scrapling 0.2.99__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. scrapling/__init__.py +18 -31
  2. scrapling/cli.py +818 -20
  3. scrapling/core/_html_utils.py +348 -0
  4. scrapling/core/_types.py +34 -17
  5. scrapling/core/ai.py +611 -0
  6. scrapling/core/custom_types.py +183 -100
  7. scrapling/core/mixins.py +27 -19
  8. scrapling/core/shell.py +647 -0
  9. scrapling/core/{storage_adaptors.py → storage.py} +41 -33
  10. scrapling/core/translator.py +20 -26
  11. scrapling/core/utils.py +49 -54
  12. scrapling/engines/__init__.py +15 -6
  13. scrapling/engines/_browsers/__init__.py +2 -0
  14. scrapling/engines/_browsers/_camoufox.py +759 -0
  15. scrapling/engines/_browsers/_config_tools.py +130 -0
  16. scrapling/engines/_browsers/_controllers.py +644 -0
  17. scrapling/engines/_browsers/_page.py +93 -0
  18. scrapling/engines/_browsers/_validators.py +170 -0
  19. scrapling/engines/constants.py +101 -88
  20. scrapling/engines/static.py +667 -110
  21. scrapling/engines/toolbelt/__init__.py +20 -6
  22. scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
  23. scrapling/engines/toolbelt/convertor.py +254 -0
  24. scrapling/engines/toolbelt/custom.py +158 -175
  25. scrapling/engines/toolbelt/fingerprints.py +32 -46
  26. scrapling/engines/toolbelt/navigation.py +68 -39
  27. scrapling/fetchers.py +239 -333
  28. scrapling/parser.py +781 -449
  29. scrapling-0.3.1.dist-info/METADATA +411 -0
  30. scrapling-0.3.1.dist-info/RECORD +41 -0
  31. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/WHEEL +1 -1
  32. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/top_level.txt +0 -1
  33. scrapling/defaults.py +0 -25
  34. scrapling/engines/camo.py +0 -339
  35. scrapling/engines/pw.py +0 -465
  36. scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
  37. scrapling-0.2.99.dist-info/METADATA +0 -290
  38. scrapling-0.2.99.dist-info/RECORD +0 -49
  39. tests/__init__.py +0 -1
  40. tests/fetchers/__init__.py +0 -1
  41. tests/fetchers/async/__init__.py +0 -0
  42. tests/fetchers/async/test_camoufox.py +0 -97
  43. tests/fetchers/async/test_httpx.py +0 -85
  44. tests/fetchers/async/test_playwright.py +0 -101
  45. tests/fetchers/sync/__init__.py +0 -0
  46. tests/fetchers/sync/test_camoufox.py +0 -70
  47. tests/fetchers/sync/test_httpx.py +0 -84
  48. tests/fetchers/sync/test_playwright.py +0 -89
  49. tests/fetchers/test_utils.py +0 -97
  50. tests/parser/__init__.py +0 -0
  51. tests/parser/test_automatch.py +0 -111
  52. tests/parser/test_general.py +0 -330
  53. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/entry_points.txt +0 -0
  54. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,290 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: scrapling
3
- Version: 0.2.99
4
- Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy again! In an internet filled with complications,
5
- Home-page: https://github.com/D4Vinci/Scrapling
6
- Author: Karim Shoair
7
- Author-email: karim.shoair@pm.me
8
- License: BSD
9
- Project-URL: Documentation, https://scrapling.readthedocs.io/en/latest/
10
- Project-URL: Source, https://github.com/D4Vinci/Scrapling
11
- Project-URL: Tracker, https://github.com/D4Vinci/Scrapling/issues
12
- Classifier: Operating System :: OS Independent
13
- Classifier: Development Status :: 4 - Beta
14
- Classifier: Intended Audience :: Developers
15
- Classifier: License :: OSI Approved :: BSD License
16
- Classifier: Natural Language :: English
17
- Classifier: Topic :: Internet :: WWW/HTTP
18
- Classifier: Topic :: Text Processing :: Markup
19
- Classifier: Topic :: Internet :: WWW/HTTP :: Browsers
20
- Classifier: Topic :: Text Processing :: Markup :: HTML
21
- Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
- Classifier: Programming Language :: Python :: 3
23
- Classifier: Programming Language :: Python :: 3 :: Only
24
- Classifier: Programming Language :: Python :: 3.9
25
- Classifier: Programming Language :: Python :: 3.10
26
- Classifier: Programming Language :: Python :: 3.11
27
- Classifier: Programming Language :: Python :: 3.12
28
- Classifier: Programming Language :: Python :: 3.13
29
- Classifier: Programming Language :: Python :: Implementation :: CPython
30
- Classifier: Typing :: Typed
31
- Requires-Python: >=3.9
32
- Description-Content-Type: text/markdown
33
- License-File: LICENSE
34
- Requires-Dist: lxml>=5.0
35
- Requires-Dist: cssselect>=1.2
36
- Requires-Dist: click
37
- Requires-Dist: w3lib
38
- Requires-Dist: orjson>=3
39
- Requires-Dist: tldextract
40
- Requires-Dist: httpx[brotli,socks,zstd]
41
- Requires-Dist: playwright>=1.49.1
42
- Requires-Dist: rebrowser-playwright>=1.49.1
43
- Requires-Dist: camoufox[geoip]>=0.4.11
44
- Dynamic: author
45
- Dynamic: author-email
46
- Dynamic: classifier
47
- Dynamic: description
48
- Dynamic: description-content-type
49
- Dynamic: home-page
50
- Dynamic: license
51
- Dynamic: license-file
52
- Dynamic: project-url
53
- Dynamic: requires-dist
54
- Dynamic: requires-python
55
- Dynamic: summary
56
-
57
- # 🕷️ Scrapling: Undetectable, Lightning-Fast, and Easy Web Scraping with Python
58
- <p align="center">
59
- <a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
60
- <img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
61
- <a href="https://badge.fury.io/py/Scrapling" alt="PyPI version">
62
- <img alt="PyPI version" src="https://badge.fury.io/py/Scrapling.svg"></a>
63
- <a href="https://pepy.tech/project/scrapling" alt="PyPI Downloads">
64
- <img alt="PyPI Downloads" src="https://static.pepy.tech/badge/scrapling"></a>
65
- <br/>
66
- <a href="https://pypi.org/project/scrapling/" alt="Supported Python versions">
67
- <img alt="Supported Python versions" src="https://img.shields.io/pypi/pyversions/scrapling.svg"></a>
68
- </p>
69
- <p align="center">
70
- <a href="https://scrapling.readthedocs.io/en/latest/#installation">
71
- Installation
72
- </a>
73
- ·
74
- <a href="https://scrapling.readthedocs.io/en/latest/overview/">
75
- Overview
76
- </a>
77
- ·
78
- <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/">
79
- Selection methods
80
- </a>
81
- ·
82
- <a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing/">
83
- Choosing a fetcher
84
- </a>
85
- ·
86
- <a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/">
87
- Migrating from Beautifulsoup
88
- </a>
89
- </p>
90
-
91
- Dealing with failing web scrapers due to anti-bot protections or website changes? Meet Scrapling.
92
-
93
- Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
94
-
95
- ```python
96
- >> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, PlayWrightFetcher
97
- >> StealthyFetcher.auto_match = True
98
- # Fetch websites' source under the radar!
99
- >> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
100
- >> print(page.status)
101
- 200
102
- >> products = page.css('.product', auto_save=True) # Scrape data that survives website design changes!
103
- >> # Later, if the website structure changes, pass `auto_match=True`
104
- >> products = page.css('.product', auto_match=True) # and Scrapling still finds them!
105
- ```
106
-
107
- # Sponsors
108
-
109
- [Scrapeless Deep SerpApi](https://www.scrapeless.com/en/product/deep-serp-api?utm_source=website&utm_medium=ads&utm_campaign=scraping&utm_term=d4vinci) From $0.10 per 1,000 queries with a 1-2 second response time!
110
-
111
- Deep SerpApi is a dedicated search engine designed for large language models (LLMs) and AI agents. It aims to provide real-time, accurate, and unbiased information to help AI applications retrieve and process data efficiently.
112
- - covering 20+ Google SERP scenarios and mainstream search engines.
113
- - support real-time data updates to ensure real-time and accurate information.
114
- - It can integrate information from all available online channels and search engines.
115
- - Deep SerpApi will simplify the process of integrating dynamic web information into AI solutions, and ultimately achieve an ALL-in-One API for one-click search and extraction of web data.
116
- - **Developer Support Program**: Integrate Scrapeless Deep SerpApi into your AI tools, applications or projects. [We already support Dify, and will soon support frameworks such as Langchain, Langflow, FlowiseAI]. Then share your results on GitHub or social media, and you will get a 1-12 month free developer support opportunity, up to 500 free usage per month.
117
- - 🚀 **Scraping API**: Effortless and highly customizable data extraction with a single API call, providing structured data from any website.
118
- - ⚡ **Scraping Browser**: AI-powered and LLM-driven, it simulates human-like behavior with genuine fingerprints and headless browser support, ensuring seamless, block-free scraping.
119
- - 🌐 **Proxies**: Use high-quality, rotating proxies to scrape top platforms like Amazon, Shopee, and more, with global coverage in 195+ countries.
120
-
121
-
122
- [![Scrapeless Banner](https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/scrapeless.jpg)](https://www.scrapeless.com/en/product/deep-serp-api?utm_source=website&utm_medium=ads&utm_campaign=scraping&utm_term=d4vinci)
123
- ---
124
-
125
- [Evomi](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling) is your Swiss Quality Proxy Provider, starting at **$0.49/GB**
126
-
127
- - 👩‍💻 **$0.49 per GB Residential Proxies**: Our price is unbeatable
128
- - 👩‍💻 **24/7 Expert Support**: We will join your Slack Channel
129
- - 🌍 **Global Presence**: Available in 150+ Countries
130
- - ⚡ **Low Latency**
131
- - 🔒 **Swiss Quality and Privacy**
132
- - 🎁 **Free Trial**
133
- - 🛡️ **99.9% Uptime**
134
- - 🤝 **Special IP Pool selection**: Optimize for fast, quality or quantity of ips
135
- - 🔧 **Easy Integration**: Compatible with most software and programming languages
136
-
137
- [![Evomi Banner](https://my.evomi.com/images/brand/cta.png)](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling)
138
- ---
139
-
140
- ## Key Features
141
-
142
- ### Fetch websites as you prefer with async support
143
- - **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class.
144
- - **Dynamic Loading & Automation**: Fetch dynamic websites with the `PlayWrightFetcher` class through your real browser, Scrapling's stealth mode, Playwright's Chrome browser, or [NSTbrowser](https://app.nstbrowser.io/r/1vO5e5)'s browserless!
145
- - **Anti-bot Protections Bypass**: Easily bypass protections with the `StealthyFetcher` and `PlayWrightFetcher` classes.
146
-
147
- ### Adaptive Scraping
148
- - 🔄 **Smart Element Tracking**: Relocate elements after website changes using an intelligent similarity system and integrated storage.
149
- - 🎯 **Flexible Selection**: CSS selectors, XPath selectors, filters-based search, text search, regex search, and more.
150
- - 🔍 **Find Similar Elements**: Automatically locate elements similar to the element you found!
151
- - 🧠 **Smart Content Scraping**: Extract data from multiple websites using Scrapling's powerful features without specific selectors.
152
-
153
- ### High Performance
154
- - 🚀 **Lightning Fast**: Built from the ground up with performance in mind, outperforming most popular Python scraping libraries.
155
- - 🔋 **Memory Efficient**: Optimized data structures for minimal memory footprint.
156
- - ⚡ **Fast JSON serialization**: 10x faster than standard library.
157
-
158
- ### Developer Friendly
159
- - 🛠️ **Powerful Navigation API**: Easy DOM traversal in all directions.
160
- - 🧬 **Rich Text Processing**: All strings have built-in regex, cleaning methods, and more. All elements' attributes are optimized dictionaries with added methods that consume less memory than standard dictionaries.
161
- - 📝 **Auto Selectors Generation**: Generate robust short and full CSS/XPath selectors for any element.
162
- - 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup and the same pseudo-elements used in Scrapy.
163
- - 📘 **Type hints**: Complete type/doc-strings coverage for future-proofing and best autocompletion support.
164
-
165
- ## Getting Started
166
-
167
- ```python
168
- from scrapling.fetchers import Fetcher
169
-
170
- # Do HTTP GET request to a web page and create an Adaptor instance
171
- page = Fetcher.get('https://quotes.toscrape.com/', stealthy_headers=True)
172
- # Get all text content from all HTML tags in the page except the `script` and `style` tags
173
- page.get_all_text(ignore_tags=('script', 'style'))
174
-
175
- # Get all quotes elements; any of these methods will return a list of strings directly (TextHandlers)
176
- quotes = page.css('.quote .text::text') # CSS selector
177
- quotes = page.xpath('//span[@class="text"]/text()') # XPath
178
- quotes = page.css('.quote').css('.text::text') # Chained selectors
179
- quotes = [element.text for element in page.css('.quote .text')] # Slower than bulk query above
180
-
181
- # Get the first quote element
182
- quote = page.css_first('.quote') # same as page.css('.quote').first or page.css('.quote')[0]
183
-
184
- # Tired of selectors? Use find_all/find
185
- # Get all 'div' HTML tags that one of its 'class' values is 'quote'
186
- quotes = page.find_all('div', {'class': 'quote'})
187
- # Same as
188
- quotes = page.find_all('div', class_='quote')
189
- quotes = page.find_all(['div'], class_='quote')
190
- quotes = page.find_all(class_='quote') # and so on...
191
-
192
- # Working with elements
193
- quote.html_content # Get the Inner HTML of this element
194
- quote.prettify() # Prettified version of Inner HTML above
195
- quote.attrib # Get that element's attributes
196
- quote.path # DOM path to element (List of all ancestors from <html> tag till the element itself)
197
- ```
198
- To keep it simple, all methods can be chained on top of each other!
199
-
200
- > [!NOTE]
201
- > Check out the full documentation from [here](https://scrapling.readthedocs.io/en/latest/)
202
-
203
- ## Parsing Performance
204
-
205
- Scrapling isn't just powerful - it's also blazing fast. Scrapling implements many best practices, design patterns, and numerous optimizations to save fractions of seconds. All of that while focusing exclusively on parsing HTML documents.
206
- Here are benchmarks comparing Scrapling to popular Python libraries in two tests.
207
-
208
- ### Text Extraction Speed Test (5000 nested elements).
209
-
210
- This test consists of extracting the text content of 5000 nested div elements.
211
-
212
-
213
- | # | Library | Time (ms) | vs Scrapling |
214
- |---|:-----------------:|:---------:|:------------:|
215
- | 1 | Scrapling | 5.44 | 1.0x |
216
- | 2 | Parsel/Scrapy | 5.53 | 1.017x |
217
- | 3 | Raw Lxml | 6.76 | 1.243x |
218
- | 4 | PyQuery | 21.96 | 4.037x |
219
- | 5 | Selectolax | 67.12 | 12.338x |
220
- | 6 | BS4 with Lxml | 1307.03 | 240.263x |
221
- | 7 | MechanicalSoup | 1322.64 | 243.132x |
222
- | 8 | BS4 with html5lib | 3373.75 | 620.175x |
223
-
224
- As you see, Scrapling is on par with Scrapy and slightly faster than Lxml, which both libraries are built on top of. These are the closest results to Scrapling. PyQuery is also built on top of Lxml, but Scrapling is four times faster.
225
-
226
- ### Extraction By Text Speed Test
227
-
228
- Scrapling can find elements based on its text content and find elements similar to these elements. The only known library with these two features, too, is AutoScraper.
229
-
230
- So, we compared this to see how fast Scrapling can be in these two tasks compared to AutoScraper.
231
-
232
- Here are the results:
233
-
234
- | Library | Time (ms) | vs Scrapling |
235
- |-------------|:---------:|:------------:|
236
- | Scrapling | 2.51 | 1.0x |
237
- | AutoScraper | 11.41 | 4.546x |
238
-
239
- Scrapling can find elements with more methods and returns the entire element's `Adaptor` object, not only text like AutoScraper. So, to make this test fair, both libraries will extract an element with text, find similar elements, and then extract the text content for all of them.
240
-
241
- As you see, Scrapling is still 4.5 times faster at the same task.
242
-
243
- If we made Scrapling extract the elements only without stopping to extract each element's text, we would get speed twice as fast as this, but as I said, to make it fair comparison a bit :smile:
244
-
245
- > All benchmarks' results are an average of 100 runs. See our [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology and to run your comparisons.
246
-
247
- ## Installation
248
- Scrapling is a breeze to get started with. Starting from version 0.2.9, we require at least Python 3.9 to work.
249
- ```bash
250
- pip3 install scrapling
251
- ```
252
- Then run this command to install browsers' dependencies needed to use Fetcher classes
253
- ```bash
254
- scrapling install
255
- ```
256
- If you have any installation issues, please open an issue.
257
-
258
-
259
- ## More Sponsors!
260
- <a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" alt="SerpApi Banner" ></a>
261
-
262
-
263
- ## Contributing
264
- Everybody is invited and welcome to contribute to Scrapling. There is a lot to do!
265
-
266
- Please read the [contributing file](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) before doing anything.
267
-
268
- ## Disclaimer for Scrapling Project
269
- > [!CAUTION]
270
- > This library is provided for educational and research purposes only. By using this library, you agree to comply with local and international data scraping and privacy laws. The authors and contributors are not responsible for any misuse of this software. This library should not be used to violate the rights of others, for unethical purposes, or to use data in an unauthorized or illegal manner. Do not use it on any website unless you have permission from the website owner or within their allowed rules, such as the `robots.txt` file.
271
-
272
- ## License
273
- This work is licensed under BSD-3
274
-
275
- ## Acknowledgments
276
- This project includes code adapted from:
277
- - Parsel (BSD License) - Used for [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/translator.py) submodule
278
-
279
- ## Thanks and References
280
- - [Daijro](https://github.com/daijro)'s brilliant work on both [BrowserForge](https://github.com/daijro/browserforge) and [Camoufox](https://github.com/daijro/camoufox)
281
- - [Vinyzu](https://github.com/Vinyzu)'s work on Playwright's mock on [Botright](https://github.com/Vinyzu/Botright)
282
- - [brotector](https://github.com/kaliiiiiiiiii/brotector)
283
- - [fakebrowser](https://github.com/kkoooqq/fakebrowser)
284
- - [rebrowser-patches](https://github.com/rebrowser/rebrowser-patches)
285
-
286
- ## Known Issues
287
- - In the auto-matching save process, the unique properties of the first element from the selection results are the only ones that get saved. If the selector you are using selects different elements on the page in different locations, auto-matching will return the first element to you only when you relocate it later. This doesn't include combined CSS selectors (Using commas to combine more than one selector, for example), as these selectors get separated, and each selector gets executed alone.
288
-
289
- ---
290
- <div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
@@ -1,49 +0,0 @@
1
- scrapling/__init__.py,sha256=hNIkfkiY42w_VSw9-Flicz83FYGGdXvY3uWMtcoyJV8,1510
2
- scrapling/cli.py,sha256=7yTsMhVAqqS8Z27T5dFKrR9_X8vuFjBlwYgAF22W7T8,1292
3
- scrapling/defaults.py,sha256=07g4E7-A8QPkpDuEW4EP0tI4m_KBQQWIoi7dbRxZyXI,1413
4
- scrapling/fetchers.py,sha256=fsHq27mAa8hFMl263igdUpT7pphUhYFnxLf025bE6DQ,41718
5
- scrapling/parser.py,sha256=4BWeHYszXZAN9_t-5G5zE7ZaUSwj0vP8e_tsTfe4hn0,53957
6
- scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
7
- scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- scrapling/core/_types.py,sha256=dKVi_dUxdxNtTr7sj7ySkHXDfrsmjFTfpCQeO5tGuBY,670
9
- scrapling/core/custom_types.py,sha256=EWGx5t5scHEB1SMsitzc8duskq-5f-Qaj40IWkNTRzM,12947
10
- scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
11
- scrapling/core/storage_adaptors.py,sha256=gZbUpHtLOL7o_oZbES_o40r39zShxTeTM8YK6dXA5Zo,6214
12
- scrapling/core/translator.py,sha256=3a2VX9KR-q-GzwT1OgGDv1UlzIkvBggkQXUdiMyL-4c,5277
13
- scrapling/core/utils.py,sha256=KX88B3tV1-SgCAr69TUN3LfmsTDcLnEhYJiPuWd31yA,3704
14
- scrapling/engines/__init__.py,sha256=zA7tzqcDXP0hllwmjVewNHWipIA4JSU9mRG4J-cud0c,267
15
- scrapling/engines/camo.py,sha256=4q8f1uHTlyNVWhOF-a7xUJFzquCJ7bd0K4huhhoa5pY,18398
16
- scrapling/engines/constants.py,sha256=Gb_nXFoBB4ujJkd05SKkenMe1UDiRYQA3dkmA3DunLg,3723
17
- scrapling/engines/pw.py,sha256=1_Njrv3hOtXfx5dQya2ivaGuX7d4rIwfBtYvQcxgsv4,23339
18
- scrapling/engines/static.py,sha256=EjdaR0beqWfEKKavT7vlBnozoayQaVpqeVtaOuzd384,9306
19
- scrapling/engines/toolbelt/__init__.py,sha256=VQDdYm1zY9Apno6d8UrULk29vUjllZrQqD8mXL1E2Fc,402
20
- scrapling/engines/toolbelt/custom.py,sha256=m5RUTmqaJ7lF2LhVAOzJGIRNEYtxdSx0JgCfHpy0X1A,13428
21
- scrapling/engines/toolbelt/fingerprints.py,sha256=Zzoqq3p6X_8D7eTxACz3z96cBZWWK61iKOGo2sZUtlg,2924
22
- scrapling/engines/toolbelt/navigation.py,sha256=fMjDgicqy2MoZZll2h5EvrrxkL6yNrC09v8isTpwAt0,4565
23
- scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
24
- scrapling/engines/toolbelt/bypasses/notification_permission.js,sha256=poPM3o5WYgEX-EdiUfDCllpWfc3Umvw4jr2u6O6elus,237
25
- scrapling/engines/toolbelt/bypasses/pdf_viewer.js,sha256=mKjjSuP1-BOGC_2WhRYHJo_LP7lTBi2KXmP_zsHO_tI,173
26
- scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js,sha256=3RP1AE_XZRvpupeV_i-WSNVqRxyUy0qd8rQV8j_4j3U,221
27
- scrapling/engines/toolbelt/bypasses/screen_props.js,sha256=fZEuHMQ1-fYuxxUMoQXUvVWYUkPUbblkfMfpiLvBY7w,599
28
- scrapling/engines/toolbelt/bypasses/webdriver_fully.js,sha256=hdJw4clRAJQqIdq5gIFC_eC-x7C1i2ab01KV5ylmOBs,728
29
- scrapling/engines/toolbelt/bypasses/window_chrome.js,sha256=D7hqzNGGDorh8JVlvm2YIv7Bk2CoVkG55MDIdyqhT1w,6808
30
- scrapling-0.2.99.dist-info/licenses/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
31
- tests/__init__.py,sha256=YHFB5ftzgLQVh6gbPfbYcY4yOS9DOBp5dBa6I-qtm8U,32
32
- tests/fetchers/__init__.py,sha256=6H4NgARhyTcGGd3dNCKQJ8kUFdrAEMSScQL7Ga_vU3c,43
33
- tests/fetchers/test_utils.py,sha256=ANFu-4FFhtyGFGIwJksUO2M2tTTcKU2M_t6F2aav8lM,4967
34
- tests/fetchers/async/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
- tests/fetchers/async/test_camoufox.py,sha256=-Ar5b5d14S3k-S0j2p6yM-8sibaqMBaB9TK_iU8aCIA,3702
36
- tests/fetchers/async/test_httpx.py,sha256=yABAGK1ZEvceZAV5O4WB25x0j8amb2lwaHqbEizPSzM,3903
37
- tests/fetchers/async/test_playwright.py,sha256=ubKX8BKZ8VImI6an-cIIcs5RCrrtZiTA_CAkipnO-UY,4125
38
- tests/fetchers/sync/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
- tests/fetchers/sync/test_camoufox.py,sha256=-N4GrvOd37vDECUzZko9IJUBs3xrDyGw1nSUDwiJnnI,3182
40
- tests/fetchers/sync/test_httpx.py,sha256=oJeDGqCAzVcHawAZ2_-guUiEKMgvgQOuLkA24RVpP5E,3624
41
- tests/fetchers/sync/test_playwright.py,sha256=OIcI3I4Tp8zvoJEgHvlP_nHyf5Kf1cMeKcxT6oFmGB4,3837
42
- tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
43
- tests/parser/test_automatch.py,sha256=SxsNdExE8zz8AcPRQFBUjZ3Q_1-tPOd9dzVvMSZpOYQ,4908
44
- tests/parser/test_general.py,sha256=dyfOsc8lleoY4AxcfDUBUaD1i95xecfYuTUhKBsYjwo,12100
45
- scrapling-0.2.99.dist-info/METADATA,sha256=93jrvv1iAQ7kWOVDrn9wEvgSOS8WmE3cMpjzpHscD1s,16054
46
- scrapling-0.2.99.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
47
- scrapling-0.2.99.dist-info/entry_points.txt,sha256=DHyt2Blxy0P5OE2HRcP95Wz9_xo2ERCDcNqrJjYS3o8,49
48
- scrapling-0.2.99.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
49
- scrapling-0.2.99.dist-info/RECORD,,
tests/__init__.py DELETED
@@ -1 +0,0 @@
1
- """Package for test project."""
@@ -1 +0,0 @@
1
- # Because I'm too lazy to mock requests :)
File without changes
@@ -1,97 +0,0 @@
1
- import pytest
2
- import pytest_httpbin
3
-
4
- from scrapling import StealthyFetcher
5
-
6
- StealthyFetcher.auto_match = True
7
-
8
-
9
- @pytest_httpbin.use_class_based_httpbin
10
- @pytest.mark.asyncio
11
- class TestStealthyFetcher:
12
- @pytest.fixture(scope="class")
13
- def fetcher(self):
14
- return StealthyFetcher
15
-
16
- @pytest.fixture(scope="class")
17
- def urls(self, httpbin):
18
- url = httpbin.url
19
- return {
20
- 'status_200': f'{url}/status/200',
21
- 'status_404': f'{url}/status/404',
22
- 'status_501': f'{url}/status/501',
23
- 'basic_url': f'{url}/get',
24
- 'html_url': f'{url}/html',
25
- 'delayed_url': f'{url}/delay/10', # 10 Seconds delay response
26
- 'cookies_url': f"{url}/cookies/set/test/value"
27
- }
28
-
29
- async def test_basic_fetch(self, fetcher, urls):
30
- """Test doing basic fetch request with multiple statuses"""
31
- assert (await fetcher.async_fetch(urls['status_200'])).status == 200
32
- assert (await fetcher.async_fetch(urls['status_404'])).status == 404
33
- assert (await fetcher.async_fetch(urls['status_501'])).status == 501
34
-
35
- async def test_networkidle(self, fetcher, urls):
36
- """Test if waiting for `networkidle` make page does not finish loading or not"""
37
- assert (await fetcher.async_fetch(urls['basic_url'], network_idle=True)).status == 200
38
-
39
- async def test_blocking_resources(self, fetcher, urls):
40
- """Test if blocking resources make page does not finish loading or not"""
41
- assert (await fetcher.async_fetch(urls['basic_url'], block_images=True)).status == 200
42
- assert (await fetcher.async_fetch(urls['basic_url'], disable_resources=True)).status == 200
43
-
44
- async def test_waiting_selector(self, fetcher, urls):
45
- """Test if waiting for a selector make page does not finish loading or not"""
46
- assert (await fetcher.async_fetch(urls['html_url'], wait_selector='h1')).status == 200
47
- assert (await fetcher.async_fetch(
48
- urls['html_url'],
49
- wait_selector='h1',
50
- wait_selector_state='visible'
51
- )).status == 200
52
-
53
- async def test_cookies_loading(self, fetcher, urls):
54
- """Test if cookies are set after the request"""
55
- response = await fetcher.async_fetch(urls['cookies_url'])
56
- assert response.cookies == {'test': 'value'}
57
-
58
- async def test_automation(self, fetcher, urls):
59
- """Test if automation break the code or not"""
60
-
61
- async def scroll_page(page):
62
- await page.mouse.wheel(10, 0)
63
- await page.mouse.move(100, 400)
64
- await page.mouse.up()
65
- return page
66
-
67
- assert (await fetcher.async_fetch(urls['html_url'], page_action=scroll_page)).status == 200
68
-
69
- async def test_properties(self, fetcher, urls):
70
- """Test if different arguments breaks the code or not"""
71
- assert (await fetcher.async_fetch(
72
- urls['html_url'],
73
- block_webrtc=True,
74
- allow_webgl=True
75
- )).status == 200
76
-
77
- assert (await fetcher.async_fetch(
78
- urls['html_url'],
79
- block_webrtc=False,
80
- allow_webgl=True
81
- )).status == 200
82
-
83
- assert (await fetcher.async_fetch(
84
- urls['html_url'],
85
- block_webrtc=True,
86
- allow_webgl=False
87
- )).status == 200
88
-
89
- assert (await fetcher.async_fetch(
90
- urls['html_url'],
91
- extra_headers={'ayo': ''},
92
- os_randomize=True
93
- )).status == 200
94
-
95
- async def test_infinite_timeout(self, fetcher, urls):
96
- """Test if infinite timeout breaks the code or not"""
97
- assert (await fetcher.async_fetch(urls['delayed_url'], timeout=None)).status == 200
@@ -1,85 +0,0 @@
1
- import pytest
2
- import pytest_httpbin
3
-
4
- from scrapling.fetchers import AsyncFetcher
5
-
6
- AsyncFetcher.auto_match = True
7
-
8
-
9
- @pytest_httpbin.use_class_based_httpbin
10
- @pytest.mark.asyncio
11
- class TestAsyncFetcher:
12
- @pytest.fixture(scope="class")
13
- def fetcher(self):
14
- return AsyncFetcher
15
-
16
- @pytest.fixture(scope="class")
17
- def urls(self, httpbin):
18
- return {
19
- 'status_200': f'{httpbin.url}/status/200',
20
- 'status_404': f'{httpbin.url}/status/404',
21
- 'status_501': f'{httpbin.url}/status/501',
22
- 'basic_url': f'{httpbin.url}/get',
23
- 'post_url': f'{httpbin.url}/post',
24
- 'put_url': f'{httpbin.url}/put',
25
- 'delete_url': f'{httpbin.url}/delete',
26
- 'html_url': f'{httpbin.url}/html'
27
- }
28
-
29
- async def test_basic_get(self, fetcher, urls):
30
- """Test doing basic get request with multiple statuses"""
31
- assert (await fetcher.get(urls['status_200'])).status == 200
32
- assert (await fetcher.get(urls['status_404'])).status == 404
33
- assert (await fetcher.get(urls['status_501'])).status == 501
34
-
35
- async def test_get_properties(self, fetcher, urls):
36
- """Test if different arguments with GET request breaks the code or not"""
37
- assert (await fetcher.get(urls['status_200'], stealthy_headers=True)).status == 200
38
- assert (await fetcher.get(urls['status_200'], follow_redirects=True)).status == 200
39
- assert (await fetcher.get(urls['status_200'], timeout=None)).status == 200
40
- assert (await fetcher.get(
41
- urls['status_200'],
42
- stealthy_headers=True,
43
- follow_redirects=True,
44
- timeout=None
45
- )).status == 200
46
-
47
- async def test_post_properties(self, fetcher, urls):
48
- """Test if different arguments with POST request breaks the code or not"""
49
- assert (await fetcher.post(urls['post_url'], data={'key': 'value'})).status == 200
50
- assert (await fetcher.post(urls['post_url'], data={'key': 'value'}, stealthy_headers=True)).status == 200
51
- assert (await fetcher.post(urls['post_url'], data={'key': 'value'}, follow_redirects=True)).status == 200
52
- assert (await fetcher.post(urls['post_url'], data={'key': 'value'}, timeout=None)).status == 200
53
- assert (await fetcher.post(
54
- urls['post_url'],
55
- data={'key': 'value'},
56
- stealthy_headers=True,
57
- follow_redirects=True,
58
- timeout=None
59
- )).status == 200
60
-
61
- async def test_put_properties(self, fetcher, urls):
62
- """Test if different arguments with PUT request breaks the code or not"""
63
- assert (await fetcher.put(urls['put_url'], data={'key': 'value'})).status in [200, 405]
64
- assert (await fetcher.put(urls['put_url'], data={'key': 'value'}, stealthy_headers=True)).status in [200, 405]
65
- assert (await fetcher.put(urls['put_url'], data={'key': 'value'}, follow_redirects=True)).status in [200, 405]
66
- assert (await fetcher.put(urls['put_url'], data={'key': 'value'}, timeout=None)).status in [200, 405]
67
- assert (await fetcher.put(
68
- urls['put_url'],
69
- data={'key': 'value'},
70
- stealthy_headers=True,
71
- follow_redirects=True,
72
- timeout=None
73
- )).status in [200, 405]
74
-
75
- async def test_delete_properties(self, fetcher, urls):
76
- """Test if different arguments with DELETE request breaks the code or not"""
77
- assert (await fetcher.delete(urls['delete_url'], stealthy_headers=True)).status == 200
78
- assert (await fetcher.delete(urls['delete_url'], follow_redirects=True)).status == 200
79
- assert (await fetcher.delete(urls['delete_url'], timeout=None)).status == 200
80
- assert (await fetcher.delete(
81
- urls['delete_url'],
82
- stealthy_headers=True,
83
- follow_redirects=True,
84
- timeout=None
85
- )).status == 200
@@ -1,101 +0,0 @@
1
- import pytest
2
- import pytest_httpbin
3
-
4
- from scrapling import PlayWrightFetcher
5
-
6
- PlayWrightFetcher.auto_match = True
7
-
8
-
9
- @pytest_httpbin.use_class_based_httpbin
10
- class TestPlayWrightFetcherAsync:
11
- @pytest.fixture
12
- def fetcher(self):
13
- return PlayWrightFetcher
14
-
15
- @pytest.fixture
16
- def urls(self, httpbin):
17
- return {
18
- 'status_200': f'{httpbin.url}/status/200',
19
- 'status_404': f'{httpbin.url}/status/404',
20
- 'status_501': f'{httpbin.url}/status/501',
21
- 'basic_url': f'{httpbin.url}/get',
22
- 'html_url': f'{httpbin.url}/html',
23
- 'delayed_url': f'{httpbin.url}/delay/10',
24
- 'cookies_url': f"{httpbin.url}/cookies/set/test/value"
25
- }
26
-
27
- @pytest.mark.asyncio
28
- async def test_basic_fetch(self, fetcher, urls):
29
- """Test doing basic fetch request with multiple statuses"""
30
- response = await fetcher.async_fetch(urls['status_200'])
31
- assert response.status == 200
32
-
33
- @pytest.mark.asyncio
34
- async def test_networkidle(self, fetcher, urls):
35
- """Test if waiting for `networkidle` make page does not finish loading or not"""
36
- response = await fetcher.async_fetch(urls['basic_url'], network_idle=True)
37
- assert response.status == 200
38
-
39
- @pytest.mark.asyncio
40
- async def test_blocking_resources(self, fetcher, urls):
41
- """Test if blocking resources make page does not finish loading or not"""
42
- response = await fetcher.async_fetch(urls['basic_url'], disable_resources=True)
43
- assert response.status == 200
44
-
45
- @pytest.mark.asyncio
46
- async def test_waiting_selector(self, fetcher, urls):
47
- """Test if waiting for a selector make page does not finish loading or not"""
48
- response1 = await fetcher.async_fetch(urls['html_url'], wait_selector='h1')
49
- assert response1.status == 200
50
-
51
- response2 = await fetcher.async_fetch(urls['html_url'], wait_selector='h1', wait_selector_state='visible')
52
- assert response2.status == 200
53
-
54
- @pytest.mark.asyncio
55
- async def test_cookies_loading(self, fetcher, urls):
56
- """Test if cookies are set after the request"""
57
- response = await fetcher.async_fetch(urls['cookies_url'])
58
- assert response.cookies == {'test': 'value'}
59
-
60
- @pytest.mark.asyncio
61
- async def test_automation(self, fetcher, urls):
62
- """Test if automation break the code or not"""
63
- async def scroll_page(page):
64
- await page.mouse.wheel(10, 0)
65
- await page.mouse.move(100, 400)
66
- await page.mouse.up()
67
- return page
68
-
69
- response = await fetcher.async_fetch(urls['html_url'], page_action=scroll_page)
70
- assert response.status == 200
71
-
72
- @pytest.mark.parametrize("kwargs", [
73
- {"disable_webgl": True, "hide_canvas": False},
74
- {"disable_webgl": False, "hide_canvas": True},
75
- # {"stealth": True}, # causes issues with Github Actions
76
- {"useragent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0'},
77
- {"extra_headers": {'ayo': ''}}
78
- ])
79
- @pytest.mark.asyncio
80
- async def test_properties(self, fetcher, urls, kwargs):
81
- """Test if different arguments breaks the code or not"""
82
- response = await fetcher.async_fetch(urls['html_url'], **kwargs)
83
- assert response.status == 200
84
-
85
- @pytest.mark.asyncio
86
- async def test_cdp_url_invalid(self, fetcher, urls):
87
- """Test if invalid CDP URLs raise appropriate exceptions"""
88
- with pytest.raises(ValueError):
89
- await fetcher.async_fetch(urls['html_url'], cdp_url='blahblah')
90
-
91
- with pytest.raises(ValueError):
92
- await fetcher.async_fetch(urls['html_url'], cdp_url='blahblah', nstbrowser_mode=True)
93
-
94
- with pytest.raises(Exception):
95
- await fetcher.async_fetch(urls['html_url'], cdp_url='ws://blahblah')
96
-
97
- @pytest.mark.asyncio
98
- async def test_infinite_timeout(self, fetcher, urls):
99
- """Test if infinite timeout breaks the code or not"""
100
- response = await fetcher.async_fetch(urls['delayed_url'], timeout=None)
101
- assert response.status == 200
File without changes