scrapling 0.3__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. {scrapling-0.3/scrapling.egg-info → scrapling-0.3.2}/PKG-INFO +57 -47
  2. {scrapling-0.3 → scrapling-0.3.2}/README.md +44 -37
  3. {scrapling-0.3 → scrapling-0.3.2}/pyproject.toml +10 -6
  4. {scrapling-0.3 → scrapling-0.3.2}/scrapling/__init__.py +1 -1
  5. {scrapling-0.3 → scrapling-0.3.2}/scrapling/cli.py +38 -51
  6. {scrapling-0.3 → scrapling-0.3.2}/scrapling/core/_html_utils.py +3 -9
  7. {scrapling-0.3 → scrapling-0.3.2}/scrapling/core/ai.py +5 -13
  8. {scrapling-0.3 → scrapling-0.3.2}/scrapling/core/custom_types.py +19 -61
  9. {scrapling-0.3 → scrapling-0.3.2}/scrapling/core/mixins.py +6 -28
  10. {scrapling-0.3 → scrapling-0.3.2}/scrapling/core/shell.py +51 -129
  11. {scrapling-0.3 → scrapling-0.3.2}/scrapling/core/storage.py +2 -8
  12. {scrapling-0.3 → scrapling-0.3.2}/scrapling/core/translator.py +8 -20
  13. scrapling-0.3.2/scrapling/core/utils/__init__.py +10 -0
  14. scrapling-0.3.2/scrapling/core/utils/_shell.py +48 -0
  15. scrapling-0.3/scrapling/core/utils.py → scrapling-0.3.2/scrapling/core/utils/_utils.py +5 -21
  16. scrapling-0.3.2/scrapling/engines/__init__.py +0 -0
  17. scrapling-0.3.2/scrapling/engines/_browsers/_base.py +297 -0
  18. {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/_browsers/_camoufox.py +238 -293
  19. {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/_browsers/_config_tools.py +2 -1
  20. {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/_browsers/_controllers.py +220 -278
  21. {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/_browsers/_page.py +37 -15
  22. {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/_browsers/_validators.py +29 -15
  23. {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/constants.py +3 -6
  24. {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/static.py +25 -75
  25. scrapling-0.3.2/scrapling/engines/toolbelt/__init__.py +1 -0
  26. {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/toolbelt/convertor.py +95 -86
  27. {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/toolbelt/custom.py +7 -99
  28. {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/toolbelt/fingerprints.py +1 -3
  29. {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/toolbelt/navigation.py +4 -58
  30. {scrapling-0.3 → scrapling-0.3.2}/scrapling/fetchers.py +41 -24
  31. {scrapling-0.3 → scrapling-0.3.2}/scrapling/parser.py +45 -122
  32. {scrapling-0.3 → scrapling-0.3.2/scrapling.egg-info}/PKG-INFO +57 -47
  33. {scrapling-0.3 → scrapling-0.3.2}/scrapling.egg-info/SOURCES.txt +4 -1
  34. {scrapling-0.3 → scrapling-0.3.2}/scrapling.egg-info/requires.txt +14 -10
  35. {scrapling-0.3 → scrapling-0.3.2}/setup.cfg +1 -1
  36. scrapling-0.3/scrapling/engines/__init__.py +0 -16
  37. scrapling-0.3/scrapling/engines/toolbelt/__init__.py +0 -20
  38. {scrapling-0.3 → scrapling-0.3.2}/LICENSE +0 -0
  39. {scrapling-0.3 → scrapling-0.3.2}/MANIFEST.in +0 -0
  40. {scrapling-0.3 → scrapling-0.3.2}/scrapling/core/__init__.py +0 -0
  41. {scrapling-0.3 → scrapling-0.3.2}/scrapling/core/_types.py +0 -0
  42. {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/_browsers/__init__.py +0 -0
  43. {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
  44. {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
  45. {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
  46. {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
  47. {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
  48. {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
  49. {scrapling-0.3 → scrapling-0.3.2}/scrapling/py.typed +0 -0
  50. {scrapling-0.3 → scrapling-0.3.2}/scrapling.egg-info/dependency_links.txt +0 -0
  51. {scrapling-0.3 → scrapling-0.3.2}/scrapling.egg-info/entry_points.txt +0 -0
  52. {scrapling-0.3 → scrapling-0.3.2}/scrapling.egg-info/not-zip-safe +0 -0
  53. {scrapling-0.3 → scrapling-0.3.2}/scrapling.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scrapling
3
- Version: 0.3
3
+ Version: 0.3.2
4
4
  Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
@@ -64,23 +64,26 @@ Classifier: Typing :: Typed
64
64
  Requires-Python: >=3.10
65
65
  Description-Content-Type: text/markdown
66
66
  License-File: LICENSE
67
- Requires-Dist: lxml>=6.0.0
67
+ Requires-Dist: lxml>=6.0.1
68
68
  Requires-Dist: cssselect>=1.3.0
69
- Requires-Dist: click>=8.2.1
70
- Requires-Dist: orjson>=3.11.2
69
+ Requires-Dist: orjson>=3.11.3
71
70
  Requires-Dist: tldextract>=5.3.0
72
- Requires-Dist: curl_cffi>=0.13.0
73
- Requires-Dist: playwright>=1.52.0
74
- Requires-Dist: rebrowser-playwright>=1.52.0
75
- Requires-Dist: camoufox>=0.4.11
76
- Requires-Dist: geoip2>=5.1.0
77
- Requires-Dist: msgspec>=0.19.0
71
+ Provides-Extra: fetchers
72
+ Requires-Dist: click>=8.2.1; extra == "fetchers"
73
+ Requires-Dist: curl_cffi>=0.13.0; extra == "fetchers"
74
+ Requires-Dist: playwright>=1.52.0; extra == "fetchers"
75
+ Requires-Dist: rebrowser-playwright>=1.52.0; extra == "fetchers"
76
+ Requires-Dist: camoufox>=0.4.11; extra == "fetchers"
77
+ Requires-Dist: geoip2>=5.1.0; extra == "fetchers"
78
+ Requires-Dist: msgspec>=0.19.0; extra == "fetchers"
78
79
  Provides-Extra: ai
79
- Requires-Dist: mcp>=1.13.0; extra == "ai"
80
+ Requires-Dist: mcp>=1.14.0; extra == "ai"
80
81
  Requires-Dist: markdownify>=1.2.0; extra == "ai"
82
+ Requires-Dist: scrapling[fetchers]; extra == "ai"
81
83
  Provides-Extra: shell
82
84
  Requires-Dist: IPython>=8.37; extra == "shell"
83
85
  Requires-Dist: markdownify>=1.2.0; extra == "shell"
86
+ Requires-Dist: scrapling[fetchers]; extra == "shell"
84
87
  Provides-Extra: all
85
88
  Requires-Dist: scrapling[ai,shell]; extra == "all"
86
89
  Dynamic: license-file
@@ -155,9 +158,10 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
155
158
  <!-- sponsors -->
156
159
 
157
160
  <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
158
- <a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
159
161
  <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
162
+ <a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
160
163
  <a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
164
+ <a href="https://www.nstproxy.com/?type=flow&utm_source=scrapling" target="_blank" title="One Proxy Service, Infinite Solutions at Unbeatable Prices!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/NSTproxy.png"></a>
161
165
 
162
166
  <!-- /sponsors -->
163
167
 
@@ -178,7 +182,7 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
178
182
  - 🔄 **Smart Element Tracking**: Relocate elements after website changes using intelligent similarity algorithms.
179
183
  - 🎯 **Smart Flexible Selection**: CSS selectors, XPath selectors, filter-based search, text search, regex search, and more.
180
184
  - 🔍 **Find Similar Elements**: Automatically locate elements similar to found elements.
181
- - 🤖 **MCP Server to be used with AI**: Built-in MCP server for AI-assisted Web Scraping and data extraction. The MCP server features custom, powerful capabilities that utilize Scrapling to extract targeted content before passing it to the AI (Claude/Cursor/etc), thereby speeding up operations and reducing costs by minimizing token usage.
185
+ - 🤖 **MCP Server to be used with AI**: Built-in MCP server for AI-assisted Web Scraping and data extraction. The MCP server features custom, powerful capabilities that utilize Scrapling to extract targeted content before passing it to the AI (Claude/Cursor/etc), thereby speeding up operations and reducing costs by minimizing token usage. ([demo video](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
182
186
 
183
187
  ### High-Performance & battle-tested Architecture
184
188
  - 🚀 **Lightning Fast**: Optimized performance outperforming most Python scraping libraries.
@@ -220,7 +224,7 @@ quotes = page.css('.quote .text::text')
220
224
 
221
225
  # Advanced stealth mode (Keep the browser open until you finish)
222
226
  with StealthySession(headless=True, solve_cloudflare=True) as session:
223
- page = session.fetch('https://nopecha.com/demo/cloudflare')
227
+ page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
224
228
  data = page.css('#padded_content a')
225
229
 
226
230
  # Or use one-off request style, it opens the browser for this request, then closes it after finishing
@@ -229,7 +233,7 @@ data = page.css('#padded_content a')
229
233
 
230
234
  # Full browser automation (Keep the browser open until you finish)
231
235
  with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session:
232
- page = session.fetch('https://quotes.toscrape.com/')
236
+ page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
233
237
  data = page.xpath('//span[@class="text"]/text()') # XPath selector if you prefer it
234
238
 
235
239
  # Or use one-off request style, it opens the browser for this request, then closes it after finishing
@@ -273,7 +277,7 @@ from scrapling.parser import Selector
273
277
 
274
278
  page = Selector("<html>...</html>")
275
279
  ```
276
- And it works exactly the same!
280
+ And it works precisely the same way!
277
281
 
278
282
  ### Async Session Management Examples
279
283
  ```python
@@ -302,6 +306,8 @@ async with AsyncStealthySession(max_pages=2) as session:
302
306
 
303
307
  Scrapling v0.3 includes a powerful command-line interface:
304
308
 
309
+ [![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)
310
+
305
311
  ```bash
306
312
  # Launch interactive Web Scraping shell
307
313
  scrapling shell
@@ -320,20 +326,20 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.
320
326
 
321
327
  ## Performance Benchmarks
322
328
 
323
- Scrapling isn't just powerful—it's also blazing fast, and version 0.3 delivers exceptional performance improvements across all operations!
329
+ Scrapling isn't just powerful—it's also blazing fast, and the updates since version 0.3 deliver exceptional performance improvements across all operations!
324
330
 
325
331
  ### Text Extraction Speed Test (5000 nested elements)
326
332
 
327
333
  | # | Library | Time (ms) | vs Scrapling |
328
334
  |---|:-----------------:|:---------:|:------------:|
329
- | 1 | Scrapling | 1.88 | 1.0x |
330
- | 2 | Parsel/Scrapy | 1.96 | 1.043x |
331
- | 3 | Raw Lxml | 2.32 | 1.234x |
332
- | 4 | PyQuery | 20.2 | ~11x |
333
- | 5 | Selectolax | 85.2 | ~45x |
334
- | 6 | MechanicalSoup | 1305.84 | ~695x |
335
- | 7 | BS4 with Lxml | 1307.92 | ~696x |
336
- | 8 | BS4 with html5lib | 3336.28 | ~1775x |
335
+ | 1 | Scrapling | 1.92 | 1.0x |
336
+ | 2 | Parsel/Scrapy | 1.99 | 1.036x |
337
+ | 3 | Raw Lxml | 2.33 | 1.214x |
338
+ | 4 | PyQuery | 20.61 | ~11x |
339
+ | 5 | Selectolax | 80.65 | ~42x |
340
+ | 6 | BS4 with Lxml | 1283.21 | ~698x |
341
+ | 7 | MechanicalSoup | 1304.57 | ~679x |
342
+ | 8 | BS4 with html5lib | 3331.96 | ~1735x |
337
343
 
338
344
  ### Element Similarity & Text Search Performance
339
345
 
@@ -341,8 +347,8 @@ Scrapling's adaptive element finding capabilities significantly outperform alter
341
347
 
342
348
  | Library | Time (ms) | vs Scrapling |
343
349
  |-------------|:---------:|:------------:|
344
- | Scrapling | 2.02 | 1.0x |
345
- | AutoScraper | 10.26 | 5.08x |
350
+ | Scrapling | 1.87 | 1.0x |
351
+ | AutoScraper | 10.24 | 5.476x |
346
352
 
347
353
 
348
354
  > All benchmarks represent averages of 100+ runs. See [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology.
@@ -355,29 +361,33 @@ Scrapling requires Python 3.10 or higher:
355
361
  pip install scrapling
356
362
  ```
357
363
 
358
- #### Fetchers Setup
359
-
360
- If you are going to use any of the fetchers or their classes, then install browser dependencies with
361
- ```bash
362
- scrapling install
363
- ```
364
-
365
- This downloads all browsers with their system dependencies and fingerprint manipulation dependencies.
364
+ Starting with v0.3.2, this installation only includes the parser engine and its dependencies, without any fetchers or commandline dependencies.
366
365
 
367
366
  ### Optional Dependencies
368
367
 
369
- - Install the MCP server feature:
370
- ```bash
371
- pip install "scrapling[ai]"
372
- ```
373
- - Install shell features (Web Scraping shell and the `extract` command):
374
- ```bash
375
- pip install "scrapling[shell]"
376
- ```
377
- - Install everything:
378
- ```bash
379
- pip install "scrapling[all]"
380
- ```
368
+ 1. If you are going to use any of the extra features below, the fetchers, or their classes, then you need to install fetchers' dependencies, and then install their browser dependencies with
369
+ ```bash
370
+ pip install "scrapling[fetchers]"
371
+
372
+ scrapling install
373
+ ```
374
+
375
+ This downloads all browsers with their system dependencies and fingerprint manipulation dependencies.
376
+
377
+ 2. Extra features:
378
+ - Install the MCP server feature:
379
+ ```bash
380
+ pip install "scrapling[ai]"
381
+ ```
382
+ - Install shell features (Web Scraping shell and the `extract` command):
383
+ ```bash
384
+ pip install "scrapling[shell]"
385
+ ```
386
+ - Install everything:
387
+ ```bash
388
+ pip install "scrapling[all]"
389
+ ```
390
+ Don't forget that you need to install the browser dependencies with `scrapling install` after any of these extras (if you didn't already)
381
391
 
382
392
  ## Contributing
383
393
 
@@ -68,9 +68,10 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
68
68
  <!-- sponsors -->
69
69
 
70
70
  <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
71
- <a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
72
71
  <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
72
+ <a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
73
73
  <a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
74
+ <a href="https://www.nstproxy.com/?type=flow&utm_source=scrapling" target="_blank" title="One Proxy Service, Infinite Solutions at Unbeatable Prices!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/NSTproxy.png"></a>
74
75
 
75
76
  <!-- /sponsors -->
76
77
 
@@ -91,7 +92,7 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
91
92
  - 🔄 **Smart Element Tracking**: Relocate elements after website changes using intelligent similarity algorithms.
92
93
  - 🎯 **Smart Flexible Selection**: CSS selectors, XPath selectors, filter-based search, text search, regex search, and more.
93
94
  - 🔍 **Find Similar Elements**: Automatically locate elements similar to found elements.
94
- - 🤖 **MCP Server to be used with AI**: Built-in MCP server for AI-assisted Web Scraping and data extraction. The MCP server features custom, powerful capabilities that utilize Scrapling to extract targeted content before passing it to the AI (Claude/Cursor/etc), thereby speeding up operations and reducing costs by minimizing token usage.
95
+ - 🤖 **MCP Server to be used with AI**: Built-in MCP server for AI-assisted Web Scraping and data extraction. The MCP server features custom, powerful capabilities that utilize Scrapling to extract targeted content before passing it to the AI (Claude/Cursor/etc), thereby speeding up operations and reducing costs by minimizing token usage. ([demo video](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
95
96
 
96
97
  ### High-Performance & battle-tested Architecture
97
98
  - 🚀 **Lightning Fast**: Optimized performance outperforming most Python scraping libraries.
@@ -133,7 +134,7 @@ quotes = page.css('.quote .text::text')
133
134
 
134
135
  # Advanced stealth mode (Keep the browser open until you finish)
135
136
  with StealthySession(headless=True, solve_cloudflare=True) as session:
136
- page = session.fetch('https://nopecha.com/demo/cloudflare')
137
+ page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
137
138
  data = page.css('#padded_content a')
138
139
 
139
140
  # Or use one-off request style, it opens the browser for this request, then closes it after finishing
@@ -142,7 +143,7 @@ data = page.css('#padded_content a')
142
143
 
143
144
  # Full browser automation (Keep the browser open until you finish)
144
145
  with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session:
145
- page = session.fetch('https://quotes.toscrape.com/')
146
+ page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
146
147
  data = page.xpath('//span[@class="text"]/text()') # XPath selector if you prefer it
147
148
 
148
149
  # Or use one-off request style, it opens the browser for this request, then closes it after finishing
@@ -186,7 +187,7 @@ from scrapling.parser import Selector
186
187
 
187
188
  page = Selector("<html>...</html>")
188
189
  ```
189
- And it works exactly the same!
190
+ And it works precisely the same way!
190
191
 
191
192
  ### Async Session Management Examples
192
193
  ```python
@@ -215,6 +216,8 @@ async with AsyncStealthySession(max_pages=2) as session:
215
216
 
216
217
  Scrapling v0.3 includes a powerful command-line interface:
217
218
 
219
+ [![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)
220
+
218
221
  ```bash
219
222
  # Launch interactive Web Scraping shell
220
223
  scrapling shell
@@ -233,20 +236,20 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.
233
236
 
234
237
  ## Performance Benchmarks
235
238
 
236
- Scrapling isn't just powerful—it's also blazing fast, and version 0.3 delivers exceptional performance improvements across all operations!
239
+ Scrapling isn't just powerful—it's also blazing fast, and the updates since version 0.3 deliver exceptional performance improvements across all operations!
237
240
 
238
241
  ### Text Extraction Speed Test (5000 nested elements)
239
242
 
240
243
  | # | Library | Time (ms) | vs Scrapling |
241
244
  |---|:-----------------:|:---------:|:------------:|
242
- | 1 | Scrapling | 1.88 | 1.0x |
243
- | 2 | Parsel/Scrapy | 1.96 | 1.043x |
244
- | 3 | Raw Lxml | 2.32 | 1.234x |
245
- | 4 | PyQuery | 20.2 | ~11x |
246
- | 5 | Selectolax | 85.2 | ~45x |
247
- | 6 | MechanicalSoup | 1305.84 | ~695x |
248
- | 7 | BS4 with Lxml | 1307.92 | ~696x |
249
- | 8 | BS4 with html5lib | 3336.28 | ~1775x |
245
+ | 1 | Scrapling | 1.92 | 1.0x |
246
+ | 2 | Parsel/Scrapy | 1.99 | 1.036x |
247
+ | 3 | Raw Lxml | 2.33 | 1.214x |
248
+ | 4 | PyQuery | 20.61 | ~11x |
249
+ | 5 | Selectolax | 80.65 | ~42x |
250
+ | 6 | BS4 with Lxml | 1283.21 | ~698x |
251
+ | 7 | MechanicalSoup | 1304.57 | ~679x |
252
+ | 8 | BS4 with html5lib | 3331.96 | ~1735x |
250
253
 
251
254
  ### Element Similarity & Text Search Performance
252
255
 
@@ -254,8 +257,8 @@ Scrapling's adaptive element finding capabilities significantly outperform alter
254
257
 
255
258
  | Library | Time (ms) | vs Scrapling |
256
259
  |-------------|:---------:|:------------:|
257
- | Scrapling | 2.02 | 1.0x |
258
- | AutoScraper | 10.26 | 5.08x |
260
+ | Scrapling | 1.87 | 1.0x |
261
+ | AutoScraper | 10.24 | 5.476x |
259
262
 
260
263
 
261
264
  > All benchmarks represent averages of 100+ runs. See [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology.
@@ -268,29 +271,33 @@ Scrapling requires Python 3.10 or higher:
268
271
  pip install scrapling
269
272
  ```
270
273
 
271
- #### Fetchers Setup
272
-
273
- If you are going to use any of the fetchers or their classes, then install browser dependencies with
274
- ```bash
275
- scrapling install
276
- ```
277
-
278
- This downloads all browsers with their system dependencies and fingerprint manipulation dependencies.
274
+ Starting with v0.3.2, this installation only includes the parser engine and its dependencies, without any fetchers or commandline dependencies.
279
275
 
280
276
  ### Optional Dependencies
281
277
 
282
- - Install the MCP server feature:
283
- ```bash
284
- pip install "scrapling[ai]"
285
- ```
286
- - Install shell features (Web Scraping shell and the `extract` command):
287
- ```bash
288
- pip install "scrapling[shell]"
289
- ```
290
- - Install everything:
291
- ```bash
292
- pip install "scrapling[all]"
293
- ```
278
+ 1. If you are going to use any of the extra features below, the fetchers, or their classes, then you need to install fetchers' dependencies, and then install their browser dependencies with
279
+ ```bash
280
+ pip install "scrapling[fetchers]"
281
+
282
+ scrapling install
283
+ ```
284
+
285
+ This downloads all browsers with their system dependencies and fingerprint manipulation dependencies.
286
+
287
+ 2. Extra features:
288
+ - Install the MCP server feature:
289
+ ```bash
290
+ pip install "scrapling[ai]"
291
+ ```
292
+ - Install shell features (Web Scraping shell and the `extract` command):
293
+ ```bash
294
+ pip install "scrapling[shell]"
295
+ ```
296
+ - Install everything:
297
+ ```bash
298
+ pip install "scrapling[all]"
299
+ ```
300
+ Don't forget that you need to install the browser dependencies with `scrapling install` after any of these extras (if you didn't already)
294
301
 
295
302
  ## Contributing
296
303
 
@@ -319,4 +326,4 @@ This project includes code adapted from:
319
326
  - [rebrowser-patches](https://github.com/rebrowser/rebrowser-patches) for stealth improvements
320
327
 
321
328
  ---
322
- <div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
329
+ <div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
@@ -56,11 +56,15 @@ classifiers = [
56
56
  "Typing :: Typed",
57
57
  ]
58
58
  dependencies = [
59
- "lxml>=6.0.0",
59
+ "lxml>=6.0.1",
60
60
  "cssselect>=1.3.0",
61
- "click>=8.2.1",
62
- "orjson>=3.11.2",
61
+ "orjson>=3.11.3",
63
62
  "tldextract>=5.3.0",
63
+ ]
64
+
65
+ [project.optional-dependencies]
66
+ fetchers = [
67
+ "click>=8.2.1",
64
68
  "curl_cffi>=0.13.0",
65
69
  "playwright>=1.52.0",
66
70
  "rebrowser-playwright>=1.52.0",
@@ -68,15 +72,15 @@ dependencies = [
68
72
  "geoip2>=5.1.0",
69
73
  "msgspec>=0.19.0",
70
74
  ]
71
-
72
- [project.optional-dependencies]
73
75
  ai = [
74
- "mcp>=1.13.0",
76
+ "mcp>=1.14.0",
75
77
  "markdownify>=1.2.0",
78
+ "scrapling[fetchers]",
76
79
  ]
77
80
  shell = [
78
81
  "IPython>=8.37", # The last version that supports Python 3.10
79
82
  "markdownify>=1.2.0",
83
+ "scrapling[fetchers]",
80
84
  ]
81
85
  all = [
82
86
  "scrapling[ai,shell]",
@@ -1,5 +1,5 @@
1
1
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
2
- __version__ = "0.3"
2
+ __version__ = "0.3.2"
3
3
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
4
4
 
5
5
 
@@ -2,14 +2,18 @@ from pathlib import Path
2
2
  from subprocess import check_output
3
3
  from sys import executable as python_executable
4
4
 
5
- from scrapling.core.utils import log
6
- from scrapling.engines.toolbelt import Response
5
+ from scrapling.engines.toolbelt.custom import Response
6
+ from scrapling.core.utils import log, _CookieParser, _ParseHeaders
7
7
  from scrapling.core._types import List, Optional, Dict, Tuple, Any, Callable
8
- from scrapling.fetchers import Fetcher, DynamicFetcher, StealthyFetcher
9
- from scrapling.core.shell import Convertor, _CookieParser, _ParseHeaders
10
8
 
11
9
  from orjson import loads as json_loads, JSONDecodeError
12
- from click import command, option, Choice, group, argument
10
+
11
+ try:
12
+ from click import command, option, Choice, group, argument
13
+ except (ImportError, ModuleNotFoundError) as e:
14
+ raise ModuleNotFoundError(
15
+ "You need to install scrapling with any of the extras to enable Shell commands. See: https://scrapling.readthedocs.io/en/latest/#installation"
16
+ ) from e
13
17
 
14
18
  __OUTPUT_FILE_HELP__ = "The output file path can be an HTML file, a Markdown file of the HTML content, or the text content itself. Use file extensions (`.html`/`.md`/`.txt`) respectively."
15
19
  __PACKAGE_DIR__ = Path(__file__).parent
@@ -40,6 +44,8 @@ def __Request_and_Save(
40
44
  **kwargs,
41
45
  ) -> None:
42
46
  """Make a request using the specified fetcher function and save the result"""
47
+ from scrapling.core.shell import Convertor
48
+
43
49
  # Handle relative paths - convert to an absolute path based on the current working directory
44
50
  output_path = Path(output_file)
45
51
  if not output_path.is_absolute():
@@ -72,14 +78,10 @@ def __ParseExtractArguments(
72
78
  return parsed_headers, parsed_cookies, parsed_params, parsed_json
73
79
 
74
80
 
75
- def __BuildRequest(
76
- headers: List[str], cookies: str, params: str, json: Optional[str] = None, **kwargs
77
- ) -> Dict:
81
+ def __BuildRequest(headers: List[str], cookies: str, params: str, json: Optional[str] = None, **kwargs) -> Dict:
78
82
  """Build a request object using the specified arguments"""
79
83
  # Parse parameters
80
- parsed_headers, parsed_cookies, parsed_params, parsed_json = (
81
- __ParseExtractArguments(headers, cookies, params, json)
82
- )
84
+ parsed_headers, parsed_cookies, parsed_params, parsed_json = __ParseExtractArguments(headers, cookies, params, json)
83
85
  # Build request arguments
84
86
  request_kwargs = {
85
87
  "headers": parsed_headers if parsed_headers else None,
@@ -106,10 +108,7 @@ def __BuildRequest(
106
108
  help="Force Scrapling to reinstall all Fetchers dependencies",
107
109
  )
108
110
  def install(force): # pragma: no cover
109
- if (
110
- force
111
- or not __PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").exists()
112
- ):
111
+ if force or not __PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").exists():
113
112
  __Execute(
114
113
  [python_executable, "-m", "playwright", "install", "chromium"],
115
114
  "Playwright browsers",
@@ -158,9 +157,7 @@ def mcp():
158
157
  "level",
159
158
  is_flag=False,
160
159
  default="debug",
161
- type=Choice(
162
- ["debug", "info", "warning", "error", "critical", "fatal"], case_sensitive=False
163
- ),
160
+ type=Choice(["debug", "info", "warning", "error", "critical", "fatal"], case_sensitive=False),
164
161
  help="Log level (default: DEBUG)",
165
162
  )
166
163
  def shell(code, level):
@@ -178,9 +175,7 @@ def extract():
178
175
  pass
179
176
 
180
177
 
181
- @extract.command(
182
- help=f"Perform a GET request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}"
183
- )
178
+ @extract.command(help=f"Perform a GET request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}")
184
179
  @argument("url", required=True)
185
180
  @argument("output_file", required=True)
186
181
  @option(
@@ -190,9 +185,7 @@ def extract():
190
185
  help='HTTP headers in format "Key: Value" (can be used multiple times)',
191
186
  )
192
187
  @option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
193
- @option(
194
- "--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)"
195
- )
188
+ @option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)")
196
189
  @option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
197
190
  @option(
198
191
  "--css-selector",
@@ -264,12 +257,12 @@ def get(
264
257
  impersonate=impersonate,
265
258
  proxy=proxy,
266
259
  )
260
+ from scrapling.fetchers import Fetcher
261
+
267
262
  __Request_and_Save(Fetcher.get, url, output_file, css_selector, **kwargs)
268
263
 
269
264
 
270
- @extract.command(
271
- help=f"Perform a POST request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}"
272
- )
265
+ @extract.command(help=f"Perform a POST request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}")
273
266
  @argument("url", required=True)
274
267
  @argument("output_file", required=True)
275
268
  @option(
@@ -285,9 +278,7 @@ def get(
285
278
  help='HTTP headers in format "Key: Value" (can be used multiple times)',
286
279
  )
287
280
  @option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
288
- @option(
289
- "--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)"
290
- )
281
+ @option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)")
291
282
  @option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
292
283
  @option(
293
284
  "--css-selector",
@@ -364,12 +355,12 @@ def post(
364
355
  proxy=proxy,
365
356
  data=data,
366
357
  )
358
+ from scrapling.fetchers import Fetcher
359
+
367
360
  __Request_and_Save(Fetcher.post, url, output_file, css_selector, **kwargs)
368
361
 
369
362
 
370
- @extract.command(
371
- help=f"Perform a PUT request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}"
372
- )
363
+ @extract.command(help=f"Perform a PUT request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}")
373
364
  @argument("url", required=True)
374
365
  @argument("output_file", required=True)
375
366
  @option("--data", "-d", help="Form data to include in the request body")
@@ -381,9 +372,7 @@ def post(
381
372
  help='HTTP headers in format "Key: Value" (can be used multiple times)',
382
373
  )
383
374
  @option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
384
- @option(
385
- "--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)"
386
- )
375
+ @option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)")
387
376
  @option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
388
377
  @option(
389
378
  "--css-selector",
@@ -460,12 +449,12 @@ def put(
460
449
  proxy=proxy,
461
450
  data=data,
462
451
  )
452
+ from scrapling.fetchers import Fetcher
453
+
463
454
  __Request_and_Save(Fetcher.put, url, output_file, css_selector, **kwargs)
464
455
 
465
456
 
466
- @extract.command(
467
- help=f"Perform a DELETE request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}"
468
- )
457
+ @extract.command(help=f"Perform a DELETE request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}")
469
458
  @argument("url", required=True)
470
459
  @argument("output_file", required=True)
471
460
  @option(
@@ -475,9 +464,7 @@ def put(
475
464
  help='HTTP headers in format "Key: Value" (can be used multiple times)',
476
465
  )
477
466
  @option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
478
- @option(
479
- "--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)"
480
- )
467
+ @option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)")
481
468
  @option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
482
469
  @option(
483
470
  "--css-selector",
@@ -549,12 +536,12 @@ def delete(
549
536
  impersonate=impersonate,
550
537
  proxy=proxy,
551
538
  )
539
+ from scrapling.fetchers import Fetcher
540
+
552
541
  __Request_and_Save(Fetcher.delete, url, output_file, css_selector, **kwargs)
553
542
 
554
543
 
555
- @extract.command(
556
- help=f"Use DynamicFetcher to fetch content with browser automation.\n\n{__OUTPUT_FILE_HELP__}"
557
- )
544
+ @extract.command(help=f"Use DynamicFetcher to fetch content with browser automation.\n\n{__OUTPUT_FILE_HELP__}")
558
545
  @argument("url", required=True)
559
546
  @argument("output_file", required=True)
560
547
  @option(
@@ -591,9 +578,7 @@ def delete(
591
578
  )
592
579
  @option("--wait-selector", help="CSS selector to wait for before proceeding")
593
580
  @option("--locale", default="en-US", help="Browser locale (default: en-US)")
594
- @option(
595
- "--stealth/--no-stealth", default=False, help="Enable stealth mode (default: False)"
596
- )
581
+ @option("--stealth/--no-stealth", default=False, help="Enable stealth mode (default: False)")
597
582
  @option(
598
583
  "--hide-canvas/--show-canvas",
599
584
  default=False,
@@ -672,12 +657,12 @@ def fetch(
672
657
  if parsed_headers:
673
658
  kwargs["extra_headers"] = parsed_headers
674
659
 
660
+ from scrapling.fetchers import DynamicFetcher
661
+
675
662
  __Request_and_Save(DynamicFetcher.fetch, url, output_file, css_selector, **kwargs)
676
663
 
677
664
 
678
- @extract.command(
679
- help=f"Use StealthyFetcher to fetch content with advanced stealth features.\n\n{__OUTPUT_FILE_HELP__}"
680
- )
665
+ @extract.command(help=f"Use StealthyFetcher to fetch content with advanced stealth features.\n\n{__OUTPUT_FILE_HELP__}")
681
666
  @argument("url", required=True)
682
667
  @argument("output_file", required=True)
683
668
  @option(
@@ -821,6 +806,8 @@ def stealthy_fetch(
821
806
  if parsed_headers:
822
807
  kwargs["extra_headers"] = parsed_headers
823
808
 
809
+ from scrapling.fetchers import StealthyFetcher
810
+
824
811
  __Request_and_Save(StealthyFetcher.fetch, url, output_file, css_selector, **kwargs)
825
812
 
826
813
 
@@ -269,17 +269,13 @@ name2codepoint = {
269
269
  }
270
270
 
271
271
 
272
- def to_unicode(
273
- text: StrOrBytes, encoding: Optional[str] = None, errors: str = "strict"
274
- ) -> str:
272
+ def to_unicode(text: StrOrBytes, encoding: Optional[str] = None, errors: str = "strict") -> str:
275
273
  """Return the Unicode representation of a bytes object `text`. If `text`
276
274
  is already a Unicode object, return it as-is."""
277
275
  if isinstance(text, str):
278
276
  return text
279
277
  if not isinstance(text, (bytes, str)):
280
- raise TypeError(
281
- f"to_unicode must receive bytes or str, got {type(text).__name__}"
282
- )
278
+ raise TypeError(f"to_unicode must receive bytes or str, got {type(text).__name__}")
283
279
  if encoding is None:
284
280
  encoding = "utf-8"
285
281
  return text.decode(encoding, errors)
@@ -328,9 +324,7 @@ def _replace_entities(
328
324
  entity_name = groups["named"]
329
325
  if entity_name.lower() in keep:
330
326
  return m.group(0)
331
- number = name2codepoint.get(entity_name) or name2codepoint.get(
332
- entity_name.lower()
333
- )
327
+ number = name2codepoint.get(entity_name) or name2codepoint.get(entity_name.lower())
334
328
  if number is not None:
335
329
  # Browsers typically
336
330
  # interpret numeric character references in the 80-9F range as representing the characters mapped