scrapling 0.3__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scrapling-0.3/scrapling.egg-info → scrapling-0.3.2}/PKG-INFO +57 -47
- {scrapling-0.3 → scrapling-0.3.2}/README.md +44 -37
- {scrapling-0.3 → scrapling-0.3.2}/pyproject.toml +10 -6
- {scrapling-0.3 → scrapling-0.3.2}/scrapling/__init__.py +1 -1
- {scrapling-0.3 → scrapling-0.3.2}/scrapling/cli.py +38 -51
- {scrapling-0.3 → scrapling-0.3.2}/scrapling/core/_html_utils.py +3 -9
- {scrapling-0.3 → scrapling-0.3.2}/scrapling/core/ai.py +5 -13
- {scrapling-0.3 → scrapling-0.3.2}/scrapling/core/custom_types.py +19 -61
- {scrapling-0.3 → scrapling-0.3.2}/scrapling/core/mixins.py +6 -28
- {scrapling-0.3 → scrapling-0.3.2}/scrapling/core/shell.py +51 -129
- {scrapling-0.3 → scrapling-0.3.2}/scrapling/core/storage.py +2 -8
- {scrapling-0.3 → scrapling-0.3.2}/scrapling/core/translator.py +8 -20
- scrapling-0.3.2/scrapling/core/utils/__init__.py +10 -0
- scrapling-0.3.2/scrapling/core/utils/_shell.py +48 -0
- scrapling-0.3/scrapling/core/utils.py → scrapling-0.3.2/scrapling/core/utils/_utils.py +5 -21
- scrapling-0.3.2/scrapling/engines/__init__.py +0 -0
- scrapling-0.3.2/scrapling/engines/_browsers/_base.py +297 -0
- {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/_browsers/_camoufox.py +238 -293
- {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/_browsers/_config_tools.py +2 -1
- {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/_browsers/_controllers.py +220 -278
- {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/_browsers/_page.py +37 -15
- {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/_browsers/_validators.py +29 -15
- {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/constants.py +3 -6
- {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/static.py +25 -75
- scrapling-0.3.2/scrapling/engines/toolbelt/__init__.py +1 -0
- {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/toolbelt/convertor.py +95 -86
- {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/toolbelt/custom.py +7 -99
- {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/toolbelt/fingerprints.py +1 -3
- {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/toolbelt/navigation.py +4 -58
- {scrapling-0.3 → scrapling-0.3.2}/scrapling/fetchers.py +41 -24
- {scrapling-0.3 → scrapling-0.3.2}/scrapling/parser.py +45 -122
- {scrapling-0.3 → scrapling-0.3.2/scrapling.egg-info}/PKG-INFO +57 -47
- {scrapling-0.3 → scrapling-0.3.2}/scrapling.egg-info/SOURCES.txt +4 -1
- {scrapling-0.3 → scrapling-0.3.2}/scrapling.egg-info/requires.txt +14 -10
- {scrapling-0.3 → scrapling-0.3.2}/setup.cfg +1 -1
- scrapling-0.3/scrapling/engines/__init__.py +0 -16
- scrapling-0.3/scrapling/engines/toolbelt/__init__.py +0 -20
- {scrapling-0.3 → scrapling-0.3.2}/LICENSE +0 -0
- {scrapling-0.3 → scrapling-0.3.2}/MANIFEST.in +0 -0
- {scrapling-0.3 → scrapling-0.3.2}/scrapling/core/__init__.py +0 -0
- {scrapling-0.3 → scrapling-0.3.2}/scrapling/core/_types.py +0 -0
- {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/_browsers/__init__.py +0 -0
- {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
- {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
- {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
- {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
- {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
- {scrapling-0.3 → scrapling-0.3.2}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
- {scrapling-0.3 → scrapling-0.3.2}/scrapling/py.typed +0 -0
- {scrapling-0.3 → scrapling-0.3.2}/scrapling.egg-info/dependency_links.txt +0 -0
- {scrapling-0.3 → scrapling-0.3.2}/scrapling.egg-info/entry_points.txt +0 -0
- {scrapling-0.3 → scrapling-0.3.2}/scrapling.egg-info/not-zip-safe +0 -0
- {scrapling-0.3 → scrapling-0.3.2}/scrapling.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.3
|
3
|
+
Version: 0.3.2
|
4
4
|
Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -64,23 +64,26 @@ Classifier: Typing :: Typed
|
|
64
64
|
Requires-Python: >=3.10
|
65
65
|
Description-Content-Type: text/markdown
|
66
66
|
License-File: LICENSE
|
67
|
-
Requires-Dist: lxml>=6.0.
|
67
|
+
Requires-Dist: lxml>=6.0.1
|
68
68
|
Requires-Dist: cssselect>=1.3.0
|
69
|
-
Requires-Dist:
|
70
|
-
Requires-Dist: orjson>=3.11.2
|
69
|
+
Requires-Dist: orjson>=3.11.3
|
71
70
|
Requires-Dist: tldextract>=5.3.0
|
72
|
-
|
73
|
-
Requires-Dist:
|
74
|
-
Requires-Dist:
|
75
|
-
Requires-Dist:
|
76
|
-
Requires-Dist:
|
77
|
-
Requires-Dist:
|
71
|
+
Provides-Extra: fetchers
|
72
|
+
Requires-Dist: click>=8.2.1; extra == "fetchers"
|
73
|
+
Requires-Dist: curl_cffi>=0.13.0; extra == "fetchers"
|
74
|
+
Requires-Dist: playwright>=1.52.0; extra == "fetchers"
|
75
|
+
Requires-Dist: rebrowser-playwright>=1.52.0; extra == "fetchers"
|
76
|
+
Requires-Dist: camoufox>=0.4.11; extra == "fetchers"
|
77
|
+
Requires-Dist: geoip2>=5.1.0; extra == "fetchers"
|
78
|
+
Requires-Dist: msgspec>=0.19.0; extra == "fetchers"
|
78
79
|
Provides-Extra: ai
|
79
|
-
Requires-Dist: mcp>=1.
|
80
|
+
Requires-Dist: mcp>=1.14.0; extra == "ai"
|
80
81
|
Requires-Dist: markdownify>=1.2.0; extra == "ai"
|
82
|
+
Requires-Dist: scrapling[fetchers]; extra == "ai"
|
81
83
|
Provides-Extra: shell
|
82
84
|
Requires-Dist: IPython>=8.37; extra == "shell"
|
83
85
|
Requires-Dist: markdownify>=1.2.0; extra == "shell"
|
86
|
+
Requires-Dist: scrapling[fetchers]; extra == "shell"
|
84
87
|
Provides-Extra: all
|
85
88
|
Requires-Dist: scrapling[ai,shell]; extra == "all"
|
86
89
|
Dynamic: license-file
|
@@ -155,9 +158,10 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
|
|
155
158
|
<!-- sponsors -->
|
156
159
|
|
157
160
|
<a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
|
158
|
-
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
159
161
|
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
162
|
+
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
160
163
|
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
164
|
+
<a href="https://www.nstproxy.com/?type=flow&utm_source=scrapling" target="_blank" title="One Proxy Service, Infinite Solutions at Unbeatable Prices!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/NSTproxy.png"></a>
|
161
165
|
|
162
166
|
<!-- /sponsors -->
|
163
167
|
|
@@ -178,7 +182,7 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
|
|
178
182
|
- 🔄 **Smart Element Tracking**: Relocate elements after website changes using intelligent similarity algorithms.
|
179
183
|
- 🎯 **Smart Flexible Selection**: CSS selectors, XPath selectors, filter-based search, text search, regex search, and more.
|
180
184
|
- 🔍 **Find Similar Elements**: Automatically locate elements similar to found elements.
|
181
|
-
- 🤖 **MCP Server to be used with AI**: Built-in MCP server for AI-assisted Web Scraping and data extraction. The MCP server features custom, powerful capabilities that utilize Scrapling to extract targeted content before passing it to the AI (Claude/Cursor/etc), thereby speeding up operations and reducing costs by minimizing token usage.
|
185
|
+
- 🤖 **MCP Server to be used with AI**: Built-in MCP server for AI-assisted Web Scraping and data extraction. The MCP server features custom, powerful capabilities that utilize Scrapling to extract targeted content before passing it to the AI (Claude/Cursor/etc), thereby speeding up operations and reducing costs by minimizing token usage. ([demo video](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
|
182
186
|
|
183
187
|
### High-Performance & battle-tested Architecture
|
184
188
|
- 🚀 **Lightning Fast**: Optimized performance outperforming most Python scraping libraries.
|
@@ -220,7 +224,7 @@ quotes = page.css('.quote .text::text')
|
|
220
224
|
|
221
225
|
# Advanced stealth mode (Keep the browser open until you finish)
|
222
226
|
with StealthySession(headless=True, solve_cloudflare=True) as session:
|
223
|
-
page = session.fetch('https://nopecha.com/demo/cloudflare')
|
227
|
+
page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
|
224
228
|
data = page.css('#padded_content a')
|
225
229
|
|
226
230
|
# Or use one-off request style, it opens the browser for this request, then closes it after finishing
|
@@ -229,7 +233,7 @@ data = page.css('#padded_content a')
|
|
229
233
|
|
230
234
|
# Full browser automation (Keep the browser open until you finish)
|
231
235
|
with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session:
|
232
|
-
page = session.fetch('https://quotes.toscrape.com/')
|
236
|
+
page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
|
233
237
|
data = page.xpath('//span[@class="text"]/text()') # XPath selector if you prefer it
|
234
238
|
|
235
239
|
# Or use one-off request style, it opens the browser for this request, then closes it after finishing
|
@@ -273,7 +277,7 @@ from scrapling.parser import Selector
|
|
273
277
|
|
274
278
|
page = Selector("<html>...</html>")
|
275
279
|
```
|
276
|
-
And it works
|
280
|
+
And it works precisely the same way!
|
277
281
|
|
278
282
|
### Async Session Management Examples
|
279
283
|
```python
|
@@ -302,6 +306,8 @@ async with AsyncStealthySession(max_pages=2) as session:
|
|
302
306
|
|
303
307
|
Scrapling v0.3 includes a powerful command-line interface:
|
304
308
|
|
309
|
+
[](https://asciinema.org/a/736339)
|
310
|
+
|
305
311
|
```bash
|
306
312
|
# Launch interactive Web Scraping shell
|
307
313
|
scrapling shell
|
@@ -320,20 +326,20 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.
|
|
320
326
|
|
321
327
|
## Performance Benchmarks
|
322
328
|
|
323
|
-
Scrapling isn't just powerful—it's also blazing fast, and version 0.3
|
329
|
+
Scrapling isn't just powerful—it's also blazing fast, and the updates since version 0.3 deliver exceptional performance improvements across all operations!
|
324
330
|
|
325
331
|
### Text Extraction Speed Test (5000 nested elements)
|
326
332
|
|
327
333
|
| # | Library | Time (ms) | vs Scrapling |
|
328
334
|
|---|:-----------------:|:---------:|:------------:|
|
329
|
-
| 1 | Scrapling | 1.
|
330
|
-
| 2 | Parsel/Scrapy | 1.
|
331
|
-
| 3 | Raw Lxml | 2.
|
332
|
-
| 4 | PyQuery | 20.
|
333
|
-
| 5 | Selectolax |
|
334
|
-
| 6 |
|
335
|
-
| 7 |
|
336
|
-
| 8 | BS4 with html5lib |
|
335
|
+
| 1 | Scrapling | 1.92 | 1.0x |
|
336
|
+
| 2 | Parsel/Scrapy | 1.99 | 1.036x |
|
337
|
+
| 3 | Raw Lxml | 2.33 | 1.214x |
|
338
|
+
| 4 | PyQuery | 20.61 | ~11x |
|
339
|
+
| 5 | Selectolax | 80.65 | ~42x |
|
340
|
+
| 6 | BS4 with Lxml | 1283.21 | ~698x |
|
341
|
+
| 7 | MechanicalSoup | 1304.57 | ~679x |
|
342
|
+
| 8 | BS4 with html5lib | 3331.96 | ~1735x |
|
337
343
|
|
338
344
|
### Element Similarity & Text Search Performance
|
339
345
|
|
@@ -341,8 +347,8 @@ Scrapling's adaptive element finding capabilities significantly outperform alter
|
|
341
347
|
|
342
348
|
| Library | Time (ms) | vs Scrapling |
|
343
349
|
|-------------|:---------:|:------------:|
|
344
|
-
| Scrapling |
|
345
|
-
| AutoScraper | 10.
|
350
|
+
| Scrapling | 1.87 | 1.0x |
|
351
|
+
| AutoScraper | 10.24 | 5.476x |
|
346
352
|
|
347
353
|
|
348
354
|
> All benchmarks represent averages of 100+ runs. See [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology.
|
@@ -355,29 +361,33 @@ Scrapling requires Python 3.10 or higher:
|
|
355
361
|
pip install scrapling
|
356
362
|
```
|
357
363
|
|
358
|
-
|
359
|
-
|
360
|
-
If you are going to use any of the fetchers or their classes, then install browser dependencies with
|
361
|
-
```bash
|
362
|
-
scrapling install
|
363
|
-
```
|
364
|
-
|
365
|
-
This downloads all browsers with their system dependencies and fingerprint manipulation dependencies.
|
364
|
+
Starting with v0.3.2, this installation only includes the parser engine and its dependencies, without any fetchers or commandline dependencies.
|
366
365
|
|
367
366
|
### Optional Dependencies
|
368
367
|
|
369
|
-
|
370
|
-
```bash
|
371
|
-
pip install "scrapling[
|
372
|
-
|
373
|
-
|
374
|
-
```
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
```
|
368
|
+
1. If you are going to use any of the extra features below, the fetchers, or their classes, then you need to install fetchers' dependencies, and then install their browser dependencies with
|
369
|
+
```bash
|
370
|
+
pip install "scrapling[fetchers]"
|
371
|
+
|
372
|
+
scrapling install
|
373
|
+
```
|
374
|
+
|
375
|
+
This downloads all browsers with their system dependencies and fingerprint manipulation dependencies.
|
376
|
+
|
377
|
+
2. Extra features:
|
378
|
+
- Install the MCP server feature:
|
379
|
+
```bash
|
380
|
+
pip install "scrapling[ai]"
|
381
|
+
```
|
382
|
+
- Install shell features (Web Scraping shell and the `extract` command):
|
383
|
+
```bash
|
384
|
+
pip install "scrapling[shell]"
|
385
|
+
```
|
386
|
+
- Install everything:
|
387
|
+
```bash
|
388
|
+
pip install "scrapling[all]"
|
389
|
+
```
|
390
|
+
Don't forget that you need to install the browser dependencies with `scrapling install` after any of these extras (if you didn't already)
|
381
391
|
|
382
392
|
## Contributing
|
383
393
|
|
@@ -68,9 +68,10 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
|
|
68
68
|
<!-- sponsors -->
|
69
69
|
|
70
70
|
<a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
|
71
|
-
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
72
71
|
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
72
|
+
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
73
73
|
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
74
|
+
<a href="https://www.nstproxy.com/?type=flow&utm_source=scrapling" target="_blank" title="One Proxy Service, Infinite Solutions at Unbeatable Prices!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/NSTproxy.png"></a>
|
74
75
|
|
75
76
|
<!-- /sponsors -->
|
76
77
|
|
@@ -91,7 +92,7 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
|
|
91
92
|
- 🔄 **Smart Element Tracking**: Relocate elements after website changes using intelligent similarity algorithms.
|
92
93
|
- 🎯 **Smart Flexible Selection**: CSS selectors, XPath selectors, filter-based search, text search, regex search, and more.
|
93
94
|
- 🔍 **Find Similar Elements**: Automatically locate elements similar to found elements.
|
94
|
-
- 🤖 **MCP Server to be used with AI**: Built-in MCP server for AI-assisted Web Scraping and data extraction. The MCP server features custom, powerful capabilities that utilize Scrapling to extract targeted content before passing it to the AI (Claude/Cursor/etc), thereby speeding up operations and reducing costs by minimizing token usage.
|
95
|
+
- 🤖 **MCP Server to be used with AI**: Built-in MCP server for AI-assisted Web Scraping and data extraction. The MCP server features custom, powerful capabilities that utilize Scrapling to extract targeted content before passing it to the AI (Claude/Cursor/etc), thereby speeding up operations and reducing costs by minimizing token usage. ([demo video](https://www.youtube.com/watch?v=qyFk3ZNwOxE))
|
95
96
|
|
96
97
|
### High-Performance & battle-tested Architecture
|
97
98
|
- 🚀 **Lightning Fast**: Optimized performance outperforming most Python scraping libraries.
|
@@ -133,7 +134,7 @@ quotes = page.css('.quote .text::text')
|
|
133
134
|
|
134
135
|
# Advanced stealth mode (Keep the browser open until you finish)
|
135
136
|
with StealthySession(headless=True, solve_cloudflare=True) as session:
|
136
|
-
page = session.fetch('https://nopecha.com/demo/cloudflare')
|
137
|
+
page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)
|
137
138
|
data = page.css('#padded_content a')
|
138
139
|
|
139
140
|
# Or use one-off request style, it opens the browser for this request, then closes it after finishing
|
@@ -142,7 +143,7 @@ data = page.css('#padded_content a')
|
|
142
143
|
|
143
144
|
# Full browser automation (Keep the browser open until you finish)
|
144
145
|
with DynamicSession(headless=True, disable_resources=False, network_idle=True) as session:
|
145
|
-
page = session.fetch('https://quotes.toscrape.com/')
|
146
|
+
page = session.fetch('https://quotes.toscrape.com/', load_dom=False)
|
146
147
|
data = page.xpath('//span[@class="text"]/text()') # XPath selector if you prefer it
|
147
148
|
|
148
149
|
# Or use one-off request style, it opens the browser for this request, then closes it after finishing
|
@@ -186,7 +187,7 @@ from scrapling.parser import Selector
|
|
186
187
|
|
187
188
|
page = Selector("<html>...</html>")
|
188
189
|
```
|
189
|
-
And it works
|
190
|
+
And it works precisely the same way!
|
190
191
|
|
191
192
|
### Async Session Management Examples
|
192
193
|
```python
|
@@ -215,6 +216,8 @@ async with AsyncStealthySession(max_pages=2) as session:
|
|
215
216
|
|
216
217
|
Scrapling v0.3 includes a powerful command-line interface:
|
217
218
|
|
219
|
+
[](https://asciinema.org/a/736339)
|
220
|
+
|
218
221
|
```bash
|
219
222
|
# Launch interactive Web Scraping shell
|
220
223
|
scrapling shell
|
@@ -233,20 +236,20 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.
|
|
233
236
|
|
234
237
|
## Performance Benchmarks
|
235
238
|
|
236
|
-
Scrapling isn't just powerful—it's also blazing fast, and version 0.3
|
239
|
+
Scrapling isn't just powerful—it's also blazing fast, and the updates since version 0.3 deliver exceptional performance improvements across all operations!
|
237
240
|
|
238
241
|
### Text Extraction Speed Test (5000 nested elements)
|
239
242
|
|
240
243
|
| # | Library | Time (ms) | vs Scrapling |
|
241
244
|
|---|:-----------------:|:---------:|:------------:|
|
242
|
-
| 1 | Scrapling | 1.
|
243
|
-
| 2 | Parsel/Scrapy | 1.
|
244
|
-
| 3 | Raw Lxml | 2.
|
245
|
-
| 4 | PyQuery | 20.
|
246
|
-
| 5 | Selectolax |
|
247
|
-
| 6 |
|
248
|
-
| 7 |
|
249
|
-
| 8 | BS4 with html5lib |
|
245
|
+
| 1 | Scrapling | 1.92 | 1.0x |
|
246
|
+
| 2 | Parsel/Scrapy | 1.99 | 1.036x |
|
247
|
+
| 3 | Raw Lxml | 2.33 | 1.214x |
|
248
|
+
| 4 | PyQuery | 20.61 | ~11x |
|
249
|
+
| 5 | Selectolax | 80.65 | ~42x |
|
250
|
+
| 6 | BS4 with Lxml | 1283.21 | ~698x |
|
251
|
+
| 7 | MechanicalSoup | 1304.57 | ~679x |
|
252
|
+
| 8 | BS4 with html5lib | 3331.96 | ~1735x |
|
250
253
|
|
251
254
|
### Element Similarity & Text Search Performance
|
252
255
|
|
@@ -254,8 +257,8 @@ Scrapling's adaptive element finding capabilities significantly outperform alter
|
|
254
257
|
|
255
258
|
| Library | Time (ms) | vs Scrapling |
|
256
259
|
|-------------|:---------:|:------------:|
|
257
|
-
| Scrapling |
|
258
|
-
| AutoScraper | 10.
|
260
|
+
| Scrapling | 1.87 | 1.0x |
|
261
|
+
| AutoScraper | 10.24 | 5.476x |
|
259
262
|
|
260
263
|
|
261
264
|
> All benchmarks represent averages of 100+ runs. See [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology.
|
@@ -268,29 +271,33 @@ Scrapling requires Python 3.10 or higher:
|
|
268
271
|
pip install scrapling
|
269
272
|
```
|
270
273
|
|
271
|
-
|
272
|
-
|
273
|
-
If you are going to use any of the fetchers or their classes, then install browser dependencies with
|
274
|
-
```bash
|
275
|
-
scrapling install
|
276
|
-
```
|
277
|
-
|
278
|
-
This downloads all browsers with their system dependencies and fingerprint manipulation dependencies.
|
274
|
+
Starting with v0.3.2, this installation only includes the parser engine and its dependencies, without any fetchers or commandline dependencies.
|
279
275
|
|
280
276
|
### Optional Dependencies
|
281
277
|
|
282
|
-
|
283
|
-
```bash
|
284
|
-
pip install "scrapling[
|
285
|
-
|
286
|
-
|
287
|
-
```
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
```
|
278
|
+
1. If you are going to use any of the extra features below, the fetchers, or their classes, then you need to install fetchers' dependencies, and then install their browser dependencies with
|
279
|
+
```bash
|
280
|
+
pip install "scrapling[fetchers]"
|
281
|
+
|
282
|
+
scrapling install
|
283
|
+
```
|
284
|
+
|
285
|
+
This downloads all browsers with their system dependencies and fingerprint manipulation dependencies.
|
286
|
+
|
287
|
+
2. Extra features:
|
288
|
+
- Install the MCP server feature:
|
289
|
+
```bash
|
290
|
+
pip install "scrapling[ai]"
|
291
|
+
```
|
292
|
+
- Install shell features (Web Scraping shell and the `extract` command):
|
293
|
+
```bash
|
294
|
+
pip install "scrapling[shell]"
|
295
|
+
```
|
296
|
+
- Install everything:
|
297
|
+
```bash
|
298
|
+
pip install "scrapling[all]"
|
299
|
+
```
|
300
|
+
Don't forget that you need to install the browser dependencies with `scrapling install` after any of these extras (if you didn't already)
|
294
301
|
|
295
302
|
## Contributing
|
296
303
|
|
@@ -319,4 +326,4 @@ This project includes code adapted from:
|
|
319
326
|
- [rebrowser-patches](https://github.com/rebrowser/rebrowser-patches) for stealth improvements
|
320
327
|
|
321
328
|
---
|
322
|
-
<div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
|
329
|
+
<div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
|
@@ -56,11 +56,15 @@ classifiers = [
|
|
56
56
|
"Typing :: Typed",
|
57
57
|
]
|
58
58
|
dependencies = [
|
59
|
-
"lxml>=6.0.
|
59
|
+
"lxml>=6.0.1",
|
60
60
|
"cssselect>=1.3.0",
|
61
|
-
"
|
62
|
-
"orjson>=3.11.2",
|
61
|
+
"orjson>=3.11.3",
|
63
62
|
"tldextract>=5.3.0",
|
63
|
+
]
|
64
|
+
|
65
|
+
[project.optional-dependencies]
|
66
|
+
fetchers = [
|
67
|
+
"click>=8.2.1",
|
64
68
|
"curl_cffi>=0.13.0",
|
65
69
|
"playwright>=1.52.0",
|
66
70
|
"rebrowser-playwright>=1.52.0",
|
@@ -68,15 +72,15 @@ dependencies = [
|
|
68
72
|
"geoip2>=5.1.0",
|
69
73
|
"msgspec>=0.19.0",
|
70
74
|
]
|
71
|
-
|
72
|
-
[project.optional-dependencies]
|
73
75
|
ai = [
|
74
|
-
"mcp>=1.
|
76
|
+
"mcp>=1.14.0",
|
75
77
|
"markdownify>=1.2.0",
|
78
|
+
"scrapling[fetchers]",
|
76
79
|
]
|
77
80
|
shell = [
|
78
81
|
"IPython>=8.37", # The last version that supports Python 3.10
|
79
82
|
"markdownify>=1.2.0",
|
83
|
+
"scrapling[fetchers]",
|
80
84
|
]
|
81
85
|
all = [
|
82
86
|
"scrapling[ai,shell]",
|
@@ -2,14 +2,18 @@ from pathlib import Path
|
|
2
2
|
from subprocess import check_output
|
3
3
|
from sys import executable as python_executable
|
4
4
|
|
5
|
-
from scrapling.
|
6
|
-
from scrapling.
|
5
|
+
from scrapling.engines.toolbelt.custom import Response
|
6
|
+
from scrapling.core.utils import log, _CookieParser, _ParseHeaders
|
7
7
|
from scrapling.core._types import List, Optional, Dict, Tuple, Any, Callable
|
8
|
-
from scrapling.fetchers import Fetcher, DynamicFetcher, StealthyFetcher
|
9
|
-
from scrapling.core.shell import Convertor, _CookieParser, _ParseHeaders
|
10
8
|
|
11
9
|
from orjson import loads as json_loads, JSONDecodeError
|
12
|
-
|
10
|
+
|
11
|
+
try:
|
12
|
+
from click import command, option, Choice, group, argument
|
13
|
+
except (ImportError, ModuleNotFoundError) as e:
|
14
|
+
raise ModuleNotFoundError(
|
15
|
+
"You need to install scrapling with any of the extras to enable Shell commands. See: https://scrapling.readthedocs.io/en/latest/#installation"
|
16
|
+
) from e
|
13
17
|
|
14
18
|
__OUTPUT_FILE_HELP__ = "The output file path can be an HTML file, a Markdown file of the HTML content, or the text content itself. Use file extensions (`.html`/`.md`/`.txt`) respectively."
|
15
19
|
__PACKAGE_DIR__ = Path(__file__).parent
|
@@ -40,6 +44,8 @@ def __Request_and_Save(
|
|
40
44
|
**kwargs,
|
41
45
|
) -> None:
|
42
46
|
"""Make a request using the specified fetcher function and save the result"""
|
47
|
+
from scrapling.core.shell import Convertor
|
48
|
+
|
43
49
|
# Handle relative paths - convert to an absolute path based on the current working directory
|
44
50
|
output_path = Path(output_file)
|
45
51
|
if not output_path.is_absolute():
|
@@ -72,14 +78,10 @@ def __ParseExtractArguments(
|
|
72
78
|
return parsed_headers, parsed_cookies, parsed_params, parsed_json
|
73
79
|
|
74
80
|
|
75
|
-
def __BuildRequest(
|
76
|
-
headers: List[str], cookies: str, params: str, json: Optional[str] = None, **kwargs
|
77
|
-
) -> Dict:
|
81
|
+
def __BuildRequest(headers: List[str], cookies: str, params: str, json: Optional[str] = None, **kwargs) -> Dict:
|
78
82
|
"""Build a request object using the specified arguments"""
|
79
83
|
# Parse parameters
|
80
|
-
parsed_headers, parsed_cookies, parsed_params, parsed_json = (
|
81
|
-
__ParseExtractArguments(headers, cookies, params, json)
|
82
|
-
)
|
84
|
+
parsed_headers, parsed_cookies, parsed_params, parsed_json = __ParseExtractArguments(headers, cookies, params, json)
|
83
85
|
# Build request arguments
|
84
86
|
request_kwargs = {
|
85
87
|
"headers": parsed_headers if parsed_headers else None,
|
@@ -106,10 +108,7 @@ def __BuildRequest(
|
|
106
108
|
help="Force Scrapling to reinstall all Fetchers dependencies",
|
107
109
|
)
|
108
110
|
def install(force): # pragma: no cover
|
109
|
-
if (
|
110
|
-
force
|
111
|
-
or not __PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").exists()
|
112
|
-
):
|
111
|
+
if force or not __PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").exists():
|
113
112
|
__Execute(
|
114
113
|
[python_executable, "-m", "playwright", "install", "chromium"],
|
115
114
|
"Playwright browsers",
|
@@ -158,9 +157,7 @@ def mcp():
|
|
158
157
|
"level",
|
159
158
|
is_flag=False,
|
160
159
|
default="debug",
|
161
|
-
type=Choice(
|
162
|
-
["debug", "info", "warning", "error", "critical", "fatal"], case_sensitive=False
|
163
|
-
),
|
160
|
+
type=Choice(["debug", "info", "warning", "error", "critical", "fatal"], case_sensitive=False),
|
164
161
|
help="Log level (default: DEBUG)",
|
165
162
|
)
|
166
163
|
def shell(code, level):
|
@@ -178,9 +175,7 @@ def extract():
|
|
178
175
|
pass
|
179
176
|
|
180
177
|
|
181
|
-
@extract.command(
|
182
|
-
help=f"Perform a GET request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}"
|
183
|
-
)
|
178
|
+
@extract.command(help=f"Perform a GET request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}")
|
184
179
|
@argument("url", required=True)
|
185
180
|
@argument("output_file", required=True)
|
186
181
|
@option(
|
@@ -190,9 +185,7 @@ def extract():
|
|
190
185
|
help='HTTP headers in format "Key: Value" (can be used multiple times)',
|
191
186
|
)
|
192
187
|
@option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
|
193
|
-
@option(
|
194
|
-
"--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)"
|
195
|
-
)
|
188
|
+
@option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)")
|
196
189
|
@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
|
197
190
|
@option(
|
198
191
|
"--css-selector",
|
@@ -264,12 +257,12 @@ def get(
|
|
264
257
|
impersonate=impersonate,
|
265
258
|
proxy=proxy,
|
266
259
|
)
|
260
|
+
from scrapling.fetchers import Fetcher
|
261
|
+
|
267
262
|
__Request_and_Save(Fetcher.get, url, output_file, css_selector, **kwargs)
|
268
263
|
|
269
264
|
|
270
|
-
@extract.command(
|
271
|
-
help=f"Perform a POST request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}"
|
272
|
-
)
|
265
|
+
@extract.command(help=f"Perform a POST request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}")
|
273
266
|
@argument("url", required=True)
|
274
267
|
@argument("output_file", required=True)
|
275
268
|
@option(
|
@@ -285,9 +278,7 @@ def get(
|
|
285
278
|
help='HTTP headers in format "Key: Value" (can be used multiple times)',
|
286
279
|
)
|
287
280
|
@option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
|
288
|
-
@option(
|
289
|
-
"--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)"
|
290
|
-
)
|
281
|
+
@option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)")
|
291
282
|
@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
|
292
283
|
@option(
|
293
284
|
"--css-selector",
|
@@ -364,12 +355,12 @@ def post(
|
|
364
355
|
proxy=proxy,
|
365
356
|
data=data,
|
366
357
|
)
|
358
|
+
from scrapling.fetchers import Fetcher
|
359
|
+
|
367
360
|
__Request_and_Save(Fetcher.post, url, output_file, css_selector, **kwargs)
|
368
361
|
|
369
362
|
|
370
|
-
@extract.command(
|
371
|
-
help=f"Perform a PUT request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}"
|
372
|
-
)
|
363
|
+
@extract.command(help=f"Perform a PUT request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}")
|
373
364
|
@argument("url", required=True)
|
374
365
|
@argument("output_file", required=True)
|
375
366
|
@option("--data", "-d", help="Form data to include in the request body")
|
@@ -381,9 +372,7 @@ def post(
|
|
381
372
|
help='HTTP headers in format "Key: Value" (can be used multiple times)',
|
382
373
|
)
|
383
374
|
@option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
|
384
|
-
@option(
|
385
|
-
"--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)"
|
386
|
-
)
|
375
|
+
@option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)")
|
387
376
|
@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
|
388
377
|
@option(
|
389
378
|
"--css-selector",
|
@@ -460,12 +449,12 @@ def put(
|
|
460
449
|
proxy=proxy,
|
461
450
|
data=data,
|
462
451
|
)
|
452
|
+
from scrapling.fetchers import Fetcher
|
453
|
+
|
463
454
|
__Request_and_Save(Fetcher.put, url, output_file, css_selector, **kwargs)
|
464
455
|
|
465
456
|
|
466
|
-
@extract.command(
|
467
|
-
help=f"Perform a DELETE request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}"
|
468
|
-
)
|
457
|
+
@extract.command(help=f"Perform a DELETE request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}")
|
469
458
|
@argument("url", required=True)
|
470
459
|
@argument("output_file", required=True)
|
471
460
|
@option(
|
@@ -475,9 +464,7 @@ def put(
|
|
475
464
|
help='HTTP headers in format "Key: Value" (can be used multiple times)',
|
476
465
|
)
|
477
466
|
@option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
|
478
|
-
@option(
|
479
|
-
"--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)"
|
480
|
-
)
|
467
|
+
@option("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)")
|
481
468
|
@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
|
482
469
|
@option(
|
483
470
|
"--css-selector",
|
@@ -549,12 +536,12 @@ def delete(
|
|
549
536
|
impersonate=impersonate,
|
550
537
|
proxy=proxy,
|
551
538
|
)
|
539
|
+
from scrapling.fetchers import Fetcher
|
540
|
+
|
552
541
|
__Request_and_Save(Fetcher.delete, url, output_file, css_selector, **kwargs)
|
553
542
|
|
554
543
|
|
555
|
-
@extract.command(
|
556
|
-
help=f"Use DynamicFetcher to fetch content with browser automation.\n\n{__OUTPUT_FILE_HELP__}"
|
557
|
-
)
|
544
|
+
@extract.command(help=f"Use DynamicFetcher to fetch content with browser automation.\n\n{__OUTPUT_FILE_HELP__}")
|
558
545
|
@argument("url", required=True)
|
559
546
|
@argument("output_file", required=True)
|
560
547
|
@option(
|
@@ -591,9 +578,7 @@ def delete(
|
|
591
578
|
)
|
592
579
|
@option("--wait-selector", help="CSS selector to wait for before proceeding")
|
593
580
|
@option("--locale", default="en-US", help="Browser locale (default: en-US)")
|
594
|
-
@option(
|
595
|
-
"--stealth/--no-stealth", default=False, help="Enable stealth mode (default: False)"
|
596
|
-
)
|
581
|
+
@option("--stealth/--no-stealth", default=False, help="Enable stealth mode (default: False)")
|
597
582
|
@option(
|
598
583
|
"--hide-canvas/--show-canvas",
|
599
584
|
default=False,
|
@@ -672,12 +657,12 @@ def fetch(
|
|
672
657
|
if parsed_headers:
|
673
658
|
kwargs["extra_headers"] = parsed_headers
|
674
659
|
|
660
|
+
from scrapling.fetchers import DynamicFetcher
|
661
|
+
|
675
662
|
__Request_and_Save(DynamicFetcher.fetch, url, output_file, css_selector, **kwargs)
|
676
663
|
|
677
664
|
|
678
|
-
@extract.command(
|
679
|
-
help=f"Use StealthyFetcher to fetch content with advanced stealth features.\n\n{__OUTPUT_FILE_HELP__}"
|
680
|
-
)
|
665
|
+
@extract.command(help=f"Use StealthyFetcher to fetch content with advanced stealth features.\n\n{__OUTPUT_FILE_HELP__}")
|
681
666
|
@argument("url", required=True)
|
682
667
|
@argument("output_file", required=True)
|
683
668
|
@option(
|
@@ -821,6 +806,8 @@ def stealthy_fetch(
|
|
821
806
|
if parsed_headers:
|
822
807
|
kwargs["extra_headers"] = parsed_headers
|
823
808
|
|
809
|
+
from scrapling.fetchers import StealthyFetcher
|
810
|
+
|
824
811
|
__Request_and_Save(StealthyFetcher.fetch, url, output_file, css_selector, **kwargs)
|
825
812
|
|
826
813
|
|
@@ -269,17 +269,13 @@ name2codepoint = {
|
|
269
269
|
}
|
270
270
|
|
271
271
|
|
272
|
-
def to_unicode(
|
273
|
-
text: StrOrBytes, encoding: Optional[str] = None, errors: str = "strict"
|
274
|
-
) -> str:
|
272
|
+
def to_unicode(text: StrOrBytes, encoding: Optional[str] = None, errors: str = "strict") -> str:
|
275
273
|
"""Return the Unicode representation of a bytes object `text`. If `text`
|
276
274
|
is already a Unicode object, return it as-is."""
|
277
275
|
if isinstance(text, str):
|
278
276
|
return text
|
279
277
|
if not isinstance(text, (bytes, str)):
|
280
|
-
raise TypeError(
|
281
|
-
f"to_unicode must receive bytes or str, got {type(text).__name__}"
|
282
|
-
)
|
278
|
+
raise TypeError(f"to_unicode must receive bytes or str, got {type(text).__name__}")
|
283
279
|
if encoding is None:
|
284
280
|
encoding = "utf-8"
|
285
281
|
return text.decode(encoding, errors)
|
@@ -328,9 +324,7 @@ def _replace_entities(
|
|
328
324
|
entity_name = groups["named"]
|
329
325
|
if entity_name.lower() in keep:
|
330
326
|
return m.group(0)
|
331
|
-
number = name2codepoint.get(entity_name) or name2codepoint.get(
|
332
|
-
entity_name.lower()
|
333
|
-
)
|
327
|
+
number = name2codepoint.get(entity_name) or name2codepoint.get(entity_name.lower())
|
334
328
|
if number is not None:
|
335
329
|
# Browsers typically
|
336
330
|
# interpret numeric character references in the 80-9F range as representing the characters mapped
|