scrapling 0.4.2__tar.gz → 0.4.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scrapling-0.4.2/scrapling.egg-info → scrapling-0.4.4}/PKG-INFO +76 -30
- {scrapling-0.4.2 → scrapling-0.4.4}/README.md +70 -25
- {scrapling-0.4.2 → scrapling-0.4.4}/pyproject.toml +8 -7
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/__init__.py +1 -1
- scrapling-0.4.4/scrapling/cli.py +637 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/core/ai.py +370 -158
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/core/custom_types.py +3 -3
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/core/shell.py +37 -2
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/engines/_browsers/_base.py +48 -12
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/engines/_browsers/_controllers.py +35 -7
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/engines/_browsers/_stealth.py +37 -13
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/engines/_browsers/_types.py +2 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/engines/_browsers/_validators.py +9 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/engines/static.py +1 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/engines/toolbelt/convertor.py +39 -22
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/engines/toolbelt/custom.py +13 -1
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/engines/toolbelt/proxy_rotation.py +1 -1
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/spiders/engine.py +85 -6
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/spiders/result.py +2 -0
- scrapling-0.4.4/scrapling/spiders/robotstxt.py +77 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/spiders/session.py +4 -2
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/spiders/spider.py +3 -0
- {scrapling-0.4.2 → scrapling-0.4.4/scrapling.egg-info}/PKG-INFO +76 -30
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling.egg-info/SOURCES.txt +1 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling.egg-info/requires.txt +5 -4
- {scrapling-0.4.2 → scrapling-0.4.4}/setup.cfg +1 -1
- scrapling-0.4.2/scrapling/cli.py +0 -826
- {scrapling-0.4.2 → scrapling-0.4.4}/LICENSE +0 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/MANIFEST.in +0 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/core/__init__.py +0 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/core/_shell_signatures.py +0 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/core/_types.py +0 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/core/mixins.py +0 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/core/storage.py +0 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/core/translator.py +0 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/core/utils/__init__.py +0 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/core/utils/_shell.py +0 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/core/utils/_utils.py +0 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/engines/__init__.py +0 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/engines/_browsers/__init__.py +0 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/engines/_browsers/_config_tools.py +0 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/engines/_browsers/_page.py +0 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/engines/constants.py +0 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/engines/toolbelt/__init__.py +0 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/engines/toolbelt/fingerprints.py +0 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/engines/toolbelt/navigation.py +0 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/fetchers/__init__.py +0 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/fetchers/chrome.py +0 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/fetchers/requests.py +0 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/fetchers/stealth_chrome.py +0 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/parser.py +0 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/py.typed +0 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/spiders/__init__.py +0 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/spiders/checkpoint.py +0 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/spiders/request.py +0 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling/spiders/scheduler.py +0 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling.egg-info/dependency_links.txt +0 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling.egg-info/entry_points.txt +0 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling.egg-info/not-zip-safe +0 -0
- {scrapling-0.4.2 → scrapling-0.4.4}/scrapling.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scrapling
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.4
|
|
4
4
|
Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
|
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
|
6
6
|
Author: Karim Shoair
|
|
@@ -70,19 +70,20 @@ Description-Content-Type: text/markdown
|
|
|
70
70
|
License-File: LICENSE
|
|
71
71
|
Requires-Dist: lxml>=6.0.2
|
|
72
72
|
Requires-Dist: cssselect>=1.4.0
|
|
73
|
-
Requires-Dist: orjson>=3.11.
|
|
73
|
+
Requires-Dist: orjson>=3.11.8
|
|
74
74
|
Requires-Dist: tld>=0.13.2
|
|
75
|
-
Requires-Dist: w3lib>=2.4.
|
|
75
|
+
Requires-Dist: w3lib>=2.4.1
|
|
76
76
|
Requires-Dist: typing_extensions
|
|
77
77
|
Provides-Extra: fetchers
|
|
78
78
|
Requires-Dist: click>=8.3.0; extra == "fetchers"
|
|
79
|
-
Requires-Dist: curl_cffi>=0.
|
|
79
|
+
Requires-Dist: curl_cffi>=0.15.0; extra == "fetchers"
|
|
80
80
|
Requires-Dist: playwright==1.58.0; extra == "fetchers"
|
|
81
81
|
Requires-Dist: patchright==1.58.2; extra == "fetchers"
|
|
82
82
|
Requires-Dist: browserforge>=1.2.4; extra == "fetchers"
|
|
83
|
-
Requires-Dist: apify-fingerprint-datapoints>=0.
|
|
83
|
+
Requires-Dist: apify-fingerprint-datapoints>=0.12.0; extra == "fetchers"
|
|
84
84
|
Requires-Dist: msgspec>=0.20.0; extra == "fetchers"
|
|
85
85
|
Requires-Dist: anyio>=4.12.1; extra == "fetchers"
|
|
86
|
+
Requires-Dist: protego>=0.6.0; extra == "fetchers"
|
|
86
87
|
Provides-Extra: ai
|
|
87
88
|
Requires-Dist: mcp>=1.26.0; extra == "ai"
|
|
88
89
|
Requires-Dist: markdownify>=1.2.0; extra == "ai"
|
|
@@ -111,14 +112,13 @@ Dynamic: license-file
|
|
|
111
112
|
<p align="center">
|
|
112
113
|
<a href="https://trendshift.io/repositories/14244" target="_blank"><img src="https://trendshift.io/api/badge/repositories/14244" alt="D4Vinci%2FScrapling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
|
113
114
|
<br/>
|
|
114
|
-
<a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_AR.md">العربيه</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_ES.md">Español</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_FR.md">Français</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_DE.md">Deutsch</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_CN.md">简体中文</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_JP.md">日本語</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_RU.md">Русский</a>
|
|
115
|
+
<a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_AR.md">العربيه</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_ES.md">Español</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_FR.md">Français</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_DE.md">Deutsch</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_CN.md">简体中文</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_JP.md">日本語</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_RU.md">Русский</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_KR.md">한국어</a>
|
|
115
116
|
<br/>
|
|
116
117
|
<a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
|
|
117
118
|
<img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
|
|
118
119
|
<a href="https://badge.fury.io/py/Scrapling" alt="PyPI version">
|
|
119
120
|
<img alt="PyPI version" src="https://badge.fury.io/py/Scrapling.svg"></a>
|
|
120
|
-
<a href="https://
|
|
121
|
-
<img alt="PyPI Downloads" src="https://static.pepy.tech/personalized-badge/scrapling?period=total&units=INTERNATIONAL_SYSTEM&left_color=GREY&right_color=GREEN&left_text=Downloads"></a>
|
|
121
|
+
<a href="https://clickpy.clickhouse.com/dashboard/scrapling" rel="nofollow"><img src="https://img.shields.io/pypi/dm/scrapling" alt="PyPI package downloads"></a>
|
|
122
122
|
<a href="https://github.com/D4Vinci/Scrapling/tree/main/agent-skill" alt="AI Agent Skill directory">
|
|
123
123
|
<img alt="Static Badge" src="https://img.shields.io/badge/Skill-black?style=flat&label=Agent&link=https%3A%2F%2Fgithub.com%2FD4Vinci%2FScrapling%2Ftree%2Fmain%2Fagent-skill"></a>
|
|
124
124
|
<a href="https://clawhub.ai/D4Vinci/scrapling-official" alt="OpenClaw Skill">
|
|
@@ -136,22 +136,22 @@ Dynamic: license-file
|
|
|
136
136
|
</p>
|
|
137
137
|
|
|
138
138
|
<p align="center">
|
|
139
|
-
<a href="https://scrapling.readthedocs.io/en/latest/parsing/selection
|
|
139
|
+
<a href="https://scrapling.readthedocs.io/en/latest/parsing/selection.html"><strong>Selection methods</strong></a>
|
|
140
140
|
·
|
|
141
|
-
<a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing
|
|
141
|
+
<a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing.html"><strong>Fetchers</strong></a>
|
|
142
142
|
·
|
|
143
143
|
<a href="https://scrapling.readthedocs.io/en/latest/spiders/architecture.html"><strong>Spiders</strong></a>
|
|
144
144
|
·
|
|
145
145
|
<a href="https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html"><strong>Proxy Rotation</strong></a>
|
|
146
146
|
·
|
|
147
|
-
<a href="https://scrapling.readthedocs.io/en/latest/cli/overview
|
|
147
|
+
<a href="https://scrapling.readthedocs.io/en/latest/cli/overview.html"><strong>CLI</strong></a>
|
|
148
148
|
·
|
|
149
|
-
<a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server
|
|
149
|
+
<a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server.html"><strong>MCP</strong></a>
|
|
150
150
|
</p>
|
|
151
151
|
|
|
152
152
|
Scrapling is an adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl.
|
|
153
153
|
|
|
154
|
-
Its parser learns from website changes and automatically relocates your elements when pages update. Its fetchers bypass anti-bot systems like Cloudflare Turnstile out of the box. And its spider framework lets you scale up to concurrent, multi-session crawls with pause/resume and automatic proxy rotation
|
|
154
|
+
Its parser learns from website changes and automatically relocates your elements when pages update. Its fetchers bypass anti-bot systems like Cloudflare Turnstile out of the box. And its spider framework lets you scale up to concurrent, multi-session crawls with pause/resume and automatic proxy rotation - all in a few lines of Python. One library, zero compromises.
|
|
155
155
|
|
|
156
156
|
Blazing fast crawls with real-time stats and streaming. Built by Web Scrapers for Web Scrapers and regular users, there's something for everyone.
|
|
157
157
|
|
|
@@ -189,7 +189,6 @@ MySpider().start()
|
|
|
189
189
|
<td width="200">
|
|
190
190
|
<a href="https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling" target="_blank" title="Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada">
|
|
191
191
|
<img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png">
|
|
192
|
-
<br />
|
|
193
192
|
</a>
|
|
194
193
|
</td>
|
|
195
194
|
<td> Scrapling handles Cloudflare Turnstile. For enterprise-grade protection, <a href="https://hypersolutions.co?utm_source=github&utm_medium=readme&utm_campaign=scrapling">
|
|
@@ -200,7 +199,6 @@ MySpider().start()
|
|
|
200
199
|
<td width="200">
|
|
201
200
|
<a href="https://birdproxies.com/t/scrapling" target="_blank" title="At Bird Proxies, we eliminate your pains such as banned IPs, geo restriction, and high costs so you can focus on your work.">
|
|
202
201
|
<img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/BirdProxies.jpg">
|
|
203
|
-
<br />
|
|
204
202
|
</a>
|
|
205
203
|
</td>
|
|
206
204
|
<td>Hey, we built <a href="https://birdproxies.com/t/scrapling">
|
|
@@ -213,7 +211,6 @@ MySpider().start()
|
|
|
213
211
|
<td width="200">
|
|
214
212
|
<a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB">
|
|
215
213
|
<img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png">
|
|
216
|
-
<br />
|
|
217
214
|
</a>
|
|
218
215
|
</td>
|
|
219
216
|
<td>
|
|
@@ -225,13 +222,63 @@ MySpider().start()
|
|
|
225
222
|
</tr>
|
|
226
223
|
<tr>
|
|
227
224
|
<td width="200">
|
|
228
|
-
<a href="https://tikhub.io/?
|
|
225
|
+
<a href="https://tikhub.io/?utm_source=github.com/D4Vinci/Scrapling&utm_medium=marketing_social&utm_campaign=retargeting&utm_content=carousel_ad" target="_blank" title="Unlock the Power of Social Media Data & AI">
|
|
229
226
|
<img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TikHub.jpg">
|
|
230
|
-
<br />
|
|
231
227
|
</a>
|
|
232
228
|
</td>
|
|
233
229
|
<td>
|
|
234
|
-
<a href="https://tikhub.io/?
|
|
230
|
+
<a href="https://tikhub.io/?utm_source=github.com/D4Vinci/Scrapling&utm_medium=marketing_social&utm_campaign=retargeting&utm_content=carousel_ad" target="_blank">TikHub.io</a> provides 900+ stable APIs across 16+ platforms including TikTok, X, YouTube & Instagram, with 40M+ datasets. <br /> Also offers <a href="https://ai.tikhub.io/?ref=KarimShoair" target="_blank">DISCOUNTED AI models</a> - Claude, GPT, GEMINI & more up to 71% off.
|
|
231
|
+
</td>
|
|
232
|
+
</tr>
|
|
233
|
+
<tr>
|
|
234
|
+
<td width="200">
|
|
235
|
+
<a href="https://www.nsocks.com/?keyword=2p67aivg" target="_blank" title="Scalable Web Data Access for AI Applications">
|
|
236
|
+
<img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/nsocks.png">
|
|
237
|
+
</a>
|
|
238
|
+
</td>
|
|
239
|
+
<td>
|
|
240
|
+
<a href="https://www.nsocks.com/?keyword=2p67aivg" target="_blank">Nsocks</a> provides fast Residential and ISP proxies for developers and scrapers. Global IP coverage, high anonymity, smart rotation, and reliable performance for automation and data extraction. Use <a href="https://www.xcrawl.com/?keyword=2p67aivg" target="_blank">Xcrawl</a> to simplify large-scale web crawling.
|
|
241
|
+
</td>
|
|
242
|
+
</tr>
|
|
243
|
+
<tr>
|
|
244
|
+
<td width="200">
|
|
245
|
+
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting.">
|
|
246
|
+
<img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png">
|
|
247
|
+
</a>
|
|
248
|
+
</td>
|
|
249
|
+
<td>
|
|
250
|
+
Close your laptop. Your scrapers keep running. <br />
|
|
251
|
+
<a href="https://petrosky.io/d4vinci" target="_blank">PetroSky VPS</a> - cloud servers built for nonstop automation. Windows and Linux machines with full control. From €6.99/mo.
|
|
252
|
+
</td>
|
|
253
|
+
</tr>
|
|
254
|
+
<tr>
|
|
255
|
+
<td width="200">
|
|
256
|
+
<a href="https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling" target="_blank" title="The #1 newsletter dedicated to Web Scraping">
|
|
257
|
+
<img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TWSC.png">
|
|
258
|
+
</a>
|
|
259
|
+
</td>
|
|
260
|
+
<td>
|
|
261
|
+
Read a full review of <a href="https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling" target="_blank">Scrapling on The Web Scraping Club</a> (Nov 2025), the #1 newsletter dedicated to Web Scraping.
|
|
262
|
+
</td>
|
|
263
|
+
</tr>
|
|
264
|
+
<tr>
|
|
265
|
+
<td width="200">
|
|
266
|
+
<a href="https://proxy-seller.com/?partner=CU9CAA5TBYFFT2" target="_blank" title="Proxy-Seller provides reliable proxy infrastructure for Web Scraping">
|
|
267
|
+
<img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxySeller.png">
|
|
268
|
+
</a>
|
|
269
|
+
</td>
|
|
270
|
+
<td>
|
|
271
|
+
<a href="https://proxy-seller.com/?partner=CU9CAA5TBYFFT2" target="_blank">Proxy-Seller</a> provides reliable proxy infrastructure for web scraping, offering IPv4, IPv6, ISP, Residential, and Mobile proxies with stable performance, broad geo coverage, and flexible plans for business-scale data collection.
|
|
272
|
+
</td>
|
|
273
|
+
</tr>
|
|
274
|
+
<tr>
|
|
275
|
+
<td width="200">
|
|
276
|
+
<a href="http://mangoproxy.com/?utm_source=D4Vinci&utm_medium=GitHub&utm_campaign=D4Vinci" target="_blank" title="Proxies You Can Rely On: Residential, Server, and Mobile">
|
|
277
|
+
<img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/MangoProxy.png">
|
|
278
|
+
</a>
|
|
279
|
+
</td>
|
|
280
|
+
<td>
|
|
281
|
+
<a href="http://mangoproxy.com/?utm_source=D4Vinci&utm_medium=GitHub&utm_campaign=D4Vinci" target="_blank">Stable proxies</a> for scraping, automation, and multi-accounting. Clean IPs, fast response, and reliable performance under load. Built for scalable workflows.
|
|
235
282
|
</td>
|
|
236
283
|
</tr>
|
|
237
284
|
</table>
|
|
@@ -241,15 +288,13 @@ MySpider().start()
|
|
|
241
288
|
|
|
242
289
|
<!-- sponsors -->
|
|
243
290
|
|
|
291
|
+
|
|
244
292
|
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
|
245
293
|
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
|
246
|
-
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
|
247
294
|
<a href="https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci" target="_blank" title="The web scraping service that actually beats anti-bot systems!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png"></a>
|
|
248
295
|
<a href="https://proxyempire.io/?ref=scrapling&utm_source=scrapling" target="_blank" title="Collect The Data Your Project Needs with the Best Residential Proxies"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png"></a>
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
|
252
|
-
<a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
|
|
296
|
+
<a href="https://www.webshare.io/?referral_code=48r2m2cd5uz1" target="_blank" title="The Most Reliable Proxy with Unparalleled Performance"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/webshare.png"></a>
|
|
297
|
+
<a href="https://www.crawleo.dev/?utm_source=github&utm_medium=sponsor&utm_campaign=scrapling" target="_blank" title="Supercharge your AI with Real-Time Web Intelligence"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/crawleo.png"></a>
|
|
253
298
|
<a href="https://browser.cash/?utm_source=D4Vinci&utm_medium=referral" target="_blank" title="Browser Automation & AI Browser Agent Platform"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png"></a>
|
|
254
299
|
|
|
255
300
|
<!-- /sponsors -->
|
|
@@ -260,13 +305,14 @@ MySpider().start()
|
|
|
260
305
|
|
|
261
306
|
## Key Features
|
|
262
307
|
|
|
263
|
-
### Spiders
|
|
308
|
+
### Spiders - A Full Crawling Framework
|
|
264
309
|
- 🕷️ **Scrapy-like Spider API**: Define spiders with `start_urls`, async `parse` callbacks, and `Request`/`Response` objects.
|
|
265
310
|
- ⚡ **Concurrent Crawling**: Configurable concurrency limits, per-domain throttling, and download delays.
|
|
266
|
-
- 🔄 **Multi-Session Support**: Unified interface for HTTP requests, and stealthy headless browsers in a single spider
|
|
311
|
+
- 🔄 **Multi-Session Support**: Unified interface for HTTP requests, and stealthy headless browsers in a single spider - route requests to different sessions by ID.
|
|
267
312
|
- 💾 **Pause & Resume**: Checkpoint-based crawl persistence. Press Ctrl+C for a graceful shutdown; restart to resume from where you left off.
|
|
268
|
-
- 📡 **Streaming Mode**: Stream scraped items as they arrive via `async for item in spider.stream()` with real-time stats
|
|
313
|
+
- 📡 **Streaming Mode**: Stream scraped items as they arrive via `async for item in spider.stream()` with real-time stats - ideal for UI, pipelines, and long-running crawls.
|
|
269
314
|
- 🛡️ **Blocked Request Detection**: Automatic detection and retry of blocked requests with customizable logic.
|
|
315
|
+
- 🤖 **Robots.txt Compliance**: Optional `robots_txt_obey` flag that respects `Disallow`, `Crawl-delay`, and `Request-rate` directives with per-domain caching.
|
|
270
316
|
- 📦 **Built-in Export**: Export results through hooks and your own pipeline or the built-in JSON/JSONL with `result.items.to_json()` / `result.items.to_jsonl()` respectively.
|
|
271
317
|
|
|
272
318
|
### Advanced Websites Fetching with Session Support
|
|
@@ -392,7 +438,7 @@ Pause and resume long crawls with checkpoints by running the spider like this:
|
|
|
392
438
|
```python
|
|
393
439
|
QuotesSpider(crawldir="./crawl_data").start()
|
|
394
440
|
```
|
|
395
|
-
Press Ctrl+C to pause gracefully
|
|
441
|
+
Press Ctrl+C to pause gracefully - progress is saved automatically. Later, when you start the spider again, pass the same `crawldir`, and it will resume from where it stopped.
|
|
396
442
|
|
|
397
443
|
### Advanced Parsing & Navigation
|
|
398
444
|
```python
|
|
@@ -477,7 +523,7 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.
|
|
|
477
523
|
|
|
478
524
|
## Performance Benchmarks
|
|
479
525
|
|
|
480
|
-
Scrapling isn't just powerful
|
|
526
|
+
Scrapling isn't just powerful-it's also blazing fast. The following benchmarks compare Scrapling's parser with the latest versions of other popular libraries.
|
|
481
527
|
|
|
482
528
|
### Text Extraction Speed Test (5000 nested elements)
|
|
483
529
|
|
|
@@ -589,7 +635,7 @@ This work is licensed under the BSD-3-Clause License.
|
|
|
589
635
|
## Acknowledgments
|
|
590
636
|
|
|
591
637
|
This project includes code adapted from:
|
|
592
|
-
- Parsel (BSD License)
|
|
638
|
+
- Parsel (BSD License)-Used for [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py) submodule
|
|
593
639
|
|
|
594
640
|
---
|
|
595
641
|
<div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
|
|
@@ -14,14 +14,13 @@
|
|
|
14
14
|
<p align="center">
|
|
15
15
|
<a href="https://trendshift.io/repositories/14244" target="_blank"><img src="https://trendshift.io/api/badge/repositories/14244" alt="D4Vinci%2FScrapling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
|
16
16
|
<br/>
|
|
17
|
-
<a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_AR.md">العربيه</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_ES.md">Español</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_FR.md">Français</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_DE.md">Deutsch</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_CN.md">简体中文</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_JP.md">日本語</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_RU.md">Русский</a>
|
|
17
|
+
<a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_AR.md">العربيه</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_ES.md">Español</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_FR.md">Français</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_DE.md">Deutsch</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_CN.md">简体中文</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_JP.md">日本語</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_RU.md">Русский</a> | <a href="https://github.com/D4Vinci/Scrapling/blob/main/docs/README_KR.md">한국어</a>
|
|
18
18
|
<br/>
|
|
19
19
|
<a href="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml" alt="Tests">
|
|
20
20
|
<img alt="Tests" src="https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg"></a>
|
|
21
21
|
<a href="https://badge.fury.io/py/Scrapling" alt="PyPI version">
|
|
22
22
|
<img alt="PyPI version" src="https://badge.fury.io/py/Scrapling.svg"></a>
|
|
23
|
-
<a href="https://
|
|
24
|
-
<img alt="PyPI Downloads" src="https://static.pepy.tech/personalized-badge/scrapling?period=total&units=INTERNATIONAL_SYSTEM&left_color=GREY&right_color=GREEN&left_text=Downloads"></a>
|
|
23
|
+
<a href="https://clickpy.clickhouse.com/dashboard/scrapling" rel="nofollow"><img src="https://img.shields.io/pypi/dm/scrapling" alt="PyPI package downloads"></a>
|
|
25
24
|
<a href="https://github.com/D4Vinci/Scrapling/tree/main/agent-skill" alt="AI Agent Skill directory">
|
|
26
25
|
<img alt="Static Badge" src="https://img.shields.io/badge/Skill-black?style=flat&label=Agent&link=https%3A%2F%2Fgithub.com%2FD4Vinci%2FScrapling%2Ftree%2Fmain%2Fagent-skill"></a>
|
|
27
26
|
<a href="https://clawhub.ai/D4Vinci/scrapling-official" alt="OpenClaw Skill">
|
|
@@ -39,22 +38,22 @@
|
|
|
39
38
|
</p>
|
|
40
39
|
|
|
41
40
|
<p align="center">
|
|
42
|
-
<a href="https://scrapling.readthedocs.io/en/latest/parsing/selection
|
|
41
|
+
<a href="https://scrapling.readthedocs.io/en/latest/parsing/selection.html"><strong>Selection methods</strong></a>
|
|
43
42
|
·
|
|
44
|
-
<a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing
|
|
43
|
+
<a href="https://scrapling.readthedocs.io/en/latest/fetching/choosing.html"><strong>Fetchers</strong></a>
|
|
45
44
|
·
|
|
46
45
|
<a href="https://scrapling.readthedocs.io/en/latest/spiders/architecture.html"><strong>Spiders</strong></a>
|
|
47
46
|
·
|
|
48
47
|
<a href="https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html"><strong>Proxy Rotation</strong></a>
|
|
49
48
|
·
|
|
50
|
-
<a href="https://scrapling.readthedocs.io/en/latest/cli/overview
|
|
49
|
+
<a href="https://scrapling.readthedocs.io/en/latest/cli/overview.html"><strong>CLI</strong></a>
|
|
51
50
|
·
|
|
52
|
-
<a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server
|
|
51
|
+
<a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server.html"><strong>MCP</strong></a>
|
|
53
52
|
</p>
|
|
54
53
|
|
|
55
54
|
Scrapling is an adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl.
|
|
56
55
|
|
|
57
|
-
Its parser learns from website changes and automatically relocates your elements when pages update. Its fetchers bypass anti-bot systems like Cloudflare Turnstile out of the box. And its spider framework lets you scale up to concurrent, multi-session crawls with pause/resume and automatic proxy rotation
|
|
56
|
+
Its parser learns from website changes and automatically relocates your elements when pages update. Its fetchers bypass anti-bot systems like Cloudflare Turnstile out of the box. And its spider framework lets you scale up to concurrent, multi-session crawls with pause/resume and automatic proxy rotation - all in a few lines of Python. One library, zero compromises.
|
|
58
57
|
|
|
59
58
|
Blazing fast crawls with real-time stats and streaming. Built by Web Scrapers for Web Scrapers and regular users, there's something for everyone.
|
|
60
59
|
|
|
@@ -92,7 +91,6 @@ MySpider().start()
|
|
|
92
91
|
<td width="200">
|
|
93
92
|
<a href="https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling" target="_blank" title="Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada">
|
|
94
93
|
<img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png">
|
|
95
|
-
<br />
|
|
96
94
|
</a>
|
|
97
95
|
</td>
|
|
98
96
|
<td> Scrapling handles Cloudflare Turnstile. For enterprise-grade protection, <a href="https://hypersolutions.co?utm_source=github&utm_medium=readme&utm_campaign=scrapling">
|
|
@@ -103,7 +101,6 @@ MySpider().start()
|
|
|
103
101
|
<td width="200">
|
|
104
102
|
<a href="https://birdproxies.com/t/scrapling" target="_blank" title="At Bird Proxies, we eliminate your pains such as banned IPs, geo restriction, and high costs so you can focus on your work.">
|
|
105
103
|
<img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/BirdProxies.jpg">
|
|
106
|
-
<br />
|
|
107
104
|
</a>
|
|
108
105
|
</td>
|
|
109
106
|
<td>Hey, we built <a href="https://birdproxies.com/t/scrapling">
|
|
@@ -116,7 +113,6 @@ MySpider().start()
|
|
|
116
113
|
<td width="200">
|
|
117
114
|
<a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB">
|
|
118
115
|
<img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png">
|
|
119
|
-
<br />
|
|
120
116
|
</a>
|
|
121
117
|
</td>
|
|
122
118
|
<td>
|
|
@@ -128,13 +124,63 @@ MySpider().start()
|
|
|
128
124
|
</tr>
|
|
129
125
|
<tr>
|
|
130
126
|
<td width="200">
|
|
131
|
-
<a href="https://tikhub.io/?
|
|
127
|
+
<a href="https://tikhub.io/?utm_source=github.com/D4Vinci/Scrapling&utm_medium=marketing_social&utm_campaign=retargeting&utm_content=carousel_ad" target="_blank" title="Unlock the Power of Social Media Data & AI">
|
|
132
128
|
<img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TikHub.jpg">
|
|
133
|
-
<br />
|
|
134
129
|
</a>
|
|
135
130
|
</td>
|
|
136
131
|
<td>
|
|
137
|
-
<a href="https://tikhub.io/?
|
|
132
|
+
<a href="https://tikhub.io/?utm_source=github.com/D4Vinci/Scrapling&utm_medium=marketing_social&utm_campaign=retargeting&utm_content=carousel_ad" target="_blank">TikHub.io</a> provides 900+ stable APIs across 16+ platforms including TikTok, X, YouTube & Instagram, with 40M+ datasets. <br /> Also offers <a href="https://ai.tikhub.io/?ref=KarimShoair" target="_blank">DISCOUNTED AI models</a> - Claude, GPT, GEMINI & more up to 71% off.
|
|
133
|
+
</td>
|
|
134
|
+
</tr>
|
|
135
|
+
<tr>
|
|
136
|
+
<td width="200">
|
|
137
|
+
<a href="https://www.nsocks.com/?keyword=2p67aivg" target="_blank" title="Scalable Web Data Access for AI Applications">
|
|
138
|
+
<img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/nsocks.png">
|
|
139
|
+
</a>
|
|
140
|
+
</td>
|
|
141
|
+
<td>
|
|
142
|
+
<a href="https://www.nsocks.com/?keyword=2p67aivg" target="_blank">Nsocks</a> provides fast Residential and ISP proxies for developers and scrapers. Global IP coverage, high anonymity, smart rotation, and reliable performance for automation and data extraction. Use <a href="https://www.xcrawl.com/?keyword=2p67aivg" target="_blank">Xcrawl</a> to simplify large-scale web crawling.
|
|
143
|
+
</td>
|
|
144
|
+
</tr>
|
|
145
|
+
<tr>
|
|
146
|
+
<td width="200">
|
|
147
|
+
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting.">
|
|
148
|
+
<img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png">
|
|
149
|
+
</a>
|
|
150
|
+
</td>
|
|
151
|
+
<td>
|
|
152
|
+
Close your laptop. Your scrapers keep running. <br />
|
|
153
|
+
<a href="https://petrosky.io/d4vinci" target="_blank">PetroSky VPS</a> - cloud servers built for nonstop automation. Windows and Linux machines with full control. From €6.99/mo.
|
|
154
|
+
</td>
|
|
155
|
+
</tr>
|
|
156
|
+
<tr>
|
|
157
|
+
<td width="200">
|
|
158
|
+
<a href="https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling" target="_blank" title="The #1 newsletter dedicated to Web Scraping">
|
|
159
|
+
<img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TWSC.png">
|
|
160
|
+
</a>
|
|
161
|
+
</td>
|
|
162
|
+
<td>
|
|
163
|
+
Read a full review of <a href="https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling" target="_blank">Scrapling on The Web Scraping Club</a> (Nov 2025), the #1 newsletter dedicated to Web Scraping.
|
|
164
|
+
</td>
|
|
165
|
+
</tr>
|
|
166
|
+
<tr>
|
|
167
|
+
<td width="200">
|
|
168
|
+
<a href="https://proxy-seller.com/?partner=CU9CAA5TBYFFT2" target="_blank" title="Proxy-Seller provides reliable proxy infrastructure for Web Scraping">
|
|
169
|
+
<img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxySeller.png">
|
|
170
|
+
</a>
|
|
171
|
+
</td>
|
|
172
|
+
<td>
|
|
173
|
+
<a href="https://proxy-seller.com/?partner=CU9CAA5TBYFFT2" target="_blank">Proxy-Seller</a> provides reliable proxy infrastructure for web scraping, offering IPv4, IPv6, ISP, Residential, and Mobile proxies with stable performance, broad geo coverage, and flexible plans for business-scale data collection.
|
|
174
|
+
</td>
|
|
175
|
+
</tr>
|
|
176
|
+
<tr>
|
|
177
|
+
<td width="200">
|
|
178
|
+
<a href="http://mangoproxy.com/?utm_source=D4Vinci&utm_medium=GitHub&utm_campaign=D4Vinci" target="_blank" title="Proxies You Can Rely On: Residential, Server, and Mobile">
|
|
179
|
+
<img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/MangoProxy.png">
|
|
180
|
+
</a>
|
|
181
|
+
</td>
|
|
182
|
+
<td>
|
|
183
|
+
<a href="http://mangoproxy.com/?utm_source=D4Vinci&utm_medium=GitHub&utm_campaign=D4Vinci" target="_blank">Stable proxies</a> for scraping, automation, and multi-accounting. Clean IPs, fast response, and reliable performance under load. Built for scalable workflows.
|
|
138
184
|
</td>
|
|
139
185
|
</tr>
|
|
140
186
|
</table>
|
|
@@ -144,15 +190,13 @@ MySpider().start()
|
|
|
144
190
|
|
|
145
191
|
<!-- sponsors -->
|
|
146
192
|
|
|
193
|
+
|
|
147
194
|
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
|
148
195
|
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
|
149
|
-
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
|
150
196
|
<a href="https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci" target="_blank" title="The web scraping service that actually beats anti-bot systems!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png"></a>
|
|
151
197
|
<a href="https://proxyempire.io/?ref=scrapling&utm_source=scrapling" target="_blank" title="Collect The Data Your Project Needs with the Best Residential Proxies"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png"></a>
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
|
155
|
-
<a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
|
|
198
|
+
<a href="https://www.webshare.io/?referral_code=48r2m2cd5uz1" target="_blank" title="The Most Reliable Proxy with Unparalleled Performance"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/webshare.png"></a>
|
|
199
|
+
<a href="https://www.crawleo.dev/?utm_source=github&utm_medium=sponsor&utm_campaign=scrapling" target="_blank" title="Supercharge your AI with Real-Time Web Intelligence"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/crawleo.png"></a>
|
|
156
200
|
<a href="https://browser.cash/?utm_source=D4Vinci&utm_medium=referral" target="_blank" title="Browser Automation & AI Browser Agent Platform"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png"></a>
|
|
157
201
|
|
|
158
202
|
<!-- /sponsors -->
|
|
@@ -163,13 +207,14 @@ MySpider().start()
|
|
|
163
207
|
|
|
164
208
|
## Key Features
|
|
165
209
|
|
|
166
|
-
### Spiders
|
|
210
|
+
### Spiders - A Full Crawling Framework
|
|
167
211
|
- 🕷️ **Scrapy-like Spider API**: Define spiders with `start_urls`, async `parse` callbacks, and `Request`/`Response` objects.
|
|
168
212
|
- ⚡ **Concurrent Crawling**: Configurable concurrency limits, per-domain throttling, and download delays.
|
|
169
|
-
- 🔄 **Multi-Session Support**: Unified interface for HTTP requests, and stealthy headless browsers in a single spider
|
|
213
|
+
- 🔄 **Multi-Session Support**: Unified interface for HTTP requests, and stealthy headless browsers in a single spider - route requests to different sessions by ID.
|
|
170
214
|
- 💾 **Pause & Resume**: Checkpoint-based crawl persistence. Press Ctrl+C for a graceful shutdown; restart to resume from where you left off.
|
|
171
|
-
- 📡 **Streaming Mode**: Stream scraped items as they arrive via `async for item in spider.stream()` with real-time stats
|
|
215
|
+
- 📡 **Streaming Mode**: Stream scraped items as they arrive via `async for item in spider.stream()` with real-time stats - ideal for UI, pipelines, and long-running crawls.
|
|
172
216
|
- 🛡️ **Blocked Request Detection**: Automatic detection and retry of blocked requests with customizable logic.
|
|
217
|
+
- 🤖 **Robots.txt Compliance**: Optional `robots_txt_obey` flag that respects `Disallow`, `Crawl-delay`, and `Request-rate` directives with per-domain caching.
|
|
173
218
|
- 📦 **Built-in Export**: Export results through hooks and your own pipeline or the built-in JSON/JSONL with `result.items.to_json()` / `result.items.to_jsonl()` respectively.
|
|
174
219
|
|
|
175
220
|
### Advanced Websites Fetching with Session Support
|
|
@@ -295,7 +340,7 @@ Pause and resume long crawls with checkpoints by running the spider like this:
|
|
|
295
340
|
```python
|
|
296
341
|
QuotesSpider(crawldir="./crawl_data").start()
|
|
297
342
|
```
|
|
298
|
-
Press Ctrl+C to pause gracefully
|
|
343
|
+
Press Ctrl+C to pause gracefully - progress is saved automatically. Later, when you start the spider again, pass the same `crawldir`, and it will resume from where it stopped.
|
|
299
344
|
|
|
300
345
|
### Advanced Parsing & Navigation
|
|
301
346
|
```python
|
|
@@ -380,7 +425,7 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.
|
|
|
380
425
|
|
|
381
426
|
## Performance Benchmarks
|
|
382
427
|
|
|
383
|
-
Scrapling isn't just powerful
|
|
428
|
+
Scrapling isn't just powerful-it's also blazing fast. The following benchmarks compare Scrapling's parser with the latest versions of other popular libraries.
|
|
384
429
|
|
|
385
430
|
### Text Extraction Speed Test (5000 nested elements)
|
|
386
431
|
|
|
@@ -492,7 +537,7 @@ This work is licensed under the BSD-3-Clause License.
|
|
|
492
537
|
## Acknowledgments
|
|
493
538
|
|
|
494
539
|
This project includes code adapted from:
|
|
495
|
-
- Parsel (BSD License)
|
|
540
|
+
- Parsel (BSD License)-Used for [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py) submodule
|
|
496
541
|
|
|
497
542
|
---
|
|
498
543
|
<div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
|
|
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "scrapling"
|
|
7
7
|
# Static version instead of a dynamic version so we can get better layer caching while building docker, check the docker file to understand
|
|
8
|
-
version = "0.4.
|
|
8
|
+
version = "0.4.4"
|
|
9
9
|
description = "Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!"
|
|
10
10
|
readme = {file = "README.md", content-type = "text/markdown"}
|
|
11
11
|
license = {file = "LICENSE"}
|
|
@@ -63,22 +63,23 @@ classifiers = [
|
|
|
63
63
|
dependencies = [
|
|
64
64
|
"lxml>=6.0.2",
|
|
65
65
|
"cssselect>=1.4.0",
|
|
66
|
-
"orjson>=3.11.
|
|
66
|
+
"orjson>=3.11.8",
|
|
67
67
|
"tld>=0.13.2",
|
|
68
|
-
"w3lib>=2.4.
|
|
69
|
-
"typing_extensions"
|
|
68
|
+
"w3lib>=2.4.1",
|
|
69
|
+
"typing_extensions"
|
|
70
70
|
]
|
|
71
71
|
|
|
72
72
|
[project.optional-dependencies]
|
|
73
73
|
fetchers = [
|
|
74
74
|
"click>=8.3.0",
|
|
75
|
-
"curl_cffi>=0.
|
|
75
|
+
"curl_cffi>=0.15.0",
|
|
76
76
|
"playwright==1.58.0",
|
|
77
77
|
"patchright==1.58.2",
|
|
78
78
|
"browserforge>=1.2.4",
|
|
79
|
-
"apify-fingerprint-datapoints>=0.
|
|
79
|
+
"apify-fingerprint-datapoints>=0.12.0",
|
|
80
80
|
"msgspec>=0.20.0",
|
|
81
|
-
"anyio>=4.12.1"
|
|
81
|
+
"anyio>=4.12.1",
|
|
82
|
+
"protego>=0.6.0",
|
|
82
83
|
]
|
|
83
84
|
ai = [
|
|
84
85
|
"mcp>=1.26.0",
|