scrapling 0.3.4__tar.gz → 0.3.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scrapling-0.3.4/scrapling.egg-info → scrapling-0.3.6}/PKG-INFO +25 -23
- {scrapling-0.3.4 → scrapling-0.3.6}/README.md +19 -17
- {scrapling-0.3.4 → scrapling-0.3.6}/pyproject.toml +7 -9
- scrapling-0.3.6/scrapling/__init__.py +38 -0
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/cli.py +25 -8
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/core/_types.py +0 -2
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/core/ai.py +22 -14
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/core/custom_types.py +2 -2
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/core/shell.py +6 -5
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/core/storage.py +2 -1
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/core/utils/__init__.py +0 -1
- scrapling-0.3.6/scrapling/engines/_browsers/__init__.py +0 -0
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/engines/_browsers/_base.py +11 -36
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/engines/_browsers/_camoufox.py +75 -60
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/engines/_browsers/_controllers.py +43 -52
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/engines/_browsers/_page.py +1 -42
- scrapling-0.3.6/scrapling/engines/_browsers/_validators.py +229 -0
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/engines/constants.py +0 -15
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/engines/static.py +417 -16
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/engines/toolbelt/navigation.py +1 -1
- scrapling-0.3.6/scrapling/fetchers/__init__.py +36 -0
- scrapling-0.3.6/scrapling/fetchers/chrome.py +205 -0
- scrapling-0.3.6/scrapling/fetchers/firefox.py +216 -0
- scrapling-0.3.6/scrapling/fetchers/requests.py +28 -0
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/parser.py +7 -7
- {scrapling-0.3.4 → scrapling-0.3.6/scrapling.egg-info}/PKG-INFO +25 -23
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling.egg-info/SOURCES.txt +5 -2
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling.egg-info/requires.txt +5 -5
- {scrapling-0.3.4 → scrapling-0.3.6}/setup.cfg +1 -1
- scrapling-0.3.4/scrapling/__init__.py +0 -28
- scrapling-0.3.4/scrapling/engines/_browsers/__init__.py +0 -2
- scrapling-0.3.4/scrapling/engines/_browsers/_validators.py +0 -164
- scrapling-0.3.4/scrapling/fetchers.py +0 -444
- {scrapling-0.3.4 → scrapling-0.3.6}/LICENSE +0 -0
- {scrapling-0.3.4 → scrapling-0.3.6}/MANIFEST.in +0 -0
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/core/__init__.py +0 -0
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/core/_html_utils.py +0 -0
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/core/mixins.py +0 -0
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/core/translator.py +0 -0
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/core/utils/_shell.py +0 -0
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/core/utils/_utils.py +0 -0
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/engines/__init__.py +0 -0
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/engines/_browsers/_config_tools.py +0 -0
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/engines/toolbelt/__init__.py +0 -0
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/engines/toolbelt/convertor.py +0 -0
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/engines/toolbelt/custom.py +0 -0
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/engines/toolbelt/fingerprints.py +0 -0
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling/py.typed +0 -0
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling.egg-info/dependency_links.txt +0 -0
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling.egg-info/entry_points.txt +0 -0
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling.egg-info/not-zip-safe +0 -0
- {scrapling-0.3.4 → scrapling-0.3.6}/scrapling.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.6
|
4
4
|
Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -64,20 +64,20 @@ Classifier: Typing :: Typed
|
|
64
64
|
Requires-Python: >=3.10
|
65
65
|
Description-Content-Type: text/markdown
|
66
66
|
License-File: LICENSE
|
67
|
-
Requires-Dist: lxml>=6.0.
|
67
|
+
Requires-Dist: lxml>=6.0.2
|
68
68
|
Requires-Dist: cssselect>=1.3.0
|
69
69
|
Requires-Dist: orjson>=3.11.3
|
70
70
|
Requires-Dist: tldextract>=5.3.0
|
71
71
|
Provides-Extra: fetchers
|
72
|
-
Requires-Dist: click>=8.
|
72
|
+
Requires-Dist: click>=8.3.0; extra == "fetchers"
|
73
73
|
Requires-Dist: curl_cffi>=0.13.0; extra == "fetchers"
|
74
|
-
Requires-Dist: playwright>=1.
|
75
|
-
Requires-Dist:
|
74
|
+
Requires-Dist: playwright>=1.55.0; extra == "fetchers"
|
75
|
+
Requires-Dist: patchright>=1.55.2; extra == "fetchers"
|
76
76
|
Requires-Dist: camoufox>=0.4.11; extra == "fetchers"
|
77
77
|
Requires-Dist: geoip2>=5.1.0; extra == "fetchers"
|
78
78
|
Requires-Dist: msgspec>=0.19.0; extra == "fetchers"
|
79
79
|
Provides-Extra: ai
|
80
|
-
Requires-Dist: mcp>=1.
|
80
|
+
Requires-Dist: mcp>=1.15.0; extra == "ai"
|
81
81
|
Requires-Dist: markdownify>=1.2.0; extra == "ai"
|
82
82
|
Requires-Dist: scrapling[fetchers]; extra == "ai"
|
83
83
|
Provides-Extra: shell
|
@@ -139,7 +139,7 @@ Dynamic: license-file
|
|
139
139
|
|
140
140
|
Scrapling isn't just another Web Scraping library. It's the first **adaptive** scraping library that learns from website changes and evolves with them. While other libraries break when websites update their structure, Scrapling automatically relocates your elements and keeps your scrapers running.
|
141
141
|
|
142
|
-
Built for the modern Web, Scrapling
|
142
|
+
Built for the modern Web, Scrapling features its own rapid parsing engine and fetchers to handle all Web Scraping challenges you face or will face. Built by Web Scrapers for Web Scrapers and regular users, there's something for everyone.
|
143
143
|
|
144
144
|
```python
|
145
145
|
>> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
|
@@ -157,12 +157,14 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
|
|
157
157
|
|
158
158
|
<!-- sponsors -->
|
159
159
|
|
160
|
+
<a href="https://www.thordata.com/?ls=github&lk=D4Vinci" target="_blank" title="A global network of over 60M+ residential proxies with 99.7% availability, ensuring stable and reliable web data scraping to support AI, BI, and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
|
160
161
|
<a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
|
161
|
-
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
162
162
|
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
163
|
+
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
163
164
|
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
164
|
-
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
165
165
|
<a href="https://www.nstproxy.com/?type=flow&utm_source=scrapling" target="_blank" title="One Proxy Service, Infinite Solutions at Unbeatable Prices!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/NSTproxy.png"></a>
|
166
|
+
<a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
|
167
|
+
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
166
168
|
|
167
169
|
<!-- /sponsors -->
|
168
170
|
|
@@ -175,7 +177,7 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
|
|
175
177
|
### Advanced Websites Fetching with Session Support
|
176
178
|
- **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class. Can impersonate browsers' TLS fingerprint, headers, and use HTTP3.
|
177
179
|
- **Dynamic Loading**: Fetch dynamic websites with full browser automation through the `DynamicFetcher` class supporting Playwright's Chromium, real Chrome, and custom stealth mode.
|
178
|
-
- **Anti-bot Bypass**: Advanced stealth capabilities with `StealthyFetcher` using a modified version of Firefox and fingerprint spoofing. Can bypass all
|
180
|
+
- **Anti-bot Bypass**: Advanced stealth capabilities with `StealthyFetcher` using a modified version of Firefox and fingerprint spoofing. Can bypass all types of Cloudflare's Turnstile and Interstitial with automation easily.
|
179
181
|
- **Session Management**: Persistent session support with `FetcherSession`, `StealthySession`, and `DynamicSession` classes for cookie and state management across requests.
|
180
182
|
- **Async Support**: Complete async support across all fetchers and dedicated async session classes.
|
181
183
|
|
@@ -199,13 +201,7 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
|
|
199
201
|
- 📝 **Auto Selector Generation**: Generate robust CSS/XPath selectors for any element.
|
200
202
|
- 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup with the same pseudo-elements used in Scrapy/Parsel.
|
201
203
|
- 📘 **Complete Type Coverage**: Full type hints for excellent IDE support and code completion.
|
202
|
-
|
203
|
-
### New Session Architecture
|
204
|
-
Scrapling 0.3 introduces a completely revamped session system:
|
205
|
-
- **Persistent Sessions**: Maintain cookies, headers, and authentication across multiple requests
|
206
|
-
- **Automatic Session Management**: Smart session lifecycle handling with proper cleanup
|
207
|
-
- **Session Inheritance**: All fetchers support both one-off requests and persistent session usage
|
208
|
-
- **Concurrent Session Support**: Run multiple isolated sessions simultaneously
|
204
|
+
- 🔋 **Ready Docker image**: With each release, a Docker image containing all browsers is automatically built and pushed.
|
209
205
|
|
210
206
|
## Getting Started
|
211
207
|
|
@@ -323,11 +319,11 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.
|
|
323
319
|
```
|
324
320
|
|
325
321
|
> [!NOTE]
|
326
|
-
> There are many additional features, but we want to keep this page
|
322
|
+
> There are many additional features, but we want to keep this page concise, such as the MCP server and the interactive Web Scraping Shell. Check out the full documentation [here](https://scrapling.readthedocs.io/en/latest/)
|
327
323
|
|
328
324
|
## Performance Benchmarks
|
329
325
|
|
330
|
-
Scrapling isn't just powerful—it's also blazing fast, and the updates since version 0.3
|
326
|
+
Scrapling isn't just powerful—it's also blazing fast, and the updates since version 0.3 have delivered exceptional performance improvements across all operations.
|
331
327
|
|
332
328
|
### Text Extraction Speed Test (5000 nested elements)
|
333
329
|
|
@@ -390,6 +386,13 @@ Starting with v0.3.2, this installation only includes the parser engine and its
|
|
390
386
|
```
|
391
387
|
Don't forget that you need to install the browser dependencies with `scrapling install` after any of these extras (if you didn't already)
|
392
388
|
|
389
|
+
### Docker
|
390
|
+
You can also install a Docker image with all extras and browsers with the following command:
|
391
|
+
```bash
|
392
|
+
docker pull scrapling
|
393
|
+
```
|
394
|
+
This image is automatically built and pushed to Docker Hub through GitHub actions right here.
|
395
|
+
|
393
396
|
## Contributing
|
394
397
|
|
395
398
|
We welcome contributions! Please read our [contributing guidelines](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) before getting started.
|
@@ -397,7 +400,7 @@ We welcome contributions! Please read our [contributing guidelines](https://gith
|
|
397
400
|
## Disclaimer
|
398
401
|
|
399
402
|
> [!CAUTION]
|
400
|
-
> This library is provided for educational and research purposes only. By using this library, you agree to comply with local and international data scraping and privacy laws. The authors and contributors are not responsible for any misuse of this software. Always respect
|
403
|
+
> This library is provided for educational and research purposes only. By using this library, you agree to comply with local and international data scraping and privacy laws. The authors and contributors are not responsible for any misuse of this software. Always respect the terms of service of websites and robots.txt files.
|
401
404
|
|
402
405
|
## License
|
403
406
|
|
@@ -411,10 +414,9 @@ This project includes code adapted from:
|
|
411
414
|
## Thanks and References
|
412
415
|
|
413
416
|
- [Daijro](https://github.com/daijro)'s brilliant work on [BrowserForge](https://github.com/daijro/browserforge) and [Camoufox](https://github.com/daijro/camoufox)
|
414
|
-
- [Vinyzu](https://github.com/Vinyzu)'s work on [Botright](https://github.com/Vinyzu/Botright)
|
417
|
+
- [Vinyzu](https://github.com/Vinyzu)'s brilliant work on [Botright](https://github.com/Vinyzu/Botright) and [PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright)
|
415
418
|
- [brotector](https://github.com/kaliiiiiiiiii/brotector) for browser detection bypass techniques
|
416
|
-
- [fakebrowser](https://github.com/kkoooqq/fakebrowser) for fingerprinting research
|
417
|
-
- [rebrowser-patches](https://github.com/rebrowser/rebrowser-patches) for stealth improvements
|
419
|
+
- [fakebrowser](https://github.com/kkoooqq/fakebrowser) and [BotBrowser](https://github.com/botswin/BotBrowser) for fingerprinting research
|
418
420
|
|
419
421
|
---
|
420
422
|
<div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
|
@@ -49,7 +49,7 @@
|
|
49
49
|
|
50
50
|
Scrapling isn't just another Web Scraping library. It's the first **adaptive** scraping library that learns from website changes and evolves with them. While other libraries break when websites update their structure, Scrapling automatically relocates your elements and keeps your scrapers running.
|
51
51
|
|
52
|
-
Built for the modern Web, Scrapling
|
52
|
+
Built for the modern Web, Scrapling features its own rapid parsing engine and fetchers to handle all Web Scraping challenges you face or will face. Built by Web Scrapers for Web Scrapers and regular users, there's something for everyone.
|
53
53
|
|
54
54
|
```python
|
55
55
|
>> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
|
@@ -67,12 +67,14 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
|
|
67
67
|
|
68
68
|
<!-- sponsors -->
|
69
69
|
|
70
|
+
<a href="https://www.thordata.com/?ls=github&lk=D4Vinci" target="_blank" title="A global network of over 60M+ residential proxies with 99.7% availability, ensuring stable and reliable web data scraping to support AI, BI, and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
|
70
71
|
<a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
|
71
|
-
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
72
72
|
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
73
|
+
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
73
74
|
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
74
|
-
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
75
75
|
<a href="https://www.nstproxy.com/?type=flow&utm_source=scrapling" target="_blank" title="One Proxy Service, Infinite Solutions at Unbeatable Prices!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/NSTproxy.png"></a>
|
76
|
+
<a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
|
77
|
+
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
76
78
|
|
77
79
|
<!-- /sponsors -->
|
78
80
|
|
@@ -85,7 +87,7 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
|
|
85
87
|
### Advanced Websites Fetching with Session Support
|
86
88
|
- **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class. Can impersonate browsers' TLS fingerprint, headers, and use HTTP3.
|
87
89
|
- **Dynamic Loading**: Fetch dynamic websites with full browser automation through the `DynamicFetcher` class supporting Playwright's Chromium, real Chrome, and custom stealth mode.
|
88
|
-
- **Anti-bot Bypass**: Advanced stealth capabilities with `StealthyFetcher` using a modified version of Firefox and fingerprint spoofing. Can bypass all
|
90
|
+
- **Anti-bot Bypass**: Advanced stealth capabilities with `StealthyFetcher` using a modified version of Firefox and fingerprint spoofing. Can bypass all types of Cloudflare's Turnstile and Interstitial with automation easily.
|
89
91
|
- **Session Management**: Persistent session support with `FetcherSession`, `StealthySession`, and `DynamicSession` classes for cookie and state management across requests.
|
90
92
|
- **Async Support**: Complete async support across all fetchers and dedicated async session classes.
|
91
93
|
|
@@ -109,13 +111,7 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
|
|
109
111
|
- 📝 **Auto Selector Generation**: Generate robust CSS/XPath selectors for any element.
|
110
112
|
- 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup with the same pseudo-elements used in Scrapy/Parsel.
|
111
113
|
- 📘 **Complete Type Coverage**: Full type hints for excellent IDE support and code completion.
|
112
|
-
|
113
|
-
### New Session Architecture
|
114
|
-
Scrapling 0.3 introduces a completely revamped session system:
|
115
|
-
- **Persistent Sessions**: Maintain cookies, headers, and authentication across multiple requests
|
116
|
-
- **Automatic Session Management**: Smart session lifecycle handling with proper cleanup
|
117
|
-
- **Session Inheritance**: All fetchers support both one-off requests and persistent session usage
|
118
|
-
- **Concurrent Session Support**: Run multiple isolated sessions simultaneously
|
114
|
+
- 🔋 **Ready Docker image**: With each release, a Docker image containing all browsers is automatically built and pushed.
|
119
115
|
|
120
116
|
## Getting Started
|
121
117
|
|
@@ -233,11 +229,11 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.
|
|
233
229
|
```
|
234
230
|
|
235
231
|
> [!NOTE]
|
236
|
-
> There are many additional features, but we want to keep this page
|
232
|
+
> There are many additional features, but we want to keep this page concise, such as the MCP server and the interactive Web Scraping Shell. Check out the full documentation [here](https://scrapling.readthedocs.io/en/latest/)
|
237
233
|
|
238
234
|
## Performance Benchmarks
|
239
235
|
|
240
|
-
Scrapling isn't just powerful—it's also blazing fast, and the updates since version 0.3
|
236
|
+
Scrapling isn't just powerful—it's also blazing fast, and the updates since version 0.3 have delivered exceptional performance improvements across all operations.
|
241
237
|
|
242
238
|
### Text Extraction Speed Test (5000 nested elements)
|
243
239
|
|
@@ -300,6 +296,13 @@ Starting with v0.3.2, this installation only includes the parser engine and its
|
|
300
296
|
```
|
301
297
|
Don't forget that you need to install the browser dependencies with `scrapling install` after any of these extras (if you didn't already)
|
302
298
|
|
299
|
+
### Docker
|
300
|
+
You can also install a Docker image with all extras and browsers with the following command:
|
301
|
+
```bash
|
302
|
+
docker pull scrapling
|
303
|
+
```
|
304
|
+
This image is automatically built and pushed to Docker Hub through GitHub actions right here.
|
305
|
+
|
303
306
|
## Contributing
|
304
307
|
|
305
308
|
We welcome contributions! Please read our [contributing guidelines](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) before getting started.
|
@@ -307,7 +310,7 @@ We welcome contributions! Please read our [contributing guidelines](https://gith
|
|
307
310
|
## Disclaimer
|
308
311
|
|
309
312
|
> [!CAUTION]
|
310
|
-
> This library is provided for educational and research purposes only. By using this library, you agree to comply with local and international data scraping and privacy laws. The authors and contributors are not responsible for any misuse of this software. Always respect
|
313
|
+
> This library is provided for educational and research purposes only. By using this library, you agree to comply with local and international data scraping and privacy laws. The authors and contributors are not responsible for any misuse of this software. Always respect the terms of service of websites and robots.txt files.
|
311
314
|
|
312
315
|
## License
|
313
316
|
|
@@ -321,10 +324,9 @@ This project includes code adapted from:
|
|
321
324
|
## Thanks and References
|
322
325
|
|
323
326
|
- [Daijro](https://github.com/daijro)'s brilliant work on [BrowserForge](https://github.com/daijro/browserforge) and [Camoufox](https://github.com/daijro/camoufox)
|
324
|
-
- [Vinyzu](https://github.com/Vinyzu)'s work on [Botright](https://github.com/Vinyzu/Botright)
|
327
|
+
- [Vinyzu](https://github.com/Vinyzu)'s brilliant work on [Botright](https://github.com/Vinyzu/Botright) and [PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright)
|
325
328
|
- [brotector](https://github.com/kaliiiiiiiiii/brotector) for browser detection bypass techniques
|
326
|
-
- [fakebrowser](https://github.com/kkoooqq/fakebrowser) for fingerprinting research
|
327
|
-
- [rebrowser-patches](https://github.com/rebrowser/rebrowser-patches) for stealth improvements
|
329
|
+
- [fakebrowser](https://github.com/kkoooqq/fakebrowser) and [BotBrowser](https://github.com/botswin/BotBrowser) for fingerprinting research
|
328
330
|
|
329
331
|
---
|
330
332
|
<div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
|
@@ -4,7 +4,8 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "scrapling"
|
7
|
-
dynamic
|
7
|
+
# Static version instead of dynamic version so we can get better layer caching while building docker, check the docker file to understand
|
8
|
+
version = "0.3.6"
|
8
9
|
description = "Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!"
|
9
10
|
readme = {file = "README.md", content-type = "text/markdown"}
|
10
11
|
license = {file = "LICENSE"}
|
@@ -56,7 +57,7 @@ classifiers = [
|
|
56
57
|
"Typing :: Typed",
|
57
58
|
]
|
58
59
|
dependencies = [
|
59
|
-
"lxml>=6.0.
|
60
|
+
"lxml>=6.0.2",
|
60
61
|
"cssselect>=1.3.0",
|
61
62
|
"orjson>=3.11.3",
|
62
63
|
"tldextract>=5.3.0",
|
@@ -64,16 +65,16 @@ dependencies = [
|
|
64
65
|
|
65
66
|
[project.optional-dependencies]
|
66
67
|
fetchers = [
|
67
|
-
"click>=8.
|
68
|
+
"click>=8.3.0",
|
68
69
|
"curl_cffi>=0.13.0",
|
69
|
-
"playwright>=1.
|
70
|
-
"
|
70
|
+
"playwright>=1.55.0",
|
71
|
+
"patchright>=1.55.2",
|
71
72
|
"camoufox>=0.4.11",
|
72
73
|
"geoip2>=5.1.0",
|
73
74
|
"msgspec>=0.19.0",
|
74
75
|
]
|
75
76
|
ai = [
|
76
|
-
"mcp>=1.
|
77
|
+
"mcp>=1.15.0",
|
77
78
|
"markdownify>=1.2.0",
|
78
79
|
"scrapling[fetchers]",
|
79
80
|
]
|
@@ -99,9 +100,6 @@ scrapling = "scrapling.cli:main"
|
|
99
100
|
zip-safe = false
|
100
101
|
include-package-data = true
|
101
102
|
|
102
|
-
[tool.setuptools.dynamic]
|
103
|
-
version = {attr = "scrapling.__version__"}
|
104
|
-
|
105
103
|
[tool.setuptools.packages.find]
|
106
104
|
where = ["."]
|
107
105
|
include = ["scrapling*"]
|
@@ -0,0 +1,38 @@
|
|
1
|
+
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
2
|
+
__version__ = "0.3.6"
|
3
|
+
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
4
|
+
|
5
|
+
from typing import Any, TYPE_CHECKING
|
6
|
+
|
7
|
+
if TYPE_CHECKING:
|
8
|
+
from scrapling.parser import Selector, Selectors
|
9
|
+
from scrapling.core.custom_types import AttributesHandler, TextHandler
|
10
|
+
from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
|
11
|
+
|
12
|
+
|
13
|
+
# Lazy import mapping
|
14
|
+
_LAZY_IMPORTS = {
|
15
|
+
"Fetcher": ("scrapling.fetchers", "Fetcher"),
|
16
|
+
"Selector": ("scrapling.parser", "Selector"),
|
17
|
+
"Selectors": ("scrapling.parser", "Selectors"),
|
18
|
+
"AttributesHandler": ("scrapling.core.custom_types", "AttributesHandler"),
|
19
|
+
"TextHandler": ("scrapling.core.custom_types", "TextHandler"),
|
20
|
+
"AsyncFetcher": ("scrapling.fetchers", "AsyncFetcher"),
|
21
|
+
"StealthyFetcher": ("scrapling.fetchers", "StealthyFetcher"),
|
22
|
+
"DynamicFetcher": ("scrapling.fetchers", "DynamicFetcher"),
|
23
|
+
}
|
24
|
+
__all__ = ["Selector", "Fetcher", "AsyncFetcher", "StealthyFetcher", "DynamicFetcher"]
|
25
|
+
|
26
|
+
|
27
|
+
def __getattr__(name: str) -> Any:
|
28
|
+
if name in _LAZY_IMPORTS:
|
29
|
+
module_path, class_name = _LAZY_IMPORTS[name]
|
30
|
+
module = __import__(module_path, fromlist=[class_name])
|
31
|
+
return getattr(module, class_name)
|
32
|
+
else:
|
33
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
34
|
+
|
35
|
+
|
36
|
+
def __dir__() -> list[str]:
|
37
|
+
"""Support for dir() and autocomplete."""
|
38
|
+
return sorted(__all__ + ["fetchers", "parser", "cli", "core", "__author__", "__version__", "__copyright__"])
|
@@ -2,8 +2,9 @@ from pathlib import Path
|
|
2
2
|
from subprocess import check_output
|
3
3
|
from sys import executable as python_executable
|
4
4
|
|
5
|
+
from scrapling.core.utils import log
|
5
6
|
from scrapling.engines.toolbelt.custom import Response
|
6
|
-
from scrapling.core.utils import
|
7
|
+
from scrapling.core.utils._shell import _CookieParser, _ParseHeaders
|
7
8
|
from scrapling.core._types import List, Optional, Dict, Tuple, Any, Callable
|
8
9
|
|
9
10
|
from orjson import loads as json_loads, JSONDecodeError
|
@@ -32,8 +33,8 @@ def __ParseJSONData(json_string: Optional[str] = None) -> Optional[Dict[str, Any
|
|
32
33
|
|
33
34
|
try:
|
34
35
|
return json_loads(json_string)
|
35
|
-
except JSONDecodeError as
|
36
|
-
raise ValueError(f"Invalid JSON data '{json_string}': {
|
36
|
+
except JSONDecodeError as err: # pragma: no cover
|
37
|
+
raise ValueError(f"Invalid JSON data '{json_string}': {err}")
|
37
38
|
|
38
39
|
|
39
40
|
def __Request_and_Save(
|
@@ -65,8 +66,8 @@ def __ParseExtractArguments(
|
|
65
66
|
for key, value in _CookieParser(cookies):
|
66
67
|
try:
|
67
68
|
parsed_cookies[key] = value
|
68
|
-
except Exception as
|
69
|
-
raise ValueError(f"Could not parse cookies '{cookies}': {
|
69
|
+
except Exception as err:
|
70
|
+
raise ValueError(f"Could not parse cookies '{cookies}': {err}")
|
70
71
|
|
71
72
|
parsed_json = __ParseJSONData(json)
|
72
73
|
parsed_params = {}
|
@@ -135,10 +136,26 @@ def install(force): # pragma: no cover
|
|
135
136
|
|
136
137
|
|
137
138
|
@command(help="Run Scrapling's MCP server (Check the docs for more info).")
|
138
|
-
|
139
|
+
@option(
|
140
|
+
"--http",
|
141
|
+
is_flag=True,
|
142
|
+
default=False,
|
143
|
+
help="Whether to run the MCP server in streamable-http transport or leave it as stdio (Default: False)",
|
144
|
+
)
|
145
|
+
@option(
|
146
|
+
"--host",
|
147
|
+
type=str,
|
148
|
+
default="0.0.0.0",
|
149
|
+
help="The host to use if streamable-http transport is enabled (Default: '0.0.0.0')",
|
150
|
+
)
|
151
|
+
@option(
|
152
|
+
"--port", type=int, default=8000, help="The port to use if streamable-http transport is enabled (Default: 8000)"
|
153
|
+
)
|
154
|
+
def mcp(http, host, port):
|
139
155
|
from scrapling.core.ai import ScraplingMCPServer
|
140
156
|
|
141
|
-
ScraplingMCPServer()
|
157
|
+
server = ScraplingMCPServer()
|
158
|
+
server.serve(http, host, port)
|
142
159
|
|
143
160
|
|
144
161
|
@command(help="Interactive scraping console")
|
@@ -766,7 +783,7 @@ def stealthy_fetch(
|
|
766
783
|
:param disable_resources: Drop requests of unnecessary resources for a speed boost.
|
767
784
|
:param block_webrtc: Blocks WebRTC entirely.
|
768
785
|
:param humanize: Humanize the cursor movement.
|
769
|
-
:param solve_cloudflare: Solves all
|
786
|
+
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges.
|
770
787
|
:param allow_webgl: Allow WebGL (recommended to keep enabled).
|
771
788
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
772
789
|
:param disable_ads: Install the uBlock Origin addon on the browser.
|
@@ -42,10 +42,7 @@ def _ContentTranslator(content: Generator[str, None, None], page: _ScraplingResp
|
|
42
42
|
|
43
43
|
|
44
44
|
class ScraplingMCPServer:
|
45
|
-
_server = FastMCP(name="Scrapling")
|
46
|
-
|
47
45
|
@staticmethod
|
48
|
-
@_server.tool()
|
49
46
|
def get(
|
50
47
|
url: str,
|
51
48
|
impersonate: Optional[BrowserTypeLiteral] = "chrome",
|
@@ -124,7 +121,6 @@ class ScraplingMCPServer:
|
|
124
121
|
)
|
125
122
|
|
126
123
|
@staticmethod
|
127
|
-
@_server.tool()
|
128
124
|
async def bulk_get(
|
129
125
|
urls: Tuple[str, ...],
|
130
126
|
impersonate: Optional[BrowserTypeLiteral] = "chrome",
|
@@ -211,7 +207,6 @@ class ScraplingMCPServer:
|
|
211
207
|
]
|
212
208
|
|
213
209
|
@staticmethod
|
214
|
-
@_server.tool()
|
215
210
|
async def fetch(
|
216
211
|
url: str,
|
217
212
|
extraction_type: extraction_types = "markdown",
|
@@ -263,7 +258,7 @@ class ScraplingMCPServer:
|
|
263
258
|
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
264
259
|
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
265
260
|
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
266
|
-
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers
|
261
|
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
267
262
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
268
263
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
269
264
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
@@ -300,7 +295,6 @@ class ScraplingMCPServer:
|
|
300
295
|
)
|
301
296
|
|
302
297
|
@staticmethod
|
303
|
-
@_server.tool()
|
304
298
|
async def bulk_fetch(
|
305
299
|
urls: Tuple[str, ...],
|
306
300
|
extraction_type: extraction_types = "markdown",
|
@@ -352,7 +346,7 @@ class ScraplingMCPServer:
|
|
352
346
|
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
353
347
|
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
354
348
|
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
355
|
-
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers
|
349
|
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
356
350
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
357
351
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
358
352
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
@@ -394,7 +388,6 @@ class ScraplingMCPServer:
|
|
394
388
|
]
|
395
389
|
|
396
390
|
@staticmethod
|
397
|
-
@_server.tool()
|
398
391
|
async def stealthy_fetch(
|
399
392
|
url: str,
|
400
393
|
extraction_type: extraction_types = "markdown",
|
@@ -443,7 +436,7 @@ class ScraplingMCPServer:
|
|
443
436
|
:param cookies: Set cookies for the next request.
|
444
437
|
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
445
438
|
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
446
|
-
:param solve_cloudflare: Solves all
|
439
|
+
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
447
440
|
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
448
441
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
449
442
|
:param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
|
@@ -494,7 +487,6 @@ class ScraplingMCPServer:
|
|
494
487
|
)
|
495
488
|
|
496
489
|
@staticmethod
|
497
|
-
@_server.tool()
|
498
490
|
async def bulk_stealthy_fetch(
|
499
491
|
urls: Tuple[str, ...],
|
500
492
|
extraction_type: extraction_types = "markdown",
|
@@ -543,7 +535,7 @@ class ScraplingMCPServer:
|
|
543
535
|
:param cookies: Set cookies for the next request.
|
544
536
|
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
545
537
|
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
546
|
-
:param solve_cloudflare: Solves all
|
538
|
+
:param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
|
547
539
|
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
548
540
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
549
541
|
:param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
|
@@ -598,6 +590,22 @@ class ScraplingMCPServer:
|
|
598
590
|
for page in responses
|
599
591
|
]
|
600
592
|
|
601
|
-
def serve(self):
|
593
|
+
def serve(self, http: bool, host: str, port: int):
|
602
594
|
"""Serve the MCP server."""
|
603
|
-
|
595
|
+
server = FastMCP(name="Scrapling", host=host, port=port)
|
596
|
+
server.add_tool(self.get, title="get", description=self.get.__doc__, structured_output=True)
|
597
|
+
server.add_tool(self.bulk_get, title="bulk_get", description=self.bulk_get.__doc__, structured_output=True)
|
598
|
+
server.add_tool(self.fetch, title="fetch", description=self.fetch.__doc__, structured_output=True)
|
599
|
+
server.add_tool(
|
600
|
+
self.bulk_fetch, title="bulk_fetch", description=self.bulk_fetch.__doc__, structured_output=True
|
601
|
+
)
|
602
|
+
server.add_tool(
|
603
|
+
self.stealthy_fetch, title="stealthy_fetch", description=self.stealthy_fetch.__doc__, structured_output=True
|
604
|
+
)
|
605
|
+
server.add_tool(
|
606
|
+
self.bulk_stealthy_fetch,
|
607
|
+
title="bulk_stealthy_fetch",
|
608
|
+
description=self.bulk_stealthy_fetch.__doc__,
|
609
|
+
structured_output=True,
|
610
|
+
)
|
611
|
+
server.run(transport="stdio" if not http else "streamable-http")
|
@@ -145,7 +145,7 @@ class TextHandler(str):
|
|
145
145
|
clean_match: bool = False,
|
146
146
|
case_sensitive: bool = True,
|
147
147
|
check_match: Literal[False] = False,
|
148
|
-
) -> "TextHandlers
|
148
|
+
) -> "TextHandlers": ...
|
149
149
|
|
150
150
|
def re(
|
151
151
|
self,
|
@@ -241,7 +241,7 @@ class TextHandlers(List[TextHandler]):
|
|
241
241
|
replace_entities: bool = True,
|
242
242
|
clean_match: bool = False,
|
243
243
|
case_sensitive: bool = True,
|
244
|
-
) -> "TextHandlers
|
244
|
+
) -> "TextHandlers":
|
245
245
|
"""Call the ``.re()`` method for each element in this list and return
|
246
246
|
their results flattened as TextHandlers.
|
247
247
|
|
@@ -22,10 +22,11 @@ from logging import (
|
|
22
22
|
from orjson import loads as json_loads, JSONDecodeError
|
23
23
|
|
24
24
|
from scrapling import __version__
|
25
|
+
from scrapling.core.utils import log
|
25
26
|
from scrapling.parser import Selector, Selectors
|
26
27
|
from scrapling.core.custom_types import TextHandler
|
27
28
|
from scrapling.engines.toolbelt.custom import Response
|
28
|
-
from scrapling.core.utils import
|
29
|
+
from scrapling.core.utils._shell import _ParseHeaders, _CookieParser
|
29
30
|
from scrapling.core._types import (
|
30
31
|
Optional,
|
31
32
|
Dict,
|
@@ -201,7 +202,7 @@ class CurlParser:
|
|
201
202
|
data_payload = parsed_args.data_binary # Fallback to string
|
202
203
|
|
203
204
|
elif parsed_args.data_raw is not None:
|
204
|
-
data_payload = parsed_args.data_raw
|
205
|
+
data_payload = parsed_args.data_raw.lstrip("$")
|
205
206
|
|
206
207
|
elif parsed_args.data is not None:
|
207
208
|
data_payload = parsed_args.data
|
@@ -317,8 +318,8 @@ def show_page_in_browser(page: Selector): # pragma: no cover
|
|
317
318
|
|
318
319
|
try:
|
319
320
|
fd, fname = make_temp_file(prefix="scrapling_view_", suffix=".html")
|
320
|
-
with open(fd, "
|
321
|
-
f.write(page.
|
321
|
+
with open(fd, "w", encoding=page.encoding) as f:
|
322
|
+
f.write(page.html_content)
|
322
323
|
|
323
324
|
open_in_browser(f"file://{fname}")
|
324
325
|
except IOError as e:
|
@@ -545,7 +546,7 @@ class Convertor:
|
|
545
546
|
for page in pages:
|
546
547
|
match extraction_type:
|
547
548
|
case "markdown":
|
548
|
-
yield cls._convert_to_markdown(page.
|
549
|
+
yield cls._convert_to_markdown(page.html_content)
|
549
550
|
case "html":
|
550
551
|
yield page.body
|
551
552
|
case "text":
|
@@ -6,7 +6,6 @@ from sqlite3 import connect as db_connect
|
|
6
6
|
|
7
7
|
from orjson import dumps, loads
|
8
8
|
from lxml.html import HtmlElement
|
9
|
-
from tldextract import extract as tld
|
10
9
|
|
11
10
|
from scrapling.core.utils import _StorageTools, log
|
12
11
|
from scrapling.core._types import Dict, Optional, Any
|
@@ -26,6 +25,8 @@ class StorageSystemMixin(ABC): # pragma: no cover
|
|
26
25
|
return default_value
|
27
26
|
|
28
27
|
try:
|
28
|
+
from tldextract import extract as tld
|
29
|
+
|
29
30
|
extracted = tld(self.url)
|
30
31
|
return extracted.top_domain_under_public_suffix or extracted.domain or default_value
|
31
32
|
except AttributeError:
|
File without changes
|