scrapling 0.3.5__tar.gz → 0.3.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {scrapling-0.3.5/scrapling.egg-info → scrapling-0.3.6}/PKG-INFO +17 -15
  2. {scrapling-0.3.5 → scrapling-0.3.6}/README.md +14 -12
  3. {scrapling-0.3.5 → scrapling-0.3.6}/pyproject.toml +4 -6
  4. scrapling-0.3.6/scrapling/__init__.py +38 -0
  5. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/cli.py +21 -4
  6. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/core/_types.py +0 -2
  7. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/core/ai.py +22 -14
  8. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/core/shell.py +2 -1
  9. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/core/storage.py +2 -1
  10. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/core/utils/__init__.py +0 -1
  11. scrapling-0.3.6/scrapling/engines/_browsers/__init__.py +0 -0
  12. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/engines/_browsers/_base.py +9 -8
  13. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/engines/_browsers/_camoufox.py +36 -22
  14. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/engines/_browsers/_controllers.py +2 -2
  15. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/engines/constants.py +0 -15
  16. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/engines/static.py +419 -16
  17. scrapling-0.3.6/scrapling/fetchers/__init__.py +36 -0
  18. scrapling-0.3.6/scrapling/fetchers/chrome.py +205 -0
  19. scrapling-0.3.6/scrapling/fetchers/firefox.py +216 -0
  20. scrapling-0.3.6/scrapling/fetchers/requests.py +28 -0
  21. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/parser.py +4 -4
  22. {scrapling-0.3.5 → scrapling-0.3.6/scrapling.egg-info}/PKG-INFO +17 -15
  23. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling.egg-info/SOURCES.txt +5 -2
  24. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling.egg-info/requires.txt +2 -2
  25. {scrapling-0.3.5 → scrapling-0.3.6}/setup.cfg +1 -1
  26. scrapling-0.3.5/scrapling/__init__.py +0 -28
  27. scrapling-0.3.5/scrapling/engines/_browsers/__init__.py +0 -2
  28. scrapling-0.3.5/scrapling/fetchers.py +0 -444
  29. {scrapling-0.3.5 → scrapling-0.3.6}/LICENSE +0 -0
  30. {scrapling-0.3.5 → scrapling-0.3.6}/MANIFEST.in +0 -0
  31. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/core/__init__.py +0 -0
  32. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/core/_html_utils.py +0 -0
  33. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/core/custom_types.py +0 -0
  34. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/core/mixins.py +0 -0
  35. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/core/translator.py +0 -0
  36. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/core/utils/_shell.py +0 -0
  37. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/core/utils/_utils.py +0 -0
  38. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/engines/__init__.py +0 -0
  39. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/engines/_browsers/_config_tools.py +0 -0
  40. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/engines/_browsers/_page.py +0 -0
  41. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/engines/_browsers/_validators.py +0 -0
  42. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/engines/toolbelt/__init__.py +0 -0
  43. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
  44. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
  45. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
  46. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
  47. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
  48. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
  49. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/engines/toolbelt/convertor.py +0 -0
  50. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/engines/toolbelt/custom.py +0 -0
  51. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/engines/toolbelt/fingerprints.py +0 -0
  52. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/engines/toolbelt/navigation.py +0 -0
  53. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling/py.typed +0 -0
  54. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling.egg-info/dependency_links.txt +0 -0
  55. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling.egg-info/entry_points.txt +0 -0
  56. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling.egg-info/not-zip-safe +0 -0
  57. {scrapling-0.3.5 → scrapling-0.3.6}/scrapling.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scrapling
3
- Version: 0.3.5
3
+ Version: 0.3.6
4
4
  Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
@@ -64,7 +64,7 @@ Classifier: Typing :: Typed
64
64
  Requires-Python: >=3.10
65
65
  Description-Content-Type: text/markdown
66
66
  License-File: LICENSE
67
- Requires-Dist: lxml>=6.0.1
67
+ Requires-Dist: lxml>=6.0.2
68
68
  Requires-Dist: cssselect>=1.3.0
69
69
  Requires-Dist: orjson>=3.11.3
70
70
  Requires-Dist: tldextract>=5.3.0
@@ -77,7 +77,7 @@ Requires-Dist: camoufox>=0.4.11; extra == "fetchers"
77
77
  Requires-Dist: geoip2>=5.1.0; extra == "fetchers"
78
78
  Requires-Dist: msgspec>=0.19.0; extra == "fetchers"
79
79
  Provides-Extra: ai
80
- Requires-Dist: mcp>=1.14.1; extra == "ai"
80
+ Requires-Dist: mcp>=1.15.0; extra == "ai"
81
81
  Requires-Dist: markdownify>=1.2.0; extra == "ai"
82
82
  Requires-Dist: scrapling[fetchers]; extra == "ai"
83
83
  Provides-Extra: shell
@@ -139,7 +139,7 @@ Dynamic: license-file
139
139
 
140
140
  Scrapling isn't just another Web Scraping library. It's the first **adaptive** scraping library that learns from website changes and evolves with them. While other libraries break when websites update their structure, Scrapling automatically relocates your elements and keeps your scrapers running.
141
141
 
142
- Built for the modern Web, Scrapling has its own rapid parsing engine and its fetchers to handle all Web Scraping challenges you are facing or will face. Built by Web Scrapers for Web Scrapers and regular users, there's something for everyone.
142
+ Built for the modern Web, Scrapling features its own rapid parsing engine and fetchers to handle all Web Scraping challenges you face or will face. Built by Web Scrapers for Web Scrapers and regular users, there's something for everyone.
143
143
 
144
144
  ```python
145
145
  >> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
@@ -163,6 +163,7 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
163
163
  <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
164
164
  <a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
165
165
  <a href="https://www.nstproxy.com/?type=flow&utm_source=scrapling" target="_blank" title="One Proxy Service, Infinite Solutions at Unbeatable Prices!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/NSTproxy.png"></a>
166
+ <a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
166
167
  <a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
167
168
 
168
169
  <!-- /sponsors -->
@@ -176,7 +177,7 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
176
177
  ### Advanced Websites Fetching with Session Support
177
178
  - **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class. Can impersonate browsers' TLS fingerprint, headers, and use HTTP3.
178
179
  - **Dynamic Loading**: Fetch dynamic websites with full browser automation through the `DynamicFetcher` class supporting Playwright's Chromium, real Chrome, and custom stealth mode.
179
- - **Anti-bot Bypass**: Advanced stealth capabilities with `StealthyFetcher` using a modified version of Firefox and fingerprint spoofing. Can bypass all levels of Cloudflare's Turnstile with automation easily.
180
+ - **Anti-bot Bypass**: Advanced stealth capabilities with `StealthyFetcher` using a modified version of Firefox and fingerprint spoofing. Can bypass all types of Cloudflare's Turnstile and Interstitial with automation easily.
180
181
  - **Session Management**: Persistent session support with `FetcherSession`, `StealthySession`, and `DynamicSession` classes for cookie and state management across requests.
181
182
  - **Async Support**: Complete async support across all fetchers and dedicated async session classes.
182
183
 
@@ -200,13 +201,7 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
200
201
  - 📝 **Auto Selector Generation**: Generate robust CSS/XPath selectors for any element.
201
202
  - 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup with the same pseudo-elements used in Scrapy/Parsel.
202
203
  - 📘 **Complete Type Coverage**: Full type hints for excellent IDE support and code completion.
203
-
204
- ### New Session Architecture
205
- Scrapling 0.3 introduces a completely revamped session system:
206
- - **Persistent Sessions**: Maintain cookies, headers, and authentication across multiple requests
207
- - **Automatic Session Management**: Smart session lifecycle handling with proper cleanup
208
- - **Session Inheritance**: All fetchers support both one-off requests and persistent session usage
209
- - **Concurrent Session Support**: Run multiple isolated sessions simultaneously
204
+ - 🔋 **Ready Docker image**: With each release, a Docker image containing all browsers is automatically built and pushed.
210
205
 
211
206
  ## Getting Started
212
207
 
@@ -324,11 +319,11 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.
324
319
  ```
325
320
 
326
321
  > [!NOTE]
327
- > There are many additional features, but we want to keep this page short, like the MCP server and the interactive Web Scraping Shell. Check out the full documentation [here](https://scrapling.readthedocs.io/en/latest/)
322
+ > There are many additional features, but we want to keep this page concise, such as the MCP server and the interactive Web Scraping Shell. Check out the full documentation [here](https://scrapling.readthedocs.io/en/latest/)
328
323
 
329
324
  ## Performance Benchmarks
330
325
 
331
- Scrapling isn't just powerful—it's also blazing fast, and the updates since version 0.3 deliver exceptional performance improvements across all operations!
326
+ Scrapling isn't just powerful—it's also blazing fast, and the updates since version 0.3 have delivered exceptional performance improvements across all operations.
332
327
 
333
328
  ### Text Extraction Speed Test (5000 nested elements)
334
329
 
@@ -391,6 +386,13 @@ Starting with v0.3.2, this installation only includes the parser engine and its
391
386
  ```
392
387
  Don't forget that you need to install the browser dependencies with `scrapling install` after any of these extras (if you didn't already)
393
388
 
389
+ ### Docker
390
+ You can also install a Docker image with all extras and browsers with the following command:
391
+ ```bash
392
+ docker pull scrapling
393
+ ```
394
+ This image is automatically built and pushed to Docker Hub through GitHub actions right here.
395
+
394
396
  ## Contributing
395
397
 
396
398
  We welcome contributions! Please read our [contributing guidelines](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) before getting started.
@@ -398,7 +400,7 @@ We welcome contributions! Please read our [contributing guidelines](https://gith
398
400
  ## Disclaimer
399
401
 
400
402
  > [!CAUTION]
401
- > This library is provided for educational and research purposes only. By using this library, you agree to comply with local and international data scraping and privacy laws. The authors and contributors are not responsible for any misuse of this software. Always respect website terms of service and robots.txt files.
403
+ > This library is provided for educational and research purposes only. By using this library, you agree to comply with local and international data scraping and privacy laws. The authors and contributors are not responsible for any misuse of this software. Always respect the terms of service of websites and robots.txt files.
402
404
 
403
405
  ## License
404
406
 
@@ -49,7 +49,7 @@
49
49
 
50
50
  Scrapling isn't just another Web Scraping library. It's the first **adaptive** scraping library that learns from website changes and evolves with them. While other libraries break when websites update their structure, Scrapling automatically relocates your elements and keeps your scrapers running.
51
51
 
52
- Built for the modern Web, Scrapling has its own rapid parsing engine and its fetchers to handle all Web Scraping challenges you are facing or will face. Built by Web Scrapers for Web Scrapers and regular users, there's something for everyone.
52
+ Built for the modern Web, Scrapling features its own rapid parsing engine and fetchers to handle all Web Scraping challenges you face or will face. Built by Web Scrapers for Web Scrapers and regular users, there's something for everyone.
53
53
 
54
54
  ```python
55
55
  >> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
@@ -73,6 +73,7 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
73
73
  <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
74
74
  <a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
75
75
  <a href="https://www.nstproxy.com/?type=flow&utm_source=scrapling" target="_blank" title="One Proxy Service, Infinite Solutions at Unbeatable Prices!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/NSTproxy.png"></a>
76
+ <a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
76
77
  <a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
77
78
 
78
79
  <!-- /sponsors -->
@@ -86,7 +87,7 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
86
87
  ### Advanced Websites Fetching with Session Support
87
88
  - **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class. Can impersonate browsers' TLS fingerprint, headers, and use HTTP3.
88
89
  - **Dynamic Loading**: Fetch dynamic websites with full browser automation through the `DynamicFetcher` class supporting Playwright's Chromium, real Chrome, and custom stealth mode.
89
- - **Anti-bot Bypass**: Advanced stealth capabilities with `StealthyFetcher` using a modified version of Firefox and fingerprint spoofing. Can bypass all levels of Cloudflare's Turnstile with automation easily.
90
+ - **Anti-bot Bypass**: Advanced stealth capabilities with `StealthyFetcher` using a modified version of Firefox and fingerprint spoofing. Can bypass all types of Cloudflare's Turnstile and Interstitial with automation easily.
90
91
  - **Session Management**: Persistent session support with `FetcherSession`, `StealthySession`, and `DynamicSession` classes for cookie and state management across requests.
91
92
  - **Async Support**: Complete async support across all fetchers and dedicated async session classes.
92
93
 
@@ -110,13 +111,7 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
110
111
  - 📝 **Auto Selector Generation**: Generate robust CSS/XPath selectors for any element.
111
112
  - 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup with the same pseudo-elements used in Scrapy/Parsel.
112
113
  - 📘 **Complete Type Coverage**: Full type hints for excellent IDE support and code completion.
113
-
114
- ### New Session Architecture
115
- Scrapling 0.3 introduces a completely revamped session system:
116
- - **Persistent Sessions**: Maintain cookies, headers, and authentication across multiple requests
117
- - **Automatic Session Management**: Smart session lifecycle handling with proper cleanup
118
- - **Session Inheritance**: All fetchers support both one-off requests and persistent session usage
119
- - **Concurrent Session Support**: Run multiple isolated sessions simultaneously
114
+ - 🔋 **Ready Docker image**: With each release, a Docker image containing all browsers is automatically built and pushed.
120
115
 
121
116
  ## Getting Started
122
117
 
@@ -234,11 +229,11 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.
234
229
  ```
235
230
 
236
231
  > [!NOTE]
237
- > There are many additional features, but we want to keep this page short, like the MCP server and the interactive Web Scraping Shell. Check out the full documentation [here](https://scrapling.readthedocs.io/en/latest/)
232
+ > There are many additional features, but we want to keep this page concise, such as the MCP server and the interactive Web Scraping Shell. Check out the full documentation [here](https://scrapling.readthedocs.io/en/latest/)
238
233
 
239
234
  ## Performance Benchmarks
240
235
 
241
- Scrapling isn't just powerful—it's also blazing fast, and the updates since version 0.3 deliver exceptional performance improvements across all operations!
236
+ Scrapling isn't just powerful—it's also blazing fast, and the updates since version 0.3 have delivered exceptional performance improvements across all operations.
242
237
 
243
238
  ### Text Extraction Speed Test (5000 nested elements)
244
239
 
@@ -301,6 +296,13 @@ Starting with v0.3.2, this installation only includes the parser engine and its
301
296
  ```
302
297
  Don't forget that you need to install the browser dependencies with `scrapling install` after any of these extras (if you didn't already)
303
298
 
299
+ ### Docker
300
+ You can also install a Docker image with all extras and browsers with the following command:
301
+ ```bash
302
+ docker pull scrapling
303
+ ```
304
+ This image is automatically built and pushed to Docker Hub through GitHub actions right here.
305
+
304
306
  ## Contributing
305
307
 
306
308
  We welcome contributions! Please read our [contributing guidelines](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) before getting started.
@@ -308,7 +310,7 @@ We welcome contributions! Please read our [contributing guidelines](https://gith
308
310
  ## Disclaimer
309
311
 
310
312
  > [!CAUTION]
311
- > This library is provided for educational and research purposes only. By using this library, you agree to comply with local and international data scraping and privacy laws. The authors and contributors are not responsible for any misuse of this software. Always respect website terms of service and robots.txt files.
313
+ > This library is provided for educational and research purposes only. By using this library, you agree to comply with local and international data scraping and privacy laws. The authors and contributors are not responsible for any misuse of this software. Always respect the terms of service of websites and robots.txt files.
312
314
 
313
315
  ## License
314
316
 
@@ -4,7 +4,8 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "scrapling"
7
- dynamic = ["version"]
7
+ # Static version instead of dynamic version so we can get better layer caching while building docker, check the docker file to understand
8
+ version = "0.3.6"
8
9
  description = "Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!"
9
10
  readme = {file = "README.md", content-type = "text/markdown"}
10
11
  license = {file = "LICENSE"}
@@ -56,7 +57,7 @@ classifiers = [
56
57
  "Typing :: Typed",
57
58
  ]
58
59
  dependencies = [
59
- "lxml>=6.0.1",
60
+ "lxml>=6.0.2",
60
61
  "cssselect>=1.3.0",
61
62
  "orjson>=3.11.3",
62
63
  "tldextract>=5.3.0",
@@ -73,7 +74,7 @@ fetchers = [
73
74
  "msgspec>=0.19.0",
74
75
  ]
75
76
  ai = [
76
- "mcp>=1.14.1",
77
+ "mcp>=1.15.0",
77
78
  "markdownify>=1.2.0",
78
79
  "scrapling[fetchers]",
79
80
  ]
@@ -99,9 +100,6 @@ scrapling = "scrapling.cli:main"
99
100
  zip-safe = false
100
101
  include-package-data = true
101
102
 
102
- [tool.setuptools.dynamic]
103
- version = {attr = "scrapling.__version__"}
104
-
105
103
  [tool.setuptools.packages.find]
106
104
  where = ["."]
107
105
  include = ["scrapling*"]
@@ -0,0 +1,38 @@
1
+ __author__ = "Karim Shoair (karim.shoair@pm.me)"
2
+ __version__ = "0.3.6"
3
+ __copyright__ = "Copyright (c) 2024 Karim Shoair"
4
+
5
+ from typing import Any, TYPE_CHECKING
6
+
7
+ if TYPE_CHECKING:
8
+ from scrapling.parser import Selector, Selectors
9
+ from scrapling.core.custom_types import AttributesHandler, TextHandler
10
+ from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
11
+
12
+
13
+ # Lazy import mapping
14
+ _LAZY_IMPORTS = {
15
+ "Fetcher": ("scrapling.fetchers", "Fetcher"),
16
+ "Selector": ("scrapling.parser", "Selector"),
17
+ "Selectors": ("scrapling.parser", "Selectors"),
18
+ "AttributesHandler": ("scrapling.core.custom_types", "AttributesHandler"),
19
+ "TextHandler": ("scrapling.core.custom_types", "TextHandler"),
20
+ "AsyncFetcher": ("scrapling.fetchers", "AsyncFetcher"),
21
+ "StealthyFetcher": ("scrapling.fetchers", "StealthyFetcher"),
22
+ "DynamicFetcher": ("scrapling.fetchers", "DynamicFetcher"),
23
+ }
24
+ __all__ = ["Selector", "Fetcher", "AsyncFetcher", "StealthyFetcher", "DynamicFetcher"]
25
+
26
+
27
+ def __getattr__(name: str) -> Any:
28
+ if name in _LAZY_IMPORTS:
29
+ module_path, class_name = _LAZY_IMPORTS[name]
30
+ module = __import__(module_path, fromlist=[class_name])
31
+ return getattr(module, class_name)
32
+ else:
33
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
34
+
35
+
36
+ def __dir__() -> list[str]:
37
+ """Support for dir() and autocomplete."""
38
+ return sorted(__all__ + ["fetchers", "parser", "cli", "core", "__author__", "__version__", "__copyright__"])
@@ -2,8 +2,9 @@ from pathlib import Path
2
2
  from subprocess import check_output
3
3
  from sys import executable as python_executable
4
4
 
5
+ from scrapling.core.utils import log
5
6
  from scrapling.engines.toolbelt.custom import Response
6
- from scrapling.core.utils import log, _CookieParser, _ParseHeaders
7
+ from scrapling.core.utils._shell import _CookieParser, _ParseHeaders
7
8
  from scrapling.core._types import List, Optional, Dict, Tuple, Any, Callable
8
9
 
9
10
  from orjson import loads as json_loads, JSONDecodeError
@@ -135,10 +136,26 @@ def install(force): # pragma: no cover
135
136
 
136
137
 
137
138
  @command(help="Run Scrapling's MCP server (Check the docs for more info).")
138
- def mcp():
139
+ @option(
140
+ "--http",
141
+ is_flag=True,
142
+ default=False,
143
+ help="Whether to run the MCP server in streamable-http transport or leave it as stdio (Default: False)",
144
+ )
145
+ @option(
146
+ "--host",
147
+ type=str,
148
+ default="0.0.0.0",
149
+ help="The host to use if streamable-http transport is enabled (Default: '0.0.0.0')",
150
+ )
151
+ @option(
152
+ "--port", type=int, default=8000, help="The port to use if streamable-http transport is enabled (Default: 8000)"
153
+ )
154
+ def mcp(http, host, port):
139
155
  from scrapling.core.ai import ScraplingMCPServer
140
156
 
141
- ScraplingMCPServer().serve()
157
+ server = ScraplingMCPServer()
158
+ server.serve(http, host, port)
142
159
 
143
160
 
144
161
  @command(help="Interactive scraping console")
@@ -766,7 +783,7 @@ def stealthy_fetch(
766
783
  :param disable_resources: Drop requests of unnecessary resources for a speed boost.
767
784
  :param block_webrtc: Blocks WebRTC entirely.
768
785
  :param humanize: Humanize the cursor movement.
769
- :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page.
786
+ :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges.
770
787
  :param allow_webgl: Allow WebGL (recommended to keep enabled).
771
788
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
772
789
  :param disable_ads: Install the uBlock Origin addon on the browser.
@@ -39,6 +39,4 @@ except ImportError: # pragma: no cover
39
39
  try:
40
40
  from typing_extensions import Self # Backport
41
41
  except ImportError:
42
- from typing import TypeVar
43
-
44
42
  Self = object
@@ -42,10 +42,7 @@ def _ContentTranslator(content: Generator[str, None, None], page: _ScraplingResp
42
42
 
43
43
 
44
44
  class ScraplingMCPServer:
45
- _server = FastMCP(name="Scrapling")
46
-
47
45
  @staticmethod
48
- @_server.tool()
49
46
  def get(
50
47
  url: str,
51
48
  impersonate: Optional[BrowserTypeLiteral] = "chrome",
@@ -124,7 +121,6 @@ class ScraplingMCPServer:
124
121
  )
125
122
 
126
123
  @staticmethod
127
- @_server.tool()
128
124
  async def bulk_get(
129
125
  urls: Tuple[str, ...],
130
126
  impersonate: Optional[BrowserTypeLiteral] = "chrome",
@@ -211,7 +207,6 @@ class ScraplingMCPServer:
211
207
  ]
212
208
 
213
209
  @staticmethod
214
- @_server.tool()
215
210
  async def fetch(
216
211
  url: str,
217
212
  extraction_type: extraction_types = "markdown",
@@ -263,7 +258,7 @@ class ScraplingMCPServer:
263
258
  :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
264
259
  :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
265
260
  :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
266
- :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
261
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
267
262
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
268
263
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
269
264
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
@@ -300,7 +295,6 @@ class ScraplingMCPServer:
300
295
  )
301
296
 
302
297
  @staticmethod
303
- @_server.tool()
304
298
  async def bulk_fetch(
305
299
  urls: Tuple[str, ...],
306
300
  extraction_type: extraction_types = "markdown",
@@ -352,7 +346,7 @@ class ScraplingMCPServer:
352
346
  :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
353
347
  :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
354
348
  :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
355
- :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
349
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
356
350
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
357
351
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
358
352
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
@@ -394,7 +388,6 @@ class ScraplingMCPServer:
394
388
  ]
395
389
 
396
390
  @staticmethod
397
- @_server.tool()
398
391
  async def stealthy_fetch(
399
392
  url: str,
400
393
  extraction_type: extraction_types = "markdown",
@@ -443,7 +436,7 @@ class ScraplingMCPServer:
443
436
  :param cookies: Set cookies for the next request.
444
437
  :param addons: List of Firefox addons to use. Must be paths to extracted addons.
445
438
  :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
446
- :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
439
+ :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
447
440
  :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
448
441
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
449
442
  :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
@@ -494,7 +487,6 @@ class ScraplingMCPServer:
494
487
  )
495
488
 
496
489
  @staticmethod
497
- @_server.tool()
498
490
  async def bulk_stealthy_fetch(
499
491
  urls: Tuple[str, ...],
500
492
  extraction_type: extraction_types = "markdown",
@@ -543,7 +535,7 @@ class ScraplingMCPServer:
543
535
  :param cookies: Set cookies for the next request.
544
536
  :param addons: List of Firefox addons to use. Must be paths to extracted addons.
545
537
  :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
546
- :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
538
+ :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
547
539
  :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
548
540
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
549
541
  :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
@@ -598,6 +590,22 @@ class ScraplingMCPServer:
598
590
  for page in responses
599
591
  ]
600
592
 
601
- def serve(self):
593
+ def serve(self, http: bool, host: str, port: int):
602
594
  """Serve the MCP server."""
603
- self._server.run(transport="stdio")
595
+ server = FastMCP(name="Scrapling", host=host, port=port)
596
+ server.add_tool(self.get, title="get", description=self.get.__doc__, structured_output=True)
597
+ server.add_tool(self.bulk_get, title="bulk_get", description=self.bulk_get.__doc__, structured_output=True)
598
+ server.add_tool(self.fetch, title="fetch", description=self.fetch.__doc__, structured_output=True)
599
+ server.add_tool(
600
+ self.bulk_fetch, title="bulk_fetch", description=self.bulk_fetch.__doc__, structured_output=True
601
+ )
602
+ server.add_tool(
603
+ self.stealthy_fetch, title="stealthy_fetch", description=self.stealthy_fetch.__doc__, structured_output=True
604
+ )
605
+ server.add_tool(
606
+ self.bulk_stealthy_fetch,
607
+ title="bulk_stealthy_fetch",
608
+ description=self.bulk_stealthy_fetch.__doc__,
609
+ structured_output=True,
610
+ )
611
+ server.run(transport="stdio" if not http else "streamable-http")
@@ -22,10 +22,11 @@ from logging import (
22
22
  from orjson import loads as json_loads, JSONDecodeError
23
23
 
24
24
  from scrapling import __version__
25
+ from scrapling.core.utils import log
25
26
  from scrapling.parser import Selector, Selectors
26
27
  from scrapling.core.custom_types import TextHandler
27
28
  from scrapling.engines.toolbelt.custom import Response
28
- from scrapling.core.utils import log, _ParseHeaders, _CookieParser
29
+ from scrapling.core.utils._shell import _ParseHeaders, _CookieParser
29
30
  from scrapling.core._types import (
30
31
  Optional,
31
32
  Dict,
@@ -6,7 +6,6 @@ from sqlite3 import connect as db_connect
6
6
 
7
7
  from orjson import dumps, loads
8
8
  from lxml.html import HtmlElement
9
- from tldextract import extract as tld
10
9
 
11
10
  from scrapling.core.utils import _StorageTools, log
12
11
  from scrapling.core._types import Dict, Optional, Any
@@ -26,6 +25,8 @@ class StorageSystemMixin(ABC): # pragma: no cover
26
25
  return default_value
27
26
 
28
27
  try:
28
+ from tldextract import extract as tld
29
+
29
30
  extracted = tld(self.url)
30
31
  return extracted.top_domain_under_public_suffix or extracted.domain or default_value
31
32
  except AttributeError:
@@ -7,4 +7,3 @@ from ._utils import (
7
7
  clean_spaces,
8
8
  html_forbidden,
9
9
  )
10
- from ._shell import _CookieParser, _ParseHeaders
@@ -12,17 +12,13 @@ from camoufox.utils import (
12
12
  installed_verstr as camoufox_version,
13
13
  )
14
14
 
15
- from scrapling.engines.toolbelt.navigation import intercept_route, async_intercept_route
16
- from scrapling.core._types import (
17
- Any,
18
- Dict,
19
- Optional,
20
- )
21
15
  from ._page import PageInfo, PagePool
22
- from ._config_tools import _compiled_stealth_scripts
23
- from ._config_tools import _launch_kwargs, _context_kwargs
16
+ from scrapling.parser import Selector
17
+ from scrapling.core._types import Dict, Optional
24
18
  from scrapling.engines.toolbelt.fingerprints import get_os_name
25
19
  from ._validators import validate, PlaywrightConfig, CamoufoxConfig
20
+ from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
21
+ from scrapling.engines.toolbelt.navigation import intercept_route, async_intercept_route
26
22
 
27
23
  __ff_version_str__ = camoufox_version().split(".", 1)[0]
28
24
 
@@ -268,4 +264,9 @@ class StealthySessionMixin:
268
264
  if f"cType: '{ctype}'" in page_content:
269
265
  return ctype
270
266
 
267
+ # Check if turnstile captcha is embedded inside the page (Usually inside a closed Shadow iframe)
268
+ selector = Selector(content=page_content)
269
+ if selector.css('script[src*="challenges.cloudflare.com/turnstile/v"]'):
270
+ return "embedded"
271
+
271
272
  return None
@@ -116,7 +116,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
116
116
  :param cookies: Set cookies for the next request.
117
117
  :param addons: List of Firefox addons to use. Must be paths to extracted addons.
118
118
  :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
119
- :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
119
+ :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
120
120
  :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
121
121
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
122
122
  :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
@@ -237,26 +237,33 @@ class StealthySession(StealthySessionMixin, SyncSession):
237
237
  return
238
238
 
239
239
  else:
240
- while "Verifying you are human." in self._get_page_content(page):
241
- # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
242
- page.wait_for_timeout(500)
240
+ box_selector = "#cf_turnstile div, #cf-turnstile div, .turnstile>div>div"
241
+ if challenge_type != "embedded":
242
+ box_selector = ".main-content p+div>div>div"
243
+ while "Verifying you are human." in self._get_page_content(page):
244
+ # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
245
+ page.wait_for_timeout(500)
243
246
 
244
247
  iframe = page.frame(url=__CF_PATTERN__)
245
248
  if iframe is None:
246
- log.info("Didn't find Cloudflare iframe!")
249
+ log.error("Didn't find Cloudflare iframe!")
247
250
  return
248
251
 
249
- while not iframe.frame_element().is_visible():
250
- # Double-checking that the iframe is loaded
251
- page.wait_for_timeout(500)
252
+ if challenge_type != "embedded":
253
+ while not iframe.frame_element().is_visible():
254
+ # Double-checking that the iframe is loaded
255
+ page.wait_for_timeout(500)
252
256
 
257
+ iframe.wait_for_load_state(state="domcontentloaded")
258
+ iframe.wait_for_load_state("networkidle")
253
259
  # Calculate the Captcha coordinates for any viewport
254
- outer_box = page.locator(".main-content p+div>div>div").bounding_box()
260
+ outer_box = page.locator(box_selector).last.bounding_box()
255
261
  captcha_x, captcha_y = outer_box["x"] + 26, outer_box["y"] + 25
256
262
 
257
263
  # Move the mouse to the center of the window, then press and hold the left mouse button
258
264
  page.mouse.click(captcha_x, captcha_y, delay=60, button="left")
259
- page.locator(".zone-name-title").wait_for(state="hidden")
265
+ if challenge_type != "embedded":
266
+ page.locator(".zone-name-title").wait_for(state="hidden")
260
267
  page.wait_for_load_state(state="domcontentloaded")
261
268
 
262
269
  log.info("Cloudflare captcha is solved")
@@ -293,7 +300,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
293
300
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
294
301
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
295
302
  :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
296
- :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
303
+ :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
297
304
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
298
305
  :return: A `Response` object.
299
306
  """
@@ -435,7 +442,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
435
442
  :param cookies: Set cookies for the next request.
436
443
  :param addons: List of Firefox addons to use. Must be paths to extracted addons.
437
444
  :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
438
- :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
445
+ :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
439
446
  :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
440
447
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
441
448
  :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
@@ -556,26 +563,33 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
556
563
  return
557
564
 
558
565
  else:
559
- while "Verifying you are human." in (await self._get_page_content(page)):
560
- # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
561
- await page.wait_for_timeout(500)
566
+ box_selector = "#cf_turnstile div, #cf-turnstile div, .turnstile>div>div"
567
+ if challenge_type != "embedded":
568
+ box_selector = ".main-content p+div>div>div"
569
+ while "Verifying you are human." in (await self._get_page_content(page)):
570
+ # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
571
+ await page.wait_for_timeout(500)
562
572
 
563
573
  iframe = page.frame(url=__CF_PATTERN__)
564
574
  if iframe is None:
565
- log.info("Didn't find Cloudflare iframe!")
575
+ log.error("Didn't find Cloudflare iframe!")
566
576
  return
567
577
 
568
- while not await (await iframe.frame_element()).is_visible():
569
- # Double-checking that the iframe is loaded
570
- await page.wait_for_timeout(500)
578
+ if challenge_type != "embedded":
579
+ while not await (await iframe.frame_element()).is_visible():
580
+ # Double-checking that the iframe is loaded
581
+ await page.wait_for_timeout(500)
571
582
 
583
+ await iframe.wait_for_load_state(state="domcontentloaded")
584
+ await iframe.wait_for_load_state("networkidle")
572
585
  # Calculate the Captcha coordinates for any viewport
573
- outer_box = await page.locator(".main-content p+div>div>div").bounding_box()
586
+ outer_box = await page.locator(box_selector).last.bounding_box()
574
587
  captcha_x, captcha_y = outer_box["x"] + 26, outer_box["y"] + 25
575
588
 
576
589
  # Move the mouse to the center of the window, then press and hold the left mouse button
577
590
  await page.mouse.click(captcha_x, captcha_y, delay=60, button="left")
578
- await page.locator(".zone-name-title").wait_for(state="hidden")
591
+ if challenge_type != "embedded":
592
+ await page.locator(".zone-name-title").wait_for(state="hidden")
579
593
  await page.wait_for_load_state(state="domcontentloaded")
580
594
 
581
595
  log.info("Cloudflare captcha is solved")
@@ -612,7 +626,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
612
626
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
613
627
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
614
628
  :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
615
- :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
629
+ :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
616
630
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
617
631
  :return: A `Response` object.
618
632
  """