scrapling 0.3.4__tar.gz → 0.3.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scrapling-0.3.4/scrapling.egg-info → scrapling-0.3.5}/PKG-INFO +10 -10
- {scrapling-0.3.4 → scrapling-0.3.5}/README.md +5 -5
- {scrapling-0.3.4 → scrapling-0.3.5}/pyproject.toml +4 -4
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/__init__.py +1 -1
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/cli.py +4 -4
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/core/custom_types.py +2 -2
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/core/shell.py +4 -4
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/_browsers/_base.py +2 -28
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/_browsers/_camoufox.py +39 -38
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/_browsers/_controllers.py +41 -50
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/_browsers/_page.py +1 -42
- scrapling-0.3.5/scrapling/engines/_browsers/_validators.py +229 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/static.py +2 -4
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/toolbelt/navigation.py +1 -1
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/parser.py +3 -3
- {scrapling-0.3.4 → scrapling-0.3.5/scrapling.egg-info}/PKG-INFO +10 -10
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling.egg-info/requires.txt +4 -4
- {scrapling-0.3.4 → scrapling-0.3.5}/setup.cfg +1 -1
- scrapling-0.3.4/scrapling/engines/_browsers/_validators.py +0 -164
- {scrapling-0.3.4 → scrapling-0.3.5}/LICENSE +0 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/MANIFEST.in +0 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/core/__init__.py +0 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/core/_html_utils.py +0 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/core/_types.py +0 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/core/ai.py +0 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/core/mixins.py +0 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/core/storage.py +0 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/core/translator.py +0 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/core/utils/__init__.py +0 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/core/utils/_shell.py +0 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/core/utils/_utils.py +0 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/__init__.py +0 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/_browsers/__init__.py +0 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/_browsers/_config_tools.py +0 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/constants.py +0 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/toolbelt/__init__.py +0 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/toolbelt/convertor.py +0 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/toolbelt/custom.py +0 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/toolbelt/fingerprints.py +0 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/fetchers.py +0 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/py.typed +0 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling.egg-info/SOURCES.txt +0 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling.egg-info/dependency_links.txt +0 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling.egg-info/entry_points.txt +0 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling.egg-info/not-zip-safe +0 -0
- {scrapling-0.3.4 → scrapling-0.3.5}/scrapling.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.5
|
4
4
|
Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -69,15 +69,15 @@ Requires-Dist: cssselect>=1.3.0
|
|
69
69
|
Requires-Dist: orjson>=3.11.3
|
70
70
|
Requires-Dist: tldextract>=5.3.0
|
71
71
|
Provides-Extra: fetchers
|
72
|
-
Requires-Dist: click>=8.
|
72
|
+
Requires-Dist: click>=8.3.0; extra == "fetchers"
|
73
73
|
Requires-Dist: curl_cffi>=0.13.0; extra == "fetchers"
|
74
|
-
Requires-Dist: playwright>=1.
|
75
|
-
Requires-Dist:
|
74
|
+
Requires-Dist: playwright>=1.55.0; extra == "fetchers"
|
75
|
+
Requires-Dist: patchright>=1.55.2; extra == "fetchers"
|
76
76
|
Requires-Dist: camoufox>=0.4.11; extra == "fetchers"
|
77
77
|
Requires-Dist: geoip2>=5.1.0; extra == "fetchers"
|
78
78
|
Requires-Dist: msgspec>=0.19.0; extra == "fetchers"
|
79
79
|
Provides-Extra: ai
|
80
|
-
Requires-Dist: mcp>=1.14.
|
80
|
+
Requires-Dist: mcp>=1.14.1; extra == "ai"
|
81
81
|
Requires-Dist: markdownify>=1.2.0; extra == "ai"
|
82
82
|
Requires-Dist: scrapling[fetchers]; extra == "ai"
|
83
83
|
Provides-Extra: shell
|
@@ -157,12 +157,13 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
|
|
157
157
|
|
158
158
|
<!-- sponsors -->
|
159
159
|
|
160
|
+
<a href="https://www.thordata.com/?ls=github&lk=D4Vinci" target="_blank" title="A global network of over 60M+ residential proxies with 99.7% availability, ensuring stable and reliable web data scraping to support AI, BI, and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
|
160
161
|
<a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
|
161
|
-
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
162
162
|
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
163
|
+
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
163
164
|
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
164
|
-
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
165
165
|
<a href="https://www.nstproxy.com/?type=flow&utm_source=scrapling" target="_blank" title="One Proxy Service, Infinite Solutions at Unbeatable Prices!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/NSTproxy.png"></a>
|
166
|
+
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
166
167
|
|
167
168
|
<!-- /sponsors -->
|
168
169
|
|
@@ -411,10 +412,9 @@ This project includes code adapted from:
|
|
411
412
|
## Thanks and References
|
412
413
|
|
413
414
|
- [Daijro](https://github.com/daijro)'s brilliant work on [BrowserForge](https://github.com/daijro/browserforge) and [Camoufox](https://github.com/daijro/camoufox)
|
414
|
-
- [Vinyzu](https://github.com/Vinyzu)'s work on [Botright](https://github.com/Vinyzu/Botright)
|
415
|
+
- [Vinyzu](https://github.com/Vinyzu)'s brilliant work on [Botright](https://github.com/Vinyzu/Botright) and [PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright)
|
415
416
|
- [brotector](https://github.com/kaliiiiiiiiii/brotector) for browser detection bypass techniques
|
416
|
-
- [fakebrowser](https://github.com/kkoooqq/fakebrowser) for fingerprinting research
|
417
|
-
- [rebrowser-patches](https://github.com/rebrowser/rebrowser-patches) for stealth improvements
|
417
|
+
- [fakebrowser](https://github.com/kkoooqq/fakebrowser) and [BotBrowser](https://github.com/botswin/BotBrowser) for fingerprinting research
|
418
418
|
|
419
419
|
---
|
420
420
|
<div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
|
@@ -67,12 +67,13 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
|
|
67
67
|
|
68
68
|
<!-- sponsors -->
|
69
69
|
|
70
|
+
<a href="https://www.thordata.com/?ls=github&lk=D4Vinci" target="_blank" title="A global network of over 60M+ residential proxies with 99.7% availability, ensuring stable and reliable web data scraping to support AI, BI, and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
|
70
71
|
<a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
|
71
|
-
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
72
72
|
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
73
|
+
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
73
74
|
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
74
|
-
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
75
75
|
<a href="https://www.nstproxy.com/?type=flow&utm_source=scrapling" target="_blank" title="One Proxy Service, Infinite Solutions at Unbeatable Prices!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/NSTproxy.png"></a>
|
76
|
+
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
76
77
|
|
77
78
|
<!-- /sponsors -->
|
78
79
|
|
@@ -321,10 +322,9 @@ This project includes code adapted from:
|
|
321
322
|
## Thanks and References
|
322
323
|
|
323
324
|
- [Daijro](https://github.com/daijro)'s brilliant work on [BrowserForge](https://github.com/daijro/browserforge) and [Camoufox](https://github.com/daijro/camoufox)
|
324
|
-
- [Vinyzu](https://github.com/Vinyzu)'s work on [Botright](https://github.com/Vinyzu/Botright)
|
325
|
+
- [Vinyzu](https://github.com/Vinyzu)'s brilliant work on [Botright](https://github.com/Vinyzu/Botright) and [PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright)
|
325
326
|
- [brotector](https://github.com/kaliiiiiiiiii/brotector) for browser detection bypass techniques
|
326
|
-
- [fakebrowser](https://github.com/kkoooqq/fakebrowser) for fingerprinting research
|
327
|
-
- [rebrowser-patches](https://github.com/rebrowser/rebrowser-patches) for stealth improvements
|
327
|
+
- [fakebrowser](https://github.com/kkoooqq/fakebrowser) and [BotBrowser](https://github.com/botswin/BotBrowser) for fingerprinting research
|
328
328
|
|
329
329
|
---
|
330
330
|
<div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
|
@@ -64,16 +64,16 @@ dependencies = [
|
|
64
64
|
|
65
65
|
[project.optional-dependencies]
|
66
66
|
fetchers = [
|
67
|
-
"click>=8.
|
67
|
+
"click>=8.3.0",
|
68
68
|
"curl_cffi>=0.13.0",
|
69
|
-
"playwright>=1.
|
70
|
-
"
|
69
|
+
"playwright>=1.55.0",
|
70
|
+
"patchright>=1.55.2",
|
71
71
|
"camoufox>=0.4.11",
|
72
72
|
"geoip2>=5.1.0",
|
73
73
|
"msgspec>=0.19.0",
|
74
74
|
]
|
75
75
|
ai = [
|
76
|
-
"mcp>=1.14.
|
76
|
+
"mcp>=1.14.1",
|
77
77
|
"markdownify>=1.2.0",
|
78
78
|
"scrapling[fetchers]",
|
79
79
|
]
|
@@ -32,8 +32,8 @@ def __ParseJSONData(json_string: Optional[str] = None) -> Optional[Dict[str, Any
|
|
32
32
|
|
33
33
|
try:
|
34
34
|
return json_loads(json_string)
|
35
|
-
except JSONDecodeError as
|
36
|
-
raise ValueError(f"Invalid JSON data '{json_string}': {
|
35
|
+
except JSONDecodeError as err: # pragma: no cover
|
36
|
+
raise ValueError(f"Invalid JSON data '{json_string}': {err}")
|
37
37
|
|
38
38
|
|
39
39
|
def __Request_and_Save(
|
@@ -65,8 +65,8 @@ def __ParseExtractArguments(
|
|
65
65
|
for key, value in _CookieParser(cookies):
|
66
66
|
try:
|
67
67
|
parsed_cookies[key] = value
|
68
|
-
except Exception as
|
69
|
-
raise ValueError(f"Could not parse cookies '{cookies}': {
|
68
|
+
except Exception as err:
|
69
|
+
raise ValueError(f"Could not parse cookies '{cookies}': {err}")
|
70
70
|
|
71
71
|
parsed_json = __ParseJSONData(json)
|
72
72
|
parsed_params = {}
|
@@ -145,7 +145,7 @@ class TextHandler(str):
|
|
145
145
|
clean_match: bool = False,
|
146
146
|
case_sensitive: bool = True,
|
147
147
|
check_match: Literal[False] = False,
|
148
|
-
) -> "TextHandlers
|
148
|
+
) -> "TextHandlers": ...
|
149
149
|
|
150
150
|
def re(
|
151
151
|
self,
|
@@ -241,7 +241,7 @@ class TextHandlers(List[TextHandler]):
|
|
241
241
|
replace_entities: bool = True,
|
242
242
|
clean_match: bool = False,
|
243
243
|
case_sensitive: bool = True,
|
244
|
-
) -> "TextHandlers
|
244
|
+
) -> "TextHandlers":
|
245
245
|
"""Call the ``.re()`` method for each element in this list and return
|
246
246
|
their results flattened as TextHandlers.
|
247
247
|
|
@@ -201,7 +201,7 @@ class CurlParser:
|
|
201
201
|
data_payload = parsed_args.data_binary # Fallback to string
|
202
202
|
|
203
203
|
elif parsed_args.data_raw is not None:
|
204
|
-
data_payload = parsed_args.data_raw
|
204
|
+
data_payload = parsed_args.data_raw.lstrip("$")
|
205
205
|
|
206
206
|
elif parsed_args.data is not None:
|
207
207
|
data_payload = parsed_args.data
|
@@ -317,8 +317,8 @@ def show_page_in_browser(page: Selector): # pragma: no cover
|
|
317
317
|
|
318
318
|
try:
|
319
319
|
fd, fname = make_temp_file(prefix="scrapling_view_", suffix=".html")
|
320
|
-
with open(fd, "
|
321
|
-
f.write(page.
|
320
|
+
with open(fd, "w", encoding=page.encoding) as f:
|
321
|
+
f.write(page.html_content)
|
322
322
|
|
323
323
|
open_in_browser(f"file://{fname}")
|
324
324
|
except IOError as e:
|
@@ -545,7 +545,7 @@ class Convertor:
|
|
545
545
|
for page in pages:
|
546
546
|
match extraction_type:
|
547
547
|
case "markdown":
|
548
|
-
yield cls._convert_to_markdown(page.
|
548
|
+
yield cls._convert_to_markdown(page.html_content)
|
549
549
|
case "html":
|
550
550
|
yield page.body
|
551
551
|
case "text":
|
@@ -1,4 +1,4 @@
|
|
1
|
-
from time import time
|
1
|
+
from time import time
|
2
2
|
from asyncio import sleep as asyncio_sleep, Lock
|
3
3
|
|
4
4
|
from camoufox import DefaultAddons
|
@@ -44,23 +44,7 @@ class SyncSession:
|
|
44
44
|
) -> PageInfo: # pragma: no cover
|
45
45
|
"""Get a new page to use"""
|
46
46
|
|
47
|
-
#
|
48
|
-
self.page_pool.close_all_finished_pages()
|
49
|
-
|
50
|
-
# If we're at max capacity after cleanup, wait for busy pages to finish
|
51
|
-
if self.page_pool.pages_count >= self.max_pages:
|
52
|
-
start_time = time()
|
53
|
-
while time() - start_time < self._max_wait_for_page:
|
54
|
-
# Wait for any pages to finish, then clean them up
|
55
|
-
sleep(0.05)
|
56
|
-
self.page_pool.close_all_finished_pages()
|
57
|
-
if self.page_pool.pages_count < self.max_pages:
|
58
|
-
break
|
59
|
-
else:
|
60
|
-
raise TimeoutError(
|
61
|
-
f"No pages finished to clear place in the pool within the {self._max_wait_for_page}s timeout period"
|
62
|
-
)
|
63
|
-
|
47
|
+
# No need to check if a page is available or not in sync code because the code blocked before reaching here till the page closed, ofc.
|
64
48
|
page = self.context.new_page()
|
65
49
|
page.set_default_navigation_timeout(timeout)
|
66
50
|
page.set_default_timeout(timeout)
|
@@ -76,11 +60,6 @@ class SyncSession:
|
|
76
60
|
|
77
61
|
return self.page_pool.add_page(page)
|
78
62
|
|
79
|
-
@staticmethod
|
80
|
-
def _get_with_precedence(request_value: Any, session_value: Any, sentinel_value: object) -> Any:
|
81
|
-
"""Get value with request-level priority over session-level"""
|
82
|
-
return request_value if request_value is not sentinel_value else session_value
|
83
|
-
|
84
63
|
def get_pool_stats(self) -> Dict[str, int]:
|
85
64
|
"""Get statistics about the current page pool"""
|
86
65
|
return {
|
@@ -105,16 +84,11 @@ class AsyncSession(SyncSession):
|
|
105
84
|
) -> PageInfo: # pragma: no cover
|
106
85
|
"""Get a new page to use"""
|
107
86
|
async with self._lock:
|
108
|
-
# Close all finished pages to ensure clean state
|
109
|
-
await self.page_pool.aclose_all_finished_pages()
|
110
|
-
|
111
87
|
# If we're at max capacity after cleanup, wait for busy pages to finish
|
112
88
|
if self.page_pool.pages_count >= self.max_pages:
|
113
89
|
start_time = time()
|
114
90
|
while time() - start_time < self._max_wait_for_page:
|
115
|
-
# Wait for any pages to finish, then clean them up
|
116
91
|
await asyncio_sleep(0.05)
|
117
|
-
await self.page_pool.aclose_all_finished_pages()
|
118
92
|
if self.page_pool.pages_count < self.max_pages:
|
119
93
|
break
|
120
94
|
else:
|
@@ -16,7 +16,7 @@ from playwright.async_api import (
|
|
16
16
|
)
|
17
17
|
from playwright._impl._errors import Error as PlaywrightError
|
18
18
|
|
19
|
-
from ._validators import
|
19
|
+
from ._validators import validate_fetch as _validate
|
20
20
|
from ._base import SyncSession, AsyncSession, StealthySessionMixin
|
21
21
|
from scrapling.core.utils import log
|
22
22
|
from scrapling.core._types import (
|
@@ -297,23 +297,22 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
297
297
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
298
298
|
:return: A `Response` object.
|
299
299
|
"""
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
CamoufoxConfig,
|
300
|
+
params = _validate(
|
301
|
+
[
|
302
|
+
("google_search", google_search, self.google_search),
|
303
|
+
("timeout", timeout, self.timeout),
|
304
|
+
("wait", wait, self.wait),
|
305
|
+
("page_action", page_action, self.page_action),
|
306
|
+
("extra_headers", extra_headers, self.extra_headers),
|
307
|
+
("disable_resources", disable_resources, self.disable_resources),
|
308
|
+
("wait_selector", wait_selector, self.wait_selector),
|
309
|
+
("wait_selector_state", wait_selector_state, self.wait_selector_state),
|
310
|
+
("network_idle", network_idle, self.network_idle),
|
311
|
+
("load_dom", load_dom, self.load_dom),
|
312
|
+
("solve_cloudflare", solve_cloudflare, self.solve_cloudflare),
|
313
|
+
("selector_config", selector_config, self.selector_config),
|
314
|
+
],
|
315
|
+
_UNSET,
|
317
316
|
)
|
318
317
|
|
319
318
|
if self._closed: # pragma: no cover
|
@@ -381,8 +380,9 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
381
380
|
page_info.page, first_response, final_response, params.selector_config
|
382
381
|
)
|
383
382
|
|
384
|
-
#
|
385
|
-
page_info.
|
383
|
+
# Close the page, to free up resources
|
384
|
+
page_info.page.close()
|
385
|
+
self.page_pool.pages.remove(page_info)
|
386
386
|
|
387
387
|
return response
|
388
388
|
|
@@ -616,22 +616,22 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
616
616
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
617
617
|
:return: A `Response` object.
|
618
618
|
"""
|
619
|
-
params =
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
619
|
+
params = _validate(
|
620
|
+
[
|
621
|
+
("google_search", google_search, self.google_search),
|
622
|
+
("timeout", timeout, self.timeout),
|
623
|
+
("wait", wait, self.wait),
|
624
|
+
("page_action", page_action, self.page_action),
|
625
|
+
("extra_headers", extra_headers, self.extra_headers),
|
626
|
+
("disable_resources", disable_resources, self.disable_resources),
|
627
|
+
("wait_selector", wait_selector, self.wait_selector),
|
628
|
+
("wait_selector_state", wait_selector_state, self.wait_selector_state),
|
629
|
+
("network_idle", network_idle, self.network_idle),
|
630
|
+
("load_dom", load_dom, self.load_dom),
|
631
|
+
("solve_cloudflare", solve_cloudflare, self.solve_cloudflare),
|
632
|
+
("selector_config", selector_config, self.selector_config),
|
633
|
+
],
|
634
|
+
_UNSET,
|
635
635
|
)
|
636
636
|
|
637
637
|
if self._closed: # pragma: no cover
|
@@ -701,8 +701,9 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
701
701
|
page_info.page, first_response, final_response, params.selector_config
|
702
702
|
)
|
703
703
|
|
704
|
-
#
|
705
|
-
page_info.
|
704
|
+
# Close the page, to free up resources
|
705
|
+
await page_info.page.close()
|
706
|
+
self.page_pool.pages.remove(page_info)
|
706
707
|
|
707
708
|
return response
|
708
709
|
|
@@ -11,14 +11,12 @@ from playwright.async_api import (
|
|
11
11
|
Playwright as AsyncPlaywright,
|
12
12
|
Locator as AsyncLocator,
|
13
13
|
)
|
14
|
-
from
|
15
|
-
from
|
16
|
-
async_playwright as async_rebrowser_playwright,
|
17
|
-
)
|
14
|
+
from patchright.sync_api import sync_playwright as sync_patchright
|
15
|
+
from patchright.async_api import async_playwright as async_patchright
|
18
16
|
|
19
17
|
from scrapling.core.utils import log
|
20
18
|
from ._base import SyncSession, AsyncSession, DynamicSessionMixin
|
21
|
-
from ._validators import
|
19
|
+
from ._validators import validate_fetch as _validate
|
22
20
|
from scrapling.core._types import (
|
23
21
|
Dict,
|
24
22
|
List,
|
@@ -154,10 +152,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
154
152
|
|
155
153
|
def __create__(self):
|
156
154
|
"""Create a browser for this instance and context."""
|
157
|
-
sync_context =
|
158
|
-
if not self.stealth or self.real_chrome:
|
159
|
-
# Because rebrowser_playwright doesn't play well with real browsers
|
160
|
-
sync_context = sync_playwright
|
155
|
+
sync_context = sync_patchright if self.stealth else sync_playwright
|
161
156
|
|
162
157
|
self.playwright: Playwright = sync_context().start()
|
163
158
|
|
@@ -229,22 +224,21 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
229
224
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
230
225
|
:return: A `Response` object.
|
231
226
|
"""
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
PlaywrightConfig,
|
227
|
+
params = _validate(
|
228
|
+
[
|
229
|
+
("google_search", google_search, self.google_search),
|
230
|
+
("timeout", timeout, self.timeout),
|
231
|
+
("wait", wait, self.wait),
|
232
|
+
("page_action", page_action, self.page_action),
|
233
|
+
("extra_headers", extra_headers, self.extra_headers),
|
234
|
+
("disable_resources", disable_resources, self.disable_resources),
|
235
|
+
("wait_selector", wait_selector, self.wait_selector),
|
236
|
+
("wait_selector_state", wait_selector_state, self.wait_selector_state),
|
237
|
+
("network_idle", network_idle, self.network_idle),
|
238
|
+
("load_dom", load_dom, self.load_dom),
|
239
|
+
("selector_config", selector_config, self.selector_config),
|
240
|
+
],
|
241
|
+
_UNSET,
|
248
242
|
)
|
249
243
|
|
250
244
|
if self._closed: # pragma: no cover
|
@@ -305,8 +299,9 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
305
299
|
page_info.page, first_response, final_response, params.selector_config
|
306
300
|
)
|
307
301
|
|
308
|
-
#
|
309
|
-
page_info.
|
302
|
+
# Close the page, to free up resources
|
303
|
+
page_info.page.close()
|
304
|
+
self.page_pool.pages.remove(page_info)
|
310
305
|
|
311
306
|
return response
|
312
307
|
|
@@ -402,10 +397,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
402
397
|
|
403
398
|
async def __create__(self):
|
404
399
|
"""Create a browser for this instance and context."""
|
405
|
-
async_context =
|
406
|
-
if not self.stealth or self.real_chrome:
|
407
|
-
# Because rebrowser_playwright doesn't play well with real browsers
|
408
|
-
async_context = async_playwright
|
400
|
+
async_context = async_patchright if self.stealth else async_playwright
|
409
401
|
|
410
402
|
self.playwright: AsyncPlaywright = await async_context().start()
|
411
403
|
|
@@ -478,22 +470,21 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
478
470
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
479
471
|
:return: A `Response` object.
|
480
472
|
"""
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
PlaywrightConfig,
|
473
|
+
params = _validate(
|
474
|
+
[
|
475
|
+
("google_search", google_search, self.google_search),
|
476
|
+
("timeout", timeout, self.timeout),
|
477
|
+
("wait", wait, self.wait),
|
478
|
+
("page_action", page_action, self.page_action),
|
479
|
+
("extra_headers", extra_headers, self.extra_headers),
|
480
|
+
("disable_resources", disable_resources, self.disable_resources),
|
481
|
+
("wait_selector", wait_selector, self.wait_selector),
|
482
|
+
("wait_selector_state", wait_selector_state, self.wait_selector_state),
|
483
|
+
("network_idle", network_idle, self.network_idle),
|
484
|
+
("load_dom", load_dom, self.load_dom),
|
485
|
+
("selector_config", selector_config, self.selector_config),
|
486
|
+
],
|
487
|
+
_UNSET,
|
497
488
|
)
|
498
489
|
|
499
490
|
if self._closed: # pragma: no cover
|
@@ -554,9 +545,9 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
554
545
|
page_info.page, first_response, final_response, params.selector_config
|
555
546
|
)
|
556
547
|
|
557
|
-
#
|
558
|
-
page_info.
|
559
|
-
|
548
|
+
# Close the page, to free up resources
|
549
|
+
await page_info.page.close()
|
550
|
+
self.page_pool.pages.remove(page_info)
|
560
551
|
return response
|
561
552
|
|
562
553
|
except Exception as e: # pragma: no cover
|
@@ -6,7 +6,7 @@ from playwright.async_api import Page as AsyncPage
|
|
6
6
|
|
7
7
|
from scrapling.core._types import Optional, List, Literal
|
8
8
|
|
9
|
-
PageState = Literal["
|
9
|
+
PageState = Literal["ready", "busy", "error"] # States that a page can be in
|
10
10
|
|
11
11
|
|
12
12
|
@dataclass
|
@@ -23,11 +23,6 @@ class PageInfo:
|
|
23
23
|
self.state = "busy"
|
24
24
|
self.url = url
|
25
25
|
|
26
|
-
def mark_finished(self):
|
27
|
-
"""Mark the page as finished for new requests"""
|
28
|
-
self.state = "finished"
|
29
|
-
self.url = ""
|
30
|
-
|
31
26
|
def mark_error(self):
|
32
27
|
"""Mark the page as having an error"""
|
33
28
|
self.state = "error"
|
@@ -67,12 +62,6 @@ class PagePool:
|
|
67
62
|
"""Get the total number of pages"""
|
68
63
|
return len(self.pages)
|
69
64
|
|
70
|
-
@property
|
71
|
-
def finished_count(self) -> int:
|
72
|
-
"""Get the number of finished pages"""
|
73
|
-
with self._lock:
|
74
|
-
return sum(1 for p in self.pages if p.state == "finished")
|
75
|
-
|
76
65
|
@property
|
77
66
|
def busy_count(self) -> int:
|
78
67
|
"""Get the number of busy pages"""
|
@@ -83,33 +72,3 @@ class PagePool:
|
|
83
72
|
"""Remove pages in error state"""
|
84
73
|
with self._lock:
|
85
74
|
self.pages = [p for p in self.pages if p.state != "error"]
|
86
|
-
|
87
|
-
def close_all_finished_pages(self):
|
88
|
-
"""Close all pages in finished state and remove them from the pool"""
|
89
|
-
with self._lock:
|
90
|
-
pages_to_remove = []
|
91
|
-
for page_info in self.pages:
|
92
|
-
if page_info.state == "finished":
|
93
|
-
try:
|
94
|
-
page_info.page.close()
|
95
|
-
except Exception:
|
96
|
-
pass
|
97
|
-
pages_to_remove.append(page_info)
|
98
|
-
|
99
|
-
for page_info in pages_to_remove:
|
100
|
-
self.pages.remove(page_info)
|
101
|
-
|
102
|
-
async def aclose_all_finished_pages(self):
|
103
|
-
"""Async version: Close all pages in finished state and remove them from the pool"""
|
104
|
-
with self._lock:
|
105
|
-
pages_to_remove = []
|
106
|
-
for page_info in self.pages:
|
107
|
-
if page_info.state == "finished":
|
108
|
-
try:
|
109
|
-
await page_info.page.close()
|
110
|
-
except Exception:
|
111
|
-
pass
|
112
|
-
pages_to_remove.append(page_info)
|
113
|
-
|
114
|
-
for page_info in pages_to_remove:
|
115
|
-
self.pages.remove(page_info)
|
@@ -0,0 +1,229 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from typing import Annotated
|
3
|
+
from dataclasses import dataclass
|
4
|
+
from urllib.parse import urlparse
|
5
|
+
|
6
|
+
from msgspec import Struct, Meta, convert, ValidationError
|
7
|
+
|
8
|
+
from scrapling.core._types import (
|
9
|
+
Dict,
|
10
|
+
List,
|
11
|
+
Tuple,
|
12
|
+
Optional,
|
13
|
+
Callable,
|
14
|
+
SelectorWaitStates,
|
15
|
+
)
|
16
|
+
from scrapling.engines.toolbelt.navigation import construct_proxy_dict
|
17
|
+
|
18
|
+
|
19
|
+
# Custom validators for msgspec
|
20
|
+
def _validate_file_path(value: str):
|
21
|
+
"""Fast file path validation"""
|
22
|
+
path = Path(value)
|
23
|
+
if not path.exists():
|
24
|
+
raise ValueError(f"Init script path not found: {value}")
|
25
|
+
if not path.is_file():
|
26
|
+
raise ValueError(f"Init script is not a file: {value}")
|
27
|
+
if not path.is_absolute():
|
28
|
+
raise ValueError(f"Init script is not a absolute path: {value}")
|
29
|
+
|
30
|
+
|
31
|
+
def _validate_addon_path(value: str):
|
32
|
+
"""Fast addon path validation"""
|
33
|
+
path = Path(value)
|
34
|
+
if not path.exists():
|
35
|
+
raise FileNotFoundError(f"Addon path not found: {value}")
|
36
|
+
if not path.is_dir():
|
37
|
+
raise ValueError(f"Addon path must be a directory of the extracted addon: {value}")
|
38
|
+
|
39
|
+
|
40
|
+
def _validate_cdp_url(cdp_url: str):
|
41
|
+
"""Fast CDP URL validation"""
|
42
|
+
try:
|
43
|
+
# Check the scheme
|
44
|
+
if not cdp_url.startswith(("ws://", "wss://")):
|
45
|
+
raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
|
46
|
+
|
47
|
+
# Validate hostname and port
|
48
|
+
if not urlparse(cdp_url).netloc:
|
49
|
+
raise ValueError("Invalid hostname for the CDP URL")
|
50
|
+
|
51
|
+
except AttributeError as e:
|
52
|
+
raise ValueError(f"Malformed CDP URL: {cdp_url}: {str(e)}")
|
53
|
+
|
54
|
+
except Exception as e:
|
55
|
+
raise ValueError(f"Invalid CDP URL '{cdp_url}': {str(e)}")
|
56
|
+
|
57
|
+
|
58
|
+
# Type aliases for cleaner annotations
|
59
|
+
PagesCount = Annotated[int, Meta(ge=1, le=50)]
|
60
|
+
Seconds = Annotated[int, float, Meta(ge=0)]
|
61
|
+
|
62
|
+
|
63
|
+
class PlaywrightConfig(Struct, kw_only=True, frozen=False):
|
64
|
+
"""Configuration struct for validation"""
|
65
|
+
|
66
|
+
max_pages: PagesCount = 1
|
67
|
+
cdp_url: Optional[str] = None
|
68
|
+
headless: bool = True
|
69
|
+
google_search: bool = True
|
70
|
+
hide_canvas: bool = False
|
71
|
+
disable_webgl: bool = False
|
72
|
+
real_chrome: bool = False
|
73
|
+
stealth: bool = False
|
74
|
+
wait: Seconds = 0
|
75
|
+
page_action: Optional[Callable] = None
|
76
|
+
proxy: Optional[str | Dict[str, str]] = None # The default value for proxy in Playwright's source is `None`
|
77
|
+
locale: str = "en-US"
|
78
|
+
extra_headers: Optional[Dict[str, str]] = None
|
79
|
+
useragent: Optional[str] = None
|
80
|
+
timeout: Seconds = 30000
|
81
|
+
init_script: Optional[str] = None
|
82
|
+
disable_resources: bool = False
|
83
|
+
wait_selector: Optional[str] = None
|
84
|
+
cookies: Optional[List[Dict]] = None
|
85
|
+
network_idle: bool = False
|
86
|
+
load_dom: bool = True
|
87
|
+
wait_selector_state: SelectorWaitStates = "attached"
|
88
|
+
selector_config: Optional[Dict] = None
|
89
|
+
|
90
|
+
def __post_init__(self):
|
91
|
+
"""Custom validation after msgspec validation"""
|
92
|
+
if self.page_action and not callable(self.page_action):
|
93
|
+
raise TypeError(f"page_action must be callable, got {type(self.page_action).__name__}")
|
94
|
+
if self.proxy:
|
95
|
+
self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
|
96
|
+
if self.cdp_url:
|
97
|
+
_validate_cdp_url(self.cdp_url)
|
98
|
+
|
99
|
+
if not self.cookies:
|
100
|
+
self.cookies = []
|
101
|
+
if not self.selector_config:
|
102
|
+
self.selector_config = {}
|
103
|
+
|
104
|
+
if self.init_script is not None:
|
105
|
+
_validate_file_path(self.init_script)
|
106
|
+
|
107
|
+
|
108
|
+
class CamoufoxConfig(Struct, kw_only=True, frozen=False):
|
109
|
+
"""Configuration struct for validation"""
|
110
|
+
|
111
|
+
max_pages: PagesCount = 1
|
112
|
+
headless: bool = True # noqa: F821
|
113
|
+
block_images: bool = False
|
114
|
+
disable_resources: bool = False
|
115
|
+
block_webrtc: bool = False
|
116
|
+
allow_webgl: bool = True
|
117
|
+
network_idle: bool = False
|
118
|
+
load_dom: bool = True
|
119
|
+
humanize: bool | float = True
|
120
|
+
solve_cloudflare: bool = False
|
121
|
+
wait: Seconds = 0
|
122
|
+
timeout: Seconds = 30000
|
123
|
+
init_script: Optional[str] = None
|
124
|
+
page_action: Optional[Callable] = None
|
125
|
+
wait_selector: Optional[str] = None
|
126
|
+
addons: Optional[List[str]] = None
|
127
|
+
wait_selector_state: SelectorWaitStates = "attached"
|
128
|
+
cookies: Optional[List[Dict]] = None
|
129
|
+
google_search: bool = True
|
130
|
+
extra_headers: Optional[Dict[str, str]] = None
|
131
|
+
proxy: Optional[str | Dict[str, str]] = None # The default value for proxy in Playwright's source is `None`
|
132
|
+
os_randomize: bool = False
|
133
|
+
disable_ads: bool = False
|
134
|
+
geoip: bool = False
|
135
|
+
selector_config: Optional[Dict] = None
|
136
|
+
additional_args: Optional[Dict] = None
|
137
|
+
|
138
|
+
def __post_init__(self):
|
139
|
+
"""Custom validation after msgspec validation"""
|
140
|
+
if self.page_action and not callable(self.page_action):
|
141
|
+
raise TypeError(f"page_action must be callable, got {type(self.page_action).__name__}")
|
142
|
+
if self.proxy:
|
143
|
+
self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
|
144
|
+
|
145
|
+
if self.addons and isinstance(self.addons, list):
|
146
|
+
for addon in self.addons:
|
147
|
+
_validate_addon_path(addon)
|
148
|
+
else:
|
149
|
+
self.addons = []
|
150
|
+
|
151
|
+
if self.init_script is not None:
|
152
|
+
_validate_file_path(self.init_script)
|
153
|
+
|
154
|
+
if not self.cookies:
|
155
|
+
self.cookies = []
|
156
|
+
# Cloudflare timeout adjustment
|
157
|
+
if self.solve_cloudflare and self.timeout < 60_000:
|
158
|
+
self.timeout = 60_000
|
159
|
+
if not self.selector_config:
|
160
|
+
self.selector_config = {}
|
161
|
+
if not self.additional_args:
|
162
|
+
self.additional_args = {}
|
163
|
+
|
164
|
+
|
165
|
+
# Code parts to validate `fetch` in the least possible numbers of lines overall
|
166
|
+
class FetchConfig(Struct, kw_only=True):
|
167
|
+
"""Configuration struct for `fetch` calls validation"""
|
168
|
+
|
169
|
+
google_search: bool = True
|
170
|
+
timeout: Seconds = 30000
|
171
|
+
wait: Seconds = 0
|
172
|
+
page_action: Optional[Callable] = None
|
173
|
+
extra_headers: Optional[Dict[str, str]] = None
|
174
|
+
disable_resources: bool = False
|
175
|
+
wait_selector: Optional[str] = None
|
176
|
+
wait_selector_state: SelectorWaitStates = "attached"
|
177
|
+
network_idle: bool = False
|
178
|
+
load_dom: bool = True
|
179
|
+
solve_cloudflare: bool = False
|
180
|
+
selector_config: Optional[Dict] = {}
|
181
|
+
|
182
|
+
def to_dict(self):
|
183
|
+
return {f: getattr(self, f) for f in self.__struct_fields__}
|
184
|
+
|
185
|
+
|
186
|
+
@dataclass
|
187
|
+
class _fetch_params:
|
188
|
+
"""A dataclass of all parameters used by `fetch` calls"""
|
189
|
+
|
190
|
+
google_search: bool
|
191
|
+
timeout: Seconds
|
192
|
+
wait: Seconds
|
193
|
+
page_action: Optional[Callable]
|
194
|
+
extra_headers: Optional[Dict[str, str]]
|
195
|
+
disable_resources: bool
|
196
|
+
wait_selector: Optional[str]
|
197
|
+
wait_selector_state: SelectorWaitStates
|
198
|
+
network_idle: bool
|
199
|
+
load_dom: bool
|
200
|
+
solve_cloudflare: bool
|
201
|
+
selector_config: Optional[Dict]
|
202
|
+
|
203
|
+
|
204
|
+
def validate_fetch(params: List[Tuple], sentinel=None) -> _fetch_params:
|
205
|
+
result = {}
|
206
|
+
overrides = {}
|
207
|
+
|
208
|
+
for arg, request_value, session_value in params:
|
209
|
+
if request_value is not sentinel:
|
210
|
+
overrides[arg] = request_value
|
211
|
+
else:
|
212
|
+
result[arg] = session_value
|
213
|
+
|
214
|
+
if overrides:
|
215
|
+
overrides = validate(overrides, FetchConfig).to_dict()
|
216
|
+
overrides.update(result)
|
217
|
+
return _fetch_params(**overrides)
|
218
|
+
|
219
|
+
if not result.get("solve_cloudflare"):
|
220
|
+
result["solve_cloudflare"] = False
|
221
|
+
|
222
|
+
return _fetch_params(**result)
|
223
|
+
|
224
|
+
|
225
|
+
def validate(params: Dict, model) -> PlaywrightConfig | CamoufoxConfig | FetchConfig:
|
226
|
+
try:
|
227
|
+
return convert(params, model)
|
228
|
+
except ValidationError as e:
|
229
|
+
raise TypeError(f"Invalid argument type: {e}") from e
|
@@ -94,8 +94,8 @@ class FetcherSession:
|
|
94
94
|
self.default_http3 = http3
|
95
95
|
self.selector_config = selector_config or {}
|
96
96
|
|
97
|
-
self._curl_session: Optional[CurlSession] = None
|
98
|
-
self._async_curl_session: Optional[AsyncCurlSession] = None
|
97
|
+
self._curl_session: Optional[CurlSession] | bool = None
|
98
|
+
self._async_curl_session: Optional[AsyncCurlSession] | bool = None
|
99
99
|
|
100
100
|
def _merge_request_args(self, **kwargs) -> Dict[str, Any]:
|
101
101
|
"""Merge request-specific arguments with default session arguments."""
|
@@ -239,7 +239,6 @@ class FetcherSession:
|
|
239
239
|
Perform an HTTP request using the configured session.
|
240
240
|
|
241
241
|
:param method: HTTP method to be used, supported methods are ["GET", "POST", "PUT", "DELETE"]
|
242
|
-
:param url: Target URL for the request.
|
243
242
|
:param request_args: Arguments to be passed to the session's `request()` method.
|
244
243
|
:param max_retries: Maximum number of retries for the request.
|
245
244
|
:param retry_delay: Number of seconds to wait between retries.
|
@@ -280,7 +279,6 @@ class FetcherSession:
|
|
280
279
|
Perform an HTTP request using the configured session.
|
281
280
|
|
282
281
|
:param method: HTTP method to be used, supported methods are ["GET", "POST", "PUT", "DELETE"]
|
283
|
-
:param url: Target URL for the request.
|
284
282
|
:param request_args: Arguments to be passed to the session's `request()` method.
|
285
283
|
:param max_retries: Maximum number of retries for the request.
|
286
284
|
:param retry_delay: Number of seconds to wait between retries.
|
@@ -4,7 +4,7 @@ Functions related to files and URLs
|
|
4
4
|
|
5
5
|
from pathlib import Path
|
6
6
|
from functools import lru_cache
|
7
|
-
from urllib.parse import
|
7
|
+
from urllib.parse import urlparse
|
8
8
|
|
9
9
|
from playwright.async_api import Route as async_Route
|
10
10
|
from msgspec import Struct, structs, convert, ValidationError
|
@@ -239,7 +239,7 @@ class Selector(SelectorsGeneration):
|
|
239
239
|
)
|
240
240
|
|
241
241
|
def __handle_element(
|
242
|
-
self, element: HtmlElement | _ElementUnicodeResult
|
242
|
+
self, element: Optional[HtmlElement | _ElementUnicodeResult]
|
243
243
|
) -> Optional[Union[TextHandler, "Selector"]]:
|
244
244
|
"""Used internally in all functions to convert a single element to type (Selector|TextHandler) when possible"""
|
245
245
|
if element is None:
|
@@ -345,7 +345,7 @@ class Selector(SelectorsGeneration):
|
|
345
345
|
return TextHandler(content)
|
346
346
|
|
347
347
|
@property
|
348
|
-
def body(self):
|
348
|
+
def body(self) -> str | bytes:
|
349
349
|
"""Return the raw body of the current `Selector` without any processing. Useful for binary and non-HTML requests."""
|
350
350
|
return self._raw_body
|
351
351
|
|
@@ -1259,7 +1259,7 @@ class Selectors(List[Selector]):
|
|
1259
1259
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
1260
1260
|
:param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
|
1261
1261
|
"""
|
1262
|
-
results = [n.
|
1262
|
+
results = [n.re(regex, replace_entities, clean_match, case_sensitive) for n in self]
|
1263
1263
|
return TextHandlers(flatten(results))
|
1264
1264
|
|
1265
1265
|
def re_first(
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.5
|
4
4
|
Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -69,15 +69,15 @@ Requires-Dist: cssselect>=1.3.0
|
|
69
69
|
Requires-Dist: orjson>=3.11.3
|
70
70
|
Requires-Dist: tldextract>=5.3.0
|
71
71
|
Provides-Extra: fetchers
|
72
|
-
Requires-Dist: click>=8.
|
72
|
+
Requires-Dist: click>=8.3.0; extra == "fetchers"
|
73
73
|
Requires-Dist: curl_cffi>=0.13.0; extra == "fetchers"
|
74
|
-
Requires-Dist: playwright>=1.
|
75
|
-
Requires-Dist:
|
74
|
+
Requires-Dist: playwright>=1.55.0; extra == "fetchers"
|
75
|
+
Requires-Dist: patchright>=1.55.2; extra == "fetchers"
|
76
76
|
Requires-Dist: camoufox>=0.4.11; extra == "fetchers"
|
77
77
|
Requires-Dist: geoip2>=5.1.0; extra == "fetchers"
|
78
78
|
Requires-Dist: msgspec>=0.19.0; extra == "fetchers"
|
79
79
|
Provides-Extra: ai
|
80
|
-
Requires-Dist: mcp>=1.14.
|
80
|
+
Requires-Dist: mcp>=1.14.1; extra == "ai"
|
81
81
|
Requires-Dist: markdownify>=1.2.0; extra == "ai"
|
82
82
|
Requires-Dist: scrapling[fetchers]; extra == "ai"
|
83
83
|
Provides-Extra: shell
|
@@ -157,12 +157,13 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
|
|
157
157
|
|
158
158
|
<!-- sponsors -->
|
159
159
|
|
160
|
+
<a href="https://www.thordata.com/?ls=github&lk=D4Vinci" target="_blank" title="A global network of over 60M+ residential proxies with 99.7% availability, ensuring stable and reliable web data scraping to support AI, BI, and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
|
160
161
|
<a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
|
161
|
-
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
162
162
|
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
163
|
+
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
163
164
|
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
164
|
-
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
165
165
|
<a href="https://www.nstproxy.com/?type=flow&utm_source=scrapling" target="_blank" title="One Proxy Service, Infinite Solutions at Unbeatable Prices!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/NSTproxy.png"></a>
|
166
|
+
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
166
167
|
|
167
168
|
<!-- /sponsors -->
|
168
169
|
|
@@ -411,10 +412,9 @@ This project includes code adapted from:
|
|
411
412
|
## Thanks and References
|
412
413
|
|
413
414
|
- [Daijro](https://github.com/daijro)'s brilliant work on [BrowserForge](https://github.com/daijro/browserforge) and [Camoufox](https://github.com/daijro/camoufox)
|
414
|
-
- [Vinyzu](https://github.com/Vinyzu)'s work on [Botright](https://github.com/Vinyzu/Botright)
|
415
|
+
- [Vinyzu](https://github.com/Vinyzu)'s brilliant work on [Botright](https://github.com/Vinyzu/Botright) and [PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright)
|
415
416
|
- [brotector](https://github.com/kaliiiiiiiiii/brotector) for browser detection bypass techniques
|
416
|
-
- [fakebrowser](https://github.com/kkoooqq/fakebrowser) for fingerprinting research
|
417
|
-
- [rebrowser-patches](https://github.com/rebrowser/rebrowser-patches) for stealth improvements
|
417
|
+
- [fakebrowser](https://github.com/kkoooqq/fakebrowser) and [BotBrowser](https://github.com/botswin/BotBrowser) for fingerprinting research
|
418
418
|
|
419
419
|
---
|
420
420
|
<div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
|
@@ -4,7 +4,7 @@ orjson>=3.11.3
|
|
4
4
|
tldextract>=5.3.0
|
5
5
|
|
6
6
|
[ai]
|
7
|
-
mcp>=1.14.
|
7
|
+
mcp>=1.14.1
|
8
8
|
markdownify>=1.2.0
|
9
9
|
scrapling[fetchers]
|
10
10
|
|
@@ -12,10 +12,10 @@ scrapling[fetchers]
|
|
12
12
|
scrapling[ai,shell]
|
13
13
|
|
14
14
|
[fetchers]
|
15
|
-
click>=8.
|
15
|
+
click>=8.3.0
|
16
16
|
curl_cffi>=0.13.0
|
17
|
-
playwright>=1.
|
18
|
-
|
17
|
+
playwright>=1.55.0
|
18
|
+
patchright>=1.55.2
|
19
19
|
camoufox>=0.4.11
|
20
20
|
geoip2>=5.1.0
|
21
21
|
msgspec>=0.19.0
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[metadata]
|
2
2
|
name = scrapling
|
3
|
-
version = 0.3.
|
3
|
+
version = 0.3.5
|
4
4
|
author = Karim Shoair
|
5
5
|
author_email = karim.shoair@pm.me
|
6
6
|
description = Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
|
@@ -1,164 +0,0 @@
|
|
1
|
-
from msgspec import Struct, convert, ValidationError
|
2
|
-
from urllib.parse import urlparse
|
3
|
-
from pathlib import Path
|
4
|
-
|
5
|
-
from scrapling.core._types import (
|
6
|
-
Optional,
|
7
|
-
Dict,
|
8
|
-
Callable,
|
9
|
-
List,
|
10
|
-
SelectorWaitStates,
|
11
|
-
)
|
12
|
-
from scrapling.engines.toolbelt.navigation import construct_proxy_dict
|
13
|
-
|
14
|
-
|
15
|
-
class PlaywrightConfig(Struct, kw_only=True, frozen=False):
|
16
|
-
"""Configuration struct for validation"""
|
17
|
-
|
18
|
-
max_pages: int = 1
|
19
|
-
cdp_url: Optional[str] = None
|
20
|
-
headless: bool = True
|
21
|
-
google_search: bool = True
|
22
|
-
hide_canvas: bool = False
|
23
|
-
disable_webgl: bool = False
|
24
|
-
real_chrome: bool = False
|
25
|
-
stealth: bool = False
|
26
|
-
wait: int | float = 0
|
27
|
-
page_action: Optional[Callable] = None
|
28
|
-
proxy: Optional[str | Dict[str, str]] = None # The default value for proxy in Playwright's source is `None`
|
29
|
-
locale: str = "en-US"
|
30
|
-
extra_headers: Optional[Dict[str, str]] = None
|
31
|
-
useragent: Optional[str] = None
|
32
|
-
timeout: int | float = 30000
|
33
|
-
init_script: Optional[str] = None
|
34
|
-
disable_resources: bool = False
|
35
|
-
wait_selector: Optional[str] = None
|
36
|
-
cookies: Optional[List[Dict]] = None
|
37
|
-
network_idle: bool = False
|
38
|
-
load_dom: bool = True
|
39
|
-
wait_selector_state: SelectorWaitStates = "attached"
|
40
|
-
selector_config: Optional[Dict] = None
|
41
|
-
|
42
|
-
def __post_init__(self):
|
43
|
-
"""Custom validation after msgspec validation"""
|
44
|
-
if self.max_pages < 1 or self.max_pages > 50:
|
45
|
-
raise ValueError("max_pages must be between 1 and 50")
|
46
|
-
if self.timeout < 0:
|
47
|
-
raise ValueError("timeout must be >= 0")
|
48
|
-
if self.page_action and not callable(self.page_action):
|
49
|
-
raise TypeError(f"page_action must be callable, got {type(self.page_action).__name__}")
|
50
|
-
if self.proxy:
|
51
|
-
self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
|
52
|
-
if self.cdp_url:
|
53
|
-
self.__validate_cdp(self.cdp_url)
|
54
|
-
if not self.cookies:
|
55
|
-
self.cookies = []
|
56
|
-
if not self.selector_config:
|
57
|
-
self.selector_config = {}
|
58
|
-
|
59
|
-
if self.init_script is not None:
|
60
|
-
script_path = Path(self.init_script)
|
61
|
-
if not script_path.exists():
|
62
|
-
raise ValueError("Init script path not found")
|
63
|
-
elif not script_path.is_file():
|
64
|
-
raise ValueError("Init script is not a file")
|
65
|
-
elif not script_path.is_absolute():
|
66
|
-
raise ValueError("Init script is not a absolute path")
|
67
|
-
|
68
|
-
@staticmethod
|
69
|
-
def __validate_cdp(cdp_url):
|
70
|
-
try:
|
71
|
-
# Check the scheme
|
72
|
-
if not cdp_url.startswith(("ws://", "wss://")):
|
73
|
-
raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
|
74
|
-
|
75
|
-
# Validate hostname and port
|
76
|
-
if not urlparse(cdp_url).netloc:
|
77
|
-
raise ValueError("Invalid hostname for the CDP URL")
|
78
|
-
|
79
|
-
except AttributeError as e:
|
80
|
-
raise ValueError(f"Malformed CDP URL: {cdp_url}: {str(e)}")
|
81
|
-
|
82
|
-
except Exception as e:
|
83
|
-
raise ValueError(f"Invalid CDP URL '{cdp_url}': {str(e)}")
|
84
|
-
|
85
|
-
|
86
|
-
class CamoufoxConfig(Struct, kw_only=True, frozen=False):
|
87
|
-
"""Configuration struct for validation"""
|
88
|
-
|
89
|
-
max_pages: int = 1
|
90
|
-
headless: bool = True # noqa: F821
|
91
|
-
block_images: bool = False
|
92
|
-
disable_resources: bool = False
|
93
|
-
block_webrtc: bool = False
|
94
|
-
allow_webgl: bool = True
|
95
|
-
network_idle: bool = False
|
96
|
-
load_dom: bool = True
|
97
|
-
humanize: bool | float = True
|
98
|
-
solve_cloudflare: bool = False
|
99
|
-
wait: int | float = 0
|
100
|
-
timeout: int | float = 30000
|
101
|
-
init_script: Optional[str] = None
|
102
|
-
page_action: Optional[Callable] = None
|
103
|
-
wait_selector: Optional[str] = None
|
104
|
-
addons: Optional[List[str]] = None
|
105
|
-
wait_selector_state: SelectorWaitStates = "attached"
|
106
|
-
cookies: Optional[List[Dict]] = None
|
107
|
-
google_search: bool = True
|
108
|
-
extra_headers: Optional[Dict[str, str]] = None
|
109
|
-
proxy: Optional[str | Dict[str, str]] = None # The default value for proxy in Playwright's source is `None`
|
110
|
-
os_randomize: bool = False
|
111
|
-
disable_ads: bool = False
|
112
|
-
geoip: bool = False
|
113
|
-
selector_config: Optional[Dict] = None
|
114
|
-
additional_args: Optional[Dict] = None
|
115
|
-
|
116
|
-
def __post_init__(self):
|
117
|
-
"""Custom validation after msgspec validation"""
|
118
|
-
if self.max_pages < 1 or self.max_pages > 50:
|
119
|
-
raise ValueError("max_pages must be between 1 and 50")
|
120
|
-
if self.timeout < 0:
|
121
|
-
raise ValueError("timeout must be >= 0")
|
122
|
-
if self.page_action and not callable(self.page_action):
|
123
|
-
raise TypeError(f"page_action must be callable, got {type(self.page_action).__name__}")
|
124
|
-
if self.proxy:
|
125
|
-
self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
|
126
|
-
|
127
|
-
if not self.addons:
|
128
|
-
self.addons = []
|
129
|
-
else:
|
130
|
-
for addon in self.addons:
|
131
|
-
addon_path = Path(addon)
|
132
|
-
if not addon_path.exists():
|
133
|
-
raise FileNotFoundError(f"Addon's path not found: {addon}")
|
134
|
-
elif not addon_path.is_dir():
|
135
|
-
raise ValueError(
|
136
|
-
f"Addon's path is not a folder, you need to pass a folder of the extracted addon: {addon}"
|
137
|
-
)
|
138
|
-
|
139
|
-
if self.init_script is not None:
|
140
|
-
script_path = Path(self.init_script)
|
141
|
-
if not script_path.exists():
|
142
|
-
raise ValueError("Init script path not found")
|
143
|
-
elif not script_path.is_file():
|
144
|
-
raise ValueError("Init script is not a file")
|
145
|
-
elif not script_path.is_absolute():
|
146
|
-
raise ValueError("Init script is not a absolute path")
|
147
|
-
|
148
|
-
if not self.cookies:
|
149
|
-
self.cookies = []
|
150
|
-
if self.solve_cloudflare and self.timeout < 60_000:
|
151
|
-
self.timeout = 60_000
|
152
|
-
if not self.selector_config:
|
153
|
-
self.selector_config = {}
|
154
|
-
if not self.additional_args:
|
155
|
-
self.additional_args = {}
|
156
|
-
|
157
|
-
|
158
|
-
def validate(params, model):
|
159
|
-
try:
|
160
|
-
config = convert(params, model)
|
161
|
-
except ValidationError as e:
|
162
|
-
raise TypeError(f"Invalid argument type: {e}")
|
163
|
-
|
164
|
-
return config
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js
RENAMED
File without changes
|
{scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/toolbelt/bypasses/notification_permission.js
RENAMED
File without changes
|
{scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|