scrapling 0.3.3__tar.gz → 0.3.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scrapling-0.3.3/scrapling.egg-info → scrapling-0.3.5}/PKG-INFO +18 -17
- {scrapling-0.3.3 → scrapling-0.3.5}/README.md +13 -12
- {scrapling-0.3.3 → scrapling-0.3.5}/pyproject.toml +4 -4
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/__init__.py +1 -1
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/cli.py +4 -4
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/core/custom_types.py +2 -2
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/core/shell.py +21 -6
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/_browsers/_base.py +5 -31
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/_browsers/_camoufox.py +74 -44
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/_browsers/_controllers.py +41 -50
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/_browsers/_page.py +1 -42
- scrapling-0.3.5/scrapling/engines/_browsers/_validators.py +229 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/static.py +2 -4
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/toolbelt/navigation.py +1 -1
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/parser.py +16 -12
- {scrapling-0.3.3 → scrapling-0.3.5/scrapling.egg-info}/PKG-INFO +18 -17
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling.egg-info/requires.txt +4 -4
- {scrapling-0.3.3 → scrapling-0.3.5}/setup.cfg +1 -1
- scrapling-0.3.3/scrapling/engines/_browsers/_validators.py +0 -164
- {scrapling-0.3.3 → scrapling-0.3.5}/LICENSE +0 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/MANIFEST.in +0 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/core/__init__.py +0 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/core/_html_utils.py +0 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/core/_types.py +0 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/core/ai.py +0 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/core/mixins.py +0 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/core/storage.py +0 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/core/translator.py +0 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/core/utils/__init__.py +0 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/core/utils/_shell.py +0 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/core/utils/_utils.py +0 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/__init__.py +0 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/_browsers/__init__.py +0 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/_browsers/_config_tools.py +0 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/constants.py +0 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/toolbelt/__init__.py +0 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/toolbelt/convertor.py +0 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/toolbelt/custom.py +0 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/toolbelt/fingerprints.py +0 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/fetchers.py +0 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/py.typed +0 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling.egg-info/SOURCES.txt +0 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling.egg-info/dependency_links.txt +0 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling.egg-info/entry_points.txt +0 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling.egg-info/not-zip-safe +0 -0
- {scrapling-0.3.3 → scrapling-0.3.5}/scrapling.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.5
|
4
4
|
Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -69,15 +69,15 @@ Requires-Dist: cssselect>=1.3.0
|
|
69
69
|
Requires-Dist: orjson>=3.11.3
|
70
70
|
Requires-Dist: tldextract>=5.3.0
|
71
71
|
Provides-Extra: fetchers
|
72
|
-
Requires-Dist: click>=8.
|
72
|
+
Requires-Dist: click>=8.3.0; extra == "fetchers"
|
73
73
|
Requires-Dist: curl_cffi>=0.13.0; extra == "fetchers"
|
74
|
-
Requires-Dist: playwright>=1.
|
75
|
-
Requires-Dist:
|
74
|
+
Requires-Dist: playwright>=1.55.0; extra == "fetchers"
|
75
|
+
Requires-Dist: patchright>=1.55.2; extra == "fetchers"
|
76
76
|
Requires-Dist: camoufox>=0.4.11; extra == "fetchers"
|
77
77
|
Requires-Dist: geoip2>=5.1.0; extra == "fetchers"
|
78
78
|
Requires-Dist: msgspec>=0.19.0; extra == "fetchers"
|
79
79
|
Provides-Extra: ai
|
80
|
-
Requires-Dist: mcp>=1.14.
|
80
|
+
Requires-Dist: mcp>=1.14.1; extra == "ai"
|
81
81
|
Requires-Dist: markdownify>=1.2.0; extra == "ai"
|
82
82
|
Requires-Dist: scrapling[fetchers]; extra == "ai"
|
83
83
|
Provides-Extra: shell
|
@@ -114,14 +114,6 @@ Dynamic: license-file
|
|
114
114
|
</p>
|
115
115
|
|
116
116
|
<p align="center">
|
117
|
-
<a href="https://scrapling.readthedocs.io/en/latest/#installation">
|
118
|
-
Installation
|
119
|
-
</a>
|
120
|
-
·
|
121
|
-
<a href="https://scrapling.readthedocs.io/en/latest/overview/">
|
122
|
-
Overview
|
123
|
-
</a>
|
124
|
-
·
|
125
117
|
<a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/">
|
126
118
|
Selection methods
|
127
119
|
</a>
|
@@ -130,6 +122,14 @@ Dynamic: license-file
|
|
130
122
|
Choosing a fetcher
|
131
123
|
</a>
|
132
124
|
·
|
125
|
+
<a href="https://scrapling.readthedocs.io/en/latest/cli/overview/">
|
126
|
+
CLI
|
127
|
+
</a>
|
128
|
+
·
|
129
|
+
<a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/">
|
130
|
+
MCP mode
|
131
|
+
</a>
|
132
|
+
·
|
133
133
|
<a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/">
|
134
134
|
Migrating from Beautifulsoup
|
135
135
|
</a>
|
@@ -157,11 +157,13 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
|
|
157
157
|
|
158
158
|
<!-- sponsors -->
|
159
159
|
|
160
|
+
<a href="https://www.thordata.com/?ls=github&lk=D4Vinci" target="_blank" title="A global network of over 60M+ residential proxies with 99.7% availability, ensuring stable and reliable web data scraping to support AI, BI, and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
|
160
161
|
<a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
|
162
|
+
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
161
163
|
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
162
164
|
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
163
|
-
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
164
165
|
<a href="https://www.nstproxy.com/?type=flow&utm_source=scrapling" target="_blank" title="One Proxy Service, Infinite Solutions at Unbeatable Prices!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/NSTproxy.png"></a>
|
166
|
+
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
165
167
|
|
166
168
|
<!-- /sponsors -->
|
167
169
|
|
@@ -410,10 +412,9 @@ This project includes code adapted from:
|
|
410
412
|
## Thanks and References
|
411
413
|
|
412
414
|
- [Daijro](https://github.com/daijro)'s brilliant work on [BrowserForge](https://github.com/daijro/browserforge) and [Camoufox](https://github.com/daijro/camoufox)
|
413
|
-
- [Vinyzu](https://github.com/Vinyzu)'s work on [Botright](https://github.com/Vinyzu/Botright)
|
415
|
+
- [Vinyzu](https://github.com/Vinyzu)'s brilliant work on [Botright](https://github.com/Vinyzu/Botright) and [PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright)
|
414
416
|
- [brotector](https://github.com/kaliiiiiiiiii/brotector) for browser detection bypass techniques
|
415
|
-
- [fakebrowser](https://github.com/kkoooqq/fakebrowser) for fingerprinting research
|
416
|
-
- [rebrowser-patches](https://github.com/rebrowser/rebrowser-patches) for stealth improvements
|
417
|
+
- [fakebrowser](https://github.com/kkoooqq/fakebrowser) and [BotBrowser](https://github.com/botswin/BotBrowser) for fingerprinting research
|
417
418
|
|
418
419
|
---
|
419
420
|
<div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
|
@@ -24,14 +24,6 @@
|
|
24
24
|
</p>
|
25
25
|
|
26
26
|
<p align="center">
|
27
|
-
<a href="https://scrapling.readthedocs.io/en/latest/#installation">
|
28
|
-
Installation
|
29
|
-
</a>
|
30
|
-
·
|
31
|
-
<a href="https://scrapling.readthedocs.io/en/latest/overview/">
|
32
|
-
Overview
|
33
|
-
</a>
|
34
|
-
·
|
35
27
|
<a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/">
|
36
28
|
Selection methods
|
37
29
|
</a>
|
@@ -40,6 +32,14 @@
|
|
40
32
|
Choosing a fetcher
|
41
33
|
</a>
|
42
34
|
·
|
35
|
+
<a href="https://scrapling.readthedocs.io/en/latest/cli/overview/">
|
36
|
+
CLI
|
37
|
+
</a>
|
38
|
+
·
|
39
|
+
<a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/">
|
40
|
+
MCP mode
|
41
|
+
</a>
|
42
|
+
·
|
43
43
|
<a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/">
|
44
44
|
Migrating from Beautifulsoup
|
45
45
|
</a>
|
@@ -67,11 +67,13 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
|
|
67
67
|
|
68
68
|
<!-- sponsors -->
|
69
69
|
|
70
|
+
<a href="https://www.thordata.com/?ls=github&lk=D4Vinci" target="_blank" title="A global network of over 60M+ residential proxies with 99.7% availability, ensuring stable and reliable web data scraping to support AI, BI, and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
|
70
71
|
<a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
|
72
|
+
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
71
73
|
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
72
74
|
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
73
|
-
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
74
75
|
<a href="https://www.nstproxy.com/?type=flow&utm_source=scrapling" target="_blank" title="One Proxy Service, Infinite Solutions at Unbeatable Prices!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/NSTproxy.png"></a>
|
76
|
+
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
75
77
|
|
76
78
|
<!-- /sponsors -->
|
77
79
|
|
@@ -320,10 +322,9 @@ This project includes code adapted from:
|
|
320
322
|
## Thanks and References
|
321
323
|
|
322
324
|
- [Daijro](https://github.com/daijro)'s brilliant work on [BrowserForge](https://github.com/daijro/browserforge) and [Camoufox](https://github.com/daijro/camoufox)
|
323
|
-
- [Vinyzu](https://github.com/Vinyzu)'s work on [Botright](https://github.com/Vinyzu/Botright)
|
325
|
+
- [Vinyzu](https://github.com/Vinyzu)'s brilliant work on [Botright](https://github.com/Vinyzu/Botright) and [PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright)
|
324
326
|
- [brotector](https://github.com/kaliiiiiiiiii/brotector) for browser detection bypass techniques
|
325
|
-
- [fakebrowser](https://github.com/kkoooqq/fakebrowser) for fingerprinting research
|
326
|
-
- [rebrowser-patches](https://github.com/rebrowser/rebrowser-patches) for stealth improvements
|
327
|
+
- [fakebrowser](https://github.com/kkoooqq/fakebrowser) and [BotBrowser](https://github.com/botswin/BotBrowser) for fingerprinting research
|
327
328
|
|
328
329
|
---
|
329
330
|
<div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
|
@@ -64,16 +64,16 @@ dependencies = [
|
|
64
64
|
|
65
65
|
[project.optional-dependencies]
|
66
66
|
fetchers = [
|
67
|
-
"click>=8.
|
67
|
+
"click>=8.3.0",
|
68
68
|
"curl_cffi>=0.13.0",
|
69
|
-
"playwright>=1.
|
70
|
-
"
|
69
|
+
"playwright>=1.55.0",
|
70
|
+
"patchright>=1.55.2",
|
71
71
|
"camoufox>=0.4.11",
|
72
72
|
"geoip2>=5.1.0",
|
73
73
|
"msgspec>=0.19.0",
|
74
74
|
]
|
75
75
|
ai = [
|
76
|
-
"mcp>=1.14.
|
76
|
+
"mcp>=1.14.1",
|
77
77
|
"markdownify>=1.2.0",
|
78
78
|
"scrapling[fetchers]",
|
79
79
|
]
|
@@ -32,8 +32,8 @@ def __ParseJSONData(json_string: Optional[str] = None) -> Optional[Dict[str, Any
|
|
32
32
|
|
33
33
|
try:
|
34
34
|
return json_loads(json_string)
|
35
|
-
except JSONDecodeError as
|
36
|
-
raise ValueError(f"Invalid JSON data '{json_string}': {
|
35
|
+
except JSONDecodeError as err: # pragma: no cover
|
36
|
+
raise ValueError(f"Invalid JSON data '{json_string}': {err}")
|
37
37
|
|
38
38
|
|
39
39
|
def __Request_and_Save(
|
@@ -65,8 +65,8 @@ def __ParseExtractArguments(
|
|
65
65
|
for key, value in _CookieParser(cookies):
|
66
66
|
try:
|
67
67
|
parsed_cookies[key] = value
|
68
|
-
except Exception as
|
69
|
-
raise ValueError(f"Could not parse cookies '{cookies}': {
|
68
|
+
except Exception as err:
|
69
|
+
raise ValueError(f"Could not parse cookies '{cookies}': {err}")
|
70
70
|
|
71
71
|
parsed_json = __ParseJSONData(json)
|
72
72
|
parsed_params = {}
|
@@ -145,7 +145,7 @@ class TextHandler(str):
|
|
145
145
|
clean_match: bool = False,
|
146
146
|
case_sensitive: bool = True,
|
147
147
|
check_match: Literal[False] = False,
|
148
|
-
) -> "TextHandlers
|
148
|
+
) -> "TextHandlers": ...
|
149
149
|
|
150
150
|
def re(
|
151
151
|
self,
|
@@ -241,7 +241,7 @@ class TextHandlers(List[TextHandler]):
|
|
241
241
|
replace_entities: bool = True,
|
242
242
|
clean_match: bool = False,
|
243
243
|
case_sensitive: bool = True,
|
244
|
-
) -> "TextHandlers
|
244
|
+
) -> "TextHandlers":
|
245
245
|
"""Call the ``.re()`` method for each element in this list and return
|
246
246
|
their results flattened as TextHandlers.
|
247
247
|
|
@@ -201,7 +201,7 @@ class CurlParser:
|
|
201
201
|
data_payload = parsed_args.data_binary # Fallback to string
|
202
202
|
|
203
203
|
elif parsed_args.data_raw is not None:
|
204
|
-
data_payload = parsed_args.data_raw
|
204
|
+
data_payload = parsed_args.data_raw.lstrip("$")
|
205
205
|
|
206
206
|
elif parsed_args.data is not None:
|
207
207
|
data_payload = parsed_args.data
|
@@ -318,7 +318,7 @@ def show_page_in_browser(page: Selector): # pragma: no cover
|
|
318
318
|
try:
|
319
319
|
fd, fname = make_temp_file(prefix="scrapling_view_", suffix=".html")
|
320
320
|
with open(fd, "w", encoding=page.encoding) as f:
|
321
|
-
f.write(page.
|
321
|
+
f.write(page.html_content)
|
322
322
|
|
323
323
|
open_in_browser(f"file://{fname}")
|
324
324
|
except IOError as e:
|
@@ -335,15 +335,25 @@ class CustomShell:
|
|
335
335
|
from scrapling.fetchers import (
|
336
336
|
Fetcher as __Fetcher,
|
337
337
|
AsyncFetcher as __AsyncFetcher,
|
338
|
+
FetcherSession as __FetcherSession,
|
338
339
|
DynamicFetcher as __DynamicFetcher,
|
340
|
+
DynamicSession as __DynamicSession,
|
341
|
+
AsyncDynamicSession as __AsyncDynamicSession,
|
339
342
|
StealthyFetcher as __StealthyFetcher,
|
343
|
+
StealthySession as __StealthySession,
|
344
|
+
AsyncStealthySession as __AsyncStealthySession,
|
340
345
|
)
|
341
346
|
|
342
347
|
self.__InteractiveShellEmbed = __InteractiveShellEmbed
|
343
348
|
self.__Fetcher = __Fetcher
|
344
349
|
self.__AsyncFetcher = __AsyncFetcher
|
350
|
+
self.__FetcherSession = __FetcherSession
|
345
351
|
self.__DynamicFetcher = __DynamicFetcher
|
352
|
+
self.__DynamicSession = __DynamicSession
|
353
|
+
self.__AsyncDynamicSession = __AsyncDynamicSession
|
346
354
|
self.__StealthyFetcher = __StealthyFetcher
|
355
|
+
self.__StealthySession = __StealthySession
|
356
|
+
self.__AsyncStealthySession = __AsyncStealthySession
|
347
357
|
self.code = code
|
348
358
|
self.page = None
|
349
359
|
self.pages = Selectors([])
|
@@ -379,9 +389,9 @@ class CustomShell:
|
|
379
389
|
"""Create a custom banner for the shell"""
|
380
390
|
return f"""
|
381
391
|
-> Available Scrapling objects:
|
382
|
-
- Fetcher/AsyncFetcher
|
383
|
-
- DynamicFetcher
|
384
|
-
- StealthyFetcher
|
392
|
+
- Fetcher/AsyncFetcher/FetcherSession
|
393
|
+
- DynamicFetcher/DynamicSession/AsyncDynamicSession
|
394
|
+
- StealthyFetcher/StealthySession/AsyncStealthySession
|
385
395
|
- Selector
|
386
396
|
|
387
397
|
-> Useful shortcuts:
|
@@ -449,6 +459,11 @@ Type 'exit' or press Ctrl+D to exit.
|
|
449
459
|
"delete": delete,
|
450
460
|
"Fetcher": self.__Fetcher,
|
451
461
|
"AsyncFetcher": self.__AsyncFetcher,
|
462
|
+
"FetcherSession": self.__FetcherSession,
|
463
|
+
"DynamicSession": self.__DynamicSession,
|
464
|
+
"AsyncDynamicSession": self.__AsyncDynamicSession,
|
465
|
+
"StealthySession": self.__StealthySession,
|
466
|
+
"AsyncStealthySession": self.__AsyncStealthySession,
|
452
467
|
"fetch": dynamic_fetch,
|
453
468
|
"DynamicFetcher": self.__DynamicFetcher,
|
454
469
|
"stealthy_fetch": stealthy_fetch,
|
@@ -530,7 +545,7 @@ class Convertor:
|
|
530
545
|
for page in pages:
|
531
546
|
match extraction_type:
|
532
547
|
case "markdown":
|
533
|
-
yield cls._convert_to_markdown(page.
|
548
|
+
yield cls._convert_to_markdown(page.html_content)
|
534
549
|
case "html":
|
535
550
|
yield page.body
|
536
551
|
case "text":
|
@@ -1,4 +1,4 @@
|
|
1
|
-
from time import time
|
1
|
+
from time import time
|
2
2
|
from asyncio import sleep as asyncio_sleep, Lock
|
3
3
|
|
4
4
|
from camoufox import DefaultAddons
|
@@ -31,7 +31,7 @@ class SyncSession:
|
|
31
31
|
def __init__(self, max_pages: int = 1):
|
32
32
|
self.max_pages = max_pages
|
33
33
|
self.page_pool = PagePool(max_pages)
|
34
|
-
self.
|
34
|
+
self._max_wait_for_page = 60
|
35
35
|
self.playwright: Optional[Playwright] = None
|
36
36
|
self.context: Optional[BrowserContext] = None
|
37
37
|
self._closed = False
|
@@ -44,23 +44,7 @@ class SyncSession:
|
|
44
44
|
) -> PageInfo: # pragma: no cover
|
45
45
|
"""Get a new page to use"""
|
46
46
|
|
47
|
-
#
|
48
|
-
self.page_pool.close_all_finished_pages()
|
49
|
-
|
50
|
-
# If we're at max capacity after cleanup, wait for busy pages to finish
|
51
|
-
if self.page_pool.pages_count >= self.max_pages:
|
52
|
-
start_time = time()
|
53
|
-
while time() - start_time < self.__max_wait_for_page:
|
54
|
-
# Wait for any pages to finish, then clean them up
|
55
|
-
sleep(0.05)
|
56
|
-
self.page_pool.close_all_finished_pages()
|
57
|
-
if self.page_pool.pages_count < self.max_pages:
|
58
|
-
break
|
59
|
-
else:
|
60
|
-
raise TimeoutError(
|
61
|
-
f"No pages finished to clear place in the pool within the {self.__max_wait_for_page}s timeout period"
|
62
|
-
)
|
63
|
-
|
47
|
+
# No need to check if a page is available or not in sync code because the code blocked before reaching here till the page closed, ofc.
|
64
48
|
page = self.context.new_page()
|
65
49
|
page.set_default_navigation_timeout(timeout)
|
66
50
|
page.set_default_timeout(timeout)
|
@@ -76,11 +60,6 @@ class SyncSession:
|
|
76
60
|
|
77
61
|
return self.page_pool.add_page(page)
|
78
62
|
|
79
|
-
@staticmethod
|
80
|
-
def _get_with_precedence(request_value: Any, session_value: Any, sentinel_value: object) -> Any:
|
81
|
-
"""Get value with request-level priority over session-level"""
|
82
|
-
return request_value if request_value is not sentinel_value else session_value
|
83
|
-
|
84
63
|
def get_pool_stats(self) -> Dict[str, int]:
|
85
64
|
"""Get statistics about the current page pool"""
|
86
65
|
return {
|
@@ -105,21 +84,16 @@ class AsyncSession(SyncSession):
|
|
105
84
|
) -> PageInfo: # pragma: no cover
|
106
85
|
"""Get a new page to use"""
|
107
86
|
async with self._lock:
|
108
|
-
# Close all finished pages to ensure clean state
|
109
|
-
await self.page_pool.aclose_all_finished_pages()
|
110
|
-
|
111
87
|
# If we're at max capacity after cleanup, wait for busy pages to finish
|
112
88
|
if self.page_pool.pages_count >= self.max_pages:
|
113
89
|
start_time = time()
|
114
|
-
while time() - start_time < self.
|
115
|
-
# Wait for any pages to finish, then clean them up
|
90
|
+
while time() - start_time < self._max_wait_for_page:
|
116
91
|
await asyncio_sleep(0.05)
|
117
|
-
await self.page_pool.aclose_all_finished_pages()
|
118
92
|
if self.page_pool.pages_count < self.max_pages:
|
119
93
|
break
|
120
94
|
else:
|
121
95
|
raise TimeoutError(
|
122
|
-
f"No pages finished to clear place in the pool within the {self.
|
96
|
+
f"No pages finished to clear place in the pool within the {self._max_wait_for_page}s timeout period"
|
123
97
|
)
|
124
98
|
|
125
99
|
page = await self.context.new_page()
|
@@ -14,8 +14,9 @@ from playwright.async_api import (
|
|
14
14
|
Locator as AsyncLocator,
|
15
15
|
Page as async_Page,
|
16
16
|
)
|
17
|
+
from playwright._impl._errors import Error as PlaywrightError
|
17
18
|
|
18
|
-
from ._validators import
|
19
|
+
from ._validators import validate_fetch as _validate
|
19
20
|
from ._base import SyncSession, AsyncSession, StealthySessionMixin
|
20
21
|
from scrapling.core.utils import log
|
21
22
|
from scrapling.core._types import (
|
@@ -201,20 +202,34 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
201
202
|
|
202
203
|
self._closed = True
|
203
204
|
|
205
|
+
@staticmethod
|
206
|
+
def _get_page_content(page: Page) -> str | None:
|
207
|
+
"""
|
208
|
+
A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
|
209
|
+
:param page: The page to extract content from.
|
210
|
+
:return:
|
211
|
+
"""
|
212
|
+
while True:
|
213
|
+
try:
|
214
|
+
return page.content() or ""
|
215
|
+
except PlaywrightError:
|
216
|
+
page.wait_for_timeout(1000)
|
217
|
+
continue
|
218
|
+
|
204
219
|
def _solve_cloudflare(self, page: Page) -> None: # pragma: no cover
|
205
220
|
"""Solve the cloudflare challenge displayed on the playwright page passed
|
206
221
|
|
207
222
|
:param page: The targeted page
|
208
223
|
:return:
|
209
224
|
"""
|
210
|
-
challenge_type = self._detect_cloudflare(
|
225
|
+
challenge_type = self._detect_cloudflare(self._get_page_content(page))
|
211
226
|
if not challenge_type:
|
212
227
|
log.error("No Cloudflare challenge found.")
|
213
228
|
return
|
214
229
|
else:
|
215
230
|
log.info(f'The turnstile version discovered is "{challenge_type}"')
|
216
231
|
if challenge_type == "non-interactive":
|
217
|
-
while "<title>Just a moment...</title>" in (
|
232
|
+
while "<title>Just a moment...</title>" in (self._get_page_content(page)):
|
218
233
|
log.info("Waiting for Cloudflare wait page to disappear.")
|
219
234
|
page.wait_for_timeout(1000)
|
220
235
|
page.wait_for_load_state()
|
@@ -222,7 +237,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
222
237
|
return
|
223
238
|
|
224
239
|
else:
|
225
|
-
while "Verifying you are human." in
|
240
|
+
while "Verifying you are human." in self._get_page_content(page):
|
226
241
|
# Waiting for the verify spinner to disappear, checking every 1s if it disappeared
|
227
242
|
page.wait_for_timeout(500)
|
228
243
|
|
@@ -282,23 +297,22 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
282
297
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
283
298
|
:return: A `Response` object.
|
284
299
|
"""
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
CamoufoxConfig,
|
300
|
+
params = _validate(
|
301
|
+
[
|
302
|
+
("google_search", google_search, self.google_search),
|
303
|
+
("timeout", timeout, self.timeout),
|
304
|
+
("wait", wait, self.wait),
|
305
|
+
("page_action", page_action, self.page_action),
|
306
|
+
("extra_headers", extra_headers, self.extra_headers),
|
307
|
+
("disable_resources", disable_resources, self.disable_resources),
|
308
|
+
("wait_selector", wait_selector, self.wait_selector),
|
309
|
+
("wait_selector_state", wait_selector_state, self.wait_selector_state),
|
310
|
+
("network_idle", network_idle, self.network_idle),
|
311
|
+
("load_dom", load_dom, self.load_dom),
|
312
|
+
("solve_cloudflare", solve_cloudflare, self.solve_cloudflare),
|
313
|
+
("selector_config", selector_config, self.selector_config),
|
314
|
+
],
|
315
|
+
_UNSET,
|
302
316
|
)
|
303
317
|
|
304
318
|
if self._closed: # pragma: no cover
|
@@ -366,8 +380,9 @@ class StealthySession(StealthySessionMixin, SyncSession):
|
|
366
380
|
page_info.page, first_response, final_response, params.selector_config
|
367
381
|
)
|
368
382
|
|
369
|
-
#
|
370
|
-
page_info.
|
383
|
+
# Close the page, to free up resources
|
384
|
+
page_info.page.close()
|
385
|
+
self.page_pool.pages.remove(page_info)
|
371
386
|
|
372
387
|
return response
|
373
388
|
|
@@ -506,20 +521,34 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
506
521
|
|
507
522
|
self._closed = True
|
508
523
|
|
524
|
+
@staticmethod
|
525
|
+
async def _get_page_content(page: async_Page) -> str | None:
|
526
|
+
"""
|
527
|
+
A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
|
528
|
+
:param page: The page to extract content from.
|
529
|
+
:return:
|
530
|
+
"""
|
531
|
+
while True:
|
532
|
+
try:
|
533
|
+
return (await page.content()) or ""
|
534
|
+
except PlaywrightError:
|
535
|
+
await page.wait_for_timeout(1000)
|
536
|
+
continue
|
537
|
+
|
509
538
|
async def _solve_cloudflare(self, page: async_Page):
|
510
539
|
"""Solve the cloudflare challenge displayed on the playwright page passed. The async version
|
511
540
|
|
512
541
|
:param page: The async targeted page
|
513
542
|
:return:
|
514
543
|
"""
|
515
|
-
challenge_type = self._detect_cloudflare(await
|
544
|
+
challenge_type = self._detect_cloudflare(await self._get_page_content(page))
|
516
545
|
if not challenge_type:
|
517
546
|
log.error("No Cloudflare challenge found.")
|
518
547
|
return
|
519
548
|
else:
|
520
549
|
log.info(f'The turnstile version discovered is "{challenge_type}"')
|
521
550
|
if challenge_type == "non-interactive": # pragma: no cover
|
522
|
-
while "<title>Just a moment...</title>" in (await
|
551
|
+
while "<title>Just a moment...</title>" in (await self._get_page_content(page)):
|
523
552
|
log.info("Waiting for Cloudflare wait page to disappear.")
|
524
553
|
await page.wait_for_timeout(1000)
|
525
554
|
await page.wait_for_load_state()
|
@@ -527,7 +556,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
527
556
|
return
|
528
557
|
|
529
558
|
else:
|
530
|
-
while "Verifying you are human." in (await
|
559
|
+
while "Verifying you are human." in (await self._get_page_content(page)):
|
531
560
|
# Waiting for the verify spinner to disappear, checking every 1s if it disappeared
|
532
561
|
await page.wait_for_timeout(500)
|
533
562
|
|
@@ -587,22 +616,22 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
587
616
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
588
617
|
:return: A `Response` object.
|
589
618
|
"""
|
590
|
-
params =
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
619
|
+
params = _validate(
|
620
|
+
[
|
621
|
+
("google_search", google_search, self.google_search),
|
622
|
+
("timeout", timeout, self.timeout),
|
623
|
+
("wait", wait, self.wait),
|
624
|
+
("page_action", page_action, self.page_action),
|
625
|
+
("extra_headers", extra_headers, self.extra_headers),
|
626
|
+
("disable_resources", disable_resources, self.disable_resources),
|
627
|
+
("wait_selector", wait_selector, self.wait_selector),
|
628
|
+
("wait_selector_state", wait_selector_state, self.wait_selector_state),
|
629
|
+
("network_idle", network_idle, self.network_idle),
|
630
|
+
("load_dom", load_dom, self.load_dom),
|
631
|
+
("solve_cloudflare", solve_cloudflare, self.solve_cloudflare),
|
632
|
+
("selector_config", selector_config, self.selector_config),
|
633
|
+
],
|
634
|
+
_UNSET,
|
606
635
|
)
|
607
636
|
|
608
637
|
if self._closed: # pragma: no cover
|
@@ -672,8 +701,9 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
|
|
672
701
|
page_info.page, first_response, final_response, params.selector_config
|
673
702
|
)
|
674
703
|
|
675
|
-
#
|
676
|
-
page_info.
|
704
|
+
# Close the page, to free up resources
|
705
|
+
await page_info.page.close()
|
706
|
+
self.page_pool.pages.remove(page_info)
|
677
707
|
|
678
708
|
return response
|
679
709
|
|