scrapling 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/cli.py +38 -51
- scrapling/core/_html_utils.py +3 -9
- scrapling/core/ai.py +5 -13
- scrapling/core/custom_types.py +19 -61
- scrapling/core/mixins.py +6 -28
- scrapling/core/shell.py +49 -127
- scrapling/core/storage.py +2 -8
- scrapling/core/translator.py +8 -20
- scrapling/core/utils/__init__.py +10 -0
- scrapling/core/utils/_shell.py +48 -0
- scrapling/core/{utils.py → utils/_utils.py} +5 -21
- scrapling/engines/__init__.py +0 -16
- scrapling/engines/_browsers/_base.py +297 -0
- scrapling/engines/_browsers/_camoufox.py +219 -296
- scrapling/engines/_browsers/_config_tools.py +2 -1
- scrapling/engines/_browsers/_controllers.py +201 -281
- scrapling/engines/_browsers/_page.py +37 -15
- scrapling/engines/_browsers/_validators.py +9 -15
- scrapling/engines/constants.py +3 -6
- scrapling/engines/static.py +25 -75
- scrapling/engines/toolbelt/__init__.py +1 -20
- scrapling/engines/toolbelt/convertor.py +95 -86
- scrapling/engines/toolbelt/custom.py +7 -99
- scrapling/engines/toolbelt/fingerprints.py +1 -3
- scrapling/engines/toolbelt/navigation.py +4 -58
- scrapling/fetchers.py +29 -24
- scrapling/parser.py +45 -122
- {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/METADATA +54 -46
- scrapling-0.3.3.dist-info/RECORD +44 -0
- scrapling-0.3.1.dist-info/RECORD +0 -41
- {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/WHEEL +0 -0
- {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/entry_points.txt +0 -0
- {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/licenses/LICENSE +0 -0
- {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,297 @@
|
|
1
|
+
from time import time, sleep
|
2
|
+
from asyncio import sleep as asyncio_sleep, Lock
|
3
|
+
|
4
|
+
from camoufox import DefaultAddons
|
5
|
+
from playwright.sync_api import BrowserContext, Playwright
|
6
|
+
from playwright.async_api import (
|
7
|
+
BrowserContext as AsyncBrowserContext,
|
8
|
+
Playwright as AsyncPlaywright,
|
9
|
+
)
|
10
|
+
from camoufox.utils import (
|
11
|
+
launch_options as generate_launch_options,
|
12
|
+
installed_verstr as camoufox_version,
|
13
|
+
)
|
14
|
+
|
15
|
+
from scrapling.engines.toolbelt.navigation import intercept_route, async_intercept_route
|
16
|
+
from scrapling.core._types import (
|
17
|
+
Any,
|
18
|
+
Dict,
|
19
|
+
Optional,
|
20
|
+
)
|
21
|
+
from ._page import PageInfo, PagePool
|
22
|
+
from ._config_tools import _compiled_stealth_scripts
|
23
|
+
from ._config_tools import _launch_kwargs, _context_kwargs
|
24
|
+
from scrapling.engines.toolbelt.fingerprints import get_os_name
|
25
|
+
from ._validators import validate, PlaywrightConfig, CamoufoxConfig
|
26
|
+
|
27
|
+
__ff_version_str__ = camoufox_version().split(".", 1)[0]
|
28
|
+
|
29
|
+
|
30
|
+
class SyncSession:
|
31
|
+
def __init__(self, max_pages: int = 1):
|
32
|
+
self.max_pages = max_pages
|
33
|
+
self.page_pool = PagePool(max_pages)
|
34
|
+
self.__max_wait_for_page = 60
|
35
|
+
self.playwright: Optional[Playwright] = None
|
36
|
+
self.context: Optional[BrowserContext] = None
|
37
|
+
self._closed = False
|
38
|
+
|
39
|
+
def _get_page(
|
40
|
+
self,
|
41
|
+
timeout: int | float,
|
42
|
+
extra_headers: Optional[Dict[str, str]],
|
43
|
+
disable_resources: bool,
|
44
|
+
) -> PageInfo: # pragma: no cover
|
45
|
+
"""Get a new page to use"""
|
46
|
+
|
47
|
+
# Close all finished pages to ensure clean state
|
48
|
+
self.page_pool.close_all_finished_pages()
|
49
|
+
|
50
|
+
# If we're at max capacity after cleanup, wait for busy pages to finish
|
51
|
+
if self.page_pool.pages_count >= self.max_pages:
|
52
|
+
start_time = time()
|
53
|
+
while time() - start_time < self.__max_wait_for_page:
|
54
|
+
# Wait for any pages to finish, then clean them up
|
55
|
+
sleep(0.05)
|
56
|
+
self.page_pool.close_all_finished_pages()
|
57
|
+
if self.page_pool.pages_count < self.max_pages:
|
58
|
+
break
|
59
|
+
else:
|
60
|
+
raise TimeoutError(
|
61
|
+
f"No pages finished to clear place in the pool within the {self.__max_wait_for_page}s timeout period"
|
62
|
+
)
|
63
|
+
|
64
|
+
page = self.context.new_page()
|
65
|
+
page.set_default_navigation_timeout(timeout)
|
66
|
+
page.set_default_timeout(timeout)
|
67
|
+
if extra_headers:
|
68
|
+
page.set_extra_http_headers(extra_headers)
|
69
|
+
|
70
|
+
if disable_resources:
|
71
|
+
page.route("**/*", intercept_route)
|
72
|
+
|
73
|
+
if getattr(self, "stealth", False):
|
74
|
+
for script in _compiled_stealth_scripts():
|
75
|
+
page.add_init_script(script=script)
|
76
|
+
|
77
|
+
return self.page_pool.add_page(page)
|
78
|
+
|
79
|
+
@staticmethod
|
80
|
+
def _get_with_precedence(request_value: Any, session_value: Any, sentinel_value: object) -> Any:
|
81
|
+
"""Get value with request-level priority over session-level"""
|
82
|
+
return request_value if request_value is not sentinel_value else session_value
|
83
|
+
|
84
|
+
def get_pool_stats(self) -> Dict[str, int]:
|
85
|
+
"""Get statistics about the current page pool"""
|
86
|
+
return {
|
87
|
+
"total_pages": self.page_pool.pages_count,
|
88
|
+
"busy_pages": self.page_pool.busy_count,
|
89
|
+
"max_pages": self.max_pages,
|
90
|
+
}
|
91
|
+
|
92
|
+
|
93
|
+
class AsyncSession(SyncSession):
|
94
|
+
def __init__(self, max_pages: int = 1):
|
95
|
+
super().__init__(max_pages)
|
96
|
+
self.playwright: Optional[AsyncPlaywright] = None
|
97
|
+
self.context: Optional[AsyncBrowserContext] = None
|
98
|
+
self._lock = Lock()
|
99
|
+
|
100
|
+
async def _get_page(
|
101
|
+
self,
|
102
|
+
timeout: int | float,
|
103
|
+
extra_headers: Optional[Dict[str, str]],
|
104
|
+
disable_resources: bool,
|
105
|
+
) -> PageInfo: # pragma: no cover
|
106
|
+
"""Get a new page to use"""
|
107
|
+
async with self._lock:
|
108
|
+
# Close all finished pages to ensure clean state
|
109
|
+
await self.page_pool.aclose_all_finished_pages()
|
110
|
+
|
111
|
+
# If we're at max capacity after cleanup, wait for busy pages to finish
|
112
|
+
if self.page_pool.pages_count >= self.max_pages:
|
113
|
+
start_time = time()
|
114
|
+
while time() - start_time < self.__max_wait_for_page:
|
115
|
+
# Wait for any pages to finish, then clean them up
|
116
|
+
await asyncio_sleep(0.05)
|
117
|
+
await self.page_pool.aclose_all_finished_pages()
|
118
|
+
if self.page_pool.pages_count < self.max_pages:
|
119
|
+
break
|
120
|
+
else:
|
121
|
+
raise TimeoutError(
|
122
|
+
f"No pages finished to clear place in the pool within the {self.__max_wait_for_page}s timeout period"
|
123
|
+
)
|
124
|
+
|
125
|
+
page = await self.context.new_page()
|
126
|
+
page.set_default_navigation_timeout(timeout)
|
127
|
+
page.set_default_timeout(timeout)
|
128
|
+
if extra_headers:
|
129
|
+
await page.set_extra_http_headers(extra_headers)
|
130
|
+
|
131
|
+
if disable_resources:
|
132
|
+
await page.route("**/*", async_intercept_route)
|
133
|
+
|
134
|
+
if getattr(self, "stealth", False):
|
135
|
+
for script in _compiled_stealth_scripts():
|
136
|
+
await page.add_init_script(script=script)
|
137
|
+
|
138
|
+
return self.page_pool.add_page(page)
|
139
|
+
|
140
|
+
|
141
|
+
class DynamicSessionMixin:
|
142
|
+
def __validate__(self, **params):
|
143
|
+
config = validate(params, model=PlaywrightConfig)
|
144
|
+
|
145
|
+
self.max_pages = config.max_pages
|
146
|
+
self.headless = config.headless
|
147
|
+
self.hide_canvas = config.hide_canvas
|
148
|
+
self.disable_webgl = config.disable_webgl
|
149
|
+
self.real_chrome = config.real_chrome
|
150
|
+
self.stealth = config.stealth
|
151
|
+
self.google_search = config.google_search
|
152
|
+
self.wait = config.wait
|
153
|
+
self.proxy = config.proxy
|
154
|
+
self.locale = config.locale
|
155
|
+
self.extra_headers = config.extra_headers
|
156
|
+
self.useragent = config.useragent
|
157
|
+
self.timeout = config.timeout
|
158
|
+
self.cookies = config.cookies
|
159
|
+
self.disable_resources = config.disable_resources
|
160
|
+
self.cdp_url = config.cdp_url
|
161
|
+
self.network_idle = config.network_idle
|
162
|
+
self.load_dom = config.load_dom
|
163
|
+
self.wait_selector = config.wait_selector
|
164
|
+
self.init_script = config.init_script
|
165
|
+
self.wait_selector_state = config.wait_selector_state
|
166
|
+
self.selector_config = config.selector_config
|
167
|
+
self.page_action = config.page_action
|
168
|
+
self._headers_keys = set(map(str.lower, self.extra_headers.keys())) if self.extra_headers else set()
|
169
|
+
self.__initiate_browser_options__()
|
170
|
+
|
171
|
+
def __initiate_browser_options__(self):
|
172
|
+
if not self.cdp_url:
|
173
|
+
# `launch_options` is used with persistent context
|
174
|
+
self.launch_options = dict(
|
175
|
+
_launch_kwargs(
|
176
|
+
self.headless,
|
177
|
+
self.proxy,
|
178
|
+
self.locale,
|
179
|
+
tuple(self.extra_headers.items()) if self.extra_headers else tuple(),
|
180
|
+
self.useragent,
|
181
|
+
self.real_chrome,
|
182
|
+
self.stealth,
|
183
|
+
self.hide_canvas,
|
184
|
+
self.disable_webgl,
|
185
|
+
)
|
186
|
+
)
|
187
|
+
self.launch_options["extra_http_headers"] = dict(self.launch_options["extra_http_headers"])
|
188
|
+
self.launch_options["proxy"] = dict(self.launch_options["proxy"]) or None
|
189
|
+
self.context_options = dict()
|
190
|
+
else:
|
191
|
+
# while `context_options` is left to be used when cdp mode is enabled
|
192
|
+
self.launch_options = dict()
|
193
|
+
self.context_options = dict(
|
194
|
+
_context_kwargs(
|
195
|
+
self.proxy,
|
196
|
+
self.locale,
|
197
|
+
tuple(self.extra_headers.items()) if self.extra_headers else tuple(),
|
198
|
+
self.useragent,
|
199
|
+
self.stealth,
|
200
|
+
)
|
201
|
+
)
|
202
|
+
self.context_options["extra_http_headers"] = dict(self.context_options["extra_http_headers"])
|
203
|
+
self.context_options["proxy"] = dict(self.context_options["proxy"]) or None
|
204
|
+
|
205
|
+
|
206
|
+
class StealthySessionMixin:
|
207
|
+
def __validate__(self, **params):
|
208
|
+
config = validate(params, model=CamoufoxConfig)
|
209
|
+
|
210
|
+
self.max_pages = config.max_pages
|
211
|
+
self.headless = config.headless
|
212
|
+
self.block_images = config.block_images
|
213
|
+
self.disable_resources = config.disable_resources
|
214
|
+
self.block_webrtc = config.block_webrtc
|
215
|
+
self.allow_webgl = config.allow_webgl
|
216
|
+
self.network_idle = config.network_idle
|
217
|
+
self.load_dom = config.load_dom
|
218
|
+
self.humanize = config.humanize
|
219
|
+
self.solve_cloudflare = config.solve_cloudflare
|
220
|
+
self.wait = config.wait
|
221
|
+
self.timeout = config.timeout
|
222
|
+
self.page_action = config.page_action
|
223
|
+
self.wait_selector = config.wait_selector
|
224
|
+
self.init_script = config.init_script
|
225
|
+
self.addons = config.addons
|
226
|
+
self.wait_selector_state = config.wait_selector_state
|
227
|
+
self.cookies = config.cookies
|
228
|
+
self.google_search = config.google_search
|
229
|
+
self.extra_headers = config.extra_headers
|
230
|
+
self.proxy = config.proxy
|
231
|
+
self.os_randomize = config.os_randomize
|
232
|
+
self.disable_ads = config.disable_ads
|
233
|
+
self.geoip = config.geoip
|
234
|
+
self.selector_config = config.selector_config
|
235
|
+
self.additional_args = config.additional_args
|
236
|
+
self.page_action = config.page_action
|
237
|
+
self._headers_keys = set(map(str.lower, self.extra_headers.keys())) if self.extra_headers else set()
|
238
|
+
self.__initiate_browser_options__()
|
239
|
+
|
240
|
+
def __initiate_browser_options__(self):
|
241
|
+
"""Initiate browser options."""
|
242
|
+
self.launch_options = generate_launch_options(
|
243
|
+
**{
|
244
|
+
"geoip": self.geoip,
|
245
|
+
"proxy": dict(self.proxy) if self.proxy else self.proxy,
|
246
|
+
"addons": self.addons,
|
247
|
+
"exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
|
248
|
+
"headless": self.headless,
|
249
|
+
"humanize": True if self.solve_cloudflare else self.humanize,
|
250
|
+
"i_know_what_im_doing": True, # To turn warnings off with the user configurations
|
251
|
+
"allow_webgl": self.allow_webgl,
|
252
|
+
"block_webrtc": self.block_webrtc,
|
253
|
+
"block_images": self.block_images, # Careful! it makes some websites don't finish loading at all like stackoverflow even in headful mode.
|
254
|
+
"os": None if self.os_randomize else get_os_name(),
|
255
|
+
"user_data_dir": "",
|
256
|
+
"ff_version": __ff_version_str__,
|
257
|
+
"firefox_user_prefs": {
|
258
|
+
# This is what enabling `enable_cache` does internally, so we do it from here instead
|
259
|
+
"browser.sessionhistory.max_entries": 10,
|
260
|
+
"browser.sessionhistory.max_total_viewers": -1,
|
261
|
+
"browser.cache.memory.enable": True,
|
262
|
+
"browser.cache.disk_cache_ssl": True,
|
263
|
+
"browser.cache.disk.smart_size.enabled": True,
|
264
|
+
},
|
265
|
+
**self.additional_args,
|
266
|
+
}
|
267
|
+
)
|
268
|
+
|
269
|
+
@staticmethod
|
270
|
+
def _detect_cloudflare(page_content: str) -> str | None:
|
271
|
+
"""
|
272
|
+
Detect the type of Cloudflare challenge present in the provided page content.
|
273
|
+
|
274
|
+
This function analyzes the given page content to identify whether a specific
|
275
|
+
type of Cloudflare challenge is present. It checks for three predefined
|
276
|
+
challenge types: non-interactive, managed, and interactive. If a challenge
|
277
|
+
type is detected, it returns the corresponding type as a string. If no
|
278
|
+
challenge type is detected, it returns None.
|
279
|
+
|
280
|
+
Args:
|
281
|
+
page_content (str): The content of the page to analyze for Cloudflare
|
282
|
+
challenge types.
|
283
|
+
|
284
|
+
Returns:
|
285
|
+
str: A string representing the detected Cloudflare challenge type, if
|
286
|
+
found. Returns None if no challenge matches.
|
287
|
+
"""
|
288
|
+
challenge_types = (
|
289
|
+
"non-interactive",
|
290
|
+
"managed",
|
291
|
+
"interactive",
|
292
|
+
)
|
293
|
+
for ctype in challenge_types:
|
294
|
+
if f"cType: '{ctype}'" in page_content:
|
295
|
+
return ctype
|
296
|
+
|
297
|
+
return None
|