scrapling 0.3.7__py3-none-any.whl → 0.3.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/engines/_browsers/_base.py +140 -9
- scrapling/engines/_browsers/_camoufox.py +47 -164
- scrapling/engines/_browsers/_config_tools.py +8 -2
- scrapling/engines/_browsers/_controllers.py +25 -96
- scrapling/engines/_browsers/_validators.py +72 -61
- scrapling/engines/toolbelt/convertor.py +37 -2
- scrapling/engines/toolbelt/custom.py +0 -12
- scrapling/engines/toolbelt/fingerprints.py +6 -8
- scrapling/fetchers/chrome.py +6 -0
- {scrapling-0.3.7.dist-info → scrapling-0.3.8.dist-info}/METADATA +6 -4
- {scrapling-0.3.7.dist-info → scrapling-0.3.8.dist-info}/RECORD +16 -16
- {scrapling-0.3.7.dist-info → scrapling-0.3.8.dist-info}/WHEEL +0 -0
- {scrapling-0.3.7.dist-info → scrapling-0.3.8.dist-info}/entry_points.txt +0 -0
- {scrapling-0.3.7.dist-info → scrapling-0.3.8.dist-info}/licenses/LICENSE +0 -0
- {scrapling-0.3.7.dist-info → scrapling-0.3.8.dist-info}/top_level.txt +0 -0
|
@@ -1,23 +1,20 @@
|
|
|
1
1
|
from playwright.sync_api import (
|
|
2
|
-
Response as SyncPlaywrightResponse,
|
|
3
|
-
sync_playwright,
|
|
4
|
-
Playwright,
|
|
5
2
|
Locator,
|
|
3
|
+
Playwright,
|
|
4
|
+
sync_playwright,
|
|
6
5
|
)
|
|
7
6
|
from playwright.async_api import (
|
|
8
7
|
async_playwright,
|
|
9
|
-
Response as AsyncPlaywrightResponse,
|
|
10
|
-
BrowserContext as AsyncBrowserContext,
|
|
11
|
-
Playwright as AsyncPlaywright,
|
|
12
8
|
Locator as AsyncLocator,
|
|
13
|
-
|
|
9
|
+
Playwright as AsyncPlaywright,
|
|
10
|
+
BrowserContext as AsyncBrowserContext,
|
|
14
11
|
)
|
|
15
12
|
from patchright.sync_api import sync_playwright as sync_patchright
|
|
16
13
|
from patchright.async_api import async_playwright as async_patchright
|
|
17
14
|
|
|
18
15
|
from scrapling.core.utils import log
|
|
19
16
|
from ._base import SyncSession, AsyncSession, DynamicSessionMixin
|
|
20
|
-
from ._validators import validate_fetch as _validate
|
|
17
|
+
from ._validators import validate_fetch as _validate, PlaywrightConfig
|
|
21
18
|
from scrapling.core._types import (
|
|
22
19
|
Any,
|
|
23
20
|
Dict,
|
|
@@ -98,6 +95,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
|
98
95
|
load_dom: bool = True,
|
|
99
96
|
wait_selector_state: SelectorWaitStates = "attached",
|
|
100
97
|
user_data_dir: str = "",
|
|
98
|
+
extra_flags: Optional[List[str]] = None,
|
|
101
99
|
selector_config: Optional[Dict] = None,
|
|
102
100
|
additional_args: Optional[Dict] = None,
|
|
103
101
|
):
|
|
@@ -127,6 +125,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
|
127
125
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
|
128
126
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
|
129
127
|
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
|
128
|
+
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
|
130
129
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
|
131
130
|
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
|
132
131
|
"""
|
|
@@ -152,6 +151,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
|
152
151
|
extra_headers=extra_headers,
|
|
153
152
|
wait_selector=wait_selector,
|
|
154
153
|
disable_webgl=disable_webgl,
|
|
154
|
+
extra_flags=extra_flags,
|
|
155
155
|
selector_config=selector_config,
|
|
156
156
|
additional_args=additional_args,
|
|
157
157
|
disable_resources=disable_resources,
|
|
@@ -178,28 +178,6 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
|
178
178
|
if self.cookies: # pragma: no cover
|
|
179
179
|
self.context.add_cookies(self.cookies)
|
|
180
180
|
|
|
181
|
-
def __enter__(self):
|
|
182
|
-
self.__create__()
|
|
183
|
-
return self
|
|
184
|
-
|
|
185
|
-
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
186
|
-
self.close()
|
|
187
|
-
|
|
188
|
-
def close(self): # pragma: no cover
|
|
189
|
-
"""Close all resources"""
|
|
190
|
-
if self._closed:
|
|
191
|
-
return
|
|
192
|
-
|
|
193
|
-
if self.context:
|
|
194
|
-
self.context.close()
|
|
195
|
-
self.context = None
|
|
196
|
-
|
|
197
|
-
if self.playwright:
|
|
198
|
-
self.playwright.stop()
|
|
199
|
-
self.playwright = None # pyright: ignore
|
|
200
|
-
|
|
201
|
-
self._closed = True
|
|
202
|
-
|
|
203
181
|
def fetch(
|
|
204
182
|
self,
|
|
205
183
|
url: str,
|
|
@@ -247,38 +225,26 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
|
247
225
|
("load_dom", load_dom, self.load_dom),
|
|
248
226
|
("selector_config", selector_config, self.selector_config),
|
|
249
227
|
],
|
|
228
|
+
PlaywrightConfig,
|
|
250
229
|
_UNSET,
|
|
251
230
|
)
|
|
252
231
|
|
|
253
232
|
if self._closed: # pragma: no cover
|
|
254
233
|
raise RuntimeError("Context manager has been closed")
|
|
255
234
|
|
|
256
|
-
final_response = None
|
|
257
235
|
referer = (
|
|
258
236
|
generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
|
|
259
237
|
)
|
|
260
238
|
|
|
261
|
-
def handle_response(finished_response: SyncPlaywrightResponse):
|
|
262
|
-
nonlocal final_response
|
|
263
|
-
if (
|
|
264
|
-
finished_response.request.resource_type == "document"
|
|
265
|
-
and finished_response.request.is_navigation_request()
|
|
266
|
-
and finished_response.request.frame == page_info.page.main_frame
|
|
267
|
-
):
|
|
268
|
-
final_response = finished_response
|
|
269
|
-
|
|
270
239
|
page_info = self._get_page(params.timeout, params.extra_headers, params.disable_resources)
|
|
271
|
-
|
|
240
|
+
final_response = [None]
|
|
241
|
+
handle_response = self._create_response_handler(page_info, final_response)
|
|
272
242
|
|
|
273
243
|
try: # pragma: no cover
|
|
274
244
|
# Navigate to URL and wait for a specified state
|
|
275
245
|
page_info.page.on("response", handle_response)
|
|
276
246
|
first_response = page_info.page.goto(url, referer=referer)
|
|
277
|
-
|
|
278
|
-
page_info.page.wait_for_load_state(state="domcontentloaded")
|
|
279
|
-
|
|
280
|
-
if params.network_idle:
|
|
281
|
-
page_info.page.wait_for_load_state("networkidle")
|
|
247
|
+
self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
|
|
282
248
|
|
|
283
249
|
if not first_response:
|
|
284
250
|
raise RuntimeError(f"Failed to get response for {url}")
|
|
@@ -294,11 +260,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
|
294
260
|
waiter: Locator = page_info.page.locator(params.wait_selector)
|
|
295
261
|
waiter.first.wait_for(state=params.wait_selector_state)
|
|
296
262
|
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
|
297
|
-
page_info.page.
|
|
298
|
-
if params.load_dom:
|
|
299
|
-
page_info.page.wait_for_load_state(state="domcontentloaded")
|
|
300
|
-
if params.network_idle:
|
|
301
|
-
page_info.page.wait_for_load_state("networkidle")
|
|
263
|
+
self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
|
|
302
264
|
except Exception as e: # pragma: no cover
|
|
303
265
|
log.error(f"Error waiting for selector {params.wait_selector}: {e}")
|
|
304
266
|
|
|
@@ -306,7 +268,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
|
306
268
|
|
|
307
269
|
# Create response object
|
|
308
270
|
response = ResponseFactory.from_playwright_response(
|
|
309
|
-
page_info.page, first_response, final_response, params.selector_config
|
|
271
|
+
page_info.page, first_response, final_response[0], params.selector_config, bool(params.page_action)
|
|
310
272
|
)
|
|
311
273
|
|
|
312
274
|
# Close the page to free up resources
|
|
@@ -348,6 +310,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
|
348
310
|
load_dom: bool = True,
|
|
349
311
|
wait_selector_state: SelectorWaitStates = "attached",
|
|
350
312
|
user_data_dir: str = "",
|
|
313
|
+
extra_flags: Optional[List[str]] = None,
|
|
351
314
|
selector_config: Optional[Dict] = None,
|
|
352
315
|
additional_args: Optional[Dict] = None,
|
|
353
316
|
):
|
|
@@ -378,6 +341,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
|
378
341
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
|
379
342
|
:param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
|
|
380
343
|
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
|
344
|
+
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
|
381
345
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
|
382
346
|
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
|
383
347
|
"""
|
|
@@ -404,6 +368,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
|
404
368
|
extra_headers=extra_headers,
|
|
405
369
|
wait_selector=wait_selector,
|
|
406
370
|
disable_webgl=disable_webgl,
|
|
371
|
+
extra_flags=extra_flags,
|
|
407
372
|
selector_config=selector_config,
|
|
408
373
|
additional_args=additional_args,
|
|
409
374
|
disable_resources=disable_resources,
|
|
@@ -431,28 +396,6 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
|
431
396
|
if self.cookies:
|
|
432
397
|
await self.context.add_cookies(self.cookies) # pyright: ignore
|
|
433
398
|
|
|
434
|
-
async def __aenter__(self):
|
|
435
|
-
await self.__create__()
|
|
436
|
-
return self
|
|
437
|
-
|
|
438
|
-
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
439
|
-
await self.close()
|
|
440
|
-
|
|
441
|
-
async def close(self):
|
|
442
|
-
"""Close all resources"""
|
|
443
|
-
if self._closed: # pragma: no cover
|
|
444
|
-
return
|
|
445
|
-
|
|
446
|
-
if self.context:
|
|
447
|
-
await self.context.close()
|
|
448
|
-
self.context = None # pyright: ignore
|
|
449
|
-
|
|
450
|
-
if self.playwright:
|
|
451
|
-
await self.playwright.stop()
|
|
452
|
-
self.playwright = None # pyright: ignore
|
|
453
|
-
|
|
454
|
-
self._closed = True
|
|
455
|
-
|
|
456
399
|
async def fetch(
|
|
457
400
|
self,
|
|
458
401
|
url: str,
|
|
@@ -500,30 +443,24 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
|
500
443
|
("load_dom", load_dom, self.load_dom),
|
|
501
444
|
("selector_config", selector_config, self.selector_config),
|
|
502
445
|
],
|
|
446
|
+
PlaywrightConfig,
|
|
503
447
|
_UNSET,
|
|
504
448
|
)
|
|
505
449
|
|
|
506
450
|
if self._closed: # pragma: no cover
|
|
507
451
|
raise RuntimeError("Context manager has been closed")
|
|
508
452
|
|
|
509
|
-
final_response = None
|
|
510
453
|
referer = (
|
|
511
454
|
generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
|
|
512
455
|
)
|
|
513
456
|
|
|
514
|
-
async def handle_response(finished_response: AsyncPlaywrightResponse):
|
|
515
|
-
nonlocal final_response
|
|
516
|
-
if (
|
|
517
|
-
finished_response.request.resource_type == "document"
|
|
518
|
-
and finished_response.request.is_navigation_request()
|
|
519
|
-
and finished_response.request.frame == page_info.page.main_frame
|
|
520
|
-
):
|
|
521
|
-
final_response = finished_response
|
|
522
|
-
|
|
523
457
|
page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
|
|
524
|
-
|
|
458
|
+
final_response = [None]
|
|
459
|
+
handle_response = self._create_response_handler(page_info, final_response)
|
|
525
460
|
|
|
526
461
|
if TYPE_CHECKING:
|
|
462
|
+
from playwright.async_api import Page as async_Page
|
|
463
|
+
|
|
527
464
|
if not isinstance(page_info.page, async_Page):
|
|
528
465
|
raise TypeError
|
|
529
466
|
|
|
@@ -531,11 +468,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
|
531
468
|
# Navigate to URL and wait for a specified state
|
|
532
469
|
page_info.page.on("response", handle_response)
|
|
533
470
|
first_response = await page_info.page.goto(url, referer=referer)
|
|
534
|
-
|
|
535
|
-
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
|
536
|
-
|
|
537
|
-
if params.network_idle:
|
|
538
|
-
await page_info.page.wait_for_load_state("networkidle")
|
|
471
|
+
await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
|
|
539
472
|
|
|
540
473
|
if not first_response:
|
|
541
474
|
raise RuntimeError(f"Failed to get response for {url}")
|
|
@@ -551,11 +484,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
|
551
484
|
waiter: AsyncLocator = page_info.page.locator(params.wait_selector)
|
|
552
485
|
await waiter.first.wait_for(state=params.wait_selector_state)
|
|
553
486
|
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
|
554
|
-
await page_info.page.
|
|
555
|
-
if self.load_dom:
|
|
556
|
-
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
|
557
|
-
if params.network_idle:
|
|
558
|
-
await page_info.page.wait_for_load_state("networkidle")
|
|
487
|
+
await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
|
|
559
488
|
except Exception as e:
|
|
560
489
|
log.error(f"Error waiting for selector {params.wait_selector}: {e}")
|
|
561
490
|
|
|
@@ -563,7 +492,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
|
563
492
|
|
|
564
493
|
# Create response object
|
|
565
494
|
response = await ResponseFactory.from_async_playwright_response(
|
|
566
|
-
page_info.page, first_response, final_response, params.selector_config
|
|
495
|
+
page_info.page, first_response, final_response[0], params.selector_config, bool(params.page_action)
|
|
567
496
|
)
|
|
568
497
|
|
|
569
498
|
# Close the page to free up resources
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
2
|
from typing import Annotated
|
|
3
|
-
from
|
|
3
|
+
from functools import lru_cache
|
|
4
4
|
from urllib.parse import urlparse
|
|
5
|
+
from dataclasses import dataclass, fields
|
|
5
6
|
|
|
6
7
|
from msgspec import Struct, Meta, convert, ValidationError
|
|
7
8
|
|
|
@@ -19,18 +20,20 @@ from scrapling.engines.toolbelt.navigation import construct_proxy_dict
|
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
# Custom validators for msgspec
|
|
22
|
-
|
|
23
|
+
@lru_cache(8)
|
|
24
|
+
def _is_invalid_file_path(value: str) -> bool | str:
|
|
23
25
|
"""Fast file path validation"""
|
|
24
26
|
path = Path(value)
|
|
25
27
|
if not path.exists():
|
|
26
|
-
|
|
28
|
+
return f"Init script path not found: {value}"
|
|
27
29
|
if not path.is_file():
|
|
28
|
-
|
|
30
|
+
return f"Init script is not a file: {value}"
|
|
29
31
|
if not path.is_absolute():
|
|
30
|
-
|
|
32
|
+
return f"Init script is not a absolute path: {value}"
|
|
33
|
+
return False
|
|
31
34
|
|
|
32
35
|
|
|
33
|
-
def _validate_addon_path(value: str):
|
|
36
|
+
def _validate_addon_path(value: str) -> None:
|
|
34
37
|
"""Fast addon path validation"""
|
|
35
38
|
path = Path(value)
|
|
36
39
|
if not path.exists():
|
|
@@ -39,22 +42,16 @@ def _validate_addon_path(value: str):
|
|
|
39
42
|
raise ValueError(f"Addon path must be a directory of the extracted addon: {value}")
|
|
40
43
|
|
|
41
44
|
|
|
42
|
-
|
|
45
|
+
@lru_cache(2)
|
|
46
|
+
def _is_invalid_cdp_url(cdp_url: str) -> bool | str:
|
|
43
47
|
"""Fast CDP URL validation"""
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
if not cdp_url.startswith(("ws://", "wss://")):
|
|
47
|
-
raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
|
|
48
|
-
|
|
49
|
-
# Validate hostname and port
|
|
50
|
-
if not urlparse(cdp_url).netloc:
|
|
51
|
-
raise ValueError("Invalid hostname for the CDP URL")
|
|
48
|
+
if not cdp_url.startswith(("ws://", "wss://")):
|
|
49
|
+
return "CDP URL must use 'ws://' or 'wss://' scheme"
|
|
52
50
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
raise ValueError(f"Invalid CDP URL '{cdp_url}': {str(e)}")
|
|
51
|
+
netloc = urlparse(cdp_url).netloc
|
|
52
|
+
if not netloc:
|
|
53
|
+
return "Invalid hostname for the CDP URL"
|
|
54
|
+
return False
|
|
58
55
|
|
|
59
56
|
|
|
60
57
|
# Type aliases for cleaner annotations
|
|
@@ -62,7 +59,7 @@ PagesCount = Annotated[int, Meta(ge=1, le=50)]
|
|
|
62
59
|
Seconds = Annotated[int, float, Meta(ge=0)]
|
|
63
60
|
|
|
64
61
|
|
|
65
|
-
class PlaywrightConfig(Struct, kw_only=True, frozen=False):
|
|
62
|
+
class PlaywrightConfig(Struct, kw_only=True, frozen=False, weakref=True):
|
|
66
63
|
"""Configuration struct for validation"""
|
|
67
64
|
|
|
68
65
|
max_pages: PagesCount = 1
|
|
@@ -88,6 +85,7 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
|
|
|
88
85
|
load_dom: bool = True
|
|
89
86
|
wait_selector_state: SelectorWaitStates = "attached"
|
|
90
87
|
user_data_dir: str = ""
|
|
88
|
+
extra_flags: Optional[List[str]] = None
|
|
91
89
|
selector_config: Optional[Dict] = {}
|
|
92
90
|
additional_args: Optional[Dict] = {}
|
|
93
91
|
|
|
@@ -98,20 +96,26 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
|
|
|
98
96
|
if self.proxy:
|
|
99
97
|
self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
|
|
100
98
|
if self.cdp_url:
|
|
101
|
-
|
|
99
|
+
cdp_msg = _is_invalid_cdp_url(self.cdp_url)
|
|
100
|
+
if cdp_msg:
|
|
101
|
+
raise ValueError(cdp_msg)
|
|
102
102
|
|
|
103
103
|
if not self.cookies:
|
|
104
104
|
self.cookies = []
|
|
105
|
+
if not self.extra_flags:
|
|
106
|
+
self.extra_flags = []
|
|
105
107
|
if not self.selector_config:
|
|
106
108
|
self.selector_config = {}
|
|
107
109
|
if not self.additional_args:
|
|
108
110
|
self.additional_args = {}
|
|
109
111
|
|
|
110
112
|
if self.init_script is not None:
|
|
111
|
-
|
|
113
|
+
validation_msg = _is_invalid_file_path(self.init_script)
|
|
114
|
+
if validation_msg:
|
|
115
|
+
raise ValueError(validation_msg)
|
|
112
116
|
|
|
113
117
|
|
|
114
|
-
class CamoufoxConfig(Struct, kw_only=True, frozen=False):
|
|
118
|
+
class CamoufoxConfig(Struct, kw_only=True, frozen=False, weakref=True):
|
|
115
119
|
"""Configuration struct for validation"""
|
|
116
120
|
|
|
117
121
|
max_pages: PagesCount = 1
|
|
@@ -149,14 +153,16 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
|
|
|
149
153
|
if self.proxy:
|
|
150
154
|
self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
|
|
151
155
|
|
|
152
|
-
if self.addons
|
|
156
|
+
if self.addons:
|
|
153
157
|
for addon in self.addons:
|
|
154
158
|
_validate_addon_path(addon)
|
|
155
159
|
else:
|
|
156
160
|
self.addons = []
|
|
157
161
|
|
|
158
162
|
if self.init_script is not None:
|
|
159
|
-
|
|
163
|
+
validation_msg = _is_invalid_file_path(self.init_script)
|
|
164
|
+
if validation_msg:
|
|
165
|
+
raise ValueError(validation_msg)
|
|
160
166
|
|
|
161
167
|
if not self.cookies:
|
|
162
168
|
self.cookies = []
|
|
@@ -169,27 +175,6 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
|
|
|
169
175
|
self.additional_args = {}
|
|
170
176
|
|
|
171
177
|
|
|
172
|
-
# Code parts to validate `fetch` in the least possible numbers of lines overall
|
|
173
|
-
class FetchConfig(Struct, kw_only=True):
|
|
174
|
-
"""Configuration struct for `fetch` calls validation"""
|
|
175
|
-
|
|
176
|
-
google_search: bool = True
|
|
177
|
-
timeout: Seconds = 30000
|
|
178
|
-
wait: Seconds = 0
|
|
179
|
-
page_action: Optional[Callable] = None
|
|
180
|
-
extra_headers: Optional[Dict[str, str]] = None
|
|
181
|
-
disable_resources: bool = False
|
|
182
|
-
wait_selector: Optional[str] = None
|
|
183
|
-
wait_selector_state: SelectorWaitStates = "attached"
|
|
184
|
-
network_idle: bool = False
|
|
185
|
-
load_dom: bool = True
|
|
186
|
-
solve_cloudflare: bool = False
|
|
187
|
-
selector_config: Dict = {}
|
|
188
|
-
|
|
189
|
-
def to_dict(self):
|
|
190
|
-
return {f: getattr(self, f) for f in self.__struct_fields__}
|
|
191
|
-
|
|
192
|
-
|
|
193
178
|
@dataclass
|
|
194
179
|
class _fetch_params:
|
|
195
180
|
"""A dataclass of all parameters used by `fetch` calls"""
|
|
@@ -208,7 +193,9 @@ class _fetch_params:
|
|
|
208
193
|
selector_config: Dict
|
|
209
194
|
|
|
210
195
|
|
|
211
|
-
def validate_fetch(
|
|
196
|
+
def validate_fetch(
|
|
197
|
+
params: List[Tuple], model: type[PlaywrightConfig] | type[CamoufoxConfig], sentinel=None
|
|
198
|
+
) -> _fetch_params:
|
|
212
199
|
result = {}
|
|
213
200
|
overrides = {}
|
|
214
201
|
|
|
@@ -219,32 +206,56 @@ def validate_fetch(params: List[Tuple], sentinel=None) -> _fetch_params:
|
|
|
219
206
|
result[arg] = session_value
|
|
220
207
|
|
|
221
208
|
if overrides:
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
209
|
+
validated_config = validate(overrides, model)
|
|
210
|
+
# Extract only the fields that _fetch_params needs from validated_config
|
|
211
|
+
validated_dict = {
|
|
212
|
+
f.name: getattr(validated_config, f.name)
|
|
213
|
+
for f in fields(_fetch_params)
|
|
214
|
+
if hasattr(validated_config, f.name)
|
|
215
|
+
}
|
|
216
|
+
# solve_cloudflare defaults to False for models that don't have it (PlaywrightConfig)
|
|
217
|
+
validated_dict.setdefault("solve_cloudflare", False)
|
|
225
218
|
|
|
226
|
-
|
|
227
|
-
|
|
219
|
+
validated_dict.update(result)
|
|
220
|
+
return _fetch_params(**validated_dict)
|
|
221
|
+
|
|
222
|
+
result.setdefault("solve_cloudflare", False)
|
|
228
223
|
|
|
229
224
|
return _fetch_params(**result)
|
|
230
225
|
|
|
231
226
|
|
|
232
|
-
|
|
233
|
-
|
|
227
|
+
# Cache default values for each model to reduce validation overhead
|
|
228
|
+
models_default_values = {}
|
|
229
|
+
|
|
230
|
+
for _model in (CamoufoxConfig, PlaywrightConfig):
|
|
231
|
+
_defaults = {}
|
|
232
|
+
if hasattr(_model, "__struct_defaults__") and hasattr(_model, "__struct_fields__"):
|
|
233
|
+
for field_name, default_value in zip(_model.__struct_fields__, _model.__struct_defaults__): # type: ignore
|
|
234
|
+
# Skip factory defaults - these are msgspec._core.Factory instances
|
|
235
|
+
if type(default_value).__name__ != "Factory":
|
|
236
|
+
_defaults[field_name] = default_value
|
|
237
|
+
|
|
238
|
+
models_default_values[_model.__name__] = _defaults.copy()
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _filter_defaults(params: Dict, model: str) -> Dict:
|
|
242
|
+
"""Filter out parameters that match their default values to reduce validation overhead."""
|
|
243
|
+
defaults = models_default_values[model]
|
|
244
|
+
return {k: v for k, v in params.items() if k not in defaults or v != defaults[k]}
|
|
234
245
|
|
|
235
246
|
|
|
236
247
|
@overload
|
|
237
|
-
def validate(params: Dict, model: type[
|
|
248
|
+
def validate(params: Dict, model: type[PlaywrightConfig]) -> PlaywrightConfig: ...
|
|
238
249
|
|
|
239
250
|
|
|
240
251
|
@overload
|
|
241
|
-
def validate(params: Dict, model: type[
|
|
252
|
+
def validate(params: Dict, model: type[CamoufoxConfig]) -> CamoufoxConfig: ...
|
|
242
253
|
|
|
243
254
|
|
|
244
|
-
def validate(
|
|
245
|
-
params: Dict, model: type[PlaywrightConfig] | type[CamoufoxConfig] | type[FetchConfig]
|
|
246
|
-
) -> PlaywrightConfig | CamoufoxConfig | FetchConfig:
|
|
255
|
+
def validate(params: Dict, model: type[PlaywrightConfig] | type[CamoufoxConfig]) -> PlaywrightConfig | CamoufoxConfig:
|
|
247
256
|
try:
|
|
248
|
-
|
|
257
|
+
# Filter out params with the default values (no need to validate them) to speed up validation
|
|
258
|
+
filtered = _filter_defaults(params, model.__name__)
|
|
259
|
+
return convert(filtered, model)
|
|
249
260
|
except ValidationError as e:
|
|
250
261
|
raise TypeError(f"Invalid argument type: {e}") from e
|
|
@@ -2,6 +2,7 @@ from functools import lru_cache
|
|
|
2
2
|
from re import compile as re_compile
|
|
3
3
|
|
|
4
4
|
from curl_cffi.requests import Response as CurlResponse
|
|
5
|
+
from playwright._impl._errors import Error as PlaywrightError
|
|
5
6
|
from playwright.sync_api import Page as SyncPage, Response as SyncResponse
|
|
6
7
|
from playwright.async_api import Page as AsyncPage, Response as AsyncResponse
|
|
7
8
|
|
|
@@ -84,6 +85,7 @@ class ResponseFactory:
|
|
|
84
85
|
first_response: SyncResponse,
|
|
85
86
|
final_response: Optional[SyncResponse],
|
|
86
87
|
parser_arguments: Dict,
|
|
88
|
+
automated_page: bool = False,
|
|
87
89
|
) -> Response:
|
|
88
90
|
"""
|
|
89
91
|
Transforms a Playwright response into an internal `Response` object, encapsulating
|
|
@@ -99,6 +101,7 @@ class ResponseFactory:
|
|
|
99
101
|
:param first_response: An earlier or initial Playwright `Response` object that may serve as a fallback response in the absence of the final one.
|
|
100
102
|
:param parser_arguments: A dictionary containing additional arguments needed for parsing or further customization of the returned `Response`. These arguments are dynamically unpacked into
|
|
101
103
|
the `Response` object.
|
|
104
|
+
:param automated_page: If True, it means the `page_action` argument was being used, so the response retrieving method changes to use Playwright's page instead of the final response.
|
|
102
105
|
|
|
103
106
|
:return: A fully populated `Response` object containing the page's URL, content, status, headers, cookies, and other derived metadata.
|
|
104
107
|
:rtype: Response
|
|
@@ -114,7 +117,7 @@ class ResponseFactory:
|
|
|
114
117
|
|
|
115
118
|
history = cls._process_response_history(first_response, parser_arguments)
|
|
116
119
|
try:
|
|
117
|
-
page_content = final_response.text()
|
|
120
|
+
page_content = final_response.text() if not automated_page else cls._get_page_content(page)
|
|
118
121
|
except Exception as e: # pragma: no cover
|
|
119
122
|
log.error(f"Error getting page content: {e}")
|
|
120
123
|
page_content = ""
|
|
@@ -179,6 +182,36 @@ class ResponseFactory:
|
|
|
179
182
|
|
|
180
183
|
return history
|
|
181
184
|
|
|
185
|
+
@classmethod
|
|
186
|
+
def _get_page_content(cls, page: SyncPage) -> str:
|
|
187
|
+
"""
|
|
188
|
+
A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
|
|
189
|
+
:param page: The page to extract content from.
|
|
190
|
+
:return:
|
|
191
|
+
"""
|
|
192
|
+
while True:
|
|
193
|
+
try:
|
|
194
|
+
return page.content() or ""
|
|
195
|
+
except PlaywrightError:
|
|
196
|
+
page.wait_for_timeout(500)
|
|
197
|
+
continue
|
|
198
|
+
return "" # pyright: ignore
|
|
199
|
+
|
|
200
|
+
@classmethod
|
|
201
|
+
async def _get_async_page_content(cls, page: AsyncPage) -> str:
|
|
202
|
+
"""
|
|
203
|
+
A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
|
|
204
|
+
:param page: The page to extract content from.
|
|
205
|
+
:return:
|
|
206
|
+
"""
|
|
207
|
+
while True:
|
|
208
|
+
try:
|
|
209
|
+
return (await page.content()) or ""
|
|
210
|
+
except PlaywrightError:
|
|
211
|
+
await page.wait_for_timeout(500)
|
|
212
|
+
continue
|
|
213
|
+
return "" # pyright: ignore
|
|
214
|
+
|
|
182
215
|
@classmethod
|
|
183
216
|
async def from_async_playwright_response(
|
|
184
217
|
cls,
|
|
@@ -186,6 +219,7 @@ class ResponseFactory:
|
|
|
186
219
|
first_response: AsyncResponse,
|
|
187
220
|
final_response: Optional[AsyncResponse],
|
|
188
221
|
parser_arguments: Dict,
|
|
222
|
+
automated_page: bool = False,
|
|
189
223
|
) -> Response:
|
|
190
224
|
"""
|
|
191
225
|
Transforms a Playwright response into an internal `Response` object, encapsulating
|
|
@@ -201,6 +235,7 @@ class ResponseFactory:
|
|
|
201
235
|
:param first_response: An earlier or initial Playwright `Response` object that may serve as a fallback response in the absence of the final one.
|
|
202
236
|
:param parser_arguments: A dictionary containing additional arguments needed for parsing or further customization of the returned `Response`. These arguments are dynamically unpacked into
|
|
203
237
|
the `Response` object.
|
|
238
|
+
:param automated_page: If True, it means the `page_action` argument was being used, so the response retrieving method changes to use Playwright's page instead of the final response.
|
|
204
239
|
|
|
205
240
|
:return: A fully populated `Response` object containing the page's URL, content, status, headers, cookies, and other derived metadata.
|
|
206
241
|
:rtype: Response
|
|
@@ -216,7 +251,7 @@ class ResponseFactory:
|
|
|
216
251
|
|
|
217
252
|
history = await cls._async_process_response_history(first_response, parser_arguments)
|
|
218
253
|
try:
|
|
219
|
-
page_content = await final_response.text()
|
|
254
|
+
page_content = await (final_response.text() if not automated_page else cls._get_async_page_content(page))
|
|
220
255
|
except Exception as e: # pragma: no cover
|
|
221
256
|
log.error(f"Error getting page content in async: {e}")
|
|
222
257
|
page_content = ""
|
|
@@ -209,15 +209,3 @@ class StatusText:
|
|
|
209
209
|
def get(cls, status_code: int) -> str:
|
|
210
210
|
"""Get the phrase for a given HTTP status code."""
|
|
211
211
|
return cls._phrases.get(status_code, "Unknown Status Code")
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
def get_variable_name(var: Any) -> Optional[str]:
|
|
215
|
-
"""Get the name of a variable using global and local scopes.
|
|
216
|
-
:param var: The variable to find the name for
|
|
217
|
-
:return: The name of the variable if found, None otherwise
|
|
218
|
-
"""
|
|
219
|
-
for scope in [globals(), locals()]:
|
|
220
|
-
for name, value in scope.items():
|
|
221
|
-
if value is var:
|
|
222
|
-
return name
|
|
223
|
-
return None
|
|
@@ -7,8 +7,9 @@ from platform import system as platform_system
|
|
|
7
7
|
|
|
8
8
|
from tldextract import extract
|
|
9
9
|
from browserforge.headers import Browser, HeaderGenerator
|
|
10
|
+
from browserforge.headers.generator import SUPPORTED_OPERATING_SYSTEMS
|
|
10
11
|
|
|
11
|
-
from scrapling.core._types import Dict, Literal
|
|
12
|
+
from scrapling.core._types import Dict, Literal, Tuple
|
|
12
13
|
|
|
13
14
|
__OS_NAME__ = platform_system()
|
|
14
15
|
OSName = Literal["linux", "macos", "windows"]
|
|
@@ -29,12 +30,12 @@ def generate_convincing_referer(url: str) -> str:
|
|
|
29
30
|
|
|
30
31
|
|
|
31
32
|
@lru_cache(1, typed=True)
|
|
32
|
-
def get_os_name() -> OSName |
|
|
33
|
+
def get_os_name() -> OSName | Tuple:
|
|
33
34
|
"""Get the current OS name in the same format needed for browserforge, if the OS is Unknown, return None so browserforge uses all.
|
|
34
35
|
|
|
35
36
|
:return: Current OS name or `None` otherwise
|
|
36
37
|
"""
|
|
37
|
-
match __OS_NAME__:
|
|
38
|
+
match __OS_NAME__: # pragma: no cover
|
|
38
39
|
case "Linux":
|
|
39
40
|
return "linux"
|
|
40
41
|
case "Darwin":
|
|
@@ -42,7 +43,7 @@ def get_os_name() -> OSName | None:
|
|
|
42
43
|
case "Windows":
|
|
43
44
|
return "windows"
|
|
44
45
|
case _:
|
|
45
|
-
return
|
|
46
|
+
return SUPPORTED_OPERATING_SYSTEMS
|
|
46
47
|
|
|
47
48
|
|
|
48
49
|
def generate_headers(browser_mode: bool = False) -> Dict:
|
|
@@ -63,10 +64,7 @@ def generate_headers(browser_mode: bool = False) -> Dict:
|
|
|
63
64
|
Browser(name="edge", min_version=130),
|
|
64
65
|
]
|
|
65
66
|
)
|
|
66
|
-
|
|
67
|
-
return HeaderGenerator(browser=browsers, os=os_name, device="desktop").generate()
|
|
68
|
-
else:
|
|
69
|
-
return HeaderGenerator(browser=browsers, device="desktop").generate()
|
|
67
|
+
return HeaderGenerator(browser=browsers, os=os_name, device="desktop").generate()
|
|
70
68
|
|
|
71
69
|
|
|
72
70
|
__default_useragent__ = generate_headers(browser_mode=False).get("User-Agent")
|