scrapling 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/core/_types.py +3 -0
- scrapling/core/ai.py +2 -1
- scrapling/core/custom_types.py +20 -27
- scrapling/core/mixins.py +15 -9
- scrapling/core/shell.py +4 -3
- scrapling/core/storage.py +5 -5
- scrapling/core/translator.py +13 -8
- scrapling/engines/_browsers/_base.py +175 -21
- scrapling/engines/_browsers/_camoufox.py +95 -171
- scrapling/engines/_browsers/_config_tools.py +9 -3
- scrapling/engines/_browsers/_controllers.py +51 -101
- scrapling/engines/_browsers/_validators.py +95 -63
- scrapling/engines/static.py +678 -668
- scrapling/engines/toolbelt/convertor.py +48 -15
- scrapling/engines/toolbelt/custom.py +6 -21
- scrapling/engines/toolbelt/fingerprints.py +14 -9
- scrapling/engines/toolbelt/navigation.py +11 -3
- scrapling/fetchers/__init__.py +11 -1
- scrapling/fetchers/chrome.py +15 -4
- scrapling/fetchers/firefox.py +0 -4
- scrapling/parser.py +105 -80
- {scrapling-0.3.6.dist-info → scrapling-0.3.8.dist-info}/METADATA +7 -6
- scrapling-0.3.8.dist-info/RECORD +47 -0
- scrapling-0.3.6.dist-info/RECORD +0 -47
- {scrapling-0.3.6.dist-info → scrapling-0.3.8.dist-info}/WHEEL +0 -0
- {scrapling-0.3.6.dist-info → scrapling-0.3.8.dist-info}/entry_points.txt +0 -0
- {scrapling-0.3.6.dist-info → scrapling-0.3.8.dist-info}/licenses/LICENSE +0 -0
- {scrapling-0.3.6.dist-info → scrapling-0.3.8.dist-info}/top_level.txt +0 -0
|
@@ -1,27 +1,27 @@
|
|
|
1
1
|
from playwright.sync_api import (
|
|
2
|
-
Response as SyncPlaywrightResponse,
|
|
3
|
-
sync_playwright,
|
|
4
|
-
Playwright,
|
|
5
2
|
Locator,
|
|
3
|
+
Playwright,
|
|
4
|
+
sync_playwright,
|
|
6
5
|
)
|
|
7
6
|
from playwright.async_api import (
|
|
8
7
|
async_playwright,
|
|
9
|
-
Response as AsyncPlaywrightResponse,
|
|
10
|
-
BrowserContext as AsyncBrowserContext,
|
|
11
|
-
Playwright as AsyncPlaywright,
|
|
12
8
|
Locator as AsyncLocator,
|
|
9
|
+
Playwright as AsyncPlaywright,
|
|
10
|
+
BrowserContext as AsyncBrowserContext,
|
|
13
11
|
)
|
|
14
12
|
from patchright.sync_api import sync_playwright as sync_patchright
|
|
15
13
|
from patchright.async_api import async_playwright as async_patchright
|
|
16
14
|
|
|
17
15
|
from scrapling.core.utils import log
|
|
18
16
|
from ._base import SyncSession, AsyncSession, DynamicSessionMixin
|
|
19
|
-
from ._validators import validate_fetch as _validate
|
|
17
|
+
from ._validators import validate_fetch as _validate, PlaywrightConfig
|
|
20
18
|
from scrapling.core._types import (
|
|
19
|
+
Any,
|
|
21
20
|
Dict,
|
|
22
21
|
List,
|
|
23
22
|
Optional,
|
|
24
23
|
Callable,
|
|
24
|
+
TYPE_CHECKING,
|
|
25
25
|
SelectorWaitStates,
|
|
26
26
|
)
|
|
27
27
|
from scrapling.engines.toolbelt.convertor import (
|
|
@@ -30,7 +30,7 @@ from scrapling.engines.toolbelt.convertor import (
|
|
|
30
30
|
)
|
|
31
31
|
from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
|
|
32
32
|
|
|
33
|
-
_UNSET = object()
|
|
33
|
+
_UNSET: Any = object()
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
@@ -94,7 +94,10 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
|
94
94
|
network_idle: bool = False,
|
|
95
95
|
load_dom: bool = True,
|
|
96
96
|
wait_selector_state: SelectorWaitStates = "attached",
|
|
97
|
+
user_data_dir: str = "",
|
|
98
|
+
extra_flags: Optional[List[str]] = None,
|
|
97
99
|
selector_config: Optional[Dict] = None,
|
|
100
|
+
additional_args: Optional[Dict] = None,
|
|
98
101
|
):
|
|
99
102
|
"""A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
|
|
100
103
|
|
|
@@ -121,7 +124,10 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
|
121
124
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
|
122
125
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
|
123
126
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
|
127
|
+
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
|
128
|
+
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
|
124
129
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
|
130
|
+
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
|
125
131
|
"""
|
|
126
132
|
self.__validate__(
|
|
127
133
|
wait=wait,
|
|
@@ -140,11 +146,14 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
|
140
146
|
hide_canvas=hide_canvas,
|
|
141
147
|
init_script=init_script,
|
|
142
148
|
network_idle=network_idle,
|
|
149
|
+
user_data_dir=user_data_dir,
|
|
143
150
|
google_search=google_search,
|
|
144
151
|
extra_headers=extra_headers,
|
|
145
152
|
wait_selector=wait_selector,
|
|
146
153
|
disable_webgl=disable_webgl,
|
|
154
|
+
extra_flags=extra_flags,
|
|
147
155
|
selector_config=selector_config,
|
|
156
|
+
additional_args=additional_args,
|
|
148
157
|
disable_resources=disable_resources,
|
|
149
158
|
wait_selector_state=wait_selector_state,
|
|
150
159
|
)
|
|
@@ -154,14 +163,14 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
|
154
163
|
"""Create a browser for this instance and context."""
|
|
155
164
|
sync_context = sync_patchright if self.stealth else sync_playwright
|
|
156
165
|
|
|
157
|
-
self.playwright: Playwright = sync_context().start()
|
|
166
|
+
self.playwright: Playwright = sync_context().start() # pyright: ignore [reportAttributeAccessIssue]
|
|
158
167
|
|
|
159
168
|
if self.cdp_url: # pragma: no cover
|
|
160
169
|
self.context = self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url).new_context(
|
|
161
170
|
**self.context_options
|
|
162
171
|
)
|
|
163
172
|
else:
|
|
164
|
-
self.context = self.playwright.chromium.launch_persistent_context(
|
|
173
|
+
self.context = self.playwright.chromium.launch_persistent_context(**self.launch_options)
|
|
165
174
|
|
|
166
175
|
if self.init_script: # pragma: no cover
|
|
167
176
|
self.context.add_init_script(path=self.init_script)
|
|
@@ -169,28 +178,6 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
|
169
178
|
if self.cookies: # pragma: no cover
|
|
170
179
|
self.context.add_cookies(self.cookies)
|
|
171
180
|
|
|
172
|
-
def __enter__(self):
|
|
173
|
-
self.__create__()
|
|
174
|
-
return self
|
|
175
|
-
|
|
176
|
-
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
177
|
-
self.close()
|
|
178
|
-
|
|
179
|
-
def close(self): # pragma: no cover
|
|
180
|
-
"""Close all resources"""
|
|
181
|
-
if self._closed:
|
|
182
|
-
return
|
|
183
|
-
|
|
184
|
-
if self.context:
|
|
185
|
-
self.context.close()
|
|
186
|
-
self.context = None
|
|
187
|
-
|
|
188
|
-
if self.playwright:
|
|
189
|
-
self.playwright.stop()
|
|
190
|
-
self.playwright = None
|
|
191
|
-
|
|
192
|
-
self._closed = True
|
|
193
|
-
|
|
194
181
|
def fetch(
|
|
195
182
|
self,
|
|
196
183
|
url: str,
|
|
@@ -238,37 +225,26 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
|
238
225
|
("load_dom", load_dom, self.load_dom),
|
|
239
226
|
("selector_config", selector_config, self.selector_config),
|
|
240
227
|
],
|
|
228
|
+
PlaywrightConfig,
|
|
241
229
|
_UNSET,
|
|
242
230
|
)
|
|
243
231
|
|
|
244
232
|
if self._closed: # pragma: no cover
|
|
245
233
|
raise RuntimeError("Context manager has been closed")
|
|
246
234
|
|
|
247
|
-
final_response = None
|
|
248
235
|
referer = (
|
|
249
236
|
generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
|
|
250
237
|
)
|
|
251
238
|
|
|
252
|
-
def handle_response(finished_response: SyncPlaywrightResponse):
|
|
253
|
-
nonlocal final_response
|
|
254
|
-
if (
|
|
255
|
-
finished_response.request.resource_type == "document"
|
|
256
|
-
and finished_response.request.is_navigation_request()
|
|
257
|
-
):
|
|
258
|
-
final_response = finished_response
|
|
259
|
-
|
|
260
239
|
page_info = self._get_page(params.timeout, params.extra_headers, params.disable_resources)
|
|
261
|
-
|
|
240
|
+
final_response = [None]
|
|
241
|
+
handle_response = self._create_response_handler(page_info, final_response)
|
|
262
242
|
|
|
263
243
|
try: # pragma: no cover
|
|
264
244
|
# Navigate to URL and wait for a specified state
|
|
265
245
|
page_info.page.on("response", handle_response)
|
|
266
246
|
first_response = page_info.page.goto(url, referer=referer)
|
|
267
|
-
|
|
268
|
-
page_info.page.wait_for_load_state(state="domcontentloaded")
|
|
269
|
-
|
|
270
|
-
if params.network_idle:
|
|
271
|
-
page_info.page.wait_for_load_state("networkidle")
|
|
247
|
+
self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
|
|
272
248
|
|
|
273
249
|
if not first_response:
|
|
274
250
|
raise RuntimeError(f"Failed to get response for {url}")
|
|
@@ -284,11 +260,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
|
284
260
|
waiter: Locator = page_info.page.locator(params.wait_selector)
|
|
285
261
|
waiter.first.wait_for(state=params.wait_selector_state)
|
|
286
262
|
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
|
287
|
-
page_info.page.
|
|
288
|
-
if params.load_dom:
|
|
289
|
-
page_info.page.wait_for_load_state(state="domcontentloaded")
|
|
290
|
-
if params.network_idle:
|
|
291
|
-
page_info.page.wait_for_load_state("networkidle")
|
|
263
|
+
self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
|
|
292
264
|
except Exception as e: # pragma: no cover
|
|
293
265
|
log.error(f"Error waiting for selector {params.wait_selector}: {e}")
|
|
294
266
|
|
|
@@ -296,10 +268,10 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
|
296
268
|
|
|
297
269
|
# Create response object
|
|
298
270
|
response = ResponseFactory.from_playwright_response(
|
|
299
|
-
page_info.page, first_response, final_response, params.selector_config
|
|
271
|
+
page_info.page, first_response, final_response[0], params.selector_config, bool(params.page_action)
|
|
300
272
|
)
|
|
301
273
|
|
|
302
|
-
# Close the page
|
|
274
|
+
# Close the page to free up resources
|
|
303
275
|
page_info.page.close()
|
|
304
276
|
self.page_pool.pages.remove(page_info)
|
|
305
277
|
|
|
@@ -337,7 +309,10 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
|
337
309
|
network_idle: bool = False,
|
|
338
310
|
load_dom: bool = True,
|
|
339
311
|
wait_selector_state: SelectorWaitStates = "attached",
|
|
312
|
+
user_data_dir: str = "",
|
|
313
|
+
extra_flags: Optional[List[str]] = None,
|
|
340
314
|
selector_config: Optional[Dict] = None,
|
|
315
|
+
additional_args: Optional[Dict] = None,
|
|
341
316
|
):
|
|
342
317
|
"""A Browser session manager with page pooling
|
|
343
318
|
|
|
@@ -365,7 +340,10 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
|
365
340
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
|
366
341
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
|
367
342
|
:param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
|
|
343
|
+
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
|
344
|
+
:param extra_flags: A list of additional browser flags to pass to the browser on launch.
|
|
368
345
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
|
346
|
+
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
|
369
347
|
"""
|
|
370
348
|
|
|
371
349
|
self.__validate__(
|
|
@@ -385,11 +363,14 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
|
385
363
|
hide_canvas=hide_canvas,
|
|
386
364
|
init_script=init_script,
|
|
387
365
|
network_idle=network_idle,
|
|
366
|
+
user_data_dir=user_data_dir,
|
|
388
367
|
google_search=google_search,
|
|
389
368
|
extra_headers=extra_headers,
|
|
390
369
|
wait_selector=wait_selector,
|
|
391
370
|
disable_webgl=disable_webgl,
|
|
371
|
+
extra_flags=extra_flags,
|
|
392
372
|
selector_config=selector_config,
|
|
373
|
+
additional_args=additional_args,
|
|
393
374
|
disable_resources=disable_resources,
|
|
394
375
|
wait_selector_state=wait_selector_state,
|
|
395
376
|
)
|
|
@@ -399,43 +380,21 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
|
399
380
|
"""Create a browser for this instance and context."""
|
|
400
381
|
async_context = async_patchright if self.stealth else async_playwright
|
|
401
382
|
|
|
402
|
-
self.playwright: AsyncPlaywright = await async_context().start()
|
|
383
|
+
self.playwright: AsyncPlaywright = await async_context().start() # pyright: ignore [reportAttributeAccessIssue]
|
|
403
384
|
|
|
404
385
|
if self.cdp_url:
|
|
405
386
|
browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url)
|
|
406
387
|
self.context: AsyncBrowserContext = await browser.new_context(**self.context_options)
|
|
407
388
|
else:
|
|
408
389
|
self.context: AsyncBrowserContext = await self.playwright.chromium.launch_persistent_context(
|
|
409
|
-
|
|
390
|
+
**self.launch_options
|
|
410
391
|
)
|
|
411
392
|
|
|
412
393
|
if self.init_script: # pragma: no cover
|
|
413
394
|
await self.context.add_init_script(path=self.init_script)
|
|
414
395
|
|
|
415
396
|
if self.cookies:
|
|
416
|
-
await self.context.add_cookies(self.cookies)
|
|
417
|
-
|
|
418
|
-
async def __aenter__(self):
|
|
419
|
-
await self.__create__()
|
|
420
|
-
return self
|
|
421
|
-
|
|
422
|
-
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
423
|
-
await self.close()
|
|
424
|
-
|
|
425
|
-
async def close(self):
|
|
426
|
-
"""Close all resources"""
|
|
427
|
-
if self._closed: # pragma: no cover
|
|
428
|
-
return
|
|
429
|
-
|
|
430
|
-
if self.context:
|
|
431
|
-
await self.context.close()
|
|
432
|
-
self.context = None
|
|
433
|
-
|
|
434
|
-
if self.playwright:
|
|
435
|
-
await self.playwright.stop()
|
|
436
|
-
self.playwright = None
|
|
437
|
-
|
|
438
|
-
self._closed = True
|
|
397
|
+
await self.context.add_cookies(self.cookies) # pyright: ignore
|
|
439
398
|
|
|
440
399
|
async def fetch(
|
|
441
400
|
self,
|
|
@@ -484,37 +443,32 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
|
484
443
|
("load_dom", load_dom, self.load_dom),
|
|
485
444
|
("selector_config", selector_config, self.selector_config),
|
|
486
445
|
],
|
|
446
|
+
PlaywrightConfig,
|
|
487
447
|
_UNSET,
|
|
488
448
|
)
|
|
489
449
|
|
|
490
450
|
if self._closed: # pragma: no cover
|
|
491
451
|
raise RuntimeError("Context manager has been closed")
|
|
492
452
|
|
|
493
|
-
final_response = None
|
|
494
453
|
referer = (
|
|
495
454
|
generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
|
|
496
455
|
)
|
|
497
456
|
|
|
498
|
-
async def handle_response(finished_response: AsyncPlaywrightResponse):
|
|
499
|
-
nonlocal final_response
|
|
500
|
-
if (
|
|
501
|
-
finished_response.request.resource_type == "document"
|
|
502
|
-
and finished_response.request.is_navigation_request()
|
|
503
|
-
):
|
|
504
|
-
final_response = finished_response
|
|
505
|
-
|
|
506
457
|
page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
|
|
507
|
-
|
|
458
|
+
final_response = [None]
|
|
459
|
+
handle_response = self._create_response_handler(page_info, final_response)
|
|
460
|
+
|
|
461
|
+
if TYPE_CHECKING:
|
|
462
|
+
from playwright.async_api import Page as async_Page
|
|
463
|
+
|
|
464
|
+
if not isinstance(page_info.page, async_Page):
|
|
465
|
+
raise TypeError
|
|
508
466
|
|
|
509
467
|
try:
|
|
510
468
|
# Navigate to URL and wait for a specified state
|
|
511
469
|
page_info.page.on("response", handle_response)
|
|
512
470
|
first_response = await page_info.page.goto(url, referer=referer)
|
|
513
|
-
|
|
514
|
-
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
|
515
|
-
|
|
516
|
-
if params.network_idle:
|
|
517
|
-
await page_info.page.wait_for_load_state("networkidle")
|
|
471
|
+
await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
|
|
518
472
|
|
|
519
473
|
if not first_response:
|
|
520
474
|
raise RuntimeError(f"Failed to get response for {url}")
|
|
@@ -530,11 +484,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
|
530
484
|
waiter: AsyncLocator = page_info.page.locator(params.wait_selector)
|
|
531
485
|
await waiter.first.wait_for(state=params.wait_selector_state)
|
|
532
486
|
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
|
533
|
-
await page_info.page.
|
|
534
|
-
if self.load_dom:
|
|
535
|
-
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
|
536
|
-
if params.network_idle:
|
|
537
|
-
await page_info.page.wait_for_load_state("networkidle")
|
|
487
|
+
await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
|
|
538
488
|
except Exception as e:
|
|
539
489
|
log.error(f"Error waiting for selector {params.wait_selector}: {e}")
|
|
540
490
|
|
|
@@ -542,10 +492,10 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
|
542
492
|
|
|
543
493
|
# Create response object
|
|
544
494
|
response = await ResponseFactory.from_async_playwright_response(
|
|
545
|
-
page_info.page, first_response, final_response, params.selector_config
|
|
495
|
+
page_info.page, first_response, final_response[0], params.selector_config, bool(params.page_action)
|
|
546
496
|
)
|
|
547
497
|
|
|
548
|
-
# Close the page
|
|
498
|
+
# Close the page to free up resources
|
|
549
499
|
await page_info.page.close()
|
|
550
500
|
self.page_pool.pages.remove(page_info)
|
|
551
501
|
return response
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
2
|
from typing import Annotated
|
|
3
|
-
from
|
|
3
|
+
from functools import lru_cache
|
|
4
4
|
from urllib.parse import urlparse
|
|
5
|
+
from dataclasses import dataclass, fields
|
|
5
6
|
|
|
6
7
|
from msgspec import Struct, Meta, convert, ValidationError
|
|
7
8
|
|
|
@@ -11,24 +12,28 @@ from scrapling.core._types import (
|
|
|
11
12
|
Tuple,
|
|
12
13
|
Optional,
|
|
13
14
|
Callable,
|
|
15
|
+
Iterable,
|
|
14
16
|
SelectorWaitStates,
|
|
17
|
+
overload,
|
|
15
18
|
)
|
|
16
19
|
from scrapling.engines.toolbelt.navigation import construct_proxy_dict
|
|
17
20
|
|
|
18
21
|
|
|
19
22
|
# Custom validators for msgspec
|
|
20
|
-
|
|
23
|
+
@lru_cache(8)
|
|
24
|
+
def _is_invalid_file_path(value: str) -> bool | str:
|
|
21
25
|
"""Fast file path validation"""
|
|
22
26
|
path = Path(value)
|
|
23
27
|
if not path.exists():
|
|
24
|
-
|
|
28
|
+
return f"Init script path not found: {value}"
|
|
25
29
|
if not path.is_file():
|
|
26
|
-
|
|
30
|
+
return f"Init script is not a file: {value}"
|
|
27
31
|
if not path.is_absolute():
|
|
28
|
-
|
|
32
|
+
return f"Init script is not a absolute path: {value}"
|
|
33
|
+
return False
|
|
29
34
|
|
|
30
35
|
|
|
31
|
-
def _validate_addon_path(value: str):
|
|
36
|
+
def _validate_addon_path(value: str) -> None:
|
|
32
37
|
"""Fast addon path validation"""
|
|
33
38
|
path = Path(value)
|
|
34
39
|
if not path.exists():
|
|
@@ -37,22 +42,16 @@ def _validate_addon_path(value: str):
|
|
|
37
42
|
raise ValueError(f"Addon path must be a directory of the extracted addon: {value}")
|
|
38
43
|
|
|
39
44
|
|
|
40
|
-
|
|
45
|
+
@lru_cache(2)
|
|
46
|
+
def _is_invalid_cdp_url(cdp_url: str) -> bool | str:
|
|
41
47
|
"""Fast CDP URL validation"""
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
if not cdp_url.startswith(("ws://", "wss://")):
|
|
45
|
-
raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
|
|
46
|
-
|
|
47
|
-
# Validate hostname and port
|
|
48
|
-
if not urlparse(cdp_url).netloc:
|
|
49
|
-
raise ValueError("Invalid hostname for the CDP URL")
|
|
50
|
-
|
|
51
|
-
except AttributeError as e:
|
|
52
|
-
raise ValueError(f"Malformed CDP URL: {cdp_url}: {str(e)}")
|
|
48
|
+
if not cdp_url.startswith(("ws://", "wss://")):
|
|
49
|
+
return "CDP URL must use 'ws://' or 'wss://' scheme"
|
|
53
50
|
|
|
54
|
-
|
|
55
|
-
|
|
51
|
+
netloc = urlparse(cdp_url).netloc
|
|
52
|
+
if not netloc:
|
|
53
|
+
return "Invalid hostname for the CDP URL"
|
|
54
|
+
return False
|
|
56
55
|
|
|
57
56
|
|
|
58
57
|
# Type aliases for cleaner annotations
|
|
@@ -60,7 +59,7 @@ PagesCount = Annotated[int, Meta(ge=1, le=50)]
|
|
|
60
59
|
Seconds = Annotated[int, float, Meta(ge=0)]
|
|
61
60
|
|
|
62
61
|
|
|
63
|
-
class PlaywrightConfig(Struct, kw_only=True, frozen=False):
|
|
62
|
+
class PlaywrightConfig(Struct, kw_only=True, frozen=False, weakref=True):
|
|
64
63
|
"""Configuration struct for validation"""
|
|
65
64
|
|
|
66
65
|
max_pages: PagesCount = 1
|
|
@@ -73,7 +72,7 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
|
|
|
73
72
|
stealth: bool = False
|
|
74
73
|
wait: Seconds = 0
|
|
75
74
|
page_action: Optional[Callable] = None
|
|
76
|
-
proxy: Optional[str | Dict[str, str]] = None # The default value for proxy in Playwright's source is `None`
|
|
75
|
+
proxy: Optional[str | Dict[str, str] | Tuple] = None # The default value for proxy in Playwright's source is `None`
|
|
77
76
|
locale: str = "en-US"
|
|
78
77
|
extra_headers: Optional[Dict[str, str]] = None
|
|
79
78
|
useragent: Optional[str] = None
|
|
@@ -81,11 +80,14 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
|
|
|
81
80
|
init_script: Optional[str] = None
|
|
82
81
|
disable_resources: bool = False
|
|
83
82
|
wait_selector: Optional[str] = None
|
|
84
|
-
cookies: Optional[
|
|
83
|
+
cookies: Optional[Iterable[Dict]] = None
|
|
85
84
|
network_idle: bool = False
|
|
86
85
|
load_dom: bool = True
|
|
87
86
|
wait_selector_state: SelectorWaitStates = "attached"
|
|
88
|
-
|
|
87
|
+
user_data_dir: str = ""
|
|
88
|
+
extra_flags: Optional[List[str]] = None
|
|
89
|
+
selector_config: Optional[Dict] = {}
|
|
90
|
+
additional_args: Optional[Dict] = {}
|
|
89
91
|
|
|
90
92
|
def __post_init__(self):
|
|
91
93
|
"""Custom validation after msgspec validation"""
|
|
@@ -94,18 +96,26 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
|
|
|
94
96
|
if self.proxy:
|
|
95
97
|
self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
|
|
96
98
|
if self.cdp_url:
|
|
97
|
-
|
|
99
|
+
cdp_msg = _is_invalid_cdp_url(self.cdp_url)
|
|
100
|
+
if cdp_msg:
|
|
101
|
+
raise ValueError(cdp_msg)
|
|
98
102
|
|
|
99
103
|
if not self.cookies:
|
|
100
104
|
self.cookies = []
|
|
105
|
+
if not self.extra_flags:
|
|
106
|
+
self.extra_flags = []
|
|
101
107
|
if not self.selector_config:
|
|
102
108
|
self.selector_config = {}
|
|
109
|
+
if not self.additional_args:
|
|
110
|
+
self.additional_args = {}
|
|
103
111
|
|
|
104
112
|
if self.init_script is not None:
|
|
105
|
-
|
|
113
|
+
validation_msg = _is_invalid_file_path(self.init_script)
|
|
114
|
+
if validation_msg:
|
|
115
|
+
raise ValueError(validation_msg)
|
|
106
116
|
|
|
107
117
|
|
|
108
|
-
class CamoufoxConfig(Struct, kw_only=True, frozen=False):
|
|
118
|
+
class CamoufoxConfig(Struct, kw_only=True, frozen=False, weakref=True):
|
|
109
119
|
"""Configuration struct for validation"""
|
|
110
120
|
|
|
111
121
|
max_pages: PagesCount = 1
|
|
@@ -125,15 +135,16 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
|
|
|
125
135
|
wait_selector: Optional[str] = None
|
|
126
136
|
addons: Optional[List[str]] = None
|
|
127
137
|
wait_selector_state: SelectorWaitStates = "attached"
|
|
128
|
-
cookies: Optional[
|
|
138
|
+
cookies: Optional[Iterable[Dict]] = None
|
|
129
139
|
google_search: bool = True
|
|
130
140
|
extra_headers: Optional[Dict[str, str]] = None
|
|
131
|
-
proxy: Optional[str | Dict[str, str]] = None # The default value for proxy in Playwright's source is `None`
|
|
141
|
+
proxy: Optional[str | Dict[str, str] | Tuple] = None # The default value for proxy in Playwright's source is `None`
|
|
132
142
|
os_randomize: bool = False
|
|
133
143
|
disable_ads: bool = False
|
|
134
144
|
geoip: bool = False
|
|
135
|
-
|
|
136
|
-
|
|
145
|
+
user_data_dir: str = ""
|
|
146
|
+
selector_config: Optional[Dict] = {}
|
|
147
|
+
additional_args: Optional[Dict] = {}
|
|
137
148
|
|
|
138
149
|
def __post_init__(self):
|
|
139
150
|
"""Custom validation after msgspec validation"""
|
|
@@ -142,14 +153,16 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
|
|
|
142
153
|
if self.proxy:
|
|
143
154
|
self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
|
|
144
155
|
|
|
145
|
-
if self.addons
|
|
156
|
+
if self.addons:
|
|
146
157
|
for addon in self.addons:
|
|
147
158
|
_validate_addon_path(addon)
|
|
148
159
|
else:
|
|
149
160
|
self.addons = []
|
|
150
161
|
|
|
151
162
|
if self.init_script is not None:
|
|
152
|
-
|
|
163
|
+
validation_msg = _is_invalid_file_path(self.init_script)
|
|
164
|
+
if validation_msg:
|
|
165
|
+
raise ValueError(validation_msg)
|
|
153
166
|
|
|
154
167
|
if not self.cookies:
|
|
155
168
|
self.cookies = []
|
|
@@ -162,27 +175,6 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
|
|
|
162
175
|
self.additional_args = {}
|
|
163
176
|
|
|
164
177
|
|
|
165
|
-
# Code parts to validate `fetch` in the least possible numbers of lines overall
|
|
166
|
-
class FetchConfig(Struct, kw_only=True):
|
|
167
|
-
"""Configuration struct for `fetch` calls validation"""
|
|
168
|
-
|
|
169
|
-
google_search: bool = True
|
|
170
|
-
timeout: Seconds = 30000
|
|
171
|
-
wait: Seconds = 0
|
|
172
|
-
page_action: Optional[Callable] = None
|
|
173
|
-
extra_headers: Optional[Dict[str, str]] = None
|
|
174
|
-
disable_resources: bool = False
|
|
175
|
-
wait_selector: Optional[str] = None
|
|
176
|
-
wait_selector_state: SelectorWaitStates = "attached"
|
|
177
|
-
network_idle: bool = False
|
|
178
|
-
load_dom: bool = True
|
|
179
|
-
solve_cloudflare: bool = False
|
|
180
|
-
selector_config: Optional[Dict] = {}
|
|
181
|
-
|
|
182
|
-
def to_dict(self):
|
|
183
|
-
return {f: getattr(self, f) for f in self.__struct_fields__}
|
|
184
|
-
|
|
185
|
-
|
|
186
178
|
@dataclass
|
|
187
179
|
class _fetch_params:
|
|
188
180
|
"""A dataclass of all parameters used by `fetch` calls"""
|
|
@@ -198,10 +190,12 @@ class _fetch_params:
|
|
|
198
190
|
network_idle: bool
|
|
199
191
|
load_dom: bool
|
|
200
192
|
solve_cloudflare: bool
|
|
201
|
-
selector_config:
|
|
193
|
+
selector_config: Dict
|
|
202
194
|
|
|
203
195
|
|
|
204
|
-
def validate_fetch(
|
|
196
|
+
def validate_fetch(
|
|
197
|
+
params: List[Tuple], model: type[PlaywrightConfig] | type[CamoufoxConfig], sentinel=None
|
|
198
|
+
) -> _fetch_params:
|
|
205
199
|
result = {}
|
|
206
200
|
overrides = {}
|
|
207
201
|
|
|
@@ -212,18 +206,56 @@ def validate_fetch(params: List[Tuple], sentinel=None) -> _fetch_params:
|
|
|
212
206
|
result[arg] = session_value
|
|
213
207
|
|
|
214
208
|
if overrides:
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
209
|
+
validated_config = validate(overrides, model)
|
|
210
|
+
# Extract only the fields that _fetch_params needs from validated_config
|
|
211
|
+
validated_dict = {
|
|
212
|
+
f.name: getattr(validated_config, f.name)
|
|
213
|
+
for f in fields(_fetch_params)
|
|
214
|
+
if hasattr(validated_config, f.name)
|
|
215
|
+
}
|
|
216
|
+
# solve_cloudflare defaults to False for models that don't have it (PlaywrightConfig)
|
|
217
|
+
validated_dict.setdefault("solve_cloudflare", False)
|
|
218
|
+
|
|
219
|
+
validated_dict.update(result)
|
|
220
|
+
return _fetch_params(**validated_dict)
|
|
218
221
|
|
|
219
|
-
|
|
220
|
-
result["solve_cloudflare"] = False
|
|
222
|
+
result.setdefault("solve_cloudflare", False)
|
|
221
223
|
|
|
222
224
|
return _fetch_params(**result)
|
|
223
225
|
|
|
224
226
|
|
|
225
|
-
|
|
227
|
+
# Cache default values for each model to reduce validation overhead
|
|
228
|
+
models_default_values = {}
|
|
229
|
+
|
|
230
|
+
for _model in (CamoufoxConfig, PlaywrightConfig):
|
|
231
|
+
_defaults = {}
|
|
232
|
+
if hasattr(_model, "__struct_defaults__") and hasattr(_model, "__struct_fields__"):
|
|
233
|
+
for field_name, default_value in zip(_model.__struct_fields__, _model.__struct_defaults__): # type: ignore
|
|
234
|
+
# Skip factory defaults - these are msgspec._core.Factory instances
|
|
235
|
+
if type(default_value).__name__ != "Factory":
|
|
236
|
+
_defaults[field_name] = default_value
|
|
237
|
+
|
|
238
|
+
models_default_values[_model.__name__] = _defaults.copy()
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _filter_defaults(params: Dict, model: str) -> Dict:
|
|
242
|
+
"""Filter out parameters that match their default values to reduce validation overhead."""
|
|
243
|
+
defaults = models_default_values[model]
|
|
244
|
+
return {k: v for k, v in params.items() if k not in defaults or v != defaults[k]}
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
@overload
|
|
248
|
+
def validate(params: Dict, model: type[PlaywrightConfig]) -> PlaywrightConfig: ...
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
@overload
|
|
252
|
+
def validate(params: Dict, model: type[CamoufoxConfig]) -> CamoufoxConfig: ...
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def validate(params: Dict, model: type[PlaywrightConfig] | type[CamoufoxConfig]) -> PlaywrightConfig | CamoufoxConfig:
|
|
226
256
|
try:
|
|
227
|
-
|
|
257
|
+
# Filter out params with the default values (no need to validate them) to speed up validation
|
|
258
|
+
filtered = _filter_defaults(params, model.__name__)
|
|
259
|
+
return convert(filtered, model)
|
|
228
260
|
except ValidationError as e:
|
|
229
261
|
raise TypeError(f"Invalid argument type: {e}") from e
|