scrapling 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,27 +1,27 @@
1
1
  from playwright.sync_api import (
2
- Response as SyncPlaywrightResponse,
3
- sync_playwright,
4
- Playwright,
5
2
  Locator,
3
+ Playwright,
4
+ sync_playwright,
6
5
  )
7
6
  from playwright.async_api import (
8
7
  async_playwright,
9
- Response as AsyncPlaywrightResponse,
10
- BrowserContext as AsyncBrowserContext,
11
- Playwright as AsyncPlaywright,
12
8
  Locator as AsyncLocator,
9
+ Playwright as AsyncPlaywright,
10
+ BrowserContext as AsyncBrowserContext,
13
11
  )
14
12
  from patchright.sync_api import sync_playwright as sync_patchright
15
13
  from patchright.async_api import async_playwright as async_patchright
16
14
 
17
15
  from scrapling.core.utils import log
18
16
  from ._base import SyncSession, AsyncSession, DynamicSessionMixin
19
- from ._validators import validate_fetch as _validate
17
+ from ._validators import validate_fetch as _validate, PlaywrightConfig
20
18
  from scrapling.core._types import (
19
+ Any,
21
20
  Dict,
22
21
  List,
23
22
  Optional,
24
23
  Callable,
24
+ TYPE_CHECKING,
25
25
  SelectorWaitStates,
26
26
  )
27
27
  from scrapling.engines.toolbelt.convertor import (
@@ -30,7 +30,7 @@ from scrapling.engines.toolbelt.convertor import (
30
30
  )
31
31
  from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
32
32
 
33
- _UNSET = object()
33
+ _UNSET: Any = object()
34
34
 
35
35
 
36
36
  class DynamicSession(DynamicSessionMixin, SyncSession):
@@ -94,7 +94,10 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
94
94
  network_idle: bool = False,
95
95
  load_dom: bool = True,
96
96
  wait_selector_state: SelectorWaitStates = "attached",
97
+ user_data_dir: str = "",
98
+ extra_flags: Optional[List[str]] = None,
97
99
  selector_config: Optional[Dict] = None,
100
+ additional_args: Optional[Dict] = None,
98
101
  ):
99
102
  """A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
100
103
 
@@ -121,7 +124,10 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
121
124
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
122
125
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
123
126
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
127
+ :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
128
+ :param extra_flags: A list of additional browser flags to pass to the browser on launch.
124
129
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
130
+ :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
125
131
  """
126
132
  self.__validate__(
127
133
  wait=wait,
@@ -140,11 +146,14 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
140
146
  hide_canvas=hide_canvas,
141
147
  init_script=init_script,
142
148
  network_idle=network_idle,
149
+ user_data_dir=user_data_dir,
143
150
  google_search=google_search,
144
151
  extra_headers=extra_headers,
145
152
  wait_selector=wait_selector,
146
153
  disable_webgl=disable_webgl,
154
+ extra_flags=extra_flags,
147
155
  selector_config=selector_config,
156
+ additional_args=additional_args,
148
157
  disable_resources=disable_resources,
149
158
  wait_selector_state=wait_selector_state,
150
159
  )
@@ -154,14 +163,14 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
154
163
  """Create a browser for this instance and context."""
155
164
  sync_context = sync_patchright if self.stealth else sync_playwright
156
165
 
157
- self.playwright: Playwright = sync_context().start()
166
+ self.playwright: Playwright = sync_context().start() # pyright: ignore [reportAttributeAccessIssue]
158
167
 
159
168
  if self.cdp_url: # pragma: no cover
160
169
  self.context = self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url).new_context(
161
170
  **self.context_options
162
171
  )
163
172
  else:
164
- self.context = self.playwright.chromium.launch_persistent_context(user_data_dir="", **self.launch_options)
173
+ self.context = self.playwright.chromium.launch_persistent_context(**self.launch_options)
165
174
 
166
175
  if self.init_script: # pragma: no cover
167
176
  self.context.add_init_script(path=self.init_script)
@@ -169,28 +178,6 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
169
178
  if self.cookies: # pragma: no cover
170
179
  self.context.add_cookies(self.cookies)
171
180
 
172
- def __enter__(self):
173
- self.__create__()
174
- return self
175
-
176
- def __exit__(self, exc_type, exc_val, exc_tb):
177
- self.close()
178
-
179
- def close(self): # pragma: no cover
180
- """Close all resources"""
181
- if self._closed:
182
- return
183
-
184
- if self.context:
185
- self.context.close()
186
- self.context = None
187
-
188
- if self.playwright:
189
- self.playwright.stop()
190
- self.playwright = None
191
-
192
- self._closed = True
193
-
194
181
  def fetch(
195
182
  self,
196
183
  url: str,
@@ -238,37 +225,26 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
238
225
  ("load_dom", load_dom, self.load_dom),
239
226
  ("selector_config", selector_config, self.selector_config),
240
227
  ],
228
+ PlaywrightConfig,
241
229
  _UNSET,
242
230
  )
243
231
 
244
232
  if self._closed: # pragma: no cover
245
233
  raise RuntimeError("Context manager has been closed")
246
234
 
247
- final_response = None
248
235
  referer = (
249
236
  generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
250
237
  )
251
238
 
252
- def handle_response(finished_response: SyncPlaywrightResponse):
253
- nonlocal final_response
254
- if (
255
- finished_response.request.resource_type == "document"
256
- and finished_response.request.is_navigation_request()
257
- ):
258
- final_response = finished_response
259
-
260
239
  page_info = self._get_page(params.timeout, params.extra_headers, params.disable_resources)
261
- page_info.mark_busy(url=url)
240
+ final_response = [None]
241
+ handle_response = self._create_response_handler(page_info, final_response)
262
242
 
263
243
  try: # pragma: no cover
264
244
  # Navigate to URL and wait for a specified state
265
245
  page_info.page.on("response", handle_response)
266
246
  first_response = page_info.page.goto(url, referer=referer)
267
- if params.load_dom:
268
- page_info.page.wait_for_load_state(state="domcontentloaded")
269
-
270
- if params.network_idle:
271
- page_info.page.wait_for_load_state("networkidle")
247
+ self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
272
248
 
273
249
  if not first_response:
274
250
  raise RuntimeError(f"Failed to get response for {url}")
@@ -284,11 +260,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
284
260
  waiter: Locator = page_info.page.locator(params.wait_selector)
285
261
  waiter.first.wait_for(state=params.wait_selector_state)
286
262
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
287
- page_info.page.wait_for_load_state(state="load")
288
- if params.load_dom:
289
- page_info.page.wait_for_load_state(state="domcontentloaded")
290
- if params.network_idle:
291
- page_info.page.wait_for_load_state("networkidle")
263
+ self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
292
264
  except Exception as e: # pragma: no cover
293
265
  log.error(f"Error waiting for selector {params.wait_selector}: {e}")
294
266
 
@@ -296,10 +268,10 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
296
268
 
297
269
  # Create response object
298
270
  response = ResponseFactory.from_playwright_response(
299
- page_info.page, first_response, final_response, params.selector_config
271
+ page_info.page, first_response, final_response[0], params.selector_config, bool(params.page_action)
300
272
  )
301
273
 
302
- # Close the page, to free up resources
274
+ # Close the page to free up resources
303
275
  page_info.page.close()
304
276
  self.page_pool.pages.remove(page_info)
305
277
 
@@ -337,7 +309,10 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
337
309
  network_idle: bool = False,
338
310
  load_dom: bool = True,
339
311
  wait_selector_state: SelectorWaitStates = "attached",
312
+ user_data_dir: str = "",
313
+ extra_flags: Optional[List[str]] = None,
340
314
  selector_config: Optional[Dict] = None,
315
+ additional_args: Optional[Dict] = None,
341
316
  ):
342
317
  """A Browser session manager with page pooling
343
318
 
@@ -365,7 +340,10 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
365
340
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
366
341
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
367
342
  :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
343
+ :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
344
+ :param extra_flags: A list of additional browser flags to pass to the browser on launch.
368
345
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
346
+ :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
369
347
  """
370
348
 
371
349
  self.__validate__(
@@ -385,11 +363,14 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
385
363
  hide_canvas=hide_canvas,
386
364
  init_script=init_script,
387
365
  network_idle=network_idle,
366
+ user_data_dir=user_data_dir,
388
367
  google_search=google_search,
389
368
  extra_headers=extra_headers,
390
369
  wait_selector=wait_selector,
391
370
  disable_webgl=disable_webgl,
371
+ extra_flags=extra_flags,
392
372
  selector_config=selector_config,
373
+ additional_args=additional_args,
393
374
  disable_resources=disable_resources,
394
375
  wait_selector_state=wait_selector_state,
395
376
  )
@@ -399,43 +380,21 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
399
380
  """Create a browser for this instance and context."""
400
381
  async_context = async_patchright if self.stealth else async_playwright
401
382
 
402
- self.playwright: AsyncPlaywright = await async_context().start()
383
+ self.playwright: AsyncPlaywright = await async_context().start() # pyright: ignore [reportAttributeAccessIssue]
403
384
 
404
385
  if self.cdp_url:
405
386
  browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url)
406
387
  self.context: AsyncBrowserContext = await browser.new_context(**self.context_options)
407
388
  else:
408
389
  self.context: AsyncBrowserContext = await self.playwright.chromium.launch_persistent_context(
409
- user_data_dir="", **self.launch_options
390
+ **self.launch_options
410
391
  )
411
392
 
412
393
  if self.init_script: # pragma: no cover
413
394
  await self.context.add_init_script(path=self.init_script)
414
395
 
415
396
  if self.cookies:
416
- await self.context.add_cookies(self.cookies)
417
-
418
- async def __aenter__(self):
419
- await self.__create__()
420
- return self
421
-
422
- async def __aexit__(self, exc_type, exc_val, exc_tb):
423
- await self.close()
424
-
425
- async def close(self):
426
- """Close all resources"""
427
- if self._closed: # pragma: no cover
428
- return
429
-
430
- if self.context:
431
- await self.context.close()
432
- self.context = None
433
-
434
- if self.playwright:
435
- await self.playwright.stop()
436
- self.playwright = None
437
-
438
- self._closed = True
397
+ await self.context.add_cookies(self.cookies) # pyright: ignore
439
398
 
440
399
  async def fetch(
441
400
  self,
@@ -484,37 +443,32 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
484
443
  ("load_dom", load_dom, self.load_dom),
485
444
  ("selector_config", selector_config, self.selector_config),
486
445
  ],
446
+ PlaywrightConfig,
487
447
  _UNSET,
488
448
  )
489
449
 
490
450
  if self._closed: # pragma: no cover
491
451
  raise RuntimeError("Context manager has been closed")
492
452
 
493
- final_response = None
494
453
  referer = (
495
454
  generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
496
455
  )
497
456
 
498
- async def handle_response(finished_response: AsyncPlaywrightResponse):
499
- nonlocal final_response
500
- if (
501
- finished_response.request.resource_type == "document"
502
- and finished_response.request.is_navigation_request()
503
- ):
504
- final_response = finished_response
505
-
506
457
  page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
507
- page_info.mark_busy(url=url)
458
+ final_response = [None]
459
+ handle_response = self._create_response_handler(page_info, final_response)
460
+
461
+ if TYPE_CHECKING:
462
+ from playwright.async_api import Page as async_Page
463
+
464
+ if not isinstance(page_info.page, async_Page):
465
+ raise TypeError
508
466
 
509
467
  try:
510
468
  # Navigate to URL and wait for a specified state
511
469
  page_info.page.on("response", handle_response)
512
470
  first_response = await page_info.page.goto(url, referer=referer)
513
- if self.load_dom:
514
- await page_info.page.wait_for_load_state(state="domcontentloaded")
515
-
516
- if params.network_idle:
517
- await page_info.page.wait_for_load_state("networkidle")
471
+ await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
518
472
 
519
473
  if not first_response:
520
474
  raise RuntimeError(f"Failed to get response for {url}")
@@ -530,11 +484,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
530
484
  waiter: AsyncLocator = page_info.page.locator(params.wait_selector)
531
485
  await waiter.first.wait_for(state=params.wait_selector_state)
532
486
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
533
- await page_info.page.wait_for_load_state(state="load")
534
- if self.load_dom:
535
- await page_info.page.wait_for_load_state(state="domcontentloaded")
536
- if params.network_idle:
537
- await page_info.page.wait_for_load_state("networkidle")
487
+ await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
538
488
  except Exception as e:
539
489
  log.error(f"Error waiting for selector {params.wait_selector}: {e}")
540
490
 
@@ -542,10 +492,10 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
542
492
 
543
493
  # Create response object
544
494
  response = await ResponseFactory.from_async_playwright_response(
545
- page_info.page, first_response, final_response, params.selector_config
495
+ page_info.page, first_response, final_response[0], params.selector_config, bool(params.page_action)
546
496
  )
547
497
 
548
- # Close the page, to free up resources
498
+ # Close the page to free up resources
549
499
  await page_info.page.close()
550
500
  self.page_pool.pages.remove(page_info)
551
501
  return response
@@ -1,7 +1,8 @@
1
1
  from pathlib import Path
2
2
  from typing import Annotated
3
- from dataclasses import dataclass
3
+ from functools import lru_cache
4
4
  from urllib.parse import urlparse
5
+ from dataclasses import dataclass, fields
5
6
 
6
7
  from msgspec import Struct, Meta, convert, ValidationError
7
8
 
@@ -11,24 +12,28 @@ from scrapling.core._types import (
11
12
  Tuple,
12
13
  Optional,
13
14
  Callable,
15
+ Iterable,
14
16
  SelectorWaitStates,
17
+ overload,
15
18
  )
16
19
  from scrapling.engines.toolbelt.navigation import construct_proxy_dict
17
20
 
18
21
 
19
22
  # Custom validators for msgspec
20
- def _validate_file_path(value: str):
23
+ @lru_cache(8)
24
+ def _is_invalid_file_path(value: str) -> bool | str:
21
25
  """Fast file path validation"""
22
26
  path = Path(value)
23
27
  if not path.exists():
24
- raise ValueError(f"Init script path not found: {value}")
28
+ return f"Init script path not found: {value}"
25
29
  if not path.is_file():
26
- raise ValueError(f"Init script is not a file: {value}")
30
+ return f"Init script is not a file: {value}"
27
31
  if not path.is_absolute():
28
- raise ValueError(f"Init script is not a absolute path: {value}")
32
+ return f"Init script is not a absolute path: {value}"
33
+ return False
29
34
 
30
35
 
31
- def _validate_addon_path(value: str):
36
+ def _validate_addon_path(value: str) -> None:
32
37
  """Fast addon path validation"""
33
38
  path = Path(value)
34
39
  if not path.exists():
@@ -37,22 +42,16 @@ def _validate_addon_path(value: str):
37
42
  raise ValueError(f"Addon path must be a directory of the extracted addon: {value}")
38
43
 
39
44
 
40
- def _validate_cdp_url(cdp_url: str):
45
+ @lru_cache(2)
46
+ def _is_invalid_cdp_url(cdp_url: str) -> bool | str:
41
47
  """Fast CDP URL validation"""
42
- try:
43
- # Check the scheme
44
- if not cdp_url.startswith(("ws://", "wss://")):
45
- raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
46
-
47
- # Validate hostname and port
48
- if not urlparse(cdp_url).netloc:
49
- raise ValueError("Invalid hostname for the CDP URL")
50
-
51
- except AttributeError as e:
52
- raise ValueError(f"Malformed CDP URL: {cdp_url}: {str(e)}")
48
+ if not cdp_url.startswith(("ws://", "wss://")):
49
+ return "CDP URL must use 'ws://' or 'wss://' scheme"
53
50
 
54
- except Exception as e:
55
- raise ValueError(f"Invalid CDP URL '{cdp_url}': {str(e)}")
51
+ netloc = urlparse(cdp_url).netloc
52
+ if not netloc:
53
+ return "Invalid hostname for the CDP URL"
54
+ return False
56
55
 
57
56
 
58
57
  # Type aliases for cleaner annotations
@@ -60,7 +59,7 @@ PagesCount = Annotated[int, Meta(ge=1, le=50)]
60
59
  Seconds = Annotated[int, float, Meta(ge=0)]
61
60
 
62
61
 
63
- class PlaywrightConfig(Struct, kw_only=True, frozen=False):
62
+ class PlaywrightConfig(Struct, kw_only=True, frozen=False, weakref=True):
64
63
  """Configuration struct for validation"""
65
64
 
66
65
  max_pages: PagesCount = 1
@@ -73,7 +72,7 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
73
72
  stealth: bool = False
74
73
  wait: Seconds = 0
75
74
  page_action: Optional[Callable] = None
76
- proxy: Optional[str | Dict[str, str]] = None # The default value for proxy in Playwright's source is `None`
75
+ proxy: Optional[str | Dict[str, str] | Tuple] = None # The default value for proxy in Playwright's source is `None`
77
76
  locale: str = "en-US"
78
77
  extra_headers: Optional[Dict[str, str]] = None
79
78
  useragent: Optional[str] = None
@@ -81,11 +80,14 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
81
80
  init_script: Optional[str] = None
82
81
  disable_resources: bool = False
83
82
  wait_selector: Optional[str] = None
84
- cookies: Optional[List[Dict]] = None
83
+ cookies: Optional[Iterable[Dict]] = None
85
84
  network_idle: bool = False
86
85
  load_dom: bool = True
87
86
  wait_selector_state: SelectorWaitStates = "attached"
88
- selector_config: Optional[Dict] = None
87
+ user_data_dir: str = ""
88
+ extra_flags: Optional[List[str]] = None
89
+ selector_config: Optional[Dict] = {}
90
+ additional_args: Optional[Dict] = {}
89
91
 
90
92
  def __post_init__(self):
91
93
  """Custom validation after msgspec validation"""
@@ -94,18 +96,26 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
94
96
  if self.proxy:
95
97
  self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
96
98
  if self.cdp_url:
97
- _validate_cdp_url(self.cdp_url)
99
+ cdp_msg = _is_invalid_cdp_url(self.cdp_url)
100
+ if cdp_msg:
101
+ raise ValueError(cdp_msg)
98
102
 
99
103
  if not self.cookies:
100
104
  self.cookies = []
105
+ if not self.extra_flags:
106
+ self.extra_flags = []
101
107
  if not self.selector_config:
102
108
  self.selector_config = {}
109
+ if not self.additional_args:
110
+ self.additional_args = {}
103
111
 
104
112
  if self.init_script is not None:
105
- _validate_file_path(self.init_script)
113
+ validation_msg = _is_invalid_file_path(self.init_script)
114
+ if validation_msg:
115
+ raise ValueError(validation_msg)
106
116
 
107
117
 
108
- class CamoufoxConfig(Struct, kw_only=True, frozen=False):
118
+ class CamoufoxConfig(Struct, kw_only=True, frozen=False, weakref=True):
109
119
  """Configuration struct for validation"""
110
120
 
111
121
  max_pages: PagesCount = 1
@@ -125,15 +135,16 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
125
135
  wait_selector: Optional[str] = None
126
136
  addons: Optional[List[str]] = None
127
137
  wait_selector_state: SelectorWaitStates = "attached"
128
- cookies: Optional[List[Dict]] = None
138
+ cookies: Optional[Iterable[Dict]] = None
129
139
  google_search: bool = True
130
140
  extra_headers: Optional[Dict[str, str]] = None
131
- proxy: Optional[str | Dict[str, str]] = None # The default value for proxy in Playwright's source is `None`
141
+ proxy: Optional[str | Dict[str, str] | Tuple] = None # The default value for proxy in Playwright's source is `None`
132
142
  os_randomize: bool = False
133
143
  disable_ads: bool = False
134
144
  geoip: bool = False
135
- selector_config: Optional[Dict] = None
136
- additional_args: Optional[Dict] = None
145
+ user_data_dir: str = ""
146
+ selector_config: Optional[Dict] = {}
147
+ additional_args: Optional[Dict] = {}
137
148
 
138
149
  def __post_init__(self):
139
150
  """Custom validation after msgspec validation"""
@@ -142,14 +153,16 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
142
153
  if self.proxy:
143
154
  self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
144
155
 
145
- if self.addons and isinstance(self.addons, list):
156
+ if self.addons:
146
157
  for addon in self.addons:
147
158
  _validate_addon_path(addon)
148
159
  else:
149
160
  self.addons = []
150
161
 
151
162
  if self.init_script is not None:
152
- _validate_file_path(self.init_script)
163
+ validation_msg = _is_invalid_file_path(self.init_script)
164
+ if validation_msg:
165
+ raise ValueError(validation_msg)
153
166
 
154
167
  if not self.cookies:
155
168
  self.cookies = []
@@ -162,27 +175,6 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
162
175
  self.additional_args = {}
163
176
 
164
177
 
165
- # Code parts to validate `fetch` in the least possible numbers of lines overall
166
- class FetchConfig(Struct, kw_only=True):
167
- """Configuration struct for `fetch` calls validation"""
168
-
169
- google_search: bool = True
170
- timeout: Seconds = 30000
171
- wait: Seconds = 0
172
- page_action: Optional[Callable] = None
173
- extra_headers: Optional[Dict[str, str]] = None
174
- disable_resources: bool = False
175
- wait_selector: Optional[str] = None
176
- wait_selector_state: SelectorWaitStates = "attached"
177
- network_idle: bool = False
178
- load_dom: bool = True
179
- solve_cloudflare: bool = False
180
- selector_config: Optional[Dict] = {}
181
-
182
- def to_dict(self):
183
- return {f: getattr(self, f) for f in self.__struct_fields__}
184
-
185
-
186
178
  @dataclass
187
179
  class _fetch_params:
188
180
  """A dataclass of all parameters used by `fetch` calls"""
@@ -198,10 +190,12 @@ class _fetch_params:
198
190
  network_idle: bool
199
191
  load_dom: bool
200
192
  solve_cloudflare: bool
201
- selector_config: Optional[Dict]
193
+ selector_config: Dict
202
194
 
203
195
 
204
- def validate_fetch(params: List[Tuple], sentinel=None) -> _fetch_params:
196
+ def validate_fetch(
197
+ params: List[Tuple], model: type[PlaywrightConfig] | type[CamoufoxConfig], sentinel=None
198
+ ) -> _fetch_params:
205
199
  result = {}
206
200
  overrides = {}
207
201
 
@@ -212,18 +206,56 @@ def validate_fetch(params: List[Tuple], sentinel=None) -> _fetch_params:
212
206
  result[arg] = session_value
213
207
 
214
208
  if overrides:
215
- overrides = validate(overrides, FetchConfig).to_dict()
216
- overrides.update(result)
217
- return _fetch_params(**overrides)
209
+ validated_config = validate(overrides, model)
210
+ # Extract only the fields that _fetch_params needs from validated_config
211
+ validated_dict = {
212
+ f.name: getattr(validated_config, f.name)
213
+ for f in fields(_fetch_params)
214
+ if hasattr(validated_config, f.name)
215
+ }
216
+ # solve_cloudflare defaults to False for models that don't have it (PlaywrightConfig)
217
+ validated_dict.setdefault("solve_cloudflare", False)
218
+
219
+ validated_dict.update(result)
220
+ return _fetch_params(**validated_dict)
218
221
 
219
- if not result.get("solve_cloudflare"):
220
- result["solve_cloudflare"] = False
222
+ result.setdefault("solve_cloudflare", False)
221
223
 
222
224
  return _fetch_params(**result)
223
225
 
224
226
 
225
- def validate(params: Dict, model) -> PlaywrightConfig | CamoufoxConfig | FetchConfig:
227
+ # Cache default values for each model to reduce validation overhead
228
+ models_default_values = {}
229
+
230
+ for _model in (CamoufoxConfig, PlaywrightConfig):
231
+ _defaults = {}
232
+ if hasattr(_model, "__struct_defaults__") and hasattr(_model, "__struct_fields__"):
233
+ for field_name, default_value in zip(_model.__struct_fields__, _model.__struct_defaults__): # type: ignore
234
+ # Skip factory defaults - these are msgspec._core.Factory instances
235
+ if type(default_value).__name__ != "Factory":
236
+ _defaults[field_name] = default_value
237
+
238
+ models_default_values[_model.__name__] = _defaults.copy()
239
+
240
+
241
+ def _filter_defaults(params: Dict, model: str) -> Dict:
242
+ """Filter out parameters that match their default values to reduce validation overhead."""
243
+ defaults = models_default_values[model]
244
+ return {k: v for k, v in params.items() if k not in defaults or v != defaults[k]}
245
+
246
+
247
+ @overload
248
+ def validate(params: Dict, model: type[PlaywrightConfig]) -> PlaywrightConfig: ...
249
+
250
+
251
+ @overload
252
+ def validate(params: Dict, model: type[CamoufoxConfig]) -> CamoufoxConfig: ...
253
+
254
+
255
+ def validate(params: Dict, model: type[PlaywrightConfig] | type[CamoufoxConfig]) -> PlaywrightConfig | CamoufoxConfig:
226
256
  try:
227
- return convert(params, model)
257
+ # Filter out params with the default values (no need to validate them) to speed up validation
258
+ filtered = _filter_defaults(params, model.__name__)
259
+ return convert(filtered, model)
228
260
  except ValidationError as e:
229
261
  raise TypeError(f"Invalid argument type: {e}") from e