scrapling 0.3.7__py3-none-any.whl → 0.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,23 +1,20 @@
1
1
  from playwright.sync_api import (
2
- Response as SyncPlaywrightResponse,
3
- sync_playwright,
4
- Playwright,
5
2
  Locator,
3
+ Playwright,
4
+ sync_playwright,
6
5
  )
7
6
  from playwright.async_api import (
8
7
  async_playwright,
9
- Response as AsyncPlaywrightResponse,
10
- BrowserContext as AsyncBrowserContext,
11
- Playwright as AsyncPlaywright,
12
8
  Locator as AsyncLocator,
13
- Page as async_Page,
9
+ Playwright as AsyncPlaywright,
10
+ BrowserContext as AsyncBrowserContext,
14
11
  )
15
12
  from patchright.sync_api import sync_playwright as sync_patchright
16
13
  from patchright.async_api import async_playwright as async_patchright
17
14
 
18
15
  from scrapling.core.utils import log
19
16
  from ._base import SyncSession, AsyncSession, DynamicSessionMixin
20
- from ._validators import validate_fetch as _validate
17
+ from ._validators import validate_fetch as _validate, PlaywrightConfig
21
18
  from scrapling.core._types import (
22
19
  Any,
23
20
  Dict,
@@ -98,6 +95,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
98
95
  load_dom: bool = True,
99
96
  wait_selector_state: SelectorWaitStates = "attached",
100
97
  user_data_dir: str = "",
98
+ extra_flags: Optional[List[str]] = None,
101
99
  selector_config: Optional[Dict] = None,
102
100
  additional_args: Optional[Dict] = None,
103
101
  ):
@@ -127,6 +125,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
127
125
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
128
126
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
129
127
  :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
128
+ :param extra_flags: A list of additional browser flags to pass to the browser on launch.
130
129
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
131
130
  :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
132
131
  """
@@ -152,6 +151,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
152
151
  extra_headers=extra_headers,
153
152
  wait_selector=wait_selector,
154
153
  disable_webgl=disable_webgl,
154
+ extra_flags=extra_flags,
155
155
  selector_config=selector_config,
156
156
  additional_args=additional_args,
157
157
  disable_resources=disable_resources,
@@ -178,28 +178,6 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
178
178
  if self.cookies: # pragma: no cover
179
179
  self.context.add_cookies(self.cookies)
180
180
 
181
- def __enter__(self):
182
- self.__create__()
183
- return self
184
-
185
- def __exit__(self, exc_type, exc_val, exc_tb):
186
- self.close()
187
-
188
- def close(self): # pragma: no cover
189
- """Close all resources"""
190
- if self._closed:
191
- return
192
-
193
- if self.context:
194
- self.context.close()
195
- self.context = None
196
-
197
- if self.playwright:
198
- self.playwright.stop()
199
- self.playwright = None # pyright: ignore
200
-
201
- self._closed = True
202
-
203
181
  def fetch(
204
182
  self,
205
183
  url: str,
@@ -247,38 +225,26 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
247
225
  ("load_dom", load_dom, self.load_dom),
248
226
  ("selector_config", selector_config, self.selector_config),
249
227
  ],
228
+ PlaywrightConfig,
250
229
  _UNSET,
251
230
  )
252
231
 
253
232
  if self._closed: # pragma: no cover
254
233
  raise RuntimeError("Context manager has been closed")
255
234
 
256
- final_response = None
257
235
  referer = (
258
236
  generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
259
237
  )
260
238
 
261
- def handle_response(finished_response: SyncPlaywrightResponse):
262
- nonlocal final_response
263
- if (
264
- finished_response.request.resource_type == "document"
265
- and finished_response.request.is_navigation_request()
266
- and finished_response.request.frame == page_info.page.main_frame
267
- ):
268
- final_response = finished_response
269
-
270
239
  page_info = self._get_page(params.timeout, params.extra_headers, params.disable_resources)
271
- page_info.mark_busy(url=url)
240
+ final_response = [None]
241
+ handle_response = self._create_response_handler(page_info, final_response)
272
242
 
273
243
  try: # pragma: no cover
274
244
  # Navigate to URL and wait for a specified state
275
245
  page_info.page.on("response", handle_response)
276
246
  first_response = page_info.page.goto(url, referer=referer)
277
- if params.load_dom:
278
- page_info.page.wait_for_load_state(state="domcontentloaded")
279
-
280
- if params.network_idle:
281
- page_info.page.wait_for_load_state("networkidle")
247
+ self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
282
248
 
283
249
  if not first_response:
284
250
  raise RuntimeError(f"Failed to get response for {url}")
@@ -294,11 +260,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
294
260
  waiter: Locator = page_info.page.locator(params.wait_selector)
295
261
  waiter.first.wait_for(state=params.wait_selector_state)
296
262
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
297
- page_info.page.wait_for_load_state(state="load")
298
- if params.load_dom:
299
- page_info.page.wait_for_load_state(state="domcontentloaded")
300
- if params.network_idle:
301
- page_info.page.wait_for_load_state("networkidle")
263
+ self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
302
264
  except Exception as e: # pragma: no cover
303
265
  log.error(f"Error waiting for selector {params.wait_selector}: {e}")
304
266
 
@@ -306,7 +268,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
306
268
 
307
269
  # Create response object
308
270
  response = ResponseFactory.from_playwright_response(
309
- page_info.page, first_response, final_response, params.selector_config
271
+ page_info.page, first_response, final_response[0], params.selector_config, bool(params.page_action)
310
272
  )
311
273
 
312
274
  # Close the page to free up resources
@@ -348,6 +310,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
348
310
  load_dom: bool = True,
349
311
  wait_selector_state: SelectorWaitStates = "attached",
350
312
  user_data_dir: str = "",
313
+ extra_flags: Optional[List[str]] = None,
351
314
  selector_config: Optional[Dict] = None,
352
315
  additional_args: Optional[Dict] = None,
353
316
  ):
@@ -378,6 +341,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
378
341
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
379
342
  :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
380
343
  :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
344
+ :param extra_flags: A list of additional browser flags to pass to the browser on launch.
381
345
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
382
346
  :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
383
347
  """
@@ -404,6 +368,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
404
368
  extra_headers=extra_headers,
405
369
  wait_selector=wait_selector,
406
370
  disable_webgl=disable_webgl,
371
+ extra_flags=extra_flags,
407
372
  selector_config=selector_config,
408
373
  additional_args=additional_args,
409
374
  disable_resources=disable_resources,
@@ -431,28 +396,6 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
431
396
  if self.cookies:
432
397
  await self.context.add_cookies(self.cookies) # pyright: ignore
433
398
 
434
- async def __aenter__(self):
435
- await self.__create__()
436
- return self
437
-
438
- async def __aexit__(self, exc_type, exc_val, exc_tb):
439
- await self.close()
440
-
441
- async def close(self):
442
- """Close all resources"""
443
- if self._closed: # pragma: no cover
444
- return
445
-
446
- if self.context:
447
- await self.context.close()
448
- self.context = None # pyright: ignore
449
-
450
- if self.playwright:
451
- await self.playwright.stop()
452
- self.playwright = None # pyright: ignore
453
-
454
- self._closed = True
455
-
456
399
  async def fetch(
457
400
  self,
458
401
  url: str,
@@ -500,30 +443,24 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
500
443
  ("load_dom", load_dom, self.load_dom),
501
444
  ("selector_config", selector_config, self.selector_config),
502
445
  ],
446
+ PlaywrightConfig,
503
447
  _UNSET,
504
448
  )
505
449
 
506
450
  if self._closed: # pragma: no cover
507
451
  raise RuntimeError("Context manager has been closed")
508
452
 
509
- final_response = None
510
453
  referer = (
511
454
  generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
512
455
  )
513
456
 
514
- async def handle_response(finished_response: AsyncPlaywrightResponse):
515
- nonlocal final_response
516
- if (
517
- finished_response.request.resource_type == "document"
518
- and finished_response.request.is_navigation_request()
519
- and finished_response.request.frame == page_info.page.main_frame
520
- ):
521
- final_response = finished_response
522
-
523
457
  page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
524
- page_info.mark_busy(url=url)
458
+ final_response = [None]
459
+ handle_response = self._create_response_handler(page_info, final_response)
525
460
 
526
461
  if TYPE_CHECKING:
462
+ from playwright.async_api import Page as async_Page
463
+
527
464
  if not isinstance(page_info.page, async_Page):
528
465
  raise TypeError
529
466
 
@@ -531,11 +468,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
531
468
  # Navigate to URL and wait for a specified state
532
469
  page_info.page.on("response", handle_response)
533
470
  first_response = await page_info.page.goto(url, referer=referer)
534
- if self.load_dom:
535
- await page_info.page.wait_for_load_state(state="domcontentloaded")
536
-
537
- if params.network_idle:
538
- await page_info.page.wait_for_load_state("networkidle")
471
+ await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
539
472
 
540
473
  if not first_response:
541
474
  raise RuntimeError(f"Failed to get response for {url}")
@@ -551,11 +484,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
551
484
  waiter: AsyncLocator = page_info.page.locator(params.wait_selector)
552
485
  await waiter.first.wait_for(state=params.wait_selector_state)
553
486
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
554
- await page_info.page.wait_for_load_state(state="load")
555
- if self.load_dom:
556
- await page_info.page.wait_for_load_state(state="domcontentloaded")
557
- if params.network_idle:
558
- await page_info.page.wait_for_load_state("networkidle")
487
+ await self._wait_for_page_stability(page_info.page, params.load_dom, params.network_idle)
559
488
  except Exception as e:
560
489
  log.error(f"Error waiting for selector {params.wait_selector}: {e}")
561
490
 
@@ -563,7 +492,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
563
492
 
564
493
  # Create response object
565
494
  response = await ResponseFactory.from_async_playwright_response(
566
- page_info.page, first_response, final_response, params.selector_config
495
+ page_info.page, first_response, final_response[0], params.selector_config, bool(params.page_action)
567
496
  )
568
497
 
569
498
  # Close the page to free up resources
@@ -1,7 +1,8 @@
1
1
  from pathlib import Path
2
2
  from typing import Annotated
3
- from dataclasses import dataclass
3
+ from functools import lru_cache
4
4
  from urllib.parse import urlparse
5
+ from dataclasses import dataclass, fields
5
6
 
6
7
  from msgspec import Struct, Meta, convert, ValidationError
7
8
 
@@ -19,18 +20,20 @@ from scrapling.engines.toolbelt.navigation import construct_proxy_dict
19
20
 
20
21
 
21
22
  # Custom validators for msgspec
22
- def _validate_file_path(value: str):
23
+ @lru_cache(8)
24
+ def _is_invalid_file_path(value: str) -> bool | str:
23
25
  """Fast file path validation"""
24
26
  path = Path(value)
25
27
  if not path.exists():
26
- raise ValueError(f"Init script path not found: {value}")
28
+ return f"Init script path not found: {value}"
27
29
  if not path.is_file():
28
- raise ValueError(f"Init script is not a file: {value}")
30
+ return f"Init script is not a file: {value}"
29
31
  if not path.is_absolute():
30
- raise ValueError(f"Init script is not a absolute path: {value}")
32
+ return f"Init script is not a absolute path: {value}"
33
+ return False
31
34
 
32
35
 
33
- def _validate_addon_path(value: str):
36
+ def _validate_addon_path(value: str) -> None:
34
37
  """Fast addon path validation"""
35
38
  path = Path(value)
36
39
  if not path.exists():
@@ -39,22 +42,16 @@ def _validate_addon_path(value: str):
39
42
  raise ValueError(f"Addon path must be a directory of the extracted addon: {value}")
40
43
 
41
44
 
42
- def _validate_cdp_url(cdp_url: str):
45
+ @lru_cache(2)
46
+ def _is_invalid_cdp_url(cdp_url: str) -> bool | str:
43
47
  """Fast CDP URL validation"""
44
- try:
45
- # Check the scheme
46
- if not cdp_url.startswith(("ws://", "wss://")):
47
- raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
48
-
49
- # Validate hostname and port
50
- if not urlparse(cdp_url).netloc:
51
- raise ValueError("Invalid hostname for the CDP URL")
48
+ if not cdp_url.startswith(("ws://", "wss://")):
49
+ return "CDP URL must use 'ws://' or 'wss://' scheme"
52
50
 
53
- except AttributeError as e:
54
- raise ValueError(f"Malformed CDP URL: {cdp_url}: {str(e)}")
55
-
56
- except Exception as e:
57
- raise ValueError(f"Invalid CDP URL '{cdp_url}': {str(e)}")
51
+ netloc = urlparse(cdp_url).netloc
52
+ if not netloc:
53
+ return "Invalid hostname for the CDP URL"
54
+ return False
58
55
 
59
56
 
60
57
  # Type aliases for cleaner annotations
@@ -62,7 +59,7 @@ PagesCount = Annotated[int, Meta(ge=1, le=50)]
62
59
  Seconds = Annotated[int, float, Meta(ge=0)]
63
60
 
64
61
 
65
- class PlaywrightConfig(Struct, kw_only=True, frozen=False):
62
+ class PlaywrightConfig(Struct, kw_only=True, frozen=False, weakref=True):
66
63
  """Configuration struct for validation"""
67
64
 
68
65
  max_pages: PagesCount = 1
@@ -88,6 +85,7 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
88
85
  load_dom: bool = True
89
86
  wait_selector_state: SelectorWaitStates = "attached"
90
87
  user_data_dir: str = ""
88
+ extra_flags: Optional[List[str]] = None
91
89
  selector_config: Optional[Dict] = {}
92
90
  additional_args: Optional[Dict] = {}
93
91
 
@@ -98,20 +96,26 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
98
96
  if self.proxy:
99
97
  self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
100
98
  if self.cdp_url:
101
- _validate_cdp_url(self.cdp_url)
99
+ cdp_msg = _is_invalid_cdp_url(self.cdp_url)
100
+ if cdp_msg:
101
+ raise ValueError(cdp_msg)
102
102
 
103
103
  if not self.cookies:
104
104
  self.cookies = []
105
+ if not self.extra_flags:
106
+ self.extra_flags = []
105
107
  if not self.selector_config:
106
108
  self.selector_config = {}
107
109
  if not self.additional_args:
108
110
  self.additional_args = {}
109
111
 
110
112
  if self.init_script is not None:
111
- _validate_file_path(self.init_script)
113
+ validation_msg = _is_invalid_file_path(self.init_script)
114
+ if validation_msg:
115
+ raise ValueError(validation_msg)
112
116
 
113
117
 
114
- class CamoufoxConfig(Struct, kw_only=True, frozen=False):
118
+ class CamoufoxConfig(Struct, kw_only=True, frozen=False, weakref=True):
115
119
  """Configuration struct for validation"""
116
120
 
117
121
  max_pages: PagesCount = 1
@@ -149,14 +153,16 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
149
153
  if self.proxy:
150
154
  self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
151
155
 
152
- if self.addons and isinstance(self.addons, list):
156
+ if self.addons:
153
157
  for addon in self.addons:
154
158
  _validate_addon_path(addon)
155
159
  else:
156
160
  self.addons = []
157
161
 
158
162
  if self.init_script is not None:
159
- _validate_file_path(self.init_script)
163
+ validation_msg = _is_invalid_file_path(self.init_script)
164
+ if validation_msg:
165
+ raise ValueError(validation_msg)
160
166
 
161
167
  if not self.cookies:
162
168
  self.cookies = []
@@ -169,27 +175,6 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
169
175
  self.additional_args = {}
170
176
 
171
177
 
172
- # Code parts to validate `fetch` in the least possible numbers of lines overall
173
- class FetchConfig(Struct, kw_only=True):
174
- """Configuration struct for `fetch` calls validation"""
175
-
176
- google_search: bool = True
177
- timeout: Seconds = 30000
178
- wait: Seconds = 0
179
- page_action: Optional[Callable] = None
180
- extra_headers: Optional[Dict[str, str]] = None
181
- disable_resources: bool = False
182
- wait_selector: Optional[str] = None
183
- wait_selector_state: SelectorWaitStates = "attached"
184
- network_idle: bool = False
185
- load_dom: bool = True
186
- solve_cloudflare: bool = False
187
- selector_config: Dict = {}
188
-
189
- def to_dict(self):
190
- return {f: getattr(self, f) for f in self.__struct_fields__}
191
-
192
-
193
178
  @dataclass
194
179
  class _fetch_params:
195
180
  """A dataclass of all parameters used by `fetch` calls"""
@@ -208,7 +193,9 @@ class _fetch_params:
208
193
  selector_config: Dict
209
194
 
210
195
 
211
- def validate_fetch(params: List[Tuple], sentinel=None) -> _fetch_params:
196
+ def validate_fetch(
197
+ params: List[Tuple], model: type[PlaywrightConfig] | type[CamoufoxConfig], sentinel=None
198
+ ) -> _fetch_params:
212
199
  result = {}
213
200
  overrides = {}
214
201
 
@@ -219,32 +206,56 @@ def validate_fetch(params: List[Tuple], sentinel=None) -> _fetch_params:
219
206
  result[arg] = session_value
220
207
 
221
208
  if overrides:
222
- overrides = validate(overrides, FetchConfig).to_dict()
223
- overrides.update(result)
224
- return _fetch_params(**overrides)
209
+ validated_config = validate(overrides, model)
210
+ # Extract only the fields that _fetch_params needs from validated_config
211
+ validated_dict = {
212
+ f.name: getattr(validated_config, f.name)
213
+ for f in fields(_fetch_params)
214
+ if hasattr(validated_config, f.name)
215
+ }
216
+ # solve_cloudflare defaults to False for models that don't have it (PlaywrightConfig)
217
+ validated_dict.setdefault("solve_cloudflare", False)
225
218
 
226
- if not result.get("solve_cloudflare"):
227
- result["solve_cloudflare"] = False
219
+ validated_dict.update(result)
220
+ return _fetch_params(**validated_dict)
221
+
222
+ result.setdefault("solve_cloudflare", False)
228
223
 
229
224
  return _fetch_params(**result)
230
225
 
231
226
 
232
- @overload
233
- def validate(params: Dict, model: type[PlaywrightConfig]) -> PlaywrightConfig: ...
227
+ # Cache default values for each model to reduce validation overhead
228
+ models_default_values = {}
229
+
230
+ for _model in (CamoufoxConfig, PlaywrightConfig):
231
+ _defaults = {}
232
+ if hasattr(_model, "__struct_defaults__") and hasattr(_model, "__struct_fields__"):
233
+ for field_name, default_value in zip(_model.__struct_fields__, _model.__struct_defaults__): # type: ignore
234
+ # Skip factory defaults - these are msgspec._core.Factory instances
235
+ if type(default_value).__name__ != "Factory":
236
+ _defaults[field_name] = default_value
237
+
238
+ models_default_values[_model.__name__] = _defaults.copy()
239
+
240
+
241
+ def _filter_defaults(params: Dict, model: str) -> Dict:
242
+ """Filter out parameters that match their default values to reduce validation overhead."""
243
+ defaults = models_default_values[model]
244
+ return {k: v for k, v in params.items() if k not in defaults or v != defaults[k]}
234
245
 
235
246
 
236
247
  @overload
237
- def validate(params: Dict, model: type[CamoufoxConfig]) -> CamoufoxConfig: ...
248
+ def validate(params: Dict, model: type[PlaywrightConfig]) -> PlaywrightConfig: ...
238
249
 
239
250
 
240
251
  @overload
241
- def validate(params: Dict, model: type[FetchConfig]) -> FetchConfig: ...
252
+ def validate(params: Dict, model: type[CamoufoxConfig]) -> CamoufoxConfig: ...
242
253
 
243
254
 
244
- def validate(
245
- params: Dict, model: type[PlaywrightConfig] | type[CamoufoxConfig] | type[FetchConfig]
246
- ) -> PlaywrightConfig | CamoufoxConfig | FetchConfig:
255
+ def validate(params: Dict, model: type[PlaywrightConfig] | type[CamoufoxConfig]) -> PlaywrightConfig | CamoufoxConfig:
247
256
  try:
248
- return convert(params, model)
257
+ # Filter out params with the default values (no need to validate them) to speed up validation
258
+ filtered = _filter_defaults(params, model.__name__)
259
+ return convert(filtered, model)
249
260
  except ValidationError as e:
250
261
  raise TypeError(f"Invalid argument type: {e}") from e
@@ -2,6 +2,7 @@ from functools import lru_cache
2
2
  from re import compile as re_compile
3
3
 
4
4
  from curl_cffi.requests import Response as CurlResponse
5
+ from playwright._impl._errors import Error as PlaywrightError
5
6
  from playwright.sync_api import Page as SyncPage, Response as SyncResponse
6
7
  from playwright.async_api import Page as AsyncPage, Response as AsyncResponse
7
8
 
@@ -84,6 +85,7 @@ class ResponseFactory:
84
85
  first_response: SyncResponse,
85
86
  final_response: Optional[SyncResponse],
86
87
  parser_arguments: Dict,
88
+ automated_page: bool = False,
87
89
  ) -> Response:
88
90
  """
89
91
  Transforms a Playwright response into an internal `Response` object, encapsulating
@@ -99,6 +101,7 @@ class ResponseFactory:
99
101
  :param first_response: An earlier or initial Playwright `Response` object that may serve as a fallback response in the absence of the final one.
100
102
  :param parser_arguments: A dictionary containing additional arguments needed for parsing or further customization of the returned `Response`. These arguments are dynamically unpacked into
101
103
  the `Response` object.
104
+ :param automated_page: If True, it means the `page_action` argument was being used, so the response retrieving method changes to use Playwright's page instead of the final response.
102
105
 
103
106
  :return: A fully populated `Response` object containing the page's URL, content, status, headers, cookies, and other derived metadata.
104
107
  :rtype: Response
@@ -114,7 +117,7 @@ class ResponseFactory:
114
117
 
115
118
  history = cls._process_response_history(first_response, parser_arguments)
116
119
  try:
117
- page_content = final_response.text()
120
+ page_content = final_response.text() if not automated_page else cls._get_page_content(page)
118
121
  except Exception as e: # pragma: no cover
119
122
  log.error(f"Error getting page content: {e}")
120
123
  page_content = ""
@@ -179,6 +182,36 @@ class ResponseFactory:
179
182
 
180
183
  return history
181
184
 
185
+ @classmethod
186
+ def _get_page_content(cls, page: SyncPage) -> str:
187
+ """
188
+ A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
189
+ :param page: The page to extract content from.
190
+ :return:
191
+ """
192
+ while True:
193
+ try:
194
+ return page.content() or ""
195
+ except PlaywrightError:
196
+ page.wait_for_timeout(500)
197
+ continue
198
+ return "" # pyright: ignore
199
+
200
+ @classmethod
201
+ async def _get_async_page_content(cls, page: AsyncPage) -> str:
202
+ """
203
+ A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
204
+ :param page: The page to extract content from.
205
+ :return:
206
+ """
207
+ while True:
208
+ try:
209
+ return (await page.content()) or ""
210
+ except PlaywrightError:
211
+ await page.wait_for_timeout(500)
212
+ continue
213
+ return "" # pyright: ignore
214
+
182
215
  @classmethod
183
216
  async def from_async_playwright_response(
184
217
  cls,
@@ -186,6 +219,7 @@ class ResponseFactory:
186
219
  first_response: AsyncResponse,
187
220
  final_response: Optional[AsyncResponse],
188
221
  parser_arguments: Dict,
222
+ automated_page: bool = False,
189
223
  ) -> Response:
190
224
  """
191
225
  Transforms a Playwright response into an internal `Response` object, encapsulating
@@ -201,6 +235,7 @@ class ResponseFactory:
201
235
  :param first_response: An earlier or initial Playwright `Response` object that may serve as a fallback response in the absence of the final one.
202
236
  :param parser_arguments: A dictionary containing additional arguments needed for parsing or further customization of the returned `Response`. These arguments are dynamically unpacked into
203
237
  the `Response` object.
238
+ :param automated_page: If True, it means the `page_action` argument was being used, so the response retrieving method changes to use Playwright's page instead of the final response.
204
239
 
205
240
  :return: A fully populated `Response` object containing the page's URL, content, status, headers, cookies, and other derived metadata.
206
241
  :rtype: Response
@@ -216,7 +251,7 @@ class ResponseFactory:
216
251
 
217
252
  history = await cls._async_process_response_history(first_response, parser_arguments)
218
253
  try:
219
- page_content = await final_response.text()
254
+ page_content = await (final_response.text() if not automated_page else cls._get_async_page_content(page))
220
255
  except Exception as e: # pragma: no cover
221
256
  log.error(f"Error getting page content in async: {e}")
222
257
  page_content = ""
@@ -209,15 +209,3 @@ class StatusText:
209
209
  def get(cls, status_code: int) -> str:
210
210
  """Get the phrase for a given HTTP status code."""
211
211
  return cls._phrases.get(status_code, "Unknown Status Code")
212
-
213
-
214
- def get_variable_name(var: Any) -> Optional[str]:
215
- """Get the name of a variable using global and local scopes.
216
- :param var: The variable to find the name for
217
- :return: The name of the variable if found, None otherwise
218
- """
219
- for scope in [globals(), locals()]:
220
- for name, value in scope.items():
221
- if value is var:
222
- return name
223
- return None
@@ -7,8 +7,9 @@ from platform import system as platform_system
7
7
 
8
8
  from tldextract import extract
9
9
  from browserforge.headers import Browser, HeaderGenerator
10
+ from browserforge.headers.generator import SUPPORTED_OPERATING_SYSTEMS
10
11
 
11
- from scrapling.core._types import Dict, Literal
12
+ from scrapling.core._types import Dict, Literal, Tuple
12
13
 
13
14
  __OS_NAME__ = platform_system()
14
15
  OSName = Literal["linux", "macos", "windows"]
@@ -29,12 +30,12 @@ def generate_convincing_referer(url: str) -> str:
29
30
 
30
31
 
31
32
  @lru_cache(1, typed=True)
32
- def get_os_name() -> OSName | None:
33
+ def get_os_name() -> OSName | Tuple:
33
34
  """Get the current OS name in the same format needed for browserforge, if the OS is Unknown, return None so browserforge uses all.
34
35
 
35
36
  :return: Current OS name or `None` otherwise
36
37
  """
37
- match __OS_NAME__:
38
+ match __OS_NAME__: # pragma: no cover
38
39
  case "Linux":
39
40
  return "linux"
40
41
  case "Darwin":
@@ -42,7 +43,7 @@ def get_os_name() -> OSName | None:
42
43
  case "Windows":
43
44
  return "windows"
44
45
  case _:
45
- return None
46
+ return SUPPORTED_OPERATING_SYSTEMS
46
47
 
47
48
 
48
49
  def generate_headers(browser_mode: bool = False) -> Dict:
@@ -63,10 +64,7 @@ def generate_headers(browser_mode: bool = False) -> Dict:
63
64
  Browser(name="edge", min_version=130),
64
65
  ]
65
66
  )
66
- if os_name:
67
- return HeaderGenerator(browser=browsers, os=os_name, device="desktop").generate()
68
- else:
69
- return HeaderGenerator(browser=browsers, device="desktop").generate()
67
+ return HeaderGenerator(browser=browsers, os=os_name, device="desktop").generate()
70
68
 
71
69
 
72
70
  __default_useragent__ = generate_headers(browser_mode=False).get("User-Agent")