scrapling 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. scrapling/__init__.py +1 -1
  2. scrapling/cli.py +38 -51
  3. scrapling/core/_html_utils.py +3 -9
  4. scrapling/core/ai.py +5 -13
  5. scrapling/core/custom_types.py +19 -61
  6. scrapling/core/mixins.py +6 -28
  7. scrapling/core/shell.py +49 -127
  8. scrapling/core/storage.py +2 -8
  9. scrapling/core/translator.py +8 -20
  10. scrapling/core/utils/__init__.py +10 -0
  11. scrapling/core/utils/_shell.py +48 -0
  12. scrapling/core/{utils.py → utils/_utils.py} +5 -21
  13. scrapling/engines/__init__.py +0 -16
  14. scrapling/engines/_browsers/_base.py +297 -0
  15. scrapling/engines/_browsers/_camoufox.py +219 -296
  16. scrapling/engines/_browsers/_config_tools.py +2 -1
  17. scrapling/engines/_browsers/_controllers.py +201 -281
  18. scrapling/engines/_browsers/_page.py +37 -15
  19. scrapling/engines/_browsers/_validators.py +9 -15
  20. scrapling/engines/constants.py +3 -6
  21. scrapling/engines/static.py +25 -75
  22. scrapling/engines/toolbelt/__init__.py +1 -20
  23. scrapling/engines/toolbelt/convertor.py +95 -86
  24. scrapling/engines/toolbelt/custom.py +7 -99
  25. scrapling/engines/toolbelt/fingerprints.py +1 -3
  26. scrapling/engines/toolbelt/navigation.py +4 -58
  27. scrapling/fetchers.py +29 -24
  28. scrapling/parser.py +45 -122
  29. {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/METADATA +54 -46
  30. scrapling-0.3.3.dist-info/RECORD +44 -0
  31. scrapling-0.3.1.dist-info/RECORD +0 -41
  32. {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/WHEEL +0 -0
  33. {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/entry_points.txt +0 -0
  34. {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/licenses/LICENSE +0 -0
  35. {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,8 @@
1
- from time import time, sleep
2
1
  from re import compile as re_compile
3
- from asyncio import sleep as asyncio_sleep, Lock
4
2
 
5
- from camoufox import DefaultAddons
6
- from camoufox.utils import launch_options as generate_launch_options
7
3
  from playwright.sync_api import (
8
4
  Response as SyncPlaywrightResponse,
9
5
  sync_playwright,
10
- BrowserContext,
11
- Playwright,
12
6
  Locator,
13
7
  Page,
14
8
  )
@@ -21,9 +15,9 @@ from playwright.async_api import (
21
15
  Page as async_Page,
22
16
  )
23
17
 
24
- from scrapling.core.utils import log
25
- from ._page import PageInfo, PagePool
26
18
  from ._validators import validate, CamoufoxConfig
19
+ from ._base import SyncSession, AsyncSession, StealthySessionMixin
20
+ from scrapling.core.utils import log
27
21
  from scrapling.core._types import (
28
22
  Dict,
29
23
  List,
@@ -31,19 +25,17 @@ from scrapling.core._types import (
31
25
  Callable,
32
26
  SelectorWaitStates,
33
27
  )
34
- from scrapling.engines.toolbelt import (
28
+ from scrapling.engines.toolbelt.convertor import (
35
29
  Response,
36
30
  ResponseFactory,
37
- async_intercept_route,
38
- generate_convincing_referer,
39
- get_os_name,
40
- intercept_route,
41
31
  )
32
+ from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
42
33
 
43
34
  __CF_PATTERN__ = re_compile("challenges.cloudflare.com/cdn-cgi/challenge-platform/.*")
35
+ _UNSET = object()
44
36
 
45
37
 
46
- class StealthySession:
38
+ class StealthySession(StealthySessionMixin, SyncSession):
47
39
  """A Stealthy session manager with page pooling."""
48
40
 
49
41
  __slots__ = (
@@ -54,6 +46,7 @@ class StealthySession:
54
46
  "block_webrtc",
55
47
  "allow_webgl",
56
48
  "network_idle",
49
+ "load_dom",
57
50
  "humanize",
58
51
  "solve_cloudflare",
59
52
  "wait",
@@ -83,13 +76,14 @@ class StealthySession:
83
76
 
84
77
  def __init__(
85
78
  self,
86
- max_pages: int = 1,
79
+ __max_pages: int = 1,
87
80
  headless: bool = True, # noqa: F821
88
81
  block_images: bool = False,
89
82
  disable_resources: bool = False,
90
83
  block_webrtc: bool = False,
91
84
  allow_webgl: bool = True,
92
85
  network_idle: bool = False,
86
+ load_dom: bool = True,
93
87
  humanize: bool | float = True,
94
88
  solve_cloudflare: bool = False,
95
89
  wait: int | float = 0,
@@ -124,11 +118,12 @@ class StealthySession:
124
118
  :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
125
119
  :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
126
120
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
121
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
127
122
  :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
128
123
  :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
129
124
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
130
125
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
131
- :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
126
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
132
127
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
133
128
  :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
134
129
  :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
@@ -137,108 +132,47 @@ class StealthySession:
137
132
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
138
133
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
139
134
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
140
- :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
141
135
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
142
136
  :param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
143
137
  """
144
138
 
145
- params = {
146
- "max_pages": max_pages,
147
- "headless": headless,
148
- "block_images": block_images,
149
- "disable_resources": disable_resources,
150
- "block_webrtc": block_webrtc,
151
- "allow_webgl": allow_webgl,
152
- "network_idle": network_idle,
153
- "humanize": humanize,
154
- "solve_cloudflare": solve_cloudflare,
155
- "wait": wait,
156
- "timeout": timeout,
157
- "page_action": page_action,
158
- "wait_selector": wait_selector,
159
- "init_script": init_script,
160
- "addons": addons,
161
- "wait_selector_state": wait_selector_state,
162
- "cookies": cookies,
163
- "google_search": google_search,
164
- "extra_headers": extra_headers,
165
- "proxy": proxy,
166
- "os_randomize": os_randomize,
167
- "disable_ads": disable_ads,
168
- "geoip": geoip,
169
- "selector_config": selector_config,
170
- "additional_args": additional_args,
171
- }
172
- config = validate(params, CamoufoxConfig)
173
-
174
- self.max_pages = config.max_pages
175
- self.headless = config.headless
176
- self.block_images = config.block_images
177
- self.disable_resources = config.disable_resources
178
- self.block_webrtc = config.block_webrtc
179
- self.allow_webgl = config.allow_webgl
180
- self.network_idle = config.network_idle
181
- self.humanize = config.humanize
182
- self.solve_cloudflare = config.solve_cloudflare
183
- self.wait = config.wait
184
- self.timeout = config.timeout
185
- self.page_action = config.page_action
186
- self.wait_selector = config.wait_selector
187
- self.init_script = config.init_script
188
- self.addons = config.addons
189
- self.wait_selector_state = config.wait_selector_state
190
- self.cookies = config.cookies
191
- self.google_search = config.google_search
192
- self.extra_headers = config.extra_headers
193
- self.proxy = config.proxy
194
- self.os_randomize = config.os_randomize
195
- self.disable_ads = config.disable_ads
196
- self.geoip = config.geoip
197
- self.selector_config = config.selector_config
198
- self.additional_args = config.additional_args
199
-
200
- self.playwright: Optional[Playwright] = None
201
- self.context: Optional[BrowserContext] = None
202
- self.page_pool = PagePool(self.max_pages)
203
- self._closed = False
204
- self.selector_config = config.selector_config
205
- self.page_action = config.page_action
206
- self._headers_keys = (
207
- set(map(str.lower, self.extra_headers.keys()))
208
- if self.extra_headers
209
- else set()
210
- )
211
- self.__initiate_browser_options__()
212
-
213
- def __initiate_browser_options__(self):
214
- """Initiate browser options."""
215
- self.launch_options = generate_launch_options(
216
- **{
217
- "geoip": self.geoip,
218
- "proxy": dict(self.proxy) if self.proxy else self.proxy,
219
- "enable_cache": True,
220
- "addons": self.addons,
221
- "exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
222
- "headless": self.headless,
223
- "humanize": True if self.solve_cloudflare else self.humanize,
224
- "i_know_what_im_doing": True, # To turn warnings off with the user configurations
225
- "allow_webgl": self.allow_webgl,
226
- "block_webrtc": self.block_webrtc,
227
- "block_images": self.block_images, # Careful! it makes some websites don't finish loading at all like stackoverflow even in headful mode.
228
- "os": None if self.os_randomize else get_os_name(),
229
- "user_data_dir": "",
230
- **self.additional_args,
231
- }
139
+ self.__validate__(
140
+ wait=wait,
141
+ proxy=proxy,
142
+ geoip=geoip,
143
+ addons=addons,
144
+ timeout=timeout,
145
+ cookies=cookies,
146
+ headless=headless,
147
+ humanize=humanize,
148
+ load_dom=load_dom,
149
+ max_pages=__max_pages,
150
+ disable_ads=disable_ads,
151
+ allow_webgl=allow_webgl,
152
+ page_action=page_action,
153
+ init_script=init_script,
154
+ network_idle=network_idle,
155
+ block_images=block_images,
156
+ block_webrtc=block_webrtc,
157
+ os_randomize=os_randomize,
158
+ wait_selector=wait_selector,
159
+ google_search=google_search,
160
+ extra_headers=extra_headers,
161
+ additional_args=additional_args,
162
+ selector_config=selector_config,
163
+ solve_cloudflare=solve_cloudflare,
164
+ disable_resources=disable_resources,
165
+ wait_selector_state=wait_selector_state,
232
166
  )
167
+ super().__init__(max_pages=self.max_pages)
233
168
 
234
169
  def __create__(self):
235
170
  """Create a browser for this instance and context."""
236
171
  self.playwright = sync_playwright().start()
237
- self.context = (
238
- self.playwright.firefox.launch_persistent_context( # pragma: no cover
239
- **self.launch_options
240
- )
172
+ self.context = self.playwright.firefox.launch_persistent_context( # pragma: no cover
173
+ **self.launch_options
241
174
  )
175
+
242
176
  if self.init_script: # pragma: no cover
243
177
  self.context.add_init_script(path=self.init_script)
244
178
 
@@ -267,68 +201,6 @@ class StealthySession:
267
201
 
268
202
  self._closed = True
269
203
 
270
- def _get_or_create_page(self) -> PageInfo: # pragma: no cover
271
- """Get an available page or create a new one"""
272
- # Try to get a ready page first
273
- page_info = self.page_pool.get_ready_page()
274
- if page_info:
275
- return page_info
276
-
277
- # Create a new page if under limit
278
- if self.page_pool.pages_count < self.max_pages:
279
- page = self.context.new_page()
280
- page.set_default_navigation_timeout(self.timeout)
281
- page.set_default_timeout(self.timeout)
282
- if self.extra_headers:
283
- page.set_extra_http_headers(self.extra_headers)
284
-
285
- if self.disable_resources:
286
- page.route("**/*", intercept_route)
287
-
288
- return self.page_pool.add_page(page)
289
-
290
- # Wait for a page to become available
291
- max_wait = 30
292
- start_time = time()
293
-
294
- while time() - start_time < max_wait:
295
- page_info = self.page_pool.get_ready_page()
296
- if page_info:
297
- return page_info
298
- sleep(0.05)
299
-
300
- raise TimeoutError("No pages available within timeout period")
301
-
302
- @staticmethod
303
- def _detect_cloudflare(page_content):
304
- """
305
- Detect the type of Cloudflare challenge present in the provided page content.
306
-
307
- This function analyzes the given page content to identify whether a specific
308
- type of Cloudflare challenge is present. It checks for three predefined
309
- challenge types: non-interactive, managed, and interactive. If a challenge
310
- type is detected, it returns the corresponding type as a string. If no
311
- challenge type is detected, it returns None.
312
-
313
- Args:
314
- page_content (str): The content of the page to analyze for Cloudflare
315
- challenge types.
316
-
317
- Returns:
318
- str: A string representing the detected Cloudflare challenge type, if
319
- found. Returns None if no challenge matches.
320
- """
321
- challenge_types = (
322
- "non-interactive",
323
- "managed",
324
- "interactive",
325
- )
326
- for ctype in challenge_types:
327
- if f"cType: '{ctype}'" in page_content:
328
- return ctype
329
-
330
- return None
331
-
332
204
  def _solve_cloudflare(self, page: Page) -> None: # pragma: no cover
333
205
  """Solve the cloudflare challenge displayed on the playwright page passed
334
206
 
@@ -375,20 +247,66 @@ class StealthySession:
375
247
  log.info("Cloudflare captcha is solved")
376
248
  return
377
249
 
378
- def fetch(self, url: str) -> Response:
250
+ def fetch(
251
+ self,
252
+ url: str,
253
+ google_search: bool = _UNSET,
254
+ timeout: int | float = _UNSET,
255
+ wait: int | float = _UNSET,
256
+ page_action: Optional[Callable] = _UNSET,
257
+ extra_headers: Optional[Dict[str, str]] = _UNSET,
258
+ disable_resources: bool = _UNSET,
259
+ wait_selector: Optional[str] = _UNSET,
260
+ wait_selector_state: SelectorWaitStates = _UNSET,
261
+ network_idle: bool = _UNSET,
262
+ load_dom: bool = _UNSET,
263
+ solve_cloudflare: bool = _UNSET,
264
+ selector_config: Optional[Dict] = _UNSET,
265
+ ) -> Response:
379
266
  """Opens up the browser and do your request based on your chosen options.
380
267
 
381
268
  :param url: The Target url.
269
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
270
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
271
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
272
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
273
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
274
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
275
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
276
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
277
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
278
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
279
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
280
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
281
+ :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
282
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
382
283
  :return: A `Response` object.
383
284
  """
285
+ # Validate all resolved parameters
286
+ params = validate(
287
+ dict(
288
+ google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
289
+ timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
290
+ wait=self._get_with_precedence(wait, self.wait, _UNSET),
291
+ page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
292
+ extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
293
+ disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
294
+ wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
295
+ wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
296
+ network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
297
+ load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
298
+ solve_cloudflare=self._get_with_precedence(solve_cloudflare, self.solve_cloudflare, _UNSET),
299
+ selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
300
+ ),
301
+ CamoufoxConfig,
302
+ )
303
+
384
304
  if self._closed: # pragma: no cover
385
305
  raise RuntimeError("Context manager has been closed")
386
306
 
387
307
  final_response = None
388
308
  referer = (
389
- generate_convincing_referer(url)
390
- if (self.google_search and "referer" not in self._headers_keys)
391
- else None
309
+ generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
392
310
  )
393
311
 
394
312
  def handle_response(finished_response: SyncPlaywrightResponse):
@@ -399,54 +317,57 @@ class StealthySession:
399
317
  ):
400
318
  final_response = finished_response
401
319
 
402
- page_info = self._get_or_create_page()
320
+ page_info = self._get_page(params.timeout, params.extra_headers, params.disable_resources)
403
321
  page_info.mark_busy(url=url)
404
322
 
405
323
  try: # pragma: no cover
406
324
  # Navigate to URL and wait for a specified state
407
325
  page_info.page.on("response", handle_response)
408
326
  first_response = page_info.page.goto(url, referer=referer)
409
- page_info.page.wait_for_load_state(state="domcontentloaded")
327
+ if params.load_dom:
328
+ page_info.page.wait_for_load_state(state="domcontentloaded")
410
329
 
411
- if self.network_idle:
330
+ if params.network_idle:
412
331
  page_info.page.wait_for_load_state("networkidle")
413
332
 
414
333
  if not first_response:
415
334
  raise RuntimeError(f"Failed to get response for {url}")
416
335
 
417
- if self.solve_cloudflare:
336
+ if params.solve_cloudflare:
418
337
  self._solve_cloudflare(page_info.page)
419
338
  # Make sure the page is fully loaded after the captcha
420
339
  page_info.page.wait_for_load_state(state="load")
421
- page_info.page.wait_for_load_state(state="domcontentloaded")
422
- if self.network_idle:
340
+ if params.load_dom:
341
+ page_info.page.wait_for_load_state(state="domcontentloaded")
342
+ if params.network_idle:
423
343
  page_info.page.wait_for_load_state("networkidle")
424
344
 
425
- if self.page_action is not None:
345
+ if params.page_action:
426
346
  try:
427
- page_info.page = self.page_action(page_info.page)
347
+ _ = params.page_action(page_info.page)
428
348
  except Exception as e:
429
349
  log.error(f"Error executing page_action: {e}")
430
350
 
431
- if self.wait_selector:
351
+ if params.wait_selector:
432
352
  try:
433
- waiter: Locator = page_info.page.locator(self.wait_selector)
434
- waiter.first.wait_for(state=self.wait_selector_state)
353
+ waiter: Locator = page_info.page.locator(params.wait_selector)
354
+ waiter.first.wait_for(state=params.wait_selector_state)
435
355
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
436
356
  page_info.page.wait_for_load_state(state="load")
437
- page_info.page.wait_for_load_state(state="domcontentloaded")
438
- if self.network_idle:
357
+ if params.load_dom:
358
+ page_info.page.wait_for_load_state(state="domcontentloaded")
359
+ if params.network_idle:
439
360
  page_info.page.wait_for_load_state("networkidle")
440
361
  except Exception as e:
441
- log.error(f"Error waiting for selector {self.wait_selector}: {e}")
362
+ log.error(f"Error waiting for selector {params.wait_selector}: {e}")
442
363
 
443
- page_info.page.wait_for_timeout(self.wait)
364
+ page_info.page.wait_for_timeout(params.wait)
444
365
  response = ResponseFactory.from_playwright_response(
445
- page_info.page, first_response, final_response, self.selector_config
366
+ page_info.page, first_response, final_response, params.selector_config
446
367
  )
447
368
 
448
- # Mark the page as ready for next use
449
- page_info.mark_ready()
369
+ # Mark the page as finished for next use
370
+ page_info.mark_finished()
450
371
 
451
372
  return response
452
373
 
@@ -454,17 +375,8 @@ class StealthySession:
454
375
  page_info.mark_error()
455
376
  raise e
456
377
 
457
- def get_pool_stats(self) -> Dict[str, int]:
458
- """Get statistics about the current page pool"""
459
- return {
460
- "total_pages": self.page_pool.pages_count,
461
- "ready_pages": self.page_pool.ready_count,
462
- "busy_pages": self.page_pool.busy_count,
463
- "max_pages": self.max_pages,
464
- }
465
-
466
378
 
467
- class AsyncStealthySession(StealthySession):
379
+ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
468
380
  """A Stealthy session manager with page pooling."""
469
381
 
470
382
  def __init__(
@@ -476,6 +388,7 @@ class AsyncStealthySession(StealthySession):
476
388
  block_webrtc: bool = False,
477
389
  allow_webgl: bool = True,
478
390
  network_idle: bool = False,
391
+ load_dom: bool = True,
479
392
  humanize: bool | float = True,
480
393
  solve_cloudflare: bool = False,
481
394
  wait: int | float = 0,
@@ -510,11 +423,12 @@ class AsyncStealthySession(StealthySession):
510
423
  :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
511
424
  :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
512
425
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
426
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
513
427
  :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
514
428
  :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
515
429
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
516
430
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
517
- :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
431
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
518
432
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
519
433
  :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
520
434
  :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
@@ -527,47 +441,43 @@ class AsyncStealthySession(StealthySession):
527
441
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
528
442
  :param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
529
443
  """
530
- super().__init__(
531
- max_pages,
532
- headless,
533
- block_images,
534
- disable_resources,
535
- block_webrtc,
536
- allow_webgl,
537
- network_idle,
538
- humanize,
539
- solve_cloudflare,
540
- wait,
541
- timeout,
542
- page_action,
543
- wait_selector,
544
- init_script,
545
- addons,
546
- wait_selector_state,
547
- cookies,
548
- google_search,
549
- extra_headers,
550
- proxy,
551
- os_randomize,
552
- disable_ads,
553
- geoip,
554
- selector_config,
555
- additional_args,
444
+ self.__validate__(
445
+ wait=wait,
446
+ proxy=proxy,
447
+ geoip=geoip,
448
+ addons=addons,
449
+ timeout=timeout,
450
+ cookies=cookies,
451
+ headless=headless,
452
+ load_dom=load_dom,
453
+ humanize=humanize,
454
+ max_pages=max_pages,
455
+ disable_ads=disable_ads,
456
+ allow_webgl=allow_webgl,
457
+ page_action=page_action,
458
+ init_script=init_script,
459
+ network_idle=network_idle,
460
+ block_images=block_images,
461
+ block_webrtc=block_webrtc,
462
+ os_randomize=os_randomize,
463
+ wait_selector=wait_selector,
464
+ google_search=google_search,
465
+ extra_headers=extra_headers,
466
+ additional_args=additional_args,
467
+ selector_config=selector_config,
468
+ solve_cloudflare=solve_cloudflare,
469
+ disable_resources=disable_resources,
470
+ wait_selector_state=wait_selector_state,
556
471
  )
557
- self.playwright: Optional[AsyncPlaywright] = None
558
- self.context: Optional[AsyncBrowserContext] = None
559
- self._lock = Lock()
560
- self.__enter__ = None
561
- self.__exit__ = None
472
+ super().__init__(max_pages=self.max_pages)
562
473
 
563
474
  async def __create__(self):
564
475
  """Create a browser for this instance and context."""
565
476
  self.playwright: AsyncPlaywright = await async_playwright().start()
566
- self.context: AsyncBrowserContext = (
567
- await self.playwright.firefox.launch_persistent_context(
568
- **self.launch_options
569
- )
477
+ self.context: AsyncBrowserContext = await self.playwright.firefox.launch_persistent_context(
478
+ **self.launch_options
570
479
  )
480
+
571
481
  if self.init_script: # pragma: no cover
572
482
  await self.context.add_init_script(path=self.init_script)
573
483
 
@@ -596,39 +506,6 @@ class AsyncStealthySession(StealthySession):
596
506
 
597
507
  self._closed = True
598
508
 
599
- async def _get_or_create_page(self) -> PageInfo:
600
- """Get an available page or create a new one"""
601
- async with self._lock:
602
- # Try to get a ready page first
603
- page_info = self.page_pool.get_ready_page()
604
- if page_info:
605
- return page_info
606
-
607
- # Create a new page if under limit
608
- if self.page_pool.pages_count < self.max_pages:
609
- page = await self.context.new_page()
610
- page.set_default_navigation_timeout(self.timeout)
611
- page.set_default_timeout(self.timeout)
612
- if self.extra_headers:
613
- await page.set_extra_http_headers(self.extra_headers)
614
-
615
- if self.disable_resources:
616
- await page.route("**/*", async_intercept_route)
617
-
618
- return self.page_pool.add_page(page)
619
-
620
- # Wait for a page to become available
621
- max_wait = 30
622
- start_time = time()
623
-
624
- while time() - start_time < max_wait: # pragma: no cover
625
- page_info = self.page_pool.get_ready_page()
626
- if page_info:
627
- return page_info
628
- await asyncio_sleep(0.05)
629
-
630
- raise TimeoutError("No pages available within timeout period")
631
-
632
509
  async def _solve_cloudflare(self, page: async_Page):
633
510
  """Solve the cloudflare challenge displayed on the playwright page passed. The async version
634
511
 
@@ -664,9 +541,7 @@ class AsyncStealthySession(StealthySession):
664
541
  await page.wait_for_timeout(500)
665
542
 
666
543
  # Calculate the Captcha coordinates for any viewport
667
- outer_box = await page.locator(
668
- ".main-content p+div>div>div"
669
- ).bounding_box()
544
+ outer_box = await page.locator(".main-content p+div>div>div").bounding_box()
670
545
  captcha_x, captcha_y = outer_box["x"] + 26, outer_box["y"] + 25
671
546
 
672
547
  # Move the mouse to the center of the window, then press and hold the left mouse button
@@ -677,20 +552,65 @@ class AsyncStealthySession(StealthySession):
677
552
  log.info("Cloudflare captcha is solved")
678
553
  return
679
554
 
680
- async def fetch(self, url: str) -> Response:
555
+ async def fetch(
556
+ self,
557
+ url: str,
558
+ google_search: bool = _UNSET,
559
+ timeout: int | float = _UNSET,
560
+ wait: int | float = _UNSET,
561
+ page_action: Optional[Callable] = _UNSET,
562
+ extra_headers: Optional[Dict[str, str]] = _UNSET,
563
+ disable_resources: bool = _UNSET,
564
+ wait_selector: Optional[str] = _UNSET,
565
+ wait_selector_state: SelectorWaitStates = _UNSET,
566
+ network_idle: bool = _UNSET,
567
+ load_dom: bool = _UNSET,
568
+ solve_cloudflare: bool = _UNSET,
569
+ selector_config: Optional[Dict] = _UNSET,
570
+ ) -> Response:
681
571
  """Opens up the browser and do your request based on your chosen options.
682
572
 
683
573
  :param url: The Target url.
574
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
575
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
576
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
577
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
578
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
579
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
580
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
581
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
582
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
583
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
584
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
585
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
586
+ :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
587
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
684
588
  :return: A `Response` object.
685
589
  """
590
+ params = validate(
591
+ dict(
592
+ google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
593
+ timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
594
+ wait=self._get_with_precedence(wait, self.wait, _UNSET),
595
+ page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
596
+ extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
597
+ disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
598
+ wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
599
+ wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
600
+ network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
601
+ load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
602
+ solve_cloudflare=self._get_with_precedence(solve_cloudflare, self.solve_cloudflare, _UNSET),
603
+ selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
604
+ ),
605
+ CamoufoxConfig,
606
+ )
607
+
686
608
  if self._closed: # pragma: no cover
687
609
  raise RuntimeError("Context manager has been closed")
688
610
 
689
611
  final_response = None
690
612
  referer = (
691
- generate_convincing_referer(url)
692
- if (self.google_search and "referer" not in self._headers_keys)
693
- else None
613
+ generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
694
614
  )
695
615
 
696
616
  async def handle_response(finished_response: AsyncPlaywrightResponse):
@@ -701,56 +621,59 @@ class AsyncStealthySession(StealthySession):
701
621
  ):
702
622
  final_response = finished_response
703
623
 
704
- page_info = await self._get_or_create_page()
624
+ page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
705
625
  page_info.mark_busy(url=url)
706
626
 
707
627
  try:
708
628
  # Navigate to URL and wait for a specified state
709
629
  page_info.page.on("response", handle_response)
710
630
  first_response = await page_info.page.goto(url, referer=referer)
711
- await page_info.page.wait_for_load_state(state="domcontentloaded")
631
+ if params.load_dom:
632
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
712
633
 
713
- if self.network_idle:
634
+ if params.network_idle:
714
635
  await page_info.page.wait_for_load_state("networkidle")
715
636
 
716
637
  if not first_response:
717
638
  raise RuntimeError(f"Failed to get response for {url}")
718
639
 
719
- if self.solve_cloudflare:
640
+ if params.solve_cloudflare:
720
641
  await self._solve_cloudflare(page_info.page)
721
642
  # Make sure the page is fully loaded after the captcha
722
643
  await page_info.page.wait_for_load_state(state="load")
723
- await page_info.page.wait_for_load_state(state="domcontentloaded")
724
- if self.network_idle:
644
+ if params.load_dom:
645
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
646
+ if params.network_idle:
725
647
  await page_info.page.wait_for_load_state("networkidle")
726
648
 
727
- if self.page_action is not None:
649
+ if params.page_action:
728
650
  try:
729
- page_info.page = await self.page_action(page_info.page)
651
+ _ = await params.page_action(page_info.page)
730
652
  except Exception as e:
731
653
  log.error(f"Error executing page_action: {e}")
732
654
 
733
- if self.wait_selector:
655
+ if params.wait_selector:
734
656
  try:
735
- waiter: AsyncLocator = page_info.page.locator(self.wait_selector)
736
- await waiter.first.wait_for(state=self.wait_selector_state)
657
+ waiter: AsyncLocator = page_info.page.locator(params.wait_selector)
658
+ await waiter.first.wait_for(state=params.wait_selector_state)
737
659
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
738
660
  await page_info.page.wait_for_load_state(state="load")
739
- await page_info.page.wait_for_load_state(state="domcontentloaded")
740
- if self.network_idle:
661
+ if params.load_dom:
662
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
663
+ if params.network_idle:
741
664
  await page_info.page.wait_for_load_state("networkidle")
742
665
  except Exception as e:
743
- log.error(f"Error waiting for selector {self.wait_selector}: {e}")
666
+ log.error(f"Error waiting for selector {params.wait_selector}: {e}")
744
667
 
745
- await page_info.page.wait_for_timeout(self.wait)
668
+ await page_info.page.wait_for_timeout(params.wait)
746
669
 
747
670
  # Create response object
748
671
  response = await ResponseFactory.from_async_playwright_response(
749
- page_info.page, first_response, final_response, self.selector_config
672
+ page_info.page, first_response, final_response, params.selector_config
750
673
  )
751
674
 
752
- # Mark the page as ready for next use
753
- page_info.mark_ready()
675
+ # Mark the page as finished for next use
676
+ page_info.mark_finished()
754
677
 
755
678
  return response
756
679