scrapling 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. scrapling/__init__.py +1 -1
  2. scrapling/cli.py +38 -51
  3. scrapling/core/_html_utils.py +3 -9
  4. scrapling/core/ai.py +5 -13
  5. scrapling/core/custom_types.py +19 -61
  6. scrapling/core/mixins.py +6 -28
  7. scrapling/core/shell.py +49 -127
  8. scrapling/core/storage.py +2 -8
  9. scrapling/core/translator.py +8 -20
  10. scrapling/core/utils/__init__.py +10 -0
  11. scrapling/core/utils/_shell.py +48 -0
  12. scrapling/core/{utils.py → utils/_utils.py} +5 -21
  13. scrapling/engines/__init__.py +0 -16
  14. scrapling/engines/_browsers/_base.py +297 -0
  15. scrapling/engines/_browsers/_camoufox.py +227 -296
  16. scrapling/engines/_browsers/_config_tools.py +2 -1
  17. scrapling/engines/_browsers/_controllers.py +209 -281
  18. scrapling/engines/_browsers/_page.py +37 -15
  19. scrapling/engines/_browsers/_validators.py +9 -15
  20. scrapling/engines/constants.py +3 -6
  21. scrapling/engines/static.py +25 -75
  22. scrapling/engines/toolbelt/__init__.py +1 -20
  23. scrapling/engines/toolbelt/convertor.py +95 -86
  24. scrapling/engines/toolbelt/custom.py +7 -99
  25. scrapling/engines/toolbelt/fingerprints.py +1 -3
  26. scrapling/engines/toolbelt/navigation.py +4 -58
  27. scrapling/fetchers.py +29 -24
  28. scrapling/parser.py +45 -122
  29. {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/METADATA +54 -46
  30. scrapling-0.3.2.dist-info/RECORD +44 -0
  31. scrapling-0.3.1.dist-info/RECORD +0 -41
  32. {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/WHEEL +0 -0
  33. {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/entry_points.txt +0 -0
  34. {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/licenses/LICENSE +0 -0
  35. {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,8 @@
1
- from time import time, sleep
2
1
  from re import compile as re_compile
3
- from asyncio import sleep as asyncio_sleep, Lock
4
2
 
5
- from camoufox import DefaultAddons
6
- from camoufox.utils import launch_options as generate_launch_options
7
3
  from playwright.sync_api import (
8
4
  Response as SyncPlaywrightResponse,
9
5
  sync_playwright,
10
- BrowserContext,
11
- Playwright,
12
6
  Locator,
13
7
  Page,
14
8
  )
@@ -21,9 +15,9 @@ from playwright.async_api import (
21
15
  Page as async_Page,
22
16
  )
23
17
 
24
- from scrapling.core.utils import log
25
- from ._page import PageInfo, PagePool
26
18
  from ._validators import validate, CamoufoxConfig
19
+ from ._base import SyncSession, AsyncSession, StealthySessionMixin
20
+ from scrapling.core.utils import log
27
21
  from scrapling.core._types import (
28
22
  Dict,
29
23
  List,
@@ -31,19 +25,17 @@ from scrapling.core._types import (
31
25
  Callable,
32
26
  SelectorWaitStates,
33
27
  )
34
- from scrapling.engines.toolbelt import (
28
+ from scrapling.engines.toolbelt.convertor import (
35
29
  Response,
36
30
  ResponseFactory,
37
- async_intercept_route,
38
- generate_convincing_referer,
39
- get_os_name,
40
- intercept_route,
41
31
  )
32
+ from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
42
33
 
43
34
  __CF_PATTERN__ = re_compile("challenges.cloudflare.com/cdn-cgi/challenge-platform/.*")
35
+ _UNSET = object()
44
36
 
45
37
 
46
- class StealthySession:
38
+ class StealthySession(StealthySessionMixin, SyncSession):
47
39
  """A Stealthy session manager with page pooling."""
48
40
 
49
41
  __slots__ = (
@@ -54,6 +46,7 @@ class StealthySession:
54
46
  "block_webrtc",
55
47
  "allow_webgl",
56
48
  "network_idle",
49
+ "load_dom",
57
50
  "humanize",
58
51
  "solve_cloudflare",
59
52
  "wait",
@@ -83,13 +76,14 @@ class StealthySession:
83
76
 
84
77
  def __init__(
85
78
  self,
86
- max_pages: int = 1,
79
+ __max_pages: int = 1,
87
80
  headless: bool = True, # noqa: F821
88
81
  block_images: bool = False,
89
82
  disable_resources: bool = False,
90
83
  block_webrtc: bool = False,
91
84
  allow_webgl: bool = True,
92
85
  network_idle: bool = False,
86
+ load_dom: bool = True,
93
87
  humanize: bool | float = True,
94
88
  solve_cloudflare: bool = False,
95
89
  wait: int | float = 0,
@@ -124,11 +118,12 @@ class StealthySession:
124
118
  :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
125
119
  :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
126
120
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
121
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
127
122
  :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
128
123
  :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
129
124
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
130
125
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
131
- :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
126
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
132
127
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
133
128
  :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
134
129
  :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
@@ -137,108 +132,51 @@ class StealthySession:
137
132
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
138
133
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
139
134
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
140
- :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
141
135
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
142
136
  :param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
143
137
  """
144
138
 
145
- params = {
146
- "max_pages": max_pages,
147
- "headless": headless,
148
- "block_images": block_images,
149
- "disable_resources": disable_resources,
150
- "block_webrtc": block_webrtc,
151
- "allow_webgl": allow_webgl,
152
- "network_idle": network_idle,
153
- "humanize": humanize,
154
- "solve_cloudflare": solve_cloudflare,
155
- "wait": wait,
156
- "timeout": timeout,
157
- "page_action": page_action,
158
- "wait_selector": wait_selector,
159
- "init_script": init_script,
160
- "addons": addons,
161
- "wait_selector_state": wait_selector_state,
162
- "cookies": cookies,
163
- "google_search": google_search,
164
- "extra_headers": extra_headers,
165
- "proxy": proxy,
166
- "os_randomize": os_randomize,
167
- "disable_ads": disable_ads,
168
- "geoip": geoip,
169
- "selector_config": selector_config,
170
- "additional_args": additional_args,
171
- }
172
- config = validate(params, CamoufoxConfig)
173
-
174
- self.max_pages = config.max_pages
175
- self.headless = config.headless
176
- self.block_images = config.block_images
177
- self.disable_resources = config.disable_resources
178
- self.block_webrtc = config.block_webrtc
179
- self.allow_webgl = config.allow_webgl
180
- self.network_idle = config.network_idle
181
- self.humanize = config.humanize
182
- self.solve_cloudflare = config.solve_cloudflare
183
- self.wait = config.wait
184
- self.timeout = config.timeout
185
- self.page_action = config.page_action
186
- self.wait_selector = config.wait_selector
187
- self.init_script = config.init_script
188
- self.addons = config.addons
189
- self.wait_selector_state = config.wait_selector_state
190
- self.cookies = config.cookies
191
- self.google_search = config.google_search
192
- self.extra_headers = config.extra_headers
193
- self.proxy = config.proxy
194
- self.os_randomize = config.os_randomize
195
- self.disable_ads = config.disable_ads
196
- self.geoip = config.geoip
197
- self.selector_config = config.selector_config
198
- self.additional_args = config.additional_args
199
-
200
- self.playwright: Optional[Playwright] = None
201
- self.context: Optional[BrowserContext] = None
202
- self.page_pool = PagePool(self.max_pages)
203
- self._closed = False
204
- self.selector_config = config.selector_config
205
- self.page_action = config.page_action
206
- self._headers_keys = (
207
- set(map(str.lower, self.extra_headers.keys()))
208
- if self.extra_headers
209
- else set()
210
- )
211
- self.__initiate_browser_options__()
212
-
213
- def __initiate_browser_options__(self):
214
- """Initiate browser options."""
215
- self.launch_options = generate_launch_options(
216
- **{
217
- "geoip": self.geoip,
218
- "proxy": dict(self.proxy) if self.proxy else self.proxy,
219
- "enable_cache": True,
220
- "addons": self.addons,
221
- "exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
222
- "headless": self.headless,
223
- "humanize": True if self.solve_cloudflare else self.humanize,
224
- "i_know_what_im_doing": True, # To turn warnings off with the user configurations
225
- "allow_webgl": self.allow_webgl,
226
- "block_webrtc": self.block_webrtc,
227
- "block_images": self.block_images, # Careful! it makes some websites don't finish loading at all like stackoverflow even in headful mode.
228
- "os": None if self.os_randomize else get_os_name(),
229
- "user_data_dir": "",
230
- **self.additional_args,
231
- }
139
+ self.__validate__(
140
+ wait=wait,
141
+ proxy=proxy,
142
+ geoip=geoip,
143
+ addons=addons,
144
+ timeout=timeout,
145
+ cookies=cookies,
146
+ headless=headless,
147
+ humanize=humanize,
148
+ load_dom=load_dom,
149
+ max_pages=__max_pages,
150
+ disable_ads=disable_ads,
151
+ allow_webgl=allow_webgl,
152
+ page_action=page_action,
153
+ init_script=init_script,
154
+ network_idle=network_idle,
155
+ block_images=block_images,
156
+ block_webrtc=block_webrtc,
157
+ os_randomize=os_randomize,
158
+ wait_selector=wait_selector,
159
+ google_search=google_search,
160
+ extra_headers=extra_headers,
161
+ additional_args=additional_args,
162
+ selector_config=selector_config,
163
+ solve_cloudflare=solve_cloudflare,
164
+ disable_resources=disable_resources,
165
+ wait_selector_state=wait_selector_state,
232
166
  )
167
+ super().__init__(max_pages=self.max_pages)
233
168
 
234
169
  def __create__(self):
235
170
  """Create a browser for this instance and context."""
236
171
  self.playwright = sync_playwright().start()
237
- self.context = (
238
- self.playwright.firefox.launch_persistent_context( # pragma: no cover
239
- **self.launch_options
240
- )
172
+ self.context = self.playwright.firefox.launch_persistent_context( # pragma: no cover
173
+ **self.launch_options
241
174
  )
175
+
176
+ # Get the default page and close it
177
+ default_page = self.context.pages[0]
178
+ default_page.close()
179
+
242
180
  if self.init_script: # pragma: no cover
243
181
  self.context.add_init_script(path=self.init_script)
244
182
 
@@ -267,68 +205,6 @@ class StealthySession:
267
205
 
268
206
  self._closed = True
269
207
 
270
- def _get_or_create_page(self) -> PageInfo: # pragma: no cover
271
- """Get an available page or create a new one"""
272
- # Try to get a ready page first
273
- page_info = self.page_pool.get_ready_page()
274
- if page_info:
275
- return page_info
276
-
277
- # Create a new page if under limit
278
- if self.page_pool.pages_count < self.max_pages:
279
- page = self.context.new_page()
280
- page.set_default_navigation_timeout(self.timeout)
281
- page.set_default_timeout(self.timeout)
282
- if self.extra_headers:
283
- page.set_extra_http_headers(self.extra_headers)
284
-
285
- if self.disable_resources:
286
- page.route("**/*", intercept_route)
287
-
288
- return self.page_pool.add_page(page)
289
-
290
- # Wait for a page to become available
291
- max_wait = 30
292
- start_time = time()
293
-
294
- while time() - start_time < max_wait:
295
- page_info = self.page_pool.get_ready_page()
296
- if page_info:
297
- return page_info
298
- sleep(0.05)
299
-
300
- raise TimeoutError("No pages available within timeout period")
301
-
302
- @staticmethod
303
- def _detect_cloudflare(page_content):
304
- """
305
- Detect the type of Cloudflare challenge present in the provided page content.
306
-
307
- This function analyzes the given page content to identify whether a specific
308
- type of Cloudflare challenge is present. It checks for three predefined
309
- challenge types: non-interactive, managed, and interactive. If a challenge
310
- type is detected, it returns the corresponding type as a string. If no
311
- challenge type is detected, it returns None.
312
-
313
- Args:
314
- page_content (str): The content of the page to analyze for Cloudflare
315
- challenge types.
316
-
317
- Returns:
318
- str: A string representing the detected Cloudflare challenge type, if
319
- found. Returns None if no challenge matches.
320
- """
321
- challenge_types = (
322
- "non-interactive",
323
- "managed",
324
- "interactive",
325
- )
326
- for ctype in challenge_types:
327
- if f"cType: '{ctype}'" in page_content:
328
- return ctype
329
-
330
- return None
331
-
332
208
  def _solve_cloudflare(self, page: Page) -> None: # pragma: no cover
333
209
  """Solve the cloudflare challenge displayed on the playwright page passed
334
210
 
@@ -375,20 +251,66 @@ class StealthySession:
375
251
  log.info("Cloudflare captcha is solved")
376
252
  return
377
253
 
378
- def fetch(self, url: str) -> Response:
254
+ def fetch(
255
+ self,
256
+ url: str,
257
+ google_search: bool = _UNSET,
258
+ timeout: int | float = _UNSET,
259
+ wait: int | float = _UNSET,
260
+ page_action: Optional[Callable] = _UNSET,
261
+ extra_headers: Optional[Dict[str, str]] = _UNSET,
262
+ disable_resources: bool = _UNSET,
263
+ wait_selector: Optional[str] = _UNSET,
264
+ wait_selector_state: SelectorWaitStates = _UNSET,
265
+ network_idle: bool = _UNSET,
266
+ load_dom: bool = _UNSET,
267
+ solve_cloudflare: bool = _UNSET,
268
+ selector_config: Optional[Dict] = _UNSET,
269
+ ) -> Response:
379
270
  """Opens up the browser and do your request based on your chosen options.
380
271
 
381
272
  :param url: The Target url.
273
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
274
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
275
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
276
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
277
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
278
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
279
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
280
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
281
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
282
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
283
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
284
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
285
+ :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
286
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
382
287
  :return: A `Response` object.
383
288
  """
289
+ # Validate all resolved parameters
290
+ params = validate(
291
+ dict(
292
+ google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
293
+ timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
294
+ wait=self._get_with_precedence(wait, self.wait, _UNSET),
295
+ page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
296
+ extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
297
+ disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
298
+ wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
299
+ wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
300
+ network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
301
+ load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
302
+ solve_cloudflare=self._get_with_precedence(solve_cloudflare, self.solve_cloudflare, _UNSET),
303
+ selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
304
+ ),
305
+ CamoufoxConfig,
306
+ )
307
+
384
308
  if self._closed: # pragma: no cover
385
309
  raise RuntimeError("Context manager has been closed")
386
310
 
387
311
  final_response = None
388
312
  referer = (
389
- generate_convincing_referer(url)
390
- if (self.google_search and "referer" not in self._headers_keys)
391
- else None
313
+ generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
392
314
  )
393
315
 
394
316
  def handle_response(finished_response: SyncPlaywrightResponse):
@@ -399,54 +321,57 @@ class StealthySession:
399
321
  ):
400
322
  final_response = finished_response
401
323
 
402
- page_info = self._get_or_create_page()
324
+ page_info = self._get_page(params.timeout, params.extra_headers, params.disable_resources)
403
325
  page_info.mark_busy(url=url)
404
326
 
405
327
  try: # pragma: no cover
406
328
  # Navigate to URL and wait for a specified state
407
329
  page_info.page.on("response", handle_response)
408
330
  first_response = page_info.page.goto(url, referer=referer)
409
- page_info.page.wait_for_load_state(state="domcontentloaded")
331
+ if params.load_dom:
332
+ page_info.page.wait_for_load_state(state="domcontentloaded")
410
333
 
411
- if self.network_idle:
334
+ if params.network_idle:
412
335
  page_info.page.wait_for_load_state("networkidle")
413
336
 
414
337
  if not first_response:
415
338
  raise RuntimeError(f"Failed to get response for {url}")
416
339
 
417
- if self.solve_cloudflare:
340
+ if params.solve_cloudflare:
418
341
  self._solve_cloudflare(page_info.page)
419
342
  # Make sure the page is fully loaded after the captcha
420
343
  page_info.page.wait_for_load_state(state="load")
421
- page_info.page.wait_for_load_state(state="domcontentloaded")
422
- if self.network_idle:
344
+ if params.load_dom:
345
+ page_info.page.wait_for_load_state(state="domcontentloaded")
346
+ if params.network_idle:
423
347
  page_info.page.wait_for_load_state("networkidle")
424
348
 
425
- if self.page_action is not None:
349
+ if params.page_action:
426
350
  try:
427
- page_info.page = self.page_action(page_info.page)
351
+ _ = params.page_action(page_info.page)
428
352
  except Exception as e:
429
353
  log.error(f"Error executing page_action: {e}")
430
354
 
431
- if self.wait_selector:
355
+ if params.wait_selector:
432
356
  try:
433
- waiter: Locator = page_info.page.locator(self.wait_selector)
434
- waiter.first.wait_for(state=self.wait_selector_state)
357
+ waiter: Locator = page_info.page.locator(params.wait_selector)
358
+ waiter.first.wait_for(state=params.wait_selector_state)
435
359
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
436
360
  page_info.page.wait_for_load_state(state="load")
437
- page_info.page.wait_for_load_state(state="domcontentloaded")
438
- if self.network_idle:
361
+ if params.load_dom:
362
+ page_info.page.wait_for_load_state(state="domcontentloaded")
363
+ if params.network_idle:
439
364
  page_info.page.wait_for_load_state("networkidle")
440
365
  except Exception as e:
441
- log.error(f"Error waiting for selector {self.wait_selector}: {e}")
366
+ log.error(f"Error waiting for selector {params.wait_selector}: {e}")
442
367
 
443
- page_info.page.wait_for_timeout(self.wait)
368
+ page_info.page.wait_for_timeout(params.wait)
444
369
  response = ResponseFactory.from_playwright_response(
445
- page_info.page, first_response, final_response, self.selector_config
370
+ page_info.page, first_response, final_response, params.selector_config
446
371
  )
447
372
 
448
- # Mark the page as ready for next use
449
- page_info.mark_ready()
373
+ # Mark the page as finished for next use
374
+ page_info.mark_finished()
450
375
 
451
376
  return response
452
377
 
@@ -454,17 +379,8 @@ class StealthySession:
454
379
  page_info.mark_error()
455
380
  raise e
456
381
 
457
- def get_pool_stats(self) -> Dict[str, int]:
458
- """Get statistics about the current page pool"""
459
- return {
460
- "total_pages": self.page_pool.pages_count,
461
- "ready_pages": self.page_pool.ready_count,
462
- "busy_pages": self.page_pool.busy_count,
463
- "max_pages": self.max_pages,
464
- }
465
-
466
382
 
467
- class AsyncStealthySession(StealthySession):
383
+ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
468
384
  """A Stealthy session manager with page pooling."""
469
385
 
470
386
  def __init__(
@@ -476,6 +392,7 @@ class AsyncStealthySession(StealthySession):
476
392
  block_webrtc: bool = False,
477
393
  allow_webgl: bool = True,
478
394
  network_idle: bool = False,
395
+ load_dom: bool = True,
479
396
  humanize: bool | float = True,
480
397
  solve_cloudflare: bool = False,
481
398
  wait: int | float = 0,
@@ -510,11 +427,12 @@ class AsyncStealthySession(StealthySession):
510
427
  :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
511
428
  :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
512
429
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
430
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
513
431
  :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
514
432
  :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
515
433
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
516
434
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
517
- :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
435
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
518
436
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
519
437
  :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
520
438
  :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
@@ -527,47 +445,47 @@ class AsyncStealthySession(StealthySession):
527
445
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
528
446
  :param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
529
447
  """
530
- super().__init__(
531
- max_pages,
532
- headless,
533
- block_images,
534
- disable_resources,
535
- block_webrtc,
536
- allow_webgl,
537
- network_idle,
538
- humanize,
539
- solve_cloudflare,
540
- wait,
541
- timeout,
542
- page_action,
543
- wait_selector,
544
- init_script,
545
- addons,
546
- wait_selector_state,
547
- cookies,
548
- google_search,
549
- extra_headers,
550
- proxy,
551
- os_randomize,
552
- disable_ads,
553
- geoip,
554
- selector_config,
555
- additional_args,
448
+ self.__validate__(
449
+ wait=wait,
450
+ proxy=proxy,
451
+ geoip=geoip,
452
+ addons=addons,
453
+ timeout=timeout,
454
+ cookies=cookies,
455
+ headless=headless,
456
+ load_dom=load_dom,
457
+ humanize=humanize,
458
+ max_pages=max_pages,
459
+ disable_ads=disable_ads,
460
+ allow_webgl=allow_webgl,
461
+ page_action=page_action,
462
+ init_script=init_script,
463
+ network_idle=network_idle,
464
+ block_images=block_images,
465
+ block_webrtc=block_webrtc,
466
+ os_randomize=os_randomize,
467
+ wait_selector=wait_selector,
468
+ google_search=google_search,
469
+ extra_headers=extra_headers,
470
+ additional_args=additional_args,
471
+ selector_config=selector_config,
472
+ solve_cloudflare=solve_cloudflare,
473
+ disable_resources=disable_resources,
474
+ wait_selector_state=wait_selector_state,
556
475
  )
557
- self.playwright: Optional[AsyncPlaywright] = None
558
- self.context: Optional[AsyncBrowserContext] = None
559
- self._lock = Lock()
560
- self.__enter__ = None
561
- self.__exit__ = None
476
+ super().__init__(max_pages=self.max_pages)
562
477
 
563
478
  async def __create__(self):
564
479
  """Create a browser for this instance and context."""
565
480
  self.playwright: AsyncPlaywright = await async_playwright().start()
566
- self.context: AsyncBrowserContext = (
567
- await self.playwright.firefox.launch_persistent_context(
568
- **self.launch_options
569
- )
481
+ self.context: AsyncBrowserContext = await self.playwright.firefox.launch_persistent_context(
482
+ **self.launch_options
570
483
  )
484
+
485
+ # Get the default page and close it
486
+ default_page = self.context.pages[0]
487
+ await default_page.close()
488
+
571
489
  if self.init_script: # pragma: no cover
572
490
  await self.context.add_init_script(path=self.init_script)
573
491
 
@@ -596,39 +514,6 @@ class AsyncStealthySession(StealthySession):
596
514
 
597
515
  self._closed = True
598
516
 
599
- async def _get_or_create_page(self) -> PageInfo:
600
- """Get an available page or create a new one"""
601
- async with self._lock:
602
- # Try to get a ready page first
603
- page_info = self.page_pool.get_ready_page()
604
- if page_info:
605
- return page_info
606
-
607
- # Create a new page if under limit
608
- if self.page_pool.pages_count < self.max_pages:
609
- page = await self.context.new_page()
610
- page.set_default_navigation_timeout(self.timeout)
611
- page.set_default_timeout(self.timeout)
612
- if self.extra_headers:
613
- await page.set_extra_http_headers(self.extra_headers)
614
-
615
- if self.disable_resources:
616
- await page.route("**/*", async_intercept_route)
617
-
618
- return self.page_pool.add_page(page)
619
-
620
- # Wait for a page to become available
621
- max_wait = 30
622
- start_time = time()
623
-
624
- while time() - start_time < max_wait: # pragma: no cover
625
- page_info = self.page_pool.get_ready_page()
626
- if page_info:
627
- return page_info
628
- await asyncio_sleep(0.05)
629
-
630
- raise TimeoutError("No pages available within timeout period")
631
-
632
517
  async def _solve_cloudflare(self, page: async_Page):
633
518
  """Solve the cloudflare challenge displayed on the playwright page passed. The async version
634
519
 
@@ -664,9 +549,7 @@ class AsyncStealthySession(StealthySession):
664
549
  await page.wait_for_timeout(500)
665
550
 
666
551
  # Calculate the Captcha coordinates for any viewport
667
- outer_box = await page.locator(
668
- ".main-content p+div>div>div"
669
- ).bounding_box()
552
+ outer_box = await page.locator(".main-content p+div>div>div").bounding_box()
670
553
  captcha_x, captcha_y = outer_box["x"] + 26, outer_box["y"] + 25
671
554
 
672
555
  # Move the mouse to the center of the window, then press and hold the left mouse button
@@ -677,20 +560,65 @@ class AsyncStealthySession(StealthySession):
677
560
  log.info("Cloudflare captcha is solved")
678
561
  return
679
562
 
680
- async def fetch(self, url: str) -> Response:
563
+ async def fetch(
564
+ self,
565
+ url: str,
566
+ google_search: bool = _UNSET,
567
+ timeout: int | float = _UNSET,
568
+ wait: int | float = _UNSET,
569
+ page_action: Optional[Callable] = _UNSET,
570
+ extra_headers: Optional[Dict[str, str]] = _UNSET,
571
+ disable_resources: bool = _UNSET,
572
+ wait_selector: Optional[str] = _UNSET,
573
+ wait_selector_state: SelectorWaitStates = _UNSET,
574
+ network_idle: bool = _UNSET,
575
+ load_dom: bool = _UNSET,
576
+ solve_cloudflare: bool = _UNSET,
577
+ selector_config: Optional[Dict] = _UNSET,
578
+ ) -> Response:
681
579
  """Opens up the browser and do your request based on your chosen options.
682
580
 
683
581
  :param url: The Target url.
582
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
583
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
584
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
585
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
586
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
587
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
588
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
589
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
590
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
591
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
592
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
593
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
594
+ :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
595
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
684
596
  :return: A `Response` object.
685
597
  """
598
+ params = validate(
599
+ dict(
600
+ google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
601
+ timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
602
+ wait=self._get_with_precedence(wait, self.wait, _UNSET),
603
+ page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
604
+ extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
605
+ disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
606
+ wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
607
+ wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
608
+ network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
609
+ load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
610
+ solve_cloudflare=self._get_with_precedence(solve_cloudflare, self.solve_cloudflare, _UNSET),
611
+ selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
612
+ ),
613
+ CamoufoxConfig,
614
+ )
615
+
686
616
  if self._closed: # pragma: no cover
687
617
  raise RuntimeError("Context manager has been closed")
688
618
 
689
619
  final_response = None
690
620
  referer = (
691
- generate_convincing_referer(url)
692
- if (self.google_search and "referer" not in self._headers_keys)
693
- else None
621
+ generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
694
622
  )
695
623
 
696
624
  async def handle_response(finished_response: AsyncPlaywrightResponse):
@@ -701,56 +629,59 @@ class AsyncStealthySession(StealthySession):
701
629
  ):
702
630
  final_response = finished_response
703
631
 
704
- page_info = await self._get_or_create_page()
632
+ page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
705
633
  page_info.mark_busy(url=url)
706
634
 
707
635
  try:
708
636
  # Navigate to URL and wait for a specified state
709
637
  page_info.page.on("response", handle_response)
710
638
  first_response = await page_info.page.goto(url, referer=referer)
711
- await page_info.page.wait_for_load_state(state="domcontentloaded")
639
+ if params.load_dom:
640
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
712
641
 
713
- if self.network_idle:
642
+ if params.network_idle:
714
643
  await page_info.page.wait_for_load_state("networkidle")
715
644
 
716
645
  if not first_response:
717
646
  raise RuntimeError(f"Failed to get response for {url}")
718
647
 
719
- if self.solve_cloudflare:
648
+ if params.solve_cloudflare:
720
649
  await self._solve_cloudflare(page_info.page)
721
650
  # Make sure the page is fully loaded after the captcha
722
651
  await page_info.page.wait_for_load_state(state="load")
723
- await page_info.page.wait_for_load_state(state="domcontentloaded")
724
- if self.network_idle:
652
+ if params.load_dom:
653
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
654
+ if params.network_idle:
725
655
  await page_info.page.wait_for_load_state("networkidle")
726
656
 
727
- if self.page_action is not None:
657
+ if params.page_action:
728
658
  try:
729
- page_info.page = await self.page_action(page_info.page)
659
+ _ = await params.page_action(page_info.page)
730
660
  except Exception as e:
731
661
  log.error(f"Error executing page_action: {e}")
732
662
 
733
- if self.wait_selector:
663
+ if params.wait_selector:
734
664
  try:
735
- waiter: AsyncLocator = page_info.page.locator(self.wait_selector)
736
- await waiter.first.wait_for(state=self.wait_selector_state)
665
+ waiter: AsyncLocator = page_info.page.locator(params.wait_selector)
666
+ await waiter.first.wait_for(state=params.wait_selector_state)
737
667
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
738
668
  await page_info.page.wait_for_load_state(state="load")
739
- await page_info.page.wait_for_load_state(state="domcontentloaded")
740
- if self.network_idle:
669
+ if params.load_dom:
670
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
671
+ if params.network_idle:
741
672
  await page_info.page.wait_for_load_state("networkidle")
742
673
  except Exception as e:
743
- log.error(f"Error waiting for selector {self.wait_selector}: {e}")
674
+ log.error(f"Error waiting for selector {params.wait_selector}: {e}")
744
675
 
745
- await page_info.page.wait_for_timeout(self.wait)
676
+ await page_info.page.wait_for_timeout(params.wait)
746
677
 
747
678
  # Create response object
748
679
  response = await ResponseFactory.from_async_playwright_response(
749
- page_info.page, first_response, final_response, self.selector_config
680
+ page_info.page, first_response, final_response, params.selector_config
750
681
  )
751
682
 
752
- # Mark the page as ready for next use
753
- page_info.mark_ready()
683
+ # Mark the page as finished for next use
684
+ page_info.mark_finished()
754
685
 
755
686
  return response
756
687