scrapling 0.3__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. scrapling/__init__.py +1 -1
  2. scrapling/cli.py +38 -51
  3. scrapling/core/_html_utils.py +3 -9
  4. scrapling/core/ai.py +5 -13
  5. scrapling/core/custom_types.py +19 -61
  6. scrapling/core/mixins.py +6 -28
  7. scrapling/core/shell.py +51 -129
  8. scrapling/core/storage.py +2 -8
  9. scrapling/core/translator.py +8 -20
  10. scrapling/core/utils/__init__.py +10 -0
  11. scrapling/core/utils/_shell.py +48 -0
  12. scrapling/core/{utils.py → utils/_utils.py} +5 -21
  13. scrapling/engines/__init__.py +0 -16
  14. scrapling/engines/_browsers/_base.py +297 -0
  15. scrapling/engines/_browsers/_camoufox.py +238 -293
  16. scrapling/engines/_browsers/_config_tools.py +2 -1
  17. scrapling/engines/_browsers/_controllers.py +220 -278
  18. scrapling/engines/_browsers/_page.py +37 -15
  19. scrapling/engines/_browsers/_validators.py +29 -15
  20. scrapling/engines/constants.py +3 -6
  21. scrapling/engines/static.py +25 -75
  22. scrapling/engines/toolbelt/__init__.py +1 -20
  23. scrapling/engines/toolbelt/convertor.py +95 -86
  24. scrapling/engines/toolbelt/custom.py +7 -99
  25. scrapling/engines/toolbelt/fingerprints.py +1 -3
  26. scrapling/engines/toolbelt/navigation.py +4 -58
  27. scrapling/fetchers.py +41 -24
  28. scrapling/parser.py +45 -122
  29. {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/METADATA +57 -47
  30. scrapling-0.3.2.dist-info/RECORD +44 -0
  31. scrapling-0.3.dist-info/RECORD +0 -41
  32. {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/WHEEL +0 -0
  33. {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/entry_points.txt +0 -0
  34. {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/licenses/LICENSE +0 -0
  35. {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,8 @@
1
- from time import time, sleep
2
1
  from re import compile as re_compile
3
- from asyncio import sleep as asyncio_sleep, Lock
4
2
 
5
- from camoufox import DefaultAddons
6
- from camoufox.utils import launch_options as generate_launch_options
7
3
  from playwright.sync_api import (
8
4
  Response as SyncPlaywrightResponse,
9
5
  sync_playwright,
10
- BrowserContext,
11
- Playwright,
12
6
  Locator,
13
7
  Page,
14
8
  )
@@ -21,9 +15,9 @@ from playwright.async_api import (
21
15
  Page as async_Page,
22
16
  )
23
17
 
24
- from scrapling.core.utils import log
25
- from ._page import PageInfo, PagePool
26
18
  from ._validators import validate, CamoufoxConfig
19
+ from ._base import SyncSession, AsyncSession, StealthySessionMixin
20
+ from scrapling.core.utils import log
27
21
  from scrapling.core._types import (
28
22
  Dict,
29
23
  List,
@@ -31,19 +25,17 @@ from scrapling.core._types import (
31
25
  Callable,
32
26
  SelectorWaitStates,
33
27
  )
34
- from scrapling.engines.toolbelt import (
28
+ from scrapling.engines.toolbelt.convertor import (
35
29
  Response,
36
30
  ResponseFactory,
37
- async_intercept_route,
38
- generate_convincing_referer,
39
- get_os_name,
40
- intercept_route,
41
31
  )
32
+ from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
42
33
 
43
34
  __CF_PATTERN__ = re_compile("challenges.cloudflare.com/cdn-cgi/challenge-platform/.*")
35
+ _UNSET = object()
44
36
 
45
37
 
46
- class StealthySession:
38
+ class StealthySession(StealthySessionMixin, SyncSession):
47
39
  """A Stealthy session manager with page pooling."""
48
40
 
49
41
  __slots__ = (
@@ -54,12 +46,14 @@ class StealthySession:
54
46
  "block_webrtc",
55
47
  "allow_webgl",
56
48
  "network_idle",
49
+ "load_dom",
57
50
  "humanize",
58
51
  "solve_cloudflare",
59
52
  "wait",
60
53
  "timeout",
61
54
  "page_action",
62
55
  "wait_selector",
56
+ "init_script",
63
57
  "addons",
64
58
  "wait_selector_state",
65
59
  "cookies",
@@ -82,19 +76,21 @@ class StealthySession:
82
76
 
83
77
  def __init__(
84
78
  self,
85
- max_pages: int = 1,
79
+ __max_pages: int = 1,
86
80
  headless: bool = True, # noqa: F821
87
81
  block_images: bool = False,
88
82
  disable_resources: bool = False,
89
83
  block_webrtc: bool = False,
90
84
  allow_webgl: bool = True,
91
85
  network_idle: bool = False,
86
+ load_dom: bool = True,
92
87
  humanize: bool | float = True,
93
88
  solve_cloudflare: bool = False,
94
89
  wait: int | float = 0,
95
90
  timeout: int | float = 30000,
96
91
  page_action: Optional[Callable] = None,
97
92
  wait_selector: Optional[str] = None,
93
+ init_script: Optional[str] = None,
98
94
  addons: Optional[List[str]] = None,
99
95
  wait_selector_state: SelectorWaitStates = "attached",
100
96
  cookies: Optional[List[Dict]] = None,
@@ -122,118 +118,68 @@ class StealthySession:
122
118
  :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
123
119
  :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
124
120
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
121
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
125
122
  :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
126
123
  :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
127
124
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
128
125
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
129
- :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
126
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
130
127
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
128
+ :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
131
129
  :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
132
130
  It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
133
131
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
134
132
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
135
133
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
136
134
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
137
- :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
138
135
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
139
136
  :param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
140
137
  """
141
138
 
142
- params = {
143
- "max_pages": max_pages,
144
- "headless": headless,
145
- "block_images": block_images,
146
- "disable_resources": disable_resources,
147
- "block_webrtc": block_webrtc,
148
- "allow_webgl": allow_webgl,
149
- "network_idle": network_idle,
150
- "humanize": humanize,
151
- "solve_cloudflare": solve_cloudflare,
152
- "wait": wait,
153
- "timeout": timeout,
154
- "page_action": page_action,
155
- "wait_selector": wait_selector,
156
- "addons": addons,
157
- "wait_selector_state": wait_selector_state,
158
- "cookies": cookies,
159
- "google_search": google_search,
160
- "extra_headers": extra_headers,
161
- "proxy": proxy,
162
- "os_randomize": os_randomize,
163
- "disable_ads": disable_ads,
164
- "geoip": geoip,
165
- "selector_config": selector_config,
166
- "additional_args": additional_args,
167
- }
168
- config = validate(params, CamoufoxConfig)
169
-
170
- self.max_pages = config.max_pages
171
- self.headless = config.headless
172
- self.block_images = config.block_images
173
- self.disable_resources = config.disable_resources
174
- self.block_webrtc = config.block_webrtc
175
- self.allow_webgl = config.allow_webgl
176
- self.network_idle = config.network_idle
177
- self.humanize = config.humanize
178
- self.solve_cloudflare = config.solve_cloudflare
179
- self.wait = config.wait
180
- self.timeout = config.timeout
181
- self.page_action = config.page_action
182
- self.wait_selector = config.wait_selector
183
- self.addons = config.addons
184
- self.wait_selector_state = config.wait_selector_state
185
- self.cookies = config.cookies
186
- self.google_search = config.google_search
187
- self.extra_headers = config.extra_headers
188
- self.proxy = config.proxy
189
- self.os_randomize = config.os_randomize
190
- self.disable_ads = config.disable_ads
191
- self.geoip = config.geoip
192
- self.selector_config = config.selector_config
193
- self.additional_args = config.additional_args
194
-
195
- self.playwright: Optional[Playwright] = None
196
- self.context: Optional[BrowserContext] = None
197
- self.page_pool = PagePool(self.max_pages)
198
- self._closed = False
199
- self.selector_config = config.selector_config
200
- self.page_action = config.page_action
201
- self._headers_keys = (
202
- set(map(str.lower, self.extra_headers.keys()))
203
- if self.extra_headers
204
- else set()
205
- )
206
- self.__initiate_browser_options__()
207
-
208
- def __initiate_browser_options__(self):
209
- """Initiate browser options."""
210
- self.launch_options = generate_launch_options(
211
- **{
212
- "geoip": self.geoip,
213
- "proxy": dict(self.proxy) if self.proxy else self.proxy,
214
- "enable_cache": True,
215
- "addons": self.addons,
216
- "exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
217
- "headless": self.headless,
218
- "humanize": True if self.solve_cloudflare else self.humanize,
219
- "i_know_what_im_doing": True, # To turn warnings off with the user configurations
220
- "allow_webgl": self.allow_webgl,
221
- "block_webrtc": self.block_webrtc,
222
- "block_images": self.block_images, # Careful! it makes some websites don't finish loading at all like stackoverflow even in headful mode.
223
- "os": None if self.os_randomize else get_os_name(),
224
- "user_data_dir": "",
225
- **self.additional_args,
226
- }
139
+ self.__validate__(
140
+ wait=wait,
141
+ proxy=proxy,
142
+ geoip=geoip,
143
+ addons=addons,
144
+ timeout=timeout,
145
+ cookies=cookies,
146
+ headless=headless,
147
+ humanize=humanize,
148
+ load_dom=load_dom,
149
+ max_pages=__max_pages,
150
+ disable_ads=disable_ads,
151
+ allow_webgl=allow_webgl,
152
+ page_action=page_action,
153
+ init_script=init_script,
154
+ network_idle=network_idle,
155
+ block_images=block_images,
156
+ block_webrtc=block_webrtc,
157
+ os_randomize=os_randomize,
158
+ wait_selector=wait_selector,
159
+ google_search=google_search,
160
+ extra_headers=extra_headers,
161
+ additional_args=additional_args,
162
+ selector_config=selector_config,
163
+ solve_cloudflare=solve_cloudflare,
164
+ disable_resources=disable_resources,
165
+ wait_selector_state=wait_selector_state,
227
166
  )
167
+ super().__init__(max_pages=self.max_pages)
228
168
 
229
169
  def __create__(self):
230
170
  """Create a browser for this instance and context."""
231
171
  self.playwright = sync_playwright().start()
232
- self.context = (
233
- self.playwright.firefox.launch_persistent_context( # pragma: no cover
234
- **self.launch_options
235
- )
172
+ self.context = self.playwright.firefox.launch_persistent_context( # pragma: no cover
173
+ **self.launch_options
236
174
  )
175
+
176
+ # Get the default page and close it
177
+ default_page = self.context.pages[0]
178
+ default_page.close()
179
+
180
+ if self.init_script: # pragma: no cover
181
+ self.context.add_init_script(path=self.init_script)
182
+
237
183
  if self.cookies: # pragma: no cover
238
184
  self.context.add_cookies(self.cookies)
239
185
 
@@ -259,68 +205,6 @@ class StealthySession:
259
205
 
260
206
  self._closed = True
261
207
 
262
- def _get_or_create_page(self) -> PageInfo: # pragma: no cover
263
- """Get an available page or create a new one"""
264
- # Try to get a ready page first
265
- page_info = self.page_pool.get_ready_page()
266
- if page_info:
267
- return page_info
268
-
269
- # Create a new page if under limit
270
- if self.page_pool.pages_count < self.max_pages:
271
- page = self.context.new_page()
272
- page.set_default_navigation_timeout(self.timeout)
273
- page.set_default_timeout(self.timeout)
274
- if self.extra_headers:
275
- page.set_extra_http_headers(self.extra_headers)
276
-
277
- if self.disable_resources:
278
- page.route("**/*", intercept_route)
279
-
280
- return self.page_pool.add_page(page)
281
-
282
- # Wait for a page to become available
283
- max_wait = 30
284
- start_time = time()
285
-
286
- while time() - start_time < max_wait:
287
- page_info = self.page_pool.get_ready_page()
288
- if page_info:
289
- return page_info
290
- sleep(0.05)
291
-
292
- raise TimeoutError("No pages available within timeout period")
293
-
294
- @staticmethod
295
- def _detect_cloudflare(page_content):
296
- """
297
- Detect the type of Cloudflare challenge present in the provided page content.
298
-
299
- This function analyzes the given page content to identify whether a specific
300
- type of Cloudflare challenge is present. It checks for three predefined
301
- challenge types: non-interactive, managed, and interactive. If a challenge
302
- type is detected, it returns the corresponding type as a string. If no
303
- challenge type is detected, it returns None.
304
-
305
- Args:
306
- page_content (str): The content of the page to analyze for Cloudflare
307
- challenge types.
308
-
309
- Returns:
310
- str: A string representing the detected Cloudflare challenge type, if
311
- found. Returns None if no challenge matches.
312
- """
313
- challenge_types = (
314
- "non-interactive",
315
- "managed",
316
- "interactive",
317
- )
318
- for ctype in challenge_types:
319
- if f"cType: '{ctype}'" in page_content:
320
- return ctype
321
-
322
- return None
323
-
324
208
  def _solve_cloudflare(self, page: Page) -> None: # pragma: no cover
325
209
  """Solve the cloudflare challenge displayed on the playwright page passed
326
210
 
@@ -367,20 +251,66 @@ class StealthySession:
367
251
  log.info("Cloudflare captcha is solved")
368
252
  return
369
253
 
370
- def fetch(self, url: str) -> Response:
254
+ def fetch(
255
+ self,
256
+ url: str,
257
+ google_search: bool = _UNSET,
258
+ timeout: int | float = _UNSET,
259
+ wait: int | float = _UNSET,
260
+ page_action: Optional[Callable] = _UNSET,
261
+ extra_headers: Optional[Dict[str, str]] = _UNSET,
262
+ disable_resources: bool = _UNSET,
263
+ wait_selector: Optional[str] = _UNSET,
264
+ wait_selector_state: SelectorWaitStates = _UNSET,
265
+ network_idle: bool = _UNSET,
266
+ load_dom: bool = _UNSET,
267
+ solve_cloudflare: bool = _UNSET,
268
+ selector_config: Optional[Dict] = _UNSET,
269
+ ) -> Response:
371
270
  """Opens up the browser and do your request based on your chosen options.
372
271
 
373
272
  :param url: The Target url.
273
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
274
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
275
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
276
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
277
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
278
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
279
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
280
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
281
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
282
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
283
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
284
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
285
+ :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
286
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
374
287
  :return: A `Response` object.
375
288
  """
289
+ # Validate all resolved parameters
290
+ params = validate(
291
+ dict(
292
+ google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
293
+ timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
294
+ wait=self._get_with_precedence(wait, self.wait, _UNSET),
295
+ page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
296
+ extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
297
+ disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
298
+ wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
299
+ wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
300
+ network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
301
+ load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
302
+ solve_cloudflare=self._get_with_precedence(solve_cloudflare, self.solve_cloudflare, _UNSET),
303
+ selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
304
+ ),
305
+ CamoufoxConfig,
306
+ )
307
+
376
308
  if self._closed: # pragma: no cover
377
309
  raise RuntimeError("Context manager has been closed")
378
310
 
379
311
  final_response = None
380
312
  referer = (
381
- generate_convincing_referer(url)
382
- if (self.google_search and "referer" not in self._headers_keys)
383
- else None
313
+ generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
384
314
  )
385
315
 
386
316
  def handle_response(finished_response: SyncPlaywrightResponse):
@@ -391,54 +321,57 @@ class StealthySession:
391
321
  ):
392
322
  final_response = finished_response
393
323
 
394
- page_info = self._get_or_create_page()
324
+ page_info = self._get_page(params.timeout, params.extra_headers, params.disable_resources)
395
325
  page_info.mark_busy(url=url)
396
326
 
397
327
  try: # pragma: no cover
398
328
  # Navigate to URL and wait for a specified state
399
329
  page_info.page.on("response", handle_response)
400
330
  first_response = page_info.page.goto(url, referer=referer)
401
- page_info.page.wait_for_load_state(state="domcontentloaded")
331
+ if params.load_dom:
332
+ page_info.page.wait_for_load_state(state="domcontentloaded")
402
333
 
403
- if self.network_idle:
334
+ if params.network_idle:
404
335
  page_info.page.wait_for_load_state("networkidle")
405
336
 
406
337
  if not first_response:
407
338
  raise RuntimeError(f"Failed to get response for {url}")
408
339
 
409
- if self.solve_cloudflare:
340
+ if params.solve_cloudflare:
410
341
  self._solve_cloudflare(page_info.page)
411
342
  # Make sure the page is fully loaded after the captcha
412
343
  page_info.page.wait_for_load_state(state="load")
413
- page_info.page.wait_for_load_state(state="domcontentloaded")
414
- if self.network_idle:
344
+ if params.load_dom:
345
+ page_info.page.wait_for_load_state(state="domcontentloaded")
346
+ if params.network_idle:
415
347
  page_info.page.wait_for_load_state("networkidle")
416
348
 
417
- if self.page_action is not None:
349
+ if params.page_action:
418
350
  try:
419
- page_info.page = self.page_action(page_info.page)
351
+ _ = params.page_action(page_info.page)
420
352
  except Exception as e:
421
353
  log.error(f"Error executing page_action: {e}")
422
354
 
423
- if self.wait_selector:
355
+ if params.wait_selector:
424
356
  try:
425
- waiter: Locator = page_info.page.locator(self.wait_selector)
426
- waiter.first.wait_for(state=self.wait_selector_state)
357
+ waiter: Locator = page_info.page.locator(params.wait_selector)
358
+ waiter.first.wait_for(state=params.wait_selector_state)
427
359
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
428
360
  page_info.page.wait_for_load_state(state="load")
429
- page_info.page.wait_for_load_state(state="domcontentloaded")
430
- if self.network_idle:
361
+ if params.load_dom:
362
+ page_info.page.wait_for_load_state(state="domcontentloaded")
363
+ if params.network_idle:
431
364
  page_info.page.wait_for_load_state("networkidle")
432
365
  except Exception as e:
433
- log.error(f"Error waiting for selector {self.wait_selector}: {e}")
366
+ log.error(f"Error waiting for selector {params.wait_selector}: {e}")
434
367
 
435
- page_info.page.wait_for_timeout(self.wait)
368
+ page_info.page.wait_for_timeout(params.wait)
436
369
  response = ResponseFactory.from_playwright_response(
437
- page_info.page, first_response, final_response, self.selector_config
370
+ page_info.page, first_response, final_response, params.selector_config
438
371
  )
439
372
 
440
- # Mark the page as ready for next use
441
- page_info.mark_ready()
373
+ # Mark the page as finished for next use
374
+ page_info.mark_finished()
442
375
 
443
376
  return response
444
377
 
@@ -446,17 +379,8 @@ class StealthySession:
446
379
  page_info.mark_error()
447
380
  raise e
448
381
 
449
- def get_pool_stats(self) -> Dict[str, int]:
450
- """Get statistics about the current page pool"""
451
- return {
452
- "total_pages": self.page_pool.pages_count,
453
- "ready_pages": self.page_pool.ready_count,
454
- "busy_pages": self.page_pool.busy_count,
455
- "max_pages": self.max_pages,
456
- }
457
-
458
382
 
459
- class AsyncStealthySession(StealthySession):
383
+ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
460
384
  """A Stealthy session manager with page pooling."""
461
385
 
462
386
  def __init__(
@@ -468,12 +392,14 @@ class AsyncStealthySession(StealthySession):
468
392
  block_webrtc: bool = False,
469
393
  allow_webgl: bool = True,
470
394
  network_idle: bool = False,
395
+ load_dom: bool = True,
471
396
  humanize: bool | float = True,
472
397
  solve_cloudflare: bool = False,
473
398
  wait: int | float = 0,
474
399
  timeout: int | float = 30000,
475
400
  page_action: Optional[Callable] = None,
476
401
  wait_selector: Optional[str] = None,
402
+ init_script: Optional[str] = None,
477
403
  addons: Optional[List[str]] = None,
478
404
  wait_selector_state: SelectorWaitStates = "attached",
479
405
  cookies: Optional[List[Dict]] = None,
@@ -501,12 +427,14 @@ class AsyncStealthySession(StealthySession):
501
427
  :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
502
428
  :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
503
429
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
430
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
504
431
  :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
505
432
  :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
506
433
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
507
434
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
508
- :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
435
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
509
436
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
437
+ :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
510
438
  :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
511
439
  It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
512
440
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
@@ -517,46 +445,50 @@ class AsyncStealthySession(StealthySession):
517
445
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
518
446
  :param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
519
447
  """
520
- super().__init__(
521
- max_pages,
522
- headless,
523
- block_images,
524
- disable_resources,
525
- block_webrtc,
526
- allow_webgl,
527
- network_idle,
528
- humanize,
529
- solve_cloudflare,
530
- wait,
531
- timeout,
532
- page_action,
533
- wait_selector,
534
- addons,
535
- wait_selector_state,
536
- cookies,
537
- google_search,
538
- extra_headers,
539
- proxy,
540
- os_randomize,
541
- disable_ads,
542
- geoip,
543
- selector_config,
544
- additional_args,
448
+ self.__validate__(
449
+ wait=wait,
450
+ proxy=proxy,
451
+ geoip=geoip,
452
+ addons=addons,
453
+ timeout=timeout,
454
+ cookies=cookies,
455
+ headless=headless,
456
+ load_dom=load_dom,
457
+ humanize=humanize,
458
+ max_pages=max_pages,
459
+ disable_ads=disable_ads,
460
+ allow_webgl=allow_webgl,
461
+ page_action=page_action,
462
+ init_script=init_script,
463
+ network_idle=network_idle,
464
+ block_images=block_images,
465
+ block_webrtc=block_webrtc,
466
+ os_randomize=os_randomize,
467
+ wait_selector=wait_selector,
468
+ google_search=google_search,
469
+ extra_headers=extra_headers,
470
+ additional_args=additional_args,
471
+ selector_config=selector_config,
472
+ solve_cloudflare=solve_cloudflare,
473
+ disable_resources=disable_resources,
474
+ wait_selector_state=wait_selector_state,
545
475
  )
546
- self.playwright: Optional[AsyncPlaywright] = None
547
- self.context: Optional[AsyncBrowserContext] = None
548
- self._lock = Lock()
549
- self.__enter__ = None
550
- self.__exit__ = None
476
+ super().__init__(max_pages=self.max_pages)
551
477
 
552
478
  async def __create__(self):
553
479
  """Create a browser for this instance and context."""
554
480
  self.playwright: AsyncPlaywright = await async_playwright().start()
555
- self.context: AsyncBrowserContext = (
556
- await self.playwright.firefox.launch_persistent_context(
557
- **self.launch_options
558
- )
481
+ self.context: AsyncBrowserContext = await self.playwright.firefox.launch_persistent_context(
482
+ **self.launch_options
559
483
  )
484
+
485
+ # Get the default page and close it
486
+ default_page = self.context.pages[0]
487
+ await default_page.close()
488
+
489
+ if self.init_script: # pragma: no cover
490
+ await self.context.add_init_script(path=self.init_script)
491
+
560
492
  if self.cookies:
561
493
  await self.context.add_cookies(self.cookies)
562
494
 
@@ -582,39 +514,6 @@ class AsyncStealthySession(StealthySession):
582
514
 
583
515
  self._closed = True
584
516
 
585
- async def _get_or_create_page(self) -> PageInfo:
586
- """Get an available page or create a new one"""
587
- async with self._lock:
588
- # Try to get a ready page first
589
- page_info = self.page_pool.get_ready_page()
590
- if page_info:
591
- return page_info
592
-
593
- # Create a new page if under limit
594
- if self.page_pool.pages_count < self.max_pages:
595
- page = await self.context.new_page()
596
- page.set_default_navigation_timeout(self.timeout)
597
- page.set_default_timeout(self.timeout)
598
- if self.extra_headers:
599
- await page.set_extra_http_headers(self.extra_headers)
600
-
601
- if self.disable_resources:
602
- await page.route("**/*", async_intercept_route)
603
-
604
- return self.page_pool.add_page(page)
605
-
606
- # Wait for a page to become available
607
- max_wait = 30
608
- start_time = time()
609
-
610
- while time() - start_time < max_wait: # pragma: no cover
611
- page_info = self.page_pool.get_ready_page()
612
- if page_info:
613
- return page_info
614
- await asyncio_sleep(0.05)
615
-
616
- raise TimeoutError("No pages available within timeout period")
617
-
618
517
  async def _solve_cloudflare(self, page: async_Page):
619
518
  """Solve the cloudflare challenge displayed on the playwright page passed. The async version
620
519
 
@@ -650,9 +549,7 @@ class AsyncStealthySession(StealthySession):
650
549
  await page.wait_for_timeout(500)
651
550
 
652
551
  # Calculate the Captcha coordinates for any viewport
653
- outer_box = await page.locator(
654
- ".main-content p+div>div>div"
655
- ).bounding_box()
552
+ outer_box = await page.locator(".main-content p+div>div>div").bounding_box()
656
553
  captcha_x, captcha_y = outer_box["x"] + 26, outer_box["y"] + 25
657
554
 
658
555
  # Move the mouse to the center of the window, then press and hold the left mouse button
@@ -663,20 +560,65 @@ class AsyncStealthySession(StealthySession):
663
560
  log.info("Cloudflare captcha is solved")
664
561
  return
665
562
 
666
- async def fetch(self, url: str) -> Response:
563
+ async def fetch(
564
+ self,
565
+ url: str,
566
+ google_search: bool = _UNSET,
567
+ timeout: int | float = _UNSET,
568
+ wait: int | float = _UNSET,
569
+ page_action: Optional[Callable] = _UNSET,
570
+ extra_headers: Optional[Dict[str, str]] = _UNSET,
571
+ disable_resources: bool = _UNSET,
572
+ wait_selector: Optional[str] = _UNSET,
573
+ wait_selector_state: SelectorWaitStates = _UNSET,
574
+ network_idle: bool = _UNSET,
575
+ load_dom: bool = _UNSET,
576
+ solve_cloudflare: bool = _UNSET,
577
+ selector_config: Optional[Dict] = _UNSET,
578
+ ) -> Response:
667
579
  """Opens up the browser and do your request based on your chosen options.
668
580
 
669
581
  :param url: The Target url.
582
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
583
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
584
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
585
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
586
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
587
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
588
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
589
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
590
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
591
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
592
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
593
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
594
+ :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
595
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
670
596
  :return: A `Response` object.
671
597
  """
598
+ params = validate(
599
+ dict(
600
+ google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
601
+ timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
602
+ wait=self._get_with_precedence(wait, self.wait, _UNSET),
603
+ page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
604
+ extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
605
+ disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
606
+ wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
607
+ wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
608
+ network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
609
+ load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
610
+ solve_cloudflare=self._get_with_precedence(solve_cloudflare, self.solve_cloudflare, _UNSET),
611
+ selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
612
+ ),
613
+ CamoufoxConfig,
614
+ )
615
+
672
616
  if self._closed: # pragma: no cover
673
617
  raise RuntimeError("Context manager has been closed")
674
618
 
675
619
  final_response = None
676
620
  referer = (
677
- generate_convincing_referer(url)
678
- if (self.google_search and "referer" not in self._headers_keys)
679
- else None
621
+ generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
680
622
  )
681
623
 
682
624
  async def handle_response(finished_response: AsyncPlaywrightResponse):
@@ -687,56 +629,59 @@ class AsyncStealthySession(StealthySession):
687
629
  ):
688
630
  final_response = finished_response
689
631
 
690
- page_info = await self._get_or_create_page()
632
+ page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
691
633
  page_info.mark_busy(url=url)
692
634
 
693
635
  try:
694
636
  # Navigate to URL and wait for a specified state
695
637
  page_info.page.on("response", handle_response)
696
638
  first_response = await page_info.page.goto(url, referer=referer)
697
- await page_info.page.wait_for_load_state(state="domcontentloaded")
639
+ if params.load_dom:
640
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
698
641
 
699
- if self.network_idle:
642
+ if params.network_idle:
700
643
  await page_info.page.wait_for_load_state("networkidle")
701
644
 
702
645
  if not first_response:
703
646
  raise RuntimeError(f"Failed to get response for {url}")
704
647
 
705
- if self.solve_cloudflare:
648
+ if params.solve_cloudflare:
706
649
  await self._solve_cloudflare(page_info.page)
707
650
  # Make sure the page is fully loaded after the captcha
708
651
  await page_info.page.wait_for_load_state(state="load")
709
- await page_info.page.wait_for_load_state(state="domcontentloaded")
710
- if self.network_idle:
652
+ if params.load_dom:
653
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
654
+ if params.network_idle:
711
655
  await page_info.page.wait_for_load_state("networkidle")
712
656
 
713
- if self.page_action is not None:
657
+ if params.page_action:
714
658
  try:
715
- page_info.page = await self.page_action(page_info.page)
659
+ _ = await params.page_action(page_info.page)
716
660
  except Exception as e:
717
661
  log.error(f"Error executing page_action: {e}")
718
662
 
719
- if self.wait_selector:
663
+ if params.wait_selector:
720
664
  try:
721
- waiter: AsyncLocator = page_info.page.locator(self.wait_selector)
722
- await waiter.first.wait_for(state=self.wait_selector_state)
665
+ waiter: AsyncLocator = page_info.page.locator(params.wait_selector)
666
+ await waiter.first.wait_for(state=params.wait_selector_state)
723
667
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
724
668
  await page_info.page.wait_for_load_state(state="load")
725
- await page_info.page.wait_for_load_state(state="domcontentloaded")
726
- if self.network_idle:
669
+ if params.load_dom:
670
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
671
+ if params.network_idle:
727
672
  await page_info.page.wait_for_load_state("networkidle")
728
673
  except Exception as e:
729
- log.error(f"Error waiting for selector {self.wait_selector}: {e}")
674
+ log.error(f"Error waiting for selector {params.wait_selector}: {e}")
730
675
 
731
- await page_info.page.wait_for_timeout(self.wait)
676
+ await page_info.page.wait_for_timeout(params.wait)
732
677
 
733
678
  # Create response object
734
679
  response = await ResponseFactory.from_async_playwright_response(
735
- page_info.page, first_response, final_response, self.selector_config
680
+ page_info.page, first_response, final_response, params.selector_config
736
681
  )
737
682
 
738
- # Mark the page as ready for next use
739
- page_info.mark_ready()
683
+ # Mark the page as finished for next use
684
+ page_info.mark_finished()
740
685
 
741
686
  return response
742
687