scrapling 0.2.99__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. scrapling/__init__.py +18 -31
  2. scrapling/cli.py +818 -20
  3. scrapling/core/_html_utils.py +348 -0
  4. scrapling/core/_types.py +34 -17
  5. scrapling/core/ai.py +611 -0
  6. scrapling/core/custom_types.py +183 -100
  7. scrapling/core/mixins.py +27 -19
  8. scrapling/core/shell.py +647 -0
  9. scrapling/core/{storage_adaptors.py → storage.py} +41 -33
  10. scrapling/core/translator.py +20 -26
  11. scrapling/core/utils.py +49 -54
  12. scrapling/engines/__init__.py +15 -6
  13. scrapling/engines/_browsers/__init__.py +2 -0
  14. scrapling/engines/_browsers/_camoufox.py +759 -0
  15. scrapling/engines/_browsers/_config_tools.py +130 -0
  16. scrapling/engines/_browsers/_controllers.py +644 -0
  17. scrapling/engines/_browsers/_page.py +93 -0
  18. scrapling/engines/_browsers/_validators.py +170 -0
  19. scrapling/engines/constants.py +101 -88
  20. scrapling/engines/static.py +667 -110
  21. scrapling/engines/toolbelt/__init__.py +20 -6
  22. scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
  23. scrapling/engines/toolbelt/convertor.py +254 -0
  24. scrapling/engines/toolbelt/custom.py +158 -175
  25. scrapling/engines/toolbelt/fingerprints.py +32 -46
  26. scrapling/engines/toolbelt/navigation.py +68 -39
  27. scrapling/fetchers.py +239 -333
  28. scrapling/parser.py +781 -449
  29. scrapling-0.3.1.dist-info/METADATA +411 -0
  30. scrapling-0.3.1.dist-info/RECORD +41 -0
  31. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/WHEEL +1 -1
  32. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/top_level.txt +0 -1
  33. scrapling/defaults.py +0 -25
  34. scrapling/engines/camo.py +0 -339
  35. scrapling/engines/pw.py +0 -465
  36. scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
  37. scrapling-0.2.99.dist-info/METADATA +0 -290
  38. scrapling-0.2.99.dist-info/RECORD +0 -49
  39. tests/__init__.py +0 -1
  40. tests/fetchers/__init__.py +0 -1
  41. tests/fetchers/async/__init__.py +0 -0
  42. tests/fetchers/async/test_camoufox.py +0 -97
  43. tests/fetchers/async/test_httpx.py +0 -85
  44. tests/fetchers/async/test_playwright.py +0 -101
  45. tests/fetchers/sync/__init__.py +0 -0
  46. tests/fetchers/sync/test_camoufox.py +0 -70
  47. tests/fetchers/sync/test_httpx.py +0 -84
  48. tests/fetchers/sync/test_playwright.py +0 -89
  49. tests/fetchers/test_utils.py +0 -97
  50. tests/parser/__init__.py +0 -0
  51. tests/parser/test_automatch.py +0 -111
  52. tests/parser/test_general.py +0 -330
  53. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/entry_points.txt +0 -0
  54. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,759 @@
1
+ from time import time, sleep
2
+ from re import compile as re_compile
3
+ from asyncio import sleep as asyncio_sleep, Lock
4
+
5
+ from camoufox import DefaultAddons
6
+ from camoufox.utils import launch_options as generate_launch_options
7
+ from playwright.sync_api import (
8
+ Response as SyncPlaywrightResponse,
9
+ sync_playwright,
10
+ BrowserContext,
11
+ Playwright,
12
+ Locator,
13
+ Page,
14
+ )
15
+ from playwright.async_api import (
16
+ async_playwright,
17
+ Response as AsyncPlaywrightResponse,
18
+ BrowserContext as AsyncBrowserContext,
19
+ Playwright as AsyncPlaywright,
20
+ Locator as AsyncLocator,
21
+ Page as async_Page,
22
+ )
23
+
24
+ from scrapling.core.utils import log
25
+ from ._page import PageInfo, PagePool
26
+ from ._validators import validate, CamoufoxConfig
27
+ from scrapling.core._types import (
28
+ Dict,
29
+ List,
30
+ Optional,
31
+ Callable,
32
+ SelectorWaitStates,
33
+ )
34
+ from scrapling.engines.toolbelt import (
35
+ Response,
36
+ ResponseFactory,
37
+ async_intercept_route,
38
+ generate_convincing_referer,
39
+ get_os_name,
40
+ intercept_route,
41
+ )
42
+
43
+ __CF_PATTERN__ = re_compile("challenges.cloudflare.com/cdn-cgi/challenge-platform/.*")
44
+
45
+
46
+ class StealthySession:
47
+ """A Stealthy session manager with page pooling."""
48
+
49
+ __slots__ = (
50
+ "max_pages",
51
+ "headless",
52
+ "block_images",
53
+ "disable_resources",
54
+ "block_webrtc",
55
+ "allow_webgl",
56
+ "network_idle",
57
+ "humanize",
58
+ "solve_cloudflare",
59
+ "wait",
60
+ "timeout",
61
+ "page_action",
62
+ "wait_selector",
63
+ "init_script",
64
+ "addons",
65
+ "wait_selector_state",
66
+ "cookies",
67
+ "google_search",
68
+ "extra_headers",
69
+ "proxy",
70
+ "os_randomize",
71
+ "disable_ads",
72
+ "geoip",
73
+ "selector_config",
74
+ "additional_args",
75
+ "playwright",
76
+ "browser",
77
+ "context",
78
+ "page_pool",
79
+ "_closed",
80
+ "launch_options",
81
+ "_headers_keys",
82
+ )
83
+
84
+ def __init__(
85
+ self,
86
+ max_pages: int = 1,
87
+ headless: bool = True, # noqa: F821
88
+ block_images: bool = False,
89
+ disable_resources: bool = False,
90
+ block_webrtc: bool = False,
91
+ allow_webgl: bool = True,
92
+ network_idle: bool = False,
93
+ humanize: bool | float = True,
94
+ solve_cloudflare: bool = False,
95
+ wait: int | float = 0,
96
+ timeout: int | float = 30000,
97
+ page_action: Optional[Callable] = None,
98
+ wait_selector: Optional[str] = None,
99
+ init_script: Optional[str] = None,
100
+ addons: Optional[List[str]] = None,
101
+ wait_selector_state: SelectorWaitStates = "attached",
102
+ cookies: Optional[List[Dict]] = None,
103
+ google_search: bool = True,
104
+ extra_headers: Optional[Dict[str, str]] = None,
105
+ proxy: Optional[str | Dict[str, str]] = None,
106
+ os_randomize: bool = False,
107
+ disable_ads: bool = False,
108
+ geoip: bool = False,
109
+ selector_config: Optional[Dict] = None,
110
+ additional_args: Optional[Dict] = None,
111
+ ):
112
+ """A Browser session manager with page pooling
113
+
114
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
115
+ :param block_images: Prevent the loading of images through Firefox preferences.
116
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
117
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
118
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
119
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
120
+ :param block_webrtc: Blocks WebRTC entirely.
121
+ :param cookies: Set cookies for the next request.
122
+ :param addons: List of Firefox addons to use. Must be paths to extracted addons.
123
+ :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
124
+ :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
125
+ :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
126
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
127
+ :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
128
+ :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
129
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
130
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
131
+ :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
132
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
133
+ :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
134
+ :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
135
+ It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
136
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
137
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
138
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
139
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
140
+ :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
141
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
142
+ :param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
143
+ """
144
+
145
+ params = {
146
+ "max_pages": max_pages,
147
+ "headless": headless,
148
+ "block_images": block_images,
149
+ "disable_resources": disable_resources,
150
+ "block_webrtc": block_webrtc,
151
+ "allow_webgl": allow_webgl,
152
+ "network_idle": network_idle,
153
+ "humanize": humanize,
154
+ "solve_cloudflare": solve_cloudflare,
155
+ "wait": wait,
156
+ "timeout": timeout,
157
+ "page_action": page_action,
158
+ "wait_selector": wait_selector,
159
+ "init_script": init_script,
160
+ "addons": addons,
161
+ "wait_selector_state": wait_selector_state,
162
+ "cookies": cookies,
163
+ "google_search": google_search,
164
+ "extra_headers": extra_headers,
165
+ "proxy": proxy,
166
+ "os_randomize": os_randomize,
167
+ "disable_ads": disable_ads,
168
+ "geoip": geoip,
169
+ "selector_config": selector_config,
170
+ "additional_args": additional_args,
171
+ }
172
+ config = validate(params, CamoufoxConfig)
173
+
174
+ self.max_pages = config.max_pages
175
+ self.headless = config.headless
176
+ self.block_images = config.block_images
177
+ self.disable_resources = config.disable_resources
178
+ self.block_webrtc = config.block_webrtc
179
+ self.allow_webgl = config.allow_webgl
180
+ self.network_idle = config.network_idle
181
+ self.humanize = config.humanize
182
+ self.solve_cloudflare = config.solve_cloudflare
183
+ self.wait = config.wait
184
+ self.timeout = config.timeout
185
+ self.page_action = config.page_action
186
+ self.wait_selector = config.wait_selector
187
+ self.init_script = config.init_script
188
+ self.addons = config.addons
189
+ self.wait_selector_state = config.wait_selector_state
190
+ self.cookies = config.cookies
191
+ self.google_search = config.google_search
192
+ self.extra_headers = config.extra_headers
193
+ self.proxy = config.proxy
194
+ self.os_randomize = config.os_randomize
195
+ self.disable_ads = config.disable_ads
196
+ self.geoip = config.geoip
197
+ self.selector_config = config.selector_config
198
+ self.additional_args = config.additional_args
199
+
200
+ self.playwright: Optional[Playwright] = None
201
+ self.context: Optional[BrowserContext] = None
202
+ self.page_pool = PagePool(self.max_pages)
203
+ self._closed = False
204
+ self.selector_config = config.selector_config
205
+ self.page_action = config.page_action
206
+ self._headers_keys = (
207
+ set(map(str.lower, self.extra_headers.keys()))
208
+ if self.extra_headers
209
+ else set()
210
+ )
211
+ self.__initiate_browser_options__()
212
+
213
+ def __initiate_browser_options__(self):
214
+ """Initiate browser options."""
215
+ self.launch_options = generate_launch_options(
216
+ **{
217
+ "geoip": self.geoip,
218
+ "proxy": dict(self.proxy) if self.proxy else self.proxy,
219
+ "enable_cache": True,
220
+ "addons": self.addons,
221
+ "exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
222
+ "headless": self.headless,
223
+ "humanize": True if self.solve_cloudflare else self.humanize,
224
+ "i_know_what_im_doing": True, # To turn warnings off with the user configurations
225
+ "allow_webgl": self.allow_webgl,
226
+ "block_webrtc": self.block_webrtc,
227
+ "block_images": self.block_images, # Careful! it makes some websites don't finish loading at all like stackoverflow even in headful mode.
228
+ "os": None if self.os_randomize else get_os_name(),
229
+ "user_data_dir": "",
230
+ **self.additional_args,
231
+ }
232
+ )
233
+
234
+ def __create__(self):
235
+ """Create a browser for this instance and context."""
236
+ self.playwright = sync_playwright().start()
237
+ self.context = (
238
+ self.playwright.firefox.launch_persistent_context( # pragma: no cover
239
+ **self.launch_options
240
+ )
241
+ )
242
+ if self.init_script: # pragma: no cover
243
+ self.context.add_init_script(path=self.init_script)
244
+
245
+ if self.cookies: # pragma: no cover
246
+ self.context.add_cookies(self.cookies)
247
+
248
+ def __enter__(self): # pragma: no cover
249
+ self.__create__()
250
+ return self
251
+
252
+ def __exit__(self, exc_type, exc_val, exc_tb):
253
+ self.close()
254
+
255
+ def close(self): # pragma: no cover
256
+ """Close all resources"""
257
+ if self._closed: # pragma: no cover
258
+ return
259
+
260
+ if self.context:
261
+ self.context.close()
262
+ self.context = None
263
+
264
+ if self.playwright:
265
+ self.playwright.stop()
266
+ self.playwright = None
267
+
268
+ self._closed = True
269
+
270
+ def _get_or_create_page(self) -> PageInfo: # pragma: no cover
271
+ """Get an available page or create a new one"""
272
+ # Try to get a ready page first
273
+ page_info = self.page_pool.get_ready_page()
274
+ if page_info:
275
+ return page_info
276
+
277
+ # Create a new page if under limit
278
+ if self.page_pool.pages_count < self.max_pages:
279
+ page = self.context.new_page()
280
+ page.set_default_navigation_timeout(self.timeout)
281
+ page.set_default_timeout(self.timeout)
282
+ if self.extra_headers:
283
+ page.set_extra_http_headers(self.extra_headers)
284
+
285
+ if self.disable_resources:
286
+ page.route("**/*", intercept_route)
287
+
288
+ return self.page_pool.add_page(page)
289
+
290
+ # Wait for a page to become available
291
+ max_wait = 30
292
+ start_time = time()
293
+
294
+ while time() - start_time < max_wait:
295
+ page_info = self.page_pool.get_ready_page()
296
+ if page_info:
297
+ return page_info
298
+ sleep(0.05)
299
+
300
+ raise TimeoutError("No pages available within timeout period")
301
+
302
+ @staticmethod
303
+ def _detect_cloudflare(page_content):
304
+ """
305
+ Detect the type of Cloudflare challenge present in the provided page content.
306
+
307
+ This function analyzes the given page content to identify whether a specific
308
+ type of Cloudflare challenge is present. It checks for three predefined
309
+ challenge types: non-interactive, managed, and interactive. If a challenge
310
+ type is detected, it returns the corresponding type as a string. If no
311
+ challenge type is detected, it returns None.
312
+
313
+ Args:
314
+ page_content (str): The content of the page to analyze for Cloudflare
315
+ challenge types.
316
+
317
+ Returns:
318
+ str: A string representing the detected Cloudflare challenge type, if
319
+ found. Returns None if no challenge matches.
320
+ """
321
+ challenge_types = (
322
+ "non-interactive",
323
+ "managed",
324
+ "interactive",
325
+ )
326
+ for ctype in challenge_types:
327
+ if f"cType: '{ctype}'" in page_content:
328
+ return ctype
329
+
330
+ return None
331
+
332
+ def _solve_cloudflare(self, page: Page) -> None: # pragma: no cover
333
+ """Solve the cloudflare challenge displayed on the playwright page passed
334
+
335
+ :param page: The targeted page
336
+ :return:
337
+ """
338
+ challenge_type = self._detect_cloudflare(page.content())
339
+ if not challenge_type:
340
+ log.error("No Cloudflare challenge found.")
341
+ return
342
+ else:
343
+ log.info(f'The turnstile version discovered is "{challenge_type}"')
344
+ if challenge_type == "non-interactive":
345
+ while "<title>Just a moment...</title>" in (page.content()):
346
+ log.info("Waiting for Cloudflare wait page to disappear.")
347
+ page.wait_for_timeout(1000)
348
+ page.wait_for_load_state()
349
+ log.info("Cloudflare captcha is solved")
350
+ return
351
+
352
+ else:
353
+ while "Verifying you are human." in page.content():
354
+ # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
355
+ page.wait_for_timeout(500)
356
+
357
+ iframe = page.frame(url=__CF_PATTERN__)
358
+ if iframe is None:
359
+ log.info("Didn't find Cloudflare iframe!")
360
+ return
361
+
362
+ while not iframe.frame_element().is_visible():
363
+ # Double-checking that the iframe is loaded
364
+ page.wait_for_timeout(500)
365
+
366
+ # Calculate the Captcha coordinates for any viewport
367
+ outer_box = page.locator(".main-content p+div>div>div").bounding_box()
368
+ captcha_x, captcha_y = outer_box["x"] + 26, outer_box["y"] + 25
369
+
370
+ # Move the mouse to the center of the window, then press and hold the left mouse button
371
+ page.mouse.click(captcha_x, captcha_y, delay=60, button="left")
372
+ page.locator(".zone-name-title").wait_for(state="hidden")
373
+ page.wait_for_load_state(state="domcontentloaded")
374
+
375
+ log.info("Cloudflare captcha is solved")
376
+ return
377
+
378
+ def fetch(self, url: str) -> Response:
379
+ """Opens up the browser and do your request based on your chosen options.
380
+
381
+ :param url: The Target url.
382
+ :return: A `Response` object.
383
+ """
384
+ if self._closed: # pragma: no cover
385
+ raise RuntimeError("Context manager has been closed")
386
+
387
+ final_response = None
388
+ referer = (
389
+ generate_convincing_referer(url)
390
+ if (self.google_search and "referer" not in self._headers_keys)
391
+ else None
392
+ )
393
+
394
+ def handle_response(finished_response: SyncPlaywrightResponse):
395
+ nonlocal final_response
396
+ if (
397
+ finished_response.request.resource_type == "document"
398
+ and finished_response.request.is_navigation_request()
399
+ ):
400
+ final_response = finished_response
401
+
402
+ page_info = self._get_or_create_page()
403
+ page_info.mark_busy(url=url)
404
+
405
+ try: # pragma: no cover
406
+ # Navigate to URL and wait for a specified state
407
+ page_info.page.on("response", handle_response)
408
+ first_response = page_info.page.goto(url, referer=referer)
409
+ page_info.page.wait_for_load_state(state="domcontentloaded")
410
+
411
+ if self.network_idle:
412
+ page_info.page.wait_for_load_state("networkidle")
413
+
414
+ if not first_response:
415
+ raise RuntimeError(f"Failed to get response for {url}")
416
+
417
+ if self.solve_cloudflare:
418
+ self._solve_cloudflare(page_info.page)
419
+ # Make sure the page is fully loaded after the captcha
420
+ page_info.page.wait_for_load_state(state="load")
421
+ page_info.page.wait_for_load_state(state="domcontentloaded")
422
+ if self.network_idle:
423
+ page_info.page.wait_for_load_state("networkidle")
424
+
425
+ if self.page_action is not None:
426
+ try:
427
+ page_info.page = self.page_action(page_info.page)
428
+ except Exception as e:
429
+ log.error(f"Error executing page_action: {e}")
430
+
431
+ if self.wait_selector:
432
+ try:
433
+ waiter: Locator = page_info.page.locator(self.wait_selector)
434
+ waiter.first.wait_for(state=self.wait_selector_state)
435
+ # Wait again after waiting for the selector, helpful with protections like Cloudflare
436
+ page_info.page.wait_for_load_state(state="load")
437
+ page_info.page.wait_for_load_state(state="domcontentloaded")
438
+ if self.network_idle:
439
+ page_info.page.wait_for_load_state("networkidle")
440
+ except Exception as e:
441
+ log.error(f"Error waiting for selector {self.wait_selector}: {e}")
442
+
443
+ page_info.page.wait_for_timeout(self.wait)
444
+ response = ResponseFactory.from_playwright_response(
445
+ page_info.page, first_response, final_response, self.selector_config
446
+ )
447
+
448
+ # Mark the page as ready for next use
449
+ page_info.mark_ready()
450
+
451
+ return response
452
+
453
+ except Exception as e: # pragma: no cover
454
+ page_info.mark_error()
455
+ raise e
456
+
457
+ def get_pool_stats(self) -> Dict[str, int]:
458
+ """Get statistics about the current page pool"""
459
+ return {
460
+ "total_pages": self.page_pool.pages_count,
461
+ "ready_pages": self.page_pool.ready_count,
462
+ "busy_pages": self.page_pool.busy_count,
463
+ "max_pages": self.max_pages,
464
+ }
465
+
466
+
467
+ class AsyncStealthySession(StealthySession):
468
+ """A Stealthy session manager with page pooling."""
469
+
470
+ def __init__(
471
+ self,
472
+ max_pages: int = 1,
473
+ headless: bool = True, # noqa: F821
474
+ block_images: bool = False,
475
+ disable_resources: bool = False,
476
+ block_webrtc: bool = False,
477
+ allow_webgl: bool = True,
478
+ network_idle: bool = False,
479
+ humanize: bool | float = True,
480
+ solve_cloudflare: bool = False,
481
+ wait: int | float = 0,
482
+ timeout: int | float = 30000,
483
+ page_action: Optional[Callable] = None,
484
+ wait_selector: Optional[str] = None,
485
+ init_script: Optional[str] = None,
486
+ addons: Optional[List[str]] = None,
487
+ wait_selector_state: SelectorWaitStates = "attached",
488
+ cookies: Optional[List[Dict]] = None,
489
+ google_search: bool = True,
490
+ extra_headers: Optional[Dict[str, str]] = None,
491
+ proxy: Optional[str | Dict[str, str]] = None,
492
+ os_randomize: bool = False,
493
+ disable_ads: bool = False,
494
+ geoip: bool = False,
495
+ selector_config: Optional[Dict] = None,
496
+ additional_args: Optional[Dict] = None,
497
+ ):
498
+ """A Browser session manager with page pooling
499
+
500
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
501
+ :param block_images: Prevent the loading of images through Firefox preferences.
502
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
503
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
504
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
505
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
506
+ :param block_webrtc: Blocks WebRTC entirely.
507
+ :param cookies: Set cookies for the next request.
508
+ :param addons: List of Firefox addons to use. Must be paths to extracted addons.
509
+ :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
510
+ :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
511
+ :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
512
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
513
+ :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
514
+ :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
515
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
516
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
517
+ :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
518
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
519
+ :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
520
+ :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
521
+ It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
522
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
523
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
524
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
525
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
526
+ :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
527
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
528
+ :param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
529
+ """
530
+ super().__init__(
531
+ max_pages,
532
+ headless,
533
+ block_images,
534
+ disable_resources,
535
+ block_webrtc,
536
+ allow_webgl,
537
+ network_idle,
538
+ humanize,
539
+ solve_cloudflare,
540
+ wait,
541
+ timeout,
542
+ page_action,
543
+ wait_selector,
544
+ init_script,
545
+ addons,
546
+ wait_selector_state,
547
+ cookies,
548
+ google_search,
549
+ extra_headers,
550
+ proxy,
551
+ os_randomize,
552
+ disable_ads,
553
+ geoip,
554
+ selector_config,
555
+ additional_args,
556
+ )
557
+ self.playwright: Optional[AsyncPlaywright] = None
558
+ self.context: Optional[AsyncBrowserContext] = None
559
+ self._lock = Lock()
560
+ self.__enter__ = None
561
+ self.__exit__ = None
562
+
563
+ async def __create__(self):
564
+ """Create a browser for this instance and context."""
565
+ self.playwright: AsyncPlaywright = await async_playwright().start()
566
+ self.context: AsyncBrowserContext = (
567
+ await self.playwright.firefox.launch_persistent_context(
568
+ **self.launch_options
569
+ )
570
+ )
571
+ if self.init_script: # pragma: no cover
572
+ await self.context.add_init_script(path=self.init_script)
573
+
574
+ if self.cookies:
575
+ await self.context.add_cookies(self.cookies)
576
+
577
+ async def __aenter__(self):
578
+ await self.__create__()
579
+ return self
580
+
581
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
582
+ await self.close()
583
+
584
+ async def close(self):
585
+ """Close all resources"""
586
+ if self._closed: # pragma: no cover
587
+ return
588
+
589
+ if self.context:
590
+ await self.context.close()
591
+ self.context = None
592
+
593
+ if self.playwright:
594
+ await self.playwright.stop()
595
+ self.playwright = None
596
+
597
+ self._closed = True
598
+
599
+ async def _get_or_create_page(self) -> PageInfo:
600
+ """Get an available page or create a new one"""
601
+ async with self._lock:
602
+ # Try to get a ready page first
603
+ page_info = self.page_pool.get_ready_page()
604
+ if page_info:
605
+ return page_info
606
+
607
+ # Create a new page if under limit
608
+ if self.page_pool.pages_count < self.max_pages:
609
+ page = await self.context.new_page()
610
+ page.set_default_navigation_timeout(self.timeout)
611
+ page.set_default_timeout(self.timeout)
612
+ if self.extra_headers:
613
+ await page.set_extra_http_headers(self.extra_headers)
614
+
615
+ if self.disable_resources:
616
+ await page.route("**/*", async_intercept_route)
617
+
618
+ return self.page_pool.add_page(page)
619
+
620
+ # Wait for a page to become available
621
+ max_wait = 30
622
+ start_time = time()
623
+
624
+ while time() - start_time < max_wait: # pragma: no cover
625
+ page_info = self.page_pool.get_ready_page()
626
+ if page_info:
627
+ return page_info
628
+ await asyncio_sleep(0.05)
629
+
630
+ raise TimeoutError("No pages available within timeout period")
631
+
632
+ async def _solve_cloudflare(self, page: async_Page):
633
+ """Solve the cloudflare challenge displayed on the playwright page passed. The async version
634
+
635
+ :param page: The async targeted page
636
+ :return:
637
+ """
638
+ challenge_type = self._detect_cloudflare(await page.content())
639
+ if not challenge_type:
640
+ log.error("No Cloudflare challenge found.")
641
+ return
642
+ else:
643
+ log.info(f'The turnstile version discovered is "{challenge_type}"')
644
+ if challenge_type == "non-interactive": # pragma: no cover
645
+ while "<title>Just a moment...</title>" in (await page.content()):
646
+ log.info("Waiting for Cloudflare wait page to disappear.")
647
+ await page.wait_for_timeout(1000)
648
+ await page.wait_for_load_state()
649
+ log.info("Cloudflare captcha is solved")
650
+ return
651
+
652
+ else:
653
+ while "Verifying you are human." in (await page.content()):
654
+ # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
655
+ await page.wait_for_timeout(500)
656
+
657
+ iframe = page.frame(url=__CF_PATTERN__)
658
+ if iframe is None:
659
+ log.info("Didn't find Cloudflare iframe!")
660
+ return
661
+
662
+ while not await (await iframe.frame_element()).is_visible():
663
+ # Double-checking that the iframe is loaded
664
+ await page.wait_for_timeout(500)
665
+
666
+ # Calculate the Captcha coordinates for any viewport
667
+ outer_box = await page.locator(
668
+ ".main-content p+div>div>div"
669
+ ).bounding_box()
670
+ captcha_x, captcha_y = outer_box["x"] + 26, outer_box["y"] + 25
671
+
672
+ # Move the mouse to the center of the window, then press and hold the left mouse button
673
+ await page.mouse.click(captcha_x, captcha_y, delay=60, button="left")
674
+ await page.locator(".zone-name-title").wait_for(state="hidden")
675
+ await page.wait_for_load_state(state="domcontentloaded")
676
+
677
+ log.info("Cloudflare captcha is solved")
678
+ return
679
+
680
+ async def fetch(self, url: str) -> Response:
681
+ """Opens up the browser and do your request based on your chosen options.
682
+
683
+ :param url: The Target url.
684
+ :return: A `Response` object.
685
+ """
686
+ if self._closed: # pragma: no cover
687
+ raise RuntimeError("Context manager has been closed")
688
+
689
+ final_response = None
690
+ referer = (
691
+ generate_convincing_referer(url)
692
+ if (self.google_search and "referer" not in self._headers_keys)
693
+ else None
694
+ )
695
+
696
+ async def handle_response(finished_response: AsyncPlaywrightResponse):
697
+ nonlocal final_response
698
+ if (
699
+ finished_response.request.resource_type == "document"
700
+ and finished_response.request.is_navigation_request()
701
+ ):
702
+ final_response = finished_response
703
+
704
+ page_info = await self._get_or_create_page()
705
+ page_info.mark_busy(url=url)
706
+
707
+ try:
708
+ # Navigate to URL and wait for a specified state
709
+ page_info.page.on("response", handle_response)
710
+ first_response = await page_info.page.goto(url, referer=referer)
711
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
712
+
713
+ if self.network_idle:
714
+ await page_info.page.wait_for_load_state("networkidle")
715
+
716
+ if not first_response:
717
+ raise RuntimeError(f"Failed to get response for {url}")
718
+
719
+ if self.solve_cloudflare:
720
+ await self._solve_cloudflare(page_info.page)
721
+ # Make sure the page is fully loaded after the captcha
722
+ await page_info.page.wait_for_load_state(state="load")
723
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
724
+ if self.network_idle:
725
+ await page_info.page.wait_for_load_state("networkidle")
726
+
727
+ if self.page_action is not None:
728
+ try:
729
+ page_info.page = await self.page_action(page_info.page)
730
+ except Exception as e:
731
+ log.error(f"Error executing page_action: {e}")
732
+
733
+ if self.wait_selector:
734
+ try:
735
+ waiter: AsyncLocator = page_info.page.locator(self.wait_selector)
736
+ await waiter.first.wait_for(state=self.wait_selector_state)
737
+ # Wait again after waiting for the selector, helpful with protections like Cloudflare
738
+ await page_info.page.wait_for_load_state(state="load")
739
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
740
+ if self.network_idle:
741
+ await page_info.page.wait_for_load_state("networkidle")
742
+ except Exception as e:
743
+ log.error(f"Error waiting for selector {self.wait_selector}: {e}")
744
+
745
+ await page_info.page.wait_for_timeout(self.wait)
746
+
747
+ # Create response object
748
+ response = await ResponseFactory.from_async_playwright_response(
749
+ page_info.page, first_response, final_response, self.selector_config
750
+ )
751
+
752
+ # Mark the page as ready for next use
753
+ page_info.mark_ready()
754
+
755
+ return response
756
+
757
+ except Exception as e:
758
+ page_info.mark_error()
759
+ raise e