scrapling 0.2.99__py3-none-any.whl → 0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. scrapling/__init__.py +18 -31
  2. scrapling/cli.py +818 -20
  3. scrapling/core/_html_utils.py +348 -0
  4. scrapling/core/_types.py +34 -17
  5. scrapling/core/ai.py +611 -0
  6. scrapling/core/custom_types.py +183 -100
  7. scrapling/core/mixins.py +27 -19
  8. scrapling/core/shell.py +647 -0
  9. scrapling/core/{storage_adaptors.py → storage.py} +41 -33
  10. scrapling/core/translator.py +20 -26
  11. scrapling/core/utils.py +49 -54
  12. scrapling/engines/__init__.py +15 -6
  13. scrapling/engines/_browsers/__init__.py +2 -0
  14. scrapling/engines/_browsers/_camoufox.py +745 -0
  15. scrapling/engines/_browsers/_config_tools.py +130 -0
  16. scrapling/engines/_browsers/_controllers.py +630 -0
  17. scrapling/engines/_browsers/_page.py +93 -0
  18. scrapling/engines/_browsers/_validators.py +150 -0
  19. scrapling/engines/constants.py +101 -88
  20. scrapling/engines/static.py +667 -110
  21. scrapling/engines/toolbelt/__init__.py +20 -6
  22. scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
  23. scrapling/engines/toolbelt/convertor.py +254 -0
  24. scrapling/engines/toolbelt/custom.py +158 -175
  25. scrapling/engines/toolbelt/fingerprints.py +32 -46
  26. scrapling/engines/toolbelt/navigation.py +68 -39
  27. scrapling/fetchers.py +227 -333
  28. scrapling/parser.py +781 -449
  29. scrapling-0.3.dist-info/METADATA +409 -0
  30. scrapling-0.3.dist-info/RECORD +41 -0
  31. {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/WHEEL +1 -1
  32. {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/top_level.txt +0 -1
  33. scrapling/defaults.py +0 -25
  34. scrapling/engines/camo.py +0 -339
  35. scrapling/engines/pw.py +0 -465
  36. scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
  37. scrapling-0.2.99.dist-info/METADATA +0 -290
  38. scrapling-0.2.99.dist-info/RECORD +0 -49
  39. tests/__init__.py +0 -1
  40. tests/fetchers/__init__.py +0 -1
  41. tests/fetchers/async/__init__.py +0 -0
  42. tests/fetchers/async/test_camoufox.py +0 -97
  43. tests/fetchers/async/test_httpx.py +0 -85
  44. tests/fetchers/async/test_playwright.py +0 -101
  45. tests/fetchers/sync/__init__.py +0 -0
  46. tests/fetchers/sync/test_camoufox.py +0 -70
  47. tests/fetchers/sync/test_httpx.py +0 -84
  48. tests/fetchers/sync/test_playwright.py +0 -89
  49. tests/fetchers/test_utils.py +0 -97
  50. tests/parser/__init__.py +0 -0
  51. tests/parser/test_automatch.py +0 -111
  52. tests/parser/test_general.py +0 -330
  53. {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/entry_points.txt +0 -0
  54. {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,745 @@
1
+ from time import time, sleep
2
+ from re import compile as re_compile
3
+ from asyncio import sleep as asyncio_sleep, Lock
4
+
5
+ from camoufox import DefaultAddons
6
+ from camoufox.utils import launch_options as generate_launch_options
7
+ from playwright.sync_api import (
8
+ Response as SyncPlaywrightResponse,
9
+ sync_playwright,
10
+ BrowserContext,
11
+ Playwright,
12
+ Locator,
13
+ Page,
14
+ )
15
+ from playwright.async_api import (
16
+ async_playwright,
17
+ Response as AsyncPlaywrightResponse,
18
+ BrowserContext as AsyncBrowserContext,
19
+ Playwright as AsyncPlaywright,
20
+ Locator as AsyncLocator,
21
+ Page as async_Page,
22
+ )
23
+
24
+ from scrapling.core.utils import log
25
+ from ._page import PageInfo, PagePool
26
+ from ._validators import validate, CamoufoxConfig
27
+ from scrapling.core._types import (
28
+ Dict,
29
+ List,
30
+ Optional,
31
+ Callable,
32
+ SelectorWaitStates,
33
+ )
34
+ from scrapling.engines.toolbelt import (
35
+ Response,
36
+ ResponseFactory,
37
+ async_intercept_route,
38
+ generate_convincing_referer,
39
+ get_os_name,
40
+ intercept_route,
41
+ )
42
+
43
+ __CF_PATTERN__ = re_compile("challenges.cloudflare.com/cdn-cgi/challenge-platform/.*")
44
+
45
+
46
+ class StealthySession:
47
+ """A Stealthy session manager with page pooling."""
48
+
49
+ __slots__ = (
50
+ "max_pages",
51
+ "headless",
52
+ "block_images",
53
+ "disable_resources",
54
+ "block_webrtc",
55
+ "allow_webgl",
56
+ "network_idle",
57
+ "humanize",
58
+ "solve_cloudflare",
59
+ "wait",
60
+ "timeout",
61
+ "page_action",
62
+ "wait_selector",
63
+ "addons",
64
+ "wait_selector_state",
65
+ "cookies",
66
+ "google_search",
67
+ "extra_headers",
68
+ "proxy",
69
+ "os_randomize",
70
+ "disable_ads",
71
+ "geoip",
72
+ "selector_config",
73
+ "additional_args",
74
+ "playwright",
75
+ "browser",
76
+ "context",
77
+ "page_pool",
78
+ "_closed",
79
+ "launch_options",
80
+ "_headers_keys",
81
+ )
82
+
83
+ def __init__(
84
+ self,
85
+ max_pages: int = 1,
86
+ headless: bool = True, # noqa: F821
87
+ block_images: bool = False,
88
+ disable_resources: bool = False,
89
+ block_webrtc: bool = False,
90
+ allow_webgl: bool = True,
91
+ network_idle: bool = False,
92
+ humanize: bool | float = True,
93
+ solve_cloudflare: bool = False,
94
+ wait: int | float = 0,
95
+ timeout: int | float = 30000,
96
+ page_action: Optional[Callable] = None,
97
+ wait_selector: Optional[str] = None,
98
+ addons: Optional[List[str]] = None,
99
+ wait_selector_state: SelectorWaitStates = "attached",
100
+ cookies: Optional[List[Dict]] = None,
101
+ google_search: bool = True,
102
+ extra_headers: Optional[Dict[str, str]] = None,
103
+ proxy: Optional[str | Dict[str, str]] = None,
104
+ os_randomize: bool = False,
105
+ disable_ads: bool = False,
106
+ geoip: bool = False,
107
+ selector_config: Optional[Dict] = None,
108
+ additional_args: Optional[Dict] = None,
109
+ ):
110
+ """A Browser session manager with page pooling
111
+
112
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
113
+ :param block_images: Prevent the loading of images through Firefox preferences.
114
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
115
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
116
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
117
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
118
+ :param block_webrtc: Blocks WebRTC entirely.
119
+ :param cookies: Set cookies for the next request.
120
+ :param addons: List of Firefox addons to use. Must be paths to extracted addons.
121
+ :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
122
+ :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
123
+ :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
124
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
125
+ :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
126
+ :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
127
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
128
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
129
+ :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
130
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
131
+ :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
132
+ It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
133
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
134
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
135
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
136
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
137
+ :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
138
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
139
+ :param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
140
+ """
141
+
142
+ params = {
143
+ "max_pages": max_pages,
144
+ "headless": headless,
145
+ "block_images": block_images,
146
+ "disable_resources": disable_resources,
147
+ "block_webrtc": block_webrtc,
148
+ "allow_webgl": allow_webgl,
149
+ "network_idle": network_idle,
150
+ "humanize": humanize,
151
+ "solve_cloudflare": solve_cloudflare,
152
+ "wait": wait,
153
+ "timeout": timeout,
154
+ "page_action": page_action,
155
+ "wait_selector": wait_selector,
156
+ "addons": addons,
157
+ "wait_selector_state": wait_selector_state,
158
+ "cookies": cookies,
159
+ "google_search": google_search,
160
+ "extra_headers": extra_headers,
161
+ "proxy": proxy,
162
+ "os_randomize": os_randomize,
163
+ "disable_ads": disable_ads,
164
+ "geoip": geoip,
165
+ "selector_config": selector_config,
166
+ "additional_args": additional_args,
167
+ }
168
+ config = validate(params, CamoufoxConfig)
169
+
170
+ self.max_pages = config.max_pages
171
+ self.headless = config.headless
172
+ self.block_images = config.block_images
173
+ self.disable_resources = config.disable_resources
174
+ self.block_webrtc = config.block_webrtc
175
+ self.allow_webgl = config.allow_webgl
176
+ self.network_idle = config.network_idle
177
+ self.humanize = config.humanize
178
+ self.solve_cloudflare = config.solve_cloudflare
179
+ self.wait = config.wait
180
+ self.timeout = config.timeout
181
+ self.page_action = config.page_action
182
+ self.wait_selector = config.wait_selector
183
+ self.addons = config.addons
184
+ self.wait_selector_state = config.wait_selector_state
185
+ self.cookies = config.cookies
186
+ self.google_search = config.google_search
187
+ self.extra_headers = config.extra_headers
188
+ self.proxy = config.proxy
189
+ self.os_randomize = config.os_randomize
190
+ self.disable_ads = config.disable_ads
191
+ self.geoip = config.geoip
192
+ self.selector_config = config.selector_config
193
+ self.additional_args = config.additional_args
194
+
195
+ self.playwright: Optional[Playwright] = None
196
+ self.context: Optional[BrowserContext] = None
197
+ self.page_pool = PagePool(self.max_pages)
198
+ self._closed = False
199
+ self.selector_config = config.selector_config
200
+ self.page_action = config.page_action
201
+ self._headers_keys = (
202
+ set(map(str.lower, self.extra_headers.keys()))
203
+ if self.extra_headers
204
+ else set()
205
+ )
206
+ self.__initiate_browser_options__()
207
+
208
+ def __initiate_browser_options__(self):
209
+ """Initiate browser options."""
210
+ self.launch_options = generate_launch_options(
211
+ **{
212
+ "geoip": self.geoip,
213
+ "proxy": dict(self.proxy) if self.proxy else self.proxy,
214
+ "enable_cache": True,
215
+ "addons": self.addons,
216
+ "exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
217
+ "headless": self.headless,
218
+ "humanize": True if self.solve_cloudflare else self.humanize,
219
+ "i_know_what_im_doing": True, # To turn warnings off with the user configurations
220
+ "allow_webgl": self.allow_webgl,
221
+ "block_webrtc": self.block_webrtc,
222
+ "block_images": self.block_images, # Careful! it makes some websites don't finish loading at all like stackoverflow even in headful mode.
223
+ "os": None if self.os_randomize else get_os_name(),
224
+ "user_data_dir": "",
225
+ **self.additional_args,
226
+ }
227
+ )
228
+
229
+ def __create__(self):
230
+ """Create a browser for this instance and context."""
231
+ self.playwright = sync_playwright().start()
232
+ self.context = (
233
+ self.playwright.firefox.launch_persistent_context( # pragma: no cover
234
+ **self.launch_options
235
+ )
236
+ )
237
+ if self.cookies: # pragma: no cover
238
+ self.context.add_cookies(self.cookies)
239
+
240
+ def __enter__(self): # pragma: no cover
241
+ self.__create__()
242
+ return self
243
+
244
+ def __exit__(self, exc_type, exc_val, exc_tb):
245
+ self.close()
246
+
247
+ def close(self): # pragma: no cover
248
+ """Close all resources"""
249
+ if self._closed: # pragma: no cover
250
+ return
251
+
252
+ if self.context:
253
+ self.context.close()
254
+ self.context = None
255
+
256
+ if self.playwright:
257
+ self.playwright.stop()
258
+ self.playwright = None
259
+
260
+ self._closed = True
261
+
262
+ def _get_or_create_page(self) -> PageInfo: # pragma: no cover
263
+ """Get an available page or create a new one"""
264
+ # Try to get a ready page first
265
+ page_info = self.page_pool.get_ready_page()
266
+ if page_info:
267
+ return page_info
268
+
269
+ # Create a new page if under limit
270
+ if self.page_pool.pages_count < self.max_pages:
271
+ page = self.context.new_page()
272
+ page.set_default_navigation_timeout(self.timeout)
273
+ page.set_default_timeout(self.timeout)
274
+ if self.extra_headers:
275
+ page.set_extra_http_headers(self.extra_headers)
276
+
277
+ if self.disable_resources:
278
+ page.route("**/*", intercept_route)
279
+
280
+ return self.page_pool.add_page(page)
281
+
282
+ # Wait for a page to become available
283
+ max_wait = 30
284
+ start_time = time()
285
+
286
+ while time() - start_time < max_wait:
287
+ page_info = self.page_pool.get_ready_page()
288
+ if page_info:
289
+ return page_info
290
+ sleep(0.05)
291
+
292
+ raise TimeoutError("No pages available within timeout period")
293
+
294
+ @staticmethod
295
+ def _detect_cloudflare(page_content):
296
+ """
297
+ Detect the type of Cloudflare challenge present in the provided page content.
298
+
299
+ This function analyzes the given page content to identify whether a specific
300
+ type of Cloudflare challenge is present. It checks for three predefined
301
+ challenge types: non-interactive, managed, and interactive. If a challenge
302
+ type is detected, it returns the corresponding type as a string. If no
303
+ challenge type is detected, it returns None.
304
+
305
+ Args:
306
+ page_content (str): The content of the page to analyze for Cloudflare
307
+ challenge types.
308
+
309
+ Returns:
310
+ str: A string representing the detected Cloudflare challenge type, if
311
+ found. Returns None if no challenge matches.
312
+ """
313
+ challenge_types = (
314
+ "non-interactive",
315
+ "managed",
316
+ "interactive",
317
+ )
318
+ for ctype in challenge_types:
319
+ if f"cType: '{ctype}'" in page_content:
320
+ return ctype
321
+
322
+ return None
323
+
324
+ def _solve_cloudflare(self, page: Page) -> None: # pragma: no cover
325
+ """Solve the cloudflare challenge displayed on the playwright page passed
326
+
327
+ :param page: The targeted page
328
+ :return:
329
+ """
330
+ challenge_type = self._detect_cloudflare(page.content())
331
+ if not challenge_type:
332
+ log.error("No Cloudflare challenge found.")
333
+ return
334
+ else:
335
+ log.info(f'The turnstile version discovered is "{challenge_type}"')
336
+ if challenge_type == "non-interactive":
337
+ while "<title>Just a moment...</title>" in (page.content()):
338
+ log.info("Waiting for Cloudflare wait page to disappear.")
339
+ page.wait_for_timeout(1000)
340
+ page.wait_for_load_state()
341
+ log.info("Cloudflare captcha is solved")
342
+ return
343
+
344
+ else:
345
+ while "Verifying you are human." in page.content():
346
+ # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
347
+ page.wait_for_timeout(500)
348
+
349
+ iframe = page.frame(url=__CF_PATTERN__)
350
+ if iframe is None:
351
+ log.info("Didn't find Cloudflare iframe!")
352
+ return
353
+
354
+ while not iframe.frame_element().is_visible():
355
+ # Double-checking that the iframe is loaded
356
+ page.wait_for_timeout(500)
357
+
358
+ # Calculate the Captcha coordinates for any viewport
359
+ outer_box = page.locator(".main-content p+div>div>div").bounding_box()
360
+ captcha_x, captcha_y = outer_box["x"] + 26, outer_box["y"] + 25
361
+
362
+ # Move the mouse to the center of the window, then press and hold the left mouse button
363
+ page.mouse.click(captcha_x, captcha_y, delay=60, button="left")
364
+ page.locator(".zone-name-title").wait_for(state="hidden")
365
+ page.wait_for_load_state(state="domcontentloaded")
366
+
367
+ log.info("Cloudflare captcha is solved")
368
+ return
369
+
370
+ def fetch(self, url: str) -> Response:
371
+ """Opens up the browser and do your request based on your chosen options.
372
+
373
+ :param url: The Target url.
374
+ :return: A `Response` object.
375
+ """
376
+ if self._closed: # pragma: no cover
377
+ raise RuntimeError("Context manager has been closed")
378
+
379
+ final_response = None
380
+ referer = (
381
+ generate_convincing_referer(url)
382
+ if (self.google_search and "referer" not in self._headers_keys)
383
+ else None
384
+ )
385
+
386
+ def handle_response(finished_response: SyncPlaywrightResponse):
387
+ nonlocal final_response
388
+ if (
389
+ finished_response.request.resource_type == "document"
390
+ and finished_response.request.is_navigation_request()
391
+ ):
392
+ final_response = finished_response
393
+
394
+ page_info = self._get_or_create_page()
395
+ page_info.mark_busy(url=url)
396
+
397
+ try: # pragma: no cover
398
+ # Navigate to URL and wait for a specified state
399
+ page_info.page.on("response", handle_response)
400
+ first_response = page_info.page.goto(url, referer=referer)
401
+ page_info.page.wait_for_load_state(state="domcontentloaded")
402
+
403
+ if self.network_idle:
404
+ page_info.page.wait_for_load_state("networkidle")
405
+
406
+ if not first_response:
407
+ raise RuntimeError(f"Failed to get response for {url}")
408
+
409
+ if self.solve_cloudflare:
410
+ self._solve_cloudflare(page_info.page)
411
+ # Make sure the page is fully loaded after the captcha
412
+ page_info.page.wait_for_load_state(state="load")
413
+ page_info.page.wait_for_load_state(state="domcontentloaded")
414
+ if self.network_idle:
415
+ page_info.page.wait_for_load_state("networkidle")
416
+
417
+ if self.page_action is not None:
418
+ try:
419
+ page_info.page = self.page_action(page_info.page)
420
+ except Exception as e:
421
+ log.error(f"Error executing page_action: {e}")
422
+
423
+ if self.wait_selector:
424
+ try:
425
+ waiter: Locator = page_info.page.locator(self.wait_selector)
426
+ waiter.first.wait_for(state=self.wait_selector_state)
427
+ # Wait again after waiting for the selector, helpful with protections like Cloudflare
428
+ page_info.page.wait_for_load_state(state="load")
429
+ page_info.page.wait_for_load_state(state="domcontentloaded")
430
+ if self.network_idle:
431
+ page_info.page.wait_for_load_state("networkidle")
432
+ except Exception as e:
433
+ log.error(f"Error waiting for selector {self.wait_selector}: {e}")
434
+
435
+ page_info.page.wait_for_timeout(self.wait)
436
+ response = ResponseFactory.from_playwright_response(
437
+ page_info.page, first_response, final_response, self.selector_config
438
+ )
439
+
440
+ # Mark the page as ready for next use
441
+ page_info.mark_ready()
442
+
443
+ return response
444
+
445
+ except Exception as e: # pragma: no cover
446
+ page_info.mark_error()
447
+ raise e
448
+
449
+ def get_pool_stats(self) -> Dict[str, int]:
450
+ """Get statistics about the current page pool"""
451
+ return {
452
+ "total_pages": self.page_pool.pages_count,
453
+ "ready_pages": self.page_pool.ready_count,
454
+ "busy_pages": self.page_pool.busy_count,
455
+ "max_pages": self.max_pages,
456
+ }
457
+
458
+
459
+ class AsyncStealthySession(StealthySession):
460
+ """A Stealthy session manager with page pooling."""
461
+
462
+ def __init__(
463
+ self,
464
+ max_pages: int = 1,
465
+ headless: bool = True, # noqa: F821
466
+ block_images: bool = False,
467
+ disable_resources: bool = False,
468
+ block_webrtc: bool = False,
469
+ allow_webgl: bool = True,
470
+ network_idle: bool = False,
471
+ humanize: bool | float = True,
472
+ solve_cloudflare: bool = False,
473
+ wait: int | float = 0,
474
+ timeout: int | float = 30000,
475
+ page_action: Optional[Callable] = None,
476
+ wait_selector: Optional[str] = None,
477
+ addons: Optional[List[str]] = None,
478
+ wait_selector_state: SelectorWaitStates = "attached",
479
+ cookies: Optional[List[Dict]] = None,
480
+ google_search: bool = True,
481
+ extra_headers: Optional[Dict[str, str]] = None,
482
+ proxy: Optional[str | Dict[str, str]] = None,
483
+ os_randomize: bool = False,
484
+ disable_ads: bool = False,
485
+ geoip: bool = False,
486
+ selector_config: Optional[Dict] = None,
487
+ additional_args: Optional[Dict] = None,
488
+ ):
489
+ """A Browser session manager with page pooling
490
+
491
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
492
+ :param block_images: Prevent the loading of images through Firefox preferences.
493
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
494
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
495
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
496
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
497
+ :param block_webrtc: Blocks WebRTC entirely.
498
+ :param cookies: Set cookies for the next request.
499
+ :param addons: List of Firefox addons to use. Must be paths to extracted addons.
500
+ :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
501
+ :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
502
+ :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
503
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
504
+ :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
505
+ :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
506
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
507
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
508
+ :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
509
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
510
+ :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
511
+ It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
512
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
513
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
514
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
515
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
516
+ :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
517
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
518
+ :param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
519
+ """
520
+ super().__init__(
521
+ max_pages,
522
+ headless,
523
+ block_images,
524
+ disable_resources,
525
+ block_webrtc,
526
+ allow_webgl,
527
+ network_idle,
528
+ humanize,
529
+ solve_cloudflare,
530
+ wait,
531
+ timeout,
532
+ page_action,
533
+ wait_selector,
534
+ addons,
535
+ wait_selector_state,
536
+ cookies,
537
+ google_search,
538
+ extra_headers,
539
+ proxy,
540
+ os_randomize,
541
+ disable_ads,
542
+ geoip,
543
+ selector_config,
544
+ additional_args,
545
+ )
546
+ self.playwright: Optional[AsyncPlaywright] = None
547
+ self.context: Optional[AsyncBrowserContext] = None
548
+ self._lock = Lock()
549
+ self.__enter__ = None
550
+ self.__exit__ = None
551
+
552
+ async def __create__(self):
553
+ """Create a browser for this instance and context."""
554
+ self.playwright: AsyncPlaywright = await async_playwright().start()
555
+ self.context: AsyncBrowserContext = (
556
+ await self.playwright.firefox.launch_persistent_context(
557
+ **self.launch_options
558
+ )
559
+ )
560
+ if self.cookies:
561
+ await self.context.add_cookies(self.cookies)
562
+
563
+ async def __aenter__(self):
564
+ await self.__create__()
565
+ return self
566
+
567
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
568
+ await self.close()
569
+
570
+ async def close(self):
571
+ """Close all resources"""
572
+ if self._closed: # pragma: no cover
573
+ return
574
+
575
+ if self.context:
576
+ await self.context.close()
577
+ self.context = None
578
+
579
+ if self.playwright:
580
+ await self.playwright.stop()
581
+ self.playwright = None
582
+
583
+ self._closed = True
584
+
585
+ async def _get_or_create_page(self) -> PageInfo:
586
+ """Get an available page or create a new one"""
587
+ async with self._lock:
588
+ # Try to get a ready page first
589
+ page_info = self.page_pool.get_ready_page()
590
+ if page_info:
591
+ return page_info
592
+
593
+ # Create a new page if under limit
594
+ if self.page_pool.pages_count < self.max_pages:
595
+ page = await self.context.new_page()
596
+ page.set_default_navigation_timeout(self.timeout)
597
+ page.set_default_timeout(self.timeout)
598
+ if self.extra_headers:
599
+ await page.set_extra_http_headers(self.extra_headers)
600
+
601
+ if self.disable_resources:
602
+ await page.route("**/*", async_intercept_route)
603
+
604
+ return self.page_pool.add_page(page)
605
+
606
+ # Wait for a page to become available
607
+ max_wait = 30
608
+ start_time = time()
609
+
610
+ while time() - start_time < max_wait: # pragma: no cover
611
+ page_info = self.page_pool.get_ready_page()
612
+ if page_info:
613
+ return page_info
614
+ await asyncio_sleep(0.05)
615
+
616
+ raise TimeoutError("No pages available within timeout period")
617
+
618
+ async def _solve_cloudflare(self, page: async_Page):
619
+ """Solve the cloudflare challenge displayed on the playwright page passed. The async version
620
+
621
+ :param page: The async targeted page
622
+ :return:
623
+ """
624
+ challenge_type = self._detect_cloudflare(await page.content())
625
+ if not challenge_type:
626
+ log.error("No Cloudflare challenge found.")
627
+ return
628
+ else:
629
+ log.info(f'The turnstile version discovered is "{challenge_type}"')
630
+ if challenge_type == "non-interactive": # pragma: no cover
631
+ while "<title>Just a moment...</title>" in (await page.content()):
632
+ log.info("Waiting for Cloudflare wait page to disappear.")
633
+ await page.wait_for_timeout(1000)
634
+ await page.wait_for_load_state()
635
+ log.info("Cloudflare captcha is solved")
636
+ return
637
+
638
+ else:
639
+ while "Verifying you are human." in (await page.content()):
640
+ # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
641
+ await page.wait_for_timeout(500)
642
+
643
+ iframe = page.frame(url=__CF_PATTERN__)
644
+ if iframe is None:
645
+ log.info("Didn't find Cloudflare iframe!")
646
+ return
647
+
648
+ while not await (await iframe.frame_element()).is_visible():
649
+ # Double-checking that the iframe is loaded
650
+ await page.wait_for_timeout(500)
651
+
652
+ # Calculate the Captcha coordinates for any viewport
653
+ outer_box = await page.locator(
654
+ ".main-content p+div>div>div"
655
+ ).bounding_box()
656
+ captcha_x, captcha_y = outer_box["x"] + 26, outer_box["y"] + 25
657
+
658
+ # Move the mouse to the center of the window, then press and hold the left mouse button
659
+ await page.mouse.click(captcha_x, captcha_y, delay=60, button="left")
660
+ await page.locator(".zone-name-title").wait_for(state="hidden")
661
+ await page.wait_for_load_state(state="domcontentloaded")
662
+
663
+ log.info("Cloudflare captcha is solved")
664
+ return
665
+
666
+ async def fetch(self, url: str) -> Response:
667
+ """Opens up the browser and do your request based on your chosen options.
668
+
669
+ :param url: The Target url.
670
+ :return: A `Response` object.
671
+ """
672
+ if self._closed: # pragma: no cover
673
+ raise RuntimeError("Context manager has been closed")
674
+
675
+ final_response = None
676
+ referer = (
677
+ generate_convincing_referer(url)
678
+ if (self.google_search and "referer" not in self._headers_keys)
679
+ else None
680
+ )
681
+
682
+ async def handle_response(finished_response: AsyncPlaywrightResponse):
683
+ nonlocal final_response
684
+ if (
685
+ finished_response.request.resource_type == "document"
686
+ and finished_response.request.is_navigation_request()
687
+ ):
688
+ final_response = finished_response
689
+
690
+ page_info = await self._get_or_create_page()
691
+ page_info.mark_busy(url=url)
692
+
693
+ try:
694
+ # Navigate to URL and wait for a specified state
695
+ page_info.page.on("response", handle_response)
696
+ first_response = await page_info.page.goto(url, referer=referer)
697
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
698
+
699
+ if self.network_idle:
700
+ await page_info.page.wait_for_load_state("networkidle")
701
+
702
+ if not first_response:
703
+ raise RuntimeError(f"Failed to get response for {url}")
704
+
705
+ if self.solve_cloudflare:
706
+ await self._solve_cloudflare(page_info.page)
707
+ # Make sure the page is fully loaded after the captcha
708
+ await page_info.page.wait_for_load_state(state="load")
709
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
710
+ if self.network_idle:
711
+ await page_info.page.wait_for_load_state("networkidle")
712
+
713
+ if self.page_action is not None:
714
+ try:
715
+ page_info.page = await self.page_action(page_info.page)
716
+ except Exception as e:
717
+ log.error(f"Error executing page_action: {e}")
718
+
719
+ if self.wait_selector:
720
+ try:
721
+ waiter: AsyncLocator = page_info.page.locator(self.wait_selector)
722
+ await waiter.first.wait_for(state=self.wait_selector_state)
723
+ # Wait again after waiting for the selector, helpful with protections like Cloudflare
724
+ await page_info.page.wait_for_load_state(state="load")
725
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
726
+ if self.network_idle:
727
+ await page_info.page.wait_for_load_state("networkidle")
728
+ except Exception as e:
729
+ log.error(f"Error waiting for selector {self.wait_selector}: {e}")
730
+
731
+ await page_info.page.wait_for_timeout(self.wait)
732
+
733
+ # Create response object
734
+ response = await ResponseFactory.from_async_playwright_response(
735
+ page_info.page, first_response, final_response, self.selector_config
736
+ )
737
+
738
+ # Mark the page as ready for next use
739
+ page_info.mark_ready()
740
+
741
+ return response
742
+
743
+ except Exception as e:
744
+ page_info.mark_error()
745
+ raise e