scrapling 0.2.99__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. scrapling/__init__.py +18 -31
  2. scrapling/cli.py +818 -20
  3. scrapling/core/_html_utils.py +348 -0
  4. scrapling/core/_types.py +34 -17
  5. scrapling/core/ai.py +611 -0
  6. scrapling/core/custom_types.py +183 -100
  7. scrapling/core/mixins.py +27 -19
  8. scrapling/core/shell.py +647 -0
  9. scrapling/core/{storage_adaptors.py → storage.py} +41 -33
  10. scrapling/core/translator.py +20 -26
  11. scrapling/core/utils.py +49 -54
  12. scrapling/engines/__init__.py +15 -6
  13. scrapling/engines/_browsers/__init__.py +2 -0
  14. scrapling/engines/_browsers/_camoufox.py +759 -0
  15. scrapling/engines/_browsers/_config_tools.py +130 -0
  16. scrapling/engines/_browsers/_controllers.py +644 -0
  17. scrapling/engines/_browsers/_page.py +93 -0
  18. scrapling/engines/_browsers/_validators.py +170 -0
  19. scrapling/engines/constants.py +101 -88
  20. scrapling/engines/static.py +667 -110
  21. scrapling/engines/toolbelt/__init__.py +20 -6
  22. scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
  23. scrapling/engines/toolbelt/convertor.py +254 -0
  24. scrapling/engines/toolbelt/custom.py +158 -175
  25. scrapling/engines/toolbelt/fingerprints.py +32 -46
  26. scrapling/engines/toolbelt/navigation.py +68 -39
  27. scrapling/fetchers.py +239 -333
  28. scrapling/parser.py +781 -449
  29. scrapling-0.3.1.dist-info/METADATA +411 -0
  30. scrapling-0.3.1.dist-info/RECORD +41 -0
  31. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/WHEEL +1 -1
  32. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/top_level.txt +0 -1
  33. scrapling/defaults.py +0 -25
  34. scrapling/engines/camo.py +0 -339
  35. scrapling/engines/pw.py +0 -465
  36. scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
  37. scrapling-0.2.99.dist-info/METADATA +0 -290
  38. scrapling-0.2.99.dist-info/RECORD +0 -49
  39. tests/__init__.py +0 -1
  40. tests/fetchers/__init__.py +0 -1
  41. tests/fetchers/async/__init__.py +0 -0
  42. tests/fetchers/async/test_camoufox.py +0 -97
  43. tests/fetchers/async/test_httpx.py +0 -85
  44. tests/fetchers/async/test_playwright.py +0 -101
  45. tests/fetchers/sync/__init__.py +0 -0
  46. tests/fetchers/sync/test_camoufox.py +0 -70
  47. tests/fetchers/sync/test_httpx.py +0 -84
  48. tests/fetchers/sync/test_playwright.py +0 -89
  49. tests/fetchers/test_utils.py +0 -97
  50. tests/parser/__init__.py +0 -0
  51. tests/parser/test_automatch.py +0 -111
  52. tests/parser/test_general.py +0 -330
  53. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/entry_points.txt +0 -0
  54. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,644 @@
1
+ from time import time, sleep
2
+ from asyncio import sleep as asyncio_sleep, Lock
3
+
4
+ from playwright.sync_api import (
5
+ Response as SyncPlaywrightResponse,
6
+ sync_playwright,
7
+ BrowserContext,
8
+ Playwright,
9
+ Locator,
10
+ )
11
+ from playwright.async_api import (
12
+ async_playwright,
13
+ Response as AsyncPlaywrightResponse,
14
+ BrowserContext as AsyncBrowserContext,
15
+ Playwright as AsyncPlaywright,
16
+ Locator as AsyncLocator,
17
+ )
18
+ from rebrowser_playwright.sync_api import sync_playwright as sync_rebrowser_playwright
19
+ from rebrowser_playwright.async_api import (
20
+ async_playwright as async_rebrowser_playwright,
21
+ )
22
+
23
+ from scrapling.core.utils import log
24
+ from ._page import PageInfo, PagePool
25
+ from ._validators import validate, PlaywrightConfig
26
+ from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
27
+ from scrapling.core._types import (
28
+ Dict,
29
+ List,
30
+ Optional,
31
+ Callable,
32
+ SelectorWaitStates,
33
+ )
34
+ from scrapling.engines.toolbelt import (
35
+ Response,
36
+ ResponseFactory,
37
+ generate_convincing_referer,
38
+ intercept_route,
39
+ async_intercept_route,
40
+ )
41
+
42
+
43
+ class DynamicSession:
44
+ """A Browser session manager with page pooling."""
45
+
46
+ __slots__ = (
47
+ "max_pages",
48
+ "headless",
49
+ "hide_canvas",
50
+ "disable_webgl",
51
+ "real_chrome",
52
+ "stealth",
53
+ "google_search",
54
+ "proxy",
55
+ "locale",
56
+ "extra_headers",
57
+ "useragent",
58
+ "timeout",
59
+ "cookies",
60
+ "disable_resources",
61
+ "network_idle",
62
+ "wait_selector",
63
+ "init_script",
64
+ "wait_selector_state",
65
+ "wait",
66
+ "playwright",
67
+ "browser",
68
+ "context",
69
+ "page_pool",
70
+ "_closed",
71
+ "selector_config",
72
+ "page_action",
73
+ "launch_options",
74
+ "context_options",
75
+ "cdp_url",
76
+ "_headers_keys",
77
+ )
78
+
79
+ def __init__(
80
+ self,
81
+ __max_pages: int = 1,
82
+ headless: bool = True,
83
+ google_search: bool = True,
84
+ hide_canvas: bool = False,
85
+ disable_webgl: bool = False,
86
+ real_chrome: bool = False,
87
+ stealth: bool = False,
88
+ wait: int | float = 0,
89
+ page_action: Optional[Callable] = None,
90
+ proxy: Optional[str | Dict[str, str]] = None,
91
+ locale: str = "en-US",
92
+ extra_headers: Optional[Dict[str, str]] = None,
93
+ useragent: Optional[str] = None,
94
+ cdp_url: Optional[str] = None,
95
+ timeout: int | float = 30000,
96
+ disable_resources: bool = False,
97
+ wait_selector: Optional[str] = None,
98
+ init_script: Optional[str] = None,
99
+ cookies: Optional[List[Dict]] = None,
100
+ network_idle: bool = False,
101
+ wait_selector_state: SelectorWaitStates = "attached",
102
+ selector_config: Optional[Dict] = None,
103
+ ):
104
+ """A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
105
+
106
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
107
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
108
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
109
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
110
+ :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
111
+ :param cookies: Set cookies for the next request.
112
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
113
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
114
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
115
+ :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
116
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
117
+ :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
118
+ :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
119
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
120
+ :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
121
+ :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
122
+ :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
123
+ :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
124
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
125
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
126
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
127
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
128
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
129
+ """
130
+
131
+ params = {
132
+ "max_pages": __max_pages,
133
+ "headless": headless,
134
+ "google_search": google_search,
135
+ "hide_canvas": hide_canvas,
136
+ "disable_webgl": disable_webgl,
137
+ "real_chrome": real_chrome,
138
+ "stealth": stealth,
139
+ "wait": wait,
140
+ "page_action": page_action,
141
+ "proxy": proxy,
142
+ "locale": locale,
143
+ "extra_headers": extra_headers,
144
+ "useragent": useragent,
145
+ "timeout": timeout,
146
+ "selector_config": selector_config,
147
+ "disable_resources": disable_resources,
148
+ "wait_selector": wait_selector,
149
+ "init_script": init_script,
150
+ "cookies": cookies,
151
+ "network_idle": network_idle,
152
+ "wait_selector_state": wait_selector_state,
153
+ "cdp_url": cdp_url,
154
+ }
155
+ config = validate(params, PlaywrightConfig)
156
+
157
+ self.max_pages = config.max_pages
158
+ self.headless = config.headless
159
+ self.hide_canvas = config.hide_canvas
160
+ self.disable_webgl = config.disable_webgl
161
+ self.real_chrome = config.real_chrome
162
+ self.stealth = config.stealth
163
+ self.google_search = config.google_search
164
+ self.wait = config.wait
165
+ self.proxy = config.proxy
166
+ self.locale = config.locale
167
+ self.extra_headers = config.extra_headers
168
+ self.useragent = config.useragent
169
+ self.timeout = config.timeout
170
+ self.cookies = config.cookies
171
+ self.disable_resources = config.disable_resources
172
+ self.cdp_url = config.cdp_url
173
+ self.network_idle = config.network_idle
174
+ self.wait_selector = config.wait_selector
175
+ self.init_script = config.init_script
176
+ self.wait_selector_state = config.wait_selector_state
177
+
178
+ self.playwright: Optional[Playwright] = None
179
+ self.context: Optional[BrowserContext] = None
180
+ self.page_pool = PagePool(self.max_pages)
181
+ self._closed = False
182
+ self.selector_config = config.selector_config
183
+ self.page_action = config.page_action
184
+ self._headers_keys = (
185
+ set(map(str.lower, self.extra_headers.keys()))
186
+ if self.extra_headers
187
+ else set()
188
+ )
189
+ self.__initiate_browser_options__()
190
+
191
+ def __initiate_browser_options__(self):
192
+ if not self.cdp_url:
193
+ # `launch_options` is used with persistent context
194
+ self.launch_options = dict(
195
+ _launch_kwargs(
196
+ self.headless,
197
+ self.proxy,
198
+ self.locale,
199
+ tuple(self.extra_headers.items())
200
+ if self.extra_headers
201
+ else tuple(),
202
+ self.useragent,
203
+ self.real_chrome,
204
+ self.stealth,
205
+ self.hide_canvas,
206
+ self.disable_webgl,
207
+ )
208
+ )
209
+ self.launch_options["extra_http_headers"] = dict(
210
+ self.launch_options["extra_http_headers"]
211
+ )
212
+ self.launch_options["proxy"] = dict(self.launch_options["proxy"]) or None
213
+ self.context_options = dict()
214
+ else:
215
+ # while `context_options` is left to be used when cdp mode is enabled
216
+ self.launch_options = dict()
217
+ self.context_options = dict(
218
+ _context_kwargs(
219
+ self.proxy,
220
+ self.locale,
221
+ tuple(self.extra_headers.items())
222
+ if self.extra_headers
223
+ else tuple(),
224
+ self.useragent,
225
+ self.stealth,
226
+ )
227
+ )
228
+ self.context_options["extra_http_headers"] = dict(
229
+ self.context_options["extra_http_headers"]
230
+ )
231
+ self.context_options["proxy"] = dict(self.context_options["proxy"]) or None
232
+
233
+ def __create__(self):
234
+ """Create a browser for this instance and context."""
235
+ sync_context = sync_rebrowser_playwright
236
+ if not self.stealth or self.real_chrome:
237
+ # Because rebrowser_playwright doesn't play well with real browsers
238
+ sync_context = sync_playwright
239
+
240
+ self.playwright = sync_context().start()
241
+
242
+ if self.cdp_url: # pragma: no cover
243
+ self.context = self.playwright.chromium.connect_over_cdp(
244
+ endpoint_url=self.cdp_url
245
+ ).new_context(**self.context_options)
246
+ else:
247
+ self.context = self.playwright.chromium.launch_persistent_context(
248
+ user_data_dir="", **self.launch_options
249
+ )
250
+
251
+ if self.init_script: # pragma: no cover
252
+ self.context.add_init_script(path=self.init_script)
253
+
254
+ if self.cookies: # pragma: no cover
255
+ self.context.add_cookies(self.cookies)
256
+
257
+ def __enter__(self):
258
+ self.__create__()
259
+ return self
260
+
261
+ def __exit__(self, exc_type, exc_val, exc_tb):
262
+ self.close()
263
+
264
+ def close(self): # pragma: no cover
265
+ """Close all resources"""
266
+ if self._closed:
267
+ return
268
+
269
+ if self.context:
270
+ self.context.close()
271
+ self.context = None
272
+
273
+ if self.playwright:
274
+ self.playwright.stop()
275
+ self.playwright = None
276
+
277
+ self._closed = True
278
+
279
+ def _get_or_create_page(self) -> PageInfo: # pragma: no cover
280
+ """Get an available page or create a new one"""
281
+ # Try to get a ready page first
282
+ page_info = self.page_pool.get_ready_page()
283
+ if page_info:
284
+ return page_info
285
+
286
+ # Create a new page if under limit
287
+ if self.page_pool.pages_count < self.max_pages:
288
+ page = self.context.new_page()
289
+ page.set_default_navigation_timeout(self.timeout)
290
+ page.set_default_timeout(self.timeout)
291
+ if self.extra_headers:
292
+ page.set_extra_http_headers(self.extra_headers)
293
+
294
+ if self.disable_resources:
295
+ page.route("**/*", intercept_route)
296
+
297
+ if self.stealth:
298
+ for script in _compiled_stealth_scripts():
299
+ page.add_init_script(script=script)
300
+
301
+ return self.page_pool.add_page(page)
302
+
303
+ # Wait for a page to become available
304
+ max_wait = 30
305
+ start_time = time()
306
+
307
+ while time() - start_time < max_wait:
308
+ page_info = self.page_pool.get_ready_page()
309
+ if page_info:
310
+ return page_info
311
+ sleep(0.05)
312
+
313
+ raise TimeoutError("No pages available within timeout period")
314
+
315
+ def fetch(self, url: str) -> Response:
316
+ """Opens up the browser and do your request based on your chosen options.
317
+
318
+ :param url: The Target url.
319
+ :return: A `Response` object.
320
+ """
321
+ if self._closed: # pragma: no cover
322
+ raise RuntimeError("Context manager has been closed")
323
+
324
+ final_response = None
325
+ referer = (
326
+ generate_convincing_referer(url)
327
+ if (self.google_search and "referer" not in self._headers_keys)
328
+ else None
329
+ )
330
+
331
+ def handle_response(finished_response: SyncPlaywrightResponse):
332
+ nonlocal final_response
333
+ if (
334
+ finished_response.request.resource_type == "document"
335
+ and finished_response.request.is_navigation_request()
336
+ ):
337
+ final_response = finished_response
338
+
339
+ page_info = self._get_or_create_page()
340
+ page_info.mark_busy(url=url)
341
+
342
+ try: # pragma: no cover
343
+ # Navigate to URL and wait for a specified state
344
+ page_info.page.on("response", handle_response)
345
+ first_response = page_info.page.goto(url, referer=referer)
346
+ page_info.page.wait_for_load_state(state="domcontentloaded")
347
+
348
+ if self.network_idle:
349
+ page_info.page.wait_for_load_state("networkidle")
350
+
351
+ if not first_response:
352
+ raise RuntimeError(f"Failed to get response for {url}")
353
+
354
+ if self.page_action is not None:
355
+ try:
356
+ page_info.page = self.page_action(page_info.page)
357
+ except Exception as e: # pragma: no cover
358
+ log.error(f"Error executing page_action: {e}")
359
+
360
+ if self.wait_selector:
361
+ try:
362
+ waiter: Locator = page_info.page.locator(self.wait_selector)
363
+ waiter.first.wait_for(state=self.wait_selector_state)
364
+ # Wait again after waiting for the selector, helpful with protections like Cloudflare
365
+ page_info.page.wait_for_load_state(state="load")
366
+ page_info.page.wait_for_load_state(state="domcontentloaded")
367
+ if self.network_idle:
368
+ page_info.page.wait_for_load_state("networkidle")
369
+ except Exception as e: # pragma: no cover
370
+ log.error(f"Error waiting for selector {self.wait_selector}: {e}")
371
+
372
+ page_info.page.wait_for_timeout(self.wait)
373
+
374
+ # Create response object
375
+ response = ResponseFactory.from_playwright_response(
376
+ page_info.page, first_response, final_response, self.selector_config
377
+ )
378
+
379
+ # Mark the page as ready for next use
380
+ page_info.mark_ready()
381
+
382
+ return response
383
+
384
+ except Exception as e:
385
+ page_info.mark_error()
386
+ raise e
387
+
388
+ def get_pool_stats(self) -> Dict[str, int]:
389
+ """Get statistics about the current page pool"""
390
+ return {
391
+ "total_pages": self.page_pool.pages_count,
392
+ "ready_pages": self.page_pool.ready_count,
393
+ "busy_pages": self.page_pool.busy_count,
394
+ "max_pages": self.max_pages,
395
+ }
396
+
397
+
398
+ class AsyncDynamicSession(DynamicSession):
399
+ """An async Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory."""
400
+
401
+ def __init__(
402
+ self,
403
+ max_pages: int = 1,
404
+ headless: bool = True,
405
+ google_search: bool = True,
406
+ hide_canvas: bool = False,
407
+ disable_webgl: bool = False,
408
+ real_chrome: bool = False,
409
+ stealth: bool = False,
410
+ wait: int | float = 0,
411
+ page_action: Optional[Callable] = None,
412
+ proxy: Optional[str | Dict[str, str]] = None,
413
+ locale: str = "en-US",
414
+ extra_headers: Optional[Dict[str, str]] = None,
415
+ useragent: Optional[str] = None,
416
+ cdp_url: Optional[str] = None,
417
+ timeout: int | float = 30000,
418
+ disable_resources: bool = False,
419
+ wait_selector: Optional[str] = None,
420
+ init_script: Optional[str] = None,
421
+ cookies: Optional[List[Dict]] = None,
422
+ network_idle: bool = False,
423
+ wait_selector_state: SelectorWaitStates = "attached",
424
+ selector_config: Optional[Dict] = None,
425
+ ):
426
+ """A Browser session manager with page pooling
427
+
428
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
429
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
430
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
431
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
432
+ :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
433
+ :param cookies: Set cookies for the next request.
434
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
435
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
436
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
437
+ :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
438
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
439
+ :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
440
+ :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
441
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
442
+ :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
443
+ :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
444
+ :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
445
+ :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
446
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
447
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
448
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
449
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
450
+ :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
451
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
452
+ """
453
+
454
+ super().__init__(
455
+ max_pages,
456
+ headless,
457
+ google_search,
458
+ hide_canvas,
459
+ disable_webgl,
460
+ real_chrome,
461
+ stealth,
462
+ wait,
463
+ page_action,
464
+ proxy,
465
+ locale,
466
+ extra_headers,
467
+ useragent,
468
+ cdp_url,
469
+ timeout,
470
+ disable_resources,
471
+ wait_selector,
472
+ init_script,
473
+ cookies,
474
+ network_idle,
475
+ wait_selector_state,
476
+ selector_config,
477
+ )
478
+
479
+ self.playwright: Optional[AsyncPlaywright] = None
480
+ self.context: Optional[AsyncBrowserContext] = None
481
+ self._lock = Lock()
482
+ self.__enter__ = None
483
+ self.__exit__ = None
484
+
485
+ async def __create__(self):
486
+ """Create a browser for this instance and context."""
487
+ async_context = async_rebrowser_playwright
488
+ if not self.stealth or self.real_chrome:
489
+ # Because rebrowser_playwright doesn't play well with real browsers
490
+ async_context = async_playwright
491
+
492
+ self.playwright: AsyncPlaywright = await async_context().start()
493
+
494
+ if self.cdp_url:
495
+ browser = await self.playwright.chromium.connect_over_cdp(
496
+ endpoint_url=self.cdp_url
497
+ )
498
+ self.context: AsyncBrowserContext = await browser.new_context(
499
+ **self.context_options
500
+ )
501
+ else:
502
+ self.context: AsyncBrowserContext = (
503
+ await self.playwright.chromium.launch_persistent_context(
504
+ user_data_dir="", **self.launch_options
505
+ )
506
+ )
507
+
508
+ if self.init_script: # pragma: no cover
509
+ await self.context.add_init_script(path=self.init_script)
510
+
511
+ if self.cookies:
512
+ await self.context.add_cookies(self.cookies)
513
+
514
+ async def __aenter__(self):
515
+ await self.__create__()
516
+ return self
517
+
518
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
519
+ await self.close()
520
+
521
+ async def close(self):
522
+ """Close all resources"""
523
+ if self._closed: # pragma: no cover
524
+ return
525
+
526
+ if self.context:
527
+ await self.context.close()
528
+ self.context = None
529
+
530
+ if self.playwright:
531
+ await self.playwright.stop()
532
+ self.playwright = None
533
+
534
+ self._closed = True
535
+
536
+ async def _get_or_create_page(self) -> PageInfo:
537
+ """Get an available page or create a new one"""
538
+ async with self._lock:
539
+ # Try to get a ready page first
540
+ page_info = self.page_pool.get_ready_page()
541
+ if page_info:
542
+ return page_info
543
+
544
+ # Create a new page if under limit
545
+ if self.page_pool.pages_count < self.max_pages:
546
+ page = await self.context.new_page()
547
+ page.set_default_navigation_timeout(self.timeout)
548
+ page.set_default_timeout(self.timeout)
549
+ if self.extra_headers:
550
+ await page.set_extra_http_headers(self.extra_headers)
551
+
552
+ if self.disable_resources:
553
+ await page.route("**/*", async_intercept_route)
554
+
555
+ if self.stealth:
556
+ for script in _compiled_stealth_scripts():
557
+ await page.add_init_script(script=script)
558
+
559
+ return self.page_pool.add_page(page)
560
+
561
+ # Wait for a page to become available
562
+ max_wait = 30 # seconds
563
+ start_time = time()
564
+
565
+ while time() - start_time < max_wait: # pragma: no cover
566
+ page_info = self.page_pool.get_ready_page()
567
+ if page_info:
568
+ return page_info
569
+ await asyncio_sleep(0.05)
570
+
571
+ raise TimeoutError("No pages available within timeout period")
572
+
573
+ async def fetch(self, url: str) -> Response:
574
+ """Opens up the browser and do your request based on your chosen options.
575
+
576
+ :param url: The Target url.
577
+ :return: A `Response` object.
578
+ """
579
+ if self._closed: # pragma: no cover
580
+ raise RuntimeError("Context manager has been closed")
581
+
582
+ final_response = None
583
+ referer = (
584
+ generate_convincing_referer(url)
585
+ if (self.google_search and "referer" not in self._headers_keys)
586
+ else None
587
+ )
588
+
589
+ async def handle_response(finished_response: AsyncPlaywrightResponse):
590
+ nonlocal final_response
591
+ if (
592
+ finished_response.request.resource_type == "document"
593
+ and finished_response.request.is_navigation_request()
594
+ ):
595
+ final_response = finished_response
596
+
597
+ page_info = await self._get_or_create_page()
598
+ page_info.mark_busy(url=url)
599
+
600
+ try:
601
+ # Navigate to URL and wait for a specified state
602
+ page_info.page.on("response", handle_response)
603
+ first_response = await page_info.page.goto(url, referer=referer)
604
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
605
+
606
+ if self.network_idle:
607
+ await page_info.page.wait_for_load_state("networkidle")
608
+
609
+ if not first_response:
610
+ raise RuntimeError(f"Failed to get response for {url}")
611
+
612
+ if self.page_action is not None:
613
+ try:
614
+ page_info.page = await self.page_action(page_info.page)
615
+ except Exception as e:
616
+ log.error(f"Error executing page_action: {e}")
617
+
618
+ if self.wait_selector:
619
+ try:
620
+ waiter: AsyncLocator = page_info.page.locator(self.wait_selector)
621
+ await waiter.first.wait_for(state=self.wait_selector_state)
622
+ # Wait again after waiting for the selector, helpful with protections like Cloudflare
623
+ await page_info.page.wait_for_load_state(state="load")
624
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
625
+ if self.network_idle:
626
+ await page_info.page.wait_for_load_state("networkidle")
627
+ except Exception as e:
628
+ log.error(f"Error waiting for selector {self.wait_selector}: {e}")
629
+
630
+ await page_info.page.wait_for_timeout(self.wait)
631
+
632
+ # Create response object
633
+ response = await ResponseFactory.from_async_playwright_response(
634
+ page_info.page, first_response, final_response, self.selector_config
635
+ )
636
+
637
+ # Mark the page as ready for next use
638
+ page_info.mark_ready()
639
+
640
+ return response
641
+
642
+ except Exception as e: # pragma: no cover
643
+ page_info.mark_error()
644
+ raise e