scrapling 0.2.98__py3-none-any.whl → 0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. scrapling/__init__.py +18 -31
  2. scrapling/cli.py +818 -20
  3. scrapling/core/_html_utils.py +348 -0
  4. scrapling/core/_types.py +34 -17
  5. scrapling/core/ai.py +611 -0
  6. scrapling/core/custom_types.py +183 -100
  7. scrapling/core/mixins.py +27 -19
  8. scrapling/core/shell.py +647 -0
  9. scrapling/core/{storage_adaptors.py → storage.py} +41 -33
  10. scrapling/core/translator.py +20 -26
  11. scrapling/core/utils.py +49 -54
  12. scrapling/engines/__init__.py +15 -6
  13. scrapling/engines/_browsers/__init__.py +2 -0
  14. scrapling/engines/_browsers/_camoufox.py +745 -0
  15. scrapling/engines/_browsers/_config_tools.py +130 -0
  16. scrapling/engines/_browsers/_controllers.py +630 -0
  17. scrapling/engines/_browsers/_page.py +93 -0
  18. scrapling/engines/_browsers/_validators.py +150 -0
  19. scrapling/engines/constants.py +101 -88
  20. scrapling/engines/static.py +667 -110
  21. scrapling/engines/toolbelt/__init__.py +20 -6
  22. scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
  23. scrapling/engines/toolbelt/convertor.py +254 -0
  24. scrapling/engines/toolbelt/custom.py +205 -186
  25. scrapling/engines/toolbelt/fingerprints.py +32 -46
  26. scrapling/engines/toolbelt/navigation.py +68 -39
  27. scrapling/fetchers.py +255 -260
  28. scrapling/parser.py +781 -449
  29. scrapling-0.3.dist-info/METADATA +409 -0
  30. scrapling-0.3.dist-info/RECORD +41 -0
  31. {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/WHEEL +1 -1
  32. {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/top_level.txt +0 -1
  33. scrapling/defaults.py +0 -19
  34. scrapling/engines/camo.py +0 -299
  35. scrapling/engines/pw.py +0 -428
  36. scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
  37. scrapling-0.2.98.dist-info/METADATA +0 -867
  38. scrapling-0.2.98.dist-info/RECORD +0 -49
  39. tests/__init__.py +0 -1
  40. tests/fetchers/__init__.py +0 -1
  41. tests/fetchers/async/__init__.py +0 -0
  42. tests/fetchers/async/test_camoufox.py +0 -95
  43. tests/fetchers/async/test_httpx.py +0 -83
  44. tests/fetchers/async/test_playwright.py +0 -99
  45. tests/fetchers/sync/__init__.py +0 -0
  46. tests/fetchers/sync/test_camoufox.py +0 -68
  47. tests/fetchers/sync/test_httpx.py +0 -82
  48. tests/fetchers/sync/test_playwright.py +0 -87
  49. tests/fetchers/test_utils.py +0 -97
  50. tests/parser/__init__.py +0 -0
  51. tests/parser/test_automatch.py +0 -111
  52. tests/parser/test_general.py +0 -330
  53. {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/entry_points.txt +0 -0
  54. {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,630 @@
1
+ from time import time, sleep
2
+ from asyncio import sleep as asyncio_sleep, Lock
3
+
4
+ from playwright.sync_api import (
5
+ Response as SyncPlaywrightResponse,
6
+ sync_playwright,
7
+ BrowserContext,
8
+ Playwright,
9
+ Locator,
10
+ )
11
+ from playwright.async_api import (
12
+ async_playwright,
13
+ Response as AsyncPlaywrightResponse,
14
+ BrowserContext as AsyncBrowserContext,
15
+ Playwright as AsyncPlaywright,
16
+ Locator as AsyncLocator,
17
+ )
18
+ from rebrowser_playwright.sync_api import sync_playwright as sync_rebrowser_playwright
19
+ from rebrowser_playwright.async_api import (
20
+ async_playwright as async_rebrowser_playwright,
21
+ )
22
+
23
+ from scrapling.core.utils import log
24
+ from ._page import PageInfo, PagePool
25
+ from ._validators import validate, PlaywrightConfig
26
+ from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
27
+ from scrapling.core._types import (
28
+ Dict,
29
+ List,
30
+ Optional,
31
+ Callable,
32
+ SelectorWaitStates,
33
+ )
34
+ from scrapling.engines.toolbelt import (
35
+ Response,
36
+ ResponseFactory,
37
+ generate_convincing_referer,
38
+ intercept_route,
39
+ async_intercept_route,
40
+ )
41
+
42
+
43
+ class DynamicSession:
44
+ """A Browser session manager with page pooling."""
45
+
46
+ __slots__ = (
47
+ "max_pages",
48
+ "headless",
49
+ "hide_canvas",
50
+ "disable_webgl",
51
+ "real_chrome",
52
+ "stealth",
53
+ "google_search",
54
+ "proxy",
55
+ "locale",
56
+ "extra_headers",
57
+ "useragent",
58
+ "timeout",
59
+ "cookies",
60
+ "disable_resources",
61
+ "network_idle",
62
+ "wait_selector",
63
+ "wait_selector_state",
64
+ "wait",
65
+ "playwright",
66
+ "browser",
67
+ "context",
68
+ "page_pool",
69
+ "_closed",
70
+ "selector_config",
71
+ "page_action",
72
+ "launch_options",
73
+ "context_options",
74
+ "cdp_url",
75
+ "_headers_keys",
76
+ )
77
+
78
+ def __init__(
79
+ self,
80
+ __max_pages: int = 1,
81
+ headless: bool = True,
82
+ google_search: bool = True,
83
+ hide_canvas: bool = False,
84
+ disable_webgl: bool = False,
85
+ real_chrome: bool = False,
86
+ stealth: bool = False,
87
+ wait: int | float = 0,
88
+ page_action: Optional[Callable] = None,
89
+ proxy: Optional[str | Dict[str, str]] = None,
90
+ locale: str = "en-US",
91
+ extra_headers: Optional[Dict[str, str]] = None,
92
+ useragent: Optional[str] = None,
93
+ cdp_url: Optional[str] = None,
94
+ timeout: int | float = 30000,
95
+ disable_resources: bool = False,
96
+ wait_selector: Optional[str] = None,
97
+ cookies: Optional[List[Dict]] = None,
98
+ network_idle: bool = False,
99
+ wait_selector_state: SelectorWaitStates = "attached",
100
+ selector_config: Optional[Dict] = None,
101
+ ):
102
+ """A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
103
+
104
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
105
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
106
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
107
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
108
+ :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
109
+ :param cookies: Set cookies for the next request.
110
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
111
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
112
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
113
+ :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
114
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
115
+ :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
116
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
117
+ :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
118
+ :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
119
+ :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
120
+ :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
121
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
122
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
123
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
124
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
125
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
126
+ """
127
+
128
+ params = {
129
+ "max_pages": __max_pages,
130
+ "headless": headless,
131
+ "google_search": google_search,
132
+ "hide_canvas": hide_canvas,
133
+ "disable_webgl": disable_webgl,
134
+ "real_chrome": real_chrome,
135
+ "stealth": stealth,
136
+ "wait": wait,
137
+ "page_action": page_action,
138
+ "proxy": proxy,
139
+ "locale": locale,
140
+ "extra_headers": extra_headers,
141
+ "useragent": useragent,
142
+ "timeout": timeout,
143
+ "selector_config": selector_config,
144
+ "disable_resources": disable_resources,
145
+ "wait_selector": wait_selector,
146
+ "cookies": cookies,
147
+ "network_idle": network_idle,
148
+ "wait_selector_state": wait_selector_state,
149
+ "cdp_url": cdp_url,
150
+ }
151
+ config = validate(params, PlaywrightConfig)
152
+
153
+ self.max_pages = config.max_pages
154
+ self.headless = config.headless
155
+ self.hide_canvas = config.hide_canvas
156
+ self.disable_webgl = config.disable_webgl
157
+ self.real_chrome = config.real_chrome
158
+ self.stealth = config.stealth
159
+ self.google_search = config.google_search
160
+ self.wait = config.wait
161
+ self.proxy = config.proxy
162
+ self.locale = config.locale
163
+ self.extra_headers = config.extra_headers
164
+ self.useragent = config.useragent
165
+ self.timeout = config.timeout
166
+ self.cookies = config.cookies
167
+ self.disable_resources = config.disable_resources
168
+ self.cdp_url = config.cdp_url
169
+ self.network_idle = config.network_idle
170
+ self.wait_selector = config.wait_selector
171
+ self.wait_selector_state = config.wait_selector_state
172
+
173
+ self.playwright: Optional[Playwright] = None
174
+ self.context: Optional[BrowserContext] = None
175
+ self.page_pool = PagePool(self.max_pages)
176
+ self._closed = False
177
+ self.selector_config = config.selector_config
178
+ self.page_action = config.page_action
179
+ self._headers_keys = (
180
+ set(map(str.lower, self.extra_headers.keys()))
181
+ if self.extra_headers
182
+ else set()
183
+ )
184
+ self.__initiate_browser_options__()
185
+
186
+ def __initiate_browser_options__(self):
187
+ if not self.cdp_url:
188
+ # `launch_options` is used with persistent context
189
+ self.launch_options = dict(
190
+ _launch_kwargs(
191
+ self.headless,
192
+ self.proxy,
193
+ self.locale,
194
+ tuple(self.extra_headers.items())
195
+ if self.extra_headers
196
+ else tuple(),
197
+ self.useragent,
198
+ self.real_chrome,
199
+ self.stealth,
200
+ self.hide_canvas,
201
+ self.disable_webgl,
202
+ )
203
+ )
204
+ self.launch_options["extra_http_headers"] = dict(
205
+ self.launch_options["extra_http_headers"]
206
+ )
207
+ self.launch_options["proxy"] = dict(self.launch_options["proxy"]) or None
208
+ self.context_options = dict()
209
+ else:
210
+ # while `context_options` is left to be used when cdp mode is enabled
211
+ self.launch_options = dict()
212
+ self.context_options = dict(
213
+ _context_kwargs(
214
+ self.proxy,
215
+ self.locale,
216
+ tuple(self.extra_headers.items())
217
+ if self.extra_headers
218
+ else tuple(),
219
+ self.useragent,
220
+ self.stealth,
221
+ )
222
+ )
223
+ self.context_options["extra_http_headers"] = dict(
224
+ self.context_options["extra_http_headers"]
225
+ )
226
+ self.context_options["proxy"] = dict(self.context_options["proxy"]) or None
227
+
228
+ def __create__(self):
229
+ """Create a browser for this instance and context."""
230
+ sync_context = sync_rebrowser_playwright
231
+ if not self.stealth or self.real_chrome:
232
+ # Because rebrowser_playwright doesn't play well with real browsers
233
+ sync_context = sync_playwright
234
+
235
+ self.playwright = sync_context().start()
236
+
237
+ if self.cdp_url: # pragma: no cover
238
+ self.context = self.playwright.chromium.connect_over_cdp(
239
+ endpoint_url=self.cdp_url
240
+ ).new_context(**self.context_options)
241
+ else:
242
+ self.context = self.playwright.chromium.launch_persistent_context(
243
+ user_data_dir="", **self.launch_options
244
+ )
245
+
246
+ if self.cookies: # pragma: no cover
247
+ self.context.add_cookies(self.cookies)
248
+
249
+ def __enter__(self):
250
+ self.__create__()
251
+ return self
252
+
253
+ def __exit__(self, exc_type, exc_val, exc_tb):
254
+ self.close()
255
+
256
+ def close(self): # pragma: no cover
257
+ """Close all resources"""
258
+ if self._closed:
259
+ return
260
+
261
+ if self.context:
262
+ self.context.close()
263
+ self.context = None
264
+
265
+ if self.playwright:
266
+ self.playwright.stop()
267
+ self.playwright = None
268
+
269
+ self._closed = True
270
+
271
+ def _get_or_create_page(self) -> PageInfo: # pragma: no cover
272
+ """Get an available page or create a new one"""
273
+ # Try to get a ready page first
274
+ page_info = self.page_pool.get_ready_page()
275
+ if page_info:
276
+ return page_info
277
+
278
+ # Create a new page if under limit
279
+ if self.page_pool.pages_count < self.max_pages:
280
+ page = self.context.new_page()
281
+ page.set_default_navigation_timeout(self.timeout)
282
+ page.set_default_timeout(self.timeout)
283
+ if self.extra_headers:
284
+ page.set_extra_http_headers(self.extra_headers)
285
+
286
+ if self.disable_resources:
287
+ page.route("**/*", intercept_route)
288
+
289
+ if self.stealth:
290
+ for script in _compiled_stealth_scripts():
291
+ page.add_init_script(script=script)
292
+
293
+ return self.page_pool.add_page(page)
294
+
295
+ # Wait for a page to become available
296
+ max_wait = 30
297
+ start_time = time()
298
+
299
+ while time() - start_time < max_wait:
300
+ page_info = self.page_pool.get_ready_page()
301
+ if page_info:
302
+ return page_info
303
+ sleep(0.05)
304
+
305
+ raise TimeoutError("No pages available within timeout period")
306
+
307
+ def fetch(self, url: str) -> Response:
308
+ """Opens up the browser and do your request based on your chosen options.
309
+
310
+ :param url: The Target url.
311
+ :return: A `Response` object.
312
+ """
313
+ if self._closed: # pragma: no cover
314
+ raise RuntimeError("Context manager has been closed")
315
+
316
+ final_response = None
317
+ referer = (
318
+ generate_convincing_referer(url)
319
+ if (self.google_search and "referer" not in self._headers_keys)
320
+ else None
321
+ )
322
+
323
+ def handle_response(finished_response: SyncPlaywrightResponse):
324
+ nonlocal final_response
325
+ if (
326
+ finished_response.request.resource_type == "document"
327
+ and finished_response.request.is_navigation_request()
328
+ ):
329
+ final_response = finished_response
330
+
331
+ page_info = self._get_or_create_page()
332
+ page_info.mark_busy(url=url)
333
+
334
+ try: # pragma: no cover
335
+ # Navigate to URL and wait for a specified state
336
+ page_info.page.on("response", handle_response)
337
+ first_response = page_info.page.goto(url, referer=referer)
338
+ page_info.page.wait_for_load_state(state="domcontentloaded")
339
+
340
+ if self.network_idle:
341
+ page_info.page.wait_for_load_state("networkidle")
342
+
343
+ if not first_response:
344
+ raise RuntimeError(f"Failed to get response for {url}")
345
+
346
+ if self.page_action is not None:
347
+ try:
348
+ page_info.page = self.page_action(page_info.page)
349
+ except Exception as e: # pragma: no cover
350
+ log.error(f"Error executing page_action: {e}")
351
+
352
+ if self.wait_selector:
353
+ try:
354
+ waiter: Locator = page_info.page.locator(self.wait_selector)
355
+ waiter.first.wait_for(state=self.wait_selector_state)
356
+ # Wait again after waiting for the selector, helpful with protections like Cloudflare
357
+ page_info.page.wait_for_load_state(state="load")
358
+ page_info.page.wait_for_load_state(state="domcontentloaded")
359
+ if self.network_idle:
360
+ page_info.page.wait_for_load_state("networkidle")
361
+ except Exception as e: # pragma: no cover
362
+ log.error(f"Error waiting for selector {self.wait_selector}: {e}")
363
+
364
+ page_info.page.wait_for_timeout(self.wait)
365
+
366
+ # Create response object
367
+ response = ResponseFactory.from_playwright_response(
368
+ page_info.page, first_response, final_response, self.selector_config
369
+ )
370
+
371
+ # Mark the page as ready for next use
372
+ page_info.mark_ready()
373
+
374
+ return response
375
+
376
+ except Exception as e:
377
+ page_info.mark_error()
378
+ raise e
379
+
380
+ def get_pool_stats(self) -> Dict[str, int]:
381
+ """Get statistics about the current page pool"""
382
+ return {
383
+ "total_pages": self.page_pool.pages_count,
384
+ "ready_pages": self.page_pool.ready_count,
385
+ "busy_pages": self.page_pool.busy_count,
386
+ "max_pages": self.max_pages,
387
+ }
388
+
389
+
390
+ class AsyncDynamicSession(DynamicSession):
391
+ """An async Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory."""
392
+
393
+ def __init__(
394
+ self,
395
+ max_pages: int = 1,
396
+ headless: bool = True,
397
+ google_search: bool = True,
398
+ hide_canvas: bool = False,
399
+ disable_webgl: bool = False,
400
+ real_chrome: bool = False,
401
+ stealth: bool = False,
402
+ wait: int | float = 0,
403
+ page_action: Optional[Callable] = None,
404
+ proxy: Optional[str | Dict[str, str]] = None,
405
+ locale: str = "en-US",
406
+ extra_headers: Optional[Dict[str, str]] = None,
407
+ useragent: Optional[str] = None,
408
+ cdp_url: Optional[str] = None,
409
+ timeout: int | float = 30000,
410
+ disable_resources: bool = False,
411
+ wait_selector: Optional[str] = None,
412
+ cookies: Optional[List[Dict]] = None,
413
+ network_idle: bool = False,
414
+ wait_selector_state: SelectorWaitStates = "attached",
415
+ selector_config: Optional[Dict] = None,
416
+ ):
417
+ """A Browser session manager with page pooling
418
+
419
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
420
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
421
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
422
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
423
+ :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
424
+ :param cookies: Set cookies for the next request.
425
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
426
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
427
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
428
+ :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
429
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
430
+ :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
431
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
432
+ :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
433
+ :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
434
+ :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
435
+ :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
436
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
437
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
438
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
439
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
440
+ :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
441
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
442
+ """
443
+
444
+ super().__init__(
445
+ max_pages,
446
+ headless,
447
+ google_search,
448
+ hide_canvas,
449
+ disable_webgl,
450
+ real_chrome,
451
+ stealth,
452
+ wait,
453
+ page_action,
454
+ proxy,
455
+ locale,
456
+ extra_headers,
457
+ useragent,
458
+ cdp_url,
459
+ timeout,
460
+ disable_resources,
461
+ wait_selector,
462
+ cookies,
463
+ network_idle,
464
+ wait_selector_state,
465
+ selector_config,
466
+ )
467
+
468
+ self.playwright: Optional[AsyncPlaywright] = None
469
+ self.context: Optional[AsyncBrowserContext] = None
470
+ self._lock = Lock()
471
+ self.__enter__ = None
472
+ self.__exit__ = None
473
+
474
+ async def __create__(self):
475
+ """Create a browser for this instance and context."""
476
+ async_context = async_rebrowser_playwright
477
+ if not self.stealth or self.real_chrome:
478
+ # Because rebrowser_playwright doesn't play well with real browsers
479
+ async_context = async_playwright
480
+
481
+ self.playwright: AsyncPlaywright = await async_context().start()
482
+
483
+ if self.cdp_url:
484
+ browser = await self.playwright.chromium.connect_over_cdp(
485
+ endpoint_url=self.cdp_url
486
+ )
487
+ self.context: AsyncBrowserContext = await browser.new_context(
488
+ **self.context_options
489
+ )
490
+ else:
491
+ self.context: AsyncBrowserContext = (
492
+ await self.playwright.chromium.launch_persistent_context(
493
+ user_data_dir="", **self.launch_options
494
+ )
495
+ )
496
+
497
+ if self.cookies:
498
+ await self.context.add_cookies(self.cookies)
499
+
500
+ async def __aenter__(self):
501
+ await self.__create__()
502
+ return self
503
+
504
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
505
+ await self.close()
506
+
507
+ async def close(self):
508
+ """Close all resources"""
509
+ if self._closed: # pragma: no cover
510
+ return
511
+
512
+ if self.context:
513
+ await self.context.close()
514
+ self.context = None
515
+
516
+ if self.playwright:
517
+ await self.playwright.stop()
518
+ self.playwright = None
519
+
520
+ self._closed = True
521
+
522
+ async def _get_or_create_page(self) -> PageInfo:
523
+ """Get an available page or create a new one"""
524
+ async with self._lock:
525
+ # Try to get a ready page first
526
+ page_info = self.page_pool.get_ready_page()
527
+ if page_info:
528
+ return page_info
529
+
530
+ # Create a new page if under limit
531
+ if self.page_pool.pages_count < self.max_pages:
532
+ page = await self.context.new_page()
533
+ page.set_default_navigation_timeout(self.timeout)
534
+ page.set_default_timeout(self.timeout)
535
+ if self.extra_headers:
536
+ await page.set_extra_http_headers(self.extra_headers)
537
+
538
+ if self.disable_resources:
539
+ await page.route("**/*", async_intercept_route)
540
+
541
+ if self.stealth:
542
+ for script in _compiled_stealth_scripts():
543
+ await page.add_init_script(script=script)
544
+
545
+ return self.page_pool.add_page(page)
546
+
547
+ # Wait for a page to become available
548
+ max_wait = 30 # seconds
549
+ start_time = time()
550
+
551
+ while time() - start_time < max_wait: # pragma: no cover
552
+ page_info = self.page_pool.get_ready_page()
553
+ if page_info:
554
+ return page_info
555
+ await asyncio_sleep(0.05)
556
+
557
+ raise TimeoutError("No pages available within timeout period")
558
+
559
+ async def fetch(self, url: str) -> Response:
560
+ """Opens up the browser and do your request based on your chosen options.
561
+
562
+ :param url: The Target url.
563
+ :return: A `Response` object.
564
+ """
565
+ if self._closed: # pragma: no cover
566
+ raise RuntimeError("Context manager has been closed")
567
+
568
+ final_response = None
569
+ referer = (
570
+ generate_convincing_referer(url)
571
+ if (self.google_search and "referer" not in self._headers_keys)
572
+ else None
573
+ )
574
+
575
+ async def handle_response(finished_response: AsyncPlaywrightResponse):
576
+ nonlocal final_response
577
+ if (
578
+ finished_response.request.resource_type == "document"
579
+ and finished_response.request.is_navigation_request()
580
+ ):
581
+ final_response = finished_response
582
+
583
+ page_info = await self._get_or_create_page()
584
+ page_info.mark_busy(url=url)
585
+
586
+ try:
587
+ # Navigate to URL and wait for a specified state
588
+ page_info.page.on("response", handle_response)
589
+ first_response = await page_info.page.goto(url, referer=referer)
590
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
591
+
592
+ if self.network_idle:
593
+ await page_info.page.wait_for_load_state("networkidle")
594
+
595
+ if not first_response:
596
+ raise RuntimeError(f"Failed to get response for {url}")
597
+
598
+ if self.page_action is not None:
599
+ try:
600
+ page_info.page = await self.page_action(page_info.page)
601
+ except Exception as e:
602
+ log.error(f"Error executing page_action: {e}")
603
+
604
+ if self.wait_selector:
605
+ try:
606
+ waiter: AsyncLocator = page_info.page.locator(self.wait_selector)
607
+ await waiter.first.wait_for(state=self.wait_selector_state)
608
+ # Wait again after waiting for the selector, helpful with protections like Cloudflare
609
+ await page_info.page.wait_for_load_state(state="load")
610
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
611
+ if self.network_idle:
612
+ await page_info.page.wait_for_load_state("networkidle")
613
+ except Exception as e:
614
+ log.error(f"Error waiting for selector {self.wait_selector}: {e}")
615
+
616
+ await page_info.page.wait_for_timeout(self.wait)
617
+
618
+ # Create response object
619
+ response = await ResponseFactory.from_async_playwright_response(
620
+ page_info.page, first_response, final_response, self.selector_config
621
+ )
622
+
623
+ # Mark the page as ready for next use
624
+ page_info.mark_ready()
625
+
626
+ return response
627
+
628
+ except Exception as e: # pragma: no cover
629
+ page_info.mark_error()
630
+ raise e