scrapling 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. scrapling/__init__.py +1 -1
  2. scrapling/cli.py +38 -51
  3. scrapling/core/_html_utils.py +3 -9
  4. scrapling/core/ai.py +5 -13
  5. scrapling/core/custom_types.py +19 -61
  6. scrapling/core/mixins.py +6 -28
  7. scrapling/core/shell.py +49 -127
  8. scrapling/core/storage.py +2 -8
  9. scrapling/core/translator.py +8 -20
  10. scrapling/core/utils/__init__.py +10 -0
  11. scrapling/core/utils/_shell.py +48 -0
  12. scrapling/core/{utils.py → utils/_utils.py} +5 -21
  13. scrapling/engines/__init__.py +0 -16
  14. scrapling/engines/_browsers/_base.py +297 -0
  15. scrapling/engines/_browsers/_camoufox.py +219 -296
  16. scrapling/engines/_browsers/_config_tools.py +2 -1
  17. scrapling/engines/_browsers/_controllers.py +201 -281
  18. scrapling/engines/_browsers/_page.py +37 -15
  19. scrapling/engines/_browsers/_validators.py +9 -15
  20. scrapling/engines/constants.py +3 -6
  21. scrapling/engines/static.py +25 -75
  22. scrapling/engines/toolbelt/__init__.py +1 -20
  23. scrapling/engines/toolbelt/convertor.py +95 -86
  24. scrapling/engines/toolbelt/custom.py +7 -99
  25. scrapling/engines/toolbelt/fingerprints.py +1 -3
  26. scrapling/engines/toolbelt/navigation.py +4 -58
  27. scrapling/fetchers.py +29 -24
  28. scrapling/parser.py +45 -122
  29. {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/METADATA +54 -46
  30. scrapling-0.3.3.dist-info/RECORD +44 -0
  31. scrapling-0.3.1.dist-info/RECORD +0 -41
  32. {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/WHEEL +0 -0
  33. {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/entry_points.txt +0 -0
  34. {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/licenses/LICENSE +0 -0
  35. {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,6 @@
1
- from time import time, sleep
2
- from asyncio import sleep as asyncio_sleep, Lock
3
-
4
1
  from playwright.sync_api import (
5
2
  Response as SyncPlaywrightResponse,
6
3
  sync_playwright,
7
- BrowserContext,
8
4
  Playwright,
9
5
  Locator,
10
6
  )
@@ -21,9 +17,8 @@ from rebrowser_playwright.async_api import (
21
17
  )
22
18
 
23
19
  from scrapling.core.utils import log
24
- from ._page import PageInfo, PagePool
20
+ from ._base import SyncSession, AsyncSession, DynamicSessionMixin
25
21
  from ._validators import validate, PlaywrightConfig
26
- from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
27
22
  from scrapling.core._types import (
28
23
  Dict,
29
24
  List,
@@ -31,16 +26,16 @@ from scrapling.core._types import (
31
26
  Callable,
32
27
  SelectorWaitStates,
33
28
  )
34
- from scrapling.engines.toolbelt import (
29
+ from scrapling.engines.toolbelt.convertor import (
35
30
  Response,
36
31
  ResponseFactory,
37
- generate_convincing_referer,
38
- intercept_route,
39
- async_intercept_route,
40
32
  )
33
+ from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
34
+
35
+ _UNSET = object()
41
36
 
42
37
 
43
- class DynamicSession:
38
+ class DynamicSession(DynamicSessionMixin, SyncSession):
44
39
  """A Browser session manager with page pooling."""
45
40
 
46
41
  __slots__ = (
@@ -59,6 +54,7 @@ class DynamicSession:
59
54
  "cookies",
60
55
  "disable_resources",
61
56
  "network_idle",
57
+ "load_dom",
62
58
  "wait_selector",
63
59
  "init_script",
64
60
  "wait_selector_state",
@@ -98,6 +94,7 @@ class DynamicSession:
98
94
  init_script: Optional[str] = None,
99
95
  cookies: Optional[List[Dict]] = None,
100
96
  network_idle: bool = False,
97
+ load_dom: bool = True,
101
98
  wait_selector_state: SelectorWaitStates = "attached",
102
99
  selector_config: Optional[Dict] = None,
103
100
  ):
@@ -112,7 +109,7 @@ class DynamicSession:
112
109
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
113
110
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
114
111
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
115
- :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
112
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
116
113
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
117
114
  :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
118
115
  :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
@@ -121,114 +118,39 @@ class DynamicSession:
121
118
  :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
122
119
  :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
123
120
  :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
121
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
124
122
  :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
125
123
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
126
124
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
127
125
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
128
126
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
129
127
  """
130
-
131
- params = {
132
- "max_pages": __max_pages,
133
- "headless": headless,
134
- "google_search": google_search,
135
- "hide_canvas": hide_canvas,
136
- "disable_webgl": disable_webgl,
137
- "real_chrome": real_chrome,
138
- "stealth": stealth,
139
- "wait": wait,
140
- "page_action": page_action,
141
- "proxy": proxy,
142
- "locale": locale,
143
- "extra_headers": extra_headers,
144
- "useragent": useragent,
145
- "timeout": timeout,
146
- "selector_config": selector_config,
147
- "disable_resources": disable_resources,
148
- "wait_selector": wait_selector,
149
- "init_script": init_script,
150
- "cookies": cookies,
151
- "network_idle": network_idle,
152
- "wait_selector_state": wait_selector_state,
153
- "cdp_url": cdp_url,
154
- }
155
- config = validate(params, PlaywrightConfig)
156
-
157
- self.max_pages = config.max_pages
158
- self.headless = config.headless
159
- self.hide_canvas = config.hide_canvas
160
- self.disable_webgl = config.disable_webgl
161
- self.real_chrome = config.real_chrome
162
- self.stealth = config.stealth
163
- self.google_search = config.google_search
164
- self.wait = config.wait
165
- self.proxy = config.proxy
166
- self.locale = config.locale
167
- self.extra_headers = config.extra_headers
168
- self.useragent = config.useragent
169
- self.timeout = config.timeout
170
- self.cookies = config.cookies
171
- self.disable_resources = config.disable_resources
172
- self.cdp_url = config.cdp_url
173
- self.network_idle = config.network_idle
174
- self.wait_selector = config.wait_selector
175
- self.init_script = config.init_script
176
- self.wait_selector_state = config.wait_selector_state
177
-
178
- self.playwright: Optional[Playwright] = None
179
- self.context: Optional[BrowserContext] = None
180
- self.page_pool = PagePool(self.max_pages)
181
- self._closed = False
182
- self.selector_config = config.selector_config
183
- self.page_action = config.page_action
184
- self._headers_keys = (
185
- set(map(str.lower, self.extra_headers.keys()))
186
- if self.extra_headers
187
- else set()
128
+ self.__validate__(
129
+ wait=wait,
130
+ proxy=proxy,
131
+ locale=locale,
132
+ timeout=timeout,
133
+ stealth=stealth,
134
+ cdp_url=cdp_url,
135
+ cookies=cookies,
136
+ load_dom=load_dom,
137
+ headless=headless,
138
+ useragent=useragent,
139
+ max_pages=__max_pages,
140
+ real_chrome=real_chrome,
141
+ page_action=page_action,
142
+ hide_canvas=hide_canvas,
143
+ init_script=init_script,
144
+ network_idle=network_idle,
145
+ google_search=google_search,
146
+ extra_headers=extra_headers,
147
+ wait_selector=wait_selector,
148
+ disable_webgl=disable_webgl,
149
+ selector_config=selector_config,
150
+ disable_resources=disable_resources,
151
+ wait_selector_state=wait_selector_state,
188
152
  )
189
- self.__initiate_browser_options__()
190
-
191
- def __initiate_browser_options__(self):
192
- if not self.cdp_url:
193
- # `launch_options` is used with persistent context
194
- self.launch_options = dict(
195
- _launch_kwargs(
196
- self.headless,
197
- self.proxy,
198
- self.locale,
199
- tuple(self.extra_headers.items())
200
- if self.extra_headers
201
- else tuple(),
202
- self.useragent,
203
- self.real_chrome,
204
- self.stealth,
205
- self.hide_canvas,
206
- self.disable_webgl,
207
- )
208
- )
209
- self.launch_options["extra_http_headers"] = dict(
210
- self.launch_options["extra_http_headers"]
211
- )
212
- self.launch_options["proxy"] = dict(self.launch_options["proxy"]) or None
213
- self.context_options = dict()
214
- else:
215
- # while `context_options` is left to be used when cdp mode is enabled
216
- self.launch_options = dict()
217
- self.context_options = dict(
218
- _context_kwargs(
219
- self.proxy,
220
- self.locale,
221
- tuple(self.extra_headers.items())
222
- if self.extra_headers
223
- else tuple(),
224
- self.useragent,
225
- self.stealth,
226
- )
227
- )
228
- self.context_options["extra_http_headers"] = dict(
229
- self.context_options["extra_http_headers"]
230
- )
231
- self.context_options["proxy"] = dict(self.context_options["proxy"]) or None
153
+ super().__init__(max_pages=self.max_pages)
232
154
 
233
155
  def __create__(self):
234
156
  """Create a browser for this instance and context."""
@@ -237,16 +159,14 @@ class DynamicSession:
237
159
  # Because rebrowser_playwright doesn't play well with real browsers
238
160
  sync_context = sync_playwright
239
161
 
240
- self.playwright = sync_context().start()
162
+ self.playwright: Playwright = sync_context().start()
241
163
 
242
164
  if self.cdp_url: # pragma: no cover
243
- self.context = self.playwright.chromium.connect_over_cdp(
244
- endpoint_url=self.cdp_url
245
- ).new_context(**self.context_options)
246
- else:
247
- self.context = self.playwright.chromium.launch_persistent_context(
248
- user_data_dir="", **self.launch_options
165
+ self.context = self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url).new_context(
166
+ **self.context_options
249
167
  )
168
+ else:
169
+ self.context = self.playwright.chromium.launch_persistent_context(user_data_dir="", **self.launch_options)
250
170
 
251
171
  if self.init_script: # pragma: no cover
252
172
  self.context.add_init_script(path=self.init_script)
@@ -276,56 +196,63 @@ class DynamicSession:
276
196
 
277
197
  self._closed = True
278
198
 
279
- def _get_or_create_page(self) -> PageInfo: # pragma: no cover
280
- """Get an available page or create a new one"""
281
- # Try to get a ready page first
282
- page_info = self.page_pool.get_ready_page()
283
- if page_info:
284
- return page_info
285
-
286
- # Create a new page if under limit
287
- if self.page_pool.pages_count < self.max_pages:
288
- page = self.context.new_page()
289
- page.set_default_navigation_timeout(self.timeout)
290
- page.set_default_timeout(self.timeout)
291
- if self.extra_headers:
292
- page.set_extra_http_headers(self.extra_headers)
293
-
294
- if self.disable_resources:
295
- page.route("**/*", intercept_route)
296
-
297
- if self.stealth:
298
- for script in _compiled_stealth_scripts():
299
- page.add_init_script(script=script)
300
-
301
- return self.page_pool.add_page(page)
302
-
303
- # Wait for a page to become available
304
- max_wait = 30
305
- start_time = time()
306
-
307
- while time() - start_time < max_wait:
308
- page_info = self.page_pool.get_ready_page()
309
- if page_info:
310
- return page_info
311
- sleep(0.05)
312
-
313
- raise TimeoutError("No pages available within timeout period")
314
-
315
- def fetch(self, url: str) -> Response:
199
+ def fetch(
200
+ self,
201
+ url: str,
202
+ google_search: bool = _UNSET,
203
+ timeout: int | float = _UNSET,
204
+ wait: int | float = _UNSET,
205
+ page_action: Optional[Callable] = _UNSET,
206
+ extra_headers: Optional[Dict[str, str]] = _UNSET,
207
+ disable_resources: bool = _UNSET,
208
+ wait_selector: Optional[str] = _UNSET,
209
+ wait_selector_state: SelectorWaitStates = _UNSET,
210
+ network_idle: bool = _UNSET,
211
+ load_dom: bool = _UNSET,
212
+ selector_config: Optional[Dict] = _UNSET,
213
+ ) -> Response:
316
214
  """Opens up the browser and do your request based on your chosen options.
317
215
 
318
216
  :param url: The Target url.
217
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
218
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
219
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
220
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
221
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
222
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
223
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
224
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
225
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
226
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
227
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
228
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
229
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
319
230
  :return: A `Response` object.
320
231
  """
232
+ # Validate all resolved parameters
233
+ params = validate(
234
+ dict(
235
+ google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
236
+ timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
237
+ wait=self._get_with_precedence(wait, self.wait, _UNSET),
238
+ page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
239
+ extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
240
+ disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
241
+ wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
242
+ wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
243
+ network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
244
+ load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
245
+ selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
246
+ ),
247
+ PlaywrightConfig,
248
+ )
249
+
321
250
  if self._closed: # pragma: no cover
322
251
  raise RuntimeError("Context manager has been closed")
323
252
 
324
253
  final_response = None
325
254
  referer = (
326
- generate_convincing_referer(url)
327
- if (self.google_search and "referer" not in self._headers_keys)
328
- else None
255
+ generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
329
256
  )
330
257
 
331
258
  def handle_response(finished_response: SyncPlaywrightResponse):
@@ -336,48 +263,50 @@ class DynamicSession:
336
263
  ):
337
264
  final_response = finished_response
338
265
 
339
- page_info = self._get_or_create_page()
266
+ page_info = self._get_page(params.timeout, params.extra_headers, params.disable_resources)
340
267
  page_info.mark_busy(url=url)
341
268
 
342
269
  try: # pragma: no cover
343
270
  # Navigate to URL and wait for a specified state
344
271
  page_info.page.on("response", handle_response)
345
272
  first_response = page_info.page.goto(url, referer=referer)
346
- page_info.page.wait_for_load_state(state="domcontentloaded")
273
+ if params.load_dom:
274
+ page_info.page.wait_for_load_state(state="domcontentloaded")
347
275
 
348
- if self.network_idle:
276
+ if params.network_idle:
349
277
  page_info.page.wait_for_load_state("networkidle")
350
278
 
351
279
  if not first_response:
352
280
  raise RuntimeError(f"Failed to get response for {url}")
353
281
 
354
- if self.page_action is not None:
282
+ if params.page_action:
355
283
  try:
356
- page_info.page = self.page_action(page_info.page)
284
+ _ = params.page_action(page_info.page)
357
285
  except Exception as e: # pragma: no cover
358
286
  log.error(f"Error executing page_action: {e}")
359
287
 
360
- if self.wait_selector:
288
+ if params.wait_selector:
361
289
  try:
362
- waiter: Locator = page_info.page.locator(self.wait_selector)
363
- waiter.first.wait_for(state=self.wait_selector_state)
290
+ waiter: Locator = page_info.page.locator(params.wait_selector)
291
+ waiter.first.wait_for(state=params.wait_selector_state)
364
292
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
365
293
  page_info.page.wait_for_load_state(state="load")
366
- page_info.page.wait_for_load_state(state="domcontentloaded")
367
- if self.network_idle:
294
+ if params.load_dom:
295
+ page_info.page.wait_for_load_state(state="domcontentloaded")
296
+ if params.network_idle:
368
297
  page_info.page.wait_for_load_state("networkidle")
369
298
  except Exception as e: # pragma: no cover
370
- log.error(f"Error waiting for selector {self.wait_selector}: {e}")
299
+ log.error(f"Error waiting for selector {params.wait_selector}: {e}")
371
300
 
372
- page_info.page.wait_for_timeout(self.wait)
301
+ page_info.page.wait_for_timeout(params.wait)
373
302
 
374
303
  # Create response object
375
304
  response = ResponseFactory.from_playwright_response(
376
- page_info.page, first_response, final_response, self.selector_config
305
+ page_info.page, first_response, final_response, params.selector_config
377
306
  )
378
307
 
379
- # Mark the page as ready for next use
380
- page_info.mark_ready()
308
+ # Mark the page as finished for next use
309
+ page_info.mark_finished()
381
310
 
382
311
  return response
383
312
 
@@ -385,17 +314,8 @@ class DynamicSession:
385
314
  page_info.mark_error()
386
315
  raise e
387
316
 
388
- def get_pool_stats(self) -> Dict[str, int]:
389
- """Get statistics about the current page pool"""
390
- return {
391
- "total_pages": self.page_pool.pages_count,
392
- "ready_pages": self.page_pool.ready_count,
393
- "busy_pages": self.page_pool.busy_count,
394
- "max_pages": self.max_pages,
395
- }
396
-
397
317
 
398
- class AsyncDynamicSession(DynamicSession):
318
+ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
399
319
  """An async Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory."""
400
320
 
401
321
  def __init__(
@@ -420,6 +340,7 @@ class AsyncDynamicSession(DynamicSession):
420
340
  init_script: Optional[str] = None,
421
341
  cookies: Optional[List[Dict]] = None,
422
342
  network_idle: bool = False,
343
+ load_dom: bool = True,
423
344
  wait_selector_state: SelectorWaitStates = "attached",
424
345
  selector_config: Optional[Dict] = None,
425
346
  ):
@@ -432,9 +353,10 @@ class AsyncDynamicSession(DynamicSession):
432
353
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
433
354
  :param cookies: Set cookies for the next request.
434
355
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
356
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
435
357
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
436
358
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
437
- :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
359
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
438
360
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
439
361
  :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
440
362
  :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
@@ -451,36 +373,32 @@ class AsyncDynamicSession(DynamicSession):
451
373
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
452
374
  """
453
375
 
454
- super().__init__(
455
- max_pages,
456
- headless,
457
- google_search,
458
- hide_canvas,
459
- disable_webgl,
460
- real_chrome,
461
- stealth,
462
- wait,
463
- page_action,
464
- proxy,
465
- locale,
466
- extra_headers,
467
- useragent,
468
- cdp_url,
469
- timeout,
470
- disable_resources,
471
- wait_selector,
472
- init_script,
473
- cookies,
474
- network_idle,
475
- wait_selector_state,
476
- selector_config,
376
+ self.__validate__(
377
+ wait=wait,
378
+ proxy=proxy,
379
+ locale=locale,
380
+ timeout=timeout,
381
+ stealth=stealth,
382
+ cdp_url=cdp_url,
383
+ cookies=cookies,
384
+ load_dom=load_dom,
385
+ headless=headless,
386
+ useragent=useragent,
387
+ max_pages=max_pages,
388
+ real_chrome=real_chrome,
389
+ page_action=page_action,
390
+ hide_canvas=hide_canvas,
391
+ init_script=init_script,
392
+ network_idle=network_idle,
393
+ google_search=google_search,
394
+ extra_headers=extra_headers,
395
+ wait_selector=wait_selector,
396
+ disable_webgl=disable_webgl,
397
+ selector_config=selector_config,
398
+ disable_resources=disable_resources,
399
+ wait_selector_state=wait_selector_state,
477
400
  )
478
-
479
- self.playwright: Optional[AsyncPlaywright] = None
480
- self.context: Optional[AsyncBrowserContext] = None
481
- self._lock = Lock()
482
- self.__enter__ = None
483
- self.__exit__ = None
401
+ super().__init__(max_pages=self.max_pages)
484
402
 
485
403
  async def __create__(self):
486
404
  """Create a browser for this instance and context."""
@@ -492,17 +410,11 @@ class AsyncDynamicSession(DynamicSession):
492
410
  self.playwright: AsyncPlaywright = await async_context().start()
493
411
 
494
412
  if self.cdp_url:
495
- browser = await self.playwright.chromium.connect_over_cdp(
496
- endpoint_url=self.cdp_url
497
- )
498
- self.context: AsyncBrowserContext = await browser.new_context(
499
- **self.context_options
500
- )
413
+ browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url)
414
+ self.context: AsyncBrowserContext = await browser.new_context(**self.context_options)
501
415
  else:
502
- self.context: AsyncBrowserContext = (
503
- await self.playwright.chromium.launch_persistent_context(
504
- user_data_dir="", **self.launch_options
505
- )
416
+ self.context: AsyncBrowserContext = await self.playwright.chromium.launch_persistent_context(
417
+ user_data_dir="", **self.launch_options
506
418
  )
507
419
 
508
420
  if self.init_script: # pragma: no cover
@@ -533,57 +445,63 @@ class AsyncDynamicSession(DynamicSession):
533
445
 
534
446
  self._closed = True
535
447
 
536
- async def _get_or_create_page(self) -> PageInfo:
537
- """Get an available page or create a new one"""
538
- async with self._lock:
539
- # Try to get a ready page first
540
- page_info = self.page_pool.get_ready_page()
541
- if page_info:
542
- return page_info
543
-
544
- # Create a new page if under limit
545
- if self.page_pool.pages_count < self.max_pages:
546
- page = await self.context.new_page()
547
- page.set_default_navigation_timeout(self.timeout)
548
- page.set_default_timeout(self.timeout)
549
- if self.extra_headers:
550
- await page.set_extra_http_headers(self.extra_headers)
551
-
552
- if self.disable_resources:
553
- await page.route("**/*", async_intercept_route)
554
-
555
- if self.stealth:
556
- for script in _compiled_stealth_scripts():
557
- await page.add_init_script(script=script)
558
-
559
- return self.page_pool.add_page(page)
560
-
561
- # Wait for a page to become available
562
- max_wait = 30 # seconds
563
- start_time = time()
564
-
565
- while time() - start_time < max_wait: # pragma: no cover
566
- page_info = self.page_pool.get_ready_page()
567
- if page_info:
568
- return page_info
569
- await asyncio_sleep(0.05)
570
-
571
- raise TimeoutError("No pages available within timeout period")
572
-
573
- async def fetch(self, url: str) -> Response:
448
+ async def fetch(
449
+ self,
450
+ url: str,
451
+ google_search: bool = _UNSET,
452
+ timeout: int | float = _UNSET,
453
+ wait: int | float = _UNSET,
454
+ page_action: Optional[Callable] = _UNSET,
455
+ extra_headers: Optional[Dict[str, str]] = _UNSET,
456
+ disable_resources: bool = _UNSET,
457
+ wait_selector: Optional[str] = _UNSET,
458
+ wait_selector_state: SelectorWaitStates = _UNSET,
459
+ network_idle: bool = _UNSET,
460
+ load_dom: bool = _UNSET,
461
+ selector_config: Optional[Dict] = _UNSET,
462
+ ) -> Response:
574
463
  """Opens up the browser and do your request based on your chosen options.
575
464
 
576
465
  :param url: The Target url.
466
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
467
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
468
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
469
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
470
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
471
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
472
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
473
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
474
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
475
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
476
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
477
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
478
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
577
479
  :return: A `Response` object.
578
480
  """
481
+ # Validate all resolved parameters
482
+ params = validate(
483
+ dict(
484
+ google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
485
+ timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
486
+ wait=self._get_with_precedence(wait, self.wait, _UNSET),
487
+ page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
488
+ extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
489
+ disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
490
+ wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
491
+ wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
492
+ network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
493
+ load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
494
+ selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
495
+ ),
496
+ PlaywrightConfig,
497
+ )
498
+
579
499
  if self._closed: # pragma: no cover
580
500
  raise RuntimeError("Context manager has been closed")
581
501
 
582
502
  final_response = None
583
503
  referer = (
584
- generate_convincing_referer(url)
585
- if (self.google_search and "referer" not in self._headers_keys)
586
- else None
504
+ generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
587
505
  )
588
506
 
589
507
  async def handle_response(finished_response: AsyncPlaywrightResponse):
@@ -594,48 +512,50 @@ class AsyncDynamicSession(DynamicSession):
594
512
  ):
595
513
  final_response = finished_response
596
514
 
597
- page_info = await self._get_or_create_page()
515
+ page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
598
516
  page_info.mark_busy(url=url)
599
517
 
600
518
  try:
601
519
  # Navigate to URL and wait for a specified state
602
520
  page_info.page.on("response", handle_response)
603
521
  first_response = await page_info.page.goto(url, referer=referer)
604
- await page_info.page.wait_for_load_state(state="domcontentloaded")
522
+ if self.load_dom:
523
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
605
524
 
606
- if self.network_idle:
525
+ if params.network_idle:
607
526
  await page_info.page.wait_for_load_state("networkidle")
608
527
 
609
528
  if not first_response:
610
529
  raise RuntimeError(f"Failed to get response for {url}")
611
530
 
612
- if self.page_action is not None:
531
+ if params.page_action:
613
532
  try:
614
- page_info.page = await self.page_action(page_info.page)
533
+ _ = await params.page_action(page_info.page)
615
534
  except Exception as e:
616
535
  log.error(f"Error executing page_action: {e}")
617
536
 
618
- if self.wait_selector:
537
+ if params.wait_selector:
619
538
  try:
620
- waiter: AsyncLocator = page_info.page.locator(self.wait_selector)
621
- await waiter.first.wait_for(state=self.wait_selector_state)
539
+ waiter: AsyncLocator = page_info.page.locator(params.wait_selector)
540
+ await waiter.first.wait_for(state=params.wait_selector_state)
622
541
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
623
542
  await page_info.page.wait_for_load_state(state="load")
624
- await page_info.page.wait_for_load_state(state="domcontentloaded")
625
- if self.network_idle:
543
+ if self.load_dom:
544
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
545
+ if params.network_idle:
626
546
  await page_info.page.wait_for_load_state("networkidle")
627
547
  except Exception as e:
628
- log.error(f"Error waiting for selector {self.wait_selector}: {e}")
548
+ log.error(f"Error waiting for selector {params.wait_selector}: {e}")
629
549
 
630
- await page_info.page.wait_for_timeout(self.wait)
550
+ await page_info.page.wait_for_timeout(params.wait)
631
551
 
632
552
  # Create response object
633
553
  response = await ResponseFactory.from_async_playwright_response(
634
- page_info.page, first_response, final_response, self.selector_config
554
+ page_info.page, first_response, final_response, params.selector_config
635
555
  )
636
556
 
637
- # Mark the page as ready for next use
638
- page_info.mark_ready()
557
+ # Mark the page as finished for next use
558
+ page_info.mark_finished()
639
559
 
640
560
  return response
641
561