scrapling 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. scrapling/__init__.py +1 -1
  2. scrapling/cli.py +38 -51
  3. scrapling/core/_html_utils.py +3 -9
  4. scrapling/core/ai.py +5 -13
  5. scrapling/core/custom_types.py +19 -61
  6. scrapling/core/mixins.py +6 -28
  7. scrapling/core/shell.py +49 -127
  8. scrapling/core/storage.py +2 -8
  9. scrapling/core/translator.py +8 -20
  10. scrapling/core/utils/__init__.py +10 -0
  11. scrapling/core/utils/_shell.py +48 -0
  12. scrapling/core/{utils.py → utils/_utils.py} +5 -21
  13. scrapling/engines/__init__.py +0 -16
  14. scrapling/engines/_browsers/_base.py +297 -0
  15. scrapling/engines/_browsers/_camoufox.py +227 -296
  16. scrapling/engines/_browsers/_config_tools.py +2 -1
  17. scrapling/engines/_browsers/_controllers.py +209 -281
  18. scrapling/engines/_browsers/_page.py +37 -15
  19. scrapling/engines/_browsers/_validators.py +9 -15
  20. scrapling/engines/constants.py +3 -6
  21. scrapling/engines/static.py +25 -75
  22. scrapling/engines/toolbelt/__init__.py +1 -20
  23. scrapling/engines/toolbelt/convertor.py +95 -86
  24. scrapling/engines/toolbelt/custom.py +7 -99
  25. scrapling/engines/toolbelt/fingerprints.py +1 -3
  26. scrapling/engines/toolbelt/navigation.py +4 -58
  27. scrapling/fetchers.py +29 -24
  28. scrapling/parser.py +45 -122
  29. {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/METADATA +54 -46
  30. scrapling-0.3.2.dist-info/RECORD +44 -0
  31. scrapling-0.3.1.dist-info/RECORD +0 -41
  32. {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/WHEEL +0 -0
  33. {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/entry_points.txt +0 -0
  34. {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/licenses/LICENSE +0 -0
  35. {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,6 @@
1
- from time import time, sleep
2
- from asyncio import sleep as asyncio_sleep, Lock
3
-
4
1
  from playwright.sync_api import (
5
2
  Response as SyncPlaywrightResponse,
6
3
  sync_playwright,
7
- BrowserContext,
8
4
  Playwright,
9
5
  Locator,
10
6
  )
@@ -21,9 +17,8 @@ from rebrowser_playwright.async_api import (
21
17
  )
22
18
 
23
19
  from scrapling.core.utils import log
24
- from ._page import PageInfo, PagePool
20
+ from ._base import SyncSession, AsyncSession, DynamicSessionMixin
25
21
  from ._validators import validate, PlaywrightConfig
26
- from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
27
22
  from scrapling.core._types import (
28
23
  Dict,
29
24
  List,
@@ -31,16 +26,16 @@ from scrapling.core._types import (
31
26
  Callable,
32
27
  SelectorWaitStates,
33
28
  )
34
- from scrapling.engines.toolbelt import (
29
+ from scrapling.engines.toolbelt.convertor import (
35
30
  Response,
36
31
  ResponseFactory,
37
- generate_convincing_referer,
38
- intercept_route,
39
- async_intercept_route,
40
32
  )
33
+ from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
34
+
35
+ _UNSET = object()
41
36
 
42
37
 
43
- class DynamicSession:
38
+ class DynamicSession(DynamicSessionMixin, SyncSession):
44
39
  """A Browser session manager with page pooling."""
45
40
 
46
41
  __slots__ = (
@@ -59,6 +54,7 @@ class DynamicSession:
59
54
  "cookies",
60
55
  "disable_resources",
61
56
  "network_idle",
57
+ "load_dom",
62
58
  "wait_selector",
63
59
  "init_script",
64
60
  "wait_selector_state",
@@ -98,6 +94,7 @@ class DynamicSession:
98
94
  init_script: Optional[str] = None,
99
95
  cookies: Optional[List[Dict]] = None,
100
96
  network_idle: bool = False,
97
+ load_dom: bool = True,
101
98
  wait_selector_state: SelectorWaitStates = "attached",
102
99
  selector_config: Optional[Dict] = None,
103
100
  ):
@@ -112,7 +109,7 @@ class DynamicSession:
112
109
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
113
110
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
114
111
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
115
- :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
112
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
116
113
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
117
114
  :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
118
115
  :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
@@ -121,114 +118,39 @@ class DynamicSession:
121
118
  :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
122
119
  :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
123
120
  :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
121
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
124
122
  :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
125
123
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
126
124
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
127
125
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
128
126
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
129
127
  """
130
-
131
- params = {
132
- "max_pages": __max_pages,
133
- "headless": headless,
134
- "google_search": google_search,
135
- "hide_canvas": hide_canvas,
136
- "disable_webgl": disable_webgl,
137
- "real_chrome": real_chrome,
138
- "stealth": stealth,
139
- "wait": wait,
140
- "page_action": page_action,
141
- "proxy": proxy,
142
- "locale": locale,
143
- "extra_headers": extra_headers,
144
- "useragent": useragent,
145
- "timeout": timeout,
146
- "selector_config": selector_config,
147
- "disable_resources": disable_resources,
148
- "wait_selector": wait_selector,
149
- "init_script": init_script,
150
- "cookies": cookies,
151
- "network_idle": network_idle,
152
- "wait_selector_state": wait_selector_state,
153
- "cdp_url": cdp_url,
154
- }
155
- config = validate(params, PlaywrightConfig)
156
-
157
- self.max_pages = config.max_pages
158
- self.headless = config.headless
159
- self.hide_canvas = config.hide_canvas
160
- self.disable_webgl = config.disable_webgl
161
- self.real_chrome = config.real_chrome
162
- self.stealth = config.stealth
163
- self.google_search = config.google_search
164
- self.wait = config.wait
165
- self.proxy = config.proxy
166
- self.locale = config.locale
167
- self.extra_headers = config.extra_headers
168
- self.useragent = config.useragent
169
- self.timeout = config.timeout
170
- self.cookies = config.cookies
171
- self.disable_resources = config.disable_resources
172
- self.cdp_url = config.cdp_url
173
- self.network_idle = config.network_idle
174
- self.wait_selector = config.wait_selector
175
- self.init_script = config.init_script
176
- self.wait_selector_state = config.wait_selector_state
177
-
178
- self.playwright: Optional[Playwright] = None
179
- self.context: Optional[BrowserContext] = None
180
- self.page_pool = PagePool(self.max_pages)
181
- self._closed = False
182
- self.selector_config = config.selector_config
183
- self.page_action = config.page_action
184
- self._headers_keys = (
185
- set(map(str.lower, self.extra_headers.keys()))
186
- if self.extra_headers
187
- else set()
128
+ self.__validate__(
129
+ wait=wait,
130
+ proxy=proxy,
131
+ locale=locale,
132
+ timeout=timeout,
133
+ stealth=stealth,
134
+ cdp_url=cdp_url,
135
+ cookies=cookies,
136
+ load_dom=load_dom,
137
+ headless=headless,
138
+ useragent=useragent,
139
+ max_pages=__max_pages,
140
+ real_chrome=real_chrome,
141
+ page_action=page_action,
142
+ hide_canvas=hide_canvas,
143
+ init_script=init_script,
144
+ network_idle=network_idle,
145
+ google_search=google_search,
146
+ extra_headers=extra_headers,
147
+ wait_selector=wait_selector,
148
+ disable_webgl=disable_webgl,
149
+ selector_config=selector_config,
150
+ disable_resources=disable_resources,
151
+ wait_selector_state=wait_selector_state,
188
152
  )
189
- self.__initiate_browser_options__()
190
-
191
- def __initiate_browser_options__(self):
192
- if not self.cdp_url:
193
- # `launch_options` is used with persistent context
194
- self.launch_options = dict(
195
- _launch_kwargs(
196
- self.headless,
197
- self.proxy,
198
- self.locale,
199
- tuple(self.extra_headers.items())
200
- if self.extra_headers
201
- else tuple(),
202
- self.useragent,
203
- self.real_chrome,
204
- self.stealth,
205
- self.hide_canvas,
206
- self.disable_webgl,
207
- )
208
- )
209
- self.launch_options["extra_http_headers"] = dict(
210
- self.launch_options["extra_http_headers"]
211
- )
212
- self.launch_options["proxy"] = dict(self.launch_options["proxy"]) or None
213
- self.context_options = dict()
214
- else:
215
- # while `context_options` is left to be used when cdp mode is enabled
216
- self.launch_options = dict()
217
- self.context_options = dict(
218
- _context_kwargs(
219
- self.proxy,
220
- self.locale,
221
- tuple(self.extra_headers.items())
222
- if self.extra_headers
223
- else tuple(),
224
- self.useragent,
225
- self.stealth,
226
- )
227
- )
228
- self.context_options["extra_http_headers"] = dict(
229
- self.context_options["extra_http_headers"]
230
- )
231
- self.context_options["proxy"] = dict(self.context_options["proxy"]) or None
153
+ super().__init__(max_pages=self.max_pages)
232
154
 
233
155
  def __create__(self):
234
156
  """Create a browser for this instance and context."""
@@ -237,16 +159,18 @@ class DynamicSession:
237
159
  # Because rebrowser_playwright doesn't play well with real browsers
238
160
  sync_context = sync_playwright
239
161
 
240
- self.playwright = sync_context().start()
162
+ self.playwright: Playwright = sync_context().start()
241
163
 
242
164
  if self.cdp_url: # pragma: no cover
243
- self.context = self.playwright.chromium.connect_over_cdp(
244
- endpoint_url=self.cdp_url
245
- ).new_context(**self.context_options)
246
- else:
247
- self.context = self.playwright.chromium.launch_persistent_context(
248
- user_data_dir="", **self.launch_options
165
+ self.context = self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url).new_context(
166
+ **self.context_options
249
167
  )
168
+ else:
169
+ self.context = self.playwright.chromium.launch_persistent_context(user_data_dir="", **self.launch_options)
170
+
171
+ # Get the default page and close it
172
+ default_page = self.context.pages[0]
173
+ default_page.close()
250
174
 
251
175
  if self.init_script: # pragma: no cover
252
176
  self.context.add_init_script(path=self.init_script)
@@ -276,56 +200,63 @@ class DynamicSession:
276
200
 
277
201
  self._closed = True
278
202
 
279
- def _get_or_create_page(self) -> PageInfo: # pragma: no cover
280
- """Get an available page or create a new one"""
281
- # Try to get a ready page first
282
- page_info = self.page_pool.get_ready_page()
283
- if page_info:
284
- return page_info
285
-
286
- # Create a new page if under limit
287
- if self.page_pool.pages_count < self.max_pages:
288
- page = self.context.new_page()
289
- page.set_default_navigation_timeout(self.timeout)
290
- page.set_default_timeout(self.timeout)
291
- if self.extra_headers:
292
- page.set_extra_http_headers(self.extra_headers)
293
-
294
- if self.disable_resources:
295
- page.route("**/*", intercept_route)
296
-
297
- if self.stealth:
298
- for script in _compiled_stealth_scripts():
299
- page.add_init_script(script=script)
300
-
301
- return self.page_pool.add_page(page)
302
-
303
- # Wait for a page to become available
304
- max_wait = 30
305
- start_time = time()
306
-
307
- while time() - start_time < max_wait:
308
- page_info = self.page_pool.get_ready_page()
309
- if page_info:
310
- return page_info
311
- sleep(0.05)
312
-
313
- raise TimeoutError("No pages available within timeout period")
314
-
315
- def fetch(self, url: str) -> Response:
203
+ def fetch(
204
+ self,
205
+ url: str,
206
+ google_search: bool = _UNSET,
207
+ timeout: int | float = _UNSET,
208
+ wait: int | float = _UNSET,
209
+ page_action: Optional[Callable] = _UNSET,
210
+ extra_headers: Optional[Dict[str, str]] = _UNSET,
211
+ disable_resources: bool = _UNSET,
212
+ wait_selector: Optional[str] = _UNSET,
213
+ wait_selector_state: SelectorWaitStates = _UNSET,
214
+ network_idle: bool = _UNSET,
215
+ load_dom: bool = _UNSET,
216
+ selector_config: Optional[Dict] = _UNSET,
217
+ ) -> Response:
316
218
  """Opens up the browser and do your request based on your chosen options.
317
219
 
318
220
  :param url: The Target url.
221
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
222
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
223
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
224
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
225
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
226
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
227
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
228
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
229
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
230
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
231
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
232
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
233
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
319
234
  :return: A `Response` object.
320
235
  """
236
+ # Validate all resolved parameters
237
+ params = validate(
238
+ dict(
239
+ google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
240
+ timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
241
+ wait=self._get_with_precedence(wait, self.wait, _UNSET),
242
+ page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
243
+ extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
244
+ disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
245
+ wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
246
+ wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
247
+ network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
248
+ load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
249
+ selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
250
+ ),
251
+ PlaywrightConfig,
252
+ )
253
+
321
254
  if self._closed: # pragma: no cover
322
255
  raise RuntimeError("Context manager has been closed")
323
256
 
324
257
  final_response = None
325
258
  referer = (
326
- generate_convincing_referer(url)
327
- if (self.google_search and "referer" not in self._headers_keys)
328
- else None
259
+ generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
329
260
  )
330
261
 
331
262
  def handle_response(finished_response: SyncPlaywrightResponse):
@@ -336,48 +267,50 @@ class DynamicSession:
336
267
  ):
337
268
  final_response = finished_response
338
269
 
339
- page_info = self._get_or_create_page()
270
+ page_info = self._get_page(params.timeout, params.extra_headers, params.disable_resources)
340
271
  page_info.mark_busy(url=url)
341
272
 
342
273
  try: # pragma: no cover
343
274
  # Navigate to URL and wait for a specified state
344
275
  page_info.page.on("response", handle_response)
345
276
  first_response = page_info.page.goto(url, referer=referer)
346
- page_info.page.wait_for_load_state(state="domcontentloaded")
277
+ if params.load_dom:
278
+ page_info.page.wait_for_load_state(state="domcontentloaded")
347
279
 
348
- if self.network_idle:
280
+ if params.network_idle:
349
281
  page_info.page.wait_for_load_state("networkidle")
350
282
 
351
283
  if not first_response:
352
284
  raise RuntimeError(f"Failed to get response for {url}")
353
285
 
354
- if self.page_action is not None:
286
+ if params.page_action:
355
287
  try:
356
- page_info.page = self.page_action(page_info.page)
288
+ _ = params.page_action(page_info.page)
357
289
  except Exception as e: # pragma: no cover
358
290
  log.error(f"Error executing page_action: {e}")
359
291
 
360
- if self.wait_selector:
292
+ if params.wait_selector:
361
293
  try:
362
- waiter: Locator = page_info.page.locator(self.wait_selector)
363
- waiter.first.wait_for(state=self.wait_selector_state)
294
+ waiter: Locator = page_info.page.locator(params.wait_selector)
295
+ waiter.first.wait_for(state=params.wait_selector_state)
364
296
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
365
297
  page_info.page.wait_for_load_state(state="load")
366
- page_info.page.wait_for_load_state(state="domcontentloaded")
367
- if self.network_idle:
298
+ if params.load_dom:
299
+ page_info.page.wait_for_load_state(state="domcontentloaded")
300
+ if params.network_idle:
368
301
  page_info.page.wait_for_load_state("networkidle")
369
302
  except Exception as e: # pragma: no cover
370
- log.error(f"Error waiting for selector {self.wait_selector}: {e}")
303
+ log.error(f"Error waiting for selector {params.wait_selector}: {e}")
371
304
 
372
- page_info.page.wait_for_timeout(self.wait)
305
+ page_info.page.wait_for_timeout(params.wait)
373
306
 
374
307
  # Create response object
375
308
  response = ResponseFactory.from_playwright_response(
376
- page_info.page, first_response, final_response, self.selector_config
309
+ page_info.page, first_response, final_response, params.selector_config
377
310
  )
378
311
 
379
- # Mark the page as ready for next use
380
- page_info.mark_ready()
312
+ # Mark the page as finished for next use
313
+ page_info.mark_finished()
381
314
 
382
315
  return response
383
316
 
@@ -385,17 +318,8 @@ class DynamicSession:
385
318
  page_info.mark_error()
386
319
  raise e
387
320
 
388
- def get_pool_stats(self) -> Dict[str, int]:
389
- """Get statistics about the current page pool"""
390
- return {
391
- "total_pages": self.page_pool.pages_count,
392
- "ready_pages": self.page_pool.ready_count,
393
- "busy_pages": self.page_pool.busy_count,
394
- "max_pages": self.max_pages,
395
- }
396
-
397
321
 
398
- class AsyncDynamicSession(DynamicSession):
322
+ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
399
323
  """An async Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory."""
400
324
 
401
325
  def __init__(
@@ -420,6 +344,7 @@ class AsyncDynamicSession(DynamicSession):
420
344
  init_script: Optional[str] = None,
421
345
  cookies: Optional[List[Dict]] = None,
422
346
  network_idle: bool = False,
347
+ load_dom: bool = True,
423
348
  wait_selector_state: SelectorWaitStates = "attached",
424
349
  selector_config: Optional[Dict] = None,
425
350
  ):
@@ -432,9 +357,10 @@ class AsyncDynamicSession(DynamicSession):
432
357
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
433
358
  :param cookies: Set cookies for the next request.
434
359
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
360
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
435
361
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
436
362
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
437
- :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
363
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
438
364
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
439
365
  :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
440
366
  :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
@@ -451,36 +377,32 @@ class AsyncDynamicSession(DynamicSession):
451
377
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
452
378
  """
453
379
 
454
- super().__init__(
455
- max_pages,
456
- headless,
457
- google_search,
458
- hide_canvas,
459
- disable_webgl,
460
- real_chrome,
461
- stealth,
462
- wait,
463
- page_action,
464
- proxy,
465
- locale,
466
- extra_headers,
467
- useragent,
468
- cdp_url,
469
- timeout,
470
- disable_resources,
471
- wait_selector,
472
- init_script,
473
- cookies,
474
- network_idle,
475
- wait_selector_state,
476
- selector_config,
380
+ self.__validate__(
381
+ wait=wait,
382
+ proxy=proxy,
383
+ locale=locale,
384
+ timeout=timeout,
385
+ stealth=stealth,
386
+ cdp_url=cdp_url,
387
+ cookies=cookies,
388
+ load_dom=load_dom,
389
+ headless=headless,
390
+ useragent=useragent,
391
+ max_pages=max_pages,
392
+ real_chrome=real_chrome,
393
+ page_action=page_action,
394
+ hide_canvas=hide_canvas,
395
+ init_script=init_script,
396
+ network_idle=network_idle,
397
+ google_search=google_search,
398
+ extra_headers=extra_headers,
399
+ wait_selector=wait_selector,
400
+ disable_webgl=disable_webgl,
401
+ selector_config=selector_config,
402
+ disable_resources=disable_resources,
403
+ wait_selector_state=wait_selector_state,
477
404
  )
478
-
479
- self.playwright: Optional[AsyncPlaywright] = None
480
- self.context: Optional[AsyncBrowserContext] = None
481
- self._lock = Lock()
482
- self.__enter__ = None
483
- self.__exit__ = None
405
+ super().__init__(max_pages=self.max_pages)
484
406
 
485
407
  async def __create__(self):
486
408
  """Create a browser for this instance and context."""
@@ -492,19 +414,17 @@ class AsyncDynamicSession(DynamicSession):
492
414
  self.playwright: AsyncPlaywright = await async_context().start()
493
415
 
494
416
  if self.cdp_url:
495
- browser = await self.playwright.chromium.connect_over_cdp(
496
- endpoint_url=self.cdp_url
497
- )
498
- self.context: AsyncBrowserContext = await browser.new_context(
499
- **self.context_options
500
- )
417
+ browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url)
418
+ self.context: AsyncBrowserContext = await browser.new_context(**self.context_options)
501
419
  else:
502
- self.context: AsyncBrowserContext = (
503
- await self.playwright.chromium.launch_persistent_context(
504
- user_data_dir="", **self.launch_options
505
- )
420
+ self.context: AsyncBrowserContext = await self.playwright.chromium.launch_persistent_context(
421
+ user_data_dir="", **self.launch_options
506
422
  )
507
423
 
424
+ # Get the default page and close it
425
+ default_page = self.context.pages[0]
426
+ await default_page.close()
427
+
508
428
  if self.init_script: # pragma: no cover
509
429
  await self.context.add_init_script(path=self.init_script)
510
430
 
@@ -533,57 +453,63 @@ class AsyncDynamicSession(DynamicSession):
533
453
 
534
454
  self._closed = True
535
455
 
536
- async def _get_or_create_page(self) -> PageInfo:
537
- """Get an available page or create a new one"""
538
- async with self._lock:
539
- # Try to get a ready page first
540
- page_info = self.page_pool.get_ready_page()
541
- if page_info:
542
- return page_info
543
-
544
- # Create a new page if under limit
545
- if self.page_pool.pages_count < self.max_pages:
546
- page = await self.context.new_page()
547
- page.set_default_navigation_timeout(self.timeout)
548
- page.set_default_timeout(self.timeout)
549
- if self.extra_headers:
550
- await page.set_extra_http_headers(self.extra_headers)
551
-
552
- if self.disable_resources:
553
- await page.route("**/*", async_intercept_route)
554
-
555
- if self.stealth:
556
- for script in _compiled_stealth_scripts():
557
- await page.add_init_script(script=script)
558
-
559
- return self.page_pool.add_page(page)
560
-
561
- # Wait for a page to become available
562
- max_wait = 30 # seconds
563
- start_time = time()
564
-
565
- while time() - start_time < max_wait: # pragma: no cover
566
- page_info = self.page_pool.get_ready_page()
567
- if page_info:
568
- return page_info
569
- await asyncio_sleep(0.05)
570
-
571
- raise TimeoutError("No pages available within timeout period")
572
-
573
- async def fetch(self, url: str) -> Response:
456
+ async def fetch(
457
+ self,
458
+ url: str,
459
+ google_search: bool = _UNSET,
460
+ timeout: int | float = _UNSET,
461
+ wait: int | float = _UNSET,
462
+ page_action: Optional[Callable] = _UNSET,
463
+ extra_headers: Optional[Dict[str, str]] = _UNSET,
464
+ disable_resources: bool = _UNSET,
465
+ wait_selector: Optional[str] = _UNSET,
466
+ wait_selector_state: SelectorWaitStates = _UNSET,
467
+ network_idle: bool = _UNSET,
468
+ load_dom: bool = _UNSET,
469
+ selector_config: Optional[Dict] = _UNSET,
470
+ ) -> Response:
574
471
  """Opens up the browser and do your request based on your chosen options.
575
472
 
576
473
  :param url: The Target url.
474
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
475
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
476
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
477
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
478
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
479
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
480
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
481
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
482
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
483
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
484
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
485
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
486
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
577
487
  :return: A `Response` object.
578
488
  """
489
+ # Validate all resolved parameters
490
+ params = validate(
491
+ dict(
492
+ google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
493
+ timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
494
+ wait=self._get_with_precedence(wait, self.wait, _UNSET),
495
+ page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
496
+ extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
497
+ disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
498
+ wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
499
+ wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
500
+ network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
501
+ load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
502
+ selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
503
+ ),
504
+ PlaywrightConfig,
505
+ )
506
+
579
507
  if self._closed: # pragma: no cover
580
508
  raise RuntimeError("Context manager has been closed")
581
509
 
582
510
  final_response = None
583
511
  referer = (
584
- generate_convincing_referer(url)
585
- if (self.google_search and "referer" not in self._headers_keys)
586
- else None
512
+ generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
587
513
  )
588
514
 
589
515
  async def handle_response(finished_response: AsyncPlaywrightResponse):
@@ -594,48 +520,50 @@ class AsyncDynamicSession(DynamicSession):
594
520
  ):
595
521
  final_response = finished_response
596
522
 
597
- page_info = await self._get_or_create_page()
523
+ page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
598
524
  page_info.mark_busy(url=url)
599
525
 
600
526
  try:
601
527
  # Navigate to URL and wait for a specified state
602
528
  page_info.page.on("response", handle_response)
603
529
  first_response = await page_info.page.goto(url, referer=referer)
604
- await page_info.page.wait_for_load_state(state="domcontentloaded")
530
+ if self.load_dom:
531
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
605
532
 
606
- if self.network_idle:
533
+ if params.network_idle:
607
534
  await page_info.page.wait_for_load_state("networkidle")
608
535
 
609
536
  if not first_response:
610
537
  raise RuntimeError(f"Failed to get response for {url}")
611
538
 
612
- if self.page_action is not None:
539
+ if params.page_action:
613
540
  try:
614
- page_info.page = await self.page_action(page_info.page)
541
+ _ = await params.page_action(page_info.page)
615
542
  except Exception as e:
616
543
  log.error(f"Error executing page_action: {e}")
617
544
 
618
- if self.wait_selector:
545
+ if params.wait_selector:
619
546
  try:
620
- waiter: AsyncLocator = page_info.page.locator(self.wait_selector)
621
- await waiter.first.wait_for(state=self.wait_selector_state)
547
+ waiter: AsyncLocator = page_info.page.locator(params.wait_selector)
548
+ await waiter.first.wait_for(state=params.wait_selector_state)
622
549
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
623
550
  await page_info.page.wait_for_load_state(state="load")
624
- await page_info.page.wait_for_load_state(state="domcontentloaded")
625
- if self.network_idle:
551
+ if self.load_dom:
552
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
553
+ if params.network_idle:
626
554
  await page_info.page.wait_for_load_state("networkidle")
627
555
  except Exception as e:
628
- log.error(f"Error waiting for selector {self.wait_selector}: {e}")
556
+ log.error(f"Error waiting for selector {params.wait_selector}: {e}")
629
557
 
630
- await page_info.page.wait_for_timeout(self.wait)
558
+ await page_info.page.wait_for_timeout(params.wait)
631
559
 
632
560
  # Create response object
633
561
  response = await ResponseFactory.from_async_playwright_response(
634
- page_info.page, first_response, final_response, self.selector_config
562
+ page_info.page, first_response, final_response, params.selector_config
635
563
  )
636
564
 
637
- # Mark the page as ready for next use
638
- page_info.mark_ready()
565
+ # Mark the page as finished for next use
566
+ page_info.mark_finished()
639
567
 
640
568
  return response
641
569