scrapling 0.3__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. scrapling/__init__.py +1 -1
  2. scrapling/cli.py +38 -51
  3. scrapling/core/_html_utils.py +3 -9
  4. scrapling/core/ai.py +5 -13
  5. scrapling/core/custom_types.py +19 -61
  6. scrapling/core/mixins.py +6 -28
  7. scrapling/core/shell.py +51 -129
  8. scrapling/core/storage.py +2 -8
  9. scrapling/core/translator.py +8 -20
  10. scrapling/core/utils/__init__.py +10 -0
  11. scrapling/core/utils/_shell.py +48 -0
  12. scrapling/core/{utils.py → utils/_utils.py} +5 -21
  13. scrapling/engines/__init__.py +0 -16
  14. scrapling/engines/_browsers/_base.py +297 -0
  15. scrapling/engines/_browsers/_camoufox.py +238 -293
  16. scrapling/engines/_browsers/_config_tools.py +2 -1
  17. scrapling/engines/_browsers/_controllers.py +220 -278
  18. scrapling/engines/_browsers/_page.py +37 -15
  19. scrapling/engines/_browsers/_validators.py +29 -15
  20. scrapling/engines/constants.py +3 -6
  21. scrapling/engines/static.py +25 -75
  22. scrapling/engines/toolbelt/__init__.py +1 -20
  23. scrapling/engines/toolbelt/convertor.py +95 -86
  24. scrapling/engines/toolbelt/custom.py +7 -99
  25. scrapling/engines/toolbelt/fingerprints.py +1 -3
  26. scrapling/engines/toolbelt/navigation.py +4 -58
  27. scrapling/fetchers.py +41 -24
  28. scrapling/parser.py +45 -122
  29. {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/METADATA +57 -47
  30. scrapling-0.3.2.dist-info/RECORD +44 -0
  31. scrapling-0.3.dist-info/RECORD +0 -41
  32. {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/WHEEL +0 -0
  33. {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/entry_points.txt +0 -0
  34. {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/licenses/LICENSE +0 -0
  35. {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,6 @@
1
- from time import time, sleep
2
- from asyncio import sleep as asyncio_sleep, Lock
3
-
4
1
  from playwright.sync_api import (
5
2
  Response as SyncPlaywrightResponse,
6
3
  sync_playwright,
7
- BrowserContext,
8
4
  Playwright,
9
5
  Locator,
10
6
  )
@@ -21,9 +17,8 @@ from rebrowser_playwright.async_api import (
21
17
  )
22
18
 
23
19
  from scrapling.core.utils import log
24
- from ._page import PageInfo, PagePool
20
+ from ._base import SyncSession, AsyncSession, DynamicSessionMixin
25
21
  from ._validators import validate, PlaywrightConfig
26
- from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
27
22
  from scrapling.core._types import (
28
23
  Dict,
29
24
  List,
@@ -31,16 +26,16 @@ from scrapling.core._types import (
31
26
  Callable,
32
27
  SelectorWaitStates,
33
28
  )
34
- from scrapling.engines.toolbelt import (
29
+ from scrapling.engines.toolbelt.convertor import (
35
30
  Response,
36
31
  ResponseFactory,
37
- generate_convincing_referer,
38
- intercept_route,
39
- async_intercept_route,
40
32
  )
33
+ from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
34
+
35
+ _UNSET = object()
41
36
 
42
37
 
43
- class DynamicSession:
38
+ class DynamicSession(DynamicSessionMixin, SyncSession):
44
39
  """A Browser session manager with page pooling."""
45
40
 
46
41
  __slots__ = (
@@ -59,7 +54,9 @@ class DynamicSession:
59
54
  "cookies",
60
55
  "disable_resources",
61
56
  "network_idle",
57
+ "load_dom",
62
58
  "wait_selector",
59
+ "init_script",
63
60
  "wait_selector_state",
64
61
  "wait",
65
62
  "playwright",
@@ -94,8 +91,10 @@ class DynamicSession:
94
91
  timeout: int | float = 30000,
95
92
  disable_resources: bool = False,
96
93
  wait_selector: Optional[str] = None,
94
+ init_script: Optional[str] = None,
97
95
  cookies: Optional[List[Dict]] = None,
98
96
  network_idle: bool = False,
97
+ load_dom: bool = True,
99
98
  wait_selector_state: SelectorWaitStates = "attached",
100
99
  selector_config: Optional[Dict] = None,
101
100
  ):
@@ -110,120 +109,48 @@ class DynamicSession:
110
109
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
111
110
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
112
111
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
113
- :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
112
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
114
113
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
114
+ :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
115
115
  :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
116
116
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
117
117
  :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
118
118
  :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
119
119
  :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
120
120
  :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
121
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
121
122
  :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
122
123
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
123
124
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
124
125
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
125
126
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
126
127
  """
127
-
128
- params = {
129
- "max_pages": __max_pages,
130
- "headless": headless,
131
- "google_search": google_search,
132
- "hide_canvas": hide_canvas,
133
- "disable_webgl": disable_webgl,
134
- "real_chrome": real_chrome,
135
- "stealth": stealth,
136
- "wait": wait,
137
- "page_action": page_action,
138
- "proxy": proxy,
139
- "locale": locale,
140
- "extra_headers": extra_headers,
141
- "useragent": useragent,
142
- "timeout": timeout,
143
- "selector_config": selector_config,
144
- "disable_resources": disable_resources,
145
- "wait_selector": wait_selector,
146
- "cookies": cookies,
147
- "network_idle": network_idle,
148
- "wait_selector_state": wait_selector_state,
149
- "cdp_url": cdp_url,
150
- }
151
- config = validate(params, PlaywrightConfig)
152
-
153
- self.max_pages = config.max_pages
154
- self.headless = config.headless
155
- self.hide_canvas = config.hide_canvas
156
- self.disable_webgl = config.disable_webgl
157
- self.real_chrome = config.real_chrome
158
- self.stealth = config.stealth
159
- self.google_search = config.google_search
160
- self.wait = config.wait
161
- self.proxy = config.proxy
162
- self.locale = config.locale
163
- self.extra_headers = config.extra_headers
164
- self.useragent = config.useragent
165
- self.timeout = config.timeout
166
- self.cookies = config.cookies
167
- self.disable_resources = config.disable_resources
168
- self.cdp_url = config.cdp_url
169
- self.network_idle = config.network_idle
170
- self.wait_selector = config.wait_selector
171
- self.wait_selector_state = config.wait_selector_state
172
-
173
- self.playwright: Optional[Playwright] = None
174
- self.context: Optional[BrowserContext] = None
175
- self.page_pool = PagePool(self.max_pages)
176
- self._closed = False
177
- self.selector_config = config.selector_config
178
- self.page_action = config.page_action
179
- self._headers_keys = (
180
- set(map(str.lower, self.extra_headers.keys()))
181
- if self.extra_headers
182
- else set()
128
+ self.__validate__(
129
+ wait=wait,
130
+ proxy=proxy,
131
+ locale=locale,
132
+ timeout=timeout,
133
+ stealth=stealth,
134
+ cdp_url=cdp_url,
135
+ cookies=cookies,
136
+ load_dom=load_dom,
137
+ headless=headless,
138
+ useragent=useragent,
139
+ max_pages=__max_pages,
140
+ real_chrome=real_chrome,
141
+ page_action=page_action,
142
+ hide_canvas=hide_canvas,
143
+ init_script=init_script,
144
+ network_idle=network_idle,
145
+ google_search=google_search,
146
+ extra_headers=extra_headers,
147
+ wait_selector=wait_selector,
148
+ disable_webgl=disable_webgl,
149
+ selector_config=selector_config,
150
+ disable_resources=disable_resources,
151
+ wait_selector_state=wait_selector_state,
183
152
  )
184
- self.__initiate_browser_options__()
185
-
186
- def __initiate_browser_options__(self):
187
- if not self.cdp_url:
188
- # `launch_options` is used with persistent context
189
- self.launch_options = dict(
190
- _launch_kwargs(
191
- self.headless,
192
- self.proxy,
193
- self.locale,
194
- tuple(self.extra_headers.items())
195
- if self.extra_headers
196
- else tuple(),
197
- self.useragent,
198
- self.real_chrome,
199
- self.stealth,
200
- self.hide_canvas,
201
- self.disable_webgl,
202
- )
203
- )
204
- self.launch_options["extra_http_headers"] = dict(
205
- self.launch_options["extra_http_headers"]
206
- )
207
- self.launch_options["proxy"] = dict(self.launch_options["proxy"]) or None
208
- self.context_options = dict()
209
- else:
210
- # while `context_options` is left to be used when cdp mode is enabled
211
- self.launch_options = dict()
212
- self.context_options = dict(
213
- _context_kwargs(
214
- self.proxy,
215
- self.locale,
216
- tuple(self.extra_headers.items())
217
- if self.extra_headers
218
- else tuple(),
219
- self.useragent,
220
- self.stealth,
221
- )
222
- )
223
- self.context_options["extra_http_headers"] = dict(
224
- self.context_options["extra_http_headers"]
225
- )
226
- self.context_options["proxy"] = dict(self.context_options["proxy"]) or None
153
+ super().__init__(max_pages=self.max_pages)
227
154
 
228
155
  def __create__(self):
229
156
  """Create a browser for this instance and context."""
@@ -232,16 +159,21 @@ class DynamicSession:
232
159
  # Because rebrowser_playwright doesn't play well with real browsers
233
160
  sync_context = sync_playwright
234
161
 
235
- self.playwright = sync_context().start()
162
+ self.playwright: Playwright = sync_context().start()
236
163
 
237
164
  if self.cdp_url: # pragma: no cover
238
- self.context = self.playwright.chromium.connect_over_cdp(
239
- endpoint_url=self.cdp_url
240
- ).new_context(**self.context_options)
241
- else:
242
- self.context = self.playwright.chromium.launch_persistent_context(
243
- user_data_dir="", **self.launch_options
165
+ self.context = self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url).new_context(
166
+ **self.context_options
244
167
  )
168
+ else:
169
+ self.context = self.playwright.chromium.launch_persistent_context(user_data_dir="", **self.launch_options)
170
+
171
+ # Get the default page and close it
172
+ default_page = self.context.pages[0]
173
+ default_page.close()
174
+
175
+ if self.init_script: # pragma: no cover
176
+ self.context.add_init_script(path=self.init_script)
245
177
 
246
178
  if self.cookies: # pragma: no cover
247
179
  self.context.add_cookies(self.cookies)
@@ -268,56 +200,63 @@ class DynamicSession:
268
200
 
269
201
  self._closed = True
270
202
 
271
- def _get_or_create_page(self) -> PageInfo: # pragma: no cover
272
- """Get an available page or create a new one"""
273
- # Try to get a ready page first
274
- page_info = self.page_pool.get_ready_page()
275
- if page_info:
276
- return page_info
277
-
278
- # Create a new page if under limit
279
- if self.page_pool.pages_count < self.max_pages:
280
- page = self.context.new_page()
281
- page.set_default_navigation_timeout(self.timeout)
282
- page.set_default_timeout(self.timeout)
283
- if self.extra_headers:
284
- page.set_extra_http_headers(self.extra_headers)
285
-
286
- if self.disable_resources:
287
- page.route("**/*", intercept_route)
288
-
289
- if self.stealth:
290
- for script in _compiled_stealth_scripts():
291
- page.add_init_script(script=script)
292
-
293
- return self.page_pool.add_page(page)
294
-
295
- # Wait for a page to become available
296
- max_wait = 30
297
- start_time = time()
298
-
299
- while time() - start_time < max_wait:
300
- page_info = self.page_pool.get_ready_page()
301
- if page_info:
302
- return page_info
303
- sleep(0.05)
304
-
305
- raise TimeoutError("No pages available within timeout period")
306
-
307
- def fetch(self, url: str) -> Response:
203
+ def fetch(
204
+ self,
205
+ url: str,
206
+ google_search: bool = _UNSET,
207
+ timeout: int | float = _UNSET,
208
+ wait: int | float = _UNSET,
209
+ page_action: Optional[Callable] = _UNSET,
210
+ extra_headers: Optional[Dict[str, str]] = _UNSET,
211
+ disable_resources: bool = _UNSET,
212
+ wait_selector: Optional[str] = _UNSET,
213
+ wait_selector_state: SelectorWaitStates = _UNSET,
214
+ network_idle: bool = _UNSET,
215
+ load_dom: bool = _UNSET,
216
+ selector_config: Optional[Dict] = _UNSET,
217
+ ) -> Response:
308
218
  """Opens up the browser and do your request based on your chosen options.
309
219
 
310
220
  :param url: The Target url.
221
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
222
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
223
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
224
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
225
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
226
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
227
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
228
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
229
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
230
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
231
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
232
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
233
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
311
234
  :return: A `Response` object.
312
235
  """
236
+ # Validate all resolved parameters
237
+ params = validate(
238
+ dict(
239
+ google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
240
+ timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
241
+ wait=self._get_with_precedence(wait, self.wait, _UNSET),
242
+ page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
243
+ extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
244
+ disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
245
+ wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
246
+ wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
247
+ network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
248
+ load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
249
+ selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
250
+ ),
251
+ PlaywrightConfig,
252
+ )
253
+
313
254
  if self._closed: # pragma: no cover
314
255
  raise RuntimeError("Context manager has been closed")
315
256
 
316
257
  final_response = None
317
258
  referer = (
318
- generate_convincing_referer(url)
319
- if (self.google_search and "referer" not in self._headers_keys)
320
- else None
259
+ generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
321
260
  )
322
261
 
323
262
  def handle_response(finished_response: SyncPlaywrightResponse):
@@ -328,48 +267,50 @@ class DynamicSession:
328
267
  ):
329
268
  final_response = finished_response
330
269
 
331
- page_info = self._get_or_create_page()
270
+ page_info = self._get_page(params.timeout, params.extra_headers, params.disable_resources)
332
271
  page_info.mark_busy(url=url)
333
272
 
334
273
  try: # pragma: no cover
335
274
  # Navigate to URL and wait for a specified state
336
275
  page_info.page.on("response", handle_response)
337
276
  first_response = page_info.page.goto(url, referer=referer)
338
- page_info.page.wait_for_load_state(state="domcontentloaded")
277
+ if params.load_dom:
278
+ page_info.page.wait_for_load_state(state="domcontentloaded")
339
279
 
340
- if self.network_idle:
280
+ if params.network_idle:
341
281
  page_info.page.wait_for_load_state("networkidle")
342
282
 
343
283
  if not first_response:
344
284
  raise RuntimeError(f"Failed to get response for {url}")
345
285
 
346
- if self.page_action is not None:
286
+ if params.page_action:
347
287
  try:
348
- page_info.page = self.page_action(page_info.page)
288
+ _ = params.page_action(page_info.page)
349
289
  except Exception as e: # pragma: no cover
350
290
  log.error(f"Error executing page_action: {e}")
351
291
 
352
- if self.wait_selector:
292
+ if params.wait_selector:
353
293
  try:
354
- waiter: Locator = page_info.page.locator(self.wait_selector)
355
- waiter.first.wait_for(state=self.wait_selector_state)
294
+ waiter: Locator = page_info.page.locator(params.wait_selector)
295
+ waiter.first.wait_for(state=params.wait_selector_state)
356
296
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
357
297
  page_info.page.wait_for_load_state(state="load")
358
- page_info.page.wait_for_load_state(state="domcontentloaded")
359
- if self.network_idle:
298
+ if params.load_dom:
299
+ page_info.page.wait_for_load_state(state="domcontentloaded")
300
+ if params.network_idle:
360
301
  page_info.page.wait_for_load_state("networkidle")
361
302
  except Exception as e: # pragma: no cover
362
- log.error(f"Error waiting for selector {self.wait_selector}: {e}")
303
+ log.error(f"Error waiting for selector {params.wait_selector}: {e}")
363
304
 
364
- page_info.page.wait_for_timeout(self.wait)
305
+ page_info.page.wait_for_timeout(params.wait)
365
306
 
366
307
  # Create response object
367
308
  response = ResponseFactory.from_playwright_response(
368
- page_info.page, first_response, final_response, self.selector_config
309
+ page_info.page, first_response, final_response, params.selector_config
369
310
  )
370
311
 
371
- # Mark the page as ready for next use
372
- page_info.mark_ready()
312
+ # Mark the page as finished for next use
313
+ page_info.mark_finished()
373
314
 
374
315
  return response
375
316
 
@@ -377,17 +318,8 @@ class DynamicSession:
377
318
  page_info.mark_error()
378
319
  raise e
379
320
 
380
- def get_pool_stats(self) -> Dict[str, int]:
381
- """Get statistics about the current page pool"""
382
- return {
383
- "total_pages": self.page_pool.pages_count,
384
- "ready_pages": self.page_pool.ready_count,
385
- "busy_pages": self.page_pool.busy_count,
386
- "max_pages": self.max_pages,
387
- }
388
321
 
389
-
390
- class AsyncDynamicSession(DynamicSession):
322
+ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
391
323
  """An async Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory."""
392
324
 
393
325
  def __init__(
@@ -409,8 +341,10 @@ class AsyncDynamicSession(DynamicSession):
409
341
  timeout: int | float = 30000,
410
342
  disable_resources: bool = False,
411
343
  wait_selector: Optional[str] = None,
344
+ init_script: Optional[str] = None,
412
345
  cookies: Optional[List[Dict]] = None,
413
346
  network_idle: bool = False,
347
+ load_dom: bool = True,
414
348
  wait_selector_state: SelectorWaitStates = "attached",
415
349
  selector_config: Optional[Dict] = None,
416
350
  ):
@@ -423,10 +357,12 @@ class AsyncDynamicSession(DynamicSession):
423
357
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
424
358
  :param cookies: Set cookies for the next request.
425
359
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
360
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
426
361
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
427
362
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
428
- :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
363
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
429
364
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
365
+ :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
430
366
  :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
431
367
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
432
368
  :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
@@ -441,35 +377,32 @@ class AsyncDynamicSession(DynamicSession):
441
377
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
442
378
  """
443
379
 
444
- super().__init__(
445
- max_pages,
446
- headless,
447
- google_search,
448
- hide_canvas,
449
- disable_webgl,
450
- real_chrome,
451
- stealth,
452
- wait,
453
- page_action,
454
- proxy,
455
- locale,
456
- extra_headers,
457
- useragent,
458
- cdp_url,
459
- timeout,
460
- disable_resources,
461
- wait_selector,
462
- cookies,
463
- network_idle,
464
- wait_selector_state,
465
- selector_config,
380
+ self.__validate__(
381
+ wait=wait,
382
+ proxy=proxy,
383
+ locale=locale,
384
+ timeout=timeout,
385
+ stealth=stealth,
386
+ cdp_url=cdp_url,
387
+ cookies=cookies,
388
+ load_dom=load_dom,
389
+ headless=headless,
390
+ useragent=useragent,
391
+ max_pages=max_pages,
392
+ real_chrome=real_chrome,
393
+ page_action=page_action,
394
+ hide_canvas=hide_canvas,
395
+ init_script=init_script,
396
+ network_idle=network_idle,
397
+ google_search=google_search,
398
+ extra_headers=extra_headers,
399
+ wait_selector=wait_selector,
400
+ disable_webgl=disable_webgl,
401
+ selector_config=selector_config,
402
+ disable_resources=disable_resources,
403
+ wait_selector_state=wait_selector_state,
466
404
  )
467
-
468
- self.playwright: Optional[AsyncPlaywright] = None
469
- self.context: Optional[AsyncBrowserContext] = None
470
- self._lock = Lock()
471
- self.__enter__ = None
472
- self.__exit__ = None
405
+ super().__init__(max_pages=self.max_pages)
473
406
 
474
407
  async def __create__(self):
475
408
  """Create a browser for this instance and context."""
@@ -481,19 +414,20 @@ class AsyncDynamicSession(DynamicSession):
481
414
  self.playwright: AsyncPlaywright = await async_context().start()
482
415
 
483
416
  if self.cdp_url:
484
- browser = await self.playwright.chromium.connect_over_cdp(
485
- endpoint_url=self.cdp_url
486
- )
487
- self.context: AsyncBrowserContext = await browser.new_context(
488
- **self.context_options
489
- )
417
+ browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url)
418
+ self.context: AsyncBrowserContext = await browser.new_context(**self.context_options)
490
419
  else:
491
- self.context: AsyncBrowserContext = (
492
- await self.playwright.chromium.launch_persistent_context(
493
- user_data_dir="", **self.launch_options
494
- )
420
+ self.context: AsyncBrowserContext = await self.playwright.chromium.launch_persistent_context(
421
+ user_data_dir="", **self.launch_options
495
422
  )
496
423
 
424
+ # Get the default page and close it
425
+ default_page = self.context.pages[0]
426
+ await default_page.close()
427
+
428
+ if self.init_script: # pragma: no cover
429
+ await self.context.add_init_script(path=self.init_script)
430
+
497
431
  if self.cookies:
498
432
  await self.context.add_cookies(self.cookies)
499
433
 
@@ -519,57 +453,63 @@ class AsyncDynamicSession(DynamicSession):
519
453
 
520
454
  self._closed = True
521
455
 
522
- async def _get_or_create_page(self) -> PageInfo:
523
- """Get an available page or create a new one"""
524
- async with self._lock:
525
- # Try to get a ready page first
526
- page_info = self.page_pool.get_ready_page()
527
- if page_info:
528
- return page_info
529
-
530
- # Create a new page if under limit
531
- if self.page_pool.pages_count < self.max_pages:
532
- page = await self.context.new_page()
533
- page.set_default_navigation_timeout(self.timeout)
534
- page.set_default_timeout(self.timeout)
535
- if self.extra_headers:
536
- await page.set_extra_http_headers(self.extra_headers)
537
-
538
- if self.disable_resources:
539
- await page.route("**/*", async_intercept_route)
540
-
541
- if self.stealth:
542
- for script in _compiled_stealth_scripts():
543
- await page.add_init_script(script=script)
544
-
545
- return self.page_pool.add_page(page)
546
-
547
- # Wait for a page to become available
548
- max_wait = 30 # seconds
549
- start_time = time()
550
-
551
- while time() - start_time < max_wait: # pragma: no cover
552
- page_info = self.page_pool.get_ready_page()
553
- if page_info:
554
- return page_info
555
- await asyncio_sleep(0.05)
556
-
557
- raise TimeoutError("No pages available within timeout period")
558
-
559
- async def fetch(self, url: str) -> Response:
456
+ async def fetch(
457
+ self,
458
+ url: str,
459
+ google_search: bool = _UNSET,
460
+ timeout: int | float = _UNSET,
461
+ wait: int | float = _UNSET,
462
+ page_action: Optional[Callable] = _UNSET,
463
+ extra_headers: Optional[Dict[str, str]] = _UNSET,
464
+ disable_resources: bool = _UNSET,
465
+ wait_selector: Optional[str] = _UNSET,
466
+ wait_selector_state: SelectorWaitStates = _UNSET,
467
+ network_idle: bool = _UNSET,
468
+ load_dom: bool = _UNSET,
469
+ selector_config: Optional[Dict] = _UNSET,
470
+ ) -> Response:
560
471
  """Opens up the browser and do your request based on your chosen options.
561
472
 
562
473
  :param url: The Target url.
474
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
475
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
476
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
477
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
478
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
479
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
480
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
481
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
482
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
483
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
484
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
485
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
486
+ :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
563
487
  :return: A `Response` object.
564
488
  """
489
+ # Validate all resolved parameters
490
+ params = validate(
491
+ dict(
492
+ google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
493
+ timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
494
+ wait=self._get_with_precedence(wait, self.wait, _UNSET),
495
+ page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
496
+ extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
497
+ disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
498
+ wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
499
+ wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
500
+ network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
501
+ load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
502
+ selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
503
+ ),
504
+ PlaywrightConfig,
505
+ )
506
+
565
507
  if self._closed: # pragma: no cover
566
508
  raise RuntimeError("Context manager has been closed")
567
509
 
568
510
  final_response = None
569
511
  referer = (
570
- generate_convincing_referer(url)
571
- if (self.google_search and "referer" not in self._headers_keys)
572
- else None
512
+ generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
573
513
  )
574
514
 
575
515
  async def handle_response(finished_response: AsyncPlaywrightResponse):
@@ -580,48 +520,50 @@ class AsyncDynamicSession(DynamicSession):
580
520
  ):
581
521
  final_response = finished_response
582
522
 
583
- page_info = await self._get_or_create_page()
523
+ page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
584
524
  page_info.mark_busy(url=url)
585
525
 
586
526
  try:
587
527
  # Navigate to URL and wait for a specified state
588
528
  page_info.page.on("response", handle_response)
589
529
  first_response = await page_info.page.goto(url, referer=referer)
590
- await page_info.page.wait_for_load_state(state="domcontentloaded")
530
+ if self.load_dom:
531
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
591
532
 
592
- if self.network_idle:
533
+ if params.network_idle:
593
534
  await page_info.page.wait_for_load_state("networkidle")
594
535
 
595
536
  if not first_response:
596
537
  raise RuntimeError(f"Failed to get response for {url}")
597
538
 
598
- if self.page_action is not None:
539
+ if params.page_action:
599
540
  try:
600
- page_info.page = await self.page_action(page_info.page)
541
+ _ = await params.page_action(page_info.page)
601
542
  except Exception as e:
602
543
  log.error(f"Error executing page_action: {e}")
603
544
 
604
- if self.wait_selector:
545
+ if params.wait_selector:
605
546
  try:
606
- waiter: AsyncLocator = page_info.page.locator(self.wait_selector)
607
- await waiter.first.wait_for(state=self.wait_selector_state)
547
+ waiter: AsyncLocator = page_info.page.locator(params.wait_selector)
548
+ await waiter.first.wait_for(state=params.wait_selector_state)
608
549
  # Wait again after waiting for the selector, helpful with protections like Cloudflare
609
550
  await page_info.page.wait_for_load_state(state="load")
610
- await page_info.page.wait_for_load_state(state="domcontentloaded")
611
- if self.network_idle:
551
+ if self.load_dom:
552
+ await page_info.page.wait_for_load_state(state="domcontentloaded")
553
+ if params.network_idle:
612
554
  await page_info.page.wait_for_load_state("networkidle")
613
555
  except Exception as e:
614
- log.error(f"Error waiting for selector {self.wait_selector}: {e}")
556
+ log.error(f"Error waiting for selector {params.wait_selector}: {e}")
615
557
 
616
- await page_info.page.wait_for_timeout(self.wait)
558
+ await page_info.page.wait_for_timeout(params.wait)
617
559
 
618
560
  # Create response object
619
561
  response = await ResponseFactory.from_async_playwright_response(
620
- page_info.page, first_response, final_response, self.selector_config
562
+ page_info.page, first_response, final_response, params.selector_config
621
563
  )
622
564
 
623
- # Mark the page as ready for next use
624
- page_info.mark_ready()
565
+ # Mark the page as finished for next use
566
+ page_info.mark_finished()
625
567
 
626
568
  return response
627
569