scrapling 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/cli.py +38 -51
- scrapling/core/_html_utils.py +3 -9
- scrapling/core/ai.py +5 -13
- scrapling/core/custom_types.py +19 -61
- scrapling/core/mixins.py +6 -28
- scrapling/core/shell.py +49 -127
- scrapling/core/storage.py +2 -8
- scrapling/core/translator.py +8 -20
- scrapling/core/utils/__init__.py +10 -0
- scrapling/core/utils/_shell.py +48 -0
- scrapling/core/{utils.py → utils/_utils.py} +5 -21
- scrapling/engines/__init__.py +0 -16
- scrapling/engines/_browsers/_base.py +297 -0
- scrapling/engines/_browsers/_camoufox.py +219 -296
- scrapling/engines/_browsers/_config_tools.py +2 -1
- scrapling/engines/_browsers/_controllers.py +201 -281
- scrapling/engines/_browsers/_page.py +37 -15
- scrapling/engines/_browsers/_validators.py +9 -15
- scrapling/engines/constants.py +3 -6
- scrapling/engines/static.py +25 -75
- scrapling/engines/toolbelt/__init__.py +1 -20
- scrapling/engines/toolbelt/convertor.py +95 -86
- scrapling/engines/toolbelt/custom.py +7 -99
- scrapling/engines/toolbelt/fingerprints.py +1 -3
- scrapling/engines/toolbelt/navigation.py +4 -58
- scrapling/fetchers.py +29 -24
- scrapling/parser.py +45 -122
- {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/METADATA +54 -46
- scrapling-0.3.3.dist-info/RECORD +44 -0
- scrapling-0.3.1.dist-info/RECORD +0 -41
- {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/WHEEL +0 -0
- {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/entry_points.txt +0 -0
- {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/licenses/LICENSE +0 -0
- {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,6 @@
|
|
1
|
-
from time import time, sleep
|
2
|
-
from asyncio import sleep as asyncio_sleep, Lock
|
3
|
-
|
4
1
|
from playwright.sync_api import (
|
5
2
|
Response as SyncPlaywrightResponse,
|
6
3
|
sync_playwright,
|
7
|
-
BrowserContext,
|
8
4
|
Playwright,
|
9
5
|
Locator,
|
10
6
|
)
|
@@ -21,9 +17,8 @@ from rebrowser_playwright.async_api import (
|
|
21
17
|
)
|
22
18
|
|
23
19
|
from scrapling.core.utils import log
|
24
|
-
from .
|
20
|
+
from ._base import SyncSession, AsyncSession, DynamicSessionMixin
|
25
21
|
from ._validators import validate, PlaywrightConfig
|
26
|
-
from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
|
27
22
|
from scrapling.core._types import (
|
28
23
|
Dict,
|
29
24
|
List,
|
@@ -31,16 +26,16 @@ from scrapling.core._types import (
|
|
31
26
|
Callable,
|
32
27
|
SelectorWaitStates,
|
33
28
|
)
|
34
|
-
from scrapling.engines.toolbelt import (
|
29
|
+
from scrapling.engines.toolbelt.convertor import (
|
35
30
|
Response,
|
36
31
|
ResponseFactory,
|
37
|
-
generate_convincing_referer,
|
38
|
-
intercept_route,
|
39
|
-
async_intercept_route,
|
40
32
|
)
|
33
|
+
from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
|
34
|
+
|
35
|
+
_UNSET = object()
|
41
36
|
|
42
37
|
|
43
|
-
class DynamicSession:
|
38
|
+
class DynamicSession(DynamicSessionMixin, SyncSession):
|
44
39
|
"""A Browser session manager with page pooling."""
|
45
40
|
|
46
41
|
__slots__ = (
|
@@ -59,6 +54,7 @@ class DynamicSession:
|
|
59
54
|
"cookies",
|
60
55
|
"disable_resources",
|
61
56
|
"network_idle",
|
57
|
+
"load_dom",
|
62
58
|
"wait_selector",
|
63
59
|
"init_script",
|
64
60
|
"wait_selector_state",
|
@@ -98,6 +94,7 @@ class DynamicSession:
|
|
98
94
|
init_script: Optional[str] = None,
|
99
95
|
cookies: Optional[List[Dict]] = None,
|
100
96
|
network_idle: bool = False,
|
97
|
+
load_dom: bool = True,
|
101
98
|
wait_selector_state: SelectorWaitStates = "attached",
|
102
99
|
selector_config: Optional[Dict] = None,
|
103
100
|
):
|
@@ -112,7 +109,7 @@ class DynamicSession:
|
|
112
109
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
113
110
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
114
111
|
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
115
|
-
:param page_action: Added for automation. A function that takes the `page` object
|
112
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
116
113
|
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
117
114
|
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
|
118
115
|
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
@@ -121,114 +118,39 @@ class DynamicSession:
|
|
121
118
|
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
122
119
|
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
123
120
|
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
121
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
124
122
|
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
|
125
123
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
126
124
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
127
125
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
128
126
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
129
127
|
"""
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
}
|
155
|
-
config = validate(params, PlaywrightConfig)
|
156
|
-
|
157
|
-
self.max_pages = config.max_pages
|
158
|
-
self.headless = config.headless
|
159
|
-
self.hide_canvas = config.hide_canvas
|
160
|
-
self.disable_webgl = config.disable_webgl
|
161
|
-
self.real_chrome = config.real_chrome
|
162
|
-
self.stealth = config.stealth
|
163
|
-
self.google_search = config.google_search
|
164
|
-
self.wait = config.wait
|
165
|
-
self.proxy = config.proxy
|
166
|
-
self.locale = config.locale
|
167
|
-
self.extra_headers = config.extra_headers
|
168
|
-
self.useragent = config.useragent
|
169
|
-
self.timeout = config.timeout
|
170
|
-
self.cookies = config.cookies
|
171
|
-
self.disable_resources = config.disable_resources
|
172
|
-
self.cdp_url = config.cdp_url
|
173
|
-
self.network_idle = config.network_idle
|
174
|
-
self.wait_selector = config.wait_selector
|
175
|
-
self.init_script = config.init_script
|
176
|
-
self.wait_selector_state = config.wait_selector_state
|
177
|
-
|
178
|
-
self.playwright: Optional[Playwright] = None
|
179
|
-
self.context: Optional[BrowserContext] = None
|
180
|
-
self.page_pool = PagePool(self.max_pages)
|
181
|
-
self._closed = False
|
182
|
-
self.selector_config = config.selector_config
|
183
|
-
self.page_action = config.page_action
|
184
|
-
self._headers_keys = (
|
185
|
-
set(map(str.lower, self.extra_headers.keys()))
|
186
|
-
if self.extra_headers
|
187
|
-
else set()
|
128
|
+
self.__validate__(
|
129
|
+
wait=wait,
|
130
|
+
proxy=proxy,
|
131
|
+
locale=locale,
|
132
|
+
timeout=timeout,
|
133
|
+
stealth=stealth,
|
134
|
+
cdp_url=cdp_url,
|
135
|
+
cookies=cookies,
|
136
|
+
load_dom=load_dom,
|
137
|
+
headless=headless,
|
138
|
+
useragent=useragent,
|
139
|
+
max_pages=__max_pages,
|
140
|
+
real_chrome=real_chrome,
|
141
|
+
page_action=page_action,
|
142
|
+
hide_canvas=hide_canvas,
|
143
|
+
init_script=init_script,
|
144
|
+
network_idle=network_idle,
|
145
|
+
google_search=google_search,
|
146
|
+
extra_headers=extra_headers,
|
147
|
+
wait_selector=wait_selector,
|
148
|
+
disable_webgl=disable_webgl,
|
149
|
+
selector_config=selector_config,
|
150
|
+
disable_resources=disable_resources,
|
151
|
+
wait_selector_state=wait_selector_state,
|
188
152
|
)
|
189
|
-
self.
|
190
|
-
|
191
|
-
def __initiate_browser_options__(self):
|
192
|
-
if not self.cdp_url:
|
193
|
-
# `launch_options` is used with persistent context
|
194
|
-
self.launch_options = dict(
|
195
|
-
_launch_kwargs(
|
196
|
-
self.headless,
|
197
|
-
self.proxy,
|
198
|
-
self.locale,
|
199
|
-
tuple(self.extra_headers.items())
|
200
|
-
if self.extra_headers
|
201
|
-
else tuple(),
|
202
|
-
self.useragent,
|
203
|
-
self.real_chrome,
|
204
|
-
self.stealth,
|
205
|
-
self.hide_canvas,
|
206
|
-
self.disable_webgl,
|
207
|
-
)
|
208
|
-
)
|
209
|
-
self.launch_options["extra_http_headers"] = dict(
|
210
|
-
self.launch_options["extra_http_headers"]
|
211
|
-
)
|
212
|
-
self.launch_options["proxy"] = dict(self.launch_options["proxy"]) or None
|
213
|
-
self.context_options = dict()
|
214
|
-
else:
|
215
|
-
# while `context_options` is left to be used when cdp mode is enabled
|
216
|
-
self.launch_options = dict()
|
217
|
-
self.context_options = dict(
|
218
|
-
_context_kwargs(
|
219
|
-
self.proxy,
|
220
|
-
self.locale,
|
221
|
-
tuple(self.extra_headers.items())
|
222
|
-
if self.extra_headers
|
223
|
-
else tuple(),
|
224
|
-
self.useragent,
|
225
|
-
self.stealth,
|
226
|
-
)
|
227
|
-
)
|
228
|
-
self.context_options["extra_http_headers"] = dict(
|
229
|
-
self.context_options["extra_http_headers"]
|
230
|
-
)
|
231
|
-
self.context_options["proxy"] = dict(self.context_options["proxy"]) or None
|
153
|
+
super().__init__(max_pages=self.max_pages)
|
232
154
|
|
233
155
|
def __create__(self):
|
234
156
|
"""Create a browser for this instance and context."""
|
@@ -237,16 +159,14 @@ class DynamicSession:
|
|
237
159
|
# Because rebrowser_playwright doesn't play well with real browsers
|
238
160
|
sync_context = sync_playwright
|
239
161
|
|
240
|
-
self.playwright = sync_context().start()
|
162
|
+
self.playwright: Playwright = sync_context().start()
|
241
163
|
|
242
164
|
if self.cdp_url: # pragma: no cover
|
243
|
-
self.context = self.playwright.chromium.connect_over_cdp(
|
244
|
-
|
245
|
-
).new_context(**self.context_options)
|
246
|
-
else:
|
247
|
-
self.context = self.playwright.chromium.launch_persistent_context(
|
248
|
-
user_data_dir="", **self.launch_options
|
165
|
+
self.context = self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url).new_context(
|
166
|
+
**self.context_options
|
249
167
|
)
|
168
|
+
else:
|
169
|
+
self.context = self.playwright.chromium.launch_persistent_context(user_data_dir="", **self.launch_options)
|
250
170
|
|
251
171
|
if self.init_script: # pragma: no cover
|
252
172
|
self.context.add_init_script(path=self.init_script)
|
@@ -276,56 +196,63 @@ class DynamicSession:
|
|
276
196
|
|
277
197
|
self._closed = True
|
278
198
|
|
279
|
-
def
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
if self.disable_resources:
|
295
|
-
page.route("**/*", intercept_route)
|
296
|
-
|
297
|
-
if self.stealth:
|
298
|
-
for script in _compiled_stealth_scripts():
|
299
|
-
page.add_init_script(script=script)
|
300
|
-
|
301
|
-
return self.page_pool.add_page(page)
|
302
|
-
|
303
|
-
# Wait for a page to become available
|
304
|
-
max_wait = 30
|
305
|
-
start_time = time()
|
306
|
-
|
307
|
-
while time() - start_time < max_wait:
|
308
|
-
page_info = self.page_pool.get_ready_page()
|
309
|
-
if page_info:
|
310
|
-
return page_info
|
311
|
-
sleep(0.05)
|
312
|
-
|
313
|
-
raise TimeoutError("No pages available within timeout period")
|
314
|
-
|
315
|
-
def fetch(self, url: str) -> Response:
|
199
|
+
def fetch(
|
200
|
+
self,
|
201
|
+
url: str,
|
202
|
+
google_search: bool = _UNSET,
|
203
|
+
timeout: int | float = _UNSET,
|
204
|
+
wait: int | float = _UNSET,
|
205
|
+
page_action: Optional[Callable] = _UNSET,
|
206
|
+
extra_headers: Optional[Dict[str, str]] = _UNSET,
|
207
|
+
disable_resources: bool = _UNSET,
|
208
|
+
wait_selector: Optional[str] = _UNSET,
|
209
|
+
wait_selector_state: SelectorWaitStates = _UNSET,
|
210
|
+
network_idle: bool = _UNSET,
|
211
|
+
load_dom: bool = _UNSET,
|
212
|
+
selector_config: Optional[Dict] = _UNSET,
|
213
|
+
) -> Response:
|
316
214
|
"""Opens up the browser and do your request based on your chosen options.
|
317
215
|
|
318
216
|
:param url: The Target url.
|
217
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
218
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
219
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
220
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
221
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
222
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
223
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
224
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
225
|
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
226
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
227
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
228
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
229
|
+
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
319
230
|
:return: A `Response` object.
|
320
231
|
"""
|
232
|
+
# Validate all resolved parameters
|
233
|
+
params = validate(
|
234
|
+
dict(
|
235
|
+
google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
|
236
|
+
timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
|
237
|
+
wait=self._get_with_precedence(wait, self.wait, _UNSET),
|
238
|
+
page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
|
239
|
+
extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
|
240
|
+
disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
|
241
|
+
wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
|
242
|
+
wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
|
243
|
+
network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
|
244
|
+
load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
|
245
|
+
selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
|
246
|
+
),
|
247
|
+
PlaywrightConfig,
|
248
|
+
)
|
249
|
+
|
321
250
|
if self._closed: # pragma: no cover
|
322
251
|
raise RuntimeError("Context manager has been closed")
|
323
252
|
|
324
253
|
final_response = None
|
325
254
|
referer = (
|
326
|
-
generate_convincing_referer(url)
|
327
|
-
if (self.google_search and "referer" not in self._headers_keys)
|
328
|
-
else None
|
255
|
+
generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
|
329
256
|
)
|
330
257
|
|
331
258
|
def handle_response(finished_response: SyncPlaywrightResponse):
|
@@ -336,48 +263,50 @@ class DynamicSession:
|
|
336
263
|
):
|
337
264
|
final_response = finished_response
|
338
265
|
|
339
|
-
page_info = self.
|
266
|
+
page_info = self._get_page(params.timeout, params.extra_headers, params.disable_resources)
|
340
267
|
page_info.mark_busy(url=url)
|
341
268
|
|
342
269
|
try: # pragma: no cover
|
343
270
|
# Navigate to URL and wait for a specified state
|
344
271
|
page_info.page.on("response", handle_response)
|
345
272
|
first_response = page_info.page.goto(url, referer=referer)
|
346
|
-
|
273
|
+
if params.load_dom:
|
274
|
+
page_info.page.wait_for_load_state(state="domcontentloaded")
|
347
275
|
|
348
|
-
if
|
276
|
+
if params.network_idle:
|
349
277
|
page_info.page.wait_for_load_state("networkidle")
|
350
278
|
|
351
279
|
if not first_response:
|
352
280
|
raise RuntimeError(f"Failed to get response for {url}")
|
353
281
|
|
354
|
-
if
|
282
|
+
if params.page_action:
|
355
283
|
try:
|
356
|
-
|
284
|
+
_ = params.page_action(page_info.page)
|
357
285
|
except Exception as e: # pragma: no cover
|
358
286
|
log.error(f"Error executing page_action: {e}")
|
359
287
|
|
360
|
-
if
|
288
|
+
if params.wait_selector:
|
361
289
|
try:
|
362
|
-
waiter: Locator = page_info.page.locator(
|
363
|
-
waiter.first.wait_for(state=
|
290
|
+
waiter: Locator = page_info.page.locator(params.wait_selector)
|
291
|
+
waiter.first.wait_for(state=params.wait_selector_state)
|
364
292
|
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
365
293
|
page_info.page.wait_for_load_state(state="load")
|
366
|
-
|
367
|
-
|
294
|
+
if params.load_dom:
|
295
|
+
page_info.page.wait_for_load_state(state="domcontentloaded")
|
296
|
+
if params.network_idle:
|
368
297
|
page_info.page.wait_for_load_state("networkidle")
|
369
298
|
except Exception as e: # pragma: no cover
|
370
|
-
log.error(f"Error waiting for selector {
|
299
|
+
log.error(f"Error waiting for selector {params.wait_selector}: {e}")
|
371
300
|
|
372
|
-
page_info.page.wait_for_timeout(
|
301
|
+
page_info.page.wait_for_timeout(params.wait)
|
373
302
|
|
374
303
|
# Create response object
|
375
304
|
response = ResponseFactory.from_playwright_response(
|
376
|
-
page_info.page, first_response, final_response,
|
305
|
+
page_info.page, first_response, final_response, params.selector_config
|
377
306
|
)
|
378
307
|
|
379
|
-
# Mark the page as
|
380
|
-
page_info.
|
308
|
+
# Mark the page as finished for next use
|
309
|
+
page_info.mark_finished()
|
381
310
|
|
382
311
|
return response
|
383
312
|
|
@@ -385,17 +314,8 @@ class DynamicSession:
|
|
385
314
|
page_info.mark_error()
|
386
315
|
raise e
|
387
316
|
|
388
|
-
def get_pool_stats(self) -> Dict[str, int]:
|
389
|
-
"""Get statistics about the current page pool"""
|
390
|
-
return {
|
391
|
-
"total_pages": self.page_pool.pages_count,
|
392
|
-
"ready_pages": self.page_pool.ready_count,
|
393
|
-
"busy_pages": self.page_pool.busy_count,
|
394
|
-
"max_pages": self.max_pages,
|
395
|
-
}
|
396
|
-
|
397
317
|
|
398
|
-
class AsyncDynamicSession(
|
318
|
+
class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
399
319
|
"""An async Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory."""
|
400
320
|
|
401
321
|
def __init__(
|
@@ -420,6 +340,7 @@ class AsyncDynamicSession(DynamicSession):
|
|
420
340
|
init_script: Optional[str] = None,
|
421
341
|
cookies: Optional[List[Dict]] = None,
|
422
342
|
network_idle: bool = False,
|
343
|
+
load_dom: bool = True,
|
423
344
|
wait_selector_state: SelectorWaitStates = "attached",
|
424
345
|
selector_config: Optional[Dict] = None,
|
425
346
|
):
|
@@ -432,9 +353,10 @@ class AsyncDynamicSession(DynamicSession):
|
|
432
353
|
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
433
354
|
:param cookies: Set cookies for the next request.
|
434
355
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
356
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
435
357
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
436
358
|
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
437
|
-
:param page_action: Added for automation. A function that takes the `page` object
|
359
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
438
360
|
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
439
361
|
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
|
440
362
|
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
@@ -451,36 +373,32 @@ class AsyncDynamicSession(DynamicSession):
|
|
451
373
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
452
374
|
"""
|
453
375
|
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
376
|
+
self.__validate__(
|
377
|
+
wait=wait,
|
378
|
+
proxy=proxy,
|
379
|
+
locale=locale,
|
380
|
+
timeout=timeout,
|
381
|
+
stealth=stealth,
|
382
|
+
cdp_url=cdp_url,
|
383
|
+
cookies=cookies,
|
384
|
+
load_dom=load_dom,
|
385
|
+
headless=headless,
|
386
|
+
useragent=useragent,
|
387
|
+
max_pages=max_pages,
|
388
|
+
real_chrome=real_chrome,
|
389
|
+
page_action=page_action,
|
390
|
+
hide_canvas=hide_canvas,
|
391
|
+
init_script=init_script,
|
392
|
+
network_idle=network_idle,
|
393
|
+
google_search=google_search,
|
394
|
+
extra_headers=extra_headers,
|
395
|
+
wait_selector=wait_selector,
|
396
|
+
disable_webgl=disable_webgl,
|
397
|
+
selector_config=selector_config,
|
398
|
+
disable_resources=disable_resources,
|
399
|
+
wait_selector_state=wait_selector_state,
|
477
400
|
)
|
478
|
-
|
479
|
-
self.playwright: Optional[AsyncPlaywright] = None
|
480
|
-
self.context: Optional[AsyncBrowserContext] = None
|
481
|
-
self._lock = Lock()
|
482
|
-
self.__enter__ = None
|
483
|
-
self.__exit__ = None
|
401
|
+
super().__init__(max_pages=self.max_pages)
|
484
402
|
|
485
403
|
async def __create__(self):
|
486
404
|
"""Create a browser for this instance and context."""
|
@@ -492,17 +410,11 @@ class AsyncDynamicSession(DynamicSession):
|
|
492
410
|
self.playwright: AsyncPlaywright = await async_context().start()
|
493
411
|
|
494
412
|
if self.cdp_url:
|
495
|
-
browser = await self.playwright.chromium.connect_over_cdp(
|
496
|
-
|
497
|
-
)
|
498
|
-
self.context: AsyncBrowserContext = await browser.new_context(
|
499
|
-
**self.context_options
|
500
|
-
)
|
413
|
+
browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url)
|
414
|
+
self.context: AsyncBrowserContext = await browser.new_context(**self.context_options)
|
501
415
|
else:
|
502
|
-
self.context: AsyncBrowserContext = (
|
503
|
-
|
504
|
-
user_data_dir="", **self.launch_options
|
505
|
-
)
|
416
|
+
self.context: AsyncBrowserContext = await self.playwright.chromium.launch_persistent_context(
|
417
|
+
user_data_dir="", **self.launch_options
|
506
418
|
)
|
507
419
|
|
508
420
|
if self.init_script: # pragma: no cover
|
@@ -533,57 +445,63 @@ class AsyncDynamicSession(DynamicSession):
|
|
533
445
|
|
534
446
|
self._closed = True
|
535
447
|
|
536
|
-
async def
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
if self.disable_resources:
|
553
|
-
await page.route("**/*", async_intercept_route)
|
554
|
-
|
555
|
-
if self.stealth:
|
556
|
-
for script in _compiled_stealth_scripts():
|
557
|
-
await page.add_init_script(script=script)
|
558
|
-
|
559
|
-
return self.page_pool.add_page(page)
|
560
|
-
|
561
|
-
# Wait for a page to become available
|
562
|
-
max_wait = 30 # seconds
|
563
|
-
start_time = time()
|
564
|
-
|
565
|
-
while time() - start_time < max_wait: # pragma: no cover
|
566
|
-
page_info = self.page_pool.get_ready_page()
|
567
|
-
if page_info:
|
568
|
-
return page_info
|
569
|
-
await asyncio_sleep(0.05)
|
570
|
-
|
571
|
-
raise TimeoutError("No pages available within timeout period")
|
572
|
-
|
573
|
-
async def fetch(self, url: str) -> Response:
|
448
|
+
async def fetch(
|
449
|
+
self,
|
450
|
+
url: str,
|
451
|
+
google_search: bool = _UNSET,
|
452
|
+
timeout: int | float = _UNSET,
|
453
|
+
wait: int | float = _UNSET,
|
454
|
+
page_action: Optional[Callable] = _UNSET,
|
455
|
+
extra_headers: Optional[Dict[str, str]] = _UNSET,
|
456
|
+
disable_resources: bool = _UNSET,
|
457
|
+
wait_selector: Optional[str] = _UNSET,
|
458
|
+
wait_selector_state: SelectorWaitStates = _UNSET,
|
459
|
+
network_idle: bool = _UNSET,
|
460
|
+
load_dom: bool = _UNSET,
|
461
|
+
selector_config: Optional[Dict] = _UNSET,
|
462
|
+
) -> Response:
|
574
463
|
"""Opens up the browser and do your request based on your chosen options.
|
575
464
|
|
576
465
|
:param url: The Target url.
|
466
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
467
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
468
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
469
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
470
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
471
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
472
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
473
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
474
|
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
475
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
476
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
477
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
478
|
+
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
577
479
|
:return: A `Response` object.
|
578
480
|
"""
|
481
|
+
# Validate all resolved parameters
|
482
|
+
params = validate(
|
483
|
+
dict(
|
484
|
+
google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
|
485
|
+
timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
|
486
|
+
wait=self._get_with_precedence(wait, self.wait, _UNSET),
|
487
|
+
page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
|
488
|
+
extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
|
489
|
+
disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
|
490
|
+
wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
|
491
|
+
wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
|
492
|
+
network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
|
493
|
+
load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
|
494
|
+
selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
|
495
|
+
),
|
496
|
+
PlaywrightConfig,
|
497
|
+
)
|
498
|
+
|
579
499
|
if self._closed: # pragma: no cover
|
580
500
|
raise RuntimeError("Context manager has been closed")
|
581
501
|
|
582
502
|
final_response = None
|
583
503
|
referer = (
|
584
|
-
generate_convincing_referer(url)
|
585
|
-
if (self.google_search and "referer" not in self._headers_keys)
|
586
|
-
else None
|
504
|
+
generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
|
587
505
|
)
|
588
506
|
|
589
507
|
async def handle_response(finished_response: AsyncPlaywrightResponse):
|
@@ -594,48 +512,50 @@ class AsyncDynamicSession(DynamicSession):
|
|
594
512
|
):
|
595
513
|
final_response = finished_response
|
596
514
|
|
597
|
-
page_info = await self.
|
515
|
+
page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
|
598
516
|
page_info.mark_busy(url=url)
|
599
517
|
|
600
518
|
try:
|
601
519
|
# Navigate to URL and wait for a specified state
|
602
520
|
page_info.page.on("response", handle_response)
|
603
521
|
first_response = await page_info.page.goto(url, referer=referer)
|
604
|
-
|
522
|
+
if self.load_dom:
|
523
|
+
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
605
524
|
|
606
|
-
if
|
525
|
+
if params.network_idle:
|
607
526
|
await page_info.page.wait_for_load_state("networkidle")
|
608
527
|
|
609
528
|
if not first_response:
|
610
529
|
raise RuntimeError(f"Failed to get response for {url}")
|
611
530
|
|
612
|
-
if
|
531
|
+
if params.page_action:
|
613
532
|
try:
|
614
|
-
|
533
|
+
_ = await params.page_action(page_info.page)
|
615
534
|
except Exception as e:
|
616
535
|
log.error(f"Error executing page_action: {e}")
|
617
536
|
|
618
|
-
if
|
537
|
+
if params.wait_selector:
|
619
538
|
try:
|
620
|
-
waiter: AsyncLocator = page_info.page.locator(
|
621
|
-
await waiter.first.wait_for(state=
|
539
|
+
waiter: AsyncLocator = page_info.page.locator(params.wait_selector)
|
540
|
+
await waiter.first.wait_for(state=params.wait_selector_state)
|
622
541
|
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
623
542
|
await page_info.page.wait_for_load_state(state="load")
|
624
|
-
|
625
|
-
|
543
|
+
if self.load_dom:
|
544
|
+
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
545
|
+
if params.network_idle:
|
626
546
|
await page_info.page.wait_for_load_state("networkidle")
|
627
547
|
except Exception as e:
|
628
|
-
log.error(f"Error waiting for selector {
|
548
|
+
log.error(f"Error waiting for selector {params.wait_selector}: {e}")
|
629
549
|
|
630
|
-
await page_info.page.wait_for_timeout(
|
550
|
+
await page_info.page.wait_for_timeout(params.wait)
|
631
551
|
|
632
552
|
# Create response object
|
633
553
|
response = await ResponseFactory.from_async_playwright_response(
|
634
|
-
page_info.page, first_response, final_response,
|
554
|
+
page_info.page, first_response, final_response, params.selector_config
|
635
555
|
)
|
636
556
|
|
637
|
-
# Mark the page as
|
638
|
-
page_info.
|
557
|
+
# Mark the page as finished for next use
|
558
|
+
page_info.mark_finished()
|
639
559
|
|
640
560
|
return response
|
641
561
|
|