scrapling 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/cli.py +38 -51
- scrapling/core/_html_utils.py +3 -9
- scrapling/core/ai.py +5 -13
- scrapling/core/custom_types.py +19 -61
- scrapling/core/mixins.py +6 -28
- scrapling/core/shell.py +49 -127
- scrapling/core/storage.py +2 -8
- scrapling/core/translator.py +8 -20
- scrapling/core/utils/__init__.py +10 -0
- scrapling/core/utils/_shell.py +48 -0
- scrapling/core/{utils.py → utils/_utils.py} +5 -21
- scrapling/engines/__init__.py +0 -16
- scrapling/engines/_browsers/_base.py +297 -0
- scrapling/engines/_browsers/_camoufox.py +227 -296
- scrapling/engines/_browsers/_config_tools.py +2 -1
- scrapling/engines/_browsers/_controllers.py +209 -281
- scrapling/engines/_browsers/_page.py +37 -15
- scrapling/engines/_browsers/_validators.py +9 -15
- scrapling/engines/constants.py +3 -6
- scrapling/engines/static.py +25 -75
- scrapling/engines/toolbelt/__init__.py +1 -20
- scrapling/engines/toolbelt/convertor.py +95 -86
- scrapling/engines/toolbelt/custom.py +7 -99
- scrapling/engines/toolbelt/fingerprints.py +1 -3
- scrapling/engines/toolbelt/navigation.py +4 -58
- scrapling/fetchers.py +29 -24
- scrapling/parser.py +45 -122
- {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/METADATA +54 -46
- scrapling-0.3.2.dist-info/RECORD +44 -0
- scrapling-0.3.1.dist-info/RECORD +0 -41
- {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/WHEEL +0 -0
- {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/entry_points.txt +0 -0
- {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/licenses/LICENSE +0 -0
- {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,6 @@
|
|
1
|
-
from time import time, sleep
|
2
|
-
from asyncio import sleep as asyncio_sleep, Lock
|
3
|
-
|
4
1
|
from playwright.sync_api import (
|
5
2
|
Response as SyncPlaywrightResponse,
|
6
3
|
sync_playwright,
|
7
|
-
BrowserContext,
|
8
4
|
Playwright,
|
9
5
|
Locator,
|
10
6
|
)
|
@@ -21,9 +17,8 @@ from rebrowser_playwright.async_api import (
|
|
21
17
|
)
|
22
18
|
|
23
19
|
from scrapling.core.utils import log
|
24
|
-
from .
|
20
|
+
from ._base import SyncSession, AsyncSession, DynamicSessionMixin
|
25
21
|
from ._validators import validate, PlaywrightConfig
|
26
|
-
from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
|
27
22
|
from scrapling.core._types import (
|
28
23
|
Dict,
|
29
24
|
List,
|
@@ -31,16 +26,16 @@ from scrapling.core._types import (
|
|
31
26
|
Callable,
|
32
27
|
SelectorWaitStates,
|
33
28
|
)
|
34
|
-
from scrapling.engines.toolbelt import (
|
29
|
+
from scrapling.engines.toolbelt.convertor import (
|
35
30
|
Response,
|
36
31
|
ResponseFactory,
|
37
|
-
generate_convincing_referer,
|
38
|
-
intercept_route,
|
39
|
-
async_intercept_route,
|
40
32
|
)
|
33
|
+
from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
|
34
|
+
|
35
|
+
_UNSET = object()
|
41
36
|
|
42
37
|
|
43
|
-
class DynamicSession:
|
38
|
+
class DynamicSession(DynamicSessionMixin, SyncSession):
|
44
39
|
"""A Browser session manager with page pooling."""
|
45
40
|
|
46
41
|
__slots__ = (
|
@@ -59,6 +54,7 @@ class DynamicSession:
|
|
59
54
|
"cookies",
|
60
55
|
"disable_resources",
|
61
56
|
"network_idle",
|
57
|
+
"load_dom",
|
62
58
|
"wait_selector",
|
63
59
|
"init_script",
|
64
60
|
"wait_selector_state",
|
@@ -98,6 +94,7 @@ class DynamicSession:
|
|
98
94
|
init_script: Optional[str] = None,
|
99
95
|
cookies: Optional[List[Dict]] = None,
|
100
96
|
network_idle: bool = False,
|
97
|
+
load_dom: bool = True,
|
101
98
|
wait_selector_state: SelectorWaitStates = "attached",
|
102
99
|
selector_config: Optional[Dict] = None,
|
103
100
|
):
|
@@ -112,7 +109,7 @@ class DynamicSession:
|
|
112
109
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
113
110
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
114
111
|
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
115
|
-
:param page_action: Added for automation. A function that takes the `page` object
|
112
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
116
113
|
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
117
114
|
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
|
118
115
|
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
@@ -121,114 +118,39 @@ class DynamicSession:
|
|
121
118
|
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
122
119
|
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
123
120
|
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
121
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
124
122
|
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
|
125
123
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
126
124
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
127
125
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
128
126
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
129
127
|
"""
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
}
|
155
|
-
config = validate(params, PlaywrightConfig)
|
156
|
-
|
157
|
-
self.max_pages = config.max_pages
|
158
|
-
self.headless = config.headless
|
159
|
-
self.hide_canvas = config.hide_canvas
|
160
|
-
self.disable_webgl = config.disable_webgl
|
161
|
-
self.real_chrome = config.real_chrome
|
162
|
-
self.stealth = config.stealth
|
163
|
-
self.google_search = config.google_search
|
164
|
-
self.wait = config.wait
|
165
|
-
self.proxy = config.proxy
|
166
|
-
self.locale = config.locale
|
167
|
-
self.extra_headers = config.extra_headers
|
168
|
-
self.useragent = config.useragent
|
169
|
-
self.timeout = config.timeout
|
170
|
-
self.cookies = config.cookies
|
171
|
-
self.disable_resources = config.disable_resources
|
172
|
-
self.cdp_url = config.cdp_url
|
173
|
-
self.network_idle = config.network_idle
|
174
|
-
self.wait_selector = config.wait_selector
|
175
|
-
self.init_script = config.init_script
|
176
|
-
self.wait_selector_state = config.wait_selector_state
|
177
|
-
|
178
|
-
self.playwright: Optional[Playwright] = None
|
179
|
-
self.context: Optional[BrowserContext] = None
|
180
|
-
self.page_pool = PagePool(self.max_pages)
|
181
|
-
self._closed = False
|
182
|
-
self.selector_config = config.selector_config
|
183
|
-
self.page_action = config.page_action
|
184
|
-
self._headers_keys = (
|
185
|
-
set(map(str.lower, self.extra_headers.keys()))
|
186
|
-
if self.extra_headers
|
187
|
-
else set()
|
128
|
+
self.__validate__(
|
129
|
+
wait=wait,
|
130
|
+
proxy=proxy,
|
131
|
+
locale=locale,
|
132
|
+
timeout=timeout,
|
133
|
+
stealth=stealth,
|
134
|
+
cdp_url=cdp_url,
|
135
|
+
cookies=cookies,
|
136
|
+
load_dom=load_dom,
|
137
|
+
headless=headless,
|
138
|
+
useragent=useragent,
|
139
|
+
max_pages=__max_pages,
|
140
|
+
real_chrome=real_chrome,
|
141
|
+
page_action=page_action,
|
142
|
+
hide_canvas=hide_canvas,
|
143
|
+
init_script=init_script,
|
144
|
+
network_idle=network_idle,
|
145
|
+
google_search=google_search,
|
146
|
+
extra_headers=extra_headers,
|
147
|
+
wait_selector=wait_selector,
|
148
|
+
disable_webgl=disable_webgl,
|
149
|
+
selector_config=selector_config,
|
150
|
+
disable_resources=disable_resources,
|
151
|
+
wait_selector_state=wait_selector_state,
|
188
152
|
)
|
189
|
-
self.
|
190
|
-
|
191
|
-
def __initiate_browser_options__(self):
|
192
|
-
if not self.cdp_url:
|
193
|
-
# `launch_options` is used with persistent context
|
194
|
-
self.launch_options = dict(
|
195
|
-
_launch_kwargs(
|
196
|
-
self.headless,
|
197
|
-
self.proxy,
|
198
|
-
self.locale,
|
199
|
-
tuple(self.extra_headers.items())
|
200
|
-
if self.extra_headers
|
201
|
-
else tuple(),
|
202
|
-
self.useragent,
|
203
|
-
self.real_chrome,
|
204
|
-
self.stealth,
|
205
|
-
self.hide_canvas,
|
206
|
-
self.disable_webgl,
|
207
|
-
)
|
208
|
-
)
|
209
|
-
self.launch_options["extra_http_headers"] = dict(
|
210
|
-
self.launch_options["extra_http_headers"]
|
211
|
-
)
|
212
|
-
self.launch_options["proxy"] = dict(self.launch_options["proxy"]) or None
|
213
|
-
self.context_options = dict()
|
214
|
-
else:
|
215
|
-
# while `context_options` is left to be used when cdp mode is enabled
|
216
|
-
self.launch_options = dict()
|
217
|
-
self.context_options = dict(
|
218
|
-
_context_kwargs(
|
219
|
-
self.proxy,
|
220
|
-
self.locale,
|
221
|
-
tuple(self.extra_headers.items())
|
222
|
-
if self.extra_headers
|
223
|
-
else tuple(),
|
224
|
-
self.useragent,
|
225
|
-
self.stealth,
|
226
|
-
)
|
227
|
-
)
|
228
|
-
self.context_options["extra_http_headers"] = dict(
|
229
|
-
self.context_options["extra_http_headers"]
|
230
|
-
)
|
231
|
-
self.context_options["proxy"] = dict(self.context_options["proxy"]) or None
|
153
|
+
super().__init__(max_pages=self.max_pages)
|
232
154
|
|
233
155
|
def __create__(self):
|
234
156
|
"""Create a browser for this instance and context."""
|
@@ -237,16 +159,18 @@ class DynamicSession:
|
|
237
159
|
# Because rebrowser_playwright doesn't play well with real browsers
|
238
160
|
sync_context = sync_playwright
|
239
161
|
|
240
|
-
self.playwright = sync_context().start()
|
162
|
+
self.playwright: Playwright = sync_context().start()
|
241
163
|
|
242
164
|
if self.cdp_url: # pragma: no cover
|
243
|
-
self.context = self.playwright.chromium.connect_over_cdp(
|
244
|
-
|
245
|
-
).new_context(**self.context_options)
|
246
|
-
else:
|
247
|
-
self.context = self.playwright.chromium.launch_persistent_context(
|
248
|
-
user_data_dir="", **self.launch_options
|
165
|
+
self.context = self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url).new_context(
|
166
|
+
**self.context_options
|
249
167
|
)
|
168
|
+
else:
|
169
|
+
self.context = self.playwright.chromium.launch_persistent_context(user_data_dir="", **self.launch_options)
|
170
|
+
|
171
|
+
# Get the default page and close it
|
172
|
+
default_page = self.context.pages[0]
|
173
|
+
default_page.close()
|
250
174
|
|
251
175
|
if self.init_script: # pragma: no cover
|
252
176
|
self.context.add_init_script(path=self.init_script)
|
@@ -276,56 +200,63 @@ class DynamicSession:
|
|
276
200
|
|
277
201
|
self._closed = True
|
278
202
|
|
279
|
-
def
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
if self.disable_resources:
|
295
|
-
page.route("**/*", intercept_route)
|
296
|
-
|
297
|
-
if self.stealth:
|
298
|
-
for script in _compiled_stealth_scripts():
|
299
|
-
page.add_init_script(script=script)
|
300
|
-
|
301
|
-
return self.page_pool.add_page(page)
|
302
|
-
|
303
|
-
# Wait for a page to become available
|
304
|
-
max_wait = 30
|
305
|
-
start_time = time()
|
306
|
-
|
307
|
-
while time() - start_time < max_wait:
|
308
|
-
page_info = self.page_pool.get_ready_page()
|
309
|
-
if page_info:
|
310
|
-
return page_info
|
311
|
-
sleep(0.05)
|
312
|
-
|
313
|
-
raise TimeoutError("No pages available within timeout period")
|
314
|
-
|
315
|
-
def fetch(self, url: str) -> Response:
|
203
|
+
def fetch(
|
204
|
+
self,
|
205
|
+
url: str,
|
206
|
+
google_search: bool = _UNSET,
|
207
|
+
timeout: int | float = _UNSET,
|
208
|
+
wait: int | float = _UNSET,
|
209
|
+
page_action: Optional[Callable] = _UNSET,
|
210
|
+
extra_headers: Optional[Dict[str, str]] = _UNSET,
|
211
|
+
disable_resources: bool = _UNSET,
|
212
|
+
wait_selector: Optional[str] = _UNSET,
|
213
|
+
wait_selector_state: SelectorWaitStates = _UNSET,
|
214
|
+
network_idle: bool = _UNSET,
|
215
|
+
load_dom: bool = _UNSET,
|
216
|
+
selector_config: Optional[Dict] = _UNSET,
|
217
|
+
) -> Response:
|
316
218
|
"""Opens up the browser and do your request based on your chosen options.
|
317
219
|
|
318
220
|
:param url: The Target url.
|
221
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
222
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
223
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
224
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
225
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
226
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
227
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
228
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
229
|
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
230
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
231
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
232
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
233
|
+
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
319
234
|
:return: A `Response` object.
|
320
235
|
"""
|
236
|
+
# Validate all resolved parameters
|
237
|
+
params = validate(
|
238
|
+
dict(
|
239
|
+
google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
|
240
|
+
timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
|
241
|
+
wait=self._get_with_precedence(wait, self.wait, _UNSET),
|
242
|
+
page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
|
243
|
+
extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
|
244
|
+
disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
|
245
|
+
wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
|
246
|
+
wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
|
247
|
+
network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
|
248
|
+
load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
|
249
|
+
selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
|
250
|
+
),
|
251
|
+
PlaywrightConfig,
|
252
|
+
)
|
253
|
+
|
321
254
|
if self._closed: # pragma: no cover
|
322
255
|
raise RuntimeError("Context manager has been closed")
|
323
256
|
|
324
257
|
final_response = None
|
325
258
|
referer = (
|
326
|
-
generate_convincing_referer(url)
|
327
|
-
if (self.google_search and "referer" not in self._headers_keys)
|
328
|
-
else None
|
259
|
+
generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
|
329
260
|
)
|
330
261
|
|
331
262
|
def handle_response(finished_response: SyncPlaywrightResponse):
|
@@ -336,48 +267,50 @@ class DynamicSession:
|
|
336
267
|
):
|
337
268
|
final_response = finished_response
|
338
269
|
|
339
|
-
page_info = self.
|
270
|
+
page_info = self._get_page(params.timeout, params.extra_headers, params.disable_resources)
|
340
271
|
page_info.mark_busy(url=url)
|
341
272
|
|
342
273
|
try: # pragma: no cover
|
343
274
|
# Navigate to URL and wait for a specified state
|
344
275
|
page_info.page.on("response", handle_response)
|
345
276
|
first_response = page_info.page.goto(url, referer=referer)
|
346
|
-
|
277
|
+
if params.load_dom:
|
278
|
+
page_info.page.wait_for_load_state(state="domcontentloaded")
|
347
279
|
|
348
|
-
if
|
280
|
+
if params.network_idle:
|
349
281
|
page_info.page.wait_for_load_state("networkidle")
|
350
282
|
|
351
283
|
if not first_response:
|
352
284
|
raise RuntimeError(f"Failed to get response for {url}")
|
353
285
|
|
354
|
-
if
|
286
|
+
if params.page_action:
|
355
287
|
try:
|
356
|
-
|
288
|
+
_ = params.page_action(page_info.page)
|
357
289
|
except Exception as e: # pragma: no cover
|
358
290
|
log.error(f"Error executing page_action: {e}")
|
359
291
|
|
360
|
-
if
|
292
|
+
if params.wait_selector:
|
361
293
|
try:
|
362
|
-
waiter: Locator = page_info.page.locator(
|
363
|
-
waiter.first.wait_for(state=
|
294
|
+
waiter: Locator = page_info.page.locator(params.wait_selector)
|
295
|
+
waiter.first.wait_for(state=params.wait_selector_state)
|
364
296
|
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
365
297
|
page_info.page.wait_for_load_state(state="load")
|
366
|
-
|
367
|
-
|
298
|
+
if params.load_dom:
|
299
|
+
page_info.page.wait_for_load_state(state="domcontentloaded")
|
300
|
+
if params.network_idle:
|
368
301
|
page_info.page.wait_for_load_state("networkidle")
|
369
302
|
except Exception as e: # pragma: no cover
|
370
|
-
log.error(f"Error waiting for selector {
|
303
|
+
log.error(f"Error waiting for selector {params.wait_selector}: {e}")
|
371
304
|
|
372
|
-
page_info.page.wait_for_timeout(
|
305
|
+
page_info.page.wait_for_timeout(params.wait)
|
373
306
|
|
374
307
|
# Create response object
|
375
308
|
response = ResponseFactory.from_playwright_response(
|
376
|
-
page_info.page, first_response, final_response,
|
309
|
+
page_info.page, first_response, final_response, params.selector_config
|
377
310
|
)
|
378
311
|
|
379
|
-
# Mark the page as
|
380
|
-
page_info.
|
312
|
+
# Mark the page as finished for next use
|
313
|
+
page_info.mark_finished()
|
381
314
|
|
382
315
|
return response
|
383
316
|
|
@@ -385,17 +318,8 @@ class DynamicSession:
|
|
385
318
|
page_info.mark_error()
|
386
319
|
raise e
|
387
320
|
|
388
|
-
def get_pool_stats(self) -> Dict[str, int]:
|
389
|
-
"""Get statistics about the current page pool"""
|
390
|
-
return {
|
391
|
-
"total_pages": self.page_pool.pages_count,
|
392
|
-
"ready_pages": self.page_pool.ready_count,
|
393
|
-
"busy_pages": self.page_pool.busy_count,
|
394
|
-
"max_pages": self.max_pages,
|
395
|
-
}
|
396
|
-
|
397
321
|
|
398
|
-
class AsyncDynamicSession(
|
322
|
+
class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
399
323
|
"""An async Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory."""
|
400
324
|
|
401
325
|
def __init__(
|
@@ -420,6 +344,7 @@ class AsyncDynamicSession(DynamicSession):
|
|
420
344
|
init_script: Optional[str] = None,
|
421
345
|
cookies: Optional[List[Dict]] = None,
|
422
346
|
network_idle: bool = False,
|
347
|
+
load_dom: bool = True,
|
423
348
|
wait_selector_state: SelectorWaitStates = "attached",
|
424
349
|
selector_config: Optional[Dict] = None,
|
425
350
|
):
|
@@ -432,9 +357,10 @@ class AsyncDynamicSession(DynamicSession):
|
|
432
357
|
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
433
358
|
:param cookies: Set cookies for the next request.
|
434
359
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
360
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
435
361
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
436
362
|
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
437
|
-
:param page_action: Added for automation. A function that takes the `page` object
|
363
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
438
364
|
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
439
365
|
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
|
440
366
|
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
@@ -451,36 +377,32 @@ class AsyncDynamicSession(DynamicSession):
|
|
451
377
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
452
378
|
"""
|
453
379
|
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
380
|
+
self.__validate__(
|
381
|
+
wait=wait,
|
382
|
+
proxy=proxy,
|
383
|
+
locale=locale,
|
384
|
+
timeout=timeout,
|
385
|
+
stealth=stealth,
|
386
|
+
cdp_url=cdp_url,
|
387
|
+
cookies=cookies,
|
388
|
+
load_dom=load_dom,
|
389
|
+
headless=headless,
|
390
|
+
useragent=useragent,
|
391
|
+
max_pages=max_pages,
|
392
|
+
real_chrome=real_chrome,
|
393
|
+
page_action=page_action,
|
394
|
+
hide_canvas=hide_canvas,
|
395
|
+
init_script=init_script,
|
396
|
+
network_idle=network_idle,
|
397
|
+
google_search=google_search,
|
398
|
+
extra_headers=extra_headers,
|
399
|
+
wait_selector=wait_selector,
|
400
|
+
disable_webgl=disable_webgl,
|
401
|
+
selector_config=selector_config,
|
402
|
+
disable_resources=disable_resources,
|
403
|
+
wait_selector_state=wait_selector_state,
|
477
404
|
)
|
478
|
-
|
479
|
-
self.playwright: Optional[AsyncPlaywright] = None
|
480
|
-
self.context: Optional[AsyncBrowserContext] = None
|
481
|
-
self._lock = Lock()
|
482
|
-
self.__enter__ = None
|
483
|
-
self.__exit__ = None
|
405
|
+
super().__init__(max_pages=self.max_pages)
|
484
406
|
|
485
407
|
async def __create__(self):
|
486
408
|
"""Create a browser for this instance and context."""
|
@@ -492,19 +414,17 @@ class AsyncDynamicSession(DynamicSession):
|
|
492
414
|
self.playwright: AsyncPlaywright = await async_context().start()
|
493
415
|
|
494
416
|
if self.cdp_url:
|
495
|
-
browser = await self.playwright.chromium.connect_over_cdp(
|
496
|
-
|
497
|
-
)
|
498
|
-
self.context: AsyncBrowserContext = await browser.new_context(
|
499
|
-
**self.context_options
|
500
|
-
)
|
417
|
+
browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url)
|
418
|
+
self.context: AsyncBrowserContext = await browser.new_context(**self.context_options)
|
501
419
|
else:
|
502
|
-
self.context: AsyncBrowserContext = (
|
503
|
-
|
504
|
-
user_data_dir="", **self.launch_options
|
505
|
-
)
|
420
|
+
self.context: AsyncBrowserContext = await self.playwright.chromium.launch_persistent_context(
|
421
|
+
user_data_dir="", **self.launch_options
|
506
422
|
)
|
507
423
|
|
424
|
+
# Get the default page and close it
|
425
|
+
default_page = self.context.pages[0]
|
426
|
+
await default_page.close()
|
427
|
+
|
508
428
|
if self.init_script: # pragma: no cover
|
509
429
|
await self.context.add_init_script(path=self.init_script)
|
510
430
|
|
@@ -533,57 +453,63 @@ class AsyncDynamicSession(DynamicSession):
|
|
533
453
|
|
534
454
|
self._closed = True
|
535
455
|
|
536
|
-
async def
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
if self.disable_resources:
|
553
|
-
await page.route("**/*", async_intercept_route)
|
554
|
-
|
555
|
-
if self.stealth:
|
556
|
-
for script in _compiled_stealth_scripts():
|
557
|
-
await page.add_init_script(script=script)
|
558
|
-
|
559
|
-
return self.page_pool.add_page(page)
|
560
|
-
|
561
|
-
# Wait for a page to become available
|
562
|
-
max_wait = 30 # seconds
|
563
|
-
start_time = time()
|
564
|
-
|
565
|
-
while time() - start_time < max_wait: # pragma: no cover
|
566
|
-
page_info = self.page_pool.get_ready_page()
|
567
|
-
if page_info:
|
568
|
-
return page_info
|
569
|
-
await asyncio_sleep(0.05)
|
570
|
-
|
571
|
-
raise TimeoutError("No pages available within timeout period")
|
572
|
-
|
573
|
-
async def fetch(self, url: str) -> Response:
|
456
|
+
async def fetch(
|
457
|
+
self,
|
458
|
+
url: str,
|
459
|
+
google_search: bool = _UNSET,
|
460
|
+
timeout: int | float = _UNSET,
|
461
|
+
wait: int | float = _UNSET,
|
462
|
+
page_action: Optional[Callable] = _UNSET,
|
463
|
+
extra_headers: Optional[Dict[str, str]] = _UNSET,
|
464
|
+
disable_resources: bool = _UNSET,
|
465
|
+
wait_selector: Optional[str] = _UNSET,
|
466
|
+
wait_selector_state: SelectorWaitStates = _UNSET,
|
467
|
+
network_idle: bool = _UNSET,
|
468
|
+
load_dom: bool = _UNSET,
|
469
|
+
selector_config: Optional[Dict] = _UNSET,
|
470
|
+
) -> Response:
|
574
471
|
"""Opens up the browser and do your request based on your chosen options.
|
575
472
|
|
576
473
|
:param url: The Target url.
|
474
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
475
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
476
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
477
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
478
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
479
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
480
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
481
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
482
|
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
483
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
484
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
485
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
486
|
+
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
577
487
|
:return: A `Response` object.
|
578
488
|
"""
|
489
|
+
# Validate all resolved parameters
|
490
|
+
params = validate(
|
491
|
+
dict(
|
492
|
+
google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
|
493
|
+
timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
|
494
|
+
wait=self._get_with_precedence(wait, self.wait, _UNSET),
|
495
|
+
page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
|
496
|
+
extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
|
497
|
+
disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
|
498
|
+
wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
|
499
|
+
wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
|
500
|
+
network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
|
501
|
+
load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
|
502
|
+
selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
|
503
|
+
),
|
504
|
+
PlaywrightConfig,
|
505
|
+
)
|
506
|
+
|
579
507
|
if self._closed: # pragma: no cover
|
580
508
|
raise RuntimeError("Context manager has been closed")
|
581
509
|
|
582
510
|
final_response = None
|
583
511
|
referer = (
|
584
|
-
generate_convincing_referer(url)
|
585
|
-
if (self.google_search and "referer" not in self._headers_keys)
|
586
|
-
else None
|
512
|
+
generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
|
587
513
|
)
|
588
514
|
|
589
515
|
async def handle_response(finished_response: AsyncPlaywrightResponse):
|
@@ -594,48 +520,50 @@ class AsyncDynamicSession(DynamicSession):
|
|
594
520
|
):
|
595
521
|
final_response = finished_response
|
596
522
|
|
597
|
-
page_info = await self.
|
523
|
+
page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
|
598
524
|
page_info.mark_busy(url=url)
|
599
525
|
|
600
526
|
try:
|
601
527
|
# Navigate to URL and wait for a specified state
|
602
528
|
page_info.page.on("response", handle_response)
|
603
529
|
first_response = await page_info.page.goto(url, referer=referer)
|
604
|
-
|
530
|
+
if self.load_dom:
|
531
|
+
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
605
532
|
|
606
|
-
if
|
533
|
+
if params.network_idle:
|
607
534
|
await page_info.page.wait_for_load_state("networkidle")
|
608
535
|
|
609
536
|
if not first_response:
|
610
537
|
raise RuntimeError(f"Failed to get response for {url}")
|
611
538
|
|
612
|
-
if
|
539
|
+
if params.page_action:
|
613
540
|
try:
|
614
|
-
|
541
|
+
_ = await params.page_action(page_info.page)
|
615
542
|
except Exception as e:
|
616
543
|
log.error(f"Error executing page_action: {e}")
|
617
544
|
|
618
|
-
if
|
545
|
+
if params.wait_selector:
|
619
546
|
try:
|
620
|
-
waiter: AsyncLocator = page_info.page.locator(
|
621
|
-
await waiter.first.wait_for(state=
|
547
|
+
waiter: AsyncLocator = page_info.page.locator(params.wait_selector)
|
548
|
+
await waiter.first.wait_for(state=params.wait_selector_state)
|
622
549
|
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
623
550
|
await page_info.page.wait_for_load_state(state="load")
|
624
|
-
|
625
|
-
|
551
|
+
if self.load_dom:
|
552
|
+
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
553
|
+
if params.network_idle:
|
626
554
|
await page_info.page.wait_for_load_state("networkidle")
|
627
555
|
except Exception as e:
|
628
|
-
log.error(f"Error waiting for selector {
|
556
|
+
log.error(f"Error waiting for selector {params.wait_selector}: {e}")
|
629
557
|
|
630
|
-
await page_info.page.wait_for_timeout(
|
558
|
+
await page_info.page.wait_for_timeout(params.wait)
|
631
559
|
|
632
560
|
# Create response object
|
633
561
|
response = await ResponseFactory.from_async_playwright_response(
|
634
|
-
page_info.page, first_response, final_response,
|
562
|
+
page_info.page, first_response, final_response, params.selector_config
|
635
563
|
)
|
636
564
|
|
637
|
-
# Mark the page as
|
638
|
-
page_info.
|
565
|
+
# Mark the page as finished for next use
|
566
|
+
page_info.mark_finished()
|
639
567
|
|
640
568
|
return response
|
641
569
|
|