scrapling 0.3__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/cli.py +38 -51
- scrapling/core/_html_utils.py +3 -9
- scrapling/core/ai.py +5 -13
- scrapling/core/custom_types.py +19 -61
- scrapling/core/mixins.py +6 -28
- scrapling/core/shell.py +51 -129
- scrapling/core/storage.py +2 -8
- scrapling/core/translator.py +8 -20
- scrapling/core/utils/__init__.py +10 -0
- scrapling/core/utils/_shell.py +48 -0
- scrapling/core/{utils.py → utils/_utils.py} +5 -21
- scrapling/engines/__init__.py +0 -16
- scrapling/engines/_browsers/_base.py +297 -0
- scrapling/engines/_browsers/_camoufox.py +238 -293
- scrapling/engines/_browsers/_config_tools.py +2 -1
- scrapling/engines/_browsers/_controllers.py +220 -278
- scrapling/engines/_browsers/_page.py +37 -15
- scrapling/engines/_browsers/_validators.py +29 -15
- scrapling/engines/constants.py +3 -6
- scrapling/engines/static.py +25 -75
- scrapling/engines/toolbelt/__init__.py +1 -20
- scrapling/engines/toolbelt/convertor.py +95 -86
- scrapling/engines/toolbelt/custom.py +7 -99
- scrapling/engines/toolbelt/fingerprints.py +1 -3
- scrapling/engines/toolbelt/navigation.py +4 -58
- scrapling/fetchers.py +41 -24
- scrapling/parser.py +45 -122
- {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/METADATA +57 -47
- scrapling-0.3.2.dist-info/RECORD +44 -0
- scrapling-0.3.dist-info/RECORD +0 -41
- {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/WHEEL +0 -0
- {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/entry_points.txt +0 -0
- {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/licenses/LICENSE +0 -0
- {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,6 @@
|
|
1
|
-
from time import time, sleep
|
2
|
-
from asyncio import sleep as asyncio_sleep, Lock
|
3
|
-
|
4
1
|
from playwright.sync_api import (
|
5
2
|
Response as SyncPlaywrightResponse,
|
6
3
|
sync_playwright,
|
7
|
-
BrowserContext,
|
8
4
|
Playwright,
|
9
5
|
Locator,
|
10
6
|
)
|
@@ -21,9 +17,8 @@ from rebrowser_playwright.async_api import (
|
|
21
17
|
)
|
22
18
|
|
23
19
|
from scrapling.core.utils import log
|
24
|
-
from .
|
20
|
+
from ._base import SyncSession, AsyncSession, DynamicSessionMixin
|
25
21
|
from ._validators import validate, PlaywrightConfig
|
26
|
-
from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
|
27
22
|
from scrapling.core._types import (
|
28
23
|
Dict,
|
29
24
|
List,
|
@@ -31,16 +26,16 @@ from scrapling.core._types import (
|
|
31
26
|
Callable,
|
32
27
|
SelectorWaitStates,
|
33
28
|
)
|
34
|
-
from scrapling.engines.toolbelt import (
|
29
|
+
from scrapling.engines.toolbelt.convertor import (
|
35
30
|
Response,
|
36
31
|
ResponseFactory,
|
37
|
-
generate_convincing_referer,
|
38
|
-
intercept_route,
|
39
|
-
async_intercept_route,
|
40
32
|
)
|
33
|
+
from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
|
34
|
+
|
35
|
+
_UNSET = object()
|
41
36
|
|
42
37
|
|
43
|
-
class DynamicSession:
|
38
|
+
class DynamicSession(DynamicSessionMixin, SyncSession):
|
44
39
|
"""A Browser session manager with page pooling."""
|
45
40
|
|
46
41
|
__slots__ = (
|
@@ -59,7 +54,9 @@ class DynamicSession:
|
|
59
54
|
"cookies",
|
60
55
|
"disable_resources",
|
61
56
|
"network_idle",
|
57
|
+
"load_dom",
|
62
58
|
"wait_selector",
|
59
|
+
"init_script",
|
63
60
|
"wait_selector_state",
|
64
61
|
"wait",
|
65
62
|
"playwright",
|
@@ -94,8 +91,10 @@ class DynamicSession:
|
|
94
91
|
timeout: int | float = 30000,
|
95
92
|
disable_resources: bool = False,
|
96
93
|
wait_selector: Optional[str] = None,
|
94
|
+
init_script: Optional[str] = None,
|
97
95
|
cookies: Optional[List[Dict]] = None,
|
98
96
|
network_idle: bool = False,
|
97
|
+
load_dom: bool = True,
|
99
98
|
wait_selector_state: SelectorWaitStates = "attached",
|
100
99
|
selector_config: Optional[Dict] = None,
|
101
100
|
):
|
@@ -110,120 +109,48 @@ class DynamicSession:
|
|
110
109
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
111
110
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
112
111
|
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
113
|
-
:param page_action: Added for automation. A function that takes the `page` object
|
112
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
114
113
|
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
114
|
+
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
|
115
115
|
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
116
116
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
117
117
|
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
118
118
|
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
119
119
|
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
120
120
|
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
121
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
121
122
|
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
|
122
123
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
123
124
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
124
125
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
125
126
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
126
127
|
"""
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
config = validate(params, PlaywrightConfig)
|
152
|
-
|
153
|
-
self.max_pages = config.max_pages
|
154
|
-
self.headless = config.headless
|
155
|
-
self.hide_canvas = config.hide_canvas
|
156
|
-
self.disable_webgl = config.disable_webgl
|
157
|
-
self.real_chrome = config.real_chrome
|
158
|
-
self.stealth = config.stealth
|
159
|
-
self.google_search = config.google_search
|
160
|
-
self.wait = config.wait
|
161
|
-
self.proxy = config.proxy
|
162
|
-
self.locale = config.locale
|
163
|
-
self.extra_headers = config.extra_headers
|
164
|
-
self.useragent = config.useragent
|
165
|
-
self.timeout = config.timeout
|
166
|
-
self.cookies = config.cookies
|
167
|
-
self.disable_resources = config.disable_resources
|
168
|
-
self.cdp_url = config.cdp_url
|
169
|
-
self.network_idle = config.network_idle
|
170
|
-
self.wait_selector = config.wait_selector
|
171
|
-
self.wait_selector_state = config.wait_selector_state
|
172
|
-
|
173
|
-
self.playwright: Optional[Playwright] = None
|
174
|
-
self.context: Optional[BrowserContext] = None
|
175
|
-
self.page_pool = PagePool(self.max_pages)
|
176
|
-
self._closed = False
|
177
|
-
self.selector_config = config.selector_config
|
178
|
-
self.page_action = config.page_action
|
179
|
-
self._headers_keys = (
|
180
|
-
set(map(str.lower, self.extra_headers.keys()))
|
181
|
-
if self.extra_headers
|
182
|
-
else set()
|
128
|
+
self.__validate__(
|
129
|
+
wait=wait,
|
130
|
+
proxy=proxy,
|
131
|
+
locale=locale,
|
132
|
+
timeout=timeout,
|
133
|
+
stealth=stealth,
|
134
|
+
cdp_url=cdp_url,
|
135
|
+
cookies=cookies,
|
136
|
+
load_dom=load_dom,
|
137
|
+
headless=headless,
|
138
|
+
useragent=useragent,
|
139
|
+
max_pages=__max_pages,
|
140
|
+
real_chrome=real_chrome,
|
141
|
+
page_action=page_action,
|
142
|
+
hide_canvas=hide_canvas,
|
143
|
+
init_script=init_script,
|
144
|
+
network_idle=network_idle,
|
145
|
+
google_search=google_search,
|
146
|
+
extra_headers=extra_headers,
|
147
|
+
wait_selector=wait_selector,
|
148
|
+
disable_webgl=disable_webgl,
|
149
|
+
selector_config=selector_config,
|
150
|
+
disable_resources=disable_resources,
|
151
|
+
wait_selector_state=wait_selector_state,
|
183
152
|
)
|
184
|
-
self.
|
185
|
-
|
186
|
-
def __initiate_browser_options__(self):
|
187
|
-
if not self.cdp_url:
|
188
|
-
# `launch_options` is used with persistent context
|
189
|
-
self.launch_options = dict(
|
190
|
-
_launch_kwargs(
|
191
|
-
self.headless,
|
192
|
-
self.proxy,
|
193
|
-
self.locale,
|
194
|
-
tuple(self.extra_headers.items())
|
195
|
-
if self.extra_headers
|
196
|
-
else tuple(),
|
197
|
-
self.useragent,
|
198
|
-
self.real_chrome,
|
199
|
-
self.stealth,
|
200
|
-
self.hide_canvas,
|
201
|
-
self.disable_webgl,
|
202
|
-
)
|
203
|
-
)
|
204
|
-
self.launch_options["extra_http_headers"] = dict(
|
205
|
-
self.launch_options["extra_http_headers"]
|
206
|
-
)
|
207
|
-
self.launch_options["proxy"] = dict(self.launch_options["proxy"]) or None
|
208
|
-
self.context_options = dict()
|
209
|
-
else:
|
210
|
-
# while `context_options` is left to be used when cdp mode is enabled
|
211
|
-
self.launch_options = dict()
|
212
|
-
self.context_options = dict(
|
213
|
-
_context_kwargs(
|
214
|
-
self.proxy,
|
215
|
-
self.locale,
|
216
|
-
tuple(self.extra_headers.items())
|
217
|
-
if self.extra_headers
|
218
|
-
else tuple(),
|
219
|
-
self.useragent,
|
220
|
-
self.stealth,
|
221
|
-
)
|
222
|
-
)
|
223
|
-
self.context_options["extra_http_headers"] = dict(
|
224
|
-
self.context_options["extra_http_headers"]
|
225
|
-
)
|
226
|
-
self.context_options["proxy"] = dict(self.context_options["proxy"]) or None
|
153
|
+
super().__init__(max_pages=self.max_pages)
|
227
154
|
|
228
155
|
def __create__(self):
|
229
156
|
"""Create a browser for this instance and context."""
|
@@ -232,16 +159,21 @@ class DynamicSession:
|
|
232
159
|
# Because rebrowser_playwright doesn't play well with real browsers
|
233
160
|
sync_context = sync_playwright
|
234
161
|
|
235
|
-
self.playwright = sync_context().start()
|
162
|
+
self.playwright: Playwright = sync_context().start()
|
236
163
|
|
237
164
|
if self.cdp_url: # pragma: no cover
|
238
|
-
self.context = self.playwright.chromium.connect_over_cdp(
|
239
|
-
|
240
|
-
).new_context(**self.context_options)
|
241
|
-
else:
|
242
|
-
self.context = self.playwright.chromium.launch_persistent_context(
|
243
|
-
user_data_dir="", **self.launch_options
|
165
|
+
self.context = self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url).new_context(
|
166
|
+
**self.context_options
|
244
167
|
)
|
168
|
+
else:
|
169
|
+
self.context = self.playwright.chromium.launch_persistent_context(user_data_dir="", **self.launch_options)
|
170
|
+
|
171
|
+
# Get the default page and close it
|
172
|
+
default_page = self.context.pages[0]
|
173
|
+
default_page.close()
|
174
|
+
|
175
|
+
if self.init_script: # pragma: no cover
|
176
|
+
self.context.add_init_script(path=self.init_script)
|
245
177
|
|
246
178
|
if self.cookies: # pragma: no cover
|
247
179
|
self.context.add_cookies(self.cookies)
|
@@ -268,56 +200,63 @@ class DynamicSession:
|
|
268
200
|
|
269
201
|
self._closed = True
|
270
202
|
|
271
|
-
def
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
if self.disable_resources:
|
287
|
-
page.route("**/*", intercept_route)
|
288
|
-
|
289
|
-
if self.stealth:
|
290
|
-
for script in _compiled_stealth_scripts():
|
291
|
-
page.add_init_script(script=script)
|
292
|
-
|
293
|
-
return self.page_pool.add_page(page)
|
294
|
-
|
295
|
-
# Wait for a page to become available
|
296
|
-
max_wait = 30
|
297
|
-
start_time = time()
|
298
|
-
|
299
|
-
while time() - start_time < max_wait:
|
300
|
-
page_info = self.page_pool.get_ready_page()
|
301
|
-
if page_info:
|
302
|
-
return page_info
|
303
|
-
sleep(0.05)
|
304
|
-
|
305
|
-
raise TimeoutError("No pages available within timeout period")
|
306
|
-
|
307
|
-
def fetch(self, url: str) -> Response:
|
203
|
+
def fetch(
|
204
|
+
self,
|
205
|
+
url: str,
|
206
|
+
google_search: bool = _UNSET,
|
207
|
+
timeout: int | float = _UNSET,
|
208
|
+
wait: int | float = _UNSET,
|
209
|
+
page_action: Optional[Callable] = _UNSET,
|
210
|
+
extra_headers: Optional[Dict[str, str]] = _UNSET,
|
211
|
+
disable_resources: bool = _UNSET,
|
212
|
+
wait_selector: Optional[str] = _UNSET,
|
213
|
+
wait_selector_state: SelectorWaitStates = _UNSET,
|
214
|
+
network_idle: bool = _UNSET,
|
215
|
+
load_dom: bool = _UNSET,
|
216
|
+
selector_config: Optional[Dict] = _UNSET,
|
217
|
+
) -> Response:
|
308
218
|
"""Opens up the browser and do your request based on your chosen options.
|
309
219
|
|
310
220
|
:param url: The Target url.
|
221
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
222
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
223
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
224
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
225
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
226
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
227
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
228
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
229
|
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
230
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
231
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
232
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
233
|
+
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
311
234
|
:return: A `Response` object.
|
312
235
|
"""
|
236
|
+
# Validate all resolved parameters
|
237
|
+
params = validate(
|
238
|
+
dict(
|
239
|
+
google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
|
240
|
+
timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
|
241
|
+
wait=self._get_with_precedence(wait, self.wait, _UNSET),
|
242
|
+
page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
|
243
|
+
extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
|
244
|
+
disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
|
245
|
+
wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
|
246
|
+
wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
|
247
|
+
network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
|
248
|
+
load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
|
249
|
+
selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
|
250
|
+
),
|
251
|
+
PlaywrightConfig,
|
252
|
+
)
|
253
|
+
|
313
254
|
if self._closed: # pragma: no cover
|
314
255
|
raise RuntimeError("Context manager has been closed")
|
315
256
|
|
316
257
|
final_response = None
|
317
258
|
referer = (
|
318
|
-
generate_convincing_referer(url)
|
319
|
-
if (self.google_search and "referer" not in self._headers_keys)
|
320
|
-
else None
|
259
|
+
generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
|
321
260
|
)
|
322
261
|
|
323
262
|
def handle_response(finished_response: SyncPlaywrightResponse):
|
@@ -328,48 +267,50 @@ class DynamicSession:
|
|
328
267
|
):
|
329
268
|
final_response = finished_response
|
330
269
|
|
331
|
-
page_info = self.
|
270
|
+
page_info = self._get_page(params.timeout, params.extra_headers, params.disable_resources)
|
332
271
|
page_info.mark_busy(url=url)
|
333
272
|
|
334
273
|
try: # pragma: no cover
|
335
274
|
# Navigate to URL and wait for a specified state
|
336
275
|
page_info.page.on("response", handle_response)
|
337
276
|
first_response = page_info.page.goto(url, referer=referer)
|
338
|
-
|
277
|
+
if params.load_dom:
|
278
|
+
page_info.page.wait_for_load_state(state="domcontentloaded")
|
339
279
|
|
340
|
-
if
|
280
|
+
if params.network_idle:
|
341
281
|
page_info.page.wait_for_load_state("networkidle")
|
342
282
|
|
343
283
|
if not first_response:
|
344
284
|
raise RuntimeError(f"Failed to get response for {url}")
|
345
285
|
|
346
|
-
if
|
286
|
+
if params.page_action:
|
347
287
|
try:
|
348
|
-
|
288
|
+
_ = params.page_action(page_info.page)
|
349
289
|
except Exception as e: # pragma: no cover
|
350
290
|
log.error(f"Error executing page_action: {e}")
|
351
291
|
|
352
|
-
if
|
292
|
+
if params.wait_selector:
|
353
293
|
try:
|
354
|
-
waiter: Locator = page_info.page.locator(
|
355
|
-
waiter.first.wait_for(state=
|
294
|
+
waiter: Locator = page_info.page.locator(params.wait_selector)
|
295
|
+
waiter.first.wait_for(state=params.wait_selector_state)
|
356
296
|
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
357
297
|
page_info.page.wait_for_load_state(state="load")
|
358
|
-
|
359
|
-
|
298
|
+
if params.load_dom:
|
299
|
+
page_info.page.wait_for_load_state(state="domcontentloaded")
|
300
|
+
if params.network_idle:
|
360
301
|
page_info.page.wait_for_load_state("networkidle")
|
361
302
|
except Exception as e: # pragma: no cover
|
362
|
-
log.error(f"Error waiting for selector {
|
303
|
+
log.error(f"Error waiting for selector {params.wait_selector}: {e}")
|
363
304
|
|
364
|
-
page_info.page.wait_for_timeout(
|
305
|
+
page_info.page.wait_for_timeout(params.wait)
|
365
306
|
|
366
307
|
# Create response object
|
367
308
|
response = ResponseFactory.from_playwright_response(
|
368
|
-
page_info.page, first_response, final_response,
|
309
|
+
page_info.page, first_response, final_response, params.selector_config
|
369
310
|
)
|
370
311
|
|
371
|
-
# Mark the page as
|
372
|
-
page_info.
|
312
|
+
# Mark the page as finished for next use
|
313
|
+
page_info.mark_finished()
|
373
314
|
|
374
315
|
return response
|
375
316
|
|
@@ -377,17 +318,8 @@ class DynamicSession:
|
|
377
318
|
page_info.mark_error()
|
378
319
|
raise e
|
379
320
|
|
380
|
-
def get_pool_stats(self) -> Dict[str, int]:
|
381
|
-
"""Get statistics about the current page pool"""
|
382
|
-
return {
|
383
|
-
"total_pages": self.page_pool.pages_count,
|
384
|
-
"ready_pages": self.page_pool.ready_count,
|
385
|
-
"busy_pages": self.page_pool.busy_count,
|
386
|
-
"max_pages": self.max_pages,
|
387
|
-
}
|
388
321
|
|
389
|
-
|
390
|
-
class AsyncDynamicSession(DynamicSession):
|
322
|
+
class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
391
323
|
"""An async Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory."""
|
392
324
|
|
393
325
|
def __init__(
|
@@ -409,8 +341,10 @@ class AsyncDynamicSession(DynamicSession):
|
|
409
341
|
timeout: int | float = 30000,
|
410
342
|
disable_resources: bool = False,
|
411
343
|
wait_selector: Optional[str] = None,
|
344
|
+
init_script: Optional[str] = None,
|
412
345
|
cookies: Optional[List[Dict]] = None,
|
413
346
|
network_idle: bool = False,
|
347
|
+
load_dom: bool = True,
|
414
348
|
wait_selector_state: SelectorWaitStates = "attached",
|
415
349
|
selector_config: Optional[Dict] = None,
|
416
350
|
):
|
@@ -423,10 +357,12 @@ class AsyncDynamicSession(DynamicSession):
|
|
423
357
|
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
424
358
|
:param cookies: Set cookies for the next request.
|
425
359
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
360
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
426
361
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
427
362
|
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
428
|
-
:param page_action: Added for automation. A function that takes the `page` object
|
363
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
429
364
|
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
365
|
+
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
|
430
366
|
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
431
367
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
432
368
|
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
@@ -441,35 +377,32 @@ class AsyncDynamicSession(DynamicSession):
|
|
441
377
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
442
378
|
"""
|
443
379
|
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
selector_config,
|
380
|
+
self.__validate__(
|
381
|
+
wait=wait,
|
382
|
+
proxy=proxy,
|
383
|
+
locale=locale,
|
384
|
+
timeout=timeout,
|
385
|
+
stealth=stealth,
|
386
|
+
cdp_url=cdp_url,
|
387
|
+
cookies=cookies,
|
388
|
+
load_dom=load_dom,
|
389
|
+
headless=headless,
|
390
|
+
useragent=useragent,
|
391
|
+
max_pages=max_pages,
|
392
|
+
real_chrome=real_chrome,
|
393
|
+
page_action=page_action,
|
394
|
+
hide_canvas=hide_canvas,
|
395
|
+
init_script=init_script,
|
396
|
+
network_idle=network_idle,
|
397
|
+
google_search=google_search,
|
398
|
+
extra_headers=extra_headers,
|
399
|
+
wait_selector=wait_selector,
|
400
|
+
disable_webgl=disable_webgl,
|
401
|
+
selector_config=selector_config,
|
402
|
+
disable_resources=disable_resources,
|
403
|
+
wait_selector_state=wait_selector_state,
|
466
404
|
)
|
467
|
-
|
468
|
-
self.playwright: Optional[AsyncPlaywright] = None
|
469
|
-
self.context: Optional[AsyncBrowserContext] = None
|
470
|
-
self._lock = Lock()
|
471
|
-
self.__enter__ = None
|
472
|
-
self.__exit__ = None
|
405
|
+
super().__init__(max_pages=self.max_pages)
|
473
406
|
|
474
407
|
async def __create__(self):
|
475
408
|
"""Create a browser for this instance and context."""
|
@@ -481,19 +414,20 @@ class AsyncDynamicSession(DynamicSession):
|
|
481
414
|
self.playwright: AsyncPlaywright = await async_context().start()
|
482
415
|
|
483
416
|
if self.cdp_url:
|
484
|
-
browser = await self.playwright.chromium.connect_over_cdp(
|
485
|
-
|
486
|
-
)
|
487
|
-
self.context: AsyncBrowserContext = await browser.new_context(
|
488
|
-
**self.context_options
|
489
|
-
)
|
417
|
+
browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url)
|
418
|
+
self.context: AsyncBrowserContext = await browser.new_context(**self.context_options)
|
490
419
|
else:
|
491
|
-
self.context: AsyncBrowserContext = (
|
492
|
-
|
493
|
-
user_data_dir="", **self.launch_options
|
494
|
-
)
|
420
|
+
self.context: AsyncBrowserContext = await self.playwright.chromium.launch_persistent_context(
|
421
|
+
user_data_dir="", **self.launch_options
|
495
422
|
)
|
496
423
|
|
424
|
+
# Get the default page and close it
|
425
|
+
default_page = self.context.pages[0]
|
426
|
+
await default_page.close()
|
427
|
+
|
428
|
+
if self.init_script: # pragma: no cover
|
429
|
+
await self.context.add_init_script(path=self.init_script)
|
430
|
+
|
497
431
|
if self.cookies:
|
498
432
|
await self.context.add_cookies(self.cookies)
|
499
433
|
|
@@ -519,57 +453,63 @@ class AsyncDynamicSession(DynamicSession):
|
|
519
453
|
|
520
454
|
self._closed = True
|
521
455
|
|
522
|
-
async def
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
if self.disable_resources:
|
539
|
-
await page.route("**/*", async_intercept_route)
|
540
|
-
|
541
|
-
if self.stealth:
|
542
|
-
for script in _compiled_stealth_scripts():
|
543
|
-
await page.add_init_script(script=script)
|
544
|
-
|
545
|
-
return self.page_pool.add_page(page)
|
546
|
-
|
547
|
-
# Wait for a page to become available
|
548
|
-
max_wait = 30 # seconds
|
549
|
-
start_time = time()
|
550
|
-
|
551
|
-
while time() - start_time < max_wait: # pragma: no cover
|
552
|
-
page_info = self.page_pool.get_ready_page()
|
553
|
-
if page_info:
|
554
|
-
return page_info
|
555
|
-
await asyncio_sleep(0.05)
|
556
|
-
|
557
|
-
raise TimeoutError("No pages available within timeout period")
|
558
|
-
|
559
|
-
async def fetch(self, url: str) -> Response:
|
456
|
+
async def fetch(
|
457
|
+
self,
|
458
|
+
url: str,
|
459
|
+
google_search: bool = _UNSET,
|
460
|
+
timeout: int | float = _UNSET,
|
461
|
+
wait: int | float = _UNSET,
|
462
|
+
page_action: Optional[Callable] = _UNSET,
|
463
|
+
extra_headers: Optional[Dict[str, str]] = _UNSET,
|
464
|
+
disable_resources: bool = _UNSET,
|
465
|
+
wait_selector: Optional[str] = _UNSET,
|
466
|
+
wait_selector_state: SelectorWaitStates = _UNSET,
|
467
|
+
network_idle: bool = _UNSET,
|
468
|
+
load_dom: bool = _UNSET,
|
469
|
+
selector_config: Optional[Dict] = _UNSET,
|
470
|
+
) -> Response:
|
560
471
|
"""Opens up the browser and do your request based on your chosen options.
|
561
472
|
|
562
473
|
:param url: The Target url.
|
474
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
475
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
476
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
477
|
+
:param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
|
478
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
479
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
480
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
481
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
482
|
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
483
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
484
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
485
|
+
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
486
|
+
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
563
487
|
:return: A `Response` object.
|
564
488
|
"""
|
489
|
+
# Validate all resolved parameters
|
490
|
+
params = validate(
|
491
|
+
dict(
|
492
|
+
google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
|
493
|
+
timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
|
494
|
+
wait=self._get_with_precedence(wait, self.wait, _UNSET),
|
495
|
+
page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
|
496
|
+
extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
|
497
|
+
disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
|
498
|
+
wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
|
499
|
+
wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
|
500
|
+
network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
|
501
|
+
load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
|
502
|
+
selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
|
503
|
+
),
|
504
|
+
PlaywrightConfig,
|
505
|
+
)
|
506
|
+
|
565
507
|
if self._closed: # pragma: no cover
|
566
508
|
raise RuntimeError("Context manager has been closed")
|
567
509
|
|
568
510
|
final_response = None
|
569
511
|
referer = (
|
570
|
-
generate_convincing_referer(url)
|
571
|
-
if (self.google_search and "referer" not in self._headers_keys)
|
572
|
-
else None
|
512
|
+
generate_convincing_referer(url) if (params.google_search and "referer" not in self._headers_keys) else None
|
573
513
|
)
|
574
514
|
|
575
515
|
async def handle_response(finished_response: AsyncPlaywrightResponse):
|
@@ -580,48 +520,50 @@ class AsyncDynamicSession(DynamicSession):
|
|
580
520
|
):
|
581
521
|
final_response = finished_response
|
582
522
|
|
583
|
-
page_info = await self.
|
523
|
+
page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
|
584
524
|
page_info.mark_busy(url=url)
|
585
525
|
|
586
526
|
try:
|
587
527
|
# Navigate to URL and wait for a specified state
|
588
528
|
page_info.page.on("response", handle_response)
|
589
529
|
first_response = await page_info.page.goto(url, referer=referer)
|
590
|
-
|
530
|
+
if self.load_dom:
|
531
|
+
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
591
532
|
|
592
|
-
if
|
533
|
+
if params.network_idle:
|
593
534
|
await page_info.page.wait_for_load_state("networkidle")
|
594
535
|
|
595
536
|
if not first_response:
|
596
537
|
raise RuntimeError(f"Failed to get response for {url}")
|
597
538
|
|
598
|
-
if
|
539
|
+
if params.page_action:
|
599
540
|
try:
|
600
|
-
|
541
|
+
_ = await params.page_action(page_info.page)
|
601
542
|
except Exception as e:
|
602
543
|
log.error(f"Error executing page_action: {e}")
|
603
544
|
|
604
|
-
if
|
545
|
+
if params.wait_selector:
|
605
546
|
try:
|
606
|
-
waiter: AsyncLocator = page_info.page.locator(
|
607
|
-
await waiter.first.wait_for(state=
|
547
|
+
waiter: AsyncLocator = page_info.page.locator(params.wait_selector)
|
548
|
+
await waiter.first.wait_for(state=params.wait_selector_state)
|
608
549
|
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
609
550
|
await page_info.page.wait_for_load_state(state="load")
|
610
|
-
|
611
|
-
|
551
|
+
if self.load_dom:
|
552
|
+
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
553
|
+
if params.network_idle:
|
612
554
|
await page_info.page.wait_for_load_state("networkidle")
|
613
555
|
except Exception as e:
|
614
|
-
log.error(f"Error waiting for selector {
|
556
|
+
log.error(f"Error waiting for selector {params.wait_selector}: {e}")
|
615
557
|
|
616
|
-
await page_info.page.wait_for_timeout(
|
558
|
+
await page_info.page.wait_for_timeout(params.wait)
|
617
559
|
|
618
560
|
# Create response object
|
619
561
|
response = await ResponseFactory.from_async_playwright_response(
|
620
|
-
page_info.page, first_response, final_response,
|
562
|
+
page_info.page, first_response, final_response, params.selector_config
|
621
563
|
)
|
622
564
|
|
623
|
-
# Mark the page as
|
624
|
-
page_info.
|
565
|
+
# Mark the page as finished for next use
|
566
|
+
page_info.mark_finished()
|
625
567
|
|
626
568
|
return response
|
627
569
|
|