scrapling 0.2.99__py3-none-any.whl → 0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +18 -31
- scrapling/cli.py +818 -20
- scrapling/core/_html_utils.py +348 -0
- scrapling/core/_types.py +34 -17
- scrapling/core/ai.py +611 -0
- scrapling/core/custom_types.py +183 -100
- scrapling/core/mixins.py +27 -19
- scrapling/core/shell.py +647 -0
- scrapling/core/{storage_adaptors.py → storage.py} +41 -33
- scrapling/core/translator.py +20 -26
- scrapling/core/utils.py +49 -54
- scrapling/engines/__init__.py +15 -6
- scrapling/engines/_browsers/__init__.py +2 -0
- scrapling/engines/_browsers/_camoufox.py +745 -0
- scrapling/engines/_browsers/_config_tools.py +130 -0
- scrapling/engines/_browsers/_controllers.py +630 -0
- scrapling/engines/_browsers/_page.py +93 -0
- scrapling/engines/_browsers/_validators.py +150 -0
- scrapling/engines/constants.py +101 -88
- scrapling/engines/static.py +667 -110
- scrapling/engines/toolbelt/__init__.py +20 -6
- scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
- scrapling/engines/toolbelt/convertor.py +254 -0
- scrapling/engines/toolbelt/custom.py +158 -175
- scrapling/engines/toolbelt/fingerprints.py +32 -46
- scrapling/engines/toolbelt/navigation.py +68 -39
- scrapling/fetchers.py +227 -333
- scrapling/parser.py +781 -449
- scrapling-0.3.dist-info/METADATA +409 -0
- scrapling-0.3.dist-info/RECORD +41 -0
- {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/WHEEL +1 -1
- {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/top_level.txt +0 -1
- scrapling/defaults.py +0 -25
- scrapling/engines/camo.py +0 -339
- scrapling/engines/pw.py +0 -465
- scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
- scrapling-0.2.99.dist-info/METADATA +0 -290
- scrapling-0.2.99.dist-info/RECORD +0 -49
- tests/__init__.py +0 -1
- tests/fetchers/__init__.py +0 -1
- tests/fetchers/async/__init__.py +0 -0
- tests/fetchers/async/test_camoufox.py +0 -97
- tests/fetchers/async/test_httpx.py +0 -85
- tests/fetchers/async/test_playwright.py +0 -101
- tests/fetchers/sync/__init__.py +0 -0
- tests/fetchers/sync/test_camoufox.py +0 -70
- tests/fetchers/sync/test_httpx.py +0 -84
- tests/fetchers/sync/test_playwright.py +0 -89
- tests/fetchers/test_utils.py +0 -97
- tests/parser/__init__.py +0 -0
- tests/parser/test_automatch.py +0 -111
- tests/parser/test_general.py +0 -330
- {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/entry_points.txt +0 -0
- {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,630 @@
|
|
1
|
+
from time import time, sleep
|
2
|
+
from asyncio import sleep as asyncio_sleep, Lock
|
3
|
+
|
4
|
+
from playwright.sync_api import (
|
5
|
+
Response as SyncPlaywrightResponse,
|
6
|
+
sync_playwright,
|
7
|
+
BrowserContext,
|
8
|
+
Playwright,
|
9
|
+
Locator,
|
10
|
+
)
|
11
|
+
from playwright.async_api import (
|
12
|
+
async_playwright,
|
13
|
+
Response as AsyncPlaywrightResponse,
|
14
|
+
BrowserContext as AsyncBrowserContext,
|
15
|
+
Playwright as AsyncPlaywright,
|
16
|
+
Locator as AsyncLocator,
|
17
|
+
)
|
18
|
+
from rebrowser_playwright.sync_api import sync_playwright as sync_rebrowser_playwright
|
19
|
+
from rebrowser_playwright.async_api import (
|
20
|
+
async_playwright as async_rebrowser_playwright,
|
21
|
+
)
|
22
|
+
|
23
|
+
from scrapling.core.utils import log
|
24
|
+
from ._page import PageInfo, PagePool
|
25
|
+
from ._validators import validate, PlaywrightConfig
|
26
|
+
from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
|
27
|
+
from scrapling.core._types import (
|
28
|
+
Dict,
|
29
|
+
List,
|
30
|
+
Optional,
|
31
|
+
Callable,
|
32
|
+
SelectorWaitStates,
|
33
|
+
)
|
34
|
+
from scrapling.engines.toolbelt import (
|
35
|
+
Response,
|
36
|
+
ResponseFactory,
|
37
|
+
generate_convincing_referer,
|
38
|
+
intercept_route,
|
39
|
+
async_intercept_route,
|
40
|
+
)
|
41
|
+
|
42
|
+
|
43
|
+
class DynamicSession:
|
44
|
+
"""A Browser session manager with page pooling."""
|
45
|
+
|
46
|
+
__slots__ = (
|
47
|
+
"max_pages",
|
48
|
+
"headless",
|
49
|
+
"hide_canvas",
|
50
|
+
"disable_webgl",
|
51
|
+
"real_chrome",
|
52
|
+
"stealth",
|
53
|
+
"google_search",
|
54
|
+
"proxy",
|
55
|
+
"locale",
|
56
|
+
"extra_headers",
|
57
|
+
"useragent",
|
58
|
+
"timeout",
|
59
|
+
"cookies",
|
60
|
+
"disable_resources",
|
61
|
+
"network_idle",
|
62
|
+
"wait_selector",
|
63
|
+
"wait_selector_state",
|
64
|
+
"wait",
|
65
|
+
"playwright",
|
66
|
+
"browser",
|
67
|
+
"context",
|
68
|
+
"page_pool",
|
69
|
+
"_closed",
|
70
|
+
"selector_config",
|
71
|
+
"page_action",
|
72
|
+
"launch_options",
|
73
|
+
"context_options",
|
74
|
+
"cdp_url",
|
75
|
+
"_headers_keys",
|
76
|
+
)
|
77
|
+
|
78
|
+
def __init__(
|
79
|
+
self,
|
80
|
+
__max_pages: int = 1,
|
81
|
+
headless: bool = True,
|
82
|
+
google_search: bool = True,
|
83
|
+
hide_canvas: bool = False,
|
84
|
+
disable_webgl: bool = False,
|
85
|
+
real_chrome: bool = False,
|
86
|
+
stealth: bool = False,
|
87
|
+
wait: int | float = 0,
|
88
|
+
page_action: Optional[Callable] = None,
|
89
|
+
proxy: Optional[str | Dict[str, str]] = None,
|
90
|
+
locale: str = "en-US",
|
91
|
+
extra_headers: Optional[Dict[str, str]] = None,
|
92
|
+
useragent: Optional[str] = None,
|
93
|
+
cdp_url: Optional[str] = None,
|
94
|
+
timeout: int | float = 30000,
|
95
|
+
disable_resources: bool = False,
|
96
|
+
wait_selector: Optional[str] = None,
|
97
|
+
cookies: Optional[List[Dict]] = None,
|
98
|
+
network_idle: bool = False,
|
99
|
+
wait_selector_state: SelectorWaitStates = "attached",
|
100
|
+
selector_config: Optional[Dict] = None,
|
101
|
+
):
|
102
|
+
"""A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
|
103
|
+
|
104
|
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
105
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
106
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
107
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
108
|
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
109
|
+
:param cookies: Set cookies for the next request.
|
110
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
111
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
112
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
113
|
+
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
114
|
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
115
|
+
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
116
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
117
|
+
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
118
|
+
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
119
|
+
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
120
|
+
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
121
|
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
|
122
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
123
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
124
|
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
125
|
+
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
126
|
+
"""
|
127
|
+
|
128
|
+
params = {
|
129
|
+
"max_pages": __max_pages,
|
130
|
+
"headless": headless,
|
131
|
+
"google_search": google_search,
|
132
|
+
"hide_canvas": hide_canvas,
|
133
|
+
"disable_webgl": disable_webgl,
|
134
|
+
"real_chrome": real_chrome,
|
135
|
+
"stealth": stealth,
|
136
|
+
"wait": wait,
|
137
|
+
"page_action": page_action,
|
138
|
+
"proxy": proxy,
|
139
|
+
"locale": locale,
|
140
|
+
"extra_headers": extra_headers,
|
141
|
+
"useragent": useragent,
|
142
|
+
"timeout": timeout,
|
143
|
+
"selector_config": selector_config,
|
144
|
+
"disable_resources": disable_resources,
|
145
|
+
"wait_selector": wait_selector,
|
146
|
+
"cookies": cookies,
|
147
|
+
"network_idle": network_idle,
|
148
|
+
"wait_selector_state": wait_selector_state,
|
149
|
+
"cdp_url": cdp_url,
|
150
|
+
}
|
151
|
+
config = validate(params, PlaywrightConfig)
|
152
|
+
|
153
|
+
self.max_pages = config.max_pages
|
154
|
+
self.headless = config.headless
|
155
|
+
self.hide_canvas = config.hide_canvas
|
156
|
+
self.disable_webgl = config.disable_webgl
|
157
|
+
self.real_chrome = config.real_chrome
|
158
|
+
self.stealth = config.stealth
|
159
|
+
self.google_search = config.google_search
|
160
|
+
self.wait = config.wait
|
161
|
+
self.proxy = config.proxy
|
162
|
+
self.locale = config.locale
|
163
|
+
self.extra_headers = config.extra_headers
|
164
|
+
self.useragent = config.useragent
|
165
|
+
self.timeout = config.timeout
|
166
|
+
self.cookies = config.cookies
|
167
|
+
self.disable_resources = config.disable_resources
|
168
|
+
self.cdp_url = config.cdp_url
|
169
|
+
self.network_idle = config.network_idle
|
170
|
+
self.wait_selector = config.wait_selector
|
171
|
+
self.wait_selector_state = config.wait_selector_state
|
172
|
+
|
173
|
+
self.playwright: Optional[Playwright] = None
|
174
|
+
self.context: Optional[BrowserContext] = None
|
175
|
+
self.page_pool = PagePool(self.max_pages)
|
176
|
+
self._closed = False
|
177
|
+
self.selector_config = config.selector_config
|
178
|
+
self.page_action = config.page_action
|
179
|
+
self._headers_keys = (
|
180
|
+
set(map(str.lower, self.extra_headers.keys()))
|
181
|
+
if self.extra_headers
|
182
|
+
else set()
|
183
|
+
)
|
184
|
+
self.__initiate_browser_options__()
|
185
|
+
|
186
|
+
def __initiate_browser_options__(self):
|
187
|
+
if not self.cdp_url:
|
188
|
+
# `launch_options` is used with persistent context
|
189
|
+
self.launch_options = dict(
|
190
|
+
_launch_kwargs(
|
191
|
+
self.headless,
|
192
|
+
self.proxy,
|
193
|
+
self.locale,
|
194
|
+
tuple(self.extra_headers.items())
|
195
|
+
if self.extra_headers
|
196
|
+
else tuple(),
|
197
|
+
self.useragent,
|
198
|
+
self.real_chrome,
|
199
|
+
self.stealth,
|
200
|
+
self.hide_canvas,
|
201
|
+
self.disable_webgl,
|
202
|
+
)
|
203
|
+
)
|
204
|
+
self.launch_options["extra_http_headers"] = dict(
|
205
|
+
self.launch_options["extra_http_headers"]
|
206
|
+
)
|
207
|
+
self.launch_options["proxy"] = dict(self.launch_options["proxy"]) or None
|
208
|
+
self.context_options = dict()
|
209
|
+
else:
|
210
|
+
# while `context_options` is left to be used when cdp mode is enabled
|
211
|
+
self.launch_options = dict()
|
212
|
+
self.context_options = dict(
|
213
|
+
_context_kwargs(
|
214
|
+
self.proxy,
|
215
|
+
self.locale,
|
216
|
+
tuple(self.extra_headers.items())
|
217
|
+
if self.extra_headers
|
218
|
+
else tuple(),
|
219
|
+
self.useragent,
|
220
|
+
self.stealth,
|
221
|
+
)
|
222
|
+
)
|
223
|
+
self.context_options["extra_http_headers"] = dict(
|
224
|
+
self.context_options["extra_http_headers"]
|
225
|
+
)
|
226
|
+
self.context_options["proxy"] = dict(self.context_options["proxy"]) or None
|
227
|
+
|
228
|
+
def __create__(self):
|
229
|
+
"""Create a browser for this instance and context."""
|
230
|
+
sync_context = sync_rebrowser_playwright
|
231
|
+
if not self.stealth or self.real_chrome:
|
232
|
+
# Because rebrowser_playwright doesn't play well with real browsers
|
233
|
+
sync_context = sync_playwright
|
234
|
+
|
235
|
+
self.playwright = sync_context().start()
|
236
|
+
|
237
|
+
if self.cdp_url: # pragma: no cover
|
238
|
+
self.context = self.playwright.chromium.connect_over_cdp(
|
239
|
+
endpoint_url=self.cdp_url
|
240
|
+
).new_context(**self.context_options)
|
241
|
+
else:
|
242
|
+
self.context = self.playwright.chromium.launch_persistent_context(
|
243
|
+
user_data_dir="", **self.launch_options
|
244
|
+
)
|
245
|
+
|
246
|
+
if self.cookies: # pragma: no cover
|
247
|
+
self.context.add_cookies(self.cookies)
|
248
|
+
|
249
|
+
def __enter__(self):
|
250
|
+
self.__create__()
|
251
|
+
return self
|
252
|
+
|
253
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
254
|
+
self.close()
|
255
|
+
|
256
|
+
def close(self): # pragma: no cover
|
257
|
+
"""Close all resources"""
|
258
|
+
if self._closed:
|
259
|
+
return
|
260
|
+
|
261
|
+
if self.context:
|
262
|
+
self.context.close()
|
263
|
+
self.context = None
|
264
|
+
|
265
|
+
if self.playwright:
|
266
|
+
self.playwright.stop()
|
267
|
+
self.playwright = None
|
268
|
+
|
269
|
+
self._closed = True
|
270
|
+
|
271
|
+
def _get_or_create_page(self) -> PageInfo: # pragma: no cover
|
272
|
+
"""Get an available page or create a new one"""
|
273
|
+
# Try to get a ready page first
|
274
|
+
page_info = self.page_pool.get_ready_page()
|
275
|
+
if page_info:
|
276
|
+
return page_info
|
277
|
+
|
278
|
+
# Create a new page if under limit
|
279
|
+
if self.page_pool.pages_count < self.max_pages:
|
280
|
+
page = self.context.new_page()
|
281
|
+
page.set_default_navigation_timeout(self.timeout)
|
282
|
+
page.set_default_timeout(self.timeout)
|
283
|
+
if self.extra_headers:
|
284
|
+
page.set_extra_http_headers(self.extra_headers)
|
285
|
+
|
286
|
+
if self.disable_resources:
|
287
|
+
page.route("**/*", intercept_route)
|
288
|
+
|
289
|
+
if self.stealth:
|
290
|
+
for script in _compiled_stealth_scripts():
|
291
|
+
page.add_init_script(script=script)
|
292
|
+
|
293
|
+
return self.page_pool.add_page(page)
|
294
|
+
|
295
|
+
# Wait for a page to become available
|
296
|
+
max_wait = 30
|
297
|
+
start_time = time()
|
298
|
+
|
299
|
+
while time() - start_time < max_wait:
|
300
|
+
page_info = self.page_pool.get_ready_page()
|
301
|
+
if page_info:
|
302
|
+
return page_info
|
303
|
+
sleep(0.05)
|
304
|
+
|
305
|
+
raise TimeoutError("No pages available within timeout period")
|
306
|
+
|
307
|
+
def fetch(self, url: str) -> Response:
|
308
|
+
"""Opens up the browser and do your request based on your chosen options.
|
309
|
+
|
310
|
+
:param url: The Target url.
|
311
|
+
:return: A `Response` object.
|
312
|
+
"""
|
313
|
+
if self._closed: # pragma: no cover
|
314
|
+
raise RuntimeError("Context manager has been closed")
|
315
|
+
|
316
|
+
final_response = None
|
317
|
+
referer = (
|
318
|
+
generate_convincing_referer(url)
|
319
|
+
if (self.google_search and "referer" not in self._headers_keys)
|
320
|
+
else None
|
321
|
+
)
|
322
|
+
|
323
|
+
def handle_response(finished_response: SyncPlaywrightResponse):
|
324
|
+
nonlocal final_response
|
325
|
+
if (
|
326
|
+
finished_response.request.resource_type == "document"
|
327
|
+
and finished_response.request.is_navigation_request()
|
328
|
+
):
|
329
|
+
final_response = finished_response
|
330
|
+
|
331
|
+
page_info = self._get_or_create_page()
|
332
|
+
page_info.mark_busy(url=url)
|
333
|
+
|
334
|
+
try: # pragma: no cover
|
335
|
+
# Navigate to URL and wait for a specified state
|
336
|
+
page_info.page.on("response", handle_response)
|
337
|
+
first_response = page_info.page.goto(url, referer=referer)
|
338
|
+
page_info.page.wait_for_load_state(state="domcontentloaded")
|
339
|
+
|
340
|
+
if self.network_idle:
|
341
|
+
page_info.page.wait_for_load_state("networkidle")
|
342
|
+
|
343
|
+
if not first_response:
|
344
|
+
raise RuntimeError(f"Failed to get response for {url}")
|
345
|
+
|
346
|
+
if self.page_action is not None:
|
347
|
+
try:
|
348
|
+
page_info.page = self.page_action(page_info.page)
|
349
|
+
except Exception as e: # pragma: no cover
|
350
|
+
log.error(f"Error executing page_action: {e}")
|
351
|
+
|
352
|
+
if self.wait_selector:
|
353
|
+
try:
|
354
|
+
waiter: Locator = page_info.page.locator(self.wait_selector)
|
355
|
+
waiter.first.wait_for(state=self.wait_selector_state)
|
356
|
+
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
357
|
+
page_info.page.wait_for_load_state(state="load")
|
358
|
+
page_info.page.wait_for_load_state(state="domcontentloaded")
|
359
|
+
if self.network_idle:
|
360
|
+
page_info.page.wait_for_load_state("networkidle")
|
361
|
+
except Exception as e: # pragma: no cover
|
362
|
+
log.error(f"Error waiting for selector {self.wait_selector}: {e}")
|
363
|
+
|
364
|
+
page_info.page.wait_for_timeout(self.wait)
|
365
|
+
|
366
|
+
# Create response object
|
367
|
+
response = ResponseFactory.from_playwright_response(
|
368
|
+
page_info.page, first_response, final_response, self.selector_config
|
369
|
+
)
|
370
|
+
|
371
|
+
# Mark the page as ready for next use
|
372
|
+
page_info.mark_ready()
|
373
|
+
|
374
|
+
return response
|
375
|
+
|
376
|
+
except Exception as e:
|
377
|
+
page_info.mark_error()
|
378
|
+
raise e
|
379
|
+
|
380
|
+
def get_pool_stats(self) -> Dict[str, int]:
|
381
|
+
"""Get statistics about the current page pool"""
|
382
|
+
return {
|
383
|
+
"total_pages": self.page_pool.pages_count,
|
384
|
+
"ready_pages": self.page_pool.ready_count,
|
385
|
+
"busy_pages": self.page_pool.busy_count,
|
386
|
+
"max_pages": self.max_pages,
|
387
|
+
}
|
388
|
+
|
389
|
+
|
390
|
+
class AsyncDynamicSession(DynamicSession):
|
391
|
+
"""An async Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory."""
|
392
|
+
|
393
|
+
def __init__(
|
394
|
+
self,
|
395
|
+
max_pages: int = 1,
|
396
|
+
headless: bool = True,
|
397
|
+
google_search: bool = True,
|
398
|
+
hide_canvas: bool = False,
|
399
|
+
disable_webgl: bool = False,
|
400
|
+
real_chrome: bool = False,
|
401
|
+
stealth: bool = False,
|
402
|
+
wait: int | float = 0,
|
403
|
+
page_action: Optional[Callable] = None,
|
404
|
+
proxy: Optional[str | Dict[str, str]] = None,
|
405
|
+
locale: str = "en-US",
|
406
|
+
extra_headers: Optional[Dict[str, str]] = None,
|
407
|
+
useragent: Optional[str] = None,
|
408
|
+
cdp_url: Optional[str] = None,
|
409
|
+
timeout: int | float = 30000,
|
410
|
+
disable_resources: bool = False,
|
411
|
+
wait_selector: Optional[str] = None,
|
412
|
+
cookies: Optional[List[Dict]] = None,
|
413
|
+
network_idle: bool = False,
|
414
|
+
wait_selector_state: SelectorWaitStates = "attached",
|
415
|
+
selector_config: Optional[Dict] = None,
|
416
|
+
):
|
417
|
+
"""A Browser session manager with page pooling
|
418
|
+
|
419
|
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
420
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
421
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
422
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
423
|
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
424
|
+
:param cookies: Set cookies for the next request.
|
425
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
426
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
427
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
428
|
+
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
429
|
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
430
|
+
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
431
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
432
|
+
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
433
|
+
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
434
|
+
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
435
|
+
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
436
|
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
|
437
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
438
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
439
|
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
440
|
+
:param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
|
441
|
+
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
442
|
+
"""
|
443
|
+
|
444
|
+
super().__init__(
|
445
|
+
max_pages,
|
446
|
+
headless,
|
447
|
+
google_search,
|
448
|
+
hide_canvas,
|
449
|
+
disable_webgl,
|
450
|
+
real_chrome,
|
451
|
+
stealth,
|
452
|
+
wait,
|
453
|
+
page_action,
|
454
|
+
proxy,
|
455
|
+
locale,
|
456
|
+
extra_headers,
|
457
|
+
useragent,
|
458
|
+
cdp_url,
|
459
|
+
timeout,
|
460
|
+
disable_resources,
|
461
|
+
wait_selector,
|
462
|
+
cookies,
|
463
|
+
network_idle,
|
464
|
+
wait_selector_state,
|
465
|
+
selector_config,
|
466
|
+
)
|
467
|
+
|
468
|
+
self.playwright: Optional[AsyncPlaywright] = None
|
469
|
+
self.context: Optional[AsyncBrowserContext] = None
|
470
|
+
self._lock = Lock()
|
471
|
+
self.__enter__ = None
|
472
|
+
self.__exit__ = None
|
473
|
+
|
474
|
+
async def __create__(self):
|
475
|
+
"""Create a browser for this instance and context."""
|
476
|
+
async_context = async_rebrowser_playwright
|
477
|
+
if not self.stealth or self.real_chrome:
|
478
|
+
# Because rebrowser_playwright doesn't play well with real browsers
|
479
|
+
async_context = async_playwright
|
480
|
+
|
481
|
+
self.playwright: AsyncPlaywright = await async_context().start()
|
482
|
+
|
483
|
+
if self.cdp_url:
|
484
|
+
browser = await self.playwright.chromium.connect_over_cdp(
|
485
|
+
endpoint_url=self.cdp_url
|
486
|
+
)
|
487
|
+
self.context: AsyncBrowserContext = await browser.new_context(
|
488
|
+
**self.context_options
|
489
|
+
)
|
490
|
+
else:
|
491
|
+
self.context: AsyncBrowserContext = (
|
492
|
+
await self.playwright.chromium.launch_persistent_context(
|
493
|
+
user_data_dir="", **self.launch_options
|
494
|
+
)
|
495
|
+
)
|
496
|
+
|
497
|
+
if self.cookies:
|
498
|
+
await self.context.add_cookies(self.cookies)
|
499
|
+
|
500
|
+
async def __aenter__(self):
|
501
|
+
await self.__create__()
|
502
|
+
return self
|
503
|
+
|
504
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
505
|
+
await self.close()
|
506
|
+
|
507
|
+
async def close(self):
|
508
|
+
"""Close all resources"""
|
509
|
+
if self._closed: # pragma: no cover
|
510
|
+
return
|
511
|
+
|
512
|
+
if self.context:
|
513
|
+
await self.context.close()
|
514
|
+
self.context = None
|
515
|
+
|
516
|
+
if self.playwright:
|
517
|
+
await self.playwright.stop()
|
518
|
+
self.playwright = None
|
519
|
+
|
520
|
+
self._closed = True
|
521
|
+
|
522
|
+
async def _get_or_create_page(self) -> PageInfo:
|
523
|
+
"""Get an available page or create a new one"""
|
524
|
+
async with self._lock:
|
525
|
+
# Try to get a ready page first
|
526
|
+
page_info = self.page_pool.get_ready_page()
|
527
|
+
if page_info:
|
528
|
+
return page_info
|
529
|
+
|
530
|
+
# Create a new page if under limit
|
531
|
+
if self.page_pool.pages_count < self.max_pages:
|
532
|
+
page = await self.context.new_page()
|
533
|
+
page.set_default_navigation_timeout(self.timeout)
|
534
|
+
page.set_default_timeout(self.timeout)
|
535
|
+
if self.extra_headers:
|
536
|
+
await page.set_extra_http_headers(self.extra_headers)
|
537
|
+
|
538
|
+
if self.disable_resources:
|
539
|
+
await page.route("**/*", async_intercept_route)
|
540
|
+
|
541
|
+
if self.stealth:
|
542
|
+
for script in _compiled_stealth_scripts():
|
543
|
+
await page.add_init_script(script=script)
|
544
|
+
|
545
|
+
return self.page_pool.add_page(page)
|
546
|
+
|
547
|
+
# Wait for a page to become available
|
548
|
+
max_wait = 30 # seconds
|
549
|
+
start_time = time()
|
550
|
+
|
551
|
+
while time() - start_time < max_wait: # pragma: no cover
|
552
|
+
page_info = self.page_pool.get_ready_page()
|
553
|
+
if page_info:
|
554
|
+
return page_info
|
555
|
+
await asyncio_sleep(0.05)
|
556
|
+
|
557
|
+
raise TimeoutError("No pages available within timeout period")
|
558
|
+
|
559
|
+
async def fetch(self, url: str) -> Response:
|
560
|
+
"""Opens up the browser and do your request based on your chosen options.
|
561
|
+
|
562
|
+
:param url: The Target url.
|
563
|
+
:return: A `Response` object.
|
564
|
+
"""
|
565
|
+
if self._closed: # pragma: no cover
|
566
|
+
raise RuntimeError("Context manager has been closed")
|
567
|
+
|
568
|
+
final_response = None
|
569
|
+
referer = (
|
570
|
+
generate_convincing_referer(url)
|
571
|
+
if (self.google_search and "referer" not in self._headers_keys)
|
572
|
+
else None
|
573
|
+
)
|
574
|
+
|
575
|
+
async def handle_response(finished_response: AsyncPlaywrightResponse):
|
576
|
+
nonlocal final_response
|
577
|
+
if (
|
578
|
+
finished_response.request.resource_type == "document"
|
579
|
+
and finished_response.request.is_navigation_request()
|
580
|
+
):
|
581
|
+
final_response = finished_response
|
582
|
+
|
583
|
+
page_info = await self._get_or_create_page()
|
584
|
+
page_info.mark_busy(url=url)
|
585
|
+
|
586
|
+
try:
|
587
|
+
# Navigate to URL and wait for a specified state
|
588
|
+
page_info.page.on("response", handle_response)
|
589
|
+
first_response = await page_info.page.goto(url, referer=referer)
|
590
|
+
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
591
|
+
|
592
|
+
if self.network_idle:
|
593
|
+
await page_info.page.wait_for_load_state("networkidle")
|
594
|
+
|
595
|
+
if not first_response:
|
596
|
+
raise RuntimeError(f"Failed to get response for {url}")
|
597
|
+
|
598
|
+
if self.page_action is not None:
|
599
|
+
try:
|
600
|
+
page_info.page = await self.page_action(page_info.page)
|
601
|
+
except Exception as e:
|
602
|
+
log.error(f"Error executing page_action: {e}")
|
603
|
+
|
604
|
+
if self.wait_selector:
|
605
|
+
try:
|
606
|
+
waiter: AsyncLocator = page_info.page.locator(self.wait_selector)
|
607
|
+
await waiter.first.wait_for(state=self.wait_selector_state)
|
608
|
+
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
609
|
+
await page_info.page.wait_for_load_state(state="load")
|
610
|
+
await page_info.page.wait_for_load_state(state="domcontentloaded")
|
611
|
+
if self.network_idle:
|
612
|
+
await page_info.page.wait_for_load_state("networkidle")
|
613
|
+
except Exception as e:
|
614
|
+
log.error(f"Error waiting for selector {self.wait_selector}: {e}")
|
615
|
+
|
616
|
+
await page_info.page.wait_for_timeout(self.wait)
|
617
|
+
|
618
|
+
# Create response object
|
619
|
+
response = await ResponseFactory.from_async_playwright_response(
|
620
|
+
page_info.page, first_response, final_response, self.selector_config
|
621
|
+
)
|
622
|
+
|
623
|
+
# Mark the page as ready for next use
|
624
|
+
page_info.mark_ready()
|
625
|
+
|
626
|
+
return response
|
627
|
+
|
628
|
+
except Exception as e: # pragma: no cover
|
629
|
+
page_info.mark_error()
|
630
|
+
raise e
|