scrapling 0.2.98__py3-none-any.whl → 0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +18 -31
- scrapling/cli.py +818 -20
- scrapling/core/_html_utils.py +348 -0
- scrapling/core/_types.py +34 -17
- scrapling/core/ai.py +611 -0
- scrapling/core/custom_types.py +183 -100
- scrapling/core/mixins.py +27 -19
- scrapling/core/shell.py +647 -0
- scrapling/core/{storage_adaptors.py → storage.py} +41 -33
- scrapling/core/translator.py +20 -26
- scrapling/core/utils.py +49 -54
- scrapling/engines/__init__.py +15 -6
- scrapling/engines/_browsers/__init__.py +2 -0
- scrapling/engines/_browsers/_camoufox.py +745 -0
- scrapling/engines/_browsers/_config_tools.py +130 -0
- scrapling/engines/_browsers/_controllers.py +630 -0
- scrapling/engines/_browsers/_page.py +93 -0
- scrapling/engines/_browsers/_validators.py +150 -0
- scrapling/engines/constants.py +101 -88
- scrapling/engines/static.py +667 -110
- scrapling/engines/toolbelt/__init__.py +20 -6
- scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
- scrapling/engines/toolbelt/convertor.py +254 -0
- scrapling/engines/toolbelt/custom.py +205 -186
- scrapling/engines/toolbelt/fingerprints.py +32 -46
- scrapling/engines/toolbelt/navigation.py +68 -39
- scrapling/fetchers.py +255 -260
- scrapling/parser.py +781 -449
- scrapling-0.3.dist-info/METADATA +409 -0
- scrapling-0.3.dist-info/RECORD +41 -0
- {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/WHEEL +1 -1
- {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/top_level.txt +0 -1
- scrapling/defaults.py +0 -19
- scrapling/engines/camo.py +0 -299
- scrapling/engines/pw.py +0 -428
- scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
- scrapling-0.2.98.dist-info/METADATA +0 -867
- scrapling-0.2.98.dist-info/RECORD +0 -49
- tests/__init__.py +0 -1
- tests/fetchers/__init__.py +0 -1
- tests/fetchers/async/__init__.py +0 -0
- tests/fetchers/async/test_camoufox.py +0 -95
- tests/fetchers/async/test_httpx.py +0 -83
- tests/fetchers/async/test_playwright.py +0 -99
- tests/fetchers/sync/__init__.py +0 -0
- tests/fetchers/sync/test_camoufox.py +0 -68
- tests/fetchers/sync/test_httpx.py +0 -82
- tests/fetchers/sync/test_playwright.py +0 -87
- tests/fetchers/test_utils.py +0 -97
- tests/parser/__init__.py +0 -0
- tests/parser/test_automatch.py +0 -111
- tests/parser/test_general.py +0 -330
- {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/entry_points.txt +0 -0
- {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info/licenses}/LICENSE +0 -0
scrapling/core/ai.py
ADDED
@@ -0,0 +1,611 @@
|
|
1
|
+
from asyncio import gather
|
2
|
+
|
3
|
+
from mcp.server.fastmcp import FastMCP
|
4
|
+
from pydantic import BaseModel, Field
|
5
|
+
|
6
|
+
from scrapling.core.shell import Convertor
|
7
|
+
from scrapling.engines.toolbelt import Response as _ScraplingResponse
|
8
|
+
from scrapling.fetchers import (
|
9
|
+
Fetcher,
|
10
|
+
FetcherSession,
|
11
|
+
DynamicFetcher,
|
12
|
+
AsyncDynamicSession,
|
13
|
+
StealthyFetcher,
|
14
|
+
AsyncStealthySession,
|
15
|
+
)
|
16
|
+
from scrapling.core._types import (
|
17
|
+
Optional,
|
18
|
+
Tuple,
|
19
|
+
extraction_types,
|
20
|
+
Mapping,
|
21
|
+
Dict,
|
22
|
+
List,
|
23
|
+
SelectorWaitStates,
|
24
|
+
Generator,
|
25
|
+
)
|
26
|
+
from curl_cffi.requests import (
|
27
|
+
BrowserTypeLiteral,
|
28
|
+
)
|
29
|
+
|
30
|
+
|
31
|
+
class ResponseModel(BaseModel):
|
32
|
+
"""Request's response information structure."""
|
33
|
+
|
34
|
+
status: int = Field(description="The status code returned by the website.")
|
35
|
+
content: list[str] = Field(
|
36
|
+
description="The content as Markdown/HTML or the text content of the page."
|
37
|
+
)
|
38
|
+
url: str = Field(
|
39
|
+
description="The URL given by the user that resulted in this response."
|
40
|
+
)
|
41
|
+
|
42
|
+
|
43
|
+
def _ContentTranslator(
|
44
|
+
content: Generator[str, None, None], page: _ScraplingResponse
|
45
|
+
) -> ResponseModel:
|
46
|
+
"""Convert a content generator to a list of ResponseModel objects."""
|
47
|
+
return ResponseModel(
|
48
|
+
status=page.status, content=[result for result in content], url=page.url
|
49
|
+
)
|
50
|
+
|
51
|
+
|
52
|
+
class ScraplingMCPServer:
|
53
|
+
_server = FastMCP(name="Scrapling")
|
54
|
+
|
55
|
+
@staticmethod
|
56
|
+
@_server.tool()
|
57
|
+
def get(
|
58
|
+
url: str,
|
59
|
+
impersonate: Optional[BrowserTypeLiteral] = "chrome",
|
60
|
+
extraction_type: extraction_types = "markdown",
|
61
|
+
css_selector: Optional[str] = None,
|
62
|
+
main_content_only: bool = True,
|
63
|
+
params: Optional[Dict | List | Tuple] = None,
|
64
|
+
headers: Optional[Mapping[str, Optional[str]]] = None,
|
65
|
+
cookies: Optional[Dict[str, str] | list[tuple[str, str]]] = None,
|
66
|
+
timeout: Optional[int | float] = 30,
|
67
|
+
follow_redirects: bool = True,
|
68
|
+
max_redirects: int = 30,
|
69
|
+
retries: Optional[int] = 3,
|
70
|
+
retry_delay: Optional[int] = 1,
|
71
|
+
proxy: Optional[str] = None,
|
72
|
+
proxy_auth: Optional[Tuple[str, str]] = None,
|
73
|
+
auth: Optional[Tuple[str, str]] = None,
|
74
|
+
verify: Optional[bool] = True,
|
75
|
+
http3: Optional[bool] = False,
|
76
|
+
stealthy_headers: Optional[bool] = True,
|
77
|
+
) -> ResponseModel:
|
78
|
+
"""Make GET HTTP request to a URL and return a structured output of the result.
|
79
|
+
Note: This is only suitable for low-mid protection levels. For high-protection levels or websites that require JS loading, use the other tools directly.
|
80
|
+
Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
|
81
|
+
|
82
|
+
:param url: The URL to request.
|
83
|
+
:param impersonate: Browser version to impersonate its fingerprint. It's using the latest chrome version by default.
|
84
|
+
:param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
|
85
|
+
- Markdown will convert the page content to Markdown format.
|
86
|
+
- HTML will return the raw HTML content of the page.
|
87
|
+
- Text will return the text content of the page.
|
88
|
+
:param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
|
89
|
+
:param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
|
90
|
+
:param params: Query string parameters for the request.
|
91
|
+
:param headers: Headers to include in the request.
|
92
|
+
:param cookies: Cookies to use in the request.
|
93
|
+
:param timeout: Number of seconds to wait before timing out.
|
94
|
+
:param follow_redirects: Whether to follow redirects. Defaults to True.
|
95
|
+
:param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
|
96
|
+
:param retries: Number of retry attempts. Defaults to 3.
|
97
|
+
:param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
|
98
|
+
:param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
|
99
|
+
Cannot be used together with the `proxies` parameter.
|
100
|
+
:param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
|
101
|
+
:param auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
|
102
|
+
:param verify: Whether to verify HTTPS certificates.
|
103
|
+
:param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
104
|
+
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
|
105
|
+
"""
|
106
|
+
page = Fetcher.get(
|
107
|
+
url,
|
108
|
+
auth=auth,
|
109
|
+
proxy=proxy,
|
110
|
+
http3=http3,
|
111
|
+
verify=verify,
|
112
|
+
params=params,
|
113
|
+
proxy_auth=proxy_auth,
|
114
|
+
retry_delay=retry_delay,
|
115
|
+
stealthy_headers=stealthy_headers,
|
116
|
+
impersonate=impersonate,
|
117
|
+
headers=headers,
|
118
|
+
cookies=cookies,
|
119
|
+
timeout=timeout,
|
120
|
+
retries=retries,
|
121
|
+
max_redirects=max_redirects,
|
122
|
+
follow_redirects=follow_redirects,
|
123
|
+
)
|
124
|
+
return _ContentTranslator(
|
125
|
+
Convertor._extract_content(
|
126
|
+
page,
|
127
|
+
css_selector=css_selector,
|
128
|
+
extraction_type=extraction_type,
|
129
|
+
main_content_only=main_content_only,
|
130
|
+
),
|
131
|
+
page,
|
132
|
+
)
|
133
|
+
|
134
|
+
@staticmethod
|
135
|
+
@_server.tool()
|
136
|
+
async def bulk_get(
|
137
|
+
urls: Tuple[str, ...],
|
138
|
+
impersonate: Optional[BrowserTypeLiteral] = "chrome",
|
139
|
+
extraction_type: extraction_types = "markdown",
|
140
|
+
css_selector: Optional[str] = None,
|
141
|
+
main_content_only: bool = True,
|
142
|
+
params: Optional[Dict | List | Tuple] = None,
|
143
|
+
headers: Optional[Mapping[str, Optional[str]]] = None,
|
144
|
+
cookies: Optional[Dict[str, str] | list[tuple[str, str]]] = None,
|
145
|
+
timeout: Optional[int | float] = 30,
|
146
|
+
follow_redirects: bool = True,
|
147
|
+
max_redirects: int = 30,
|
148
|
+
retries: Optional[int] = 3,
|
149
|
+
retry_delay: Optional[int] = 1,
|
150
|
+
proxy: Optional[str] = None,
|
151
|
+
proxy_auth: Optional[Tuple[str, str]] = None,
|
152
|
+
auth: Optional[Tuple[str, str]] = None,
|
153
|
+
verify: Optional[bool] = True,
|
154
|
+
http3: Optional[bool] = False,
|
155
|
+
stealthy_headers: Optional[bool] = True,
|
156
|
+
) -> List[ResponseModel]:
|
157
|
+
"""Make GET HTTP request to a group of URLs and for each URL, return a structured output of the result.
|
158
|
+
Note: This is only suitable for low-mid protection levels. For high-protection levels or websites that require JS loading, use the other tools directly.
|
159
|
+
Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
|
160
|
+
|
161
|
+
:param urls: A tuple of the URLs to request.
|
162
|
+
:param impersonate: Browser version to impersonate its fingerprint. It's using the latest chrome version by default.
|
163
|
+
:param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
|
164
|
+
- Markdown will convert the page content to Markdown format.
|
165
|
+
- HTML will return the raw HTML content of the page.
|
166
|
+
- Text will return the text content of the page.
|
167
|
+
:param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
|
168
|
+
:param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
|
169
|
+
:param params: Query string parameters for the request.
|
170
|
+
:param headers: Headers to include in the request.
|
171
|
+
:param cookies: Cookies to use in the request.
|
172
|
+
:param timeout: Number of seconds to wait before timing out.
|
173
|
+
:param follow_redirects: Whether to follow redirects. Defaults to True.
|
174
|
+
:param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.
|
175
|
+
:param retries: Number of retry attempts. Defaults to 3.
|
176
|
+
:param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.
|
177
|
+
:param proxy: Proxy URL to use. Format: "http://username:password@localhost:8030".
|
178
|
+
Cannot be used together with the `proxies` parameter.
|
179
|
+
:param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).
|
180
|
+
:param auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.
|
181
|
+
:param verify: Whether to verify HTTPS certificates.
|
182
|
+
:param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.
|
183
|
+
:param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
|
184
|
+
"""
|
185
|
+
async with FetcherSession() as session:
|
186
|
+
tasks = [
|
187
|
+
session.get(
|
188
|
+
url,
|
189
|
+
auth=auth,
|
190
|
+
proxy=proxy,
|
191
|
+
http3=http3,
|
192
|
+
verify=verify,
|
193
|
+
params=params,
|
194
|
+
headers=headers,
|
195
|
+
cookies=cookies,
|
196
|
+
timeout=timeout,
|
197
|
+
retries=retries,
|
198
|
+
proxy_auth=proxy_auth,
|
199
|
+
retry_delay=retry_delay,
|
200
|
+
impersonate=impersonate,
|
201
|
+
max_redirects=max_redirects,
|
202
|
+
follow_redirects=follow_redirects,
|
203
|
+
stealthy_headers=stealthy_headers,
|
204
|
+
)
|
205
|
+
for url in urls
|
206
|
+
]
|
207
|
+
responses = await gather(*tasks)
|
208
|
+
return [
|
209
|
+
_ContentTranslator(
|
210
|
+
Convertor._extract_content(
|
211
|
+
page,
|
212
|
+
css_selector=css_selector,
|
213
|
+
extraction_type=extraction_type,
|
214
|
+
main_content_only=main_content_only,
|
215
|
+
),
|
216
|
+
page,
|
217
|
+
)
|
218
|
+
for page in responses
|
219
|
+
]
|
220
|
+
|
221
|
+
@staticmethod
|
222
|
+
@_server.tool()
|
223
|
+
async def fetch(
|
224
|
+
url: str,
|
225
|
+
extraction_type: extraction_types = "markdown",
|
226
|
+
css_selector: Optional[str] = None,
|
227
|
+
main_content_only: bool = True,
|
228
|
+
headless: bool = False,
|
229
|
+
google_search: bool = True,
|
230
|
+
hide_canvas: bool = False,
|
231
|
+
disable_webgl: bool = False,
|
232
|
+
real_chrome: bool = False,
|
233
|
+
stealth: bool = False,
|
234
|
+
wait: int | float = 0,
|
235
|
+
proxy: Optional[str | Dict[str, str]] = None,
|
236
|
+
locale: str = "en-US",
|
237
|
+
extra_headers: Optional[Dict[str, str]] = None,
|
238
|
+
useragent: Optional[str] = None,
|
239
|
+
cdp_url: Optional[str] = None,
|
240
|
+
timeout: int | float = 30000,
|
241
|
+
disable_resources: bool = False,
|
242
|
+
wait_selector: Optional[str] = None,
|
243
|
+
cookies: Optional[List[Dict]] = None,
|
244
|
+
network_idle: bool = False,
|
245
|
+
wait_selector_state: SelectorWaitStates = "attached",
|
246
|
+
) -> ResponseModel:
|
247
|
+
"""Use playwright to open a browser to fetch a URL and return a structured output of the result.
|
248
|
+
Note: This is only suitable for low-mid protection levels.
|
249
|
+
Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
|
250
|
+
|
251
|
+
:param url: The URL to request.
|
252
|
+
:param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
|
253
|
+
- Markdown will convert the page content to Markdown format.
|
254
|
+
- HTML will return the raw HTML content of the page.
|
255
|
+
- Text will return the text content of the page.
|
256
|
+
:param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
|
257
|
+
:param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
|
258
|
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
259
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
260
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
261
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
262
|
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
263
|
+
:param cookies: Set cookies for the next request. It should be in a dictionary format that Playwright accepts.
|
264
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
265
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
266
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
267
|
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
268
|
+
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
269
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
270
|
+
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
271
|
+
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
272
|
+
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
273
|
+
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
274
|
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
|
275
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
276
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
277
|
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
278
|
+
"""
|
279
|
+
page = await DynamicFetcher.async_fetch(
|
280
|
+
url,
|
281
|
+
wait=wait,
|
282
|
+
proxy=proxy,
|
283
|
+
locale=locale,
|
284
|
+
timeout=timeout,
|
285
|
+
cookies=cookies,
|
286
|
+
stealth=stealth,
|
287
|
+
cdp_url=cdp_url,
|
288
|
+
headless=headless,
|
289
|
+
useragent=useragent,
|
290
|
+
hide_canvas=hide_canvas,
|
291
|
+
real_chrome=real_chrome,
|
292
|
+
network_idle=network_idle,
|
293
|
+
wait_selector=wait_selector,
|
294
|
+
disable_webgl=disable_webgl,
|
295
|
+
extra_headers=extra_headers,
|
296
|
+
google_search=google_search,
|
297
|
+
disable_resources=disable_resources,
|
298
|
+
wait_selector_state=wait_selector_state,
|
299
|
+
)
|
300
|
+
return _ContentTranslator(
|
301
|
+
Convertor._extract_content(
|
302
|
+
page,
|
303
|
+
css_selector=css_selector,
|
304
|
+
extraction_type=extraction_type,
|
305
|
+
main_content_only=main_content_only,
|
306
|
+
),
|
307
|
+
page,
|
308
|
+
)
|
309
|
+
|
310
|
+
@staticmethod
|
311
|
+
@_server.tool()
|
312
|
+
async def bulk_fetch(
|
313
|
+
urls: Tuple[str, ...],
|
314
|
+
extraction_type: extraction_types = "markdown",
|
315
|
+
css_selector: Optional[str] = None,
|
316
|
+
main_content_only: bool = True,
|
317
|
+
headless: bool = False,
|
318
|
+
google_search: bool = True,
|
319
|
+
hide_canvas: bool = False,
|
320
|
+
disable_webgl: bool = False,
|
321
|
+
real_chrome: bool = False,
|
322
|
+
stealth: bool = False,
|
323
|
+
wait: int | float = 0,
|
324
|
+
proxy: Optional[str | Dict[str, str]] = None,
|
325
|
+
locale: str = "en-US",
|
326
|
+
extra_headers: Optional[Dict[str, str]] = None,
|
327
|
+
useragent: Optional[str] = None,
|
328
|
+
cdp_url: Optional[str] = None,
|
329
|
+
timeout: int | float = 30000,
|
330
|
+
disable_resources: bool = False,
|
331
|
+
wait_selector: Optional[str] = None,
|
332
|
+
cookies: Optional[List[Dict]] = None,
|
333
|
+
network_idle: bool = False,
|
334
|
+
wait_selector_state: SelectorWaitStates = "attached",
|
335
|
+
) -> List[ResponseModel]:
|
336
|
+
"""Use playwright to open a browser, then fetch a group of URLs at the same time, and for each page return a structured output of the result.
|
337
|
+
Note: This is only suitable for low-mid protection levels.
|
338
|
+
Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
|
339
|
+
|
340
|
+
:param urls: A tuple of the URLs to request.
|
341
|
+
:param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
|
342
|
+
- Markdown will convert the page content to Markdown format.
|
343
|
+
- HTML will return the raw HTML content of the page.
|
344
|
+
- Text will return the text content of the page.
|
345
|
+
:param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
|
346
|
+
:param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
|
347
|
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
348
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
349
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
350
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
351
|
+
:param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
|
352
|
+
:param cookies: Set cookies for the next request. It should be in a dictionary format that Playwright accepts.
|
353
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
354
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
355
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
356
|
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
357
|
+
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
|
358
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
359
|
+
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
360
|
+
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
361
|
+
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
362
|
+
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
363
|
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
|
364
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
365
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
366
|
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
367
|
+
"""
|
368
|
+
async with AsyncDynamicSession(
|
369
|
+
wait=wait,
|
370
|
+
proxy=proxy,
|
371
|
+
locale=locale,
|
372
|
+
timeout=timeout,
|
373
|
+
cookies=cookies,
|
374
|
+
stealth=stealth,
|
375
|
+
cdp_url=cdp_url,
|
376
|
+
headless=headless,
|
377
|
+
max_pages=len(urls),
|
378
|
+
useragent=useragent,
|
379
|
+
hide_canvas=hide_canvas,
|
380
|
+
real_chrome=real_chrome,
|
381
|
+
network_idle=network_idle,
|
382
|
+
wait_selector=wait_selector,
|
383
|
+
google_search=google_search,
|
384
|
+
disable_webgl=disable_webgl,
|
385
|
+
extra_headers=extra_headers,
|
386
|
+
disable_resources=disable_resources,
|
387
|
+
wait_selector_state=wait_selector_state,
|
388
|
+
) as session:
|
389
|
+
tasks = [session.fetch(url) for url in urls]
|
390
|
+
responses = await gather(*tasks)
|
391
|
+
return [
|
392
|
+
_ContentTranslator(
|
393
|
+
Convertor._extract_content(
|
394
|
+
page,
|
395
|
+
css_selector=css_selector,
|
396
|
+
extraction_type=extraction_type,
|
397
|
+
main_content_only=main_content_only,
|
398
|
+
),
|
399
|
+
page,
|
400
|
+
)
|
401
|
+
for page in responses
|
402
|
+
]
|
403
|
+
|
404
|
+
@staticmethod
|
405
|
+
@_server.tool()
|
406
|
+
async def stealthy_fetch(
|
407
|
+
url: str,
|
408
|
+
extraction_type: extraction_types = "markdown",
|
409
|
+
css_selector: Optional[str] = None,
|
410
|
+
main_content_only: bool = True,
|
411
|
+
headless: bool = True, # noqa: F821
|
412
|
+
block_images: bool = False,
|
413
|
+
disable_resources: bool = False,
|
414
|
+
block_webrtc: bool = False,
|
415
|
+
allow_webgl: bool = True,
|
416
|
+
network_idle: bool = False,
|
417
|
+
humanize: bool | float = True,
|
418
|
+
solve_cloudflare: bool = False,
|
419
|
+
wait: int | float = 0,
|
420
|
+
timeout: int | float = 30000,
|
421
|
+
wait_selector: Optional[str] = None,
|
422
|
+
addons: Optional[List[str]] = None,
|
423
|
+
wait_selector_state: SelectorWaitStates = "attached",
|
424
|
+
cookies: Optional[List[Dict]] = None,
|
425
|
+
google_search: bool = True,
|
426
|
+
extra_headers: Optional[Dict[str, str]] = None,
|
427
|
+
proxy: Optional[str | Dict[str, str]] = None,
|
428
|
+
os_randomize: bool = False,
|
429
|
+
disable_ads: bool = False,
|
430
|
+
geoip: bool = False,
|
431
|
+
additional_args: Optional[Dict] = None,
|
432
|
+
) -> ResponseModel:
|
433
|
+
"""Use Scrapling's version of the Camoufox browser to fetch a URL and return a structured output of the result.
|
434
|
+
Note: This is best suitable for high protection levels. It's slower than the other tools.
|
435
|
+
Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
|
436
|
+
|
437
|
+
:param url: The URL to request.
|
438
|
+
:param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
|
439
|
+
- Markdown will convert the page content to Markdown format.
|
440
|
+
- HTML will return the raw HTML content of the page.
|
441
|
+
- Text will return the text content of the page.
|
442
|
+
:param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
|
443
|
+
:param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
|
444
|
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
445
|
+
:param block_images: Prevent the loading of images through Firefox preferences.
|
446
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
447
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
448
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
449
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
450
|
+
:param block_webrtc: Blocks WebRTC entirely.
|
451
|
+
:param cookies: Set cookies for the next request.
|
452
|
+
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
453
|
+
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
454
|
+
:param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
|
455
|
+
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
456
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
457
|
+
:param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
|
458
|
+
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
459
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
460
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
461
|
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
462
|
+
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
|
463
|
+
It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
|
464
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
465
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
466
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
467
|
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
468
|
+
:param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
469
|
+
"""
|
470
|
+
page = await StealthyFetcher.async_fetch(
|
471
|
+
url,
|
472
|
+
wait=wait,
|
473
|
+
proxy=proxy,
|
474
|
+
geoip=geoip,
|
475
|
+
addons=addons,
|
476
|
+
timeout=timeout,
|
477
|
+
cookies=cookies,
|
478
|
+
headless=headless,
|
479
|
+
humanize=humanize,
|
480
|
+
allow_webgl=allow_webgl,
|
481
|
+
disable_ads=disable_ads,
|
482
|
+
network_idle=network_idle,
|
483
|
+
block_images=block_images,
|
484
|
+
block_webrtc=block_webrtc,
|
485
|
+
os_randomize=os_randomize,
|
486
|
+
wait_selector=wait_selector,
|
487
|
+
google_search=google_search,
|
488
|
+
extra_headers=extra_headers,
|
489
|
+
solve_cloudflare=solve_cloudflare,
|
490
|
+
disable_resources=disable_resources,
|
491
|
+
wait_selector_state=wait_selector_state,
|
492
|
+
additional_args=additional_args,
|
493
|
+
)
|
494
|
+
return _ContentTranslator(
|
495
|
+
Convertor._extract_content(
|
496
|
+
page,
|
497
|
+
css_selector=css_selector,
|
498
|
+
extraction_type=extraction_type,
|
499
|
+
main_content_only=main_content_only,
|
500
|
+
),
|
501
|
+
page,
|
502
|
+
)
|
503
|
+
|
504
|
+
@staticmethod
|
505
|
+
@_server.tool()
|
506
|
+
async def bulk_stealthy_fetch(
|
507
|
+
urls: Tuple[str, ...],
|
508
|
+
extraction_type: extraction_types = "markdown",
|
509
|
+
css_selector: Optional[str] = None,
|
510
|
+
main_content_only: bool = True,
|
511
|
+
headless: bool = True, # noqa: F821
|
512
|
+
block_images: bool = False,
|
513
|
+
disable_resources: bool = False,
|
514
|
+
block_webrtc: bool = False,
|
515
|
+
allow_webgl: bool = True,
|
516
|
+
network_idle: bool = False,
|
517
|
+
humanize: bool | float = True,
|
518
|
+
solve_cloudflare: bool = False,
|
519
|
+
wait: int | float = 0,
|
520
|
+
timeout: int | float = 30000,
|
521
|
+
wait_selector: Optional[str] = None,
|
522
|
+
addons: Optional[List[str]] = None,
|
523
|
+
wait_selector_state: SelectorWaitStates = "attached",
|
524
|
+
cookies: Optional[List[Dict]] = None,
|
525
|
+
google_search: bool = True,
|
526
|
+
extra_headers: Optional[Dict[str, str]] = None,
|
527
|
+
proxy: Optional[str | Dict[str, str]] = None,
|
528
|
+
os_randomize: bool = False,
|
529
|
+
disable_ads: bool = False,
|
530
|
+
geoip: bool = False,
|
531
|
+
additional_args: Optional[Dict] = None,
|
532
|
+
) -> List[ResponseModel]:
|
533
|
+
"""Use Scrapling's version of the Camoufox browser to fetch a group of URLs at the same time, and for each page return a structured output of the result.
|
534
|
+
Note: This is best suitable for high protection levels. It's slower than the other tools.
|
535
|
+
Note: If the `css_selector` resolves to more than one element, all the elements will be returned.
|
536
|
+
|
537
|
+
:param urls: A tuple of the URLs to request.
|
538
|
+
:param extraction_type: The type of content to extract from the page. Defaults to "markdown". Options are:
|
539
|
+
- Markdown will convert the page content to Markdown format.
|
540
|
+
- HTML will return the raw HTML content of the page.
|
541
|
+
- Text will return the text content of the page.
|
542
|
+
:param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.
|
543
|
+
:param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.
|
544
|
+
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
|
545
|
+
:param block_images: Prevent the loading of images through Firefox preferences.
|
546
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
547
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends, but it made requests ~25% faster in my tests for some websites.
|
548
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
549
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
550
|
+
:param block_webrtc: Blocks WebRTC entirely.
|
551
|
+
:param cookies: Set cookies for the next request.
|
552
|
+
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
553
|
+
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
554
|
+
:param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
|
555
|
+
:param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
|
556
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
557
|
+
:param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
|
558
|
+
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
559
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
|
560
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
|
561
|
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
562
|
+
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
|
563
|
+
It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
|
564
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
|
565
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
566
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
567
|
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
568
|
+
:param additional_args: Additional arguments to be passed to Camoufox as additional settings, and it takes higher priority than Scrapling's settings.
|
569
|
+
"""
|
570
|
+
async with AsyncStealthySession(
|
571
|
+
wait=wait,
|
572
|
+
proxy=proxy,
|
573
|
+
geoip=geoip,
|
574
|
+
addons=addons,
|
575
|
+
timeout=timeout,
|
576
|
+
cookies=cookies,
|
577
|
+
headless=headless,
|
578
|
+
humanize=humanize,
|
579
|
+
max_pages=len(urls),
|
580
|
+
allow_webgl=allow_webgl,
|
581
|
+
disable_ads=disable_ads,
|
582
|
+
block_images=block_images,
|
583
|
+
block_webrtc=block_webrtc,
|
584
|
+
network_idle=network_idle,
|
585
|
+
os_randomize=os_randomize,
|
586
|
+
wait_selector=wait_selector,
|
587
|
+
google_search=google_search,
|
588
|
+
extra_headers=extra_headers,
|
589
|
+
solve_cloudflare=solve_cloudflare,
|
590
|
+
disable_resources=disable_resources,
|
591
|
+
wait_selector_state=wait_selector_state,
|
592
|
+
additional_args=additional_args,
|
593
|
+
) as session:
|
594
|
+
tasks = [session.fetch(url) for url in urls]
|
595
|
+
responses = await gather(*tasks)
|
596
|
+
return [
|
597
|
+
_ContentTranslator(
|
598
|
+
Convertor._extract_content(
|
599
|
+
page,
|
600
|
+
css_selector=css_selector,
|
601
|
+
extraction_type=extraction_type,
|
602
|
+
main_content_only=main_content_only,
|
603
|
+
),
|
604
|
+
page,
|
605
|
+
)
|
606
|
+
for page in responses
|
607
|
+
]
|
608
|
+
|
609
|
+
def serve(self):
|
610
|
+
"""Serve the MCP server."""
|
611
|
+
self._server.run(transport="stdio")
|