scrapling 0.2.98__py3-none-any.whl → 0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +18 -31
- scrapling/cli.py +818 -20
- scrapling/core/_html_utils.py +348 -0
- scrapling/core/_types.py +34 -17
- scrapling/core/ai.py +611 -0
- scrapling/core/custom_types.py +183 -100
- scrapling/core/mixins.py +27 -19
- scrapling/core/shell.py +647 -0
- scrapling/core/{storage_adaptors.py → storage.py} +41 -33
- scrapling/core/translator.py +20 -26
- scrapling/core/utils.py +49 -54
- scrapling/engines/__init__.py +15 -6
- scrapling/engines/_browsers/__init__.py +2 -0
- scrapling/engines/_browsers/_camoufox.py +745 -0
- scrapling/engines/_browsers/_config_tools.py +130 -0
- scrapling/engines/_browsers/_controllers.py +630 -0
- scrapling/engines/_browsers/_page.py +93 -0
- scrapling/engines/_browsers/_validators.py +150 -0
- scrapling/engines/constants.py +101 -88
- scrapling/engines/static.py +667 -110
- scrapling/engines/toolbelt/__init__.py +20 -6
- scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
- scrapling/engines/toolbelt/convertor.py +254 -0
- scrapling/engines/toolbelt/custom.py +205 -186
- scrapling/engines/toolbelt/fingerprints.py +32 -46
- scrapling/engines/toolbelt/navigation.py +68 -39
- scrapling/fetchers.py +255 -260
- scrapling/parser.py +781 -449
- scrapling-0.3.dist-info/METADATA +409 -0
- scrapling-0.3.dist-info/RECORD +41 -0
- {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/WHEEL +1 -1
- {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/top_level.txt +0 -1
- scrapling/defaults.py +0 -19
- scrapling/engines/camo.py +0 -299
- scrapling/engines/pw.py +0 -428
- scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
- scrapling-0.2.98.dist-info/METADATA +0 -867
- scrapling-0.2.98.dist-info/RECORD +0 -49
- tests/__init__.py +0 -1
- tests/fetchers/__init__.py +0 -1
- tests/fetchers/async/__init__.py +0 -0
- tests/fetchers/async/test_camoufox.py +0 -95
- tests/fetchers/async/test_httpx.py +0 -83
- tests/fetchers/async/test_playwright.py +0 -99
- tests/fetchers/sync/__init__.py +0 -0
- tests/fetchers/sync/test_camoufox.py +0 -68
- tests/fetchers/sync/test_httpx.py +0 -82
- tests/fetchers/sync/test_playwright.py +0 -87
- tests/fetchers/test_utils.py +0 -97
- tests/parser/__init__.py +0 -0
- tests/parser/test_automatch.py +0 -111
- tests/parser/test_general.py +0 -330
- {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/entry_points.txt +0 -0
- {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info/licenses}/LICENSE +0 -0
scrapling/cli.py
CHANGED
@@ -1,38 +1,836 @@
|
|
1
|
-
import os
|
2
|
-
import subprocess
|
3
|
-
import sys
|
4
1
|
from pathlib import Path
|
2
|
+
from subprocess import check_output
|
3
|
+
from sys import executable as python_executable
|
5
4
|
|
6
|
-
import
|
5
|
+
from scrapling.core.utils import log
|
6
|
+
from scrapling.engines.toolbelt import Response
|
7
|
+
from scrapling.core._types import List, Optional, Dict, Tuple, Any, Callable
|
8
|
+
from scrapling.fetchers import Fetcher, DynamicFetcher, StealthyFetcher
|
9
|
+
from scrapling.core.shell import Convertor, _CookieParser, _ParseHeaders
|
7
10
|
|
11
|
+
from orjson import loads as json_loads, JSONDecodeError
|
12
|
+
from click import command, option, Choice, group, argument
|
8
13
|
|
9
|
-
|
10
|
-
|
14
|
+
__OUTPUT_FILE_HELP__ = "The output file path can be an HTML file, a Markdown file of the HTML content, or the text content itself. Use file extensions (`.html`/`.md`/`.txt`) respectively."
|
15
|
+
__PACKAGE_DIR__ = Path(__file__).parent
|
11
16
|
|
12
17
|
|
13
|
-
def
|
14
|
-
print(f"Installing {
|
15
|
-
_ =
|
18
|
+
def __Execute(cmd: List[str], help_line: str) -> None: # pragma: no cover
|
19
|
+
print(f"Installing {help_line}...")
|
20
|
+
_ = check_output(cmd, shell=False) # nosec B603
|
16
21
|
# I meant to not use try except here
|
17
22
|
|
18
23
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
24
|
+
def __ParseJSONData(json_string: Optional[str] = None) -> Optional[Dict[str, Any]]:
|
25
|
+
"""Parse JSON string into a Python object"""
|
26
|
+
if not json_string:
|
27
|
+
return None
|
28
|
+
|
29
|
+
try:
|
30
|
+
return json_loads(json_string)
|
31
|
+
except JSONDecodeError as e: # pragma: no cover
|
32
|
+
raise ValueError(f"Invalid JSON data '{json_string}': {e}")
|
33
|
+
|
34
|
+
|
35
|
+
def __Request_and_Save(
|
36
|
+
fetcher_func: Callable[..., Response],
|
37
|
+
url: str,
|
38
|
+
output_file: str,
|
39
|
+
css_selector: Optional[str] = None,
|
40
|
+
**kwargs,
|
41
|
+
) -> None:
|
42
|
+
"""Make a request using the specified fetcher function and save the result"""
|
43
|
+
# Handle relative paths - convert to an absolute path based on the current working directory
|
44
|
+
output_path = Path(output_file)
|
45
|
+
if not output_path.is_absolute():
|
46
|
+
output_path = Path.cwd() / output_file
|
47
|
+
|
48
|
+
response = fetcher_func(url, **kwargs)
|
49
|
+
Convertor.write_content_to_file(response, str(output_path), css_selector)
|
50
|
+
log.info(f"Content successfully saved to '{output_path}'")
|
51
|
+
|
52
|
+
|
53
|
+
def __ParseExtractArguments(
|
54
|
+
headers: List[str], cookies: str, params: str, json: Optional[str] = None
|
55
|
+
) -> Tuple[Dict[str, str], Dict[str, str], Dict[str, str], Optional[Dict[str, str]]]:
|
56
|
+
"""Parse arguments for extract command"""
|
57
|
+
parsed_headers, parsed_cookies = _ParseHeaders(headers)
|
58
|
+
if cookies:
|
59
|
+
for key, value in _CookieParser(cookies):
|
60
|
+
try:
|
61
|
+
parsed_cookies[key] = value
|
62
|
+
except Exception as e:
|
63
|
+
raise ValueError(f"Could not parse cookies '{cookies}': {e}")
|
64
|
+
|
65
|
+
parsed_json = __ParseJSONData(json)
|
66
|
+
parsed_params = {}
|
67
|
+
for param in params:
|
68
|
+
if "=" in param:
|
69
|
+
key, value = param.split("=", 1)
|
70
|
+
parsed_params[key] = value
|
71
|
+
|
72
|
+
return parsed_headers, parsed_cookies, parsed_params, parsed_json
|
73
|
+
|
74
|
+
|
75
|
+
def __BuildRequest(
|
76
|
+
headers: List[str], cookies: str, params: str, json: Optional[str] = None, **kwargs
|
77
|
+
) -> Dict:
|
78
|
+
"""Build a request object using the specified arguments"""
|
79
|
+
# Parse parameters
|
80
|
+
parsed_headers, parsed_cookies, parsed_params, parsed_json = (
|
81
|
+
__ParseExtractArguments(headers, cookies, params, json)
|
82
|
+
)
|
83
|
+
# Build request arguments
|
84
|
+
request_kwargs = {
|
85
|
+
"headers": parsed_headers if parsed_headers else None,
|
86
|
+
"cookies": parsed_cookies if parsed_cookies else None,
|
87
|
+
}
|
88
|
+
if parsed_json:
|
89
|
+
request_kwargs["json"] = parsed_json
|
90
|
+
if parsed_params:
|
91
|
+
request_kwargs["params"] = parsed_params
|
92
|
+
if "proxy" in kwargs:
|
93
|
+
request_kwargs["proxy"] = kwargs.pop("proxy")
|
94
|
+
|
95
|
+
return {**request_kwargs, **kwargs}
|
96
|
+
|
97
|
+
|
98
|
+
@command(help="Install all Scrapling's Fetchers dependencies")
|
99
|
+
@option(
|
100
|
+
"-f",
|
101
|
+
"--force",
|
102
|
+
"force",
|
103
|
+
is_flag=True,
|
104
|
+
default=False,
|
105
|
+
type=bool,
|
106
|
+
help="Force Scrapling to reinstall all Fetchers dependencies",
|
107
|
+
)
|
108
|
+
def install(force): # pragma: no cover
|
109
|
+
if (
|
110
|
+
force
|
111
|
+
or not __PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").exists()
|
112
|
+
):
|
113
|
+
__Execute(
|
114
|
+
[python_executable, "-m", "playwright", "install", "chromium"],
|
115
|
+
"Playwright browsers",
|
116
|
+
)
|
117
|
+
__Execute(
|
118
|
+
[
|
119
|
+
python_executable,
|
120
|
+
"-m",
|
121
|
+
"playwright",
|
122
|
+
"install-deps",
|
123
|
+
"chromium",
|
124
|
+
"firefox",
|
125
|
+
],
|
126
|
+
"Playwright dependencies",
|
127
|
+
)
|
128
|
+
__Execute(
|
129
|
+
[python_executable, "-m", "camoufox", "fetch", "--browserforge"],
|
130
|
+
"Camoufox browser and databases",
|
131
|
+
)
|
132
|
+
# if no errors raised by the above commands, then we add the below file
|
133
|
+
__PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").touch()
|
28
134
|
else:
|
29
|
-
print(
|
135
|
+
print("The dependencies are already installed")
|
136
|
+
|
137
|
+
|
138
|
+
@command(help="Run Scrapling's MCP server (Check the docs for more info).")
|
139
|
+
def mcp():
|
140
|
+
from scrapling.core.ai import ScraplingMCPServer
|
141
|
+
|
142
|
+
ScraplingMCPServer().serve()
|
143
|
+
|
144
|
+
|
145
|
+
@command(help="Interactive scraping console")
|
146
|
+
@option(
|
147
|
+
"-c",
|
148
|
+
"--code",
|
149
|
+
"code",
|
150
|
+
is_flag=False,
|
151
|
+
default="",
|
152
|
+
type=str,
|
153
|
+
help="Evaluate the code in the shell, print the result and exit",
|
154
|
+
)
|
155
|
+
@option(
|
156
|
+
"-L",
|
157
|
+
"--loglevel",
|
158
|
+
"level",
|
159
|
+
is_flag=False,
|
160
|
+
default="debug",
|
161
|
+
type=Choice(
|
162
|
+
["debug", "info", "warning", "error", "critical", "fatal"], case_sensitive=False
|
163
|
+
),
|
164
|
+
help="Log level (default: DEBUG)",
|
165
|
+
)
|
166
|
+
def shell(code, level):
|
167
|
+
from scrapling.core.shell import CustomShell
|
168
|
+
|
169
|
+
console = CustomShell(code=code, log_level=level)
|
170
|
+
console.start()
|
171
|
+
|
172
|
+
|
173
|
+
@group(
|
174
|
+
help="Fetch web pages using various fetchers and extract full/selected HTML content as HTML, Markdown, or extract text content."
|
175
|
+
)
|
176
|
+
def extract():
|
177
|
+
"""Extract content from web pages and save to files"""
|
178
|
+
pass
|
179
|
+
|
180
|
+
|
181
|
+
@extract.command(
|
182
|
+
help=f"Perform a GET request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}"
|
183
|
+
)
|
184
|
+
@argument("url", required=True)
|
185
|
+
@argument("output_file", required=True)
|
186
|
+
@option(
|
187
|
+
"--headers",
|
188
|
+
"-H",
|
189
|
+
multiple=True,
|
190
|
+
help='HTTP headers in format "Key: Value" (can be used multiple times)',
|
191
|
+
)
|
192
|
+
@option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
|
193
|
+
@option(
|
194
|
+
"--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)"
|
195
|
+
)
|
196
|
+
@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
|
197
|
+
@option(
|
198
|
+
"--css-selector",
|
199
|
+
"-s",
|
200
|
+
help="CSS selector to extract specific content from the page. It returns all matches.",
|
201
|
+
)
|
202
|
+
@option(
|
203
|
+
"--params",
|
204
|
+
"-p",
|
205
|
+
multiple=True,
|
206
|
+
help='Query parameters in format "key=value" (can be used multiple times)',
|
207
|
+
)
|
208
|
+
@option(
|
209
|
+
"--follow-redirects/--no-follow-redirects",
|
210
|
+
default=True,
|
211
|
+
help="Whether to follow redirects (default: True)",
|
212
|
+
)
|
213
|
+
@option(
|
214
|
+
"--verify/--no-verify",
|
215
|
+
default=True,
|
216
|
+
help="Whether to verify SSL certificates (default: True)",
|
217
|
+
)
|
218
|
+
@option("--impersonate", help="Browser to impersonate (e.g., chrome, firefox).")
|
219
|
+
@option(
|
220
|
+
"--stealthy-headers/--no-stealthy-headers",
|
221
|
+
default=True,
|
222
|
+
help="Use stealthy browser headers (default: True)",
|
223
|
+
)
|
224
|
+
def get(
|
225
|
+
url,
|
226
|
+
output_file,
|
227
|
+
headers,
|
228
|
+
cookies,
|
229
|
+
timeout,
|
230
|
+
proxy,
|
231
|
+
css_selector,
|
232
|
+
params,
|
233
|
+
follow_redirects,
|
234
|
+
verify,
|
235
|
+
impersonate,
|
236
|
+
stealthy_headers,
|
237
|
+
):
|
238
|
+
"""
|
239
|
+
Perform a GET request and save the content to a file.
|
240
|
+
|
241
|
+
:param url: Target URL for the request.
|
242
|
+
:param output_file: Output file path (.md for Markdown, .html for HTML).
|
243
|
+
:param headers: HTTP headers to include in the request.
|
244
|
+
:param cookies: Cookies to use in the request.
|
245
|
+
:param timeout: Number of seconds to wait before timing out.
|
246
|
+
:param proxy: Proxy URL to use. (Format: "http://username:password@localhost:8030")
|
247
|
+
:param css_selector: CSS selector to extract specific content.
|
248
|
+
:param params: Query string parameters for the request.
|
249
|
+
:param follow_redirects: Whether to follow redirects.
|
250
|
+
:param verify: Whether to verify HTTPS certificates.
|
251
|
+
:param impersonate: Browser version to impersonate.
|
252
|
+
:param stealthy_headers: If enabled, creates and adds real browser headers.
|
253
|
+
"""
|
254
|
+
|
255
|
+
kwargs = __BuildRequest(
|
256
|
+
headers,
|
257
|
+
cookies,
|
258
|
+
params,
|
259
|
+
None,
|
260
|
+
timeout=timeout,
|
261
|
+
follow_redirects=follow_redirects,
|
262
|
+
verify=verify,
|
263
|
+
stealthy_headers=stealthy_headers,
|
264
|
+
impersonate=impersonate,
|
265
|
+
proxy=proxy,
|
266
|
+
)
|
267
|
+
__Request_and_Save(Fetcher.get, url, output_file, css_selector, **kwargs)
|
268
|
+
|
269
|
+
|
270
|
+
@extract.command(
|
271
|
+
help=f"Perform a POST request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}"
|
272
|
+
)
|
273
|
+
@argument("url", required=True)
|
274
|
+
@argument("output_file", required=True)
|
275
|
+
@option(
|
276
|
+
"--data",
|
277
|
+
"-d",
|
278
|
+
help='Form data to include in the request body (as string, ex: "param1=value1¶m2=value2")',
|
279
|
+
)
|
280
|
+
@option("--json", "-j", help="JSON data to include in the request body (as string)")
|
281
|
+
@option(
|
282
|
+
"--headers",
|
283
|
+
"-H",
|
284
|
+
multiple=True,
|
285
|
+
help='HTTP headers in format "Key: Value" (can be used multiple times)',
|
286
|
+
)
|
287
|
+
@option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
|
288
|
+
@option(
|
289
|
+
"--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)"
|
290
|
+
)
|
291
|
+
@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
|
292
|
+
@option(
|
293
|
+
"--css-selector",
|
294
|
+
"-s",
|
295
|
+
help="CSS selector to extract specific content from the page. It returns all matches.",
|
296
|
+
)
|
297
|
+
@option(
|
298
|
+
"--params",
|
299
|
+
"-p",
|
300
|
+
multiple=True,
|
301
|
+
help='Query parameters in format "key=value" (can be used multiple times)',
|
302
|
+
)
|
303
|
+
@option(
|
304
|
+
"--follow-redirects/--no-follow-redirects",
|
305
|
+
default=True,
|
306
|
+
help="Whether to follow redirects (default: True)",
|
307
|
+
)
|
308
|
+
@option(
|
309
|
+
"--verify/--no-verify",
|
310
|
+
default=True,
|
311
|
+
help="Whether to verify SSL certificates (default: True)",
|
312
|
+
)
|
313
|
+
@option("--impersonate", help="Browser to impersonate (e.g., chrome, firefox).")
|
314
|
+
@option(
|
315
|
+
"--stealthy-headers/--no-stealthy-headers",
|
316
|
+
default=True,
|
317
|
+
help="Use stealthy browser headers (default: True)",
|
318
|
+
)
|
319
|
+
def post(
|
320
|
+
url,
|
321
|
+
output_file,
|
322
|
+
data,
|
323
|
+
json,
|
324
|
+
headers,
|
325
|
+
cookies,
|
326
|
+
timeout,
|
327
|
+
proxy,
|
328
|
+
css_selector,
|
329
|
+
params,
|
330
|
+
follow_redirects,
|
331
|
+
verify,
|
332
|
+
impersonate,
|
333
|
+
stealthy_headers,
|
334
|
+
):
|
335
|
+
"""
|
336
|
+
Perform a POST request and save the content to a file.
|
337
|
+
|
338
|
+
:param url: Target URL for the request.
|
339
|
+
:param output_file: Output file path (.md for Markdown, .html for HTML).
|
340
|
+
:param data: Form data to include in the request body. (as string, ex: "param1=value1¶m2=value2")
|
341
|
+
:param json: A JSON serializable object to include in the body of the request.
|
342
|
+
:param headers: Headers to include in the request.
|
343
|
+
:param cookies: Cookies to use in the request.
|
344
|
+
:param timeout: Number of seconds to wait before timing out.
|
345
|
+
:param proxy: Proxy URL to use.
|
346
|
+
:param css_selector: CSS selector to extract specific content.
|
347
|
+
:param params: Query string parameters for the request.
|
348
|
+
:param follow_redirects: Whether to follow redirects.
|
349
|
+
:param verify: Whether to verify HTTPS certificates.
|
350
|
+
:param impersonate: Browser version to impersonate.
|
351
|
+
:param stealthy_headers: If enabled, creates and adds real browser headers.
|
352
|
+
"""
|
353
|
+
|
354
|
+
kwargs = __BuildRequest(
|
355
|
+
headers,
|
356
|
+
cookies,
|
357
|
+
params,
|
358
|
+
json,
|
359
|
+
timeout=timeout,
|
360
|
+
follow_redirects=follow_redirects,
|
361
|
+
verify=verify,
|
362
|
+
stealthy_headers=stealthy_headers,
|
363
|
+
impersonate=impersonate,
|
364
|
+
proxy=proxy,
|
365
|
+
data=data,
|
366
|
+
)
|
367
|
+
__Request_and_Save(Fetcher.post, url, output_file, css_selector, **kwargs)
|
368
|
+
|
369
|
+
|
370
|
+
@extract.command(
|
371
|
+
help=f"Perform a PUT request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}"
|
372
|
+
)
|
373
|
+
@argument("url", required=True)
|
374
|
+
@argument("output_file", required=True)
|
375
|
+
@option("--data", "-d", help="Form data to include in the request body")
|
376
|
+
@option("--json", "-j", help="JSON data to include in the request body (as string)")
|
377
|
+
@option(
|
378
|
+
"--headers",
|
379
|
+
"-H",
|
380
|
+
multiple=True,
|
381
|
+
help='HTTP headers in format "Key: Value" (can be used multiple times)',
|
382
|
+
)
|
383
|
+
@option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
|
384
|
+
@option(
|
385
|
+
"--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)"
|
386
|
+
)
|
387
|
+
@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
|
388
|
+
@option(
|
389
|
+
"--css-selector",
|
390
|
+
"-s",
|
391
|
+
help="CSS selector to extract specific content from the page. It returns all matches.",
|
392
|
+
)
|
393
|
+
@option(
|
394
|
+
"--params",
|
395
|
+
"-p",
|
396
|
+
multiple=True,
|
397
|
+
help='Query parameters in format "key=value" (can be used multiple times)',
|
398
|
+
)
|
399
|
+
@option(
|
400
|
+
"--follow-redirects/--no-follow-redirects",
|
401
|
+
default=True,
|
402
|
+
help="Whether to follow redirects (default: True)",
|
403
|
+
)
|
404
|
+
@option(
|
405
|
+
"--verify/--no-verify",
|
406
|
+
default=True,
|
407
|
+
help="Whether to verify SSL certificates (default: True)",
|
408
|
+
)
|
409
|
+
@option("--impersonate", help="Browser to impersonate (e.g., chrome, firefox).")
|
410
|
+
@option(
|
411
|
+
"--stealthy-headers/--no-stealthy-headers",
|
412
|
+
default=True,
|
413
|
+
help="Use stealthy browser headers (default: True)",
|
414
|
+
)
|
415
|
+
def put(
|
416
|
+
url,
|
417
|
+
output_file,
|
418
|
+
data,
|
419
|
+
json,
|
420
|
+
headers,
|
421
|
+
cookies,
|
422
|
+
timeout,
|
423
|
+
proxy,
|
424
|
+
css_selector,
|
425
|
+
params,
|
426
|
+
follow_redirects,
|
427
|
+
verify,
|
428
|
+
impersonate,
|
429
|
+
stealthy_headers,
|
430
|
+
):
|
431
|
+
"""
|
432
|
+
Perform a PUT request and save the content to a file.
|
433
|
+
|
434
|
+
:param url: Target URL for the request.
|
435
|
+
:param output_file: Output file path (.md for Markdown, .html for HTML).
|
436
|
+
:param data: Form data to include in the request body.
|
437
|
+
:param json: A JSON serializable object to include in the body of the request.
|
438
|
+
:param headers: Headers to include in the request.
|
439
|
+
:param cookies: Cookies to use in the request.
|
440
|
+
:param timeout: Number of seconds to wait before timing out.
|
441
|
+
:param proxy: Proxy URL to use.
|
442
|
+
:param css_selector: CSS selector to extract specific content.
|
443
|
+
:param params: Query string parameters for the request.
|
444
|
+
:param follow_redirects: Whether to follow redirects.
|
445
|
+
:param verify: Whether to verify HTTPS certificates.
|
446
|
+
:param impersonate: Browser version to impersonate.
|
447
|
+
:param stealthy_headers: If enabled, creates and adds real browser headers.
|
448
|
+
"""
|
449
|
+
|
450
|
+
kwargs = __BuildRequest(
|
451
|
+
headers,
|
452
|
+
cookies,
|
453
|
+
params,
|
454
|
+
json,
|
455
|
+
timeout=timeout,
|
456
|
+
follow_redirects=follow_redirects,
|
457
|
+
verify=verify,
|
458
|
+
stealthy_headers=stealthy_headers,
|
459
|
+
impersonate=impersonate,
|
460
|
+
proxy=proxy,
|
461
|
+
data=data,
|
462
|
+
)
|
463
|
+
__Request_and_Save(Fetcher.put, url, output_file, css_selector, **kwargs)
|
464
|
+
|
465
|
+
|
466
|
+
@extract.command(
|
467
|
+
help=f"Perform a DELETE request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}"
|
468
|
+
)
|
469
|
+
@argument("url", required=True)
|
470
|
+
@argument("output_file", required=True)
|
471
|
+
@option(
|
472
|
+
"--headers",
|
473
|
+
"-H",
|
474
|
+
multiple=True,
|
475
|
+
help='HTTP headers in format "Key: Value" (can be used multiple times)',
|
476
|
+
)
|
477
|
+
@option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
|
478
|
+
@option(
|
479
|
+
"--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)"
|
480
|
+
)
|
481
|
+
@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
|
482
|
+
@option(
|
483
|
+
"--css-selector",
|
484
|
+
"-s",
|
485
|
+
help="CSS selector to extract specific content from the page. It returns all matches.",
|
486
|
+
)
|
487
|
+
@option(
|
488
|
+
"--params",
|
489
|
+
"-p",
|
490
|
+
multiple=True,
|
491
|
+
help='Query parameters in format "key=value" (can be used multiple times)',
|
492
|
+
)
|
493
|
+
@option(
|
494
|
+
"--follow-redirects/--no-follow-redirects",
|
495
|
+
default=True,
|
496
|
+
help="Whether to follow redirects (default: True)",
|
497
|
+
)
|
498
|
+
@option(
|
499
|
+
"--verify/--no-verify",
|
500
|
+
default=True,
|
501
|
+
help="Whether to verify SSL certificates (default: True)",
|
502
|
+
)
|
503
|
+
@option("--impersonate", help="Browser to impersonate (e.g., chrome, firefox).")
|
504
|
+
@option(
|
505
|
+
"--stealthy-headers/--no-stealthy-headers",
|
506
|
+
default=True,
|
507
|
+
help="Use stealthy browser headers (default: True)",
|
508
|
+
)
|
509
|
+
def delete(
|
510
|
+
url,
|
511
|
+
output_file,
|
512
|
+
headers,
|
513
|
+
cookies,
|
514
|
+
timeout,
|
515
|
+
proxy,
|
516
|
+
css_selector,
|
517
|
+
params,
|
518
|
+
follow_redirects,
|
519
|
+
verify,
|
520
|
+
impersonate,
|
521
|
+
stealthy_headers,
|
522
|
+
):
|
523
|
+
"""
|
524
|
+
Perform a DELETE request and save the content to a file.
|
525
|
+
|
526
|
+
:param url: Target URL for the request.
|
527
|
+
:param output_file: Output file path (.md for Markdown, .html for HTML).
|
528
|
+
:param headers: Headers to include in the request.
|
529
|
+
:param cookies: Cookies to use in the request.
|
530
|
+
:param timeout: Number of seconds to wait before timing out.
|
531
|
+
:param proxy: Proxy URL to use.
|
532
|
+
:param css_selector: CSS selector to extract specific content.
|
533
|
+
:param params: Query string parameters for the request.
|
534
|
+
:param follow_redirects: Whether to follow redirects.
|
535
|
+
:param verify: Whether to verify HTTPS certificates.
|
536
|
+
:param impersonate: Browser version to impersonate.
|
537
|
+
:param stealthy_headers: If enabled, creates and adds real browser headers.
|
538
|
+
"""
|
539
|
+
|
540
|
+
kwargs = __BuildRequest(
|
541
|
+
headers,
|
542
|
+
cookies,
|
543
|
+
params,
|
544
|
+
None,
|
545
|
+
timeout=timeout,
|
546
|
+
follow_redirects=follow_redirects,
|
547
|
+
verify=verify,
|
548
|
+
stealthy_headers=stealthy_headers,
|
549
|
+
impersonate=impersonate,
|
550
|
+
proxy=proxy,
|
551
|
+
)
|
552
|
+
__Request_and_Save(Fetcher.delete, url, output_file, css_selector, **kwargs)
|
553
|
+
|
554
|
+
|
555
|
+
@extract.command(
|
556
|
+
help=f"Use DynamicFetcher to fetch content with browser automation.\n\n{__OUTPUT_FILE_HELP__}"
|
557
|
+
)
|
558
|
+
@argument("url", required=True)
|
559
|
+
@argument("output_file", required=True)
|
560
|
+
@option(
|
561
|
+
"--headless/--no-headless",
|
562
|
+
default=True,
|
563
|
+
help="Run browser in headless mode (default: True)",
|
564
|
+
)
|
565
|
+
@option(
|
566
|
+
"--disable-resources/--enable-resources",
|
567
|
+
default=False,
|
568
|
+
help="Drop unnecessary resources for speed boost (default: False)",
|
569
|
+
)
|
570
|
+
@option(
|
571
|
+
"--network-idle/--no-network-idle",
|
572
|
+
default=False,
|
573
|
+
help="Wait for network idle (default: False)",
|
574
|
+
)
|
575
|
+
@option(
|
576
|
+
"--timeout",
|
577
|
+
type=int,
|
578
|
+
default=30000,
|
579
|
+
help="Timeout in milliseconds (default: 30000)",
|
580
|
+
)
|
581
|
+
@option(
|
582
|
+
"--wait",
|
583
|
+
type=int,
|
584
|
+
default=0,
|
585
|
+
help="Additional wait time in milliseconds after page load (default: 0)",
|
586
|
+
)
|
587
|
+
@option(
|
588
|
+
"--css-selector",
|
589
|
+
"-s",
|
590
|
+
help="CSS selector to extract specific content from the page. It returns all matches.",
|
591
|
+
)
|
592
|
+
@option("--wait-selector", help="CSS selector to wait for before proceeding")
|
593
|
+
@option("--locale", default="en-US", help="Browser locale (default: en-US)")
|
594
|
+
@option(
|
595
|
+
"--stealth/--no-stealth", default=False, help="Enable stealth mode (default: False)"
|
596
|
+
)
|
597
|
+
@option(
|
598
|
+
"--hide-canvas/--show-canvas",
|
599
|
+
default=False,
|
600
|
+
help="Add noise to canvas operations (default: False)",
|
601
|
+
)
|
602
|
+
@option(
|
603
|
+
"--disable-webgl/--enable-webgl",
|
604
|
+
default=False,
|
605
|
+
help="Disable WebGL support (default: False)",
|
606
|
+
)
|
607
|
+
@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
|
608
|
+
@option(
|
609
|
+
"--extra-headers",
|
610
|
+
"-H",
|
611
|
+
multiple=True,
|
612
|
+
help='Extra headers in format "Key: Value" (can be used multiple times)',
|
613
|
+
)
|
614
|
+
def fetch(
|
615
|
+
url,
|
616
|
+
output_file,
|
617
|
+
headless,
|
618
|
+
disable_resources,
|
619
|
+
network_idle,
|
620
|
+
timeout,
|
621
|
+
wait,
|
622
|
+
css_selector,
|
623
|
+
wait_selector,
|
624
|
+
locale,
|
625
|
+
stealth,
|
626
|
+
hide_canvas,
|
627
|
+
disable_webgl,
|
628
|
+
proxy,
|
629
|
+
extra_headers,
|
630
|
+
):
|
631
|
+
"""
|
632
|
+
Opens up a browser and fetch content using DynamicFetcher.
|
633
|
+
|
634
|
+
:param url: Target url.
|
635
|
+
:param output_file: Output file path (.md for Markdown, .html for HTML).
|
636
|
+
:param headless: Run the browser in headless/hidden or headful/visible mode.
|
637
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost.
|
638
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
639
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page.
|
640
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before returning.
|
641
|
+
:param css_selector: CSS selector to extract specific content.
|
642
|
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
643
|
+
:param locale: Set the locale for the browser.
|
644
|
+
:param stealth: Enables stealth mode.
|
645
|
+
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
646
|
+
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
647
|
+
:param proxy: The proxy to be used with requests.
|
648
|
+
:param extra_headers: Extra headers to add to the request.
|
649
|
+
"""
|
650
|
+
|
651
|
+
# Parse parameters
|
652
|
+
parsed_headers, _ = _ParseHeaders(extra_headers, False)
|
653
|
+
|
654
|
+
# Build request arguments
|
655
|
+
kwargs = {
|
656
|
+
"headless": headless,
|
657
|
+
"disable_resources": disable_resources,
|
658
|
+
"network_idle": network_idle,
|
659
|
+
"timeout": timeout,
|
660
|
+
"locale": locale,
|
661
|
+
"stealth": stealth,
|
662
|
+
"hide_canvas": hide_canvas,
|
663
|
+
"disable_webgl": disable_webgl,
|
664
|
+
}
|
665
|
+
|
666
|
+
if wait > 0:
|
667
|
+
kwargs["wait"] = wait
|
668
|
+
if wait_selector:
|
669
|
+
kwargs["wait_selector"] = wait_selector
|
670
|
+
if proxy:
|
671
|
+
kwargs["proxy"] = proxy
|
672
|
+
if parsed_headers:
|
673
|
+
kwargs["extra_headers"] = parsed_headers
|
674
|
+
|
675
|
+
__Request_and_Save(DynamicFetcher.fetch, url, output_file, css_selector, **kwargs)
|
676
|
+
|
677
|
+
|
678
|
+
@extract.command(
|
679
|
+
help=f"Use StealthyFetcher to fetch content with advanced stealth features.\n\n{__OUTPUT_FILE_HELP__}"
|
680
|
+
)
|
681
|
+
@argument("url", required=True)
|
682
|
+
@argument("output_file", required=True)
|
683
|
+
@option(
|
684
|
+
"--headless/--no-headless",
|
685
|
+
default=True,
|
686
|
+
help="Run browser in headless mode (default: True)",
|
687
|
+
)
|
688
|
+
@option(
|
689
|
+
"--block-images/--allow-images",
|
690
|
+
default=False,
|
691
|
+
help="Block image loading (default: False)",
|
692
|
+
)
|
693
|
+
@option(
|
694
|
+
"--disable-resources/--enable-resources",
|
695
|
+
default=False,
|
696
|
+
help="Drop unnecessary resources for speed boost (default: False)",
|
697
|
+
)
|
698
|
+
@option(
|
699
|
+
"--block-webrtc/--allow-webrtc",
|
700
|
+
default=False,
|
701
|
+
help="Block WebRTC entirely (default: False)",
|
702
|
+
)
|
703
|
+
@option(
|
704
|
+
"--humanize/--no-humanize",
|
705
|
+
default=False,
|
706
|
+
help="Humanize cursor movement (default: False)",
|
707
|
+
)
|
708
|
+
@option(
|
709
|
+
"--solve-cloudflare/--no-solve-cloudflare",
|
710
|
+
default=False,
|
711
|
+
help="Solve Cloudflare challenges (default: False)",
|
712
|
+
)
|
713
|
+
@option("--allow-webgl/--block-webgl", default=True, help="Allow WebGL (default: True)")
|
714
|
+
@option(
|
715
|
+
"--network-idle/--no-network-idle",
|
716
|
+
default=False,
|
717
|
+
help="Wait for network idle (default: False)",
|
718
|
+
)
|
719
|
+
@option(
|
720
|
+
"--disable-ads/--allow-ads",
|
721
|
+
default=False,
|
722
|
+
help="Install uBlock Origin addon (default: False)",
|
723
|
+
)
|
724
|
+
@option(
|
725
|
+
"--timeout",
|
726
|
+
type=int,
|
727
|
+
default=30000,
|
728
|
+
help="Timeout in milliseconds (default: 30000)",
|
729
|
+
)
|
730
|
+
@option(
|
731
|
+
"--wait",
|
732
|
+
type=int,
|
733
|
+
default=0,
|
734
|
+
help="Additional wait time in milliseconds after page load (default: 0)",
|
735
|
+
)
|
736
|
+
@option(
|
737
|
+
"--css-selector",
|
738
|
+
"-s",
|
739
|
+
help="CSS selector to extract specific content from the page. It returns all matches.",
|
740
|
+
)
|
741
|
+
@option("--wait-selector", help="CSS selector to wait for before proceeding")
|
742
|
+
@option(
|
743
|
+
"--geoip/--no-geoip",
|
744
|
+
default=False,
|
745
|
+
help="Use IP geolocation for timezone/locale (default: False)",
|
746
|
+
)
|
747
|
+
@option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
|
748
|
+
@option(
|
749
|
+
"--extra-headers",
|
750
|
+
"-H",
|
751
|
+
multiple=True,
|
752
|
+
help='Extra headers in format "Key: Value" (can be used multiple times)',
|
753
|
+
)
|
754
|
+
def stealthy_fetch(
|
755
|
+
url,
|
756
|
+
output_file,
|
757
|
+
headless,
|
758
|
+
block_images,
|
759
|
+
disable_resources,
|
760
|
+
block_webrtc,
|
761
|
+
humanize,
|
762
|
+
solve_cloudflare,
|
763
|
+
allow_webgl,
|
764
|
+
network_idle,
|
765
|
+
disable_ads,
|
766
|
+
timeout,
|
767
|
+
wait,
|
768
|
+
css_selector,
|
769
|
+
wait_selector,
|
770
|
+
geoip,
|
771
|
+
proxy,
|
772
|
+
extra_headers,
|
773
|
+
):
|
774
|
+
"""
|
775
|
+
Opens up a browser with advanced stealth features and fetch content using StealthyFetcher.
|
776
|
+
|
777
|
+
:param url: Target url.
|
778
|
+
:param output_file: Output file path (.md for Markdown, .html for HTML).
|
779
|
+
:param headless: Run the browser in headless/hidden, or headful/visible mode.
|
780
|
+
:param block_images: Prevent the loading of images through Firefox preferences.
|
781
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost.
|
782
|
+
:param block_webrtc: Blocks WebRTC entirely.
|
783
|
+
:param humanize: Humanize the cursor movement.
|
784
|
+
:param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page.
|
785
|
+
:param allow_webgl: Allow WebGL (recommended to keep enabled).
|
786
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
787
|
+
:param disable_ads: Install the uBlock Origin addon on the browser.
|
788
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page.
|
789
|
+
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before returning.
|
790
|
+
:param css_selector: CSS selector to extract specific content.
|
791
|
+
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
|
792
|
+
:param geoip: Automatically use IP's longitude, latitude, timezone, country, locale.
|
793
|
+
:param proxy: The proxy to be used with requests.
|
794
|
+
:param extra_headers: Extra headers to add to the request.
|
795
|
+
"""
|
796
|
+
|
797
|
+
# Parse parameters
|
798
|
+
parsed_headers, _ = _ParseHeaders(extra_headers, False)
|
799
|
+
|
800
|
+
# Build request arguments
|
801
|
+
kwargs = {
|
802
|
+
"headless": headless,
|
803
|
+
"block_images": block_images,
|
804
|
+
"disable_resources": disable_resources,
|
805
|
+
"block_webrtc": block_webrtc,
|
806
|
+
"humanize": humanize,
|
807
|
+
"solve_cloudflare": solve_cloudflare,
|
808
|
+
"allow_webgl": allow_webgl,
|
809
|
+
"network_idle": network_idle,
|
810
|
+
"disable_ads": disable_ads,
|
811
|
+
"timeout": timeout,
|
812
|
+
"geoip": geoip,
|
813
|
+
}
|
814
|
+
|
815
|
+
if wait > 0:
|
816
|
+
kwargs["wait"] = wait
|
817
|
+
if wait_selector:
|
818
|
+
kwargs["wait_selector"] = wait_selector
|
819
|
+
if proxy:
|
820
|
+
kwargs["proxy"] = proxy
|
821
|
+
if parsed_headers:
|
822
|
+
kwargs["extra_headers"] = parsed_headers
|
823
|
+
|
824
|
+
__Request_and_Save(StealthyFetcher.fetch, url, output_file, css_selector, **kwargs)
|
30
825
|
|
31
826
|
|
32
|
-
@
|
827
|
+
@group()
|
33
828
|
def main():
|
34
829
|
pass
|
35
830
|
|
36
831
|
|
37
832
|
# Adding commands
|
38
833
|
main.add_command(install)
|
834
|
+
main.add_command(shell)
|
835
|
+
main.add_command(extract)
|
836
|
+
main.add_command(mcp)
|