scrapling 0.2.99__py3-none-any.whl → 0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. scrapling/__init__.py +18 -31
  2. scrapling/cli.py +818 -20
  3. scrapling/core/_html_utils.py +348 -0
  4. scrapling/core/_types.py +34 -17
  5. scrapling/core/ai.py +611 -0
  6. scrapling/core/custom_types.py +183 -100
  7. scrapling/core/mixins.py +27 -19
  8. scrapling/core/shell.py +647 -0
  9. scrapling/core/{storage_adaptors.py → storage.py} +41 -33
  10. scrapling/core/translator.py +20 -26
  11. scrapling/core/utils.py +49 -54
  12. scrapling/engines/__init__.py +15 -6
  13. scrapling/engines/_browsers/__init__.py +2 -0
  14. scrapling/engines/_browsers/_camoufox.py +745 -0
  15. scrapling/engines/_browsers/_config_tools.py +130 -0
  16. scrapling/engines/_browsers/_controllers.py +630 -0
  17. scrapling/engines/_browsers/_page.py +93 -0
  18. scrapling/engines/_browsers/_validators.py +150 -0
  19. scrapling/engines/constants.py +101 -88
  20. scrapling/engines/static.py +667 -110
  21. scrapling/engines/toolbelt/__init__.py +20 -6
  22. scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
  23. scrapling/engines/toolbelt/convertor.py +254 -0
  24. scrapling/engines/toolbelt/custom.py +158 -175
  25. scrapling/engines/toolbelt/fingerprints.py +32 -46
  26. scrapling/engines/toolbelt/navigation.py +68 -39
  27. scrapling/fetchers.py +227 -333
  28. scrapling/parser.py +781 -449
  29. scrapling-0.3.dist-info/METADATA +409 -0
  30. scrapling-0.3.dist-info/RECORD +41 -0
  31. {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/WHEEL +1 -1
  32. {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/top_level.txt +0 -1
  33. scrapling/defaults.py +0 -25
  34. scrapling/engines/camo.py +0 -339
  35. scrapling/engines/pw.py +0 -465
  36. scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
  37. scrapling-0.2.99.dist-info/METADATA +0 -290
  38. scrapling-0.2.99.dist-info/RECORD +0 -49
  39. tests/__init__.py +0 -1
  40. tests/fetchers/__init__.py +0 -1
  41. tests/fetchers/async/__init__.py +0 -0
  42. tests/fetchers/async/test_camoufox.py +0 -97
  43. tests/fetchers/async/test_httpx.py +0 -85
  44. tests/fetchers/async/test_playwright.py +0 -101
  45. tests/fetchers/sync/__init__.py +0 -0
  46. tests/fetchers/sync/test_camoufox.py +0 -70
  47. tests/fetchers/sync/test_httpx.py +0 -84
  48. tests/fetchers/sync/test_playwright.py +0 -89
  49. tests/fetchers/test_utils.py +0 -97
  50. tests/parser/__init__.py +0 -0
  51. tests/parser/test_automatch.py +0 -111
  52. tests/parser/test_general.py +0 -330
  53. {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/entry_points.txt +0 -0
  54. {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/licenses/LICENSE +0 -0
scrapling/cli.py CHANGED
@@ -1,38 +1,836 @@
1
- import os
2
- import subprocess
3
- import sys
4
1
  from pathlib import Path
2
+ from subprocess import check_output
3
+ from sys import executable as python_executable
5
4
 
6
- import click
5
+ from scrapling.core.utils import log
6
+ from scrapling.engines.toolbelt import Response
7
+ from scrapling.core._types import List, Optional, Dict, Tuple, Any, Callable
8
+ from scrapling.fetchers import Fetcher, DynamicFetcher, StealthyFetcher
9
+ from scrapling.core.shell import Convertor, _CookieParser, _ParseHeaders
7
10
 
11
+ from orjson import loads as json_loads, JSONDecodeError
12
+ from click import command, option, Choice, group, argument
8
13
 
9
- def get_package_dir():
10
- return Path(os.path.dirname(__file__))
14
+ __OUTPUT_FILE_HELP__ = "The output file path can be an HTML file, a Markdown file of the HTML content, or the text content itself. Use file extensions (`.html`/`.md`/`.txt`) respectively."
15
+ __PACKAGE_DIR__ = Path(__file__).parent
11
16
 
12
17
 
13
- def run_command(command, line):
14
- print(f"Installing {line}...")
15
- _ = subprocess.check_call(' '.join(command), shell=True)
18
+ def __Execute(cmd: List[str], help_line: str) -> None: # pragma: no cover
19
+ print(f"Installing {help_line}...")
20
+ _ = check_output(cmd, shell=False) # nosec B603
16
21
  # I meant to not use try except here
17
22
 
18
23
 
19
- @click.command(help="Install all Scrapling's Fetchers dependencies")
20
- @click.option('-f', '--force', 'force', is_flag=True, default=False, type=bool, help="Force Scrapling to reinstall all Fetchers dependencies")
21
- def install(force):
22
- if force or not get_package_dir().joinpath(".scrapling_dependencies_installed").exists():
23
- run_command([sys.executable, "-m", "playwright", "install", 'chromium'], 'Playwright browsers')
24
- run_command([sys.executable, "-m", "playwright", "install-deps", 'chromium', 'firefox'], 'Playwright dependencies')
25
- run_command([sys.executable, "-m", "camoufox", "fetch", '--browserforge'], 'Camoufox browser and databases')
26
- # if no errors raised by above commands, then we add below file
27
- get_package_dir().joinpath(".scrapling_dependencies_installed").touch()
24
+ def __ParseJSONData(json_string: Optional[str] = None) -> Optional[Dict[str, Any]]:
25
+ """Parse JSON string into a Python object"""
26
+ if not json_string:
27
+ return None
28
+
29
+ try:
30
+ return json_loads(json_string)
31
+ except JSONDecodeError as e: # pragma: no cover
32
+ raise ValueError(f"Invalid JSON data '{json_string}': {e}")
33
+
34
+
35
+ def __Request_and_Save(
36
+ fetcher_func: Callable[..., Response],
37
+ url: str,
38
+ output_file: str,
39
+ css_selector: Optional[str] = None,
40
+ **kwargs,
41
+ ) -> None:
42
+ """Make a request using the specified fetcher function and save the result"""
43
+ # Handle relative paths - convert to an absolute path based on the current working directory
44
+ output_path = Path(output_file)
45
+ if not output_path.is_absolute():
46
+ output_path = Path.cwd() / output_file
47
+
48
+ response = fetcher_func(url, **kwargs)
49
+ Convertor.write_content_to_file(response, str(output_path), css_selector)
50
+ log.info(f"Content successfully saved to '{output_path}'")
51
+
52
+
53
+ def __ParseExtractArguments(
54
+ headers: List[str], cookies: str, params: str, json: Optional[str] = None
55
+ ) -> Tuple[Dict[str, str], Dict[str, str], Dict[str, str], Optional[Dict[str, str]]]:
56
+ """Parse arguments for extract command"""
57
+ parsed_headers, parsed_cookies = _ParseHeaders(headers)
58
+ if cookies:
59
+ for key, value in _CookieParser(cookies):
60
+ try:
61
+ parsed_cookies[key] = value
62
+ except Exception as e:
63
+ raise ValueError(f"Could not parse cookies '{cookies}': {e}")
64
+
65
+ parsed_json = __ParseJSONData(json)
66
+ parsed_params = {}
67
+ for param in params:
68
+ if "=" in param:
69
+ key, value = param.split("=", 1)
70
+ parsed_params[key] = value
71
+
72
+ return parsed_headers, parsed_cookies, parsed_params, parsed_json
73
+
74
+
75
+ def __BuildRequest(
76
+ headers: List[str], cookies: str, params: str, json: Optional[str] = None, **kwargs
77
+ ) -> Dict:
78
+ """Build a request object using the specified arguments"""
79
+ # Parse parameters
80
+ parsed_headers, parsed_cookies, parsed_params, parsed_json = (
81
+ __ParseExtractArguments(headers, cookies, params, json)
82
+ )
83
+ # Build request arguments
84
+ request_kwargs = {
85
+ "headers": parsed_headers if parsed_headers else None,
86
+ "cookies": parsed_cookies if parsed_cookies else None,
87
+ }
88
+ if parsed_json:
89
+ request_kwargs["json"] = parsed_json
90
+ if parsed_params:
91
+ request_kwargs["params"] = parsed_params
92
+ if "proxy" in kwargs:
93
+ request_kwargs["proxy"] = kwargs.pop("proxy")
94
+
95
+ return {**request_kwargs, **kwargs}
96
+
97
+
98
+ @command(help="Install all Scrapling's Fetchers dependencies")
99
+ @option(
100
+ "-f",
101
+ "--force",
102
+ "force",
103
+ is_flag=True,
104
+ default=False,
105
+ type=bool,
106
+ help="Force Scrapling to reinstall all Fetchers dependencies",
107
+ )
108
+ def install(force): # pragma: no cover
109
+ if (
110
+ force
111
+ or not __PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").exists()
112
+ ):
113
+ __Execute(
114
+ [python_executable, "-m", "playwright", "install", "chromium"],
115
+ "Playwright browsers",
116
+ )
117
+ __Execute(
118
+ [
119
+ python_executable,
120
+ "-m",
121
+ "playwright",
122
+ "install-deps",
123
+ "chromium",
124
+ "firefox",
125
+ ],
126
+ "Playwright dependencies",
127
+ )
128
+ __Execute(
129
+ [python_executable, "-m", "camoufox", "fetch", "--browserforge"],
130
+ "Camoufox browser and databases",
131
+ )
132
+ # if no errors raised by the above commands, then we add the below file
133
+ __PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").touch()
28
134
  else:
29
- print('The dependencies are already installed')
135
+ print("The dependencies are already installed")
136
+
137
+
138
+ @command(help="Run Scrapling's MCP server (Check the docs for more info).")
139
+ def mcp():
140
+ from scrapling.core.ai import ScraplingMCPServer
141
+
142
+ ScraplingMCPServer().serve()
143
+
144
+
145
+ @command(help="Interactive scraping console")
146
+ @option(
147
+ "-c",
148
+ "--code",
149
+ "code",
150
+ is_flag=False,
151
+ default="",
152
+ type=str,
153
+ help="Evaluate the code in the shell, print the result and exit",
154
+ )
155
+ @option(
156
+ "-L",
157
+ "--loglevel",
158
+ "level",
159
+ is_flag=False,
160
+ default="debug",
161
+ type=Choice(
162
+ ["debug", "info", "warning", "error", "critical", "fatal"], case_sensitive=False
163
+ ),
164
+ help="Log level (default: DEBUG)",
165
+ )
166
+ def shell(code, level):
167
+ from scrapling.core.shell import CustomShell
168
+
169
+ console = CustomShell(code=code, log_level=level)
170
+ console.start()
171
+
172
+
173
+ @group(
174
+ help="Fetch web pages using various fetchers and extract full/selected HTML content as HTML, Markdown, or extract text content."
175
+ )
176
+ def extract():
177
+ """Extract content from web pages and save to files"""
178
+ pass
179
+
180
+
181
+ @extract.command(
182
+ help=f"Perform a GET request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}"
183
+ )
184
+ @argument("url", required=True)
185
+ @argument("output_file", required=True)
186
+ @option(
187
+ "--headers",
188
+ "-H",
189
+ multiple=True,
190
+ help='HTTP headers in format "Key: Value" (can be used multiple times)',
191
+ )
192
+ @option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
193
+ @option(
194
+ "--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)"
195
+ )
196
+ @option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
197
+ @option(
198
+ "--css-selector",
199
+ "-s",
200
+ help="CSS selector to extract specific content from the page. It returns all matches.",
201
+ )
202
+ @option(
203
+ "--params",
204
+ "-p",
205
+ multiple=True,
206
+ help='Query parameters in format "key=value" (can be used multiple times)',
207
+ )
208
+ @option(
209
+ "--follow-redirects/--no-follow-redirects",
210
+ default=True,
211
+ help="Whether to follow redirects (default: True)",
212
+ )
213
+ @option(
214
+ "--verify/--no-verify",
215
+ default=True,
216
+ help="Whether to verify SSL certificates (default: True)",
217
+ )
218
+ @option("--impersonate", help="Browser to impersonate (e.g., chrome, firefox).")
219
+ @option(
220
+ "--stealthy-headers/--no-stealthy-headers",
221
+ default=True,
222
+ help="Use stealthy browser headers (default: True)",
223
+ )
224
+ def get(
225
+ url,
226
+ output_file,
227
+ headers,
228
+ cookies,
229
+ timeout,
230
+ proxy,
231
+ css_selector,
232
+ params,
233
+ follow_redirects,
234
+ verify,
235
+ impersonate,
236
+ stealthy_headers,
237
+ ):
238
+ """
239
+ Perform a GET request and save the content to a file.
240
+
241
+ :param url: Target URL for the request.
242
+ :param output_file: Output file path (.md for Markdown, .html for HTML).
243
+ :param headers: HTTP headers to include in the request.
244
+ :param cookies: Cookies to use in the request.
245
+ :param timeout: Number of seconds to wait before timing out.
246
+ :param proxy: Proxy URL to use. (Format: "http://username:password@localhost:8030")
247
+ :param css_selector: CSS selector to extract specific content.
248
+ :param params: Query string parameters for the request.
249
+ :param follow_redirects: Whether to follow redirects.
250
+ :param verify: Whether to verify HTTPS certificates.
251
+ :param impersonate: Browser version to impersonate.
252
+ :param stealthy_headers: If enabled, creates and adds real browser headers.
253
+ """
254
+
255
+ kwargs = __BuildRequest(
256
+ headers,
257
+ cookies,
258
+ params,
259
+ None,
260
+ timeout=timeout,
261
+ follow_redirects=follow_redirects,
262
+ verify=verify,
263
+ stealthy_headers=stealthy_headers,
264
+ impersonate=impersonate,
265
+ proxy=proxy,
266
+ )
267
+ __Request_and_Save(Fetcher.get, url, output_file, css_selector, **kwargs)
268
+
269
+
270
+ @extract.command(
271
+ help=f"Perform a POST request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}"
272
+ )
273
+ @argument("url", required=True)
274
+ @argument("output_file", required=True)
275
+ @option(
276
+ "--data",
277
+ "-d",
278
+ help='Form data to include in the request body (as string, ex: "param1=value1&param2=value2")',
279
+ )
280
+ @option("--json", "-j", help="JSON data to include in the request body (as string)")
281
+ @option(
282
+ "--headers",
283
+ "-H",
284
+ multiple=True,
285
+ help='HTTP headers in format "Key: Value" (can be used multiple times)',
286
+ )
287
+ @option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
288
+ @option(
289
+ "--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)"
290
+ )
291
+ @option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
292
+ @option(
293
+ "--css-selector",
294
+ "-s",
295
+ help="CSS selector to extract specific content from the page. It returns all matches.",
296
+ )
297
+ @option(
298
+ "--params",
299
+ "-p",
300
+ multiple=True,
301
+ help='Query parameters in format "key=value" (can be used multiple times)',
302
+ )
303
+ @option(
304
+ "--follow-redirects/--no-follow-redirects",
305
+ default=True,
306
+ help="Whether to follow redirects (default: True)",
307
+ )
308
+ @option(
309
+ "--verify/--no-verify",
310
+ default=True,
311
+ help="Whether to verify SSL certificates (default: True)",
312
+ )
313
+ @option("--impersonate", help="Browser to impersonate (e.g., chrome, firefox).")
314
+ @option(
315
+ "--stealthy-headers/--no-stealthy-headers",
316
+ default=True,
317
+ help="Use stealthy browser headers (default: True)",
318
+ )
319
+ def post(
320
+ url,
321
+ output_file,
322
+ data,
323
+ json,
324
+ headers,
325
+ cookies,
326
+ timeout,
327
+ proxy,
328
+ css_selector,
329
+ params,
330
+ follow_redirects,
331
+ verify,
332
+ impersonate,
333
+ stealthy_headers,
334
+ ):
335
+ """
336
+ Perform a POST request and save the content to a file.
337
+
338
+ :param url: Target URL for the request.
339
+ :param output_file: Output file path (.md for Markdown, .html for HTML).
340
+ :param data: Form data to include in the request body. (as string, ex: "param1=value1&param2=value2")
341
+ :param json: A JSON serializable object to include in the body of the request.
342
+ :param headers: Headers to include in the request.
343
+ :param cookies: Cookies to use in the request.
344
+ :param timeout: Number of seconds to wait before timing out.
345
+ :param proxy: Proxy URL to use.
346
+ :param css_selector: CSS selector to extract specific content.
347
+ :param params: Query string parameters for the request.
348
+ :param follow_redirects: Whether to follow redirects.
349
+ :param verify: Whether to verify HTTPS certificates.
350
+ :param impersonate: Browser version to impersonate.
351
+ :param stealthy_headers: If enabled, creates and adds real browser headers.
352
+ """
353
+
354
+ kwargs = __BuildRequest(
355
+ headers,
356
+ cookies,
357
+ params,
358
+ json,
359
+ timeout=timeout,
360
+ follow_redirects=follow_redirects,
361
+ verify=verify,
362
+ stealthy_headers=stealthy_headers,
363
+ impersonate=impersonate,
364
+ proxy=proxy,
365
+ data=data,
366
+ )
367
+ __Request_and_Save(Fetcher.post, url, output_file, css_selector, **kwargs)
368
+
369
+
370
+ @extract.command(
371
+ help=f"Perform a PUT request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}"
372
+ )
373
+ @argument("url", required=True)
374
+ @argument("output_file", required=True)
375
+ @option("--data", "-d", help="Form data to include in the request body")
376
+ @option("--json", "-j", help="JSON data to include in the request body (as string)")
377
+ @option(
378
+ "--headers",
379
+ "-H",
380
+ multiple=True,
381
+ help='HTTP headers in format "Key: Value" (can be used multiple times)',
382
+ )
383
+ @option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
384
+ @option(
385
+ "--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)"
386
+ )
387
+ @option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
388
+ @option(
389
+ "--css-selector",
390
+ "-s",
391
+ help="CSS selector to extract specific content from the page. It returns all matches.",
392
+ )
393
+ @option(
394
+ "--params",
395
+ "-p",
396
+ multiple=True,
397
+ help='Query parameters in format "key=value" (can be used multiple times)',
398
+ )
399
+ @option(
400
+ "--follow-redirects/--no-follow-redirects",
401
+ default=True,
402
+ help="Whether to follow redirects (default: True)",
403
+ )
404
+ @option(
405
+ "--verify/--no-verify",
406
+ default=True,
407
+ help="Whether to verify SSL certificates (default: True)",
408
+ )
409
+ @option("--impersonate", help="Browser to impersonate (e.g., chrome, firefox).")
410
+ @option(
411
+ "--stealthy-headers/--no-stealthy-headers",
412
+ default=True,
413
+ help="Use stealthy browser headers (default: True)",
414
+ )
415
+ def put(
416
+ url,
417
+ output_file,
418
+ data,
419
+ json,
420
+ headers,
421
+ cookies,
422
+ timeout,
423
+ proxy,
424
+ css_selector,
425
+ params,
426
+ follow_redirects,
427
+ verify,
428
+ impersonate,
429
+ stealthy_headers,
430
+ ):
431
+ """
432
+ Perform a PUT request and save the content to a file.
433
+
434
+ :param url: Target URL for the request.
435
+ :param output_file: Output file path (.md for Markdown, .html for HTML).
436
+ :param data: Form data to include in the request body.
437
+ :param json: A JSON serializable object to include in the body of the request.
438
+ :param headers: Headers to include in the request.
439
+ :param cookies: Cookies to use in the request.
440
+ :param timeout: Number of seconds to wait before timing out.
441
+ :param proxy: Proxy URL to use.
442
+ :param css_selector: CSS selector to extract specific content.
443
+ :param params: Query string parameters for the request.
444
+ :param follow_redirects: Whether to follow redirects.
445
+ :param verify: Whether to verify HTTPS certificates.
446
+ :param impersonate: Browser version to impersonate.
447
+ :param stealthy_headers: If enabled, creates and adds real browser headers.
448
+ """
449
+
450
+ kwargs = __BuildRequest(
451
+ headers,
452
+ cookies,
453
+ params,
454
+ json,
455
+ timeout=timeout,
456
+ follow_redirects=follow_redirects,
457
+ verify=verify,
458
+ stealthy_headers=stealthy_headers,
459
+ impersonate=impersonate,
460
+ proxy=proxy,
461
+ data=data,
462
+ )
463
+ __Request_and_Save(Fetcher.put, url, output_file, css_selector, **kwargs)
464
+
465
+
466
+ @extract.command(
467
+ help=f"Perform a DELETE request and save the content to a file.\n\n{__OUTPUT_FILE_HELP__}"
468
+ )
469
+ @argument("url", required=True)
470
+ @argument("output_file", required=True)
471
+ @option(
472
+ "--headers",
473
+ "-H",
474
+ multiple=True,
475
+ help='HTTP headers in format "Key: Value" (can be used multiple times)',
476
+ )
477
+ @option("--cookies", help='Cookies string in format "name1=value1; name2=value2"')
478
+ @option(
479
+ "--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)"
480
+ )
481
+ @option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
482
+ @option(
483
+ "--css-selector",
484
+ "-s",
485
+ help="CSS selector to extract specific content from the page. It returns all matches.",
486
+ )
487
+ @option(
488
+ "--params",
489
+ "-p",
490
+ multiple=True,
491
+ help='Query parameters in format "key=value" (can be used multiple times)',
492
+ )
493
+ @option(
494
+ "--follow-redirects/--no-follow-redirects",
495
+ default=True,
496
+ help="Whether to follow redirects (default: True)",
497
+ )
498
+ @option(
499
+ "--verify/--no-verify",
500
+ default=True,
501
+ help="Whether to verify SSL certificates (default: True)",
502
+ )
503
+ @option("--impersonate", help="Browser to impersonate (e.g., chrome, firefox).")
504
+ @option(
505
+ "--stealthy-headers/--no-stealthy-headers",
506
+ default=True,
507
+ help="Use stealthy browser headers (default: True)",
508
+ )
509
+ def delete(
510
+ url,
511
+ output_file,
512
+ headers,
513
+ cookies,
514
+ timeout,
515
+ proxy,
516
+ css_selector,
517
+ params,
518
+ follow_redirects,
519
+ verify,
520
+ impersonate,
521
+ stealthy_headers,
522
+ ):
523
+ """
524
+ Perform a DELETE request and save the content to a file.
525
+
526
+ :param url: Target URL for the request.
527
+ :param output_file: Output file path (.md for Markdown, .html for HTML).
528
+ :param headers: Headers to include in the request.
529
+ :param cookies: Cookies to use in the request.
530
+ :param timeout: Number of seconds to wait before timing out.
531
+ :param proxy: Proxy URL to use.
532
+ :param css_selector: CSS selector to extract specific content.
533
+ :param params: Query string parameters for the request.
534
+ :param follow_redirects: Whether to follow redirects.
535
+ :param verify: Whether to verify HTTPS certificates.
536
+ :param impersonate: Browser version to impersonate.
537
+ :param stealthy_headers: If enabled, creates and adds real browser headers.
538
+ """
539
+
540
+ kwargs = __BuildRequest(
541
+ headers,
542
+ cookies,
543
+ params,
544
+ None,
545
+ timeout=timeout,
546
+ follow_redirects=follow_redirects,
547
+ verify=verify,
548
+ stealthy_headers=stealthy_headers,
549
+ impersonate=impersonate,
550
+ proxy=proxy,
551
+ )
552
+ __Request_and_Save(Fetcher.delete, url, output_file, css_selector, **kwargs)
553
+
554
+
555
+ @extract.command(
556
+ help=f"Use DynamicFetcher to fetch content with browser automation.\n\n{__OUTPUT_FILE_HELP__}"
557
+ )
558
+ @argument("url", required=True)
559
+ @argument("output_file", required=True)
560
+ @option(
561
+ "--headless/--no-headless",
562
+ default=True,
563
+ help="Run browser in headless mode (default: True)",
564
+ )
565
+ @option(
566
+ "--disable-resources/--enable-resources",
567
+ default=False,
568
+ help="Drop unnecessary resources for speed boost (default: False)",
569
+ )
570
+ @option(
571
+ "--network-idle/--no-network-idle",
572
+ default=False,
573
+ help="Wait for network idle (default: False)",
574
+ )
575
+ @option(
576
+ "--timeout",
577
+ type=int,
578
+ default=30000,
579
+ help="Timeout in milliseconds (default: 30000)",
580
+ )
581
+ @option(
582
+ "--wait",
583
+ type=int,
584
+ default=0,
585
+ help="Additional wait time in milliseconds after page load (default: 0)",
586
+ )
587
+ @option(
588
+ "--css-selector",
589
+ "-s",
590
+ help="CSS selector to extract specific content from the page. It returns all matches.",
591
+ )
592
+ @option("--wait-selector", help="CSS selector to wait for before proceeding")
593
+ @option("--locale", default="en-US", help="Browser locale (default: en-US)")
594
+ @option(
595
+ "--stealth/--no-stealth", default=False, help="Enable stealth mode (default: False)"
596
+ )
597
+ @option(
598
+ "--hide-canvas/--show-canvas",
599
+ default=False,
600
+ help="Add noise to canvas operations (default: False)",
601
+ )
602
+ @option(
603
+ "--disable-webgl/--enable-webgl",
604
+ default=False,
605
+ help="Disable WebGL support (default: False)",
606
+ )
607
+ @option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
608
+ @option(
609
+ "--extra-headers",
610
+ "-H",
611
+ multiple=True,
612
+ help='Extra headers in format "Key: Value" (can be used multiple times)',
613
+ )
614
+ def fetch(
615
+ url,
616
+ output_file,
617
+ headless,
618
+ disable_resources,
619
+ network_idle,
620
+ timeout,
621
+ wait,
622
+ css_selector,
623
+ wait_selector,
624
+ locale,
625
+ stealth,
626
+ hide_canvas,
627
+ disable_webgl,
628
+ proxy,
629
+ extra_headers,
630
+ ):
631
+ """
632
+ Opens up a browser and fetch content using DynamicFetcher.
633
+
634
+ :param url: Target url.
635
+ :param output_file: Output file path (.md for Markdown, .html for HTML).
636
+ :param headless: Run the browser in headless/hidden or headful/visible mode.
637
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost.
638
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
639
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page.
640
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before returning.
641
+ :param css_selector: CSS selector to extract specific content.
642
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
643
+ :param locale: Set the locale for the browser.
644
+ :param stealth: Enables stealth mode.
645
+ :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
646
+ :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
647
+ :param proxy: The proxy to be used with requests.
648
+ :param extra_headers: Extra headers to add to the request.
649
+ """
650
+
651
+ # Parse parameters
652
+ parsed_headers, _ = _ParseHeaders(extra_headers, False)
653
+
654
+ # Build request arguments
655
+ kwargs = {
656
+ "headless": headless,
657
+ "disable_resources": disable_resources,
658
+ "network_idle": network_idle,
659
+ "timeout": timeout,
660
+ "locale": locale,
661
+ "stealth": stealth,
662
+ "hide_canvas": hide_canvas,
663
+ "disable_webgl": disable_webgl,
664
+ }
665
+
666
+ if wait > 0:
667
+ kwargs["wait"] = wait
668
+ if wait_selector:
669
+ kwargs["wait_selector"] = wait_selector
670
+ if proxy:
671
+ kwargs["proxy"] = proxy
672
+ if parsed_headers:
673
+ kwargs["extra_headers"] = parsed_headers
674
+
675
+ __Request_and_Save(DynamicFetcher.fetch, url, output_file, css_selector, **kwargs)
676
+
677
+
678
+ @extract.command(
679
+ help=f"Use StealthyFetcher to fetch content with advanced stealth features.\n\n{__OUTPUT_FILE_HELP__}"
680
+ )
681
+ @argument("url", required=True)
682
+ @argument("output_file", required=True)
683
+ @option(
684
+ "--headless/--no-headless",
685
+ default=True,
686
+ help="Run browser in headless mode (default: True)",
687
+ )
688
+ @option(
689
+ "--block-images/--allow-images",
690
+ default=False,
691
+ help="Block image loading (default: False)",
692
+ )
693
+ @option(
694
+ "--disable-resources/--enable-resources",
695
+ default=False,
696
+ help="Drop unnecessary resources for speed boost (default: False)",
697
+ )
698
+ @option(
699
+ "--block-webrtc/--allow-webrtc",
700
+ default=False,
701
+ help="Block WebRTC entirely (default: False)",
702
+ )
703
+ @option(
704
+ "--humanize/--no-humanize",
705
+ default=False,
706
+ help="Humanize cursor movement (default: False)",
707
+ )
708
+ @option(
709
+ "--solve-cloudflare/--no-solve-cloudflare",
710
+ default=False,
711
+ help="Solve Cloudflare challenges (default: False)",
712
+ )
713
+ @option("--allow-webgl/--block-webgl", default=True, help="Allow WebGL (default: True)")
714
+ @option(
715
+ "--network-idle/--no-network-idle",
716
+ default=False,
717
+ help="Wait for network idle (default: False)",
718
+ )
719
+ @option(
720
+ "--disable-ads/--allow-ads",
721
+ default=False,
722
+ help="Install uBlock Origin addon (default: False)",
723
+ )
724
+ @option(
725
+ "--timeout",
726
+ type=int,
727
+ default=30000,
728
+ help="Timeout in milliseconds (default: 30000)",
729
+ )
730
+ @option(
731
+ "--wait",
732
+ type=int,
733
+ default=0,
734
+ help="Additional wait time in milliseconds after page load (default: 0)",
735
+ )
736
+ @option(
737
+ "--css-selector",
738
+ "-s",
739
+ help="CSS selector to extract specific content from the page. It returns all matches.",
740
+ )
741
+ @option("--wait-selector", help="CSS selector to wait for before proceeding")
742
+ @option(
743
+ "--geoip/--no-geoip",
744
+ default=False,
745
+ help="Use IP geolocation for timezone/locale (default: False)",
746
+ )
747
+ @option("--proxy", help='Proxy URL in format "http://username:password@host:port"')
748
+ @option(
749
+ "--extra-headers",
750
+ "-H",
751
+ multiple=True,
752
+ help='Extra headers in format "Key: Value" (can be used multiple times)',
753
+ )
754
+ def stealthy_fetch(
755
+ url,
756
+ output_file,
757
+ headless,
758
+ block_images,
759
+ disable_resources,
760
+ block_webrtc,
761
+ humanize,
762
+ solve_cloudflare,
763
+ allow_webgl,
764
+ network_idle,
765
+ disable_ads,
766
+ timeout,
767
+ wait,
768
+ css_selector,
769
+ wait_selector,
770
+ geoip,
771
+ proxy,
772
+ extra_headers,
773
+ ):
774
+ """
775
+ Opens up a browser with advanced stealth features and fetch content using StealthyFetcher.
776
+
777
+ :param url: Target url.
778
+ :param output_file: Output file path (.md for Markdown, .html for HTML).
779
+ :param headless: Run the browser in headless/hidden, or headful/visible mode.
780
+ :param block_images: Prevent the loading of images through Firefox preferences.
781
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost.
782
+ :param block_webrtc: Blocks WebRTC entirely.
783
+ :param humanize: Humanize the cursor movement.
784
+ :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page.
785
+ :param allow_webgl: Allow WebGL (recommended to keep enabled).
786
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
787
+ :param disable_ads: Install the uBlock Origin addon on the browser.
788
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page.
789
+ :param wait: The time (milliseconds) the fetcher will wait after everything finishes before returning.
790
+ :param css_selector: CSS selector to extract specific content.
791
+ :param wait_selector: Wait for a specific CSS selector to be in a specific state.
792
+ :param geoip: Automatically use IP's longitude, latitude, timezone, country, locale.
793
+ :param proxy: The proxy to be used with requests.
794
+ :param extra_headers: Extra headers to add to the request.
795
+ """
796
+
797
+ # Parse parameters
798
+ parsed_headers, _ = _ParseHeaders(extra_headers, False)
799
+
800
+ # Build request arguments
801
+ kwargs = {
802
+ "headless": headless,
803
+ "block_images": block_images,
804
+ "disable_resources": disable_resources,
805
+ "block_webrtc": block_webrtc,
806
+ "humanize": humanize,
807
+ "solve_cloudflare": solve_cloudflare,
808
+ "allow_webgl": allow_webgl,
809
+ "network_idle": network_idle,
810
+ "disable_ads": disable_ads,
811
+ "timeout": timeout,
812
+ "geoip": geoip,
813
+ }
814
+
815
+ if wait > 0:
816
+ kwargs["wait"] = wait
817
+ if wait_selector:
818
+ kwargs["wait_selector"] = wait_selector
819
+ if proxy:
820
+ kwargs["proxy"] = proxy
821
+ if parsed_headers:
822
+ kwargs["extra_headers"] = parsed_headers
823
+
824
+ __Request_and_Save(StealthyFetcher.fetch, url, output_file, css_selector, **kwargs)
30
825
 
31
826
 
32
- @click.group()
827
+ @group()
33
828
  def main():
34
829
  pass
35
830
 
36
831
 
37
832
  # Adding commands
38
833
  main.add_command(install)
834
+ main.add_command(shell)
835
+ main.add_command(extract)
836
+ main.add_command(mcp)