scrapling 0.2.99__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. scrapling/__init__.py +18 -31
  2. scrapling/cli.py +818 -20
  3. scrapling/core/_html_utils.py +348 -0
  4. scrapling/core/_types.py +34 -17
  5. scrapling/core/ai.py +611 -0
  6. scrapling/core/custom_types.py +183 -100
  7. scrapling/core/mixins.py +27 -19
  8. scrapling/core/shell.py +647 -0
  9. scrapling/core/{storage_adaptors.py → storage.py} +41 -33
  10. scrapling/core/translator.py +20 -26
  11. scrapling/core/utils.py +49 -54
  12. scrapling/engines/__init__.py +15 -6
  13. scrapling/engines/_browsers/__init__.py +2 -0
  14. scrapling/engines/_browsers/_camoufox.py +759 -0
  15. scrapling/engines/_browsers/_config_tools.py +130 -0
  16. scrapling/engines/_browsers/_controllers.py +644 -0
  17. scrapling/engines/_browsers/_page.py +93 -0
  18. scrapling/engines/_browsers/_validators.py +170 -0
  19. scrapling/engines/constants.py +101 -88
  20. scrapling/engines/static.py +667 -110
  21. scrapling/engines/toolbelt/__init__.py +20 -6
  22. scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
  23. scrapling/engines/toolbelt/convertor.py +254 -0
  24. scrapling/engines/toolbelt/custom.py +158 -175
  25. scrapling/engines/toolbelt/fingerprints.py +32 -46
  26. scrapling/engines/toolbelt/navigation.py +68 -39
  27. scrapling/fetchers.py +239 -333
  28. scrapling/parser.py +781 -449
  29. scrapling-0.3.1.dist-info/METADATA +411 -0
  30. scrapling-0.3.1.dist-info/RECORD +41 -0
  31. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/WHEEL +1 -1
  32. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/top_level.txt +0 -1
  33. scrapling/defaults.py +0 -25
  34. scrapling/engines/camo.py +0 -339
  35. scrapling/engines/pw.py +0 -465
  36. scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
  37. scrapling-0.2.99.dist-info/METADATA +0 -290
  38. scrapling-0.2.99.dist-info/RECORD +0 -49
  39. tests/__init__.py +0 -1
  40. tests/fetchers/__init__.py +0 -1
  41. tests/fetchers/async/__init__.py +0 -0
  42. tests/fetchers/async/test_camoufox.py +0 -97
  43. tests/fetchers/async/test_httpx.py +0 -85
  44. tests/fetchers/async/test_playwright.py +0 -101
  45. tests/fetchers/sync/__init__.py +0 -0
  46. tests/fetchers/sync/test_camoufox.py +0 -70
  47. tests/fetchers/sync/test_httpx.py +0 -84
  48. tests/fetchers/sync/test_playwright.py +0 -89
  49. tests/fetchers/test_utils.py +0 -97
  50. tests/parser/__init__.py +0 -0
  51. tests/parser/test_automatch.py +0 -111
  52. tests/parser/test_general.py +0 -330
  53. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/entry_points.txt +0 -0
  54. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,647 @@
1
+ # -*- coding: utf-8 -*-
2
+ from re import sub as re_sub
3
+ from sys import stderr
4
+ from functools import wraps
5
+ from http import cookies as Cookie
6
+ from collections import namedtuple
7
+ from shlex import split as shlex_split
8
+ from tempfile import mkstemp as make_temp_file
9
+ from urllib.parse import urlparse, urlunparse, parse_qsl
10
+ from argparse import ArgumentParser, SUPPRESS
11
+ from webbrowser import open as open_in_browser
12
+ from logging import (
13
+ DEBUG,
14
+ INFO,
15
+ WARNING,
16
+ ERROR,
17
+ CRITICAL,
18
+ FATAL,
19
+ getLogger,
20
+ getLevelName,
21
+ )
22
+
23
+ from orjson import loads as json_loads, JSONDecodeError
24
+
25
+ from scrapling import __version__
26
+ from scrapling.core.custom_types import TextHandler
27
+ from scrapling.core.utils import log
28
+ from scrapling.parser import Selector, Selectors
29
+ from scrapling.core._types import (
30
+ List,
31
+ Optional,
32
+ Dict,
33
+ Tuple,
34
+ Any,
35
+ extraction_types,
36
+ Generator,
37
+ )
38
+ from scrapling.fetchers import (
39
+ Fetcher,
40
+ AsyncFetcher,
41
+ DynamicFetcher,
42
+ StealthyFetcher,
43
+ Response,
44
+ )
45
+
46
+
47
+ _known_logging_levels = {
48
+ "debug": DEBUG,
49
+ "info": INFO,
50
+ "warning": WARNING,
51
+ "error": ERROR,
52
+ "critical": CRITICAL,
53
+ "fatal": FATAL,
54
+ }
55
+
56
+
57
+ # Define the structure for parsed context - Simplified for Fetcher args
58
+ Request = namedtuple(
59
+ "Request",
60
+ [
61
+ "method",
62
+ "url",
63
+ "params",
64
+ "data", # Can be str, bytes, or dict (for urlencoded)
65
+ "json_data", # Python object (dict/list) for JSON payload
66
+ "headers",
67
+ "cookies",
68
+ "proxy",
69
+ "follow_redirects", # Added for -L flag
70
+ ],
71
+ )
72
+
73
+
74
+ def _CookieParser(cookie_string):
75
+ # Errors will be handled on call so the log can be specified
76
+ cookie_parser = Cookie.SimpleCookie()
77
+ cookie_parser.load(cookie_string)
78
+ for key, morsel in cookie_parser.items():
79
+ yield key, morsel.value
80
+
81
+
82
+ def _ParseHeaders(
83
+ header_lines: List[str], parse_cookies: bool = True
84
+ ) -> Tuple[Dict[str, str], Dict[str, str]]:
85
+ """Parses headers into separate header and cookie dictionaries."""
86
+ header_dict = dict()
87
+ cookie_dict = dict()
88
+
89
+ for header_line in header_lines:
90
+ if ":" not in header_line:
91
+ if header_line.endswith(";"):
92
+ header_key = header_line[:-1].strip()
93
+ header_value = ""
94
+ header_dict[header_key] = header_value
95
+ else:
96
+ raise ValueError(
97
+ f"Could not parse header without colon: '{header_line}'."
98
+ )
99
+ else:
100
+ header_key, header_value = header_line.split(":", 1)
101
+ header_key = header_key.strip()
102
+ header_value = header_value.strip()
103
+
104
+ if parse_cookies:
105
+ if header_key.lower() == "cookie":
106
+ try:
107
+ cookie_dict = {
108
+ key: value for key, value in _CookieParser(header_value)
109
+ }
110
+ except Exception as e: # pragma: no cover
111
+ raise ValueError(
112
+ f"Could not parse cookie string from header '{header_value}': {e}"
113
+ )
114
+ else:
115
+ header_dict[header_key] = header_value
116
+ else:
117
+ header_dict[header_key] = header_value
118
+
119
+ return header_dict, cookie_dict
120
+
121
+
122
+ # Suppress exit on error to handle parsing errors gracefully
123
+ class NoExitArgumentParser(ArgumentParser): # pragma: no cover
124
+ def error(self, message):
125
+ log.error(f"Curl arguments parsing error: {message}")
126
+ raise ValueError(f"Curl arguments parsing error: {message}")
127
+
128
+ def exit(self, status=0, message=None):
129
+ if message:
130
+ log.error(f"Scrapling shell exited with status {status}: {message}")
131
+ self._print_message(message, stderr)
132
+ raise ValueError(
133
+ f"Scrapling shell exited with status {status}: {message or 'Unknown reason'}"
134
+ )
135
+
136
+
137
+ class CurlParser:
138
+ """Builds the argument parser for relevant curl flags from DevTools."""
139
+
140
+ def __init__(self):
141
+ # We will use argparse parser to parse the curl command directly instead of regex
142
+ # We will focus more on flags that will show up on curl commands copied from DevTools's network tab
143
+ _parser = NoExitArgumentParser(add_help=False) # Disable default help
144
+ # Basic curl arguments
145
+ _parser.add_argument("curl_command_placeholder", nargs="?", help=SUPPRESS)
146
+ _parser.add_argument("url")
147
+ _parser.add_argument("-X", "--request", dest="method", default=None)
148
+ _parser.add_argument("-H", "--header", action="append", default=[])
149
+ _parser.add_argument(
150
+ "-A", "--user-agent", help="Will be parsed from -H if present"
151
+ ) # Note: DevTools usually includes this in -H
152
+
153
+ # Data arguments (prioritizing types common from DevTools)
154
+ _parser.add_argument("-d", "--data", default=None)
155
+ _parser.add_argument(
156
+ "--data-raw", default=None
157
+ ) # Often used by browsers for JSON body
158
+ _parser.add_argument("--data-binary", default=None)
159
+ # Keep urlencode for completeness, though less common from browser copy/paste
160
+ _parser.add_argument("--data-urlencode", action="append", default=[])
161
+ _parser.add_argument(
162
+ "-G", "--get", action="store_true"
163
+ ) # Use GET and put data in URL
164
+
165
+ _parser.add_argument(
166
+ "-b",
167
+ "--cookie",
168
+ default=None,
169
+ help="Send cookies from string/file (string format used by DevTools)",
170
+ )
171
+
172
+ # Proxy
173
+ _parser.add_argument("-x", "--proxy", default=None)
174
+ _parser.add_argument("-U", "--proxy-user", default=None) # Basic proxy auth
175
+
176
+ # Connection/Security
177
+ _parser.add_argument("-k", "--insecure", action="store_true")
178
+ _parser.add_argument(
179
+ "--compressed", action="store_true"
180
+ ) # Very common from browsers
181
+
182
+ # Other flags often included but may not map directly to request args
183
+ _parser.add_argument("-i", "--include", action="store_true")
184
+ _parser.add_argument("-s", "--silent", action="store_true")
185
+ _parser.add_argument("-v", "--verbose", action="store_true")
186
+
187
+ self.parser: NoExitArgumentParser = _parser
188
+ self._supported_methods = ("get", "post", "put", "delete")
189
+
190
+ # --- Main Parsing Logic ---
191
+ def parse(self, curl_command: str) -> Optional[Request]:
192
+ """Parses the curl command string into a structured context for Fetcher."""
193
+
194
+ clean_command = curl_command.strip().lstrip("curl").strip().replace("\\\n", " ")
195
+
196
+ try:
197
+ tokens = shlex_split(
198
+ clean_command
199
+ ) # Split the string using shell-like syntax
200
+ except ValueError as e: # pragma: no cover
201
+ log.error(f"Could not split command line: {e}")
202
+ return None
203
+
204
+ try:
205
+ parsed_args, unknown = self.parser.parse_known_args(tokens)
206
+ if unknown:
207
+ raise AttributeError(f"Unknown/Unsupported curl arguments: {unknown}")
208
+
209
+ except ValueError: # pragma: no cover
210
+ return None
211
+
212
+ except AttributeError:
213
+ raise
214
+
215
+ except Exception as e: # pragma: no cover
216
+ log.error(
217
+ f"An unexpected error occurred during curl arguments parsing: {e}"
218
+ )
219
+ return None
220
+
221
+ # --- Determine Method ---
222
+ method = "get" # Default
223
+ if parsed_args.get: # `-G` forces GET
224
+ method = "get"
225
+
226
+ elif parsed_args.method:
227
+ method = parsed_args.method.strip().lower()
228
+
229
+ # Infer POST if data is present (unless overridden by -X or -G)
230
+ elif any(
231
+ [
232
+ parsed_args.data,
233
+ parsed_args.data_raw,
234
+ parsed_args.data_binary,
235
+ parsed_args.data_urlencode,
236
+ ]
237
+ ):
238
+ method = "post"
239
+
240
+ headers, cookies = _ParseHeaders(parsed_args.header)
241
+
242
+ if parsed_args.cookie:
243
+ # We are focusing on the string format from DevTools.
244
+ try:
245
+ for key, value in _CookieParser(parsed_args.cookie):
246
+ # Update the cookie dict, potentially overwriting cookies with the same name from -H 'cookie:'
247
+ cookies[key] = value
248
+ log.debug(f"Parsed cookies from -b argument: {list(cookies.keys())}")
249
+ except Exception as e: # pragma: no cover
250
+ log.error(
251
+ f"Could not parse cookie string from -b '{parsed_args.cookie}': {e}"
252
+ )
253
+
254
+ # --- Process Data Payload ---
255
+ params = dict()
256
+ data_payload: Optional[str | bytes | Dict] = None
257
+ json_payload: Optional[Any] = None
258
+
259
+ # DevTools often uses --data-raw for JSON bodies
260
+ # Precedence: --data-binary > --data-raw / -d > --data-urlencode
261
+ if parsed_args.data_binary is not None: # pragma: no cover
262
+ try:
263
+ data_payload = parsed_args.data_binary.encode("utf-8")
264
+ log.debug("Using data from --data-binary as bytes.")
265
+ except Exception as e:
266
+ log.warning(
267
+ f"Could not encode binary data '{parsed_args.data_binary}' as bytes: {e}. Using raw string."
268
+ )
269
+ data_payload = parsed_args.data_binary # Fallback to string
270
+
271
+ elif parsed_args.data_raw is not None:
272
+ data_payload = parsed_args.data_raw
273
+
274
+ elif parsed_args.data is not None:
275
+ data_payload = parsed_args.data
276
+
277
+ elif parsed_args.data_urlencode: # pragma: no cover
278
+ # Combine and parse urlencoded data
279
+ combined_data = "&".join(parsed_args.data_urlencode)
280
+ try:
281
+ data_payload = dict(parse_qsl(combined_data, keep_blank_values=True))
282
+ except Exception as e:
283
+ log.warning(
284
+ f"Could not parse urlencoded data '{combined_data}': {e}. Treating as raw string."
285
+ )
286
+ data_payload = combined_data
287
+
288
+ # Check if raw data looks like JSON, prefer 'json' param if so
289
+ if isinstance(data_payload, str):
290
+ try:
291
+ maybe_json = json_loads(data_payload)
292
+ if isinstance(maybe_json, (dict, list)):
293
+ json_payload = maybe_json
294
+ data_payload = None
295
+ except JSONDecodeError:
296
+ pass # Not JSON, keep it in data_payload
297
+
298
+ # Handle `-G`: Move data to params if the method is GET
299
+ if method == "get" and data_payload: # pragma: no cover
300
+ if isinstance(data_payload, dict): # From --data-urlencode likely
301
+ params.update(data_payload)
302
+ elif isinstance(data_payload, str):
303
+ try:
304
+ params.update(dict(parse_qsl(data_payload, keep_blank_values=True)))
305
+ except ValueError:
306
+ log.warning(
307
+ f"Could not parse data '{data_payload}' into GET parameters for -G."
308
+ )
309
+
310
+ if params:
311
+ data_payload = None # Clear data as it's moved to params
312
+ json_payload = None # Should not have JSON body with -G
313
+
314
+ # --- Process Proxy ---
315
+ proxies: Optional[Dict[str, str]] = None
316
+ if parsed_args.proxy:
317
+ proxy_url = (
318
+ f"http://{parsed_args.proxy}"
319
+ if "://" not in parsed_args.proxy
320
+ else parsed_args.proxy
321
+ )
322
+
323
+ if parsed_args.proxy_user:
324
+ user_pass = parsed_args.proxy_user
325
+ parts = urlparse(proxy_url)
326
+ netloc_parts = parts.netloc.split("@")
327
+ netloc = (
328
+ f"{user_pass}@{netloc_parts[-1]}"
329
+ if len(netloc_parts) > 1
330
+ else f"{user_pass}@{parts.netloc}"
331
+ )
332
+ proxy_url = urlunparse(
333
+ (
334
+ parts.scheme,
335
+ netloc,
336
+ parts.path,
337
+ parts.params,
338
+ parts.query,
339
+ parts.fragment,
340
+ )
341
+ )
342
+
343
+ # Standard proxy dict format
344
+ proxies = {"http": proxy_url, "https": proxy_url}
345
+ log.debug(f"Using proxy configuration: {proxies}")
346
+
347
+ # --- Final Context ---
348
+ return Request(
349
+ method=method,
350
+ url=parsed_args.url,
351
+ params=params,
352
+ data=data_payload,
353
+ json_data=json_payload,
354
+ headers=headers,
355
+ cookies=cookies,
356
+ proxy=proxies,
357
+ follow_redirects=True, # Scrapling default is True
358
+ )
359
+
360
+ def convert2fetcher(self, curl_command: Request | str) -> Optional[Response]:
361
+ if isinstance(curl_command, (Request, str)):
362
+ request = (
363
+ self.parse(curl_command)
364
+ if isinstance(curl_command, str)
365
+ else curl_command
366
+ )
367
+
368
+ # Ensure request parsing was successful before proceeding
369
+ if request is None: # pragma: no cover
370
+ log.error("Failed to parse curl command, cannot convert to fetcher.")
371
+ return None
372
+
373
+ request_args = request._asdict()
374
+ method = request_args.pop("method").strip().lower()
375
+ if method in self._supported_methods:
376
+ request_args["json"] = request_args.pop("json_data")
377
+
378
+ # Ensure data/json are removed for non-POST/PUT methods
379
+ if method not in ("post", "put"):
380
+ _ = request_args.pop("data", None)
381
+ _ = request_args.pop("json", None)
382
+
383
+ try:
384
+ return getattr(Fetcher, method)(**request_args)
385
+ except Exception as e: # pragma: no cover
386
+ log.error(f"Error calling Fetcher.{method}: {e}")
387
+ return None
388
+ else: # pragma: no cover
389
+ log.error(
390
+ f'Request method "{method}" isn\'t supported by Scrapling yet'
391
+ )
392
+ return None
393
+
394
+ else: # pragma: no cover
395
+ log.error("Input must be a valid curl command string or a Request object.")
396
+ return None
397
+
398
+
399
+ def show_page_in_browser(page: Selector): # pragma: no cover
400
+ if not page or not isinstance(page, Selector):
401
+ log.error("Input must be of type `Selector`")
402
+ return
403
+
404
+ try:
405
+ fd, fname = make_temp_file(prefix="scrapling_view_", suffix=".html")
406
+ with open(fd, "w", encoding="utf-8") as f:
407
+ f.write(page.body)
408
+
409
+ open_in_browser(f"file://{fname}")
410
+ except IOError as e:
411
+ log.error(f"Failed to write temporary file for viewing: {e}")
412
+ except Exception as e:
413
+ log.error(f"An unexpected error occurred while viewing the page: {e}")
414
+
415
+
416
+ class CustomShell:
417
+ """A custom IPython shell with minimal dependencies"""
418
+
419
+ def __init__(self, code, log_level="debug"):
420
+ self.code = code
421
+ self.page = None
422
+ self.pages = Selectors([])
423
+ self._curl_parser = CurlParser()
424
+ log_level = log_level.strip().lower()
425
+
426
+ if _known_logging_levels.get(log_level):
427
+ self.log_level = _known_logging_levels[log_level]
428
+ else: # pragma: no cover
429
+ log.warning(f'Unknown log level "{log_level}", defaulting to "DEBUG"')
430
+ self.log_level = DEBUG
431
+
432
+ self.shell = None
433
+
434
+ # Initialize your application components
435
+ self.init_components()
436
+
437
+ def init_components(self):
438
+ """Initialize application components"""
439
+ # This is where you'd set up your application-specific objects
440
+ if self.log_level:
441
+ getLogger("scrapling").setLevel(self.log_level)
442
+
443
+ settings = Fetcher.display_config()
444
+ settings.pop("storage", None)
445
+ settings.pop("storage_args", None)
446
+ log.info(f"Scrapling {__version__} shell started")
447
+ log.info(f"Logging level is set to '{getLevelName(self.log_level)}'")
448
+ log.info(f"Fetchers' parsing settings: {settings}")
449
+
450
+ @staticmethod
451
+ def banner():
452
+ """Create a custom banner for the shell"""
453
+ return f"""
454
+ -> Available Scrapling objects:
455
+ - Fetcher/AsyncFetcher
456
+ - DynamicFetcher
457
+ - StealthyFetcher
458
+ - Selector
459
+
460
+ -> Useful shortcuts:
461
+ - {"get":<30} Shortcut for `Fetcher.get`
462
+ - {"post":<30} Shortcut for `Fetcher.post`
463
+ - {"put":<30} Shortcut for `Fetcher.put`
464
+ - {"delete":<30} Shortcut for `Fetcher.delete`
465
+ - {"fetch":<30} Shortcut for `DynamicFetcher.fetch`
466
+ - {"stealthy_fetch":<30} Shortcut for `StealthyFetcher.fetch`
467
+
468
+ -> Useful commands
469
+ - {"page / response":<30} The response object of the last page you fetched
470
+ - {"pages":<30} Selectors object of the last 5 response objects you fetched
471
+ - {"uncurl('curl_command')":<30} Convert curl command to a Request object. (Optimized to handle curl commands copied from DevTools network tab.)
472
+ - {"curl2fetcher('curl_command')":<30} Convert curl command and make the request with Fetcher. (Optimized to handle curl commands copied from DevTools network tab.)
473
+ - {"view(page)":<30} View page in a browser
474
+ - {"help()":<30} Show this help message (Shell help)
475
+
476
+ Type 'exit' or press Ctrl+D to exit.
477
+ """
478
+
479
+ def update_page(self, result): # pragma: no cover
480
+ """Update the current page and add to pages history"""
481
+ self.page = result
482
+ if isinstance(result, (Response, Selector)):
483
+ self.pages.append(result)
484
+ if len(self.pages) > 5:
485
+ self.pages.pop(0) # Remove oldest item
486
+
487
+ # Update in IPython namespace too
488
+ if self.shell:
489
+ self.shell.user_ns["page"] = self.page
490
+ self.shell.user_ns["response"] = self.page
491
+ self.shell.user_ns["pages"] = self.pages
492
+
493
+ return result
494
+
495
+ def create_wrapper(self, func):
496
+ """Create a wrapper that preserves function signature but updates page"""
497
+
498
+ @wraps(func)
499
+ def wrapper(*args, **kwargs):
500
+ result = func(*args, **kwargs)
501
+ return self.update_page(result)
502
+
503
+ return wrapper
504
+
505
+ def get_namespace(self):
506
+ """Create a namespace with application-specific objects"""
507
+
508
+ # Create wrapped versions of fetch functions
509
+ get = self.create_wrapper(Fetcher.get)
510
+ post = self.create_wrapper(Fetcher.post)
511
+ put = self.create_wrapper(Fetcher.put)
512
+ delete = self.create_wrapper(Fetcher.delete)
513
+ dynamic_fetch = self.create_wrapper(DynamicFetcher.fetch)
514
+ stealthy_fetch = self.create_wrapper(StealthyFetcher.fetch)
515
+ curl2fetcher = self.create_wrapper(self._curl_parser.convert2fetcher)
516
+
517
+ # Create the namespace dictionary
518
+ return {
519
+ "get": get,
520
+ "post": post,
521
+ "put": put,
522
+ "delete": delete,
523
+ "Fetcher": Fetcher,
524
+ "AsyncFetcher": AsyncFetcher,
525
+ "fetch": dynamic_fetch,
526
+ "DynamicFetcher": DynamicFetcher,
527
+ "stealthy_fetch": stealthy_fetch,
528
+ "StealthyFetcher": StealthyFetcher,
529
+ "Selector": Selector,
530
+ "page": self.page,
531
+ "response": self.page,
532
+ "pages": self.pages,
533
+ "view": show_page_in_browser,
534
+ "uncurl": self._curl_parser.parse,
535
+ "curl2fetcher": curl2fetcher,
536
+ "help": self.show_help,
537
+ }
538
+
539
+ def show_help(self): # pragma: no cover
540
+ """Show help information"""
541
+ print(self.banner())
542
+
543
+ def start(self): # pragma: no cover
544
+ """Start the interactive shell"""
545
+ from IPython.terminal.embed import InteractiveShellEmbed
546
+
547
+ # Get our namespace with application objects
548
+ namespace = self.get_namespace()
549
+ ipython_shell = InteractiveShellEmbed(
550
+ banner1=self.banner(),
551
+ banner2="",
552
+ enable_tip=False,
553
+ exit_msg="Bye Bye",
554
+ user_ns=namespace,
555
+ )
556
+ self.shell = ipython_shell
557
+
558
+ # If a command was provided, execute it and exit
559
+ if self.code:
560
+ log.info(f"Executing provided code: {self.code}")
561
+ try:
562
+ ipython_shell.run_cell(self.code, store_history=False)
563
+ except Exception as e:
564
+ log.error(f"Error executing initial code: {e}")
565
+ return
566
+
567
+ ipython_shell()
568
+
569
+
570
+ class Convertor:
571
+ """Utils for the extract shell command"""
572
+
573
+ _extension_map: Dict[str, extraction_types] = {
574
+ "md": "markdown",
575
+ "html": "html",
576
+ "txt": "text",
577
+ }
578
+
579
+ @classmethod
580
+ def _convert_to_markdown(cls, body: TextHandler) -> str:
581
+ """Convert HTML content to Markdown"""
582
+ from markdownify import markdownify
583
+
584
+ return markdownify(body)
585
+
586
+ @classmethod
587
+ def _extract_content(
588
+ cls,
589
+ page: Selector,
590
+ extraction_type: extraction_types = "markdown",
591
+ css_selector: Optional[str] = None,
592
+ main_content_only: bool = False,
593
+ ) -> Generator[str, None, None]:
594
+ """Extract the content of a Selector"""
595
+ if not page or not isinstance(page, Selector): # pragma: no cover
596
+ raise TypeError("Input must be of type `Selector`")
597
+ elif not extraction_type or extraction_type not in cls._extension_map.values():
598
+ raise ValueError(f"Unknown extraction type: {extraction_type}")
599
+ else:
600
+ if main_content_only:
601
+ page = page.css_first("body") or page
602
+
603
+ pages = [page] if not css_selector else page.css(css_selector)
604
+ for page in pages:
605
+ match extraction_type:
606
+ case "markdown":
607
+ yield cls._convert_to_markdown(page.body)
608
+ case "html":
609
+ yield page.body
610
+ case "text":
611
+ txt_content = page.get_all_text(strip=True)
612
+ for s in (
613
+ "\n",
614
+ "\r",
615
+ "\t",
616
+ " ",
617
+ ):
618
+ # Remove consecutive white-spaces
619
+ txt_content = re_sub(f"[{s}]+", s, txt_content)
620
+ yield txt_content
621
+ yield ""
622
+
623
+ @classmethod
624
+ def write_content_to_file(
625
+ cls, page: Selector, filename: str, css_selector: Optional[str] = None
626
+ ) -> None:
627
+ """Write a Selector's content to a file"""
628
+ if not page or not isinstance(page, Selector): # pragma: no cover
629
+ raise TypeError("Input must be of type `Selector`")
630
+ elif not filename or not isinstance(filename, str) or not filename.strip():
631
+ raise ValueError("Filename must be provided")
632
+ elif not filename.endswith((".md", ".html", ".txt")):
633
+ raise ValueError(
634
+ "Unknown file type: filename must end with '.md', '.html', or '.txt'"
635
+ )
636
+ else:
637
+ with open(filename, "w", encoding="utf-8") as f:
638
+ extension = filename.split(".")[-1]
639
+ f.write(
640
+ "".join(
641
+ cls._extract_content(
642
+ page,
643
+ cls._extension_map[extension],
644
+ css_selector=css_selector,
645
+ )
646
+ )
647
+ )