scrapling 0.2.98__py3-none-any.whl → 0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. scrapling/__init__.py +18 -31
  2. scrapling/cli.py +818 -20
  3. scrapling/core/_html_utils.py +348 -0
  4. scrapling/core/_types.py +34 -17
  5. scrapling/core/ai.py +611 -0
  6. scrapling/core/custom_types.py +183 -100
  7. scrapling/core/mixins.py +27 -19
  8. scrapling/core/shell.py +647 -0
  9. scrapling/core/{storage_adaptors.py → storage.py} +41 -33
  10. scrapling/core/translator.py +20 -26
  11. scrapling/core/utils.py +49 -54
  12. scrapling/engines/__init__.py +15 -6
  13. scrapling/engines/_browsers/__init__.py +2 -0
  14. scrapling/engines/_browsers/_camoufox.py +745 -0
  15. scrapling/engines/_browsers/_config_tools.py +130 -0
  16. scrapling/engines/_browsers/_controllers.py +630 -0
  17. scrapling/engines/_browsers/_page.py +93 -0
  18. scrapling/engines/_browsers/_validators.py +150 -0
  19. scrapling/engines/constants.py +101 -88
  20. scrapling/engines/static.py +667 -110
  21. scrapling/engines/toolbelt/__init__.py +20 -6
  22. scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
  23. scrapling/engines/toolbelt/convertor.py +254 -0
  24. scrapling/engines/toolbelt/custom.py +205 -186
  25. scrapling/engines/toolbelt/fingerprints.py +32 -46
  26. scrapling/engines/toolbelt/navigation.py +68 -39
  27. scrapling/fetchers.py +255 -260
  28. scrapling/parser.py +781 -449
  29. scrapling-0.3.dist-info/METADATA +409 -0
  30. scrapling-0.3.dist-info/RECORD +41 -0
  31. {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/WHEEL +1 -1
  32. {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/top_level.txt +0 -1
  33. scrapling/defaults.py +0 -19
  34. scrapling/engines/camo.py +0 -299
  35. scrapling/engines/pw.py +0 -428
  36. scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
  37. scrapling-0.2.98.dist-info/METADATA +0 -867
  38. scrapling-0.2.98.dist-info/RECORD +0 -49
  39. tests/__init__.py +0 -1
  40. tests/fetchers/__init__.py +0 -1
  41. tests/fetchers/async/__init__.py +0 -0
  42. tests/fetchers/async/test_camoufox.py +0 -95
  43. tests/fetchers/async/test_httpx.py +0 -83
  44. tests/fetchers/async/test_playwright.py +0 -99
  45. tests/fetchers/sync/__init__.py +0 -0
  46. tests/fetchers/sync/test_camoufox.py +0 -68
  47. tests/fetchers/sync/test_httpx.py +0 -82
  48. tests/fetchers/sync/test_playwright.py +0 -87
  49. tests/fetchers/test_utils.py +0 -97
  50. tests/parser/__init__.py +0 -0
  51. tests/parser/test_automatch.py +0 -111
  52. tests/parser/test_general.py +0 -330
  53. {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info}/entry_points.txt +0 -0
  54. {scrapling-0.2.98.dist-info → scrapling-0.3.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,647 @@
1
+ # -*- coding: utf-8 -*-
2
+ from re import sub as re_sub
3
+ from sys import stderr
4
+ from functools import wraps
5
+ from http import cookies as Cookie
6
+ from collections import namedtuple
7
+ from shlex import split as shlex_split
8
+ from tempfile import mkstemp as make_temp_file
9
+ from urllib.parse import urlparse, urlunparse, parse_qsl
10
+ from argparse import ArgumentParser, SUPPRESS
11
+ from webbrowser import open as open_in_browser
12
+ from logging import (
13
+ DEBUG,
14
+ INFO,
15
+ WARNING,
16
+ ERROR,
17
+ CRITICAL,
18
+ FATAL,
19
+ getLogger,
20
+ getLevelName,
21
+ )
22
+
23
+ from IPython.terminal.embed import InteractiveShellEmbed
24
+ from orjson import loads as json_loads, JSONDecodeError
25
+
26
+ from scrapling import __version__
27
+ from scrapling.core.custom_types import TextHandler
28
+ from scrapling.core.utils import log
29
+ from scrapling.parser import Selector, Selectors
30
+ from scrapling.core._types import (
31
+ List,
32
+ Optional,
33
+ Dict,
34
+ Tuple,
35
+ Any,
36
+ extraction_types,
37
+ Generator,
38
+ )
39
+ from scrapling.fetchers import (
40
+ Fetcher,
41
+ AsyncFetcher,
42
+ DynamicFetcher,
43
+ StealthyFetcher,
44
+ Response,
45
+ )
46
+
47
+
48
+ _known_logging_levels = {
49
+ "debug": DEBUG,
50
+ "info": INFO,
51
+ "warning": WARNING,
52
+ "error": ERROR,
53
+ "critical": CRITICAL,
54
+ "fatal": FATAL,
55
+ }
56
+
57
+
58
+ # Define the structure for parsed context - Simplified for Fetcher args
59
+ Request = namedtuple(
60
+ "Request",
61
+ [
62
+ "method",
63
+ "url",
64
+ "params",
65
+ "data", # Can be str, bytes, or dict (for urlencoded)
66
+ "json_data", # Python object (dict/list) for JSON payload
67
+ "headers",
68
+ "cookies",
69
+ "proxy",
70
+ "follow_redirects", # Added for -L flag
71
+ ],
72
+ )
73
+
74
+
75
+ def _CookieParser(cookie_string):
76
+ # Errors will be handled on call so the log can be specified
77
+ cookie_parser = Cookie.SimpleCookie()
78
+ cookie_parser.load(cookie_string)
79
+ for key, morsel in cookie_parser.items():
80
+ yield key, morsel.value
81
+
82
+
83
+ def _ParseHeaders(
84
+ header_lines: List[str], parse_cookies: bool = True
85
+ ) -> Tuple[Dict[str, str], Dict[str, str]]:
86
+ """Parses headers into separate header and cookie dictionaries."""
87
+ header_dict = dict()
88
+ cookie_dict = dict()
89
+
90
+ for header_line in header_lines:
91
+ if ":" not in header_line:
92
+ if header_line.endswith(";"):
93
+ header_key = header_line[:-1].strip()
94
+ header_value = ""
95
+ header_dict[header_key] = header_value
96
+ else:
97
+ raise ValueError(
98
+ f"Could not parse header without colon: '{header_line}'."
99
+ )
100
+ else:
101
+ header_key, header_value = header_line.split(":", 1)
102
+ header_key = header_key.strip()
103
+ header_value = header_value.strip()
104
+
105
+ if parse_cookies:
106
+ if header_key.lower() == "cookie":
107
+ try:
108
+ cookie_dict = {
109
+ key: value for key, value in _CookieParser(header_value)
110
+ }
111
+ except Exception as e: # pragma: no cover
112
+ raise ValueError(
113
+ f"Could not parse cookie string from header '{header_value}': {e}"
114
+ )
115
+ else:
116
+ header_dict[header_key] = header_value
117
+ else:
118
+ header_dict[header_key] = header_value
119
+
120
+ return header_dict, cookie_dict
121
+
122
+
123
+ # Suppress exit on error to handle parsing errors gracefully
124
+ class NoExitArgumentParser(ArgumentParser): # pragma: no cover
125
+ def error(self, message):
126
+ log.error(f"Curl arguments parsing error: {message}")
127
+ raise ValueError(f"Curl arguments parsing error: {message}")
128
+
129
+ def exit(self, status=0, message=None):
130
+ if message:
131
+ log.error(f"Scrapling shell exited with status {status}: {message}")
132
+ self._print_message(message, stderr)
133
+ raise ValueError(
134
+ f"Scrapling shell exited with status {status}: {message or 'Unknown reason'}"
135
+ )
136
+
137
+
138
+ class CurlParser:
139
+ """Builds the argument parser for relevant curl flags from DevTools."""
140
+
141
+ def __init__(self):
142
+ # We will use argparse parser to parse the curl command directly instead of regex
143
+ # We will focus more on flags that will show up on curl commands copied from DevTools's network tab
144
+ _parser = NoExitArgumentParser(add_help=False) # Disable default help
145
+ # Basic curl arguments
146
+ _parser.add_argument("curl_command_placeholder", nargs="?", help=SUPPRESS)
147
+ _parser.add_argument("url")
148
+ _parser.add_argument("-X", "--request", dest="method", default=None)
149
+ _parser.add_argument("-H", "--header", action="append", default=[])
150
+ _parser.add_argument(
151
+ "-A", "--user-agent", help="Will be parsed from -H if present"
152
+ ) # Note: DevTools usually includes this in -H
153
+
154
+ # Data arguments (prioritizing types common from DevTools)
155
+ _parser.add_argument("-d", "--data", default=None)
156
+ _parser.add_argument(
157
+ "--data-raw", default=None
158
+ ) # Often used by browsers for JSON body
159
+ _parser.add_argument("--data-binary", default=None)
160
+ # Keep urlencode for completeness, though less common from browser copy/paste
161
+ _parser.add_argument("--data-urlencode", action="append", default=[])
162
+ _parser.add_argument(
163
+ "-G", "--get", action="store_true"
164
+ ) # Use GET and put data in URL
165
+
166
+ _parser.add_argument(
167
+ "-b",
168
+ "--cookie",
169
+ default=None,
170
+ help="Send cookies from string/file (string format used by DevTools)",
171
+ )
172
+
173
+ # Proxy
174
+ _parser.add_argument("-x", "--proxy", default=None)
175
+ _parser.add_argument("-U", "--proxy-user", default=None) # Basic proxy auth
176
+
177
+ # Connection/Security
178
+ _parser.add_argument("-k", "--insecure", action="store_true")
179
+ _parser.add_argument(
180
+ "--compressed", action="store_true"
181
+ ) # Very common from browsers
182
+
183
+ # Other flags often included but may not map directly to request args
184
+ _parser.add_argument("-i", "--include", action="store_true")
185
+ _parser.add_argument("-s", "--silent", action="store_true")
186
+ _parser.add_argument("-v", "--verbose", action="store_true")
187
+
188
+ self.parser: NoExitArgumentParser = _parser
189
+ self._supported_methods = ("get", "post", "put", "delete")
190
+
191
+ # --- Main Parsing Logic ---
192
+ def parse(self, curl_command: str) -> Optional[Request]:
193
+ """Parses the curl command string into a structured context for Fetcher."""
194
+
195
+ clean_command = curl_command.strip().lstrip("curl").strip().replace("\\\n", " ")
196
+
197
+ try:
198
+ tokens = shlex_split(
199
+ clean_command
200
+ ) # Split the string using shell-like syntax
201
+ except ValueError as e: # pragma: no cover
202
+ log.error(f"Could not split command line: {e}")
203
+ return None
204
+
205
+ try:
206
+ parsed_args, unknown = self.parser.parse_known_args(tokens)
207
+ if unknown:
208
+ raise AttributeError(f"Unknown/Unsupported curl arguments: {unknown}")
209
+
210
+ except ValueError: # pragma: no cover
211
+ return None
212
+
213
+ except AttributeError:
214
+ raise
215
+
216
+ except Exception as e: # pragma: no cover
217
+ log.error(
218
+ f"An unexpected error occurred during curl arguments parsing: {e}"
219
+ )
220
+ return None
221
+
222
+ # --- Determine Method ---
223
+ method = "get" # Default
224
+ if parsed_args.get: # `-G` forces GET
225
+ method = "get"
226
+
227
+ elif parsed_args.method:
228
+ method = parsed_args.method.strip().lower()
229
+
230
+ # Infer POST if data is present (unless overridden by -X or -G)
231
+ elif any(
232
+ [
233
+ parsed_args.data,
234
+ parsed_args.data_raw,
235
+ parsed_args.data_binary,
236
+ parsed_args.data_urlencode,
237
+ ]
238
+ ):
239
+ method = "post"
240
+
241
+ headers, cookies = _ParseHeaders(parsed_args.header)
242
+
243
+ if parsed_args.cookie:
244
+ # We are focusing on the string format from DevTools.
245
+ try:
246
+ for key, value in _CookieParser(parsed_args.cookie):
247
+ # Update the cookie dict, potentially overwriting cookies with the same name from -H 'cookie:'
248
+ cookies[key] = value
249
+ log.debug(f"Parsed cookies from -b argument: {list(cookies.keys())}")
250
+ except Exception as e: # pragma: no cover
251
+ log.error(
252
+ f"Could not parse cookie string from -b '{parsed_args.cookie}': {e}"
253
+ )
254
+
255
+ # --- Process Data Payload ---
256
+ params = dict()
257
+ data_payload: Optional[str | bytes | Dict] = None
258
+ json_payload: Optional[Any] = None
259
+
260
+ # DevTools often uses --data-raw for JSON bodies
261
+ # Precedence: --data-binary > --data-raw / -d > --data-urlencode
262
+ if parsed_args.data_binary is not None: # pragma: no cover
263
+ try:
264
+ data_payload = parsed_args.data_binary.encode("utf-8")
265
+ log.debug("Using data from --data-binary as bytes.")
266
+ except Exception as e:
267
+ log.warning(
268
+ f"Could not encode binary data '{parsed_args.data_binary}' as bytes: {e}. Using raw string."
269
+ )
270
+ data_payload = parsed_args.data_binary # Fallback to string
271
+
272
+ elif parsed_args.data_raw is not None:
273
+ data_payload = parsed_args.data_raw
274
+
275
+ elif parsed_args.data is not None:
276
+ data_payload = parsed_args.data
277
+
278
+ elif parsed_args.data_urlencode: # pragma: no cover
279
+ # Combine and parse urlencoded data
280
+ combined_data = "&".join(parsed_args.data_urlencode)
281
+ try:
282
+ data_payload = dict(parse_qsl(combined_data, keep_blank_values=True))
283
+ except Exception as e:
284
+ log.warning(
285
+ f"Could not parse urlencoded data '{combined_data}': {e}. Treating as raw string."
286
+ )
287
+ data_payload = combined_data
288
+
289
+ # Check if raw data looks like JSON, prefer 'json' param if so
290
+ if isinstance(data_payload, str):
291
+ try:
292
+ maybe_json = json_loads(data_payload)
293
+ if isinstance(maybe_json, (dict, list)):
294
+ json_payload = maybe_json
295
+ data_payload = None
296
+ except JSONDecodeError:
297
+ pass # Not JSON, keep it in data_payload
298
+
299
+ # Handle `-G`: Move data to params if the method is GET
300
+ if method == "get" and data_payload: # pragma: no cover
301
+ if isinstance(data_payload, dict): # From --data-urlencode likely
302
+ params.update(data_payload)
303
+ elif isinstance(data_payload, str):
304
+ try:
305
+ params.update(dict(parse_qsl(data_payload, keep_blank_values=True)))
306
+ except ValueError:
307
+ log.warning(
308
+ f"Could not parse data '{data_payload}' into GET parameters for -G."
309
+ )
310
+
311
+ if params:
312
+ data_payload = None # Clear data as it's moved to params
313
+ json_payload = None # Should not have JSON body with -G
314
+
315
+ # --- Process Proxy ---
316
+ proxies: Optional[Dict[str, str]] = None
317
+ if parsed_args.proxy:
318
+ proxy_url = (
319
+ f"http://{parsed_args.proxy}"
320
+ if "://" not in parsed_args.proxy
321
+ else parsed_args.proxy
322
+ )
323
+
324
+ if parsed_args.proxy_user:
325
+ user_pass = parsed_args.proxy_user
326
+ parts = urlparse(proxy_url)
327
+ netloc_parts = parts.netloc.split("@")
328
+ netloc = (
329
+ f"{user_pass}@{netloc_parts[-1]}"
330
+ if len(netloc_parts) > 1
331
+ else f"{user_pass}@{parts.netloc}"
332
+ )
333
+ proxy_url = urlunparse(
334
+ (
335
+ parts.scheme,
336
+ netloc,
337
+ parts.path,
338
+ parts.params,
339
+ parts.query,
340
+ parts.fragment,
341
+ )
342
+ )
343
+
344
+ # Standard proxy dict format
345
+ proxies = {"http": proxy_url, "https": proxy_url}
346
+ log.debug(f"Using proxy configuration: {proxies}")
347
+
348
+ # --- Final Context ---
349
+ return Request(
350
+ method=method,
351
+ url=parsed_args.url,
352
+ params=params,
353
+ data=data_payload,
354
+ json_data=json_payload,
355
+ headers=headers,
356
+ cookies=cookies,
357
+ proxy=proxies,
358
+ follow_redirects=True, # Scrapling default is True
359
+ )
360
+
361
+ def convert2fetcher(self, curl_command: Request | str) -> Optional[Response]:
362
+ if isinstance(curl_command, (Request, str)):
363
+ request = (
364
+ self.parse(curl_command)
365
+ if isinstance(curl_command, str)
366
+ else curl_command
367
+ )
368
+
369
+ # Ensure request parsing was successful before proceeding
370
+ if request is None: # pragma: no cover
371
+ log.error("Failed to parse curl command, cannot convert to fetcher.")
372
+ return None
373
+
374
+ request_args = request._asdict()
375
+ method = request_args.pop("method").strip().lower()
376
+ if method in self._supported_methods:
377
+ request_args["json"] = request_args.pop("json_data")
378
+
379
+ # Ensure data/json are removed for non-POST/PUT methods
380
+ if method not in ("post", "put"):
381
+ _ = request_args.pop("data", None)
382
+ _ = request_args.pop("json", None)
383
+
384
+ try:
385
+ return getattr(Fetcher, method)(**request_args)
386
+ except Exception as e: # pragma: no cover
387
+ log.error(f"Error calling Fetcher.{method}: {e}")
388
+ return None
389
+ else: # pragma: no cover
390
+ log.error(
391
+ f'Request method "{method}" isn\'t supported by Scrapling yet'
392
+ )
393
+ return None
394
+
395
+ else: # pragma: no cover
396
+ log.error("Input must be a valid curl command string or a Request object.")
397
+
398
+ return None
399
+
400
+
401
+ def show_page_in_browser(page: Selector): # pragma: no cover
402
+ if not page or not isinstance(page, Selector):
403
+ log.error("Input must be of type `Selector`")
404
+ return
405
+
406
+ try:
407
+ fd, fname = make_temp_file(prefix="scrapling_view_", suffix=".html")
408
+ with open(fd, "w", encoding="utf-8") as f:
409
+ f.write(page.body)
410
+
411
+ open_in_browser(f"file://{fname}")
412
+ except IOError as e:
413
+ log.error(f"Failed to write temporary file for viewing: {e}")
414
+ except Exception as e:
415
+ log.error(f"An unexpected error occurred while viewing the page: {e}")
416
+
417
+
418
+ class CustomShell:
419
+ """A custom IPython shell with minimal dependencies"""
420
+
421
+ def __init__(self, code, log_level="debug"):
422
+ self.code = code
423
+ self.page = None
424
+ self.pages = Selectors([])
425
+ self._curl_parser = CurlParser()
426
+ log_level = log_level.strip().lower()
427
+
428
+ if _known_logging_levels.get(log_level):
429
+ self.log_level = _known_logging_levels[log_level]
430
+ else: # pragma: no cover
431
+ log.warning(f'Unknown log level "{log_level}", defaulting to "DEBUG"')
432
+ self.log_level = DEBUG
433
+
434
+ self.shell = None
435
+
436
+ # Initialize your application components
437
+ self.init_components()
438
+
439
+ def init_components(self):
440
+ """Initialize application components"""
441
+ # This is where you'd set up your application-specific objects
442
+ if self.log_level:
443
+ getLogger("scrapling").setLevel(self.log_level)
444
+
445
+ settings = Fetcher.display_config()
446
+ settings.pop("storage", None)
447
+ settings.pop("storage_args", None)
448
+ log.info(f"Scrapling {__version__} shell started")
449
+ log.info(f"Logging level is set to '{getLevelName(self.log_level)}'")
450
+ log.info(f"Fetchers' parsing settings: {settings}")
451
+
452
+ @staticmethod
453
+ def banner():
454
+ """Create a custom banner for the shell"""
455
+ return f"""
456
+ -> Available Scrapling objects:
457
+ - Fetcher/AsyncFetcher
458
+ - DynamicFetcher
459
+ - StealthyFetcher
460
+ - Selector
461
+
462
+ -> Useful shortcuts:
463
+ - {"get":<30} Shortcut for `Fetcher.get`
464
+ - {"post":<30} Shortcut for `Fetcher.post`
465
+ - {"put":<30} Shortcut for `Fetcher.put`
466
+ - {"delete":<30} Shortcut for `Fetcher.delete`
467
+ - {"fetch":<30} Shortcut for `DynamicFetcher.fetch`
468
+ - {"stealthy_fetch":<30} Shortcut for `StealthyFetcher.fetch`
469
+
470
+ -> Useful commands
471
+ - {"page / response":<30} The response object of the last page you fetched
472
+ - {"pages":<30} Selectors object of the last 5 response objects you fetched
473
+ - {"uncurl('curl_command')":<30} Convert curl command to a Request object. (Optimized to handle curl commands copied from DevTools network tab.)
474
+ - {"curl2fetcher('curl_command')":<30} Convert curl command and make the request with Fetcher. (Optimized to handle curl commands copied from DevTools network tab.)
475
+ - {"view(page)":<30} View page in a browser
476
+ - {"help()":<30} Show this help message (Shell help)
477
+
478
+ Type 'exit' or press Ctrl+D to exit.
479
+ """
480
+
481
+ def update_page(self, result): # pragma: no cover
482
+ """Update the current page and add to pages history"""
483
+ self.page = result
484
+ if isinstance(result, (Response, Selector)):
485
+ self.pages.append(result)
486
+ if len(self.pages) > 5:
487
+ self.pages.pop(0) # Remove oldest item
488
+
489
+ # Update in IPython namespace too
490
+ if self.shell:
491
+ self.shell.user_ns["page"] = self.page
492
+ self.shell.user_ns["response"] = self.page
493
+ self.shell.user_ns["pages"] = self.pages
494
+
495
+ return result
496
+
497
+ def create_wrapper(self, func):
498
+ """Create a wrapper that preserves function signature but updates page"""
499
+
500
+ @wraps(func)
501
+ def wrapper(*args, **kwargs):
502
+ result = func(*args, **kwargs)
503
+ return self.update_page(result)
504
+
505
+ return wrapper
506
+
507
+ def get_namespace(self):
508
+ """Create a namespace with application-specific objects"""
509
+
510
+ # Create wrapped versions of fetch functions
511
+ get = self.create_wrapper(Fetcher.get)
512
+ post = self.create_wrapper(Fetcher.post)
513
+ put = self.create_wrapper(Fetcher.put)
514
+ delete = self.create_wrapper(Fetcher.delete)
515
+ dynamic_fetch = self.create_wrapper(DynamicFetcher.fetch)
516
+ stealthy_fetch = self.create_wrapper(StealthyFetcher.fetch)
517
+ curl2fetcher = self.create_wrapper(self._curl_parser.convert2fetcher)
518
+
519
+ # Create the namespace dictionary
520
+ return {
521
+ "get": get,
522
+ "post": post,
523
+ "put": put,
524
+ "delete": delete,
525
+ "Fetcher": Fetcher,
526
+ "AsyncFetcher": AsyncFetcher,
527
+ "fetch": dynamic_fetch,
528
+ "DynamicFetcher": DynamicFetcher,
529
+ "stealthy_fetch": stealthy_fetch,
530
+ "StealthyFetcher": StealthyFetcher,
531
+ "Selector": Selector,
532
+ "page": self.page,
533
+ "response": self.page,
534
+ "pages": self.pages,
535
+ "view": show_page_in_browser,
536
+ "uncurl": self._curl_parser.parse,
537
+ "curl2fetcher": curl2fetcher,
538
+ "help": self.show_help,
539
+ }
540
+
541
+ def show_help(self): # pragma: no cover
542
+ """Show help information"""
543
+ print(self.banner())
544
+
545
+ def start(self): # pragma: no cover
546
+ """Start the interactive shell"""
547
+ # Get our namespace with application objects
548
+ namespace = self.get_namespace()
549
+ ipython_shell = InteractiveShellEmbed(
550
+ banner1=self.banner(),
551
+ banner2="",
552
+ enable_tip=False,
553
+ exit_msg="Bye Bye",
554
+ user_ns=namespace,
555
+ )
556
+ self.shell = ipython_shell
557
+
558
+ # If a command was provided, execute it and exit
559
+ if self.code:
560
+ log.info(f"Executing provided code: {self.code}")
561
+ try:
562
+ ipython_shell.run_cell(self.code, store_history=False)
563
+ except Exception as e:
564
+ log.error(f"Error executing initial code: {e}")
565
+ return
566
+
567
+ ipython_shell()
568
+
569
+
570
+ class Convertor:
571
+ """Utils for the extract shell command"""
572
+
573
+ _extension_map: Dict[str, extraction_types] = {
574
+ "md": "markdown",
575
+ "html": "html",
576
+ "txt": "text",
577
+ }
578
+
579
+ @classmethod
580
+ def _convert_to_markdown(cls, body: TextHandler) -> str:
581
+ """Convert HTML content to Markdown"""
582
+ from markdownify import markdownify
583
+
584
+ return markdownify(body)
585
+
586
+ @classmethod
587
+ def _extract_content(
588
+ cls,
589
+ page: Selector,
590
+ extraction_type: extraction_types = "markdown",
591
+ css_selector: Optional[str] = None,
592
+ main_content_only: bool = False,
593
+ ) -> Generator[str, None, None]:
594
+ """Extract the content of a Selector"""
595
+ if not page or not isinstance(page, Selector): # pragma: no cover
596
+ raise TypeError("Input must be of type `Selector`")
597
+ elif not extraction_type or extraction_type not in cls._extension_map.values():
598
+ raise ValueError(f"Unknown extraction type: {extraction_type}")
599
+ else:
600
+ if main_content_only:
601
+ page = page.css_first("body") or page
602
+
603
+ pages = [page] if not css_selector else page.css(css_selector)
604
+ for page in pages:
605
+ match extraction_type:
606
+ case "markdown":
607
+ yield cls._convert_to_markdown(page.body)
608
+ case "html":
609
+ yield page.body
610
+ case "text":
611
+ txt_content = page.get_all_text(strip=True)
612
+ for s in (
613
+ "\n",
614
+ "\r",
615
+ "\t",
616
+ " ",
617
+ ):
618
+ # Remove consecutive white-spaces
619
+ txt_content = re_sub(f"[{s}]+", s, txt_content)
620
+ yield txt_content
621
+ yield ""
622
+
623
+ @classmethod
624
+ def write_content_to_file(
625
+ cls, page: Selector, filename: str, css_selector: Optional[str] = None
626
+ ) -> None:
627
+ """Write a Selector's content to a file"""
628
+ if not page or not isinstance(page, Selector): # pragma: no cover
629
+ raise TypeError("Input must be of type `Selector`")
630
+ elif not filename or not isinstance(filename, str) or not filename.strip():
631
+ raise ValueError("Filename must be provided")
632
+ elif not filename.endswith((".md", ".html", ".txt")):
633
+ raise ValueError(
634
+ "Unknown file type: filename must end with '.md', '.html', or '.txt'"
635
+ )
636
+ else:
637
+ with open(filename, "w", encoding="utf-8") as f:
638
+ extension = filename.split(".")[-1]
639
+ f.write(
640
+ "".join(
641
+ cls._extract_content(
642
+ page,
643
+ cls._extension_map[extension],
644
+ css_selector=css_selector,
645
+ )
646
+ )
647
+ )