scrapling 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. scrapling/__init__.py +1 -1
  2. scrapling/cli.py +38 -51
  3. scrapling/core/_html_utils.py +3 -9
  4. scrapling/core/ai.py +5 -13
  5. scrapling/core/custom_types.py +19 -61
  6. scrapling/core/mixins.py +6 -28
  7. scrapling/core/shell.py +49 -127
  8. scrapling/core/storage.py +2 -8
  9. scrapling/core/translator.py +8 -20
  10. scrapling/core/utils/__init__.py +10 -0
  11. scrapling/core/utils/_shell.py +48 -0
  12. scrapling/core/{utils.py → utils/_utils.py} +5 -21
  13. scrapling/engines/__init__.py +0 -16
  14. scrapling/engines/_browsers/_base.py +297 -0
  15. scrapling/engines/_browsers/_camoufox.py +219 -296
  16. scrapling/engines/_browsers/_config_tools.py +2 -1
  17. scrapling/engines/_browsers/_controllers.py +201 -281
  18. scrapling/engines/_browsers/_page.py +37 -15
  19. scrapling/engines/_browsers/_validators.py +9 -15
  20. scrapling/engines/constants.py +3 -6
  21. scrapling/engines/static.py +25 -75
  22. scrapling/engines/toolbelt/__init__.py +1 -20
  23. scrapling/engines/toolbelt/convertor.py +95 -86
  24. scrapling/engines/toolbelt/custom.py +7 -99
  25. scrapling/engines/toolbelt/fingerprints.py +1 -3
  26. scrapling/engines/toolbelt/navigation.py +4 -58
  27. scrapling/fetchers.py +29 -24
  28. scrapling/parser.py +45 -122
  29. {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/METADATA +54 -46
  30. scrapling-0.3.3.dist-info/RECORD +44 -0
  31. scrapling-0.3.1.dist-info/RECORD +0 -41
  32. {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/WHEEL +0 -0
  33. {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/entry_points.txt +0 -0
  34. {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/licenses/LICENSE +0 -0
  35. {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/top_level.txt +0 -0
scrapling/core/shell.py CHANGED
@@ -2,7 +2,6 @@
2
2
  from re import sub as re_sub
3
3
  from sys import stderr
4
4
  from functools import wraps
5
- from http import cookies as Cookie
6
5
  from collections import namedtuple
7
6
  from shlex import split as shlex_split
8
7
  from tempfile import mkstemp as make_temp_file
@@ -23,25 +22,17 @@ from logging import (
23
22
  from orjson import loads as json_loads, JSONDecodeError
24
23
 
25
24
  from scrapling import __version__
26
- from scrapling.core.custom_types import TextHandler
27
- from scrapling.core.utils import log
28
25
  from scrapling.parser import Selector, Selectors
26
+ from scrapling.core.custom_types import TextHandler
27
+ from scrapling.engines.toolbelt.custom import Response
28
+ from scrapling.core.utils import log, _ParseHeaders, _CookieParser
29
29
  from scrapling.core._types import (
30
- List,
31
30
  Optional,
32
31
  Dict,
33
- Tuple,
34
32
  Any,
35
33
  extraction_types,
36
34
  Generator,
37
35
  )
38
- from scrapling.fetchers import (
39
- Fetcher,
40
- AsyncFetcher,
41
- DynamicFetcher,
42
- StealthyFetcher,
43
- Response,
44
- )
45
36
 
46
37
 
47
38
  _known_logging_levels = {
@@ -71,54 +62,6 @@ Request = namedtuple(
71
62
  )
72
63
 
73
64
 
74
- def _CookieParser(cookie_string):
75
- # Errors will be handled on call so the log can be specified
76
- cookie_parser = Cookie.SimpleCookie()
77
- cookie_parser.load(cookie_string)
78
- for key, morsel in cookie_parser.items():
79
- yield key, morsel.value
80
-
81
-
82
- def _ParseHeaders(
83
- header_lines: List[str], parse_cookies: bool = True
84
- ) -> Tuple[Dict[str, str], Dict[str, str]]:
85
- """Parses headers into separate header and cookie dictionaries."""
86
- header_dict = dict()
87
- cookie_dict = dict()
88
-
89
- for header_line in header_lines:
90
- if ":" not in header_line:
91
- if header_line.endswith(";"):
92
- header_key = header_line[:-1].strip()
93
- header_value = ""
94
- header_dict[header_key] = header_value
95
- else:
96
- raise ValueError(
97
- f"Could not parse header without colon: '{header_line}'."
98
- )
99
- else:
100
- header_key, header_value = header_line.split(":", 1)
101
- header_key = header_key.strip()
102
- header_value = header_value.strip()
103
-
104
- if parse_cookies:
105
- if header_key.lower() == "cookie":
106
- try:
107
- cookie_dict = {
108
- key: value for key, value in _CookieParser(header_value)
109
- }
110
- except Exception as e: # pragma: no cover
111
- raise ValueError(
112
- f"Could not parse cookie string from header '{header_value}': {e}"
113
- )
114
- else:
115
- header_dict[header_key] = header_value
116
- else:
117
- header_dict[header_key] = header_value
118
-
119
- return header_dict, cookie_dict
120
-
121
-
122
65
  # Suppress exit on error to handle parsing errors gracefully
123
66
  class NoExitArgumentParser(ArgumentParser): # pragma: no cover
124
67
  def error(self, message):
@@ -129,15 +72,16 @@ class NoExitArgumentParser(ArgumentParser): # pragma: no cover
129
72
  if message:
130
73
  log.error(f"Scrapling shell exited with status {status}: {message}")
131
74
  self._print_message(message, stderr)
132
- raise ValueError(
133
- f"Scrapling shell exited with status {status}: {message or 'Unknown reason'}"
134
- )
75
+ raise ValueError(f"Scrapling shell exited with status {status}: {message or 'Unknown reason'}")
135
76
 
136
77
 
137
78
  class CurlParser:
138
79
  """Builds the argument parser for relevant curl flags from DevTools."""
139
80
 
140
81
  def __init__(self):
82
+ from scrapling.fetchers import Fetcher as __Fetcher
83
+
84
+ self.__fetcher = __Fetcher
141
85
  # We will use argparse parser to parse the curl command directly instead of regex
142
86
  # We will focus more on flags that will show up on curl commands copied from DevTools's network tab
143
87
  _parser = NoExitArgumentParser(add_help=False) # Disable default help
@@ -152,15 +96,11 @@ class CurlParser:
152
96
 
153
97
  # Data arguments (prioritizing types common from DevTools)
154
98
  _parser.add_argument("-d", "--data", default=None)
155
- _parser.add_argument(
156
- "--data-raw", default=None
157
- ) # Often used by browsers for JSON body
99
+ _parser.add_argument("--data-raw", default=None) # Often used by browsers for JSON body
158
100
  _parser.add_argument("--data-binary", default=None)
159
101
  # Keep urlencode for completeness, though less common from browser copy/paste
160
102
  _parser.add_argument("--data-urlencode", action="append", default=[])
161
- _parser.add_argument(
162
- "-G", "--get", action="store_true"
163
- ) # Use GET and put data in URL
103
+ _parser.add_argument("-G", "--get", action="store_true") # Use GET and put data in URL
164
104
 
165
105
  _parser.add_argument(
166
106
  "-b",
@@ -175,9 +115,7 @@ class CurlParser:
175
115
 
176
116
  # Connection/Security
177
117
  _parser.add_argument("-k", "--insecure", action="store_true")
178
- _parser.add_argument(
179
- "--compressed", action="store_true"
180
- ) # Very common from browsers
118
+ _parser.add_argument("--compressed", action="store_true") # Very common from browsers
181
119
 
182
120
  # Other flags often included but may not map directly to request args
183
121
  _parser.add_argument("-i", "--include", action="store_true")
@@ -194,9 +132,7 @@ class CurlParser:
194
132
  clean_command = curl_command.strip().lstrip("curl").strip().replace("\\\n", " ")
195
133
 
196
134
  try:
197
- tokens = shlex_split(
198
- clean_command
199
- ) # Split the string using shell-like syntax
135
+ tokens = shlex_split(clean_command) # Split the string using shell-like syntax
200
136
  except ValueError as e: # pragma: no cover
201
137
  log.error(f"Could not split command line: {e}")
202
138
  return None
@@ -213,9 +149,7 @@ class CurlParser:
213
149
  raise
214
150
 
215
151
  except Exception as e: # pragma: no cover
216
- log.error(
217
- f"An unexpected error occurred during curl arguments parsing: {e}"
218
- )
152
+ log.error(f"An unexpected error occurred during curl arguments parsing: {e}")
219
153
  return None
220
154
 
221
155
  # --- Determine Method ---
@@ -247,9 +181,7 @@ class CurlParser:
247
181
  cookies[key] = value
248
182
  log.debug(f"Parsed cookies from -b argument: {list(cookies.keys())}")
249
183
  except Exception as e: # pragma: no cover
250
- log.error(
251
- f"Could not parse cookie string from -b '{parsed_args.cookie}': {e}"
252
- )
184
+ log.error(f"Could not parse cookie string from -b '{parsed_args.cookie}': {e}")
253
185
 
254
186
  # --- Process Data Payload ---
255
187
  params = dict()
@@ -280,9 +212,7 @@ class CurlParser:
280
212
  try:
281
213
  data_payload = dict(parse_qsl(combined_data, keep_blank_values=True))
282
214
  except Exception as e:
283
- log.warning(
284
- f"Could not parse urlencoded data '{combined_data}': {e}. Treating as raw string."
285
- )
215
+ log.warning(f"Could not parse urlencoded data '{combined_data}': {e}. Treating as raw string.")
286
216
  data_payload = combined_data
287
217
 
288
218
  # Check if raw data looks like JSON, prefer 'json' param if so
@@ -303,9 +233,7 @@ class CurlParser:
303
233
  try:
304
234
  params.update(dict(parse_qsl(data_payload, keep_blank_values=True)))
305
235
  except ValueError:
306
- log.warning(
307
- f"Could not parse data '{data_payload}' into GET parameters for -G."
308
- )
236
+ log.warning(f"Could not parse data '{data_payload}' into GET parameters for -G.")
309
237
 
310
238
  if params:
311
239
  data_payload = None # Clear data as it's moved to params
@@ -314,21 +242,13 @@ class CurlParser:
314
242
  # --- Process Proxy ---
315
243
  proxies: Optional[Dict[str, str]] = None
316
244
  if parsed_args.proxy:
317
- proxy_url = (
318
- f"http://{parsed_args.proxy}"
319
- if "://" not in parsed_args.proxy
320
- else parsed_args.proxy
321
- )
245
+ proxy_url = f"http://{parsed_args.proxy}" if "://" not in parsed_args.proxy else parsed_args.proxy
322
246
 
323
247
  if parsed_args.proxy_user:
324
248
  user_pass = parsed_args.proxy_user
325
249
  parts = urlparse(proxy_url)
326
250
  netloc_parts = parts.netloc.split("@")
327
- netloc = (
328
- f"{user_pass}@{netloc_parts[-1]}"
329
- if len(netloc_parts) > 1
330
- else f"{user_pass}@{parts.netloc}"
331
- )
251
+ netloc = f"{user_pass}@{netloc_parts[-1]}" if len(netloc_parts) > 1 else f"{user_pass}@{parts.netloc}"
332
252
  proxy_url = urlunparse(
333
253
  (
334
254
  parts.scheme,
@@ -359,11 +279,7 @@ class CurlParser:
359
279
 
360
280
  def convert2fetcher(self, curl_command: Request | str) -> Optional[Response]:
361
281
  if isinstance(curl_command, (Request, str)):
362
- request = (
363
- self.parse(curl_command)
364
- if isinstance(curl_command, str)
365
- else curl_command
366
- )
282
+ request = self.parse(curl_command) if isinstance(curl_command, str) else curl_command
367
283
 
368
284
  # Ensure request parsing was successful before proceeding
369
285
  if request is None: # pragma: no cover
@@ -381,14 +297,12 @@ class CurlParser:
381
297
  _ = request_args.pop("json", None)
382
298
 
383
299
  try:
384
- return getattr(Fetcher, method)(**request_args)
300
+ return getattr(self.__fetcher, method)(**request_args)
385
301
  except Exception as e: # pragma: no cover
386
302
  log.error(f"Error calling Fetcher.{method}: {e}")
387
303
  return None
388
304
  else: # pragma: no cover
389
- log.error(
390
- f'Request method "{method}" isn\'t supported by Scrapling yet'
391
- )
305
+ log.error(f'Request method "{method}" isn\'t supported by Scrapling yet')
392
306
  return None
393
307
 
394
308
  else: # pragma: no cover
@@ -403,7 +317,7 @@ def show_page_in_browser(page: Selector): # pragma: no cover
403
317
 
404
318
  try:
405
319
  fd, fname = make_temp_file(prefix="scrapling_view_", suffix=".html")
406
- with open(fd, "w", encoding="utf-8") as f:
320
+ with open(fd, "w", encoding=page.encoding) as f:
407
321
  f.write(page.body)
408
322
 
409
323
  open_in_browser(f"file://{fname}")
@@ -417,6 +331,19 @@ class CustomShell:
417
331
  """A custom IPython shell with minimal dependencies"""
418
332
 
419
333
  def __init__(self, code, log_level="debug"):
334
+ from IPython.terminal.embed import InteractiveShellEmbed as __InteractiveShellEmbed
335
+ from scrapling.fetchers import (
336
+ Fetcher as __Fetcher,
337
+ AsyncFetcher as __AsyncFetcher,
338
+ DynamicFetcher as __DynamicFetcher,
339
+ StealthyFetcher as __StealthyFetcher,
340
+ )
341
+
342
+ self.__InteractiveShellEmbed = __InteractiveShellEmbed
343
+ self.__Fetcher = __Fetcher
344
+ self.__AsyncFetcher = __AsyncFetcher
345
+ self.__DynamicFetcher = __DynamicFetcher
346
+ self.__StealthyFetcher = __StealthyFetcher
420
347
  self.code = code
421
348
  self.page = None
422
349
  self.pages = Selectors([])
@@ -440,7 +367,7 @@ class CustomShell:
440
367
  if self.log_level:
441
368
  getLogger("scrapling").setLevel(self.log_level)
442
369
 
443
- settings = Fetcher.display_config()
370
+ settings = self.__Fetcher.display_config()
444
371
  settings.pop("storage", None)
445
372
  settings.pop("storage_args", None)
446
373
  log.info(f"Scrapling {__version__} shell started")
@@ -506,12 +433,12 @@ Type 'exit' or press Ctrl+D to exit.
506
433
  """Create a namespace with application-specific objects"""
507
434
 
508
435
  # Create wrapped versions of fetch functions
509
- get = self.create_wrapper(Fetcher.get)
510
- post = self.create_wrapper(Fetcher.post)
511
- put = self.create_wrapper(Fetcher.put)
512
- delete = self.create_wrapper(Fetcher.delete)
513
- dynamic_fetch = self.create_wrapper(DynamicFetcher.fetch)
514
- stealthy_fetch = self.create_wrapper(StealthyFetcher.fetch)
436
+ get = self.create_wrapper(self.__Fetcher.get)
437
+ post = self.create_wrapper(self.__Fetcher.post)
438
+ put = self.create_wrapper(self.__Fetcher.put)
439
+ delete = self.create_wrapper(self.__Fetcher.delete)
440
+ dynamic_fetch = self.create_wrapper(self.__DynamicFetcher.fetch)
441
+ stealthy_fetch = self.create_wrapper(self.__StealthyFetcher.fetch)
515
442
  curl2fetcher = self.create_wrapper(self._curl_parser.convert2fetcher)
516
443
 
517
444
  # Create the namespace dictionary
@@ -520,12 +447,12 @@ Type 'exit' or press Ctrl+D to exit.
520
447
  "post": post,
521
448
  "put": put,
522
449
  "delete": delete,
523
- "Fetcher": Fetcher,
524
- "AsyncFetcher": AsyncFetcher,
450
+ "Fetcher": self.__Fetcher,
451
+ "AsyncFetcher": self.__AsyncFetcher,
525
452
  "fetch": dynamic_fetch,
526
- "DynamicFetcher": DynamicFetcher,
453
+ "DynamicFetcher": self.__DynamicFetcher,
527
454
  "stealthy_fetch": stealthy_fetch,
528
- "StealthyFetcher": StealthyFetcher,
455
+ "StealthyFetcher": self.__StealthyFetcher,
529
456
  "Selector": Selector,
530
457
  "page": self.page,
531
458
  "response": self.page,
@@ -542,11 +469,10 @@ Type 'exit' or press Ctrl+D to exit.
542
469
 
543
470
  def start(self): # pragma: no cover
544
471
  """Start the interactive shell"""
545
- from IPython.terminal.embed import InteractiveShellEmbed
546
472
 
547
473
  # Get our namespace with application objects
548
474
  namespace = self.get_namespace()
549
- ipython_shell = InteractiveShellEmbed(
475
+ ipython_shell = self.__InteractiveShellEmbed(
550
476
  banner1=self.banner(),
551
477
  banner2="",
552
478
  enable_tip=False,
@@ -621,20 +547,16 @@ class Convertor:
621
547
  yield ""
622
548
 
623
549
  @classmethod
624
- def write_content_to_file(
625
- cls, page: Selector, filename: str, css_selector: Optional[str] = None
626
- ) -> None:
550
+ def write_content_to_file(cls, page: Selector, filename: str, css_selector: Optional[str] = None) -> None:
627
551
  """Write a Selector's content to a file"""
628
552
  if not page or not isinstance(page, Selector): # pragma: no cover
629
553
  raise TypeError("Input must be of type `Selector`")
630
554
  elif not filename or not isinstance(filename, str) or not filename.strip():
631
555
  raise ValueError("Filename must be provided")
632
556
  elif not filename.endswith((".md", ".html", ".txt")):
633
- raise ValueError(
634
- "Unknown file type: filename must end with '.md', '.html', or '.txt'"
635
- )
557
+ raise ValueError("Unknown file type: filename must end with '.md', '.html', or '.txt'")
636
558
  else:
637
- with open(filename, "w", encoding="utf-8") as f:
559
+ with open(filename, "w", encoding=page.encoding) as f:
638
560
  extension = filename.split(".")[-1]
639
561
  f.write(
640
562
  "".join(
scrapling/core/storage.py CHANGED
@@ -27,11 +27,7 @@ class StorageSystemMixin(ABC): # pragma: no cover
27
27
 
28
28
  try:
29
29
  extracted = tld(self.url)
30
- return (
31
- extracted.top_domain_under_public_suffix
32
- or extracted.domain
33
- or default_value
34
- )
30
+ return extracted.top_domain_under_public_suffix or extracted.domain or default_value
35
31
  except AttributeError:
36
32
  return default_value
37
33
 
@@ -90,9 +86,7 @@ class SQLiteStorageSystem(StorageSystemMixin):
90
86
  self.connection.execute("PRAGMA journal_mode=WAL")
91
87
  self.cursor = self.connection.cursor()
92
88
  self._setup_database()
93
- log.debug(
94
- f'Storage system loaded with arguments (storage_file="{storage_file}", url="{url}")'
95
- )
89
+ log.debug(f'Storage system loaded with arguments (storage_file="{storage_file}", url="{url}")')
96
90
 
97
91
  def _setup_database(self) -> None:
98
92
  self.cursor.execute("""
@@ -10,10 +10,10 @@ So you don't have to learn a new selectors/api method like what bs4 done with so
10
10
 
11
11
  from functools import lru_cache
12
12
 
13
- from cssselect import HTMLTranslator as OriginalHTMLTranslator
14
- from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
15
13
  from cssselect.xpath import ExpressionError
16
14
  from cssselect.xpath import XPathExpr as OriginalXPathExpr
15
+ from cssselect import HTMLTranslator as OriginalHTMLTranslator
16
+ from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
17
17
 
18
18
  from scrapling.core._types import Any, Optional, Protocol, Self
19
19
 
@@ -89,9 +89,7 @@ class TranslatorMixin:
89
89
  xpath = super().xpath_element(selector) # type: ignore[safe-super]
90
90
  return XPathExpr.from_xpath(xpath)
91
91
 
92
- def xpath_pseudo_element(
93
- self, xpath: OriginalXPathExpr, pseudo_element: PseudoElement
94
- ) -> OriginalXPathExpr:
92
+ def xpath_pseudo_element(self, xpath: OriginalXPathExpr, pseudo_element: PseudoElement) -> OriginalXPathExpr:
95
93
  """
96
94
  Dispatch method that transforms XPath to support the pseudo-element.
97
95
  """
@@ -99,31 +97,21 @@ class TranslatorMixin:
99
97
  method_name = f"xpath_{pseudo_element.name.replace('-', '_')}_functional_pseudo_element"
100
98
  method = getattr(self, method_name, None)
101
99
  if not method: # pragma: no cover
102
- raise ExpressionError(
103
- f"The functional pseudo-element ::{pseudo_element.name}() is unknown"
104
- )
100
+ raise ExpressionError(f"The functional pseudo-element ::{pseudo_element.name}() is unknown")
105
101
  xpath = method(xpath, pseudo_element)
106
102
  else:
107
- method_name = (
108
- f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element"
109
- )
103
+ method_name = f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element"
110
104
  method = getattr(self, method_name, None)
111
105
  if not method: # pragma: no cover
112
- raise ExpressionError(
113
- f"The pseudo-element ::{pseudo_element} is unknown"
114
- )
106
+ raise ExpressionError(f"The pseudo-element ::{pseudo_element} is unknown")
115
107
  xpath = method(xpath)
116
108
  return xpath
117
109
 
118
110
  @staticmethod
119
- def xpath_attr_functional_pseudo_element(
120
- xpath: OriginalXPathExpr, function: FunctionalPseudoElement
121
- ) -> XPathExpr:
111
+ def xpath_attr_functional_pseudo_element(xpath: OriginalXPathExpr, function: FunctionalPseudoElement) -> XPathExpr:
122
112
  """Support selecting attribute values using ::attr() pseudo-element"""
123
113
  if function.argument_types() not in (["STRING"], ["IDENT"]): # pragma: no cover
124
- raise ExpressionError(
125
- f"Expected a single string or ident for ::attr(), got {function.arguments!r}"
126
- )
114
+ raise ExpressionError(f"Expected a single string or ident for ::attr(), got {function.arguments!r}")
127
115
  return XPathExpr.from_xpath(xpath, attribute=function.arguments[0].value)
128
116
 
129
117
  @staticmethod
@@ -0,0 +1,10 @@
1
+ from ._utils import (
2
+ log,
3
+ __CONSECUTIVE_SPACES_REGEX__,
4
+ flatten,
5
+ _is_iterable,
6
+ _StorageTools,
7
+ clean_spaces,
8
+ html_forbidden,
9
+ )
10
+ from ._shell import _CookieParser, _ParseHeaders
@@ -0,0 +1,48 @@
1
+ from http import cookies as Cookie
2
+
3
+
4
+ from scrapling.core._types import (
5
+ List,
6
+ Dict,
7
+ Tuple,
8
+ )
9
+
10
+
11
+ def _CookieParser(cookie_string):
12
+ # Errors will be handled on call so the log can be specified
13
+ cookie_parser = Cookie.SimpleCookie()
14
+ cookie_parser.load(cookie_string)
15
+ for key, morsel in cookie_parser.items():
16
+ yield key, morsel.value
17
+
18
+
19
+ def _ParseHeaders(header_lines: List[str], parse_cookies: bool = True) -> Tuple[Dict[str, str], Dict[str, str]]:
20
+ """Parses headers into separate header and cookie dictionaries."""
21
+ header_dict = dict()
22
+ cookie_dict = dict()
23
+
24
+ for header_line in header_lines:
25
+ if ":" not in header_line:
26
+ if header_line.endswith(";"):
27
+ header_key = header_line[:-1].strip()
28
+ header_value = ""
29
+ header_dict[header_key] = header_value
30
+ else:
31
+ raise ValueError(f"Could not parse header without colon: '{header_line}'.")
32
+ else:
33
+ header_key, header_value = header_line.split(":", 1)
34
+ header_key = header_key.strip()
35
+ header_value = header_value.strip()
36
+
37
+ if parse_cookies:
38
+ if header_key.lower() == "cookie":
39
+ try:
40
+ cookie_dict = {key: value for key, value in _CookieParser(header_value)}
41
+ except Exception as e: # pragma: no cover
42
+ raise ValueError(f"Could not parse cookie string from header '{header_value}': {e}")
43
+ else:
44
+ header_dict[header_key] = header_value
45
+ else:
46
+ header_dict[header_key] = header_value
47
+
48
+ return header_dict, cookie_dict
@@ -24,9 +24,7 @@ def setup_logger():
24
24
  logger = logging.getLogger("scrapling")
25
25
  logger.setLevel(logging.INFO)
26
26
 
27
- formatter = logging.Formatter(
28
- fmt="[%(asctime)s] %(levelname)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
29
- )
27
+ formatter = logging.Formatter(fmt="[%(asctime)s] %(levelname)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
30
28
 
31
29
  console_handler = logging.StreamHandler()
32
30
  console_handler.setFormatter(formatter)
@@ -61,11 +59,7 @@ class _StorageTools:
61
59
  def __clean_attributes(element: html.HtmlElement, forbidden: tuple = ()) -> Dict:
62
60
  if not element.attrib:
63
61
  return {}
64
- return {
65
- k: v.strip()
66
- for k, v in element.attrib.items()
67
- if v and v.strip() and k not in forbidden
68
- }
62
+ return {k: v.strip() for k, v in element.attrib.items() if v and v.strip() and k not in forbidden}
69
63
 
70
64
  @classmethod
71
65
  def element_to_dict(cls, element: html.HtmlElement) -> Dict:
@@ -85,17 +79,11 @@ class _StorageTools:
85
79
  }
86
80
  )
87
81
 
88
- siblings = [
89
- child.tag for child in parent.iterchildren() if child != element
90
- ]
82
+ siblings = [child.tag for child in parent.iterchildren() if child != element]
91
83
  if siblings:
92
84
  result.update({"siblings": tuple(siblings)})
93
85
 
94
- children = [
95
- child.tag
96
- for child in element.iterchildren()
97
- if not isinstance(child, html_forbidden)
98
- ]
86
+ children = [child.tag for child in element.iterchildren() if not isinstance(child, html_forbidden)]
99
87
  if children:
100
88
  result.update({"children": tuple(children)})
101
89
 
@@ -104,11 +92,7 @@ class _StorageTools:
104
92
  @classmethod
105
93
  def _get_element_path(cls, element: html.HtmlElement):
106
94
  parent = element.getparent()
107
- return tuple(
108
- (element.tag,)
109
- if parent is None
110
- else (cls._get_element_path(parent) + (element.tag,))
111
- )
95
+ return tuple((element.tag,) if parent is None else (cls._get_element_path(parent) + (element.tag,)))
112
96
 
113
97
 
114
98
  @lru_cache(128, typed=True)
@@ -1,16 +0,0 @@
1
- from .constants import DEFAULT_DISABLED_RESOURCES, DEFAULT_STEALTH_FLAGS, DEFAULT_FLAGS
2
- from .static import FetcherSession, FetcherClient, AsyncFetcherClient
3
- from ._browsers import (
4
- DynamicSession,
5
- AsyncDynamicSession,
6
- StealthySession,
7
- AsyncStealthySession,
8
- )
9
-
10
- __all__ = [
11
- "FetcherSession",
12
- "DynamicSession",
13
- "AsyncDynamicSession",
14
- "StealthySession",
15
- "AsyncStealthySession",
16
- ]