scrapling 0.3__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. scrapling/__init__.py +1 -1
  2. scrapling/cli.py +38 -51
  3. scrapling/core/_html_utils.py +3 -9
  4. scrapling/core/ai.py +5 -13
  5. scrapling/core/custom_types.py +19 -61
  6. scrapling/core/mixins.py +6 -28
  7. scrapling/core/shell.py +51 -129
  8. scrapling/core/storage.py +2 -8
  9. scrapling/core/translator.py +8 -20
  10. scrapling/core/utils/__init__.py +10 -0
  11. scrapling/core/utils/_shell.py +48 -0
  12. scrapling/core/{utils.py → utils/_utils.py} +5 -21
  13. scrapling/engines/__init__.py +0 -16
  14. scrapling/engines/_browsers/_base.py +297 -0
  15. scrapling/engines/_browsers/_camoufox.py +238 -293
  16. scrapling/engines/_browsers/_config_tools.py +2 -1
  17. scrapling/engines/_browsers/_controllers.py +220 -278
  18. scrapling/engines/_browsers/_page.py +37 -15
  19. scrapling/engines/_browsers/_validators.py +29 -15
  20. scrapling/engines/constants.py +3 -6
  21. scrapling/engines/static.py +25 -75
  22. scrapling/engines/toolbelt/__init__.py +1 -20
  23. scrapling/engines/toolbelt/convertor.py +95 -86
  24. scrapling/engines/toolbelt/custom.py +7 -99
  25. scrapling/engines/toolbelt/fingerprints.py +1 -3
  26. scrapling/engines/toolbelt/navigation.py +4 -58
  27. scrapling/fetchers.py +41 -24
  28. scrapling/parser.py +45 -122
  29. {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/METADATA +57 -47
  30. scrapling-0.3.2.dist-info/RECORD +44 -0
  31. scrapling-0.3.dist-info/RECORD +0 -41
  32. {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/WHEEL +0 -0
  33. {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/entry_points.txt +0 -0
  34. {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/licenses/LICENSE +0 -0
  35. {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/top_level.txt +0 -0
scrapling/core/shell.py CHANGED
@@ -2,7 +2,6 @@
2
2
  from re import sub as re_sub
3
3
  from sys import stderr
4
4
  from functools import wraps
5
- from http import cookies as Cookie
6
5
  from collections import namedtuple
7
6
  from shlex import split as shlex_split
8
7
  from tempfile import mkstemp as make_temp_file
@@ -20,29 +19,20 @@ from logging import (
20
19
  getLevelName,
21
20
  )
22
21
 
23
- from IPython.terminal.embed import InteractiveShellEmbed
24
22
  from orjson import loads as json_loads, JSONDecodeError
25
23
 
26
24
  from scrapling import __version__
27
- from scrapling.core.custom_types import TextHandler
28
- from scrapling.core.utils import log
29
25
  from scrapling.parser import Selector, Selectors
26
+ from scrapling.core.custom_types import TextHandler
27
+ from scrapling.engines.toolbelt.custom import Response
28
+ from scrapling.core.utils import log, _ParseHeaders, _CookieParser
30
29
  from scrapling.core._types import (
31
- List,
32
30
  Optional,
33
31
  Dict,
34
- Tuple,
35
32
  Any,
36
33
  extraction_types,
37
34
  Generator,
38
35
  )
39
- from scrapling.fetchers import (
40
- Fetcher,
41
- AsyncFetcher,
42
- DynamicFetcher,
43
- StealthyFetcher,
44
- Response,
45
- )
46
36
 
47
37
 
48
38
  _known_logging_levels = {
@@ -72,54 +62,6 @@ Request = namedtuple(
72
62
  )
73
63
 
74
64
 
75
- def _CookieParser(cookie_string):
76
- # Errors will be handled on call so the log can be specified
77
- cookie_parser = Cookie.SimpleCookie()
78
- cookie_parser.load(cookie_string)
79
- for key, morsel in cookie_parser.items():
80
- yield key, morsel.value
81
-
82
-
83
- def _ParseHeaders(
84
- header_lines: List[str], parse_cookies: bool = True
85
- ) -> Tuple[Dict[str, str], Dict[str, str]]:
86
- """Parses headers into separate header and cookie dictionaries."""
87
- header_dict = dict()
88
- cookie_dict = dict()
89
-
90
- for header_line in header_lines:
91
- if ":" not in header_line:
92
- if header_line.endswith(";"):
93
- header_key = header_line[:-1].strip()
94
- header_value = ""
95
- header_dict[header_key] = header_value
96
- else:
97
- raise ValueError(
98
- f"Could not parse header without colon: '{header_line}'."
99
- )
100
- else:
101
- header_key, header_value = header_line.split(":", 1)
102
- header_key = header_key.strip()
103
- header_value = header_value.strip()
104
-
105
- if parse_cookies:
106
- if header_key.lower() == "cookie":
107
- try:
108
- cookie_dict = {
109
- key: value for key, value in _CookieParser(header_value)
110
- }
111
- except Exception as e: # pragma: no cover
112
- raise ValueError(
113
- f"Could not parse cookie string from header '{header_value}': {e}"
114
- )
115
- else:
116
- header_dict[header_key] = header_value
117
- else:
118
- header_dict[header_key] = header_value
119
-
120
- return header_dict, cookie_dict
121
-
122
-
123
65
  # Suppress exit on error to handle parsing errors gracefully
124
66
  class NoExitArgumentParser(ArgumentParser): # pragma: no cover
125
67
  def error(self, message):
@@ -130,15 +72,16 @@ class NoExitArgumentParser(ArgumentParser): # pragma: no cover
130
72
  if message:
131
73
  log.error(f"Scrapling shell exited with status {status}: {message}")
132
74
  self._print_message(message, stderr)
133
- raise ValueError(
134
- f"Scrapling shell exited with status {status}: {message or 'Unknown reason'}"
135
- )
75
+ raise ValueError(f"Scrapling shell exited with status {status}: {message or 'Unknown reason'}")
136
76
 
137
77
 
138
78
  class CurlParser:
139
79
  """Builds the argument parser for relevant curl flags from DevTools."""
140
80
 
141
81
  def __init__(self):
82
+ from scrapling.fetchers import Fetcher as __Fetcher
83
+
84
+ self.__fetcher = __Fetcher
142
85
  # We will use argparse parser to parse the curl command directly instead of regex
143
86
  # We will focus more on flags that will show up on curl commands copied from DevTools's network tab
144
87
  _parser = NoExitArgumentParser(add_help=False) # Disable default help
@@ -153,15 +96,11 @@ class CurlParser:
153
96
 
154
97
  # Data arguments (prioritizing types common from DevTools)
155
98
  _parser.add_argument("-d", "--data", default=None)
156
- _parser.add_argument(
157
- "--data-raw", default=None
158
- ) # Often used by browsers for JSON body
99
+ _parser.add_argument("--data-raw", default=None) # Often used by browsers for JSON body
159
100
  _parser.add_argument("--data-binary", default=None)
160
101
  # Keep urlencode for completeness, though less common from browser copy/paste
161
102
  _parser.add_argument("--data-urlencode", action="append", default=[])
162
- _parser.add_argument(
163
- "-G", "--get", action="store_true"
164
- ) # Use GET and put data in URL
103
+ _parser.add_argument("-G", "--get", action="store_true") # Use GET and put data in URL
165
104
 
166
105
  _parser.add_argument(
167
106
  "-b",
@@ -176,9 +115,7 @@ class CurlParser:
176
115
 
177
116
  # Connection/Security
178
117
  _parser.add_argument("-k", "--insecure", action="store_true")
179
- _parser.add_argument(
180
- "--compressed", action="store_true"
181
- ) # Very common from browsers
118
+ _parser.add_argument("--compressed", action="store_true") # Very common from browsers
182
119
 
183
120
  # Other flags often included but may not map directly to request args
184
121
  _parser.add_argument("-i", "--include", action="store_true")
@@ -195,9 +132,7 @@ class CurlParser:
195
132
  clean_command = curl_command.strip().lstrip("curl").strip().replace("\\\n", " ")
196
133
 
197
134
  try:
198
- tokens = shlex_split(
199
- clean_command
200
- ) # Split the string using shell-like syntax
135
+ tokens = shlex_split(clean_command) # Split the string using shell-like syntax
201
136
  except ValueError as e: # pragma: no cover
202
137
  log.error(f"Could not split command line: {e}")
203
138
  return None
@@ -214,9 +149,7 @@ class CurlParser:
214
149
  raise
215
150
 
216
151
  except Exception as e: # pragma: no cover
217
- log.error(
218
- f"An unexpected error occurred during curl arguments parsing: {e}"
219
- )
152
+ log.error(f"An unexpected error occurred during curl arguments parsing: {e}")
220
153
  return None
221
154
 
222
155
  # --- Determine Method ---
@@ -248,9 +181,7 @@ class CurlParser:
248
181
  cookies[key] = value
249
182
  log.debug(f"Parsed cookies from -b argument: {list(cookies.keys())}")
250
183
  except Exception as e: # pragma: no cover
251
- log.error(
252
- f"Could not parse cookie string from -b '{parsed_args.cookie}': {e}"
253
- )
184
+ log.error(f"Could not parse cookie string from -b '{parsed_args.cookie}': {e}")
254
185
 
255
186
  # --- Process Data Payload ---
256
187
  params = dict()
@@ -281,9 +212,7 @@ class CurlParser:
281
212
  try:
282
213
  data_payload = dict(parse_qsl(combined_data, keep_blank_values=True))
283
214
  except Exception as e:
284
- log.warning(
285
- f"Could not parse urlencoded data '{combined_data}': {e}. Treating as raw string."
286
- )
215
+ log.warning(f"Could not parse urlencoded data '{combined_data}': {e}. Treating as raw string.")
287
216
  data_payload = combined_data
288
217
 
289
218
  # Check if raw data looks like JSON, prefer 'json' param if so
@@ -304,9 +233,7 @@ class CurlParser:
304
233
  try:
305
234
  params.update(dict(parse_qsl(data_payload, keep_blank_values=True)))
306
235
  except ValueError:
307
- log.warning(
308
- f"Could not parse data '{data_payload}' into GET parameters for -G."
309
- )
236
+ log.warning(f"Could not parse data '{data_payload}' into GET parameters for -G.")
310
237
 
311
238
  if params:
312
239
  data_payload = None # Clear data as it's moved to params
@@ -315,21 +242,13 @@ class CurlParser:
315
242
  # --- Process Proxy ---
316
243
  proxies: Optional[Dict[str, str]] = None
317
244
  if parsed_args.proxy:
318
- proxy_url = (
319
- f"http://{parsed_args.proxy}"
320
- if "://" not in parsed_args.proxy
321
- else parsed_args.proxy
322
- )
245
+ proxy_url = f"http://{parsed_args.proxy}" if "://" not in parsed_args.proxy else parsed_args.proxy
323
246
 
324
247
  if parsed_args.proxy_user:
325
248
  user_pass = parsed_args.proxy_user
326
249
  parts = urlparse(proxy_url)
327
250
  netloc_parts = parts.netloc.split("@")
328
- netloc = (
329
- f"{user_pass}@{netloc_parts[-1]}"
330
- if len(netloc_parts) > 1
331
- else f"{user_pass}@{parts.netloc}"
332
- )
251
+ netloc = f"{user_pass}@{netloc_parts[-1]}" if len(netloc_parts) > 1 else f"{user_pass}@{parts.netloc}"
333
252
  proxy_url = urlunparse(
334
253
  (
335
254
  parts.scheme,
@@ -360,11 +279,7 @@ class CurlParser:
360
279
 
361
280
  def convert2fetcher(self, curl_command: Request | str) -> Optional[Response]:
362
281
  if isinstance(curl_command, (Request, str)):
363
- request = (
364
- self.parse(curl_command)
365
- if isinstance(curl_command, str)
366
- else curl_command
367
- )
282
+ request = self.parse(curl_command) if isinstance(curl_command, str) else curl_command
368
283
 
369
284
  # Ensure request parsing was successful before proceeding
370
285
  if request is None: # pragma: no cover
@@ -382,20 +297,17 @@ class CurlParser:
382
297
  _ = request_args.pop("json", None)
383
298
 
384
299
  try:
385
- return getattr(Fetcher, method)(**request_args)
300
+ return getattr(self.__fetcher, method)(**request_args)
386
301
  except Exception as e: # pragma: no cover
387
302
  log.error(f"Error calling Fetcher.{method}: {e}")
388
303
  return None
389
304
  else: # pragma: no cover
390
- log.error(
391
- f'Request method "{method}" isn\'t supported by Scrapling yet'
392
- )
305
+ log.error(f'Request method "{method}" isn\'t supported by Scrapling yet')
393
306
  return None
394
307
 
395
308
  else: # pragma: no cover
396
309
  log.error("Input must be a valid curl command string or a Request object.")
397
-
398
- return None
310
+ return None
399
311
 
400
312
 
401
313
  def show_page_in_browser(page: Selector): # pragma: no cover
@@ -405,7 +317,7 @@ def show_page_in_browser(page: Selector): # pragma: no cover
405
317
 
406
318
  try:
407
319
  fd, fname = make_temp_file(prefix="scrapling_view_", suffix=".html")
408
- with open(fd, "w", encoding="utf-8") as f:
320
+ with open(fd, "w", encoding=page.encoding) as f:
409
321
  f.write(page.body)
410
322
 
411
323
  open_in_browser(f"file://{fname}")
@@ -419,6 +331,19 @@ class CustomShell:
419
331
  """A custom IPython shell with minimal dependencies"""
420
332
 
421
333
  def __init__(self, code, log_level="debug"):
334
+ from IPython.terminal.embed import InteractiveShellEmbed as __InteractiveShellEmbed
335
+ from scrapling.fetchers import (
336
+ Fetcher as __Fetcher,
337
+ AsyncFetcher as __AsyncFetcher,
338
+ DynamicFetcher as __DynamicFetcher,
339
+ StealthyFetcher as __StealthyFetcher,
340
+ )
341
+
342
+ self.__InteractiveShellEmbed = __InteractiveShellEmbed
343
+ self.__Fetcher = __Fetcher
344
+ self.__AsyncFetcher = __AsyncFetcher
345
+ self.__DynamicFetcher = __DynamicFetcher
346
+ self.__StealthyFetcher = __StealthyFetcher
422
347
  self.code = code
423
348
  self.page = None
424
349
  self.pages = Selectors([])
@@ -442,7 +367,7 @@ class CustomShell:
442
367
  if self.log_level:
443
368
  getLogger("scrapling").setLevel(self.log_level)
444
369
 
445
- settings = Fetcher.display_config()
370
+ settings = self.__Fetcher.display_config()
446
371
  settings.pop("storage", None)
447
372
  settings.pop("storage_args", None)
448
373
  log.info(f"Scrapling {__version__} shell started")
@@ -508,12 +433,12 @@ Type 'exit' or press Ctrl+D to exit.
508
433
  """Create a namespace with application-specific objects"""
509
434
 
510
435
  # Create wrapped versions of fetch functions
511
- get = self.create_wrapper(Fetcher.get)
512
- post = self.create_wrapper(Fetcher.post)
513
- put = self.create_wrapper(Fetcher.put)
514
- delete = self.create_wrapper(Fetcher.delete)
515
- dynamic_fetch = self.create_wrapper(DynamicFetcher.fetch)
516
- stealthy_fetch = self.create_wrapper(StealthyFetcher.fetch)
436
+ get = self.create_wrapper(self.__Fetcher.get)
437
+ post = self.create_wrapper(self.__Fetcher.post)
438
+ put = self.create_wrapper(self.__Fetcher.put)
439
+ delete = self.create_wrapper(self.__Fetcher.delete)
440
+ dynamic_fetch = self.create_wrapper(self.__DynamicFetcher.fetch)
441
+ stealthy_fetch = self.create_wrapper(self.__StealthyFetcher.fetch)
517
442
  curl2fetcher = self.create_wrapper(self._curl_parser.convert2fetcher)
518
443
 
519
444
  # Create the namespace dictionary
@@ -522,12 +447,12 @@ Type 'exit' or press Ctrl+D to exit.
522
447
  "post": post,
523
448
  "put": put,
524
449
  "delete": delete,
525
- "Fetcher": Fetcher,
526
- "AsyncFetcher": AsyncFetcher,
450
+ "Fetcher": self.__Fetcher,
451
+ "AsyncFetcher": self.__AsyncFetcher,
527
452
  "fetch": dynamic_fetch,
528
- "DynamicFetcher": DynamicFetcher,
453
+ "DynamicFetcher": self.__DynamicFetcher,
529
454
  "stealthy_fetch": stealthy_fetch,
530
- "StealthyFetcher": StealthyFetcher,
455
+ "StealthyFetcher": self.__StealthyFetcher,
531
456
  "Selector": Selector,
532
457
  "page": self.page,
533
458
  "response": self.page,
@@ -544,9 +469,10 @@ Type 'exit' or press Ctrl+D to exit.
544
469
 
545
470
  def start(self): # pragma: no cover
546
471
  """Start the interactive shell"""
472
+
547
473
  # Get our namespace with application objects
548
474
  namespace = self.get_namespace()
549
- ipython_shell = InteractiveShellEmbed(
475
+ ipython_shell = self.__InteractiveShellEmbed(
550
476
  banner1=self.banner(),
551
477
  banner2="",
552
478
  enable_tip=False,
@@ -621,20 +547,16 @@ class Convertor:
621
547
  yield ""
622
548
 
623
549
  @classmethod
624
- def write_content_to_file(
625
- cls, page: Selector, filename: str, css_selector: Optional[str] = None
626
- ) -> None:
550
+ def write_content_to_file(cls, page: Selector, filename: str, css_selector: Optional[str] = None) -> None:
627
551
  """Write a Selector's content to a file"""
628
552
  if not page or not isinstance(page, Selector): # pragma: no cover
629
553
  raise TypeError("Input must be of type `Selector`")
630
554
  elif not filename or not isinstance(filename, str) or not filename.strip():
631
555
  raise ValueError("Filename must be provided")
632
556
  elif not filename.endswith((".md", ".html", ".txt")):
633
- raise ValueError(
634
- "Unknown file type: filename must end with '.md', '.html', or '.txt'"
635
- )
557
+ raise ValueError("Unknown file type: filename must end with '.md', '.html', or '.txt'")
636
558
  else:
637
- with open(filename, "w", encoding="utf-8") as f:
559
+ with open(filename, "w", encoding=page.encoding) as f:
638
560
  extension = filename.split(".")[-1]
639
561
  f.write(
640
562
  "".join(
scrapling/core/storage.py CHANGED
@@ -27,11 +27,7 @@ class StorageSystemMixin(ABC): # pragma: no cover
27
27
 
28
28
  try:
29
29
  extracted = tld(self.url)
30
- return (
31
- extracted.top_domain_under_public_suffix
32
- or extracted.domain
33
- or default_value
34
- )
30
+ return extracted.top_domain_under_public_suffix or extracted.domain or default_value
35
31
  except AttributeError:
36
32
  return default_value
37
33
 
@@ -90,9 +86,7 @@ class SQLiteStorageSystem(StorageSystemMixin):
90
86
  self.connection.execute("PRAGMA journal_mode=WAL")
91
87
  self.cursor = self.connection.cursor()
92
88
  self._setup_database()
93
- log.debug(
94
- f'Storage system loaded with arguments (storage_file="{storage_file}", url="{url}")'
95
- )
89
+ log.debug(f'Storage system loaded with arguments (storage_file="{storage_file}", url="{url}")')
96
90
 
97
91
  def _setup_database(self) -> None:
98
92
  self.cursor.execute("""
@@ -10,10 +10,10 @@ So you don't have to learn a new selectors/api method like what bs4 done with so
10
10
 
11
11
  from functools import lru_cache
12
12
 
13
- from cssselect import HTMLTranslator as OriginalHTMLTranslator
14
- from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
15
13
  from cssselect.xpath import ExpressionError
16
14
  from cssselect.xpath import XPathExpr as OriginalXPathExpr
15
+ from cssselect import HTMLTranslator as OriginalHTMLTranslator
16
+ from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
17
17
 
18
18
  from scrapling.core._types import Any, Optional, Protocol, Self
19
19
 
@@ -89,9 +89,7 @@ class TranslatorMixin:
89
89
  xpath = super().xpath_element(selector) # type: ignore[safe-super]
90
90
  return XPathExpr.from_xpath(xpath)
91
91
 
92
- def xpath_pseudo_element(
93
- self, xpath: OriginalXPathExpr, pseudo_element: PseudoElement
94
- ) -> OriginalXPathExpr:
92
+ def xpath_pseudo_element(self, xpath: OriginalXPathExpr, pseudo_element: PseudoElement) -> OriginalXPathExpr:
95
93
  """
96
94
  Dispatch method that transforms XPath to support the pseudo-element.
97
95
  """
@@ -99,31 +97,21 @@ class TranslatorMixin:
99
97
  method_name = f"xpath_{pseudo_element.name.replace('-', '_')}_functional_pseudo_element"
100
98
  method = getattr(self, method_name, None)
101
99
  if not method: # pragma: no cover
102
- raise ExpressionError(
103
- f"The functional pseudo-element ::{pseudo_element.name}() is unknown"
104
- )
100
+ raise ExpressionError(f"The functional pseudo-element ::{pseudo_element.name}() is unknown")
105
101
  xpath = method(xpath, pseudo_element)
106
102
  else:
107
- method_name = (
108
- f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element"
109
- )
103
+ method_name = f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element"
110
104
  method = getattr(self, method_name, None)
111
105
  if not method: # pragma: no cover
112
- raise ExpressionError(
113
- f"The pseudo-element ::{pseudo_element} is unknown"
114
- )
106
+ raise ExpressionError(f"The pseudo-element ::{pseudo_element} is unknown")
115
107
  xpath = method(xpath)
116
108
  return xpath
117
109
 
118
110
  @staticmethod
119
- def xpath_attr_functional_pseudo_element(
120
- xpath: OriginalXPathExpr, function: FunctionalPseudoElement
121
- ) -> XPathExpr:
111
+ def xpath_attr_functional_pseudo_element(xpath: OriginalXPathExpr, function: FunctionalPseudoElement) -> XPathExpr:
122
112
  """Support selecting attribute values using ::attr() pseudo-element"""
123
113
  if function.argument_types() not in (["STRING"], ["IDENT"]): # pragma: no cover
124
- raise ExpressionError(
125
- f"Expected a single string or ident for ::attr(), got {function.arguments!r}"
126
- )
114
+ raise ExpressionError(f"Expected a single string or ident for ::attr(), got {function.arguments!r}")
127
115
  return XPathExpr.from_xpath(xpath, attribute=function.arguments[0].value)
128
116
 
129
117
  @staticmethod
@@ -0,0 +1,10 @@
1
+ from ._utils import (
2
+ log,
3
+ __CONSECUTIVE_SPACES_REGEX__,
4
+ flatten,
5
+ _is_iterable,
6
+ _StorageTools,
7
+ clean_spaces,
8
+ html_forbidden,
9
+ )
10
+ from ._shell import _CookieParser, _ParseHeaders
@@ -0,0 +1,48 @@
1
+ from http import cookies as Cookie
2
+
3
+
4
+ from scrapling.core._types import (
5
+ List,
6
+ Dict,
7
+ Tuple,
8
+ )
9
+
10
+
11
+ def _CookieParser(cookie_string):
12
+ # Errors will be handled on call so the log can be specified
13
+ cookie_parser = Cookie.SimpleCookie()
14
+ cookie_parser.load(cookie_string)
15
+ for key, morsel in cookie_parser.items():
16
+ yield key, morsel.value
17
+
18
+
19
+ def _ParseHeaders(header_lines: List[str], parse_cookies: bool = True) -> Tuple[Dict[str, str], Dict[str, str]]:
20
+ """Parses headers into separate header and cookie dictionaries."""
21
+ header_dict = dict()
22
+ cookie_dict = dict()
23
+
24
+ for header_line in header_lines:
25
+ if ":" not in header_line:
26
+ if header_line.endswith(";"):
27
+ header_key = header_line[:-1].strip()
28
+ header_value = ""
29
+ header_dict[header_key] = header_value
30
+ else:
31
+ raise ValueError(f"Could not parse header without colon: '{header_line}'.")
32
+ else:
33
+ header_key, header_value = header_line.split(":", 1)
34
+ header_key = header_key.strip()
35
+ header_value = header_value.strip()
36
+
37
+ if parse_cookies:
38
+ if header_key.lower() == "cookie":
39
+ try:
40
+ cookie_dict = {key: value for key, value in _CookieParser(header_value)}
41
+ except Exception as e: # pragma: no cover
42
+ raise ValueError(f"Could not parse cookie string from header '{header_value}': {e}")
43
+ else:
44
+ header_dict[header_key] = header_value
45
+ else:
46
+ header_dict[header_key] = header_value
47
+
48
+ return header_dict, cookie_dict
@@ -24,9 +24,7 @@ def setup_logger():
24
24
  logger = logging.getLogger("scrapling")
25
25
  logger.setLevel(logging.INFO)
26
26
 
27
- formatter = logging.Formatter(
28
- fmt="[%(asctime)s] %(levelname)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
29
- )
27
+ formatter = logging.Formatter(fmt="[%(asctime)s] %(levelname)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
30
28
 
31
29
  console_handler = logging.StreamHandler()
32
30
  console_handler.setFormatter(formatter)
@@ -61,11 +59,7 @@ class _StorageTools:
61
59
  def __clean_attributes(element: html.HtmlElement, forbidden: tuple = ()) -> Dict:
62
60
  if not element.attrib:
63
61
  return {}
64
- return {
65
- k: v.strip()
66
- for k, v in element.attrib.items()
67
- if v and v.strip() and k not in forbidden
68
- }
62
+ return {k: v.strip() for k, v in element.attrib.items() if v and v.strip() and k not in forbidden}
69
63
 
70
64
  @classmethod
71
65
  def element_to_dict(cls, element: html.HtmlElement) -> Dict:
@@ -85,17 +79,11 @@ class _StorageTools:
85
79
  }
86
80
  )
87
81
 
88
- siblings = [
89
- child.tag for child in parent.iterchildren() if child != element
90
- ]
82
+ siblings = [child.tag for child in parent.iterchildren() if child != element]
91
83
  if siblings:
92
84
  result.update({"siblings": tuple(siblings)})
93
85
 
94
- children = [
95
- child.tag
96
- for child in element.iterchildren()
97
- if not isinstance(child, html_forbidden)
98
- ]
86
+ children = [child.tag for child in element.iterchildren() if not isinstance(child, html_forbidden)]
99
87
  if children:
100
88
  result.update({"children": tuple(children)})
101
89
 
@@ -104,11 +92,7 @@ class _StorageTools:
104
92
  @classmethod
105
93
  def _get_element_path(cls, element: html.HtmlElement):
106
94
  parent = element.getparent()
107
- return tuple(
108
- (element.tag,)
109
- if parent is None
110
- else (cls._get_element_path(parent) + (element.tag,))
111
- )
95
+ return tuple((element.tag,) if parent is None else (cls._get_element_path(parent) + (element.tag,)))
112
96
 
113
97
 
114
98
  @lru_cache(128, typed=True)
@@ -1,16 +0,0 @@
1
- from .constants import DEFAULT_DISABLED_RESOURCES, DEFAULT_STEALTH_FLAGS, DEFAULT_FLAGS
2
- from .static import FetcherSession, FetcherClient, AsyncFetcherClient
3
- from ._browsers import (
4
- DynamicSession,
5
- AsyncDynamicSession,
6
- StealthySession,
7
- AsyncStealthySession,
8
- )
9
-
10
- __all__ = [
11
- "FetcherSession",
12
- "DynamicSession",
13
- "AsyncDynamicSession",
14
- "StealthySession",
15
- "AsyncStealthySession",
16
- ]