scrapling 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. scrapling/__init__.py +1 -1
  2. scrapling/cli.py +38 -51
  3. scrapling/core/_html_utils.py +3 -9
  4. scrapling/core/ai.py +5 -13
  5. scrapling/core/custom_types.py +19 -61
  6. scrapling/core/mixins.py +6 -28
  7. scrapling/core/shell.py +49 -127
  8. scrapling/core/storage.py +2 -8
  9. scrapling/core/translator.py +8 -20
  10. scrapling/core/utils/__init__.py +10 -0
  11. scrapling/core/utils/_shell.py +48 -0
  12. scrapling/core/{utils.py → utils/_utils.py} +5 -21
  13. scrapling/engines/__init__.py +0 -16
  14. scrapling/engines/_browsers/_base.py +297 -0
  15. scrapling/engines/_browsers/_camoufox.py +219 -296
  16. scrapling/engines/_browsers/_config_tools.py +2 -1
  17. scrapling/engines/_browsers/_controllers.py +201 -281
  18. scrapling/engines/_browsers/_page.py +37 -15
  19. scrapling/engines/_browsers/_validators.py +9 -15
  20. scrapling/engines/constants.py +3 -6
  21. scrapling/engines/static.py +25 -75
  22. scrapling/engines/toolbelt/__init__.py +1 -20
  23. scrapling/engines/toolbelt/convertor.py +95 -86
  24. scrapling/engines/toolbelt/custom.py +7 -99
  25. scrapling/engines/toolbelt/fingerprints.py +1 -3
  26. scrapling/engines/toolbelt/navigation.py +4 -58
  27. scrapling/fetchers.py +29 -24
  28. scrapling/parser.py +45 -122
  29. {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/METADATA +54 -46
  30. scrapling-0.3.3.dist-info/RECORD +44 -0
  31. scrapling-0.3.1.dist-info/RECORD +0 -41
  32. {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/WHEEL +0 -0
  33. {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/entry_points.txt +0 -0
  34. {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/licenses/LICENSE +0 -0
  35. {scrapling-0.3.1.dist-info → scrapling-0.3.3.dist-info}/top_level.txt +0 -0
scrapling/fetchers.py CHANGED
@@ -6,16 +6,18 @@ from scrapling.core._types import (
6
6
  SelectorWaitStates,
7
7
  Iterable,
8
8
  )
9
- from scrapling.engines import (
9
+ from scrapling.engines.static import (
10
10
  FetcherSession,
11
- StealthySession,
12
- AsyncStealthySession,
13
- DynamicSession,
14
- AsyncDynamicSession,
15
11
  FetcherClient as _FetcherClient,
16
12
  AsyncFetcherClient as _AsyncFetcherClient,
17
13
  )
18
- from scrapling.engines.toolbelt import BaseFetcher, Response
14
+ from scrapling.engines._browsers import (
15
+ DynamicSession,
16
+ StealthySession,
17
+ AsyncDynamicSession,
18
+ AsyncStealthySession,
19
+ )
20
+ from scrapling.engines.toolbelt.custom import BaseFetcher, Response
19
21
 
20
22
  __FetcherClientInstance__ = _FetcherClient()
21
23
  __AsyncFetcherClientInstance__ = _AsyncFetcherClient()
@@ -56,6 +58,7 @@ class StealthyFetcher(BaseFetcher):
56
58
  block_webrtc: bool = False,
57
59
  allow_webgl: bool = True,
58
60
  network_idle: bool = False,
61
+ load_dom: bool = True,
59
62
  humanize: bool | float = True,
60
63
  solve_cloudflare: bool = False,
61
64
  wait: int | float = 0,
@@ -92,11 +95,12 @@ class StealthyFetcher(BaseFetcher):
92
95
  :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
93
96
  :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
94
97
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
98
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
95
99
  :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
96
100
  :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
97
101
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
98
102
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
99
- :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
103
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
100
104
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
101
105
  :param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
102
106
  :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
@@ -112,13 +116,10 @@ class StealthyFetcher(BaseFetcher):
112
116
  if not custom_config:
113
117
  custom_config = {}
114
118
  elif not isinstance(custom_config, dict):
115
- ValueError(
116
- f"The custom parser config must be of type dictionary, got {cls.__class__}"
117
- )
119
+ ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
118
120
 
119
121
  with StealthySession(
120
122
  wait=wait,
121
- max_pages=1,
122
123
  proxy=proxy,
123
124
  geoip=geoip,
124
125
  addons=addons,
@@ -126,6 +127,7 @@ class StealthyFetcher(BaseFetcher):
126
127
  cookies=cookies,
127
128
  headless=headless,
128
129
  humanize=humanize,
130
+ load_dom=load_dom,
129
131
  disable_ads=disable_ads,
130
132
  allow_webgl=allow_webgl,
131
133
  page_action=page_action,
@@ -155,6 +157,7 @@ class StealthyFetcher(BaseFetcher):
155
157
  block_webrtc: bool = False,
156
158
  allow_webgl: bool = True,
157
159
  network_idle: bool = False,
160
+ load_dom: bool = True,
158
161
  humanize: bool | float = True,
159
162
  solve_cloudflare: bool = False,
160
163
  wait: int | float = 0,
@@ -191,11 +194,12 @@ class StealthyFetcher(BaseFetcher):
191
194
  :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
192
195
  :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
193
196
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
197
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
194
198
  :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
195
199
  :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
196
200
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
197
201
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
198
- :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
202
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
199
203
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
200
204
  :param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
201
205
  :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
@@ -211,9 +215,7 @@ class StealthyFetcher(BaseFetcher):
211
215
  if not custom_config:
212
216
  custom_config = {}
213
217
  elif not isinstance(custom_config, dict):
214
- ValueError(
215
- f"The custom parser config must be of type dictionary, got {cls.__class__}"
216
- )
218
+ ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
217
219
 
218
220
  async with AsyncStealthySession(
219
221
  wait=wait,
@@ -225,6 +227,7 @@ class StealthyFetcher(BaseFetcher):
225
227
  cookies=cookies,
226
228
  headless=headless,
227
229
  humanize=humanize,
230
+ load_dom=load_dom,
228
231
  disable_ads=disable_ads,
229
232
  allow_webgl=allow_webgl,
230
233
  page_action=page_action,
@@ -285,6 +288,7 @@ class DynamicFetcher(BaseFetcher):
285
288
  init_script: Optional[str] = None,
286
289
  cookies: Optional[Iterable[Dict]] = None,
287
290
  network_idle: bool = False,
291
+ load_dom: bool = True,
288
292
  wait_selector_state: SelectorWaitStates = "attached",
289
293
  custom_config: Optional[Dict] = None,
290
294
  ) -> Response:
@@ -298,9 +302,10 @@ class DynamicFetcher(BaseFetcher):
298
302
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
299
303
  :param cookies: Set cookies for the next request.
300
304
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
305
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
301
306
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
302
307
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
303
- :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
308
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
304
309
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
305
310
  :param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
306
311
  :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
@@ -319,9 +324,7 @@ class DynamicFetcher(BaseFetcher):
319
324
  if not custom_config:
320
325
  custom_config = {}
321
326
  elif not isinstance(custom_config, dict):
322
- raise ValueError(
323
- f"The custom parser config must be of type dictionary, got {cls.__class__}"
324
- )
327
+ raise ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
325
328
 
326
329
  with DynamicSession(
327
330
  wait=wait,
@@ -332,6 +335,7 @@ class DynamicFetcher(BaseFetcher):
332
335
  cdp_url=cdp_url,
333
336
  cookies=cookies,
334
337
  headless=headless,
338
+ load_dom=load_dom,
335
339
  useragent=useragent,
336
340
  real_chrome=real_chrome,
337
341
  page_action=page_action,
@@ -371,6 +375,7 @@ class DynamicFetcher(BaseFetcher):
371
375
  init_script: Optional[str] = None,
372
376
  cookies: Optional[Iterable[Dict]] = None,
373
377
  network_idle: bool = False,
378
+ load_dom: bool = True,
374
379
  wait_selector_state: SelectorWaitStates = "attached",
375
380
  custom_config: Optional[Dict] = None,
376
381
  ) -> Response:
@@ -384,9 +389,10 @@ class DynamicFetcher(BaseFetcher):
384
389
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
385
390
  :param cookies: Set cookies for the next request.
386
391
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
392
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
387
393
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
388
394
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
389
- :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
395
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
390
396
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
391
397
  :param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
392
398
  :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
@@ -405,12 +411,11 @@ class DynamicFetcher(BaseFetcher):
405
411
  if not custom_config:
406
412
  custom_config = {}
407
413
  elif not isinstance(custom_config, dict):
408
- raise ValueError(
409
- f"The custom parser config must be of type dictionary, got {cls.__class__}"
410
- )
414
+ raise ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
411
415
 
412
416
  async with AsyncDynamicSession(
413
417
  wait=wait,
418
+ max_pages=1,
414
419
  proxy=proxy,
415
420
  locale=locale,
416
421
  timeout=timeout,
@@ -418,8 +423,8 @@ class DynamicFetcher(BaseFetcher):
418
423
  cdp_url=cdp_url,
419
424
  cookies=cookies,
420
425
  headless=headless,
426
+ load_dom=load_dom,
421
427
  useragent=useragent,
422
- max_pages=1,
423
428
  real_chrome=real_chrome,
424
429
  page_action=page_action,
425
430
  hide_canvas=hide_canvas,
scrapling/parser.py CHANGED
@@ -1,12 +1,11 @@
1
- from pathlib import Path
2
1
  import re
2
+ from pathlib import Path
3
3
  from inspect import signature
4
- from difflib import SequenceMatcher
5
4
  from urllib.parse import urljoin
5
+ from difflib import SequenceMatcher
6
6
 
7
- from cssselect import SelectorError, SelectorSyntaxError
8
- from cssselect import parse as split_selectors
9
7
  from lxml.html import HtmlElement, HtmlMixin, HTMLParser
8
+ from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
10
9
  from lxml.etree import (
11
10
  XPath,
12
11
  tostring,
@@ -75,7 +74,7 @@ class Selector(SelectorsGeneration):
75
74
  self,
76
75
  content: Optional[str | bytes] = None,
77
76
  url: Optional[str] = None,
78
- encoding: str = "utf8",
77
+ encoding: str = "utf-8",
79
78
  huge_tree: bool = True,
80
79
  root: Optional[HtmlElement] = None,
81
80
  keep_comments: Optional[bool] = False,
@@ -110,22 +109,16 @@ class Selector(SelectorsGeneration):
110
109
  If empty, default values will be used.
111
110
  """
112
111
  if root is None and content is None:
113
- raise ValueError(
114
- "Selector class needs HTML content, or root arguments to work"
115
- )
112
+ raise ValueError("Selector class needs HTML content, or root arguments to work")
116
113
 
117
114
  self.__text = None
118
115
  if root is None:
119
116
  if isinstance(content, str):
120
- body = (
121
- content.strip().replace("\x00", "").encode(encoding) or b"<html/>"
122
- )
117
+ body = content.strip().replace("\x00", "").encode(encoding) or b"<html/>"
123
118
  elif isinstance(content, bytes):
124
- body = content.replace(b"\x00", b"").strip()
119
+ body = content.replace(b"\x00", b"")
125
120
  else:
126
- raise TypeError(
127
- f"content argument must be str or bytes, got {type(content)}"
128
- )
121
+ raise TypeError(f"content argument must be str or bytes, got {type(content)}")
129
122
 
130
123
  # https://lxml.de/api/lxml.etree.HTMLParser-class.html
131
124
  parser = HTMLParser(
@@ -139,8 +132,7 @@ class Selector(SelectorsGeneration):
139
132
  strip_cdata=(not keep_cdata),
140
133
  )
141
134
  self._root = fromstring(body, parser=parser, base_url=url)
142
-
143
- self._raw_body = body.decode()
135
+ self._raw_body = content
144
136
 
145
137
  else:
146
138
  # All HTML types inherit from HtmlMixin so this to check for all at once
@@ -165,16 +157,10 @@ class Selector(SelectorsGeneration):
165
157
  }
166
158
 
167
159
  if not hasattr(storage, "__wrapped__"):
168
- raise ValueError(
169
- "Storage class must be wrapped with lru_cache decorator, see docs for info"
170
- )
160
+ raise ValueError("Storage class must be wrapped with lru_cache decorator, see docs for info")
171
161
 
172
- if not issubclass(
173
- storage.__wrapped__, StorageSystemMixin
174
- ): # pragma: no cover
175
- raise ValueError(
176
- "Storage system must be inherited from class `StorageSystemMixin`"
177
- )
162
+ if not issubclass(storage.__wrapped__, StorageSystemMixin): # pragma: no cover
163
+ raise ValueError("Storage system must be inherited from class `StorageSystemMixin`")
178
164
 
179
165
  self._storage = storage(**storage_args)
180
166
 
@@ -239,9 +225,7 @@ class Selector(SelectorsGeneration):
239
225
 
240
226
  def __element_convertor(self, element: HtmlElement) -> "Selector":
241
227
  """Used internally to convert a single HtmlElement to Selector directly without checks"""
242
- db_instance = (
243
- self._storage if (hasattr(self, "_storage") and self._storage) else None
244
- )
228
+ db_instance = self._storage if (hasattr(self, "_storage") and self._storage) else None
245
229
  return Selector(
246
230
  root=element,
247
231
  url=self.url,
@@ -355,18 +339,19 @@ class Selector(SelectorsGeneration):
355
339
  @property
356
340
  def html_content(self) -> TextHandler:
357
341
  """Return the inner HTML code of the element"""
358
- return TextHandler(
359
- tostring(self._root, encoding="unicode", method="html", with_tail=False)
360
- )
342
+ return TextHandler(tostring(self._root, encoding=self.encoding, method="html", with_tail=False))
361
343
 
362
- body = html_content
344
+ @property
345
+ def body(self):
346
+ """Return the raw body of the current `Selector` without any processing. Useful for binary and non-HTML requests."""
347
+ return self._raw_body
363
348
 
364
349
  def prettify(self) -> TextHandler:
365
350
  """Return a prettified version of the element's inner html-code"""
366
351
  return TextHandler(
367
352
  tostring(
368
353
  self._root,
369
- encoding="unicode",
354
+ encoding=self.encoding,
370
355
  pretty_print=True,
371
356
  method="html",
372
357
  with_tail=False,
@@ -404,9 +389,7 @@ class Selector(SelectorsGeneration):
404
389
  def siblings(self) -> "Selectors":
405
390
  """Return other children of the current element's parent or empty list otherwise"""
406
391
  if self.parent:
407
- return Selectors(
408
- child for child in self.parent.children if child._root != self._root
409
- )
392
+ return Selectors(child for child in self.parent.children if child._root != self._root)
410
393
  return Selectors()
411
394
 
412
395
  def iterancestors(self) -> Generator["Selector", None, None]:
@@ -519,9 +502,7 @@ class Selector(SelectorsGeneration):
519
502
  log.debug(f"Highest probability was {highest_probability}%")
520
503
  log.debug("Top 5 best matching elements are: ")
521
504
  for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
522
- log.debug(
523
- f"{percent} -> {self.__handle_elements(score_table[percent])}"
524
- )
505
+ log.debug(f"{percent} -> {self.__handle_elements(score_table[percent])}")
525
506
 
526
507
  if not selector_type:
527
508
  return score_table[highest_probability]
@@ -658,9 +639,7 @@ class Selector(SelectorsGeneration):
658
639
  SelectorError,
659
640
  SelectorSyntaxError,
660
641
  ) as e:
661
- raise SelectorSyntaxError(
662
- f"Invalid CSS selector '{selector}': {str(e)}"
663
- ) from e
642
+ raise SelectorSyntaxError(f"Invalid CSS selector '{selector}': {str(e)}") from e
664
643
 
665
644
  def xpath(
666
645
  self,
@@ -702,9 +681,7 @@ class Selector(SelectorsGeneration):
702
681
  elif self.__adaptive_enabled and auto_save:
703
682
  self.save(elements[0], identifier or selector)
704
683
 
705
- return self.__handle_elements(
706
- elements[0:1] if (_first_match and elements) else elements
707
- )
684
+ return self.__handle_elements(elements[0:1] if (_first_match and elements) else elements)
708
685
  elif self.__adaptive_enabled:
709
686
  if adaptive:
710
687
  element_data = self.retrieve(identifier or selector)
@@ -713,9 +690,7 @@ class Selector(SelectorsGeneration):
713
690
  if elements is not None and auto_save:
714
691
  self.save(elements[0], identifier or selector)
715
692
 
716
- return self.__handle_elements(
717
- elements[0:1] if (_first_match and elements) else elements
718
- )
693
+ return self.__handle_elements(elements[0:1] if (_first_match and elements) else elements)
719
694
  else:
720
695
  if adaptive:
721
696
  log.warning(
@@ -726,9 +701,7 @@ class Selector(SelectorsGeneration):
726
701
  "Argument `auto_save` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info."
727
702
  )
728
703
 
729
- return self.__handle_elements(
730
- elements[0:1] if (_first_match and elements) else elements
731
- )
704
+ return self.__handle_elements(elements[0:1] if (_first_match and elements) else elements)
732
705
 
733
706
  except (
734
707
  SelectorError,
@@ -751,9 +724,7 @@ class Selector(SelectorsGeneration):
751
724
  """
752
725
 
753
726
  if not args and not kwargs:
754
- raise TypeError(
755
- "You have to pass something to search with, like tag name(s), tag attributes, or both."
756
- )
727
+ raise TypeError("You have to pass something to search with, like tag name(s), tag attributes, or both.")
757
728
 
758
729
  attributes = dict()
759
730
  tags, patterns = set(), set()
@@ -766,18 +737,11 @@ class Selector(SelectorsGeneration):
766
737
 
767
738
  elif type(arg) in (list, tuple, set):
768
739
  if not all(map(lambda x: isinstance(x, str), arg)):
769
- raise TypeError(
770
- "Nested Iterables are not accepted, only iterables of tag names are accepted"
771
- )
740
+ raise TypeError("Nested Iterables are not accepted, only iterables of tag names are accepted")
772
741
  tags.update(set(arg))
773
742
 
774
743
  elif isinstance(arg, dict):
775
- if not all(
776
- [
777
- (isinstance(k, str) and isinstance(v, str))
778
- for k, v in arg.items()
779
- ]
780
- ):
744
+ if not all([(isinstance(k, str) and isinstance(v, str)) for k, v in arg.items()]):
781
745
  raise TypeError(
782
746
  "Nested dictionaries are not accepted, only string keys and string values are accepted"
783
747
  )
@@ -795,13 +759,9 @@ class Selector(SelectorsGeneration):
795
759
  )
796
760
 
797
761
  else:
798
- raise TypeError(
799
- f'Argument with type "{type(arg)}" is not accepted, please read the docs.'
800
- )
762
+ raise TypeError(f'Argument with type "{type(arg)}" is not accepted, please read the docs.')
801
763
 
802
- if not all(
803
- [(isinstance(k, str) and isinstance(v, str)) for k, v in kwargs.items()]
804
- ):
764
+ if not all([(isinstance(k, str) and isinstance(v, str)) for k, v in kwargs.items()]):
805
765
  raise TypeError("Only string values are accepted for arguments")
806
766
 
807
767
  for attribute_name, value in kwargs.items():
@@ -825,9 +785,7 @@ class Selector(SelectorsGeneration):
825
785
  if results:
826
786
  # From the results, get the ones that fulfill passed regex patterns
827
787
  for pattern in patterns:
828
- results = results.filter(
829
- lambda e: e.text.re(pattern, check_match=True)
830
- )
788
+ results = results.filter(lambda e: e.text.re(pattern, check_match=True))
831
789
 
832
790
  # From the results, get the ones that fulfill passed functions
833
791
  for function in functions:
@@ -858,9 +816,7 @@ class Selector(SelectorsGeneration):
858
816
  return element
859
817
  return None
860
818
 
861
- def __calculate_similarity_score(
862
- self, original: Dict, candidate: HtmlElement
863
- ) -> float:
819
+ def __calculate_similarity_score(self, original: Dict, candidate: HtmlElement) -> float:
864
820
  """Used internally to calculate a score that shows how a candidate element similar to the original one
865
821
 
866
822
  :param original: The original element in the form of the dictionary generated from `element_to_dict` function
@@ -877,15 +833,11 @@ class Selector(SelectorsGeneration):
877
833
  checks += 1
878
834
 
879
835
  if original["text"]:
880
- score += SequenceMatcher(
881
- None, original["text"], candidate.get("text") or ""
882
- ).ratio() # * 0.3 # 30%
836
+ score += SequenceMatcher(None, original["text"], candidate.get("text") or "").ratio() # * 0.3 # 30%
883
837
  checks += 1
884
838
 
885
839
  # if both don't have attributes, it still counts for something!
886
- score += self.__calculate_dict_diff(
887
- original["attributes"], candidate["attributes"]
888
- ) # * 0.3 # 30%
840
+ score += self.__calculate_dict_diff(original["attributes"], candidate["attributes"]) # * 0.3 # 30%
889
841
  checks += 1
890
842
 
891
843
  # Separate similarity test for class, id, href,... this will help in full structural changes
@@ -903,9 +855,7 @@ class Selector(SelectorsGeneration):
903
855
  ).ratio() # * 0.3 # 30%
904
856
  checks += 1
905
857
 
906
- score += SequenceMatcher(
907
- None, original["path"], candidate["path"]
908
- ).ratio() # * 0.1 # 10%
858
+ score += SequenceMatcher(None, original["path"], candidate["path"]).ratio() # * 0.1 # 10%
909
859
  checks += 1
910
860
 
911
861
  if original.get("parent_name"):
@@ -944,14 +894,8 @@ class Selector(SelectorsGeneration):
944
894
  @staticmethod
945
895
  def __calculate_dict_diff(dict1: Dict, dict2: Dict) -> float:
946
896
  """Used internally to calculate similarity between two dictionaries as SequenceMatcher doesn't accept dictionaries"""
947
- score = (
948
- SequenceMatcher(None, tuple(dict1.keys()), tuple(dict2.keys())).ratio()
949
- * 0.5
950
- )
951
- score += (
952
- SequenceMatcher(None, tuple(dict1.values()), tuple(dict2.values())).ratio()
953
- * 0.5
954
- )
897
+ score = SequenceMatcher(None, tuple(dict1.keys()), tuple(dict2.keys())).ratio() * 0.5
898
+ score += SequenceMatcher(None, tuple(dict1.values()), tuple(dict2.values())).ratio() * 0.5
955
899
  return score
956
900
 
957
901
  def save(self, element: Union["Selector", HtmlElement], identifier: str) -> None:
@@ -992,7 +936,7 @@ class Selector(SelectorsGeneration):
992
936
  # Operations on text functions
993
937
  def json(self) -> Dict:
994
938
  """Return JSON response if the response is jsonable otherwise throws error"""
995
- if self._raw_body:
939
+ if self._raw_body and isinstance(self._raw_body, str):
996
940
  return TextHandler(self._raw_body).json()
997
941
  elif self.text:
998
942
  return self.text.json()
@@ -1031,9 +975,7 @@ class Selector(SelectorsGeneration):
1031
975
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
1032
976
  :param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
1033
977
  """
1034
- return self.text.re_first(
1035
- regex, default, replace_entities, clean_match, case_sensitive
1036
- )
978
+ return self.text.re_first(regex, default, replace_entities, clean_match, case_sensitive)
1037
979
 
1038
980
  @staticmethod
1039
981
  def __get_attributes(element: HtmlElement, ignore_attributes: List | Tuple) -> Dict:
@@ -1052,9 +994,7 @@ class Selector(SelectorsGeneration):
1052
994
  """Calculate a score of how much these elements are alike and return True
1053
995
  if the score is higher or equals the threshold"""
1054
996
  candidate_attributes = (
1055
- self.__get_attributes(candidate, ignore_attributes)
1056
- if ignore_attributes
1057
- else candidate.attrib
997
+ self.__get_attributes(candidate, ignore_attributes) if ignore_attributes else candidate.attrib
1058
998
  )
1059
999
  score, checks = 0, 0
1060
1000
 
@@ -1116,11 +1056,7 @@ class Selector(SelectorsGeneration):
1116
1056
  similar_elements = list()
1117
1057
 
1118
1058
  current_depth = len(list(root.iterancestors()))
1119
- target_attrs = (
1120
- self.__get_attributes(root, ignore_attributes)
1121
- if ignore_attributes
1122
- else root.attrib
1123
- )
1059
+ target_attrs = self.__get_attributes(root, ignore_attributes) if ignore_attributes else root.attrib
1124
1060
 
1125
1061
  path_parts = [self.tag]
1126
1062
  if (parent := root.getparent()) is not None:
@@ -1129,9 +1065,7 @@ class Selector(SelectorsGeneration):
1129
1065
  path_parts.insert(0, grandparent.tag)
1130
1066
 
1131
1067
  xpath_path = "//{}".format("/".join(path_parts))
1132
- potential_matches = root.xpath(
1133
- f"{xpath_path}[count(ancestor::*) = {current_depth}]"
1134
- )
1068
+ potential_matches = root.xpath(f"{xpath_path}[count(ancestor::*) = {current_depth}]")
1135
1069
 
1136
1070
  for potential_match in potential_matches:
1137
1071
  if potential_match != root and self.__are_alike(
@@ -1275,12 +1209,7 @@ class Selectors(List[Selector]):
1275
1209
 
1276
1210
  :return: `Selectors` class.
1277
1211
  """
1278
- results = [
1279
- n.xpath(
1280
- selector, identifier or selector, False, auto_save, percentage, **kwargs
1281
- )
1282
- for n in self
1283
- ]
1212
+ results = [n.xpath(selector, identifier or selector, False, auto_save, percentage, **kwargs) for n in self]
1284
1213
  return self.__class__(flatten(results))
1285
1214
 
1286
1215
  def css(
@@ -1308,10 +1237,7 @@ class Selectors(List[Selector]):
1308
1237
 
1309
1238
  :return: `Selectors` class.
1310
1239
  """
1311
- results = [
1312
- n.css(selector, identifier or selector, False, auto_save, percentage)
1313
- for n in self
1314
- ]
1240
+ results = [n.css(selector, identifier or selector, False, auto_save, percentage) for n in self]
1315
1241
  return self.__class__(flatten(results))
1316
1242
 
1317
1243
  def re(
@@ -1329,10 +1255,7 @@ class Selectors(List[Selector]):
1329
1255
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
1330
1256
  :param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
1331
1257
  """
1332
- results = [
1333
- n.text.re(regex, replace_entities, clean_match, case_sensitive)
1334
- for n in self
1335
- ]
1258
+ results = [n.text.re(regex, replace_entities, clean_match, case_sensitive) for n in self]
1336
1259
  return TextHandlers(flatten(results))
1337
1260
 
1338
1261
  def re_first(