scrapling 0.3__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. scrapling/__init__.py +1 -1
  2. scrapling/cli.py +38 -51
  3. scrapling/core/_html_utils.py +3 -9
  4. scrapling/core/ai.py +5 -13
  5. scrapling/core/custom_types.py +19 -61
  6. scrapling/core/mixins.py +6 -28
  7. scrapling/core/shell.py +51 -129
  8. scrapling/core/storage.py +2 -8
  9. scrapling/core/translator.py +8 -20
  10. scrapling/core/utils/__init__.py +10 -0
  11. scrapling/core/utils/_shell.py +48 -0
  12. scrapling/core/{utils.py → utils/_utils.py} +5 -21
  13. scrapling/engines/__init__.py +0 -16
  14. scrapling/engines/_browsers/_base.py +297 -0
  15. scrapling/engines/_browsers/_camoufox.py +238 -293
  16. scrapling/engines/_browsers/_config_tools.py +2 -1
  17. scrapling/engines/_browsers/_controllers.py +220 -278
  18. scrapling/engines/_browsers/_page.py +37 -15
  19. scrapling/engines/_browsers/_validators.py +29 -15
  20. scrapling/engines/constants.py +3 -6
  21. scrapling/engines/static.py +25 -75
  22. scrapling/engines/toolbelt/__init__.py +1 -20
  23. scrapling/engines/toolbelt/convertor.py +95 -86
  24. scrapling/engines/toolbelt/custom.py +7 -99
  25. scrapling/engines/toolbelt/fingerprints.py +1 -3
  26. scrapling/engines/toolbelt/navigation.py +4 -58
  27. scrapling/fetchers.py +41 -24
  28. scrapling/parser.py +45 -122
  29. {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/METADATA +57 -47
  30. scrapling-0.3.2.dist-info/RECORD +44 -0
  31. scrapling-0.3.dist-info/RECORD +0 -41
  32. {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/WHEEL +0 -0
  33. {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/entry_points.txt +0 -0
  34. {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/licenses/LICENSE +0 -0
  35. {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/top_level.txt +0 -0
scrapling/fetchers.py CHANGED
@@ -6,16 +6,18 @@ from scrapling.core._types import (
6
6
  SelectorWaitStates,
7
7
  Iterable,
8
8
  )
9
- from scrapling.engines import (
9
+ from scrapling.engines.static import (
10
10
  FetcherSession,
11
- StealthySession,
12
- AsyncStealthySession,
13
- DynamicSession,
14
- AsyncDynamicSession,
15
11
  FetcherClient as _FetcherClient,
16
12
  AsyncFetcherClient as _AsyncFetcherClient,
17
13
  )
18
- from scrapling.engines.toolbelt import BaseFetcher, Response
14
+ from scrapling.engines._browsers import (
15
+ DynamicSession,
16
+ StealthySession,
17
+ AsyncDynamicSession,
18
+ AsyncStealthySession,
19
+ )
20
+ from scrapling.engines.toolbelt.custom import BaseFetcher, Response
19
21
 
20
22
  __FetcherClientInstance__ = _FetcherClient()
21
23
  __AsyncFetcherClientInstance__ = _AsyncFetcherClient()
@@ -56,12 +58,14 @@ class StealthyFetcher(BaseFetcher):
56
58
  block_webrtc: bool = False,
57
59
  allow_webgl: bool = True,
58
60
  network_idle: bool = False,
61
+ load_dom: bool = True,
59
62
  humanize: bool | float = True,
60
63
  solve_cloudflare: bool = False,
61
64
  wait: int | float = 0,
62
65
  timeout: int | float = 30000,
63
66
  page_action: Optional[Callable] = None,
64
67
  wait_selector: Optional[str] = None,
68
+ init_script: Optional[str] = None,
65
69
  addons: Optional[List[str]] = None,
66
70
  wait_selector_state: SelectorWaitStates = "attached",
67
71
  cookies: Optional[List[Dict]] = None,
@@ -91,12 +95,14 @@ class StealthyFetcher(BaseFetcher):
91
95
  :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
92
96
  :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
93
97
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
98
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
94
99
  :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
95
100
  :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
96
101
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
97
102
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
98
- :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
103
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
99
104
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
105
+ :param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
100
106
  :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
101
107
  It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
102
108
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
@@ -110,13 +116,10 @@ class StealthyFetcher(BaseFetcher):
110
116
  if not custom_config:
111
117
  custom_config = {}
112
118
  elif not isinstance(custom_config, dict):
113
- ValueError(
114
- f"The custom parser config must be of type dictionary, got {cls.__class__}"
115
- )
119
+ ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
116
120
 
117
121
  with StealthySession(
118
122
  wait=wait,
119
- max_pages=1,
120
123
  proxy=proxy,
121
124
  geoip=geoip,
122
125
  addons=addons,
@@ -124,9 +127,11 @@ class StealthyFetcher(BaseFetcher):
124
127
  cookies=cookies,
125
128
  headless=headless,
126
129
  humanize=humanize,
130
+ load_dom=load_dom,
127
131
  disable_ads=disable_ads,
128
132
  allow_webgl=allow_webgl,
129
133
  page_action=page_action,
134
+ init_script=init_script,
130
135
  network_idle=network_idle,
131
136
  block_images=block_images,
132
137
  block_webrtc=block_webrtc,
@@ -152,12 +157,14 @@ class StealthyFetcher(BaseFetcher):
152
157
  block_webrtc: bool = False,
153
158
  allow_webgl: bool = True,
154
159
  network_idle: bool = False,
160
+ load_dom: bool = True,
155
161
  humanize: bool | float = True,
156
162
  solve_cloudflare: bool = False,
157
163
  wait: int | float = 0,
158
164
  timeout: int | float = 30000,
159
165
  page_action: Optional[Callable] = None,
160
166
  wait_selector: Optional[str] = None,
167
+ init_script: Optional[str] = None,
161
168
  addons: Optional[List[str]] = None,
162
169
  wait_selector_state: SelectorWaitStates = "attached",
163
170
  cookies: Optional[List[Dict]] = None,
@@ -187,12 +194,14 @@ class StealthyFetcher(BaseFetcher):
187
194
  :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
188
195
  :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
189
196
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
197
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
190
198
  :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
191
199
  :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
192
200
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
193
201
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
194
- :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
202
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
195
203
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
204
+ :param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
196
205
  :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
197
206
  It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
198
207
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
@@ -206,9 +215,7 @@ class StealthyFetcher(BaseFetcher):
206
215
  if not custom_config:
207
216
  custom_config = {}
208
217
  elif not isinstance(custom_config, dict):
209
- ValueError(
210
- f"The custom parser config must be of type dictionary, got {cls.__class__}"
211
- )
218
+ ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
212
219
 
213
220
  async with AsyncStealthySession(
214
221
  wait=wait,
@@ -220,9 +227,11 @@ class StealthyFetcher(BaseFetcher):
220
227
  cookies=cookies,
221
228
  headless=headless,
222
229
  humanize=humanize,
230
+ load_dom=load_dom,
223
231
  disable_ads=disable_ads,
224
232
  allow_webgl=allow_webgl,
225
233
  page_action=page_action,
234
+ init_script=init_script,
226
235
  network_idle=network_idle,
227
236
  block_images=block_images,
228
237
  block_webrtc=block_webrtc,
@@ -276,8 +285,10 @@ class DynamicFetcher(BaseFetcher):
276
285
  timeout: int | float = 30000,
277
286
  disable_resources: bool = False,
278
287
  wait_selector: Optional[str] = None,
288
+ init_script: Optional[str] = None,
279
289
  cookies: Optional[Iterable[Dict]] = None,
280
290
  network_idle: bool = False,
291
+ load_dom: bool = True,
281
292
  wait_selector_state: SelectorWaitStates = "attached",
282
293
  custom_config: Optional[Dict] = None,
283
294
  ) -> Response:
@@ -291,10 +302,12 @@ class DynamicFetcher(BaseFetcher):
291
302
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
292
303
  :param cookies: Set cookies for the next request.
293
304
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
305
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
294
306
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
295
307
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
296
- :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
308
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
297
309
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
310
+ :param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
298
311
  :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
299
312
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
300
313
  :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
@@ -311,9 +324,7 @@ class DynamicFetcher(BaseFetcher):
311
324
  if not custom_config:
312
325
  custom_config = {}
313
326
  elif not isinstance(custom_config, dict):
314
- raise ValueError(
315
- f"The custom parser config must be of type dictionary, got {cls.__class__}"
316
- )
327
+ raise ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
317
328
 
318
329
  with DynamicSession(
319
330
  wait=wait,
@@ -324,10 +335,12 @@ class DynamicFetcher(BaseFetcher):
324
335
  cdp_url=cdp_url,
325
336
  cookies=cookies,
326
337
  headless=headless,
338
+ load_dom=load_dom,
327
339
  useragent=useragent,
328
340
  real_chrome=real_chrome,
329
341
  page_action=page_action,
330
342
  hide_canvas=hide_canvas,
343
+ init_script=init_script,
331
344
  network_idle=network_idle,
332
345
  google_search=google_search,
333
346
  extra_headers=extra_headers,
@@ -359,8 +372,10 @@ class DynamicFetcher(BaseFetcher):
359
372
  timeout: int | float = 30000,
360
373
  disable_resources: bool = False,
361
374
  wait_selector: Optional[str] = None,
375
+ init_script: Optional[str] = None,
362
376
  cookies: Optional[Iterable[Dict]] = None,
363
377
  network_idle: bool = False,
378
+ load_dom: bool = True,
364
379
  wait_selector_state: SelectorWaitStates = "attached",
365
380
  custom_config: Optional[Dict] = None,
366
381
  ) -> Response:
@@ -374,10 +389,12 @@ class DynamicFetcher(BaseFetcher):
374
389
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
375
390
  :param cookies: Set cookies for the next request.
376
391
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
392
+ :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
377
393
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
378
394
  :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
379
- :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
395
+ :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.
380
396
  :param wait_selector: Wait for a specific CSS selector to be in a specific state.
397
+ :param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.
381
398
  :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
382
399
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
383
400
  :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
@@ -394,12 +411,11 @@ class DynamicFetcher(BaseFetcher):
394
411
  if not custom_config:
395
412
  custom_config = {}
396
413
  elif not isinstance(custom_config, dict):
397
- raise ValueError(
398
- f"The custom parser config must be of type dictionary, got {cls.__class__}"
399
- )
414
+ raise ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
400
415
 
401
416
  async with AsyncDynamicSession(
402
417
  wait=wait,
418
+ max_pages=1,
403
419
  proxy=proxy,
404
420
  locale=locale,
405
421
  timeout=timeout,
@@ -407,11 +423,12 @@ class DynamicFetcher(BaseFetcher):
407
423
  cdp_url=cdp_url,
408
424
  cookies=cookies,
409
425
  headless=headless,
426
+ load_dom=load_dom,
410
427
  useragent=useragent,
411
- max_pages=1,
412
428
  real_chrome=real_chrome,
413
429
  page_action=page_action,
414
430
  hide_canvas=hide_canvas,
431
+ init_script=init_script,
415
432
  network_idle=network_idle,
416
433
  google_search=google_search,
417
434
  extra_headers=extra_headers,
scrapling/parser.py CHANGED
@@ -1,12 +1,11 @@
1
- from pathlib import Path
2
1
  import re
2
+ from pathlib import Path
3
3
  from inspect import signature
4
- from difflib import SequenceMatcher
5
4
  from urllib.parse import urljoin
5
+ from difflib import SequenceMatcher
6
6
 
7
- from cssselect import SelectorError, SelectorSyntaxError
8
- from cssselect import parse as split_selectors
9
7
  from lxml.html import HtmlElement, HtmlMixin, HTMLParser
8
+ from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
10
9
  from lxml.etree import (
11
10
  XPath,
12
11
  tostring,
@@ -75,7 +74,7 @@ class Selector(SelectorsGeneration):
75
74
  self,
76
75
  content: Optional[str | bytes] = None,
77
76
  url: Optional[str] = None,
78
- encoding: str = "utf8",
77
+ encoding: str = "utf-8",
79
78
  huge_tree: bool = True,
80
79
  root: Optional[HtmlElement] = None,
81
80
  keep_comments: Optional[bool] = False,
@@ -110,22 +109,16 @@ class Selector(SelectorsGeneration):
110
109
  If empty, default values will be used.
111
110
  """
112
111
  if root is None and content is None:
113
- raise ValueError(
114
- "Selector class needs HTML content, or root arguments to work"
115
- )
112
+ raise ValueError("Selector class needs HTML content, or root arguments to work")
116
113
 
117
114
  self.__text = None
118
115
  if root is None:
119
116
  if isinstance(content, str):
120
- body = (
121
- content.strip().replace("\x00", "").encode(encoding) or b"<html/>"
122
- )
117
+ body = content.strip().replace("\x00", "").encode(encoding) or b"<html/>"
123
118
  elif isinstance(content, bytes):
124
- body = content.replace(b"\x00", b"").strip()
119
+ body = content.replace(b"\x00", b"")
125
120
  else:
126
- raise TypeError(
127
- f"content argument must be str or bytes, got {type(content)}"
128
- )
121
+ raise TypeError(f"content argument must be str or bytes, got {type(content)}")
129
122
 
130
123
  # https://lxml.de/api/lxml.etree.HTMLParser-class.html
131
124
  parser = HTMLParser(
@@ -139,8 +132,7 @@ class Selector(SelectorsGeneration):
139
132
  strip_cdata=(not keep_cdata),
140
133
  )
141
134
  self._root = fromstring(body, parser=parser, base_url=url)
142
-
143
- self._raw_body = body.decode()
135
+ self._raw_body = content
144
136
 
145
137
  else:
146
138
  # All HTML types inherit from HtmlMixin so this to check for all at once
@@ -165,16 +157,10 @@ class Selector(SelectorsGeneration):
165
157
  }
166
158
 
167
159
  if not hasattr(storage, "__wrapped__"):
168
- raise ValueError(
169
- "Storage class must be wrapped with lru_cache decorator, see docs for info"
170
- )
160
+ raise ValueError("Storage class must be wrapped with lru_cache decorator, see docs for info")
171
161
 
172
- if not issubclass(
173
- storage.__wrapped__, StorageSystemMixin
174
- ): # pragma: no cover
175
- raise ValueError(
176
- "Storage system must be inherited from class `StorageSystemMixin`"
177
- )
162
+ if not issubclass(storage.__wrapped__, StorageSystemMixin): # pragma: no cover
163
+ raise ValueError("Storage system must be inherited from class `StorageSystemMixin`")
178
164
 
179
165
  self._storage = storage(**storage_args)
180
166
 
@@ -239,9 +225,7 @@ class Selector(SelectorsGeneration):
239
225
 
240
226
  def __element_convertor(self, element: HtmlElement) -> "Selector":
241
227
  """Used internally to convert a single HtmlElement to Selector directly without checks"""
242
- db_instance = (
243
- self._storage if (hasattr(self, "_storage") and self._storage) else None
244
- )
228
+ db_instance = self._storage if (hasattr(self, "_storage") and self._storage) else None
245
229
  return Selector(
246
230
  root=element,
247
231
  url=self.url,
@@ -355,18 +339,19 @@ class Selector(SelectorsGeneration):
355
339
  @property
356
340
  def html_content(self) -> TextHandler:
357
341
  """Return the inner HTML code of the element"""
358
- return TextHandler(
359
- tostring(self._root, encoding="unicode", method="html", with_tail=False)
360
- )
342
+ return TextHandler(tostring(self._root, encoding=self.encoding, method="html", with_tail=False))
361
343
 
362
- body = html_content
344
+ @property
345
+ def body(self):
346
+ """Return the raw body of the current `Selector` without any processing. Useful for binary and non-HTML requests."""
347
+ return self._raw_body
363
348
 
364
349
  def prettify(self) -> TextHandler:
365
350
  """Return a prettified version of the element's inner html-code"""
366
351
  return TextHandler(
367
352
  tostring(
368
353
  self._root,
369
- encoding="unicode",
354
+ encoding=self.encoding,
370
355
  pretty_print=True,
371
356
  method="html",
372
357
  with_tail=False,
@@ -404,9 +389,7 @@ class Selector(SelectorsGeneration):
404
389
  def siblings(self) -> "Selectors":
405
390
  """Return other children of the current element's parent or empty list otherwise"""
406
391
  if self.parent:
407
- return Selectors(
408
- child for child in self.parent.children if child._root != self._root
409
- )
392
+ return Selectors(child for child in self.parent.children if child._root != self._root)
410
393
  return Selectors()
411
394
 
412
395
  def iterancestors(self) -> Generator["Selector", None, None]:
@@ -519,9 +502,7 @@ class Selector(SelectorsGeneration):
519
502
  log.debug(f"Highest probability was {highest_probability}%")
520
503
  log.debug("Top 5 best matching elements are: ")
521
504
  for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
522
- log.debug(
523
- f"{percent} -> {self.__handle_elements(score_table[percent])}"
524
- )
505
+ log.debug(f"{percent} -> {self.__handle_elements(score_table[percent])}")
525
506
 
526
507
  if not selector_type:
527
508
  return score_table[highest_probability]
@@ -658,9 +639,7 @@ class Selector(SelectorsGeneration):
658
639
  SelectorError,
659
640
  SelectorSyntaxError,
660
641
  ) as e:
661
- raise SelectorSyntaxError(
662
- f"Invalid CSS selector '{selector}': {str(e)}"
663
- ) from e
642
+ raise SelectorSyntaxError(f"Invalid CSS selector '{selector}': {str(e)}") from e
664
643
 
665
644
  def xpath(
666
645
  self,
@@ -702,9 +681,7 @@ class Selector(SelectorsGeneration):
702
681
  elif self.__adaptive_enabled and auto_save:
703
682
  self.save(elements[0], identifier or selector)
704
683
 
705
- return self.__handle_elements(
706
- elements[0:1] if (_first_match and elements) else elements
707
- )
684
+ return self.__handle_elements(elements[0:1] if (_first_match and elements) else elements)
708
685
  elif self.__adaptive_enabled:
709
686
  if adaptive:
710
687
  element_data = self.retrieve(identifier or selector)
@@ -713,9 +690,7 @@ class Selector(SelectorsGeneration):
713
690
  if elements is not None and auto_save:
714
691
  self.save(elements[0], identifier or selector)
715
692
 
716
- return self.__handle_elements(
717
- elements[0:1] if (_first_match and elements) else elements
718
- )
693
+ return self.__handle_elements(elements[0:1] if (_first_match and elements) else elements)
719
694
  else:
720
695
  if adaptive:
721
696
  log.warning(
@@ -726,9 +701,7 @@ class Selector(SelectorsGeneration):
726
701
  "Argument `auto_save` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info."
727
702
  )
728
703
 
729
- return self.__handle_elements(
730
- elements[0:1] if (_first_match and elements) else elements
731
- )
704
+ return self.__handle_elements(elements[0:1] if (_first_match and elements) else elements)
732
705
 
733
706
  except (
734
707
  SelectorError,
@@ -751,9 +724,7 @@ class Selector(SelectorsGeneration):
751
724
  """
752
725
 
753
726
  if not args and not kwargs:
754
- raise TypeError(
755
- "You have to pass something to search with, like tag name(s), tag attributes, or both."
756
- )
727
+ raise TypeError("You have to pass something to search with, like tag name(s), tag attributes, or both.")
757
728
 
758
729
  attributes = dict()
759
730
  tags, patterns = set(), set()
@@ -766,18 +737,11 @@ class Selector(SelectorsGeneration):
766
737
 
767
738
  elif type(arg) in (list, tuple, set):
768
739
  if not all(map(lambda x: isinstance(x, str), arg)):
769
- raise TypeError(
770
- "Nested Iterables are not accepted, only iterables of tag names are accepted"
771
- )
740
+ raise TypeError("Nested Iterables are not accepted, only iterables of tag names are accepted")
772
741
  tags.update(set(arg))
773
742
 
774
743
  elif isinstance(arg, dict):
775
- if not all(
776
- [
777
- (isinstance(k, str) and isinstance(v, str))
778
- for k, v in arg.items()
779
- ]
780
- ):
744
+ if not all([(isinstance(k, str) and isinstance(v, str)) for k, v in arg.items()]):
781
745
  raise TypeError(
782
746
  "Nested dictionaries are not accepted, only string keys and string values are accepted"
783
747
  )
@@ -795,13 +759,9 @@ class Selector(SelectorsGeneration):
795
759
  )
796
760
 
797
761
  else:
798
- raise TypeError(
799
- f'Argument with type "{type(arg)}" is not accepted, please read the docs.'
800
- )
762
+ raise TypeError(f'Argument with type "{type(arg)}" is not accepted, please read the docs.')
801
763
 
802
- if not all(
803
- [(isinstance(k, str) and isinstance(v, str)) for k, v in kwargs.items()]
804
- ):
764
+ if not all([(isinstance(k, str) and isinstance(v, str)) for k, v in kwargs.items()]):
805
765
  raise TypeError("Only string values are accepted for arguments")
806
766
 
807
767
  for attribute_name, value in kwargs.items():
@@ -825,9 +785,7 @@ class Selector(SelectorsGeneration):
825
785
  if results:
826
786
  # From the results, get the ones that fulfill passed regex patterns
827
787
  for pattern in patterns:
828
- results = results.filter(
829
- lambda e: e.text.re(pattern, check_match=True)
830
- )
788
+ results = results.filter(lambda e: e.text.re(pattern, check_match=True))
831
789
 
832
790
  # From the results, get the ones that fulfill passed functions
833
791
  for function in functions:
@@ -858,9 +816,7 @@ class Selector(SelectorsGeneration):
858
816
  return element
859
817
  return None
860
818
 
861
- def __calculate_similarity_score(
862
- self, original: Dict, candidate: HtmlElement
863
- ) -> float:
819
+ def __calculate_similarity_score(self, original: Dict, candidate: HtmlElement) -> float:
864
820
  """Used internally to calculate a score that shows how a candidate element similar to the original one
865
821
 
866
822
  :param original: The original element in the form of the dictionary generated from `element_to_dict` function
@@ -877,15 +833,11 @@ class Selector(SelectorsGeneration):
877
833
  checks += 1
878
834
 
879
835
  if original["text"]:
880
- score += SequenceMatcher(
881
- None, original["text"], candidate.get("text") or ""
882
- ).ratio() # * 0.3 # 30%
836
+ score += SequenceMatcher(None, original["text"], candidate.get("text") or "").ratio() # * 0.3 # 30%
883
837
  checks += 1
884
838
 
885
839
  # if both don't have attributes, it still counts for something!
886
- score += self.__calculate_dict_diff(
887
- original["attributes"], candidate["attributes"]
888
- ) # * 0.3 # 30%
840
+ score += self.__calculate_dict_diff(original["attributes"], candidate["attributes"]) # * 0.3 # 30%
889
841
  checks += 1
890
842
 
891
843
  # Separate similarity test for class, id, href,... this will help in full structural changes
@@ -903,9 +855,7 @@ class Selector(SelectorsGeneration):
903
855
  ).ratio() # * 0.3 # 30%
904
856
  checks += 1
905
857
 
906
- score += SequenceMatcher(
907
- None, original["path"], candidate["path"]
908
- ).ratio() # * 0.1 # 10%
858
+ score += SequenceMatcher(None, original["path"], candidate["path"]).ratio() # * 0.1 # 10%
909
859
  checks += 1
910
860
 
911
861
  if original.get("parent_name"):
@@ -944,14 +894,8 @@ class Selector(SelectorsGeneration):
944
894
  @staticmethod
945
895
  def __calculate_dict_diff(dict1: Dict, dict2: Dict) -> float:
946
896
  """Used internally to calculate similarity between two dictionaries as SequenceMatcher doesn't accept dictionaries"""
947
- score = (
948
- SequenceMatcher(None, tuple(dict1.keys()), tuple(dict2.keys())).ratio()
949
- * 0.5
950
- )
951
- score += (
952
- SequenceMatcher(None, tuple(dict1.values()), tuple(dict2.values())).ratio()
953
- * 0.5
954
- )
897
+ score = SequenceMatcher(None, tuple(dict1.keys()), tuple(dict2.keys())).ratio() * 0.5
898
+ score += SequenceMatcher(None, tuple(dict1.values()), tuple(dict2.values())).ratio() * 0.5
955
899
  return score
956
900
 
957
901
  def save(self, element: Union["Selector", HtmlElement], identifier: str) -> None:
@@ -992,7 +936,7 @@ class Selector(SelectorsGeneration):
992
936
  # Operations on text functions
993
937
  def json(self) -> Dict:
994
938
  """Return JSON response if the response is jsonable otherwise throws error"""
995
- if self._raw_body:
939
+ if self._raw_body and isinstance(self._raw_body, str):
996
940
  return TextHandler(self._raw_body).json()
997
941
  elif self.text:
998
942
  return self.text.json()
@@ -1031,9 +975,7 @@ class Selector(SelectorsGeneration):
1031
975
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
1032
976
  :param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
1033
977
  """
1034
- return self.text.re_first(
1035
- regex, default, replace_entities, clean_match, case_sensitive
1036
- )
978
+ return self.text.re_first(regex, default, replace_entities, clean_match, case_sensitive)
1037
979
 
1038
980
  @staticmethod
1039
981
  def __get_attributes(element: HtmlElement, ignore_attributes: List | Tuple) -> Dict:
@@ -1052,9 +994,7 @@ class Selector(SelectorsGeneration):
1052
994
  """Calculate a score of how much these elements are alike and return True
1053
995
  if the score is higher or equals the threshold"""
1054
996
  candidate_attributes = (
1055
- self.__get_attributes(candidate, ignore_attributes)
1056
- if ignore_attributes
1057
- else candidate.attrib
997
+ self.__get_attributes(candidate, ignore_attributes) if ignore_attributes else candidate.attrib
1058
998
  )
1059
999
  score, checks = 0, 0
1060
1000
 
@@ -1116,11 +1056,7 @@ class Selector(SelectorsGeneration):
1116
1056
  similar_elements = list()
1117
1057
 
1118
1058
  current_depth = len(list(root.iterancestors()))
1119
- target_attrs = (
1120
- self.__get_attributes(root, ignore_attributes)
1121
- if ignore_attributes
1122
- else root.attrib
1123
- )
1059
+ target_attrs = self.__get_attributes(root, ignore_attributes) if ignore_attributes else root.attrib
1124
1060
 
1125
1061
  path_parts = [self.tag]
1126
1062
  if (parent := root.getparent()) is not None:
@@ -1129,9 +1065,7 @@ class Selector(SelectorsGeneration):
1129
1065
  path_parts.insert(0, grandparent.tag)
1130
1066
 
1131
1067
  xpath_path = "//{}".format("/".join(path_parts))
1132
- potential_matches = root.xpath(
1133
- f"{xpath_path}[count(ancestor::*) = {current_depth}]"
1134
- )
1068
+ potential_matches = root.xpath(f"{xpath_path}[count(ancestor::*) = {current_depth}]")
1135
1069
 
1136
1070
  for potential_match in potential_matches:
1137
1071
  if potential_match != root and self.__are_alike(
@@ -1275,12 +1209,7 @@ class Selectors(List[Selector]):
1275
1209
 
1276
1210
  :return: `Selectors` class.
1277
1211
  """
1278
- results = [
1279
- n.xpath(
1280
- selector, identifier or selector, False, auto_save, percentage, **kwargs
1281
- )
1282
- for n in self
1283
- ]
1212
+ results = [n.xpath(selector, identifier or selector, False, auto_save, percentage, **kwargs) for n in self]
1284
1213
  return self.__class__(flatten(results))
1285
1214
 
1286
1215
  def css(
@@ -1308,10 +1237,7 @@ class Selectors(List[Selector]):
1308
1237
 
1309
1238
  :return: `Selectors` class.
1310
1239
  """
1311
- results = [
1312
- n.css(selector, identifier or selector, False, auto_save, percentage)
1313
- for n in self
1314
- ]
1240
+ results = [n.css(selector, identifier or selector, False, auto_save, percentage) for n in self]
1315
1241
  return self.__class__(flatten(results))
1316
1242
 
1317
1243
  def re(
@@ -1329,10 +1255,7 @@ class Selectors(List[Selector]):
1329
1255
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
1330
1256
  :param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
1331
1257
  """
1332
- results = [
1333
- n.text.re(regex, replace_entities, clean_match, case_sensitive)
1334
- for n in self
1335
- ]
1258
+ results = [n.text.re(regex, replace_entities, clean_match, case_sensitive) for n in self]
1336
1259
  return TextHandlers(flatten(results))
1337
1260
 
1338
1261
  def re_first(