scrapling 0.2.99__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. scrapling/__init__.py +18 -31
  2. scrapling/cli.py +818 -20
  3. scrapling/core/_html_utils.py +348 -0
  4. scrapling/core/_types.py +34 -17
  5. scrapling/core/ai.py +611 -0
  6. scrapling/core/custom_types.py +183 -100
  7. scrapling/core/mixins.py +27 -19
  8. scrapling/core/shell.py +647 -0
  9. scrapling/core/{storage_adaptors.py → storage.py} +41 -33
  10. scrapling/core/translator.py +20 -26
  11. scrapling/core/utils.py +49 -54
  12. scrapling/engines/__init__.py +15 -6
  13. scrapling/engines/_browsers/__init__.py +2 -0
  14. scrapling/engines/_browsers/_camoufox.py +759 -0
  15. scrapling/engines/_browsers/_config_tools.py +130 -0
  16. scrapling/engines/_browsers/_controllers.py +644 -0
  17. scrapling/engines/_browsers/_page.py +93 -0
  18. scrapling/engines/_browsers/_validators.py +170 -0
  19. scrapling/engines/constants.py +101 -88
  20. scrapling/engines/static.py +667 -110
  21. scrapling/engines/toolbelt/__init__.py +20 -6
  22. scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
  23. scrapling/engines/toolbelt/convertor.py +254 -0
  24. scrapling/engines/toolbelt/custom.py +158 -175
  25. scrapling/engines/toolbelt/fingerprints.py +32 -46
  26. scrapling/engines/toolbelt/navigation.py +68 -39
  27. scrapling/fetchers.py +239 -333
  28. scrapling/parser.py +781 -449
  29. scrapling-0.3.1.dist-info/METADATA +411 -0
  30. scrapling-0.3.1.dist-info/RECORD +41 -0
  31. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/WHEEL +1 -1
  32. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/top_level.txt +0 -1
  33. scrapling/defaults.py +0 -25
  34. scrapling/engines/camo.py +0 -339
  35. scrapling/engines/pw.py +0 -465
  36. scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
  37. scrapling-0.2.99.dist-info/METADATA +0 -290
  38. scrapling-0.2.99.dist-info/RECORD +0 -49
  39. tests/__init__.py +0 -1
  40. tests/fetchers/__init__.py +0 -1
  41. tests/fetchers/async/__init__.py +0 -0
  42. tests/fetchers/async/test_camoufox.py +0 -97
  43. tests/fetchers/async/test_httpx.py +0 -85
  44. tests/fetchers/async/test_playwright.py +0 -101
  45. tests/fetchers/sync/__init__.py +0 -0
  46. tests/fetchers/sync/test_camoufox.py +0 -70
  47. tests/fetchers/sync/test_httpx.py +0 -84
  48. tests/fetchers/sync/test_playwright.py +0 -89
  49. tests/fetchers/test_utils.py +0 -97
  50. tests/parser/__init__.py +0 -0
  51. tests/parser/test_automatch.py +0 -111
  52. tests/parser/test_general.py +0 -330
  53. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/entry_points.txt +0 -0
  54. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/licenses/LICENSE +0 -0
scrapling/parser.py CHANGED
@@ -1,123 +1,182 @@
1
- import inspect
2
- import os
1
+ from pathlib import Path
3
2
  import re
4
- import typing
3
+ from inspect import signature
5
4
  from difflib import SequenceMatcher
6
5
  from urllib.parse import urljoin
7
6
 
8
7
  from cssselect import SelectorError, SelectorSyntaxError
9
8
  from cssselect import parse as split_selectors
10
- from lxml import etree, html
11
-
12
- from scrapling.core._types import (Any, Callable, Dict, Generator, Iterable,
13
- List, Optional, Pattern, SupportsIndex,
14
- Tuple, Union)
15
- from scrapling.core.custom_types import (AttributesHandler, TextHandler,
16
- TextHandlers)
9
+ from lxml.html import HtmlElement, HtmlMixin, HTMLParser
10
+ from lxml.etree import (
11
+ XPath,
12
+ tostring,
13
+ fromstring,
14
+ XPathError,
15
+ XPathEvalError,
16
+ _ElementUnicodeResult,
17
+ )
18
+
19
+ from scrapling.core._types import (
20
+ Any,
21
+ Dict,
22
+ List,
23
+ Tuple,
24
+ Union,
25
+ Pattern,
26
+ Callable,
27
+ Optional,
28
+ Iterable,
29
+ overload,
30
+ Generator,
31
+ SupportsIndex,
32
+ )
33
+ from scrapling.core.custom_types import AttributesHandler, TextHandler, TextHandlers
17
34
  from scrapling.core.mixins import SelectorsGeneration
18
- from scrapling.core.storage_adaptors import (SQLiteStorageSystem,
19
- StorageSystemMixin, _StorageTools)
20
- from scrapling.core.translator import translator_instance
21
- from scrapling.core.utils import (clean_spaces, flatten, html_forbidden,
22
- is_jsonable, log)
23
-
24
-
25
- class Adaptor(SelectorsGeneration):
35
+ from scrapling.core.storage import (
36
+ SQLiteStorageSystem,
37
+ StorageSystemMixin,
38
+ _StorageTools,
39
+ )
40
+ from scrapling.core.translator import translator as _translator
41
+ from scrapling.core.utils import clean_spaces, flatten, html_forbidden, log
42
+
43
+ __DEFAULT_DB_FILE__ = str(Path(__file__).parent / "elements_storage.db")
44
+ # Attributes that are Python reserved words and can't be used directly
45
+ # Ex: find_all('a', class="blah") -> find_all('a', class_="blah")
46
+ # https://www.w3schools.com/python/python_ref_keywords.asp
47
+ _whitelisted = {
48
+ "class_": "class",
49
+ "for_": "for",
50
+ }
51
+ # Pre-compiled selectors for efficiency
52
+ _find_all_elements = XPath(".//*")
53
+ _find_all_elements_with_spaces = XPath(
54
+ ".//*[normalize-space(text())]"
55
+ ) # This selector gets all elements with text content
56
+
57
+
58
+ class Selector(SelectorsGeneration):
26
59
  __slots__ = (
27
- 'url', 'encoding', '__auto_match_enabled', '_root', '_storage',
28
- '__keep_comments', '__huge_tree_enabled', '__attributes', '__text', '__tag',
29
- '__keep_cdata'
60
+ "url",
61
+ "encoding",
62
+ "__adaptive_enabled",
63
+ "_root",
64
+ "_storage",
65
+ "__keep_comments",
66
+ "__huge_tree_enabled",
67
+ "__attributes",
68
+ "__text",
69
+ "__tag",
70
+ "__keep_cdata",
71
+ "_raw_body",
30
72
  )
31
73
 
32
74
  def __init__(
33
- self,
34
- text: Optional[str] = None,
35
- url: Optional[str] = None,
36
- body: bytes = b"",
37
- encoding: str = "utf8",
38
- huge_tree: bool = True,
39
- root: Optional[html.HtmlElement] = None,
40
- keep_comments: Optional[bool] = False,
41
- keep_cdata: Optional[bool] = False,
42
- auto_match: Optional[bool] = False,
43
- storage: Any = SQLiteStorageSystem,
44
- storage_args: Optional[Dict] = None,
45
- **kwargs
75
+ self,
76
+ content: Optional[str | bytes] = None,
77
+ url: Optional[str] = None,
78
+ encoding: str = "utf8",
79
+ huge_tree: bool = True,
80
+ root: Optional[HtmlElement] = None,
81
+ keep_comments: Optional[bool] = False,
82
+ keep_cdata: Optional[bool] = False,
83
+ adaptive: Optional[bool] = False,
84
+ _storage: object = None,
85
+ storage: Any = SQLiteStorageSystem,
86
+ storage_args: Optional[Dict] = None,
87
+ **kwargs,
46
88
  ):
47
89
  """The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
48
90
  with expressions in CSS, XPath, or with simply text. Check the docs for more info.
49
91
 
50
92
  Here we try to extend module ``lxml.html.HtmlElement`` while maintaining a simpler interface, We are not
51
- inheriting from the ``lxml.html.HtmlElement`` because it's not pickleable which makes a lot of reference jobs
93
+ inheriting from the ``lxml.html.HtmlElement`` because it's not pickleable, which makes a lot of reference jobs
52
94
  not possible. You can test it here and see code explodes with `AssertionError: invalid Element proxy at...`.
53
95
  It's an old issue with lxml, see `this entry <https://bugs.launchpad.net/lxml/+bug/736708>`
54
96
 
55
- :param text: HTML body passed as text.
56
- :param url: allows storing a URL with the html data for retrieving later.
57
- :param body: HTML body as ``bytes`` object. It can be used instead of the ``text`` argument.
97
+ :param content: HTML content as either string or bytes.
98
+ :param url: It allows storing a URL with the HTML data for retrieving later.
58
99
  :param encoding: The encoding type that will be used in HTML parsing, default is `UTF-8`
59
100
  :param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
60
- libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
61
- :param root: Used internally to pass etree objects instead of text/body arguments, it takes highest priority.
101
+ the libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
102
+ :param root: Used internally to pass etree objects instead of text/body arguments, it takes the highest priority.
62
103
  Don't use it unless you know what you are doing!
63
104
  :param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
64
105
  :param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.
65
- :param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
66
- priority over all auto-match related arguments/functions in the class.
67
- :param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
106
+ :param adaptive: Globally turn off the adaptive feature in all functions, this argument takes higher
107
+ priority over all adaptive related arguments/functions in the class.
108
+ :param storage: The storage class to be passed for adaptive functionalities, see ``Docs`` for more info.
68
109
  :param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
69
110
  If empty, default values will be used.
70
111
  """
71
- if root is None and not body and text is None:
72
- raise ValueError("Adaptor class needs text, body, or root arguments to work")
112
+ if root is None and content is None:
113
+ raise ValueError(
114
+ "Selector class needs HTML content, or root arguments to work"
115
+ )
73
116
 
74
- self.__text = ''
117
+ self.__text = None
75
118
  if root is None:
76
- if text is None:
77
- if not body or not isinstance(body, bytes):
78
- raise TypeError(f"body argument must be valid and of type bytes, got {body.__class__}")
79
-
80
- body = body.replace(b"\x00", b"").strip()
119
+ if isinstance(content, str):
120
+ body = (
121
+ content.strip().replace("\x00", "").encode(encoding) or b"<html/>"
122
+ )
123
+ elif isinstance(content, bytes):
124
+ body = content.replace(b"\x00", b"").strip()
81
125
  else:
82
- if not isinstance(text, str):
83
- raise TypeError(f"text argument must be of type str, got {text.__class__}")
84
-
85
- body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
126
+ raise TypeError(
127
+ f"content argument must be str or bytes, got {type(content)}"
128
+ )
86
129
 
87
130
  # https://lxml.de/api/lxml.etree.HTMLParser-class.html
88
- parser = html.HTMLParser(
89
- recover=True, remove_blank_text=True, remove_comments=(not keep_comments), encoding=encoding,
90
- compact=True, huge_tree=huge_tree, default_doctype=True, strip_cdata=(not keep_cdata),
131
+ parser = HTMLParser(
132
+ recover=True,
133
+ remove_blank_text=True,
134
+ remove_comments=(not keep_comments),
135
+ encoding=encoding,
136
+ compact=True,
137
+ huge_tree=huge_tree,
138
+ default_doctype=True,
139
+ strip_cdata=(not keep_cdata),
91
140
  )
92
- self._root = etree.fromstring(body, parser=parser, base_url=url)
93
- if is_jsonable(text or body.decode()):
94
- self.__text = TextHandler(text or body.decode())
141
+ self._root = fromstring(body, parser=parser, base_url=url)
142
+
143
+ self._raw_body = body.decode()
95
144
 
96
145
  else:
97
- # All html types inherits from HtmlMixin so this to check for all at once
98
- if not issubclass(type(root), html.HtmlMixin):
146
+ # All HTML types inherit from HtmlMixin so this to check for all at once
147
+ if not issubclass(type(root), HtmlMixin):
99
148
  raise TypeError(
100
149
  f"Root have to be a valid element of `html` module types to work, not of type {type(root)}"
101
150
  )
102
151
 
103
152
  self._root = root
153
+ self._raw_body = ""
104
154
 
105
- self.__auto_match_enabled = auto_match
106
-
107
- if self.__auto_match_enabled:
108
- if not storage_args:
109
- storage_args = {
110
- 'storage_file': os.path.join(os.path.dirname(__file__), 'elements_storage.db'),
111
- 'url': url
112
- }
155
+ self.__adaptive_enabled = adaptive
113
156
 
114
- if not hasattr(storage, '__wrapped__'):
115
- raise ValueError("Storage class must be wrapped with lru_cache decorator, see docs for info")
157
+ if self.__adaptive_enabled:
158
+ if _storage is not None:
159
+ self._storage = _storage
160
+ else:
161
+ if not storage_args:
162
+ storage_args = {
163
+ "storage_file": __DEFAULT_DB_FILE__,
164
+ "url": url,
165
+ }
166
+
167
+ if not hasattr(storage, "__wrapped__"):
168
+ raise ValueError(
169
+ "Storage class must be wrapped with lru_cache decorator, see docs for info"
170
+ )
116
171
 
117
- if not issubclass(storage.__wrapped__, StorageSystemMixin):
118
- raise ValueError("Storage system must be inherited from class `StorageSystemMixin`")
172
+ if not issubclass(
173
+ storage.__wrapped__, StorageSystemMixin
174
+ ): # pragma: no cover
175
+ raise ValueError(
176
+ "Storage system must be inherited from class `StorageSystemMixin`"
177
+ )
119
178
 
120
- self._storage = storage(**storage_args)
179
+ self._storage = storage(**storage_args)
121
180
 
122
181
  self.__keep_comments = keep_comments
123
182
  self.__keep_cdata = keep_cdata
@@ -127,75 +186,112 @@ class Adaptor(SelectorsGeneration):
127
186
  # For selector stuff
128
187
  self.__attributes = None
129
188
  self.__tag = None
189
+
190
+ @property
191
+ def __response_data(self):
130
192
  # No need to check if all response attributes exist or not because if `status` exist, then the rest exist (Save some CPU cycles for speed)
131
- self.__response_data = {
132
- key: getattr(self, key) for key in ('status', 'reason', 'cookies', 'history', 'headers', 'request_headers',)
133
- } if hasattr(self, 'status') else {}
193
+ if not hasattr(self, "_cached_response_data"):
194
+ self._cached_response_data = (
195
+ {
196
+ key: getattr(self, key)
197
+ for key in (
198
+ "status",
199
+ "reason",
200
+ "cookies",
201
+ "history",
202
+ "headers",
203
+ "request_headers",
204
+ )
205
+ }
206
+ if hasattr(self, "status")
207
+ else {}
208
+ )
209
+ return self._cached_response_data
210
+
211
+ def __getitem__(self, key: str) -> TextHandler:
212
+ return self.attrib[key]
134
213
 
135
- # Node functionalities, I wanted to move to separate Mixin class but it had slight impact on performance
214
+ def __contains__(self, key: str) -> bool:
215
+ return key in self.attrib
216
+
217
+ # Node functionalities, I wanted to move to a separate Mixin class, but it had a slight impact on performance
136
218
  @staticmethod
137
- def _is_text_node(element: Union[html.HtmlElement, etree._ElementUnicodeResult]) -> bool:
138
- """Return True if given element is a result of a string expression
219
+ def _is_text_node(
220
+ element: HtmlElement | _ElementUnicodeResult,
221
+ ) -> bool:
222
+ """Return True if the given element is a result of a string expression
139
223
  Examples:
140
- XPath -> '/text()', '/@attribute' etc...
141
- CSS3 -> '::text', '::attr(attrib)'...
224
+ XPath -> '/text()', '/@attribute', etc...
225
+ CSS3 -> '::text', '::attr(attrib)'...
142
226
  """
143
227
  # Faster than checking `element.is_attribute or element.is_text or element.is_tail`
144
- return issubclass(type(element), etree._ElementUnicodeResult)
228
+ return issubclass(type(element), _ElementUnicodeResult)
145
229
 
146
230
  @staticmethod
147
- def __content_convertor(element: Union[html.HtmlElement, etree._ElementUnicodeResult]) -> TextHandler:
231
+ def __content_convertor(
232
+ element: HtmlElement | _ElementUnicodeResult,
233
+ ) -> TextHandler:
148
234
  """Used internally to convert a single element's text content to TextHandler directly without checks
149
235
 
150
- This single line has been isolated like this so when it's used with map we get that slight performance boost vs list comprehension
236
+ This single line has been isolated like this, so when it's used with `map` we get that slight performance boost vs. list comprehension
151
237
  """
152
- return TextHandler(str(element))
238
+ return TextHandler(element)
153
239
 
154
- def __element_convertor(self, element: html.HtmlElement) -> 'Adaptor':
155
- """Used internally to convert a single HtmlElement to Adaptor directly without checks"""
156
- return Adaptor(
240
+ def __element_convertor(self, element: HtmlElement) -> "Selector":
241
+ """Used internally to convert a single HtmlElement to Selector directly without checks"""
242
+ db_instance = (
243
+ self._storage if (hasattr(self, "_storage") and self._storage) else None
244
+ )
245
+ return Selector(
157
246
  root=element,
158
- text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
159
- url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
160
- keep_comments=self.__keep_comments, keep_cdata=self.__keep_cdata,
247
+ url=self.url,
248
+ encoding=self.encoding,
249
+ adaptive=self.__adaptive_enabled,
250
+ _storage=db_instance, # Reuse existing storage if it exists otherwise it won't be checked if `adaptive` is turned off
251
+ keep_comments=self.__keep_comments,
252
+ keep_cdata=self.__keep_cdata,
161
253
  huge_tree=self.__huge_tree_enabled,
162
- **self.__response_data
254
+ **self.__response_data,
163
255
  )
164
256
 
165
- def __handle_element(self, element: Union[html.HtmlElement, etree._ElementUnicodeResult]) -> Union[TextHandler, 'Adaptor', None]:
166
- """Used internally in all functions to convert a single element to type (Adaptor|TextHandler) when possible"""
257
+ def __handle_element(
258
+ self, element: HtmlElement | _ElementUnicodeResult
259
+ ) -> Optional[Union[TextHandler, "Selector"]]:
260
+ """Used internally in all functions to convert a single element to type (Selector|TextHandler) when possible"""
167
261
  if element is None:
168
262
  return None
169
263
  elif self._is_text_node(element):
170
- # etree._ElementUnicodeResult basically inherit from `str` so it's fine
264
+ # `_ElementUnicodeResult` basically inherit from `str` so it's fine
171
265
  return self.__content_convertor(element)
172
266
  else:
173
267
  return self.__element_convertor(element)
174
268
 
175
- def __handle_elements(self, result: List[Union[html.HtmlElement, etree._ElementUnicodeResult]]) -> Union['Adaptors', 'TextHandlers', List]:
176
- """Used internally in all functions to convert results to type (Adaptors|TextHandlers) in bulk when possible"""
177
- if not len(result): # Lxml will give a warning if I used something like `not result`
178
- return Adaptors([])
269
+ def __handle_elements(
270
+ self, result: List[HtmlElement | _ElementUnicodeResult]
271
+ ) -> Union["Selectors", "TextHandlers"]:
272
+ """Used internally in all functions to convert results to type (Selectors|TextHandlers) in bulk when possible"""
273
+ if not result:
274
+ return Selectors()
179
275
 
180
- # From within the code, this method will always get a list of the same type
181
- # so we will continue without checks for slight performance boost
276
+ # From within the code, this method will always get a list of the same type,
277
+ # so we will continue without checks for a slight performance boost
182
278
  if self._is_text_node(result[0]):
183
- return TextHandlers(list(map(self.__content_convertor, result)))
279
+ return TextHandlers(map(TextHandler, result))
184
280
 
185
- return Adaptors(list(map(self.__element_convertor, result)))
281
+ return Selectors(map(self.__element_convertor, result))
186
282
 
187
283
  def __getstate__(self) -> Any:
188
284
  # lxml don't like it :)
189
- raise TypeError("Can't pickle Adaptor objects")
285
+ raise TypeError("Can't pickle Selector objects")
190
286
 
191
287
  # The following four properties I made them into functions instead of variables directly
192
288
  # So they don't slow down the process of initializing many instances of the class and gets executed only
193
- # when the user need them for the first time for that specific element and gets cached for next times
289
+ # when the user needs them for the first time for that specific element and gets cached for next times
194
290
  # Doing that only made the library performance test sky rocked multiple times faster than before
195
291
  # because I was executing them on initialization before :))
196
292
  @property
197
293
  def tag(self) -> str:
198
- """Get tag name of the element"""
294
+ """Get the tag name of the element"""
199
295
  if not self.__tag:
200
296
  self.__tag = self._root.tag
201
297
  return self.__tag
@@ -203,13 +299,22 @@ class Adaptor(SelectorsGeneration):
203
299
  @property
204
300
  def text(self) -> TextHandler:
205
301
  """Get text content of the element"""
206
- if not self.__text:
207
- # If you want to escape lxml default behaviour and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
208
- # before extracting text then keep `keep_comments` set to False while initializing the first class
209
- self.__text = TextHandler(self._root.text)
302
+ if self.__text is None:
303
+ # If you want to escape lxml default behavior and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
304
+ # before extracting text, then keep `keep_comments` set to False while initializing the first class
305
+ self.__text = TextHandler(self._root.text or "")
210
306
  return self.__text
211
307
 
212
- def get_all_text(self, separator: str = "\n", strip: bool = False, ignore_tags: Tuple = ('script', 'style',), valid_values: bool = True) -> TextHandler:
308
+ def get_all_text(
309
+ self,
310
+ separator: str = "\n",
311
+ strip: bool = False,
312
+ ignore_tags: Tuple = (
313
+ "script",
314
+ "style",
315
+ ),
316
+ valid_values: bool = True,
317
+ ) -> TextHandler:
213
318
  """Get all child strings of this element, concatenated using the given separator.
214
319
 
215
320
  :param separator: Strings will be concatenated using this separator.
@@ -219,20 +324,25 @@ class Adaptor(SelectorsGeneration):
219
324
 
220
325
  :return: A TextHandler
221
326
  """
327
+ ignored_elements = set()
328
+ if ignore_tags:
329
+ for element in self._root.iter(*ignore_tags):
330
+ ignored_elements.add(element)
331
+ ignored_elements.update(set(_find_all_elements(element)))
332
+
222
333
  _all_strings = []
223
- for node in self._root.xpath('.//*'):
224
- if node.tag not in ignore_tags:
334
+ for node in self._root.iter():
335
+ if node not in ignored_elements:
225
336
  text = node.text
226
- if text and type(text) is str:
227
- if valid_values and text.strip():
228
- _all_strings.append(text if not strip else text.strip())
229
- else:
230
- _all_strings.append(text if not strip else text.strip())
337
+ if text and isinstance(text, str):
338
+ processed_text = text.strip() if strip else text
339
+ if not valid_values or processed_text.strip():
340
+ _all_strings.append(processed_text)
231
341
 
232
- return TextHandler(separator.join(_all_strings))
342
+ return TextHandler(separator).join(_all_strings)
233
343
 
234
344
  def urljoin(self, relative_url: str) -> str:
235
- """Join this Adaptor's url with a relative url to form an absolute full URL."""
345
+ """Join this Selector's url with a relative url to form an absolute full URL."""
236
346
  return urljoin(self.url, relative_url)
237
347
 
238
348
  @property
@@ -244,53 +354,67 @@ class Adaptor(SelectorsGeneration):
244
354
 
245
355
  @property
246
356
  def html_content(self) -> TextHandler:
247
- """Return the inner html code of the element"""
248
- return TextHandler(etree.tostring(self._root, encoding='unicode', method='html', with_tail=False))
357
+ """Return the inner HTML code of the element"""
358
+ return TextHandler(
359
+ tostring(self._root, encoding="unicode", method="html", with_tail=False)
360
+ )
249
361
 
250
362
  body = html_content
251
363
 
252
364
  def prettify(self) -> TextHandler:
253
365
  """Return a prettified version of the element's inner html-code"""
254
- return TextHandler(etree.tostring(self._root, encoding='unicode', pretty_print=True, method='html', with_tail=False))
366
+ return TextHandler(
367
+ tostring(
368
+ self._root,
369
+ encoding="unicode",
370
+ pretty_print=True,
371
+ method="html",
372
+ with_tail=False,
373
+ )
374
+ )
255
375
 
256
376
  def has_class(self, class_name: str) -> bool:
257
- """Check if element has a specific class
377
+ """Check if the element has a specific class
258
378
  :param class_name: The class name to check for
259
379
  :return: True if element has class with that name otherwise False
260
380
  """
261
381
  return class_name in self._root.classes
262
382
 
263
383
  @property
264
- def parent(self) -> Union['Adaptor', None]:
384
+ def parent(self) -> Optional["Selector"]:
265
385
  """Return the direct parent of the element or ``None`` otherwise"""
266
386
  return self.__handle_element(self._root.getparent())
267
387
 
268
388
  @property
269
- def below_elements(self) -> 'Adaptors[Adaptor]':
389
+ def below_elements(self) -> "Selectors":
270
390
  """Return all elements under the current element in the DOM tree"""
271
- below = self._root.xpath('.//*')
391
+ below = _find_all_elements(self._root)
272
392
  return self.__handle_elements(below)
273
393
 
274
394
  @property
275
- def children(self) -> 'Adaptors[Adaptor]':
395
+ def children(self) -> "Selectors":
276
396
  """Return the children elements of the current element or empty list otherwise"""
277
- return Adaptors([
278
- self.__element_convertor(child) for child in self._root.iterchildren() if type(child) not in html_forbidden
279
- ])
397
+ return Selectors(
398
+ self.__element_convertor(child)
399
+ for child in self._root.iterchildren()
400
+ if not isinstance(child, html_forbidden)
401
+ )
280
402
 
281
403
  @property
282
- def siblings(self) -> 'Adaptors[Adaptor]':
404
+ def siblings(self) -> "Selectors":
283
405
  """Return other children of the current element's parent or empty list otherwise"""
284
406
  if self.parent:
285
- return Adaptors([child for child in self.parent.children if child._root != self._root])
286
- return Adaptors([])
407
+ return Selectors(
408
+ child for child in self.parent.children if child._root != self._root
409
+ )
410
+ return Selectors()
287
411
 
288
- def iterancestors(self) -> Generator['Adaptor', None, None]:
289
- """Return a generator that loops over all ancestors of the element, starting with element's parent."""
412
+ def iterancestors(self) -> Generator["Selector", None, None]:
413
+ """Return a generator that loops over all ancestors of the element, starting with the element's parent."""
290
414
  for ancestor in self._root.iterancestors():
291
415
  yield self.__element_convertor(ancestor)
292
416
 
293
- def find_ancestor(self, func: Callable[['Adaptor'], bool]) -> Union['Adaptor', None]:
417
+ def find_ancestor(self, func: Callable[["Selector"], bool]) -> Optional["Selector"]:
294
418
  """Loop over all ancestors of the element till one match the passed function
295
419
  :param func: A function that takes each ancestor as an argument and returns True/False
296
420
  :return: The first ancestor that match the function or ``None`` otherwise.
@@ -301,30 +425,28 @@ class Adaptor(SelectorsGeneration):
301
425
  return None
302
426
 
303
427
  @property
304
- def path(self) -> 'Adaptors[Adaptor]':
305
- """Returns list of type :class:`Adaptors` that contains the path leading to the current element from the root."""
428
+ def path(self) -> "Selectors":
429
+ """Returns a list of type `Selectors` that contains the path leading to the current element from the root."""
306
430
  lst = list(self.iterancestors())
307
- return Adaptors(lst)
431
+ return Selectors(lst)
308
432
 
309
433
  @property
310
- def next(self) -> Union['Adaptor', None]:
434
+ def next(self) -> Optional["Selector"]:
311
435
  """Returns the next element of the current element in the children of the parent or ``None`` otherwise."""
312
436
  next_element = self._root.getnext()
313
- if next_element is not None:
314
- while type(next_element) in html_forbidden:
315
- # Ignore html comments and unwanted types
316
- next_element = next_element.getnext()
437
+ while next_element is not None and isinstance(next_element, html_forbidden):
438
+ # Ignore HTML comments and unwanted types
439
+ next_element = next_element.getnext()
317
440
 
318
441
  return self.__handle_element(next_element)
319
442
 
320
443
  @property
321
- def previous(self) -> Union['Adaptor', None]:
444
+ def previous(self) -> Optional["Selector"]:
322
445
  """Returns the previous element of the current element in the children of the parent or ``None`` otherwise."""
323
446
  prev_element = self._root.getprevious()
324
- if prev_element is not None:
325
- while type(prev_element) in html_forbidden:
326
- # Ignore html comments and unwanted types
327
- prev_element = prev_element.getprevious()
447
+ while prev_element is not None and isinstance(prev_element, html_forbidden):
448
+ # Ignore HTML comments and unwanted types
449
+ prev_element = prev_element.getprevious()
328
450
 
329
451
  return self.__handle_element(prev_element)
330
452
 
@@ -346,41 +468,44 @@ class Adaptor(SelectorsGeneration):
346
468
  data = "<"
347
469
  content = clean_spaces(self.html_content)
348
470
  if len(content) > length_limit:
349
- content = content[:length_limit].strip() + '...'
471
+ content = content[:length_limit].strip() + "..."
350
472
  data += f"data='{content}'"
351
473
 
352
474
  if self.parent:
353
475
  parent_content = clean_spaces(self.parent.html_content)
354
476
  if len(parent_content) > length_limit:
355
- parent_content = parent_content[:length_limit].strip() + '...'
477
+ parent_content = parent_content[:length_limit].strip() + "..."
356
478
 
357
479
  data += f" parent='{parent_content}'"
358
480
 
359
481
  return data + ">"
360
482
 
361
- # From here we start the selecting functions
483
+ # From here we start with the selecting functions
362
484
  def relocate(
363
- self, element: Union[Dict, html.HtmlElement, 'Adaptor'], percentage: int = 0, adaptor_type: bool = False
364
- ) -> Union[List[Union[html.HtmlElement, None]], 'Adaptors']:
485
+ self,
486
+ element: Union[Dict, HtmlElement, "Selector"],
487
+ percentage: int = 0,
488
+ selector_type: bool = False,
489
+ ) -> Union[List[HtmlElement], "Selectors"]:
365
490
  """This function will search again for the element in the page tree, used automatically on page structure change
366
491
 
367
492
  :param element: The element we want to relocate in the tree
368
493
  :param percentage: The minimum percentage to accept and not going lower than that. Be aware that the percentage
369
- calculation depends solely on the page structure so don't play with this number unless you must know
494
+ calculation depends solely on the page structure, so don't play with this number unless you must know
370
495
  what you are doing!
371
- :param adaptor_type: If True, the return result will be converted to `Adaptors` object
372
- :return: List of pure HTML elements that got the highest matching score or 'Adaptors' object
496
+ :param selector_type: If True, the return result will be converted to `Selectors` object
497
+ :return: List of pure HTML elements that got the highest matching score or 'Selectors' object
373
498
  """
374
499
  score_table = {}
375
- # Note: `element` will be most likely always be a dictionary at this point.
500
+ # Note: `element` will most likely always be a dictionary at this point.
376
501
  if isinstance(element, self.__class__):
377
502
  element = element._root
378
503
 
379
- if issubclass(type(element), html.HtmlElement):
504
+ if issubclass(type(element), HtmlElement):
380
505
  element = _StorageTools.element_to_dict(element)
381
506
 
382
- for node in self._root.xpath('.//*'):
383
- # Collect all elements in the page then for each element get the matching score of it against the node.
507
+ for node in _find_all_elements(self._root):
508
+ # Collect all elements in the page, then for each element get the matching score of it against the node.
384
509
  # Hence: the code doesn't stop even if the score was 100%
385
510
  # because there might be another element(s) left in page with the same score
386
511
  score = self.__calculate_similarity_score(element, node)
@@ -390,230 +515,319 @@ class Adaptor(SelectorsGeneration):
390
515
  highest_probability = max(score_table.keys())
391
516
  if score_table[highest_probability] and highest_probability >= percentage:
392
517
  if log.getEffectiveLevel() < 20:
393
- # No need to execute this part if logging level is not debugging
394
- log.debug(f'Highest probability was {highest_probability}%')
395
- log.debug('Top 5 best matching elements are: ')
518
+ # No need to execute this part if the logging level is not debugging
519
+ log.debug(f"Highest probability was {highest_probability}%")
520
+ log.debug("Top 5 best matching elements are: ")
396
521
  for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
397
- log.debug(f'{percent} -> {self.__handle_elements(score_table[percent])}')
522
+ log.debug(
523
+ f"{percent} -> {self.__handle_elements(score_table[percent])}"
524
+ )
398
525
 
399
- if not adaptor_type:
526
+ if not selector_type:
400
527
  return score_table[highest_probability]
401
528
  return self.__handle_elements(score_table[highest_probability])
402
529
  return []
403
530
 
404
- def css_first(self, selector: str, identifier: str = '',
405
- auto_match: bool = False, auto_save: bool = False, percentage: int = 0
406
- ) -> Union['Adaptor', 'TextHandler', None]:
407
- """Search current tree with CSS3 selectors and return the first result if possible, otherwise return `None`
531
+ def css_first(
532
+ self,
533
+ selector: str,
534
+ identifier: str = "",
535
+ adaptive: bool = False,
536
+ auto_save: bool = False,
537
+ percentage: int = 0,
538
+ ) -> Union["Selector", "TextHandler", None]:
539
+ """Search the current tree with CSS3 selectors and return the first result if possible, otherwise return `None`
408
540
 
409
541
  **Important:
410
- It's recommended to use the identifier argument if you plan to use different selector later
542
+ It's recommended to use the identifier argument if you plan to use a different selector later
411
543
  and want to relocate the same element(s)**
412
544
 
413
545
  :param selector: The CSS3 selector to be used.
414
- :param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
415
- :param identifier: A string that will be used to save/retrieve element's data in auto-matching
546
+ :param adaptive: Enabled will make the function try to relocate the element if it was 'saved' before
547
+ :param identifier: A string that will be used to save/retrieve element's data in adaptive,
416
548
  otherwise the selector will be used.
417
- :param auto_save: Automatically save new elements for `auto_match` later
418
- :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
419
- Be aware that the percentage calculation depends solely on the page structure so don't play with this
549
+ :param auto_save: Automatically save new elements for `adaptive` later
550
+ :param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that.
551
+ Be aware that the percentage calculation depends solely on the page structure, so don't play with this
420
552
  number unless you must know what you are doing!
421
553
  """
422
- for element in self.css(selector, identifier, auto_match, auto_save, percentage):
554
+ for element in self.css(
555
+ selector,
556
+ identifier,
557
+ adaptive,
558
+ auto_save,
559
+ percentage,
560
+ _scrapling_first_match=True,
561
+ ):
423
562
  return element
424
563
  return None
425
564
 
426
- def xpath_first(self, selector: str, identifier: str = '',
427
- auto_match: bool = False, auto_save: bool = False, percentage: int = 0, **kwargs: Any
428
- ) -> Union['Adaptor', 'TextHandler', None]:
429
- """Search current tree with XPath selectors and return the first result if possible, otherwise return `None`
565
+ def xpath_first(
566
+ self,
567
+ selector: str,
568
+ identifier: str = "",
569
+ adaptive: bool = False,
570
+ auto_save: bool = False,
571
+ percentage: int = 0,
572
+ **kwargs: Any,
573
+ ) -> Union["Selector", "TextHandler", None]:
574
+ """Search the current tree with XPath selectors and return the first result if possible, otherwise return `None`
430
575
 
431
576
  **Important:
432
- It's recommended to use the identifier argument if you plan to use different selector later
577
+ It's recommended to use the identifier argument if you plan to use a different selector later
433
578
  and want to relocate the same element(s)**
434
579
 
435
580
  Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
436
581
 
437
582
  :param selector: The XPath selector to be used.
438
- :param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
439
- :param identifier: A string that will be used to save/retrieve element's data in auto-matching
583
+ :param adaptive: Enabled will make the function try to relocate the element if it was 'saved' before
584
+ :param identifier: A string that will be used to save/retrieve element's data in adaptive,
440
585
  otherwise the selector will be used.
441
- :param auto_save: Automatically save new elements for `auto_match` later
442
- :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
443
- Be aware that the percentage calculation depends solely on the page structure so don't play with this
586
+ :param auto_save: Automatically save new elements for `adaptive` later
587
+ :param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that.
588
+ Be aware that the percentage calculation depends solely on the page structure, so don't play with this
444
589
  number unless you must know what you are doing!
445
590
  """
446
- for element in self.xpath(selector, identifier, auto_match, auto_save, percentage, **kwargs):
591
+ for element in self.xpath(
592
+ selector,
593
+ identifier,
594
+ adaptive,
595
+ auto_save,
596
+ percentage,
597
+ _scrapling_first_match=True,
598
+ **kwargs,
599
+ ):
447
600
  return element
448
601
  return None
449
602
 
450
- def css(self, selector: str, identifier: str = '',
451
- auto_match: bool = False, auto_save: bool = False, percentage: int = 0
452
- ) -> Union['Adaptors[Adaptor]', List, 'TextHandlers[TextHandler]']:
453
- """Search current tree with CSS3 selectors
603
+ def css(
604
+ self,
605
+ selector: str,
606
+ identifier: str = "",
607
+ adaptive: bool = False,
608
+ auto_save: bool = False,
609
+ percentage: int = 0,
610
+ **kwargs: Any,
611
+ ) -> Union["Selectors", List, "TextHandlers"]:
612
+ """Search the current tree with CSS3 selectors
454
613
 
455
614
  **Important:
456
- It's recommended to use the identifier argument if you plan to use different selector later
615
+ It's recommended to use the identifier argument if you plan to use a different selector later
457
616
  and want to relocate the same element(s)**
458
617
 
459
618
  :param selector: The CSS3 selector to be used.
460
- :param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
461
- :param identifier: A string that will be used to save/retrieve element's data in auto-matching
619
+ :param adaptive: Enabled will make the function try to relocate the element if it was 'saved' before
620
+ :param identifier: A string that will be used to save/retrieve element's data in adaptive,
462
621
  otherwise the selector will be used.
463
- :param auto_save: Automatically save new elements for `auto_match` later
464
- :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
465
- Be aware that the percentage calculation depends solely on the page structure so don't play with this
622
+ :param auto_save: Automatically save new elements for `adaptive` later
623
+ :param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that.
624
+ Be aware that the percentage calculation depends solely on the page structure, so don't play with this
466
625
  number unless you must know what you are doing!
467
626
 
468
- :return: List as :class:`Adaptors`
627
+ :return: `Selectors` class.
469
628
  """
470
629
  try:
471
- if not self.__auto_match_enabled or ',' not in selector:
630
+ if not self.__adaptive_enabled or "," not in selector:
472
631
  # No need to split selectors in this case, let's save some CPU cycles :)
473
- xpath_selector = translator_instance.css_to_xpath(selector)
474
- return self.xpath(xpath_selector, identifier or selector, auto_match, auto_save, percentage)
632
+ xpath_selector = _translator.css_to_xpath(selector)
633
+ return self.xpath(
634
+ xpath_selector,
635
+ identifier or selector,
636
+ adaptive,
637
+ auto_save,
638
+ percentage,
639
+ _scrapling_first_match=kwargs.pop("_scrapling_first_match", False),
640
+ )
475
641
 
476
642
  results = []
477
- if ',' in selector:
478
- for single_selector in split_selectors(selector):
479
- # I'm doing this only so the `save` function save data correctly for combined selectors
480
- # Like using the ',' to combine two different selectors that point to different elements.
481
- xpath_selector = translator_instance.css_to_xpath(single_selector.canonical())
482
- results += self.xpath(
483
- xpath_selector, identifier or single_selector.canonical(), auto_match, auto_save, percentage
484
- )
643
+ for single_selector in split_selectors(selector):
644
+ # I'm doing this only so the `save` function saves data correctly for combined selectors
645
+ # Like using the ',' to combine two different selectors that point to different elements.
646
+ xpath_selector = _translator.css_to_xpath(single_selector.canonical())
647
+ results += self.xpath(
648
+ xpath_selector,
649
+ identifier or single_selector.canonical(),
650
+ adaptive,
651
+ auto_save,
652
+ percentage,
653
+ _scrapling_first_match=kwargs.pop("_scrapling_first_match", False),
654
+ )
485
655
 
486
656
  return results
487
- except (SelectorError, SelectorSyntaxError,):
488
- raise SelectorSyntaxError(f"Invalid CSS selector: {selector}")
657
+ except (
658
+ SelectorError,
659
+ SelectorSyntaxError,
660
+ ) as e:
661
+ raise SelectorSyntaxError(
662
+ f"Invalid CSS selector '{selector}': {str(e)}"
663
+ ) from e
489
664
 
490
- def xpath(self, selector: str, identifier: str = '',
491
- auto_match: bool = False, auto_save: bool = False, percentage: int = 0, **kwargs: Any
492
- ) -> Union['Adaptors[Adaptor]', List, 'TextHandlers[TextHandler]']:
493
- """Search current tree with XPath selectors
665
+ def xpath(
666
+ self,
667
+ selector: str,
668
+ identifier: str = "",
669
+ adaptive: bool = False,
670
+ auto_save: bool = False,
671
+ percentage: int = 0,
672
+ **kwargs: Any,
673
+ ) -> Union["Selectors", "TextHandlers"]:
674
+ """Search the current tree with XPath selectors
494
675
 
495
676
  **Important:
496
- It's recommended to use the identifier argument if you plan to use different selector later
677
+ It's recommended to use the identifier argument if you plan to use a different selector later
497
678
  and want to relocate the same element(s)**
498
679
 
499
680
  Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
500
681
 
501
682
  :param selector: The XPath selector to be used.
502
- :param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
503
- :param identifier: A string that will be used to save/retrieve element's data in auto-matching
683
+ :param adaptive: Enabled will make the function try to relocate the element if it was 'saved' before
684
+ :param identifier: A string that will be used to save/retrieve element's data in adaptive,
504
685
  otherwise the selector will be used.
505
- :param auto_save: Automatically save new elements for `auto_match` later
506
- :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
507
- Be aware that the percentage calculation depends solely on the page structure so don't play with this
686
+ :param auto_save: Automatically save new elements for `adaptive` later
687
+ :param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that.
688
+ Be aware that the percentage calculation depends solely on the page structure, so don't play with this
508
689
  number unless you must know what you are doing!
509
690
 
510
- :return: List as :class:`Adaptors`
691
+ :return: `Selectors` class.
511
692
  """
693
+ _first_match = kwargs.pop(
694
+ "_scrapling_first_match", False
695
+ ) # Used internally only to speed up `css_first` and `xpath_first`
512
696
  try:
513
- elements = self._root.xpath(selector, **kwargs)
514
-
515
- if elements:
516
- if auto_save:
517
- if not self.__auto_match_enabled:
518
- log.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
519
- else:
520
- self.save(elements[0], identifier or selector)
521
-
522
- return self.__handle_elements(elements)
523
- elif self.__auto_match_enabled:
524
- if auto_match:
697
+ if elements := self._root.xpath(selector, **kwargs):
698
+ if not self.__adaptive_enabled and auto_save:
699
+ log.warning(
700
+ "Argument `auto_save` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info."
701
+ )
702
+ elif self.__adaptive_enabled and auto_save:
703
+ self.save(elements[0], identifier or selector)
704
+
705
+ return self.__handle_elements(
706
+ elements[0:1] if (_first_match and elements) else elements
707
+ )
708
+ elif self.__adaptive_enabled:
709
+ if adaptive:
525
710
  element_data = self.retrieve(identifier or selector)
526
711
  if element_data:
527
712
  elements = self.relocate(element_data, percentage)
528
713
  if elements is not None and auto_save:
529
714
  self.save(elements[0], identifier or selector)
530
715
 
531
- return self.__handle_elements(elements)
716
+ return self.__handle_elements(
717
+ elements[0:1] if (_first_match and elements) else elements
718
+ )
532
719
  else:
533
- if auto_match:
534
- log.warning("Argument `auto_match` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
720
+ if adaptive:
721
+ log.warning(
722
+ "Argument `adaptive` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info."
723
+ )
535
724
  elif auto_save:
536
- log.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
537
-
538
- return self.__handle_elements(elements)
539
-
540
- except (SelectorError, SelectorSyntaxError, etree.XPathError, etree.XPathEvalError):
541
- raise SelectorSyntaxError(f"Invalid XPath selector: {selector}")
725
+ log.warning(
726
+ "Argument `auto_save` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info."
727
+ )
542
728
 
543
- def find_all(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> 'Adaptors':
544
- """Find elements by filters of your creations for ease..
729
+ return self.__handle_elements(
730
+ elements[0:1] if (_first_match and elements) else elements
731
+ )
545
732
 
546
- :param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
733
+ except (
734
+ SelectorError,
735
+ SelectorSyntaxError,
736
+ XPathError,
737
+ XPathEvalError,
738
+ ) as e:
739
+ raise SelectorSyntaxError(f"Invalid XPath selector: {selector}") from e
740
+
741
+ def find_all(
742
+ self,
743
+ *args: str | Iterable[str] | Pattern | Callable | Dict[str, str],
744
+ **kwargs: str,
745
+ ) -> "Selectors":
746
+ """Find elements by filters of your creations for ease.
747
+
748
+ :param args: Tag name(s), iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
547
749
  :param kwargs: The attributes you want to filter elements based on it.
548
- :return: The `Adaptors` object of the elements or empty list
750
+ :return: The `Selectors` object of the elements or empty list
549
751
  """
550
- # Attributes that are Python reserved words and can't be used directly
551
- # Ex: find_all('a', class="blah") -> find_all('a', class_="blah")
552
- # https://www.w3schools.com/python/python_ref_keywords.asp
553
- whitelisted = {
554
- 'class_': 'class',
555
- 'for_': 'for',
556
- }
557
752
 
558
753
  if not args and not kwargs:
559
- raise TypeError('You have to pass something to search with, like tag name(s), tag attributes, or both.')
754
+ raise TypeError(
755
+ "You have to pass something to search with, like tag name(s), tag attributes, or both."
756
+ )
560
757
 
561
758
  attributes = dict()
562
759
  tags, patterns = set(), set()
563
- results, functions, selectors = Adaptors([]), [], []
760
+ results, functions, selectors = Selectors(), [], []
564
761
 
565
762
  # Brace yourself for a wonderful journey!
566
763
  for arg in args:
567
- if type(arg) is str:
764
+ if isinstance(arg, str):
568
765
  tags.add(arg)
569
766
 
570
- elif type(arg) in [list, tuple, set]:
571
- if not all(map(lambda x: type(x) is str, arg)):
572
- raise TypeError('Nested Iterables are not accepted, only iterables of tag names are accepted')
767
+ elif type(arg) in (list, tuple, set):
768
+ if not all(map(lambda x: isinstance(x, str), arg)):
769
+ raise TypeError(
770
+ "Nested Iterables are not accepted, only iterables of tag names are accepted"
771
+ )
573
772
  tags.update(set(arg))
574
773
 
575
774
  elif isinstance(arg, dict):
576
- if not all([(type(k) is str and type(v) is str) for k, v in arg.items()]):
577
- raise TypeError('Nested dictionaries are not accepted, only string keys and string values are accepted')
775
+ if not all(
776
+ [
777
+ (isinstance(k, str) and isinstance(v, str))
778
+ for k, v in arg.items()
779
+ ]
780
+ ):
781
+ raise TypeError(
782
+ "Nested dictionaries are not accepted, only string keys and string values are accepted"
783
+ )
578
784
  attributes.update(arg)
579
785
 
580
786
  elif isinstance(arg, re.Pattern):
581
787
  patterns.add(arg)
582
788
 
583
789
  elif callable(arg):
584
- if len(inspect.signature(arg).parameters) > 0:
790
+ if len(signature(arg).parameters) > 0:
585
791
  functions.append(arg)
586
792
  else:
587
- raise TypeError("Callable filter function must have at least one argument to take `Adaptor` objects.")
793
+ raise TypeError(
794
+ "Callable filter function must have at least one argument to take `Selector` objects."
795
+ )
588
796
 
589
797
  else:
590
- raise TypeError(f'Argument with type "{type(arg)}" is not accepted, please read the docs.')
798
+ raise TypeError(
799
+ f'Argument with type "{type(arg)}" is not accepted, please read the docs.'
800
+ )
591
801
 
592
- if not all([(type(k) is str and type(v) is str) for k, v in kwargs.items()]):
593
- raise TypeError('Only string values are accepted for arguments')
802
+ if not all(
803
+ [(isinstance(k, str) and isinstance(v, str)) for k, v in kwargs.items()]
804
+ ):
805
+ raise TypeError("Only string values are accepted for arguments")
594
806
 
595
807
  for attribute_name, value in kwargs.items():
596
808
  # Only replace names for kwargs, replacing them in dictionaries doesn't make sense
597
- attribute_name = whitelisted.get(attribute_name, attribute_name)
809
+ attribute_name = _whitelisted.get(attribute_name, attribute_name)
598
810
  attributes[attribute_name] = value
599
811
 
600
812
  # It's easier and faster to build a selector than traversing the tree
601
- tags = tags or ['*']
813
+ tags = tags or ["*"]
602
814
  for tag in tags:
603
815
  selector = tag
604
816
  for key, value in attributes.items():
605
- value = value.replace('"', r'\"') # Escape double quotes in user input
817
+ value = value.replace('"', r"\"") # Escape double quotes in user input
606
818
  # Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)
607
819
  selector += '[{}="{}"]'.format(key, value)
608
- if selector != '*':
820
+ if selector != "*":
609
821
  selectors.append(selector)
610
822
 
611
823
  if selectors:
612
- results = self.css(', '.join(selectors))
824
+ results = self.css(", ".join(selectors))
613
825
  if results:
614
826
  # From the results, get the ones that fulfill passed regex patterns
615
827
  for pattern in patterns:
616
- results = results.filter(lambda e: e.text.re(pattern, check_match=True))
828
+ results = results.filter(
829
+ lambda e: e.text.re(pattern, check_match=True)
830
+ )
617
831
 
618
832
  # From the results, get the ones that fulfill passed functions
619
833
  for function in functions:
@@ -623,25 +837,31 @@ class Adaptor(SelectorsGeneration):
623
837
  for pattern in patterns:
624
838
  results = results.filter(lambda e: e.text.re(pattern, check_match=True))
625
839
 
626
- # Collect element if it fulfills passed function otherwise
840
+ # Collect an element if it fulfills the passed function otherwise
627
841
  for function in functions:
628
842
  results = results.filter(function)
629
843
 
630
844
  return results
631
845
 
632
- def find(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> Union['Adaptor', None]:
633
- """Find elements by filters of your creations for ease then return the first result. Otherwise return `None`.
846
+ def find(
847
+ self,
848
+ *args: str | Iterable[str] | Pattern | Callable | Dict[str, str],
849
+ **kwargs: str,
850
+ ) -> Optional["Selector"]:
851
+ """Find elements by filters of your creations for ease, then return the first result. Otherwise return `None`.
634
852
 
635
- :param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
853
+ :param args: Tag name(s), iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
636
854
  :param kwargs: The attributes you want to filter elements based on it.
637
- :return: The `Adaptor` object of the element or `None` if the result didn't match
855
+ :return: The `Selector` object of the element or `None` if the result didn't match
638
856
  """
639
857
  for element in self.find_all(*args, **kwargs):
640
858
  return element
641
859
  return None
642
860
 
643
- def __calculate_similarity_score(self, original: Dict, candidate: html.HtmlElement) -> float:
644
- """Used internally to calculate a score that shows how candidate element similar to the original one
861
+ def __calculate_similarity_score(
862
+ self, original: Dict, candidate: HtmlElement
863
+ ) -> float:
864
+ """Used internally to calculate a score that shows how a candidate element similar to the original one
645
865
 
646
866
  :param original: The original element in the form of the dictionary generated from `element_to_dict` function
647
867
  :param candidate: The element to compare with the original element.
@@ -653,53 +873,68 @@ class Adaptor(SelectorsGeneration):
653
873
  # Possible TODO:
654
874
  # Study the idea of giving weight to each test below so some are more important than others
655
875
  # Current results: With weights some websites had better score while it was worse for others
656
- score += 1 if original['tag'] == candidate['tag'] else 0 # * 0.3 # 30%
876
+ score += 1 if original["tag"] == candidate["tag"] else 0 # * 0.3 # 30%
657
877
  checks += 1
658
878
 
659
- if original['text']:
660
- score += SequenceMatcher(None, original['text'], candidate.get('text') or '').ratio() # * 0.3 # 30%
879
+ if original["text"]:
880
+ score += SequenceMatcher(
881
+ None, original["text"], candidate.get("text") or ""
882
+ ).ratio() # * 0.3 # 30%
661
883
  checks += 1
662
884
 
663
- # if both doesn't have attributes, it still count for something!
664
- score += self.__calculate_dict_diff(original['attributes'], candidate['attributes']) # * 0.3 # 30%
885
+ # if both don't have attributes, it still counts for something!
886
+ score += self.__calculate_dict_diff(
887
+ original["attributes"], candidate["attributes"]
888
+ ) # * 0.3 # 30%
665
889
  checks += 1
666
890
 
667
891
  # Separate similarity test for class, id, href,... this will help in full structural changes
668
- for attrib in ('class', 'id', 'href', 'src',):
669
- if original['attributes'].get(attrib):
892
+ for attrib in (
893
+ "class",
894
+ "id",
895
+ "href",
896
+ "src",
897
+ ):
898
+ if original["attributes"].get(attrib):
670
899
  score += SequenceMatcher(
671
- None, original['attributes'][attrib], candidate['attributes'].get(attrib) or ''
900
+ None,
901
+ original["attributes"][attrib],
902
+ candidate["attributes"].get(attrib) or "",
672
903
  ).ratio() # * 0.3 # 30%
673
904
  checks += 1
674
905
 
675
- score += SequenceMatcher(None, original['path'], candidate['path']).ratio() # * 0.1 # 10%
906
+ score += SequenceMatcher(
907
+ None, original["path"], candidate["path"]
908
+ ).ratio() # * 0.1 # 10%
676
909
  checks += 1
677
910
 
678
- if original.get('parent_name'):
911
+ if original.get("parent_name"):
679
912
  # Then we start comparing parents' data
680
- if candidate.get('parent_name'):
913
+ if candidate.get("parent_name"):
681
914
  score += SequenceMatcher(
682
- None, original['parent_name'], candidate.get('parent_name') or ''
915
+ None, original["parent_name"], candidate.get("parent_name") or ""
683
916
  ).ratio() # * 0.2 # 20%
684
917
  checks += 1
685
918
 
686
919
  score += self.__calculate_dict_diff(
687
- original['parent_attribs'], candidate.get('parent_attribs') or {}
920
+ original["parent_attribs"], candidate.get("parent_attribs") or {}
688
921
  ) # * 0.2 # 20%
689
922
  checks += 1
690
923
 
691
- if original['parent_text']:
924
+ if original["parent_text"]:
692
925
  score += SequenceMatcher(
693
- None, original['parent_text'], candidate.get('parent_text') or ''
926
+ None,
927
+ original["parent_text"],
928
+ candidate.get("parent_text") or "",
694
929
  ).ratio() # * 0.1 # 10%
695
930
  checks += 1
696
931
  # else:
697
- # # The original element have a parent and this one not, this is not a good sign
932
+ # # The original element has a parent and this one not, this is not a good sign
698
933
  # score -= 0.1
699
934
 
700
- if original.get('siblings'):
935
+ if original.get("siblings"):
701
936
  score += SequenceMatcher(
702
- None, original['siblings'], candidate.get('siblings') or []
937
+ None, original["siblings"], candidate.get("siblings") or []
703
938
  ).ratio() # * 0.1 # 10%
704
939
  checks += 1
705
940
 
@@ -707,21 +942,26 @@ class Adaptor(SelectorsGeneration):
707
942
  return round((score / checks) * 100, 2)
708
943
 
709
944
  @staticmethod
710
- def __calculate_dict_diff(dict1: dict, dict2: dict) -> float:
711
- """Used internally calculate similarity between two dictionaries as SequenceMatcher doesn't accept dictionaries
712
- """
713
- score = SequenceMatcher(None, tuple(dict1.keys()), tuple(dict2.keys())).ratio() * 0.5
714
- score += SequenceMatcher(None, tuple(dict1.values()), tuple(dict2.values())).ratio() * 0.5
945
+ def __calculate_dict_diff(dict1: Dict, dict2: Dict) -> float:
946
+ """Used internally to calculate similarity between two dictionaries as SequenceMatcher doesn't accept dictionaries"""
947
+ score = (
948
+ SequenceMatcher(None, tuple(dict1.keys()), tuple(dict2.keys())).ratio()
949
+ * 0.5
950
+ )
951
+ score += (
952
+ SequenceMatcher(None, tuple(dict1.values()), tuple(dict2.values())).ratio()
953
+ * 0.5
954
+ )
715
955
  return score
716
956
 
717
- def save(self, element: Union['Adaptor', html.HtmlElement], identifier: str) -> None:
957
+ def save(self, element: Union["Selector", HtmlElement], identifier: str) -> None:
718
958
  """Saves the element's unique properties to the storage for retrieval and relocation later
719
959
 
720
- :param element: The element itself that we want to save to storage, it can be a `Adaptor` or pure `HtmlElement`
960
+ :param element: The element itself that we want to save to storage, it can be a ` Selector ` or pure ` HtmlElement `
721
961
  :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
722
962
  the docs for more info.
723
963
  """
724
- if self.__auto_match_enabled:
964
+ if self.__adaptive_enabled:
725
965
  if isinstance(element, self.__class__):
726
966
  element = element._root
727
967
 
@@ -731,154 +971,202 @@ class Adaptor(SelectorsGeneration):
731
971
  self._storage.save(element, identifier)
732
972
  else:
733
973
  log.critical(
734
- "Can't use Auto-match features while disabled globally, you have to start a new class instance."
974
+ "Can't use `adaptive` features while it's disabled globally, you have to start a new class instance."
735
975
  )
736
976
 
737
- def retrieve(self, identifier: str) -> Optional[Dict]:
977
+ def retrieve(self, identifier: str) -> Optional[Dict[str, Any]]:
738
978
  """Using the identifier, we search the storage and return the unique properties of the element
739
979
 
740
980
  :param identifier: This is the identifier that will be used to retrieve the element from the storage. See
741
981
  the docs for more info.
742
982
  :return: A dictionary of the unique properties
743
983
  """
744
- if self.__auto_match_enabled:
984
+ if self.__adaptive_enabled:
745
985
  return self._storage.retrieve(identifier)
746
986
 
747
987
  log.critical(
748
- "Can't use Auto-match features while disabled globally, you have to start a new class instance."
988
+ "Can't use `adaptive` features while it's disabled globally, you have to start a new class instance."
749
989
  )
990
+ return None
750
991
 
751
992
  # Operations on text functions
752
993
  def json(self) -> Dict:
753
- """Return json response if the response is jsonable otherwise throws error"""
754
- if self.text:
994
+ """Return JSON response if the response is jsonable otherwise throws error"""
995
+ if self._raw_body:
996
+ return TextHandler(self._raw_body).json()
997
+ elif self.text:
755
998
  return self.text.json()
756
999
  else:
757
1000
  return self.get_all_text(strip=True).json()
758
1001
 
759
- def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True,
760
- clean_match: bool = False, case_sensitive: bool = True) -> TextHandlers:
1002
+ def re(
1003
+ self,
1004
+ regex: str | Pattern[str],
1005
+ replace_entities: bool = True,
1006
+ clean_match: bool = False,
1007
+ case_sensitive: bool = True,
1008
+ ) -> TextHandlers:
761
1009
  """Apply the given regex to the current text and return a list of strings with the matches.
762
1010
 
763
1011
  :param regex: Can be either a compiled regular expression or a string.
764
- :param replace_entities: if enabled character entity references are replaced by their corresponding character
1012
+ :param replace_entities: If enabled character entity references are replaced by their corresponding character
765
1013
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
766
- :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
1014
+ :param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
767
1015
  """
768
1016
  return self.text.re(regex, replace_entities, clean_match, case_sensitive)
769
1017
 
770
- def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
771
- clean_match: bool = False, case_sensitive: bool = True) -> TextHandler:
1018
+ def re_first(
1019
+ self,
1020
+ regex: str | Pattern[str],
1021
+ default=None,
1022
+ replace_entities: bool = True,
1023
+ clean_match: bool = False,
1024
+ case_sensitive: bool = True,
1025
+ ) -> TextHandler:
772
1026
  """Apply the given regex to text and return the first match if found, otherwise return the default value.
773
1027
 
774
1028
  :param regex: Can be either a compiled regular expression or a string.
775
1029
  :param default: The default value to be returned if there is no match
776
1030
  :param replace_entities: if enabled character entity references are replaced by their corresponding character
777
1031
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
778
- :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
1032
+ :param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
779
1033
  """
780
- return self.text.re_first(regex, default, replace_entities, clean_match, case_sensitive)
1034
+ return self.text.re_first(
1035
+ regex, default, replace_entities, clean_match, case_sensitive
1036
+ )
1037
+
1038
+ @staticmethod
1039
+ def __get_attributes(element: HtmlElement, ignore_attributes: List | Tuple) -> Dict:
1040
+ """Return attributes dictionary without the ignored list"""
1041
+ return {k: v for k, v in element.attrib.items() if k not in ignore_attributes}
1042
+
1043
+ def __are_alike(
1044
+ self,
1045
+ original: HtmlElement,
1046
+ original_attributes: Dict,
1047
+ candidate: HtmlElement,
1048
+ ignore_attributes: List | Tuple,
1049
+ similarity_threshold: float,
1050
+ match_text: bool = False,
1051
+ ) -> bool:
1052
+ """Calculate a score of how much these elements are alike and return True
1053
+ if the score is higher or equals the threshold"""
1054
+ candidate_attributes = (
1055
+ self.__get_attributes(candidate, ignore_attributes)
1056
+ if ignore_attributes
1057
+ else candidate.attrib
1058
+ )
1059
+ score, checks = 0, 0
1060
+
1061
+ if original_attributes:
1062
+ score += sum(
1063
+ SequenceMatcher(None, v, candidate_attributes.get(k, "")).ratio()
1064
+ for k, v in original_attributes.items()
1065
+ )
1066
+ checks += len(candidate_attributes)
1067
+ else:
1068
+ if not candidate_attributes:
1069
+ # Both don't have attributes, this must mean something
1070
+ score += 1
1071
+ checks += 1
1072
+
1073
+ if match_text:
1074
+ score += SequenceMatcher(
1075
+ None,
1076
+ clean_spaces(original.text or ""),
1077
+ clean_spaces(candidate.text or ""),
1078
+ ).ratio()
1079
+ checks += 1
1080
+
1081
+ if checks:
1082
+ return round(score / checks, 2) >= similarity_threshold
1083
+ return False
781
1084
 
782
1085
  def find_similar(
783
- self,
784
- similarity_threshold: float = 0.2,
785
- ignore_attributes: Union[List, Tuple] = ('href', 'src',),
786
- match_text: bool = False
787
- ) -> Union['Adaptors[Adaptor]', List]:
1086
+ self,
1087
+ similarity_threshold: float = 0.2,
1088
+ ignore_attributes: List | Tuple = (
1089
+ "href",
1090
+ "src",
1091
+ ),
1092
+ match_text: bool = False,
1093
+ ) -> "Selectors":
788
1094
  """Find elements that are in the same tree depth in the page with the same tag name and same parent tag etc...
789
- then return the ones that match the current element attributes with percentage higher than the input threshold.
1095
+ then return the ones that match the current element attributes with a percentage higher than the input threshold.
790
1096
 
791
1097
  This function is inspired by AutoScraper and made for cases where you, for example, found a product div inside
792
- a products-list container and want to find other products using that that element as a starting point EXCEPT
1098
+ a products-list container and want to find other products using that element as a starting point EXCEPT
793
1099
  this function works in any case without depending on the element type.
794
1100
 
795
- :param similarity_threshold: The percentage to use while comparing elements attributes.
1101
+ :param similarity_threshold: The percentage to use while comparing element attributes.
796
1102
  Note: Elements found before attributes matching/comparison will be sharing the same depth, same tag name,
797
- same parent tag name, and same grand parent tag name. So they are 99% likely to be correct unless your are
798
- extremely unlucky then attributes matching comes into play so basically don't play with this number unless
1103
+ same parent tag name, and same grand parent tag name. So they are 99% likely to be correct unless you are
1104
+ extremely unlucky, then attributes matching comes into play, so don't play with this number unless
799
1105
  you are getting the results you don't want.
800
- Also, if current element doesn't have attributes and the similar element as well, then it's a 100% match.
801
- :param ignore_attributes: Attribute names passed will be ignored while matching the attributes in last step.
802
- The default value is to ignore `href` and `src` as URLs can change a lot between elements so it's unreliable
803
- :param match_text: If True, elements text content will be taken into calculation while matching.
804
- Not recommended to use in normal cases but it depends.
1106
+ Also, if the current element doesn't have attributes and the similar element as well, then it's a 100% match.
1107
+ :param ignore_attributes: Attribute names passed will be ignored while matching the attributes in the last step.
1108
+ The default value is to ignore `href` and `src` as URLs can change a lot between elements, so it's unreliable
1109
+ :param match_text: If True, element text content will be taken into calculation while matching.
1110
+ Not recommended to use in normal cases, but it depends.
805
1111
 
806
- :return: A ``Adaptors`` container of ``Adaptor`` objects or empty list
1112
+ :return: A ``Selectors`` container of ``Selector`` objects or empty list
807
1113
  """
808
- def get_attributes(element: html.HtmlElement) -> Dict:
809
- """Return attributes dictionary without the ignored list"""
810
- return {k: v for k, v in element.attrib.items() if k not in ignore_attributes}
811
-
812
- def are_alike(original: html.HtmlElement, original_attributes: Dict, candidate: html.HtmlElement) -> bool:
813
- """Calculate a score of how much these elements are alike and return True
814
- if score is higher or equal the threshold"""
815
- candidate_attributes = get_attributes(candidate) if ignore_attributes else candidate.attrib
816
- score, checks = 0, 0
817
-
818
- if original_attributes:
819
- score += sum(
820
- SequenceMatcher(None, v, candidate_attributes.get(k, '')).ratio()
821
- for k, v in original_attributes.items()
822
- )
823
- checks += len(candidate_attributes)
824
- else:
825
- if not candidate_attributes:
826
- # Both doesn't have attributes, this must mean something
827
- score += 1
828
- checks += 1
1114
+ # We will use the elements' root from now on to get the speed boost of using Lxml directly
1115
+ root = self._root
1116
+ similar_elements = list()
829
1117
 
830
- if match_text:
831
- score += SequenceMatcher(
832
- None, clean_spaces(original.text or ''), clean_spaces(candidate.text or '')
833
- ).ratio()
834
- checks += 1
1118
+ current_depth = len(list(root.iterancestors()))
1119
+ target_attrs = (
1120
+ self.__get_attributes(root, ignore_attributes)
1121
+ if ignore_attributes
1122
+ else root.attrib
1123
+ )
835
1124
 
836
- if checks:
837
- return round(score / checks, 2) >= similarity_threshold
838
- return False
1125
+ path_parts = [self.tag]
1126
+ if (parent := root.getparent()) is not None:
1127
+ path_parts.insert(0, parent.tag)
1128
+ if (grandparent := parent.getparent()) is not None:
1129
+ path_parts.insert(0, grandparent.tag)
839
1130
 
840
- # We will use the elements root from now on to get the speed boost of using Lxml directly
841
- root = self._root
842
- current_depth = len(list(root.iterancestors()))
843
- target_attrs = get_attributes(root) if ignore_attributes else root.attrib
844
- similar_elements = list()
845
- # + root.xpath(f"//{self.tag}[count(ancestor::*) = {current_depth-1}]")
846
- parent = root.getparent()
847
- if parent is not None:
848
- grandparent = parent.getparent() # lol
849
- if grandparent is not None:
850
- potential_matches = root.xpath(
851
- f"//{grandparent.tag}/{parent.tag}/{self.tag}[count(ancestor::*) = {current_depth}]"
852
- )
853
- else:
854
- potential_matches = root.xpath(f"//{parent.tag}/{self.tag}[count(ancestor::*) = {current_depth}]")
855
- else:
856
- potential_matches = root.xpath(f"//{self.tag}[count(ancestor::*) = {current_depth}]")
1131
+ xpath_path = "//{}".format("/".join(path_parts))
1132
+ potential_matches = root.xpath(
1133
+ f"{xpath_path}[count(ancestor::*) = {current_depth}]"
1134
+ )
857
1135
 
858
1136
  for potential_match in potential_matches:
859
- if potential_match != root and are_alike(root, target_attrs, potential_match):
1137
+ if potential_match != root and self.__are_alike(
1138
+ root,
1139
+ target_attrs,
1140
+ potential_match,
1141
+ ignore_attributes,
1142
+ similarity_threshold,
1143
+ match_text,
1144
+ ):
860
1145
  similar_elements.append(potential_match)
861
1146
 
862
- return self.__handle_elements(similar_elements)
1147
+ return Selectors(map(self.__element_convertor, similar_elements))
863
1148
 
864
1149
  def find_by_text(
865
- self, text: str, first_match: bool = True, partial: bool = False,
866
- case_sensitive: bool = False, clean_match: bool = True
867
- ) -> Union['Adaptors[Adaptor]', 'Adaptor']:
1150
+ self,
1151
+ text: str,
1152
+ first_match: bool = True,
1153
+ partial: bool = False,
1154
+ case_sensitive: bool = False,
1155
+ clean_match: bool = True,
1156
+ ) -> Union["Selectors", "Selector"]:
868
1157
  """Find elements that its text content fully/partially matches input.
869
1158
  :param text: Text query to match
870
- :param first_match: Return first element that matches conditions, enabled by default
871
- :param partial: If enabled, function return elements that contains the input text
872
- :param case_sensitive: if enabled, letters case will be taken into consideration
1159
+ :param first_match: Returns the first element that matches conditions, enabled by default
1160
+ :param partial: If enabled, the function returns elements that contain the input text
1161
+ :param case_sensitive: if enabled, the letters case will be taken into consideration
873
1162
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
874
1163
  """
875
1164
 
876
- results = Adaptors([])
1165
+ results = Selectors()
877
1166
  if not case_sensitive:
878
1167
  text = text.lower()
879
1168
 
880
- # This selector gets all elements with text content
881
- for node in self.__handle_elements(self._root.xpath('.//*[normalize-space(text())]')):
1169
+ for node in self.__handle_elements(_find_all_elements_with_spaces(self._root)):
882
1170
  """Check if element matches given text otherwise, traverse the children tree and iterate"""
883
1171
  node_text = node.text
884
1172
  if clean_match:
@@ -903,21 +1191,29 @@ class Adaptor(SelectorsGeneration):
903
1191
  return results
904
1192
 
905
1193
  def find_by_regex(
906
- self, query: Union[str, Pattern[str]], first_match: bool = True, case_sensitive: bool = False, clean_match: bool = True
907
- ) -> Union['Adaptors[Adaptor]', 'Adaptor']:
1194
+ self,
1195
+ query: str | Pattern[str],
1196
+ first_match: bool = True,
1197
+ case_sensitive: bool = False,
1198
+ clean_match: bool = True,
1199
+ ) -> Union["Selectors", "Selector"]:
908
1200
  """Find elements that its text content matches the input regex pattern.
909
1201
  :param query: Regex query/pattern to match
910
- :param first_match: Return first element that matches conditions, enabled by default
911
- :param case_sensitive: if enabled, letters case will be taken into consideration in the regex
912
- :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
1202
+ :param first_match: Return the first element that matches conditions; enabled by default.
1203
+ :param case_sensitive: If enabled, the letters case will be taken into consideration in the regex.
1204
+ :param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching.
913
1205
  """
914
- results = Adaptors([])
1206
+ results = Selectors()
915
1207
 
916
- # This selector gets all elements with text content
917
- for node in self.__handle_elements(self._root.xpath('.//*[normalize-space(text())]')):
1208
+ for node in self.__handle_elements(_find_all_elements_with_spaces(self._root)):
918
1209
  """Check if element matches given regex otherwise, traverse the children tree and iterate"""
919
1210
  node_text = node.text
920
- if node_text.re(query, check_match=True, clean_match=clean_match, case_sensitive=case_sensitive):
1211
+ if node_text.re(
1212
+ query,
1213
+ check_match=True,
1214
+ clean_match=clean_match,
1215
+ case_sensitive=case_sensitive,
1216
+ ):
921
1217
  results.append(node)
922
1218
 
923
1219
  if first_match and results:
@@ -929,21 +1225,22 @@ class Adaptor(SelectorsGeneration):
929
1225
  return results
930
1226
 
931
1227
 
932
- class Adaptors(List[Adaptor]):
1228
+ class Selectors(List[Selector]):
933
1229
  """
934
- The :class:`Adaptors` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
1230
+ The `Selectors` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
935
1231
  """
1232
+
936
1233
  __slots__ = ()
937
1234
 
938
- @typing.overload
939
- def __getitem__(self, pos: SupportsIndex) -> Adaptor:
1235
+ @overload
1236
+ def __getitem__(self, pos: SupportsIndex) -> Selector:
940
1237
  pass
941
1238
 
942
- @typing.overload
943
- def __getitem__(self, pos: slice) -> "Adaptors":
1239
+ @overload
1240
+ def __getitem__(self, pos: slice) -> "Selectors":
944
1241
  pass
945
1242
 
946
- def __getitem__(self, pos: Union[SupportsIndex, slice]) -> Union[Adaptor, "Adaptors"]:
1243
+ def __getitem__(self, pos: SupportsIndex | slice) -> Union[Selector, "Selectors"]:
947
1244
  lst = super().__getitem__(pos)
948
1245
  if isinstance(pos, slice):
949
1246
  return self.__class__(lst)
@@ -951,74 +1248,101 @@ class Adaptors(List[Adaptor]):
951
1248
  return lst
952
1249
 
953
1250
  def xpath(
954
- self, selector: str, identifier: str = '', auto_save: bool = False, percentage: int = 0, **kwargs: Any
955
- ) -> "Adaptors[Adaptor]":
1251
+ self,
1252
+ selector: str,
1253
+ identifier: str = "",
1254
+ auto_save: bool = False,
1255
+ percentage: int = 0,
1256
+ **kwargs: Any,
1257
+ ) -> "Selectors":
956
1258
  """
957
1259
  Call the ``.xpath()`` method for each element in this list and return
958
- their results as another :class:`Adaptors`.
1260
+ their results as another `Selectors` class.
959
1261
 
960
1262
  **Important:
961
- It's recommended to use the identifier argument if you plan to use different selector later
1263
+ It's recommended to use the identifier argument if you plan to use a different selector later
962
1264
  and want to relocate the same element(s)**
963
1265
 
964
1266
  Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
965
1267
 
966
1268
  :param selector: The XPath selector to be used.
967
- :param identifier: A string that will be used to retrieve element's data in auto-matching
1269
+ :param identifier: A string that will be used to retrieve element's data in adaptive,
968
1270
  otherwise the selector will be used.
969
- :param auto_save: Automatically save new elements for `auto_match` later
970
- :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
971
- Be aware that the percentage calculation depends solely on the page structure so don't play with this
1271
+ :param auto_save: Automatically save new elements for `adaptive` later
1272
+ :param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that.
1273
+ Be aware that the percentage calculation depends solely on the page structure, so don't play with this
972
1274
  number unless you must know what you are doing!
973
1275
 
974
- :return: List as :class:`Adaptors`
1276
+ :return: `Selectors` class.
975
1277
  """
976
1278
  results = [
977
- n.xpath(selector, identifier or selector, False, auto_save, percentage, **kwargs) for n in self
1279
+ n.xpath(
1280
+ selector, identifier or selector, False, auto_save, percentage, **kwargs
1281
+ )
1282
+ for n in self
978
1283
  ]
979
1284
  return self.__class__(flatten(results))
980
1285
 
981
- def css(self, selector: str, identifier: str = '', auto_save: bool = False, percentage: int = 0) -> "Adaptors[Adaptor]":
1286
+ def css(
1287
+ self,
1288
+ selector: str,
1289
+ identifier: str = "",
1290
+ auto_save: bool = False,
1291
+ percentage: int = 0,
1292
+ ) -> "Selectors":
982
1293
  """
983
1294
  Call the ``.css()`` method for each element in this list and return
984
- their results flattened as another :class:`Adaptors`.
1295
+ their results flattened as another `Selectors` class.
985
1296
 
986
1297
  **Important:
987
- It's recommended to use the identifier argument if you plan to use different selector later
1298
+ It's recommended to use the identifier argument if you plan to use a different selector later
988
1299
  and want to relocate the same element(s)**
989
1300
 
990
1301
  :param selector: The CSS3 selector to be used.
991
- :param identifier: A string that will be used to retrieve element's data in auto-matching
1302
+ :param identifier: A string that will be used to retrieve element's data in adaptive,
992
1303
  otherwise the selector will be used.
993
- :param auto_save: Automatically save new elements for `auto_match` later
994
- :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
995
- Be aware that the percentage calculation depends solely on the page structure so don't play with this
1304
+ :param auto_save: Automatically save new elements for `adaptive` later
1305
+ :param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that.
1306
+ Be aware that the percentage calculation depends solely on the page structure, so don't play with this
996
1307
  number unless you must know what you are doing!
997
1308
 
998
- :return: List as :class:`Adaptors`
1309
+ :return: `Selectors` class.
999
1310
  """
1000
1311
  results = [
1001
- n.css(selector, identifier or selector, False, auto_save, percentage) for n in self
1312
+ n.css(selector, identifier or selector, False, auto_save, percentage)
1313
+ for n in self
1002
1314
  ]
1003
1315
  return self.__class__(flatten(results))
1004
1316
 
1005
- def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True,
1006
- clean_match: bool = False, case_sensitive: bool = True) -> TextHandlers[TextHandler]:
1317
+ def re(
1318
+ self,
1319
+ regex: str | Pattern,
1320
+ replace_entities: bool = True,
1321
+ clean_match: bool = False,
1322
+ case_sensitive: bool = True,
1323
+ ) -> TextHandlers:
1007
1324
  """Call the ``.re()`` method for each element in this list and return
1008
1325
  their results flattened as List of TextHandler.
1009
1326
 
1010
1327
  :param regex: Can be either a compiled regular expression or a string.
1011
- :param replace_entities: if enabled character entity references are replaced by their corresponding character
1328
+ :param replace_entities: If enabled character entity references are replaced by their corresponding character
1012
1329
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
1013
- :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
1330
+ :param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
1014
1331
  """
1015
1332
  results = [
1016
- n.text.re(regex, replace_entities, clean_match, case_sensitive) for n in self
1333
+ n.text.re(regex, replace_entities, clean_match, case_sensitive)
1334
+ for n in self
1017
1335
  ]
1018
1336
  return TextHandlers(flatten(results))
1019
1337
 
1020
- def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
1021
- clean_match: bool = False, case_sensitive: bool = True) -> TextHandler:
1338
+ def re_first(
1339
+ self,
1340
+ regex: str | Pattern,
1341
+ default=None,
1342
+ replace_entities: bool = True,
1343
+ clean_match: bool = False,
1344
+ case_sensitive: bool = True,
1345
+ ) -> TextHandler:
1022
1346
  """Call the ``.re_first()`` method for each element in this list and return
1023
1347
  the first result or the default value otherwise.
1024
1348
 
@@ -1026,14 +1350,14 @@ class Adaptors(List[Adaptor]):
1026
1350
  :param default: The default value to be returned if there is no match
1027
1351
  :param replace_entities: if enabled character entity references are replaced by their corresponding character
1028
1352
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
1029
- :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
1353
+ :param case_sensitive: if disabled, function will set the regex to ignore the letters case while compiling it
1030
1354
  """
1031
1355
  for n in self:
1032
1356
  for result in n.re(regex, replace_entities, clean_match, case_sensitive):
1033
1357
  return result
1034
1358
  return default
1035
1359
 
1036
- def search(self, func: Callable[['Adaptor'], bool]) -> Union['Adaptor', None]:
1360
+ def search(self, func: Callable[["Selector"], bool]) -> Optional["Selector"]:
1037
1361
  """Loop over all current elements and return the first element that matches the passed function
1038
1362
  :param func: A function that takes each element as an argument and returns True/False
1039
1363
  :return: The first element that match the function or ``None`` otherwise.
@@ -1043,14 +1367,12 @@ class Adaptors(List[Adaptor]):
1043
1367
  return element
1044
1368
  return None
1045
1369
 
1046
- def filter(self, func: Callable[['Adaptor'], bool]) -> 'Adaptors[Adaptor]':
1370
+ def filter(self, func: Callable[["Selector"], bool]) -> "Selectors":
1047
1371
  """Filter current elements based on the passed function
1048
1372
  :param func: A function that takes each element as an argument and returns True/False
1049
- :return: The new `Adaptors` object or empty list otherwise.
1373
+ :return: The new `Selectors` object or empty list otherwise.
1050
1374
  """
1051
- return self.__class__([
1052
- element for element in self if func(element)
1053
- ])
1375
+ return self.__class__([element for element in self if func(element)])
1054
1376
 
1055
1377
  # For easy copy-paste from Scrapy/parsel code when needed :)
1056
1378
  def get(self, default=None):
@@ -1075,6 +1397,16 @@ class Adaptors(List[Adaptor]):
1075
1397
  """Returns the last item of the current list or `None` if the list is empty"""
1076
1398
  return self[-1] if len(self) > 0 else None
1077
1399
 
1078
- def __getstate__(self) -> Any:
1400
+ @property
1401
+ def length(self):
1402
+ """Returns the length of the current list"""
1403
+ return len(self)
1404
+
1405
+ def __getstate__(self) -> Any: # pragma: no cover
1079
1406
  # lxml don't like it :)
1080
- raise TypeError("Can't pickle Adaptors object")
1407
+ raise TypeError("Can't pickle Selectors object")
1408
+
1409
+
1410
+ # For backward compatibility
1411
+ Adaptor = Selector
1412
+ Adaptors = Selectors