scrapling 0.2.99__py3-none-any.whl → 0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. scrapling/__init__.py +18 -31
  2. scrapling/cli.py +818 -20
  3. scrapling/core/_html_utils.py +348 -0
  4. scrapling/core/_types.py +34 -17
  5. scrapling/core/ai.py +611 -0
  6. scrapling/core/custom_types.py +183 -100
  7. scrapling/core/mixins.py +27 -19
  8. scrapling/core/shell.py +647 -0
  9. scrapling/core/{storage_adaptors.py → storage.py} +41 -33
  10. scrapling/core/translator.py +20 -26
  11. scrapling/core/utils.py +49 -54
  12. scrapling/engines/__init__.py +15 -6
  13. scrapling/engines/_browsers/__init__.py +2 -0
  14. scrapling/engines/_browsers/_camoufox.py +745 -0
  15. scrapling/engines/_browsers/_config_tools.py +130 -0
  16. scrapling/engines/_browsers/_controllers.py +630 -0
  17. scrapling/engines/_browsers/_page.py +93 -0
  18. scrapling/engines/_browsers/_validators.py +150 -0
  19. scrapling/engines/constants.py +101 -88
  20. scrapling/engines/static.py +667 -110
  21. scrapling/engines/toolbelt/__init__.py +20 -6
  22. scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
  23. scrapling/engines/toolbelt/convertor.py +254 -0
  24. scrapling/engines/toolbelt/custom.py +158 -175
  25. scrapling/engines/toolbelt/fingerprints.py +32 -46
  26. scrapling/engines/toolbelt/navigation.py +68 -39
  27. scrapling/fetchers.py +227 -333
  28. scrapling/parser.py +781 -449
  29. scrapling-0.3.dist-info/METADATA +409 -0
  30. scrapling-0.3.dist-info/RECORD +41 -0
  31. {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/WHEEL +1 -1
  32. {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/top_level.txt +0 -1
  33. scrapling/defaults.py +0 -25
  34. scrapling/engines/camo.py +0 -339
  35. scrapling/engines/pw.py +0 -465
  36. scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
  37. scrapling-0.2.99.dist-info/METADATA +0 -290
  38. scrapling-0.2.99.dist-info/RECORD +0 -49
  39. tests/__init__.py +0 -1
  40. tests/fetchers/__init__.py +0 -1
  41. tests/fetchers/async/__init__.py +0 -0
  42. tests/fetchers/async/test_camoufox.py +0 -97
  43. tests/fetchers/async/test_httpx.py +0 -85
  44. tests/fetchers/async/test_playwright.py +0 -101
  45. tests/fetchers/sync/__init__.py +0 -0
  46. tests/fetchers/sync/test_camoufox.py +0 -70
  47. tests/fetchers/sync/test_httpx.py +0 -84
  48. tests/fetchers/sync/test_playwright.py +0 -89
  49. tests/fetchers/test_utils.py +0 -97
  50. tests/parser/__init__.py +0 -0
  51. tests/parser/test_automatch.py +0 -111
  52. tests/parser/test_general.py +0 -330
  53. {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/entry_points.txt +0 -0
  54. {scrapling-0.2.99.dist-info → scrapling-0.3.dist-info}/licenses/LICENSE +0 -0
@@ -1,159 +1,198 @@
1
- import re
2
- import typing
3
1
  from collections.abc import Mapping
4
2
  from types import MappingProxyType
3
+ from re import compile as re_compile, UNICODE, IGNORECASE
5
4
 
6
5
  from orjson import dumps, loads
7
- from w3lib.html import replace_entities as _replace_entities
8
6
 
9
- from scrapling.core._types import (Dict, Iterable, List, Literal, Optional,
10
- Pattern, SupportsIndex, TypeVar, Union)
11
- from scrapling.core.utils import _is_iterable, flatten
7
+ from scrapling.core._types import (
8
+ cast,
9
+ Dict,
10
+ List,
11
+ Union,
12
+ overload,
13
+ TypeVar,
14
+ Literal,
15
+ Pattern,
16
+ Iterable,
17
+ Optional,
18
+ Generator,
19
+ SupportsIndex,
20
+ )
21
+ from scrapling.core.utils import _is_iterable, flatten, __CONSECUTIVE_SPACES_REGEX__
22
+ from scrapling.core._html_utils import _replace_entities
12
23
 
13
24
  # Define type variable for AttributeHandler value type
14
- _TextHandlerType = TypeVar('_TextHandlerType', bound='TextHandler')
25
+ _TextHandlerType = TypeVar("_TextHandlerType", bound="TextHandler")
26
+ __CLEANING_TABLE__ = str.maketrans("\t\r\n", " ")
15
27
 
16
28
 
17
29
  class TextHandler(str):
18
30
  """Extends standard Python string by adding more functionality"""
19
- __slots__ = ()
20
31
 
21
- def __new__(cls, string):
22
- return super().__new__(cls, str(string))
32
+ __slots__ = ()
23
33
 
24
- def __getitem__(self, key: Union[SupportsIndex, slice]) -> "TextHandler":
34
+ def __getitem__(
35
+ self, key: SupportsIndex | slice
36
+ ) -> "TextHandler": # pragma: no cover
25
37
  lst = super().__getitem__(key)
26
- return typing.cast(_TextHandlerType, TextHandler(lst))
38
+ return cast(_TextHandlerType, TextHandler(lst))
27
39
 
28
- def split(self, sep: str = None, maxsplit: SupportsIndex = -1) -> 'TextHandlers':
40
+ def split(
41
+ self, sep: str = None, maxsplit: SupportsIndex = -1
42
+ ) -> "TextHandlers": # pragma: no cover
29
43
  return TextHandlers(
30
- typing.cast(List[_TextHandlerType], [TextHandler(s) for s in super().split(sep, maxsplit)])
44
+ cast(
45
+ List[_TextHandlerType],
46
+ [TextHandler(s) for s in super().split(sep, maxsplit)],
47
+ )
31
48
  )
32
49
 
33
- def strip(self, chars: str = None) -> Union[str, 'TextHandler']:
50
+ def strip(self, chars: str = None) -> Union[str, "TextHandler"]: # pragma: no cover
34
51
  return TextHandler(super().strip(chars))
35
52
 
36
- def lstrip(self, chars: str = None) -> Union[str, 'TextHandler']:
53
+ def lstrip(
54
+ self, chars: str = None
55
+ ) -> Union[str, "TextHandler"]: # pragma: no cover
37
56
  return TextHandler(super().lstrip(chars))
38
57
 
39
- def rstrip(self, chars: str = None) -> Union[str, 'TextHandler']:
58
+ def rstrip(
59
+ self, chars: str = None
60
+ ) -> Union[str, "TextHandler"]: # pragma: no cover
40
61
  return TextHandler(super().rstrip(chars))
41
62
 
42
- def capitalize(self) -> Union[str, 'TextHandler']:
63
+ def capitalize(self) -> Union[str, "TextHandler"]: # pragma: no cover
43
64
  return TextHandler(super().capitalize())
44
65
 
45
- def casefold(self) -> Union[str, 'TextHandler']:
66
+ def casefold(self) -> Union[str, "TextHandler"]: # pragma: no cover
46
67
  return TextHandler(super().casefold())
47
68
 
48
- def center(self, width: SupportsIndex, fillchar: str = ' ') -> Union[str, 'TextHandler']:
69
+ def center(
70
+ self, width: SupportsIndex, fillchar: str = " "
71
+ ) -> Union[str, "TextHandler"]: # pragma: no cover
49
72
  return TextHandler(super().center(width, fillchar))
50
73
 
51
- def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, 'TextHandler']:
74
+ def expandtabs(
75
+ self, tabsize: SupportsIndex = 8
76
+ ) -> Union[str, "TextHandler"]: # pragma: no cover
52
77
  return TextHandler(super().expandtabs(tabsize))
53
78
 
54
- def format(self, *args: str, **kwargs: str) -> Union[str, 'TextHandler']:
79
+ def format(
80
+ self, *args: str, **kwargs: str
81
+ ) -> Union[str, "TextHandler"]: # pragma: no cover
55
82
  return TextHandler(super().format(*args, **kwargs))
56
83
 
57
- def format_map(self, mapping) -> Union[str, 'TextHandler']:
84
+ def format_map(self, mapping) -> Union[str, "TextHandler"]: # pragma: no cover
58
85
  return TextHandler(super().format_map(mapping))
59
86
 
60
- def join(self, iterable: Iterable[str]) -> Union[str, 'TextHandler']:
87
+ def join(
88
+ self, iterable: Iterable[str]
89
+ ) -> Union[str, "TextHandler"]: # pragma: no cover
61
90
  return TextHandler(super().join(iterable))
62
91
 
63
- def ljust(self, width: SupportsIndex, fillchar: str = ' ') -> Union[str, 'TextHandler']:
92
+ def ljust(
93
+ self, width: SupportsIndex, fillchar: str = " "
94
+ ) -> Union[str, "TextHandler"]: # pragma: no cover
64
95
  return TextHandler(super().ljust(width, fillchar))
65
96
 
66
- def rjust(self, width: SupportsIndex, fillchar: str = ' ') -> Union[str, 'TextHandler']:
97
+ def rjust(
98
+ self, width: SupportsIndex, fillchar: str = " "
99
+ ) -> Union[str, "TextHandler"]: # pragma: no cover
67
100
  return TextHandler(super().rjust(width, fillchar))
68
101
 
69
- def swapcase(self) -> Union[str, 'TextHandler']:
102
+ def swapcase(self) -> Union[str, "TextHandler"]: # pragma: no cover
70
103
  return TextHandler(super().swapcase())
71
104
 
72
- def title(self) -> Union[str, 'TextHandler']:
105
+ def title(self) -> Union[str, "TextHandler"]: # pragma: no cover
73
106
  return TextHandler(super().title())
74
107
 
75
- def translate(self, table) -> Union[str, 'TextHandler']:
108
+ def translate(self, table) -> Union[str, "TextHandler"]: # pragma: no cover
76
109
  return TextHandler(super().translate(table))
77
110
 
78
- def zfill(self, width: SupportsIndex) -> Union[str, 'TextHandler']:
111
+ def zfill(
112
+ self, width: SupportsIndex
113
+ ) -> Union[str, "TextHandler"]: # pragma: no cover
79
114
  return TextHandler(super().zfill(width))
80
115
 
81
- def replace(self, old: str, new: str, count: SupportsIndex = -1) -> Union[str, 'TextHandler']:
116
+ def replace(
117
+ self, old: str, new: str, count: SupportsIndex = -1
118
+ ) -> Union[str, "TextHandler"]:
82
119
  return TextHandler(super().replace(old, new, count))
83
120
 
84
- def upper(self) -> Union[str, 'TextHandler']:
121
+ def upper(self) -> Union[str, "TextHandler"]:
85
122
  return TextHandler(super().upper())
86
123
 
87
- def lower(self) -> Union[str, 'TextHandler']:
124
+ def lower(self) -> Union[str, "TextHandler"]:
88
125
  return TextHandler(super().lower())
126
+
89
127
  ##############
90
128
 
91
- def sort(self, reverse: bool = False) -> Union[str, 'TextHandler']:
129
+ def sort(self, reverse: bool = False) -> Union[str, "TextHandler"]:
92
130
  """Return a sorted version of the string"""
93
131
  return self.__class__("".join(sorted(self, reverse=reverse)))
94
132
 
95
- def clean(self) -> Union[str, 'TextHandler']:
133
+ def clean(self) -> Union[str, "TextHandler"]:
96
134
  """Return a new version of the string after removing all white spaces and consecutive spaces"""
97
- data = re.sub(r'[\t|\r|\n]', '', self)
98
- data = re.sub(' +', ' ', data)
99
- return self.__class__(data.strip())
135
+ data = self.translate(__CLEANING_TABLE__)
136
+ return self.__class__(__CONSECUTIVE_SPACES_REGEX__.sub(" ", data).strip())
100
137
 
101
138
  # For easy copy-paste from Scrapy/parsel code when needed :)
102
- def get(self, default=None):
139
+ def get(self, default=None): # pragma: no cover
103
140
  return self
104
141
 
105
- def get_all(self):
142
+ def get_all(self): # pragma: no cover
106
143
  return self
107
144
 
108
145
  extract = get_all
109
146
  extract_first = get
110
147
 
111
148
  def json(self) -> Dict:
112
- """Return json response if the response is jsonable otherwise throw error"""
113
- # Using str function as a workaround for orjson issue with subclasses of str
149
+ """Return JSON response if the response is jsonable otherwise throw error"""
150
+ # Using str function as a workaround for orjson issue with subclasses of str.
114
151
  # Check this out: https://github.com/ijl/orjson/issues/445
115
152
  return loads(str(self))
116
153
 
117
- @typing.overload
154
+ @overload
118
155
  def re(
119
156
  self,
120
- regex: Union[str, Pattern[str]],
157
+ regex: str | Pattern,
121
158
  check_match: Literal[True],
122
159
  replace_entities: bool = True,
123
160
  clean_match: bool = False,
124
161
  case_sensitive: bool = True,
125
- ) -> bool:
126
- ...
162
+ ) -> bool: ...
127
163
 
128
- @typing.overload
164
+ @overload
129
165
  def re(
130
166
  self,
131
- regex: Union[str, Pattern[str]],
167
+ regex: str | Pattern,
132
168
  replace_entities: bool = True,
133
169
  clean_match: bool = False,
134
170
  case_sensitive: bool = True,
135
171
  check_match: Literal[False] = False,
136
- ) -> "TextHandlers[TextHandler]":
137
- ...
172
+ ) -> "TextHandlers[TextHandler]": ...
138
173
 
139
174
  def re(
140
- self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
141
- case_sensitive: bool = True, check_match: bool = False
142
- ) -> Union["TextHandlers[TextHandler]", bool]:
175
+ self,
176
+ regex: str | Pattern,
177
+ replace_entities: bool = True,
178
+ clean_match: bool = False,
179
+ case_sensitive: bool = True,
180
+ check_match: bool = False,
181
+ ) -> Union["TextHandlers", bool]:
143
182
  """Apply the given regex to the current text and return a list of strings with the matches.
144
183
 
145
184
  :param regex: Can be either a compiled regular expression or a string.
146
- :param replace_entities: if enabled character entity references are replaced by their corresponding character
147
- :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
148
- :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
149
- :param check_match: used to quickly check if this regex matches or not without any operations on the results
185
+ :param replace_entities: If enabled character entity references are replaced by their corresponding character
186
+ :param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching
187
+ :param case_sensitive: If disabled, function will set the regex to ignore the letters-case while compiling it
188
+ :param check_match: Used to quickly check if this regex matches or not without any operations on the results
150
189
 
151
190
  """
152
191
  if isinstance(regex, str):
153
192
  if case_sensitive:
154
- regex = re.compile(regex, re.UNICODE)
193
+ regex = re_compile(regex, UNICODE)
155
194
  else:
156
- regex = re.compile(regex, flags=re.UNICODE | re.IGNORECASE)
195
+ regex = re_compile(regex, flags=UNICODE | IGNORECASE)
157
196
 
158
197
  input_text = self.clean() if clean_match else self
159
198
  results = regex.findall(input_text)
@@ -164,22 +203,42 @@ class TextHandler(str):
164
203
  results = flatten(results)
165
204
 
166
205
  if not replace_entities:
167
- return TextHandlers(typing.cast(List[_TextHandlerType], [TextHandler(string) for string in results]))
206
+ return TextHandlers(
207
+ cast(
208
+ List[_TextHandlerType], [TextHandler(string) for string in results]
209
+ )
210
+ )
168
211
 
169
- return TextHandlers(typing.cast(List[_TextHandlerType], [TextHandler(_replace_entities(s)) for s in results]))
212
+ return TextHandlers(
213
+ cast(
214
+ List[_TextHandlerType],
215
+ [TextHandler(_replace_entities(s)) for s in results],
216
+ )
217
+ )
170
218
 
171
- def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
172
- clean_match: bool = False, case_sensitive: bool = True) -> "TextHandler":
219
+ def re_first(
220
+ self,
221
+ regex: str | Pattern,
222
+ default=None,
223
+ replace_entities: bool = True,
224
+ clean_match: bool = False,
225
+ case_sensitive: bool = True,
226
+ ) -> "TextHandler":
173
227
  """Apply the given regex to text and return the first match if found, otherwise return the default value.
174
228
 
175
229
  :param regex: Can be either a compiled regular expression or a string.
176
230
  :param default: The default value to be returned if there is no match
177
- :param replace_entities: if enabled character entity references are replaced by their corresponding character
178
- :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
179
- :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
231
+ :param replace_entities: If enabled character entity references are replaced by their corresponding character
232
+ :param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching
233
+ :param case_sensitive: If disabled, function will set the regex to ignore the letters-case while compiling it
180
234
 
181
235
  """
182
- result = self.re(regex, replace_entities, clean_match=clean_match, case_sensitive=case_sensitive)
236
+ result = self.re(
237
+ regex,
238
+ replace_entities,
239
+ clean_match=clean_match,
240
+ case_sensitive=case_sensitive,
241
+ )
183
242
  return result[0] if result else default
184
243
 
185
244
 
@@ -187,48 +246,61 @@ class TextHandlers(List[TextHandler]):
187
246
  """
188
247
  The :class:`TextHandlers` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
189
248
  """
249
+
190
250
  __slots__ = ()
191
251
 
192
- @typing.overload
193
- def __getitem__(self, pos: SupportsIndex) -> TextHandler:
252
+ @overload
253
+ def __getitem__(self, pos: SupportsIndex) -> TextHandler: # pragma: no cover
194
254
  pass
195
255
 
196
- @typing.overload
197
- def __getitem__(self, pos: slice) -> "TextHandlers":
256
+ @overload
257
+ def __getitem__(self, pos: slice) -> "TextHandlers": # pragma: no cover
198
258
  pass
199
259
 
200
- def __getitem__(self, pos: Union[SupportsIndex, slice]) -> Union[TextHandler, "TextHandlers"]:
260
+ def __getitem__(
261
+ self, pos: SupportsIndex | slice
262
+ ) -> Union[TextHandler, "TextHandlers"]:
201
263
  lst = super().__getitem__(pos)
202
264
  if isinstance(pos, slice):
203
- lst = [TextHandler(s) for s in lst]
204
- return TextHandlers(typing.cast(List[_TextHandlerType], lst))
205
- return typing.cast(_TextHandlerType, TextHandler(lst))
265
+ return TextHandlers(cast(List[_TextHandlerType], lst))
266
+ return cast(_TextHandlerType, TextHandler(lst))
206
267
 
207
- def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
208
- case_sensitive: bool = True) -> 'TextHandlers[TextHandler]':
268
+ def re(
269
+ self,
270
+ regex: str | Pattern,
271
+ replace_entities: bool = True,
272
+ clean_match: bool = False,
273
+ case_sensitive: bool = True,
274
+ ) -> "TextHandlers[TextHandler]":
209
275
  """Call the ``.re()`` method for each element in this list and return
210
276
  their results flattened as TextHandlers.
211
277
 
212
278
  :param regex: Can be either a compiled regular expression or a string.
213
- :param replace_entities: if enabled character entity references are replaced by their corresponding character
279
+ :param replace_entities: If enabled character entity references are replaced by their corresponding character
214
280
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
215
- :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
281
+ :param case_sensitive: if disabled, the function will set the regex to ignore the letters-case while compiling it
216
282
  """
217
283
  results = [
218
284
  n.re(regex, replace_entities, clean_match, case_sensitive) for n in self
219
285
  ]
220
286
  return TextHandlers(flatten(results))
221
287
 
222
- def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
223
- clean_match: bool = False, case_sensitive: bool = True) -> TextHandler:
288
+ def re_first(
289
+ self,
290
+ regex: str | Pattern,
291
+ default=None,
292
+ replace_entities: bool = True,
293
+ clean_match: bool = False,
294
+ case_sensitive: bool = True,
295
+ ) -> TextHandler: # pragma: no cover
224
296
  """Call the ``.re_first()`` method for each element in this list and return
225
297
  the first result or the default value otherwise.
226
298
 
227
299
  :param regex: Can be either a compiled regular expression or a string.
228
300
  :param default: The default value to be returned if there is no match
229
- :param replace_entities: if enabled character entity references are replaced by their corresponding character
230
- :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
231
- :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
301
+ :param replace_entities: If enabled character entity references are replaced by their corresponding character
302
+ :param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching
303
+ :param case_sensitive: If disabled, function will set the regex to ignore the letters-case while compiling it
232
304
  """
233
305
  for n in self:
234
306
  for result in n.re(regex, replace_entities, clean_match, case_sensitive):
@@ -250,33 +322,44 @@ class TextHandlers(List[TextHandler]):
250
322
 
251
323
 
252
324
  class AttributesHandler(Mapping[str, _TextHandlerType]):
253
- """A read-only mapping to use instead of the standard dictionary for the speed boost but at the same time I use it to add more functionalities.
254
- If standard dictionary is needed, just convert this class to dictionary with `dict` function
325
+ """A read-only mapping to use instead of the standard dictionary for the speed boost, but at the same time I use it to add more functionalities.
326
+ If the standard dictionary is needed, convert this class to a dictionary with the `dict` function
255
327
  """
256
- __slots__ = ('_data',)
328
+
329
+ __slots__ = ("_data",)
257
330
 
258
331
  def __init__(self, mapping=None, **kwargs):
259
- mapping = {
260
- key: TextHandler(value) if type(value) is str else value
261
- for key, value in mapping.items()
262
- } if mapping is not None else {}
332
+ mapping = (
333
+ {
334
+ key: TextHandler(value) if isinstance(value, str) else value
335
+ for key, value in mapping.items()
336
+ }
337
+ if mapping is not None
338
+ else {}
339
+ )
263
340
 
264
341
  if kwargs:
265
- mapping.update({
266
- key: TextHandler(value) if type(value) is str else value
267
- for key, value in kwargs.items()
268
- })
342
+ mapping.update(
343
+ {
344
+ key: TextHandler(value) if isinstance(value, str) else value
345
+ for key, value in kwargs.items()
346
+ }
347
+ )
269
348
 
270
349
  # Fastest read-only mapping type
271
350
  self._data = MappingProxyType(mapping)
272
351
 
273
- def get(self, key: str, default: Optional[str] = None) -> Union[_TextHandlerType, None]:
274
- """Acts like standard dictionary `.get()` method"""
352
+ def get(
353
+ self, key: str, default: Optional[str] = None
354
+ ) -> Optional[_TextHandlerType]:
355
+ """Acts like the standard dictionary `.get()` method"""
275
356
  return self._data.get(key, default)
276
357
 
277
- def search_values(self, keyword, partial=False):
278
- """Search current attributes by values and return dictionary of each matching item
279
- :param keyword: The keyword to search for in the attributes values
358
+ def search_values(
359
+ self, keyword: str, partial: bool = False
360
+ ) -> Generator["AttributesHandler", None, None]:
361
+ """Search current attributes by values and return a dictionary of each matching item
362
+ :param keyword: The keyword to search for in the attribute values
280
363
  :param partial: If True, the function will search if keyword in each value instead of perfect match
281
364
  """
282
365
  for key, value in self._data.items():
scrapling/core/mixins.py CHANGED
@@ -1,32 +1,37 @@
1
-
2
1
  class SelectorsGeneration:
3
- """Selectors generation functions
2
+ """
3
+ Functions for generating selectors
4
4
  Trying to generate selectors like Firefox or maybe cleaner ones!? Ehm
5
- Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591"""
5
+ Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591
6
+ """
6
7
 
7
- def __general_selection(self, selection: str = 'css', full_path=False) -> str:
8
+ def __general_selection(
9
+ self, selection: str = "css", full_path: bool = False
10
+ ) -> str:
8
11
  """Generate a selector for the current element.
9
12
  :return: A string of the generated selector.
10
13
  """
11
14
  selectorPath = []
12
15
  target = self
13
- css = selection.lower() == 'css'
16
+ css = selection.lower() == "css"
14
17
  while target is not None:
15
18
  if target.parent:
16
- if target.attrib.get('id'):
19
+ if target.attrib.get("id"):
17
20
  # id is enough
18
21
  part = (
19
- f'#{target.attrib["id"]}' if css
22
+ f"#{target.attrib['id']}"
23
+ if css
20
24
  else f"[@id='{target.attrib['id']}']"
21
25
  )
22
26
  selectorPath.append(part)
23
27
  if not full_path:
24
28
  return (
25
- " > ".join(reversed(selectorPath)) if css
26
- else '//*' + "/".join(reversed(selectorPath))
29
+ " > ".join(reversed(selectorPath))
30
+ if css
31
+ else "//*" + "/".join(reversed(selectorPath))
27
32
  )
28
33
  else:
29
- part = f'{target.tag}'
34
+ part = f"{target.tag}"
30
35
  # We won't use classes anymore because I some websites share exact classes between elements
31
36
  # classes = target.attrib.get('class', '').split()
32
37
  # if classes and css:
@@ -41,23 +46,26 @@ class SelectorsGeneration:
41
46
 
42
47
  if counter[target.tag] > 1:
43
48
  part += (
44
- f":nth-of-type({counter[target.tag]})" if css
49
+ f":nth-of-type({counter[target.tag]})"
50
+ if css
45
51
  else f"[{counter[target.tag]}]"
46
52
  )
47
53
 
48
54
  selectorPath.append(part)
49
55
  target = target.parent
50
- if target is None or target.tag == 'html':
56
+ if target is None or target.tag == "html":
51
57
  return (
52
- " > ".join(reversed(selectorPath)) if css
53
- else '//' + "/".join(reversed(selectorPath))
58
+ " > ".join(reversed(selectorPath))
59
+ if css
60
+ else "//" + "/".join(reversed(selectorPath))
54
61
  )
55
62
  else:
56
63
  break
57
64
 
58
65
  return (
59
- " > ".join(reversed(selectorPath)) if css
60
- else '//' + "/".join(reversed(selectorPath))
66
+ " > ".join(reversed(selectorPath))
67
+ if css
68
+ else "//" + "/".join(reversed(selectorPath))
61
69
  )
62
70
 
63
71
  @property
@@ -76,14 +84,14 @@ class SelectorsGeneration:
76
84
 
77
85
  @property
78
86
  def generate_xpath_selector(self) -> str:
79
- """Generate a XPath selector for the current element
87
+ """Generate an XPath selector for the current element
80
88
  :return: A string of the generated selector.
81
89
  """
82
- return self.__general_selection('xpath')
90
+ return self.__general_selection("xpath")
83
91
 
84
92
  @property
85
93
  def generate_full_xpath_selector(self) -> str:
86
94
  """Generate a complete XPath selector for the current element
87
95
  :return: A string of the generated selector.
88
96
  """
89
- return self.__general_selection('xpath', full_path=True)
97
+ return self.__general_selection("xpath", full_path=True)