scrapling 0.2.99__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +18 -31
- scrapling/cli.py +818 -20
- scrapling/core/_html_utils.py +348 -0
- scrapling/core/_types.py +34 -17
- scrapling/core/ai.py +611 -0
- scrapling/core/custom_types.py +183 -100
- scrapling/core/mixins.py +27 -19
- scrapling/core/shell.py +647 -0
- scrapling/core/{storage_adaptors.py → storage.py} +41 -33
- scrapling/core/translator.py +20 -26
- scrapling/core/utils.py +49 -54
- scrapling/engines/__init__.py +15 -6
- scrapling/engines/_browsers/__init__.py +2 -0
- scrapling/engines/_browsers/_camoufox.py +759 -0
- scrapling/engines/_browsers/_config_tools.py +130 -0
- scrapling/engines/_browsers/_controllers.py +644 -0
- scrapling/engines/_browsers/_page.py +93 -0
- scrapling/engines/_browsers/_validators.py +170 -0
- scrapling/engines/constants.py +101 -88
- scrapling/engines/static.py +667 -110
- scrapling/engines/toolbelt/__init__.py +20 -6
- scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
- scrapling/engines/toolbelt/convertor.py +254 -0
- scrapling/engines/toolbelt/custom.py +158 -175
- scrapling/engines/toolbelt/fingerprints.py +32 -46
- scrapling/engines/toolbelt/navigation.py +68 -39
- scrapling/fetchers.py +239 -333
- scrapling/parser.py +781 -449
- scrapling-0.3.1.dist-info/METADATA +411 -0
- scrapling-0.3.1.dist-info/RECORD +41 -0
- {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/WHEEL +1 -1
- {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/top_level.txt +0 -1
- scrapling/defaults.py +0 -25
- scrapling/engines/camo.py +0 -339
- scrapling/engines/pw.py +0 -465
- scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
- scrapling-0.2.99.dist-info/METADATA +0 -290
- scrapling-0.2.99.dist-info/RECORD +0 -49
- tests/__init__.py +0 -1
- tests/fetchers/__init__.py +0 -1
- tests/fetchers/async/__init__.py +0 -0
- tests/fetchers/async/test_camoufox.py +0 -97
- tests/fetchers/async/test_httpx.py +0 -85
- tests/fetchers/async/test_playwright.py +0 -101
- tests/fetchers/sync/__init__.py +0 -0
- tests/fetchers/sync/test_camoufox.py +0 -70
- tests/fetchers/sync/test_httpx.py +0 -84
- tests/fetchers/sync/test_playwright.py +0 -89
- tests/fetchers/test_utils.py +0 -97
- tests/parser/__init__.py +0 -0
- tests/parser/test_automatch.py +0 -111
- tests/parser/test_general.py +0 -330
- {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/entry_points.txt +0 -0
- {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/licenses/LICENSE +0 -0
scrapling/parser.py
CHANGED
@@ -1,123 +1,182 @@
|
|
1
|
-
import
|
2
|
-
import os
|
1
|
+
from pathlib import Path
|
3
2
|
import re
|
4
|
-
import
|
3
|
+
from inspect import signature
|
5
4
|
from difflib import SequenceMatcher
|
6
5
|
from urllib.parse import urljoin
|
7
6
|
|
8
7
|
from cssselect import SelectorError, SelectorSyntaxError
|
9
8
|
from cssselect import parse as split_selectors
|
10
|
-
from lxml import
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
9
|
+
from lxml.html import HtmlElement, HtmlMixin, HTMLParser
|
10
|
+
from lxml.etree import (
|
11
|
+
XPath,
|
12
|
+
tostring,
|
13
|
+
fromstring,
|
14
|
+
XPathError,
|
15
|
+
XPathEvalError,
|
16
|
+
_ElementUnicodeResult,
|
17
|
+
)
|
18
|
+
|
19
|
+
from scrapling.core._types import (
|
20
|
+
Any,
|
21
|
+
Dict,
|
22
|
+
List,
|
23
|
+
Tuple,
|
24
|
+
Union,
|
25
|
+
Pattern,
|
26
|
+
Callable,
|
27
|
+
Optional,
|
28
|
+
Iterable,
|
29
|
+
overload,
|
30
|
+
Generator,
|
31
|
+
SupportsIndex,
|
32
|
+
)
|
33
|
+
from scrapling.core.custom_types import AttributesHandler, TextHandler, TextHandlers
|
17
34
|
from scrapling.core.mixins import SelectorsGeneration
|
18
|
-
from scrapling.core.
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
35
|
+
from scrapling.core.storage import (
|
36
|
+
SQLiteStorageSystem,
|
37
|
+
StorageSystemMixin,
|
38
|
+
_StorageTools,
|
39
|
+
)
|
40
|
+
from scrapling.core.translator import translator as _translator
|
41
|
+
from scrapling.core.utils import clean_spaces, flatten, html_forbidden, log
|
42
|
+
|
43
|
+
__DEFAULT_DB_FILE__ = str(Path(__file__).parent / "elements_storage.db")
|
44
|
+
# Attributes that are Python reserved words and can't be used directly
|
45
|
+
# Ex: find_all('a', class="blah") -> find_all('a', class_="blah")
|
46
|
+
# https://www.w3schools.com/python/python_ref_keywords.asp
|
47
|
+
_whitelisted = {
|
48
|
+
"class_": "class",
|
49
|
+
"for_": "for",
|
50
|
+
}
|
51
|
+
# Pre-compiled selectors for efficiency
|
52
|
+
_find_all_elements = XPath(".//*")
|
53
|
+
_find_all_elements_with_spaces = XPath(
|
54
|
+
".//*[normalize-space(text())]"
|
55
|
+
) # This selector gets all elements with text content
|
56
|
+
|
57
|
+
|
58
|
+
class Selector(SelectorsGeneration):
|
26
59
|
__slots__ = (
|
27
|
-
|
28
|
-
|
29
|
-
|
60
|
+
"url",
|
61
|
+
"encoding",
|
62
|
+
"__adaptive_enabled",
|
63
|
+
"_root",
|
64
|
+
"_storage",
|
65
|
+
"__keep_comments",
|
66
|
+
"__huge_tree_enabled",
|
67
|
+
"__attributes",
|
68
|
+
"__text",
|
69
|
+
"__tag",
|
70
|
+
"__keep_cdata",
|
71
|
+
"_raw_body",
|
30
72
|
)
|
31
73
|
|
32
74
|
def __init__(
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
75
|
+
self,
|
76
|
+
content: Optional[str | bytes] = None,
|
77
|
+
url: Optional[str] = None,
|
78
|
+
encoding: str = "utf8",
|
79
|
+
huge_tree: bool = True,
|
80
|
+
root: Optional[HtmlElement] = None,
|
81
|
+
keep_comments: Optional[bool] = False,
|
82
|
+
keep_cdata: Optional[bool] = False,
|
83
|
+
adaptive: Optional[bool] = False,
|
84
|
+
_storage: object = None,
|
85
|
+
storage: Any = SQLiteStorageSystem,
|
86
|
+
storage_args: Optional[Dict] = None,
|
87
|
+
**kwargs,
|
46
88
|
):
|
47
89
|
"""The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
|
48
90
|
with expressions in CSS, XPath, or with simply text. Check the docs for more info.
|
49
91
|
|
50
92
|
Here we try to extend module ``lxml.html.HtmlElement`` while maintaining a simpler interface, We are not
|
51
|
-
inheriting from the ``lxml.html.HtmlElement`` because it's not pickleable which makes a lot of reference jobs
|
93
|
+
inheriting from the ``lxml.html.HtmlElement`` because it's not pickleable, which makes a lot of reference jobs
|
52
94
|
not possible. You can test it here and see code explodes with `AssertionError: invalid Element proxy at...`.
|
53
95
|
It's an old issue with lxml, see `this entry <https://bugs.launchpad.net/lxml/+bug/736708>`
|
54
96
|
|
55
|
-
:param
|
56
|
-
:param url: allows storing a URL with the
|
57
|
-
:param body: HTML body as ``bytes`` object. It can be used instead of the ``text`` argument.
|
97
|
+
:param content: HTML content as either string or bytes.
|
98
|
+
:param url: It allows storing a URL with the HTML data for retrieving later.
|
58
99
|
:param encoding: The encoding type that will be used in HTML parsing, default is `UTF-8`
|
59
100
|
:param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
|
60
|
-
|
61
|
-
:param root: Used internally to pass etree objects instead of text/body arguments, it takes highest priority.
|
101
|
+
the libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
|
102
|
+
:param root: Used internally to pass etree objects instead of text/body arguments, it takes the highest priority.
|
62
103
|
Don't use it unless you know what you are doing!
|
63
104
|
:param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
|
64
105
|
:param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.
|
65
|
-
:param
|
66
|
-
priority over all
|
67
|
-
:param storage: The storage class to be passed for
|
106
|
+
:param adaptive: Globally turn off the adaptive feature in all functions, this argument takes higher
|
107
|
+
priority over all adaptive related arguments/functions in the class.
|
108
|
+
:param storage: The storage class to be passed for adaptive functionalities, see ``Docs`` for more info.
|
68
109
|
:param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
|
69
110
|
If empty, default values will be used.
|
70
111
|
"""
|
71
|
-
if root is None and
|
72
|
-
raise ValueError(
|
112
|
+
if root is None and content is None:
|
113
|
+
raise ValueError(
|
114
|
+
"Selector class needs HTML content, or root arguments to work"
|
115
|
+
)
|
73
116
|
|
74
|
-
self.__text =
|
117
|
+
self.__text = None
|
75
118
|
if root is None:
|
76
|
-
if
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
119
|
+
if isinstance(content, str):
|
120
|
+
body = (
|
121
|
+
content.strip().replace("\x00", "").encode(encoding) or b"<html/>"
|
122
|
+
)
|
123
|
+
elif isinstance(content, bytes):
|
124
|
+
body = content.replace(b"\x00", b"").strip()
|
81
125
|
else:
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
|
126
|
+
raise TypeError(
|
127
|
+
f"content argument must be str or bytes, got {type(content)}"
|
128
|
+
)
|
86
129
|
|
87
130
|
# https://lxml.de/api/lxml.etree.HTMLParser-class.html
|
88
|
-
parser =
|
89
|
-
recover=True,
|
90
|
-
|
131
|
+
parser = HTMLParser(
|
132
|
+
recover=True,
|
133
|
+
remove_blank_text=True,
|
134
|
+
remove_comments=(not keep_comments),
|
135
|
+
encoding=encoding,
|
136
|
+
compact=True,
|
137
|
+
huge_tree=huge_tree,
|
138
|
+
default_doctype=True,
|
139
|
+
strip_cdata=(not keep_cdata),
|
91
140
|
)
|
92
|
-
self._root =
|
93
|
-
|
94
|
-
|
141
|
+
self._root = fromstring(body, parser=parser, base_url=url)
|
142
|
+
|
143
|
+
self._raw_body = body.decode()
|
95
144
|
|
96
145
|
else:
|
97
|
-
# All
|
98
|
-
if not issubclass(type(root),
|
146
|
+
# All HTML types inherit from HtmlMixin so this to check for all at once
|
147
|
+
if not issubclass(type(root), HtmlMixin):
|
99
148
|
raise TypeError(
|
100
149
|
f"Root have to be a valid element of `html` module types to work, not of type {type(root)}"
|
101
150
|
)
|
102
151
|
|
103
152
|
self._root = root
|
153
|
+
self._raw_body = ""
|
104
154
|
|
105
|
-
self.
|
106
|
-
|
107
|
-
if self.__auto_match_enabled:
|
108
|
-
if not storage_args:
|
109
|
-
storage_args = {
|
110
|
-
'storage_file': os.path.join(os.path.dirname(__file__), 'elements_storage.db'),
|
111
|
-
'url': url
|
112
|
-
}
|
155
|
+
self.__adaptive_enabled = adaptive
|
113
156
|
|
114
|
-
|
115
|
-
|
157
|
+
if self.__adaptive_enabled:
|
158
|
+
if _storage is not None:
|
159
|
+
self._storage = _storage
|
160
|
+
else:
|
161
|
+
if not storage_args:
|
162
|
+
storage_args = {
|
163
|
+
"storage_file": __DEFAULT_DB_FILE__,
|
164
|
+
"url": url,
|
165
|
+
}
|
166
|
+
|
167
|
+
if not hasattr(storage, "__wrapped__"):
|
168
|
+
raise ValueError(
|
169
|
+
"Storage class must be wrapped with lru_cache decorator, see docs for info"
|
170
|
+
)
|
116
171
|
|
117
|
-
|
118
|
-
|
172
|
+
if not issubclass(
|
173
|
+
storage.__wrapped__, StorageSystemMixin
|
174
|
+
): # pragma: no cover
|
175
|
+
raise ValueError(
|
176
|
+
"Storage system must be inherited from class `StorageSystemMixin`"
|
177
|
+
)
|
119
178
|
|
120
|
-
|
179
|
+
self._storage = storage(**storage_args)
|
121
180
|
|
122
181
|
self.__keep_comments = keep_comments
|
123
182
|
self.__keep_cdata = keep_cdata
|
@@ -127,75 +186,112 @@ class Adaptor(SelectorsGeneration):
|
|
127
186
|
# For selector stuff
|
128
187
|
self.__attributes = None
|
129
188
|
self.__tag = None
|
189
|
+
|
190
|
+
@property
|
191
|
+
def __response_data(self):
|
130
192
|
# No need to check if all response attributes exist or not because if `status` exist, then the rest exist (Save some CPU cycles for speed)
|
131
|
-
self
|
132
|
-
|
133
|
-
|
193
|
+
if not hasattr(self, "_cached_response_data"):
|
194
|
+
self._cached_response_data = (
|
195
|
+
{
|
196
|
+
key: getattr(self, key)
|
197
|
+
for key in (
|
198
|
+
"status",
|
199
|
+
"reason",
|
200
|
+
"cookies",
|
201
|
+
"history",
|
202
|
+
"headers",
|
203
|
+
"request_headers",
|
204
|
+
)
|
205
|
+
}
|
206
|
+
if hasattr(self, "status")
|
207
|
+
else {}
|
208
|
+
)
|
209
|
+
return self._cached_response_data
|
210
|
+
|
211
|
+
def __getitem__(self, key: str) -> TextHandler:
|
212
|
+
return self.attrib[key]
|
134
213
|
|
135
|
-
|
214
|
+
def __contains__(self, key: str) -> bool:
|
215
|
+
return key in self.attrib
|
216
|
+
|
217
|
+
# Node functionalities, I wanted to move to a separate Mixin class, but it had a slight impact on performance
|
136
218
|
@staticmethod
|
137
|
-
def _is_text_node(
|
138
|
-
|
219
|
+
def _is_text_node(
|
220
|
+
element: HtmlElement | _ElementUnicodeResult,
|
221
|
+
) -> bool:
|
222
|
+
"""Return True if the given element is a result of a string expression
|
139
223
|
Examples:
|
140
|
-
XPath -> '/text()', '/@attribute' etc...
|
141
|
-
CSS3
|
224
|
+
XPath -> '/text()', '/@attribute', etc...
|
225
|
+
CSS3 -> '::text', '::attr(attrib)'...
|
142
226
|
"""
|
143
227
|
# Faster than checking `element.is_attribute or element.is_text or element.is_tail`
|
144
|
-
return issubclass(type(element),
|
228
|
+
return issubclass(type(element), _ElementUnicodeResult)
|
145
229
|
|
146
230
|
@staticmethod
|
147
|
-
def __content_convertor(
|
231
|
+
def __content_convertor(
|
232
|
+
element: HtmlElement | _ElementUnicodeResult,
|
233
|
+
) -> TextHandler:
|
148
234
|
"""Used internally to convert a single element's text content to TextHandler directly without checks
|
149
235
|
|
150
|
-
This single line has been isolated like this so when it's used with map we get that slight performance boost vs list comprehension
|
236
|
+
This single line has been isolated like this, so when it's used with `map` we get that slight performance boost vs. list comprehension
|
151
237
|
"""
|
152
|
-
return TextHandler(
|
238
|
+
return TextHandler(element)
|
153
239
|
|
154
|
-
def __element_convertor(self, element:
|
155
|
-
"""Used internally to convert a single HtmlElement to
|
156
|
-
|
240
|
+
def __element_convertor(self, element: HtmlElement) -> "Selector":
|
241
|
+
"""Used internally to convert a single HtmlElement to Selector directly without checks"""
|
242
|
+
db_instance = (
|
243
|
+
self._storage if (hasattr(self, "_storage") and self._storage) else None
|
244
|
+
)
|
245
|
+
return Selector(
|
157
246
|
root=element,
|
158
|
-
|
159
|
-
|
160
|
-
|
247
|
+
url=self.url,
|
248
|
+
encoding=self.encoding,
|
249
|
+
adaptive=self.__adaptive_enabled,
|
250
|
+
_storage=db_instance, # Reuse existing storage if it exists otherwise it won't be checked if `adaptive` is turned off
|
251
|
+
keep_comments=self.__keep_comments,
|
252
|
+
keep_cdata=self.__keep_cdata,
|
161
253
|
huge_tree=self.__huge_tree_enabled,
|
162
|
-
**self.__response_data
|
254
|
+
**self.__response_data,
|
163
255
|
)
|
164
256
|
|
165
|
-
def __handle_element(
|
166
|
-
|
257
|
+
def __handle_element(
|
258
|
+
self, element: HtmlElement | _ElementUnicodeResult
|
259
|
+
) -> Optional[Union[TextHandler, "Selector"]]:
|
260
|
+
"""Used internally in all functions to convert a single element to type (Selector|TextHandler) when possible"""
|
167
261
|
if element is None:
|
168
262
|
return None
|
169
263
|
elif self._is_text_node(element):
|
170
|
-
#
|
264
|
+
# `_ElementUnicodeResult` basically inherit from `str` so it's fine
|
171
265
|
return self.__content_convertor(element)
|
172
266
|
else:
|
173
267
|
return self.__element_convertor(element)
|
174
268
|
|
175
|
-
def __handle_elements(
|
176
|
-
|
177
|
-
|
178
|
-
|
269
|
+
def __handle_elements(
|
270
|
+
self, result: List[HtmlElement | _ElementUnicodeResult]
|
271
|
+
) -> Union["Selectors", "TextHandlers"]:
|
272
|
+
"""Used internally in all functions to convert results to type (Selectors|TextHandlers) in bulk when possible"""
|
273
|
+
if not result:
|
274
|
+
return Selectors()
|
179
275
|
|
180
|
-
# From within the code, this method will always get a list of the same type
|
181
|
-
# so we will continue without checks for slight performance boost
|
276
|
+
# From within the code, this method will always get a list of the same type,
|
277
|
+
# so we will continue without checks for a slight performance boost
|
182
278
|
if self._is_text_node(result[0]):
|
183
|
-
return TextHandlers(
|
279
|
+
return TextHandlers(map(TextHandler, result))
|
184
280
|
|
185
|
-
return
|
281
|
+
return Selectors(map(self.__element_convertor, result))
|
186
282
|
|
187
283
|
def __getstate__(self) -> Any:
|
188
284
|
# lxml don't like it :)
|
189
|
-
raise TypeError("Can't pickle
|
285
|
+
raise TypeError("Can't pickle Selector objects")
|
190
286
|
|
191
287
|
# The following four properties I made them into functions instead of variables directly
|
192
288
|
# So they don't slow down the process of initializing many instances of the class and gets executed only
|
193
|
-
# when the user
|
289
|
+
# when the user needs them for the first time for that specific element and gets cached for next times
|
194
290
|
# Doing that only made the library performance test sky rocked multiple times faster than before
|
195
291
|
# because I was executing them on initialization before :))
|
196
292
|
@property
|
197
293
|
def tag(self) -> str:
|
198
|
-
"""Get tag name of the element"""
|
294
|
+
"""Get the tag name of the element"""
|
199
295
|
if not self.__tag:
|
200
296
|
self.__tag = self._root.tag
|
201
297
|
return self.__tag
|
@@ -203,13 +299,22 @@ class Adaptor(SelectorsGeneration):
|
|
203
299
|
@property
|
204
300
|
def text(self) -> TextHandler:
|
205
301
|
"""Get text content of the element"""
|
206
|
-
if
|
207
|
-
# If you want to escape lxml default
|
208
|
-
# before extracting text then keep `keep_comments` set to False while initializing the first class
|
209
|
-
self.__text = TextHandler(self._root.text)
|
302
|
+
if self.__text is None:
|
303
|
+
# If you want to escape lxml default behavior and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
|
304
|
+
# before extracting text, then keep `keep_comments` set to False while initializing the first class
|
305
|
+
self.__text = TextHandler(self._root.text or "")
|
210
306
|
return self.__text
|
211
307
|
|
212
|
-
def get_all_text(
|
308
|
+
def get_all_text(
|
309
|
+
self,
|
310
|
+
separator: str = "\n",
|
311
|
+
strip: bool = False,
|
312
|
+
ignore_tags: Tuple = (
|
313
|
+
"script",
|
314
|
+
"style",
|
315
|
+
),
|
316
|
+
valid_values: bool = True,
|
317
|
+
) -> TextHandler:
|
213
318
|
"""Get all child strings of this element, concatenated using the given separator.
|
214
319
|
|
215
320
|
:param separator: Strings will be concatenated using this separator.
|
@@ -219,20 +324,25 @@ class Adaptor(SelectorsGeneration):
|
|
219
324
|
|
220
325
|
:return: A TextHandler
|
221
326
|
"""
|
327
|
+
ignored_elements = set()
|
328
|
+
if ignore_tags:
|
329
|
+
for element in self._root.iter(*ignore_tags):
|
330
|
+
ignored_elements.add(element)
|
331
|
+
ignored_elements.update(set(_find_all_elements(element)))
|
332
|
+
|
222
333
|
_all_strings = []
|
223
|
-
for node in self._root.
|
224
|
-
if node
|
334
|
+
for node in self._root.iter():
|
335
|
+
if node not in ignored_elements:
|
225
336
|
text = node.text
|
226
|
-
if text and
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
_all_strings.append(text if not strip else text.strip())
|
337
|
+
if text and isinstance(text, str):
|
338
|
+
processed_text = text.strip() if strip else text
|
339
|
+
if not valid_values or processed_text.strip():
|
340
|
+
_all_strings.append(processed_text)
|
231
341
|
|
232
|
-
return TextHandler(separator.join(_all_strings)
|
342
|
+
return TextHandler(separator).join(_all_strings)
|
233
343
|
|
234
344
|
def urljoin(self, relative_url: str) -> str:
|
235
|
-
"""Join this
|
345
|
+
"""Join this Selector's url with a relative url to form an absolute full URL."""
|
236
346
|
return urljoin(self.url, relative_url)
|
237
347
|
|
238
348
|
@property
|
@@ -244,53 +354,67 @@ class Adaptor(SelectorsGeneration):
|
|
244
354
|
|
245
355
|
@property
|
246
356
|
def html_content(self) -> TextHandler:
|
247
|
-
"""Return the inner
|
248
|
-
return TextHandler(
|
357
|
+
"""Return the inner HTML code of the element"""
|
358
|
+
return TextHandler(
|
359
|
+
tostring(self._root, encoding="unicode", method="html", with_tail=False)
|
360
|
+
)
|
249
361
|
|
250
362
|
body = html_content
|
251
363
|
|
252
364
|
def prettify(self) -> TextHandler:
|
253
365
|
"""Return a prettified version of the element's inner html-code"""
|
254
|
-
return TextHandler(
|
366
|
+
return TextHandler(
|
367
|
+
tostring(
|
368
|
+
self._root,
|
369
|
+
encoding="unicode",
|
370
|
+
pretty_print=True,
|
371
|
+
method="html",
|
372
|
+
with_tail=False,
|
373
|
+
)
|
374
|
+
)
|
255
375
|
|
256
376
|
def has_class(self, class_name: str) -> bool:
|
257
|
-
"""Check if element has a specific class
|
377
|
+
"""Check if the element has a specific class
|
258
378
|
:param class_name: The class name to check for
|
259
379
|
:return: True if element has class with that name otherwise False
|
260
380
|
"""
|
261
381
|
return class_name in self._root.classes
|
262
382
|
|
263
383
|
@property
|
264
|
-
def parent(self) ->
|
384
|
+
def parent(self) -> Optional["Selector"]:
|
265
385
|
"""Return the direct parent of the element or ``None`` otherwise"""
|
266
386
|
return self.__handle_element(self._root.getparent())
|
267
387
|
|
268
388
|
@property
|
269
|
-
def below_elements(self) ->
|
389
|
+
def below_elements(self) -> "Selectors":
|
270
390
|
"""Return all elements under the current element in the DOM tree"""
|
271
|
-
below = self._root
|
391
|
+
below = _find_all_elements(self._root)
|
272
392
|
return self.__handle_elements(below)
|
273
393
|
|
274
394
|
@property
|
275
|
-
def children(self) ->
|
395
|
+
def children(self) -> "Selectors":
|
276
396
|
"""Return the children elements of the current element or empty list otherwise"""
|
277
|
-
return
|
278
|
-
self.__element_convertor(child)
|
279
|
-
|
397
|
+
return Selectors(
|
398
|
+
self.__element_convertor(child)
|
399
|
+
for child in self._root.iterchildren()
|
400
|
+
if not isinstance(child, html_forbidden)
|
401
|
+
)
|
280
402
|
|
281
403
|
@property
|
282
|
-
def siblings(self) ->
|
404
|
+
def siblings(self) -> "Selectors":
|
283
405
|
"""Return other children of the current element's parent or empty list otherwise"""
|
284
406
|
if self.parent:
|
285
|
-
return
|
286
|
-
|
407
|
+
return Selectors(
|
408
|
+
child for child in self.parent.children if child._root != self._root
|
409
|
+
)
|
410
|
+
return Selectors()
|
287
411
|
|
288
|
-
def iterancestors(self) -> Generator[
|
289
|
-
"""Return a generator that loops over all ancestors of the element, starting with element's parent."""
|
412
|
+
def iterancestors(self) -> Generator["Selector", None, None]:
|
413
|
+
"""Return a generator that loops over all ancestors of the element, starting with the element's parent."""
|
290
414
|
for ancestor in self._root.iterancestors():
|
291
415
|
yield self.__element_convertor(ancestor)
|
292
416
|
|
293
|
-
def find_ancestor(self, func: Callable[[
|
417
|
+
def find_ancestor(self, func: Callable[["Selector"], bool]) -> Optional["Selector"]:
|
294
418
|
"""Loop over all ancestors of the element till one match the passed function
|
295
419
|
:param func: A function that takes each ancestor as an argument and returns True/False
|
296
420
|
:return: The first ancestor that match the function or ``None`` otherwise.
|
@@ -301,30 +425,28 @@ class Adaptor(SelectorsGeneration):
|
|
301
425
|
return None
|
302
426
|
|
303
427
|
@property
|
304
|
-
def path(self) ->
|
305
|
-
"""Returns list of type
|
428
|
+
def path(self) -> "Selectors":
|
429
|
+
"""Returns a list of type `Selectors` that contains the path leading to the current element from the root."""
|
306
430
|
lst = list(self.iterancestors())
|
307
|
-
return
|
431
|
+
return Selectors(lst)
|
308
432
|
|
309
433
|
@property
|
310
|
-
def next(self) ->
|
434
|
+
def next(self) -> Optional["Selector"]:
|
311
435
|
"""Returns the next element of the current element in the children of the parent or ``None`` otherwise."""
|
312
436
|
next_element = self._root.getnext()
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
next_element = next_element.getnext()
|
437
|
+
while next_element is not None and isinstance(next_element, html_forbidden):
|
438
|
+
# Ignore HTML comments and unwanted types
|
439
|
+
next_element = next_element.getnext()
|
317
440
|
|
318
441
|
return self.__handle_element(next_element)
|
319
442
|
|
320
443
|
@property
|
321
|
-
def previous(self) ->
|
444
|
+
def previous(self) -> Optional["Selector"]:
|
322
445
|
"""Returns the previous element of the current element in the children of the parent or ``None`` otherwise."""
|
323
446
|
prev_element = self._root.getprevious()
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
prev_element = prev_element.getprevious()
|
447
|
+
while prev_element is not None and isinstance(prev_element, html_forbidden):
|
448
|
+
# Ignore HTML comments and unwanted types
|
449
|
+
prev_element = prev_element.getprevious()
|
328
450
|
|
329
451
|
return self.__handle_element(prev_element)
|
330
452
|
|
@@ -346,41 +468,44 @@ class Adaptor(SelectorsGeneration):
|
|
346
468
|
data = "<"
|
347
469
|
content = clean_spaces(self.html_content)
|
348
470
|
if len(content) > length_limit:
|
349
|
-
content = content[:length_limit].strip() +
|
471
|
+
content = content[:length_limit].strip() + "..."
|
350
472
|
data += f"data='{content}'"
|
351
473
|
|
352
474
|
if self.parent:
|
353
475
|
parent_content = clean_spaces(self.parent.html_content)
|
354
476
|
if len(parent_content) > length_limit:
|
355
|
-
parent_content = parent_content[:length_limit].strip() +
|
477
|
+
parent_content = parent_content[:length_limit].strip() + "..."
|
356
478
|
|
357
479
|
data += f" parent='{parent_content}'"
|
358
480
|
|
359
481
|
return data + ">"
|
360
482
|
|
361
|
-
# From here we start the selecting functions
|
483
|
+
# From here we start with the selecting functions
|
362
484
|
def relocate(
|
363
|
-
|
364
|
-
|
485
|
+
self,
|
486
|
+
element: Union[Dict, HtmlElement, "Selector"],
|
487
|
+
percentage: int = 0,
|
488
|
+
selector_type: bool = False,
|
489
|
+
) -> Union[List[HtmlElement], "Selectors"]:
|
365
490
|
"""This function will search again for the element in the page tree, used automatically on page structure change
|
366
491
|
|
367
492
|
:param element: The element we want to relocate in the tree
|
368
493
|
:param percentage: The minimum percentage to accept and not going lower than that. Be aware that the percentage
|
369
|
-
calculation depends solely on the page structure so don't play with this number unless you must know
|
494
|
+
calculation depends solely on the page structure, so don't play with this number unless you must know
|
370
495
|
what you are doing!
|
371
|
-
:param
|
372
|
-
:return: List of pure HTML elements that got the highest matching score or '
|
496
|
+
:param selector_type: If True, the return result will be converted to `Selectors` object
|
497
|
+
:return: List of pure HTML elements that got the highest matching score or 'Selectors' object
|
373
498
|
"""
|
374
499
|
score_table = {}
|
375
|
-
# Note: `element` will
|
500
|
+
# Note: `element` will most likely always be a dictionary at this point.
|
376
501
|
if isinstance(element, self.__class__):
|
377
502
|
element = element._root
|
378
503
|
|
379
|
-
if issubclass(type(element),
|
504
|
+
if issubclass(type(element), HtmlElement):
|
380
505
|
element = _StorageTools.element_to_dict(element)
|
381
506
|
|
382
|
-
for node in self._root
|
383
|
-
# Collect all elements in the page then for each element get the matching score of it against the node.
|
507
|
+
for node in _find_all_elements(self._root):
|
508
|
+
# Collect all elements in the page, then for each element get the matching score of it against the node.
|
384
509
|
# Hence: the code doesn't stop even if the score was 100%
|
385
510
|
# because there might be another element(s) left in page with the same score
|
386
511
|
score = self.__calculate_similarity_score(element, node)
|
@@ -390,230 +515,319 @@ class Adaptor(SelectorsGeneration):
|
|
390
515
|
highest_probability = max(score_table.keys())
|
391
516
|
if score_table[highest_probability] and highest_probability >= percentage:
|
392
517
|
if log.getEffectiveLevel() < 20:
|
393
|
-
# No need to execute this part if logging level is not debugging
|
394
|
-
log.debug(f
|
395
|
-
log.debug(
|
518
|
+
# No need to execute this part if the logging level is not debugging
|
519
|
+
log.debug(f"Highest probability was {highest_probability}%")
|
520
|
+
log.debug("Top 5 best matching elements are: ")
|
396
521
|
for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
|
397
|
-
log.debug(
|
522
|
+
log.debug(
|
523
|
+
f"{percent} -> {self.__handle_elements(score_table[percent])}"
|
524
|
+
)
|
398
525
|
|
399
|
-
if not
|
526
|
+
if not selector_type:
|
400
527
|
return score_table[highest_probability]
|
401
528
|
return self.__handle_elements(score_table[highest_probability])
|
402
529
|
return []
|
403
530
|
|
404
|
-
def css_first(
|
405
|
-
|
406
|
-
|
407
|
-
|
531
|
+
def css_first(
|
532
|
+
self,
|
533
|
+
selector: str,
|
534
|
+
identifier: str = "",
|
535
|
+
adaptive: bool = False,
|
536
|
+
auto_save: bool = False,
|
537
|
+
percentage: int = 0,
|
538
|
+
) -> Union["Selector", "TextHandler", None]:
|
539
|
+
"""Search the current tree with CSS3 selectors and return the first result if possible, otherwise return `None`
|
408
540
|
|
409
541
|
**Important:
|
410
|
-
It's recommended to use the identifier argument if you plan to use different selector later
|
542
|
+
It's recommended to use the identifier argument if you plan to use a different selector later
|
411
543
|
and want to relocate the same element(s)**
|
412
544
|
|
413
545
|
:param selector: The CSS3 selector to be used.
|
414
|
-
:param
|
415
|
-
:param identifier: A string that will be used to save/retrieve element's data in
|
546
|
+
:param adaptive: Enabled will make the function try to relocate the element if it was 'saved' before
|
547
|
+
:param identifier: A string that will be used to save/retrieve element's data in adaptive,
|
416
548
|
otherwise the selector will be used.
|
417
|
-
:param auto_save: Automatically save new elements for `
|
418
|
-
:param percentage: The minimum percentage to accept while
|
419
|
-
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
549
|
+
:param auto_save: Automatically save new elements for `adaptive` later
|
550
|
+
:param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that.
|
551
|
+
Be aware that the percentage calculation depends solely on the page structure, so don't play with this
|
420
552
|
number unless you must know what you are doing!
|
421
553
|
"""
|
422
|
-
for element in self.css(
|
554
|
+
for element in self.css(
|
555
|
+
selector,
|
556
|
+
identifier,
|
557
|
+
adaptive,
|
558
|
+
auto_save,
|
559
|
+
percentage,
|
560
|
+
_scrapling_first_match=True,
|
561
|
+
):
|
423
562
|
return element
|
424
563
|
return None
|
425
564
|
|
426
|
-
def xpath_first(
|
427
|
-
|
428
|
-
|
429
|
-
|
565
|
+
def xpath_first(
|
566
|
+
self,
|
567
|
+
selector: str,
|
568
|
+
identifier: str = "",
|
569
|
+
adaptive: bool = False,
|
570
|
+
auto_save: bool = False,
|
571
|
+
percentage: int = 0,
|
572
|
+
**kwargs: Any,
|
573
|
+
) -> Union["Selector", "TextHandler", None]:
|
574
|
+
"""Search the current tree with XPath selectors and return the first result if possible, otherwise return `None`
|
430
575
|
|
431
576
|
**Important:
|
432
|
-
It's recommended to use the identifier argument if you plan to use different selector later
|
577
|
+
It's recommended to use the identifier argument if you plan to use a different selector later
|
433
578
|
and want to relocate the same element(s)**
|
434
579
|
|
435
580
|
Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
|
436
581
|
|
437
582
|
:param selector: The XPath selector to be used.
|
438
|
-
:param
|
439
|
-
:param identifier: A string that will be used to save/retrieve element's data in
|
583
|
+
:param adaptive: Enabled will make the function try to relocate the element if it was 'saved' before
|
584
|
+
:param identifier: A string that will be used to save/retrieve element's data in adaptive,
|
440
585
|
otherwise the selector will be used.
|
441
|
-
:param auto_save: Automatically save new elements for `
|
442
|
-
:param percentage: The minimum percentage to accept while
|
443
|
-
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
586
|
+
:param auto_save: Automatically save new elements for `adaptive` later
|
587
|
+
:param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that.
|
588
|
+
Be aware that the percentage calculation depends solely on the page structure, so don't play with this
|
444
589
|
number unless you must know what you are doing!
|
445
590
|
"""
|
446
|
-
for element in self.xpath(
|
591
|
+
for element in self.xpath(
|
592
|
+
selector,
|
593
|
+
identifier,
|
594
|
+
adaptive,
|
595
|
+
auto_save,
|
596
|
+
percentage,
|
597
|
+
_scrapling_first_match=True,
|
598
|
+
**kwargs,
|
599
|
+
):
|
447
600
|
return element
|
448
601
|
return None
|
449
602
|
|
450
|
-
def css(
|
451
|
-
|
452
|
-
|
453
|
-
|
603
|
+
def css(
|
604
|
+
self,
|
605
|
+
selector: str,
|
606
|
+
identifier: str = "",
|
607
|
+
adaptive: bool = False,
|
608
|
+
auto_save: bool = False,
|
609
|
+
percentage: int = 0,
|
610
|
+
**kwargs: Any,
|
611
|
+
) -> Union["Selectors", List, "TextHandlers"]:
|
612
|
+
"""Search the current tree with CSS3 selectors
|
454
613
|
|
455
614
|
**Important:
|
456
|
-
It's recommended to use the identifier argument if you plan to use different selector later
|
615
|
+
It's recommended to use the identifier argument if you plan to use a different selector later
|
457
616
|
and want to relocate the same element(s)**
|
458
617
|
|
459
618
|
:param selector: The CSS3 selector to be used.
|
460
|
-
:param
|
461
|
-
:param identifier: A string that will be used to save/retrieve element's data in
|
619
|
+
:param adaptive: Enabled will make the function try to relocate the element if it was 'saved' before
|
620
|
+
:param identifier: A string that will be used to save/retrieve element's data in adaptive,
|
462
621
|
otherwise the selector will be used.
|
463
|
-
:param auto_save: Automatically save new elements for `
|
464
|
-
:param percentage: The minimum percentage to accept while
|
465
|
-
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
622
|
+
:param auto_save: Automatically save new elements for `adaptive` later
|
623
|
+
:param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that.
|
624
|
+
Be aware that the percentage calculation depends solely on the page structure, so don't play with this
|
466
625
|
number unless you must know what you are doing!
|
467
626
|
|
468
|
-
:return:
|
627
|
+
:return: `Selectors` class.
|
469
628
|
"""
|
470
629
|
try:
|
471
|
-
if not self.
|
630
|
+
if not self.__adaptive_enabled or "," not in selector:
|
472
631
|
# No need to split selectors in this case, let's save some CPU cycles :)
|
473
|
-
xpath_selector =
|
474
|
-
return self.xpath(
|
632
|
+
xpath_selector = _translator.css_to_xpath(selector)
|
633
|
+
return self.xpath(
|
634
|
+
xpath_selector,
|
635
|
+
identifier or selector,
|
636
|
+
adaptive,
|
637
|
+
auto_save,
|
638
|
+
percentage,
|
639
|
+
_scrapling_first_match=kwargs.pop("_scrapling_first_match", False),
|
640
|
+
)
|
475
641
|
|
476
642
|
results = []
|
477
|
-
|
478
|
-
for
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
643
|
+
for single_selector in split_selectors(selector):
|
644
|
+
# I'm doing this only so the `save` function saves data correctly for combined selectors
|
645
|
+
# Like using the ',' to combine two different selectors that point to different elements.
|
646
|
+
xpath_selector = _translator.css_to_xpath(single_selector.canonical())
|
647
|
+
results += self.xpath(
|
648
|
+
xpath_selector,
|
649
|
+
identifier or single_selector.canonical(),
|
650
|
+
adaptive,
|
651
|
+
auto_save,
|
652
|
+
percentage,
|
653
|
+
_scrapling_first_match=kwargs.pop("_scrapling_first_match", False),
|
654
|
+
)
|
485
655
|
|
486
656
|
return results
|
487
|
-
except (
|
488
|
-
|
657
|
+
except (
|
658
|
+
SelectorError,
|
659
|
+
SelectorSyntaxError,
|
660
|
+
) as e:
|
661
|
+
raise SelectorSyntaxError(
|
662
|
+
f"Invalid CSS selector '{selector}': {str(e)}"
|
663
|
+
) from e
|
489
664
|
|
490
|
-
def xpath(
|
491
|
-
|
492
|
-
|
493
|
-
|
665
|
+
def xpath(
|
666
|
+
self,
|
667
|
+
selector: str,
|
668
|
+
identifier: str = "",
|
669
|
+
adaptive: bool = False,
|
670
|
+
auto_save: bool = False,
|
671
|
+
percentage: int = 0,
|
672
|
+
**kwargs: Any,
|
673
|
+
) -> Union["Selectors", "TextHandlers"]:
|
674
|
+
"""Search the current tree with XPath selectors
|
494
675
|
|
495
676
|
**Important:
|
496
|
-
It's recommended to use the identifier argument if you plan to use different selector later
|
677
|
+
It's recommended to use the identifier argument if you plan to use a different selector later
|
497
678
|
and want to relocate the same element(s)**
|
498
679
|
|
499
680
|
Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
|
500
681
|
|
501
682
|
:param selector: The XPath selector to be used.
|
502
|
-
:param
|
503
|
-
:param identifier: A string that will be used to save/retrieve element's data in
|
683
|
+
:param adaptive: Enabled will make the function try to relocate the element if it was 'saved' before
|
684
|
+
:param identifier: A string that will be used to save/retrieve element's data in adaptive,
|
504
685
|
otherwise the selector will be used.
|
505
|
-
:param auto_save: Automatically save new elements for `
|
506
|
-
:param percentage: The minimum percentage to accept while
|
507
|
-
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
686
|
+
:param auto_save: Automatically save new elements for `adaptive` later
|
687
|
+
:param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that.
|
688
|
+
Be aware that the percentage calculation depends solely on the page structure, so don't play with this
|
508
689
|
number unless you must know what you are doing!
|
509
690
|
|
510
|
-
:return:
|
691
|
+
:return: `Selectors` class.
|
511
692
|
"""
|
693
|
+
_first_match = kwargs.pop(
|
694
|
+
"_scrapling_first_match", False
|
695
|
+
) # Used internally only to speed up `css_first` and `xpath_first`
|
512
696
|
try:
|
513
|
-
elements
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
697
|
+
if elements := self._root.xpath(selector, **kwargs):
|
698
|
+
if not self.__adaptive_enabled and auto_save:
|
699
|
+
log.warning(
|
700
|
+
"Argument `auto_save` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info."
|
701
|
+
)
|
702
|
+
elif self.__adaptive_enabled and auto_save:
|
703
|
+
self.save(elements[0], identifier or selector)
|
704
|
+
|
705
|
+
return self.__handle_elements(
|
706
|
+
elements[0:1] if (_first_match and elements) else elements
|
707
|
+
)
|
708
|
+
elif self.__adaptive_enabled:
|
709
|
+
if adaptive:
|
525
710
|
element_data = self.retrieve(identifier or selector)
|
526
711
|
if element_data:
|
527
712
|
elements = self.relocate(element_data, percentage)
|
528
713
|
if elements is not None and auto_save:
|
529
714
|
self.save(elements[0], identifier or selector)
|
530
715
|
|
531
|
-
return self.__handle_elements(
|
716
|
+
return self.__handle_elements(
|
717
|
+
elements[0:1] if (_first_match and elements) else elements
|
718
|
+
)
|
532
719
|
else:
|
533
|
-
if
|
534
|
-
log.warning(
|
720
|
+
if adaptive:
|
721
|
+
log.warning(
|
722
|
+
"Argument `adaptive` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info."
|
723
|
+
)
|
535
724
|
elif auto_save:
|
536
|
-
log.warning(
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
except (SelectorError, SelectorSyntaxError, etree.XPathError, etree.XPathEvalError):
|
541
|
-
raise SelectorSyntaxError(f"Invalid XPath selector: {selector}")
|
725
|
+
log.warning(
|
726
|
+
"Argument `auto_save` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info."
|
727
|
+
)
|
542
728
|
|
543
|
-
|
544
|
-
|
729
|
+
return self.__handle_elements(
|
730
|
+
elements[0:1] if (_first_match and elements) else elements
|
731
|
+
)
|
545
732
|
|
546
|
-
|
733
|
+
except (
|
734
|
+
SelectorError,
|
735
|
+
SelectorSyntaxError,
|
736
|
+
XPathError,
|
737
|
+
XPathEvalError,
|
738
|
+
) as e:
|
739
|
+
raise SelectorSyntaxError(f"Invalid XPath selector: {selector}") from e
|
740
|
+
|
741
|
+
def find_all(
|
742
|
+
self,
|
743
|
+
*args: str | Iterable[str] | Pattern | Callable | Dict[str, str],
|
744
|
+
**kwargs: str,
|
745
|
+
) -> "Selectors":
|
746
|
+
"""Find elements by filters of your creations for ease.
|
747
|
+
|
748
|
+
:param args: Tag name(s), iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
|
547
749
|
:param kwargs: The attributes you want to filter elements based on it.
|
548
|
-
:return: The `
|
750
|
+
:return: The `Selectors` object of the elements or empty list
|
549
751
|
"""
|
550
|
-
# Attributes that are Python reserved words and can't be used directly
|
551
|
-
# Ex: find_all('a', class="blah") -> find_all('a', class_="blah")
|
552
|
-
# https://www.w3schools.com/python/python_ref_keywords.asp
|
553
|
-
whitelisted = {
|
554
|
-
'class_': 'class',
|
555
|
-
'for_': 'for',
|
556
|
-
}
|
557
752
|
|
558
753
|
if not args and not kwargs:
|
559
|
-
raise TypeError(
|
754
|
+
raise TypeError(
|
755
|
+
"You have to pass something to search with, like tag name(s), tag attributes, or both."
|
756
|
+
)
|
560
757
|
|
561
758
|
attributes = dict()
|
562
759
|
tags, patterns = set(), set()
|
563
|
-
results, functions, selectors =
|
760
|
+
results, functions, selectors = Selectors(), [], []
|
564
761
|
|
565
762
|
# Brace yourself for a wonderful journey!
|
566
763
|
for arg in args:
|
567
|
-
if
|
764
|
+
if isinstance(arg, str):
|
568
765
|
tags.add(arg)
|
569
766
|
|
570
|
-
elif type(arg) in
|
571
|
-
if not all(map(lambda x:
|
572
|
-
raise TypeError(
|
767
|
+
elif type(arg) in (list, tuple, set):
|
768
|
+
if not all(map(lambda x: isinstance(x, str), arg)):
|
769
|
+
raise TypeError(
|
770
|
+
"Nested Iterables are not accepted, only iterables of tag names are accepted"
|
771
|
+
)
|
573
772
|
tags.update(set(arg))
|
574
773
|
|
575
774
|
elif isinstance(arg, dict):
|
576
|
-
if not all(
|
577
|
-
|
775
|
+
if not all(
|
776
|
+
[
|
777
|
+
(isinstance(k, str) and isinstance(v, str))
|
778
|
+
for k, v in arg.items()
|
779
|
+
]
|
780
|
+
):
|
781
|
+
raise TypeError(
|
782
|
+
"Nested dictionaries are not accepted, only string keys and string values are accepted"
|
783
|
+
)
|
578
784
|
attributes.update(arg)
|
579
785
|
|
580
786
|
elif isinstance(arg, re.Pattern):
|
581
787
|
patterns.add(arg)
|
582
788
|
|
583
789
|
elif callable(arg):
|
584
|
-
if len(
|
790
|
+
if len(signature(arg).parameters) > 0:
|
585
791
|
functions.append(arg)
|
586
792
|
else:
|
587
|
-
raise TypeError(
|
793
|
+
raise TypeError(
|
794
|
+
"Callable filter function must have at least one argument to take `Selector` objects."
|
795
|
+
)
|
588
796
|
|
589
797
|
else:
|
590
|
-
raise TypeError(
|
798
|
+
raise TypeError(
|
799
|
+
f'Argument with type "{type(arg)}" is not accepted, please read the docs.'
|
800
|
+
)
|
591
801
|
|
592
|
-
if not all(
|
593
|
-
|
802
|
+
if not all(
|
803
|
+
[(isinstance(k, str) and isinstance(v, str)) for k, v in kwargs.items()]
|
804
|
+
):
|
805
|
+
raise TypeError("Only string values are accepted for arguments")
|
594
806
|
|
595
807
|
for attribute_name, value in kwargs.items():
|
596
808
|
# Only replace names for kwargs, replacing them in dictionaries doesn't make sense
|
597
|
-
attribute_name =
|
809
|
+
attribute_name = _whitelisted.get(attribute_name, attribute_name)
|
598
810
|
attributes[attribute_name] = value
|
599
811
|
|
600
812
|
# It's easier and faster to build a selector than traversing the tree
|
601
|
-
tags = tags or [
|
813
|
+
tags = tags or ["*"]
|
602
814
|
for tag in tags:
|
603
815
|
selector = tag
|
604
816
|
for key, value in attributes.items():
|
605
|
-
value = value.replace('"', r
|
817
|
+
value = value.replace('"', r"\"") # Escape double quotes in user input
|
606
818
|
# Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)
|
607
819
|
selector += '[{}="{}"]'.format(key, value)
|
608
|
-
if selector !=
|
820
|
+
if selector != "*":
|
609
821
|
selectors.append(selector)
|
610
822
|
|
611
823
|
if selectors:
|
612
|
-
results = self.css(
|
824
|
+
results = self.css(", ".join(selectors))
|
613
825
|
if results:
|
614
826
|
# From the results, get the ones that fulfill passed regex patterns
|
615
827
|
for pattern in patterns:
|
616
|
-
results = results.filter(
|
828
|
+
results = results.filter(
|
829
|
+
lambda e: e.text.re(pattern, check_match=True)
|
830
|
+
)
|
617
831
|
|
618
832
|
# From the results, get the ones that fulfill passed functions
|
619
833
|
for function in functions:
|
@@ -623,25 +837,31 @@ class Adaptor(SelectorsGeneration):
|
|
623
837
|
for pattern in patterns:
|
624
838
|
results = results.filter(lambda e: e.text.re(pattern, check_match=True))
|
625
839
|
|
626
|
-
# Collect element if it fulfills passed function otherwise
|
840
|
+
# Collect an element if it fulfills the passed function otherwise
|
627
841
|
for function in functions:
|
628
842
|
results = results.filter(function)
|
629
843
|
|
630
844
|
return results
|
631
845
|
|
632
|
-
def find(
|
633
|
-
|
846
|
+
def find(
|
847
|
+
self,
|
848
|
+
*args: str | Iterable[str] | Pattern | Callable | Dict[str, str],
|
849
|
+
**kwargs: str,
|
850
|
+
) -> Optional["Selector"]:
|
851
|
+
"""Find elements by filters of your creations for ease, then return the first result. Otherwise return `None`.
|
634
852
|
|
635
|
-
:param args: Tag name(s),
|
853
|
+
:param args: Tag name(s), iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
|
636
854
|
:param kwargs: The attributes you want to filter elements based on it.
|
637
|
-
:return: The `
|
855
|
+
:return: The `Selector` object of the element or `None` if the result didn't match
|
638
856
|
"""
|
639
857
|
for element in self.find_all(*args, **kwargs):
|
640
858
|
return element
|
641
859
|
return None
|
642
860
|
|
643
|
-
def __calculate_similarity_score(
|
644
|
-
|
861
|
+
def __calculate_similarity_score(
|
862
|
+
self, original: Dict, candidate: HtmlElement
|
863
|
+
) -> float:
|
864
|
+
"""Used internally to calculate a score that shows how a candidate element similar to the original one
|
645
865
|
|
646
866
|
:param original: The original element in the form of the dictionary generated from `element_to_dict` function
|
647
867
|
:param candidate: The element to compare with the original element.
|
@@ -653,53 +873,68 @@ class Adaptor(SelectorsGeneration):
|
|
653
873
|
# Possible TODO:
|
654
874
|
# Study the idea of giving weight to each test below so some are more important than others
|
655
875
|
# Current results: With weights some websites had better score while it was worse for others
|
656
|
-
score += 1 if original[
|
876
|
+
score += 1 if original["tag"] == candidate["tag"] else 0 # * 0.3 # 30%
|
657
877
|
checks += 1
|
658
878
|
|
659
|
-
if original[
|
660
|
-
score += SequenceMatcher(
|
879
|
+
if original["text"]:
|
880
|
+
score += SequenceMatcher(
|
881
|
+
None, original["text"], candidate.get("text") or ""
|
882
|
+
).ratio() # * 0.3 # 30%
|
661
883
|
checks += 1
|
662
884
|
|
663
|
-
# if both
|
664
|
-
score += self.__calculate_dict_diff(
|
885
|
+
# if both don't have attributes, it still counts for something!
|
886
|
+
score += self.__calculate_dict_diff(
|
887
|
+
original["attributes"], candidate["attributes"]
|
888
|
+
) # * 0.3 # 30%
|
665
889
|
checks += 1
|
666
890
|
|
667
891
|
# Separate similarity test for class, id, href,... this will help in full structural changes
|
668
|
-
for attrib in (
|
669
|
-
|
892
|
+
for attrib in (
|
893
|
+
"class",
|
894
|
+
"id",
|
895
|
+
"href",
|
896
|
+
"src",
|
897
|
+
):
|
898
|
+
if original["attributes"].get(attrib):
|
670
899
|
score += SequenceMatcher(
|
671
|
-
None,
|
900
|
+
None,
|
901
|
+
original["attributes"][attrib],
|
902
|
+
candidate["attributes"].get(attrib) or "",
|
672
903
|
).ratio() # * 0.3 # 30%
|
673
904
|
checks += 1
|
674
905
|
|
675
|
-
score += SequenceMatcher(
|
906
|
+
score += SequenceMatcher(
|
907
|
+
None, original["path"], candidate["path"]
|
908
|
+
).ratio() # * 0.1 # 10%
|
676
909
|
checks += 1
|
677
910
|
|
678
|
-
if original.get(
|
911
|
+
if original.get("parent_name"):
|
679
912
|
# Then we start comparing parents' data
|
680
|
-
if candidate.get(
|
913
|
+
if candidate.get("parent_name"):
|
681
914
|
score += SequenceMatcher(
|
682
|
-
None, original[
|
915
|
+
None, original["parent_name"], candidate.get("parent_name") or ""
|
683
916
|
).ratio() # * 0.2 # 20%
|
684
917
|
checks += 1
|
685
918
|
|
686
919
|
score += self.__calculate_dict_diff(
|
687
|
-
original[
|
920
|
+
original["parent_attribs"], candidate.get("parent_attribs") or {}
|
688
921
|
) # * 0.2 # 20%
|
689
922
|
checks += 1
|
690
923
|
|
691
|
-
if original[
|
924
|
+
if original["parent_text"]:
|
692
925
|
score += SequenceMatcher(
|
693
|
-
None,
|
926
|
+
None,
|
927
|
+
original["parent_text"],
|
928
|
+
candidate.get("parent_text") or "",
|
694
929
|
).ratio() # * 0.1 # 10%
|
695
930
|
checks += 1
|
696
931
|
# else:
|
697
|
-
# # The original element
|
932
|
+
# # The original element has a parent and this one not, this is not a good sign
|
698
933
|
# score -= 0.1
|
699
934
|
|
700
|
-
if original.get(
|
935
|
+
if original.get("siblings"):
|
701
936
|
score += SequenceMatcher(
|
702
|
-
None, original[
|
937
|
+
None, original["siblings"], candidate.get("siblings") or []
|
703
938
|
).ratio() # * 0.1 # 10%
|
704
939
|
checks += 1
|
705
940
|
|
@@ -707,21 +942,26 @@ class Adaptor(SelectorsGeneration):
|
|
707
942
|
return round((score / checks) * 100, 2)
|
708
943
|
|
709
944
|
@staticmethod
|
710
|
-
def __calculate_dict_diff(dict1:
|
711
|
-
"""Used internally calculate similarity between two dictionaries as SequenceMatcher doesn't accept dictionaries
|
712
|
-
|
713
|
-
|
714
|
-
|
945
|
+
def __calculate_dict_diff(dict1: Dict, dict2: Dict) -> float:
|
946
|
+
"""Used internally to calculate similarity between two dictionaries as SequenceMatcher doesn't accept dictionaries"""
|
947
|
+
score = (
|
948
|
+
SequenceMatcher(None, tuple(dict1.keys()), tuple(dict2.keys())).ratio()
|
949
|
+
* 0.5
|
950
|
+
)
|
951
|
+
score += (
|
952
|
+
SequenceMatcher(None, tuple(dict1.values()), tuple(dict2.values())).ratio()
|
953
|
+
* 0.5
|
954
|
+
)
|
715
955
|
return score
|
716
956
|
|
717
|
-
def save(self, element: Union[
|
957
|
+
def save(self, element: Union["Selector", HtmlElement], identifier: str) -> None:
|
718
958
|
"""Saves the element's unique properties to the storage for retrieval and relocation later
|
719
959
|
|
720
|
-
:param element: The element itself that we want to save to storage, it can be a `
|
960
|
+
:param element: The element itself that we want to save to storage, it can be a ` Selector ` or pure ` HtmlElement `
|
721
961
|
:param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
|
722
962
|
the docs for more info.
|
723
963
|
"""
|
724
|
-
if self.
|
964
|
+
if self.__adaptive_enabled:
|
725
965
|
if isinstance(element, self.__class__):
|
726
966
|
element = element._root
|
727
967
|
|
@@ -731,154 +971,202 @@ class Adaptor(SelectorsGeneration):
|
|
731
971
|
self._storage.save(element, identifier)
|
732
972
|
else:
|
733
973
|
log.critical(
|
734
|
-
"Can't use
|
974
|
+
"Can't use `adaptive` features while it's disabled globally, you have to start a new class instance."
|
735
975
|
)
|
736
976
|
|
737
|
-
def retrieve(self, identifier: str) -> Optional[Dict]:
|
977
|
+
def retrieve(self, identifier: str) -> Optional[Dict[str, Any]]:
|
738
978
|
"""Using the identifier, we search the storage and return the unique properties of the element
|
739
979
|
|
740
980
|
:param identifier: This is the identifier that will be used to retrieve the element from the storage. See
|
741
981
|
the docs for more info.
|
742
982
|
:return: A dictionary of the unique properties
|
743
983
|
"""
|
744
|
-
if self.
|
984
|
+
if self.__adaptive_enabled:
|
745
985
|
return self._storage.retrieve(identifier)
|
746
986
|
|
747
987
|
log.critical(
|
748
|
-
"Can't use
|
988
|
+
"Can't use `adaptive` features while it's disabled globally, you have to start a new class instance."
|
749
989
|
)
|
990
|
+
return None
|
750
991
|
|
751
992
|
# Operations on text functions
|
752
993
|
def json(self) -> Dict:
|
753
|
-
"""Return
|
754
|
-
if self.
|
994
|
+
"""Return JSON response if the response is jsonable otherwise throws error"""
|
995
|
+
if self._raw_body:
|
996
|
+
return TextHandler(self._raw_body).json()
|
997
|
+
elif self.text:
|
755
998
|
return self.text.json()
|
756
999
|
else:
|
757
1000
|
return self.get_all_text(strip=True).json()
|
758
1001
|
|
759
|
-
def re(
|
760
|
-
|
1002
|
+
def re(
|
1003
|
+
self,
|
1004
|
+
regex: str | Pattern[str],
|
1005
|
+
replace_entities: bool = True,
|
1006
|
+
clean_match: bool = False,
|
1007
|
+
case_sensitive: bool = True,
|
1008
|
+
) -> TextHandlers:
|
761
1009
|
"""Apply the given regex to the current text and return a list of strings with the matches.
|
762
1010
|
|
763
1011
|
:param regex: Can be either a compiled regular expression or a string.
|
764
|
-
:param replace_entities:
|
1012
|
+
:param replace_entities: If enabled character entity references are replaced by their corresponding character
|
765
1013
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
766
|
-
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
1014
|
+
:param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
|
767
1015
|
"""
|
768
1016
|
return self.text.re(regex, replace_entities, clean_match, case_sensitive)
|
769
1017
|
|
770
|
-
def re_first(
|
771
|
-
|
1018
|
+
def re_first(
|
1019
|
+
self,
|
1020
|
+
regex: str | Pattern[str],
|
1021
|
+
default=None,
|
1022
|
+
replace_entities: bool = True,
|
1023
|
+
clean_match: bool = False,
|
1024
|
+
case_sensitive: bool = True,
|
1025
|
+
) -> TextHandler:
|
772
1026
|
"""Apply the given regex to text and return the first match if found, otherwise return the default value.
|
773
1027
|
|
774
1028
|
:param regex: Can be either a compiled regular expression or a string.
|
775
1029
|
:param default: The default value to be returned if there is no match
|
776
1030
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
777
1031
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
778
|
-
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
1032
|
+
:param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
|
779
1033
|
"""
|
780
|
-
return self.text.re_first(
|
1034
|
+
return self.text.re_first(
|
1035
|
+
regex, default, replace_entities, clean_match, case_sensitive
|
1036
|
+
)
|
1037
|
+
|
1038
|
+
@staticmethod
|
1039
|
+
def __get_attributes(element: HtmlElement, ignore_attributes: List | Tuple) -> Dict:
|
1040
|
+
"""Return attributes dictionary without the ignored list"""
|
1041
|
+
return {k: v for k, v in element.attrib.items() if k not in ignore_attributes}
|
1042
|
+
|
1043
|
+
def __are_alike(
|
1044
|
+
self,
|
1045
|
+
original: HtmlElement,
|
1046
|
+
original_attributes: Dict,
|
1047
|
+
candidate: HtmlElement,
|
1048
|
+
ignore_attributes: List | Tuple,
|
1049
|
+
similarity_threshold: float,
|
1050
|
+
match_text: bool = False,
|
1051
|
+
) -> bool:
|
1052
|
+
"""Calculate a score of how much these elements are alike and return True
|
1053
|
+
if the score is higher or equals the threshold"""
|
1054
|
+
candidate_attributes = (
|
1055
|
+
self.__get_attributes(candidate, ignore_attributes)
|
1056
|
+
if ignore_attributes
|
1057
|
+
else candidate.attrib
|
1058
|
+
)
|
1059
|
+
score, checks = 0, 0
|
1060
|
+
|
1061
|
+
if original_attributes:
|
1062
|
+
score += sum(
|
1063
|
+
SequenceMatcher(None, v, candidate_attributes.get(k, "")).ratio()
|
1064
|
+
for k, v in original_attributes.items()
|
1065
|
+
)
|
1066
|
+
checks += len(candidate_attributes)
|
1067
|
+
else:
|
1068
|
+
if not candidate_attributes:
|
1069
|
+
# Both don't have attributes, this must mean something
|
1070
|
+
score += 1
|
1071
|
+
checks += 1
|
1072
|
+
|
1073
|
+
if match_text:
|
1074
|
+
score += SequenceMatcher(
|
1075
|
+
None,
|
1076
|
+
clean_spaces(original.text or ""),
|
1077
|
+
clean_spaces(candidate.text or ""),
|
1078
|
+
).ratio()
|
1079
|
+
checks += 1
|
1080
|
+
|
1081
|
+
if checks:
|
1082
|
+
return round(score / checks, 2) >= similarity_threshold
|
1083
|
+
return False
|
781
1084
|
|
782
1085
|
def find_similar(
|
783
|
-
|
784
|
-
|
785
|
-
|
786
|
-
|
787
|
-
|
1086
|
+
self,
|
1087
|
+
similarity_threshold: float = 0.2,
|
1088
|
+
ignore_attributes: List | Tuple = (
|
1089
|
+
"href",
|
1090
|
+
"src",
|
1091
|
+
),
|
1092
|
+
match_text: bool = False,
|
1093
|
+
) -> "Selectors":
|
788
1094
|
"""Find elements that are in the same tree depth in the page with the same tag name and same parent tag etc...
|
789
|
-
then return the ones that match the current element attributes with percentage higher than the input threshold.
|
1095
|
+
then return the ones that match the current element attributes with a percentage higher than the input threshold.
|
790
1096
|
|
791
1097
|
This function is inspired by AutoScraper and made for cases where you, for example, found a product div inside
|
792
|
-
a products-list container and want to find other products using that
|
1098
|
+
a products-list container and want to find other products using that element as a starting point EXCEPT
|
793
1099
|
this function works in any case without depending on the element type.
|
794
1100
|
|
795
|
-
:param similarity_threshold: The percentage to use while comparing
|
1101
|
+
:param similarity_threshold: The percentage to use while comparing element attributes.
|
796
1102
|
Note: Elements found before attributes matching/comparison will be sharing the same depth, same tag name,
|
797
|
-
same parent tag name, and same grand parent tag name. So they are 99% likely to be correct unless
|
798
|
-
extremely unlucky then attributes matching comes into play so
|
1103
|
+
same parent tag name, and same grand parent tag name. So they are 99% likely to be correct unless you are
|
1104
|
+
extremely unlucky, then attributes matching comes into play, so don't play with this number unless
|
799
1105
|
you are getting the results you don't want.
|
800
|
-
Also, if current element doesn't have attributes and the similar element as well, then it's a 100% match.
|
801
|
-
:param ignore_attributes: Attribute names passed will be ignored while matching the attributes in last step.
|
802
|
-
The default value is to ignore `href` and `src` as URLs can change a lot between elements so it's unreliable
|
803
|
-
:param match_text: If True,
|
804
|
-
Not recommended to use in normal cases but it depends.
|
1106
|
+
Also, if the current element doesn't have attributes and the similar element as well, then it's a 100% match.
|
1107
|
+
:param ignore_attributes: Attribute names passed will be ignored while matching the attributes in the last step.
|
1108
|
+
The default value is to ignore `href` and `src` as URLs can change a lot between elements, so it's unreliable
|
1109
|
+
:param match_text: If True, element text content will be taken into calculation while matching.
|
1110
|
+
Not recommended to use in normal cases, but it depends.
|
805
1111
|
|
806
|
-
:return: A ``
|
1112
|
+
:return: A ``Selectors`` container of ``Selector`` objects or empty list
|
807
1113
|
"""
|
808
|
-
|
809
|
-
|
810
|
-
|
811
|
-
|
812
|
-
def are_alike(original: html.HtmlElement, original_attributes: Dict, candidate: html.HtmlElement) -> bool:
|
813
|
-
"""Calculate a score of how much these elements are alike and return True
|
814
|
-
if score is higher or equal the threshold"""
|
815
|
-
candidate_attributes = get_attributes(candidate) if ignore_attributes else candidate.attrib
|
816
|
-
score, checks = 0, 0
|
817
|
-
|
818
|
-
if original_attributes:
|
819
|
-
score += sum(
|
820
|
-
SequenceMatcher(None, v, candidate_attributes.get(k, '')).ratio()
|
821
|
-
for k, v in original_attributes.items()
|
822
|
-
)
|
823
|
-
checks += len(candidate_attributes)
|
824
|
-
else:
|
825
|
-
if not candidate_attributes:
|
826
|
-
# Both doesn't have attributes, this must mean something
|
827
|
-
score += 1
|
828
|
-
checks += 1
|
1114
|
+
# We will use the elements' root from now on to get the speed boost of using Lxml directly
|
1115
|
+
root = self._root
|
1116
|
+
similar_elements = list()
|
829
1117
|
|
830
|
-
|
831
|
-
|
832
|
-
|
833
|
-
|
834
|
-
|
1118
|
+
current_depth = len(list(root.iterancestors()))
|
1119
|
+
target_attrs = (
|
1120
|
+
self.__get_attributes(root, ignore_attributes)
|
1121
|
+
if ignore_attributes
|
1122
|
+
else root.attrib
|
1123
|
+
)
|
835
1124
|
|
836
|
-
|
837
|
-
|
838
|
-
|
1125
|
+
path_parts = [self.tag]
|
1126
|
+
if (parent := root.getparent()) is not None:
|
1127
|
+
path_parts.insert(0, parent.tag)
|
1128
|
+
if (grandparent := parent.getparent()) is not None:
|
1129
|
+
path_parts.insert(0, grandparent.tag)
|
839
1130
|
|
840
|
-
|
841
|
-
|
842
|
-
|
843
|
-
|
844
|
-
similar_elements = list()
|
845
|
-
# + root.xpath(f"//{self.tag}[count(ancestor::*) = {current_depth-1}]")
|
846
|
-
parent = root.getparent()
|
847
|
-
if parent is not None:
|
848
|
-
grandparent = parent.getparent() # lol
|
849
|
-
if grandparent is not None:
|
850
|
-
potential_matches = root.xpath(
|
851
|
-
f"//{grandparent.tag}/{parent.tag}/{self.tag}[count(ancestor::*) = {current_depth}]"
|
852
|
-
)
|
853
|
-
else:
|
854
|
-
potential_matches = root.xpath(f"//{parent.tag}/{self.tag}[count(ancestor::*) = {current_depth}]")
|
855
|
-
else:
|
856
|
-
potential_matches = root.xpath(f"//{self.tag}[count(ancestor::*) = {current_depth}]")
|
1131
|
+
xpath_path = "//{}".format("/".join(path_parts))
|
1132
|
+
potential_matches = root.xpath(
|
1133
|
+
f"{xpath_path}[count(ancestor::*) = {current_depth}]"
|
1134
|
+
)
|
857
1135
|
|
858
1136
|
for potential_match in potential_matches:
|
859
|
-
if potential_match != root and
|
1137
|
+
if potential_match != root and self.__are_alike(
|
1138
|
+
root,
|
1139
|
+
target_attrs,
|
1140
|
+
potential_match,
|
1141
|
+
ignore_attributes,
|
1142
|
+
similarity_threshold,
|
1143
|
+
match_text,
|
1144
|
+
):
|
860
1145
|
similar_elements.append(potential_match)
|
861
1146
|
|
862
|
-
return self.
|
1147
|
+
return Selectors(map(self.__element_convertor, similar_elements))
|
863
1148
|
|
864
1149
|
def find_by_text(
|
865
|
-
|
866
|
-
|
867
|
-
|
1150
|
+
self,
|
1151
|
+
text: str,
|
1152
|
+
first_match: bool = True,
|
1153
|
+
partial: bool = False,
|
1154
|
+
case_sensitive: bool = False,
|
1155
|
+
clean_match: bool = True,
|
1156
|
+
) -> Union["Selectors", "Selector"]:
|
868
1157
|
"""Find elements that its text content fully/partially matches input.
|
869
1158
|
:param text: Text query to match
|
870
|
-
:param first_match:
|
871
|
-
:param partial: If enabled, function
|
872
|
-
:param case_sensitive: if enabled, letters case will be taken into consideration
|
1159
|
+
:param first_match: Returns the first element that matches conditions, enabled by default
|
1160
|
+
:param partial: If enabled, the function returns elements that contain the input text
|
1161
|
+
:param case_sensitive: if enabled, the letters case will be taken into consideration
|
873
1162
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
874
1163
|
"""
|
875
1164
|
|
876
|
-
results =
|
1165
|
+
results = Selectors()
|
877
1166
|
if not case_sensitive:
|
878
1167
|
text = text.lower()
|
879
1168
|
|
880
|
-
|
881
|
-
for node in self.__handle_elements(self._root.xpath('.//*[normalize-space(text())]')):
|
1169
|
+
for node in self.__handle_elements(_find_all_elements_with_spaces(self._root)):
|
882
1170
|
"""Check if element matches given text otherwise, traverse the children tree and iterate"""
|
883
1171
|
node_text = node.text
|
884
1172
|
if clean_match:
|
@@ -903,21 +1191,29 @@ class Adaptor(SelectorsGeneration):
|
|
903
1191
|
return results
|
904
1192
|
|
905
1193
|
def find_by_regex(
|
906
|
-
|
907
|
-
|
1194
|
+
self,
|
1195
|
+
query: str | Pattern[str],
|
1196
|
+
first_match: bool = True,
|
1197
|
+
case_sensitive: bool = False,
|
1198
|
+
clean_match: bool = True,
|
1199
|
+
) -> Union["Selectors", "Selector"]:
|
908
1200
|
"""Find elements that its text content matches the input regex pattern.
|
909
1201
|
:param query: Regex query/pattern to match
|
910
|
-
:param first_match: Return first element that matches conditions
|
911
|
-
:param case_sensitive:
|
912
|
-
:param clean_match:
|
1202
|
+
:param first_match: Return the first element that matches conditions; enabled by default.
|
1203
|
+
:param case_sensitive: If enabled, the letters case will be taken into consideration in the regex.
|
1204
|
+
:param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching.
|
913
1205
|
"""
|
914
|
-
results =
|
1206
|
+
results = Selectors()
|
915
1207
|
|
916
|
-
|
917
|
-
for node in self.__handle_elements(self._root.xpath('.//*[normalize-space(text())]')):
|
1208
|
+
for node in self.__handle_elements(_find_all_elements_with_spaces(self._root)):
|
918
1209
|
"""Check if element matches given regex otherwise, traverse the children tree and iterate"""
|
919
1210
|
node_text = node.text
|
920
|
-
if node_text.re(
|
1211
|
+
if node_text.re(
|
1212
|
+
query,
|
1213
|
+
check_match=True,
|
1214
|
+
clean_match=clean_match,
|
1215
|
+
case_sensitive=case_sensitive,
|
1216
|
+
):
|
921
1217
|
results.append(node)
|
922
1218
|
|
923
1219
|
if first_match and results:
|
@@ -929,21 +1225,22 @@ class Adaptor(SelectorsGeneration):
|
|
929
1225
|
return results
|
930
1226
|
|
931
1227
|
|
932
|
-
class
|
1228
|
+
class Selectors(List[Selector]):
|
933
1229
|
"""
|
934
|
-
The
|
1230
|
+
The `Selectors` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
|
935
1231
|
"""
|
1232
|
+
|
936
1233
|
__slots__ = ()
|
937
1234
|
|
938
|
-
@
|
939
|
-
def __getitem__(self, pos: SupportsIndex) ->
|
1235
|
+
@overload
|
1236
|
+
def __getitem__(self, pos: SupportsIndex) -> Selector:
|
940
1237
|
pass
|
941
1238
|
|
942
|
-
@
|
943
|
-
def __getitem__(self, pos: slice) -> "
|
1239
|
+
@overload
|
1240
|
+
def __getitem__(self, pos: slice) -> "Selectors":
|
944
1241
|
pass
|
945
1242
|
|
946
|
-
def __getitem__(self, pos:
|
1243
|
+
def __getitem__(self, pos: SupportsIndex | slice) -> Union[Selector, "Selectors"]:
|
947
1244
|
lst = super().__getitem__(pos)
|
948
1245
|
if isinstance(pos, slice):
|
949
1246
|
return self.__class__(lst)
|
@@ -951,74 +1248,101 @@ class Adaptors(List[Adaptor]):
|
|
951
1248
|
return lst
|
952
1249
|
|
953
1250
|
def xpath(
|
954
|
-
|
955
|
-
|
1251
|
+
self,
|
1252
|
+
selector: str,
|
1253
|
+
identifier: str = "",
|
1254
|
+
auto_save: bool = False,
|
1255
|
+
percentage: int = 0,
|
1256
|
+
**kwargs: Any,
|
1257
|
+
) -> "Selectors":
|
956
1258
|
"""
|
957
1259
|
Call the ``.xpath()`` method for each element in this list and return
|
958
|
-
their results as another
|
1260
|
+
their results as another `Selectors` class.
|
959
1261
|
|
960
1262
|
**Important:
|
961
|
-
It's recommended to use the identifier argument if you plan to use different selector later
|
1263
|
+
It's recommended to use the identifier argument if you plan to use a different selector later
|
962
1264
|
and want to relocate the same element(s)**
|
963
1265
|
|
964
1266
|
Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
|
965
1267
|
|
966
1268
|
:param selector: The XPath selector to be used.
|
967
|
-
:param identifier: A string that will be used to retrieve element's data in
|
1269
|
+
:param identifier: A string that will be used to retrieve element's data in adaptive,
|
968
1270
|
otherwise the selector will be used.
|
969
|
-
:param auto_save: Automatically save new elements for `
|
970
|
-
:param percentage: The minimum percentage to accept while
|
971
|
-
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
1271
|
+
:param auto_save: Automatically save new elements for `adaptive` later
|
1272
|
+
:param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that.
|
1273
|
+
Be aware that the percentage calculation depends solely on the page structure, so don't play with this
|
972
1274
|
number unless you must know what you are doing!
|
973
1275
|
|
974
|
-
:return:
|
1276
|
+
:return: `Selectors` class.
|
975
1277
|
"""
|
976
1278
|
results = [
|
977
|
-
n.xpath(
|
1279
|
+
n.xpath(
|
1280
|
+
selector, identifier or selector, False, auto_save, percentage, **kwargs
|
1281
|
+
)
|
1282
|
+
for n in self
|
978
1283
|
]
|
979
1284
|
return self.__class__(flatten(results))
|
980
1285
|
|
981
|
-
def css(
|
1286
|
+
def css(
|
1287
|
+
self,
|
1288
|
+
selector: str,
|
1289
|
+
identifier: str = "",
|
1290
|
+
auto_save: bool = False,
|
1291
|
+
percentage: int = 0,
|
1292
|
+
) -> "Selectors":
|
982
1293
|
"""
|
983
1294
|
Call the ``.css()`` method for each element in this list and return
|
984
|
-
their results flattened as another
|
1295
|
+
their results flattened as another `Selectors` class.
|
985
1296
|
|
986
1297
|
**Important:
|
987
|
-
It's recommended to use the identifier argument if you plan to use different selector later
|
1298
|
+
It's recommended to use the identifier argument if you plan to use a different selector later
|
988
1299
|
and want to relocate the same element(s)**
|
989
1300
|
|
990
1301
|
:param selector: The CSS3 selector to be used.
|
991
|
-
:param identifier: A string that will be used to retrieve element's data in
|
1302
|
+
:param identifier: A string that will be used to retrieve element's data in adaptive,
|
992
1303
|
otherwise the selector will be used.
|
993
|
-
:param auto_save: Automatically save new elements for `
|
994
|
-
:param percentage: The minimum percentage to accept while
|
995
|
-
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
1304
|
+
:param auto_save: Automatically save new elements for `adaptive` later
|
1305
|
+
:param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that.
|
1306
|
+
Be aware that the percentage calculation depends solely on the page structure, so don't play with this
|
996
1307
|
number unless you must know what you are doing!
|
997
1308
|
|
998
|
-
:return:
|
1309
|
+
:return: `Selectors` class.
|
999
1310
|
"""
|
1000
1311
|
results = [
|
1001
|
-
n.css(selector, identifier or selector, False, auto_save, percentage)
|
1312
|
+
n.css(selector, identifier or selector, False, auto_save, percentage)
|
1313
|
+
for n in self
|
1002
1314
|
]
|
1003
1315
|
return self.__class__(flatten(results))
|
1004
1316
|
|
1005
|
-
def re(
|
1006
|
-
|
1317
|
+
def re(
|
1318
|
+
self,
|
1319
|
+
regex: str | Pattern,
|
1320
|
+
replace_entities: bool = True,
|
1321
|
+
clean_match: bool = False,
|
1322
|
+
case_sensitive: bool = True,
|
1323
|
+
) -> TextHandlers:
|
1007
1324
|
"""Call the ``.re()`` method for each element in this list and return
|
1008
1325
|
their results flattened as List of TextHandler.
|
1009
1326
|
|
1010
1327
|
:param regex: Can be either a compiled regular expression or a string.
|
1011
|
-
:param replace_entities:
|
1328
|
+
:param replace_entities: If enabled character entity references are replaced by their corresponding character
|
1012
1329
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
1013
|
-
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
1330
|
+
:param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
|
1014
1331
|
"""
|
1015
1332
|
results = [
|
1016
|
-
n.text.re(regex, replace_entities, clean_match, case_sensitive)
|
1333
|
+
n.text.re(regex, replace_entities, clean_match, case_sensitive)
|
1334
|
+
for n in self
|
1017
1335
|
]
|
1018
1336
|
return TextHandlers(flatten(results))
|
1019
1337
|
|
1020
|
-
def re_first(
|
1021
|
-
|
1338
|
+
def re_first(
|
1339
|
+
self,
|
1340
|
+
regex: str | Pattern,
|
1341
|
+
default=None,
|
1342
|
+
replace_entities: bool = True,
|
1343
|
+
clean_match: bool = False,
|
1344
|
+
case_sensitive: bool = True,
|
1345
|
+
) -> TextHandler:
|
1022
1346
|
"""Call the ``.re_first()`` method for each element in this list and return
|
1023
1347
|
the first result or the default value otherwise.
|
1024
1348
|
|
@@ -1026,14 +1350,14 @@ class Adaptors(List[Adaptor]):
|
|
1026
1350
|
:param default: The default value to be returned if there is no match
|
1027
1351
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
1028
1352
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
1029
|
-
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
1353
|
+
:param case_sensitive: if disabled, function will set the regex to ignore the letters case while compiling it
|
1030
1354
|
"""
|
1031
1355
|
for n in self:
|
1032
1356
|
for result in n.re(regex, replace_entities, clean_match, case_sensitive):
|
1033
1357
|
return result
|
1034
1358
|
return default
|
1035
1359
|
|
1036
|
-
def search(self, func: Callable[[
|
1360
|
+
def search(self, func: Callable[["Selector"], bool]) -> Optional["Selector"]:
|
1037
1361
|
"""Loop over all current elements and return the first element that matches the passed function
|
1038
1362
|
:param func: A function that takes each element as an argument and returns True/False
|
1039
1363
|
:return: The first element that match the function or ``None`` otherwise.
|
@@ -1043,14 +1367,12 @@ class Adaptors(List[Adaptor]):
|
|
1043
1367
|
return element
|
1044
1368
|
return None
|
1045
1369
|
|
1046
|
-
def filter(self, func: Callable[[
|
1370
|
+
def filter(self, func: Callable[["Selector"], bool]) -> "Selectors":
|
1047
1371
|
"""Filter current elements based on the passed function
|
1048
1372
|
:param func: A function that takes each element as an argument and returns True/False
|
1049
|
-
:return: The new `
|
1373
|
+
:return: The new `Selectors` object or empty list otherwise.
|
1050
1374
|
"""
|
1051
|
-
return self.__class__([
|
1052
|
-
element for element in self if func(element)
|
1053
|
-
])
|
1375
|
+
return self.__class__([element for element in self if func(element)])
|
1054
1376
|
|
1055
1377
|
# For easy copy-paste from Scrapy/parsel code when needed :)
|
1056
1378
|
def get(self, default=None):
|
@@ -1075,6 +1397,16 @@ class Adaptors(List[Adaptor]):
|
|
1075
1397
|
"""Returns the last item of the current list or `None` if the list is empty"""
|
1076
1398
|
return self[-1] if len(self) > 0 else None
|
1077
1399
|
|
1078
|
-
|
1400
|
+
@property
|
1401
|
+
def length(self):
|
1402
|
+
"""Returns the length of the current list"""
|
1403
|
+
return len(self)
|
1404
|
+
|
1405
|
+
def __getstate__(self) -> Any: # pragma: no cover
|
1079
1406
|
# lxml don't like it :)
|
1080
|
-
raise TypeError("Can't pickle
|
1407
|
+
raise TypeError("Can't pickle Selectors object")
|
1408
|
+
|
1409
|
+
|
1410
|
+
# For backward compatibility
|
1411
|
+
Adaptor = Selector
|
1412
|
+
Adaptors = Selectors
|