scrapling 0.1__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- scrapling/__init__.py +10 -0
- scrapling/custom_types.py +146 -0
- scrapling/mixins.py +74 -0
- scrapling/parser.py +908 -0
- scrapling/storage_adaptors.py +149 -0
- scrapling/translator.py +153 -0
- scrapling/utils.py +164 -0
- scrapling-0.1.dist-info/LICENSE +28 -0
- scrapling-0.1.dist-info/METADATA +475 -0
- scrapling-0.1.dist-info/RECORD +12 -0
- scrapling-0.1.dist-info/WHEEL +5 -0
- scrapling-0.1.dist-info/top_level.txt +1 -0
scrapling/parser.py
ADDED
@@ -0,0 +1,908 @@
|
|
1
|
+
import os
|
2
|
+
from difflib import SequenceMatcher
|
3
|
+
from typing import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator
|
4
|
+
try:
|
5
|
+
from typing import SupportsIndex
|
6
|
+
except ImportError:
|
7
|
+
# 'SupportsIndex' got added in Python 3.8
|
8
|
+
SupportsIndex = None
|
9
|
+
|
10
|
+
from scrapling.translator import HTMLTranslator
|
11
|
+
from scrapling.mixins import SelectorsGeneration
|
12
|
+
from scrapling.custom_types import TextHandler, AttributesHandler
|
13
|
+
from scrapling.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
|
14
|
+
from scrapling.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden
|
15
|
+
|
16
|
+
from lxml import etree, html
|
17
|
+
from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
|
18
|
+
|
19
|
+
|
20
|
+
class Adaptor(SelectorsGeneration):
|
21
|
+
__slots__ = (
|
22
|
+
'url', 'encoding', '__auto_match_enabled', '_root', '_storage', '__debug',
|
23
|
+
'__keep_comments', '__huge_tree_enabled', '__attributes', '__text', '__tag',
|
24
|
+
)
|
25
|
+
|
26
|
+
def __init__(
|
27
|
+
self,
|
28
|
+
text: Optional[str] = None,
|
29
|
+
url: Optional[str] = None,
|
30
|
+
body: bytes = b"",
|
31
|
+
encoding: str = "utf8",
|
32
|
+
huge_tree: bool = True,
|
33
|
+
root: Optional[html.HtmlElement] = None,
|
34
|
+
keep_comments: Optional[bool] = False,
|
35
|
+
auto_match: Optional[bool] = False,
|
36
|
+
storage: Any = SQLiteStorageSystem,
|
37
|
+
storage_args: Optional[Dict] = None,
|
38
|
+
debug: Optional[bool] = True,
|
39
|
+
):
|
40
|
+
"""The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
|
41
|
+
with expressions in CSS, XPath, or with simply text. Check the docs for more info.
|
42
|
+
|
43
|
+
Here we try to extend module ``lxml.html.HtmlElement`` while maintaining a simpler interface, We are not
|
44
|
+
inheriting from the ``lxml.html.HtmlElement`` because it's not pickleable which makes a lot of reference jobs
|
45
|
+
not possible. You can test it here and see code explodes with `AssertionError: invalid Element proxy at...`.
|
46
|
+
It's an old issue with lxml, see `this entry <https://bugs.launchpad.net/lxml/+bug/736708>`
|
47
|
+
|
48
|
+
:param text: HTML body passed as text.
|
49
|
+
:param url: allows storing a URL with the html data for retrieving later.
|
50
|
+
:param body: HTML body as ``bytes`` object. It can be used instead of the ``text`` argument.
|
51
|
+
:param encoding: The encoding type that will be used in HTML parsing, default is `UTF-8`
|
52
|
+
:param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
|
53
|
+
libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
|
54
|
+
:param root: Used internally to pass etree objects instead of text/body arguments, it takes highest priority.
|
55
|
+
Don't use it unless you know what you are doing!
|
56
|
+
:param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
|
57
|
+
:param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
|
58
|
+
priority over all auto-match related arguments/functions in the class.
|
59
|
+
:param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
|
60
|
+
:param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
|
61
|
+
If empty, default values will be used.
|
62
|
+
:param debug: Enable debug mode
|
63
|
+
"""
|
64
|
+
if root is None and not body and text is None:
|
65
|
+
raise ValueError("Adaptor class needs text, body, or root arguments to work")
|
66
|
+
|
67
|
+
if root is None:
|
68
|
+
if text is None:
|
69
|
+
if not body or not isinstance(body, bytes):
|
70
|
+
raise TypeError(f"body argument must be valid and of type bytes, got {body.__class__}")
|
71
|
+
|
72
|
+
body = body.replace(b"\x00", b"").strip()
|
73
|
+
else:
|
74
|
+
if not isinstance(text, str):
|
75
|
+
raise TypeError(f"text argument must be of type str, got {text.__class__}")
|
76
|
+
|
77
|
+
body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
|
78
|
+
|
79
|
+
parser = html.HTMLParser(
|
80
|
+
# https://lxml.de/api/lxml.etree.HTMLParser-class.html
|
81
|
+
recover=True, remove_blank_text=True, remove_comments=(keep_comments is True), encoding=encoding,
|
82
|
+
compact=True, huge_tree=huge_tree, default_doctype=True
|
83
|
+
)
|
84
|
+
self._root = etree.fromstring(body, parser=parser, base_url=url)
|
85
|
+
|
86
|
+
else:
|
87
|
+
# All html types inherits from HtmlMixin so this to check for all at once
|
88
|
+
if not issubclass(type(root), html.HtmlMixin):
|
89
|
+
raise TypeError(
|
90
|
+
f"Root have to be a valid element of `html` module types to work, not of type {type(root)}"
|
91
|
+
)
|
92
|
+
|
93
|
+
self._root = root
|
94
|
+
|
95
|
+
setup_basic_logging(level='debug' if debug else 'info')
|
96
|
+
self.__auto_match_enabled = auto_match
|
97
|
+
|
98
|
+
if self.__auto_match_enabled:
|
99
|
+
if not storage_args:
|
100
|
+
storage_args = {
|
101
|
+
'storage_file': os.path.join(os.path.dirname(__file__), 'elements_storage.db'),
|
102
|
+
'url': url
|
103
|
+
}
|
104
|
+
|
105
|
+
if not hasattr(storage, '__wrapped__'):
|
106
|
+
raise ValueError("Storage class must be wrapped with cache decorator, see docs for info")
|
107
|
+
|
108
|
+
if not issubclass(storage.__wrapped__, StorageSystemMixin):
|
109
|
+
raise ValueError("Storage system must be inherited from class `StorageSystemMixin`")
|
110
|
+
|
111
|
+
self._storage = storage(**storage_args)
|
112
|
+
|
113
|
+
self.__keep_comments = keep_comments
|
114
|
+
self.__huge_tree_enabled = huge_tree
|
115
|
+
self.encoding = encoding
|
116
|
+
self.url = url
|
117
|
+
# For selector stuff
|
118
|
+
self.__attributes = None
|
119
|
+
self.__text = None
|
120
|
+
self.__tag = None
|
121
|
+
self.__debug = debug
|
122
|
+
|
123
|
+
# Node functionalities, I wanted to move to separate Mixin class but it had slight impact on performance
|
124
|
+
@staticmethod
|
125
|
+
def _is_text_node(element: Union[html.HtmlElement, etree._ElementUnicodeResult]) -> bool:
|
126
|
+
"""Return True if given element is a result of a string expression
|
127
|
+
Examples:
|
128
|
+
Xpath -> '/text()', '/@attribute' etc...
|
129
|
+
CSS3 -> '::text', '::attr(attrib)'...
|
130
|
+
"""
|
131
|
+
# Faster than checking `element.is_attribute or element.is_text or element.is_tail`
|
132
|
+
return issubclass(type(element), etree._ElementUnicodeResult)
|
133
|
+
|
134
|
+
def __get_correct_result(
|
135
|
+
self, element: Union[html.HtmlElement, etree._ElementUnicodeResult]
|
136
|
+
) -> Union[TextHandler, html.HtmlElement, 'Adaptor', str]:
|
137
|
+
"""Used internally in all functions to convert results to type (Adaptor|Adaptors) when possible"""
|
138
|
+
if self._is_text_node(element):
|
139
|
+
# etree._ElementUnicodeResult basically inherit from `str` so it's fine
|
140
|
+
return TextHandler(str(element))
|
141
|
+
else:
|
142
|
+
if issubclass(type(element), html.HtmlMixin):
|
143
|
+
return self.__class__(
|
144
|
+
root=element, url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
|
145
|
+
keep_comments=self.__keep_comments, huge_tree=self.__huge_tree_enabled, debug=self.__debug
|
146
|
+
)
|
147
|
+
return element
|
148
|
+
|
149
|
+
def __convert_results(
|
150
|
+
self, result: Union[List[html.HtmlElement], html.HtmlElement]
|
151
|
+
) -> Union['Adaptors[Adaptor]', 'Adaptor', List, None]:
|
152
|
+
"""Used internally in all functions to convert results to type (Adaptor|Adaptors) in bulk when possible"""
|
153
|
+
if result is None:
|
154
|
+
return None
|
155
|
+
elif result == []: # Lxml will give a warning if I used something like `not result`
|
156
|
+
return []
|
157
|
+
|
158
|
+
if isinstance(result, Adaptors):
|
159
|
+
return result
|
160
|
+
|
161
|
+
if type(result) is list:
|
162
|
+
results = [self.__get_correct_result(n) for n in result]
|
163
|
+
if all(isinstance(res, self.__class__) for res in results):
|
164
|
+
return Adaptors(results)
|
165
|
+
return results
|
166
|
+
|
167
|
+
return self.__get_correct_result(result)
|
168
|
+
|
169
|
+
def __getstate__(self) -> Any:
|
170
|
+
# lxml don't like it :)
|
171
|
+
raise TypeError("Can't pickle Adaptor objects")
|
172
|
+
|
173
|
+
# The following four properties I made them into functions instead of variables directly
|
174
|
+
# So they don't slow down the process of initializing many instances of the class and gets executed only
|
175
|
+
# when the user need them for the first time for that specific element and gets cached for next times
|
176
|
+
# Doing that only made the library performance test sky rocked multiple times faster than before
|
177
|
+
# because I was executing them on initialization before :))
|
178
|
+
@property
|
179
|
+
def tag(self) -> str:
|
180
|
+
"""Get tag name of the element"""
|
181
|
+
if not self.__tag:
|
182
|
+
self.__tag = self._root.tag
|
183
|
+
return self.__tag
|
184
|
+
|
185
|
+
@property
|
186
|
+
def text(self) -> TextHandler:
|
187
|
+
"""Get text content of the element"""
|
188
|
+
if not self.__text:
|
189
|
+
self.__text = TextHandler(self._root.text)
|
190
|
+
return self.__text
|
191
|
+
|
192
|
+
def get_all_text(self, separator: str = "\n", strip: bool = False, ignore_tags: Tuple = ('script', 'style',), valid_values: bool = True) -> TextHandler:
|
193
|
+
"""Get all child strings of this element, concatenated using the given separator.
|
194
|
+
|
195
|
+
:param separator: Strings will be concatenated using this separator.
|
196
|
+
:param strip: If True, strings will be stripped before being concatenated.
|
197
|
+
:param ignore_tags: A tuple of all tag names you want to ignore
|
198
|
+
:param valid_values: If enabled, elements with text-content that is empty or only whitespaces will be ignored
|
199
|
+
|
200
|
+
:return: A TextHandler
|
201
|
+
"""
|
202
|
+
_all_strings = []
|
203
|
+
|
204
|
+
def _traverse(node: html.HtmlElement) -> None:
|
205
|
+
"""Traverse element children and get text content of each
|
206
|
+
|
207
|
+
:param node: Current node in the tree structure
|
208
|
+
:return:
|
209
|
+
"""
|
210
|
+
if node.tag not in ignore_tags:
|
211
|
+
text = node.text
|
212
|
+
if text and type(text) is str:
|
213
|
+
if valid_values:
|
214
|
+
if text.strip():
|
215
|
+
_all_strings.append(text if not strip else text.strip())
|
216
|
+
else:
|
217
|
+
_all_strings.append(text if not strip else text.strip())
|
218
|
+
|
219
|
+
for branch in node.iterchildren():
|
220
|
+
_traverse(branch)
|
221
|
+
|
222
|
+
# We will start using Lxml directly for the speed boost
|
223
|
+
_traverse(self._root)
|
224
|
+
|
225
|
+
return TextHandler(separator.join([s for s in _all_strings]))
|
226
|
+
|
227
|
+
@property
|
228
|
+
def attrib(self) -> AttributesHandler:
|
229
|
+
"""Get attributes of the element"""
|
230
|
+
if not self.__attributes:
|
231
|
+
self.__attributes = AttributesHandler(self._root.attrib)
|
232
|
+
return self.__attributes
|
233
|
+
|
234
|
+
@property
|
235
|
+
def html_content(self) -> str:
|
236
|
+
"""Return the inner html code of the element"""
|
237
|
+
return etree.tostring(self._root, encoding='unicode', method='html', with_tail=False)
|
238
|
+
|
239
|
+
body = html_content
|
240
|
+
|
241
|
+
def prettify(self) -> str:
|
242
|
+
"""Return a prettified version of the element's inner html-code"""
|
243
|
+
return etree.tostring(self._root, encoding='unicode', pretty_print=True, method='html', with_tail=False)
|
244
|
+
|
245
|
+
def has_class(self, class_name: str) -> bool:
|
246
|
+
"""Check if element has a specific class
|
247
|
+
:param class_name: The class name to check for
|
248
|
+
:return: True if element has class with that name otherwise False
|
249
|
+
"""
|
250
|
+
return class_name in self._root.classes
|
251
|
+
|
252
|
+
@property
|
253
|
+
def parent(self) -> Union['Adaptor', None]:
|
254
|
+
"""Return the direct parent of the element or ``None`` otherwise"""
|
255
|
+
return self.__convert_results(self._root.getparent())
|
256
|
+
|
257
|
+
@property
|
258
|
+
def children(self) -> Union['Adaptors[Adaptor]', List]:
|
259
|
+
"""Return the children elements of the current element or empty list otherwise"""
|
260
|
+
return self.__convert_results(list(
|
261
|
+
child for child in self._root.iterchildren() if type(child) not in html_forbidden
|
262
|
+
))
|
263
|
+
|
264
|
+
@property
|
265
|
+
def siblings(self) -> Union['Adaptors[Adaptor]', List]:
|
266
|
+
"""Return other children of the current element's parent or empty list otherwise"""
|
267
|
+
if self.parent:
|
268
|
+
return Adaptors([child for child in self.parent.children if child._root != self._root])
|
269
|
+
return []
|
270
|
+
|
271
|
+
def iterancestors(self) -> Generator['Adaptor', None, None]:
|
272
|
+
"""Return a generator that loops over all ancestors of the element, starting with element's parent."""
|
273
|
+
for ancestor in self._root.iterancestors():
|
274
|
+
yield self.__convert_results(ancestor)
|
275
|
+
|
276
|
+
def find_ancestor(self, func: Callable[['Adaptor'], bool]) -> Union['Adaptor', None]:
|
277
|
+
"""Loop over all ancestors of the element till one match the passed function
|
278
|
+
:param func: A function that takes each ancestor as an argument and returns True/False
|
279
|
+
:return: The first ancestor that match the function or ``None`` otherwise.
|
280
|
+
"""
|
281
|
+
for ancestor in self.iterancestors():
|
282
|
+
if func(ancestor):
|
283
|
+
return ancestor
|
284
|
+
return None
|
285
|
+
|
286
|
+
@property
|
287
|
+
def path(self) -> 'Adaptors[Adaptor]':
|
288
|
+
"""Returns list of type :class:`Adaptors` that contains the path leading to the current element from the root."""
|
289
|
+
lst = list(self.iterancestors())
|
290
|
+
return Adaptors(lst)
|
291
|
+
|
292
|
+
@property
|
293
|
+
def next(self) -> Union['Adaptor', None]:
|
294
|
+
"""Returns the next element of the current element in the children of the parent or ``None`` otherwise."""
|
295
|
+
next_element = self._root.getnext()
|
296
|
+
if next_element is not None:
|
297
|
+
while type(next_element) in html_forbidden:
|
298
|
+
# Ignore html comments and unwanted types
|
299
|
+
next_element = next_element.getnext()
|
300
|
+
|
301
|
+
return self.__convert_results(next_element)
|
302
|
+
|
303
|
+
@property
|
304
|
+
def previous(self) -> Union['Adaptor', None]:
|
305
|
+
"""Returns the previous element of the current element in the children of the parent or ``None`` otherwise."""
|
306
|
+
prev_element = self._root.getprevious()
|
307
|
+
if prev_element is not None:
|
308
|
+
while type(prev_element) in html_forbidden:
|
309
|
+
# Ignore html comments and unwanted types
|
310
|
+
prev_element = prev_element.getprevious()
|
311
|
+
|
312
|
+
return self.__convert_results(prev_element)
|
313
|
+
|
314
|
+
def __str__(self) -> str:
|
315
|
+
return self.html_content
|
316
|
+
|
317
|
+
def __repr__(self) -> str:
|
318
|
+
length_limit = 40
|
319
|
+
data = "<"
|
320
|
+
content = clean_spaces(self.html_content)
|
321
|
+
if len(content) > length_limit:
|
322
|
+
content = content[:length_limit].strip() + '...'
|
323
|
+
data += f"data='{content}'"
|
324
|
+
|
325
|
+
if self.parent:
|
326
|
+
parent_content = clean_spaces(self.parent.html_content)
|
327
|
+
if len(parent_content) > length_limit:
|
328
|
+
parent_content = parent_content[:length_limit].strip() + '...'
|
329
|
+
|
330
|
+
data += f" parent='{parent_content}'"
|
331
|
+
|
332
|
+
return data + ">"
|
333
|
+
|
334
|
+
# From here we start the selecting functions
|
335
|
+
def relocate(
|
336
|
+
self, element: Union[Dict, html.HtmlElement, 'Adaptor'], percentage: int = 0, adaptor_type: bool = False
|
337
|
+
) -> Union[List[Union[html.HtmlElement, None]], 'Adaptors']:
|
338
|
+
"""This function will search again for the element in the page tree, used automatically on page structure change
|
339
|
+
|
340
|
+
:param element: The element we want to relocate in the tree
|
341
|
+
:param percentage: The minimum percentage to accept and not going lower than that. Be aware that the percentage
|
342
|
+
calculation depends solely on the page structure so don't play with this number unless you must know
|
343
|
+
what you are doing!
|
344
|
+
:param adaptor_type: If True, the return result will be converted to `Adaptors` object
|
345
|
+
:return: List of pure HTML elements that got the highest matching score or 'Adaptors' object
|
346
|
+
"""
|
347
|
+
score_table = {}
|
348
|
+
# Note: `element` will be most likely always be a dictionary at this point.
|
349
|
+
if isinstance(element, self.__class__):
|
350
|
+
element = element._root
|
351
|
+
|
352
|
+
if issubclass(type(element), html.HtmlElement):
|
353
|
+
element = _StorageTools.element_to_dict(element)
|
354
|
+
|
355
|
+
# TODO: Optimize the traverse logic a bit, maybe later
|
356
|
+
def _traverse(node: html.HtmlElement, ele: Dict) -> None:
|
357
|
+
"""Get the matching score of the given element against the node then traverse the children
|
358
|
+
|
359
|
+
:param node: Current node in the tree structure
|
360
|
+
:param ele: The element we are searching for as dictionary
|
361
|
+
:return:
|
362
|
+
"""
|
363
|
+
# Hence: the code doesn't stop even if the score was 100%
|
364
|
+
# because there might be another element(s) left in page with the same score
|
365
|
+
score = self.__calculate_similarity_score(ele, node)
|
366
|
+
score_table.setdefault(score, []).append(node)
|
367
|
+
for branch in node.iterchildren():
|
368
|
+
_traverse(branch, ele)
|
369
|
+
|
370
|
+
# This will block until we traverse all children/branches
|
371
|
+
_traverse(self._root, element)
|
372
|
+
|
373
|
+
if score_table:
|
374
|
+
highest_probability = max(score_table.keys())
|
375
|
+
if score_table[highest_probability] and highest_probability >= percentage:
|
376
|
+
logging.debug(f'Highest probability was {highest_probability}%')
|
377
|
+
logging.debug('Top 5 best matching elements are: ')
|
378
|
+
for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
|
379
|
+
logging.debug(f'{percent} -> {self.__convert_results(score_table[percent])}')
|
380
|
+
if not adaptor_type:
|
381
|
+
return score_table[highest_probability]
|
382
|
+
return self.__convert_results(score_table[highest_probability])
|
383
|
+
return []
|
384
|
+
|
385
|
+
def css(self, selector: str, identifier: str = '',
|
386
|
+
auto_match: bool = False, auto_save: bool = False, percentage: int = 0
|
387
|
+
) -> Union['Adaptors[Adaptor]', List]:
|
388
|
+
"""Search current tree with CSS3 selectors
|
389
|
+
|
390
|
+
**Important:
|
391
|
+
It's recommended to use the identifier argument if you plan to use different selector later
|
392
|
+
and want to relocate the same element(s)**
|
393
|
+
|
394
|
+
:param selector: The CSS3 selector to be used.
|
395
|
+
:param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
|
396
|
+
:param identifier: A string that will be used to save/retrieve element's data in auto-matching
|
397
|
+
otherwise the selector will be used.
|
398
|
+
:param auto_save: Automatically save new elements for `auto_match` later
|
399
|
+
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
400
|
+
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
401
|
+
number unless you must know what you are doing!
|
402
|
+
|
403
|
+
:return: List as :class:`Adaptors`
|
404
|
+
"""
|
405
|
+
try:
|
406
|
+
if not self.__auto_match_enabled:
|
407
|
+
# No need to split selectors in this case, let's save some CPU cycles :)
|
408
|
+
xpath_selector = HTMLTranslator().css_to_xpath(selector)
|
409
|
+
return self.xpath(xpath_selector, identifier or selector, auto_match, auto_save, percentage)
|
410
|
+
|
411
|
+
results = []
|
412
|
+
if ',' in selector:
|
413
|
+
for single_selector in split_selectors(selector):
|
414
|
+
# I'm doing this only so the `save` function save data correctly for combined selectors
|
415
|
+
# Like using the ',' to combine two different selectors that point to different elements.
|
416
|
+
xpath_selector = HTMLTranslator().css_to_xpath(single_selector.canonical())
|
417
|
+
results += self.xpath(
|
418
|
+
xpath_selector, identifier or single_selector.canonical(), auto_match, auto_save, percentage
|
419
|
+
)
|
420
|
+
else:
|
421
|
+
xpath_selector = HTMLTranslator().css_to_xpath(selector)
|
422
|
+
return self.xpath(xpath_selector, identifier or selector, auto_match, auto_save, percentage)
|
423
|
+
|
424
|
+
return self.__convert_results(results)
|
425
|
+
except (SelectorError, SelectorSyntaxError,):
|
426
|
+
raise SelectorSyntaxError(f"Invalid CSS selector: {selector}")
|
427
|
+
|
428
|
+
def xpath(self, selector: str, identifier: str = '',
|
429
|
+
auto_match: bool = False, auto_save: bool = False, percentage: int = 0, **kwargs: Any
|
430
|
+
) -> Union['Adaptors[Adaptor]', List]:
|
431
|
+
"""Search current tree with XPath selectors
|
432
|
+
|
433
|
+
**Important:
|
434
|
+
It's recommended to use the identifier argument if you plan to use different selector later
|
435
|
+
and want to relocate the same element(s)**
|
436
|
+
|
437
|
+
Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
|
438
|
+
|
439
|
+
:param selector: The XPath selector to be used.
|
440
|
+
:param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
|
441
|
+
:param identifier: A string that will be used to save/retrieve element's data in auto-matching
|
442
|
+
otherwise the selector will be used.
|
443
|
+
:param auto_save: Automatically save new elements for `auto_match` later
|
444
|
+
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
445
|
+
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
446
|
+
number unless you must know what you are doing!
|
447
|
+
|
448
|
+
:return: List as :class:`Adaptors`
|
449
|
+
"""
|
450
|
+
try:
|
451
|
+
selected_elements = self._root.xpath(selector, **kwargs)
|
452
|
+
|
453
|
+
if selected_elements:
|
454
|
+
if not self.__auto_match_enabled and auto_save:
|
455
|
+
logging.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
|
456
|
+
|
457
|
+
elif self.__auto_match_enabled and auto_save:
|
458
|
+
self.save(selected_elements[0], identifier or selector)
|
459
|
+
|
460
|
+
return self.__convert_results(selected_elements)
|
461
|
+
else:
|
462
|
+
if self.__auto_match_enabled and auto_match:
|
463
|
+
element_data = self.retrieve(identifier or selector)
|
464
|
+
if element_data:
|
465
|
+
relocated = self.relocate(element_data, percentage)
|
466
|
+
if relocated is not None and auto_save:
|
467
|
+
self.save(relocated[0], identifier or selector)
|
468
|
+
|
469
|
+
return self.__convert_results(relocated)
|
470
|
+
else:
|
471
|
+
return self.__convert_results(selected_elements)
|
472
|
+
|
473
|
+
elif not self.__auto_match_enabled and auto_match:
|
474
|
+
logging.warning("Argument `auto_match` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
|
475
|
+
|
476
|
+
return self.__convert_results(selected_elements)
|
477
|
+
|
478
|
+
except (SelectorError, SelectorSyntaxError, etree.XPathError, etree.XPathEvalError):
|
479
|
+
raise SelectorSyntaxError(f"Invalid XPath selector: {selector}")
|
480
|
+
|
481
|
+
def __calculate_similarity_score(self, original: Dict, candidate: html.HtmlElement) -> float:
|
482
|
+
"""Used internally to calculate a score that shows how candidate element similar to the original one
|
483
|
+
|
484
|
+
:param original: The original element in the form of the dictionary generated from `element_to_dict` function
|
485
|
+
:param candidate: The element to compare with the original element.
|
486
|
+
:return: A percentage score of how similar is the candidate to the original element
|
487
|
+
"""
|
488
|
+
score, checks = 0, 0
|
489
|
+
candidate = _StorageTools.element_to_dict(candidate)
|
490
|
+
|
491
|
+
# Possible TODO:
|
492
|
+
# Study the idea of giving weight to each test below so some are more important than others
|
493
|
+
# Current results: With weights some websites had better score while it was worse for others
|
494
|
+
score += 1 if original['tag'] == candidate['tag'] else 0 # * 0.3 # 30%
|
495
|
+
checks += 1
|
496
|
+
|
497
|
+
if original['text']:
|
498
|
+
score += SequenceMatcher(None, original['text'], candidate.get('text') or '').ratio() # * 0.3 # 30%
|
499
|
+
checks += 1
|
500
|
+
|
501
|
+
# if both doesn't have attributes, it still count for something!
|
502
|
+
score += self.__calculate_dict_diff(original['attributes'], candidate['attributes']) # * 0.3 # 30%
|
503
|
+
checks += 1
|
504
|
+
|
505
|
+
# Separate similarity test for class, id, href,... this will help in full structural changes
|
506
|
+
for attrib in ('class', 'id', 'href', 'src',):
|
507
|
+
if original['attributes'].get(attrib):
|
508
|
+
score += SequenceMatcher(
|
509
|
+
None, original['attributes'][attrib], candidate['attributes'].get(attrib) or ''
|
510
|
+
).ratio() # * 0.3 # 30%
|
511
|
+
checks += 1
|
512
|
+
|
513
|
+
score += SequenceMatcher(None, original['path'], candidate['path']).ratio() # * 0.1 # 10%
|
514
|
+
checks += 1
|
515
|
+
|
516
|
+
if original.get('parent_name'):
|
517
|
+
# Then we start comparing parents' data
|
518
|
+
if candidate.get('parent_name'):
|
519
|
+
score += SequenceMatcher(
|
520
|
+
None, original['parent_name'], candidate.get('parent_name') or ''
|
521
|
+
).ratio() # * 0.2 # 20%
|
522
|
+
checks += 1
|
523
|
+
|
524
|
+
score += self.__calculate_dict_diff(
|
525
|
+
original['parent_attribs'], candidate.get('parent_attribs') or {}
|
526
|
+
) # * 0.2 # 20%
|
527
|
+
checks += 1
|
528
|
+
|
529
|
+
if original['parent_text']:
|
530
|
+
score += SequenceMatcher(
|
531
|
+
None, original['parent_text'], candidate.get('parent_text') or ''
|
532
|
+
).ratio() # * 0.1 # 10%
|
533
|
+
checks += 1
|
534
|
+
# else:
|
535
|
+
# # The original element have a parent and this one not, this is not a good sign
|
536
|
+
# score -= 0.1
|
537
|
+
|
538
|
+
if original.get('siblings'):
|
539
|
+
score += SequenceMatcher(
|
540
|
+
None, original['siblings'], candidate.get('siblings') or []
|
541
|
+
).ratio() # * 0.1 # 10%
|
542
|
+
checks += 1
|
543
|
+
|
544
|
+
# How % sure? let's see
|
545
|
+
return round((score / checks) * 100, 2)
|
546
|
+
|
547
|
+
@staticmethod
|
548
|
+
def __calculate_dict_diff(dict1: dict, dict2: dict) -> float:
|
549
|
+
"""Used internally calculate similarity between two dictionaries as SequenceMatcher doesn't accept dictionaries
|
550
|
+
"""
|
551
|
+
score = SequenceMatcher(None, tuple(dict1.keys()), tuple(dict2.keys())).ratio() * 0.5
|
552
|
+
score += SequenceMatcher(None, tuple(dict1.values()), tuple(dict2.values())).ratio() * 0.5
|
553
|
+
return score
|
554
|
+
|
555
|
+
def save(self, element: Union['Adaptor', html.HtmlElement], identifier: str) -> None:
|
556
|
+
"""Saves the element's unique properties to the storage for retrieval and relocation later
|
557
|
+
|
558
|
+
:param element: The element itself that we want to save to storage, it can be a `Adaptor` or pure `HtmlElement`
|
559
|
+
:param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
|
560
|
+
the docs for more info.
|
561
|
+
"""
|
562
|
+
if self.__auto_match_enabled:
|
563
|
+
if isinstance(element, self.__class__):
|
564
|
+
element = element._root
|
565
|
+
|
566
|
+
if self._is_text_node(element):
|
567
|
+
element = element.getparent()
|
568
|
+
|
569
|
+
self._storage.save(element, identifier)
|
570
|
+
else:
|
571
|
+
logging.critical(
|
572
|
+
"Can't use Auto-match features with disabled globally, you have to start a new class instance."
|
573
|
+
)
|
574
|
+
|
575
|
+
def retrieve(self, identifier: str) -> Optional[Dict]:
|
576
|
+
"""Using the identifier, we search the storage and return the unique properties of the element
|
577
|
+
|
578
|
+
:param identifier: This is the identifier that will be used to retrieve the element from the storage. See
|
579
|
+
the docs for more info.
|
580
|
+
:return: A dictionary of the unique properties
|
581
|
+
"""
|
582
|
+
if self.__auto_match_enabled:
|
583
|
+
return self._storage.retrieve(identifier)
|
584
|
+
|
585
|
+
logging.critical(
|
586
|
+
"Can't use Auto-match features with disabled globally, you have to start a new class instance."
|
587
|
+
)
|
588
|
+
|
589
|
+
# Operations on text functions
|
590
|
+
def json(self) -> Dict:
|
591
|
+
"""Return json response if the response is jsonable otherwise throws error"""
|
592
|
+
return self.text.json()
|
593
|
+
|
594
|
+
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True) -> 'List[str]':
|
595
|
+
"""Apply the given regex to the current text and return a list of strings with the matches.
|
596
|
+
|
597
|
+
:param regex: Can be either a compiled regular expression or a string.
|
598
|
+
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
599
|
+
"""
|
600
|
+
return self.text.re(regex, replace_entities)
|
601
|
+
|
602
|
+
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True):
|
603
|
+
"""Apply the given regex to text and return the first match if found, otherwise return the default value.
|
604
|
+
|
605
|
+
:param regex: Can be either a compiled regular expression or a string.
|
606
|
+
:param default: The default value to be returned if there is no match
|
607
|
+
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
608
|
+
|
609
|
+
"""
|
610
|
+
return self.text.re_first(regex, default, replace_entities)
|
611
|
+
|
612
|
+
def find_similar(
|
613
|
+
self,
|
614
|
+
similarity_threshold: float = 0.2,
|
615
|
+
ignore_attributes: Union[List, Tuple] = ('href', 'src',),
|
616
|
+
match_text: bool = False
|
617
|
+
) -> Union['Adaptors[Adaptor]', List]:
|
618
|
+
"""Find elements that are in the same tree depth in the page with the same tag name and same parent tag etc...
|
619
|
+
then return the ones that match the current element attributes with percentage higher than the input threshold.
|
620
|
+
|
621
|
+
This function is inspired by AutoScraper and made for cases where you, for example, found a product div inside
|
622
|
+
a products-list container and want to find other products using that that element as a starting point EXCEPT
|
623
|
+
this function works in any case without depending on the element type.
|
624
|
+
|
625
|
+
:param similarity_threshold: The percentage to use while comparing elements attributes.
|
626
|
+
Note: Elements found before attributes matching/comparison will be sharing the same depth, same tag name,
|
627
|
+
same parent tag name, and same grand parent tag name. So they are 99% likely to be correct unless your are
|
628
|
+
extremely unlucky then attributes matching comes into play so basically don't play with this number unless
|
629
|
+
you are getting the results you don't want.
|
630
|
+
Also, if current element doesn't have attributes and the similar element as well, then it's a 100% match.
|
631
|
+
:param ignore_attributes: Attribute names passed will be ignored while matching the attributes in last step.
|
632
|
+
The default value is to ignore `href` and `src` as URLs can change a lot between elements so it's unreliable
|
633
|
+
:param match_text: If True, elements text content will be taken into calculation while matching.
|
634
|
+
Not recommended to use in normal cases but it depends.
|
635
|
+
|
636
|
+
:return: A ``Adaptors`` container of ``Adaptor`` objects or empty list
|
637
|
+
"""
|
638
|
+
def get_attributes(element: html.HtmlElement) -> Dict:
|
639
|
+
"""Return attributes dictionary without the ignored list"""
|
640
|
+
return {k: v for k, v in element.attrib.items() if k not in ignore_attributes}
|
641
|
+
|
642
|
+
def are_alike(original: html.HtmlElement, original_attributes: Dict, candidate: html.HtmlElement) -> bool:
|
643
|
+
"""Calculate a score of how much these elements are alike and return True
|
644
|
+
if score is higher or equal the threshold"""
|
645
|
+
candidate_attributes = get_attributes(candidate) if ignore_attributes else candidate.attrib
|
646
|
+
score, checks = 0, 0
|
647
|
+
|
648
|
+
if original_attributes:
|
649
|
+
score += sum(
|
650
|
+
SequenceMatcher(None, v, candidate_attributes.get(k, '')).ratio()
|
651
|
+
for k, v in original_attributes.items()
|
652
|
+
)
|
653
|
+
checks += len(candidate_attributes)
|
654
|
+
else:
|
655
|
+
if not candidate_attributes:
|
656
|
+
# Both doesn't have attributes, this must mean something
|
657
|
+
score += 1
|
658
|
+
checks += 1
|
659
|
+
|
660
|
+
if match_text:
|
661
|
+
score += SequenceMatcher(
|
662
|
+
None, clean_spaces(original.text or ''), clean_spaces(candidate.text or '')
|
663
|
+
).ratio()
|
664
|
+
checks += 1
|
665
|
+
|
666
|
+
if checks:
|
667
|
+
return round(score / checks, 2) >= similarity_threshold
|
668
|
+
return False
|
669
|
+
|
670
|
+
# We will use the elements root from now on to get the speed boost of using Lxml directly
|
671
|
+
root = self._root
|
672
|
+
current_depth = len(list(root.iterancestors()))
|
673
|
+
target_attrs = get_attributes(root) if ignore_attributes else root.attrib
|
674
|
+
similar_elements = list()
|
675
|
+
# + root.xpath(f"//{self.tag}[count(ancestor::*) = {current_depth-1}]")
|
676
|
+
parent = root.getparent()
|
677
|
+
if parent is not None:
|
678
|
+
grandparent = parent.getparent() # lol
|
679
|
+
if grandparent is not None:
|
680
|
+
potential_matches = root.xpath(
|
681
|
+
f"//{grandparent.tag}/{parent.tag}/{self.tag}[count(ancestor::*) = {current_depth}]"
|
682
|
+
)
|
683
|
+
else:
|
684
|
+
potential_matches = root.xpath(f"//{parent.tag}/{self.tag}[count(ancestor::*) = {current_depth}]")
|
685
|
+
else:
|
686
|
+
potential_matches = root.xpath(f"//{self.tag}[count(ancestor::*) = {current_depth}]")
|
687
|
+
|
688
|
+
for potential_match in potential_matches:
|
689
|
+
if potential_match != root and are_alike(root, target_attrs, potential_match):
|
690
|
+
similar_elements.append(potential_match)
|
691
|
+
|
692
|
+
return self.__convert_results(similar_elements)
|
693
|
+
|
694
|
+
def find_by_text(
|
695
|
+
self, text: str, first_match: bool = True, partial: bool = False,
|
696
|
+
case_sensitive: bool = False, clean_match: bool = True
|
697
|
+
) -> Union['Adaptors[Adaptor]', 'Adaptor', List]:
|
698
|
+
"""Find elements that its text content fully/partially matches input.
|
699
|
+
:param text: Text query to match
|
700
|
+
:param first_match: Return first element that matches conditions, enabled by default
|
701
|
+
:param partial: If enabled, function return elements that contains the input text
|
702
|
+
:param case_sensitive: if enabled, letters case will be taken into consideration
|
703
|
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
704
|
+
"""
|
705
|
+
|
706
|
+
results = []
|
707
|
+
if not case_sensitive:
|
708
|
+
text = text.lower()
|
709
|
+
|
710
|
+
def _traverse(node: Adaptor) -> None:
|
711
|
+
"""Check if element matches given text otherwise, traverse the children tree and iterate"""
|
712
|
+
node_text = node.text
|
713
|
+
# if there's already no text in this node, dodge it to save CPU cycles and time
|
714
|
+
if node_text:
|
715
|
+
if clean_match:
|
716
|
+
node_text = node_text.clean()
|
717
|
+
|
718
|
+
if not case_sensitive:
|
719
|
+
node_text = node_text.lower()
|
720
|
+
|
721
|
+
if partial:
|
722
|
+
if text in node_text:
|
723
|
+
results.append(node)
|
724
|
+
elif text == node_text:
|
725
|
+
results.append(node)
|
726
|
+
|
727
|
+
if results and first_match:
|
728
|
+
# we got an element so we should stop
|
729
|
+
return
|
730
|
+
|
731
|
+
for branch in node.children:
|
732
|
+
_traverse(branch)
|
733
|
+
|
734
|
+
# This will block until we traverse all children/branches
|
735
|
+
_traverse(self)
|
736
|
+
|
737
|
+
if first_match:
|
738
|
+
if results:
|
739
|
+
return results[0]
|
740
|
+
return self.__convert_results(results)
|
741
|
+
|
742
|
+
def find_by_regex(
|
743
|
+
self, query: str, first_match: bool = True, case_sensitive: bool = False, clean_match: bool = True
|
744
|
+
) -> Union['Adaptors[Adaptor]', 'Adaptor', List]:
|
745
|
+
"""Find elements that its text content matches the input regex pattern.
|
746
|
+
:param query: Regex query to match
|
747
|
+
:param first_match: Return first element that matches conditions, enabled by default
|
748
|
+
:param case_sensitive: if enabled, letters case will be taken into consideration in the regex
|
749
|
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
750
|
+
"""
|
751
|
+
results = []
|
752
|
+
|
753
|
+
def _traverse(node: Adaptor) -> None:
|
754
|
+
"""Check if element matches given regex otherwise, traverse the children tree and iterate"""
|
755
|
+
node_text = node.text
|
756
|
+
# if there's already no text in this node, dodge it to save CPU cycles and time
|
757
|
+
if node_text:
|
758
|
+
if node_text.re(query, check_match=True, clean_match=clean_match, case_sensitive=case_sensitive):
|
759
|
+
results.append(node)
|
760
|
+
|
761
|
+
if results and first_match:
|
762
|
+
# we got an element so we should stop
|
763
|
+
return
|
764
|
+
|
765
|
+
for branch in node.children:
|
766
|
+
_traverse(branch)
|
767
|
+
|
768
|
+
# This will block until we traverse all children/branches
|
769
|
+
_traverse(self)
|
770
|
+
|
771
|
+
if results and first_match:
|
772
|
+
return results[0]
|
773
|
+
return self.__convert_results(results)
|
774
|
+
|
775
|
+
|
776
|
+
class Adaptors(List[Adaptor]):
|
777
|
+
"""
|
778
|
+
The :class:`Adaptors` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
|
779
|
+
"""
|
780
|
+
__slots__ = ()
|
781
|
+
|
782
|
+
def __getitem__(self, pos: Union[SupportsIndex, slice]) -> Union[Adaptor, "Adaptors[Adaptor]"]:
|
783
|
+
lst = super().__getitem__(pos)
|
784
|
+
if isinstance(pos, slice):
|
785
|
+
return self.__class__(lst)
|
786
|
+
else:
|
787
|
+
return lst
|
788
|
+
|
789
|
+
def xpath(
|
790
|
+
self, selector: str, identifier: str = '', auto_save: bool = False, percentage: int = 0, **kwargs: Any
|
791
|
+
) -> Union["Adaptors[Adaptor]", List]:
|
792
|
+
"""
|
793
|
+
Call the ``.xpath()`` method for each element in this list and return
|
794
|
+
their results as another :class:`Adaptors`.
|
795
|
+
|
796
|
+
**Important:
|
797
|
+
It's recommended to use the identifier argument if you plan to use different selector later
|
798
|
+
and want to relocate the same element(s)**
|
799
|
+
|
800
|
+
Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
|
801
|
+
|
802
|
+
:param selector: The XPath selector to be used.
|
803
|
+
:param identifier: A string that will be used to retrieve element's data in auto-matching
|
804
|
+
otherwise the selector will be used.
|
805
|
+
:param auto_save: Automatically save new elements for `auto_match` later
|
806
|
+
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
807
|
+
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
808
|
+
number unless you must know what you are doing!
|
809
|
+
|
810
|
+
:return: List as :class:`Adaptors`
|
811
|
+
"""
|
812
|
+
results = [
|
813
|
+
n.xpath(selector, identifier or selector, False, auto_save, percentage, **kwargs) for n in self
|
814
|
+
]
|
815
|
+
return self.__class__(flatten(results))
|
816
|
+
|
817
|
+
def css(self, selector: str, identifier: str = '', auto_save: bool = False, percentage: int = 0) -> Union["Adaptors[Adaptor]", List]:
|
818
|
+
"""
|
819
|
+
Call the ``.css()`` method for each element in this list and return
|
820
|
+
their results flattened as another :class:`Adaptors`.
|
821
|
+
|
822
|
+
**Important:
|
823
|
+
It's recommended to use the identifier argument if you plan to use different selector later
|
824
|
+
and want to relocate the same element(s)**
|
825
|
+
|
826
|
+
:param selector: The CSS3 selector to be used.
|
827
|
+
:param identifier: A string that will be used to retrieve element's data in auto-matching
|
828
|
+
otherwise the selector will be used.
|
829
|
+
:param auto_save: Automatically save new elements for `auto_match` later
|
830
|
+
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
831
|
+
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
832
|
+
number unless you must know what you are doing!
|
833
|
+
|
834
|
+
:return: List as :class:`Adaptors`
|
835
|
+
"""
|
836
|
+
results = [
|
837
|
+
n.css(selector, identifier or selector, False, auto_save, percentage) for n in self
|
838
|
+
]
|
839
|
+
return self.__class__(flatten(results))
|
840
|
+
|
841
|
+
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True) -> 'List[str]':
|
842
|
+
"""Call the ``.re()`` method for each element in this list and return
|
843
|
+
their results flattened as List of TextHandler.
|
844
|
+
|
845
|
+
:param regex: Can be either a compiled regular expression or a string.
|
846
|
+
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
847
|
+
"""
|
848
|
+
results = [
|
849
|
+
n.text.re(regex, replace_entities) for n in self
|
850
|
+
]
|
851
|
+
return flatten(results)
|
852
|
+
|
853
|
+
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True):
|
854
|
+
"""Call the ``.re_first()`` method for each element in this list and return
|
855
|
+
their results flattened as List of TextHandler.
|
856
|
+
|
857
|
+
:param regex: Can be either a compiled regular expression or a string.
|
858
|
+
:param default: The default value to be returned if there is no match
|
859
|
+
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
860
|
+
|
861
|
+
"""
|
862
|
+
results = [
|
863
|
+
n.text.re_first(regex, default, replace_entities) for n in self
|
864
|
+
]
|
865
|
+
return flatten(results)
|
866
|
+
|
867
|
+
# def __getattr__(self, name):
|
868
|
+
# if name in dir(self.__class__):
|
869
|
+
# return super().__getattribute__(name)
|
870
|
+
#
|
871
|
+
# # Execute the method itself on each Adaptor
|
872
|
+
# results = []
|
873
|
+
# for item in self:
|
874
|
+
# results.append(getattr(item, name))
|
875
|
+
#
|
876
|
+
# if all(callable(r) for r in results):
|
877
|
+
# def call_all(*args, **kwargs):
|
878
|
+
# final_results = [r(*args, **kwargs) for r in results]
|
879
|
+
# if all([isinstance(r, (Adaptor, Adaptors,)) for r in results]):
|
880
|
+
# return self.__class__(final_results)
|
881
|
+
# return final_results
|
882
|
+
#
|
883
|
+
# return call_all
|
884
|
+
# else:
|
885
|
+
# # Flatten the result if it's a single-item list containing a list
|
886
|
+
# if len(self) == 1 and isinstance(results[0], list):
|
887
|
+
# return self.__class__(results[0])
|
888
|
+
# return self.__class__(results)
|
889
|
+
|
890
|
+
def get(self, default=None):
|
891
|
+
"""Returns the first item of the current list
|
892
|
+
:param default: the default value to return if the current list is empty
|
893
|
+
"""
|
894
|
+
return self[0] if len(self) > 0 else default
|
895
|
+
|
896
|
+
@property
|
897
|
+
def first(self):
|
898
|
+
"""Returns the first item of the current list or `None` if the list is empty"""
|
899
|
+
return self.get()
|
900
|
+
|
901
|
+
@property
|
902
|
+
def last(self):
|
903
|
+
"""Returns the last item of the current list or `None` if the list is empty"""
|
904
|
+
return self[-1] if len(self) > 0 else None
|
905
|
+
|
906
|
+
def __getstate__(self) -> Any:
|
907
|
+
# lxml don't like it :)
|
908
|
+
raise TypeError("Can't pickle Adaptors object")
|