scrapling 0.1__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
scrapling/parser.py ADDED
@@ -0,0 +1,908 @@
1
+ import os
2
+ from difflib import SequenceMatcher
3
+ from typing import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator
4
+ try:
5
+ from typing import SupportsIndex
6
+ except ImportError:
7
+ # 'SupportsIndex' got added in Python 3.8
8
+ SupportsIndex = None
9
+
10
+ from scrapling.translator import HTMLTranslator
11
+ from scrapling.mixins import SelectorsGeneration
12
+ from scrapling.custom_types import TextHandler, AttributesHandler
13
+ from scrapling.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
14
+ from scrapling.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden
15
+
16
+ from lxml import etree, html
17
+ from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
18
+
19
+
20
+ class Adaptor(SelectorsGeneration):
21
+ __slots__ = (
22
+ 'url', 'encoding', '__auto_match_enabled', '_root', '_storage', '__debug',
23
+ '__keep_comments', '__huge_tree_enabled', '__attributes', '__text', '__tag',
24
+ )
25
+
26
+ def __init__(
27
+ self,
28
+ text: Optional[str] = None,
29
+ url: Optional[str] = None,
30
+ body: bytes = b"",
31
+ encoding: str = "utf8",
32
+ huge_tree: bool = True,
33
+ root: Optional[html.HtmlElement] = None,
34
+ keep_comments: Optional[bool] = False,
35
+ auto_match: Optional[bool] = False,
36
+ storage: Any = SQLiteStorageSystem,
37
+ storage_args: Optional[Dict] = None,
38
+ debug: Optional[bool] = True,
39
+ ):
40
+ """The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
41
+ with expressions in CSS, XPath, or with simply text. Check the docs for more info.
42
+
43
+ Here we try to extend module ``lxml.html.HtmlElement`` while maintaining a simpler interface, We are not
44
+ inheriting from the ``lxml.html.HtmlElement`` because it's not pickleable which makes a lot of reference jobs
45
+ not possible. You can test it here and see code explodes with `AssertionError: invalid Element proxy at...`.
46
+ It's an old issue with lxml, see `this entry <https://bugs.launchpad.net/lxml/+bug/736708>`
47
+
48
+ :param text: HTML body passed as text.
49
+ :param url: allows storing a URL with the html data for retrieving later.
50
+ :param body: HTML body as ``bytes`` object. It can be used instead of the ``text`` argument.
51
+ :param encoding: The encoding type that will be used in HTML parsing, default is `UTF-8`
52
+ :param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
53
+ libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
54
+ :param root: Used internally to pass etree objects instead of text/body arguments, it takes highest priority.
55
+ Don't use it unless you know what you are doing!
56
+ :param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
57
+ :param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
58
+ priority over all auto-match related arguments/functions in the class.
59
+ :param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
60
+ :param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
61
+ If empty, default values will be used.
62
+ :param debug: Enable debug mode
63
+ """
64
+ if root is None and not body and text is None:
65
+ raise ValueError("Adaptor class needs text, body, or root arguments to work")
66
+
67
+ if root is None:
68
+ if text is None:
69
+ if not body or not isinstance(body, bytes):
70
+ raise TypeError(f"body argument must be valid and of type bytes, got {body.__class__}")
71
+
72
+ body = body.replace(b"\x00", b"").strip()
73
+ else:
74
+ if not isinstance(text, str):
75
+ raise TypeError(f"text argument must be of type str, got {text.__class__}")
76
+
77
+ body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
78
+
79
+ parser = html.HTMLParser(
80
+ # https://lxml.de/api/lxml.etree.HTMLParser-class.html
81
+ recover=True, remove_blank_text=True, remove_comments=(keep_comments is True), encoding=encoding,
82
+ compact=True, huge_tree=huge_tree, default_doctype=True
83
+ )
84
+ self._root = etree.fromstring(body, parser=parser, base_url=url)
85
+
86
+ else:
87
+ # All html types inherits from HtmlMixin so this to check for all at once
88
+ if not issubclass(type(root), html.HtmlMixin):
89
+ raise TypeError(
90
+ f"Root have to be a valid element of `html` module types to work, not of type {type(root)}"
91
+ )
92
+
93
+ self._root = root
94
+
95
+ setup_basic_logging(level='debug' if debug else 'info')
96
+ self.__auto_match_enabled = auto_match
97
+
98
+ if self.__auto_match_enabled:
99
+ if not storage_args:
100
+ storage_args = {
101
+ 'storage_file': os.path.join(os.path.dirname(__file__), 'elements_storage.db'),
102
+ 'url': url
103
+ }
104
+
105
+ if not hasattr(storage, '__wrapped__'):
106
+ raise ValueError("Storage class must be wrapped with cache decorator, see docs for info")
107
+
108
+ if not issubclass(storage.__wrapped__, StorageSystemMixin):
109
+ raise ValueError("Storage system must be inherited from class `StorageSystemMixin`")
110
+
111
+ self._storage = storage(**storage_args)
112
+
113
+ self.__keep_comments = keep_comments
114
+ self.__huge_tree_enabled = huge_tree
115
+ self.encoding = encoding
116
+ self.url = url
117
+ # For selector stuff
118
+ self.__attributes = None
119
+ self.__text = None
120
+ self.__tag = None
121
+ self.__debug = debug
122
+
123
+ # Node functionalities, I wanted to move to separate Mixin class but it had slight impact on performance
124
+ @staticmethod
125
+ def _is_text_node(element: Union[html.HtmlElement, etree._ElementUnicodeResult]) -> bool:
126
+ """Return True if given element is a result of a string expression
127
+ Examples:
128
+ Xpath -> '/text()', '/@attribute' etc...
129
+ CSS3 -> '::text', '::attr(attrib)'...
130
+ """
131
+ # Faster than checking `element.is_attribute or element.is_text or element.is_tail`
132
+ return issubclass(type(element), etree._ElementUnicodeResult)
133
+
134
+ def __get_correct_result(
135
+ self, element: Union[html.HtmlElement, etree._ElementUnicodeResult]
136
+ ) -> Union[TextHandler, html.HtmlElement, 'Adaptor', str]:
137
+ """Used internally in all functions to convert results to type (Adaptor|Adaptors) when possible"""
138
+ if self._is_text_node(element):
139
+ # etree._ElementUnicodeResult basically inherit from `str` so it's fine
140
+ return TextHandler(str(element))
141
+ else:
142
+ if issubclass(type(element), html.HtmlMixin):
143
+ return self.__class__(
144
+ root=element, url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
145
+ keep_comments=self.__keep_comments, huge_tree=self.__huge_tree_enabled, debug=self.__debug
146
+ )
147
+ return element
148
+
149
+ def __convert_results(
150
+ self, result: Union[List[html.HtmlElement], html.HtmlElement]
151
+ ) -> Union['Adaptors[Adaptor]', 'Adaptor', List, None]:
152
+ """Used internally in all functions to convert results to type (Adaptor|Adaptors) in bulk when possible"""
153
+ if result is None:
154
+ return None
155
+ elif result == []: # Lxml will give a warning if I used something like `not result`
156
+ return []
157
+
158
+ if isinstance(result, Adaptors):
159
+ return result
160
+
161
+ if type(result) is list:
162
+ results = [self.__get_correct_result(n) for n in result]
163
+ if all(isinstance(res, self.__class__) for res in results):
164
+ return Adaptors(results)
165
+ return results
166
+
167
+ return self.__get_correct_result(result)
168
+
169
+ def __getstate__(self) -> Any:
170
+ # lxml don't like it :)
171
+ raise TypeError("Can't pickle Adaptor objects")
172
+
173
+ # The following four properties I made them into functions instead of variables directly
174
+ # So they don't slow down the process of initializing many instances of the class and gets executed only
175
+ # when the user need them for the first time for that specific element and gets cached for next times
176
+ # Doing that only made the library performance test sky rocked multiple times faster than before
177
+ # because I was executing them on initialization before :))
178
+ @property
179
+ def tag(self) -> str:
180
+ """Get tag name of the element"""
181
+ if not self.__tag:
182
+ self.__tag = self._root.tag
183
+ return self.__tag
184
+
185
+ @property
186
+ def text(self) -> TextHandler:
187
+ """Get text content of the element"""
188
+ if not self.__text:
189
+ self.__text = TextHandler(self._root.text)
190
+ return self.__text
191
+
192
+ def get_all_text(self, separator: str = "\n", strip: bool = False, ignore_tags: Tuple = ('script', 'style',), valid_values: bool = True) -> TextHandler:
193
+ """Get all child strings of this element, concatenated using the given separator.
194
+
195
+ :param separator: Strings will be concatenated using this separator.
196
+ :param strip: If True, strings will be stripped before being concatenated.
197
+ :param ignore_tags: A tuple of all tag names you want to ignore
198
+ :param valid_values: If enabled, elements with text-content that is empty or only whitespaces will be ignored
199
+
200
+ :return: A TextHandler
201
+ """
202
+ _all_strings = []
203
+
204
+ def _traverse(node: html.HtmlElement) -> None:
205
+ """Traverse element children and get text content of each
206
+
207
+ :param node: Current node in the tree structure
208
+ :return:
209
+ """
210
+ if node.tag not in ignore_tags:
211
+ text = node.text
212
+ if text and type(text) is str:
213
+ if valid_values:
214
+ if text.strip():
215
+ _all_strings.append(text if not strip else text.strip())
216
+ else:
217
+ _all_strings.append(text if not strip else text.strip())
218
+
219
+ for branch in node.iterchildren():
220
+ _traverse(branch)
221
+
222
+ # We will start using Lxml directly for the speed boost
223
+ _traverse(self._root)
224
+
225
+ return TextHandler(separator.join([s for s in _all_strings]))
226
+
227
+ @property
228
+ def attrib(self) -> AttributesHandler:
229
+ """Get attributes of the element"""
230
+ if not self.__attributes:
231
+ self.__attributes = AttributesHandler(self._root.attrib)
232
+ return self.__attributes
233
+
234
+ @property
235
+ def html_content(self) -> str:
236
+ """Return the inner html code of the element"""
237
+ return etree.tostring(self._root, encoding='unicode', method='html', with_tail=False)
238
+
239
+ body = html_content
240
+
241
+ def prettify(self) -> str:
242
+ """Return a prettified version of the element's inner html-code"""
243
+ return etree.tostring(self._root, encoding='unicode', pretty_print=True, method='html', with_tail=False)
244
+
245
+ def has_class(self, class_name: str) -> bool:
246
+ """Check if element has a specific class
247
+ :param class_name: The class name to check for
248
+ :return: True if element has class with that name otherwise False
249
+ """
250
+ return class_name in self._root.classes
251
+
252
+ @property
253
+ def parent(self) -> Union['Adaptor', None]:
254
+ """Return the direct parent of the element or ``None`` otherwise"""
255
+ return self.__convert_results(self._root.getparent())
256
+
257
+ @property
258
+ def children(self) -> Union['Adaptors[Adaptor]', List]:
259
+ """Return the children elements of the current element or empty list otherwise"""
260
+ return self.__convert_results(list(
261
+ child for child in self._root.iterchildren() if type(child) not in html_forbidden
262
+ ))
263
+
264
+ @property
265
+ def siblings(self) -> Union['Adaptors[Adaptor]', List]:
266
+ """Return other children of the current element's parent or empty list otherwise"""
267
+ if self.parent:
268
+ return Adaptors([child for child in self.parent.children if child._root != self._root])
269
+ return []
270
+
271
+ def iterancestors(self) -> Generator['Adaptor', None, None]:
272
+ """Return a generator that loops over all ancestors of the element, starting with element's parent."""
273
+ for ancestor in self._root.iterancestors():
274
+ yield self.__convert_results(ancestor)
275
+
276
+ def find_ancestor(self, func: Callable[['Adaptor'], bool]) -> Union['Adaptor', None]:
277
+ """Loop over all ancestors of the element till one match the passed function
278
+ :param func: A function that takes each ancestor as an argument and returns True/False
279
+ :return: The first ancestor that match the function or ``None`` otherwise.
280
+ """
281
+ for ancestor in self.iterancestors():
282
+ if func(ancestor):
283
+ return ancestor
284
+ return None
285
+
286
+ @property
287
+ def path(self) -> 'Adaptors[Adaptor]':
288
+ """Returns list of type :class:`Adaptors` that contains the path leading to the current element from the root."""
289
+ lst = list(self.iterancestors())
290
+ return Adaptors(lst)
291
+
292
+ @property
293
+ def next(self) -> Union['Adaptor', None]:
294
+ """Returns the next element of the current element in the children of the parent or ``None`` otherwise."""
295
+ next_element = self._root.getnext()
296
+ if next_element is not None:
297
+ while type(next_element) in html_forbidden:
298
+ # Ignore html comments and unwanted types
299
+ next_element = next_element.getnext()
300
+
301
+ return self.__convert_results(next_element)
302
+
303
+ @property
304
+ def previous(self) -> Union['Adaptor', None]:
305
+ """Returns the previous element of the current element in the children of the parent or ``None`` otherwise."""
306
+ prev_element = self._root.getprevious()
307
+ if prev_element is not None:
308
+ while type(prev_element) in html_forbidden:
309
+ # Ignore html comments and unwanted types
310
+ prev_element = prev_element.getprevious()
311
+
312
+ return self.__convert_results(prev_element)
313
+
314
+ def __str__(self) -> str:
315
+ return self.html_content
316
+
317
+ def __repr__(self) -> str:
318
+ length_limit = 40
319
+ data = "<"
320
+ content = clean_spaces(self.html_content)
321
+ if len(content) > length_limit:
322
+ content = content[:length_limit].strip() + '...'
323
+ data += f"data='{content}'"
324
+
325
+ if self.parent:
326
+ parent_content = clean_spaces(self.parent.html_content)
327
+ if len(parent_content) > length_limit:
328
+ parent_content = parent_content[:length_limit].strip() + '...'
329
+
330
+ data += f" parent='{parent_content}'"
331
+
332
+ return data + ">"
333
+
334
+ # From here we start the selecting functions
335
+ def relocate(
336
+ self, element: Union[Dict, html.HtmlElement, 'Adaptor'], percentage: int = 0, adaptor_type: bool = False
337
+ ) -> Union[List[Union[html.HtmlElement, None]], 'Adaptors']:
338
+ """This function will search again for the element in the page tree, used automatically on page structure change
339
+
340
+ :param element: The element we want to relocate in the tree
341
+ :param percentage: The minimum percentage to accept and not going lower than that. Be aware that the percentage
342
+ calculation depends solely on the page structure so don't play with this number unless you must know
343
+ what you are doing!
344
+ :param adaptor_type: If True, the return result will be converted to `Adaptors` object
345
+ :return: List of pure HTML elements that got the highest matching score or 'Adaptors' object
346
+ """
347
+ score_table = {}
348
+ # Note: `element` will be most likely always be a dictionary at this point.
349
+ if isinstance(element, self.__class__):
350
+ element = element._root
351
+
352
+ if issubclass(type(element), html.HtmlElement):
353
+ element = _StorageTools.element_to_dict(element)
354
+
355
+ # TODO: Optimize the traverse logic a bit, maybe later
356
+ def _traverse(node: html.HtmlElement, ele: Dict) -> None:
357
+ """Get the matching score of the given element against the node then traverse the children
358
+
359
+ :param node: Current node in the tree structure
360
+ :param ele: The element we are searching for as dictionary
361
+ :return:
362
+ """
363
+ # Hence: the code doesn't stop even if the score was 100%
364
+ # because there might be another element(s) left in page with the same score
365
+ score = self.__calculate_similarity_score(ele, node)
366
+ score_table.setdefault(score, []).append(node)
367
+ for branch in node.iterchildren():
368
+ _traverse(branch, ele)
369
+
370
+ # This will block until we traverse all children/branches
371
+ _traverse(self._root, element)
372
+
373
+ if score_table:
374
+ highest_probability = max(score_table.keys())
375
+ if score_table[highest_probability] and highest_probability >= percentage:
376
+ logging.debug(f'Highest probability was {highest_probability}%')
377
+ logging.debug('Top 5 best matching elements are: ')
378
+ for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
379
+ logging.debug(f'{percent} -> {self.__convert_results(score_table[percent])}')
380
+ if not adaptor_type:
381
+ return score_table[highest_probability]
382
+ return self.__convert_results(score_table[highest_probability])
383
+ return []
384
+
385
+ def css(self, selector: str, identifier: str = '',
386
+ auto_match: bool = False, auto_save: bool = False, percentage: int = 0
387
+ ) -> Union['Adaptors[Adaptor]', List]:
388
+ """Search current tree with CSS3 selectors
389
+
390
+ **Important:
391
+ It's recommended to use the identifier argument if you plan to use different selector later
392
+ and want to relocate the same element(s)**
393
+
394
+ :param selector: The CSS3 selector to be used.
395
+ :param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
396
+ :param identifier: A string that will be used to save/retrieve element's data in auto-matching
397
+ otherwise the selector will be used.
398
+ :param auto_save: Automatically save new elements for `auto_match` later
399
+ :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
400
+ Be aware that the percentage calculation depends solely on the page structure so don't play with this
401
+ number unless you must know what you are doing!
402
+
403
+ :return: List as :class:`Adaptors`
404
+ """
405
+ try:
406
+ if not self.__auto_match_enabled:
407
+ # No need to split selectors in this case, let's save some CPU cycles :)
408
+ xpath_selector = HTMLTranslator().css_to_xpath(selector)
409
+ return self.xpath(xpath_selector, identifier or selector, auto_match, auto_save, percentage)
410
+
411
+ results = []
412
+ if ',' in selector:
413
+ for single_selector in split_selectors(selector):
414
+ # I'm doing this only so the `save` function save data correctly for combined selectors
415
+ # Like using the ',' to combine two different selectors that point to different elements.
416
+ xpath_selector = HTMLTranslator().css_to_xpath(single_selector.canonical())
417
+ results += self.xpath(
418
+ xpath_selector, identifier or single_selector.canonical(), auto_match, auto_save, percentage
419
+ )
420
+ else:
421
+ xpath_selector = HTMLTranslator().css_to_xpath(selector)
422
+ return self.xpath(xpath_selector, identifier or selector, auto_match, auto_save, percentage)
423
+
424
+ return self.__convert_results(results)
425
+ except (SelectorError, SelectorSyntaxError,):
426
+ raise SelectorSyntaxError(f"Invalid CSS selector: {selector}")
427
+
428
+ def xpath(self, selector: str, identifier: str = '',
429
+ auto_match: bool = False, auto_save: bool = False, percentage: int = 0, **kwargs: Any
430
+ ) -> Union['Adaptors[Adaptor]', List]:
431
+ """Search current tree with XPath selectors
432
+
433
+ **Important:
434
+ It's recommended to use the identifier argument if you plan to use different selector later
435
+ and want to relocate the same element(s)**
436
+
437
+ Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
438
+
439
+ :param selector: The XPath selector to be used.
440
+ :param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
441
+ :param identifier: A string that will be used to save/retrieve element's data in auto-matching
442
+ otherwise the selector will be used.
443
+ :param auto_save: Automatically save new elements for `auto_match` later
444
+ :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
445
+ Be aware that the percentage calculation depends solely on the page structure so don't play with this
446
+ number unless you must know what you are doing!
447
+
448
+ :return: List as :class:`Adaptors`
449
+ """
450
+ try:
451
+ selected_elements = self._root.xpath(selector, **kwargs)
452
+
453
+ if selected_elements:
454
+ if not self.__auto_match_enabled and auto_save:
455
+ logging.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
456
+
457
+ elif self.__auto_match_enabled and auto_save:
458
+ self.save(selected_elements[0], identifier or selector)
459
+
460
+ return self.__convert_results(selected_elements)
461
+ else:
462
+ if self.__auto_match_enabled and auto_match:
463
+ element_data = self.retrieve(identifier or selector)
464
+ if element_data:
465
+ relocated = self.relocate(element_data, percentage)
466
+ if relocated is not None and auto_save:
467
+ self.save(relocated[0], identifier or selector)
468
+
469
+ return self.__convert_results(relocated)
470
+ else:
471
+ return self.__convert_results(selected_elements)
472
+
473
+ elif not self.__auto_match_enabled and auto_match:
474
+ logging.warning("Argument `auto_match` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
475
+
476
+ return self.__convert_results(selected_elements)
477
+
478
+ except (SelectorError, SelectorSyntaxError, etree.XPathError, etree.XPathEvalError):
479
+ raise SelectorSyntaxError(f"Invalid XPath selector: {selector}")
480
+
481
+ def __calculate_similarity_score(self, original: Dict, candidate: html.HtmlElement) -> float:
482
+ """Used internally to calculate a score that shows how candidate element similar to the original one
483
+
484
+ :param original: The original element in the form of the dictionary generated from `element_to_dict` function
485
+ :param candidate: The element to compare with the original element.
486
+ :return: A percentage score of how similar is the candidate to the original element
487
+ """
488
+ score, checks = 0, 0
489
+ candidate = _StorageTools.element_to_dict(candidate)
490
+
491
+ # Possible TODO:
492
+ # Study the idea of giving weight to each test below so some are more important than others
493
+ # Current results: With weights some websites had better score while it was worse for others
494
+ score += 1 if original['tag'] == candidate['tag'] else 0 # * 0.3 # 30%
495
+ checks += 1
496
+
497
+ if original['text']:
498
+ score += SequenceMatcher(None, original['text'], candidate.get('text') or '').ratio() # * 0.3 # 30%
499
+ checks += 1
500
+
501
+ # if both doesn't have attributes, it still count for something!
502
+ score += self.__calculate_dict_diff(original['attributes'], candidate['attributes']) # * 0.3 # 30%
503
+ checks += 1
504
+
505
+ # Separate similarity test for class, id, href,... this will help in full structural changes
506
+ for attrib in ('class', 'id', 'href', 'src',):
507
+ if original['attributes'].get(attrib):
508
+ score += SequenceMatcher(
509
+ None, original['attributes'][attrib], candidate['attributes'].get(attrib) or ''
510
+ ).ratio() # * 0.3 # 30%
511
+ checks += 1
512
+
513
+ score += SequenceMatcher(None, original['path'], candidate['path']).ratio() # * 0.1 # 10%
514
+ checks += 1
515
+
516
+ if original.get('parent_name'):
517
+ # Then we start comparing parents' data
518
+ if candidate.get('parent_name'):
519
+ score += SequenceMatcher(
520
+ None, original['parent_name'], candidate.get('parent_name') or ''
521
+ ).ratio() # * 0.2 # 20%
522
+ checks += 1
523
+
524
+ score += self.__calculate_dict_diff(
525
+ original['parent_attribs'], candidate.get('parent_attribs') or {}
526
+ ) # * 0.2 # 20%
527
+ checks += 1
528
+
529
+ if original['parent_text']:
530
+ score += SequenceMatcher(
531
+ None, original['parent_text'], candidate.get('parent_text') or ''
532
+ ).ratio() # * 0.1 # 10%
533
+ checks += 1
534
+ # else:
535
+ # # The original element have a parent and this one not, this is not a good sign
536
+ # score -= 0.1
537
+
538
+ if original.get('siblings'):
539
+ score += SequenceMatcher(
540
+ None, original['siblings'], candidate.get('siblings') or []
541
+ ).ratio() # * 0.1 # 10%
542
+ checks += 1
543
+
544
+ # How % sure? let's see
545
+ return round((score / checks) * 100, 2)
546
+
547
+ @staticmethod
548
+ def __calculate_dict_diff(dict1: dict, dict2: dict) -> float:
549
+ """Used internally calculate similarity between two dictionaries as SequenceMatcher doesn't accept dictionaries
550
+ """
551
+ score = SequenceMatcher(None, tuple(dict1.keys()), tuple(dict2.keys())).ratio() * 0.5
552
+ score += SequenceMatcher(None, tuple(dict1.values()), tuple(dict2.values())).ratio() * 0.5
553
+ return score
554
+
555
+ def save(self, element: Union['Adaptor', html.HtmlElement], identifier: str) -> None:
556
+ """Saves the element's unique properties to the storage for retrieval and relocation later
557
+
558
+ :param element: The element itself that we want to save to storage, it can be a `Adaptor` or pure `HtmlElement`
559
+ :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See
560
+ the docs for more info.
561
+ """
562
+ if self.__auto_match_enabled:
563
+ if isinstance(element, self.__class__):
564
+ element = element._root
565
+
566
+ if self._is_text_node(element):
567
+ element = element.getparent()
568
+
569
+ self._storage.save(element, identifier)
570
+ else:
571
+ logging.critical(
572
+ "Can't use Auto-match features with disabled globally, you have to start a new class instance."
573
+ )
574
+
575
+ def retrieve(self, identifier: str) -> Optional[Dict]:
576
+ """Using the identifier, we search the storage and return the unique properties of the element
577
+
578
+ :param identifier: This is the identifier that will be used to retrieve the element from the storage. See
579
+ the docs for more info.
580
+ :return: A dictionary of the unique properties
581
+ """
582
+ if self.__auto_match_enabled:
583
+ return self._storage.retrieve(identifier)
584
+
585
+ logging.critical(
586
+ "Can't use Auto-match features with disabled globally, you have to start a new class instance."
587
+ )
588
+
589
+ # Operations on text functions
590
+ def json(self) -> Dict:
591
+ """Return json response if the response is jsonable otherwise throws error"""
592
+ return self.text.json()
593
+
594
+ def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True) -> 'List[str]':
595
+ """Apply the given regex to the current text and return a list of strings with the matches.
596
+
597
+ :param regex: Can be either a compiled regular expression or a string.
598
+ :param replace_entities: if enabled character entity references are replaced by their corresponding character
599
+ """
600
+ return self.text.re(regex, replace_entities)
601
+
602
+ def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True):
603
+ """Apply the given regex to text and return the first match if found, otherwise return the default value.
604
+
605
+ :param regex: Can be either a compiled regular expression or a string.
606
+ :param default: The default value to be returned if there is no match
607
+ :param replace_entities: if enabled character entity references are replaced by their corresponding character
608
+
609
+ """
610
+ return self.text.re_first(regex, default, replace_entities)
611
+
612
+ def find_similar(
613
+ self,
614
+ similarity_threshold: float = 0.2,
615
+ ignore_attributes: Union[List, Tuple] = ('href', 'src',),
616
+ match_text: bool = False
617
+ ) -> Union['Adaptors[Adaptor]', List]:
618
+ """Find elements that are in the same tree depth in the page with the same tag name and same parent tag etc...
619
+ then return the ones that match the current element attributes with percentage higher than the input threshold.
620
+
621
+ This function is inspired by AutoScraper and made for cases where you, for example, found a product div inside
622
+ a products-list container and want to find other products using that that element as a starting point EXCEPT
623
+ this function works in any case without depending on the element type.
624
+
625
+ :param similarity_threshold: The percentage to use while comparing elements attributes.
626
+ Note: Elements found before attributes matching/comparison will be sharing the same depth, same tag name,
627
+ same parent tag name, and same grand parent tag name. So they are 99% likely to be correct unless your are
628
+ extremely unlucky then attributes matching comes into play so basically don't play with this number unless
629
+ you are getting the results you don't want.
630
+ Also, if current element doesn't have attributes and the similar element as well, then it's a 100% match.
631
+ :param ignore_attributes: Attribute names passed will be ignored while matching the attributes in last step.
632
+ The default value is to ignore `href` and `src` as URLs can change a lot between elements so it's unreliable
633
+ :param match_text: If True, elements text content will be taken into calculation while matching.
634
+ Not recommended to use in normal cases but it depends.
635
+
636
+ :return: A ``Adaptors`` container of ``Adaptor`` objects or empty list
637
+ """
638
+ def get_attributes(element: html.HtmlElement) -> Dict:
639
+ """Return attributes dictionary without the ignored list"""
640
+ return {k: v for k, v in element.attrib.items() if k not in ignore_attributes}
641
+
642
+ def are_alike(original: html.HtmlElement, original_attributes: Dict, candidate: html.HtmlElement) -> bool:
643
+ """Calculate a score of how much these elements are alike and return True
644
+ if score is higher or equal the threshold"""
645
+ candidate_attributes = get_attributes(candidate) if ignore_attributes else candidate.attrib
646
+ score, checks = 0, 0
647
+
648
+ if original_attributes:
649
+ score += sum(
650
+ SequenceMatcher(None, v, candidate_attributes.get(k, '')).ratio()
651
+ for k, v in original_attributes.items()
652
+ )
653
+ checks += len(candidate_attributes)
654
+ else:
655
+ if not candidate_attributes:
656
+ # Both doesn't have attributes, this must mean something
657
+ score += 1
658
+ checks += 1
659
+
660
+ if match_text:
661
+ score += SequenceMatcher(
662
+ None, clean_spaces(original.text or ''), clean_spaces(candidate.text or '')
663
+ ).ratio()
664
+ checks += 1
665
+
666
+ if checks:
667
+ return round(score / checks, 2) >= similarity_threshold
668
+ return False
669
+
670
+ # We will use the elements root from now on to get the speed boost of using Lxml directly
671
+ root = self._root
672
+ current_depth = len(list(root.iterancestors()))
673
+ target_attrs = get_attributes(root) if ignore_attributes else root.attrib
674
+ similar_elements = list()
675
+ # + root.xpath(f"//{self.tag}[count(ancestor::*) = {current_depth-1}]")
676
+ parent = root.getparent()
677
+ if parent is not None:
678
+ grandparent = parent.getparent() # lol
679
+ if grandparent is not None:
680
+ potential_matches = root.xpath(
681
+ f"//{grandparent.tag}/{parent.tag}/{self.tag}[count(ancestor::*) = {current_depth}]"
682
+ )
683
+ else:
684
+ potential_matches = root.xpath(f"//{parent.tag}/{self.tag}[count(ancestor::*) = {current_depth}]")
685
+ else:
686
+ potential_matches = root.xpath(f"//{self.tag}[count(ancestor::*) = {current_depth}]")
687
+
688
+ for potential_match in potential_matches:
689
+ if potential_match != root and are_alike(root, target_attrs, potential_match):
690
+ similar_elements.append(potential_match)
691
+
692
+ return self.__convert_results(similar_elements)
693
+
694
+ def find_by_text(
695
+ self, text: str, first_match: bool = True, partial: bool = False,
696
+ case_sensitive: bool = False, clean_match: bool = True
697
+ ) -> Union['Adaptors[Adaptor]', 'Adaptor', List]:
698
+ """Find elements that its text content fully/partially matches input.
699
+ :param text: Text query to match
700
+ :param first_match: Return first element that matches conditions, enabled by default
701
+ :param partial: If enabled, function return elements that contains the input text
702
+ :param case_sensitive: if enabled, letters case will be taken into consideration
703
+ :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
704
+ """
705
+
706
+ results = []
707
+ if not case_sensitive:
708
+ text = text.lower()
709
+
710
+ def _traverse(node: Adaptor) -> None:
711
+ """Check if element matches given text otherwise, traverse the children tree and iterate"""
712
+ node_text = node.text
713
+ # if there's already no text in this node, dodge it to save CPU cycles and time
714
+ if node_text:
715
+ if clean_match:
716
+ node_text = node_text.clean()
717
+
718
+ if not case_sensitive:
719
+ node_text = node_text.lower()
720
+
721
+ if partial:
722
+ if text in node_text:
723
+ results.append(node)
724
+ elif text == node_text:
725
+ results.append(node)
726
+
727
+ if results and first_match:
728
+ # we got an element so we should stop
729
+ return
730
+
731
+ for branch in node.children:
732
+ _traverse(branch)
733
+
734
+ # This will block until we traverse all children/branches
735
+ _traverse(self)
736
+
737
+ if first_match:
738
+ if results:
739
+ return results[0]
740
+ return self.__convert_results(results)
741
+
742
+ def find_by_regex(
743
+ self, query: str, first_match: bool = True, case_sensitive: bool = False, clean_match: bool = True
744
+ ) -> Union['Adaptors[Adaptor]', 'Adaptor', List]:
745
+ """Find elements that its text content matches the input regex pattern.
746
+ :param query: Regex query to match
747
+ :param first_match: Return first element that matches conditions, enabled by default
748
+ :param case_sensitive: if enabled, letters case will be taken into consideration in the regex
749
+ :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
750
+ """
751
+ results = []
752
+
753
+ def _traverse(node: Adaptor) -> None:
754
+ """Check if element matches given regex otherwise, traverse the children tree and iterate"""
755
+ node_text = node.text
756
+ # if there's already no text in this node, dodge it to save CPU cycles and time
757
+ if node_text:
758
+ if node_text.re(query, check_match=True, clean_match=clean_match, case_sensitive=case_sensitive):
759
+ results.append(node)
760
+
761
+ if results and first_match:
762
+ # we got an element so we should stop
763
+ return
764
+
765
+ for branch in node.children:
766
+ _traverse(branch)
767
+
768
+ # This will block until we traverse all children/branches
769
+ _traverse(self)
770
+
771
+ if results and first_match:
772
+ return results[0]
773
+ return self.__convert_results(results)
774
+
775
+
776
+ class Adaptors(List[Adaptor]):
777
+ """
778
+ The :class:`Adaptors` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
779
+ """
780
+ __slots__ = ()
781
+
782
+ def __getitem__(self, pos: Union[SupportsIndex, slice]) -> Union[Adaptor, "Adaptors[Adaptor]"]:
783
+ lst = super().__getitem__(pos)
784
+ if isinstance(pos, slice):
785
+ return self.__class__(lst)
786
+ else:
787
+ return lst
788
+
789
+ def xpath(
790
+ self, selector: str, identifier: str = '', auto_save: bool = False, percentage: int = 0, **kwargs: Any
791
+ ) -> Union["Adaptors[Adaptor]", List]:
792
+ """
793
+ Call the ``.xpath()`` method for each element in this list and return
794
+ their results as another :class:`Adaptors`.
795
+
796
+ **Important:
797
+ It's recommended to use the identifier argument if you plan to use different selector later
798
+ and want to relocate the same element(s)**
799
+
800
+ Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
801
+
802
+ :param selector: The XPath selector to be used.
803
+ :param identifier: A string that will be used to retrieve element's data in auto-matching
804
+ otherwise the selector will be used.
805
+ :param auto_save: Automatically save new elements for `auto_match` later
806
+ :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
807
+ Be aware that the percentage calculation depends solely on the page structure so don't play with this
808
+ number unless you must know what you are doing!
809
+
810
+ :return: List as :class:`Adaptors`
811
+ """
812
+ results = [
813
+ n.xpath(selector, identifier or selector, False, auto_save, percentage, **kwargs) for n in self
814
+ ]
815
+ return self.__class__(flatten(results))
816
+
817
+ def css(self, selector: str, identifier: str = '', auto_save: bool = False, percentage: int = 0) -> Union["Adaptors[Adaptor]", List]:
818
+ """
819
+ Call the ``.css()`` method for each element in this list and return
820
+ their results flattened as another :class:`Adaptors`.
821
+
822
+ **Important:
823
+ It's recommended to use the identifier argument if you plan to use different selector later
824
+ and want to relocate the same element(s)**
825
+
826
+ :param selector: The CSS3 selector to be used.
827
+ :param identifier: A string that will be used to retrieve element's data in auto-matching
828
+ otherwise the selector will be used.
829
+ :param auto_save: Automatically save new elements for `auto_match` later
830
+ :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
831
+ Be aware that the percentage calculation depends solely on the page structure so don't play with this
832
+ number unless you must know what you are doing!
833
+
834
+ :return: List as :class:`Adaptors`
835
+ """
836
+ results = [
837
+ n.css(selector, identifier or selector, False, auto_save, percentage) for n in self
838
+ ]
839
+ return self.__class__(flatten(results))
840
+
841
+ def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True) -> 'List[str]':
842
+ """Call the ``.re()`` method for each element in this list and return
843
+ their results flattened as List of TextHandler.
844
+
845
+ :param regex: Can be either a compiled regular expression or a string.
846
+ :param replace_entities: if enabled character entity references are replaced by their corresponding character
847
+ """
848
+ results = [
849
+ n.text.re(regex, replace_entities) for n in self
850
+ ]
851
+ return flatten(results)
852
+
853
+ def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True):
854
+ """Call the ``.re_first()`` method for each element in this list and return
855
+ their results flattened as List of TextHandler.
856
+
857
+ :param regex: Can be either a compiled regular expression or a string.
858
+ :param default: The default value to be returned if there is no match
859
+ :param replace_entities: if enabled character entity references are replaced by their corresponding character
860
+
861
+ """
862
+ results = [
863
+ n.text.re_first(regex, default, replace_entities) for n in self
864
+ ]
865
+ return flatten(results)
866
+
867
+ # def __getattr__(self, name):
868
+ # if name in dir(self.__class__):
869
+ # return super().__getattribute__(name)
870
+ #
871
+ # # Execute the method itself on each Adaptor
872
+ # results = []
873
+ # for item in self:
874
+ # results.append(getattr(item, name))
875
+ #
876
+ # if all(callable(r) for r in results):
877
+ # def call_all(*args, **kwargs):
878
+ # final_results = [r(*args, **kwargs) for r in results]
879
+ # if all([isinstance(r, (Adaptor, Adaptors,)) for r in results]):
880
+ # return self.__class__(final_results)
881
+ # return final_results
882
+ #
883
+ # return call_all
884
+ # else:
885
+ # # Flatten the result if it's a single-item list containing a list
886
+ # if len(self) == 1 and isinstance(results[0], list):
887
+ # return self.__class__(results[0])
888
+ # return self.__class__(results)
889
+
890
+ def get(self, default=None):
891
+ """Returns the first item of the current list
892
+ :param default: the default value to return if the current list is empty
893
+ """
894
+ return self[0] if len(self) > 0 else default
895
+
896
+ @property
897
+ def first(self):
898
+ """Returns the first item of the current list or `None` if the list is empty"""
899
+ return self.get()
900
+
901
+ @property
902
+ def last(self):
903
+ """Returns the last item of the current list or `None` if the list is empty"""
904
+ return self[-1] if len(self) > 0 else None
905
+
906
+ def __getstate__(self) -> Any:
907
+ # lxml don't like it :)
908
+ raise TypeError("Can't pickle Adaptors object")