scrapling 0.2.92__py3-none-any.whl → 0.2.93__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- scrapling/__init__.py +1 -1
- scrapling/core/_types.py +2 -1
- scrapling/core/custom_types.py +91 -39
- scrapling/core/translator.py +1 -1
- scrapling/defaults.py +8 -5
- scrapling/engines/camo.py +6 -2
- scrapling/engines/pw.py +1 -1
- scrapling/fetchers.py +5 -5
- scrapling/parser.py +153 -189
- {scrapling-0.2.92.dist-info → scrapling-0.2.93.dist-info}/METADATA +58 -32
- {scrapling-0.2.92.dist-info → scrapling-0.2.93.dist-info}/RECORD +17 -17
- {scrapling-0.2.92.dist-info → scrapling-0.2.93.dist-info}/WHEEL +1 -1
- tests/fetchers/async/test_playwright.py +1 -1
- tests/fetchers/sync/test_playwright.py +1 -1
- {scrapling-0.2.92.dist-info → scrapling-0.2.93.dist-info}/LICENSE +0 -0
- {scrapling-0.2.92.dist-info → scrapling-0.2.93.dist-info}/entry_points.txt +0 -0
- {scrapling-0.2.92.dist-info → scrapling-0.2.93.dist-info}/top_level.txt +0 -0
scrapling/parser.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import inspect
|
2
2
|
import os
|
3
3
|
import re
|
4
|
+
import typing
|
4
5
|
from difflib import SequenceMatcher
|
5
6
|
from urllib.parse import urljoin
|
6
7
|
|
@@ -145,47 +146,46 @@ class Adaptor(SelectorsGeneration):
|
|
145
146
|
# Faster than checking `element.is_attribute or element.is_text or element.is_tail`
|
146
147
|
return issubclass(type(element), etree._ElementUnicodeResult)
|
147
148
|
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
149
|
+
@staticmethod
|
150
|
+
def __content_convertor(element: Union[html.HtmlElement, etree._ElementUnicodeResult]) -> TextHandler:
|
151
|
+
"""Used internally to convert a single element's text content to TextHandler directly without checks
|
152
|
+
|
153
|
+
This single line has been isolated like this so when it's used with map we get that slight performance boost vs list comprehension
|
154
|
+
"""
|
155
|
+
return TextHandler(str(element))
|
156
|
+
|
157
|
+
def __element_convertor(self, element: html.HtmlElement) -> 'Adaptor':
|
158
|
+
"""Used internally to convert a single HtmlElement to Adaptor directly without checks"""
|
159
|
+
return Adaptor(
|
160
|
+
root=element,
|
161
|
+
text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
|
162
|
+
url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
|
163
|
+
keep_comments=self.__keep_comments, keep_cdata=self.__keep_cdata,
|
164
|
+
huge_tree=self.__huge_tree_enabled,
|
165
|
+
**self.__response_data
|
166
|
+
)
|
167
|
+
|
168
|
+
def __handle_element(self, element: Union[html.HtmlElement, etree._ElementUnicodeResult]) -> Union[TextHandler, 'Adaptor', None]:
|
169
|
+
"""Used internally in all functions to convert a single element to type (Adaptor|TextHandler) when possible"""
|
170
|
+
if element is None:
|
171
|
+
return None
|
172
|
+
elif self._is_text_node(element):
|
153
173
|
# etree._ElementUnicodeResult basically inherit from `str` so it's fine
|
154
|
-
return
|
174
|
+
return self.__content_convertor(element)
|
155
175
|
else:
|
156
|
-
|
157
|
-
|
158
|
-
return Adaptor(
|
159
|
-
root=element,
|
160
|
-
text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
|
161
|
-
url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
|
162
|
-
keep_comments=self.__keep_comments, keep_cdata=self.__keep_cdata,
|
163
|
-
huge_tree=self.__huge_tree_enabled,
|
164
|
-
**self.__response_data
|
165
|
-
)
|
166
|
-
return element
|
176
|
+
return self.__element_convertor(element)
|
167
177
|
|
168
|
-
def
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
if result is None:
|
173
|
-
return None
|
174
|
-
elif result == []: # Lxml will give a warning if I used something like `not result`
|
175
|
-
return []
|
176
|
-
|
177
|
-
if isinstance(result, Adaptors):
|
178
|
-
return result
|
179
|
-
|
180
|
-
if type(result) is list:
|
181
|
-
results = [self.__get_correct_result(n) for n in result]
|
182
|
-
if all(isinstance(res, self.__class__) for res in results):
|
183
|
-
return Adaptors(results)
|
184
|
-
elif all(isinstance(res, TextHandler) for res in results):
|
185
|
-
return TextHandlers(results)
|
186
|
-
return results
|
178
|
+
def __handle_elements(self, result: List[Union[html.HtmlElement, etree._ElementUnicodeResult]]) -> Union['Adaptors', 'TextHandlers', List]:
|
179
|
+
"""Used internally in all functions to convert results to type (Adaptors|TextHandlers) in bulk when possible"""
|
180
|
+
if not len(result): # Lxml will give a warning if I used something like `not result`
|
181
|
+
return Adaptors([])
|
187
182
|
|
188
|
-
|
183
|
+
# From within the code, this method will always get a list of the same type
|
184
|
+
# so we will continue without checks for slight performance boost
|
185
|
+
if self._is_text_node(result[0]):
|
186
|
+
return TextHandlers(list(map(self.__content_convertor, result)))
|
187
|
+
|
188
|
+
return Adaptors(list(map(self.__element_convertor, result)))
|
189
189
|
|
190
190
|
def __getstate__(self) -> Any:
|
191
191
|
# lxml don't like it :)
|
@@ -223,29 +223,16 @@ class Adaptor(SelectorsGeneration):
|
|
223
223
|
:return: A TextHandler
|
224
224
|
"""
|
225
225
|
_all_strings = []
|
226
|
-
|
227
|
-
def _traverse(node: html.HtmlElement) -> None:
|
228
|
-
"""Traverse element children and get text content of each
|
229
|
-
|
230
|
-
:param node: Current node in the tree structure
|
231
|
-
:return:
|
232
|
-
"""
|
226
|
+
for node in self._root.xpath('.//*'):
|
233
227
|
if node.tag not in ignore_tags:
|
234
228
|
text = node.text
|
235
229
|
if text and type(text) is str:
|
236
|
-
if valid_values:
|
237
|
-
if text.strip()
|
238
|
-
_all_strings.append(text if not strip else text.strip())
|
230
|
+
if valid_values and text.strip():
|
231
|
+
_all_strings.append(text if not strip else text.strip())
|
239
232
|
else:
|
240
233
|
_all_strings.append(text if not strip else text.strip())
|
241
234
|
|
242
|
-
|
243
|
-
_traverse(branch)
|
244
|
-
|
245
|
-
# We will start using Lxml directly for the speed boost
|
246
|
-
_traverse(self._root)
|
247
|
-
|
248
|
-
return TextHandler(separator.join([s for s in _all_strings]))
|
235
|
+
return TextHandler(separator.join(_all_strings))
|
249
236
|
|
250
237
|
def urljoin(self, relative_url: str) -> str:
|
251
238
|
"""Join this Adaptor's url with a relative url to form an absolute full URL."""
|
@@ -259,18 +246,18 @@ class Adaptor(SelectorsGeneration):
|
|
259
246
|
return self.__attributes
|
260
247
|
|
261
248
|
@property
|
262
|
-
def html_content(self) ->
|
249
|
+
def html_content(self) -> TextHandler:
|
263
250
|
"""Return the inner html code of the element"""
|
264
|
-
return etree.tostring(self._root, encoding='unicode', method='html', with_tail=False)
|
251
|
+
return TextHandler(etree.tostring(self._root, encoding='unicode', method='html', with_tail=False))
|
265
252
|
|
266
253
|
@property
|
267
|
-
def body(self) ->
|
254
|
+
def body(self) -> TextHandler:
|
268
255
|
"""Return raw HTML code of the element/page without any processing when possible or return `Adaptor.html_content`"""
|
269
|
-
return self.__raw_body or self.html_content
|
256
|
+
return TextHandler(self.__raw_body) or self.html_content
|
270
257
|
|
271
|
-
def prettify(self) ->
|
258
|
+
def prettify(self) -> TextHandler:
|
272
259
|
"""Return a prettified version of the element's inner html-code"""
|
273
|
-
return etree.tostring(self._root, encoding='unicode', pretty_print=True, method='html', with_tail=False)
|
260
|
+
return TextHandler(etree.tostring(self._root, encoding='unicode', pretty_print=True, method='html', with_tail=False))
|
274
261
|
|
275
262
|
def has_class(self, class_name: str) -> bool:
|
276
263
|
"""Check if element has a specific class
|
@@ -282,26 +269,32 @@ class Adaptor(SelectorsGeneration):
|
|
282
269
|
@property
|
283
270
|
def parent(self) -> Union['Adaptor', None]:
|
284
271
|
"""Return the direct parent of the element or ``None`` otherwise"""
|
285
|
-
return self.
|
272
|
+
return self.__handle_element(self._root.getparent())
|
286
273
|
|
287
274
|
@property
|
288
|
-
def
|
275
|
+
def below_elements(self) -> 'Adaptors[Adaptor]':
|
276
|
+
"""Return all elements under the current element in the DOM tree"""
|
277
|
+
below = self._root.xpath('.//*')
|
278
|
+
return self.__handle_elements(below)
|
279
|
+
|
280
|
+
@property
|
281
|
+
def children(self) -> 'Adaptors[Adaptor]':
|
289
282
|
"""Return the children elements of the current element or empty list otherwise"""
|
290
|
-
return
|
291
|
-
child for child in self._root.iterchildren() if type(child) not in html_forbidden
|
292
|
-
)
|
283
|
+
return Adaptors([
|
284
|
+
self.__element_convertor(child) for child in self._root.iterchildren() if type(child) not in html_forbidden
|
285
|
+
])
|
293
286
|
|
294
287
|
@property
|
295
|
-
def siblings(self) ->
|
288
|
+
def siblings(self) -> 'Adaptors[Adaptor]':
|
296
289
|
"""Return other children of the current element's parent or empty list otherwise"""
|
297
290
|
if self.parent:
|
298
291
|
return Adaptors([child for child in self.parent.children if child._root != self._root])
|
299
|
-
return []
|
292
|
+
return Adaptors([])
|
300
293
|
|
301
294
|
def iterancestors(self) -> Generator['Adaptor', None, None]:
|
302
295
|
"""Return a generator that loops over all ancestors of the element, starting with element's parent."""
|
303
296
|
for ancestor in self._root.iterancestors():
|
304
|
-
yield self.
|
297
|
+
yield self.__element_convertor(ancestor)
|
305
298
|
|
306
299
|
def find_ancestor(self, func: Callable[['Adaptor'], bool]) -> Union['Adaptor', None]:
|
307
300
|
"""Loop over all ancestors of the element till one match the passed function
|
@@ -328,7 +321,7 @@ class Adaptor(SelectorsGeneration):
|
|
328
321
|
# Ignore html comments and unwanted types
|
329
322
|
next_element = next_element.getnext()
|
330
323
|
|
331
|
-
return self.
|
324
|
+
return self.__handle_element(next_element)
|
332
325
|
|
333
326
|
@property
|
334
327
|
def previous(self) -> Union['Adaptor', None]:
|
@@ -339,7 +332,7 @@ class Adaptor(SelectorsGeneration):
|
|
339
332
|
# Ignore html comments and unwanted types
|
340
333
|
prev_element = prev_element.getprevious()
|
341
334
|
|
342
|
-
return self.
|
335
|
+
return self.__handle_element(prev_element)
|
343
336
|
|
344
337
|
# For easy copy-paste from Scrapy/parsel code when needed :)
|
345
338
|
def get(self, default=None):
|
@@ -392,34 +385,26 @@ class Adaptor(SelectorsGeneration):
|
|
392
385
|
if issubclass(type(element), html.HtmlElement):
|
393
386
|
element = _StorageTools.element_to_dict(element)
|
394
387
|
|
395
|
-
|
396
|
-
|
397
|
-
"""Get the matching score of the given element against the node then traverse the children
|
398
|
-
|
399
|
-
:param node: Current node in the tree structure
|
400
|
-
:param ele: The element we are searching for as dictionary
|
401
|
-
:return:
|
402
|
-
"""
|
388
|
+
for node in self._root.xpath('.//*'):
|
389
|
+
# Collect all elements in the page then for each element get the matching score of it against the node.
|
403
390
|
# Hence: the code doesn't stop even if the score was 100%
|
404
391
|
# because there might be another element(s) left in page with the same score
|
405
|
-
score = self.__calculate_similarity_score(
|
392
|
+
score = self.__calculate_similarity_score(element, node)
|
406
393
|
score_table.setdefault(score, []).append(node)
|
407
|
-
for branch in node.iterchildren():
|
408
|
-
_traverse(branch, ele)
|
409
|
-
|
410
|
-
# This will block until we traverse all children/branches
|
411
|
-
_traverse(self._root, element)
|
412
394
|
|
413
395
|
if score_table:
|
414
396
|
highest_probability = max(score_table.keys())
|
415
397
|
if score_table[highest_probability] and highest_probability >= percentage:
|
416
|
-
log.
|
417
|
-
|
418
|
-
|
419
|
-
log.debug(
|
398
|
+
if log.getEffectiveLevel() < 20:
|
399
|
+
# No need to execute this part if logging level is not debugging
|
400
|
+
log.debug(f'Highest probability was {highest_probability}%')
|
401
|
+
log.debug('Top 5 best matching elements are: ')
|
402
|
+
for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
|
403
|
+
log.debug(f'{percent} -> {self.__handle_elements(score_table[percent])}')
|
404
|
+
|
420
405
|
if not adaptor_type:
|
421
406
|
return score_table[highest_probability]
|
422
|
-
return self.
|
407
|
+
return self.__handle_elements(score_table[highest_probability])
|
423
408
|
return []
|
424
409
|
|
425
410
|
def css_first(self, selector: str, identifier: str = '',
|
@@ -439,8 +424,6 @@ class Adaptor(SelectorsGeneration):
|
|
439
424
|
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
440
425
|
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
441
426
|
number unless you must know what you are doing!
|
442
|
-
|
443
|
-
:return: List as :class:`Adaptors`
|
444
427
|
"""
|
445
428
|
for element in self.css(selector, identifier, auto_match, auto_save, percentage):
|
446
429
|
return element
|
@@ -465,8 +448,6 @@ class Adaptor(SelectorsGeneration):
|
|
465
448
|
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
466
449
|
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
467
450
|
number unless you must know what you are doing!
|
468
|
-
|
469
|
-
:return: List as :class:`Adaptors`
|
470
451
|
"""
|
471
452
|
for element in self.xpath(selector, identifier, auto_match, auto_save, percentage, **kwargs):
|
472
453
|
return element
|
@@ -493,7 +474,7 @@ class Adaptor(SelectorsGeneration):
|
|
493
474
|
:return: List as :class:`Adaptors`
|
494
475
|
"""
|
495
476
|
try:
|
496
|
-
if not self.__auto_match_enabled:
|
477
|
+
if not self.__auto_match_enabled or ',' not in selector:
|
497
478
|
# No need to split selectors in this case, let's save some CPU cycles :)
|
498
479
|
xpath_selector = HTMLTranslator().css_to_xpath(selector)
|
499
480
|
return self.xpath(xpath_selector, identifier or selector, auto_match, auto_save, percentage)
|
@@ -507,11 +488,8 @@ class Adaptor(SelectorsGeneration):
|
|
507
488
|
results += self.xpath(
|
508
489
|
xpath_selector, identifier or single_selector.canonical(), auto_match, auto_save, percentage
|
509
490
|
)
|
510
|
-
else:
|
511
|
-
xpath_selector = HTMLTranslator().css_to_xpath(selector)
|
512
|
-
return self.xpath(xpath_selector, identifier or selector, auto_match, auto_save, percentage)
|
513
491
|
|
514
|
-
return
|
492
|
+
return results
|
515
493
|
except (SelectorError, SelectorSyntaxError,):
|
516
494
|
raise SelectorSyntaxError(f"Invalid CSS selector: {selector}")
|
517
495
|
|
@@ -538,37 +516,37 @@ class Adaptor(SelectorsGeneration):
|
|
538
516
|
:return: List as :class:`Adaptors`
|
539
517
|
"""
|
540
518
|
try:
|
541
|
-
|
519
|
+
elements = self._root.xpath(selector, **kwargs)
|
542
520
|
|
543
|
-
if
|
544
|
-
if
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
521
|
+
if elements:
|
522
|
+
if auto_save:
|
523
|
+
if not self.__auto_match_enabled:
|
524
|
+
log.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
|
525
|
+
else:
|
526
|
+
self.save(elements[0], identifier or selector)
|
549
527
|
|
550
|
-
return self.
|
551
|
-
|
552
|
-
if
|
528
|
+
return self.__handle_elements(elements)
|
529
|
+
elif self.__auto_match_enabled:
|
530
|
+
if auto_match:
|
553
531
|
element_data = self.retrieve(identifier or selector)
|
554
532
|
if element_data:
|
555
|
-
|
556
|
-
if
|
557
|
-
self.save(
|
558
|
-
|
559
|
-
return self.__convert_results(relocated)
|
560
|
-
else:
|
561
|
-
return self.__convert_results(selected_elements)
|
533
|
+
elements = self.relocate(element_data, percentage)
|
534
|
+
if elements is not None and auto_save:
|
535
|
+
self.save(elements[0], identifier or selector)
|
562
536
|
|
563
|
-
|
537
|
+
return self.__handle_elements(elements)
|
538
|
+
else:
|
539
|
+
if auto_match:
|
564
540
|
log.warning("Argument `auto_match` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
|
541
|
+
elif auto_save:
|
542
|
+
log.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
|
565
543
|
|
566
|
-
return self.
|
544
|
+
return self.__handle_elements(elements)
|
567
545
|
|
568
546
|
except (SelectorError, SelectorSyntaxError, etree.XPathError, etree.XPathEvalError):
|
569
547
|
raise SelectorSyntaxError(f"Invalid XPath selector: {selector}")
|
570
548
|
|
571
|
-
def find_all(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) ->
|
549
|
+
def find_all(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> 'Adaptors':
|
572
550
|
"""Find elements by filters of your creations for ease..
|
573
551
|
|
574
552
|
:param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
|
@@ -588,15 +566,7 @@ class Adaptor(SelectorsGeneration):
|
|
588
566
|
|
589
567
|
attributes = dict()
|
590
568
|
tags, patterns = set(), set()
|
591
|
-
results, functions, selectors = [], [], []
|
592
|
-
|
593
|
-
def _search_tree(element: Adaptor, filter_function: Callable) -> None:
|
594
|
-
"""Collect element if it fulfills passed function otherwise, traverse the children tree and iterate"""
|
595
|
-
if filter_function(element):
|
596
|
-
results.append(element)
|
597
|
-
|
598
|
-
for branch in element.children:
|
599
|
-
_search_tree(branch, filter_function)
|
569
|
+
results, functions, selectors = Adaptors([]), [], []
|
600
570
|
|
601
571
|
# Brace yourself for a wonderful journey!
|
602
572
|
for arg in args:
|
@@ -608,12 +578,12 @@ class Adaptor(SelectorsGeneration):
|
|
608
578
|
raise TypeError('Nested Iterables are not accepted, only iterables of tag names are accepted')
|
609
579
|
tags.update(set(arg))
|
610
580
|
|
611
|
-
elif
|
581
|
+
elif isinstance(arg, dict):
|
612
582
|
if not all([(type(k) is str and type(v) is str) for k, v in arg.items()]):
|
613
583
|
raise TypeError('Nested dictionaries are not accepted, only string keys and string values are accepted')
|
614
584
|
attributes.update(arg)
|
615
585
|
|
616
|
-
elif
|
586
|
+
elif isinstance(arg, re.Pattern):
|
617
587
|
patterns.add(arg)
|
618
588
|
|
619
589
|
elif callable(arg):
|
@@ -634,14 +604,14 @@ class Adaptor(SelectorsGeneration):
|
|
634
604
|
attributes[attribute_name] = value
|
635
605
|
|
636
606
|
# It's easier and faster to build a selector than traversing the tree
|
637
|
-
tags = tags or ['']
|
607
|
+
tags = tags or ['*']
|
638
608
|
for tag in tags:
|
639
609
|
selector = tag
|
640
610
|
for key, value in attributes.items():
|
641
611
|
value = value.replace('"', r'\"') # Escape double quotes in user input
|
642
612
|
# Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)
|
643
613
|
selector += '[{}="{}"]'.format(key, value)
|
644
|
-
if selector:
|
614
|
+
if selector != '*':
|
645
615
|
selectors.append(selector)
|
646
616
|
|
647
617
|
if selectors:
|
@@ -655,14 +625,15 @@ class Adaptor(SelectorsGeneration):
|
|
655
625
|
for function in functions:
|
656
626
|
results = results.filter(function)
|
657
627
|
else:
|
628
|
+
results = results or self.below_elements
|
658
629
|
for pattern in patterns:
|
659
|
-
results.
|
630
|
+
results = results.filter(lambda e: e.text.re(pattern, check_match=True))
|
660
631
|
|
661
|
-
|
662
|
-
|
663
|
-
|
632
|
+
# Collect element if it fulfills passed function otherwise
|
633
|
+
for function in functions:
|
634
|
+
results = results.filter(function)
|
664
635
|
|
665
|
-
return
|
636
|
+
return results
|
666
637
|
|
667
638
|
def find(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> Union['Adaptor', None]:
|
668
639
|
"""Find elements by filters of your creations for ease then return the first result. Otherwise return `None`.
|
@@ -792,7 +763,7 @@ class Adaptor(SelectorsGeneration):
|
|
792
763
|
return self.get_all_text(strip=True).json()
|
793
764
|
|
794
765
|
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True,
|
795
|
-
clean_match: bool = False, case_sensitive: bool = False) ->
|
766
|
+
clean_match: bool = False, case_sensitive: bool = False) -> TextHandlers:
|
796
767
|
"""Apply the given regex to the current text and return a list of strings with the matches.
|
797
768
|
|
798
769
|
:param regex: Can be either a compiled regular expression or a string.
|
@@ -803,7 +774,7 @@ class Adaptor(SelectorsGeneration):
|
|
803
774
|
return self.text.re(regex, replace_entities, clean_match, case_sensitive)
|
804
775
|
|
805
776
|
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
806
|
-
clean_match: bool = False, case_sensitive: bool = False) ->
|
777
|
+
clean_match: bool = False, case_sensitive: bool = False) -> TextHandler:
|
807
778
|
"""Apply the given regex to text and return the first match if found, otherwise return the default value.
|
808
779
|
|
809
780
|
:param regex: Can be either a compiled regular expression or a string.
|
@@ -894,12 +865,12 @@ class Adaptor(SelectorsGeneration):
|
|
894
865
|
if potential_match != root and are_alike(root, target_attrs, potential_match):
|
895
866
|
similar_elements.append(potential_match)
|
896
867
|
|
897
|
-
return self.
|
868
|
+
return self.__handle_elements(similar_elements)
|
898
869
|
|
899
870
|
def find_by_text(
|
900
871
|
self, text: str, first_match: bool = True, partial: bool = False,
|
901
872
|
case_sensitive: bool = False, clean_match: bool = True
|
902
|
-
) -> Union['Adaptors[Adaptor]', 'Adaptor'
|
873
|
+
) -> Union['Adaptors[Adaptor]', 'Adaptor']:
|
903
874
|
"""Find elements that its text content fully/partially matches input.
|
904
875
|
:param text: Text query to match
|
905
876
|
:param first_match: Return first element that matches conditions, enabled by default
|
@@ -908,74 +879,60 @@ class Adaptor(SelectorsGeneration):
|
|
908
879
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
909
880
|
"""
|
910
881
|
|
911
|
-
results = []
|
882
|
+
results = Adaptors([])
|
912
883
|
if not case_sensitive:
|
913
884
|
text = text.lower()
|
914
885
|
|
915
|
-
|
886
|
+
# This selector gets all elements with text content
|
887
|
+
for node in self.__handle_elements(self._root.xpath('.//*[normalize-space(text())]')):
|
916
888
|
"""Check if element matches given text otherwise, traverse the children tree and iterate"""
|
917
889
|
node_text = node.text
|
918
|
-
|
919
|
-
|
920
|
-
if clean_match:
|
921
|
-
node_text = node_text.clean()
|
922
|
-
|
923
|
-
if not case_sensitive:
|
924
|
-
node_text = node_text.lower()
|
925
|
-
|
926
|
-
if partial:
|
927
|
-
if text in node_text:
|
928
|
-
results.append(node)
|
929
|
-
elif text == node_text:
|
930
|
-
results.append(node)
|
890
|
+
if clean_match:
|
891
|
+
node_text = node_text.clean()
|
931
892
|
|
932
|
-
if
|
933
|
-
|
934
|
-
return
|
893
|
+
if not case_sensitive:
|
894
|
+
node_text = node_text.lower()
|
935
895
|
|
936
|
-
|
937
|
-
|
896
|
+
if partial:
|
897
|
+
if text in node_text:
|
898
|
+
results.append(node)
|
899
|
+
elif text == node_text:
|
900
|
+
results.append(node)
|
938
901
|
|
939
|
-
|
940
|
-
|
902
|
+
if first_match and results:
|
903
|
+
# we got an element so we should stop
|
904
|
+
break
|
941
905
|
|
942
906
|
if first_match:
|
943
907
|
if results:
|
944
908
|
return results[0]
|
945
|
-
return
|
909
|
+
return results
|
946
910
|
|
947
911
|
def find_by_regex(
|
948
912
|
self, query: Union[str, Pattern[str]], first_match: bool = True, case_sensitive: bool = False, clean_match: bool = True
|
949
|
-
) -> Union['Adaptors[Adaptor]', 'Adaptor'
|
913
|
+
) -> Union['Adaptors[Adaptor]', 'Adaptor']:
|
950
914
|
"""Find elements that its text content matches the input regex pattern.
|
951
915
|
:param query: Regex query/pattern to match
|
952
916
|
:param first_match: Return first element that matches conditions, enabled by default
|
953
917
|
:param case_sensitive: if enabled, letters case will be taken into consideration in the regex
|
954
918
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
955
919
|
"""
|
956
|
-
results = []
|
920
|
+
results = Adaptors([])
|
957
921
|
|
958
|
-
|
922
|
+
# This selector gets all elements with text content
|
923
|
+
for node in self.__handle_elements(self._root.xpath('.//*[normalize-space(text())]')):
|
959
924
|
"""Check if element matches given regex otherwise, traverse the children tree and iterate"""
|
960
925
|
node_text = node.text
|
961
|
-
|
962
|
-
|
963
|
-
if node_text.re(query, check_match=True, clean_match=clean_match, case_sensitive=case_sensitive):
|
964
|
-
results.append(node)
|
926
|
+
if node_text.re(query, check_match=True, clean_match=clean_match, case_sensitive=case_sensitive):
|
927
|
+
results.append(node)
|
965
928
|
|
966
|
-
if
|
929
|
+
if first_match and results:
|
967
930
|
# we got an element so we should stop
|
968
|
-
|
969
|
-
|
970
|
-
for branch in node.children:
|
971
|
-
_traverse(branch)
|
972
|
-
|
973
|
-
# This will block until we traverse all children/branches
|
974
|
-
_traverse(self)
|
931
|
+
break
|
975
932
|
|
976
933
|
if results and first_match:
|
977
934
|
return results[0]
|
978
|
-
return
|
935
|
+
return results
|
979
936
|
|
980
937
|
|
981
938
|
class Adaptors(List[Adaptor]):
|
@@ -984,7 +941,15 @@ class Adaptors(List[Adaptor]):
|
|
984
941
|
"""
|
985
942
|
__slots__ = ()
|
986
943
|
|
987
|
-
|
944
|
+
@typing.overload
|
945
|
+
def __getitem__(self, pos: SupportsIndex) -> Adaptor:
|
946
|
+
pass
|
947
|
+
|
948
|
+
@typing.overload
|
949
|
+
def __getitem__(self, pos: slice) -> "Adaptors":
|
950
|
+
pass
|
951
|
+
|
952
|
+
def __getitem__(self, pos: Union[SupportsIndex, slice]) -> Union[Adaptor, "Adaptors"]:
|
988
953
|
lst = super().__getitem__(pos)
|
989
954
|
if isinstance(pos, slice):
|
990
955
|
return self.__class__(lst)
|
@@ -993,7 +958,7 @@ class Adaptors(List[Adaptor]):
|
|
993
958
|
|
994
959
|
def xpath(
|
995
960
|
self, selector: str, identifier: str = '', auto_save: bool = False, percentage: int = 0, **kwargs: Any
|
996
|
-
) ->
|
961
|
+
) -> "Adaptors[Adaptor]":
|
997
962
|
"""
|
998
963
|
Call the ``.xpath()`` method for each element in this list and return
|
999
964
|
their results as another :class:`Adaptors`.
|
@@ -1019,7 +984,7 @@ class Adaptors(List[Adaptor]):
|
|
1019
984
|
]
|
1020
985
|
return self.__class__(flatten(results))
|
1021
986
|
|
1022
|
-
def css(self, selector: str, identifier: str = '', auto_save: bool = False, percentage: int = 0) ->
|
987
|
+
def css(self, selector: str, identifier: str = '', auto_save: bool = False, percentage: int = 0) -> "Adaptors[Adaptor]":
|
1023
988
|
"""
|
1024
989
|
Call the ``.css()`` method for each element in this list and return
|
1025
990
|
their results flattened as another :class:`Adaptors`.
|
@@ -1044,7 +1009,7 @@ class Adaptors(List[Adaptor]):
|
|
1044
1009
|
return self.__class__(flatten(results))
|
1045
1010
|
|
1046
1011
|
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True,
|
1047
|
-
clean_match: bool = False, case_sensitive: bool = False) ->
|
1012
|
+
clean_match: bool = False, case_sensitive: bool = False) -> TextHandlers[TextHandler]:
|
1048
1013
|
"""Call the ``.re()`` method for each element in this list and return
|
1049
1014
|
their results flattened as List of TextHandler.
|
1050
1015
|
|
@@ -1056,10 +1021,10 @@ class Adaptors(List[Adaptor]):
|
|
1056
1021
|
results = [
|
1057
1022
|
n.text.re(regex, replace_entities, clean_match, case_sensitive) for n in self
|
1058
1023
|
]
|
1059
|
-
return flatten(results)
|
1024
|
+
return TextHandlers(flatten(results))
|
1060
1025
|
|
1061
1026
|
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
1062
|
-
clean_match: bool = False, case_sensitive: bool = False) ->
|
1027
|
+
clean_match: bool = False, case_sensitive: bool = False) -> TextHandler:
|
1063
1028
|
"""Call the ``.re_first()`` method for each element in this list and return
|
1064
1029
|
the first result or the default value otherwise.
|
1065
1030
|
|
@@ -1084,15 +1049,14 @@ class Adaptors(List[Adaptor]):
|
|
1084
1049
|
return element
|
1085
1050
|
return None
|
1086
1051
|
|
1087
|
-
def filter(self, func: Callable[['Adaptor'], bool]) ->
|
1052
|
+
def filter(self, func: Callable[['Adaptor'], bool]) -> 'Adaptors[Adaptor]':
|
1088
1053
|
"""Filter current elements based on the passed function
|
1089
1054
|
:param func: A function that takes each element as an argument and returns True/False
|
1090
1055
|
:return: The new `Adaptors` object or empty list otherwise.
|
1091
1056
|
"""
|
1092
|
-
|
1057
|
+
return self.__class__([
|
1093
1058
|
element for element in self if func(element)
|
1094
|
-
]
|
1095
|
-
return self.__class__(results) if results else results
|
1059
|
+
])
|
1096
1060
|
|
1097
1061
|
# For easy copy-paste from Scrapy/parsel code when needed :)
|
1098
1062
|
def get(self, default=None):
|