scrapling 0.2.92__py3-none-any.whl → 0.2.93__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/core/_types.py +2 -1
- scrapling/core/custom_types.py +91 -39
- scrapling/core/translator.py +1 -1
- scrapling/defaults.py +8 -5
- scrapling/engines/camo.py +6 -2
- scrapling/engines/pw.py +1 -1
- scrapling/fetchers.py +5 -5
- scrapling/parser.py +153 -189
- {scrapling-0.2.92.dist-info → scrapling-0.2.93.dist-info}/METADATA +58 -32
- {scrapling-0.2.92.dist-info → scrapling-0.2.93.dist-info}/RECORD +17 -17
- {scrapling-0.2.92.dist-info → scrapling-0.2.93.dist-info}/WHEEL +1 -1
- tests/fetchers/async/test_playwright.py +1 -1
- tests/fetchers/sync/test_playwright.py +1 -1
- {scrapling-0.2.92.dist-info → scrapling-0.2.93.dist-info}/LICENSE +0 -0
- {scrapling-0.2.92.dist-info → scrapling-0.2.93.dist-info}/entry_points.txt +0 -0
- {scrapling-0.2.92.dist-info → scrapling-0.2.93.dist-info}/top_level.txt +0 -0
scrapling/parser.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import inspect
|
2
2
|
import os
|
3
3
|
import re
|
4
|
+
import typing
|
4
5
|
from difflib import SequenceMatcher
|
5
6
|
from urllib.parse import urljoin
|
6
7
|
|
@@ -145,47 +146,46 @@ class Adaptor(SelectorsGeneration):
|
|
145
146
|
# Faster than checking `element.is_attribute or element.is_text or element.is_tail`
|
146
147
|
return issubclass(type(element), etree._ElementUnicodeResult)
|
147
148
|
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
149
|
+
@staticmethod
|
150
|
+
def __content_convertor(element: Union[html.HtmlElement, etree._ElementUnicodeResult]) -> TextHandler:
|
151
|
+
"""Used internally to convert a single element's text content to TextHandler directly without checks
|
152
|
+
|
153
|
+
This single line has been isolated like this so when it's used with map we get that slight performance boost vs list comprehension
|
154
|
+
"""
|
155
|
+
return TextHandler(str(element))
|
156
|
+
|
157
|
+
def __element_convertor(self, element: html.HtmlElement) -> 'Adaptor':
|
158
|
+
"""Used internally to convert a single HtmlElement to Adaptor directly without checks"""
|
159
|
+
return Adaptor(
|
160
|
+
root=element,
|
161
|
+
text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
|
162
|
+
url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
|
163
|
+
keep_comments=self.__keep_comments, keep_cdata=self.__keep_cdata,
|
164
|
+
huge_tree=self.__huge_tree_enabled,
|
165
|
+
**self.__response_data
|
166
|
+
)
|
167
|
+
|
168
|
+
def __handle_element(self, element: Union[html.HtmlElement, etree._ElementUnicodeResult]) -> Union[TextHandler, 'Adaptor', None]:
|
169
|
+
"""Used internally in all functions to convert a single element to type (Adaptor|TextHandler) when possible"""
|
170
|
+
if element is None:
|
171
|
+
return None
|
172
|
+
elif self._is_text_node(element):
|
153
173
|
# etree._ElementUnicodeResult basically inherit from `str` so it's fine
|
154
|
-
return
|
174
|
+
return self.__content_convertor(element)
|
155
175
|
else:
|
156
|
-
|
157
|
-
|
158
|
-
return Adaptor(
|
159
|
-
root=element,
|
160
|
-
text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
|
161
|
-
url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
|
162
|
-
keep_comments=self.__keep_comments, keep_cdata=self.__keep_cdata,
|
163
|
-
huge_tree=self.__huge_tree_enabled,
|
164
|
-
**self.__response_data
|
165
|
-
)
|
166
|
-
return element
|
176
|
+
return self.__element_convertor(element)
|
167
177
|
|
168
|
-
def
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
if result is None:
|
173
|
-
return None
|
174
|
-
elif result == []: # Lxml will give a warning if I used something like `not result`
|
175
|
-
return []
|
176
|
-
|
177
|
-
if isinstance(result, Adaptors):
|
178
|
-
return result
|
179
|
-
|
180
|
-
if type(result) is list:
|
181
|
-
results = [self.__get_correct_result(n) for n in result]
|
182
|
-
if all(isinstance(res, self.__class__) for res in results):
|
183
|
-
return Adaptors(results)
|
184
|
-
elif all(isinstance(res, TextHandler) for res in results):
|
185
|
-
return TextHandlers(results)
|
186
|
-
return results
|
178
|
+
def __handle_elements(self, result: List[Union[html.HtmlElement, etree._ElementUnicodeResult]]) -> Union['Adaptors', 'TextHandlers', List]:
|
179
|
+
"""Used internally in all functions to convert results to type (Adaptors|TextHandlers) in bulk when possible"""
|
180
|
+
if not len(result): # Lxml will give a warning if I used something like `not result`
|
181
|
+
return Adaptors([])
|
187
182
|
|
188
|
-
|
183
|
+
# From within the code, this method will always get a list of the same type
|
184
|
+
# so we will continue without checks for slight performance boost
|
185
|
+
if self._is_text_node(result[0]):
|
186
|
+
return TextHandlers(list(map(self.__content_convertor, result)))
|
187
|
+
|
188
|
+
return Adaptors(list(map(self.__element_convertor, result)))
|
189
189
|
|
190
190
|
def __getstate__(self) -> Any:
|
191
191
|
# lxml don't like it :)
|
@@ -223,29 +223,16 @@ class Adaptor(SelectorsGeneration):
|
|
223
223
|
:return: A TextHandler
|
224
224
|
"""
|
225
225
|
_all_strings = []
|
226
|
-
|
227
|
-
def _traverse(node: html.HtmlElement) -> None:
|
228
|
-
"""Traverse element children and get text content of each
|
229
|
-
|
230
|
-
:param node: Current node in the tree structure
|
231
|
-
:return:
|
232
|
-
"""
|
226
|
+
for node in self._root.xpath('.//*'):
|
233
227
|
if node.tag not in ignore_tags:
|
234
228
|
text = node.text
|
235
229
|
if text and type(text) is str:
|
236
|
-
if valid_values:
|
237
|
-
if text.strip()
|
238
|
-
_all_strings.append(text if not strip else text.strip())
|
230
|
+
if valid_values and text.strip():
|
231
|
+
_all_strings.append(text if not strip else text.strip())
|
239
232
|
else:
|
240
233
|
_all_strings.append(text if not strip else text.strip())
|
241
234
|
|
242
|
-
|
243
|
-
_traverse(branch)
|
244
|
-
|
245
|
-
# We will start using Lxml directly for the speed boost
|
246
|
-
_traverse(self._root)
|
247
|
-
|
248
|
-
return TextHandler(separator.join([s for s in _all_strings]))
|
235
|
+
return TextHandler(separator.join(_all_strings))
|
249
236
|
|
250
237
|
def urljoin(self, relative_url: str) -> str:
|
251
238
|
"""Join this Adaptor's url with a relative url to form an absolute full URL."""
|
@@ -259,18 +246,18 @@ class Adaptor(SelectorsGeneration):
|
|
259
246
|
return self.__attributes
|
260
247
|
|
261
248
|
@property
|
262
|
-
def html_content(self) ->
|
249
|
+
def html_content(self) -> TextHandler:
|
263
250
|
"""Return the inner html code of the element"""
|
264
|
-
return etree.tostring(self._root, encoding='unicode', method='html', with_tail=False)
|
251
|
+
return TextHandler(etree.tostring(self._root, encoding='unicode', method='html', with_tail=False))
|
265
252
|
|
266
253
|
@property
|
267
|
-
def body(self) ->
|
254
|
+
def body(self) -> TextHandler:
|
268
255
|
"""Return raw HTML code of the element/page without any processing when possible or return `Adaptor.html_content`"""
|
269
|
-
return self.__raw_body or self.html_content
|
256
|
+
return TextHandler(self.__raw_body) or self.html_content
|
270
257
|
|
271
|
-
def prettify(self) ->
|
258
|
+
def prettify(self) -> TextHandler:
|
272
259
|
"""Return a prettified version of the element's inner html-code"""
|
273
|
-
return etree.tostring(self._root, encoding='unicode', pretty_print=True, method='html', with_tail=False)
|
260
|
+
return TextHandler(etree.tostring(self._root, encoding='unicode', pretty_print=True, method='html', with_tail=False))
|
274
261
|
|
275
262
|
def has_class(self, class_name: str) -> bool:
|
276
263
|
"""Check if element has a specific class
|
@@ -282,26 +269,32 @@ class Adaptor(SelectorsGeneration):
|
|
282
269
|
@property
|
283
270
|
def parent(self) -> Union['Adaptor', None]:
|
284
271
|
"""Return the direct parent of the element or ``None`` otherwise"""
|
285
|
-
return self.
|
272
|
+
return self.__handle_element(self._root.getparent())
|
286
273
|
|
287
274
|
@property
|
288
|
-
def
|
275
|
+
def below_elements(self) -> 'Adaptors[Adaptor]':
|
276
|
+
"""Return all elements under the current element in the DOM tree"""
|
277
|
+
below = self._root.xpath('.//*')
|
278
|
+
return self.__handle_elements(below)
|
279
|
+
|
280
|
+
@property
|
281
|
+
def children(self) -> 'Adaptors[Adaptor]':
|
289
282
|
"""Return the children elements of the current element or empty list otherwise"""
|
290
|
-
return
|
291
|
-
child for child in self._root.iterchildren() if type(child) not in html_forbidden
|
292
|
-
)
|
283
|
+
return Adaptors([
|
284
|
+
self.__element_convertor(child) for child in self._root.iterchildren() if type(child) not in html_forbidden
|
285
|
+
])
|
293
286
|
|
294
287
|
@property
|
295
|
-
def siblings(self) ->
|
288
|
+
def siblings(self) -> 'Adaptors[Adaptor]':
|
296
289
|
"""Return other children of the current element's parent or empty list otherwise"""
|
297
290
|
if self.parent:
|
298
291
|
return Adaptors([child for child in self.parent.children if child._root != self._root])
|
299
|
-
return []
|
292
|
+
return Adaptors([])
|
300
293
|
|
301
294
|
def iterancestors(self) -> Generator['Adaptor', None, None]:
|
302
295
|
"""Return a generator that loops over all ancestors of the element, starting with element's parent."""
|
303
296
|
for ancestor in self._root.iterancestors():
|
304
|
-
yield self.
|
297
|
+
yield self.__element_convertor(ancestor)
|
305
298
|
|
306
299
|
def find_ancestor(self, func: Callable[['Adaptor'], bool]) -> Union['Adaptor', None]:
|
307
300
|
"""Loop over all ancestors of the element till one match the passed function
|
@@ -328,7 +321,7 @@ class Adaptor(SelectorsGeneration):
|
|
328
321
|
# Ignore html comments and unwanted types
|
329
322
|
next_element = next_element.getnext()
|
330
323
|
|
331
|
-
return self.
|
324
|
+
return self.__handle_element(next_element)
|
332
325
|
|
333
326
|
@property
|
334
327
|
def previous(self) -> Union['Adaptor', None]:
|
@@ -339,7 +332,7 @@ class Adaptor(SelectorsGeneration):
|
|
339
332
|
# Ignore html comments and unwanted types
|
340
333
|
prev_element = prev_element.getprevious()
|
341
334
|
|
342
|
-
return self.
|
335
|
+
return self.__handle_element(prev_element)
|
343
336
|
|
344
337
|
# For easy copy-paste from Scrapy/parsel code when needed :)
|
345
338
|
def get(self, default=None):
|
@@ -392,34 +385,26 @@ class Adaptor(SelectorsGeneration):
|
|
392
385
|
if issubclass(type(element), html.HtmlElement):
|
393
386
|
element = _StorageTools.element_to_dict(element)
|
394
387
|
|
395
|
-
|
396
|
-
|
397
|
-
"""Get the matching score of the given element against the node then traverse the children
|
398
|
-
|
399
|
-
:param node: Current node in the tree structure
|
400
|
-
:param ele: The element we are searching for as dictionary
|
401
|
-
:return:
|
402
|
-
"""
|
388
|
+
for node in self._root.xpath('.//*'):
|
389
|
+
# Collect all elements in the page then for each element get the matching score of it against the node.
|
403
390
|
# Hence: the code doesn't stop even if the score was 100%
|
404
391
|
# because there might be another element(s) left in page with the same score
|
405
|
-
score = self.__calculate_similarity_score(
|
392
|
+
score = self.__calculate_similarity_score(element, node)
|
406
393
|
score_table.setdefault(score, []).append(node)
|
407
|
-
for branch in node.iterchildren():
|
408
|
-
_traverse(branch, ele)
|
409
|
-
|
410
|
-
# This will block until we traverse all children/branches
|
411
|
-
_traverse(self._root, element)
|
412
394
|
|
413
395
|
if score_table:
|
414
396
|
highest_probability = max(score_table.keys())
|
415
397
|
if score_table[highest_probability] and highest_probability >= percentage:
|
416
|
-
log.
|
417
|
-
|
418
|
-
|
419
|
-
log.debug(
|
398
|
+
if log.getEffectiveLevel() < 20:
|
399
|
+
# No need to execute this part if logging level is not debugging
|
400
|
+
log.debug(f'Highest probability was {highest_probability}%')
|
401
|
+
log.debug('Top 5 best matching elements are: ')
|
402
|
+
for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
|
403
|
+
log.debug(f'{percent} -> {self.__handle_elements(score_table[percent])}')
|
404
|
+
|
420
405
|
if not adaptor_type:
|
421
406
|
return score_table[highest_probability]
|
422
|
-
return self.
|
407
|
+
return self.__handle_elements(score_table[highest_probability])
|
423
408
|
return []
|
424
409
|
|
425
410
|
def css_first(self, selector: str, identifier: str = '',
|
@@ -439,8 +424,6 @@ class Adaptor(SelectorsGeneration):
|
|
439
424
|
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
440
425
|
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
441
426
|
number unless you must know what you are doing!
|
442
|
-
|
443
|
-
:return: List as :class:`Adaptors`
|
444
427
|
"""
|
445
428
|
for element in self.css(selector, identifier, auto_match, auto_save, percentage):
|
446
429
|
return element
|
@@ -465,8 +448,6 @@ class Adaptor(SelectorsGeneration):
|
|
465
448
|
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
466
449
|
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
467
450
|
number unless you must know what you are doing!
|
468
|
-
|
469
|
-
:return: List as :class:`Adaptors`
|
470
451
|
"""
|
471
452
|
for element in self.xpath(selector, identifier, auto_match, auto_save, percentage, **kwargs):
|
472
453
|
return element
|
@@ -493,7 +474,7 @@ class Adaptor(SelectorsGeneration):
|
|
493
474
|
:return: List as :class:`Adaptors`
|
494
475
|
"""
|
495
476
|
try:
|
496
|
-
if not self.__auto_match_enabled:
|
477
|
+
if not self.__auto_match_enabled or ',' not in selector:
|
497
478
|
# No need to split selectors in this case, let's save some CPU cycles :)
|
498
479
|
xpath_selector = HTMLTranslator().css_to_xpath(selector)
|
499
480
|
return self.xpath(xpath_selector, identifier or selector, auto_match, auto_save, percentage)
|
@@ -507,11 +488,8 @@ class Adaptor(SelectorsGeneration):
|
|
507
488
|
results += self.xpath(
|
508
489
|
xpath_selector, identifier or single_selector.canonical(), auto_match, auto_save, percentage
|
509
490
|
)
|
510
|
-
else:
|
511
|
-
xpath_selector = HTMLTranslator().css_to_xpath(selector)
|
512
|
-
return self.xpath(xpath_selector, identifier or selector, auto_match, auto_save, percentage)
|
513
491
|
|
514
|
-
return
|
492
|
+
return results
|
515
493
|
except (SelectorError, SelectorSyntaxError,):
|
516
494
|
raise SelectorSyntaxError(f"Invalid CSS selector: {selector}")
|
517
495
|
|
@@ -538,37 +516,37 @@ class Adaptor(SelectorsGeneration):
|
|
538
516
|
:return: List as :class:`Adaptors`
|
539
517
|
"""
|
540
518
|
try:
|
541
|
-
|
519
|
+
elements = self._root.xpath(selector, **kwargs)
|
542
520
|
|
543
|
-
if
|
544
|
-
if
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
521
|
+
if elements:
|
522
|
+
if auto_save:
|
523
|
+
if not self.__auto_match_enabled:
|
524
|
+
log.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
|
525
|
+
else:
|
526
|
+
self.save(elements[0], identifier or selector)
|
549
527
|
|
550
|
-
return self.
|
551
|
-
|
552
|
-
if
|
528
|
+
return self.__handle_elements(elements)
|
529
|
+
elif self.__auto_match_enabled:
|
530
|
+
if auto_match:
|
553
531
|
element_data = self.retrieve(identifier or selector)
|
554
532
|
if element_data:
|
555
|
-
|
556
|
-
if
|
557
|
-
self.save(
|
558
|
-
|
559
|
-
return self.__convert_results(relocated)
|
560
|
-
else:
|
561
|
-
return self.__convert_results(selected_elements)
|
533
|
+
elements = self.relocate(element_data, percentage)
|
534
|
+
if elements is not None and auto_save:
|
535
|
+
self.save(elements[0], identifier or selector)
|
562
536
|
|
563
|
-
|
537
|
+
return self.__handle_elements(elements)
|
538
|
+
else:
|
539
|
+
if auto_match:
|
564
540
|
log.warning("Argument `auto_match` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
|
541
|
+
elif auto_save:
|
542
|
+
log.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
|
565
543
|
|
566
|
-
return self.
|
544
|
+
return self.__handle_elements(elements)
|
567
545
|
|
568
546
|
except (SelectorError, SelectorSyntaxError, etree.XPathError, etree.XPathEvalError):
|
569
547
|
raise SelectorSyntaxError(f"Invalid XPath selector: {selector}")
|
570
548
|
|
571
|
-
def find_all(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) ->
|
549
|
+
def find_all(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> 'Adaptors':
|
572
550
|
"""Find elements by filters of your creations for ease..
|
573
551
|
|
574
552
|
:param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
|
@@ -588,15 +566,7 @@ class Adaptor(SelectorsGeneration):
|
|
588
566
|
|
589
567
|
attributes = dict()
|
590
568
|
tags, patterns = set(), set()
|
591
|
-
results, functions, selectors = [], [], []
|
592
|
-
|
593
|
-
def _search_tree(element: Adaptor, filter_function: Callable) -> None:
|
594
|
-
"""Collect element if it fulfills passed function otherwise, traverse the children tree and iterate"""
|
595
|
-
if filter_function(element):
|
596
|
-
results.append(element)
|
597
|
-
|
598
|
-
for branch in element.children:
|
599
|
-
_search_tree(branch, filter_function)
|
569
|
+
results, functions, selectors = Adaptors([]), [], []
|
600
570
|
|
601
571
|
# Brace yourself for a wonderful journey!
|
602
572
|
for arg in args:
|
@@ -608,12 +578,12 @@ class Adaptor(SelectorsGeneration):
|
|
608
578
|
raise TypeError('Nested Iterables are not accepted, only iterables of tag names are accepted')
|
609
579
|
tags.update(set(arg))
|
610
580
|
|
611
|
-
elif
|
581
|
+
elif isinstance(arg, dict):
|
612
582
|
if not all([(type(k) is str and type(v) is str) for k, v in arg.items()]):
|
613
583
|
raise TypeError('Nested dictionaries are not accepted, only string keys and string values are accepted')
|
614
584
|
attributes.update(arg)
|
615
585
|
|
616
|
-
elif
|
586
|
+
elif isinstance(arg, re.Pattern):
|
617
587
|
patterns.add(arg)
|
618
588
|
|
619
589
|
elif callable(arg):
|
@@ -634,14 +604,14 @@ class Adaptor(SelectorsGeneration):
|
|
634
604
|
attributes[attribute_name] = value
|
635
605
|
|
636
606
|
# It's easier and faster to build a selector than traversing the tree
|
637
|
-
tags = tags or ['']
|
607
|
+
tags = tags or ['*']
|
638
608
|
for tag in tags:
|
639
609
|
selector = tag
|
640
610
|
for key, value in attributes.items():
|
641
611
|
value = value.replace('"', r'\"') # Escape double quotes in user input
|
642
612
|
# Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)
|
643
613
|
selector += '[{}="{}"]'.format(key, value)
|
644
|
-
if selector:
|
614
|
+
if selector != '*':
|
645
615
|
selectors.append(selector)
|
646
616
|
|
647
617
|
if selectors:
|
@@ -655,14 +625,15 @@ class Adaptor(SelectorsGeneration):
|
|
655
625
|
for function in functions:
|
656
626
|
results = results.filter(function)
|
657
627
|
else:
|
628
|
+
results = results or self.below_elements
|
658
629
|
for pattern in patterns:
|
659
|
-
results.
|
630
|
+
results = results.filter(lambda e: e.text.re(pattern, check_match=True))
|
660
631
|
|
661
|
-
|
662
|
-
|
663
|
-
|
632
|
+
# Collect element if it fulfills passed function otherwise
|
633
|
+
for function in functions:
|
634
|
+
results = results.filter(function)
|
664
635
|
|
665
|
-
return
|
636
|
+
return results
|
666
637
|
|
667
638
|
def find(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> Union['Adaptor', None]:
|
668
639
|
"""Find elements by filters of your creations for ease then return the first result. Otherwise return `None`.
|
@@ -792,7 +763,7 @@ class Adaptor(SelectorsGeneration):
|
|
792
763
|
return self.get_all_text(strip=True).json()
|
793
764
|
|
794
765
|
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True,
|
795
|
-
clean_match: bool = False, case_sensitive: bool = False) ->
|
766
|
+
clean_match: bool = False, case_sensitive: bool = False) -> TextHandlers:
|
796
767
|
"""Apply the given regex to the current text and return a list of strings with the matches.
|
797
768
|
|
798
769
|
:param regex: Can be either a compiled regular expression or a string.
|
@@ -803,7 +774,7 @@ class Adaptor(SelectorsGeneration):
|
|
803
774
|
return self.text.re(regex, replace_entities, clean_match, case_sensitive)
|
804
775
|
|
805
776
|
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
806
|
-
clean_match: bool = False, case_sensitive: bool = False) ->
|
777
|
+
clean_match: bool = False, case_sensitive: bool = False) -> TextHandler:
|
807
778
|
"""Apply the given regex to text and return the first match if found, otherwise return the default value.
|
808
779
|
|
809
780
|
:param regex: Can be either a compiled regular expression or a string.
|
@@ -894,12 +865,12 @@ class Adaptor(SelectorsGeneration):
|
|
894
865
|
if potential_match != root and are_alike(root, target_attrs, potential_match):
|
895
866
|
similar_elements.append(potential_match)
|
896
867
|
|
897
|
-
return self.
|
868
|
+
return self.__handle_elements(similar_elements)
|
898
869
|
|
899
870
|
def find_by_text(
|
900
871
|
self, text: str, first_match: bool = True, partial: bool = False,
|
901
872
|
case_sensitive: bool = False, clean_match: bool = True
|
902
|
-
) -> Union['Adaptors[Adaptor]', 'Adaptor'
|
873
|
+
) -> Union['Adaptors[Adaptor]', 'Adaptor']:
|
903
874
|
"""Find elements that its text content fully/partially matches input.
|
904
875
|
:param text: Text query to match
|
905
876
|
:param first_match: Return first element that matches conditions, enabled by default
|
@@ -908,74 +879,60 @@ class Adaptor(SelectorsGeneration):
|
|
908
879
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
909
880
|
"""
|
910
881
|
|
911
|
-
results = []
|
882
|
+
results = Adaptors([])
|
912
883
|
if not case_sensitive:
|
913
884
|
text = text.lower()
|
914
885
|
|
915
|
-
|
886
|
+
# This selector gets all elements with text content
|
887
|
+
for node in self.__handle_elements(self._root.xpath('.//*[normalize-space(text())]')):
|
916
888
|
"""Check if element matches given text otherwise, traverse the children tree and iterate"""
|
917
889
|
node_text = node.text
|
918
|
-
|
919
|
-
|
920
|
-
if clean_match:
|
921
|
-
node_text = node_text.clean()
|
922
|
-
|
923
|
-
if not case_sensitive:
|
924
|
-
node_text = node_text.lower()
|
925
|
-
|
926
|
-
if partial:
|
927
|
-
if text in node_text:
|
928
|
-
results.append(node)
|
929
|
-
elif text == node_text:
|
930
|
-
results.append(node)
|
890
|
+
if clean_match:
|
891
|
+
node_text = node_text.clean()
|
931
892
|
|
932
|
-
if
|
933
|
-
|
934
|
-
return
|
893
|
+
if not case_sensitive:
|
894
|
+
node_text = node_text.lower()
|
935
895
|
|
936
|
-
|
937
|
-
|
896
|
+
if partial:
|
897
|
+
if text in node_text:
|
898
|
+
results.append(node)
|
899
|
+
elif text == node_text:
|
900
|
+
results.append(node)
|
938
901
|
|
939
|
-
|
940
|
-
|
902
|
+
if first_match and results:
|
903
|
+
# we got an element so we should stop
|
904
|
+
break
|
941
905
|
|
942
906
|
if first_match:
|
943
907
|
if results:
|
944
908
|
return results[0]
|
945
|
-
return
|
909
|
+
return results
|
946
910
|
|
947
911
|
def find_by_regex(
|
948
912
|
self, query: Union[str, Pattern[str]], first_match: bool = True, case_sensitive: bool = False, clean_match: bool = True
|
949
|
-
) -> Union['Adaptors[Adaptor]', 'Adaptor'
|
913
|
+
) -> Union['Adaptors[Adaptor]', 'Adaptor']:
|
950
914
|
"""Find elements that its text content matches the input regex pattern.
|
951
915
|
:param query: Regex query/pattern to match
|
952
916
|
:param first_match: Return first element that matches conditions, enabled by default
|
953
917
|
:param case_sensitive: if enabled, letters case will be taken into consideration in the regex
|
954
918
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
955
919
|
"""
|
956
|
-
results = []
|
920
|
+
results = Adaptors([])
|
957
921
|
|
958
|
-
|
922
|
+
# This selector gets all elements with text content
|
923
|
+
for node in self.__handle_elements(self._root.xpath('.//*[normalize-space(text())]')):
|
959
924
|
"""Check if element matches given regex otherwise, traverse the children tree and iterate"""
|
960
925
|
node_text = node.text
|
961
|
-
|
962
|
-
|
963
|
-
if node_text.re(query, check_match=True, clean_match=clean_match, case_sensitive=case_sensitive):
|
964
|
-
results.append(node)
|
926
|
+
if node_text.re(query, check_match=True, clean_match=clean_match, case_sensitive=case_sensitive):
|
927
|
+
results.append(node)
|
965
928
|
|
966
|
-
if
|
929
|
+
if first_match and results:
|
967
930
|
# we got an element so we should stop
|
968
|
-
|
969
|
-
|
970
|
-
for branch in node.children:
|
971
|
-
_traverse(branch)
|
972
|
-
|
973
|
-
# This will block until we traverse all children/branches
|
974
|
-
_traverse(self)
|
931
|
+
break
|
975
932
|
|
976
933
|
if results and first_match:
|
977
934
|
return results[0]
|
978
|
-
return
|
935
|
+
return results
|
979
936
|
|
980
937
|
|
981
938
|
class Adaptors(List[Adaptor]):
|
@@ -984,7 +941,15 @@ class Adaptors(List[Adaptor]):
|
|
984
941
|
"""
|
985
942
|
__slots__ = ()
|
986
943
|
|
987
|
-
|
944
|
+
@typing.overload
|
945
|
+
def __getitem__(self, pos: SupportsIndex) -> Adaptor:
|
946
|
+
pass
|
947
|
+
|
948
|
+
@typing.overload
|
949
|
+
def __getitem__(self, pos: slice) -> "Adaptors":
|
950
|
+
pass
|
951
|
+
|
952
|
+
def __getitem__(self, pos: Union[SupportsIndex, slice]) -> Union[Adaptor, "Adaptors"]:
|
988
953
|
lst = super().__getitem__(pos)
|
989
954
|
if isinstance(pos, slice):
|
990
955
|
return self.__class__(lst)
|
@@ -993,7 +958,7 @@ class Adaptors(List[Adaptor]):
|
|
993
958
|
|
994
959
|
def xpath(
|
995
960
|
self, selector: str, identifier: str = '', auto_save: bool = False, percentage: int = 0, **kwargs: Any
|
996
|
-
) ->
|
961
|
+
) -> "Adaptors[Adaptor]":
|
997
962
|
"""
|
998
963
|
Call the ``.xpath()`` method for each element in this list and return
|
999
964
|
their results as another :class:`Adaptors`.
|
@@ -1019,7 +984,7 @@ class Adaptors(List[Adaptor]):
|
|
1019
984
|
]
|
1020
985
|
return self.__class__(flatten(results))
|
1021
986
|
|
1022
|
-
def css(self, selector: str, identifier: str = '', auto_save: bool = False, percentage: int = 0) ->
|
987
|
+
def css(self, selector: str, identifier: str = '', auto_save: bool = False, percentage: int = 0) -> "Adaptors[Adaptor]":
|
1023
988
|
"""
|
1024
989
|
Call the ``.css()`` method for each element in this list and return
|
1025
990
|
their results flattened as another :class:`Adaptors`.
|
@@ -1044,7 +1009,7 @@ class Adaptors(List[Adaptor]):
|
|
1044
1009
|
return self.__class__(flatten(results))
|
1045
1010
|
|
1046
1011
|
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True,
|
1047
|
-
clean_match: bool = False, case_sensitive: bool = False) ->
|
1012
|
+
clean_match: bool = False, case_sensitive: bool = False) -> TextHandlers[TextHandler]:
|
1048
1013
|
"""Call the ``.re()`` method for each element in this list and return
|
1049
1014
|
their results flattened as List of TextHandler.
|
1050
1015
|
|
@@ -1056,10 +1021,10 @@ class Adaptors(List[Adaptor]):
|
|
1056
1021
|
results = [
|
1057
1022
|
n.text.re(regex, replace_entities, clean_match, case_sensitive) for n in self
|
1058
1023
|
]
|
1059
|
-
return flatten(results)
|
1024
|
+
return TextHandlers(flatten(results))
|
1060
1025
|
|
1061
1026
|
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
1062
|
-
clean_match: bool = False, case_sensitive: bool = False) ->
|
1027
|
+
clean_match: bool = False, case_sensitive: bool = False) -> TextHandler:
|
1063
1028
|
"""Call the ``.re_first()`` method for each element in this list and return
|
1064
1029
|
the first result or the default value otherwise.
|
1065
1030
|
|
@@ -1084,15 +1049,14 @@ class Adaptors(List[Adaptor]):
|
|
1084
1049
|
return element
|
1085
1050
|
return None
|
1086
1051
|
|
1087
|
-
def filter(self, func: Callable[['Adaptor'], bool]) ->
|
1052
|
+
def filter(self, func: Callable[['Adaptor'], bool]) -> 'Adaptors[Adaptor]':
|
1088
1053
|
"""Filter current elements based on the passed function
|
1089
1054
|
:param func: A function that takes each element as an argument and returns True/False
|
1090
1055
|
:return: The new `Adaptors` object or empty list otherwise.
|
1091
1056
|
"""
|
1092
|
-
|
1057
|
+
return self.__class__([
|
1093
1058
|
element for element in self if func(element)
|
1094
|
-
]
|
1095
|
-
return self.__class__(results) if results else results
|
1059
|
+
])
|
1096
1060
|
|
1097
1061
|
# For easy copy-paste from Scrapy/parsel code when needed :)
|
1098
1062
|
def get(self, default=None):
|