scrapling 0.1.2__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +4 -3
- scrapling/core/__init__.py +0 -0
- scrapling/core/_types.py +25 -0
- scrapling/{custom_types.py → core/custom_types.py} +48 -3
- scrapling/{mixins.py → core/mixins.py} +22 -7
- scrapling/{storage_adaptors.py → core/storage_adaptors.py} +2 -2
- scrapling/{translator.py → core/translator.py} +2 -12
- scrapling/{utils.py → core/utils.py} +14 -61
- scrapling/engines/__init__.py +7 -0
- scrapling/engines/camo.py +128 -0
- scrapling/engines/constants.py +108 -0
- scrapling/engines/pw.py +237 -0
- scrapling/engines/static.py +112 -0
- scrapling/engines/toolbelt/__init__.py +19 -0
- scrapling/engines/toolbelt/custom.py +154 -0
- scrapling/engines/toolbelt/fingerprints.py +81 -0
- scrapling/engines/toolbelt/navigation.py +108 -0
- scrapling/fetchers.py +198 -0
- scrapling/parser.py +223 -70
- scrapling/py.typed +1 -0
- scrapling-0.2.1.dist-info/METADATA +835 -0
- scrapling-0.2.1.dist-info/RECORD +33 -0
- {scrapling-0.1.2.dist-info → scrapling-0.2.1.dist-info}/WHEEL +1 -1
- {scrapling-0.1.2.dist-info → scrapling-0.2.1.dist-info}/top_level.txt +1 -0
- tests/__init__.py +1 -0
- tests/fetchers/__init__.py +1 -0
- tests/fetchers/test_camoufox.py +62 -0
- tests/fetchers/test_httpx.py +67 -0
- tests/fetchers/test_playwright.py +74 -0
- tests/parser/__init__.py +0 -0
- tests/parser/test_automatch.py +56 -0
- tests/parser/test_general.py +286 -0
- scrapling-0.1.2.dist-info/METADATA +0 -477
- scrapling-0.1.2.dist-info/RECORD +0 -12
- {scrapling-0.1.2.dist-info → scrapling-0.2.1.dist-info}/LICENSE +0 -0
scrapling/parser.py
CHANGED
@@ -1,18 +1,14 @@
|
|
1
1
|
import os
|
2
|
+
import re
|
3
|
+
import inspect
|
2
4
|
from difflib import SequenceMatcher
|
3
|
-
from typing import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator
|
4
|
-
try:
|
5
|
-
from typing import SupportsIndex
|
6
|
-
except ImportError:
|
7
|
-
# 'SupportsIndex' got added in Python 3.8
|
8
|
-
SupportsIndex = None
|
9
|
-
|
10
|
-
from scrapling.translator import HTMLTranslator
|
11
|
-
from scrapling.mixins import SelectorsGeneration
|
12
|
-
from scrapling.custom_types import TextHandler, AttributesHandler
|
13
|
-
from scrapling.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
|
14
|
-
from scrapling.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden
|
15
5
|
|
6
|
+
from scrapling.core.translator import HTMLTranslator
|
7
|
+
from scrapling.core.mixins import SelectorsGeneration
|
8
|
+
from scrapling.core.custom_types import TextHandler, TextHandlers, AttributesHandler
|
9
|
+
from scrapling.core.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
|
10
|
+
from scrapling.core.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden, is_jsonable
|
11
|
+
from scrapling.core._types import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator, SupportsIndex, Iterable
|
16
12
|
from lxml import etree, html
|
17
13
|
from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
|
18
14
|
|
@@ -32,7 +28,7 @@ class Adaptor(SelectorsGeneration):
|
|
32
28
|
huge_tree: bool = True,
|
33
29
|
root: Optional[html.HtmlElement] = None,
|
34
30
|
keep_comments: Optional[bool] = False,
|
35
|
-
auto_match: Optional[bool] =
|
31
|
+
auto_match: Optional[bool] = True,
|
36
32
|
storage: Any = SQLiteStorageSystem,
|
37
33
|
storage_args: Optional[Dict] = None,
|
38
34
|
debug: Optional[bool] = True,
|
@@ -64,6 +60,7 @@ class Adaptor(SelectorsGeneration):
|
|
64
60
|
if root is None and not body and text is None:
|
65
61
|
raise ValueError("Adaptor class needs text, body, or root arguments to work")
|
66
62
|
|
63
|
+
self.__text = None
|
67
64
|
if root is None:
|
68
65
|
if text is None:
|
69
66
|
if not body or not isinstance(body, bytes):
|
@@ -76,12 +73,14 @@ class Adaptor(SelectorsGeneration):
|
|
76
73
|
|
77
74
|
body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
|
78
75
|
|
76
|
+
# https://lxml.de/api/lxml.etree.HTMLParser-class.html
|
79
77
|
parser = html.HTMLParser(
|
80
|
-
# https://lxml.de/api/lxml.etree.HTMLParser-class.html
|
81
78
|
recover=True, remove_blank_text=True, remove_comments=(keep_comments is False), encoding=encoding,
|
82
79
|
compact=True, huge_tree=huge_tree, default_doctype=True
|
83
80
|
)
|
84
81
|
self._root = etree.fromstring(body, parser=parser, base_url=url)
|
82
|
+
if is_jsonable(text or body.decode()):
|
83
|
+
self.__text = TextHandler(text or body.decode())
|
85
84
|
|
86
85
|
else:
|
87
86
|
# All html types inherits from HtmlMixin so this to check for all at once
|
@@ -116,7 +115,6 @@ class Adaptor(SelectorsGeneration):
|
|
116
115
|
self.url = url
|
117
116
|
# For selector stuff
|
118
117
|
self.__attributes = None
|
119
|
-
self.__text = None
|
120
118
|
self.__tag = None
|
121
119
|
self.__debug = debug
|
122
120
|
|
@@ -125,7 +123,7 @@ class Adaptor(SelectorsGeneration):
|
|
125
123
|
def _is_text_node(element: Union[html.HtmlElement, etree._ElementUnicodeResult]) -> bool:
|
126
124
|
"""Return True if given element is a result of a string expression
|
127
125
|
Examples:
|
128
|
-
|
126
|
+
XPath -> '/text()', '/@attribute' etc...
|
129
127
|
CSS3 -> '::text', '::attr(attrib)'...
|
130
128
|
"""
|
131
129
|
# Faster than checking `element.is_attribute or element.is_text or element.is_tail`
|
@@ -163,6 +161,8 @@ class Adaptor(SelectorsGeneration):
|
|
163
161
|
results = [self.__get_correct_result(n) for n in result]
|
164
162
|
if all(isinstance(res, self.__class__) for res in results):
|
165
163
|
return Adaptors(results)
|
164
|
+
elif all(isinstance(res, TextHandler) for res in results):
|
165
|
+
return TextHandlers(results)
|
166
166
|
return results
|
167
167
|
|
168
168
|
return self.__get_correct_result(result)
|
@@ -187,23 +187,9 @@ class Adaptor(SelectorsGeneration):
|
|
187
187
|
def text(self) -> TextHandler:
|
188
188
|
"""Get text content of the element"""
|
189
189
|
if not self.__text:
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
# Escape lxml default behaviour and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
|
194
|
-
# This issue is present in parsel/scrapy as well so no need to repeat it here so the user can run regex on the full text.
|
195
|
-
code = self.html_content
|
196
|
-
parser = html.HTMLParser(
|
197
|
-
recover=True, remove_blank_text=True, remove_comments=True, encoding=self.encoding,
|
198
|
-
compact=True, huge_tree=self.__huge_tree_enabled, default_doctype=True
|
199
|
-
)
|
200
|
-
fragment_root = html.fragment_fromstring(code, parser=parser)
|
201
|
-
self.__text = TextHandler(fragment_root.text)
|
202
|
-
else:
|
203
|
-
self.__text = TextHandler(self._root.text)
|
204
|
-
else:
|
205
|
-
# If user already chose to not keep comments then all is good
|
206
|
-
self.__text = TextHandler(self._root.text)
|
190
|
+
# If you want to escape lxml default behaviour and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
|
191
|
+
# before extracting text then keep `keep_comments` set to False while initializing the first class
|
192
|
+
self.__text = TextHandler(self._root.text)
|
207
193
|
return self.__text
|
208
194
|
|
209
195
|
def get_all_text(self, separator: str = "\n", strip: bool = False, ignore_tags: Tuple = ('script', 'style',), valid_values: bool = True) -> TextHandler:
|
@@ -399,6 +385,56 @@ class Adaptor(SelectorsGeneration):
|
|
399
385
|
return self.__convert_results(score_table[highest_probability])
|
400
386
|
return []
|
401
387
|
|
388
|
+
def css_first(self, selector: str, identifier: str = '',
|
389
|
+
auto_match: bool = False, auto_save: bool = False, percentage: int = 0
|
390
|
+
) -> Union['Adaptor', 'TextHandler', None]:
|
391
|
+
"""Search current tree with CSS3 selectors and return the first result if possible, otherwise return `None`
|
392
|
+
|
393
|
+
**Important:
|
394
|
+
It's recommended to use the identifier argument if you plan to use different selector later
|
395
|
+
and want to relocate the same element(s)**
|
396
|
+
|
397
|
+
:param selector: The CSS3 selector to be used.
|
398
|
+
:param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
|
399
|
+
:param identifier: A string that will be used to save/retrieve element's data in auto-matching
|
400
|
+
otherwise the selector will be used.
|
401
|
+
:param auto_save: Automatically save new elements for `auto_match` later
|
402
|
+
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
403
|
+
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
404
|
+
number unless you must know what you are doing!
|
405
|
+
|
406
|
+
:return: List as :class:`Adaptors`
|
407
|
+
"""
|
408
|
+
for element in self.css(selector, identifier, auto_match, auto_save, percentage):
|
409
|
+
return element
|
410
|
+
return None
|
411
|
+
|
412
|
+
def xpath_first(self, selector: str, identifier: str = '',
|
413
|
+
auto_match: bool = False, auto_save: bool = False, percentage: int = 0, **kwargs: Any
|
414
|
+
) -> Union['Adaptor', 'TextHandler', None]:
|
415
|
+
"""Search current tree with XPath selectors and return the first result if possible, otherwise return `None`
|
416
|
+
|
417
|
+
**Important:
|
418
|
+
It's recommended to use the identifier argument if you plan to use different selector later
|
419
|
+
and want to relocate the same element(s)**
|
420
|
+
|
421
|
+
Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
|
422
|
+
|
423
|
+
:param selector: The XPath selector to be used.
|
424
|
+
:param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
|
425
|
+
:param identifier: A string that will be used to save/retrieve element's data in auto-matching
|
426
|
+
otherwise the selector will be used.
|
427
|
+
:param auto_save: Automatically save new elements for `auto_match` later
|
428
|
+
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
429
|
+
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
430
|
+
number unless you must know what you are doing!
|
431
|
+
|
432
|
+
:return: List as :class:`Adaptors`
|
433
|
+
"""
|
434
|
+
for element in self.xpath(selector, identifier, auto_match, auto_save, percentage, **kwargs):
|
435
|
+
return element
|
436
|
+
return None
|
437
|
+
|
402
438
|
def css(self, selector: str, identifier: str = '',
|
403
439
|
auto_match: bool = False, auto_save: bool = False, percentage: int = 0
|
404
440
|
) -> Union['Adaptors[Adaptor]', List]:
|
@@ -495,6 +531,113 @@ class Adaptor(SelectorsGeneration):
|
|
495
531
|
except (SelectorError, SelectorSyntaxError, etree.XPathError, etree.XPathEvalError):
|
496
532
|
raise SelectorSyntaxError(f"Invalid XPath selector: {selector}")
|
497
533
|
|
534
|
+
def find_all(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> Union['Adaptors[Adaptor]', List]:
|
535
|
+
"""Find elements by filters of your creations for ease..
|
536
|
+
|
537
|
+
:param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
|
538
|
+
:param kwargs: The attributes you want to filter elements based on it.
|
539
|
+
:return: The `Adaptors` object of the elements or empty list
|
540
|
+
"""
|
541
|
+
# Attributes that are Python reserved words and can't be used directly
|
542
|
+
# Ex: find_all('a', class="blah") -> find_all('a', class_="blah")
|
543
|
+
# https://www.w3schools.com/python/python_ref_keywords.asp
|
544
|
+
whitelisted = {
|
545
|
+
'class_': 'class',
|
546
|
+
'for_': 'for',
|
547
|
+
}
|
548
|
+
|
549
|
+
if not args and not kwargs:
|
550
|
+
raise TypeError('You have to pass something to search with, like tag name(s), tag attributes, or both.')
|
551
|
+
|
552
|
+
attributes = dict()
|
553
|
+
tags, patterns = set(), set()
|
554
|
+
results, functions, selectors = [], [], []
|
555
|
+
|
556
|
+
def _search_tree(element: Adaptor, filter_function: Callable) -> None:
|
557
|
+
"""Collect element if it fulfills passed function otherwise, traverse the children tree and iterate"""
|
558
|
+
if filter_function(element):
|
559
|
+
results.append(element)
|
560
|
+
|
561
|
+
for branch in element.children:
|
562
|
+
_search_tree(branch, filter_function)
|
563
|
+
|
564
|
+
# Brace yourself for a wonderful journey!
|
565
|
+
for arg in args:
|
566
|
+
if type(arg) is str:
|
567
|
+
tags.add(arg)
|
568
|
+
|
569
|
+
elif type(arg) in [list, tuple, set]:
|
570
|
+
if not all(map(lambda x: type(x) is str, arg)):
|
571
|
+
raise TypeError('Nested Iterables are not accepted, only iterables of tag names are accepted')
|
572
|
+
tags.update(set(arg))
|
573
|
+
|
574
|
+
elif type(arg) is dict:
|
575
|
+
if not all([(type(k) is str and type(v) is str) for k, v in arg.items()]):
|
576
|
+
raise TypeError('Nested dictionaries are not accepted, only string keys and string values are accepted')
|
577
|
+
attributes.update(arg)
|
578
|
+
|
579
|
+
elif type(arg) is re.Pattern:
|
580
|
+
patterns.add(arg)
|
581
|
+
|
582
|
+
elif callable(arg):
|
583
|
+
if len(inspect.signature(arg).parameters) > 0:
|
584
|
+
functions.append(arg)
|
585
|
+
else:
|
586
|
+
raise TypeError("Callable filter function must have at least one argument to take `Adaptor` objects.")
|
587
|
+
|
588
|
+
else:
|
589
|
+
raise TypeError(f'Argument with type "{type(arg)}" is not accepted, please read the docs.')
|
590
|
+
|
591
|
+
if not all([(type(k) is str and type(v) is str) for k, v in kwargs.items()]):
|
592
|
+
raise TypeError('Only string values are accepted for arguments')
|
593
|
+
|
594
|
+
for attribute_name, value in kwargs.items():
|
595
|
+
# Only replace names for kwargs, replacing them in dictionaries doesn't make sense
|
596
|
+
attribute_name = whitelisted.get(attribute_name, attribute_name)
|
597
|
+
attributes[attribute_name] = value
|
598
|
+
|
599
|
+
# It's easier and faster to build a selector than traversing the tree
|
600
|
+
tags = tags or ['']
|
601
|
+
for tag in tags:
|
602
|
+
selector = tag
|
603
|
+
for key, value in attributes.items():
|
604
|
+
value = value.replace('"', r'\"') # Escape double quotes in user input
|
605
|
+
# Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)
|
606
|
+
selector += '[{}="{}"]'.format(key, value)
|
607
|
+
if selector:
|
608
|
+
selectors.append(selector)
|
609
|
+
|
610
|
+
if selectors:
|
611
|
+
results = self.css(', '.join(selectors))
|
612
|
+
if results:
|
613
|
+
# From the results, get the ones that fulfill passed regex patterns
|
614
|
+
for pattern in patterns:
|
615
|
+
results = results.filter(lambda e: e.text.re(pattern, check_match=True))
|
616
|
+
|
617
|
+
# From the results, get the ones that fulfill passed functions
|
618
|
+
for function in functions:
|
619
|
+
results = results.filter(function)
|
620
|
+
else:
|
621
|
+
for pattern in patterns:
|
622
|
+
results.extend(self.find_by_regex(pattern, first_match=False))
|
623
|
+
|
624
|
+
for result in (results or [self]):
|
625
|
+
for function in functions:
|
626
|
+
_search_tree(result, function)
|
627
|
+
|
628
|
+
return self.__convert_results(results)
|
629
|
+
|
630
|
+
def find(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> Union['Adaptor', None]:
|
631
|
+
"""Find elements by filters of your creations for ease then return the first result. Otherwise return `None`.
|
632
|
+
|
633
|
+
:param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
|
634
|
+
:param kwargs: The attributes you want to filter elements based on it.
|
635
|
+
:return: The `Adaptor` object of the element or `None` if the result didn't match
|
636
|
+
"""
|
637
|
+
for element in self.find_all(*args, **kwargs):
|
638
|
+
return element
|
639
|
+
return None
|
640
|
+
|
498
641
|
def __calculate_similarity_score(self, original: Dict, candidate: html.HtmlElement) -> float:
|
499
642
|
"""Used internally to calculate a score that shows how candidate element similar to the original one
|
500
643
|
|
@@ -606,25 +749,33 @@ class Adaptor(SelectorsGeneration):
|
|
606
749
|
# Operations on text functions
|
607
750
|
def json(self) -> Dict:
|
608
751
|
"""Return json response if the response is jsonable otherwise throws error"""
|
609
|
-
|
752
|
+
if self.text:
|
753
|
+
return self.text.json()
|
754
|
+
else:
|
755
|
+
return self.get_all_text(strip=True).json()
|
610
756
|
|
611
|
-
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True
|
757
|
+
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True,
|
758
|
+
clean_match: bool = False, case_sensitive: bool = False) -> 'List[str]':
|
612
759
|
"""Apply the given regex to the current text and return a list of strings with the matches.
|
613
760
|
|
614
761
|
:param regex: Can be either a compiled regular expression or a string.
|
615
762
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
763
|
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
764
|
+
:param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
|
616
765
|
"""
|
617
|
-
return self.text.re(regex, replace_entities)
|
766
|
+
return self.text.re(regex, replace_entities, clean_match, case_sensitive)
|
618
767
|
|
619
|
-
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True
|
768
|
+
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
769
|
+
clean_match: bool = False, case_sensitive: bool = False) -> Union[str, None]:
|
620
770
|
"""Apply the given regex to text and return the first match if found, otherwise return the default value.
|
621
771
|
|
622
772
|
:param regex: Can be either a compiled regular expression or a string.
|
623
773
|
:param default: The default value to be returned if there is no match
|
624
774
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
625
|
-
|
775
|
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
776
|
+
:param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
|
626
777
|
"""
|
627
|
-
return self.text.re_first(regex, default, replace_entities)
|
778
|
+
return self.text.re_first(regex, default, replace_entities, clean_match, case_sensitive)
|
628
779
|
|
629
780
|
def find_similar(
|
630
781
|
self,
|
@@ -757,10 +908,10 @@ class Adaptor(SelectorsGeneration):
|
|
757
908
|
return self.__convert_results(results)
|
758
909
|
|
759
910
|
def find_by_regex(
|
760
|
-
self, query: str, first_match: bool = True, case_sensitive: bool = False, clean_match: bool = True
|
911
|
+
self, query: Union[str, Pattern[str]], first_match: bool = True, case_sensitive: bool = False, clean_match: bool = True
|
761
912
|
) -> Union['Adaptors[Adaptor]', 'Adaptor', List]:
|
762
913
|
"""Find elements that its text content matches the input regex pattern.
|
763
|
-
:param query: Regex query to match
|
914
|
+
:param query: Regex query/pattern to match
|
764
915
|
:param first_match: Return first element that matches conditions, enabled by default
|
765
916
|
:param case_sensitive: if enabled, letters case will be taken into consideration in the regex
|
766
917
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
@@ -855,54 +1006,56 @@ class Adaptors(List[Adaptor]):
|
|
855
1006
|
]
|
856
1007
|
return self.__class__(flatten(results))
|
857
1008
|
|
858
|
-
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True
|
1009
|
+
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True,
|
1010
|
+
clean_match: bool = False, case_sensitive: bool = False) -> 'List[str]':
|
859
1011
|
"""Call the ``.re()`` method for each element in this list and return
|
860
1012
|
their results flattened as List of TextHandler.
|
861
1013
|
|
862
1014
|
:param regex: Can be either a compiled regular expression or a string.
|
863
1015
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
1016
|
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
1017
|
+
:param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
|
864
1018
|
"""
|
865
1019
|
results = [
|
866
|
-
n.text.re(regex, replace_entities) for n in self
|
1020
|
+
n.text.re(regex, replace_entities, clean_match, case_sensitive) for n in self
|
867
1021
|
]
|
868
1022
|
return flatten(results)
|
869
1023
|
|
870
|
-
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True
|
1024
|
+
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
1025
|
+
clean_match: bool = False, case_sensitive: bool = False) -> Union[str, None]:
|
871
1026
|
"""Call the ``.re_first()`` method for each element in this list and return
|
872
|
-
|
1027
|
+
the first result or the default value otherwise.
|
873
1028
|
|
874
1029
|
:param regex: Can be either a compiled regular expression or a string.
|
875
1030
|
:param default: The default value to be returned if there is no match
|
876
1031
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
1032
|
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
1033
|
+
:param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
|
1034
|
+
"""
|
1035
|
+
for n in self:
|
1036
|
+
for result in n.re(regex, replace_entities, clean_match, case_sensitive):
|
1037
|
+
return result
|
1038
|
+
return default
|
1039
|
+
|
1040
|
+
def search(self, func: Callable[['Adaptor'], bool]) -> Union['Adaptor', None]:
|
1041
|
+
"""Loop over all current elements and return the first element that matches the passed function
|
1042
|
+
:param func: A function that takes each element as an argument and returns True/False
|
1043
|
+
:return: The first element that match the function or ``None`` otherwise.
|
1044
|
+
"""
|
1045
|
+
for element in self:
|
1046
|
+
if func(element):
|
1047
|
+
return element
|
1048
|
+
return None
|
877
1049
|
|
1050
|
+
def filter(self, func: Callable[['Adaptor'], bool]) -> Union['Adaptors', List]:
|
1051
|
+
"""Filter current elements based on the passed function
|
1052
|
+
:param func: A function that takes each element as an argument and returns True/False
|
1053
|
+
:return: The new `Adaptors` object or empty list otherwise.
|
878
1054
|
"""
|
879
1055
|
results = [
|
880
|
-
|
1056
|
+
element for element in self if func(element)
|
881
1057
|
]
|
882
|
-
return
|
883
|
-
|
884
|
-
# def __getattr__(self, name):
|
885
|
-
# if name in dir(self.__class__):
|
886
|
-
# return super().__getattribute__(name)
|
887
|
-
#
|
888
|
-
# # Execute the method itself on each Adaptor
|
889
|
-
# results = []
|
890
|
-
# for item in self:
|
891
|
-
# results.append(getattr(item, name))
|
892
|
-
#
|
893
|
-
# if all(callable(r) for r in results):
|
894
|
-
# def call_all(*args, **kwargs):
|
895
|
-
# final_results = [r(*args, **kwargs) for r in results]
|
896
|
-
# if all([isinstance(r, (Adaptor, Adaptors,)) for r in results]):
|
897
|
-
# return self.__class__(final_results)
|
898
|
-
# return final_results
|
899
|
-
#
|
900
|
-
# return call_all
|
901
|
-
# else:
|
902
|
-
# # Flatten the result if it's a single-item list containing a list
|
903
|
-
# if len(self) == 1 and isinstance(results[0], list):
|
904
|
-
# return self.__class__(results[0])
|
905
|
-
# return self.__class__(results)
|
1058
|
+
return self.__class__(results) if results else results
|
906
1059
|
|
907
1060
|
def get(self, default=None):
|
908
1061
|
"""Returns the first item of the current list
|
scrapling/py.typed
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
|