scrapling 0.1.2__py3-none-any.whl → 0.2.1__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- scrapling/__init__.py +4 -3
- scrapling/core/__init__.py +0 -0
- scrapling/core/_types.py +25 -0
- scrapling/{custom_types.py → core/custom_types.py} +48 -3
- scrapling/{mixins.py → core/mixins.py} +22 -7
- scrapling/{storage_adaptors.py → core/storage_adaptors.py} +2 -2
- scrapling/{translator.py → core/translator.py} +2 -12
- scrapling/{utils.py → core/utils.py} +14 -61
- scrapling/engines/__init__.py +7 -0
- scrapling/engines/camo.py +128 -0
- scrapling/engines/constants.py +108 -0
- scrapling/engines/pw.py +237 -0
- scrapling/engines/static.py +112 -0
- scrapling/engines/toolbelt/__init__.py +19 -0
- scrapling/engines/toolbelt/custom.py +154 -0
- scrapling/engines/toolbelt/fingerprints.py +81 -0
- scrapling/engines/toolbelt/navigation.py +108 -0
- scrapling/fetchers.py +198 -0
- scrapling/parser.py +223 -70
- scrapling/py.typed +1 -0
- scrapling-0.2.1.dist-info/METADATA +835 -0
- scrapling-0.2.1.dist-info/RECORD +33 -0
- {scrapling-0.1.2.dist-info → scrapling-0.2.1.dist-info}/WHEEL +1 -1
- {scrapling-0.1.2.dist-info → scrapling-0.2.1.dist-info}/top_level.txt +1 -0
- tests/__init__.py +1 -0
- tests/fetchers/__init__.py +1 -0
- tests/fetchers/test_camoufox.py +62 -0
- tests/fetchers/test_httpx.py +67 -0
- tests/fetchers/test_playwright.py +74 -0
- tests/parser/__init__.py +0 -0
- tests/parser/test_automatch.py +56 -0
- tests/parser/test_general.py +286 -0
- scrapling-0.1.2.dist-info/METADATA +0 -477
- scrapling-0.1.2.dist-info/RECORD +0 -12
- {scrapling-0.1.2.dist-info → scrapling-0.2.1.dist-info}/LICENSE +0 -0
scrapling/parser.py
CHANGED
@@ -1,18 +1,14 @@
|
|
1
1
|
import os
|
2
|
+
import re
|
3
|
+
import inspect
|
2
4
|
from difflib import SequenceMatcher
|
3
|
-
from typing import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator
|
4
|
-
try:
|
5
|
-
from typing import SupportsIndex
|
6
|
-
except ImportError:
|
7
|
-
# 'SupportsIndex' got added in Python 3.8
|
8
|
-
SupportsIndex = None
|
9
|
-
|
10
|
-
from scrapling.translator import HTMLTranslator
|
11
|
-
from scrapling.mixins import SelectorsGeneration
|
12
|
-
from scrapling.custom_types import TextHandler, AttributesHandler
|
13
|
-
from scrapling.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
|
14
|
-
from scrapling.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden
|
15
5
|
|
6
|
+
from scrapling.core.translator import HTMLTranslator
|
7
|
+
from scrapling.core.mixins import SelectorsGeneration
|
8
|
+
from scrapling.core.custom_types import TextHandler, TextHandlers, AttributesHandler
|
9
|
+
from scrapling.core.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
|
10
|
+
from scrapling.core.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden, is_jsonable
|
11
|
+
from scrapling.core._types import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator, SupportsIndex, Iterable
|
16
12
|
from lxml import etree, html
|
17
13
|
from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
|
18
14
|
|
@@ -32,7 +28,7 @@ class Adaptor(SelectorsGeneration):
|
|
32
28
|
huge_tree: bool = True,
|
33
29
|
root: Optional[html.HtmlElement] = None,
|
34
30
|
keep_comments: Optional[bool] = False,
|
35
|
-
auto_match: Optional[bool] =
|
31
|
+
auto_match: Optional[bool] = True,
|
36
32
|
storage: Any = SQLiteStorageSystem,
|
37
33
|
storage_args: Optional[Dict] = None,
|
38
34
|
debug: Optional[bool] = True,
|
@@ -64,6 +60,7 @@ class Adaptor(SelectorsGeneration):
|
|
64
60
|
if root is None and not body and text is None:
|
65
61
|
raise ValueError("Adaptor class needs text, body, or root arguments to work")
|
66
62
|
|
63
|
+
self.__text = None
|
67
64
|
if root is None:
|
68
65
|
if text is None:
|
69
66
|
if not body or not isinstance(body, bytes):
|
@@ -76,12 +73,14 @@ class Adaptor(SelectorsGeneration):
|
|
76
73
|
|
77
74
|
body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
|
78
75
|
|
76
|
+
# https://lxml.de/api/lxml.etree.HTMLParser-class.html
|
79
77
|
parser = html.HTMLParser(
|
80
|
-
# https://lxml.de/api/lxml.etree.HTMLParser-class.html
|
81
78
|
recover=True, remove_blank_text=True, remove_comments=(keep_comments is False), encoding=encoding,
|
82
79
|
compact=True, huge_tree=huge_tree, default_doctype=True
|
83
80
|
)
|
84
81
|
self._root = etree.fromstring(body, parser=parser, base_url=url)
|
82
|
+
if is_jsonable(text or body.decode()):
|
83
|
+
self.__text = TextHandler(text or body.decode())
|
85
84
|
|
86
85
|
else:
|
87
86
|
# All html types inherits from HtmlMixin so this to check for all at once
|
@@ -116,7 +115,6 @@ class Adaptor(SelectorsGeneration):
|
|
116
115
|
self.url = url
|
117
116
|
# For selector stuff
|
118
117
|
self.__attributes = None
|
119
|
-
self.__text = None
|
120
118
|
self.__tag = None
|
121
119
|
self.__debug = debug
|
122
120
|
|
@@ -125,7 +123,7 @@ class Adaptor(SelectorsGeneration):
|
|
125
123
|
def _is_text_node(element: Union[html.HtmlElement, etree._ElementUnicodeResult]) -> bool:
|
126
124
|
"""Return True if given element is a result of a string expression
|
127
125
|
Examples:
|
128
|
-
|
126
|
+
XPath -> '/text()', '/@attribute' etc...
|
129
127
|
CSS3 -> '::text', '::attr(attrib)'...
|
130
128
|
"""
|
131
129
|
# Faster than checking `element.is_attribute or element.is_text or element.is_tail`
|
@@ -163,6 +161,8 @@ class Adaptor(SelectorsGeneration):
|
|
163
161
|
results = [self.__get_correct_result(n) for n in result]
|
164
162
|
if all(isinstance(res, self.__class__) for res in results):
|
165
163
|
return Adaptors(results)
|
164
|
+
elif all(isinstance(res, TextHandler) for res in results):
|
165
|
+
return TextHandlers(results)
|
166
166
|
return results
|
167
167
|
|
168
168
|
return self.__get_correct_result(result)
|
@@ -187,23 +187,9 @@ class Adaptor(SelectorsGeneration):
|
|
187
187
|
def text(self) -> TextHandler:
|
188
188
|
"""Get text content of the element"""
|
189
189
|
if not self.__text:
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
# Escape lxml default behaviour and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
|
194
|
-
# This issue is present in parsel/scrapy as well so no need to repeat it here so the user can run regex on the full text.
|
195
|
-
code = self.html_content
|
196
|
-
parser = html.HTMLParser(
|
197
|
-
recover=True, remove_blank_text=True, remove_comments=True, encoding=self.encoding,
|
198
|
-
compact=True, huge_tree=self.__huge_tree_enabled, default_doctype=True
|
199
|
-
)
|
200
|
-
fragment_root = html.fragment_fromstring(code, parser=parser)
|
201
|
-
self.__text = TextHandler(fragment_root.text)
|
202
|
-
else:
|
203
|
-
self.__text = TextHandler(self._root.text)
|
204
|
-
else:
|
205
|
-
# If user already chose to not keep comments then all is good
|
206
|
-
self.__text = TextHandler(self._root.text)
|
190
|
+
# If you want to escape lxml default behaviour and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
|
191
|
+
# before extracting text then keep `keep_comments` set to False while initializing the first class
|
192
|
+
self.__text = TextHandler(self._root.text)
|
207
193
|
return self.__text
|
208
194
|
|
209
195
|
def get_all_text(self, separator: str = "\n", strip: bool = False, ignore_tags: Tuple = ('script', 'style',), valid_values: bool = True) -> TextHandler:
|
@@ -399,6 +385,56 @@ class Adaptor(SelectorsGeneration):
|
|
399
385
|
return self.__convert_results(score_table[highest_probability])
|
400
386
|
return []
|
401
387
|
|
388
|
+
def css_first(self, selector: str, identifier: str = '',
|
389
|
+
auto_match: bool = False, auto_save: bool = False, percentage: int = 0
|
390
|
+
) -> Union['Adaptor', 'TextHandler', None]:
|
391
|
+
"""Search current tree with CSS3 selectors and return the first result if possible, otherwise return `None`
|
392
|
+
|
393
|
+
**Important:
|
394
|
+
It's recommended to use the identifier argument if you plan to use different selector later
|
395
|
+
and want to relocate the same element(s)**
|
396
|
+
|
397
|
+
:param selector: The CSS3 selector to be used.
|
398
|
+
:param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
|
399
|
+
:param identifier: A string that will be used to save/retrieve element's data in auto-matching
|
400
|
+
otherwise the selector will be used.
|
401
|
+
:param auto_save: Automatically save new elements for `auto_match` later
|
402
|
+
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
403
|
+
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
404
|
+
number unless you must know what you are doing!
|
405
|
+
|
406
|
+
:return: List as :class:`Adaptors`
|
407
|
+
"""
|
408
|
+
for element in self.css(selector, identifier, auto_match, auto_save, percentage):
|
409
|
+
return element
|
410
|
+
return None
|
411
|
+
|
412
|
+
def xpath_first(self, selector: str, identifier: str = '',
|
413
|
+
auto_match: bool = False, auto_save: bool = False, percentage: int = 0, **kwargs: Any
|
414
|
+
) -> Union['Adaptor', 'TextHandler', None]:
|
415
|
+
"""Search current tree with XPath selectors and return the first result if possible, otherwise return `None`
|
416
|
+
|
417
|
+
**Important:
|
418
|
+
It's recommended to use the identifier argument if you plan to use different selector later
|
419
|
+
and want to relocate the same element(s)**
|
420
|
+
|
421
|
+
Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
|
422
|
+
|
423
|
+
:param selector: The XPath selector to be used.
|
424
|
+
:param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
|
425
|
+
:param identifier: A string that will be used to save/retrieve element's data in auto-matching
|
426
|
+
otherwise the selector will be used.
|
427
|
+
:param auto_save: Automatically save new elements for `auto_match` later
|
428
|
+
:param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
|
429
|
+
Be aware that the percentage calculation depends solely on the page structure so don't play with this
|
430
|
+
number unless you must know what you are doing!
|
431
|
+
|
432
|
+
:return: List as :class:`Adaptors`
|
433
|
+
"""
|
434
|
+
for element in self.xpath(selector, identifier, auto_match, auto_save, percentage, **kwargs):
|
435
|
+
return element
|
436
|
+
return None
|
437
|
+
|
402
438
|
def css(self, selector: str, identifier: str = '',
|
403
439
|
auto_match: bool = False, auto_save: bool = False, percentage: int = 0
|
404
440
|
) -> Union['Adaptors[Adaptor]', List]:
|
@@ -495,6 +531,113 @@ class Adaptor(SelectorsGeneration):
|
|
495
531
|
except (SelectorError, SelectorSyntaxError, etree.XPathError, etree.XPathEvalError):
|
496
532
|
raise SelectorSyntaxError(f"Invalid XPath selector: {selector}")
|
497
533
|
|
534
|
+
def find_all(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> Union['Adaptors[Adaptor]', List]:
|
535
|
+
"""Find elements by filters of your creations for ease..
|
536
|
+
|
537
|
+
:param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
|
538
|
+
:param kwargs: The attributes you want to filter elements based on it.
|
539
|
+
:return: The `Adaptors` object of the elements or empty list
|
540
|
+
"""
|
541
|
+
# Attributes that are Python reserved words and can't be used directly
|
542
|
+
# Ex: find_all('a', class="blah") -> find_all('a', class_="blah")
|
543
|
+
# https://www.w3schools.com/python/python_ref_keywords.asp
|
544
|
+
whitelisted = {
|
545
|
+
'class_': 'class',
|
546
|
+
'for_': 'for',
|
547
|
+
}
|
548
|
+
|
549
|
+
if not args and not kwargs:
|
550
|
+
raise TypeError('You have to pass something to search with, like tag name(s), tag attributes, or both.')
|
551
|
+
|
552
|
+
attributes = dict()
|
553
|
+
tags, patterns = set(), set()
|
554
|
+
results, functions, selectors = [], [], []
|
555
|
+
|
556
|
+
def _search_tree(element: Adaptor, filter_function: Callable) -> None:
|
557
|
+
"""Collect element if it fulfills passed function otherwise, traverse the children tree and iterate"""
|
558
|
+
if filter_function(element):
|
559
|
+
results.append(element)
|
560
|
+
|
561
|
+
for branch in element.children:
|
562
|
+
_search_tree(branch, filter_function)
|
563
|
+
|
564
|
+
# Brace yourself for a wonderful journey!
|
565
|
+
for arg in args:
|
566
|
+
if type(arg) is str:
|
567
|
+
tags.add(arg)
|
568
|
+
|
569
|
+
elif type(arg) in [list, tuple, set]:
|
570
|
+
if not all(map(lambda x: type(x) is str, arg)):
|
571
|
+
raise TypeError('Nested Iterables are not accepted, only iterables of tag names are accepted')
|
572
|
+
tags.update(set(arg))
|
573
|
+
|
574
|
+
elif type(arg) is dict:
|
575
|
+
if not all([(type(k) is str and type(v) is str) for k, v in arg.items()]):
|
576
|
+
raise TypeError('Nested dictionaries are not accepted, only string keys and string values are accepted')
|
577
|
+
attributes.update(arg)
|
578
|
+
|
579
|
+
elif type(arg) is re.Pattern:
|
580
|
+
patterns.add(arg)
|
581
|
+
|
582
|
+
elif callable(arg):
|
583
|
+
if len(inspect.signature(arg).parameters) > 0:
|
584
|
+
functions.append(arg)
|
585
|
+
else:
|
586
|
+
raise TypeError("Callable filter function must have at least one argument to take `Adaptor` objects.")
|
587
|
+
|
588
|
+
else:
|
589
|
+
raise TypeError(f'Argument with type "{type(arg)}" is not accepted, please read the docs.')
|
590
|
+
|
591
|
+
if not all([(type(k) is str and type(v) is str) for k, v in kwargs.items()]):
|
592
|
+
raise TypeError('Only string values are accepted for arguments')
|
593
|
+
|
594
|
+
for attribute_name, value in kwargs.items():
|
595
|
+
# Only replace names for kwargs, replacing them in dictionaries doesn't make sense
|
596
|
+
attribute_name = whitelisted.get(attribute_name, attribute_name)
|
597
|
+
attributes[attribute_name] = value
|
598
|
+
|
599
|
+
# It's easier and faster to build a selector than traversing the tree
|
600
|
+
tags = tags or ['']
|
601
|
+
for tag in tags:
|
602
|
+
selector = tag
|
603
|
+
for key, value in attributes.items():
|
604
|
+
value = value.replace('"', r'\"') # Escape double quotes in user input
|
605
|
+
# Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)
|
606
|
+
selector += '[{}="{}"]'.format(key, value)
|
607
|
+
if selector:
|
608
|
+
selectors.append(selector)
|
609
|
+
|
610
|
+
if selectors:
|
611
|
+
results = self.css(', '.join(selectors))
|
612
|
+
if results:
|
613
|
+
# From the results, get the ones that fulfill passed regex patterns
|
614
|
+
for pattern in patterns:
|
615
|
+
results = results.filter(lambda e: e.text.re(pattern, check_match=True))
|
616
|
+
|
617
|
+
# From the results, get the ones that fulfill passed functions
|
618
|
+
for function in functions:
|
619
|
+
results = results.filter(function)
|
620
|
+
else:
|
621
|
+
for pattern in patterns:
|
622
|
+
results.extend(self.find_by_regex(pattern, first_match=False))
|
623
|
+
|
624
|
+
for result in (results or [self]):
|
625
|
+
for function in functions:
|
626
|
+
_search_tree(result, function)
|
627
|
+
|
628
|
+
return self.__convert_results(results)
|
629
|
+
|
630
|
+
def find(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> Union['Adaptor', None]:
|
631
|
+
"""Find elements by filters of your creations for ease then return the first result. Otherwise return `None`.
|
632
|
+
|
633
|
+
:param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
|
634
|
+
:param kwargs: The attributes you want to filter elements based on it.
|
635
|
+
:return: The `Adaptor` object of the element or `None` if the result didn't match
|
636
|
+
"""
|
637
|
+
for element in self.find_all(*args, **kwargs):
|
638
|
+
return element
|
639
|
+
return None
|
640
|
+
|
498
641
|
def __calculate_similarity_score(self, original: Dict, candidate: html.HtmlElement) -> float:
|
499
642
|
"""Used internally to calculate a score that shows how candidate element similar to the original one
|
500
643
|
|
@@ -606,25 +749,33 @@ class Adaptor(SelectorsGeneration):
|
|
606
749
|
# Operations on text functions
|
607
750
|
def json(self) -> Dict:
|
608
751
|
"""Return json response if the response is jsonable otherwise throws error"""
|
609
|
-
|
752
|
+
if self.text:
|
753
|
+
return self.text.json()
|
754
|
+
else:
|
755
|
+
return self.get_all_text(strip=True).json()
|
610
756
|
|
611
|
-
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True
|
757
|
+
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True,
|
758
|
+
clean_match: bool = False, case_sensitive: bool = False) -> 'List[str]':
|
612
759
|
"""Apply the given regex to the current text and return a list of strings with the matches.
|
613
760
|
|
614
761
|
:param regex: Can be either a compiled regular expression or a string.
|
615
762
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
763
|
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
764
|
+
:param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
|
616
765
|
"""
|
617
|
-
return self.text.re(regex, replace_entities)
|
766
|
+
return self.text.re(regex, replace_entities, clean_match, case_sensitive)
|
618
767
|
|
619
|
-
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True
|
768
|
+
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
769
|
+
clean_match: bool = False, case_sensitive: bool = False) -> Union[str, None]:
|
620
770
|
"""Apply the given regex to text and return the first match if found, otherwise return the default value.
|
621
771
|
|
622
772
|
:param regex: Can be either a compiled regular expression or a string.
|
623
773
|
:param default: The default value to be returned if there is no match
|
624
774
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
625
|
-
|
775
|
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
776
|
+
:param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
|
626
777
|
"""
|
627
|
-
return self.text.re_first(regex, default, replace_entities)
|
778
|
+
return self.text.re_first(regex, default, replace_entities, clean_match, case_sensitive)
|
628
779
|
|
629
780
|
def find_similar(
|
630
781
|
self,
|
@@ -757,10 +908,10 @@ class Adaptor(SelectorsGeneration):
|
|
757
908
|
return self.__convert_results(results)
|
758
909
|
|
759
910
|
def find_by_regex(
|
760
|
-
self, query: str, first_match: bool = True, case_sensitive: bool = False, clean_match: bool = True
|
911
|
+
self, query: Union[str, Pattern[str]], first_match: bool = True, case_sensitive: bool = False, clean_match: bool = True
|
761
912
|
) -> Union['Adaptors[Adaptor]', 'Adaptor', List]:
|
762
913
|
"""Find elements that its text content matches the input regex pattern.
|
763
|
-
:param query: Regex query to match
|
914
|
+
:param query: Regex query/pattern to match
|
764
915
|
:param first_match: Return first element that matches conditions, enabled by default
|
765
916
|
:param case_sensitive: if enabled, letters case will be taken into consideration in the regex
|
766
917
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
@@ -855,54 +1006,56 @@ class Adaptors(List[Adaptor]):
|
|
855
1006
|
]
|
856
1007
|
return self.__class__(flatten(results))
|
857
1008
|
|
858
|
-
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True
|
1009
|
+
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True,
|
1010
|
+
clean_match: bool = False, case_sensitive: bool = False) -> 'List[str]':
|
859
1011
|
"""Call the ``.re()`` method for each element in this list and return
|
860
1012
|
their results flattened as List of TextHandler.
|
861
1013
|
|
862
1014
|
:param regex: Can be either a compiled regular expression or a string.
|
863
1015
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
1016
|
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
1017
|
+
:param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
|
864
1018
|
"""
|
865
1019
|
results = [
|
866
|
-
n.text.re(regex, replace_entities) for n in self
|
1020
|
+
n.text.re(regex, replace_entities, clean_match, case_sensitive) for n in self
|
867
1021
|
]
|
868
1022
|
return flatten(results)
|
869
1023
|
|
870
|
-
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True
|
1024
|
+
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
1025
|
+
clean_match: bool = False, case_sensitive: bool = False) -> Union[str, None]:
|
871
1026
|
"""Call the ``.re_first()`` method for each element in this list and return
|
872
|
-
|
1027
|
+
the first result or the default value otherwise.
|
873
1028
|
|
874
1029
|
:param regex: Can be either a compiled regular expression or a string.
|
875
1030
|
:param default: The default value to be returned if there is no match
|
876
1031
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
1032
|
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
1033
|
+
:param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
|
1034
|
+
"""
|
1035
|
+
for n in self:
|
1036
|
+
for result in n.re(regex, replace_entities, clean_match, case_sensitive):
|
1037
|
+
return result
|
1038
|
+
return default
|
1039
|
+
|
1040
|
+
def search(self, func: Callable[['Adaptor'], bool]) -> Union['Adaptor', None]:
|
1041
|
+
"""Loop over all current elements and return the first element that matches the passed function
|
1042
|
+
:param func: A function that takes each element as an argument and returns True/False
|
1043
|
+
:return: The first element that match the function or ``None`` otherwise.
|
1044
|
+
"""
|
1045
|
+
for element in self:
|
1046
|
+
if func(element):
|
1047
|
+
return element
|
1048
|
+
return None
|
877
1049
|
|
1050
|
+
def filter(self, func: Callable[['Adaptor'], bool]) -> Union['Adaptors', List]:
|
1051
|
+
"""Filter current elements based on the passed function
|
1052
|
+
:param func: A function that takes each element as an argument and returns True/False
|
1053
|
+
:return: The new `Adaptors` object or empty list otherwise.
|
878
1054
|
"""
|
879
1055
|
results = [
|
880
|
-
|
1056
|
+
element for element in self if func(element)
|
881
1057
|
]
|
882
|
-
return
|
883
|
-
|
884
|
-
# def __getattr__(self, name):
|
885
|
-
# if name in dir(self.__class__):
|
886
|
-
# return super().__getattribute__(name)
|
887
|
-
#
|
888
|
-
# # Execute the method itself on each Adaptor
|
889
|
-
# results = []
|
890
|
-
# for item in self:
|
891
|
-
# results.append(getattr(item, name))
|
892
|
-
#
|
893
|
-
# if all(callable(r) for r in results):
|
894
|
-
# def call_all(*args, **kwargs):
|
895
|
-
# final_results = [r(*args, **kwargs) for r in results]
|
896
|
-
# if all([isinstance(r, (Adaptor, Adaptors,)) for r in results]):
|
897
|
-
# return self.__class__(final_results)
|
898
|
-
# return final_results
|
899
|
-
#
|
900
|
-
# return call_all
|
901
|
-
# else:
|
902
|
-
# # Flatten the result if it's a single-item list containing a list
|
903
|
-
# if len(self) == 1 and isinstance(results[0], list):
|
904
|
-
# return self.__class__(results[0])
|
905
|
-
# return self.__class__(results)
|
1058
|
+
return self.__class__(results) if results else results
|
906
1059
|
|
907
1060
|
def get(self, default=None):
|
908
1061
|
"""Returns the first item of the current list
|
scrapling/py.typed
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
|