scrapling 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +29 -19
- scrapling/cli.py +21 -4
- scrapling/core/_types.py +3 -2
- scrapling/core/ai.py +24 -15
- scrapling/core/custom_types.py +20 -27
- scrapling/core/mixins.py +15 -9
- scrapling/core/shell.py +6 -4
- scrapling/core/storage.py +7 -6
- scrapling/core/translator.py +13 -8
- scrapling/core/utils/__init__.py +0 -1
- scrapling/engines/_browsers/__init__.py +0 -2
- scrapling/engines/_browsers/_base.py +45 -21
- scrapling/engines/_browsers/_camoufox.py +98 -43
- scrapling/engines/_browsers/_config_tools.py +1 -1
- scrapling/engines/_browsers/_controllers.py +34 -13
- scrapling/engines/_browsers/_validators.py +31 -10
- scrapling/engines/constants.py +0 -15
- scrapling/engines/static.py +749 -336
- scrapling/engines/toolbelt/convertor.py +13 -15
- scrapling/engines/toolbelt/custom.py +6 -9
- scrapling/engines/toolbelt/fingerprints.py +17 -10
- scrapling/engines/toolbelt/navigation.py +11 -3
- scrapling/fetchers/__init__.py +46 -0
- scrapling/fetchers/chrome.py +210 -0
- scrapling/fetchers/firefox.py +212 -0
- scrapling/fetchers/requests.py +28 -0
- scrapling/parser.py +109 -84
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/METADATA +17 -16
- scrapling-0.3.7.dist-info/RECORD +47 -0
- scrapling/fetchers.py +0 -444
- scrapling-0.3.5.dist-info/RECORD +0 -44
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/WHEEL +0 -0
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/entry_points.txt +0 -0
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/licenses/LICENSE +0 -0
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/top_level.txt +0 -0
scrapling/parser.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1
|
-
import re
|
2
1
|
from pathlib import Path
|
3
2
|
from inspect import signature
|
4
3
|
from urllib.parse import urljoin
|
5
4
|
from difflib import SequenceMatcher
|
5
|
+
from re import Pattern as re_Pattern
|
6
6
|
|
7
7
|
from lxml.html import HtmlElement, HtmlMixin, HTMLParser
|
8
8
|
from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
|
@@ -17,17 +17,21 @@ from lxml.etree import (
|
|
17
17
|
|
18
18
|
from scrapling.core._types import (
|
19
19
|
Any,
|
20
|
+
Set,
|
20
21
|
Dict,
|
22
|
+
cast,
|
21
23
|
List,
|
22
24
|
Tuple,
|
23
25
|
Union,
|
24
26
|
Pattern,
|
25
27
|
Callable,
|
28
|
+
Literal,
|
26
29
|
Optional,
|
27
30
|
Iterable,
|
28
31
|
overload,
|
29
32
|
Generator,
|
30
33
|
SupportsIndex,
|
34
|
+
TYPE_CHECKING,
|
31
35
|
)
|
32
36
|
from scrapling.core.custom_types import AttributesHandler, TextHandler, TextHandlers
|
33
37
|
from scrapling.core.mixins import SelectorsGeneration
|
@@ -36,7 +40,7 @@ from scrapling.core.storage import (
|
|
36
40
|
StorageSystemMixin,
|
37
41
|
_StorageTools,
|
38
42
|
)
|
39
|
-
from scrapling.core.translator import
|
43
|
+
from scrapling.core.translator import css_to_xpath as _css_to_xpath
|
40
44
|
from scrapling.core.utils import clean_spaces, flatten, html_forbidden, log
|
41
45
|
|
42
46
|
__DEFAULT_DB_FILE__ = str(Path(__file__).parent / "elements_storage.db")
|
@@ -70,20 +74,23 @@ class Selector(SelectorsGeneration):
|
|
70
74
|
"_raw_body",
|
71
75
|
)
|
72
76
|
|
77
|
+
if TYPE_CHECKING:
|
78
|
+
_storage: StorageSystemMixin
|
79
|
+
|
73
80
|
def __init__(
|
74
81
|
self,
|
75
82
|
content: Optional[str | bytes] = None,
|
76
|
-
url:
|
83
|
+
url: str = "",
|
77
84
|
encoding: str = "utf-8",
|
78
85
|
huge_tree: bool = True,
|
79
86
|
root: Optional[HtmlElement] = None,
|
80
87
|
keep_comments: Optional[bool] = False,
|
81
88
|
keep_cdata: Optional[bool] = False,
|
82
89
|
adaptive: Optional[bool] = False,
|
83
|
-
_storage:
|
90
|
+
_storage: Optional[StorageSystemMixin] = None,
|
84
91
|
storage: Any = SQLiteStorageSystem,
|
85
92
|
storage_args: Optional[Dict] = None,
|
86
|
-
**
|
93
|
+
**_,
|
87
94
|
):
|
88
95
|
"""The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
|
89
96
|
with expressions in CSS, XPath, or with simply text. Check the docs for more info.
|
@@ -131,7 +138,7 @@ class Selector(SelectorsGeneration):
|
|
131
138
|
default_doctype=True,
|
132
139
|
strip_cdata=(not keep_cdata),
|
133
140
|
)
|
134
|
-
self._root = fromstring(body, parser=parser, base_url=url)
|
141
|
+
self._root = cast(HtmlElement, fromstring(body, parser=parser, base_url=url or None))
|
135
142
|
self._raw_body = content
|
136
143
|
|
137
144
|
else:
|
@@ -141,7 +148,7 @@ class Selector(SelectorsGeneration):
|
|
141
148
|
f"Root have to be a valid element of `html` module types to work, not of type {type(root)}"
|
142
149
|
)
|
143
150
|
|
144
|
-
self._root = root
|
151
|
+
self._root = cast(HtmlElement, root)
|
145
152
|
self._raw_body = ""
|
146
153
|
|
147
154
|
self.__adaptive_enabled = adaptive
|
@@ -238,6 +245,9 @@ class Selector(SelectorsGeneration):
|
|
238
245
|
**self.__response_data,
|
239
246
|
)
|
240
247
|
|
248
|
+
def __elements_convertor(self, elements: List[HtmlElement]) -> "Selectors":
|
249
|
+
return Selectors(map(self.__element_convertor, elements))
|
250
|
+
|
241
251
|
def __handle_element(
|
242
252
|
self, element: Optional[HtmlElement | _ElementUnicodeResult]
|
243
253
|
) -> Optional[Union[TextHandler, "Selector"]]:
|
@@ -262,7 +272,7 @@ class Selector(SelectorsGeneration):
|
|
262
272
|
if self._is_text_node(result[0]):
|
263
273
|
return TextHandlers(map(TextHandler, result))
|
264
274
|
|
265
|
-
return
|
275
|
+
return self.__elements_convertor(result)
|
266
276
|
|
267
277
|
def __getstate__(self) -> Any:
|
268
278
|
# lxml don't like it :)
|
@@ -323,7 +333,7 @@ class Selector(SelectorsGeneration):
|
|
323
333
|
if not valid_values or processed_text.strip():
|
324
334
|
_all_strings.append(processed_text)
|
325
335
|
|
326
|
-
return TextHandler(separator).join(_all_strings)
|
336
|
+
return cast(TextHandler, TextHandler(separator).join(_all_strings))
|
327
337
|
|
328
338
|
def urljoin(self, relative_url: str) -> str:
|
329
339
|
"""Join this Selector's url with a relative url to form an absolute full URL."""
|
@@ -341,7 +351,7 @@ class Selector(SelectorsGeneration):
|
|
341
351
|
"""Return the inner HTML code of the element"""
|
342
352
|
content = tostring(self._root, encoding=self.encoding, method="html", with_tail=False)
|
343
353
|
if isinstance(content, bytes):
|
344
|
-
content = content.decode(
|
354
|
+
content = content.strip().decode(self.encoding)
|
345
355
|
return TextHandler(content)
|
346
356
|
|
347
357
|
@property
|
@@ -359,7 +369,7 @@ class Selector(SelectorsGeneration):
|
|
359
369
|
with_tail=False,
|
360
370
|
)
|
361
371
|
if isinstance(content, bytes):
|
362
|
-
content = content.decode(
|
372
|
+
content = content.strip().decode(self.encoding)
|
363
373
|
return TextHandler(content)
|
364
374
|
|
365
375
|
def has_class(self, class_name: str) -> bool:
|
@@ -372,13 +382,14 @@ class Selector(SelectorsGeneration):
|
|
372
382
|
@property
|
373
383
|
def parent(self) -> Optional["Selector"]:
|
374
384
|
"""Return the direct parent of the element or ``None`` otherwise"""
|
375
|
-
|
385
|
+
_parent = self._root.getparent()
|
386
|
+
return self.__element_convertor(_parent) if _parent is not None else None
|
376
387
|
|
377
388
|
@property
|
378
389
|
def below_elements(self) -> "Selectors":
|
379
390
|
"""Return all elements under the current element in the DOM tree"""
|
380
391
|
below = _find_all_elements(self._root)
|
381
|
-
return self.
|
392
|
+
return self.__elements_convertor(below) if below is not None else Selectors()
|
382
393
|
|
383
394
|
@property
|
384
395
|
def children(self) -> "Selectors":
|
@@ -425,7 +436,7 @@ class Selector(SelectorsGeneration):
|
|
425
436
|
# Ignore HTML comments and unwanted types
|
426
437
|
next_element = next_element.getnext()
|
427
438
|
|
428
|
-
return self.
|
439
|
+
return self.__element_convertor(next_element) if next_element is not None else None
|
429
440
|
|
430
441
|
@property
|
431
442
|
def previous(self) -> Optional["Selector"]:
|
@@ -435,10 +446,10 @@ class Selector(SelectorsGeneration):
|
|
435
446
|
# Ignore HTML comments and unwanted types
|
436
447
|
prev_element = prev_element.getprevious()
|
437
448
|
|
438
|
-
return self.
|
449
|
+
return self.__element_convertor(prev_element) if prev_element is not None else None
|
439
450
|
|
440
451
|
# For easy copy-paste from Scrapy/parsel code when needed :)
|
441
|
-
def get(self, default=None):
|
452
|
+
def get(self, default=None): # pyright: ignore
|
442
453
|
return self
|
443
454
|
|
444
455
|
def get_all(self):
|
@@ -468,6 +479,16 @@ class Selector(SelectorsGeneration):
|
|
468
479
|
return data + ">"
|
469
480
|
|
470
481
|
# From here we start with the selecting functions
|
482
|
+
@overload
|
483
|
+
def relocate(
|
484
|
+
self, element: Union[Dict, HtmlElement, "Selector"], percentage: int, selector_type: Literal[True]
|
485
|
+
) -> "Selectors": ...
|
486
|
+
|
487
|
+
@overload
|
488
|
+
def relocate(
|
489
|
+
self, element: Union[Dict, HtmlElement, "Selector"], percentage: int, selector_type: Literal[False] = False
|
490
|
+
) -> List[HtmlElement]: ...
|
491
|
+
|
471
492
|
def relocate(
|
472
493
|
self,
|
473
494
|
element: Union[Dict, HtmlElement, "Selector"],
|
@@ -506,11 +527,11 @@ class Selector(SelectorsGeneration):
|
|
506
527
|
log.debug(f"Highest probability was {highest_probability}%")
|
507
528
|
log.debug("Top 5 best matching elements are: ")
|
508
529
|
for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
|
509
|
-
log.debug(f"{percent} -> {self.
|
530
|
+
log.debug(f"{percent} -> {self.__elements_convertor(score_table[percent])}")
|
510
531
|
|
511
532
|
if not selector_type:
|
512
533
|
return score_table[highest_probability]
|
513
|
-
return self.
|
534
|
+
return self.__elements_convertor(score_table[highest_probability])
|
514
535
|
return []
|
515
536
|
|
516
537
|
def css_first(
|
@@ -593,7 +614,7 @@ class Selector(SelectorsGeneration):
|
|
593
614
|
auto_save: bool = False,
|
594
615
|
percentage: int = 0,
|
595
616
|
**kwargs: Any,
|
596
|
-
) -> Union["Selectors", List, "TextHandlers"]:
|
617
|
+
) -> Union["Selectors", List[Any], "TextHandlers"]:
|
597
618
|
"""Search the current tree with CSS3 selectors
|
598
619
|
|
599
620
|
**Important:
|
@@ -614,7 +635,7 @@ class Selector(SelectorsGeneration):
|
|
614
635
|
try:
|
615
636
|
if not self.__adaptive_enabled or "," not in selector:
|
616
637
|
# No need to split selectors in this case, let's save some CPU cycles :)
|
617
|
-
xpath_selector =
|
638
|
+
xpath_selector = _css_to_xpath(selector)
|
618
639
|
return self.xpath(
|
619
640
|
xpath_selector,
|
620
641
|
identifier or selector,
|
@@ -628,7 +649,7 @@ class Selector(SelectorsGeneration):
|
|
628
649
|
for single_selector in split_selectors(selector):
|
629
650
|
# I'm doing this only so the `save` function saves data correctly for combined selectors
|
630
651
|
# Like using the ',' to combine two different selectors that point to different elements.
|
631
|
-
xpath_selector =
|
652
|
+
xpath_selector = _css_to_xpath(single_selector.canonical())
|
632
653
|
results += self.xpath(
|
633
654
|
xpath_selector,
|
634
655
|
identifier or single_selector.canonical(),
|
@@ -731,7 +752,8 @@ class Selector(SelectorsGeneration):
|
|
731
752
|
raise TypeError("You have to pass something to search with, like tag name(s), tag attributes, or both.")
|
732
753
|
|
733
754
|
attributes = dict()
|
734
|
-
tags
|
755
|
+
tags: Set[str] = set()
|
756
|
+
patterns: Set[Pattern] = set()
|
735
757
|
results, functions, selectors = Selectors(), [], []
|
736
758
|
|
737
759
|
# Brace yourself for a wonderful journey!
|
@@ -740,6 +762,7 @@ class Selector(SelectorsGeneration):
|
|
740
762
|
tags.add(arg)
|
741
763
|
|
742
764
|
elif type(arg) in (list, tuple, set):
|
765
|
+
arg = cast(Iterable, arg) # Type narrowing for type checkers like pyright
|
743
766
|
if not all(map(lambda x: isinstance(x, str), arg)):
|
744
767
|
raise TypeError("Nested Iterables are not accepted, only iterables of tag names are accepted")
|
745
768
|
tags.update(set(arg))
|
@@ -751,7 +774,7 @@ class Selector(SelectorsGeneration):
|
|
751
774
|
)
|
752
775
|
attributes.update(arg)
|
753
776
|
|
754
|
-
elif isinstance(arg,
|
777
|
+
elif isinstance(arg, re_Pattern):
|
755
778
|
patterns.add(arg)
|
756
779
|
|
757
780
|
elif callable(arg):
|
@@ -774,7 +797,7 @@ class Selector(SelectorsGeneration):
|
|
774
797
|
attributes[attribute_name] = value
|
775
798
|
|
776
799
|
# It's easier and faster to build a selector than traversing the tree
|
777
|
-
tags = tags or
|
800
|
+
tags = tags or set("*")
|
778
801
|
for tag in tags:
|
779
802
|
selector = tag
|
780
803
|
for key, value in attributes.items():
|
@@ -785,7 +808,7 @@ class Selector(SelectorsGeneration):
|
|
785
808
|
selectors.append(selector)
|
786
809
|
|
787
810
|
if selectors:
|
788
|
-
results = self.css(", ".join(selectors))
|
811
|
+
results = cast(Selectors, self.css(", ".join(selectors)))
|
789
812
|
if results:
|
790
813
|
# From the results, get the ones that fulfill passed regex patterns
|
791
814
|
for pattern in patterns:
|
@@ -828,20 +851,20 @@ class Selector(SelectorsGeneration):
|
|
828
851
|
:return: A percentage score of how similar is the candidate to the original element
|
829
852
|
"""
|
830
853
|
score, checks = 0, 0
|
831
|
-
|
854
|
+
data = _StorageTools.element_to_dict(candidate)
|
832
855
|
|
833
856
|
# Possible TODO:
|
834
857
|
# Study the idea of giving weight to each test below so some are more important than others
|
835
858
|
# Current results: With weights some websites had better score while it was worse for others
|
836
|
-
score += 1 if original["tag"] ==
|
859
|
+
score += 1 if original["tag"] == data["tag"] else 0 # * 0.3 # 30%
|
837
860
|
checks += 1
|
838
861
|
|
839
862
|
if original["text"]:
|
840
|
-
score += SequenceMatcher(None, original["text"],
|
863
|
+
score += SequenceMatcher(None, original["text"], data.get("text") or "").ratio() # * 0.3 # 30%
|
841
864
|
checks += 1
|
842
865
|
|
843
866
|
# if both don't have attributes, it still counts for something!
|
844
|
-
score += self.__calculate_dict_diff(original["attributes"],
|
867
|
+
score += self.__calculate_dict_diff(original["attributes"], data["attributes"]) # * 0.3 # 30%
|
845
868
|
checks += 1
|
846
869
|
|
847
870
|
# Separate similarity test for class, id, href,... this will help in full structural changes
|
@@ -855,23 +878,23 @@ class Selector(SelectorsGeneration):
|
|
855
878
|
score += SequenceMatcher(
|
856
879
|
None,
|
857
880
|
original["attributes"][attrib],
|
858
|
-
|
881
|
+
data["attributes"].get(attrib) or "",
|
859
882
|
).ratio() # * 0.3 # 30%
|
860
883
|
checks += 1
|
861
884
|
|
862
|
-
score += SequenceMatcher(None, original["path"],
|
885
|
+
score += SequenceMatcher(None, original["path"], data["path"]).ratio() # * 0.1 # 10%
|
863
886
|
checks += 1
|
864
887
|
|
865
888
|
if original.get("parent_name"):
|
866
889
|
# Then we start comparing parents' data
|
867
|
-
if
|
890
|
+
if data.get("parent_name"):
|
868
891
|
score += SequenceMatcher(
|
869
|
-
None, original["parent_name"],
|
892
|
+
None, original["parent_name"], data.get("parent_name") or ""
|
870
893
|
).ratio() # * 0.2 # 20%
|
871
894
|
checks += 1
|
872
895
|
|
873
896
|
score += self.__calculate_dict_diff(
|
874
|
-
original["parent_attribs"],
|
897
|
+
original["parent_attribs"], data.get("parent_attribs") or {}
|
875
898
|
) # * 0.2 # 20%
|
876
899
|
checks += 1
|
877
900
|
|
@@ -879,7 +902,7 @@ class Selector(SelectorsGeneration):
|
|
879
902
|
score += SequenceMatcher(
|
880
903
|
None,
|
881
904
|
original["parent_text"],
|
882
|
-
|
905
|
+
data.get("parent_text") or "",
|
883
906
|
).ratio() # * 0.1 # 10%
|
884
907
|
checks += 1
|
885
908
|
# else:
|
@@ -887,9 +910,7 @@ class Selector(SelectorsGeneration):
|
|
887
910
|
# score -= 0.1
|
888
911
|
|
889
912
|
if original.get("siblings"):
|
890
|
-
score += SequenceMatcher(
|
891
|
-
None, original["siblings"], candidate.get("siblings") or []
|
892
|
-
).ratio() # * 0.1 # 10%
|
913
|
+
score += SequenceMatcher(None, original["siblings"], data.get("siblings") or []).ratio() # * 0.1 # 10%
|
893
914
|
checks += 1
|
894
915
|
|
895
916
|
# How % sure? let's see
|
@@ -902,7 +923,7 @@ class Selector(SelectorsGeneration):
|
|
902
923
|
score += SequenceMatcher(None, tuple(dict1.values()), tuple(dict2.values())).ratio() * 0.5
|
903
924
|
return score
|
904
925
|
|
905
|
-
def save(self, element:
|
926
|
+
def save(self, element: HtmlElement, identifier: str) -> None:
|
906
927
|
"""Saves the element's unique properties to the storage for retrieval and relocation later
|
907
928
|
|
908
929
|
:param element: The element itself that we want to save to storage, it can be a ` Selector ` or pure ` HtmlElement `
|
@@ -910,15 +931,16 @@ class Selector(SelectorsGeneration):
|
|
910
931
|
the docs for more info.
|
911
932
|
"""
|
912
933
|
if self.__adaptive_enabled:
|
913
|
-
|
914
|
-
|
934
|
+
target = element
|
935
|
+
if isinstance(target, self.__class__):
|
936
|
+
target: HtmlElement = target._root
|
915
937
|
|
916
|
-
if self._is_text_node(
|
917
|
-
|
938
|
+
if self._is_text_node(target):
|
939
|
+
target: HtmlElement = target.getparent()
|
918
940
|
|
919
|
-
self._storage.save(
|
941
|
+
self._storage.save(target, identifier)
|
920
942
|
else:
|
921
|
-
|
943
|
+
raise RuntimeError(
|
922
944
|
"Can't use `adaptive` features while it's disabled globally, you have to start a new class instance."
|
923
945
|
)
|
924
946
|
|
@@ -932,10 +954,9 @@ class Selector(SelectorsGeneration):
|
|
932
954
|
if self.__adaptive_enabled:
|
933
955
|
return self._storage.retrieve(identifier)
|
934
956
|
|
935
|
-
|
957
|
+
raise RuntimeError(
|
936
958
|
"Can't use `adaptive` features while it's disabled globally, you have to start a new class instance."
|
937
959
|
)
|
938
|
-
return None
|
939
960
|
|
940
961
|
# Operations on text functions
|
941
962
|
def json(self) -> Dict:
|
@@ -1104,28 +1125,30 @@ class Selector(SelectorsGeneration):
|
|
1104
1125
|
if not case_sensitive:
|
1105
1126
|
text = text.lower()
|
1106
1127
|
|
1107
|
-
|
1108
|
-
|
1109
|
-
|
1110
|
-
|
1111
|
-
node_text =
|
1112
|
-
|
1113
|
-
|
1114
|
-
|
1115
|
-
|
1116
|
-
|
1117
|
-
|
1128
|
+
possible_targets = _find_all_elements_with_spaces(self._root)
|
1129
|
+
if possible_targets:
|
1130
|
+
for node in self.__elements_convertor(possible_targets):
|
1131
|
+
"""Check if element matches given text otherwise, traverse the children tree and iterate"""
|
1132
|
+
node_text = node.text
|
1133
|
+
if clean_match:
|
1134
|
+
node_text = node_text.clean()
|
1135
|
+
|
1136
|
+
if not case_sensitive:
|
1137
|
+
node_text = node_text.lower()
|
1138
|
+
|
1139
|
+
if partial:
|
1140
|
+
if text in node_text:
|
1141
|
+
results.append(node)
|
1142
|
+
elif text == node_text:
|
1118
1143
|
results.append(node)
|
1119
|
-
elif text == node_text:
|
1120
|
-
results.append(node)
|
1121
1144
|
|
1122
|
-
|
1123
|
-
|
1124
|
-
|
1145
|
+
if first_match and results:
|
1146
|
+
# we got an element so we should stop
|
1147
|
+
break
|
1125
1148
|
|
1126
|
-
|
1127
|
-
|
1128
|
-
|
1149
|
+
if first_match:
|
1150
|
+
if results:
|
1151
|
+
return results[0]
|
1129
1152
|
return results
|
1130
1153
|
|
1131
1154
|
def find_by_regex(
|
@@ -1143,23 +1166,25 @@ class Selector(SelectorsGeneration):
|
|
1143
1166
|
"""
|
1144
1167
|
results = Selectors()
|
1145
1168
|
|
1146
|
-
|
1147
|
-
|
1148
|
-
|
1149
|
-
|
1150
|
-
|
1151
|
-
|
1152
|
-
|
1153
|
-
|
1154
|
-
|
1155
|
-
|
1169
|
+
possible_targets = _find_all_elements_with_spaces(self._root)
|
1170
|
+
if possible_targets:
|
1171
|
+
for node in self.__elements_convertor(possible_targets):
|
1172
|
+
"""Check if element matches given regex otherwise, traverse the children tree and iterate"""
|
1173
|
+
node_text = node.text
|
1174
|
+
if node_text.re(
|
1175
|
+
query,
|
1176
|
+
check_match=True,
|
1177
|
+
clean_match=clean_match,
|
1178
|
+
case_sensitive=case_sensitive,
|
1179
|
+
):
|
1180
|
+
results.append(node)
|
1156
1181
|
|
1157
|
-
|
1158
|
-
|
1159
|
-
|
1182
|
+
if first_match and results:
|
1183
|
+
# we got an element so we should stop
|
1184
|
+
break
|
1160
1185
|
|
1161
|
-
|
1162
|
-
|
1186
|
+
if results and first_match:
|
1187
|
+
return results[0]
|
1163
1188
|
return results
|
1164
1189
|
|
1165
1190
|
|
@@ -1181,9 +1206,9 @@ class Selectors(List[Selector]):
|
|
1181
1206
|
def __getitem__(self, pos: SupportsIndex | slice) -> Union[Selector, "Selectors"]:
|
1182
1207
|
lst = super().__getitem__(pos)
|
1183
1208
|
if isinstance(pos, slice):
|
1184
|
-
return self.__class__(lst)
|
1209
|
+
return self.__class__(cast(List[Selector], lst))
|
1185
1210
|
else:
|
1186
|
-
return lst
|
1211
|
+
return cast(Selector, lst)
|
1187
1212
|
|
1188
1213
|
def xpath(
|
1189
1214
|
self,
|
@@ -1265,7 +1290,7 @@ class Selectors(List[Selector]):
|
|
1265
1290
|
def re_first(
|
1266
1291
|
self,
|
1267
1292
|
regex: str | Pattern,
|
1268
|
-
default=None,
|
1293
|
+
default: Any = None,
|
1269
1294
|
replace_entities: bool = True,
|
1270
1295
|
clean_match: bool = False,
|
1271
1296
|
case_sensitive: bool = True,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.7
|
4
4
|
Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -64,7 +64,7 @@ Classifier: Typing :: Typed
|
|
64
64
|
Requires-Python: >=3.10
|
65
65
|
Description-Content-Type: text/markdown
|
66
66
|
License-File: LICENSE
|
67
|
-
Requires-Dist: lxml>=6.0.
|
67
|
+
Requires-Dist: lxml>=6.0.2
|
68
68
|
Requires-Dist: cssselect>=1.3.0
|
69
69
|
Requires-Dist: orjson>=3.11.3
|
70
70
|
Requires-Dist: tldextract>=5.3.0
|
@@ -77,7 +77,7 @@ Requires-Dist: camoufox>=0.4.11; extra == "fetchers"
|
|
77
77
|
Requires-Dist: geoip2>=5.1.0; extra == "fetchers"
|
78
78
|
Requires-Dist: msgspec>=0.19.0; extra == "fetchers"
|
79
79
|
Provides-Extra: ai
|
80
|
-
Requires-Dist: mcp>=1.
|
80
|
+
Requires-Dist: mcp>=1.16.0; extra == "ai"
|
81
81
|
Requires-Dist: markdownify>=1.2.0; extra == "ai"
|
82
82
|
Requires-Dist: scrapling[fetchers]; extra == "ai"
|
83
83
|
Provides-Extra: shell
|
@@ -139,7 +139,7 @@ Dynamic: license-file
|
|
139
139
|
|
140
140
|
Scrapling isn't just another Web Scraping library. It's the first **adaptive** scraping library that learns from website changes and evolves with them. While other libraries break when websites update their structure, Scrapling automatically relocates your elements and keeps your scrapers running.
|
141
141
|
|
142
|
-
Built for the modern Web, Scrapling
|
142
|
+
Built for the modern Web, Scrapling features its own rapid parsing engine and fetchers to handle all Web Scraping challenges you face or will face. Built by Web Scrapers for Web Scrapers and regular users, there's something for everyone.
|
143
143
|
|
144
144
|
```python
|
145
145
|
>> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
|
@@ -162,7 +162,7 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
|
|
162
162
|
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
163
163
|
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
164
164
|
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
165
|
-
<a href="https://www.
|
165
|
+
<a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
|
166
166
|
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
167
167
|
|
168
168
|
<!-- /sponsors -->
|
@@ -176,7 +176,7 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
|
|
176
176
|
### Advanced Websites Fetching with Session Support
|
177
177
|
- **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class. Can impersonate browsers' TLS fingerprint, headers, and use HTTP3.
|
178
178
|
- **Dynamic Loading**: Fetch dynamic websites with full browser automation through the `DynamicFetcher` class supporting Playwright's Chromium, real Chrome, and custom stealth mode.
|
179
|
-
- **Anti-bot Bypass**: Advanced stealth capabilities with `StealthyFetcher` using a modified version of Firefox and fingerprint spoofing. Can bypass all
|
179
|
+
- **Anti-bot Bypass**: Advanced stealth capabilities with `StealthyFetcher` using a modified version of Firefox and fingerprint spoofing. Can bypass all types of Cloudflare's Turnstile and Interstitial with automation easily.
|
180
180
|
- **Session Management**: Persistent session support with `FetcherSession`, `StealthySession`, and `DynamicSession` classes for cookie and state management across requests.
|
181
181
|
- **Async Support**: Complete async support across all fetchers and dedicated async session classes.
|
182
182
|
|
@@ -200,13 +200,7 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
|
|
200
200
|
- 📝 **Auto Selector Generation**: Generate robust CSS/XPath selectors for any element.
|
201
201
|
- 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup with the same pseudo-elements used in Scrapy/Parsel.
|
202
202
|
- 📘 **Complete Type Coverage**: Full type hints for excellent IDE support and code completion.
|
203
|
-
|
204
|
-
### New Session Architecture
|
205
|
-
Scrapling 0.3 introduces a completely revamped session system:
|
206
|
-
- **Persistent Sessions**: Maintain cookies, headers, and authentication across multiple requests
|
207
|
-
- **Automatic Session Management**: Smart session lifecycle handling with proper cleanup
|
208
|
-
- **Session Inheritance**: All fetchers support both one-off requests and persistent session usage
|
209
|
-
- **Concurrent Session Support**: Run multiple isolated sessions simultaneously
|
203
|
+
- 🔋 **Ready Docker image**: With each release, a Docker image containing all browsers is automatically built and pushed.
|
210
204
|
|
211
205
|
## Getting Started
|
212
206
|
|
@@ -324,11 +318,11 @@ scrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.
|
|
324
318
|
```
|
325
319
|
|
326
320
|
> [!NOTE]
|
327
|
-
> There are many additional features, but we want to keep this page
|
321
|
+
> There are many additional features, but we want to keep this page concise, such as the MCP server and the interactive Web Scraping Shell. Check out the full documentation [here](https://scrapling.readthedocs.io/en/latest/)
|
328
322
|
|
329
323
|
## Performance Benchmarks
|
330
324
|
|
331
|
-
Scrapling isn't just powerful—it's also blazing fast, and the updates since version 0.3
|
325
|
+
Scrapling isn't just powerful—it's also blazing fast, and the updates since version 0.3 have delivered exceptional performance improvements across all operations.
|
332
326
|
|
333
327
|
### Text Extraction Speed Test (5000 nested elements)
|
334
328
|
|
@@ -391,6 +385,13 @@ Starting with v0.3.2, this installation only includes the parser engine and its
|
|
391
385
|
```
|
392
386
|
Don't forget that you need to install the browser dependencies with `scrapling install` after any of these extras (if you didn't already)
|
393
387
|
|
388
|
+
### Docker
|
389
|
+
You can also install a Docker image with all extras and browsers with the following command:
|
390
|
+
```bash
|
391
|
+
docker pull pyd4vinci/scrapling
|
392
|
+
```
|
393
|
+
This image is automatically built and pushed to Docker Hub through GitHub actions right here.
|
394
|
+
|
394
395
|
## Contributing
|
395
396
|
|
396
397
|
We welcome contributions! Please read our [contributing guidelines](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) before getting started.
|
@@ -398,7 +399,7 @@ We welcome contributions! Please read our [contributing guidelines](https://gith
|
|
398
399
|
## Disclaimer
|
399
400
|
|
400
401
|
> [!CAUTION]
|
401
|
-
> This library is provided for educational and research purposes only. By using this library, you agree to comply with local and international data scraping and privacy laws. The authors and contributors are not responsible for any misuse of this software. Always respect
|
402
|
+
> This library is provided for educational and research purposes only. By using this library, you agree to comply with local and international data scraping and privacy laws. The authors and contributors are not responsible for any misuse of this software. Always respect the terms of service of websites and robots.txt files.
|
402
403
|
|
403
404
|
## License
|
404
405
|
|
@@ -0,0 +1,47 @@
|
|
1
|
+
scrapling/__init__.py,sha256=ckdQrdwM2SRKBMcORUsCUgU6JWoUwGtrbC3U0OH5RN4,1522
|
2
|
+
scrapling/cli.py,sha256=gbhfy2GCz_VqcWhBaNMK4wevayxNtLb72SQIUR9Ebik,26916
|
3
|
+
scrapling/parser.py,sha256=bQ7_c3rHjnjJsWI-qqkvEVkVx4-NM-1SWYpQrcwbflQ,58837
|
4
|
+
scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
|
5
|
+
scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
+
scrapling/core/_html_utils.py,sha256=ki47v54SsTL5-khi1jcLkJqAHqEq19cuex-dqzXdbEI,20328
|
7
|
+
scrapling/core/_types.py,sha256=SwJpeZ6TSi20y0FWHo-BmfGlkHpbot6qaql01PskgHw,897
|
8
|
+
scrapling/core/ai.py,sha256=xE0RXQxZzH62fCdFiNxcSWbdeuUZK3TlXd4hPkdOO80,36295
|
9
|
+
scrapling/core/custom_types.py,sha256=JlaOKvtI28ZkJ5ylaXIKfqqhlOOhIsZDNBhTbLfyWPo,13423
|
10
|
+
scrapling/core/mixins.py,sha256=Npw36VPmsHMrEZ5VXgBbLL1OyYcFqMUWkUB5oWATqtw,3522
|
11
|
+
scrapling/core/shell.py,sha256=kx7_6zGRXAd9NulL0cyX4YVQMGf4Ij1MYUtceSSE9xk,22983
|
12
|
+
scrapling/core/storage.py,sha256=eEAwl88bmAexXwnow86alV7TaGNf1an5_J7e1Mas7PU,6309
|
13
|
+
scrapling/core/translator.py,sha256=5Wk1rn3mSXO-1ACYnrORjO7n9aP2f5-OAzT8MeNjv-M,5354
|
14
|
+
scrapling/core/utils/__init__.py,sha256=zE2I4Zm355kdGjZBAAghFdFYQ-yRGvZbNqQuDP93-Ok,155
|
15
|
+
scrapling/core/utils/_shell.py,sha256=zes71MmFTs7V9f0JFstaWcjQhKNZN6xvspu29YVQtRc,1707
|
16
|
+
scrapling/core/utils/_utils.py,sha256=ATy-wwz00U-alOGH-NGK-VoPNr1qYmUwEoWuqAHjDkg,3143
|
17
|
+
scrapling/engines/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
|
+
scrapling/engines/constants.py,sha256=aOIOFCjbtgxH3hehlPU_3EwlnjpdUHRFK342nDQy-Vc,3596
|
19
|
+
scrapling/engines/static.py,sha256=3m86QAC1bnK9MD5Cjcs5u2Bu8zb51dzQBLK4Si1K5K8,50062
|
20
|
+
scrapling/engines/_browsers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
|
+
scrapling/engines/_browsers/_base.py,sha256=hSmzQ_UBI8WcgVZyVw2qppNOdR47xwjxypOULfBhkhQ,12546
|
22
|
+
scrapling/engines/_browsers/_camoufox.py,sha256=6YikWmY_z38xl9pYW2LgVRRdCwXdCrgFiyyYhBSVQug,38593
|
23
|
+
scrapling/engines/_browsers/_config_tools.py,sha256=Vbl-0G3E7_QsA6tZ6FrkUqUy33h--a2O8LveSvVF2y8,4617
|
24
|
+
scrapling/engines/_browsers/_controllers.py,sha256=WppP9Tkl4KNCszVTjy3BQ12gyMCPfiv7mUdbjA_l0JY,28705
|
25
|
+
scrapling/engines/_browsers/_page.py,sha256=1z-P6c97cTkULE-FVrsMY589e6eL_20Ae8pUe6vjggE,2206
|
26
|
+
scrapling/engines/_browsers/_validators.py,sha256=Bpk6P5urruUKDrdrXSnkiBHQWJ-F0JpXvepzlXj6Gfk,8033
|
27
|
+
scrapling/engines/toolbelt/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
28
|
+
scrapling/engines/toolbelt/convertor.py,sha256=tO-floNdsM6PmxaoRPJm9SK3rT7lyCQqwlkoeEe4t_0,13193
|
29
|
+
scrapling/engines/toolbelt/custom.py,sha256=GWqKrciMfry_8Vc_0LlaTGNUX8XxVlPquQ9obohIPtY,7447
|
30
|
+
scrapling/engines/toolbelt/fingerprints.py,sha256=3DaxNNLConjf_zDi97YswQ8cWgdA6Bq2mdR_l57Ul5E,2521
|
31
|
+
scrapling/engines/toolbelt/navigation.py,sha256=VHQ5sMVI-5UtcSpK-_Pin0e16fRLRzW8lYu-MObCxkY,3858
|
32
|
+
scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
|
33
|
+
scrapling/engines/toolbelt/bypasses/notification_permission.js,sha256=poPM3o5WYgEX-EdiUfDCllpWfc3Umvw4jr2u6O6elus,237
|
34
|
+
scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js,sha256=clzuf7KYcvDWYaKKxT_bkAoCT2fGsOcUw47948CHjAc,267
|
35
|
+
scrapling/engines/toolbelt/bypasses/screen_props.js,sha256=fZEuHMQ1-fYuxxUMoQXUvVWYUkPUbblkfMfpiLvBY7w,599
|
36
|
+
scrapling/engines/toolbelt/bypasses/webdriver_fully.js,sha256=hdJw4clRAJQqIdq5gIFC_eC-x7C1i2ab01KV5ylmOBs,728
|
37
|
+
scrapling/engines/toolbelt/bypasses/window_chrome.js,sha256=D7hqzNGGDorh8JVlvm2YIv7Bk2CoVkG55MDIdyqhT1w,6808
|
38
|
+
scrapling/fetchers/__init__.py,sha256=V2PSNzVPqtW7bdRrLygsaxHXqbu_7kdyI3byYr5AFbU,1687
|
39
|
+
scrapling/fetchers/chrome.py,sha256=Ky8bxKkvcbT1gmgazdxrUmJ8qHQDa_dhXswi-wtVzNg,12728
|
40
|
+
scrapling/fetchers/firefox.py,sha256=Ix_RVatrDOnC3qR_IzkzkD_PbKv66Jd5C5P58YaOUF4,13190
|
41
|
+
scrapling/fetchers/requests.py,sha256=Y-ZXhm2Ui1Ugc5lvMgBDIBAmaoh3upjPlbJswdCnyok,978
|
42
|
+
scrapling-0.3.7.dist-info/licenses/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
43
|
+
scrapling-0.3.7.dist-info/METADATA,sha256=pcX6f6EBl28AL3O_n8bHO8I5_fyXIH2CvT2sgvvjUe8,22465
|
44
|
+
scrapling-0.3.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
45
|
+
scrapling-0.3.7.dist-info/entry_points.txt,sha256=DHyt2Blxy0P5OE2HRcP95Wz9_xo2ERCDcNqrJjYS3o8,49
|
46
|
+
scrapling-0.3.7.dist-info/top_level.txt,sha256=Ud-yF-PC2U5HQ3nc5QwT7HSPdIpF1RuwQ_mYgBzHHIM,10
|
47
|
+
scrapling-0.3.7.dist-info/RECORD,,
|