scrapling 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/core/_types.py +3 -0
- scrapling/core/ai.py +2 -1
- scrapling/core/custom_types.py +20 -27
- scrapling/core/mixins.py +15 -9
- scrapling/core/shell.py +4 -3
- scrapling/core/storage.py +5 -5
- scrapling/core/translator.py +13 -8
- scrapling/engines/_browsers/_base.py +37 -14
- scrapling/engines/_browsers/_camoufox.py +76 -35
- scrapling/engines/_browsers/_config_tools.py +1 -1
- scrapling/engines/_browsers/_controllers.py +32 -11
- scrapling/engines/_browsers/_validators.py +31 -10
- scrapling/engines/static.py +678 -668
- scrapling/engines/toolbelt/convertor.py +13 -15
- scrapling/engines/toolbelt/custom.py +6 -9
- scrapling/engines/toolbelt/fingerprints.py +17 -10
- scrapling/engines/toolbelt/navigation.py +11 -3
- scrapling/fetchers/__init__.py +11 -1
- scrapling/fetchers/chrome.py +9 -4
- scrapling/fetchers/firefox.py +0 -4
- scrapling/parser.py +105 -80
- {scrapling-0.3.6.dist-info → scrapling-0.3.7.dist-info}/METADATA +3 -4
- scrapling-0.3.7.dist-info/RECORD +47 -0
- scrapling-0.3.6.dist-info/RECORD +0 -47
- {scrapling-0.3.6.dist-info → scrapling-0.3.7.dist-info}/WHEEL +0 -0
- {scrapling-0.3.6.dist-info → scrapling-0.3.7.dist-info}/entry_points.txt +0 -0
- {scrapling-0.3.6.dist-info → scrapling-0.3.7.dist-info}/licenses/LICENSE +0 -0
- {scrapling-0.3.6.dist-info → scrapling-0.3.7.dist-info}/top_level.txt +0 -0
scrapling/parser.py
CHANGED
@@ -17,17 +17,21 @@ from lxml.etree import (
|
|
17
17
|
|
18
18
|
from scrapling.core._types import (
|
19
19
|
Any,
|
20
|
+
Set,
|
20
21
|
Dict,
|
22
|
+
cast,
|
21
23
|
List,
|
22
24
|
Tuple,
|
23
25
|
Union,
|
24
26
|
Pattern,
|
25
27
|
Callable,
|
28
|
+
Literal,
|
26
29
|
Optional,
|
27
30
|
Iterable,
|
28
31
|
overload,
|
29
32
|
Generator,
|
30
33
|
SupportsIndex,
|
34
|
+
TYPE_CHECKING,
|
31
35
|
)
|
32
36
|
from scrapling.core.custom_types import AttributesHandler, TextHandler, TextHandlers
|
33
37
|
from scrapling.core.mixins import SelectorsGeneration
|
@@ -36,7 +40,7 @@ from scrapling.core.storage import (
|
|
36
40
|
StorageSystemMixin,
|
37
41
|
_StorageTools,
|
38
42
|
)
|
39
|
-
from scrapling.core.translator import
|
43
|
+
from scrapling.core.translator import css_to_xpath as _css_to_xpath
|
40
44
|
from scrapling.core.utils import clean_spaces, flatten, html_forbidden, log
|
41
45
|
|
42
46
|
__DEFAULT_DB_FILE__ = str(Path(__file__).parent / "elements_storage.db")
|
@@ -70,20 +74,23 @@ class Selector(SelectorsGeneration):
|
|
70
74
|
"_raw_body",
|
71
75
|
)
|
72
76
|
|
77
|
+
if TYPE_CHECKING:
|
78
|
+
_storage: StorageSystemMixin
|
79
|
+
|
73
80
|
def __init__(
|
74
81
|
self,
|
75
82
|
content: Optional[str | bytes] = None,
|
76
|
-
url:
|
83
|
+
url: str = "",
|
77
84
|
encoding: str = "utf-8",
|
78
85
|
huge_tree: bool = True,
|
79
86
|
root: Optional[HtmlElement] = None,
|
80
87
|
keep_comments: Optional[bool] = False,
|
81
88
|
keep_cdata: Optional[bool] = False,
|
82
89
|
adaptive: Optional[bool] = False,
|
83
|
-
_storage:
|
90
|
+
_storage: Optional[StorageSystemMixin] = None,
|
84
91
|
storage: Any = SQLiteStorageSystem,
|
85
92
|
storage_args: Optional[Dict] = None,
|
86
|
-
**
|
93
|
+
**_,
|
87
94
|
):
|
88
95
|
"""The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
|
89
96
|
with expressions in CSS, XPath, or with simply text. Check the docs for more info.
|
@@ -131,7 +138,7 @@ class Selector(SelectorsGeneration):
|
|
131
138
|
default_doctype=True,
|
132
139
|
strip_cdata=(not keep_cdata),
|
133
140
|
)
|
134
|
-
self._root = fromstring(body, parser=parser, base_url=url)
|
141
|
+
self._root = cast(HtmlElement, fromstring(body, parser=parser, base_url=url or None))
|
135
142
|
self._raw_body = content
|
136
143
|
|
137
144
|
else:
|
@@ -141,7 +148,7 @@ class Selector(SelectorsGeneration):
|
|
141
148
|
f"Root have to be a valid element of `html` module types to work, not of type {type(root)}"
|
142
149
|
)
|
143
150
|
|
144
|
-
self._root = root
|
151
|
+
self._root = cast(HtmlElement, root)
|
145
152
|
self._raw_body = ""
|
146
153
|
|
147
154
|
self.__adaptive_enabled = adaptive
|
@@ -238,6 +245,9 @@ class Selector(SelectorsGeneration):
|
|
238
245
|
**self.__response_data,
|
239
246
|
)
|
240
247
|
|
248
|
+
def __elements_convertor(self, elements: List[HtmlElement]) -> "Selectors":
|
249
|
+
return Selectors(map(self.__element_convertor, elements))
|
250
|
+
|
241
251
|
def __handle_element(
|
242
252
|
self, element: Optional[HtmlElement | _ElementUnicodeResult]
|
243
253
|
) -> Optional[Union[TextHandler, "Selector"]]:
|
@@ -262,7 +272,7 @@ class Selector(SelectorsGeneration):
|
|
262
272
|
if self._is_text_node(result[0]):
|
263
273
|
return TextHandlers(map(TextHandler, result))
|
264
274
|
|
265
|
-
return
|
275
|
+
return self.__elements_convertor(result)
|
266
276
|
|
267
277
|
def __getstate__(self) -> Any:
|
268
278
|
# lxml don't like it :)
|
@@ -323,7 +333,7 @@ class Selector(SelectorsGeneration):
|
|
323
333
|
if not valid_values or processed_text.strip():
|
324
334
|
_all_strings.append(processed_text)
|
325
335
|
|
326
|
-
return TextHandler(separator).join(_all_strings)
|
336
|
+
return cast(TextHandler, TextHandler(separator).join(_all_strings))
|
327
337
|
|
328
338
|
def urljoin(self, relative_url: str) -> str:
|
329
339
|
"""Join this Selector's url with a relative url to form an absolute full URL."""
|
@@ -372,13 +382,14 @@ class Selector(SelectorsGeneration):
|
|
372
382
|
@property
|
373
383
|
def parent(self) -> Optional["Selector"]:
|
374
384
|
"""Return the direct parent of the element or ``None`` otherwise"""
|
375
|
-
|
385
|
+
_parent = self._root.getparent()
|
386
|
+
return self.__element_convertor(_parent) if _parent is not None else None
|
376
387
|
|
377
388
|
@property
|
378
389
|
def below_elements(self) -> "Selectors":
|
379
390
|
"""Return all elements under the current element in the DOM tree"""
|
380
391
|
below = _find_all_elements(self._root)
|
381
|
-
return self.
|
392
|
+
return self.__elements_convertor(below) if below is not None else Selectors()
|
382
393
|
|
383
394
|
@property
|
384
395
|
def children(self) -> "Selectors":
|
@@ -425,7 +436,7 @@ class Selector(SelectorsGeneration):
|
|
425
436
|
# Ignore HTML comments and unwanted types
|
426
437
|
next_element = next_element.getnext()
|
427
438
|
|
428
|
-
return self.
|
439
|
+
return self.__element_convertor(next_element) if next_element is not None else None
|
429
440
|
|
430
441
|
@property
|
431
442
|
def previous(self) -> Optional["Selector"]:
|
@@ -435,10 +446,10 @@ class Selector(SelectorsGeneration):
|
|
435
446
|
# Ignore HTML comments and unwanted types
|
436
447
|
prev_element = prev_element.getprevious()
|
437
448
|
|
438
|
-
return self.
|
449
|
+
return self.__element_convertor(prev_element) if prev_element is not None else None
|
439
450
|
|
440
451
|
# For easy copy-paste from Scrapy/parsel code when needed :)
|
441
|
-
def get(self, default=None):
|
452
|
+
def get(self, default=None): # pyright: ignore
|
442
453
|
return self
|
443
454
|
|
444
455
|
def get_all(self):
|
@@ -468,6 +479,16 @@ class Selector(SelectorsGeneration):
|
|
468
479
|
return data + ">"
|
469
480
|
|
470
481
|
# From here we start with the selecting functions
|
482
|
+
@overload
|
483
|
+
def relocate(
|
484
|
+
self, element: Union[Dict, HtmlElement, "Selector"], percentage: int, selector_type: Literal[True]
|
485
|
+
) -> "Selectors": ...
|
486
|
+
|
487
|
+
@overload
|
488
|
+
def relocate(
|
489
|
+
self, element: Union[Dict, HtmlElement, "Selector"], percentage: int, selector_type: Literal[False] = False
|
490
|
+
) -> List[HtmlElement]: ...
|
491
|
+
|
471
492
|
def relocate(
|
472
493
|
self,
|
473
494
|
element: Union[Dict, HtmlElement, "Selector"],
|
@@ -506,11 +527,11 @@ class Selector(SelectorsGeneration):
|
|
506
527
|
log.debug(f"Highest probability was {highest_probability}%")
|
507
528
|
log.debug("Top 5 best matching elements are: ")
|
508
529
|
for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
|
509
|
-
log.debug(f"{percent} -> {self.
|
530
|
+
log.debug(f"{percent} -> {self.__elements_convertor(score_table[percent])}")
|
510
531
|
|
511
532
|
if not selector_type:
|
512
533
|
return score_table[highest_probability]
|
513
|
-
return self.
|
534
|
+
return self.__elements_convertor(score_table[highest_probability])
|
514
535
|
return []
|
515
536
|
|
516
537
|
def css_first(
|
@@ -593,7 +614,7 @@ class Selector(SelectorsGeneration):
|
|
593
614
|
auto_save: bool = False,
|
594
615
|
percentage: int = 0,
|
595
616
|
**kwargs: Any,
|
596
|
-
) -> Union["Selectors", List, "TextHandlers"]:
|
617
|
+
) -> Union["Selectors", List[Any], "TextHandlers"]:
|
597
618
|
"""Search the current tree with CSS3 selectors
|
598
619
|
|
599
620
|
**Important:
|
@@ -614,7 +635,7 @@ class Selector(SelectorsGeneration):
|
|
614
635
|
try:
|
615
636
|
if not self.__adaptive_enabled or "," not in selector:
|
616
637
|
# No need to split selectors in this case, let's save some CPU cycles :)
|
617
|
-
xpath_selector =
|
638
|
+
xpath_selector = _css_to_xpath(selector)
|
618
639
|
return self.xpath(
|
619
640
|
xpath_selector,
|
620
641
|
identifier or selector,
|
@@ -628,7 +649,7 @@ class Selector(SelectorsGeneration):
|
|
628
649
|
for single_selector in split_selectors(selector):
|
629
650
|
# I'm doing this only so the `save` function saves data correctly for combined selectors
|
630
651
|
# Like using the ',' to combine two different selectors that point to different elements.
|
631
|
-
xpath_selector =
|
652
|
+
xpath_selector = _css_to_xpath(single_selector.canonical())
|
632
653
|
results += self.xpath(
|
633
654
|
xpath_selector,
|
634
655
|
identifier or single_selector.canonical(),
|
@@ -731,7 +752,8 @@ class Selector(SelectorsGeneration):
|
|
731
752
|
raise TypeError("You have to pass something to search with, like tag name(s), tag attributes, or both.")
|
732
753
|
|
733
754
|
attributes = dict()
|
734
|
-
tags
|
755
|
+
tags: Set[str] = set()
|
756
|
+
patterns: Set[Pattern] = set()
|
735
757
|
results, functions, selectors = Selectors(), [], []
|
736
758
|
|
737
759
|
# Brace yourself for a wonderful journey!
|
@@ -740,6 +762,7 @@ class Selector(SelectorsGeneration):
|
|
740
762
|
tags.add(arg)
|
741
763
|
|
742
764
|
elif type(arg) in (list, tuple, set):
|
765
|
+
arg = cast(Iterable, arg) # Type narrowing for type checkers like pyright
|
743
766
|
if not all(map(lambda x: isinstance(x, str), arg)):
|
744
767
|
raise TypeError("Nested Iterables are not accepted, only iterables of tag names are accepted")
|
745
768
|
tags.update(set(arg))
|
@@ -774,7 +797,7 @@ class Selector(SelectorsGeneration):
|
|
774
797
|
attributes[attribute_name] = value
|
775
798
|
|
776
799
|
# It's easier and faster to build a selector than traversing the tree
|
777
|
-
tags = tags or
|
800
|
+
tags = tags or set("*")
|
778
801
|
for tag in tags:
|
779
802
|
selector = tag
|
780
803
|
for key, value in attributes.items():
|
@@ -785,7 +808,7 @@ class Selector(SelectorsGeneration):
|
|
785
808
|
selectors.append(selector)
|
786
809
|
|
787
810
|
if selectors:
|
788
|
-
results = self.css(", ".join(selectors))
|
811
|
+
results = cast(Selectors, self.css(", ".join(selectors)))
|
789
812
|
if results:
|
790
813
|
# From the results, get the ones that fulfill passed regex patterns
|
791
814
|
for pattern in patterns:
|
@@ -828,20 +851,20 @@ class Selector(SelectorsGeneration):
|
|
828
851
|
:return: A percentage score of how similar is the candidate to the original element
|
829
852
|
"""
|
830
853
|
score, checks = 0, 0
|
831
|
-
|
854
|
+
data = _StorageTools.element_to_dict(candidate)
|
832
855
|
|
833
856
|
# Possible TODO:
|
834
857
|
# Study the idea of giving weight to each test below so some are more important than others
|
835
858
|
# Current results: With weights some websites had better score while it was worse for others
|
836
|
-
score += 1 if original["tag"] ==
|
859
|
+
score += 1 if original["tag"] == data["tag"] else 0 # * 0.3 # 30%
|
837
860
|
checks += 1
|
838
861
|
|
839
862
|
if original["text"]:
|
840
|
-
score += SequenceMatcher(None, original["text"],
|
863
|
+
score += SequenceMatcher(None, original["text"], data.get("text") or "").ratio() # * 0.3 # 30%
|
841
864
|
checks += 1
|
842
865
|
|
843
866
|
# if both don't have attributes, it still counts for something!
|
844
|
-
score += self.__calculate_dict_diff(original["attributes"],
|
867
|
+
score += self.__calculate_dict_diff(original["attributes"], data["attributes"]) # * 0.3 # 30%
|
845
868
|
checks += 1
|
846
869
|
|
847
870
|
# Separate similarity test for class, id, href,... this will help in full structural changes
|
@@ -855,23 +878,23 @@ class Selector(SelectorsGeneration):
|
|
855
878
|
score += SequenceMatcher(
|
856
879
|
None,
|
857
880
|
original["attributes"][attrib],
|
858
|
-
|
881
|
+
data["attributes"].get(attrib) or "",
|
859
882
|
).ratio() # * 0.3 # 30%
|
860
883
|
checks += 1
|
861
884
|
|
862
|
-
score += SequenceMatcher(None, original["path"],
|
885
|
+
score += SequenceMatcher(None, original["path"], data["path"]).ratio() # * 0.1 # 10%
|
863
886
|
checks += 1
|
864
887
|
|
865
888
|
if original.get("parent_name"):
|
866
889
|
# Then we start comparing parents' data
|
867
|
-
if
|
890
|
+
if data.get("parent_name"):
|
868
891
|
score += SequenceMatcher(
|
869
|
-
None, original["parent_name"],
|
892
|
+
None, original["parent_name"], data.get("parent_name") or ""
|
870
893
|
).ratio() # * 0.2 # 20%
|
871
894
|
checks += 1
|
872
895
|
|
873
896
|
score += self.__calculate_dict_diff(
|
874
|
-
original["parent_attribs"],
|
897
|
+
original["parent_attribs"], data.get("parent_attribs") or {}
|
875
898
|
) # * 0.2 # 20%
|
876
899
|
checks += 1
|
877
900
|
|
@@ -879,7 +902,7 @@ class Selector(SelectorsGeneration):
|
|
879
902
|
score += SequenceMatcher(
|
880
903
|
None,
|
881
904
|
original["parent_text"],
|
882
|
-
|
905
|
+
data.get("parent_text") or "",
|
883
906
|
).ratio() # * 0.1 # 10%
|
884
907
|
checks += 1
|
885
908
|
# else:
|
@@ -887,9 +910,7 @@ class Selector(SelectorsGeneration):
|
|
887
910
|
# score -= 0.1
|
888
911
|
|
889
912
|
if original.get("siblings"):
|
890
|
-
score += SequenceMatcher(
|
891
|
-
None, original["siblings"], candidate.get("siblings") or []
|
892
|
-
).ratio() # * 0.1 # 10%
|
913
|
+
score += SequenceMatcher(None, original["siblings"], data.get("siblings") or []).ratio() # * 0.1 # 10%
|
893
914
|
checks += 1
|
894
915
|
|
895
916
|
# How % sure? let's see
|
@@ -902,7 +923,7 @@ class Selector(SelectorsGeneration):
|
|
902
923
|
score += SequenceMatcher(None, tuple(dict1.values()), tuple(dict2.values())).ratio() * 0.5
|
903
924
|
return score
|
904
925
|
|
905
|
-
def save(self, element:
|
926
|
+
def save(self, element: HtmlElement, identifier: str) -> None:
|
906
927
|
"""Saves the element's unique properties to the storage for retrieval and relocation later
|
907
928
|
|
908
929
|
:param element: The element itself that we want to save to storage, it can be a ` Selector ` or pure ` HtmlElement `
|
@@ -910,15 +931,16 @@ class Selector(SelectorsGeneration):
|
|
910
931
|
the docs for more info.
|
911
932
|
"""
|
912
933
|
if self.__adaptive_enabled:
|
913
|
-
|
914
|
-
|
934
|
+
target = element
|
935
|
+
if isinstance(target, self.__class__):
|
936
|
+
target: HtmlElement = target._root
|
915
937
|
|
916
|
-
if self._is_text_node(
|
917
|
-
|
938
|
+
if self._is_text_node(target):
|
939
|
+
target: HtmlElement = target.getparent()
|
918
940
|
|
919
|
-
self._storage.save(
|
941
|
+
self._storage.save(target, identifier)
|
920
942
|
else:
|
921
|
-
|
943
|
+
raise RuntimeError(
|
922
944
|
"Can't use `adaptive` features while it's disabled globally, you have to start a new class instance."
|
923
945
|
)
|
924
946
|
|
@@ -932,10 +954,9 @@ class Selector(SelectorsGeneration):
|
|
932
954
|
if self.__adaptive_enabled:
|
933
955
|
return self._storage.retrieve(identifier)
|
934
956
|
|
935
|
-
|
957
|
+
raise RuntimeError(
|
936
958
|
"Can't use `adaptive` features while it's disabled globally, you have to start a new class instance."
|
937
959
|
)
|
938
|
-
return None
|
939
960
|
|
940
961
|
# Operations on text functions
|
941
962
|
def json(self) -> Dict:
|
@@ -1104,28 +1125,30 @@ class Selector(SelectorsGeneration):
|
|
1104
1125
|
if not case_sensitive:
|
1105
1126
|
text = text.lower()
|
1106
1127
|
|
1107
|
-
|
1108
|
-
|
1109
|
-
|
1110
|
-
|
1111
|
-
node_text =
|
1112
|
-
|
1113
|
-
|
1114
|
-
|
1115
|
-
|
1116
|
-
|
1117
|
-
|
1128
|
+
possible_targets = _find_all_elements_with_spaces(self._root)
|
1129
|
+
if possible_targets:
|
1130
|
+
for node in self.__elements_convertor(possible_targets):
|
1131
|
+
"""Check if element matches given text otherwise, traverse the children tree and iterate"""
|
1132
|
+
node_text = node.text
|
1133
|
+
if clean_match:
|
1134
|
+
node_text = node_text.clean()
|
1135
|
+
|
1136
|
+
if not case_sensitive:
|
1137
|
+
node_text = node_text.lower()
|
1138
|
+
|
1139
|
+
if partial:
|
1140
|
+
if text in node_text:
|
1141
|
+
results.append(node)
|
1142
|
+
elif text == node_text:
|
1118
1143
|
results.append(node)
|
1119
|
-
elif text == node_text:
|
1120
|
-
results.append(node)
|
1121
1144
|
|
1122
|
-
|
1123
|
-
|
1124
|
-
|
1145
|
+
if first_match and results:
|
1146
|
+
# we got an element so we should stop
|
1147
|
+
break
|
1125
1148
|
|
1126
|
-
|
1127
|
-
|
1128
|
-
|
1149
|
+
if first_match:
|
1150
|
+
if results:
|
1151
|
+
return results[0]
|
1129
1152
|
return results
|
1130
1153
|
|
1131
1154
|
def find_by_regex(
|
@@ -1143,23 +1166,25 @@ class Selector(SelectorsGeneration):
|
|
1143
1166
|
"""
|
1144
1167
|
results = Selectors()
|
1145
1168
|
|
1146
|
-
|
1147
|
-
|
1148
|
-
|
1149
|
-
|
1150
|
-
|
1151
|
-
|
1152
|
-
|
1153
|
-
|
1154
|
-
|
1155
|
-
|
1169
|
+
possible_targets = _find_all_elements_with_spaces(self._root)
|
1170
|
+
if possible_targets:
|
1171
|
+
for node in self.__elements_convertor(possible_targets):
|
1172
|
+
"""Check if element matches given regex otherwise, traverse the children tree and iterate"""
|
1173
|
+
node_text = node.text
|
1174
|
+
if node_text.re(
|
1175
|
+
query,
|
1176
|
+
check_match=True,
|
1177
|
+
clean_match=clean_match,
|
1178
|
+
case_sensitive=case_sensitive,
|
1179
|
+
):
|
1180
|
+
results.append(node)
|
1156
1181
|
|
1157
|
-
|
1158
|
-
|
1159
|
-
|
1182
|
+
if first_match and results:
|
1183
|
+
# we got an element so we should stop
|
1184
|
+
break
|
1160
1185
|
|
1161
|
-
|
1162
|
-
|
1186
|
+
if results and first_match:
|
1187
|
+
return results[0]
|
1163
1188
|
return results
|
1164
1189
|
|
1165
1190
|
|
@@ -1181,9 +1206,9 @@ class Selectors(List[Selector]):
|
|
1181
1206
|
def __getitem__(self, pos: SupportsIndex | slice) -> Union[Selector, "Selectors"]:
|
1182
1207
|
lst = super().__getitem__(pos)
|
1183
1208
|
if isinstance(pos, slice):
|
1184
|
-
return self.__class__(lst)
|
1209
|
+
return self.__class__(cast(List[Selector], lst))
|
1185
1210
|
else:
|
1186
|
-
return lst
|
1211
|
+
return cast(Selector, lst)
|
1187
1212
|
|
1188
1213
|
def xpath(
|
1189
1214
|
self,
|
@@ -1265,7 +1290,7 @@ class Selectors(List[Selector]):
|
|
1265
1290
|
def re_first(
|
1266
1291
|
self,
|
1267
1292
|
regex: str | Pattern,
|
1268
|
-
default=None,
|
1293
|
+
default: Any = None,
|
1269
1294
|
replace_entities: bool = True,
|
1270
1295
|
clean_match: bool = False,
|
1271
1296
|
case_sensitive: bool = True,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.7
|
4
4
|
Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -77,7 +77,7 @@ Requires-Dist: camoufox>=0.4.11; extra == "fetchers"
|
|
77
77
|
Requires-Dist: geoip2>=5.1.0; extra == "fetchers"
|
78
78
|
Requires-Dist: msgspec>=0.19.0; extra == "fetchers"
|
79
79
|
Provides-Extra: ai
|
80
|
-
Requires-Dist: mcp>=1.
|
80
|
+
Requires-Dist: mcp>=1.16.0; extra == "ai"
|
81
81
|
Requires-Dist: markdownify>=1.2.0; extra == "ai"
|
82
82
|
Requires-Dist: scrapling[fetchers]; extra == "ai"
|
83
83
|
Provides-Extra: shell
|
@@ -162,7 +162,6 @@ Built for the modern Web, Scrapling features its own rapid parsing engine and fe
|
|
162
162
|
<a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
|
163
163
|
<a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
|
164
164
|
<a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
|
165
|
-
<a href="https://www.nstproxy.com/?type=flow&utm_source=scrapling" target="_blank" title="One Proxy Service, Infinite Solutions at Unbeatable Prices!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/NSTproxy.png"></a>
|
166
165
|
<a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
|
167
166
|
<a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
|
168
167
|
|
@@ -389,7 +388,7 @@ Starting with v0.3.2, this installation only includes the parser engine and its
|
|
389
388
|
### Docker
|
390
389
|
You can also install a Docker image with all extras and browsers with the following command:
|
391
390
|
```bash
|
392
|
-
docker pull scrapling
|
391
|
+
docker pull pyd4vinci/scrapling
|
393
392
|
```
|
394
393
|
This image is automatically built and pushed to Docker Hub through GitHub actions right here.
|
395
394
|
|
@@ -0,0 +1,47 @@
|
|
1
|
+
scrapling/__init__.py,sha256=ckdQrdwM2SRKBMcORUsCUgU6JWoUwGtrbC3U0OH5RN4,1522
|
2
|
+
scrapling/cli.py,sha256=gbhfy2GCz_VqcWhBaNMK4wevayxNtLb72SQIUR9Ebik,26916
|
3
|
+
scrapling/parser.py,sha256=bQ7_c3rHjnjJsWI-qqkvEVkVx4-NM-1SWYpQrcwbflQ,58837
|
4
|
+
scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
|
5
|
+
scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
+
scrapling/core/_html_utils.py,sha256=ki47v54SsTL5-khi1jcLkJqAHqEq19cuex-dqzXdbEI,20328
|
7
|
+
scrapling/core/_types.py,sha256=SwJpeZ6TSi20y0FWHo-BmfGlkHpbot6qaql01PskgHw,897
|
8
|
+
scrapling/core/ai.py,sha256=xE0RXQxZzH62fCdFiNxcSWbdeuUZK3TlXd4hPkdOO80,36295
|
9
|
+
scrapling/core/custom_types.py,sha256=JlaOKvtI28ZkJ5ylaXIKfqqhlOOhIsZDNBhTbLfyWPo,13423
|
10
|
+
scrapling/core/mixins.py,sha256=Npw36VPmsHMrEZ5VXgBbLL1OyYcFqMUWkUB5oWATqtw,3522
|
11
|
+
scrapling/core/shell.py,sha256=kx7_6zGRXAd9NulL0cyX4YVQMGf4Ij1MYUtceSSE9xk,22983
|
12
|
+
scrapling/core/storage.py,sha256=eEAwl88bmAexXwnow86alV7TaGNf1an5_J7e1Mas7PU,6309
|
13
|
+
scrapling/core/translator.py,sha256=5Wk1rn3mSXO-1ACYnrORjO7n9aP2f5-OAzT8MeNjv-M,5354
|
14
|
+
scrapling/core/utils/__init__.py,sha256=zE2I4Zm355kdGjZBAAghFdFYQ-yRGvZbNqQuDP93-Ok,155
|
15
|
+
scrapling/core/utils/_shell.py,sha256=zes71MmFTs7V9f0JFstaWcjQhKNZN6xvspu29YVQtRc,1707
|
16
|
+
scrapling/core/utils/_utils.py,sha256=ATy-wwz00U-alOGH-NGK-VoPNr1qYmUwEoWuqAHjDkg,3143
|
17
|
+
scrapling/engines/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
|
+
scrapling/engines/constants.py,sha256=aOIOFCjbtgxH3hehlPU_3EwlnjpdUHRFK342nDQy-Vc,3596
|
19
|
+
scrapling/engines/static.py,sha256=3m86QAC1bnK9MD5Cjcs5u2Bu8zb51dzQBLK4Si1K5K8,50062
|
20
|
+
scrapling/engines/_browsers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
|
+
scrapling/engines/_browsers/_base.py,sha256=hSmzQ_UBI8WcgVZyVw2qppNOdR47xwjxypOULfBhkhQ,12546
|
22
|
+
scrapling/engines/_browsers/_camoufox.py,sha256=6YikWmY_z38xl9pYW2LgVRRdCwXdCrgFiyyYhBSVQug,38593
|
23
|
+
scrapling/engines/_browsers/_config_tools.py,sha256=Vbl-0G3E7_QsA6tZ6FrkUqUy33h--a2O8LveSvVF2y8,4617
|
24
|
+
scrapling/engines/_browsers/_controllers.py,sha256=WppP9Tkl4KNCszVTjy3BQ12gyMCPfiv7mUdbjA_l0JY,28705
|
25
|
+
scrapling/engines/_browsers/_page.py,sha256=1z-P6c97cTkULE-FVrsMY589e6eL_20Ae8pUe6vjggE,2206
|
26
|
+
scrapling/engines/_browsers/_validators.py,sha256=Bpk6P5urruUKDrdrXSnkiBHQWJ-F0JpXvepzlXj6Gfk,8033
|
27
|
+
scrapling/engines/toolbelt/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
28
|
+
scrapling/engines/toolbelt/convertor.py,sha256=tO-floNdsM6PmxaoRPJm9SK3rT7lyCQqwlkoeEe4t_0,13193
|
29
|
+
scrapling/engines/toolbelt/custom.py,sha256=GWqKrciMfry_8Vc_0LlaTGNUX8XxVlPquQ9obohIPtY,7447
|
30
|
+
scrapling/engines/toolbelt/fingerprints.py,sha256=3DaxNNLConjf_zDi97YswQ8cWgdA6Bq2mdR_l57Ul5E,2521
|
31
|
+
scrapling/engines/toolbelt/navigation.py,sha256=VHQ5sMVI-5UtcSpK-_Pin0e16fRLRzW8lYu-MObCxkY,3858
|
32
|
+
scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
|
33
|
+
scrapling/engines/toolbelt/bypasses/notification_permission.js,sha256=poPM3o5WYgEX-EdiUfDCllpWfc3Umvw4jr2u6O6elus,237
|
34
|
+
scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js,sha256=clzuf7KYcvDWYaKKxT_bkAoCT2fGsOcUw47948CHjAc,267
|
35
|
+
scrapling/engines/toolbelt/bypasses/screen_props.js,sha256=fZEuHMQ1-fYuxxUMoQXUvVWYUkPUbblkfMfpiLvBY7w,599
|
36
|
+
scrapling/engines/toolbelt/bypasses/webdriver_fully.js,sha256=hdJw4clRAJQqIdq5gIFC_eC-x7C1i2ab01KV5ylmOBs,728
|
37
|
+
scrapling/engines/toolbelt/bypasses/window_chrome.js,sha256=D7hqzNGGDorh8JVlvm2YIv7Bk2CoVkG55MDIdyqhT1w,6808
|
38
|
+
scrapling/fetchers/__init__.py,sha256=V2PSNzVPqtW7bdRrLygsaxHXqbu_7kdyI3byYr5AFbU,1687
|
39
|
+
scrapling/fetchers/chrome.py,sha256=Ky8bxKkvcbT1gmgazdxrUmJ8qHQDa_dhXswi-wtVzNg,12728
|
40
|
+
scrapling/fetchers/firefox.py,sha256=Ix_RVatrDOnC3qR_IzkzkD_PbKv66Jd5C5P58YaOUF4,13190
|
41
|
+
scrapling/fetchers/requests.py,sha256=Y-ZXhm2Ui1Ugc5lvMgBDIBAmaoh3upjPlbJswdCnyok,978
|
42
|
+
scrapling-0.3.7.dist-info/licenses/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
43
|
+
scrapling-0.3.7.dist-info/METADATA,sha256=pcX6f6EBl28AL3O_n8bHO8I5_fyXIH2CvT2sgvvjUe8,22465
|
44
|
+
scrapling-0.3.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
45
|
+
scrapling-0.3.7.dist-info/entry_points.txt,sha256=DHyt2Blxy0P5OE2HRcP95Wz9_xo2ERCDcNqrJjYS3o8,49
|
46
|
+
scrapling-0.3.7.dist-info/top_level.txt,sha256=Ud-yF-PC2U5HQ3nc5QwT7HSPdIpF1RuwQ_mYgBzHHIM,10
|
47
|
+
scrapling-0.3.7.dist-info/RECORD,,
|
scrapling-0.3.6.dist-info/RECORD
DELETED
@@ -1,47 +0,0 @@
|
|
1
|
-
scrapling/__init__.py,sha256=SgtXuhY_5yl5li8w1FN7-UEZQnb5kQCThJ9E2vtUFq4,1522
|
2
|
-
scrapling/cli.py,sha256=gbhfy2GCz_VqcWhBaNMK4wevayxNtLb72SQIUR9Ebik,26916
|
3
|
-
scrapling/parser.py,sha256=kOwqhsAvMO2DxjfpuFHKCUq1i2ei_Q5cpeWdt6jvVfY,57572
|
4
|
-
scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
|
5
|
-
scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
-
scrapling/core/_html_utils.py,sha256=ki47v54SsTL5-khi1jcLkJqAHqEq19cuex-dqzXdbEI,20328
|
7
|
-
scrapling/core/_types.py,sha256=hZIkyLtEk5Sa2FxII0AA_G6EIUHfZ3s09fwWktj7nPg,859
|
8
|
-
scrapling/core/ai.py,sha256=OcydJb69HI7FaWLm6ckNvyKyFMmN7bHoF3aHJQzMCR4,36275
|
9
|
-
scrapling/core/custom_types.py,sha256=GlQZiVIMCyv8vOdDUlASPn85r_4nw0P9ggID9q1VkRA,13608
|
10
|
-
scrapling/core/mixins.py,sha256=2iUVcN2XSAKGEvNmAM2Rr9axpZoxu0M2gIFEaFTO_Dg,3206
|
11
|
-
scrapling/core/shell.py,sha256=Y7rkgVlrdyeMSs1CfxbENdne3SDpVbUwjI5A79yi8fQ,22932
|
12
|
-
scrapling/core/storage.py,sha256=lswUJvqJZXyJUNyeoRMYm-faKxERCAzL-yhQ2zyeZDk,6303
|
13
|
-
scrapling/core/translator.py,sha256=HLJngeRRw2M0eNe_f8AfQD64a49OECIEm5Df_WELVG4,5135
|
14
|
-
scrapling/core/utils/__init__.py,sha256=zE2I4Zm355kdGjZBAAghFdFYQ-yRGvZbNqQuDP93-Ok,155
|
15
|
-
scrapling/core/utils/_shell.py,sha256=zes71MmFTs7V9f0JFstaWcjQhKNZN6xvspu29YVQtRc,1707
|
16
|
-
scrapling/core/utils/_utils.py,sha256=ATy-wwz00U-alOGH-NGK-VoPNr1qYmUwEoWuqAHjDkg,3143
|
17
|
-
scrapling/engines/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
|
-
scrapling/engines/constants.py,sha256=aOIOFCjbtgxH3hehlPU_3EwlnjpdUHRFK342nDQy-Vc,3596
|
19
|
-
scrapling/engines/static.py,sha256=tMwq2RrlHtI3RW-LJ_NPgyfj2ohIvoxZXYVSnPvf-_k,46233
|
20
|
-
scrapling/engines/_browsers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
|
-
scrapling/engines/_browsers/_base.py,sha256=Nn9rkrkUsHGSX6tFrSHbqTQdVqn_T2gdXhUW9FyWmz4,11306
|
22
|
-
scrapling/engines/_browsers/_camoufox.py,sha256=wzoFZsJJ0or-3SE6rmWEac4Verwrr2JX7GQltPZYyj8,36308
|
23
|
-
scrapling/engines/_browsers/_config_tools.py,sha256=mEPA5SGrWq0dl15cDOT6sOsm5NHMD0vI0fuPttGpw-U,4610
|
24
|
-
scrapling/engines/_browsers/_controllers.py,sha256=mYnXnG5z_WkRRsBxa9F8F-7ByVlYqOi1nr852r-zoF4,27210
|
25
|
-
scrapling/engines/_browsers/_page.py,sha256=1z-P6c97cTkULE-FVrsMY589e6eL_20Ae8pUe6vjggE,2206
|
26
|
-
scrapling/engines/_browsers/_validators.py,sha256=jvJjXURN79aeR-ZFc_k5zf_3ClP18gM1qZA7dMXd_YI,7491
|
27
|
-
scrapling/engines/toolbelt/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
28
|
-
scrapling/engines/toolbelt/convertor.py,sha256=e_rMcW8ScdfxKO-V5Mk61blVzwuDgd82CpRds0Z2tMQ,13102
|
29
|
-
scrapling/engines/toolbelt/custom.py,sha256=uhMXa_LNcvvG3wZXBRKHXvqLqShMR9SHwc3bBv4UaQs,7664
|
30
|
-
scrapling/engines/toolbelt/fingerprints.py,sha256=hCxKUTwo8sy7iN9wk8OA5vGo9XOn6E365zvC1C6zWDE,2212
|
31
|
-
scrapling/engines/toolbelt/navigation.py,sha256=Ej23I1n9AjCwxva_yRXUQeefmYJgi7lgb2Wr_b8RNFs,3550
|
32
|
-
scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
|
33
|
-
scrapling/engines/toolbelt/bypasses/notification_permission.js,sha256=poPM3o5WYgEX-EdiUfDCllpWfc3Umvw4jr2u6O6elus,237
|
34
|
-
scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js,sha256=clzuf7KYcvDWYaKKxT_bkAoCT2fGsOcUw47948CHjAc,267
|
35
|
-
scrapling/engines/toolbelt/bypasses/screen_props.js,sha256=fZEuHMQ1-fYuxxUMoQXUvVWYUkPUbblkfMfpiLvBY7w,599
|
36
|
-
scrapling/engines/toolbelt/bypasses/webdriver_fully.js,sha256=hdJw4clRAJQqIdq5gIFC_eC-x7C1i2ab01KV5ylmOBs,728
|
37
|
-
scrapling/engines/toolbelt/bypasses/window_chrome.js,sha256=D7hqzNGGDorh8JVlvm2YIv7Bk2CoVkG55MDIdyqhT1w,6808
|
38
|
-
scrapling/fetchers/__init__.py,sha256=GS_RjGXqyOyLsIbHswVDP7xlccDUQx4cIx6zUCtKvKg,1546
|
39
|
-
scrapling/fetchers/chrome.py,sha256=T6l_6TXyiDGdUsHwDH5ZSo8r9JxgisoUPs8DNZfNzKg,12224
|
40
|
-
scrapling/fetchers/firefox.py,sha256=aQarwVk81QOB7Pn-l_5pNZt6ee2O29jP65D8V2upYHo,13490
|
41
|
-
scrapling/fetchers/requests.py,sha256=Y-ZXhm2Ui1Ugc5lvMgBDIBAmaoh3upjPlbJswdCnyok,978
|
42
|
-
scrapling-0.3.6.dist-info/licenses/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
43
|
-
scrapling-0.3.6.dist-info/METADATA,sha256=pkT1VyULwrTPHzhtters1DsdAh8TNPc54MewJAcaAPk,22699
|
44
|
-
scrapling-0.3.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
45
|
-
scrapling-0.3.6.dist-info/entry_points.txt,sha256=DHyt2Blxy0P5OE2HRcP95Wz9_xo2ERCDcNqrJjYS3o8,49
|
46
|
-
scrapling-0.3.6.dist-info/top_level.txt,sha256=Ud-yF-PC2U5HQ3nc5QwT7HSPdIpF1RuwQ_mYgBzHHIM,10
|
47
|
-
scrapling-0.3.6.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|