scrapling 0.2.92__py3-none-any.whl → 0.2.94__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/core/_types.py +2 -1
- scrapling/core/custom_types.py +97 -45
- scrapling/core/translator.py +1 -1
- scrapling/defaults.py +8 -5
- scrapling/engines/camo.py +46 -2
- scrapling/engines/pw.py +43 -1
- scrapling/engines/static.py +1 -0
- scrapling/engines/toolbelt/custom.py +2 -1
- scrapling/fetchers.py +5 -5
- scrapling/parser.py +158 -194
- {scrapling-0.2.92.dist-info → scrapling-0.2.94.dist-info}/METADATA +59 -33
- {scrapling-0.2.92.dist-info → scrapling-0.2.94.dist-info}/RECORD +19 -19
- {scrapling-0.2.92.dist-info → scrapling-0.2.94.dist-info}/WHEEL +1 -1
- tests/fetchers/async/test_playwright.py +1 -1
- tests/fetchers/sync/test_playwright.py +1 -1
- {scrapling-0.2.92.dist-info → scrapling-0.2.94.dist-info}/LICENSE +0 -0
- {scrapling-0.2.92.dist-info → scrapling-0.2.94.dist-info}/entry_points.txt +0 -0
- {scrapling-0.2.92.dist-info → scrapling-0.2.94.dist-info}/top_level.txt +0 -0
scrapling/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
|
|
5
5
|
from scrapling.parser import Adaptor, Adaptors
|
6
6
|
|
7
7
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
8
|
-
__version__ = "0.2.
|
8
|
+
__version__ = "0.2.94"
|
9
9
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
10
10
|
|
11
11
|
|
scrapling/core/_types.py
CHANGED
@@ -3,7 +3,8 @@ Type definitions for type checking purposes.
|
|
3
3
|
"""
|
4
4
|
|
5
5
|
from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable,
|
6
|
-
List, Literal, Optional, Pattern, Tuple, Type,
|
6
|
+
List, Literal, Optional, Pattern, Tuple, Type, TypeVar,
|
7
|
+
Union)
|
7
8
|
|
8
9
|
SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
|
9
10
|
|
scrapling/core/custom_types.py
CHANGED
@@ -1,13 +1,18 @@
|
|
1
1
|
import re
|
2
|
+
import typing
|
2
3
|
from collections.abc import Mapping
|
3
4
|
from types import MappingProxyType
|
4
5
|
|
5
6
|
from orjson import dumps, loads
|
6
7
|
from w3lib.html import replace_entities as _replace_entities
|
7
8
|
|
8
|
-
from scrapling.core._types import Dict, List,
|
9
|
+
from scrapling.core._types import (Dict, Iterable, List, Literal, Optional,
|
10
|
+
Pattern, SupportsIndex, TypeVar, Union)
|
9
11
|
from scrapling.core.utils import _is_iterable, flatten
|
10
12
|
|
13
|
+
# Define type variable for AttributeHandler value type
|
14
|
+
_TextHandlerType = TypeVar('_TextHandlerType', bound='TextHandler')
|
15
|
+
|
11
16
|
|
12
17
|
class TextHandler(str):
|
13
18
|
"""Extends standard Python string by adding more functionality"""
|
@@ -18,72 +23,89 @@ class TextHandler(str):
|
|
18
23
|
return super().__new__(cls, string)
|
19
24
|
return super().__new__(cls, '')
|
20
25
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
26
|
+
@typing.overload
|
27
|
+
def __getitem__(self, key: SupportsIndex) -> 'TextHandler':
|
28
|
+
pass
|
29
|
+
|
30
|
+
@typing.overload
|
31
|
+
def __getitem__(self, key: slice) -> "TextHandlers":
|
32
|
+
pass
|
33
|
+
|
34
|
+
def __getitem__(self, key: Union[SupportsIndex, slice]) -> Union["TextHandler", "TextHandlers"]:
|
35
|
+
lst = super().__getitem__(key)
|
36
|
+
if isinstance(key, slice):
|
37
|
+
lst = [TextHandler(s) for s in lst]
|
38
|
+
return TextHandlers(typing.cast(List[_TextHandlerType], lst))
|
39
|
+
return typing.cast(_TextHandlerType, TextHandler(lst))
|
40
|
+
|
41
|
+
def split(self, sep: str = None, maxsplit: SupportsIndex = -1) -> 'TextHandlers':
|
42
|
+
return TextHandlers(
|
43
|
+
typing.cast(List[_TextHandlerType], [TextHandler(s) for s in super().split(sep, maxsplit)])
|
44
|
+
)
|
45
|
+
|
46
|
+
def strip(self, chars: str = None) -> Union[str, 'TextHandler']:
|
25
47
|
return TextHandler(super().strip(chars))
|
26
48
|
|
27
|
-
def lstrip(self, chars=None):
|
49
|
+
def lstrip(self, chars: str = None) -> Union[str, 'TextHandler']:
|
28
50
|
return TextHandler(super().lstrip(chars))
|
29
51
|
|
30
|
-
def rstrip(self, chars=None):
|
52
|
+
def rstrip(self, chars: str = None) -> Union[str, 'TextHandler']:
|
31
53
|
return TextHandler(super().rstrip(chars))
|
32
54
|
|
33
|
-
def capitalize(self):
|
55
|
+
def capitalize(self) -> Union[str, 'TextHandler']:
|
34
56
|
return TextHandler(super().capitalize())
|
35
57
|
|
36
|
-
def casefold(self):
|
58
|
+
def casefold(self) -> Union[str, 'TextHandler']:
|
37
59
|
return TextHandler(super().casefold())
|
38
60
|
|
39
|
-
def center(self, width, fillchar=' '):
|
61
|
+
def center(self, width: SupportsIndex, fillchar: str = ' ') -> Union[str, 'TextHandler']:
|
40
62
|
return TextHandler(super().center(width, fillchar))
|
41
63
|
|
42
|
-
def expandtabs(self, tabsize=8):
|
64
|
+
def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, 'TextHandler']:
|
43
65
|
return TextHandler(super().expandtabs(tabsize))
|
44
66
|
|
45
|
-
def format(self, *args, **kwargs):
|
67
|
+
def format(self, *args: str, **kwargs: str) -> Union[str, 'TextHandler']:
|
46
68
|
return TextHandler(super().format(*args, **kwargs))
|
47
69
|
|
48
|
-
def format_map(self, mapping):
|
70
|
+
def format_map(self, mapping) -> Union[str, 'TextHandler']:
|
49
71
|
return TextHandler(super().format_map(mapping))
|
50
72
|
|
51
|
-
def join(self, iterable):
|
73
|
+
def join(self, iterable: Iterable[str]) -> Union[str, 'TextHandler']:
|
52
74
|
return TextHandler(super().join(iterable))
|
53
75
|
|
54
|
-
def ljust(self, width, fillchar=' '):
|
76
|
+
def ljust(self, width: SupportsIndex, fillchar: str = ' ') -> Union[str, 'TextHandler']:
|
55
77
|
return TextHandler(super().ljust(width, fillchar))
|
56
78
|
|
57
|
-
def rjust(self, width, fillchar=' '):
|
79
|
+
def rjust(self, width: SupportsIndex, fillchar: str = ' ') -> Union[str, 'TextHandler']:
|
58
80
|
return TextHandler(super().rjust(width, fillchar))
|
59
81
|
|
60
|
-
def swapcase(self):
|
82
|
+
def swapcase(self) -> Union[str, 'TextHandler']:
|
61
83
|
return TextHandler(super().swapcase())
|
62
84
|
|
63
|
-
def title(self):
|
85
|
+
def title(self) -> Union[str, 'TextHandler']:
|
64
86
|
return TextHandler(super().title())
|
65
87
|
|
66
|
-
def translate(self, table):
|
88
|
+
def translate(self, table) -> Union[str, 'TextHandler']:
|
67
89
|
return TextHandler(super().translate(table))
|
68
90
|
|
69
|
-
def zfill(self, width):
|
91
|
+
def zfill(self, width: SupportsIndex) -> Union[str, 'TextHandler']:
|
70
92
|
return TextHandler(super().zfill(width))
|
71
93
|
|
72
|
-
def replace(self, old, new, count
|
94
|
+
def replace(self, old: str, new: str, count: SupportsIndex = -1) -> Union[str, 'TextHandler']:
|
73
95
|
return TextHandler(super().replace(old, new, count))
|
74
96
|
|
75
|
-
def upper(self):
|
97
|
+
def upper(self) -> Union[str, 'TextHandler']:
|
76
98
|
return TextHandler(super().upper())
|
77
99
|
|
78
|
-
def lower(self):
|
100
|
+
def lower(self) -> Union[str, 'TextHandler']:
|
79
101
|
return TextHandler(super().lower())
|
80
102
|
##############
|
81
103
|
|
82
|
-
def sort(self, reverse: bool = False) -> str:
|
104
|
+
def sort(self, reverse: bool = False) -> Union[str, 'TextHandler']:
|
83
105
|
"""Return a sorted version of the string"""
|
84
106
|
return self.__class__("".join(sorted(self, reverse=reverse)))
|
85
107
|
|
86
|
-
def clean(self) -> str:
|
108
|
+
def clean(self) -> Union[str, 'TextHandler']:
|
87
109
|
"""Return a new version of the string after removing all white spaces and consecutive spaces"""
|
88
110
|
data = re.sub(r'[\t|\r|\n]', '', self)
|
89
111
|
data = re.sub(' +', ' ', data)
|
@@ -105,21 +127,43 @@ class TextHandler(str):
|
|
105
127
|
# Check this out: https://github.com/ijl/orjson/issues/445
|
106
128
|
return loads(str(self))
|
107
129
|
|
130
|
+
@typing.overload
|
131
|
+
def re(
|
132
|
+
self,
|
133
|
+
regex: Union[str, Pattern[str]],
|
134
|
+
check_match: Literal[True],
|
135
|
+
replace_entities: bool = True,
|
136
|
+
clean_match: bool = False,
|
137
|
+
case_sensitive: bool = True,
|
138
|
+
) -> bool:
|
139
|
+
...
|
140
|
+
|
141
|
+
@typing.overload
|
142
|
+
def re(
|
143
|
+
self,
|
144
|
+
regex: Union[str, Pattern[str]],
|
145
|
+
replace_entities: bool = True,
|
146
|
+
clean_match: bool = False,
|
147
|
+
case_sensitive: bool = True,
|
148
|
+
check_match: Literal[False] = False,
|
149
|
+
) -> "TextHandlers[TextHandler]":
|
150
|
+
...
|
151
|
+
|
108
152
|
def re(
|
109
153
|
self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
|
110
|
-
case_sensitive: bool =
|
111
|
-
) -> Union[
|
154
|
+
case_sensitive: bool = True, check_match: bool = False
|
155
|
+
) -> Union["TextHandlers[TextHandler]", bool]:
|
112
156
|
"""Apply the given regex to the current text and return a list of strings with the matches.
|
113
157
|
|
114
158
|
:param regex: Can be either a compiled regular expression or a string.
|
115
159
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
116
160
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
117
|
-
:param case_sensitive: if
|
161
|
+
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
118
162
|
:param check_match: used to quickly check if this regex matches or not without any operations on the results
|
119
163
|
|
120
164
|
"""
|
121
165
|
if isinstance(regex, str):
|
122
|
-
if
|
166
|
+
if case_sensitive:
|
123
167
|
regex = re.compile(regex, re.UNICODE)
|
124
168
|
else:
|
125
169
|
regex = re.compile(regex, flags=re.UNICODE | re.IGNORECASE)
|
@@ -133,19 +177,19 @@ class TextHandler(str):
|
|
133
177
|
results = flatten(results)
|
134
178
|
|
135
179
|
if not replace_entities:
|
136
|
-
return [TextHandler(string) for string in results]
|
180
|
+
return TextHandlers(typing.cast(List[_TextHandlerType], [TextHandler(string) for string in results]))
|
137
181
|
|
138
|
-
return [TextHandler(_replace_entities(s)) for s in results]
|
182
|
+
return TextHandlers(typing.cast(List[_TextHandlerType], [TextHandler(_replace_entities(s)) for s in results]))
|
139
183
|
|
140
184
|
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
141
|
-
clean_match: bool = False, case_sensitive: bool =
|
185
|
+
clean_match: bool = False, case_sensitive: bool = True) -> "TextHandler":
|
142
186
|
"""Apply the given regex to text and return the first match if found, otherwise return the default value.
|
143
187
|
|
144
188
|
:param regex: Can be either a compiled regular expression or a string.
|
145
189
|
:param default: The default value to be returned if there is no match
|
146
190
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
147
191
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
148
|
-
:param case_sensitive: if
|
192
|
+
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
149
193
|
|
150
194
|
"""
|
151
195
|
result = self.re(regex, replace_entities, clean_match=clean_match, case_sensitive=case_sensitive)
|
@@ -158,30 +202,38 @@ class TextHandlers(List[TextHandler]):
|
|
158
202
|
"""
|
159
203
|
__slots__ = ()
|
160
204
|
|
161
|
-
|
205
|
+
@typing.overload
|
206
|
+
def __getitem__(self, pos: SupportsIndex) -> TextHandler:
|
207
|
+
pass
|
208
|
+
|
209
|
+
@typing.overload
|
210
|
+
def __getitem__(self, pos: slice) -> "TextHandlers":
|
211
|
+
pass
|
212
|
+
|
213
|
+
def __getitem__(self, pos: Union[SupportsIndex, slice]) -> Union[TextHandler, "TextHandlers"]:
|
162
214
|
lst = super().__getitem__(pos)
|
163
215
|
if isinstance(pos, slice):
|
164
|
-
|
165
|
-
|
166
|
-
|
216
|
+
lst = [TextHandler(s) for s in lst]
|
217
|
+
return TextHandlers(typing.cast(List[_TextHandlerType], lst))
|
218
|
+
return typing.cast(_TextHandlerType, TextHandler(lst))
|
167
219
|
|
168
220
|
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
|
169
|
-
case_sensitive: bool =
|
221
|
+
case_sensitive: bool = True) -> 'TextHandlers[TextHandler]':
|
170
222
|
"""Call the ``.re()`` method for each element in this list and return
|
171
223
|
their results flattened as TextHandlers.
|
172
224
|
|
173
225
|
:param regex: Can be either a compiled regular expression or a string.
|
174
226
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
175
227
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
176
|
-
:param case_sensitive: if
|
228
|
+
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
177
229
|
"""
|
178
230
|
results = [
|
179
231
|
n.re(regex, replace_entities, clean_match, case_sensitive) for n in self
|
180
232
|
]
|
181
|
-
return flatten(results)
|
233
|
+
return TextHandlers(flatten(results))
|
182
234
|
|
183
235
|
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
184
|
-
clean_match: bool = False, case_sensitive: bool =
|
236
|
+
clean_match: bool = False, case_sensitive: bool = True) -> TextHandler:
|
185
237
|
"""Call the ``.re_first()`` method for each element in this list and return
|
186
238
|
the first result or the default value otherwise.
|
187
239
|
|
@@ -189,7 +241,7 @@ class TextHandlers(List[TextHandler]):
|
|
189
241
|
:param default: The default value to be returned if there is no match
|
190
242
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
191
243
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
192
|
-
:param case_sensitive: if
|
244
|
+
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
193
245
|
"""
|
194
246
|
for n in self:
|
195
247
|
for result in n.re(regex, replace_entities, clean_match, case_sensitive):
|
@@ -210,7 +262,7 @@ class TextHandlers(List[TextHandler]):
|
|
210
262
|
get_all = extract
|
211
263
|
|
212
264
|
|
213
|
-
class AttributesHandler(Mapping):
|
265
|
+
class AttributesHandler(Mapping[str, _TextHandlerType]):
|
214
266
|
"""A read-only mapping to use instead of the standard dictionary for the speed boost but at the same time I use it to add more functionalities.
|
215
267
|
If standard dictionary is needed, just convert this class to dictionary with `dict` function
|
216
268
|
"""
|
@@ -231,7 +283,7 @@ class AttributesHandler(Mapping):
|
|
231
283
|
# Fastest read-only mapping type
|
232
284
|
self._data = MappingProxyType(mapping)
|
233
285
|
|
234
|
-
def get(self, key, default=None):
|
286
|
+
def get(self, key: str, default: Optional[str] = None) -> Union[_TextHandlerType, None]:
|
235
287
|
"""Acts like standard dictionary `.get()` method"""
|
236
288
|
return self._data.get(key, default)
|
237
289
|
|
@@ -253,7 +305,7 @@ class AttributesHandler(Mapping):
|
|
253
305
|
"""Convert current attributes to JSON string if the attributes are JSON serializable otherwise throws error"""
|
254
306
|
return dumps(dict(self._data))
|
255
307
|
|
256
|
-
def __getitem__(self, key):
|
308
|
+
def __getitem__(self, key: str) -> _TextHandlerType:
|
257
309
|
return self._data[key]
|
258
310
|
|
259
311
|
def __iter__(self):
|
scrapling/core/translator.py
CHANGED
@@ -139,6 +139,6 @@ class TranslatorMixin:
|
|
139
139
|
|
140
140
|
|
141
141
|
class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
|
142
|
-
@lru_cache(maxsize=
|
142
|
+
@lru_cache(maxsize=2048)
|
143
143
|
def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
|
144
144
|
return super().css_to_xpath(css, prefix)
|
scrapling/defaults.py
CHANGED
@@ -1,7 +1,10 @@
|
|
1
|
-
from .fetchers import AsyncFetcher
|
1
|
+
from .fetchers import AsyncFetcher as _AsyncFetcher
|
2
|
+
from .fetchers import Fetcher as _Fetcher
|
3
|
+
from .fetchers import PlayWrightFetcher as _PlayWrightFetcher
|
4
|
+
from .fetchers import StealthyFetcher as _StealthyFetcher
|
2
5
|
|
3
6
|
# If you are going to use Fetchers with the default settings, import them from this file instead for a cleaner looking code
|
4
|
-
Fetcher =
|
5
|
-
AsyncFetcher =
|
6
|
-
StealthyFetcher =
|
7
|
-
PlayWrightFetcher =
|
7
|
+
Fetcher = _Fetcher()
|
8
|
+
AsyncFetcher = _AsyncFetcher()
|
9
|
+
StealthyFetcher = _StealthyFetcher()
|
10
|
+
PlayWrightFetcher = _PlayWrightFetcher()
|
scrapling/engines/camo.py
CHANGED
@@ -19,7 +19,7 @@ class CamoufoxEngine:
|
|
19
19
|
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
|
20
20
|
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
|
21
21
|
wait_selector_state: Optional[SelectorWaitStates] = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
|
22
|
-
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] =
|
22
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = False,
|
23
23
|
geoip: Optional[bool] = False,
|
24
24
|
adaptor_arguments: Dict = None,
|
25
25
|
):
|
@@ -36,7 +36,7 @@ class CamoufoxEngine:
|
|
36
36
|
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
37
37
|
:param allow_webgl: Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled.
|
38
38
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
39
|
-
:param disable_ads:
|
39
|
+
:param disable_ads: Disabled by default, this installs `uBlock Origin` addon on the browser if enabled.
|
40
40
|
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
41
41
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
42
42
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
@@ -95,6 +95,7 @@ class CamoufoxEngine:
|
|
95
95
|
with Camoufox(
|
96
96
|
geoip=self.geoip,
|
97
97
|
proxy=self.proxy,
|
98
|
+
enable_cache=True,
|
98
99
|
addons=self.addons,
|
99
100
|
exclude_addons=addons,
|
100
101
|
headless=self.headless,
|
@@ -140,6 +141,26 @@ class CamoufoxEngine:
|
|
140
141
|
# PlayWright API sometimes give empty status text for some reason!
|
141
142
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
142
143
|
|
144
|
+
history = []
|
145
|
+
current_request = first_response.request.redirected_from
|
146
|
+
while current_request:
|
147
|
+
current_response = current_request.response()
|
148
|
+
|
149
|
+
history.insert(0, Response(
|
150
|
+
url=current_request.url,
|
151
|
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
152
|
+
text='',
|
153
|
+
body=b'',
|
154
|
+
status=current_response.status if current_response else 301,
|
155
|
+
reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
|
156
|
+
encoding=current_response.headers.get('content-type', '') or 'utf-8',
|
157
|
+
cookies={},
|
158
|
+
headers=current_response.all_headers() if current_response else {},
|
159
|
+
request_headers=current_request.all_headers(),
|
160
|
+
**self.adaptor_arguments
|
161
|
+
))
|
162
|
+
current_request = current_request.redirected_from
|
163
|
+
|
143
164
|
response = Response(
|
144
165
|
url=page.url,
|
145
166
|
text=page.content(),
|
@@ -150,6 +171,7 @@ class CamoufoxEngine:
|
|
150
171
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
151
172
|
headers=first_response.all_headers(),
|
152
173
|
request_headers=first_response.request.all_headers(),
|
174
|
+
history=history,
|
153
175
|
**self.adaptor_arguments
|
154
176
|
)
|
155
177
|
page.close()
|
@@ -174,6 +196,7 @@ class CamoufoxEngine:
|
|
174
196
|
async with AsyncCamoufox(
|
175
197
|
geoip=self.geoip,
|
176
198
|
proxy=self.proxy,
|
199
|
+
enable_cache=True,
|
177
200
|
addons=self.addons,
|
178
201
|
exclude_addons=addons,
|
179
202
|
headless=self.headless,
|
@@ -219,6 +242,26 @@ class CamoufoxEngine:
|
|
219
242
|
# PlayWright API sometimes give empty status text for some reason!
|
220
243
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
221
244
|
|
245
|
+
history = []
|
246
|
+
current_request = first_response.request.redirected_from
|
247
|
+
while current_request:
|
248
|
+
current_response = await current_request.response()
|
249
|
+
|
250
|
+
history.insert(0, Response(
|
251
|
+
url=current_request.url,
|
252
|
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
253
|
+
text='',
|
254
|
+
body=b'',
|
255
|
+
status=current_response.status if current_response else 301,
|
256
|
+
reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
|
257
|
+
encoding=current_response.headers.get('content-type', '') or 'utf-8',
|
258
|
+
cookies={},
|
259
|
+
headers=await current_response.all_headers() if current_response else {},
|
260
|
+
request_headers=await current_request.all_headers(),
|
261
|
+
**self.adaptor_arguments
|
262
|
+
))
|
263
|
+
current_request = current_request.redirected_from
|
264
|
+
|
222
265
|
response = Response(
|
223
266
|
url=page.url,
|
224
267
|
text=await page.content(),
|
@@ -229,6 +272,7 @@ class CamoufoxEngine:
|
|
229
272
|
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
|
230
273
|
headers=await first_response.all_headers(),
|
231
274
|
request_headers=await first_response.request.all_headers(),
|
275
|
+
history=history,
|
232
276
|
**self.adaptor_arguments
|
233
277
|
)
|
234
278
|
await page.close()
|
scrapling/engines/pw.py
CHANGED
@@ -105,7 +105,7 @@ class PlaywrightEngine:
|
|
105
105
|
"""
|
106
106
|
cdp_url = self.cdp_url
|
107
107
|
if self.nstbrowser_mode:
|
108
|
-
if self.nstbrowser_config and
|
108
|
+
if self.nstbrowser_config and isinstance(self.nstbrowser_config, dict):
|
109
109
|
config = self.nstbrowser_config
|
110
110
|
else:
|
111
111
|
query = NSTBROWSER_DEFAULT_QUERY.copy()
|
@@ -259,6 +259,26 @@ class PlaywrightEngine:
|
|
259
259
|
# PlayWright API sometimes give empty status text for some reason!
|
260
260
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
261
261
|
|
262
|
+
history = []
|
263
|
+
current_request = first_response.request.redirected_from
|
264
|
+
while current_request:
|
265
|
+
current_response = current_request.response()
|
266
|
+
|
267
|
+
history.insert(0, Response(
|
268
|
+
url=current_request.url,
|
269
|
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
270
|
+
text='',
|
271
|
+
body=b'',
|
272
|
+
status=current_response.status if current_response else 301,
|
273
|
+
reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
|
274
|
+
encoding=current_response.headers.get('content-type', '') or 'utf-8',
|
275
|
+
cookies={},
|
276
|
+
headers=current_response.all_headers() if current_response else {},
|
277
|
+
request_headers=current_request.all_headers(),
|
278
|
+
**self.adaptor_arguments
|
279
|
+
))
|
280
|
+
current_request = current_request.redirected_from
|
281
|
+
|
262
282
|
response = Response(
|
263
283
|
url=page.url,
|
264
284
|
text=page.content(),
|
@@ -269,6 +289,7 @@ class PlaywrightEngine:
|
|
269
289
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
270
290
|
headers=first_response.all_headers(),
|
271
291
|
request_headers=first_response.request.all_headers(),
|
292
|
+
history=history,
|
272
293
|
**self.adaptor_arguments
|
273
294
|
)
|
274
295
|
page.close()
|
@@ -345,6 +366,26 @@ class PlaywrightEngine:
|
|
345
366
|
# PlayWright API sometimes give empty status text for some reason!
|
346
367
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
347
368
|
|
369
|
+
history = []
|
370
|
+
current_request = first_response.request.redirected_from
|
371
|
+
while current_request:
|
372
|
+
current_response = await current_request.response()
|
373
|
+
|
374
|
+
history.insert(0, Response(
|
375
|
+
url=current_request.url,
|
376
|
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
377
|
+
text='',
|
378
|
+
body=b'',
|
379
|
+
status=current_response.status if current_response else 301,
|
380
|
+
reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
|
381
|
+
encoding=current_response.headers.get('content-type', '') or 'utf-8',
|
382
|
+
cookies={},
|
383
|
+
headers=await current_response.all_headers() if current_response else {},
|
384
|
+
request_headers=await current_request.all_headers(),
|
385
|
+
**self.adaptor_arguments
|
386
|
+
))
|
387
|
+
current_request = current_request.redirected_from
|
388
|
+
|
348
389
|
response = Response(
|
349
390
|
url=page.url,
|
350
391
|
text=await page.content(),
|
@@ -355,6 +396,7 @@ class PlaywrightEngine:
|
|
355
396
|
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
|
356
397
|
headers=await first_response.all_headers(),
|
357
398
|
request_headers=await first_response.request.all_headers(),
|
399
|
+
history=history,
|
358
400
|
**self.adaptor_arguments
|
359
401
|
)
|
360
402
|
await page.close()
|
scrapling/engines/static.py
CHANGED
@@ -72,6 +72,7 @@ class StaticEngine:
|
|
72
72
|
headers=dict(response.headers),
|
73
73
|
request_headers=dict(response.request.headers),
|
74
74
|
method=response.request.method,
|
75
|
+
history=[self._prepare_response(redirection) for redirection in response.history],
|
75
76
|
**self.adaptor_arguments
|
76
77
|
)
|
77
78
|
|
@@ -85,13 +85,14 @@ class Response(Adaptor):
|
|
85
85
|
"""This class is returned by all engines as a way to unify response type between different libraries."""
|
86
86
|
|
87
87
|
def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict,
|
88
|
-
encoding: str = 'utf-8', method: str = 'GET', **adaptor_arguments: Dict):
|
88
|
+
encoding: str = 'utf-8', method: str = 'GET', history: List = None, **adaptor_arguments: Dict):
|
89
89
|
automatch_domain = adaptor_arguments.pop('automatch_domain', None)
|
90
90
|
self.status = status
|
91
91
|
self.reason = reason
|
92
92
|
self.cookies = cookies
|
93
93
|
self.headers = headers
|
94
94
|
self.request_headers = request_headers
|
95
|
+
self.history = history or []
|
95
96
|
encoding = ResponseEncoding.get_value(encoding, text)
|
96
97
|
super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
|
97
98
|
# For back-ward compatibility
|
scrapling/fetchers.py
CHANGED
@@ -143,7 +143,7 @@ class AsyncFetcher(Fetcher):
|
|
143
143
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
144
144
|
"""
|
145
145
|
adaptor_arguments = tuple(self.adaptor_arguments.items())
|
146
|
-
response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).
|
146
|
+
response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_put(**kwargs)
|
147
147
|
return response_object
|
148
148
|
|
149
149
|
async def delete(
|
@@ -177,7 +177,7 @@ class StealthyFetcher(BaseFetcher):
|
|
177
177
|
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
|
178
178
|
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
|
179
179
|
wait_selector_state: SelectorWaitStates = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
|
180
|
-
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] =
|
180
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = False, geoip: Optional[bool] = False,
|
181
181
|
) -> Response:
|
182
182
|
"""
|
183
183
|
Opens up a browser and do your request based on your chosen options below.
|
@@ -191,7 +191,7 @@ class StealthyFetcher(BaseFetcher):
|
|
191
191
|
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
192
192
|
:param block_webrtc: Blocks WebRTC entirely.
|
193
193
|
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
194
|
-
:param disable_ads:
|
194
|
+
:param disable_ads: Disabled by default, this installs `uBlock Origin` addon on the browser if enabled.
|
195
195
|
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
196
196
|
:param allow_webgl: Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled.
|
197
197
|
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address.
|
@@ -235,7 +235,7 @@ class StealthyFetcher(BaseFetcher):
|
|
235
235
|
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
|
236
236
|
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
|
237
237
|
wait_selector_state: SelectorWaitStates = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
|
238
|
-
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] =
|
238
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = False, geoip: Optional[bool] = False,
|
239
239
|
) -> Response:
|
240
240
|
"""
|
241
241
|
Opens up a browser and do your request based on your chosen options below.
|
@@ -249,7 +249,7 @@ class StealthyFetcher(BaseFetcher):
|
|
249
249
|
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
250
250
|
:param block_webrtc: Blocks WebRTC entirely.
|
251
251
|
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
252
|
-
:param disable_ads:
|
252
|
+
:param disable_ads: Disabled by default, this installs `uBlock Origin` addon on the browser if enabled.
|
253
253
|
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
254
254
|
:param allow_webgl: Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled.
|
255
255
|
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address.
|