scrapling 0.2.92__py3-none-any.whl → 0.2.94__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- scrapling/__init__.py +1 -1
- scrapling/core/_types.py +2 -1
- scrapling/core/custom_types.py +97 -45
- scrapling/core/translator.py +1 -1
- scrapling/defaults.py +8 -5
- scrapling/engines/camo.py +46 -2
- scrapling/engines/pw.py +43 -1
- scrapling/engines/static.py +1 -0
- scrapling/engines/toolbelt/custom.py +2 -1
- scrapling/fetchers.py +5 -5
- scrapling/parser.py +158 -194
- {scrapling-0.2.92.dist-info → scrapling-0.2.94.dist-info}/METADATA +59 -33
- {scrapling-0.2.92.dist-info → scrapling-0.2.94.dist-info}/RECORD +19 -19
- {scrapling-0.2.92.dist-info → scrapling-0.2.94.dist-info}/WHEEL +1 -1
- tests/fetchers/async/test_playwright.py +1 -1
- tests/fetchers/sync/test_playwright.py +1 -1
- {scrapling-0.2.92.dist-info → scrapling-0.2.94.dist-info}/LICENSE +0 -0
- {scrapling-0.2.92.dist-info → scrapling-0.2.94.dist-info}/entry_points.txt +0 -0
- {scrapling-0.2.92.dist-info → scrapling-0.2.94.dist-info}/top_level.txt +0 -0
scrapling/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
|
|
5
5
|
from scrapling.parser import Adaptor, Adaptors
|
6
6
|
|
7
7
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
8
|
-
__version__ = "0.2.
|
8
|
+
__version__ = "0.2.94"
|
9
9
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
10
10
|
|
11
11
|
|
scrapling/core/_types.py
CHANGED
@@ -3,7 +3,8 @@ Type definitions for type checking purposes.
|
|
3
3
|
"""
|
4
4
|
|
5
5
|
from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable,
|
6
|
-
List, Literal, Optional, Pattern, Tuple, Type,
|
6
|
+
List, Literal, Optional, Pattern, Tuple, Type, TypeVar,
|
7
|
+
Union)
|
7
8
|
|
8
9
|
SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
|
9
10
|
|
scrapling/core/custom_types.py
CHANGED
@@ -1,13 +1,18 @@
|
|
1
1
|
import re
|
2
|
+
import typing
|
2
3
|
from collections.abc import Mapping
|
3
4
|
from types import MappingProxyType
|
4
5
|
|
5
6
|
from orjson import dumps, loads
|
6
7
|
from w3lib.html import replace_entities as _replace_entities
|
7
8
|
|
8
|
-
from scrapling.core._types import Dict, List,
|
9
|
+
from scrapling.core._types import (Dict, Iterable, List, Literal, Optional,
|
10
|
+
Pattern, SupportsIndex, TypeVar, Union)
|
9
11
|
from scrapling.core.utils import _is_iterable, flatten
|
10
12
|
|
13
|
+
# Define type variable for AttributeHandler value type
|
14
|
+
_TextHandlerType = TypeVar('_TextHandlerType', bound='TextHandler')
|
15
|
+
|
11
16
|
|
12
17
|
class TextHandler(str):
|
13
18
|
"""Extends standard Python string by adding more functionality"""
|
@@ -18,72 +23,89 @@ class TextHandler(str):
|
|
18
23
|
return super().__new__(cls, string)
|
19
24
|
return super().__new__(cls, '')
|
20
25
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
26
|
+
@typing.overload
|
27
|
+
def __getitem__(self, key: SupportsIndex) -> 'TextHandler':
|
28
|
+
pass
|
29
|
+
|
30
|
+
@typing.overload
|
31
|
+
def __getitem__(self, key: slice) -> "TextHandlers":
|
32
|
+
pass
|
33
|
+
|
34
|
+
def __getitem__(self, key: Union[SupportsIndex, slice]) -> Union["TextHandler", "TextHandlers"]:
|
35
|
+
lst = super().__getitem__(key)
|
36
|
+
if isinstance(key, slice):
|
37
|
+
lst = [TextHandler(s) for s in lst]
|
38
|
+
return TextHandlers(typing.cast(List[_TextHandlerType], lst))
|
39
|
+
return typing.cast(_TextHandlerType, TextHandler(lst))
|
40
|
+
|
41
|
+
def split(self, sep: str = None, maxsplit: SupportsIndex = -1) -> 'TextHandlers':
|
42
|
+
return TextHandlers(
|
43
|
+
typing.cast(List[_TextHandlerType], [TextHandler(s) for s in super().split(sep, maxsplit)])
|
44
|
+
)
|
45
|
+
|
46
|
+
def strip(self, chars: str = None) -> Union[str, 'TextHandler']:
|
25
47
|
return TextHandler(super().strip(chars))
|
26
48
|
|
27
|
-
def lstrip(self, chars=None):
|
49
|
+
def lstrip(self, chars: str = None) -> Union[str, 'TextHandler']:
|
28
50
|
return TextHandler(super().lstrip(chars))
|
29
51
|
|
30
|
-
def rstrip(self, chars=None):
|
52
|
+
def rstrip(self, chars: str = None) -> Union[str, 'TextHandler']:
|
31
53
|
return TextHandler(super().rstrip(chars))
|
32
54
|
|
33
|
-
def capitalize(self):
|
55
|
+
def capitalize(self) -> Union[str, 'TextHandler']:
|
34
56
|
return TextHandler(super().capitalize())
|
35
57
|
|
36
|
-
def casefold(self):
|
58
|
+
def casefold(self) -> Union[str, 'TextHandler']:
|
37
59
|
return TextHandler(super().casefold())
|
38
60
|
|
39
|
-
def center(self, width, fillchar=' '):
|
61
|
+
def center(self, width: SupportsIndex, fillchar: str = ' ') -> Union[str, 'TextHandler']:
|
40
62
|
return TextHandler(super().center(width, fillchar))
|
41
63
|
|
42
|
-
def expandtabs(self, tabsize=8):
|
64
|
+
def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, 'TextHandler']:
|
43
65
|
return TextHandler(super().expandtabs(tabsize))
|
44
66
|
|
45
|
-
def format(self, *args, **kwargs):
|
67
|
+
def format(self, *args: str, **kwargs: str) -> Union[str, 'TextHandler']:
|
46
68
|
return TextHandler(super().format(*args, **kwargs))
|
47
69
|
|
48
|
-
def format_map(self, mapping):
|
70
|
+
def format_map(self, mapping) -> Union[str, 'TextHandler']:
|
49
71
|
return TextHandler(super().format_map(mapping))
|
50
72
|
|
51
|
-
def join(self, iterable):
|
73
|
+
def join(self, iterable: Iterable[str]) -> Union[str, 'TextHandler']:
|
52
74
|
return TextHandler(super().join(iterable))
|
53
75
|
|
54
|
-
def ljust(self, width, fillchar=' '):
|
76
|
+
def ljust(self, width: SupportsIndex, fillchar: str = ' ') -> Union[str, 'TextHandler']:
|
55
77
|
return TextHandler(super().ljust(width, fillchar))
|
56
78
|
|
57
|
-
def rjust(self, width, fillchar=' '):
|
79
|
+
def rjust(self, width: SupportsIndex, fillchar: str = ' ') -> Union[str, 'TextHandler']:
|
58
80
|
return TextHandler(super().rjust(width, fillchar))
|
59
81
|
|
60
|
-
def swapcase(self):
|
82
|
+
def swapcase(self) -> Union[str, 'TextHandler']:
|
61
83
|
return TextHandler(super().swapcase())
|
62
84
|
|
63
|
-
def title(self):
|
85
|
+
def title(self) -> Union[str, 'TextHandler']:
|
64
86
|
return TextHandler(super().title())
|
65
87
|
|
66
|
-
def translate(self, table):
|
88
|
+
def translate(self, table) -> Union[str, 'TextHandler']:
|
67
89
|
return TextHandler(super().translate(table))
|
68
90
|
|
69
|
-
def zfill(self, width):
|
91
|
+
def zfill(self, width: SupportsIndex) -> Union[str, 'TextHandler']:
|
70
92
|
return TextHandler(super().zfill(width))
|
71
93
|
|
72
|
-
def replace(self, old, new, count
|
94
|
+
def replace(self, old: str, new: str, count: SupportsIndex = -1) -> Union[str, 'TextHandler']:
|
73
95
|
return TextHandler(super().replace(old, new, count))
|
74
96
|
|
75
|
-
def upper(self):
|
97
|
+
def upper(self) -> Union[str, 'TextHandler']:
|
76
98
|
return TextHandler(super().upper())
|
77
99
|
|
78
|
-
def lower(self):
|
100
|
+
def lower(self) -> Union[str, 'TextHandler']:
|
79
101
|
return TextHandler(super().lower())
|
80
102
|
##############
|
81
103
|
|
82
|
-
def sort(self, reverse: bool = False) -> str:
|
104
|
+
def sort(self, reverse: bool = False) -> Union[str, 'TextHandler']:
|
83
105
|
"""Return a sorted version of the string"""
|
84
106
|
return self.__class__("".join(sorted(self, reverse=reverse)))
|
85
107
|
|
86
|
-
def clean(self) -> str:
|
108
|
+
def clean(self) -> Union[str, 'TextHandler']:
|
87
109
|
"""Return a new version of the string after removing all white spaces and consecutive spaces"""
|
88
110
|
data = re.sub(r'[\t|\r|\n]', '', self)
|
89
111
|
data = re.sub(' +', ' ', data)
|
@@ -105,21 +127,43 @@ class TextHandler(str):
|
|
105
127
|
# Check this out: https://github.com/ijl/orjson/issues/445
|
106
128
|
return loads(str(self))
|
107
129
|
|
130
|
+
@typing.overload
|
131
|
+
def re(
|
132
|
+
self,
|
133
|
+
regex: Union[str, Pattern[str]],
|
134
|
+
check_match: Literal[True],
|
135
|
+
replace_entities: bool = True,
|
136
|
+
clean_match: bool = False,
|
137
|
+
case_sensitive: bool = True,
|
138
|
+
) -> bool:
|
139
|
+
...
|
140
|
+
|
141
|
+
@typing.overload
|
142
|
+
def re(
|
143
|
+
self,
|
144
|
+
regex: Union[str, Pattern[str]],
|
145
|
+
replace_entities: bool = True,
|
146
|
+
clean_match: bool = False,
|
147
|
+
case_sensitive: bool = True,
|
148
|
+
check_match: Literal[False] = False,
|
149
|
+
) -> "TextHandlers[TextHandler]":
|
150
|
+
...
|
151
|
+
|
108
152
|
def re(
|
109
153
|
self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
|
110
|
-
case_sensitive: bool =
|
111
|
-
) -> Union[
|
154
|
+
case_sensitive: bool = True, check_match: bool = False
|
155
|
+
) -> Union["TextHandlers[TextHandler]", bool]:
|
112
156
|
"""Apply the given regex to the current text and return a list of strings with the matches.
|
113
157
|
|
114
158
|
:param regex: Can be either a compiled regular expression or a string.
|
115
159
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
116
160
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
117
|
-
:param case_sensitive: if
|
161
|
+
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
118
162
|
:param check_match: used to quickly check if this regex matches or not without any operations on the results
|
119
163
|
|
120
164
|
"""
|
121
165
|
if isinstance(regex, str):
|
122
|
-
if
|
166
|
+
if case_sensitive:
|
123
167
|
regex = re.compile(regex, re.UNICODE)
|
124
168
|
else:
|
125
169
|
regex = re.compile(regex, flags=re.UNICODE | re.IGNORECASE)
|
@@ -133,19 +177,19 @@ class TextHandler(str):
|
|
133
177
|
results = flatten(results)
|
134
178
|
|
135
179
|
if not replace_entities:
|
136
|
-
return [TextHandler(string) for string in results]
|
180
|
+
return TextHandlers(typing.cast(List[_TextHandlerType], [TextHandler(string) for string in results]))
|
137
181
|
|
138
|
-
return [TextHandler(_replace_entities(s)) for s in results]
|
182
|
+
return TextHandlers(typing.cast(List[_TextHandlerType], [TextHandler(_replace_entities(s)) for s in results]))
|
139
183
|
|
140
184
|
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
141
|
-
clean_match: bool = False, case_sensitive: bool =
|
185
|
+
clean_match: bool = False, case_sensitive: bool = True) -> "TextHandler":
|
142
186
|
"""Apply the given regex to text and return the first match if found, otherwise return the default value.
|
143
187
|
|
144
188
|
:param regex: Can be either a compiled regular expression or a string.
|
145
189
|
:param default: The default value to be returned if there is no match
|
146
190
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
147
191
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
148
|
-
:param case_sensitive: if
|
192
|
+
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
149
193
|
|
150
194
|
"""
|
151
195
|
result = self.re(regex, replace_entities, clean_match=clean_match, case_sensitive=case_sensitive)
|
@@ -158,30 +202,38 @@ class TextHandlers(List[TextHandler]):
|
|
158
202
|
"""
|
159
203
|
__slots__ = ()
|
160
204
|
|
161
|
-
|
205
|
+
@typing.overload
|
206
|
+
def __getitem__(self, pos: SupportsIndex) -> TextHandler:
|
207
|
+
pass
|
208
|
+
|
209
|
+
@typing.overload
|
210
|
+
def __getitem__(self, pos: slice) -> "TextHandlers":
|
211
|
+
pass
|
212
|
+
|
213
|
+
def __getitem__(self, pos: Union[SupportsIndex, slice]) -> Union[TextHandler, "TextHandlers"]:
|
162
214
|
lst = super().__getitem__(pos)
|
163
215
|
if isinstance(pos, slice):
|
164
|
-
|
165
|
-
|
166
|
-
|
216
|
+
lst = [TextHandler(s) for s in lst]
|
217
|
+
return TextHandlers(typing.cast(List[_TextHandlerType], lst))
|
218
|
+
return typing.cast(_TextHandlerType, TextHandler(lst))
|
167
219
|
|
168
220
|
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
|
169
|
-
case_sensitive: bool =
|
221
|
+
case_sensitive: bool = True) -> 'TextHandlers[TextHandler]':
|
170
222
|
"""Call the ``.re()`` method for each element in this list and return
|
171
223
|
their results flattened as TextHandlers.
|
172
224
|
|
173
225
|
:param regex: Can be either a compiled regular expression or a string.
|
174
226
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
175
227
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
176
|
-
:param case_sensitive: if
|
228
|
+
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
177
229
|
"""
|
178
230
|
results = [
|
179
231
|
n.re(regex, replace_entities, clean_match, case_sensitive) for n in self
|
180
232
|
]
|
181
|
-
return flatten(results)
|
233
|
+
return TextHandlers(flatten(results))
|
182
234
|
|
183
235
|
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
184
|
-
clean_match: bool = False, case_sensitive: bool =
|
236
|
+
clean_match: bool = False, case_sensitive: bool = True) -> TextHandler:
|
185
237
|
"""Call the ``.re_first()`` method for each element in this list and return
|
186
238
|
the first result or the default value otherwise.
|
187
239
|
|
@@ -189,7 +241,7 @@ class TextHandlers(List[TextHandler]):
|
|
189
241
|
:param default: The default value to be returned if there is no match
|
190
242
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
191
243
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
192
|
-
:param case_sensitive: if
|
244
|
+
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
193
245
|
"""
|
194
246
|
for n in self:
|
195
247
|
for result in n.re(regex, replace_entities, clean_match, case_sensitive):
|
@@ -210,7 +262,7 @@ class TextHandlers(List[TextHandler]):
|
|
210
262
|
get_all = extract
|
211
263
|
|
212
264
|
|
213
|
-
class AttributesHandler(Mapping):
|
265
|
+
class AttributesHandler(Mapping[str, _TextHandlerType]):
|
214
266
|
"""A read-only mapping to use instead of the standard dictionary for the speed boost but at the same time I use it to add more functionalities.
|
215
267
|
If standard dictionary is needed, just convert this class to dictionary with `dict` function
|
216
268
|
"""
|
@@ -231,7 +283,7 @@ class AttributesHandler(Mapping):
|
|
231
283
|
# Fastest read-only mapping type
|
232
284
|
self._data = MappingProxyType(mapping)
|
233
285
|
|
234
|
-
def get(self, key, default=None):
|
286
|
+
def get(self, key: str, default: Optional[str] = None) -> Union[_TextHandlerType, None]:
|
235
287
|
"""Acts like standard dictionary `.get()` method"""
|
236
288
|
return self._data.get(key, default)
|
237
289
|
|
@@ -253,7 +305,7 @@ class AttributesHandler(Mapping):
|
|
253
305
|
"""Convert current attributes to JSON string if the attributes are JSON serializable otherwise throws error"""
|
254
306
|
return dumps(dict(self._data))
|
255
307
|
|
256
|
-
def __getitem__(self, key):
|
308
|
+
def __getitem__(self, key: str) -> _TextHandlerType:
|
257
309
|
return self._data[key]
|
258
310
|
|
259
311
|
def __iter__(self):
|
scrapling/core/translator.py
CHANGED
@@ -139,6 +139,6 @@ class TranslatorMixin:
|
|
139
139
|
|
140
140
|
|
141
141
|
class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
|
142
|
-
@lru_cache(maxsize=
|
142
|
+
@lru_cache(maxsize=2048)
|
143
143
|
def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
|
144
144
|
return super().css_to_xpath(css, prefix)
|
scrapling/defaults.py
CHANGED
@@ -1,7 +1,10 @@
|
|
1
|
-
from .fetchers import AsyncFetcher
|
1
|
+
from .fetchers import AsyncFetcher as _AsyncFetcher
|
2
|
+
from .fetchers import Fetcher as _Fetcher
|
3
|
+
from .fetchers import PlayWrightFetcher as _PlayWrightFetcher
|
4
|
+
from .fetchers import StealthyFetcher as _StealthyFetcher
|
2
5
|
|
3
6
|
# If you are going to use Fetchers with the default settings, import them from this file instead for a cleaner looking code
|
4
|
-
Fetcher =
|
5
|
-
AsyncFetcher =
|
6
|
-
StealthyFetcher =
|
7
|
-
PlayWrightFetcher =
|
7
|
+
Fetcher = _Fetcher()
|
8
|
+
AsyncFetcher = _AsyncFetcher()
|
9
|
+
StealthyFetcher = _StealthyFetcher()
|
10
|
+
PlayWrightFetcher = _PlayWrightFetcher()
|
scrapling/engines/camo.py
CHANGED
@@ -19,7 +19,7 @@ class CamoufoxEngine:
|
|
19
19
|
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
|
20
20
|
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
|
21
21
|
wait_selector_state: Optional[SelectorWaitStates] = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
|
22
|
-
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] =
|
22
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = False,
|
23
23
|
geoip: Optional[bool] = False,
|
24
24
|
adaptor_arguments: Dict = None,
|
25
25
|
):
|
@@ -36,7 +36,7 @@ class CamoufoxEngine:
|
|
36
36
|
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
37
37
|
:param allow_webgl: Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled.
|
38
38
|
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
39
|
-
:param disable_ads:
|
39
|
+
:param disable_ads: Disabled by default, this installs `uBlock Origin` addon on the browser if enabled.
|
40
40
|
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
41
41
|
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
42
42
|
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
@@ -95,6 +95,7 @@ class CamoufoxEngine:
|
|
95
95
|
with Camoufox(
|
96
96
|
geoip=self.geoip,
|
97
97
|
proxy=self.proxy,
|
98
|
+
enable_cache=True,
|
98
99
|
addons=self.addons,
|
99
100
|
exclude_addons=addons,
|
100
101
|
headless=self.headless,
|
@@ -140,6 +141,26 @@ class CamoufoxEngine:
|
|
140
141
|
# PlayWright API sometimes give empty status text for some reason!
|
141
142
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
142
143
|
|
144
|
+
history = []
|
145
|
+
current_request = first_response.request.redirected_from
|
146
|
+
while current_request:
|
147
|
+
current_response = current_request.response()
|
148
|
+
|
149
|
+
history.insert(0, Response(
|
150
|
+
url=current_request.url,
|
151
|
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
152
|
+
text='',
|
153
|
+
body=b'',
|
154
|
+
status=current_response.status if current_response else 301,
|
155
|
+
reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
|
156
|
+
encoding=current_response.headers.get('content-type', '') or 'utf-8',
|
157
|
+
cookies={},
|
158
|
+
headers=current_response.all_headers() if current_response else {},
|
159
|
+
request_headers=current_request.all_headers(),
|
160
|
+
**self.adaptor_arguments
|
161
|
+
))
|
162
|
+
current_request = current_request.redirected_from
|
163
|
+
|
143
164
|
response = Response(
|
144
165
|
url=page.url,
|
145
166
|
text=page.content(),
|
@@ -150,6 +171,7 @@ class CamoufoxEngine:
|
|
150
171
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
151
172
|
headers=first_response.all_headers(),
|
152
173
|
request_headers=first_response.request.all_headers(),
|
174
|
+
history=history,
|
153
175
|
**self.adaptor_arguments
|
154
176
|
)
|
155
177
|
page.close()
|
@@ -174,6 +196,7 @@ class CamoufoxEngine:
|
|
174
196
|
async with AsyncCamoufox(
|
175
197
|
geoip=self.geoip,
|
176
198
|
proxy=self.proxy,
|
199
|
+
enable_cache=True,
|
177
200
|
addons=self.addons,
|
178
201
|
exclude_addons=addons,
|
179
202
|
headless=self.headless,
|
@@ -219,6 +242,26 @@ class CamoufoxEngine:
|
|
219
242
|
# PlayWright API sometimes give empty status text for some reason!
|
220
243
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
221
244
|
|
245
|
+
history = []
|
246
|
+
current_request = first_response.request.redirected_from
|
247
|
+
while current_request:
|
248
|
+
current_response = await current_request.response()
|
249
|
+
|
250
|
+
history.insert(0, Response(
|
251
|
+
url=current_request.url,
|
252
|
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
253
|
+
text='',
|
254
|
+
body=b'',
|
255
|
+
status=current_response.status if current_response else 301,
|
256
|
+
reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
|
257
|
+
encoding=current_response.headers.get('content-type', '') or 'utf-8',
|
258
|
+
cookies={},
|
259
|
+
headers=await current_response.all_headers() if current_response else {},
|
260
|
+
request_headers=await current_request.all_headers(),
|
261
|
+
**self.adaptor_arguments
|
262
|
+
))
|
263
|
+
current_request = current_request.redirected_from
|
264
|
+
|
222
265
|
response = Response(
|
223
266
|
url=page.url,
|
224
267
|
text=await page.content(),
|
@@ -229,6 +272,7 @@ class CamoufoxEngine:
|
|
229
272
|
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
|
230
273
|
headers=await first_response.all_headers(),
|
231
274
|
request_headers=await first_response.request.all_headers(),
|
275
|
+
history=history,
|
232
276
|
**self.adaptor_arguments
|
233
277
|
)
|
234
278
|
await page.close()
|
scrapling/engines/pw.py
CHANGED
@@ -105,7 +105,7 @@ class PlaywrightEngine:
|
|
105
105
|
"""
|
106
106
|
cdp_url = self.cdp_url
|
107
107
|
if self.nstbrowser_mode:
|
108
|
-
if self.nstbrowser_config and
|
108
|
+
if self.nstbrowser_config and isinstance(self.nstbrowser_config, dict):
|
109
109
|
config = self.nstbrowser_config
|
110
110
|
else:
|
111
111
|
query = NSTBROWSER_DEFAULT_QUERY.copy()
|
@@ -259,6 +259,26 @@ class PlaywrightEngine:
|
|
259
259
|
# PlayWright API sometimes give empty status text for some reason!
|
260
260
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
261
261
|
|
262
|
+
history = []
|
263
|
+
current_request = first_response.request.redirected_from
|
264
|
+
while current_request:
|
265
|
+
current_response = current_request.response()
|
266
|
+
|
267
|
+
history.insert(0, Response(
|
268
|
+
url=current_request.url,
|
269
|
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
270
|
+
text='',
|
271
|
+
body=b'',
|
272
|
+
status=current_response.status if current_response else 301,
|
273
|
+
reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
|
274
|
+
encoding=current_response.headers.get('content-type', '') or 'utf-8',
|
275
|
+
cookies={},
|
276
|
+
headers=current_response.all_headers() if current_response else {},
|
277
|
+
request_headers=current_request.all_headers(),
|
278
|
+
**self.adaptor_arguments
|
279
|
+
))
|
280
|
+
current_request = current_request.redirected_from
|
281
|
+
|
262
282
|
response = Response(
|
263
283
|
url=page.url,
|
264
284
|
text=page.content(),
|
@@ -269,6 +289,7 @@ class PlaywrightEngine:
|
|
269
289
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
270
290
|
headers=first_response.all_headers(),
|
271
291
|
request_headers=first_response.request.all_headers(),
|
292
|
+
history=history,
|
272
293
|
**self.adaptor_arguments
|
273
294
|
)
|
274
295
|
page.close()
|
@@ -345,6 +366,26 @@ class PlaywrightEngine:
|
|
345
366
|
# PlayWright API sometimes give empty status text for some reason!
|
346
367
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
347
368
|
|
369
|
+
history = []
|
370
|
+
current_request = first_response.request.redirected_from
|
371
|
+
while current_request:
|
372
|
+
current_response = await current_request.response()
|
373
|
+
|
374
|
+
history.insert(0, Response(
|
375
|
+
url=current_request.url,
|
376
|
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
377
|
+
text='',
|
378
|
+
body=b'',
|
379
|
+
status=current_response.status if current_response else 301,
|
380
|
+
reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
|
381
|
+
encoding=current_response.headers.get('content-type', '') or 'utf-8',
|
382
|
+
cookies={},
|
383
|
+
headers=await current_response.all_headers() if current_response else {},
|
384
|
+
request_headers=await current_request.all_headers(),
|
385
|
+
**self.adaptor_arguments
|
386
|
+
))
|
387
|
+
current_request = current_request.redirected_from
|
388
|
+
|
348
389
|
response = Response(
|
349
390
|
url=page.url,
|
350
391
|
text=await page.content(),
|
@@ -355,6 +396,7 @@ class PlaywrightEngine:
|
|
355
396
|
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
|
356
397
|
headers=await first_response.all_headers(),
|
357
398
|
request_headers=await first_response.request.all_headers(),
|
399
|
+
history=history,
|
358
400
|
**self.adaptor_arguments
|
359
401
|
)
|
360
402
|
await page.close()
|
scrapling/engines/static.py
CHANGED
@@ -72,6 +72,7 @@ class StaticEngine:
|
|
72
72
|
headers=dict(response.headers),
|
73
73
|
request_headers=dict(response.request.headers),
|
74
74
|
method=response.request.method,
|
75
|
+
history=[self._prepare_response(redirection) for redirection in response.history],
|
75
76
|
**self.adaptor_arguments
|
76
77
|
)
|
77
78
|
|
@@ -85,13 +85,14 @@ class Response(Adaptor):
|
|
85
85
|
"""This class is returned by all engines as a way to unify response type between different libraries."""
|
86
86
|
|
87
87
|
def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict,
|
88
|
-
encoding: str = 'utf-8', method: str = 'GET', **adaptor_arguments: Dict):
|
88
|
+
encoding: str = 'utf-8', method: str = 'GET', history: List = None, **adaptor_arguments: Dict):
|
89
89
|
automatch_domain = adaptor_arguments.pop('automatch_domain', None)
|
90
90
|
self.status = status
|
91
91
|
self.reason = reason
|
92
92
|
self.cookies = cookies
|
93
93
|
self.headers = headers
|
94
94
|
self.request_headers = request_headers
|
95
|
+
self.history = history or []
|
95
96
|
encoding = ResponseEncoding.get_value(encoding, text)
|
96
97
|
super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
|
97
98
|
# For back-ward compatibility
|
scrapling/fetchers.py
CHANGED
@@ -143,7 +143,7 @@ class AsyncFetcher(Fetcher):
|
|
143
143
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
144
144
|
"""
|
145
145
|
adaptor_arguments = tuple(self.adaptor_arguments.items())
|
146
|
-
response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).
|
146
|
+
response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_put(**kwargs)
|
147
147
|
return response_object
|
148
148
|
|
149
149
|
async def delete(
|
@@ -177,7 +177,7 @@ class StealthyFetcher(BaseFetcher):
|
|
177
177
|
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
|
178
178
|
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
|
179
179
|
wait_selector_state: SelectorWaitStates = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
|
180
|
-
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] =
|
180
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = False, geoip: Optional[bool] = False,
|
181
181
|
) -> Response:
|
182
182
|
"""
|
183
183
|
Opens up a browser and do your request based on your chosen options below.
|
@@ -191,7 +191,7 @@ class StealthyFetcher(BaseFetcher):
|
|
191
191
|
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
192
192
|
:param block_webrtc: Blocks WebRTC entirely.
|
193
193
|
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
194
|
-
:param disable_ads:
|
194
|
+
:param disable_ads: Disabled by default, this installs `uBlock Origin` addon on the browser if enabled.
|
195
195
|
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
196
196
|
:param allow_webgl: Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled.
|
197
197
|
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address.
|
@@ -235,7 +235,7 @@ class StealthyFetcher(BaseFetcher):
|
|
235
235
|
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
|
236
236
|
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
|
237
237
|
wait_selector_state: SelectorWaitStates = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
|
238
|
-
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] =
|
238
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = False, geoip: Optional[bool] = False,
|
239
239
|
) -> Response:
|
240
240
|
"""
|
241
241
|
Opens up a browser and do your request based on your chosen options below.
|
@@ -249,7 +249,7 @@ class StealthyFetcher(BaseFetcher):
|
|
249
249
|
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
250
250
|
:param block_webrtc: Blocks WebRTC entirely.
|
251
251
|
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
252
|
-
:param disable_ads:
|
252
|
+
:param disable_ads: Disabled by default, this installs `uBlock Origin` addon on the browser if enabled.
|
253
253
|
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
254
254
|
:param allow_webgl: Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled.
|
255
255
|
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address.
|