scrapling 0.2.92__py3-none-any.whl → 0.2.94__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
scrapling/__init__.py CHANGED
@@ -5,7 +5,7 @@ from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
5
5
  from scrapling.parser import Adaptor, Adaptors
6
6
 
7
7
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
8
- __version__ = "0.2.92"
8
+ __version__ = "0.2.94"
9
9
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
10
10
 
11
11
 
scrapling/core/_types.py CHANGED
@@ -3,7 +3,8 @@ Type definitions for type checking purposes.
3
3
  """
4
4
 
5
5
  from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable,
6
- List, Literal, Optional, Pattern, Tuple, Type, Union)
6
+ List, Literal, Optional, Pattern, Tuple, Type, TypeVar,
7
+ Union)
7
8
 
8
9
  SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
9
10
 
@@ -1,13 +1,18 @@
1
1
  import re
2
+ import typing
2
3
  from collections.abc import Mapping
3
4
  from types import MappingProxyType
4
5
 
5
6
  from orjson import dumps, loads
6
7
  from w3lib.html import replace_entities as _replace_entities
7
8
 
8
- from scrapling.core._types import Dict, List, Pattern, SupportsIndex, Union
9
+ from scrapling.core._types import (Dict, Iterable, List, Literal, Optional,
10
+ Pattern, SupportsIndex, TypeVar, Union)
9
11
  from scrapling.core.utils import _is_iterable, flatten
10
12
 
13
+ # Define type variable for AttributeHandler value type
14
+ _TextHandlerType = TypeVar('_TextHandlerType', bound='TextHandler')
15
+
11
16
 
12
17
  class TextHandler(str):
13
18
  """Extends standard Python string by adding more functionality"""
@@ -18,72 +23,89 @@ class TextHandler(str):
18
23
  return super().__new__(cls, string)
19
24
  return super().__new__(cls, '')
20
25
 
21
- # Make methods from original `str` class return `TextHandler` instead of returning `str` again
22
- # Of course, this stupid workaround is only so we can keep the auto-completion working without issues in your IDE
23
- # and I made sonnet write it for me :)
24
- def strip(self, chars=None):
26
+ @typing.overload
27
+ def __getitem__(self, key: SupportsIndex) -> 'TextHandler':
28
+ pass
29
+
30
+ @typing.overload
31
+ def __getitem__(self, key: slice) -> "TextHandlers":
32
+ pass
33
+
34
+ def __getitem__(self, key: Union[SupportsIndex, slice]) -> Union["TextHandler", "TextHandlers"]:
35
+ lst = super().__getitem__(key)
36
+ if isinstance(key, slice):
37
+ lst = [TextHandler(s) for s in lst]
38
+ return TextHandlers(typing.cast(List[_TextHandlerType], lst))
39
+ return typing.cast(_TextHandlerType, TextHandler(lst))
40
+
41
+ def split(self, sep: str = None, maxsplit: SupportsIndex = -1) -> 'TextHandlers':
42
+ return TextHandlers(
43
+ typing.cast(List[_TextHandlerType], [TextHandler(s) for s in super().split(sep, maxsplit)])
44
+ )
45
+
46
+ def strip(self, chars: str = None) -> Union[str, 'TextHandler']:
25
47
  return TextHandler(super().strip(chars))
26
48
 
27
- def lstrip(self, chars=None):
49
+ def lstrip(self, chars: str = None) -> Union[str, 'TextHandler']:
28
50
  return TextHandler(super().lstrip(chars))
29
51
 
30
- def rstrip(self, chars=None):
52
+ def rstrip(self, chars: str = None) -> Union[str, 'TextHandler']:
31
53
  return TextHandler(super().rstrip(chars))
32
54
 
33
- def capitalize(self):
55
+ def capitalize(self) -> Union[str, 'TextHandler']:
34
56
  return TextHandler(super().capitalize())
35
57
 
36
- def casefold(self):
58
+ def casefold(self) -> Union[str, 'TextHandler']:
37
59
  return TextHandler(super().casefold())
38
60
 
39
- def center(self, width, fillchar=' '):
61
+ def center(self, width: SupportsIndex, fillchar: str = ' ') -> Union[str, 'TextHandler']:
40
62
  return TextHandler(super().center(width, fillchar))
41
63
 
42
- def expandtabs(self, tabsize=8):
64
+ def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, 'TextHandler']:
43
65
  return TextHandler(super().expandtabs(tabsize))
44
66
 
45
- def format(self, *args, **kwargs):
67
+ def format(self, *args: str, **kwargs: str) -> Union[str, 'TextHandler']:
46
68
  return TextHandler(super().format(*args, **kwargs))
47
69
 
48
- def format_map(self, mapping):
70
+ def format_map(self, mapping) -> Union[str, 'TextHandler']:
49
71
  return TextHandler(super().format_map(mapping))
50
72
 
51
- def join(self, iterable):
73
+ def join(self, iterable: Iterable[str]) -> Union[str, 'TextHandler']:
52
74
  return TextHandler(super().join(iterable))
53
75
 
54
- def ljust(self, width, fillchar=' '):
76
+ def ljust(self, width: SupportsIndex, fillchar: str = ' ') -> Union[str, 'TextHandler']:
55
77
  return TextHandler(super().ljust(width, fillchar))
56
78
 
57
- def rjust(self, width, fillchar=' '):
79
+ def rjust(self, width: SupportsIndex, fillchar: str = ' ') -> Union[str, 'TextHandler']:
58
80
  return TextHandler(super().rjust(width, fillchar))
59
81
 
60
- def swapcase(self):
82
+ def swapcase(self) -> Union[str, 'TextHandler']:
61
83
  return TextHandler(super().swapcase())
62
84
 
63
- def title(self):
85
+ def title(self) -> Union[str, 'TextHandler']:
64
86
  return TextHandler(super().title())
65
87
 
66
- def translate(self, table):
88
+ def translate(self, table) -> Union[str, 'TextHandler']:
67
89
  return TextHandler(super().translate(table))
68
90
 
69
- def zfill(self, width):
91
+ def zfill(self, width: SupportsIndex) -> Union[str, 'TextHandler']:
70
92
  return TextHandler(super().zfill(width))
71
93
 
72
- def replace(self, old, new, count=-1):
94
+ def replace(self, old: str, new: str, count: SupportsIndex = -1) -> Union[str, 'TextHandler']:
73
95
  return TextHandler(super().replace(old, new, count))
74
96
 
75
- def upper(self):
97
+ def upper(self) -> Union[str, 'TextHandler']:
76
98
  return TextHandler(super().upper())
77
99
 
78
- def lower(self):
100
+ def lower(self) -> Union[str, 'TextHandler']:
79
101
  return TextHandler(super().lower())
80
102
  ##############
81
103
 
82
- def sort(self, reverse: bool = False) -> str:
104
+ def sort(self, reverse: bool = False) -> Union[str, 'TextHandler']:
83
105
  """Return a sorted version of the string"""
84
106
  return self.__class__("".join(sorted(self, reverse=reverse)))
85
107
 
86
- def clean(self) -> str:
108
+ def clean(self) -> Union[str, 'TextHandler']:
87
109
  """Return a new version of the string after removing all white spaces and consecutive spaces"""
88
110
  data = re.sub(r'[\t|\r|\n]', '', self)
89
111
  data = re.sub(' +', ' ', data)
@@ -105,21 +127,43 @@ class TextHandler(str):
105
127
  # Check this out: https://github.com/ijl/orjson/issues/445
106
128
  return loads(str(self))
107
129
 
130
+ @typing.overload
131
+ def re(
132
+ self,
133
+ regex: Union[str, Pattern[str]],
134
+ check_match: Literal[True],
135
+ replace_entities: bool = True,
136
+ clean_match: bool = False,
137
+ case_sensitive: bool = True,
138
+ ) -> bool:
139
+ ...
140
+
141
+ @typing.overload
142
+ def re(
143
+ self,
144
+ regex: Union[str, Pattern[str]],
145
+ replace_entities: bool = True,
146
+ clean_match: bool = False,
147
+ case_sensitive: bool = True,
148
+ check_match: Literal[False] = False,
149
+ ) -> "TextHandlers[TextHandler]":
150
+ ...
151
+
108
152
  def re(
109
153
  self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
110
- case_sensitive: bool = False, check_match: bool = False
111
- ) -> Union[List[str], bool]:
154
+ case_sensitive: bool = True, check_match: bool = False
155
+ ) -> Union["TextHandlers[TextHandler]", bool]:
112
156
  """Apply the given regex to the current text and return a list of strings with the matches.
113
157
 
114
158
  :param regex: Can be either a compiled regular expression or a string.
115
159
  :param replace_entities: if enabled character entity references are replaced by their corresponding character
116
160
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
117
- :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
161
+ :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
118
162
  :param check_match: used to quickly check if this regex matches or not without any operations on the results
119
163
 
120
164
  """
121
165
  if isinstance(regex, str):
122
- if not case_sensitive:
166
+ if case_sensitive:
123
167
  regex = re.compile(regex, re.UNICODE)
124
168
  else:
125
169
  regex = re.compile(regex, flags=re.UNICODE | re.IGNORECASE)
@@ -133,19 +177,19 @@ class TextHandler(str):
133
177
  results = flatten(results)
134
178
 
135
179
  if not replace_entities:
136
- return [TextHandler(string) for string in results]
180
+ return TextHandlers(typing.cast(List[_TextHandlerType], [TextHandler(string) for string in results]))
137
181
 
138
- return [TextHandler(_replace_entities(s)) for s in results]
182
+ return TextHandlers(typing.cast(List[_TextHandlerType], [TextHandler(_replace_entities(s)) for s in results]))
139
183
 
140
184
  def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
141
- clean_match: bool = False, case_sensitive: bool = False) -> Union[str, None]:
185
+ clean_match: bool = False, case_sensitive: bool = True) -> "TextHandler":
142
186
  """Apply the given regex to text and return the first match if found, otherwise return the default value.
143
187
 
144
188
  :param regex: Can be either a compiled regular expression or a string.
145
189
  :param default: The default value to be returned if there is no match
146
190
  :param replace_entities: if enabled character entity references are replaced by their corresponding character
147
191
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
148
- :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
192
+ :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
149
193
 
150
194
  """
151
195
  result = self.re(regex, replace_entities, clean_match=clean_match, case_sensitive=case_sensitive)
@@ -158,30 +202,38 @@ class TextHandlers(List[TextHandler]):
158
202
  """
159
203
  __slots__ = ()
160
204
 
161
- def __getitem__(self, pos: Union[SupportsIndex, slice]) -> Union[TextHandler, "TextHandlers[TextHandler]"]:
205
+ @typing.overload
206
+ def __getitem__(self, pos: SupportsIndex) -> TextHandler:
207
+ pass
208
+
209
+ @typing.overload
210
+ def __getitem__(self, pos: slice) -> "TextHandlers":
211
+ pass
212
+
213
+ def __getitem__(self, pos: Union[SupportsIndex, slice]) -> Union[TextHandler, "TextHandlers"]:
162
214
  lst = super().__getitem__(pos)
163
215
  if isinstance(pos, slice):
164
- return self.__class__(lst)
165
- else:
166
- return lst
216
+ lst = [TextHandler(s) for s in lst]
217
+ return TextHandlers(typing.cast(List[_TextHandlerType], lst))
218
+ return typing.cast(_TextHandlerType, TextHandler(lst))
167
219
 
168
220
  def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
169
- case_sensitive: bool = False) -> 'List[str]':
221
+ case_sensitive: bool = True) -> 'TextHandlers[TextHandler]':
170
222
  """Call the ``.re()`` method for each element in this list and return
171
223
  their results flattened as TextHandlers.
172
224
 
173
225
  :param regex: Can be either a compiled regular expression or a string.
174
226
  :param replace_entities: if enabled character entity references are replaced by their corresponding character
175
227
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
176
- :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
228
+ :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
177
229
  """
178
230
  results = [
179
231
  n.re(regex, replace_entities, clean_match, case_sensitive) for n in self
180
232
  ]
181
- return flatten(results)
233
+ return TextHandlers(flatten(results))
182
234
 
183
235
  def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
184
- clean_match: bool = False, case_sensitive: bool = False) -> Union[str, None]:
236
+ clean_match: bool = False, case_sensitive: bool = True) -> TextHandler:
185
237
  """Call the ``.re_first()`` method for each element in this list and return
186
238
  the first result or the default value otherwise.
187
239
 
@@ -189,7 +241,7 @@ class TextHandlers(List[TextHandler]):
189
241
  :param default: The default value to be returned if there is no match
190
242
  :param replace_entities: if enabled character entity references are replaced by their corresponding character
191
243
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
192
- :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
244
+ :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
193
245
  """
194
246
  for n in self:
195
247
  for result in n.re(regex, replace_entities, clean_match, case_sensitive):
@@ -210,7 +262,7 @@ class TextHandlers(List[TextHandler]):
210
262
  get_all = extract
211
263
 
212
264
 
213
- class AttributesHandler(Mapping):
265
+ class AttributesHandler(Mapping[str, _TextHandlerType]):
214
266
  """A read-only mapping to use instead of the standard dictionary for the speed boost but at the same time I use it to add more functionalities.
215
267
  If standard dictionary is needed, just convert this class to dictionary with `dict` function
216
268
  """
@@ -231,7 +283,7 @@ class AttributesHandler(Mapping):
231
283
  # Fastest read-only mapping type
232
284
  self._data = MappingProxyType(mapping)
233
285
 
234
- def get(self, key, default=None):
286
+ def get(self, key: str, default: Optional[str] = None) -> Union[_TextHandlerType, None]:
235
287
  """Acts like standard dictionary `.get()` method"""
236
288
  return self._data.get(key, default)
237
289
 
@@ -253,7 +305,7 @@ class AttributesHandler(Mapping):
253
305
  """Convert current attributes to JSON string if the attributes are JSON serializable otherwise throws error"""
254
306
  return dumps(dict(self._data))
255
307
 
256
- def __getitem__(self, key):
308
+ def __getitem__(self, key: str) -> _TextHandlerType:
257
309
  return self._data[key]
258
310
 
259
311
  def __iter__(self):
@@ -139,6 +139,6 @@ class TranslatorMixin:
139
139
 
140
140
 
141
141
  class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
142
- @lru_cache(maxsize=256)
142
+ @lru_cache(maxsize=2048)
143
143
  def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
144
144
  return super().css_to_xpath(css, prefix)
scrapling/defaults.py CHANGED
@@ -1,7 +1,10 @@
1
- from .fetchers import AsyncFetcher, Fetcher, PlayWrightFetcher, StealthyFetcher
1
+ from .fetchers import AsyncFetcher as _AsyncFetcher
2
+ from .fetchers import Fetcher as _Fetcher
3
+ from .fetchers import PlayWrightFetcher as _PlayWrightFetcher
4
+ from .fetchers import StealthyFetcher as _StealthyFetcher
2
5
 
3
6
  # If you are going to use Fetchers with the default settings, import them from this file instead for a cleaner looking code
4
- Fetcher = Fetcher()
5
- AsyncFetcher = AsyncFetcher()
6
- StealthyFetcher = StealthyFetcher()
7
- PlayWrightFetcher = PlayWrightFetcher()
7
+ Fetcher = _Fetcher()
8
+ AsyncFetcher = _AsyncFetcher()
9
+ StealthyFetcher = _StealthyFetcher()
10
+ PlayWrightFetcher = _PlayWrightFetcher()
scrapling/engines/camo.py CHANGED
@@ -19,7 +19,7 @@ class CamoufoxEngine:
19
19
  block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
20
20
  timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
21
21
  wait_selector_state: Optional[SelectorWaitStates] = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
22
- proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
22
+ proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = False,
23
23
  geoip: Optional[bool] = False,
24
24
  adaptor_arguments: Dict = None,
25
25
  ):
@@ -36,7 +36,7 @@ class CamoufoxEngine:
36
36
  :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
37
37
  :param allow_webgl: Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled.
38
38
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
39
- :param disable_ads: Enabled by default, this installs `uBlock Origin` addon on the browser if enabled.
39
+ :param disable_ads: Disabled by default, this installs `uBlock Origin` addon on the browser if enabled.
40
40
  :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
41
41
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
42
42
  :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
@@ -95,6 +95,7 @@ class CamoufoxEngine:
95
95
  with Camoufox(
96
96
  geoip=self.geoip,
97
97
  proxy=self.proxy,
98
+ enable_cache=True,
98
99
  addons=self.addons,
99
100
  exclude_addons=addons,
100
101
  headless=self.headless,
@@ -140,6 +141,26 @@ class CamoufoxEngine:
140
141
  # PlayWright API sometimes give empty status text for some reason!
141
142
  status_text = final_response.status_text or StatusText.get(final_response.status)
142
143
 
144
+ history = []
145
+ current_request = first_response.request.redirected_from
146
+ while current_request:
147
+ current_response = current_request.response()
148
+
149
+ history.insert(0, Response(
150
+ url=current_request.url,
151
+ # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
152
+ text='',
153
+ body=b'',
154
+ status=current_response.status if current_response else 301,
155
+ reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
156
+ encoding=current_response.headers.get('content-type', '') or 'utf-8',
157
+ cookies={},
158
+ headers=current_response.all_headers() if current_response else {},
159
+ request_headers=current_request.all_headers(),
160
+ **self.adaptor_arguments
161
+ ))
162
+ current_request = current_request.redirected_from
163
+
143
164
  response = Response(
144
165
  url=page.url,
145
166
  text=page.content(),
@@ -150,6 +171,7 @@ class CamoufoxEngine:
150
171
  cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
151
172
  headers=first_response.all_headers(),
152
173
  request_headers=first_response.request.all_headers(),
174
+ history=history,
153
175
  **self.adaptor_arguments
154
176
  )
155
177
  page.close()
@@ -174,6 +196,7 @@ class CamoufoxEngine:
174
196
  async with AsyncCamoufox(
175
197
  geoip=self.geoip,
176
198
  proxy=self.proxy,
199
+ enable_cache=True,
177
200
  addons=self.addons,
178
201
  exclude_addons=addons,
179
202
  headless=self.headless,
@@ -219,6 +242,26 @@ class CamoufoxEngine:
219
242
  # PlayWright API sometimes give empty status text for some reason!
220
243
  status_text = final_response.status_text or StatusText.get(final_response.status)
221
244
 
245
+ history = []
246
+ current_request = first_response.request.redirected_from
247
+ while current_request:
248
+ current_response = await current_request.response()
249
+
250
+ history.insert(0, Response(
251
+ url=current_request.url,
252
+ # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
253
+ text='',
254
+ body=b'',
255
+ status=current_response.status if current_response else 301,
256
+ reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
257
+ encoding=current_response.headers.get('content-type', '') or 'utf-8',
258
+ cookies={},
259
+ headers=await current_response.all_headers() if current_response else {},
260
+ request_headers=await current_request.all_headers(),
261
+ **self.adaptor_arguments
262
+ ))
263
+ current_request = current_request.redirected_from
264
+
222
265
  response = Response(
223
266
  url=page.url,
224
267
  text=await page.content(),
@@ -229,6 +272,7 @@ class CamoufoxEngine:
229
272
  cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
230
273
  headers=await first_response.all_headers(),
231
274
  request_headers=await first_response.request.all_headers(),
275
+ history=history,
232
276
  **self.adaptor_arguments
233
277
  )
234
278
  await page.close()
scrapling/engines/pw.py CHANGED
@@ -105,7 +105,7 @@ class PlaywrightEngine:
105
105
  """
106
106
  cdp_url = self.cdp_url
107
107
  if self.nstbrowser_mode:
108
- if self.nstbrowser_config and type(self.nstbrowser_config) is Dict:
108
+ if self.nstbrowser_config and isinstance(self.nstbrowser_config, dict):
109
109
  config = self.nstbrowser_config
110
110
  else:
111
111
  query = NSTBROWSER_DEFAULT_QUERY.copy()
@@ -259,6 +259,26 @@ class PlaywrightEngine:
259
259
  # PlayWright API sometimes give empty status text for some reason!
260
260
  status_text = final_response.status_text or StatusText.get(final_response.status)
261
261
 
262
+ history = []
263
+ current_request = first_response.request.redirected_from
264
+ while current_request:
265
+ current_response = current_request.response()
266
+
267
+ history.insert(0, Response(
268
+ url=current_request.url,
269
+ # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
270
+ text='',
271
+ body=b'',
272
+ status=current_response.status if current_response else 301,
273
+ reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
274
+ encoding=current_response.headers.get('content-type', '') or 'utf-8',
275
+ cookies={},
276
+ headers=current_response.all_headers() if current_response else {},
277
+ request_headers=current_request.all_headers(),
278
+ **self.adaptor_arguments
279
+ ))
280
+ current_request = current_request.redirected_from
281
+
262
282
  response = Response(
263
283
  url=page.url,
264
284
  text=page.content(),
@@ -269,6 +289,7 @@ class PlaywrightEngine:
269
289
  cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
270
290
  headers=first_response.all_headers(),
271
291
  request_headers=first_response.request.all_headers(),
292
+ history=history,
272
293
  **self.adaptor_arguments
273
294
  )
274
295
  page.close()
@@ -345,6 +366,26 @@ class PlaywrightEngine:
345
366
  # PlayWright API sometimes give empty status text for some reason!
346
367
  status_text = final_response.status_text or StatusText.get(final_response.status)
347
368
 
369
+ history = []
370
+ current_request = first_response.request.redirected_from
371
+ while current_request:
372
+ current_response = await current_request.response()
373
+
374
+ history.insert(0, Response(
375
+ url=current_request.url,
376
+ # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
377
+ text='',
378
+ body=b'',
379
+ status=current_response.status if current_response else 301,
380
+ reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
381
+ encoding=current_response.headers.get('content-type', '') or 'utf-8',
382
+ cookies={},
383
+ headers=await current_response.all_headers() if current_response else {},
384
+ request_headers=await current_request.all_headers(),
385
+ **self.adaptor_arguments
386
+ ))
387
+ current_request = current_request.redirected_from
388
+
348
389
  response = Response(
349
390
  url=page.url,
350
391
  text=await page.content(),
@@ -355,6 +396,7 @@ class PlaywrightEngine:
355
396
  cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
356
397
  headers=await first_response.all_headers(),
357
398
  request_headers=await first_response.request.all_headers(),
399
+ history=history,
358
400
  **self.adaptor_arguments
359
401
  )
360
402
  await page.close()
@@ -72,6 +72,7 @@ class StaticEngine:
72
72
  headers=dict(response.headers),
73
73
  request_headers=dict(response.request.headers),
74
74
  method=response.request.method,
75
+ history=[self._prepare_response(redirection) for redirection in response.history],
75
76
  **self.adaptor_arguments
76
77
  )
77
78
 
@@ -85,13 +85,14 @@ class Response(Adaptor):
85
85
  """This class is returned by all engines as a way to unify response type between different libraries."""
86
86
 
87
87
  def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict,
88
- encoding: str = 'utf-8', method: str = 'GET', **adaptor_arguments: Dict):
88
+ encoding: str = 'utf-8', method: str = 'GET', history: List = None, **adaptor_arguments: Dict):
89
89
  automatch_domain = adaptor_arguments.pop('automatch_domain', None)
90
90
  self.status = status
91
91
  self.reason = reason
92
92
  self.cookies = cookies
93
93
  self.headers = headers
94
94
  self.request_headers = request_headers
95
+ self.history = history or []
95
96
  encoding = ResponseEncoding.get_value(encoding, text)
96
97
  super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
97
98
  # For back-ward compatibility
scrapling/fetchers.py CHANGED
@@ -143,7 +143,7 @@ class AsyncFetcher(Fetcher):
143
143
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
144
144
  """
145
145
  adaptor_arguments = tuple(self.adaptor_arguments.items())
146
- response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_post(**kwargs)
146
+ response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_put(**kwargs)
147
147
  return response_object
148
148
 
149
149
  async def delete(
@@ -177,7 +177,7 @@ class StealthyFetcher(BaseFetcher):
177
177
  block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
178
178
  timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
179
179
  wait_selector_state: SelectorWaitStates = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
180
- proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
180
+ proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = False, geoip: Optional[bool] = False,
181
181
  ) -> Response:
182
182
  """
183
183
  Opens up a browser and do your request based on your chosen options below.
@@ -191,7 +191,7 @@ class StealthyFetcher(BaseFetcher):
191
191
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
192
192
  :param block_webrtc: Blocks WebRTC entirely.
193
193
  :param addons: List of Firefox addons to use. Must be paths to extracted addons.
194
- :param disable_ads: Enabled by default, this installs `uBlock Origin` addon on the browser if enabled.
194
+ :param disable_ads: Disabled by default, this installs `uBlock Origin` addon on the browser if enabled.
195
195
  :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
196
196
  :param allow_webgl: Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled.
197
197
  :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address.
@@ -235,7 +235,7 @@ class StealthyFetcher(BaseFetcher):
235
235
  block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
236
236
  timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
237
237
  wait_selector_state: SelectorWaitStates = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
238
- proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
238
+ proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = False, geoip: Optional[bool] = False,
239
239
  ) -> Response:
240
240
  """
241
241
  Opens up a browser and do your request based on your chosen options below.
@@ -249,7 +249,7 @@ class StealthyFetcher(BaseFetcher):
249
249
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
250
250
  :param block_webrtc: Blocks WebRTC entirely.
251
251
  :param addons: List of Firefox addons to use. Must be paths to extracted addons.
252
- :param disable_ads: Enabled by default, this installs `uBlock Origin` addon on the browser if enabled.
252
+ :param disable_ads: Disabled by default, this installs `uBlock Origin` addon on the browser if enabled.
253
253
  :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
254
254
  :param allow_webgl: Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled.
255
255
  :param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address.