scrapling 0.2.93__py3-none-any.whl → 0.2.95__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/core/custom_types.py +11 -11
- scrapling/engines/camo.py +42 -2
- scrapling/engines/pw.py +42 -0
- scrapling/engines/static.py +10 -6
- scrapling/engines/toolbelt/custom.py +2 -1
- scrapling/parser.py +9 -9
- {scrapling-0.2.93.dist-info → scrapling-0.2.95.dist-info}/METADATA +6 -4
- {scrapling-0.2.93.dist-info → scrapling-0.2.95.dist-info}/RECORD +13 -13
- {scrapling-0.2.93.dist-info → scrapling-0.2.95.dist-info}/WHEEL +1 -1
- {scrapling-0.2.93.dist-info → scrapling-0.2.95.dist-info}/LICENSE +0 -0
- {scrapling-0.2.93.dist-info → scrapling-0.2.95.dist-info}/entry_points.txt +0 -0
- {scrapling-0.2.93.dist-info → scrapling-0.2.95.dist-info}/top_level.txt +0 -0
scrapling/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
|
|
5
5
|
from scrapling.parser import Adaptor, Adaptors
|
6
6
|
|
7
7
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
8
|
-
__version__ = "0.2.
|
8
|
+
__version__ = "0.2.95"
|
9
9
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
10
10
|
|
11
11
|
|
scrapling/core/custom_types.py
CHANGED
@@ -134,7 +134,7 @@ class TextHandler(str):
|
|
134
134
|
check_match: Literal[True],
|
135
135
|
replace_entities: bool = True,
|
136
136
|
clean_match: bool = False,
|
137
|
-
case_sensitive: bool =
|
137
|
+
case_sensitive: bool = True,
|
138
138
|
) -> bool:
|
139
139
|
...
|
140
140
|
|
@@ -144,26 +144,26 @@ class TextHandler(str):
|
|
144
144
|
regex: Union[str, Pattern[str]],
|
145
145
|
replace_entities: bool = True,
|
146
146
|
clean_match: bool = False,
|
147
|
-
case_sensitive: bool =
|
147
|
+
case_sensitive: bool = True,
|
148
148
|
check_match: Literal[False] = False,
|
149
149
|
) -> "TextHandlers[TextHandler]":
|
150
150
|
...
|
151
151
|
|
152
152
|
def re(
|
153
153
|
self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
|
154
|
-
case_sensitive: bool =
|
154
|
+
case_sensitive: bool = True, check_match: bool = False
|
155
155
|
) -> Union["TextHandlers[TextHandler]", bool]:
|
156
156
|
"""Apply the given regex to the current text and return a list of strings with the matches.
|
157
157
|
|
158
158
|
:param regex: Can be either a compiled regular expression or a string.
|
159
159
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
160
160
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
161
|
-
:param case_sensitive: if
|
161
|
+
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
162
162
|
:param check_match: used to quickly check if this regex matches or not without any operations on the results
|
163
163
|
|
164
164
|
"""
|
165
165
|
if isinstance(regex, str):
|
166
|
-
if
|
166
|
+
if case_sensitive:
|
167
167
|
regex = re.compile(regex, re.UNICODE)
|
168
168
|
else:
|
169
169
|
regex = re.compile(regex, flags=re.UNICODE | re.IGNORECASE)
|
@@ -182,14 +182,14 @@ class TextHandler(str):
|
|
182
182
|
return TextHandlers(typing.cast(List[_TextHandlerType], [TextHandler(_replace_entities(s)) for s in results]))
|
183
183
|
|
184
184
|
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
185
|
-
clean_match: bool = False, case_sensitive: bool =
|
185
|
+
clean_match: bool = False, case_sensitive: bool = True) -> "TextHandler":
|
186
186
|
"""Apply the given regex to text and return the first match if found, otherwise return the default value.
|
187
187
|
|
188
188
|
:param regex: Can be either a compiled regular expression or a string.
|
189
189
|
:param default: The default value to be returned if there is no match
|
190
190
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
191
191
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
192
|
-
:param case_sensitive: if
|
192
|
+
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
193
193
|
|
194
194
|
"""
|
195
195
|
result = self.re(regex, replace_entities, clean_match=clean_match, case_sensitive=case_sensitive)
|
@@ -218,14 +218,14 @@ class TextHandlers(List[TextHandler]):
|
|
218
218
|
return typing.cast(_TextHandlerType, TextHandler(lst))
|
219
219
|
|
220
220
|
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
|
221
|
-
case_sensitive: bool =
|
221
|
+
case_sensitive: bool = True) -> 'TextHandlers[TextHandler]':
|
222
222
|
"""Call the ``.re()`` method for each element in this list and return
|
223
223
|
their results flattened as TextHandlers.
|
224
224
|
|
225
225
|
:param regex: Can be either a compiled regular expression or a string.
|
226
226
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
227
227
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
228
|
-
:param case_sensitive: if
|
228
|
+
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
229
229
|
"""
|
230
230
|
results = [
|
231
231
|
n.re(regex, replace_entities, clean_match, case_sensitive) for n in self
|
@@ -233,7 +233,7 @@ class TextHandlers(List[TextHandler]):
|
|
233
233
|
return TextHandlers(flatten(results))
|
234
234
|
|
235
235
|
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
236
|
-
clean_match: bool = False, case_sensitive: bool =
|
236
|
+
clean_match: bool = False, case_sensitive: bool = True) -> TextHandler:
|
237
237
|
"""Call the ``.re_first()`` method for each element in this list and return
|
238
238
|
the first result or the default value otherwise.
|
239
239
|
|
@@ -241,7 +241,7 @@ class TextHandlers(List[TextHandler]):
|
|
241
241
|
:param default: The default value to be returned if there is no match
|
242
242
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
243
243
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
244
|
-
:param case_sensitive: if
|
244
|
+
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
245
245
|
"""
|
246
246
|
for n in self:
|
247
247
|
for result in n.re(regex, replace_entities, clean_match, case_sensitive):
|
scrapling/engines/camo.py
CHANGED
@@ -95,7 +95,6 @@ class CamoufoxEngine:
|
|
95
95
|
with Camoufox(
|
96
96
|
geoip=self.geoip,
|
97
97
|
proxy=self.proxy,
|
98
|
-
disable_coop=True,
|
99
98
|
enable_cache=True,
|
100
99
|
addons=self.addons,
|
101
100
|
exclude_addons=addons,
|
@@ -142,6 +141,26 @@ class CamoufoxEngine:
|
|
142
141
|
# PlayWright API sometimes give empty status text for some reason!
|
143
142
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
144
143
|
|
144
|
+
history = []
|
145
|
+
current_request = first_response.request.redirected_from
|
146
|
+
while current_request:
|
147
|
+
current_response = current_request.response()
|
148
|
+
|
149
|
+
history.insert(0, Response(
|
150
|
+
url=current_request.url,
|
151
|
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
152
|
+
text='',
|
153
|
+
body=b'',
|
154
|
+
status=current_response.status if current_response else 301,
|
155
|
+
reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
|
156
|
+
encoding=current_response.headers.get('content-type', '') or 'utf-8',
|
157
|
+
cookies={},
|
158
|
+
headers=current_response.all_headers() if current_response else {},
|
159
|
+
request_headers=current_request.all_headers(),
|
160
|
+
**self.adaptor_arguments
|
161
|
+
))
|
162
|
+
current_request = current_request.redirected_from
|
163
|
+
|
145
164
|
response = Response(
|
146
165
|
url=page.url,
|
147
166
|
text=page.content(),
|
@@ -152,6 +171,7 @@ class CamoufoxEngine:
|
|
152
171
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
153
172
|
headers=first_response.all_headers(),
|
154
173
|
request_headers=first_response.request.all_headers(),
|
174
|
+
history=history,
|
155
175
|
**self.adaptor_arguments
|
156
176
|
)
|
157
177
|
page.close()
|
@@ -176,7 +196,6 @@ class CamoufoxEngine:
|
|
176
196
|
async with AsyncCamoufox(
|
177
197
|
geoip=self.geoip,
|
178
198
|
proxy=self.proxy,
|
179
|
-
disable_coop=True,
|
180
199
|
enable_cache=True,
|
181
200
|
addons=self.addons,
|
182
201
|
exclude_addons=addons,
|
@@ -223,6 +242,26 @@ class CamoufoxEngine:
|
|
223
242
|
# PlayWright API sometimes give empty status text for some reason!
|
224
243
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
225
244
|
|
245
|
+
history = []
|
246
|
+
current_request = first_response.request.redirected_from
|
247
|
+
while current_request:
|
248
|
+
current_response = await current_request.response()
|
249
|
+
|
250
|
+
history.insert(0, Response(
|
251
|
+
url=current_request.url,
|
252
|
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
253
|
+
text='',
|
254
|
+
body=b'',
|
255
|
+
status=current_response.status if current_response else 301,
|
256
|
+
reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
|
257
|
+
encoding=current_response.headers.get('content-type', '') or 'utf-8',
|
258
|
+
cookies={},
|
259
|
+
headers=await current_response.all_headers() if current_response else {},
|
260
|
+
request_headers=await current_request.all_headers(),
|
261
|
+
**self.adaptor_arguments
|
262
|
+
))
|
263
|
+
current_request = current_request.redirected_from
|
264
|
+
|
226
265
|
response = Response(
|
227
266
|
url=page.url,
|
228
267
|
text=await page.content(),
|
@@ -233,6 +272,7 @@ class CamoufoxEngine:
|
|
233
272
|
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
|
234
273
|
headers=await first_response.all_headers(),
|
235
274
|
request_headers=await first_response.request.all_headers(),
|
275
|
+
history=history,
|
236
276
|
**self.adaptor_arguments
|
237
277
|
)
|
238
278
|
await page.close()
|
scrapling/engines/pw.py
CHANGED
@@ -259,6 +259,26 @@ class PlaywrightEngine:
|
|
259
259
|
# PlayWright API sometimes give empty status text for some reason!
|
260
260
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
261
261
|
|
262
|
+
history = []
|
263
|
+
current_request = first_response.request.redirected_from
|
264
|
+
while current_request:
|
265
|
+
current_response = current_request.response()
|
266
|
+
|
267
|
+
history.insert(0, Response(
|
268
|
+
url=current_request.url,
|
269
|
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
270
|
+
text='',
|
271
|
+
body=b'',
|
272
|
+
status=current_response.status if current_response else 301,
|
273
|
+
reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
|
274
|
+
encoding=current_response.headers.get('content-type', '') or 'utf-8',
|
275
|
+
cookies={},
|
276
|
+
headers=current_response.all_headers() if current_response else {},
|
277
|
+
request_headers=current_request.all_headers(),
|
278
|
+
**self.adaptor_arguments
|
279
|
+
))
|
280
|
+
current_request = current_request.redirected_from
|
281
|
+
|
262
282
|
response = Response(
|
263
283
|
url=page.url,
|
264
284
|
text=page.content(),
|
@@ -269,6 +289,7 @@ class PlaywrightEngine:
|
|
269
289
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
270
290
|
headers=first_response.all_headers(),
|
271
291
|
request_headers=first_response.request.all_headers(),
|
292
|
+
history=history,
|
272
293
|
**self.adaptor_arguments
|
273
294
|
)
|
274
295
|
page.close()
|
@@ -345,6 +366,26 @@ class PlaywrightEngine:
|
|
345
366
|
# PlayWright API sometimes give empty status text for some reason!
|
346
367
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
347
368
|
|
369
|
+
history = []
|
370
|
+
current_request = first_response.request.redirected_from
|
371
|
+
while current_request:
|
372
|
+
current_response = await current_request.response()
|
373
|
+
|
374
|
+
history.insert(0, Response(
|
375
|
+
url=current_request.url,
|
376
|
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
377
|
+
text='',
|
378
|
+
body=b'',
|
379
|
+
status=current_response.status if current_response else 301,
|
380
|
+
reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
|
381
|
+
encoding=current_response.headers.get('content-type', '') or 'utf-8',
|
382
|
+
cookies={},
|
383
|
+
headers=await current_response.all_headers() if current_response else {},
|
384
|
+
request_headers=await current_request.all_headers(),
|
385
|
+
**self.adaptor_arguments
|
386
|
+
))
|
387
|
+
current_request = current_request.redirected_from
|
388
|
+
|
348
389
|
response = Response(
|
349
390
|
url=page.url,
|
350
391
|
text=await page.content(),
|
@@ -355,6 +396,7 @@ class PlaywrightEngine:
|
|
355
396
|
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
|
356
397
|
headers=await first_response.all_headers(),
|
357
398
|
request_headers=await first_response.request.all_headers(),
|
399
|
+
history=history,
|
358
400
|
**self.adaptor_arguments
|
359
401
|
)
|
360
402
|
await page.close()
|
scrapling/engines/static.py
CHANGED
@@ -42,16 +42,19 @@ class StaticEngine:
|
|
42
42
|
:return: A dictionary of the new headers.
|
43
43
|
"""
|
44
44
|
headers = headers or {}
|
45
|
-
|
46
|
-
# Validate headers
|
47
|
-
if not headers.get('user-agent') and not headers.get('User-Agent'):
|
48
|
-
headers['User-Agent'] = generate_headers(browser_mode=False).get('User-Agent')
|
49
|
-
log.debug(f"Can't find useragent in headers so '{headers['User-Agent']}' was used.")
|
45
|
+
headers_keys = set(map(str.lower, headers.keys()))
|
50
46
|
|
51
47
|
if self.stealth:
|
52
48
|
extra_headers = generate_headers(browser_mode=False)
|
49
|
+
# Don't overwrite user supplied headers
|
50
|
+
extra_headers = {key: value for key, value in extra_headers.items() if key.lower() not in headers_keys}
|
53
51
|
headers.update(extra_headers)
|
54
|
-
|
52
|
+
if 'referer' not in headers_keys:
|
53
|
+
headers.update({'referer': generate_convincing_referer(self.url)})
|
54
|
+
|
55
|
+
elif 'user-agent' not in headers_keys:
|
56
|
+
headers['User-Agent'] = generate_headers(browser_mode=False).get('User-Agent')
|
57
|
+
log.debug(f"Can't find useragent in headers so '{headers['User-Agent']}' was used.")
|
55
58
|
|
56
59
|
return headers
|
57
60
|
|
@@ -72,6 +75,7 @@ class StaticEngine:
|
|
72
75
|
headers=dict(response.headers),
|
73
76
|
request_headers=dict(response.request.headers),
|
74
77
|
method=response.request.method,
|
78
|
+
history=[self._prepare_response(redirection) for redirection in response.history],
|
75
79
|
**self.adaptor_arguments
|
76
80
|
)
|
77
81
|
|
@@ -85,13 +85,14 @@ class Response(Adaptor):
|
|
85
85
|
"""This class is returned by all engines as a way to unify response type between different libraries."""
|
86
86
|
|
87
87
|
def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict,
|
88
|
-
encoding: str = 'utf-8', method: str = 'GET', **adaptor_arguments: Dict):
|
88
|
+
encoding: str = 'utf-8', method: str = 'GET', history: List = None, **adaptor_arguments: Dict):
|
89
89
|
automatch_domain = adaptor_arguments.pop('automatch_domain', None)
|
90
90
|
self.status = status
|
91
91
|
self.reason = reason
|
92
92
|
self.cookies = cookies
|
93
93
|
self.headers = headers
|
94
94
|
self.request_headers = request_headers
|
95
|
+
self.history = history or []
|
95
96
|
encoding = ResponseEncoding.get_value(encoding, text)
|
96
97
|
super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
|
97
98
|
# For back-ward compatibility
|
scrapling/parser.py
CHANGED
@@ -132,7 +132,7 @@ class Adaptor(SelectorsGeneration):
|
|
132
132
|
self.__tag = None
|
133
133
|
# No need to check if all response attributes exist or not because if `status` exist, then the rest exist (Save some CPU cycles for speed)
|
134
134
|
self.__response_data = {
|
135
|
-
key: getattr(self, key) for key in ('status', 'reason', 'cookies', 'headers', 'request_headers',)
|
135
|
+
key: getattr(self, key) for key in ('status', 'reason', 'cookies', 'history', 'headers', 'request_headers',)
|
136
136
|
} if hasattr(self, 'status') else {}
|
137
137
|
|
138
138
|
# Node functionalities, I wanted to move to separate Mixin class but it had slight impact on performance
|
@@ -763,25 +763,25 @@ class Adaptor(SelectorsGeneration):
|
|
763
763
|
return self.get_all_text(strip=True).json()
|
764
764
|
|
765
765
|
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True,
|
766
|
-
clean_match: bool = False, case_sensitive: bool =
|
766
|
+
clean_match: bool = False, case_sensitive: bool = True) -> TextHandlers:
|
767
767
|
"""Apply the given regex to the current text and return a list of strings with the matches.
|
768
768
|
|
769
769
|
:param regex: Can be either a compiled regular expression or a string.
|
770
770
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
771
771
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
772
|
-
:param case_sensitive: if
|
772
|
+
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
773
773
|
"""
|
774
774
|
return self.text.re(regex, replace_entities, clean_match, case_sensitive)
|
775
775
|
|
776
776
|
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
777
|
-
clean_match: bool = False, case_sensitive: bool =
|
777
|
+
clean_match: bool = False, case_sensitive: bool = True) -> TextHandler:
|
778
778
|
"""Apply the given regex to text and return the first match if found, otherwise return the default value.
|
779
779
|
|
780
780
|
:param regex: Can be either a compiled regular expression or a string.
|
781
781
|
:param default: The default value to be returned if there is no match
|
782
782
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
783
783
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
784
|
-
:param case_sensitive: if
|
784
|
+
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
785
785
|
"""
|
786
786
|
return self.text.re_first(regex, default, replace_entities, clean_match, case_sensitive)
|
787
787
|
|
@@ -1009,14 +1009,14 @@ class Adaptors(List[Adaptor]):
|
|
1009
1009
|
return self.__class__(flatten(results))
|
1010
1010
|
|
1011
1011
|
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True,
|
1012
|
-
clean_match: bool = False, case_sensitive: bool =
|
1012
|
+
clean_match: bool = False, case_sensitive: bool = True) -> TextHandlers[TextHandler]:
|
1013
1013
|
"""Call the ``.re()`` method for each element in this list and return
|
1014
1014
|
their results flattened as List of TextHandler.
|
1015
1015
|
|
1016
1016
|
:param regex: Can be either a compiled regular expression or a string.
|
1017
1017
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
1018
1018
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
1019
|
-
:param case_sensitive: if
|
1019
|
+
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
1020
1020
|
"""
|
1021
1021
|
results = [
|
1022
1022
|
n.text.re(regex, replace_entities, clean_match, case_sensitive) for n in self
|
@@ -1024,7 +1024,7 @@ class Adaptors(List[Adaptor]):
|
|
1024
1024
|
return TextHandlers(flatten(results))
|
1025
1025
|
|
1026
1026
|
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
1027
|
-
clean_match: bool = False, case_sensitive: bool =
|
1027
|
+
clean_match: bool = False, case_sensitive: bool = True) -> TextHandler:
|
1028
1028
|
"""Call the ``.re_first()`` method for each element in this list and return
|
1029
1029
|
the first result or the default value otherwise.
|
1030
1030
|
|
@@ -1032,7 +1032,7 @@ class Adaptors(List[Adaptor]):
|
|
1032
1032
|
:param default: The default value to be returned if there is no match
|
1033
1033
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
1034
1034
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
1035
|
-
:param case_sensitive: if
|
1035
|
+
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
1036
1036
|
"""
|
1037
1037
|
for n in self:
|
1038
1038
|
for result in n.re(regex, replace_entities, clean_match, case_sensitive):
|
@@ -1,7 +1,7 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
4
|
-
Summary: Scrapling is
|
3
|
+
Version: 0.2.95
|
4
|
+
Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy again! In an internet filled with complications,
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
7
7
|
Author-email: karim.shoair@pm.me
|
@@ -40,7 +40,7 @@ Requires-Dist: tldextract
|
|
40
40
|
Requires-Dist: httpx[brotli,socks,zstd]
|
41
41
|
Requires-Dist: playwright>=1.49.1
|
42
42
|
Requires-Dist: rebrowser-playwright>=1.49.1
|
43
|
-
Requires-Dist: camoufox[geoip]>=0.4.
|
43
|
+
Requires-Dist: camoufox[geoip]>=0.4.11
|
44
44
|
Dynamic: author
|
45
45
|
Dynamic: author-email
|
46
46
|
Dynamic: classifier
|
@@ -267,7 +267,7 @@ then use it right away without initializing like:
|
|
267
267
|
page = StealthyFetcher.fetch('https://example.com')
|
268
268
|
```
|
269
269
|
|
270
|
-
Also, the `Response` object returned from all fetchers is the same as the `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
270
|
+
Also, the `Response` object returned from all fetchers is the same as the `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, `history`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
271
271
|
> [!NOTE]
|
272
272
|
> The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
|
273
273
|
### Fetcher
|
@@ -275,6 +275,8 @@ This class is built on top of [httpx](https://www.python-httpx.org/) with additi
|
|
275
275
|
|
276
276
|
For all methods, you have `stealthy_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default. You can also set the number of retries with the argument `retries` for all methods and this will make httpx retry requests if it failed for any reason. The default number of retries for all `Fetcher` methods is 3.
|
277
277
|
|
278
|
+
> Hence: All headers generated by `stealthy_headers` argument can be overwritten by you through the `headers` argument
|
279
|
+
|
278
280
|
You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods in this format `http://username:password@localhost:8030`
|
279
281
|
```python
|
280
282
|
>> page = Fetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
|
@@ -1,23 +1,23 @@
|
|
1
|
-
scrapling/__init__.py,sha256=
|
1
|
+
scrapling/__init__.py,sha256=7gthgq0LYOWqlly_w1GnesFh1WzPmkXwXJyjXB3JvVY,500
|
2
2
|
scrapling/cli.py,sha256=njPdJKmbLFHeWjtSiGEm9ALBdSyfUp0IaJvxQL5C31Q,1125
|
3
3
|
scrapling/defaults.py,sha256=sdXeZjXEX7PmCtaa0weK0nRrAUzqZukNNqipZ_sltYE,469
|
4
4
|
scrapling/fetchers.py,sha256=qmiJ6S-bnPWvP48Z6rKxBnSuR-tdwHlJwlIsYxGxFM0,35405
|
5
|
-
scrapling/parser.py,sha256=
|
5
|
+
scrapling/parser.py,sha256=b_1eHxRwHRCidyvm3F6ST6qIYvVEVU6GhTTCI1LblVk,54330
|
6
6
|
scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
|
7
7
|
scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
8
|
scrapling/core/_types.py,sha256=dKVi_dUxdxNtTr7sj7ySkHXDfrsmjFTfpCQeO5tGuBY,670
|
9
|
-
scrapling/core/custom_types.py,sha256=
|
9
|
+
scrapling/core/custom_types.py,sha256=X5fNOS3E7BDkvoUxrRZpEoPlzbLMlibGhZVGbHb2E74,13393
|
10
10
|
scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
|
11
11
|
scrapling/core/storage_adaptors.py,sha256=l_ZYcdn1y69AcoPuRrPoaxqKysN62pMExrwJWYdu5MA,6220
|
12
12
|
scrapling/core/translator.py,sha256=hFSc3mxG5pYhbwRgingeFbD_E73U799vCsvVv0uFEXw,5237
|
13
13
|
scrapling/core/utils.py,sha256=03LzCDzmeK1TXPjIKVzHSUgSfhpe36XE8AwxlgxzJoU,3705
|
14
14
|
scrapling/engines/__init__.py,sha256=zA7tzqcDXP0hllwmjVewNHWipIA4JSU9mRG4J-cud0c,267
|
15
|
-
scrapling/engines/camo.py,sha256=
|
15
|
+
scrapling/engines/camo.py,sha256=SHMRnIrN6599upo5-G3fZQ10455xyB-bB_EsLMjBStA,16072
|
16
16
|
scrapling/engines/constants.py,sha256=Gb_nXFoBB4ujJkd05SKkenMe1UDiRYQA3dkmA3DunLg,3723
|
17
|
-
scrapling/engines/pw.py,sha256=
|
18
|
-
scrapling/engines/static.py,sha256=
|
17
|
+
scrapling/engines/pw.py,sha256=LvS1jvTf3s7mfdeQo7_OyQ5zpiOzvBu5g88hOLlQBCQ,20856
|
18
|
+
scrapling/engines/static.py,sha256=8v6RmdsSP6fAtWNXaJG24evHPsZ2oDiBl7yfkLrdARU,10635
|
19
19
|
scrapling/engines/toolbelt/__init__.py,sha256=VQDdYm1zY9Apno6d8UrULk29vUjllZrQqD8mXL1E2Fc,402
|
20
|
-
scrapling/engines/toolbelt/custom.py,sha256=
|
20
|
+
scrapling/engines/toolbelt/custom.py,sha256=qgONLwpxUoEIAIQBF1RcakYu8cqAAmX8qdyaol5hfjA,12813
|
21
21
|
scrapling/engines/toolbelt/fingerprints.py,sha256=ajEHdXHr7W4hw9KcNS7XlyxNBZu37p1bRj18TiICLzU,2929
|
22
22
|
scrapling/engines/toolbelt/navigation.py,sha256=xEfZRJefuxOCGxQOSI2llS0du0Y2XmoIPdVGUSHOd7k,4567
|
23
23
|
scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
|
@@ -41,9 +41,9 @@ tests/fetchers/sync/test_playwright.py,sha256=MEyDRaMyxDIWupG7f_xz0f0jd9Cpbd5rXC
|
|
41
41
|
tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
42
42
|
tests/parser/test_automatch.py,sha256=SxsNdExE8zz8AcPRQFBUjZ3Q_1-tPOd9dzVvMSZpOYQ,4908
|
43
43
|
tests/parser/test_general.py,sha256=dyfOsc8lleoY4AxcfDUBUaD1i95xecfYuTUhKBsYjwo,12100
|
44
|
-
scrapling-0.2.
|
45
|
-
scrapling-0.2.
|
46
|
-
scrapling-0.2.
|
47
|
-
scrapling-0.2.
|
48
|
-
scrapling-0.2.
|
49
|
-
scrapling-0.2.
|
44
|
+
scrapling-0.2.95.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
45
|
+
scrapling-0.2.95.dist-info/METADATA,sha256=PTTxxxijblkcumiCbowId3Xy5I64lF9DvH3nAMPhEHQ,69066
|
46
|
+
scrapling-0.2.95.dist-info/WHEEL,sha256=nn6H5-ilmfVryoAQl3ZQ2l8SH5imPWFpm1A5FgEuFV4,91
|
47
|
+
scrapling-0.2.95.dist-info/entry_points.txt,sha256=DHyt2Blxy0P5OE2HRcP95Wz9_xo2ERCDcNqrJjYS3o8,49
|
48
|
+
scrapling-0.2.95.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
|
49
|
+
scrapling-0.2.95.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|