scrapling 0.2.93__tar.gz → 0.2.94__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {scrapling-0.2.93/scrapling.egg-info → scrapling-0.2.94}/PKG-INFO +3 -3
- {scrapling-0.2.93 → scrapling-0.2.94}/README.md +1 -1
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling/__init__.py +1 -1
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling/core/custom_types.py +11 -11
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling/engines/camo.py +42 -2
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling/engines/pw.py +42 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling/engines/static.py +1 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling/engines/toolbelt/custom.py +2 -1
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling/parser.py +9 -9
- {scrapling-0.2.93 → scrapling-0.2.94/scrapling.egg-info}/PKG-INFO +3 -3
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling.egg-info/requires.txt +1 -1
- {scrapling-0.2.93 → scrapling-0.2.94}/setup.cfg +1 -1
- {scrapling-0.2.93 → scrapling-0.2.94}/setup.py +2 -2
- {scrapling-0.2.93 → scrapling-0.2.94}/LICENSE +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/MANIFEST.in +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling/cli.py +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling/core/__init__.py +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling/core/_types.py +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling/core/mixins.py +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling/core/storage_adaptors.py +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling/core/translator.py +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling/core/utils.py +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling/defaults.py +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling/engines/__init__.py +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling/engines/constants.py +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling/engines/toolbelt/__init__.py +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling/engines/toolbelt/fingerprints.py +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling/engines/toolbelt/navigation.py +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling/fetchers.py +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling/py.typed +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling.egg-info/SOURCES.txt +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling.egg-info/dependency_links.txt +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling.egg-info/entry_points.txt +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling.egg-info/not-zip-safe +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/scrapling.egg-info/top_level.txt +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/tests/__init__.py +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/tests/fetchers/__init__.py +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/tests/fetchers/async/__init__.py +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/tests/fetchers/async/test_camoufox.py +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/tests/fetchers/async/test_httpx.py +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/tests/fetchers/async/test_playwright.py +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/tests/fetchers/sync/__init__.py +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/tests/fetchers/sync/test_camoufox.py +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/tests/fetchers/sync/test_httpx.py +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/tests/fetchers/sync/test_playwright.py +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/tests/fetchers/test_utils.py +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/tests/parser/__init__.py +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/tests/parser/test_automatch.py +0 -0
- {scrapling-0.2.93 → scrapling-0.2.94}/tests/parser/test_general.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.94
|
4
4
|
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -40,7 +40,7 @@ Requires-Dist: tldextract
|
|
40
40
|
Requires-Dist: httpx[brotli,socks,zstd]
|
41
41
|
Requires-Dist: playwright>=1.49.1
|
42
42
|
Requires-Dist: rebrowser-playwright>=1.49.1
|
43
|
-
Requires-Dist: camoufox[geoip]>=0.4.
|
43
|
+
Requires-Dist: camoufox[geoip]>=0.4.11
|
44
44
|
Dynamic: author
|
45
45
|
Dynamic: author-email
|
46
46
|
Dynamic: classifier
|
@@ -267,7 +267,7 @@ then use it right away without initializing like:
|
|
267
267
|
page = StealthyFetcher.fetch('https://example.com')
|
268
268
|
```
|
269
269
|
|
270
|
-
Also, the `Response` object returned from all fetchers is the same as the `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
270
|
+
Also, the `Response` object returned from all fetchers is the same as the `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, `history`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
271
271
|
> [!NOTE]
|
272
272
|
> The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
|
273
273
|
### Fetcher
|
@@ -212,7 +212,7 @@ then use it right away without initializing like:
|
|
212
212
|
page = StealthyFetcher.fetch('https://example.com')
|
213
213
|
```
|
214
214
|
|
215
|
-
Also, the `Response` object returned from all fetchers is the same as the `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
215
|
+
Also, the `Response` object returned from all fetchers is the same as the `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, `history`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
216
216
|
> [!NOTE]
|
217
217
|
> The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
|
218
218
|
### Fetcher
|
@@ -5,7 +5,7 @@ from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
|
|
5
5
|
from scrapling.parser import Adaptor, Adaptors
|
6
6
|
|
7
7
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
8
|
-
__version__ = "0.2.
|
8
|
+
__version__ = "0.2.94"
|
9
9
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
10
10
|
|
11
11
|
|
@@ -134,7 +134,7 @@ class TextHandler(str):
|
|
134
134
|
check_match: Literal[True],
|
135
135
|
replace_entities: bool = True,
|
136
136
|
clean_match: bool = False,
|
137
|
-
case_sensitive: bool =
|
137
|
+
case_sensitive: bool = True,
|
138
138
|
) -> bool:
|
139
139
|
...
|
140
140
|
|
@@ -144,26 +144,26 @@ class TextHandler(str):
|
|
144
144
|
regex: Union[str, Pattern[str]],
|
145
145
|
replace_entities: bool = True,
|
146
146
|
clean_match: bool = False,
|
147
|
-
case_sensitive: bool =
|
147
|
+
case_sensitive: bool = True,
|
148
148
|
check_match: Literal[False] = False,
|
149
149
|
) -> "TextHandlers[TextHandler]":
|
150
150
|
...
|
151
151
|
|
152
152
|
def re(
|
153
153
|
self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
|
154
|
-
case_sensitive: bool =
|
154
|
+
case_sensitive: bool = True, check_match: bool = False
|
155
155
|
) -> Union["TextHandlers[TextHandler]", bool]:
|
156
156
|
"""Apply the given regex to the current text and return a list of strings with the matches.
|
157
157
|
|
158
158
|
:param regex: Can be either a compiled regular expression or a string.
|
159
159
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
160
160
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
161
|
-
:param case_sensitive: if
|
161
|
+
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
162
162
|
:param check_match: used to quickly check if this regex matches or not without any operations on the results
|
163
163
|
|
164
164
|
"""
|
165
165
|
if isinstance(regex, str):
|
166
|
-
if
|
166
|
+
if case_sensitive:
|
167
167
|
regex = re.compile(regex, re.UNICODE)
|
168
168
|
else:
|
169
169
|
regex = re.compile(regex, flags=re.UNICODE | re.IGNORECASE)
|
@@ -182,14 +182,14 @@ class TextHandler(str):
|
|
182
182
|
return TextHandlers(typing.cast(List[_TextHandlerType], [TextHandler(_replace_entities(s)) for s in results]))
|
183
183
|
|
184
184
|
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
185
|
-
clean_match: bool = False, case_sensitive: bool =
|
185
|
+
clean_match: bool = False, case_sensitive: bool = True) -> "TextHandler":
|
186
186
|
"""Apply the given regex to text and return the first match if found, otherwise return the default value.
|
187
187
|
|
188
188
|
:param regex: Can be either a compiled regular expression or a string.
|
189
189
|
:param default: The default value to be returned if there is no match
|
190
190
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
191
191
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
192
|
-
:param case_sensitive: if
|
192
|
+
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
193
193
|
|
194
194
|
"""
|
195
195
|
result = self.re(regex, replace_entities, clean_match=clean_match, case_sensitive=case_sensitive)
|
@@ -218,14 +218,14 @@ class TextHandlers(List[TextHandler]):
|
|
218
218
|
return typing.cast(_TextHandlerType, TextHandler(lst))
|
219
219
|
|
220
220
|
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
|
221
|
-
case_sensitive: bool =
|
221
|
+
case_sensitive: bool = True) -> 'TextHandlers[TextHandler]':
|
222
222
|
"""Call the ``.re()`` method for each element in this list and return
|
223
223
|
their results flattened as TextHandlers.
|
224
224
|
|
225
225
|
:param regex: Can be either a compiled regular expression or a string.
|
226
226
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
227
227
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
228
|
-
:param case_sensitive: if
|
228
|
+
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
229
229
|
"""
|
230
230
|
results = [
|
231
231
|
n.re(regex, replace_entities, clean_match, case_sensitive) for n in self
|
@@ -233,7 +233,7 @@ class TextHandlers(List[TextHandler]):
|
|
233
233
|
return TextHandlers(flatten(results))
|
234
234
|
|
235
235
|
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
236
|
-
clean_match: bool = False, case_sensitive: bool =
|
236
|
+
clean_match: bool = False, case_sensitive: bool = True) -> TextHandler:
|
237
237
|
"""Call the ``.re_first()`` method for each element in this list and return
|
238
238
|
the first result or the default value otherwise.
|
239
239
|
|
@@ -241,7 +241,7 @@ class TextHandlers(List[TextHandler]):
|
|
241
241
|
:param default: The default value to be returned if there is no match
|
242
242
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
243
243
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
244
|
-
:param case_sensitive: if
|
244
|
+
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
245
245
|
"""
|
246
246
|
for n in self:
|
247
247
|
for result in n.re(regex, replace_entities, clean_match, case_sensitive):
|
@@ -95,7 +95,6 @@ class CamoufoxEngine:
|
|
95
95
|
with Camoufox(
|
96
96
|
geoip=self.geoip,
|
97
97
|
proxy=self.proxy,
|
98
|
-
disable_coop=True,
|
99
98
|
enable_cache=True,
|
100
99
|
addons=self.addons,
|
101
100
|
exclude_addons=addons,
|
@@ -142,6 +141,26 @@ class CamoufoxEngine:
|
|
142
141
|
# PlayWright API sometimes give empty status text for some reason!
|
143
142
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
144
143
|
|
144
|
+
history = []
|
145
|
+
current_request = first_response.request.redirected_from
|
146
|
+
while current_request:
|
147
|
+
current_response = current_request.response()
|
148
|
+
|
149
|
+
history.insert(0, Response(
|
150
|
+
url=current_request.url,
|
151
|
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
152
|
+
text='',
|
153
|
+
body=b'',
|
154
|
+
status=current_response.status if current_response else 301,
|
155
|
+
reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
|
156
|
+
encoding=current_response.headers.get('content-type', '') or 'utf-8',
|
157
|
+
cookies={},
|
158
|
+
headers=current_response.all_headers() if current_response else {},
|
159
|
+
request_headers=current_request.all_headers(),
|
160
|
+
**self.adaptor_arguments
|
161
|
+
))
|
162
|
+
current_request = current_request.redirected_from
|
163
|
+
|
145
164
|
response = Response(
|
146
165
|
url=page.url,
|
147
166
|
text=page.content(),
|
@@ -152,6 +171,7 @@ class CamoufoxEngine:
|
|
152
171
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
153
172
|
headers=first_response.all_headers(),
|
154
173
|
request_headers=first_response.request.all_headers(),
|
174
|
+
history=history,
|
155
175
|
**self.adaptor_arguments
|
156
176
|
)
|
157
177
|
page.close()
|
@@ -176,7 +196,6 @@ class CamoufoxEngine:
|
|
176
196
|
async with AsyncCamoufox(
|
177
197
|
geoip=self.geoip,
|
178
198
|
proxy=self.proxy,
|
179
|
-
disable_coop=True,
|
180
199
|
enable_cache=True,
|
181
200
|
addons=self.addons,
|
182
201
|
exclude_addons=addons,
|
@@ -223,6 +242,26 @@ class CamoufoxEngine:
|
|
223
242
|
# PlayWright API sometimes give empty status text for some reason!
|
224
243
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
225
244
|
|
245
|
+
history = []
|
246
|
+
current_request = first_response.request.redirected_from
|
247
|
+
while current_request:
|
248
|
+
current_response = await current_request.response()
|
249
|
+
|
250
|
+
history.insert(0, Response(
|
251
|
+
url=current_request.url,
|
252
|
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
253
|
+
text='',
|
254
|
+
body=b'',
|
255
|
+
status=current_response.status if current_response else 301,
|
256
|
+
reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
|
257
|
+
encoding=current_response.headers.get('content-type', '') or 'utf-8',
|
258
|
+
cookies={},
|
259
|
+
headers=await current_response.all_headers() if current_response else {},
|
260
|
+
request_headers=await current_request.all_headers(),
|
261
|
+
**self.adaptor_arguments
|
262
|
+
))
|
263
|
+
current_request = current_request.redirected_from
|
264
|
+
|
226
265
|
response = Response(
|
227
266
|
url=page.url,
|
228
267
|
text=await page.content(),
|
@@ -233,6 +272,7 @@ class CamoufoxEngine:
|
|
233
272
|
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
|
234
273
|
headers=await first_response.all_headers(),
|
235
274
|
request_headers=await first_response.request.all_headers(),
|
275
|
+
history=history,
|
236
276
|
**self.adaptor_arguments
|
237
277
|
)
|
238
278
|
await page.close()
|
@@ -259,6 +259,26 @@ class PlaywrightEngine:
|
|
259
259
|
# PlayWright API sometimes give empty status text for some reason!
|
260
260
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
261
261
|
|
262
|
+
history = []
|
263
|
+
current_request = first_response.request.redirected_from
|
264
|
+
while current_request:
|
265
|
+
current_response = current_request.response()
|
266
|
+
|
267
|
+
history.insert(0, Response(
|
268
|
+
url=current_request.url,
|
269
|
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
270
|
+
text='',
|
271
|
+
body=b'',
|
272
|
+
status=current_response.status if current_response else 301,
|
273
|
+
reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
|
274
|
+
encoding=current_response.headers.get('content-type', '') or 'utf-8',
|
275
|
+
cookies={},
|
276
|
+
headers=current_response.all_headers() if current_response else {},
|
277
|
+
request_headers=current_request.all_headers(),
|
278
|
+
**self.adaptor_arguments
|
279
|
+
))
|
280
|
+
current_request = current_request.redirected_from
|
281
|
+
|
262
282
|
response = Response(
|
263
283
|
url=page.url,
|
264
284
|
text=page.content(),
|
@@ -269,6 +289,7 @@ class PlaywrightEngine:
|
|
269
289
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
270
290
|
headers=first_response.all_headers(),
|
271
291
|
request_headers=first_response.request.all_headers(),
|
292
|
+
history=history,
|
272
293
|
**self.adaptor_arguments
|
273
294
|
)
|
274
295
|
page.close()
|
@@ -345,6 +366,26 @@ class PlaywrightEngine:
|
|
345
366
|
# PlayWright API sometimes give empty status text for some reason!
|
346
367
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
347
368
|
|
369
|
+
history = []
|
370
|
+
current_request = first_response.request.redirected_from
|
371
|
+
while current_request:
|
372
|
+
current_response = await current_request.response()
|
373
|
+
|
374
|
+
history.insert(0, Response(
|
375
|
+
url=current_request.url,
|
376
|
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
377
|
+
text='',
|
378
|
+
body=b'',
|
379
|
+
status=current_response.status if current_response else 301,
|
380
|
+
reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
|
381
|
+
encoding=current_response.headers.get('content-type', '') or 'utf-8',
|
382
|
+
cookies={},
|
383
|
+
headers=await current_response.all_headers() if current_response else {},
|
384
|
+
request_headers=await current_request.all_headers(),
|
385
|
+
**self.adaptor_arguments
|
386
|
+
))
|
387
|
+
current_request = current_request.redirected_from
|
388
|
+
|
348
389
|
response = Response(
|
349
390
|
url=page.url,
|
350
391
|
text=await page.content(),
|
@@ -355,6 +396,7 @@ class PlaywrightEngine:
|
|
355
396
|
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
|
356
397
|
headers=await first_response.all_headers(),
|
357
398
|
request_headers=await first_response.request.all_headers(),
|
399
|
+
history=history,
|
358
400
|
**self.adaptor_arguments
|
359
401
|
)
|
360
402
|
await page.close()
|
@@ -72,6 +72,7 @@ class StaticEngine:
|
|
72
72
|
headers=dict(response.headers),
|
73
73
|
request_headers=dict(response.request.headers),
|
74
74
|
method=response.request.method,
|
75
|
+
history=[self._prepare_response(redirection) for redirection in response.history],
|
75
76
|
**self.adaptor_arguments
|
76
77
|
)
|
77
78
|
|
@@ -85,13 +85,14 @@ class Response(Adaptor):
|
|
85
85
|
"""This class is returned by all engines as a way to unify response type between different libraries."""
|
86
86
|
|
87
87
|
def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict,
|
88
|
-
encoding: str = 'utf-8', method: str = 'GET', **adaptor_arguments: Dict):
|
88
|
+
encoding: str = 'utf-8', method: str = 'GET', history: List = None, **adaptor_arguments: Dict):
|
89
89
|
automatch_domain = adaptor_arguments.pop('automatch_domain', None)
|
90
90
|
self.status = status
|
91
91
|
self.reason = reason
|
92
92
|
self.cookies = cookies
|
93
93
|
self.headers = headers
|
94
94
|
self.request_headers = request_headers
|
95
|
+
self.history = history or []
|
95
96
|
encoding = ResponseEncoding.get_value(encoding, text)
|
96
97
|
super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
|
97
98
|
# For back-ward compatibility
|
@@ -132,7 +132,7 @@ class Adaptor(SelectorsGeneration):
|
|
132
132
|
self.__tag = None
|
133
133
|
# No need to check if all response attributes exist or not because if `status` exist, then the rest exist (Save some CPU cycles for speed)
|
134
134
|
self.__response_data = {
|
135
|
-
key: getattr(self, key) for key in ('status', 'reason', 'cookies', 'headers', 'request_headers',)
|
135
|
+
key: getattr(self, key) for key in ('status', 'reason', 'cookies', 'history', 'headers', 'request_headers',)
|
136
136
|
} if hasattr(self, 'status') else {}
|
137
137
|
|
138
138
|
# Node functionalities, I wanted to move to separate Mixin class but it had slight impact on performance
|
@@ -763,25 +763,25 @@ class Adaptor(SelectorsGeneration):
|
|
763
763
|
return self.get_all_text(strip=True).json()
|
764
764
|
|
765
765
|
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True,
|
766
|
-
clean_match: bool = False, case_sensitive: bool =
|
766
|
+
clean_match: bool = False, case_sensitive: bool = True) -> TextHandlers:
|
767
767
|
"""Apply the given regex to the current text and return a list of strings with the matches.
|
768
768
|
|
769
769
|
:param regex: Can be either a compiled regular expression or a string.
|
770
770
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
771
771
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
772
|
-
:param case_sensitive: if
|
772
|
+
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
773
773
|
"""
|
774
774
|
return self.text.re(regex, replace_entities, clean_match, case_sensitive)
|
775
775
|
|
776
776
|
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
777
|
-
clean_match: bool = False, case_sensitive: bool =
|
777
|
+
clean_match: bool = False, case_sensitive: bool = True) -> TextHandler:
|
778
778
|
"""Apply the given regex to text and return the first match if found, otherwise return the default value.
|
779
779
|
|
780
780
|
:param regex: Can be either a compiled regular expression or a string.
|
781
781
|
:param default: The default value to be returned if there is no match
|
782
782
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
783
783
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
784
|
-
:param case_sensitive: if
|
784
|
+
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
785
785
|
"""
|
786
786
|
return self.text.re_first(regex, default, replace_entities, clean_match, case_sensitive)
|
787
787
|
|
@@ -1009,14 +1009,14 @@ class Adaptors(List[Adaptor]):
|
|
1009
1009
|
return self.__class__(flatten(results))
|
1010
1010
|
|
1011
1011
|
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True,
|
1012
|
-
clean_match: bool = False, case_sensitive: bool =
|
1012
|
+
clean_match: bool = False, case_sensitive: bool = True) -> TextHandlers[TextHandler]:
|
1013
1013
|
"""Call the ``.re()`` method for each element in this list and return
|
1014
1014
|
their results flattened as List of TextHandler.
|
1015
1015
|
|
1016
1016
|
:param regex: Can be either a compiled regular expression or a string.
|
1017
1017
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
1018
1018
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
1019
|
-
:param case_sensitive: if
|
1019
|
+
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
1020
1020
|
"""
|
1021
1021
|
results = [
|
1022
1022
|
n.text.re(regex, replace_entities, clean_match, case_sensitive) for n in self
|
@@ -1024,7 +1024,7 @@ class Adaptors(List[Adaptor]):
|
|
1024
1024
|
return TextHandlers(flatten(results))
|
1025
1025
|
|
1026
1026
|
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
1027
|
-
clean_match: bool = False, case_sensitive: bool =
|
1027
|
+
clean_match: bool = False, case_sensitive: bool = True) -> TextHandler:
|
1028
1028
|
"""Call the ``.re_first()`` method for each element in this list and return
|
1029
1029
|
the first result or the default value otherwise.
|
1030
1030
|
|
@@ -1032,7 +1032,7 @@ class Adaptors(List[Adaptor]):
|
|
1032
1032
|
:param default: The default value to be returned if there is no match
|
1033
1033
|
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
1034
1034
|
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
1035
|
-
:param case_sensitive: if
|
1035
|
+
:param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
|
1036
1036
|
"""
|
1037
1037
|
for n in self:
|
1038
1038
|
for result in n.re(regex, replace_entities, clean_match, case_sensitive):
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.94
|
4
4
|
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -40,7 +40,7 @@ Requires-Dist: tldextract
|
|
40
40
|
Requires-Dist: httpx[brotli,socks,zstd]
|
41
41
|
Requires-Dist: playwright>=1.49.1
|
42
42
|
Requires-Dist: rebrowser-playwright>=1.49.1
|
43
|
-
Requires-Dist: camoufox[geoip]>=0.4.
|
43
|
+
Requires-Dist: camoufox[geoip]>=0.4.11
|
44
44
|
Dynamic: author
|
45
45
|
Dynamic: author-email
|
46
46
|
Dynamic: classifier
|
@@ -267,7 +267,7 @@ then use it right away without initializing like:
|
|
267
267
|
page = StealthyFetcher.fetch('https://example.com')
|
268
268
|
```
|
269
269
|
|
270
|
-
Also, the `Response` object returned from all fetchers is the same as the `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
270
|
+
Also, the `Response` object returned from all fetchers is the same as the `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, `history`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
271
271
|
> [!NOTE]
|
272
272
|
> The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
|
273
273
|
### Fetcher
|
@@ -6,7 +6,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
|
6
6
|
|
7
7
|
setup(
|
8
8
|
name="scrapling",
|
9
|
-
version="0.2.
|
9
|
+
version="0.2.94",
|
10
10
|
description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
11
11
|
simplifies the process of extracting data from websites, even when they undergo structural changes, and offers
|
12
12
|
impressive speed improvements over many popular scraping tools.""",
|
@@ -61,7 +61,7 @@ setup(
|
|
61
61
|
'httpx[brotli,zstd, socks]',
|
62
62
|
'playwright>=1.49.1',
|
63
63
|
'rebrowser-playwright>=1.49.1',
|
64
|
-
'camoufox[geoip]>=0.4.
|
64
|
+
'camoufox[geoip]>=0.4.11'
|
65
65
|
],
|
66
66
|
python_requires=">=3.9",
|
67
67
|
url="https://github.com/D4Vinci/Scrapling",
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{scrapling-0.2.93 → scrapling-0.2.94}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js
RENAMED
File without changes
|
{scrapling-0.2.93 → scrapling-0.2.94}/scrapling/engines/toolbelt/bypasses/notification_permission.js
RENAMED
File without changes
|
File without changes
|
{scrapling-0.2.93 → scrapling-0.2.94}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js
RENAMED
File without changes
|
File without changes
|
{scrapling-0.2.93 → scrapling-0.2.94}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|