scrapling 0.2.9__tar.gz → 0.2.91__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {scrapling-0.2.9/scrapling.egg-info → scrapling-0.2.91}/PKG-INFO +2 -3
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/__init__.py +1 -1
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/core/_types.py +2 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/camo.py +44 -18
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/pw.py +47 -18
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/toolbelt/custom.py +1 -5
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/fetchers.py +7 -7
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/parser.py +1 -1
- {scrapling-0.2.9 → scrapling-0.2.91/scrapling.egg-info}/PKG-INFO +2 -3
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling.egg-info/requires.txt +1 -1
- {scrapling-0.2.9 → scrapling-0.2.91}/setup.cfg +1 -1
- {scrapling-0.2.9 → scrapling-0.2.91}/setup.py +2 -3
- {scrapling-0.2.9 → scrapling-0.2.91}/LICENSE +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/MANIFEST.in +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/README.md +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/core/__init__.py +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/core/custom_types.py +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/core/mixins.py +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/core/storage_adaptors.py +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/core/translator.py +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/core/utils.py +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/defaults.py +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/__init__.py +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/constants.py +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/static.py +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/toolbelt/__init__.py +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/toolbelt/fingerprints.py +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/toolbelt/navigation.py +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/py.typed +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling.egg-info/SOURCES.txt +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling.egg-info/dependency_links.txt +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling.egg-info/not-zip-safe +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/scrapling.egg-info/top_level.txt +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/tests/__init__.py +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/tests/fetchers/__init__.py +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/tests/fetchers/async/__init__.py +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/tests/fetchers/async/test_camoufox.py +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/tests/fetchers/async/test_httpx.py +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/tests/fetchers/async/test_playwright.py +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/tests/fetchers/sync/__init__.py +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/tests/fetchers/sync/test_camoufox.py +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/tests/fetchers/sync/test_httpx.py +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/tests/fetchers/sync/test_playwright.py +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/tests/fetchers/test_utils.py +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/tests/parser/__init__.py +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/tests/parser/test_automatch.py +0 -0
- {scrapling-0.2.9 → scrapling-0.2.91}/tests/parser/test_general.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.91
|
4
4
|
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -21,7 +21,6 @@ Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
21
21
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
22
22
|
Classifier: Programming Language :: Python :: 3
|
23
23
|
Classifier: Programming Language :: Python :: 3 :: Only
|
24
|
-
Classifier: Programming Language :: Python :: 3.8
|
25
24
|
Classifier: Programming Language :: Python :: 3.9
|
26
25
|
Classifier: Programming Language :: Python :: 3.10
|
27
26
|
Classifier: Programming Language :: Python :: 3.11
|
@@ -38,7 +37,7 @@ Requires-Dist: cssselect>=1.2
|
|
38
37
|
Requires-Dist: w3lib
|
39
38
|
Requires-Dist: orjson>=3
|
40
39
|
Requires-Dist: tldextract
|
41
|
-
Requires-Dist: httpx[brotli,zstd]
|
40
|
+
Requires-Dist: httpx[brotli,socks,zstd]
|
42
41
|
Requires-Dist: playwright>=1.49.1
|
43
42
|
Requires-Dist: rebrowser-playwright>=1.49.1
|
44
43
|
Requires-Dist: camoufox[geoip]>=0.4.9
|
@@ -5,7 +5,7 @@ from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
|
|
5
5
|
from scrapling.parser import Adaptor, Adaptors
|
6
6
|
|
7
7
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
8
|
-
__version__ = "0.2.
|
8
|
+
__version__ = "0.2.91"
|
9
9
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
10
10
|
|
11
11
|
|
@@ -5,6 +5,8 @@ Type definitions for type checking purposes.
|
|
5
5
|
from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable,
|
6
6
|
List, Literal, Optional, Pattern, Tuple, Type, Union)
|
7
7
|
|
8
|
+
SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
|
9
|
+
|
8
10
|
try:
|
9
11
|
from typing import Protocol
|
10
12
|
except ImportError:
|
@@ -3,7 +3,7 @@ from camoufox.async_api import AsyncCamoufox
|
|
3
3
|
from camoufox.sync_api import Camoufox
|
4
4
|
|
5
5
|
from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
|
6
|
-
Union)
|
6
|
+
SelectorWaitStates, Union)
|
7
7
|
from scrapling.core.utils import log
|
8
8
|
from scrapling.engines.toolbelt import (Response, StatusText,
|
9
9
|
async_intercept_route,
|
@@ -18,7 +18,7 @@ class CamoufoxEngine:
|
|
18
18
|
self, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
|
19
19
|
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
|
20
20
|
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
|
21
|
-
wait_selector_state:
|
21
|
+
wait_selector_state: Optional[SelectorWaitStates] = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
|
22
22
|
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
|
23
23
|
geoip: Optional[bool] = False,
|
24
24
|
adaptor_arguments: Dict = None,
|
@@ -84,6 +84,14 @@ class CamoufoxEngine:
|
|
84
84
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
85
85
|
"""
|
86
86
|
addons = [] if self.disable_ads else [DefaultAddons.UBO]
|
87
|
+
# Store the final response
|
88
|
+
final_response = None
|
89
|
+
|
90
|
+
def handle_response(finished_response):
|
91
|
+
nonlocal final_response
|
92
|
+
if finished_response.request.resource_type == "document":
|
93
|
+
final_response = finished_response
|
94
|
+
|
87
95
|
with Camoufox(
|
88
96
|
geoip=self.geoip,
|
89
97
|
proxy=self.proxy,
|
@@ -100,13 +108,15 @@ class CamoufoxEngine:
|
|
100
108
|
page = browser.new_page()
|
101
109
|
page.set_default_navigation_timeout(self.timeout)
|
102
110
|
page.set_default_timeout(self.timeout)
|
111
|
+
# Listen for all responses
|
112
|
+
page.on("response", handle_response)
|
103
113
|
if self.disable_resources:
|
104
114
|
page.route("**/*", intercept_route)
|
105
115
|
|
106
116
|
if self.extra_headers:
|
107
117
|
page.set_extra_http_headers(self.extra_headers)
|
108
118
|
|
109
|
-
|
119
|
+
first_response = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
110
120
|
page.wait_for_load_state(state="domcontentloaded")
|
111
121
|
if self.network_idle:
|
112
122
|
page.wait_for_load_state('networkidle')
|
@@ -123,21 +133,24 @@ class CamoufoxEngine:
|
|
123
133
|
if self.network_idle:
|
124
134
|
page.wait_for_load_state('networkidle')
|
125
135
|
|
136
|
+
response_bytes = final_response.body() if final_response else page.content().encode('utf-8')
|
137
|
+
# In case we didn't catch a document type somehow
|
138
|
+
final_response = final_response if final_response else first_response
|
126
139
|
# This will be parsed inside `Response`
|
127
|
-
encoding =
|
140
|
+
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
|
128
141
|
# PlayWright API sometimes give empty status text for some reason!
|
129
|
-
status_text =
|
142
|
+
status_text = final_response.status_text or StatusText.get(final_response.status)
|
130
143
|
|
131
144
|
response = Response(
|
132
|
-
url=
|
145
|
+
url=final_response.url,
|
133
146
|
text=page.content(),
|
134
|
-
body=
|
135
|
-
status=
|
147
|
+
body=response_bytes,
|
148
|
+
status=final_response.status,
|
136
149
|
reason=status_text,
|
137
150
|
encoding=encoding,
|
138
151
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
139
|
-
headers=
|
140
|
-
request_headers=
|
152
|
+
headers=final_response.all_headers(),
|
153
|
+
request_headers=final_response.request.all_headers(),
|
141
154
|
**self.adaptor_arguments
|
142
155
|
)
|
143
156
|
page.close()
|
@@ -151,6 +164,14 @@ class CamoufoxEngine:
|
|
151
164
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
152
165
|
"""
|
153
166
|
addons = [] if self.disable_ads else [DefaultAddons.UBO]
|
167
|
+
# Store the final response
|
168
|
+
final_response = None
|
169
|
+
|
170
|
+
async def handle_response(finished_response):
|
171
|
+
nonlocal final_response
|
172
|
+
if finished_response.request.resource_type == "document":
|
173
|
+
final_response = finished_response
|
174
|
+
|
154
175
|
async with AsyncCamoufox(
|
155
176
|
geoip=self.geoip,
|
156
177
|
proxy=self.proxy,
|
@@ -167,13 +188,15 @@ class CamoufoxEngine:
|
|
167
188
|
page = await browser.new_page()
|
168
189
|
page.set_default_navigation_timeout(self.timeout)
|
169
190
|
page.set_default_timeout(self.timeout)
|
191
|
+
# Listen for all responses
|
192
|
+
page.on("response", handle_response)
|
170
193
|
if self.disable_resources:
|
171
194
|
await page.route("**/*", async_intercept_route)
|
172
195
|
|
173
196
|
if self.extra_headers:
|
174
197
|
await page.set_extra_http_headers(self.extra_headers)
|
175
198
|
|
176
|
-
|
199
|
+
first_response = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
177
200
|
await page.wait_for_load_state(state="domcontentloaded")
|
178
201
|
if self.network_idle:
|
179
202
|
await page.wait_for_load_state('networkidle')
|
@@ -190,21 +213,24 @@ class CamoufoxEngine:
|
|
190
213
|
if self.network_idle:
|
191
214
|
await page.wait_for_load_state('networkidle')
|
192
215
|
|
216
|
+
response_bytes = await final_response.body() if final_response else (await page.content()).encode('utf-8')
|
217
|
+
# In case we didn't catch a document type somehow
|
218
|
+
final_response = final_response if final_response else first_response
|
193
219
|
# This will be parsed inside `Response`
|
194
|
-
encoding =
|
220
|
+
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
|
195
221
|
# PlayWright API sometimes give empty status text for some reason!
|
196
|
-
status_text =
|
222
|
+
status_text = final_response.status_text or StatusText.get(final_response.status)
|
197
223
|
|
198
224
|
response = Response(
|
199
|
-
url=
|
225
|
+
url=final_response.url,
|
200
226
|
text=await page.content(),
|
201
|
-
body=
|
202
|
-
status=
|
227
|
+
body=response_bytes,
|
228
|
+
status=final_response.status,
|
203
229
|
reason=status_text,
|
204
230
|
encoding=encoding,
|
205
231
|
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
|
206
|
-
headers=await
|
207
|
-
request_headers=await
|
232
|
+
headers=await final_response.all_headers(),
|
233
|
+
request_headers=await final_response.request.all_headers(),
|
208
234
|
**self.adaptor_arguments
|
209
235
|
)
|
210
236
|
await page.close()
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import json
|
2
2
|
|
3
|
-
from scrapling.core._types import Callable, Dict, Optional,
|
3
|
+
from scrapling.core._types import (Callable, Dict, Optional,
|
4
|
+
SelectorWaitStates, Union)
|
4
5
|
from scrapling.core.utils import log, lru_cache
|
5
6
|
from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
|
6
7
|
NSTBROWSER_DEFAULT_QUERY)
|
@@ -23,7 +24,7 @@ class PlaywrightEngine:
|
|
23
24
|
page_action: Callable = None,
|
24
25
|
wait_selector: Optional[str] = None,
|
25
26
|
locale: Optional[str] = 'en-US',
|
26
|
-
wait_selector_state:
|
27
|
+
wait_selector_state: SelectorWaitStates = 'attached',
|
27
28
|
stealth: Optional[bool] = False,
|
28
29
|
real_chrome: Optional[bool] = False,
|
29
30
|
hide_canvas: Optional[bool] = False,
|
@@ -193,12 +194,21 @@ class PlaywrightEngine:
|
|
193
194
|
:param url: Target url.
|
194
195
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
195
196
|
"""
|
197
|
+
from playwright.sync_api import Response as PlaywrightResponse
|
196
198
|
if not self.stealth or self.real_chrome:
|
197
199
|
# Because rebrowser_playwright doesn't play well with real browsers
|
198
200
|
from playwright.sync_api import sync_playwright
|
199
201
|
else:
|
200
202
|
from rebrowser_playwright.sync_api import sync_playwright
|
201
203
|
|
204
|
+
# Store the final response
|
205
|
+
final_response = None
|
206
|
+
|
207
|
+
def handle_response(finished_response: PlaywrightResponse):
|
208
|
+
nonlocal final_response
|
209
|
+
if finished_response.request.resource_type == "document":
|
210
|
+
final_response = finished_response
|
211
|
+
|
202
212
|
with sync_playwright() as p:
|
203
213
|
# Creating the browser
|
204
214
|
if self.cdp_url:
|
@@ -212,6 +222,8 @@ class PlaywrightEngine:
|
|
212
222
|
page = context.new_page()
|
213
223
|
page.set_default_navigation_timeout(self.timeout)
|
214
224
|
page.set_default_timeout(self.timeout)
|
225
|
+
# Listen for all responses
|
226
|
+
page.on("response", handle_response)
|
215
227
|
|
216
228
|
if self.extra_headers:
|
217
229
|
page.set_extra_http_headers(self.extra_headers)
|
@@ -223,7 +235,7 @@ class PlaywrightEngine:
|
|
223
235
|
for script in self.__stealth_scripts():
|
224
236
|
page.add_init_script(path=script)
|
225
237
|
|
226
|
-
|
238
|
+
first_response = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
227
239
|
page.wait_for_load_state(state="domcontentloaded")
|
228
240
|
if self.network_idle:
|
229
241
|
page.wait_for_load_state('networkidle')
|
@@ -240,21 +252,24 @@ class PlaywrightEngine:
|
|
240
252
|
if self.network_idle:
|
241
253
|
page.wait_for_load_state('networkidle')
|
242
254
|
|
255
|
+
response_bytes = final_response.body() if final_response else page.content().encode('utf-8')
|
256
|
+
# In case we didn't catch a document type somehow
|
257
|
+
final_response = final_response if final_response else first_response
|
243
258
|
# This will be parsed inside `Response`
|
244
|
-
encoding =
|
259
|
+
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
|
245
260
|
# PlayWright API sometimes give empty status text for some reason!
|
246
|
-
status_text =
|
261
|
+
status_text = final_response.status_text or StatusText.get(final_response.status)
|
247
262
|
|
248
263
|
response = Response(
|
249
|
-
url=
|
264
|
+
url=final_response.url,
|
250
265
|
text=page.content(),
|
251
|
-
body=
|
252
|
-
status=
|
266
|
+
body=response_bytes,
|
267
|
+
status=final_response.status,
|
253
268
|
reason=status_text,
|
254
269
|
encoding=encoding,
|
255
270
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
256
|
-
headers=
|
257
|
-
request_headers=
|
271
|
+
headers=final_response.all_headers(),
|
272
|
+
request_headers=final_response.request.all_headers(),
|
258
273
|
**self.adaptor_arguments
|
259
274
|
)
|
260
275
|
page.close()
|
@@ -266,12 +281,21 @@ class PlaywrightEngine:
|
|
266
281
|
:param url: Target url.
|
267
282
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
268
283
|
"""
|
284
|
+
from playwright.async_api import Response as PlaywrightResponse
|
269
285
|
if not self.stealth or self.real_chrome:
|
270
286
|
# Because rebrowser_playwright doesn't play well with real browsers
|
271
287
|
from playwright.async_api import async_playwright
|
272
288
|
else:
|
273
289
|
from rebrowser_playwright.async_api import async_playwright
|
274
290
|
|
291
|
+
# Store the final response
|
292
|
+
final_response = None
|
293
|
+
|
294
|
+
async def handle_response(finished_response: PlaywrightResponse):
|
295
|
+
nonlocal final_response
|
296
|
+
if finished_response.request.resource_type == "document":
|
297
|
+
final_response = finished_response
|
298
|
+
|
275
299
|
async with async_playwright() as p:
|
276
300
|
# Creating the browser
|
277
301
|
if self.cdp_url:
|
@@ -285,6 +309,8 @@ class PlaywrightEngine:
|
|
285
309
|
page = await context.new_page()
|
286
310
|
page.set_default_navigation_timeout(self.timeout)
|
287
311
|
page.set_default_timeout(self.timeout)
|
312
|
+
# Listen for all responses
|
313
|
+
page.on("response", handle_response)
|
288
314
|
|
289
315
|
if self.extra_headers:
|
290
316
|
await page.set_extra_http_headers(self.extra_headers)
|
@@ -296,7 +322,7 @@ class PlaywrightEngine:
|
|
296
322
|
for script in self.__stealth_scripts():
|
297
323
|
await page.add_init_script(path=script)
|
298
324
|
|
299
|
-
|
325
|
+
first_response = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
300
326
|
await page.wait_for_load_state(state="domcontentloaded")
|
301
327
|
if self.network_idle:
|
302
328
|
await page.wait_for_load_state('networkidle')
|
@@ -313,21 +339,24 @@ class PlaywrightEngine:
|
|
313
339
|
if self.network_idle:
|
314
340
|
await page.wait_for_load_state('networkidle')
|
315
341
|
|
342
|
+
response_bytes = await final_response.body() if final_response else (await page.content()).encode('utf-8')
|
343
|
+
# In case we didn't catch a document type somehow
|
344
|
+
final_response = final_response if final_response else first_response
|
316
345
|
# This will be parsed inside `Response`
|
317
|
-
encoding =
|
346
|
+
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
|
318
347
|
# PlayWright API sometimes give empty status text for some reason!
|
319
|
-
status_text =
|
348
|
+
status_text = final_response.status_text or StatusText.get(final_response.status)
|
320
349
|
|
321
350
|
response = Response(
|
322
|
-
url=
|
351
|
+
url=final_response.url,
|
323
352
|
text=await page.content(),
|
324
|
-
body=
|
325
|
-
status=
|
353
|
+
body=response_bytes,
|
354
|
+
status=final_response.status,
|
326
355
|
reason=status_text,
|
327
356
|
encoding=encoding,
|
328
357
|
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
|
329
|
-
headers=await
|
330
|
-
request_headers=await
|
358
|
+
headers=await final_response.all_headers(),
|
359
|
+
request_headers=await final_response.request.all_headers(),
|
331
360
|
**self.adaptor_arguments
|
332
361
|
)
|
333
362
|
await page.close()
|
@@ -84,8 +84,6 @@ class ResponseEncoding:
|
|
84
84
|
class Response(Adaptor):
|
85
85
|
"""This class is returned by all engines as a way to unify response type between different libraries."""
|
86
86
|
|
87
|
-
_is_response_result_logged = False # Class-level flag, initialized to False
|
88
|
-
|
89
87
|
def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict,
|
90
88
|
encoding: str = 'utf-8', method: str = 'GET', **adaptor_arguments: Dict):
|
91
89
|
automatch_domain = adaptor_arguments.pop('automatch_domain', None)
|
@@ -99,9 +97,7 @@ class Response(Adaptor):
|
|
99
97
|
# For back-ward compatibility
|
100
98
|
self.adaptor = self
|
101
99
|
# For easier debugging while working from a Python shell
|
102
|
-
|
103
|
-
log.info(f'Fetched ({status}) <{method} {url}> (referer: {request_headers.get("referer")})')
|
104
|
-
Response._is_response_result_logged = True
|
100
|
+
log.info(f'Fetched ({status}) <{method} {url}> (referer: {request_headers.get("referer")})')
|
105
101
|
|
106
102
|
# def __repr__(self):
|
107
103
|
# return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
|
@@ -1,5 +1,5 @@
|
|
1
1
|
from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
|
2
|
-
Union)
|
2
|
+
SelectorWaitStates, Union)
|
3
3
|
from scrapling.engines import (CamoufoxEngine, PlaywrightEngine, StaticEngine,
|
4
4
|
check_if_engine_usable)
|
5
5
|
from scrapling.engines.toolbelt import BaseFetcher, Response
|
@@ -176,8 +176,8 @@ class StealthyFetcher(BaseFetcher):
|
|
176
176
|
self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
|
177
177
|
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
|
178
178
|
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
|
179
|
-
wait_selector_state:
|
180
|
-
os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
|
179
|
+
wait_selector_state: SelectorWaitStates = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
|
180
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
|
181
181
|
) -> Response:
|
182
182
|
"""
|
183
183
|
Opens up a browser and do your request based on your chosen options below.
|
@@ -234,8 +234,8 @@ class StealthyFetcher(BaseFetcher):
|
|
234
234
|
self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
|
235
235
|
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
|
236
236
|
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
|
237
|
-
wait_selector_state:
|
238
|
-
os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
|
237
|
+
wait_selector_state: SelectorWaitStates = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
|
238
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
|
239
239
|
) -> Response:
|
240
240
|
"""
|
241
241
|
Opens up a browser and do your request based on your chosen options below.
|
@@ -308,7 +308,7 @@ class PlayWrightFetcher(BaseFetcher):
|
|
308
308
|
def fetch(
|
309
309
|
self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
|
310
310
|
useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
|
311
|
-
page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state:
|
311
|
+
page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached',
|
312
312
|
hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
|
313
313
|
proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
|
314
314
|
stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
|
@@ -368,7 +368,7 @@ class PlayWrightFetcher(BaseFetcher):
|
|
368
368
|
async def async_fetch(
|
369
369
|
self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
|
370
370
|
useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
|
371
|
-
page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state:
|
371
|
+
page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached',
|
372
372
|
hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
|
373
373
|
proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
|
374
374
|
stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
|
@@ -155,7 +155,7 @@ class Adaptor(SelectorsGeneration):
|
|
155
155
|
else:
|
156
156
|
if issubclass(type(element), html.HtmlMixin):
|
157
157
|
|
158
|
-
return
|
158
|
+
return Adaptor(
|
159
159
|
root=element,
|
160
160
|
text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
|
161
161
|
url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.91
|
4
4
|
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -21,7 +21,6 @@ Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
21
21
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
22
22
|
Classifier: Programming Language :: Python :: 3
|
23
23
|
Classifier: Programming Language :: Python :: 3 :: Only
|
24
|
-
Classifier: Programming Language :: Python :: 3.8
|
25
24
|
Classifier: Programming Language :: Python :: 3.9
|
26
25
|
Classifier: Programming Language :: Python :: 3.10
|
27
26
|
Classifier: Programming Language :: Python :: 3.11
|
@@ -38,7 +37,7 @@ Requires-Dist: cssselect>=1.2
|
|
38
37
|
Requires-Dist: w3lib
|
39
38
|
Requires-Dist: orjson>=3
|
40
39
|
Requires-Dist: tldextract
|
41
|
-
Requires-Dist: httpx[brotli,zstd]
|
40
|
+
Requires-Dist: httpx[brotli,socks,zstd]
|
42
41
|
Requires-Dist: playwright>=1.49.1
|
43
42
|
Requires-Dist: rebrowser-playwright>=1.49.1
|
44
43
|
Requires-Dist: camoufox[geoip]>=0.4.9
|
@@ -6,7 +6,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
|
6
6
|
|
7
7
|
setup(
|
8
8
|
name="scrapling",
|
9
|
-
version="0.2.
|
9
|
+
version="0.2.91",
|
10
10
|
description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
11
11
|
simplifies the process of extracting data from websites, even when they undergo structural changes, and offers
|
12
12
|
impressive speed improvements over many popular scraping tools.""",
|
@@ -37,7 +37,6 @@ setup(
|
|
37
37
|
"Topic :: Software Development :: Libraries :: Python Modules",
|
38
38
|
"Programming Language :: Python :: 3",
|
39
39
|
"Programming Language :: Python :: 3 :: Only",
|
40
|
-
"Programming Language :: Python :: 3.8",
|
41
40
|
"Programming Language :: Python :: 3.9",
|
42
41
|
"Programming Language :: Python :: 3.10",
|
43
42
|
"Programming Language :: Python :: 3.11",
|
@@ -54,7 +53,7 @@ setup(
|
|
54
53
|
"w3lib",
|
55
54
|
"orjson>=3",
|
56
55
|
"tldextract",
|
57
|
-
'httpx[brotli,zstd]',
|
56
|
+
'httpx[brotli,zstd, socks]',
|
58
57
|
'playwright>=1.49.1',
|
59
58
|
'rebrowser-playwright>=1.49.1',
|
60
59
|
'camoufox[geoip]>=0.4.9'
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js
RENAMED
File without changes
|
{scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/toolbelt/bypasses/notification_permission.js
RENAMED
File without changes
|
File without changes
|
{scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|