scrapling 0.2.9__py3-none-any.whl → 0.2.91__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- scrapling/__init__.py +1 -1
- scrapling/core/_types.py +2 -0
- scrapling/engines/camo.py +44 -18
- scrapling/engines/pw.py +47 -18
- scrapling/engines/toolbelt/custom.py +1 -5
- scrapling/fetchers.py +7 -7
- scrapling/parser.py +1 -1
- {scrapling-0.2.9.dist-info → scrapling-0.2.91.dist-info}/METADATA +2 -3
- {scrapling-0.2.9.dist-info → scrapling-0.2.91.dist-info}/RECORD +12 -12
- {scrapling-0.2.9.dist-info → scrapling-0.2.91.dist-info}/LICENSE +0 -0
- {scrapling-0.2.9.dist-info → scrapling-0.2.91.dist-info}/WHEEL +0 -0
- {scrapling-0.2.9.dist-info → scrapling-0.2.91.dist-info}/top_level.txt +0 -0
scrapling/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
|
|
5
5
|
from scrapling.parser import Adaptor, Adaptors
|
6
6
|
|
7
7
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
8
|
-
__version__ = "0.2.
|
8
|
+
__version__ = "0.2.91"
|
9
9
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
10
10
|
|
11
11
|
|
scrapling/core/_types.py
CHANGED
@@ -5,6 +5,8 @@ Type definitions for type checking purposes.
|
|
5
5
|
from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable,
|
6
6
|
List, Literal, Optional, Pattern, Tuple, Type, Union)
|
7
7
|
|
8
|
+
SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
|
9
|
+
|
8
10
|
try:
|
9
11
|
from typing import Protocol
|
10
12
|
except ImportError:
|
scrapling/engines/camo.py
CHANGED
@@ -3,7 +3,7 @@ from camoufox.async_api import AsyncCamoufox
|
|
3
3
|
from camoufox.sync_api import Camoufox
|
4
4
|
|
5
5
|
from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
|
6
|
-
Union)
|
6
|
+
SelectorWaitStates, Union)
|
7
7
|
from scrapling.core.utils import log
|
8
8
|
from scrapling.engines.toolbelt import (Response, StatusText,
|
9
9
|
async_intercept_route,
|
@@ -18,7 +18,7 @@ class CamoufoxEngine:
|
|
18
18
|
self, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
|
19
19
|
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
|
20
20
|
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
|
21
|
-
wait_selector_state:
|
21
|
+
wait_selector_state: Optional[SelectorWaitStates] = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
|
22
22
|
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
|
23
23
|
geoip: Optional[bool] = False,
|
24
24
|
adaptor_arguments: Dict = None,
|
@@ -84,6 +84,14 @@ class CamoufoxEngine:
|
|
84
84
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
85
85
|
"""
|
86
86
|
addons = [] if self.disable_ads else [DefaultAddons.UBO]
|
87
|
+
# Store the final response
|
88
|
+
final_response = None
|
89
|
+
|
90
|
+
def handle_response(finished_response):
|
91
|
+
nonlocal final_response
|
92
|
+
if finished_response.request.resource_type == "document":
|
93
|
+
final_response = finished_response
|
94
|
+
|
87
95
|
with Camoufox(
|
88
96
|
geoip=self.geoip,
|
89
97
|
proxy=self.proxy,
|
@@ -100,13 +108,15 @@ class CamoufoxEngine:
|
|
100
108
|
page = browser.new_page()
|
101
109
|
page.set_default_navigation_timeout(self.timeout)
|
102
110
|
page.set_default_timeout(self.timeout)
|
111
|
+
# Listen for all responses
|
112
|
+
page.on("response", handle_response)
|
103
113
|
if self.disable_resources:
|
104
114
|
page.route("**/*", intercept_route)
|
105
115
|
|
106
116
|
if self.extra_headers:
|
107
117
|
page.set_extra_http_headers(self.extra_headers)
|
108
118
|
|
109
|
-
|
119
|
+
first_response = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
110
120
|
page.wait_for_load_state(state="domcontentloaded")
|
111
121
|
if self.network_idle:
|
112
122
|
page.wait_for_load_state('networkidle')
|
@@ -123,21 +133,24 @@ class CamoufoxEngine:
|
|
123
133
|
if self.network_idle:
|
124
134
|
page.wait_for_load_state('networkidle')
|
125
135
|
|
136
|
+
response_bytes = final_response.body() if final_response else page.content().encode('utf-8')
|
137
|
+
# In case we didn't catch a document type somehow
|
138
|
+
final_response = final_response if final_response else first_response
|
126
139
|
# This will be parsed inside `Response`
|
127
|
-
encoding =
|
140
|
+
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
|
128
141
|
# PlayWright API sometimes give empty status text for some reason!
|
129
|
-
status_text =
|
142
|
+
status_text = final_response.status_text or StatusText.get(final_response.status)
|
130
143
|
|
131
144
|
response = Response(
|
132
|
-
url=
|
145
|
+
url=final_response.url,
|
133
146
|
text=page.content(),
|
134
|
-
body=
|
135
|
-
status=
|
147
|
+
body=response_bytes,
|
148
|
+
status=final_response.status,
|
136
149
|
reason=status_text,
|
137
150
|
encoding=encoding,
|
138
151
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
139
|
-
headers=
|
140
|
-
request_headers=
|
152
|
+
headers=final_response.all_headers(),
|
153
|
+
request_headers=final_response.request.all_headers(),
|
141
154
|
**self.adaptor_arguments
|
142
155
|
)
|
143
156
|
page.close()
|
@@ -151,6 +164,14 @@ class CamoufoxEngine:
|
|
151
164
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
152
165
|
"""
|
153
166
|
addons = [] if self.disable_ads else [DefaultAddons.UBO]
|
167
|
+
# Store the final response
|
168
|
+
final_response = None
|
169
|
+
|
170
|
+
async def handle_response(finished_response):
|
171
|
+
nonlocal final_response
|
172
|
+
if finished_response.request.resource_type == "document":
|
173
|
+
final_response = finished_response
|
174
|
+
|
154
175
|
async with AsyncCamoufox(
|
155
176
|
geoip=self.geoip,
|
156
177
|
proxy=self.proxy,
|
@@ -167,13 +188,15 @@ class CamoufoxEngine:
|
|
167
188
|
page = await browser.new_page()
|
168
189
|
page.set_default_navigation_timeout(self.timeout)
|
169
190
|
page.set_default_timeout(self.timeout)
|
191
|
+
# Listen for all responses
|
192
|
+
page.on("response", handle_response)
|
170
193
|
if self.disable_resources:
|
171
194
|
await page.route("**/*", async_intercept_route)
|
172
195
|
|
173
196
|
if self.extra_headers:
|
174
197
|
await page.set_extra_http_headers(self.extra_headers)
|
175
198
|
|
176
|
-
|
199
|
+
first_response = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
177
200
|
await page.wait_for_load_state(state="domcontentloaded")
|
178
201
|
if self.network_idle:
|
179
202
|
await page.wait_for_load_state('networkidle')
|
@@ -190,21 +213,24 @@ class CamoufoxEngine:
|
|
190
213
|
if self.network_idle:
|
191
214
|
await page.wait_for_load_state('networkidle')
|
192
215
|
|
216
|
+
response_bytes = await final_response.body() if final_response else (await page.content()).encode('utf-8')
|
217
|
+
# In case we didn't catch a document type somehow
|
218
|
+
final_response = final_response if final_response else first_response
|
193
219
|
# This will be parsed inside `Response`
|
194
|
-
encoding =
|
220
|
+
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
|
195
221
|
# PlayWright API sometimes give empty status text for some reason!
|
196
|
-
status_text =
|
222
|
+
status_text = final_response.status_text or StatusText.get(final_response.status)
|
197
223
|
|
198
224
|
response = Response(
|
199
|
-
url=
|
225
|
+
url=final_response.url,
|
200
226
|
text=await page.content(),
|
201
|
-
body=
|
202
|
-
status=
|
227
|
+
body=response_bytes,
|
228
|
+
status=final_response.status,
|
203
229
|
reason=status_text,
|
204
230
|
encoding=encoding,
|
205
231
|
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
|
206
|
-
headers=await
|
207
|
-
request_headers=await
|
232
|
+
headers=await final_response.all_headers(),
|
233
|
+
request_headers=await final_response.request.all_headers(),
|
208
234
|
**self.adaptor_arguments
|
209
235
|
)
|
210
236
|
await page.close()
|
scrapling/engines/pw.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import json
|
2
2
|
|
3
|
-
from scrapling.core._types import Callable, Dict, Optional,
|
3
|
+
from scrapling.core._types import (Callable, Dict, Optional,
|
4
|
+
SelectorWaitStates, Union)
|
4
5
|
from scrapling.core.utils import log, lru_cache
|
5
6
|
from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
|
6
7
|
NSTBROWSER_DEFAULT_QUERY)
|
@@ -23,7 +24,7 @@ class PlaywrightEngine:
|
|
23
24
|
page_action: Callable = None,
|
24
25
|
wait_selector: Optional[str] = None,
|
25
26
|
locale: Optional[str] = 'en-US',
|
26
|
-
wait_selector_state:
|
27
|
+
wait_selector_state: SelectorWaitStates = 'attached',
|
27
28
|
stealth: Optional[bool] = False,
|
28
29
|
real_chrome: Optional[bool] = False,
|
29
30
|
hide_canvas: Optional[bool] = False,
|
@@ -193,12 +194,21 @@ class PlaywrightEngine:
|
|
193
194
|
:param url: Target url.
|
194
195
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
195
196
|
"""
|
197
|
+
from playwright.sync_api import Response as PlaywrightResponse
|
196
198
|
if not self.stealth or self.real_chrome:
|
197
199
|
# Because rebrowser_playwright doesn't play well with real browsers
|
198
200
|
from playwright.sync_api import sync_playwright
|
199
201
|
else:
|
200
202
|
from rebrowser_playwright.sync_api import sync_playwright
|
201
203
|
|
204
|
+
# Store the final response
|
205
|
+
final_response = None
|
206
|
+
|
207
|
+
def handle_response(finished_response: PlaywrightResponse):
|
208
|
+
nonlocal final_response
|
209
|
+
if finished_response.request.resource_type == "document":
|
210
|
+
final_response = finished_response
|
211
|
+
|
202
212
|
with sync_playwright() as p:
|
203
213
|
# Creating the browser
|
204
214
|
if self.cdp_url:
|
@@ -212,6 +222,8 @@ class PlaywrightEngine:
|
|
212
222
|
page = context.new_page()
|
213
223
|
page.set_default_navigation_timeout(self.timeout)
|
214
224
|
page.set_default_timeout(self.timeout)
|
225
|
+
# Listen for all responses
|
226
|
+
page.on("response", handle_response)
|
215
227
|
|
216
228
|
if self.extra_headers:
|
217
229
|
page.set_extra_http_headers(self.extra_headers)
|
@@ -223,7 +235,7 @@ class PlaywrightEngine:
|
|
223
235
|
for script in self.__stealth_scripts():
|
224
236
|
page.add_init_script(path=script)
|
225
237
|
|
226
|
-
|
238
|
+
first_response = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
227
239
|
page.wait_for_load_state(state="domcontentloaded")
|
228
240
|
if self.network_idle:
|
229
241
|
page.wait_for_load_state('networkidle')
|
@@ -240,21 +252,24 @@ class PlaywrightEngine:
|
|
240
252
|
if self.network_idle:
|
241
253
|
page.wait_for_load_state('networkidle')
|
242
254
|
|
255
|
+
response_bytes = final_response.body() if final_response else page.content().encode('utf-8')
|
256
|
+
# In case we didn't catch a document type somehow
|
257
|
+
final_response = final_response if final_response else first_response
|
243
258
|
# This will be parsed inside `Response`
|
244
|
-
encoding =
|
259
|
+
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
|
245
260
|
# PlayWright API sometimes give empty status text for some reason!
|
246
|
-
status_text =
|
261
|
+
status_text = final_response.status_text or StatusText.get(final_response.status)
|
247
262
|
|
248
263
|
response = Response(
|
249
|
-
url=
|
264
|
+
url=final_response.url,
|
250
265
|
text=page.content(),
|
251
|
-
body=
|
252
|
-
status=
|
266
|
+
body=response_bytes,
|
267
|
+
status=final_response.status,
|
253
268
|
reason=status_text,
|
254
269
|
encoding=encoding,
|
255
270
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
256
|
-
headers=
|
257
|
-
request_headers=
|
271
|
+
headers=final_response.all_headers(),
|
272
|
+
request_headers=final_response.request.all_headers(),
|
258
273
|
**self.adaptor_arguments
|
259
274
|
)
|
260
275
|
page.close()
|
@@ -266,12 +281,21 @@ class PlaywrightEngine:
|
|
266
281
|
:param url: Target url.
|
267
282
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
268
283
|
"""
|
284
|
+
from playwright.async_api import Response as PlaywrightResponse
|
269
285
|
if not self.stealth or self.real_chrome:
|
270
286
|
# Because rebrowser_playwright doesn't play well with real browsers
|
271
287
|
from playwright.async_api import async_playwright
|
272
288
|
else:
|
273
289
|
from rebrowser_playwright.async_api import async_playwright
|
274
290
|
|
291
|
+
# Store the final response
|
292
|
+
final_response = None
|
293
|
+
|
294
|
+
async def handle_response(finished_response: PlaywrightResponse):
|
295
|
+
nonlocal final_response
|
296
|
+
if finished_response.request.resource_type == "document":
|
297
|
+
final_response = finished_response
|
298
|
+
|
275
299
|
async with async_playwright() as p:
|
276
300
|
# Creating the browser
|
277
301
|
if self.cdp_url:
|
@@ -285,6 +309,8 @@ class PlaywrightEngine:
|
|
285
309
|
page = await context.new_page()
|
286
310
|
page.set_default_navigation_timeout(self.timeout)
|
287
311
|
page.set_default_timeout(self.timeout)
|
312
|
+
# Listen for all responses
|
313
|
+
page.on("response", handle_response)
|
288
314
|
|
289
315
|
if self.extra_headers:
|
290
316
|
await page.set_extra_http_headers(self.extra_headers)
|
@@ -296,7 +322,7 @@ class PlaywrightEngine:
|
|
296
322
|
for script in self.__stealth_scripts():
|
297
323
|
await page.add_init_script(path=script)
|
298
324
|
|
299
|
-
|
325
|
+
first_response = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
300
326
|
await page.wait_for_load_state(state="domcontentloaded")
|
301
327
|
if self.network_idle:
|
302
328
|
await page.wait_for_load_state('networkidle')
|
@@ -313,21 +339,24 @@ class PlaywrightEngine:
|
|
313
339
|
if self.network_idle:
|
314
340
|
await page.wait_for_load_state('networkidle')
|
315
341
|
|
342
|
+
response_bytes = await final_response.body() if final_response else (await page.content()).encode('utf-8')
|
343
|
+
# In case we didn't catch a document type somehow
|
344
|
+
final_response = final_response if final_response else first_response
|
316
345
|
# This will be parsed inside `Response`
|
317
|
-
encoding =
|
346
|
+
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
|
318
347
|
# PlayWright API sometimes give empty status text for some reason!
|
319
|
-
status_text =
|
348
|
+
status_text = final_response.status_text or StatusText.get(final_response.status)
|
320
349
|
|
321
350
|
response = Response(
|
322
|
-
url=
|
351
|
+
url=final_response.url,
|
323
352
|
text=await page.content(),
|
324
|
-
body=
|
325
|
-
status=
|
353
|
+
body=response_bytes,
|
354
|
+
status=final_response.status,
|
326
355
|
reason=status_text,
|
327
356
|
encoding=encoding,
|
328
357
|
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
|
329
|
-
headers=await
|
330
|
-
request_headers=await
|
358
|
+
headers=await final_response.all_headers(),
|
359
|
+
request_headers=await final_response.request.all_headers(),
|
331
360
|
**self.adaptor_arguments
|
332
361
|
)
|
333
362
|
await page.close()
|
@@ -84,8 +84,6 @@ class ResponseEncoding:
|
|
84
84
|
class Response(Adaptor):
|
85
85
|
"""This class is returned by all engines as a way to unify response type between different libraries."""
|
86
86
|
|
87
|
-
_is_response_result_logged = False # Class-level flag, initialized to False
|
88
|
-
|
89
87
|
def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict,
|
90
88
|
encoding: str = 'utf-8', method: str = 'GET', **adaptor_arguments: Dict):
|
91
89
|
automatch_domain = adaptor_arguments.pop('automatch_domain', None)
|
@@ -99,9 +97,7 @@ class Response(Adaptor):
|
|
99
97
|
# For back-ward compatibility
|
100
98
|
self.adaptor = self
|
101
99
|
# For easier debugging while working from a Python shell
|
102
|
-
|
103
|
-
log.info(f'Fetched ({status}) <{method} {url}> (referer: {request_headers.get("referer")})')
|
104
|
-
Response._is_response_result_logged = True
|
100
|
+
log.info(f'Fetched ({status}) <{method} {url}> (referer: {request_headers.get("referer")})')
|
105
101
|
|
106
102
|
# def __repr__(self):
|
107
103
|
# return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
|
scrapling/fetchers.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
|
2
|
-
Union)
|
2
|
+
SelectorWaitStates, Union)
|
3
3
|
from scrapling.engines import (CamoufoxEngine, PlaywrightEngine, StaticEngine,
|
4
4
|
check_if_engine_usable)
|
5
5
|
from scrapling.engines.toolbelt import BaseFetcher, Response
|
@@ -176,8 +176,8 @@ class StealthyFetcher(BaseFetcher):
|
|
176
176
|
self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
|
177
177
|
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
|
178
178
|
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
|
179
|
-
wait_selector_state:
|
180
|
-
os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
|
179
|
+
wait_selector_state: SelectorWaitStates = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
|
180
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
|
181
181
|
) -> Response:
|
182
182
|
"""
|
183
183
|
Opens up a browser and do your request based on your chosen options below.
|
@@ -234,8 +234,8 @@ class StealthyFetcher(BaseFetcher):
|
|
234
234
|
self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
|
235
235
|
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
|
236
236
|
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
|
237
|
-
wait_selector_state:
|
238
|
-
os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
|
237
|
+
wait_selector_state: SelectorWaitStates = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
|
238
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
|
239
239
|
) -> Response:
|
240
240
|
"""
|
241
241
|
Opens up a browser and do your request based on your chosen options below.
|
@@ -308,7 +308,7 @@ class PlayWrightFetcher(BaseFetcher):
|
|
308
308
|
def fetch(
|
309
309
|
self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
|
310
310
|
useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
|
311
|
-
page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state:
|
311
|
+
page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached',
|
312
312
|
hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
|
313
313
|
proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
|
314
314
|
stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
|
@@ -368,7 +368,7 @@ class PlayWrightFetcher(BaseFetcher):
|
|
368
368
|
async def async_fetch(
|
369
369
|
self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
|
370
370
|
useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
|
371
|
-
page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state:
|
371
|
+
page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached',
|
372
372
|
hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
|
373
373
|
proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
|
374
374
|
stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
|
scrapling/parser.py
CHANGED
@@ -155,7 +155,7 @@ class Adaptor(SelectorsGeneration):
|
|
155
155
|
else:
|
156
156
|
if issubclass(type(element), html.HtmlMixin):
|
157
157
|
|
158
|
-
return
|
158
|
+
return Adaptor(
|
159
159
|
root=element,
|
160
160
|
text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
|
161
161
|
url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.91
|
4
4
|
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -21,7 +21,6 @@ Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
21
21
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
22
22
|
Classifier: Programming Language :: Python :: 3
|
23
23
|
Classifier: Programming Language :: Python :: 3 :: Only
|
24
|
-
Classifier: Programming Language :: Python :: 3.8
|
25
24
|
Classifier: Programming Language :: Python :: 3.9
|
26
25
|
Classifier: Programming Language :: Python :: 3.10
|
27
26
|
Classifier: Programming Language :: Python :: 3.11
|
@@ -38,7 +37,7 @@ Requires-Dist: cssselect>=1.2
|
|
38
37
|
Requires-Dist: w3lib
|
39
38
|
Requires-Dist: orjson>=3
|
40
39
|
Requires-Dist: tldextract
|
41
|
-
Requires-Dist: httpx[brotli,zstd]
|
40
|
+
Requires-Dist: httpx[brotli,socks,zstd]
|
42
41
|
Requires-Dist: playwright>=1.49.1
|
43
42
|
Requires-Dist: rebrowser-playwright>=1.49.1
|
44
43
|
Requires-Dist: camoufox[geoip]>=0.4.9
|
@@ -1,22 +1,22 @@
|
|
1
|
-
scrapling/__init__.py,sha256=
|
1
|
+
scrapling/__init__.py,sha256=pfbhEm1kcriA9pFR3JUUFEE3v4_ykB35SYbeHKzFxHw,500
|
2
2
|
scrapling/defaults.py,sha256=tJAOMB-PMd3aLZz3j_yr6haBxxaklAvWdS_hP-GFFdU,331
|
3
|
-
scrapling/fetchers.py,sha256=
|
4
|
-
scrapling/parser.py,sha256=
|
3
|
+
scrapling/fetchers.py,sha256=K3MKBqKDOXItJNwxFY2fe1C21Vz6QSd91fFtN98Mpg4,35402
|
4
|
+
scrapling/parser.py,sha256=Fl9cdbR58GuoPbWN5hZI6ToPSl0_rQFXMskTdzpoxWs,55208
|
5
5
|
scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
|
6
6
|
scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
scrapling/core/_types.py,sha256=
|
7
|
+
scrapling/core/_types.py,sha256=OcsP1WeQEOlEVo9OzTrLQfgZZfXuJ0civVs31SynwGA,641
|
8
8
|
scrapling/core/custom_types.py,sha256=ZRzpoT6qQ4vU_ejhLXa7WYuYLGl5HwAjLPe01xdhuvM,10808
|
9
9
|
scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
|
10
10
|
scrapling/core/storage_adaptors.py,sha256=l_ZYcdn1y69AcoPuRrPoaxqKysN62pMExrwJWYdu5MA,6220
|
11
11
|
scrapling/core/translator.py,sha256=ojDmNi5pFZE6Ke-AiSsTilXiPRdR8yhX3o-uVGMkap8,5236
|
12
12
|
scrapling/core/utils.py,sha256=03LzCDzmeK1TXPjIKVzHSUgSfhpe36XE8AwxlgxzJoU,3705
|
13
13
|
scrapling/engines/__init__.py,sha256=zA7tzqcDXP0hllwmjVewNHWipIA4JSU9mRG4J-cud0c,267
|
14
|
-
scrapling/engines/camo.py,sha256=
|
14
|
+
scrapling/engines/camo.py,sha256=g12IVIPy4Uyp_jngtu8Qcvy7PSMHjURAHUGXdM58Kks,13778
|
15
15
|
scrapling/engines/constants.py,sha256=Gb_nXFoBB4ujJkd05SKkenMe1UDiRYQA3dkmA3DunLg,3723
|
16
|
-
scrapling/engines/pw.py,sha256=
|
16
|
+
scrapling/engines/pw.py,sha256=Eq4_oQA5eX666chiNpXsBqhWONzleniyXjKdmCpXj_Y,18630
|
17
17
|
scrapling/engines/static.py,sha256=7SVEfeigCPfwC1ukx0zIFFe96Bo5fox6qOq2IWrP6P8,10319
|
18
18
|
scrapling/engines/toolbelt/__init__.py,sha256=VQDdYm1zY9Apno6d8UrULk29vUjllZrQqD8mXL1E2Fc,402
|
19
|
-
scrapling/engines/toolbelt/custom.py,sha256=
|
19
|
+
scrapling/engines/toolbelt/custom.py,sha256=d3qyeCg_qHm1RRE7yv5hyU9b17Y7YDPGBOVhEH1CAT0,12754
|
20
20
|
scrapling/engines/toolbelt/fingerprints.py,sha256=ajEHdXHr7W4hw9KcNS7XlyxNBZu37p1bRj18TiICLzU,2929
|
21
21
|
scrapling/engines/toolbelt/navigation.py,sha256=xEfZRJefuxOCGxQOSI2llS0du0Y2XmoIPdVGUSHOd7k,4567
|
22
22
|
scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
|
@@ -40,8 +40,8 @@ tests/fetchers/sync/test_playwright.py,sha256=5eZdPwk3JGeaO7GuExv_QsByLyWDE9joxn
|
|
40
40
|
tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
41
41
|
tests/parser/test_automatch.py,sha256=SxsNdExE8zz8AcPRQFBUjZ3Q_1-tPOd9dzVvMSZpOYQ,4908
|
42
42
|
tests/parser/test_general.py,sha256=dyfOsc8lleoY4AxcfDUBUaD1i95xecfYuTUhKBsYjwo,12100
|
43
|
-
scrapling-0.2.
|
44
|
-
scrapling-0.2.
|
45
|
-
scrapling-0.2.
|
46
|
-
scrapling-0.2.
|
47
|
-
scrapling-0.2.
|
43
|
+
scrapling-0.2.91.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
44
|
+
scrapling-0.2.91.dist-info/METADATA,sha256=ajc8n5Hjl--ZdGXwHxmfMEWyCMgbw1waZNovoPFxrUc,68339
|
45
|
+
scrapling-0.2.91.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
46
|
+
scrapling-0.2.91.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
|
47
|
+
scrapling-0.2.91.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|