scrapling 0.2.9__py3-none-any.whl → 0.2.91__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/core/_types.py +2 -0
- scrapling/engines/camo.py +44 -18
- scrapling/engines/pw.py +47 -18
- scrapling/engines/toolbelt/custom.py +1 -5
- scrapling/fetchers.py +7 -7
- scrapling/parser.py +1 -1
- {scrapling-0.2.9.dist-info → scrapling-0.2.91.dist-info}/METADATA +2 -3
- {scrapling-0.2.9.dist-info → scrapling-0.2.91.dist-info}/RECORD +12 -12
- {scrapling-0.2.9.dist-info → scrapling-0.2.91.dist-info}/LICENSE +0 -0
- {scrapling-0.2.9.dist-info → scrapling-0.2.91.dist-info}/WHEEL +0 -0
- {scrapling-0.2.9.dist-info → scrapling-0.2.91.dist-info}/top_level.txt +0 -0
scrapling/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
|
|
5
5
|
from scrapling.parser import Adaptor, Adaptors
|
6
6
|
|
7
7
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
8
|
-
__version__ = "0.2.
|
8
|
+
__version__ = "0.2.91"
|
9
9
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
10
10
|
|
11
11
|
|
scrapling/core/_types.py
CHANGED
@@ -5,6 +5,8 @@ Type definitions for type checking purposes.
|
|
5
5
|
from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable,
|
6
6
|
List, Literal, Optional, Pattern, Tuple, Type, Union)
|
7
7
|
|
8
|
+
SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
|
9
|
+
|
8
10
|
try:
|
9
11
|
from typing import Protocol
|
10
12
|
except ImportError:
|
scrapling/engines/camo.py
CHANGED
@@ -3,7 +3,7 @@ from camoufox.async_api import AsyncCamoufox
|
|
3
3
|
from camoufox.sync_api import Camoufox
|
4
4
|
|
5
5
|
from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
|
6
|
-
Union)
|
6
|
+
SelectorWaitStates, Union)
|
7
7
|
from scrapling.core.utils import log
|
8
8
|
from scrapling.engines.toolbelt import (Response, StatusText,
|
9
9
|
async_intercept_route,
|
@@ -18,7 +18,7 @@ class CamoufoxEngine:
|
|
18
18
|
self, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
|
19
19
|
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
|
20
20
|
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
|
21
|
-
wait_selector_state:
|
21
|
+
wait_selector_state: Optional[SelectorWaitStates] = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
|
22
22
|
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
|
23
23
|
geoip: Optional[bool] = False,
|
24
24
|
adaptor_arguments: Dict = None,
|
@@ -84,6 +84,14 @@ class CamoufoxEngine:
|
|
84
84
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
85
85
|
"""
|
86
86
|
addons = [] if self.disable_ads else [DefaultAddons.UBO]
|
87
|
+
# Store the final response
|
88
|
+
final_response = None
|
89
|
+
|
90
|
+
def handle_response(finished_response):
|
91
|
+
nonlocal final_response
|
92
|
+
if finished_response.request.resource_type == "document":
|
93
|
+
final_response = finished_response
|
94
|
+
|
87
95
|
with Camoufox(
|
88
96
|
geoip=self.geoip,
|
89
97
|
proxy=self.proxy,
|
@@ -100,13 +108,15 @@ class CamoufoxEngine:
|
|
100
108
|
page = browser.new_page()
|
101
109
|
page.set_default_navigation_timeout(self.timeout)
|
102
110
|
page.set_default_timeout(self.timeout)
|
111
|
+
# Listen for all responses
|
112
|
+
page.on("response", handle_response)
|
103
113
|
if self.disable_resources:
|
104
114
|
page.route("**/*", intercept_route)
|
105
115
|
|
106
116
|
if self.extra_headers:
|
107
117
|
page.set_extra_http_headers(self.extra_headers)
|
108
118
|
|
109
|
-
|
119
|
+
first_response = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
110
120
|
page.wait_for_load_state(state="domcontentloaded")
|
111
121
|
if self.network_idle:
|
112
122
|
page.wait_for_load_state('networkidle')
|
@@ -123,21 +133,24 @@ class CamoufoxEngine:
|
|
123
133
|
if self.network_idle:
|
124
134
|
page.wait_for_load_state('networkidle')
|
125
135
|
|
136
|
+
response_bytes = final_response.body() if final_response else page.content().encode('utf-8')
|
137
|
+
# In case we didn't catch a document type somehow
|
138
|
+
final_response = final_response if final_response else first_response
|
126
139
|
# This will be parsed inside `Response`
|
127
|
-
encoding =
|
140
|
+
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
|
128
141
|
# PlayWright API sometimes give empty status text for some reason!
|
129
|
-
status_text =
|
142
|
+
status_text = final_response.status_text or StatusText.get(final_response.status)
|
130
143
|
|
131
144
|
response = Response(
|
132
|
-
url=
|
145
|
+
url=final_response.url,
|
133
146
|
text=page.content(),
|
134
|
-
body=
|
135
|
-
status=
|
147
|
+
body=response_bytes,
|
148
|
+
status=final_response.status,
|
136
149
|
reason=status_text,
|
137
150
|
encoding=encoding,
|
138
151
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
139
|
-
headers=
|
140
|
-
request_headers=
|
152
|
+
headers=final_response.all_headers(),
|
153
|
+
request_headers=final_response.request.all_headers(),
|
141
154
|
**self.adaptor_arguments
|
142
155
|
)
|
143
156
|
page.close()
|
@@ -151,6 +164,14 @@ class CamoufoxEngine:
|
|
151
164
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
152
165
|
"""
|
153
166
|
addons = [] if self.disable_ads else [DefaultAddons.UBO]
|
167
|
+
# Store the final response
|
168
|
+
final_response = None
|
169
|
+
|
170
|
+
async def handle_response(finished_response):
|
171
|
+
nonlocal final_response
|
172
|
+
if finished_response.request.resource_type == "document":
|
173
|
+
final_response = finished_response
|
174
|
+
|
154
175
|
async with AsyncCamoufox(
|
155
176
|
geoip=self.geoip,
|
156
177
|
proxy=self.proxy,
|
@@ -167,13 +188,15 @@ class CamoufoxEngine:
|
|
167
188
|
page = await browser.new_page()
|
168
189
|
page.set_default_navigation_timeout(self.timeout)
|
169
190
|
page.set_default_timeout(self.timeout)
|
191
|
+
# Listen for all responses
|
192
|
+
page.on("response", handle_response)
|
170
193
|
if self.disable_resources:
|
171
194
|
await page.route("**/*", async_intercept_route)
|
172
195
|
|
173
196
|
if self.extra_headers:
|
174
197
|
await page.set_extra_http_headers(self.extra_headers)
|
175
198
|
|
176
|
-
|
199
|
+
first_response = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
177
200
|
await page.wait_for_load_state(state="domcontentloaded")
|
178
201
|
if self.network_idle:
|
179
202
|
await page.wait_for_load_state('networkidle')
|
@@ -190,21 +213,24 @@ class CamoufoxEngine:
|
|
190
213
|
if self.network_idle:
|
191
214
|
await page.wait_for_load_state('networkidle')
|
192
215
|
|
216
|
+
response_bytes = await final_response.body() if final_response else (await page.content()).encode('utf-8')
|
217
|
+
# In case we didn't catch a document type somehow
|
218
|
+
final_response = final_response if final_response else first_response
|
193
219
|
# This will be parsed inside `Response`
|
194
|
-
encoding =
|
220
|
+
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
|
195
221
|
# PlayWright API sometimes give empty status text for some reason!
|
196
|
-
status_text =
|
222
|
+
status_text = final_response.status_text or StatusText.get(final_response.status)
|
197
223
|
|
198
224
|
response = Response(
|
199
|
-
url=
|
225
|
+
url=final_response.url,
|
200
226
|
text=await page.content(),
|
201
|
-
body=
|
202
|
-
status=
|
227
|
+
body=response_bytes,
|
228
|
+
status=final_response.status,
|
203
229
|
reason=status_text,
|
204
230
|
encoding=encoding,
|
205
231
|
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
|
206
|
-
headers=await
|
207
|
-
request_headers=await
|
232
|
+
headers=await final_response.all_headers(),
|
233
|
+
request_headers=await final_response.request.all_headers(),
|
208
234
|
**self.adaptor_arguments
|
209
235
|
)
|
210
236
|
await page.close()
|
scrapling/engines/pw.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import json
|
2
2
|
|
3
|
-
from scrapling.core._types import Callable, Dict, Optional,
|
3
|
+
from scrapling.core._types import (Callable, Dict, Optional,
|
4
|
+
SelectorWaitStates, Union)
|
4
5
|
from scrapling.core.utils import log, lru_cache
|
5
6
|
from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
|
6
7
|
NSTBROWSER_DEFAULT_QUERY)
|
@@ -23,7 +24,7 @@ class PlaywrightEngine:
|
|
23
24
|
page_action: Callable = None,
|
24
25
|
wait_selector: Optional[str] = None,
|
25
26
|
locale: Optional[str] = 'en-US',
|
26
|
-
wait_selector_state:
|
27
|
+
wait_selector_state: SelectorWaitStates = 'attached',
|
27
28
|
stealth: Optional[bool] = False,
|
28
29
|
real_chrome: Optional[bool] = False,
|
29
30
|
hide_canvas: Optional[bool] = False,
|
@@ -193,12 +194,21 @@ class PlaywrightEngine:
|
|
193
194
|
:param url: Target url.
|
194
195
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
195
196
|
"""
|
197
|
+
from playwright.sync_api import Response as PlaywrightResponse
|
196
198
|
if not self.stealth or self.real_chrome:
|
197
199
|
# Because rebrowser_playwright doesn't play well with real browsers
|
198
200
|
from playwright.sync_api import sync_playwright
|
199
201
|
else:
|
200
202
|
from rebrowser_playwright.sync_api import sync_playwright
|
201
203
|
|
204
|
+
# Store the final response
|
205
|
+
final_response = None
|
206
|
+
|
207
|
+
def handle_response(finished_response: PlaywrightResponse):
|
208
|
+
nonlocal final_response
|
209
|
+
if finished_response.request.resource_type == "document":
|
210
|
+
final_response = finished_response
|
211
|
+
|
202
212
|
with sync_playwright() as p:
|
203
213
|
# Creating the browser
|
204
214
|
if self.cdp_url:
|
@@ -212,6 +222,8 @@ class PlaywrightEngine:
|
|
212
222
|
page = context.new_page()
|
213
223
|
page.set_default_navigation_timeout(self.timeout)
|
214
224
|
page.set_default_timeout(self.timeout)
|
225
|
+
# Listen for all responses
|
226
|
+
page.on("response", handle_response)
|
215
227
|
|
216
228
|
if self.extra_headers:
|
217
229
|
page.set_extra_http_headers(self.extra_headers)
|
@@ -223,7 +235,7 @@ class PlaywrightEngine:
|
|
223
235
|
for script in self.__stealth_scripts():
|
224
236
|
page.add_init_script(path=script)
|
225
237
|
|
226
|
-
|
238
|
+
first_response = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
227
239
|
page.wait_for_load_state(state="domcontentloaded")
|
228
240
|
if self.network_idle:
|
229
241
|
page.wait_for_load_state('networkidle')
|
@@ -240,21 +252,24 @@ class PlaywrightEngine:
|
|
240
252
|
if self.network_idle:
|
241
253
|
page.wait_for_load_state('networkidle')
|
242
254
|
|
255
|
+
response_bytes = final_response.body() if final_response else page.content().encode('utf-8')
|
256
|
+
# In case we didn't catch a document type somehow
|
257
|
+
final_response = final_response if final_response else first_response
|
243
258
|
# This will be parsed inside `Response`
|
244
|
-
encoding =
|
259
|
+
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
|
245
260
|
# PlayWright API sometimes give empty status text for some reason!
|
246
|
-
status_text =
|
261
|
+
status_text = final_response.status_text or StatusText.get(final_response.status)
|
247
262
|
|
248
263
|
response = Response(
|
249
|
-
url=
|
264
|
+
url=final_response.url,
|
250
265
|
text=page.content(),
|
251
|
-
body=
|
252
|
-
status=
|
266
|
+
body=response_bytes,
|
267
|
+
status=final_response.status,
|
253
268
|
reason=status_text,
|
254
269
|
encoding=encoding,
|
255
270
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
256
|
-
headers=
|
257
|
-
request_headers=
|
271
|
+
headers=final_response.all_headers(),
|
272
|
+
request_headers=final_response.request.all_headers(),
|
258
273
|
**self.adaptor_arguments
|
259
274
|
)
|
260
275
|
page.close()
|
@@ -266,12 +281,21 @@ class PlaywrightEngine:
|
|
266
281
|
:param url: Target url.
|
267
282
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
268
283
|
"""
|
284
|
+
from playwright.async_api import Response as PlaywrightResponse
|
269
285
|
if not self.stealth or self.real_chrome:
|
270
286
|
# Because rebrowser_playwright doesn't play well with real browsers
|
271
287
|
from playwright.async_api import async_playwright
|
272
288
|
else:
|
273
289
|
from rebrowser_playwright.async_api import async_playwright
|
274
290
|
|
291
|
+
# Store the final response
|
292
|
+
final_response = None
|
293
|
+
|
294
|
+
async def handle_response(finished_response: PlaywrightResponse):
|
295
|
+
nonlocal final_response
|
296
|
+
if finished_response.request.resource_type == "document":
|
297
|
+
final_response = finished_response
|
298
|
+
|
275
299
|
async with async_playwright() as p:
|
276
300
|
# Creating the browser
|
277
301
|
if self.cdp_url:
|
@@ -285,6 +309,8 @@ class PlaywrightEngine:
|
|
285
309
|
page = await context.new_page()
|
286
310
|
page.set_default_navigation_timeout(self.timeout)
|
287
311
|
page.set_default_timeout(self.timeout)
|
312
|
+
# Listen for all responses
|
313
|
+
page.on("response", handle_response)
|
288
314
|
|
289
315
|
if self.extra_headers:
|
290
316
|
await page.set_extra_http_headers(self.extra_headers)
|
@@ -296,7 +322,7 @@ class PlaywrightEngine:
|
|
296
322
|
for script in self.__stealth_scripts():
|
297
323
|
await page.add_init_script(path=script)
|
298
324
|
|
299
|
-
|
325
|
+
first_response = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
300
326
|
await page.wait_for_load_state(state="domcontentloaded")
|
301
327
|
if self.network_idle:
|
302
328
|
await page.wait_for_load_state('networkidle')
|
@@ -313,21 +339,24 @@ class PlaywrightEngine:
|
|
313
339
|
if self.network_idle:
|
314
340
|
await page.wait_for_load_state('networkidle')
|
315
341
|
|
342
|
+
response_bytes = await final_response.body() if final_response else (await page.content()).encode('utf-8')
|
343
|
+
# In case we didn't catch a document type somehow
|
344
|
+
final_response = final_response if final_response else first_response
|
316
345
|
# This will be parsed inside `Response`
|
317
|
-
encoding =
|
346
|
+
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
|
318
347
|
# PlayWright API sometimes give empty status text for some reason!
|
319
|
-
status_text =
|
348
|
+
status_text = final_response.status_text or StatusText.get(final_response.status)
|
320
349
|
|
321
350
|
response = Response(
|
322
|
-
url=
|
351
|
+
url=final_response.url,
|
323
352
|
text=await page.content(),
|
324
|
-
body=
|
325
|
-
status=
|
353
|
+
body=response_bytes,
|
354
|
+
status=final_response.status,
|
326
355
|
reason=status_text,
|
327
356
|
encoding=encoding,
|
328
357
|
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
|
329
|
-
headers=await
|
330
|
-
request_headers=await
|
358
|
+
headers=await final_response.all_headers(),
|
359
|
+
request_headers=await final_response.request.all_headers(),
|
331
360
|
**self.adaptor_arguments
|
332
361
|
)
|
333
362
|
await page.close()
|
@@ -84,8 +84,6 @@ class ResponseEncoding:
|
|
84
84
|
class Response(Adaptor):
|
85
85
|
"""This class is returned by all engines as a way to unify response type between different libraries."""
|
86
86
|
|
87
|
-
_is_response_result_logged = False # Class-level flag, initialized to False
|
88
|
-
|
89
87
|
def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict,
|
90
88
|
encoding: str = 'utf-8', method: str = 'GET', **adaptor_arguments: Dict):
|
91
89
|
automatch_domain = adaptor_arguments.pop('automatch_domain', None)
|
@@ -99,9 +97,7 @@ class Response(Adaptor):
|
|
99
97
|
# For back-ward compatibility
|
100
98
|
self.adaptor = self
|
101
99
|
# For easier debugging while working from a Python shell
|
102
|
-
|
103
|
-
log.info(f'Fetched ({status}) <{method} {url}> (referer: {request_headers.get("referer")})')
|
104
|
-
Response._is_response_result_logged = True
|
100
|
+
log.info(f'Fetched ({status}) <{method} {url}> (referer: {request_headers.get("referer")})')
|
105
101
|
|
106
102
|
# def __repr__(self):
|
107
103
|
# return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
|
scrapling/fetchers.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
|
2
|
-
Union)
|
2
|
+
SelectorWaitStates, Union)
|
3
3
|
from scrapling.engines import (CamoufoxEngine, PlaywrightEngine, StaticEngine,
|
4
4
|
check_if_engine_usable)
|
5
5
|
from scrapling.engines.toolbelt import BaseFetcher, Response
|
@@ -176,8 +176,8 @@ class StealthyFetcher(BaseFetcher):
|
|
176
176
|
self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
|
177
177
|
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
|
178
178
|
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
|
179
|
-
wait_selector_state:
|
180
|
-
os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
|
179
|
+
wait_selector_state: SelectorWaitStates = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
|
180
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
|
181
181
|
) -> Response:
|
182
182
|
"""
|
183
183
|
Opens up a browser and do your request based on your chosen options below.
|
@@ -234,8 +234,8 @@ class StealthyFetcher(BaseFetcher):
|
|
234
234
|
self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
|
235
235
|
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
|
236
236
|
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
|
237
|
-
wait_selector_state:
|
238
|
-
os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
|
237
|
+
wait_selector_state: SelectorWaitStates = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
|
238
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
|
239
239
|
) -> Response:
|
240
240
|
"""
|
241
241
|
Opens up a browser and do your request based on your chosen options below.
|
@@ -308,7 +308,7 @@ class PlayWrightFetcher(BaseFetcher):
|
|
308
308
|
def fetch(
|
309
309
|
self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
|
310
310
|
useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
|
311
|
-
page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state:
|
311
|
+
page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached',
|
312
312
|
hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
|
313
313
|
proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
|
314
314
|
stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
|
@@ -368,7 +368,7 @@ class PlayWrightFetcher(BaseFetcher):
|
|
368
368
|
async def async_fetch(
|
369
369
|
self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
|
370
370
|
useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
|
371
|
-
page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state:
|
371
|
+
page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached',
|
372
372
|
hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
|
373
373
|
proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
|
374
374
|
stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
|
scrapling/parser.py
CHANGED
@@ -155,7 +155,7 @@ class Adaptor(SelectorsGeneration):
|
|
155
155
|
else:
|
156
156
|
if issubclass(type(element), html.HtmlMixin):
|
157
157
|
|
158
|
-
return
|
158
|
+
return Adaptor(
|
159
159
|
root=element,
|
160
160
|
text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
|
161
161
|
url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.91
|
4
4
|
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -21,7 +21,6 @@ Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
21
21
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
22
22
|
Classifier: Programming Language :: Python :: 3
|
23
23
|
Classifier: Programming Language :: Python :: 3 :: Only
|
24
|
-
Classifier: Programming Language :: Python :: 3.8
|
25
24
|
Classifier: Programming Language :: Python :: 3.9
|
26
25
|
Classifier: Programming Language :: Python :: 3.10
|
27
26
|
Classifier: Programming Language :: Python :: 3.11
|
@@ -38,7 +37,7 @@ Requires-Dist: cssselect>=1.2
|
|
38
37
|
Requires-Dist: w3lib
|
39
38
|
Requires-Dist: orjson>=3
|
40
39
|
Requires-Dist: tldextract
|
41
|
-
Requires-Dist: httpx[brotli,zstd]
|
40
|
+
Requires-Dist: httpx[brotli,socks,zstd]
|
42
41
|
Requires-Dist: playwright>=1.49.1
|
43
42
|
Requires-Dist: rebrowser-playwright>=1.49.1
|
44
43
|
Requires-Dist: camoufox[geoip]>=0.4.9
|
@@ -1,22 +1,22 @@
|
|
1
|
-
scrapling/__init__.py,sha256=
|
1
|
+
scrapling/__init__.py,sha256=pfbhEm1kcriA9pFR3JUUFEE3v4_ykB35SYbeHKzFxHw,500
|
2
2
|
scrapling/defaults.py,sha256=tJAOMB-PMd3aLZz3j_yr6haBxxaklAvWdS_hP-GFFdU,331
|
3
|
-
scrapling/fetchers.py,sha256=
|
4
|
-
scrapling/parser.py,sha256=
|
3
|
+
scrapling/fetchers.py,sha256=K3MKBqKDOXItJNwxFY2fe1C21Vz6QSd91fFtN98Mpg4,35402
|
4
|
+
scrapling/parser.py,sha256=Fl9cdbR58GuoPbWN5hZI6ToPSl0_rQFXMskTdzpoxWs,55208
|
5
5
|
scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
|
6
6
|
scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
scrapling/core/_types.py,sha256=
|
7
|
+
scrapling/core/_types.py,sha256=OcsP1WeQEOlEVo9OzTrLQfgZZfXuJ0civVs31SynwGA,641
|
8
8
|
scrapling/core/custom_types.py,sha256=ZRzpoT6qQ4vU_ejhLXa7WYuYLGl5HwAjLPe01xdhuvM,10808
|
9
9
|
scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
|
10
10
|
scrapling/core/storage_adaptors.py,sha256=l_ZYcdn1y69AcoPuRrPoaxqKysN62pMExrwJWYdu5MA,6220
|
11
11
|
scrapling/core/translator.py,sha256=ojDmNi5pFZE6Ke-AiSsTilXiPRdR8yhX3o-uVGMkap8,5236
|
12
12
|
scrapling/core/utils.py,sha256=03LzCDzmeK1TXPjIKVzHSUgSfhpe36XE8AwxlgxzJoU,3705
|
13
13
|
scrapling/engines/__init__.py,sha256=zA7tzqcDXP0hllwmjVewNHWipIA4JSU9mRG4J-cud0c,267
|
14
|
-
scrapling/engines/camo.py,sha256=
|
14
|
+
scrapling/engines/camo.py,sha256=g12IVIPy4Uyp_jngtu8Qcvy7PSMHjURAHUGXdM58Kks,13778
|
15
15
|
scrapling/engines/constants.py,sha256=Gb_nXFoBB4ujJkd05SKkenMe1UDiRYQA3dkmA3DunLg,3723
|
16
|
-
scrapling/engines/pw.py,sha256=
|
16
|
+
scrapling/engines/pw.py,sha256=Eq4_oQA5eX666chiNpXsBqhWONzleniyXjKdmCpXj_Y,18630
|
17
17
|
scrapling/engines/static.py,sha256=7SVEfeigCPfwC1ukx0zIFFe96Bo5fox6qOq2IWrP6P8,10319
|
18
18
|
scrapling/engines/toolbelt/__init__.py,sha256=VQDdYm1zY9Apno6d8UrULk29vUjllZrQqD8mXL1E2Fc,402
|
19
|
-
scrapling/engines/toolbelt/custom.py,sha256=
|
19
|
+
scrapling/engines/toolbelt/custom.py,sha256=d3qyeCg_qHm1RRE7yv5hyU9b17Y7YDPGBOVhEH1CAT0,12754
|
20
20
|
scrapling/engines/toolbelt/fingerprints.py,sha256=ajEHdXHr7W4hw9KcNS7XlyxNBZu37p1bRj18TiICLzU,2929
|
21
21
|
scrapling/engines/toolbelt/navigation.py,sha256=xEfZRJefuxOCGxQOSI2llS0du0Y2XmoIPdVGUSHOd7k,4567
|
22
22
|
scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
|
@@ -40,8 +40,8 @@ tests/fetchers/sync/test_playwright.py,sha256=5eZdPwk3JGeaO7GuExv_QsByLyWDE9joxn
|
|
40
40
|
tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
41
41
|
tests/parser/test_automatch.py,sha256=SxsNdExE8zz8AcPRQFBUjZ3Q_1-tPOd9dzVvMSZpOYQ,4908
|
42
42
|
tests/parser/test_general.py,sha256=dyfOsc8lleoY4AxcfDUBUaD1i95xecfYuTUhKBsYjwo,12100
|
43
|
-
scrapling-0.2.
|
44
|
-
scrapling-0.2.
|
45
|
-
scrapling-0.2.
|
46
|
-
scrapling-0.2.
|
47
|
-
scrapling-0.2.
|
43
|
+
scrapling-0.2.91.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
44
|
+
scrapling-0.2.91.dist-info/METADATA,sha256=ajc8n5Hjl--ZdGXwHxmfMEWyCMgbw1waZNovoPFxrUc,68339
|
45
|
+
scrapling-0.2.91.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
46
|
+
scrapling-0.2.91.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
|
47
|
+
scrapling-0.2.91.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|