scrapling 0.2.9__py3-none-any.whl → 0.2.92__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- scrapling/__init__.py +1 -1
- scrapling/cli.py +37 -0
- scrapling/core/_types.py +2 -0
- scrapling/engines/camo.py +40 -16
- scrapling/engines/pw.py +43 -16
- scrapling/engines/toolbelt/custom.py +1 -5
- scrapling/fetchers.py +7 -7
- scrapling/parser.py +3 -3
- {scrapling-0.2.9.dist-info → scrapling-0.2.92.dist-info}/METADATA +8 -42
- {scrapling-0.2.9.dist-info → scrapling-0.2.92.dist-info}/RECORD +14 -12
- scrapling-0.2.92.dist-info/entry_points.txt +2 -0
- {scrapling-0.2.9.dist-info → scrapling-0.2.92.dist-info}/LICENSE +0 -0
- {scrapling-0.2.9.dist-info → scrapling-0.2.92.dist-info}/WHEEL +0 -0
- {scrapling-0.2.9.dist-info → scrapling-0.2.92.dist-info}/top_level.txt +0 -0
scrapling/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
|
|
5
5
|
from scrapling.parser import Adaptor, Adaptors
|
6
6
|
|
7
7
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
8
|
-
__version__ = "0.2.
|
8
|
+
__version__ = "0.2.92"
|
9
9
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
10
10
|
|
11
11
|
|
scrapling/cli.py
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
import os
|
2
|
+
import subprocess
|
3
|
+
import sys
|
4
|
+
from pathlib import Path
|
5
|
+
|
6
|
+
import click
|
7
|
+
|
8
|
+
|
9
|
+
def get_package_dir():
|
10
|
+
return Path(os.path.dirname(__file__))
|
11
|
+
|
12
|
+
|
13
|
+
def run_command(command, line):
|
14
|
+
print(f"Installing {line}...")
|
15
|
+
_ = subprocess.check_call(command, shell=True)
|
16
|
+
# I meant to not use try except here
|
17
|
+
|
18
|
+
|
19
|
+
@click.command(help="Install all Scrapling's Fetchers dependencies")
|
20
|
+
def install():
|
21
|
+
if not get_package_dir().joinpath(".scrapling_dependencies_installed").exists():
|
22
|
+
run_command([sys.executable, "-m", "playwright", "install", 'chromium'], 'Playwright browsers')
|
23
|
+
run_command([sys.executable, "-m", "playwright", "install-deps", 'chromium', 'firefox'], 'Playwright dependencies')
|
24
|
+
run_command([sys.executable, "-m", "camoufox", "fetch", '--browserforge'], 'Camoufox browser and databases')
|
25
|
+
# if no errors raised by above commands, then we add below file
|
26
|
+
get_package_dir().joinpath(".scrapling_dependencies_installed").touch()
|
27
|
+
else:
|
28
|
+
print('The dependencies are already installed')
|
29
|
+
|
30
|
+
|
31
|
+
@click.group()
|
32
|
+
def main():
|
33
|
+
pass
|
34
|
+
|
35
|
+
|
36
|
+
# Adding commands
|
37
|
+
main.add_command(install)
|
scrapling/core/_types.py
CHANGED
@@ -5,6 +5,8 @@ Type definitions for type checking purposes.
|
|
5
5
|
from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable,
|
6
6
|
List, Literal, Optional, Pattern, Tuple, Type, Union)
|
7
7
|
|
8
|
+
SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
|
9
|
+
|
8
10
|
try:
|
9
11
|
from typing import Protocol
|
10
12
|
except ImportError:
|
scrapling/engines/camo.py
CHANGED
@@ -3,7 +3,7 @@ from camoufox.async_api import AsyncCamoufox
|
|
3
3
|
from camoufox.sync_api import Camoufox
|
4
4
|
|
5
5
|
from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
|
6
|
-
Union)
|
6
|
+
SelectorWaitStates, Union)
|
7
7
|
from scrapling.core.utils import log
|
8
8
|
from scrapling.engines.toolbelt import (Response, StatusText,
|
9
9
|
async_intercept_route,
|
@@ -18,7 +18,7 @@ class CamoufoxEngine:
|
|
18
18
|
self, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
|
19
19
|
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
|
20
20
|
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
|
21
|
-
wait_selector_state:
|
21
|
+
wait_selector_state: Optional[SelectorWaitStates] = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
|
22
22
|
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
|
23
23
|
geoip: Optional[bool] = False,
|
24
24
|
adaptor_arguments: Dict = None,
|
@@ -84,6 +84,14 @@ class CamoufoxEngine:
|
|
84
84
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
85
85
|
"""
|
86
86
|
addons = [] if self.disable_ads else [DefaultAddons.UBO]
|
87
|
+
# Store the final response
|
88
|
+
final_response = None
|
89
|
+
|
90
|
+
def handle_response(finished_response):
|
91
|
+
nonlocal final_response
|
92
|
+
if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
|
93
|
+
final_response = finished_response
|
94
|
+
|
87
95
|
with Camoufox(
|
88
96
|
geoip=self.geoip,
|
89
97
|
proxy=self.proxy,
|
@@ -100,13 +108,15 @@ class CamoufoxEngine:
|
|
100
108
|
page = browser.new_page()
|
101
109
|
page.set_default_navigation_timeout(self.timeout)
|
102
110
|
page.set_default_timeout(self.timeout)
|
111
|
+
# Listen for all responses
|
112
|
+
page.on("response", handle_response)
|
103
113
|
if self.disable_resources:
|
104
114
|
page.route("**/*", intercept_route)
|
105
115
|
|
106
116
|
if self.extra_headers:
|
107
117
|
page.set_extra_http_headers(self.extra_headers)
|
108
118
|
|
109
|
-
|
119
|
+
first_response = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
110
120
|
page.wait_for_load_state(state="domcontentloaded")
|
111
121
|
if self.network_idle:
|
112
122
|
page.wait_for_load_state('networkidle')
|
@@ -123,21 +133,23 @@ class CamoufoxEngine:
|
|
123
133
|
if self.network_idle:
|
124
134
|
page.wait_for_load_state('networkidle')
|
125
135
|
|
136
|
+
# In case we didn't catch a document type somehow
|
137
|
+
final_response = final_response if final_response else first_response
|
126
138
|
# This will be parsed inside `Response`
|
127
|
-
encoding =
|
139
|
+
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
|
128
140
|
# PlayWright API sometimes give empty status text for some reason!
|
129
|
-
status_text =
|
141
|
+
status_text = final_response.status_text or StatusText.get(final_response.status)
|
130
142
|
|
131
143
|
response = Response(
|
132
|
-
url=
|
144
|
+
url=page.url,
|
133
145
|
text=page.content(),
|
134
146
|
body=page.content().encode('utf-8'),
|
135
|
-
status=
|
147
|
+
status=final_response.status,
|
136
148
|
reason=status_text,
|
137
149
|
encoding=encoding,
|
138
150
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
139
|
-
headers=
|
140
|
-
request_headers=
|
151
|
+
headers=first_response.all_headers(),
|
152
|
+
request_headers=first_response.request.all_headers(),
|
141
153
|
**self.adaptor_arguments
|
142
154
|
)
|
143
155
|
page.close()
|
@@ -151,6 +163,14 @@ class CamoufoxEngine:
|
|
151
163
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
152
164
|
"""
|
153
165
|
addons = [] if self.disable_ads else [DefaultAddons.UBO]
|
166
|
+
# Store the final response
|
167
|
+
final_response = None
|
168
|
+
|
169
|
+
async def handle_response(finished_response):
|
170
|
+
nonlocal final_response
|
171
|
+
if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
|
172
|
+
final_response = finished_response
|
173
|
+
|
154
174
|
async with AsyncCamoufox(
|
155
175
|
geoip=self.geoip,
|
156
176
|
proxy=self.proxy,
|
@@ -167,13 +187,15 @@ class CamoufoxEngine:
|
|
167
187
|
page = await browser.new_page()
|
168
188
|
page.set_default_navigation_timeout(self.timeout)
|
169
189
|
page.set_default_timeout(self.timeout)
|
190
|
+
# Listen for all responses
|
191
|
+
page.on("response", handle_response)
|
170
192
|
if self.disable_resources:
|
171
193
|
await page.route("**/*", async_intercept_route)
|
172
194
|
|
173
195
|
if self.extra_headers:
|
174
196
|
await page.set_extra_http_headers(self.extra_headers)
|
175
197
|
|
176
|
-
|
198
|
+
first_response = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
177
199
|
await page.wait_for_load_state(state="domcontentloaded")
|
178
200
|
if self.network_idle:
|
179
201
|
await page.wait_for_load_state('networkidle')
|
@@ -190,21 +212,23 @@ class CamoufoxEngine:
|
|
190
212
|
if self.network_idle:
|
191
213
|
await page.wait_for_load_state('networkidle')
|
192
214
|
|
215
|
+
# In case we didn't catch a document type somehow
|
216
|
+
final_response = final_response if final_response else first_response
|
193
217
|
# This will be parsed inside `Response`
|
194
|
-
encoding =
|
218
|
+
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
|
195
219
|
# PlayWright API sometimes give empty status text for some reason!
|
196
|
-
status_text =
|
220
|
+
status_text = final_response.status_text or StatusText.get(final_response.status)
|
197
221
|
|
198
222
|
response = Response(
|
199
|
-
url=
|
223
|
+
url=page.url,
|
200
224
|
text=await page.content(),
|
201
225
|
body=(await page.content()).encode('utf-8'),
|
202
|
-
status=
|
226
|
+
status=final_response.status,
|
203
227
|
reason=status_text,
|
204
228
|
encoding=encoding,
|
205
229
|
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
|
206
|
-
headers=await
|
207
|
-
request_headers=await
|
230
|
+
headers=await first_response.all_headers(),
|
231
|
+
request_headers=await first_response.request.all_headers(),
|
208
232
|
**self.adaptor_arguments
|
209
233
|
)
|
210
234
|
await page.close()
|
scrapling/engines/pw.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import json
|
2
2
|
|
3
|
-
from scrapling.core._types import Callable, Dict, Optional,
|
3
|
+
from scrapling.core._types import (Callable, Dict, Optional,
|
4
|
+
SelectorWaitStates, Union)
|
4
5
|
from scrapling.core.utils import log, lru_cache
|
5
6
|
from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
|
6
7
|
NSTBROWSER_DEFAULT_QUERY)
|
@@ -23,7 +24,7 @@ class PlaywrightEngine:
|
|
23
24
|
page_action: Callable = None,
|
24
25
|
wait_selector: Optional[str] = None,
|
25
26
|
locale: Optional[str] = 'en-US',
|
26
|
-
wait_selector_state:
|
27
|
+
wait_selector_state: SelectorWaitStates = 'attached',
|
27
28
|
stealth: Optional[bool] = False,
|
28
29
|
real_chrome: Optional[bool] = False,
|
29
30
|
hide_canvas: Optional[bool] = False,
|
@@ -193,12 +194,21 @@ class PlaywrightEngine:
|
|
193
194
|
:param url: Target url.
|
194
195
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
195
196
|
"""
|
197
|
+
from playwright.sync_api import Response as PlaywrightResponse
|
196
198
|
if not self.stealth or self.real_chrome:
|
197
199
|
# Because rebrowser_playwright doesn't play well with real browsers
|
198
200
|
from playwright.sync_api import sync_playwright
|
199
201
|
else:
|
200
202
|
from rebrowser_playwright.sync_api import sync_playwright
|
201
203
|
|
204
|
+
# Store the final response
|
205
|
+
final_response = None
|
206
|
+
|
207
|
+
def handle_response(finished_response: PlaywrightResponse):
|
208
|
+
nonlocal final_response
|
209
|
+
if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
|
210
|
+
final_response = finished_response
|
211
|
+
|
202
212
|
with sync_playwright() as p:
|
203
213
|
# Creating the browser
|
204
214
|
if self.cdp_url:
|
@@ -212,6 +222,8 @@ class PlaywrightEngine:
|
|
212
222
|
page = context.new_page()
|
213
223
|
page.set_default_navigation_timeout(self.timeout)
|
214
224
|
page.set_default_timeout(self.timeout)
|
225
|
+
# Listen for all responses
|
226
|
+
page.on("response", handle_response)
|
215
227
|
|
216
228
|
if self.extra_headers:
|
217
229
|
page.set_extra_http_headers(self.extra_headers)
|
@@ -223,7 +235,7 @@ class PlaywrightEngine:
|
|
223
235
|
for script in self.__stealth_scripts():
|
224
236
|
page.add_init_script(path=script)
|
225
237
|
|
226
|
-
|
238
|
+
first_response = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
227
239
|
page.wait_for_load_state(state="domcontentloaded")
|
228
240
|
if self.network_idle:
|
229
241
|
page.wait_for_load_state('networkidle')
|
@@ -240,21 +252,23 @@ class PlaywrightEngine:
|
|
240
252
|
if self.network_idle:
|
241
253
|
page.wait_for_load_state('networkidle')
|
242
254
|
|
255
|
+
# In case we didn't catch a document type somehow
|
256
|
+
final_response = final_response if final_response else first_response
|
243
257
|
# This will be parsed inside `Response`
|
244
|
-
encoding =
|
258
|
+
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
|
245
259
|
# PlayWright API sometimes give empty status text for some reason!
|
246
|
-
status_text =
|
260
|
+
status_text = final_response.status_text or StatusText.get(final_response.status)
|
247
261
|
|
248
262
|
response = Response(
|
249
|
-
url=
|
263
|
+
url=page.url,
|
250
264
|
text=page.content(),
|
251
265
|
body=page.content().encode('utf-8'),
|
252
|
-
status=
|
266
|
+
status=final_response.status,
|
253
267
|
reason=status_text,
|
254
268
|
encoding=encoding,
|
255
269
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
256
|
-
headers=
|
257
|
-
request_headers=
|
270
|
+
headers=first_response.all_headers(),
|
271
|
+
request_headers=first_response.request.all_headers(),
|
258
272
|
**self.adaptor_arguments
|
259
273
|
)
|
260
274
|
page.close()
|
@@ -266,12 +280,21 @@ class PlaywrightEngine:
|
|
266
280
|
:param url: Target url.
|
267
281
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
268
282
|
"""
|
283
|
+
from playwright.async_api import Response as PlaywrightResponse
|
269
284
|
if not self.stealth or self.real_chrome:
|
270
285
|
# Because rebrowser_playwright doesn't play well with real browsers
|
271
286
|
from playwright.async_api import async_playwright
|
272
287
|
else:
|
273
288
|
from rebrowser_playwright.async_api import async_playwright
|
274
289
|
|
290
|
+
# Store the final response
|
291
|
+
final_response = None
|
292
|
+
|
293
|
+
async def handle_response(finished_response: PlaywrightResponse):
|
294
|
+
nonlocal final_response
|
295
|
+
if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
|
296
|
+
final_response = finished_response
|
297
|
+
|
275
298
|
async with async_playwright() as p:
|
276
299
|
# Creating the browser
|
277
300
|
if self.cdp_url:
|
@@ -285,6 +308,8 @@ class PlaywrightEngine:
|
|
285
308
|
page = await context.new_page()
|
286
309
|
page.set_default_navigation_timeout(self.timeout)
|
287
310
|
page.set_default_timeout(self.timeout)
|
311
|
+
# Listen for all responses
|
312
|
+
page.on("response", handle_response)
|
288
313
|
|
289
314
|
if self.extra_headers:
|
290
315
|
await page.set_extra_http_headers(self.extra_headers)
|
@@ -296,7 +321,7 @@ class PlaywrightEngine:
|
|
296
321
|
for script in self.__stealth_scripts():
|
297
322
|
await page.add_init_script(path=script)
|
298
323
|
|
299
|
-
|
324
|
+
first_response = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
300
325
|
await page.wait_for_load_state(state="domcontentloaded")
|
301
326
|
if self.network_idle:
|
302
327
|
await page.wait_for_load_state('networkidle')
|
@@ -313,21 +338,23 @@ class PlaywrightEngine:
|
|
313
338
|
if self.network_idle:
|
314
339
|
await page.wait_for_load_state('networkidle')
|
315
340
|
|
341
|
+
# In case we didn't catch a document type somehow
|
342
|
+
final_response = final_response if final_response else first_response
|
316
343
|
# This will be parsed inside `Response`
|
317
|
-
encoding =
|
344
|
+
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
|
318
345
|
# PlayWright API sometimes give empty status text for some reason!
|
319
|
-
status_text =
|
346
|
+
status_text = final_response.status_text or StatusText.get(final_response.status)
|
320
347
|
|
321
348
|
response = Response(
|
322
|
-
url=
|
349
|
+
url=page.url,
|
323
350
|
text=await page.content(),
|
324
351
|
body=(await page.content()).encode('utf-8'),
|
325
|
-
status=
|
352
|
+
status=final_response.status,
|
326
353
|
reason=status_text,
|
327
354
|
encoding=encoding,
|
328
355
|
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
|
329
|
-
headers=await
|
330
|
-
request_headers=await
|
356
|
+
headers=await first_response.all_headers(),
|
357
|
+
request_headers=await first_response.request.all_headers(),
|
331
358
|
**self.adaptor_arguments
|
332
359
|
)
|
333
360
|
await page.close()
|
@@ -84,8 +84,6 @@ class ResponseEncoding:
|
|
84
84
|
class Response(Adaptor):
|
85
85
|
"""This class is returned by all engines as a way to unify response type between different libraries."""
|
86
86
|
|
87
|
-
_is_response_result_logged = False # Class-level flag, initialized to False
|
88
|
-
|
89
87
|
def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict,
|
90
88
|
encoding: str = 'utf-8', method: str = 'GET', **adaptor_arguments: Dict):
|
91
89
|
automatch_domain = adaptor_arguments.pop('automatch_domain', None)
|
@@ -99,9 +97,7 @@ class Response(Adaptor):
|
|
99
97
|
# For back-ward compatibility
|
100
98
|
self.adaptor = self
|
101
99
|
# For easier debugging while working from a Python shell
|
102
|
-
|
103
|
-
log.info(f'Fetched ({status}) <{method} {url}> (referer: {request_headers.get("referer")})')
|
104
|
-
Response._is_response_result_logged = True
|
100
|
+
log.info(f'Fetched ({status}) <{method} {url}> (referer: {request_headers.get("referer")})')
|
105
101
|
|
106
102
|
# def __repr__(self):
|
107
103
|
# return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
|
scrapling/fetchers.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
|
2
|
-
Union)
|
2
|
+
SelectorWaitStates, Union)
|
3
3
|
from scrapling.engines import (CamoufoxEngine, PlaywrightEngine, StaticEngine,
|
4
4
|
check_if_engine_usable)
|
5
5
|
from scrapling.engines.toolbelt import BaseFetcher, Response
|
@@ -176,8 +176,8 @@ class StealthyFetcher(BaseFetcher):
|
|
176
176
|
self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
|
177
177
|
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
|
178
178
|
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
|
179
|
-
wait_selector_state:
|
180
|
-
os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
|
179
|
+
wait_selector_state: SelectorWaitStates = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
|
180
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
|
181
181
|
) -> Response:
|
182
182
|
"""
|
183
183
|
Opens up a browser and do your request based on your chosen options below.
|
@@ -234,8 +234,8 @@ class StealthyFetcher(BaseFetcher):
|
|
234
234
|
self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
|
235
235
|
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
|
236
236
|
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
|
237
|
-
wait_selector_state:
|
238
|
-
os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
|
237
|
+
wait_selector_state: SelectorWaitStates = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
|
238
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
|
239
239
|
) -> Response:
|
240
240
|
"""
|
241
241
|
Opens up a browser and do your request based on your chosen options below.
|
@@ -308,7 +308,7 @@ class PlayWrightFetcher(BaseFetcher):
|
|
308
308
|
def fetch(
|
309
309
|
self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
|
310
310
|
useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
|
311
|
-
page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state:
|
311
|
+
page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached',
|
312
312
|
hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
|
313
313
|
proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
|
314
314
|
stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
|
@@ -368,7 +368,7 @@ class PlayWrightFetcher(BaseFetcher):
|
|
368
368
|
async def async_fetch(
|
369
369
|
self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
|
370
370
|
useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
|
371
|
-
page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state:
|
371
|
+
page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached',
|
372
372
|
hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
|
373
373
|
proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
|
374
374
|
stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
|
scrapling/parser.py
CHANGED
@@ -155,7 +155,7 @@ class Adaptor(SelectorsGeneration):
|
|
155
155
|
else:
|
156
156
|
if issubclass(type(element), html.HtmlMixin):
|
157
157
|
|
158
|
-
return
|
158
|
+
return Adaptor(
|
159
159
|
root=element,
|
160
160
|
text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
|
161
161
|
url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
|
@@ -474,7 +474,7 @@ class Adaptor(SelectorsGeneration):
|
|
474
474
|
|
475
475
|
def css(self, selector: str, identifier: str = '',
|
476
476
|
auto_match: bool = False, auto_save: bool = False, percentage: int = 0
|
477
|
-
) -> Union['Adaptors[Adaptor]', List]:
|
477
|
+
) -> Union['Adaptors[Adaptor]', List, 'TextHandlers[TextHandler]']:
|
478
478
|
"""Search current tree with CSS3 selectors
|
479
479
|
|
480
480
|
**Important:
|
@@ -517,7 +517,7 @@ class Adaptor(SelectorsGeneration):
|
|
517
517
|
|
518
518
|
def xpath(self, selector: str, identifier: str = '',
|
519
519
|
auto_match: bool = False, auto_save: bool = False, percentage: int = 0, **kwargs: Any
|
520
|
-
) -> Union['Adaptors[Adaptor]', List]:
|
520
|
+
) -> Union['Adaptors[Adaptor]', List, 'TextHandlers[TextHandler]']:
|
521
521
|
"""Search current tree with XPath selectors
|
522
522
|
|
523
523
|
**Important:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.92
|
4
4
|
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -21,7 +21,6 @@ Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
21
21
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
22
22
|
Classifier: Programming Language :: Python :: 3
|
23
23
|
Classifier: Programming Language :: Python :: 3 :: Only
|
24
|
-
Classifier: Programming Language :: Python :: 3.8
|
25
24
|
Classifier: Programming Language :: Python :: 3.9
|
26
25
|
Classifier: Programming Language :: Python :: 3.10
|
27
26
|
Classifier: Programming Language :: Python :: 3.11
|
@@ -35,10 +34,11 @@ License-File: LICENSE
|
|
35
34
|
Requires-Dist: requests>=2.3
|
36
35
|
Requires-Dist: lxml>=4.5
|
37
36
|
Requires-Dist: cssselect>=1.2
|
37
|
+
Requires-Dist: click
|
38
38
|
Requires-Dist: w3lib
|
39
39
|
Requires-Dist: orjson>=3
|
40
40
|
Requires-Dist: tldextract
|
41
|
-
Requires-Dist: httpx[brotli,zstd]
|
41
|
+
Requires-Dist: httpx[brotli,socks,zstd]
|
42
42
|
Requires-Dist: playwright>=1.49.1
|
43
43
|
Requires-Dist: rebrowser-playwright>=1.49.1
|
44
44
|
Requires-Dist: camoufox[geoip]>=0.4.9
|
@@ -212,52 +212,18 @@ Scrapling can find elements with more methods and it returns full element `Adapt
|
|
212
212
|
> All benchmarks' results are an average of 100 runs. See our [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology and to run your comparisons.
|
213
213
|
|
214
214
|
## Installation
|
215
|
-
Scrapling is a breeze to get started with
|
215
|
+
Scrapling is a breeze to get started with; Starting from version 0.2.9, we require at least Python 3.9 to work.
|
216
216
|
```bash
|
217
217
|
pip3 install scrapling
|
218
218
|
```
|
219
|
-
|
220
|
-
<details><summary>Windows OS</summary>
|
221
|
-
|
222
|
-
```bash
|
223
|
-
camoufox fetch --browserforge
|
224
|
-
```
|
225
|
-
</details>
|
226
|
-
<details><summary>MacOS</summary>
|
227
|
-
|
228
|
-
```bash
|
229
|
-
python3 -m camoufox fetch --browserforge
|
230
|
-
```
|
231
|
-
</details>
|
232
|
-
<details><summary>Linux</summary>
|
233
|
-
|
219
|
+
Then run this command to install browsers' dependencies needed to use Fetcher classes
|
234
220
|
```bash
|
235
|
-
|
236
|
-
```
|
237
|
-
On a fresh installation of Linux, you may also need the following Firefox dependencies:
|
238
|
-
- Debian-based distros
|
239
|
-
```bash
|
240
|
-
sudo apt install -y libgtk-3-0 libx11-xcb1 libasound2
|
241
|
-
```
|
242
|
-
- Arch-based distros
|
243
|
-
```bash
|
244
|
-
sudo pacman -S gtk3 libx11 libxcb cairo libasound alsa-lib
|
245
|
-
```
|
246
|
-
</details>
|
247
|
-
|
248
|
-
<small> See the official <a href="https://camoufox.com/python/installation/#download-the-browser">Camoufox documentation</a> for more info on installation</small>
|
249
|
-
|
250
|
-
- If you are going to use the `PlayWrightFetcher` options, then install Playwright's Chromium browser with:
|
251
|
-
```commandline
|
252
|
-
playwright install chromium
|
253
|
-
```
|
254
|
-
- If you are going to use normal requests only with the `Fetcher` class then update the fingerprints files with:
|
255
|
-
```commandline
|
256
|
-
python -m browserforge update
|
221
|
+
scrapling install
|
257
222
|
```
|
223
|
+
If you have any installation issues, please open an issue.
|
258
224
|
|
259
225
|
## Fetching Websites
|
260
|
-
Fetchers are
|
226
|
+
Fetchers are interfaces built on top of other libraries with added features that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
|
261
227
|
|
262
228
|
### Features
|
263
229
|
You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way
|
@@ -1,22 +1,23 @@
|
|
1
|
-
scrapling/__init__.py,sha256=
|
1
|
+
scrapling/__init__.py,sha256=0iEOX168f4gLFpReEUemMOhTske8AS2o0UQHJWXn-4o,500
|
2
|
+
scrapling/cli.py,sha256=njPdJKmbLFHeWjtSiGEm9ALBdSyfUp0IaJvxQL5C31Q,1125
|
2
3
|
scrapling/defaults.py,sha256=tJAOMB-PMd3aLZz3j_yr6haBxxaklAvWdS_hP-GFFdU,331
|
3
|
-
scrapling/fetchers.py,sha256=
|
4
|
-
scrapling/parser.py,sha256=
|
4
|
+
scrapling/fetchers.py,sha256=K3MKBqKDOXItJNwxFY2fe1C21Vz6QSd91fFtN98Mpg4,35402
|
5
|
+
scrapling/parser.py,sha256=sT1gh5pnbjpUzFt8K9DGD6x60zKQcAtzmyf8DgiNDCI,55266
|
5
6
|
scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
|
6
7
|
scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
scrapling/core/_types.py,sha256=
|
8
|
+
scrapling/core/_types.py,sha256=OcsP1WeQEOlEVo9OzTrLQfgZZfXuJ0civVs31SynwGA,641
|
8
9
|
scrapling/core/custom_types.py,sha256=ZRzpoT6qQ4vU_ejhLXa7WYuYLGl5HwAjLPe01xdhuvM,10808
|
9
10
|
scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
|
10
11
|
scrapling/core/storage_adaptors.py,sha256=l_ZYcdn1y69AcoPuRrPoaxqKysN62pMExrwJWYdu5MA,6220
|
11
12
|
scrapling/core/translator.py,sha256=ojDmNi5pFZE6Ke-AiSsTilXiPRdR8yhX3o-uVGMkap8,5236
|
12
13
|
scrapling/core/utils.py,sha256=03LzCDzmeK1TXPjIKVzHSUgSfhpe36XE8AwxlgxzJoU,3705
|
13
14
|
scrapling/engines/__init__.py,sha256=zA7tzqcDXP0hllwmjVewNHWipIA4JSU9mRG4J-cud0c,267
|
14
|
-
scrapling/engines/camo.py,sha256=
|
15
|
+
scrapling/engines/camo.py,sha256=wJRfaIU0w_hDSlrP2AdpjBU6NNEKw0wSnVbqUoxt1Gk,13682
|
15
16
|
scrapling/engines/constants.py,sha256=Gb_nXFoBB4ujJkd05SKkenMe1UDiRYQA3dkmA3DunLg,3723
|
16
|
-
scrapling/engines/pw.py,sha256=
|
17
|
+
scrapling/engines/pw.py,sha256=MCYE5rDx55D2VOIeUNLl44ROXnyFRfku_u2FOcXjqEQ,18534
|
17
18
|
scrapling/engines/static.py,sha256=7SVEfeigCPfwC1ukx0zIFFe96Bo5fox6qOq2IWrP6P8,10319
|
18
19
|
scrapling/engines/toolbelt/__init__.py,sha256=VQDdYm1zY9Apno6d8UrULk29vUjllZrQqD8mXL1E2Fc,402
|
19
|
-
scrapling/engines/toolbelt/custom.py,sha256=
|
20
|
+
scrapling/engines/toolbelt/custom.py,sha256=d3qyeCg_qHm1RRE7yv5hyU9b17Y7YDPGBOVhEH1CAT0,12754
|
20
21
|
scrapling/engines/toolbelt/fingerprints.py,sha256=ajEHdXHr7W4hw9KcNS7XlyxNBZu37p1bRj18TiICLzU,2929
|
21
22
|
scrapling/engines/toolbelt/navigation.py,sha256=xEfZRJefuxOCGxQOSI2llS0du0Y2XmoIPdVGUSHOd7k,4567
|
22
23
|
scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
|
@@ -40,8 +41,9 @@ tests/fetchers/sync/test_playwright.py,sha256=5eZdPwk3JGeaO7GuExv_QsByLyWDE9joxn
|
|
40
41
|
tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
41
42
|
tests/parser/test_automatch.py,sha256=SxsNdExE8zz8AcPRQFBUjZ3Q_1-tPOd9dzVvMSZpOYQ,4908
|
42
43
|
tests/parser/test_general.py,sha256=dyfOsc8lleoY4AxcfDUBUaD1i95xecfYuTUhKBsYjwo,12100
|
43
|
-
scrapling-0.2.
|
44
|
-
scrapling-0.2.
|
45
|
-
scrapling-0.2.
|
46
|
-
scrapling-0.2.
|
47
|
-
scrapling-0.2.
|
44
|
+
scrapling-0.2.92.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
45
|
+
scrapling-0.2.92.dist-info/METADATA,sha256=2I-HK-xEkVFFyQBio8NAKR0eQEBB-dLHFuvb5eluCEQ,67415
|
46
|
+
scrapling-0.2.92.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
47
|
+
scrapling-0.2.92.dist-info/entry_points.txt,sha256=DHyt2Blxy0P5OE2HRcP95Wz9_xo2ERCDcNqrJjYS3o8,49
|
48
|
+
scrapling-0.2.92.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
|
49
|
+
scrapling-0.2.92.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|