scrapling 0.2.91__py3-none-any.whl → 0.2.92__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- scrapling/__init__.py +1 -1
- scrapling/cli.py +37 -0
- scrapling/engines/camo.py +10 -12
- scrapling/engines/pw.py +10 -12
- scrapling/parser.py +2 -2
- {scrapling-0.2.91.dist-info → scrapling-0.2.92.dist-info}/METADATA +7 -40
- {scrapling-0.2.91.dist-info → scrapling-0.2.92.dist-info}/RECORD +11 -9
- scrapling-0.2.92.dist-info/entry_points.txt +2 -0
- {scrapling-0.2.91.dist-info → scrapling-0.2.92.dist-info}/LICENSE +0 -0
- {scrapling-0.2.91.dist-info → scrapling-0.2.92.dist-info}/WHEEL +0 -0
- {scrapling-0.2.91.dist-info → scrapling-0.2.92.dist-info}/top_level.txt +0 -0
scrapling/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
|
|
5
5
|
from scrapling.parser import Adaptor, Adaptors
|
6
6
|
|
7
7
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
8
|
-
__version__ = "0.2.
|
8
|
+
__version__ = "0.2.92"
|
9
9
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
10
10
|
|
11
11
|
|
scrapling/cli.py
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
import os
|
2
|
+
import subprocess
|
3
|
+
import sys
|
4
|
+
from pathlib import Path
|
5
|
+
|
6
|
+
import click
|
7
|
+
|
8
|
+
|
9
|
+
def get_package_dir():
|
10
|
+
return Path(os.path.dirname(__file__))
|
11
|
+
|
12
|
+
|
13
|
+
def run_command(command, line):
|
14
|
+
print(f"Installing {line}...")
|
15
|
+
_ = subprocess.check_call(command, shell=True)
|
16
|
+
# I meant to not use try except here
|
17
|
+
|
18
|
+
|
19
|
+
@click.command(help="Install all Scrapling's Fetchers dependencies")
|
20
|
+
def install():
|
21
|
+
if not get_package_dir().joinpath(".scrapling_dependencies_installed").exists():
|
22
|
+
run_command([sys.executable, "-m", "playwright", "install", 'chromium'], 'Playwright browsers')
|
23
|
+
run_command([sys.executable, "-m", "playwright", "install-deps", 'chromium', 'firefox'], 'Playwright dependencies')
|
24
|
+
run_command([sys.executable, "-m", "camoufox", "fetch", '--browserforge'], 'Camoufox browser and databases')
|
25
|
+
# if no errors raised by above commands, then we add below file
|
26
|
+
get_package_dir().joinpath(".scrapling_dependencies_installed").touch()
|
27
|
+
else:
|
28
|
+
print('The dependencies are already installed')
|
29
|
+
|
30
|
+
|
31
|
+
@click.group()
|
32
|
+
def main():
|
33
|
+
pass
|
34
|
+
|
35
|
+
|
36
|
+
# Adding commands
|
37
|
+
main.add_command(install)
|
scrapling/engines/camo.py
CHANGED
@@ -89,7 +89,7 @@ class CamoufoxEngine:
|
|
89
89
|
|
90
90
|
def handle_response(finished_response):
|
91
91
|
nonlocal final_response
|
92
|
-
if finished_response.request.resource_type == "document":
|
92
|
+
if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
|
93
93
|
final_response = finished_response
|
94
94
|
|
95
95
|
with Camoufox(
|
@@ -133,7 +133,6 @@ class CamoufoxEngine:
|
|
133
133
|
if self.network_idle:
|
134
134
|
page.wait_for_load_state('networkidle')
|
135
135
|
|
136
|
-
response_bytes = final_response.body() if final_response else page.content().encode('utf-8')
|
137
136
|
# In case we didn't catch a document type somehow
|
138
137
|
final_response = final_response if final_response else first_response
|
139
138
|
# This will be parsed inside `Response`
|
@@ -142,15 +141,15 @@ class CamoufoxEngine:
|
|
142
141
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
143
142
|
|
144
143
|
response = Response(
|
145
|
-
url=
|
144
|
+
url=page.url,
|
146
145
|
text=page.content(),
|
147
|
-
body=
|
146
|
+
body=page.content().encode('utf-8'),
|
148
147
|
status=final_response.status,
|
149
148
|
reason=status_text,
|
150
149
|
encoding=encoding,
|
151
150
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
152
|
-
headers=
|
153
|
-
request_headers=
|
151
|
+
headers=first_response.all_headers(),
|
152
|
+
request_headers=first_response.request.all_headers(),
|
154
153
|
**self.adaptor_arguments
|
155
154
|
)
|
156
155
|
page.close()
|
@@ -169,7 +168,7 @@ class CamoufoxEngine:
|
|
169
168
|
|
170
169
|
async def handle_response(finished_response):
|
171
170
|
nonlocal final_response
|
172
|
-
if finished_response.request.resource_type == "document":
|
171
|
+
if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
|
173
172
|
final_response = finished_response
|
174
173
|
|
175
174
|
async with AsyncCamoufox(
|
@@ -213,7 +212,6 @@ class CamoufoxEngine:
|
|
213
212
|
if self.network_idle:
|
214
213
|
await page.wait_for_load_state('networkidle')
|
215
214
|
|
216
|
-
response_bytes = await final_response.body() if final_response else (await page.content()).encode('utf-8')
|
217
215
|
# In case we didn't catch a document type somehow
|
218
216
|
final_response = final_response if final_response else first_response
|
219
217
|
# This will be parsed inside `Response`
|
@@ -222,15 +220,15 @@ class CamoufoxEngine:
|
|
222
220
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
223
221
|
|
224
222
|
response = Response(
|
225
|
-
url=
|
223
|
+
url=page.url,
|
226
224
|
text=await page.content(),
|
227
|
-
body=
|
225
|
+
body=(await page.content()).encode('utf-8'),
|
228
226
|
status=final_response.status,
|
229
227
|
reason=status_text,
|
230
228
|
encoding=encoding,
|
231
229
|
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
|
232
|
-
headers=await
|
233
|
-
request_headers=await
|
230
|
+
headers=await first_response.all_headers(),
|
231
|
+
request_headers=await first_response.request.all_headers(),
|
234
232
|
**self.adaptor_arguments
|
235
233
|
)
|
236
234
|
await page.close()
|
scrapling/engines/pw.py
CHANGED
@@ -206,7 +206,7 @@ class PlaywrightEngine:
|
|
206
206
|
|
207
207
|
def handle_response(finished_response: PlaywrightResponse):
|
208
208
|
nonlocal final_response
|
209
|
-
if finished_response.request.resource_type == "document":
|
209
|
+
if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
|
210
210
|
final_response = finished_response
|
211
211
|
|
212
212
|
with sync_playwright() as p:
|
@@ -252,7 +252,6 @@ class PlaywrightEngine:
|
|
252
252
|
if self.network_idle:
|
253
253
|
page.wait_for_load_state('networkidle')
|
254
254
|
|
255
|
-
response_bytes = final_response.body() if final_response else page.content().encode('utf-8')
|
256
255
|
# In case we didn't catch a document type somehow
|
257
256
|
final_response = final_response if final_response else first_response
|
258
257
|
# This will be parsed inside `Response`
|
@@ -261,15 +260,15 @@ class PlaywrightEngine:
|
|
261
260
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
262
261
|
|
263
262
|
response = Response(
|
264
|
-
url=
|
263
|
+
url=page.url,
|
265
264
|
text=page.content(),
|
266
|
-
body=
|
265
|
+
body=page.content().encode('utf-8'),
|
267
266
|
status=final_response.status,
|
268
267
|
reason=status_text,
|
269
268
|
encoding=encoding,
|
270
269
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
271
|
-
headers=
|
272
|
-
request_headers=
|
270
|
+
headers=first_response.all_headers(),
|
271
|
+
request_headers=first_response.request.all_headers(),
|
273
272
|
**self.adaptor_arguments
|
274
273
|
)
|
275
274
|
page.close()
|
@@ -293,7 +292,7 @@ class PlaywrightEngine:
|
|
293
292
|
|
294
293
|
async def handle_response(finished_response: PlaywrightResponse):
|
295
294
|
nonlocal final_response
|
296
|
-
if finished_response.request.resource_type == "document":
|
295
|
+
if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
|
297
296
|
final_response = finished_response
|
298
297
|
|
299
298
|
async with async_playwright() as p:
|
@@ -339,7 +338,6 @@ class PlaywrightEngine:
|
|
339
338
|
if self.network_idle:
|
340
339
|
await page.wait_for_load_state('networkidle')
|
341
340
|
|
342
|
-
response_bytes = await final_response.body() if final_response else (await page.content()).encode('utf-8')
|
343
341
|
# In case we didn't catch a document type somehow
|
344
342
|
final_response = final_response if final_response else first_response
|
345
343
|
# This will be parsed inside `Response`
|
@@ -348,15 +346,15 @@ class PlaywrightEngine:
|
|
348
346
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
349
347
|
|
350
348
|
response = Response(
|
351
|
-
url=
|
349
|
+
url=page.url,
|
352
350
|
text=await page.content(),
|
353
|
-
body=
|
351
|
+
body=(await page.content()).encode('utf-8'),
|
354
352
|
status=final_response.status,
|
355
353
|
reason=status_text,
|
356
354
|
encoding=encoding,
|
357
355
|
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
|
358
|
-
headers=await
|
359
|
-
request_headers=await
|
356
|
+
headers=await first_response.all_headers(),
|
357
|
+
request_headers=await first_response.request.all_headers(),
|
360
358
|
**self.adaptor_arguments
|
361
359
|
)
|
362
360
|
await page.close()
|
scrapling/parser.py
CHANGED
@@ -474,7 +474,7 @@ class Adaptor(SelectorsGeneration):
|
|
474
474
|
|
475
475
|
def css(self, selector: str, identifier: str = '',
|
476
476
|
auto_match: bool = False, auto_save: bool = False, percentage: int = 0
|
477
|
-
) -> Union['Adaptors[Adaptor]', List]:
|
477
|
+
) -> Union['Adaptors[Adaptor]', List, 'TextHandlers[TextHandler]']:
|
478
478
|
"""Search current tree with CSS3 selectors
|
479
479
|
|
480
480
|
**Important:
|
@@ -517,7 +517,7 @@ class Adaptor(SelectorsGeneration):
|
|
517
517
|
|
518
518
|
def xpath(self, selector: str, identifier: str = '',
|
519
519
|
auto_match: bool = False, auto_save: bool = False, percentage: int = 0, **kwargs: Any
|
520
|
-
) -> Union['Adaptors[Adaptor]', List]:
|
520
|
+
) -> Union['Adaptors[Adaptor]', List, 'TextHandlers[TextHandler]']:
|
521
521
|
"""Search current tree with XPath selectors
|
522
522
|
|
523
523
|
**Important:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.92
|
4
4
|
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -34,6 +34,7 @@ License-File: LICENSE
|
|
34
34
|
Requires-Dist: requests>=2.3
|
35
35
|
Requires-Dist: lxml>=4.5
|
36
36
|
Requires-Dist: cssselect>=1.2
|
37
|
+
Requires-Dist: click
|
37
38
|
Requires-Dist: w3lib
|
38
39
|
Requires-Dist: orjson>=3
|
39
40
|
Requires-Dist: tldextract
|
@@ -211,52 +212,18 @@ Scrapling can find elements with more methods and it returns full element `Adapt
|
|
211
212
|
> All benchmarks' results are an average of 100 runs. See our [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology and to run your comparisons.
|
212
213
|
|
213
214
|
## Installation
|
214
|
-
Scrapling is a breeze to get started with
|
215
|
+
Scrapling is a breeze to get started with; Starting from version 0.2.9, we require at least Python 3.9 to work.
|
215
216
|
```bash
|
216
217
|
pip3 install scrapling
|
217
218
|
```
|
218
|
-
|
219
|
-
<details><summary>Windows OS</summary>
|
220
|
-
|
221
|
-
```bash
|
222
|
-
camoufox fetch --browserforge
|
223
|
-
```
|
224
|
-
</details>
|
225
|
-
<details><summary>MacOS</summary>
|
226
|
-
|
227
|
-
```bash
|
228
|
-
python3 -m camoufox fetch --browserforge
|
229
|
-
```
|
230
|
-
</details>
|
231
|
-
<details><summary>Linux</summary>
|
232
|
-
|
219
|
+
Then run this command to install browsers' dependencies needed to use Fetcher classes
|
233
220
|
```bash
|
234
|
-
|
235
|
-
```
|
236
|
-
On a fresh installation of Linux, you may also need the following Firefox dependencies:
|
237
|
-
- Debian-based distros
|
238
|
-
```bash
|
239
|
-
sudo apt install -y libgtk-3-0 libx11-xcb1 libasound2
|
240
|
-
```
|
241
|
-
- Arch-based distros
|
242
|
-
```bash
|
243
|
-
sudo pacman -S gtk3 libx11 libxcb cairo libasound alsa-lib
|
244
|
-
```
|
245
|
-
</details>
|
246
|
-
|
247
|
-
<small> See the official <a href="https://camoufox.com/python/installation/#download-the-browser">Camoufox documentation</a> for more info on installation</small>
|
248
|
-
|
249
|
-
- If you are going to use the `PlayWrightFetcher` options, then install Playwright's Chromium browser with:
|
250
|
-
```commandline
|
251
|
-
playwright install chromium
|
252
|
-
```
|
253
|
-
- If you are going to use normal requests only with the `Fetcher` class then update the fingerprints files with:
|
254
|
-
```commandline
|
255
|
-
python -m browserforge update
|
221
|
+
scrapling install
|
256
222
|
```
|
223
|
+
If you have any installation issues, please open an issue.
|
257
224
|
|
258
225
|
## Fetching Websites
|
259
|
-
Fetchers are
|
226
|
+
Fetchers are interfaces built on top of other libraries with added features that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
|
260
227
|
|
261
228
|
### Features
|
262
229
|
You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way
|
@@ -1,7 +1,8 @@
|
|
1
|
-
scrapling/__init__.py,sha256=
|
1
|
+
scrapling/__init__.py,sha256=0iEOX168f4gLFpReEUemMOhTske8AS2o0UQHJWXn-4o,500
|
2
|
+
scrapling/cli.py,sha256=njPdJKmbLFHeWjtSiGEm9ALBdSyfUp0IaJvxQL5C31Q,1125
|
2
3
|
scrapling/defaults.py,sha256=tJAOMB-PMd3aLZz3j_yr6haBxxaklAvWdS_hP-GFFdU,331
|
3
4
|
scrapling/fetchers.py,sha256=K3MKBqKDOXItJNwxFY2fe1C21Vz6QSd91fFtN98Mpg4,35402
|
4
|
-
scrapling/parser.py,sha256=
|
5
|
+
scrapling/parser.py,sha256=sT1gh5pnbjpUzFt8K9DGD6x60zKQcAtzmyf8DgiNDCI,55266
|
5
6
|
scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
|
6
7
|
scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
8
|
scrapling/core/_types.py,sha256=OcsP1WeQEOlEVo9OzTrLQfgZZfXuJ0civVs31SynwGA,641
|
@@ -11,9 +12,9 @@ scrapling/core/storage_adaptors.py,sha256=l_ZYcdn1y69AcoPuRrPoaxqKysN62pMExrwJWY
|
|
11
12
|
scrapling/core/translator.py,sha256=ojDmNi5pFZE6Ke-AiSsTilXiPRdR8yhX3o-uVGMkap8,5236
|
12
13
|
scrapling/core/utils.py,sha256=03LzCDzmeK1TXPjIKVzHSUgSfhpe36XE8AwxlgxzJoU,3705
|
13
14
|
scrapling/engines/__init__.py,sha256=zA7tzqcDXP0hllwmjVewNHWipIA4JSU9mRG4J-cud0c,267
|
14
|
-
scrapling/engines/camo.py,sha256=
|
15
|
+
scrapling/engines/camo.py,sha256=wJRfaIU0w_hDSlrP2AdpjBU6NNEKw0wSnVbqUoxt1Gk,13682
|
15
16
|
scrapling/engines/constants.py,sha256=Gb_nXFoBB4ujJkd05SKkenMe1UDiRYQA3dkmA3DunLg,3723
|
16
|
-
scrapling/engines/pw.py,sha256=
|
17
|
+
scrapling/engines/pw.py,sha256=MCYE5rDx55D2VOIeUNLl44ROXnyFRfku_u2FOcXjqEQ,18534
|
17
18
|
scrapling/engines/static.py,sha256=7SVEfeigCPfwC1ukx0zIFFe96Bo5fox6qOq2IWrP6P8,10319
|
18
19
|
scrapling/engines/toolbelt/__init__.py,sha256=VQDdYm1zY9Apno6d8UrULk29vUjllZrQqD8mXL1E2Fc,402
|
19
20
|
scrapling/engines/toolbelt/custom.py,sha256=d3qyeCg_qHm1RRE7yv5hyU9b17Y7YDPGBOVhEH1CAT0,12754
|
@@ -40,8 +41,9 @@ tests/fetchers/sync/test_playwright.py,sha256=5eZdPwk3JGeaO7GuExv_QsByLyWDE9joxn
|
|
40
41
|
tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
41
42
|
tests/parser/test_automatch.py,sha256=SxsNdExE8zz8AcPRQFBUjZ3Q_1-tPOd9dzVvMSZpOYQ,4908
|
42
43
|
tests/parser/test_general.py,sha256=dyfOsc8lleoY4AxcfDUBUaD1i95xecfYuTUhKBsYjwo,12100
|
43
|
-
scrapling-0.2.
|
44
|
-
scrapling-0.2.
|
45
|
-
scrapling-0.2.
|
46
|
-
scrapling-0.2.
|
47
|
-
scrapling-0.2.
|
44
|
+
scrapling-0.2.92.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
45
|
+
scrapling-0.2.92.dist-info/METADATA,sha256=2I-HK-xEkVFFyQBio8NAKR0eQEBB-dLHFuvb5eluCEQ,67415
|
46
|
+
scrapling-0.2.92.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
47
|
+
scrapling-0.2.92.dist-info/entry_points.txt,sha256=DHyt2Blxy0P5OE2HRcP95Wz9_xo2ERCDcNqrJjYS3o8,49
|
48
|
+
scrapling-0.2.92.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
|
49
|
+
scrapling-0.2.92.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|