scrapling 0.2.91__py3-none-any.whl → 0.2.92__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/cli.py +37 -0
- scrapling/engines/camo.py +10 -12
- scrapling/engines/pw.py +10 -12
- scrapling/parser.py +2 -2
- {scrapling-0.2.91.dist-info → scrapling-0.2.92.dist-info}/METADATA +7 -40
- {scrapling-0.2.91.dist-info → scrapling-0.2.92.dist-info}/RECORD +11 -9
- scrapling-0.2.92.dist-info/entry_points.txt +2 -0
- {scrapling-0.2.91.dist-info → scrapling-0.2.92.dist-info}/LICENSE +0 -0
- {scrapling-0.2.91.dist-info → scrapling-0.2.92.dist-info}/WHEEL +0 -0
- {scrapling-0.2.91.dist-info → scrapling-0.2.92.dist-info}/top_level.txt +0 -0
scrapling/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
|
|
5
5
|
from scrapling.parser import Adaptor, Adaptors
|
6
6
|
|
7
7
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
8
|
-
__version__ = "0.2.
|
8
|
+
__version__ = "0.2.92"
|
9
9
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
10
10
|
|
11
11
|
|
scrapling/cli.py
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
import os
|
2
|
+
import subprocess
|
3
|
+
import sys
|
4
|
+
from pathlib import Path
|
5
|
+
|
6
|
+
import click
|
7
|
+
|
8
|
+
|
9
|
+
def get_package_dir():
|
10
|
+
return Path(os.path.dirname(__file__))
|
11
|
+
|
12
|
+
|
13
|
+
def run_command(command, line):
|
14
|
+
print(f"Installing {line}...")
|
15
|
+
_ = subprocess.check_call(command, shell=True)
|
16
|
+
# I meant to not use try except here
|
17
|
+
|
18
|
+
|
19
|
+
@click.command(help="Install all Scrapling's Fetchers dependencies")
|
20
|
+
def install():
|
21
|
+
if not get_package_dir().joinpath(".scrapling_dependencies_installed").exists():
|
22
|
+
run_command([sys.executable, "-m", "playwright", "install", 'chromium'], 'Playwright browsers')
|
23
|
+
run_command([sys.executable, "-m", "playwright", "install-deps", 'chromium', 'firefox'], 'Playwright dependencies')
|
24
|
+
run_command([sys.executable, "-m", "camoufox", "fetch", '--browserforge'], 'Camoufox browser and databases')
|
25
|
+
# if no errors raised by above commands, then we add below file
|
26
|
+
get_package_dir().joinpath(".scrapling_dependencies_installed").touch()
|
27
|
+
else:
|
28
|
+
print('The dependencies are already installed')
|
29
|
+
|
30
|
+
|
31
|
+
@click.group()
|
32
|
+
def main():
|
33
|
+
pass
|
34
|
+
|
35
|
+
|
36
|
+
# Adding commands
|
37
|
+
main.add_command(install)
|
scrapling/engines/camo.py
CHANGED
@@ -89,7 +89,7 @@ class CamoufoxEngine:
|
|
89
89
|
|
90
90
|
def handle_response(finished_response):
|
91
91
|
nonlocal final_response
|
92
|
-
if finished_response.request.resource_type == "document":
|
92
|
+
if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
|
93
93
|
final_response = finished_response
|
94
94
|
|
95
95
|
with Camoufox(
|
@@ -133,7 +133,6 @@ class CamoufoxEngine:
|
|
133
133
|
if self.network_idle:
|
134
134
|
page.wait_for_load_state('networkidle')
|
135
135
|
|
136
|
-
response_bytes = final_response.body() if final_response else page.content().encode('utf-8')
|
137
136
|
# In case we didn't catch a document type somehow
|
138
137
|
final_response = final_response if final_response else first_response
|
139
138
|
# This will be parsed inside `Response`
|
@@ -142,15 +141,15 @@ class CamoufoxEngine:
|
|
142
141
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
143
142
|
|
144
143
|
response = Response(
|
145
|
-
url=
|
144
|
+
url=page.url,
|
146
145
|
text=page.content(),
|
147
|
-
body=
|
146
|
+
body=page.content().encode('utf-8'),
|
148
147
|
status=final_response.status,
|
149
148
|
reason=status_text,
|
150
149
|
encoding=encoding,
|
151
150
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
152
|
-
headers=
|
153
|
-
request_headers=
|
151
|
+
headers=first_response.all_headers(),
|
152
|
+
request_headers=first_response.request.all_headers(),
|
154
153
|
**self.adaptor_arguments
|
155
154
|
)
|
156
155
|
page.close()
|
@@ -169,7 +168,7 @@ class CamoufoxEngine:
|
|
169
168
|
|
170
169
|
async def handle_response(finished_response):
|
171
170
|
nonlocal final_response
|
172
|
-
if finished_response.request.resource_type == "document":
|
171
|
+
if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
|
173
172
|
final_response = finished_response
|
174
173
|
|
175
174
|
async with AsyncCamoufox(
|
@@ -213,7 +212,6 @@ class CamoufoxEngine:
|
|
213
212
|
if self.network_idle:
|
214
213
|
await page.wait_for_load_state('networkidle')
|
215
214
|
|
216
|
-
response_bytes = await final_response.body() if final_response else (await page.content()).encode('utf-8')
|
217
215
|
# In case we didn't catch a document type somehow
|
218
216
|
final_response = final_response if final_response else first_response
|
219
217
|
# This will be parsed inside `Response`
|
@@ -222,15 +220,15 @@ class CamoufoxEngine:
|
|
222
220
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
223
221
|
|
224
222
|
response = Response(
|
225
|
-
url=
|
223
|
+
url=page.url,
|
226
224
|
text=await page.content(),
|
227
|
-
body=
|
225
|
+
body=(await page.content()).encode('utf-8'),
|
228
226
|
status=final_response.status,
|
229
227
|
reason=status_text,
|
230
228
|
encoding=encoding,
|
231
229
|
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
|
232
|
-
headers=await
|
233
|
-
request_headers=await
|
230
|
+
headers=await first_response.all_headers(),
|
231
|
+
request_headers=await first_response.request.all_headers(),
|
234
232
|
**self.adaptor_arguments
|
235
233
|
)
|
236
234
|
await page.close()
|
scrapling/engines/pw.py
CHANGED
@@ -206,7 +206,7 @@ class PlaywrightEngine:
|
|
206
206
|
|
207
207
|
def handle_response(finished_response: PlaywrightResponse):
|
208
208
|
nonlocal final_response
|
209
|
-
if finished_response.request.resource_type == "document":
|
209
|
+
if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
|
210
210
|
final_response = finished_response
|
211
211
|
|
212
212
|
with sync_playwright() as p:
|
@@ -252,7 +252,6 @@ class PlaywrightEngine:
|
|
252
252
|
if self.network_idle:
|
253
253
|
page.wait_for_load_state('networkidle')
|
254
254
|
|
255
|
-
response_bytes = final_response.body() if final_response else page.content().encode('utf-8')
|
256
255
|
# In case we didn't catch a document type somehow
|
257
256
|
final_response = final_response if final_response else first_response
|
258
257
|
# This will be parsed inside `Response`
|
@@ -261,15 +260,15 @@ class PlaywrightEngine:
|
|
261
260
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
262
261
|
|
263
262
|
response = Response(
|
264
|
-
url=
|
263
|
+
url=page.url,
|
265
264
|
text=page.content(),
|
266
|
-
body=
|
265
|
+
body=page.content().encode('utf-8'),
|
267
266
|
status=final_response.status,
|
268
267
|
reason=status_text,
|
269
268
|
encoding=encoding,
|
270
269
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
271
|
-
headers=
|
272
|
-
request_headers=
|
270
|
+
headers=first_response.all_headers(),
|
271
|
+
request_headers=first_response.request.all_headers(),
|
273
272
|
**self.adaptor_arguments
|
274
273
|
)
|
275
274
|
page.close()
|
@@ -293,7 +292,7 @@ class PlaywrightEngine:
|
|
293
292
|
|
294
293
|
async def handle_response(finished_response: PlaywrightResponse):
|
295
294
|
nonlocal final_response
|
296
|
-
if finished_response.request.resource_type == "document":
|
295
|
+
if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
|
297
296
|
final_response = finished_response
|
298
297
|
|
299
298
|
async with async_playwright() as p:
|
@@ -339,7 +338,6 @@ class PlaywrightEngine:
|
|
339
338
|
if self.network_idle:
|
340
339
|
await page.wait_for_load_state('networkidle')
|
341
340
|
|
342
|
-
response_bytes = await final_response.body() if final_response else (await page.content()).encode('utf-8')
|
343
341
|
# In case we didn't catch a document type somehow
|
344
342
|
final_response = final_response if final_response else first_response
|
345
343
|
# This will be parsed inside `Response`
|
@@ -348,15 +346,15 @@ class PlaywrightEngine:
|
|
348
346
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
349
347
|
|
350
348
|
response = Response(
|
351
|
-
url=
|
349
|
+
url=page.url,
|
352
350
|
text=await page.content(),
|
353
|
-
body=
|
351
|
+
body=(await page.content()).encode('utf-8'),
|
354
352
|
status=final_response.status,
|
355
353
|
reason=status_text,
|
356
354
|
encoding=encoding,
|
357
355
|
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
|
358
|
-
headers=await
|
359
|
-
request_headers=await
|
356
|
+
headers=await first_response.all_headers(),
|
357
|
+
request_headers=await first_response.request.all_headers(),
|
360
358
|
**self.adaptor_arguments
|
361
359
|
)
|
362
360
|
await page.close()
|
scrapling/parser.py
CHANGED
@@ -474,7 +474,7 @@ class Adaptor(SelectorsGeneration):
|
|
474
474
|
|
475
475
|
def css(self, selector: str, identifier: str = '',
|
476
476
|
auto_match: bool = False, auto_save: bool = False, percentage: int = 0
|
477
|
-
) -> Union['Adaptors[Adaptor]', List]:
|
477
|
+
) -> Union['Adaptors[Adaptor]', List, 'TextHandlers[TextHandler]']:
|
478
478
|
"""Search current tree with CSS3 selectors
|
479
479
|
|
480
480
|
**Important:
|
@@ -517,7 +517,7 @@ class Adaptor(SelectorsGeneration):
|
|
517
517
|
|
518
518
|
def xpath(self, selector: str, identifier: str = '',
|
519
519
|
auto_match: bool = False, auto_save: bool = False, percentage: int = 0, **kwargs: Any
|
520
|
-
) -> Union['Adaptors[Adaptor]', List]:
|
520
|
+
) -> Union['Adaptors[Adaptor]', List, 'TextHandlers[TextHandler]']:
|
521
521
|
"""Search current tree with XPath selectors
|
522
522
|
|
523
523
|
**Important:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.92
|
4
4
|
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -34,6 +34,7 @@ License-File: LICENSE
|
|
34
34
|
Requires-Dist: requests>=2.3
|
35
35
|
Requires-Dist: lxml>=4.5
|
36
36
|
Requires-Dist: cssselect>=1.2
|
37
|
+
Requires-Dist: click
|
37
38
|
Requires-Dist: w3lib
|
38
39
|
Requires-Dist: orjson>=3
|
39
40
|
Requires-Dist: tldextract
|
@@ -211,52 +212,18 @@ Scrapling can find elements with more methods and it returns full element `Adapt
|
|
211
212
|
> All benchmarks' results are an average of 100 runs. See our [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology and to run your comparisons.
|
212
213
|
|
213
214
|
## Installation
|
214
|
-
Scrapling is a breeze to get started with
|
215
|
+
Scrapling is a breeze to get started with; Starting from version 0.2.9, we require at least Python 3.9 to work.
|
215
216
|
```bash
|
216
217
|
pip3 install scrapling
|
217
218
|
```
|
218
|
-
|
219
|
-
<details><summary>Windows OS</summary>
|
220
|
-
|
221
|
-
```bash
|
222
|
-
camoufox fetch --browserforge
|
223
|
-
```
|
224
|
-
</details>
|
225
|
-
<details><summary>MacOS</summary>
|
226
|
-
|
227
|
-
```bash
|
228
|
-
python3 -m camoufox fetch --browserforge
|
229
|
-
```
|
230
|
-
</details>
|
231
|
-
<details><summary>Linux</summary>
|
232
|
-
|
219
|
+
Then run this command to install browsers' dependencies needed to use Fetcher classes
|
233
220
|
```bash
|
234
|
-
|
235
|
-
```
|
236
|
-
On a fresh installation of Linux, you may also need the following Firefox dependencies:
|
237
|
-
- Debian-based distros
|
238
|
-
```bash
|
239
|
-
sudo apt install -y libgtk-3-0 libx11-xcb1 libasound2
|
240
|
-
```
|
241
|
-
- Arch-based distros
|
242
|
-
```bash
|
243
|
-
sudo pacman -S gtk3 libx11 libxcb cairo libasound alsa-lib
|
244
|
-
```
|
245
|
-
</details>
|
246
|
-
|
247
|
-
<small> See the official <a href="https://camoufox.com/python/installation/#download-the-browser">Camoufox documentation</a> for more info on installation</small>
|
248
|
-
|
249
|
-
- If you are going to use the `PlayWrightFetcher` options, then install Playwright's Chromium browser with:
|
250
|
-
```commandline
|
251
|
-
playwright install chromium
|
252
|
-
```
|
253
|
-
- If you are going to use normal requests only with the `Fetcher` class then update the fingerprints files with:
|
254
|
-
```commandline
|
255
|
-
python -m browserforge update
|
221
|
+
scrapling install
|
256
222
|
```
|
223
|
+
If you have any installation issues, please open an issue.
|
257
224
|
|
258
225
|
## Fetching Websites
|
259
|
-
Fetchers are
|
226
|
+
Fetchers are interfaces built on top of other libraries with added features that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
|
260
227
|
|
261
228
|
### Features
|
262
229
|
You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way
|
@@ -1,7 +1,8 @@
|
|
1
|
-
scrapling/__init__.py,sha256=
|
1
|
+
scrapling/__init__.py,sha256=0iEOX168f4gLFpReEUemMOhTske8AS2o0UQHJWXn-4o,500
|
2
|
+
scrapling/cli.py,sha256=njPdJKmbLFHeWjtSiGEm9ALBdSyfUp0IaJvxQL5C31Q,1125
|
2
3
|
scrapling/defaults.py,sha256=tJAOMB-PMd3aLZz3j_yr6haBxxaklAvWdS_hP-GFFdU,331
|
3
4
|
scrapling/fetchers.py,sha256=K3MKBqKDOXItJNwxFY2fe1C21Vz6QSd91fFtN98Mpg4,35402
|
4
|
-
scrapling/parser.py,sha256=
|
5
|
+
scrapling/parser.py,sha256=sT1gh5pnbjpUzFt8K9DGD6x60zKQcAtzmyf8DgiNDCI,55266
|
5
6
|
scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
|
6
7
|
scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
8
|
scrapling/core/_types.py,sha256=OcsP1WeQEOlEVo9OzTrLQfgZZfXuJ0civVs31SynwGA,641
|
@@ -11,9 +12,9 @@ scrapling/core/storage_adaptors.py,sha256=l_ZYcdn1y69AcoPuRrPoaxqKysN62pMExrwJWY
|
|
11
12
|
scrapling/core/translator.py,sha256=ojDmNi5pFZE6Ke-AiSsTilXiPRdR8yhX3o-uVGMkap8,5236
|
12
13
|
scrapling/core/utils.py,sha256=03LzCDzmeK1TXPjIKVzHSUgSfhpe36XE8AwxlgxzJoU,3705
|
13
14
|
scrapling/engines/__init__.py,sha256=zA7tzqcDXP0hllwmjVewNHWipIA4JSU9mRG4J-cud0c,267
|
14
|
-
scrapling/engines/camo.py,sha256=
|
15
|
+
scrapling/engines/camo.py,sha256=wJRfaIU0w_hDSlrP2AdpjBU6NNEKw0wSnVbqUoxt1Gk,13682
|
15
16
|
scrapling/engines/constants.py,sha256=Gb_nXFoBB4ujJkd05SKkenMe1UDiRYQA3dkmA3DunLg,3723
|
16
|
-
scrapling/engines/pw.py,sha256=
|
17
|
+
scrapling/engines/pw.py,sha256=MCYE5rDx55D2VOIeUNLl44ROXnyFRfku_u2FOcXjqEQ,18534
|
17
18
|
scrapling/engines/static.py,sha256=7SVEfeigCPfwC1ukx0zIFFe96Bo5fox6qOq2IWrP6P8,10319
|
18
19
|
scrapling/engines/toolbelt/__init__.py,sha256=VQDdYm1zY9Apno6d8UrULk29vUjllZrQqD8mXL1E2Fc,402
|
19
20
|
scrapling/engines/toolbelt/custom.py,sha256=d3qyeCg_qHm1RRE7yv5hyU9b17Y7YDPGBOVhEH1CAT0,12754
|
@@ -40,8 +41,9 @@ tests/fetchers/sync/test_playwright.py,sha256=5eZdPwk3JGeaO7GuExv_QsByLyWDE9joxn
|
|
40
41
|
tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
41
42
|
tests/parser/test_automatch.py,sha256=SxsNdExE8zz8AcPRQFBUjZ3Q_1-tPOd9dzVvMSZpOYQ,4908
|
42
43
|
tests/parser/test_general.py,sha256=dyfOsc8lleoY4AxcfDUBUaD1i95xecfYuTUhKBsYjwo,12100
|
43
|
-
scrapling-0.2.
|
44
|
-
scrapling-0.2.
|
45
|
-
scrapling-0.2.
|
46
|
-
scrapling-0.2.
|
47
|
-
scrapling-0.2.
|
44
|
+
scrapling-0.2.92.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
45
|
+
scrapling-0.2.92.dist-info/METADATA,sha256=2I-HK-xEkVFFyQBio8NAKR0eQEBB-dLHFuvb5eluCEQ,67415
|
46
|
+
scrapling-0.2.92.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
47
|
+
scrapling-0.2.92.dist-info/entry_points.txt,sha256=DHyt2Blxy0P5OE2HRcP95Wz9_xo2ERCDcNqrJjYS3o8,49
|
48
|
+
scrapling-0.2.92.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
|
49
|
+
scrapling-0.2.92.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|