scrapling 0.2.91__py3-none-any.whl → 0.2.92__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
scrapling/__init__.py CHANGED
@@ -5,7 +5,7 @@ from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
5
5
  from scrapling.parser import Adaptor, Adaptors
6
6
 
7
7
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
8
- __version__ = "0.2.91"
8
+ __version__ = "0.2.92"
9
9
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
10
10
 
11
11
 
scrapling/cli.py ADDED
@@ -0,0 +1,37 @@
1
+ import os
2
+ import subprocess
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ import click
7
+
8
+
9
+ def get_package_dir():
10
+ return Path(os.path.dirname(__file__))
11
+
12
+
13
+ def run_command(command, line):
14
+ print(f"Installing {line}...")
15
+ _ = subprocess.check_call(command, shell=True)
16
+ # I meant to not use try except here
17
+
18
+
19
+ @click.command(help="Install all Scrapling's Fetchers dependencies")
20
+ def install():
21
+ if not get_package_dir().joinpath(".scrapling_dependencies_installed").exists():
22
+ run_command([sys.executable, "-m", "playwright", "install", 'chromium'], 'Playwright browsers')
23
+ run_command([sys.executable, "-m", "playwright", "install-deps", 'chromium', 'firefox'], 'Playwright dependencies')
24
+ run_command([sys.executable, "-m", "camoufox", "fetch", '--browserforge'], 'Camoufox browser and databases')
25
+ # if no errors raised by above commands, then we add below file
26
+ get_package_dir().joinpath(".scrapling_dependencies_installed").touch()
27
+ else:
28
+ print('The dependencies are already installed')
29
+
30
+
31
+ @click.group()
32
+ def main():
33
+ pass
34
+
35
+
36
+ # Adding commands
37
+ main.add_command(install)
scrapling/engines/camo.py CHANGED
@@ -89,7 +89,7 @@ class CamoufoxEngine:
89
89
 
90
90
  def handle_response(finished_response):
91
91
  nonlocal final_response
92
- if finished_response.request.resource_type == "document":
92
+ if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
93
93
  final_response = finished_response
94
94
 
95
95
  with Camoufox(
@@ -133,7 +133,6 @@ class CamoufoxEngine:
133
133
  if self.network_idle:
134
134
  page.wait_for_load_state('networkidle')
135
135
 
136
- response_bytes = final_response.body() if final_response else page.content().encode('utf-8')
137
136
  # In case we didn't catch a document type somehow
138
137
  final_response = final_response if final_response else first_response
139
138
  # This will be parsed inside `Response`
@@ -142,15 +141,15 @@ class CamoufoxEngine:
142
141
  status_text = final_response.status_text or StatusText.get(final_response.status)
143
142
 
144
143
  response = Response(
145
- url=final_response.url,
144
+ url=page.url,
146
145
  text=page.content(),
147
- body=response_bytes,
146
+ body=page.content().encode('utf-8'),
148
147
  status=final_response.status,
149
148
  reason=status_text,
150
149
  encoding=encoding,
151
150
  cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
152
- headers=final_response.all_headers(),
153
- request_headers=final_response.request.all_headers(),
151
+ headers=first_response.all_headers(),
152
+ request_headers=first_response.request.all_headers(),
154
153
  **self.adaptor_arguments
155
154
  )
156
155
  page.close()
@@ -169,7 +168,7 @@ class CamoufoxEngine:
169
168
 
170
169
  async def handle_response(finished_response):
171
170
  nonlocal final_response
172
- if finished_response.request.resource_type == "document":
171
+ if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
173
172
  final_response = finished_response
174
173
 
175
174
  async with AsyncCamoufox(
@@ -213,7 +212,6 @@ class CamoufoxEngine:
213
212
  if self.network_idle:
214
213
  await page.wait_for_load_state('networkidle')
215
214
 
216
- response_bytes = await final_response.body() if final_response else (await page.content()).encode('utf-8')
217
215
  # In case we didn't catch a document type somehow
218
216
  final_response = final_response if final_response else first_response
219
217
  # This will be parsed inside `Response`
@@ -222,15 +220,15 @@ class CamoufoxEngine:
222
220
  status_text = final_response.status_text or StatusText.get(final_response.status)
223
221
 
224
222
  response = Response(
225
- url=final_response.url,
223
+ url=page.url,
226
224
  text=await page.content(),
227
- body=response_bytes,
225
+ body=(await page.content()).encode('utf-8'),
228
226
  status=final_response.status,
229
227
  reason=status_text,
230
228
  encoding=encoding,
231
229
  cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
232
- headers=await final_response.all_headers(),
233
- request_headers=await final_response.request.all_headers(),
230
+ headers=await first_response.all_headers(),
231
+ request_headers=await first_response.request.all_headers(),
234
232
  **self.adaptor_arguments
235
233
  )
236
234
  await page.close()
scrapling/engines/pw.py CHANGED
@@ -206,7 +206,7 @@ class PlaywrightEngine:
206
206
 
207
207
  def handle_response(finished_response: PlaywrightResponse):
208
208
  nonlocal final_response
209
- if finished_response.request.resource_type == "document":
209
+ if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
210
210
  final_response = finished_response
211
211
 
212
212
  with sync_playwright() as p:
@@ -252,7 +252,6 @@ class PlaywrightEngine:
252
252
  if self.network_idle:
253
253
  page.wait_for_load_state('networkidle')
254
254
 
255
- response_bytes = final_response.body() if final_response else page.content().encode('utf-8')
256
255
  # In case we didn't catch a document type somehow
257
256
  final_response = final_response if final_response else first_response
258
257
  # This will be parsed inside `Response`
@@ -261,15 +260,15 @@ class PlaywrightEngine:
261
260
  status_text = final_response.status_text or StatusText.get(final_response.status)
262
261
 
263
262
  response = Response(
264
- url=final_response.url,
263
+ url=page.url,
265
264
  text=page.content(),
266
- body=response_bytes,
265
+ body=page.content().encode('utf-8'),
267
266
  status=final_response.status,
268
267
  reason=status_text,
269
268
  encoding=encoding,
270
269
  cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
271
- headers=final_response.all_headers(),
272
- request_headers=final_response.request.all_headers(),
270
+ headers=first_response.all_headers(),
271
+ request_headers=first_response.request.all_headers(),
273
272
  **self.adaptor_arguments
274
273
  )
275
274
  page.close()
@@ -293,7 +292,7 @@ class PlaywrightEngine:
293
292
 
294
293
  async def handle_response(finished_response: PlaywrightResponse):
295
294
  nonlocal final_response
296
- if finished_response.request.resource_type == "document":
295
+ if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
297
296
  final_response = finished_response
298
297
 
299
298
  async with async_playwright() as p:
@@ -339,7 +338,6 @@ class PlaywrightEngine:
339
338
  if self.network_idle:
340
339
  await page.wait_for_load_state('networkidle')
341
340
 
342
- response_bytes = await final_response.body() if final_response else (await page.content()).encode('utf-8')
343
341
  # In case we didn't catch a document type somehow
344
342
  final_response = final_response if final_response else first_response
345
343
  # This will be parsed inside `Response`
@@ -348,15 +346,15 @@ class PlaywrightEngine:
348
346
  status_text = final_response.status_text or StatusText.get(final_response.status)
349
347
 
350
348
  response = Response(
351
- url=final_response.url,
349
+ url=page.url,
352
350
  text=await page.content(),
353
- body=response_bytes,
351
+ body=(await page.content()).encode('utf-8'),
354
352
  status=final_response.status,
355
353
  reason=status_text,
356
354
  encoding=encoding,
357
355
  cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
358
- headers=await final_response.all_headers(),
359
- request_headers=await final_response.request.all_headers(),
356
+ headers=await first_response.all_headers(),
357
+ request_headers=await first_response.request.all_headers(),
360
358
  **self.adaptor_arguments
361
359
  )
362
360
  await page.close()
scrapling/parser.py CHANGED
@@ -474,7 +474,7 @@ class Adaptor(SelectorsGeneration):
474
474
 
475
475
  def css(self, selector: str, identifier: str = '',
476
476
  auto_match: bool = False, auto_save: bool = False, percentage: int = 0
477
- ) -> Union['Adaptors[Adaptor]', List]:
477
+ ) -> Union['Adaptors[Adaptor]', List, 'TextHandlers[TextHandler]']:
478
478
  """Search current tree with CSS3 selectors
479
479
 
480
480
  **Important:
@@ -517,7 +517,7 @@ class Adaptor(SelectorsGeneration):
517
517
 
518
518
  def xpath(self, selector: str, identifier: str = '',
519
519
  auto_match: bool = False, auto_save: bool = False, percentage: int = 0, **kwargs: Any
520
- ) -> Union['Adaptors[Adaptor]', List]:
520
+ ) -> Union['Adaptors[Adaptor]', List, 'TextHandlers[TextHandler]']:
521
521
  """Search current tree with XPath selectors
522
522
 
523
523
  **Important:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: scrapling
3
- Version: 0.2.91
3
+ Version: 0.2.92
4
4
  Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
@@ -34,6 +34,7 @@ License-File: LICENSE
34
34
  Requires-Dist: requests>=2.3
35
35
  Requires-Dist: lxml>=4.5
36
36
  Requires-Dist: cssselect>=1.2
37
+ Requires-Dist: click
37
38
  Requires-Dist: w3lib
38
39
  Requires-Dist: orjson>=3
39
40
  Requires-Dist: tldextract
@@ -211,52 +212,18 @@ Scrapling can find elements with more methods and it returns full element `Adapt
211
212
  > All benchmarks' results are an average of 100 runs. See our [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology and to run your comparisons.
212
213
 
213
214
  ## Installation
214
- Scrapling is a breeze to get started with - Starting from version 0.2.9, we require at least Python 3.9 to work.
215
+ Scrapling is a breeze to get started with; Starting from version 0.2.9, we require at least Python 3.9 to work.
215
216
  ```bash
216
217
  pip3 install scrapling
217
218
  ```
218
- - For using the `StealthyFetcher`, go to the command line and download the browser with
219
- <details><summary>Windows OS</summary>
220
-
221
- ```bash
222
- camoufox fetch --browserforge
223
- ```
224
- </details>
225
- <details><summary>MacOS</summary>
226
-
227
- ```bash
228
- python3 -m camoufox fetch --browserforge
229
- ```
230
- </details>
231
- <details><summary>Linux</summary>
232
-
219
+ Then run this command to install browsers' dependencies needed to use Fetcher classes
233
220
  ```bash
234
- python -m camoufox fetch --browserforge
235
- ```
236
- On a fresh installation of Linux, you may also need the following Firefox dependencies:
237
- - Debian-based distros
238
- ```bash
239
- sudo apt install -y libgtk-3-0 libx11-xcb1 libasound2
240
- ```
241
- - Arch-based distros
242
- ```bash
243
- sudo pacman -S gtk3 libx11 libxcb cairo libasound alsa-lib
244
- ```
245
- </details>
246
-
247
- <small> See the official <a href="https://camoufox.com/python/installation/#download-the-browser">Camoufox documentation</a> for more info on installation</small>
248
-
249
- - If you are going to use the `PlayWrightFetcher` options, then install Playwright's Chromium browser with:
250
- ```commandline
251
- playwright install chromium
252
- ```
253
- - If you are going to use normal requests only with the `Fetcher` class then update the fingerprints files with:
254
- ```commandline
255
- python -m browserforge update
221
+ scrapling install
256
222
  ```
223
+ If you have any installation issues, please open an issue.
257
224
 
258
225
  ## Fetching Websites
259
- Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
226
+ Fetchers are interfaces built on top of other libraries with added features that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
260
227
 
261
228
  ### Features
262
229
  You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way
@@ -1,7 +1,8 @@
1
- scrapling/__init__.py,sha256=pfbhEm1kcriA9pFR3JUUFEE3v4_ykB35SYbeHKzFxHw,500
1
+ scrapling/__init__.py,sha256=0iEOX168f4gLFpReEUemMOhTske8AS2o0UQHJWXn-4o,500
2
+ scrapling/cli.py,sha256=njPdJKmbLFHeWjtSiGEm9ALBdSyfUp0IaJvxQL5C31Q,1125
2
3
  scrapling/defaults.py,sha256=tJAOMB-PMd3aLZz3j_yr6haBxxaklAvWdS_hP-GFFdU,331
3
4
  scrapling/fetchers.py,sha256=K3MKBqKDOXItJNwxFY2fe1C21Vz6QSd91fFtN98Mpg4,35402
4
- scrapling/parser.py,sha256=Fl9cdbR58GuoPbWN5hZI6ToPSl0_rQFXMskTdzpoxWs,55208
5
+ scrapling/parser.py,sha256=sT1gh5pnbjpUzFt8K9DGD6x60zKQcAtzmyf8DgiNDCI,55266
5
6
  scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
6
7
  scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
8
  scrapling/core/_types.py,sha256=OcsP1WeQEOlEVo9OzTrLQfgZZfXuJ0civVs31SynwGA,641
@@ -11,9 +12,9 @@ scrapling/core/storage_adaptors.py,sha256=l_ZYcdn1y69AcoPuRrPoaxqKysN62pMExrwJWY
11
12
  scrapling/core/translator.py,sha256=ojDmNi5pFZE6Ke-AiSsTilXiPRdR8yhX3o-uVGMkap8,5236
12
13
  scrapling/core/utils.py,sha256=03LzCDzmeK1TXPjIKVzHSUgSfhpe36XE8AwxlgxzJoU,3705
13
14
  scrapling/engines/__init__.py,sha256=zA7tzqcDXP0hllwmjVewNHWipIA4JSU9mRG4J-cud0c,267
14
- scrapling/engines/camo.py,sha256=g12IVIPy4Uyp_jngtu8Qcvy7PSMHjURAHUGXdM58Kks,13778
15
+ scrapling/engines/camo.py,sha256=wJRfaIU0w_hDSlrP2AdpjBU6NNEKw0wSnVbqUoxt1Gk,13682
15
16
  scrapling/engines/constants.py,sha256=Gb_nXFoBB4ujJkd05SKkenMe1UDiRYQA3dkmA3DunLg,3723
16
- scrapling/engines/pw.py,sha256=Eq4_oQA5eX666chiNpXsBqhWONzleniyXjKdmCpXj_Y,18630
17
+ scrapling/engines/pw.py,sha256=MCYE5rDx55D2VOIeUNLl44ROXnyFRfku_u2FOcXjqEQ,18534
17
18
  scrapling/engines/static.py,sha256=7SVEfeigCPfwC1ukx0zIFFe96Bo5fox6qOq2IWrP6P8,10319
18
19
  scrapling/engines/toolbelt/__init__.py,sha256=VQDdYm1zY9Apno6d8UrULk29vUjllZrQqD8mXL1E2Fc,402
19
20
  scrapling/engines/toolbelt/custom.py,sha256=d3qyeCg_qHm1RRE7yv5hyU9b17Y7YDPGBOVhEH1CAT0,12754
@@ -40,8 +41,9 @@ tests/fetchers/sync/test_playwright.py,sha256=5eZdPwk3JGeaO7GuExv_QsByLyWDE9joxn
40
41
  tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
42
  tests/parser/test_automatch.py,sha256=SxsNdExE8zz8AcPRQFBUjZ3Q_1-tPOd9dzVvMSZpOYQ,4908
42
43
  tests/parser/test_general.py,sha256=dyfOsc8lleoY4AxcfDUBUaD1i95xecfYuTUhKBsYjwo,12100
43
- scrapling-0.2.91.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
44
- scrapling-0.2.91.dist-info/METADATA,sha256=ajc8n5Hjl--ZdGXwHxmfMEWyCMgbw1waZNovoPFxrUc,68339
45
- scrapling-0.2.91.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
46
- scrapling-0.2.91.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
47
- scrapling-0.2.91.dist-info/RECORD,,
44
+ scrapling-0.2.92.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
45
+ scrapling-0.2.92.dist-info/METADATA,sha256=2I-HK-xEkVFFyQBio8NAKR0eQEBB-dLHFuvb5eluCEQ,67415
46
+ scrapling-0.2.92.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
47
+ scrapling-0.2.92.dist-info/entry_points.txt,sha256=DHyt2Blxy0P5OE2HRcP95Wz9_xo2ERCDcNqrJjYS3o8,49
48
+ scrapling-0.2.92.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
49
+ scrapling-0.2.92.dist-info/RECORD,,
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ scrapling = scrapling.cli:main