PyPI - scrapling - Versions diffs - 0.2.91__py3-none-any.whl → 0.2.92__py3-none-any.whl - Mend

scrapling 0.2.91py3-none-any.whl → 0.2.92py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

scrapling/__init__.py +1 -1
scrapling/cli.py +37 -0
scrapling/engines/camo.py +10 -12
scrapling/engines/pw.py +10 -12
scrapling/parser.py +2 -2
{scrapling-0.2.91.dist-info → scrapling-0.2.92.dist-info}/METADATA +7 -40
{scrapling-0.2.91.dist-info → scrapling-0.2.92.dist-info}/RECORD +11 -9
scrapling-0.2.92.dist-info/entry_points.txt +2 -0
{scrapling-0.2.91.dist-info → scrapling-0.2.92.dist-info}/LICENSE +0 -0
{scrapling-0.2.91.dist-info → scrapling-0.2.92.dist-info}/WHEEL +0 -0
{scrapling-0.2.91.dist-info → scrapling-0.2.92.dist-info}/top_level.txt +0 -0

scrapling/__init__.py CHANGED Viewed

@@ -5,7 +5,7 @@ from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
 from scrapling.parser import Adaptor, Adaptors
 __author__ = "Karim Shoair (karim.shoair@pm.me)"
-__version__ = "0.2.91"
+__version__ = "0.2.92"
 __copyright__ = "Copyright (c) 2024 Karim Shoair"

scrapling/cli.py ADDED Viewed

@@ -0,0 +1,37 @@
+import os
+import subprocess
+import sys
+from pathlib import Path
+import click
+def get_package_dir():
+    return Path(os.path.dirname(__file__))
+def run_command(command, line):
+    print(f"Installing {line}...")
+    _ = subprocess.check_call(command, shell=True)
+    # I meant to not use try except here
+@click.command(help="Install all Scrapling's Fetchers dependencies")
+def install():
+    if not get_package_dir().joinpath(".scrapling_dependencies_installed").exists():
+        run_command([sys.executable, "-m", "playwright", "install", 'chromium'], 'Playwright browsers')
+        run_command([sys.executable, "-m", "playwright", "install-deps", 'chromium', 'firefox'], 'Playwright dependencies')
+        run_command([sys.executable, "-m", "camoufox", "fetch", '--browserforge'], 'Camoufox browser and databases')
+        # if no errors raised by above commands, then we add below file
+        get_package_dir().joinpath(".scrapling_dependencies_installed").touch()
+    else:
+        print('The dependencies are already installed')
+@click.group()
+def main():
+    pass
+# Adding commands
+main.add_command(install)

scrapling/engines/camo.py CHANGED Viewed

@@ -89,7 +89,7 @@ class CamoufoxEngine:
         def handle_response(finished_response):
             nonlocal final_response
-            if finished_response.request.resource_type == "document":
+            if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
                 final_response = finished_response
         with Camoufox(
@@ -133,7 +133,6 @@ class CamoufoxEngine:
                 if self.network_idle:
                     page.wait_for_load_state('networkidle')
-            response_bytes = final_response.body() if final_response else page.content().encode('utf-8')
             # In case we didn't catch a document type somehow
             final_response = final_response if final_response else first_response
             # This will be parsed inside `Response`
@@ -142,15 +141,15 @@ class CamoufoxEngine:
             status_text = final_response.status_text or StatusText.get(final_response.status)
             response = Response(
-                url=final_response.url,
+                url=page.url,
                 text=page.content(),
-                body=response_bytes,
+                body=page.content().encode('utf-8'),
                 status=final_response.status,
                 reason=status_text,
                 encoding=encoding,
                 cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
-                headers=final_response.all_headers(),
-                request_headers=final_response.request.all_headers(),
+                headers=first_response.all_headers(),
+                request_headers=first_response.request.all_headers(),
                 **self.adaptor_arguments
             )
             page.close()
@@ -169,7 +168,7 @@ class CamoufoxEngine:
         async def handle_response(finished_response):
             nonlocal final_response
-            if finished_response.request.resource_type == "document":
+            if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
                 final_response = finished_response
         async with AsyncCamoufox(
@@ -213,7 +212,6 @@ class CamoufoxEngine:
                 if self.network_idle:
                     await page.wait_for_load_state('networkidle')
-            response_bytes = await final_response.body() if final_response else (await page.content()).encode('utf-8')
             # In case we didn't catch a document type somehow
             final_response = final_response if final_response else first_response
             # This will be parsed inside `Response`
@@ -222,15 +220,15 @@ class CamoufoxEngine:
             status_text = final_response.status_text or StatusText.get(final_response.status)
             response = Response(
-                url=final_response.url,
+                url=page.url,
                 text=await page.content(),
-                body=response_bytes,
+                body=(await page.content()).encode('utf-8'),
                 status=final_response.status,
                 reason=status_text,
                 encoding=encoding,
                 cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
-                headers=await final_response.all_headers(),
-                request_headers=await final_response.request.all_headers(),
+                headers=await first_response.all_headers(),
+                request_headers=await first_response.request.all_headers(),
                 **self.adaptor_arguments
             )
             await page.close()

scrapling/engines/pw.py CHANGED Viewed

@@ -206,7 +206,7 @@ class PlaywrightEngine:
         def handle_response(finished_response: PlaywrightResponse):
             nonlocal final_response
-            if finished_response.request.resource_type == "document":
+            if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
                 final_response = finished_response
         with sync_playwright() as p:
@@ -252,7 +252,6 @@ class PlaywrightEngine:
                 if self.network_idle:
                     page.wait_for_load_state('networkidle')
-            response_bytes = final_response.body() if final_response else page.content().encode('utf-8')
             # In case we didn't catch a document type somehow
             final_response = final_response if final_response else first_response
             # This will be parsed inside `Response`
@@ -261,15 +260,15 @@ class PlaywrightEngine:
             status_text = final_response.status_text or StatusText.get(final_response.status)
             response = Response(
-                url=final_response.url,
+                url=page.url,
                 text=page.content(),
-                body=response_bytes,
+                body=page.content().encode('utf-8'),
                 status=final_response.status,
                 reason=status_text,
                 encoding=encoding,
                 cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
-                headers=final_response.all_headers(),
-                request_headers=final_response.request.all_headers(),
+                headers=first_response.all_headers(),
+                request_headers=first_response.request.all_headers(),
                 **self.adaptor_arguments
             )
             page.close()
@@ -293,7 +292,7 @@ class PlaywrightEngine:
         async def handle_response(finished_response: PlaywrightResponse):
             nonlocal final_response
-            if finished_response.request.resource_type == "document":
+            if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
                 final_response = finished_response
         async with async_playwright() as p:
@@ -339,7 +338,6 @@ class PlaywrightEngine:
                 if self.network_idle:
                     await page.wait_for_load_state('networkidle')
-            response_bytes = await final_response.body() if final_response else (await page.content()).encode('utf-8')
             # In case we didn't catch a document type somehow
             final_response = final_response if final_response else first_response
             # This will be parsed inside `Response`
@@ -348,15 +346,15 @@ class PlaywrightEngine:
             status_text = final_response.status_text or StatusText.get(final_response.status)
             response = Response(
-                url=final_response.url,
+                url=page.url,
                 text=await page.content(),
-                body=response_bytes,
+                body=(await page.content()).encode('utf-8'),
                 status=final_response.status,
                 reason=status_text,
                 encoding=encoding,
                 cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
-                headers=await final_response.all_headers(),
-                request_headers=await final_response.request.all_headers(),
+                headers=await first_response.all_headers(),
+                request_headers=await first_response.request.all_headers(),
                 **self.adaptor_arguments
             )
             await page.close()

scrapling/parser.py CHANGED Viewed

@@ -474,7 +474,7 @@ class Adaptor(SelectorsGeneration):
     def css(self, selector: str, identifier: str = '',
             auto_match: bool = False, auto_save: bool = False, percentage: int = 0
-            ) -> Union['Adaptors[Adaptor]', List]:
+            ) -> Union['Adaptors[Adaptor]', List, 'TextHandlers[TextHandler]']:
         """Search current tree with CSS3 selectors
         **Important:
@@ -517,7 +517,7 @@ class Adaptor(SelectorsGeneration):
     def xpath(self, selector: str, identifier: str = '',
               auto_match: bool = False, auto_save: bool = False, percentage: int = 0, **kwargs: Any
-              ) -> Union['Adaptors[Adaptor]', List]:
+              ) -> Union['Adaptors[Adaptor]', List, 'TextHandlers[TextHandler]']:
         """Search current tree with XPath selectors
         **Important:

{scrapling-0.2.91.dist-info → scrapling-0.2.92.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: scrapling
-Version: 0.2.91
+Version: 0.2.92
 Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
 Home-page: https://github.com/D4Vinci/Scrapling
 Author: Karim Shoair
@@ -34,6 +34,7 @@ License-File: LICENSE
 Requires-Dist: requests>=2.3
 Requires-Dist: lxml>=4.5
 Requires-Dist: cssselect>=1.2
+Requires-Dist: click
 Requires-Dist: w3lib
 Requires-Dist: orjson>=3
 Requires-Dist: tldextract
@@ -211,52 +212,18 @@ Scrapling can find elements with more methods and it returns full element `Adapt
 > All benchmarks' results are an average of 100 runs. See our [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology and to run your comparisons.
 ## Installation
-Scrapling is a breeze to get started with - Starting from version 0.2.9, we require at least Python 3.9 to work.
+Scrapling is a breeze to get started with; Starting from version 0.2.9, we require at least Python 3.9 to work.
 ```bash
 pip3 install scrapling
 ```
-- For using the `StealthyFetcher`, go to the command line and download the browser with
-<details><summary>Windows OS</summary>
-```bash
-camoufox fetch --browserforge
-```
-</details>
-<details><summary>MacOS</summary>
-```bash
-python3 -m camoufox fetch --browserforge
-```
-</details>
-<details><summary>Linux</summary>
+Then run this command to install browsers' dependencies needed to use Fetcher classes
 ```bash
-python -m camoufox fetch --browserforge
-```
-On a fresh installation of Linux, you may also need the following Firefox dependencies:
-- Debian-based distros
-    ```bash
-    sudo apt install -y libgtk-3-0 libx11-xcb1 libasound2
-    ```
-- Arch-based distros
-    ```bash
-    sudo pacman -S gtk3 libx11 libxcb cairo libasound alsa-lib
-    ```
-</details>
-<small> See the official <a href="https://camoufox.com/python/installation/#download-the-browser">Camoufox documentation</a> for more info on installation</small>
-- If you are going to use the `PlayWrightFetcher` options, then install Playwright's Chromium browser with:
-```commandline
-playwright install chromium
-```
-- If you are going to use normal requests only with the `Fetcher` class then update the fingerprints files with:
-```commandline
-python -m browserforge update
+scrapling install
 ```
+If you have any installation issues, please open an issue.
 ## Fetching Websites
-Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
+Fetchers are interfaces built on top of other libraries with added features that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
 ### Features
 You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way

{scrapling-0.2.91.dist-info → scrapling-0.2.92.dist-info}/RECORD RENAMED Viewed

@@ -1,7 +1,8 @@
-scrapling/__init__.py,sha256=pfbhEm1kcriA9pFR3JUUFEE3v4_ykB35SYbeHKzFxHw,500
+scrapling/__init__.py,sha256=0iEOX168f4gLFpReEUemMOhTske8AS2o0UQHJWXn-4o,500
+scrapling/cli.py,sha256=njPdJKmbLFHeWjtSiGEm9ALBdSyfUp0IaJvxQL5C31Q,1125
 scrapling/defaults.py,sha256=tJAOMB-PMd3aLZz3j_yr6haBxxaklAvWdS_hP-GFFdU,331
 scrapling/fetchers.py,sha256=K3MKBqKDOXItJNwxFY2fe1C21Vz6QSd91fFtN98Mpg4,35402
-scrapling/parser.py,sha256=Fl9cdbR58GuoPbWN5hZI6ToPSl0_rQFXMskTdzpoxWs,55208
+scrapling/parser.py,sha256=sT1gh5pnbjpUzFt8K9DGD6x60zKQcAtzmyf8DgiNDCI,55266
 scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
 scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 scrapling/core/_types.py,sha256=OcsP1WeQEOlEVo9OzTrLQfgZZfXuJ0civVs31SynwGA,641
@@ -11,9 +12,9 @@ scrapling/core/storage_adaptors.py,sha256=l_ZYcdn1y69AcoPuRrPoaxqKysN62pMExrwJWY
 scrapling/core/translator.py,sha256=ojDmNi5pFZE6Ke-AiSsTilXiPRdR8yhX3o-uVGMkap8,5236
 scrapling/core/utils.py,sha256=03LzCDzmeK1TXPjIKVzHSUgSfhpe36XE8AwxlgxzJoU,3705
 scrapling/engines/__init__.py,sha256=zA7tzqcDXP0hllwmjVewNHWipIA4JSU9mRG4J-cud0c,267
-scrapling/engines/camo.py,sha256=g12IVIPy4Uyp_jngtu8Qcvy7PSMHjURAHUGXdM58Kks,13778
+scrapling/engines/camo.py,sha256=wJRfaIU0w_hDSlrP2AdpjBU6NNEKw0wSnVbqUoxt1Gk,13682
 scrapling/engines/constants.py,sha256=Gb_nXFoBB4ujJkd05SKkenMe1UDiRYQA3dkmA3DunLg,3723
-scrapling/engines/pw.py,sha256=Eq4_oQA5eX666chiNpXsBqhWONzleniyXjKdmCpXj_Y,18630
+scrapling/engines/pw.py,sha256=MCYE5rDx55D2VOIeUNLl44ROXnyFRfku_u2FOcXjqEQ,18534
 scrapling/engines/static.py,sha256=7SVEfeigCPfwC1ukx0zIFFe96Bo5fox6qOq2IWrP6P8,10319
 scrapling/engines/toolbelt/__init__.py,sha256=VQDdYm1zY9Apno6d8UrULk29vUjllZrQqD8mXL1E2Fc,402
 scrapling/engines/toolbelt/custom.py,sha256=d3qyeCg_qHm1RRE7yv5hyU9b17Y7YDPGBOVhEH1CAT0,12754
@@ -40,8 +41,9 @@ tests/fetchers/sync/test_playwright.py,sha256=5eZdPwk3JGeaO7GuExv_QsByLyWDE9joxn
 tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tests/parser/test_automatch.py,sha256=SxsNdExE8zz8AcPRQFBUjZ3Q_1-tPOd9dzVvMSZpOYQ,4908
 tests/parser/test_general.py,sha256=dyfOsc8lleoY4AxcfDUBUaD1i95xecfYuTUhKBsYjwo,12100
-scrapling-0.2.91.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
-scrapling-0.2.91.dist-info/METADATA,sha256=ajc8n5Hjl--ZdGXwHxmfMEWyCMgbw1waZNovoPFxrUc,68339
-scrapling-0.2.91.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
-scrapling-0.2.91.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
-scrapling-0.2.91.dist-info/RECORD,,
+scrapling-0.2.92.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
+scrapling-0.2.92.dist-info/METADATA,sha256=2I-HK-xEkVFFyQBio8NAKR0eQEBB-dLHFuvb5eluCEQ,67415
+scrapling-0.2.92.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
+scrapling-0.2.92.dist-info/entry_points.txt,sha256=DHyt2Blxy0P5OE2HRcP95Wz9_xo2ERCDcNqrJjYS3o8,49
+scrapling-0.2.92.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
+scrapling-0.2.92.dist-info/RECORD,,

scrapling-0.2.92.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ scrapling = scrapling.cli:main

{scrapling-0.2.91.dist-info → scrapling-0.2.92.dist-info}/LICENSE RENAMED Viewed

File without changes

{scrapling-0.2.91.dist-info → scrapling-0.2.92.dist-info}/WHEEL RENAMED Viewed

File without changes

{scrapling-0.2.91.dist-info → scrapling-0.2.92.dist-info}/top_level.txt RENAMED Viewed

File without changes

scrapling 0.2.91__py3-none-any.whl → 0.2.92__py3-none-any.whl

scrapling 0.2.91py3-none-any.whl → 0.2.92py3-none-any.whl