scrapling 0.2.5__py3-none-any.whl → 0.2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/engines/pw.py +15 -9
- scrapling/engines/toolbelt/fingerprints.py +1 -1
- scrapling/fetchers.py +7 -5
- {scrapling-0.2.5.dist-info → scrapling-0.2.6.dist-info}/METADATA +6 -3
- {scrapling-0.2.5.dist-info → scrapling-0.2.6.dist-info}/RECORD +9 -9
- {scrapling-0.2.5.dist-info → scrapling-0.2.6.dist-info}/LICENSE +0 -0
- {scrapling-0.2.5.dist-info → scrapling-0.2.6.dist-info}/WHEEL +0 -0
- {scrapling-0.2.5.dist-info → scrapling-0.2.6.dist-info}/top_level.txt +0 -0
    
        scrapling/__init__.py
    CHANGED
    
    | @@ -4,7 +4,7 @@ from scrapling.parser import Adaptor, Adaptors | |
| 4 4 | 
             
            from scrapling.core.custom_types import TextHandler, AttributesHandler
         | 
| 5 5 |  | 
| 6 6 | 
             
            __author__ = "Karim Shoair (karim.shoair@pm.me)"
         | 
| 7 | 
            -
            __version__ = "0.2. | 
| 7 | 
            +
            __version__ = "0.2.6"
         | 
| 8 8 | 
             
            __copyright__ = "Copyright (c) 2024 Karim Shoair"
         | 
| 9 9 |  | 
| 10 10 |  | 
    
        scrapling/engines/pw.py
    CHANGED
    
    | @@ -27,11 +27,12 @@ class PlaywrightEngine: | |
| 27 27 | 
             
                        page_action: Callable = do_nothing,
         | 
| 28 28 | 
             
                        wait_selector: Optional[str] = None,
         | 
| 29 29 | 
             
                        wait_selector_state: Optional[str] = 'attached',
         | 
| 30 | 
            -
                        stealth: bool = False,
         | 
| 31 | 
            -
                         | 
| 32 | 
            -
                         | 
| 30 | 
            +
                        stealth: Optional[bool] = False,
         | 
| 31 | 
            +
                        real_chrome: Optional[bool] = False,
         | 
| 32 | 
            +
                        hide_canvas: Optional[bool] = False,
         | 
| 33 | 
            +
                        disable_webgl: Optional[bool] = False,
         | 
| 33 34 | 
             
                        cdp_url: Optional[str] = None,
         | 
| 34 | 
            -
                        nstbrowser_mode: bool = False,
         | 
| 35 | 
            +
                        nstbrowser_mode: Optional[bool] = False,
         | 
| 35 36 | 
             
                        nstbrowser_config: Optional[Dict] = None,
         | 
| 36 37 | 
             
                        google_search: Optional[bool] = True,
         | 
| 37 38 | 
             
                        extra_headers: Optional[Dict[str, str]] = None,
         | 
| @@ -51,6 +52,7 @@ class PlaywrightEngine: | |
| 51 52 | 
             
                    :param wait_selector: Wait for a specific css selector to be in a specific state.
         | 
| 52 53 | 
             
                    :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
         | 
| 53 54 | 
             
                    :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
         | 
| 55 | 
            +
                    :param real_chrome: If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.
         | 
| 54 56 | 
             
                    :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
         | 
| 55 57 | 
             
                    :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
         | 
| 56 58 | 
             
                    :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
         | 
| @@ -67,6 +69,7 @@ class PlaywrightEngine: | |
| 67 69 | 
             
                    self.stealth = bool(stealth)
         | 
| 68 70 | 
             
                    self.hide_canvas = bool(hide_canvas)
         | 
| 69 71 | 
             
                    self.disable_webgl = bool(disable_webgl)
         | 
| 72 | 
            +
                    self.real_chrome = bool(real_chrome)
         | 
| 70 73 | 
             
                    self.google_search = bool(google_search)
         | 
| 71 74 | 
             
                    self.extra_headers = extra_headers or {}
         | 
| 72 75 | 
             
                    self.proxy = construct_proxy_dict(proxy)
         | 
| @@ -119,7 +122,8 @@ class PlaywrightEngine: | |
| 119 122 | 
             
                    :param url: Target url.
         | 
| 120 123 | 
             
                    :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         | 
| 121 124 | 
             
                    """
         | 
| 122 | 
            -
                    if not self.stealth:
         | 
| 125 | 
            +
                    if not self.stealth or self.real_chrome:
         | 
| 126 | 
            +
                        # Because rebrowser_playwright doesn't play well with real browsers
         | 
| 123 127 | 
             
                        from playwright.sync_api import sync_playwright
         | 
| 124 128 | 
             
                    else:
         | 
| 125 129 | 
             
                        from rebrowser_playwright.sync_api import sync_playwright
         | 
| @@ -130,8 +134,8 @@ class PlaywrightEngine: | |
| 130 134 | 
             
                            extra_headers = {}
         | 
| 131 135 | 
             
                            useragent = self.useragent
         | 
| 132 136 | 
             
                        else:
         | 
| 133 | 
            -
                            extra_headers =  | 
| 134 | 
            -
                            useragent =  | 
| 137 | 
            +
                            extra_headers = {}
         | 
| 138 | 
            +
                            useragent = generate_headers(browser_mode=True).get('User-Agent')
         | 
| 135 139 |  | 
| 136 140 | 
             
                        # Prepare the flags before diving
         | 
| 137 141 | 
             
                        flags = DEFAULT_STEALTH_FLAGS
         | 
| @@ -146,9 +150,11 @@ class PlaywrightEngine: | |
| 146 150 | 
             
                            browser = p.chromium.connect_over_cdp(endpoint_url=cdp_url)
         | 
| 147 151 | 
             
                        else:
         | 
| 148 152 | 
             
                            if self.stealth:
         | 
| 149 | 
            -
                                browser = p.chromium.launch( | 
| 153 | 
            +
                                browser = p.chromium.launch(
         | 
| 154 | 
            +
                                    headless=self.headless, args=flags, ignore_default_args=['--enable-automation'], chromium_sandbox=True, channel='chrome' if self.real_chrome else 'chromium'
         | 
| 155 | 
            +
                                )
         | 
| 150 156 | 
             
                            else:
         | 
| 151 | 
            -
                                browser = p.chromium.launch(headless=self.headless, ignore_default_args=['--enable-automation'])
         | 
| 157 | 
            +
                                browser = p.chromium.launch(headless=self.headless, ignore_default_args=['--enable-automation'], channel='chrome' if self.real_chrome else 'chromium')
         | 
| 152 158 |  | 
| 153 159 | 
             
                        # Creating the context
         | 
| 154 160 | 
             
                        if self.stealth:
         | 
| @@ -67,7 +67,7 @@ def generate_headers(browser_mode: bool = False) -> Dict: | |
| 67 67 | 
             
                    # So we don't raise any inconsistency red flags while websites fingerprinting us
         | 
| 68 68 | 
             
                    os_name = get_os_name()
         | 
| 69 69 | 
             
                    return HeaderGenerator(
         | 
| 70 | 
            -
                        browser=[Browser(name='chrome', min_version= | 
| 70 | 
            +
                        browser=[Browser(name='chrome', min_version=130)],
         | 
| 71 71 | 
             
                        os=os_name,  # None is ignored
         | 
| 72 72 | 
             
                        device='desktop'
         | 
| 73 73 | 
             
                    ).generate()
         | 
    
        scrapling/fetchers.py
    CHANGED
    
    | @@ -138,7 +138,7 @@ class PlayWrightFetcher(BaseFetcher): | |
| 138 138 | 
             
                            2) Mimics some of the real browsers' properties by injecting several JS files and using custom options.
         | 
| 139 139 | 
             
                            3) Using custom flags on launch to hide Playwright even more and make it faster.
         | 
| 140 140 | 
             
                            4) Generates real browser's headers of the same type and same user OS then append it to the request.
         | 
| 141 | 
            -
                    - Real browsers by passing the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
         | 
| 141 | 
            +
                    - Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
         | 
| 142 142 | 
             
                    - NSTBrowser's docker browserless option by passing the CDP URL and enabling `nstbrowser_mode` option.
         | 
| 143 143 |  | 
| 144 144 | 
             
                > Note that these are the main options with PlayWright but it can be mixed together.
         | 
| @@ -146,12 +146,12 @@ class PlayWrightFetcher(BaseFetcher): | |
| 146 146 | 
             
                def fetch(
         | 
| 147 147 | 
             
                        self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
         | 
| 148 148 | 
             
                        useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
         | 
| 149 | 
            -
                        page_action: Callable = do_nothing, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
         | 
| 150 | 
            -
                        hide_canvas: bool =  | 
| 149 | 
            +
                        page_action: Optional[Callable] = do_nothing, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
         | 
| 150 | 
            +
                        hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
         | 
| 151 151 | 
             
                        proxy: Optional[Union[str, Dict[str, str]]] = None,
         | 
| 152 | 
            -
                        stealth: bool = False,
         | 
| 152 | 
            +
                        stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
         | 
| 153 153 | 
             
                        cdp_url: Optional[str] = None,
         | 
| 154 | 
            -
                        nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
         | 
| 154 | 
            +
                        nstbrowser_mode: Optional[bool] = False, nstbrowser_config: Optional[Dict] = None,
         | 
| 155 155 | 
             
                ) -> Response:
         | 
| 156 156 | 
             
                    """Opens up a browser and do your request based on your chosen options below.
         | 
| 157 157 |  | 
| @@ -167,6 +167,7 @@ class PlayWrightFetcher(BaseFetcher): | |
| 167 167 | 
             
                    :param wait_selector: Wait for a specific css selector to be in a specific state.
         | 
| 168 168 | 
             
                    :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
         | 
| 169 169 | 
             
                    :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
         | 
| 170 | 
            +
                    :param real_chrome: If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.
         | 
| 170 171 | 
             
                    :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
         | 
| 171 172 | 
             
                    :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
         | 
| 172 173 | 
             
                    :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
         | 
| @@ -184,6 +185,7 @@ class PlayWrightFetcher(BaseFetcher): | |
| 184 185 | 
             
                        cdp_url=cdp_url,
         | 
| 185 186 | 
             
                        headless=headless,
         | 
| 186 187 | 
             
                        useragent=useragent,
         | 
| 188 | 
            +
                        real_chrome=real_chrome,
         | 
| 187 189 | 
             
                        page_action=page_action,
         | 
| 188 190 | 
             
                        hide_canvas=hide_canvas,
         | 
| 189 191 | 
             
                        network_idle=network_idle,
         | 
| @@ -1,6 +1,6 @@ | |
| 1 1 | 
             
            Metadata-Version: 2.1
         | 
| 2 2 | 
             
            Name: scrapling
         | 
| 3 | 
            -
            Version: 0.2. | 
| 3 | 
            +
            Version: 0.2.6
         | 
| 4 4 | 
             
            Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It 
         | 
| 5 5 | 
             
            Home-page: https://github.com/D4Vinci/Scrapling
         | 
| 6 6 | 
             
            Author: Karim Shoair
         | 
| @@ -39,7 +39,7 @@ Requires-Dist: w3lib | |
| 39 39 | 
             
            Requires-Dist: orjson>=3
         | 
| 40 40 | 
             
            Requires-Dist: tldextract
         | 
| 41 41 | 
             
            Requires-Dist: httpx[brotli,zstd]
         | 
| 42 | 
            -
            Requires-Dist: playwright
         | 
| 42 | 
            +
            Requires-Dist: playwright==1.48
         | 
| 43 43 | 
             
            Requires-Dist: rebrowser-playwright
         | 
| 44 44 | 
             
            Requires-Dist: camoufox>=0.3.10
         | 
| 45 45 | 
             
            Requires-Dist: browserforge
         | 
| @@ -336,9 +336,11 @@ Using this Fetcher class, you can make requests with: | |
| 336 336 | 
             
                 * Mimics some of the real browsers' properties by injecting several JS files and using custom options.
         | 
| 337 337 | 
             
                 * Using custom flags on launch to hide Playwright even more and make it faster.
         | 
| 338 338 | 
             
                 * Generates real browser's headers of the same type and same user OS then append it to the request's headers.
         | 
| 339 | 
            -
              3) Real browsers by passing the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
         | 
| 339 | 
            +
              3) Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
         | 
| 340 340 | 
             
              4) [NSTBrowser](https://app.nstbrowser.io/r/1vO5e5)'s [docker browserless](https://hub.docker.com/r/nstbrowser/browserless) option by passing the CDP URL and enabling `nstbrowser_mode` option.
         | 
| 341 341 |  | 
| 342 | 
            +
            > Hence using the `real_chrome` argument requires that you have chrome browser installed on your device
         | 
| 343 | 
            +
             | 
| 342 344 | 
             
            Add that to a lot of controlling/hiding options as you will see in the arguments list below.
         | 
| 343 345 |  | 
| 344 346 | 
             
            <details><summary><strong>Expand this for the complete list of arguments</strong></summary>
         | 
| @@ -360,6 +362,7 @@ Add that to a lot of controlling/hiding options as you will see in the arguments | |
| 360 362 | 
             
            |     hide_canvas     | Add random noise to canvas operations to prevent fingerprinting.                                                                                                                                                                                                                                                                                                                                                |    ✔️    |
         | 
| 361 363 | 
             
            |    disable_webgl    | Disables WebGL and WebGL 2.0 support entirely.                                                                                                                                                                                                                                                                                                                                                                  |    ✔️    |
         | 
| 362 364 | 
             
            |       stealth       | Enables stealth mode, always check the documentation to see what stealth mode does currently.                                                                                                                                                                                                                                                                                                                   |    ✔️    |
         | 
| 365 | 
            +
            |     real_chrome     | If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.                                                                                                                                                                                                                                                                            |    ✔️    |
         | 
| 363 366 | 
             
            |       cdp_url       | Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.                                                                                                                                                                                                                                                                                           |    ✔️    |
         | 
| 364 367 | 
             
            |   nstbrowser_mode   | Enables NSTBrowser mode, **it have to be used with `cdp_url` argument or it will get completely ignored.**                                                                                                                                                                                                                                                                                                      |    ✔️    |
         | 
| 365 368 | 
             
            |  nstbrowser_config  | The config you want to send with requests to the NSTBrowser. _If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config._                                                                                                                                                                                                                                                        |    ✔️    |
         | 
| @@ -1,6 +1,6 @@ | |
| 1 | 
            -
            scrapling/__init__.py,sha256= | 
| 1 | 
            +
            scrapling/__init__.py,sha256=NnIpEZcBGs5Pu2TjqPCacC7N6LN37SbnniBU1AhgdXs,435
         | 
| 2 2 | 
             
            scrapling/defaults.py,sha256=blYDLiuI5DgDSLRWnUgpp21WtFOsv1BsCRCmPeg8Xc4,287
         | 
| 3 | 
            -
            scrapling/fetchers.py,sha256 | 
| 3 | 
            +
            scrapling/fetchers.py,sha256=-gc-Yo1MjF_4cdJ-5rxZqNC0owxFXTFoEBj08BFEYPs,16361
         | 
| 4 4 | 
             
            scrapling/parser.py,sha256=d2n00uF5i7W5lf0afLNRdk17ZFcNyiF9EzXLRQGA0NM,54111
         | 
| 5 5 | 
             
            scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
         | 
| 6 6 | 
             
            scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
         | 
| @@ -13,11 +13,11 @@ scrapling/core/utils.py,sha256=fXdANUgRBbVbOerJ94fRY9vi7n5zsbm8t3G4qQ-F3ak,3792 | |
| 13 13 | 
             
            scrapling/engines/__init__.py,sha256=zwMqcSdNGh-IX0d4zXazrgAeHrkqIN_v5Ia7RU1g8W0,267
         | 
| 14 14 | 
             
            scrapling/engines/camo.py,sha256=dXkdfFmf3M09RXAvaZ8CE5khsblC3Wd7_6jWfu8XO6I,7618
         | 
| 15 15 | 
             
            scrapling/engines/constants.py,sha256=jSDA6lgbvEIB8z2m2SFzCKkvFEZnp28Mondy2__FpkM,3721
         | 
| 16 | 
            -
            scrapling/engines/pw.py,sha256= | 
| 16 | 
            +
            scrapling/engines/pw.py,sha256=gMWJAZYpJbFK-GiyRrpVrMjyMqSSetE6hf8kmf0zR2o,12729
         | 
| 17 17 | 
             
            scrapling/engines/static.py,sha256=wzBsoOHPpN5JV1izQSSSarPBNWB-wo0BDWNFuin6ID8,7109
         | 
| 18 18 | 
             
            scrapling/engines/toolbelt/__init__.py,sha256=BbxfC0depVOV3i3BnBnyfjHtLcZrDbhz6c5rTRczZUc,383
         | 
| 19 19 | 
             
            scrapling/engines/toolbelt/custom.py,sha256=ELr3_FwUqNI27E98kz-50OA5a6hQQtoIYrZoLKsvUpM,12551
         | 
| 20 | 
            -
            scrapling/engines/toolbelt/fingerprints.py,sha256= | 
| 20 | 
            +
            scrapling/engines/toolbelt/fingerprints.py,sha256=T9HQejHzAnHsD5EIXvrYVC5siiG5q2gOOXVIIANmzMc,2917
         | 
| 21 21 | 
             
            scrapling/engines/toolbelt/navigation.py,sha256=Tde5_6Wv7lOeWXMzs9D6TRaxAbJ3b-zIX6-4HggZbCQ,4017
         | 
| 22 22 | 
             
            scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
         | 
| 23 23 | 
             
            scrapling/engines/toolbelt/bypasses/notification_permission.js,sha256=poPM3o5WYgEX-EdiUfDCllpWfc3Umvw4jr2u6O6elus,237
         | 
| @@ -35,8 +35,8 @@ tests/fetchers/test_utils.py,sha256=FPPJkBrqgYxdGeWwapH8Vj8zyfYVLiTE1qSLu8eBWik, | |
| 35 35 | 
             
            tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
         | 
| 36 36 | 
             
            tests/parser/test_automatch.py,sha256=BeeYJi3cYCghbiZmi57z4bqcGPaoUA8GAm7MALBBkkk,2486
         | 
| 37 37 | 
             
            tests/parser/test_general.py,sha256=qaiVzpvqESfdXYFat6QrpnMkevPYgCzIcTZK5FwdC0s,11783
         | 
| 38 | 
            -
            scrapling-0.2. | 
| 39 | 
            -
            scrapling-0.2. | 
| 40 | 
            -
            scrapling-0.2. | 
| 41 | 
            -
            scrapling-0.2. | 
| 42 | 
            -
            scrapling-0.2. | 
| 38 | 
            +
            scrapling-0.2.6.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
         | 
| 39 | 
            +
            scrapling-0.2.6.dist-info/METADATA,sha256=cFOu2nlkXDsjyjkIt9kDu1nKKvS14xYH2LT4_VNH5j0,65362
         | 
| 40 | 
            +
            scrapling-0.2.6.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
         | 
| 41 | 
            +
            scrapling-0.2.6.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
         | 
| 42 | 
            +
            scrapling-0.2.6.dist-info/RECORD,,
         | 
| 
            File without changes
         | 
| 
            File without changes
         | 
| 
            File without changes
         |