PyPI - scrapling - Versions diffs - 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl - Mend

scrapling 0.2.7py3-none-any.whl → 0.2.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

scrapling/__init__.py +5 -4
scrapling/core/_types.py +2 -3
scrapling/core/custom_types.py +93 -11
scrapling/core/storage_adaptors.py +9 -10
scrapling/core/translator.py +6 -7
scrapling/core/utils.py +35 -30
scrapling/defaults.py +2 -1
scrapling/engines/__init__.py +2 -2
scrapling/engines/camo.py +96 -26
scrapling/engines/constants.py +4 -4
scrapling/engines/pw.py +166 -96
scrapling/engines/static.py +94 -50
scrapling/engines/toolbelt/__init__.py +6 -20
scrapling/engines/toolbelt/custom.py +22 -23
scrapling/engines/toolbelt/fingerprints.py +7 -7
scrapling/engines/toolbelt/navigation.py +25 -12
scrapling/fetchers.py +233 -17
scrapling/parser.py +63 -28
{scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/METADATA +41 -25
scrapling-0.2.9.dist-info/RECORD +47 -0
tests/fetchers/async/__init__.py +0 -0
tests/fetchers/async/test_camoufox.py +95 -0
tests/fetchers/async/test_httpx.py +83 -0
tests/fetchers/async/test_playwright.py +99 -0
tests/fetchers/sync/__init__.py +0 -0
tests/fetchers/sync/test_camoufox.py +68 -0
tests/fetchers/sync/test_httpx.py +82 -0
tests/fetchers/sync/test_playwright.py +87 -0
tests/fetchers/test_utils.py +90 -122
tests/parser/test_automatch.py +64 -9
tests/parser/test_general.py +263 -219
scrapling-0.2.7.dist-info/RECORD +0 -42
tests/fetchers/test_camoufox.py +0 -64
tests/fetchers/test_httpx.py +0 -67
tests/fetchers/test_playwright.py +0 -76
{scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/LICENSE +0 -0
{scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/WHEEL +0 -0
{scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/top_level.txt +0 -0

scrapling/parser.py CHANGED Viewed

@@ -1,22 +1,31 @@
+import inspect
 import os
 import re
-import inspect
 from difflib import SequenceMatcher
+from urllib.parse import urljoin
-from scrapling.core.translator import HTMLTranslator
-from scrapling.core.mixins import SelectorsGeneration
-from scrapling.core.custom_types import TextHandler, TextHandlers, AttributesHandler
-from scrapling.core.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
-from scrapling.core.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden, is_jsonable
-from scrapling.core._types import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator, SupportsIndex, Iterable
+from cssselect import SelectorError, SelectorSyntaxError
+from cssselect import parse as split_selectors
 from lxml import etree, html
-from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
+from scrapling.core._types import (Any, Callable, Dict, Generator, Iterable,
+                                   List, Optional, Pattern, SupportsIndex,
+                                   Tuple, Union)
+from scrapling.core.custom_types import (AttributesHandler, TextHandler,
+                                         TextHandlers)
+from scrapling.core.mixins import SelectorsGeneration
+from scrapling.core.storage_adaptors import (SQLiteStorageSystem,
+                                             StorageSystemMixin, _StorageTools)
+from scrapling.core.translator import HTMLTranslator
+from scrapling.core.utils import (clean_spaces, flatten, html_forbidden,
+                                  is_jsonable, log)
 class Adaptor(SelectorsGeneration):
     __slots__ = (
-        'url', 'encoding', '__auto_match_enabled', '_root', '_storage', '__debug',
+        'url', 'encoding', '__auto_match_enabled', '_root', '_storage',
         '__keep_comments', '__huge_tree_enabled', '__attributes', '__text', '__tag',
+        '__keep_cdata', '__raw_body'
     )
     def __init__(
@@ -28,10 +37,10 @@ class Adaptor(SelectorsGeneration):
             huge_tree: bool = True,
             root: Optional[html.HtmlElement] = None,
             keep_comments: Optional[bool] = False,
+            keep_cdata: Optional[bool] = False,
             auto_match: Optional[bool] = True,
             storage: Any = SQLiteStorageSystem,
             storage_args: Optional[Dict] = None,
-            debug: Optional[bool] = True,
             **kwargs
     ):
         """The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
@@ -51,33 +60,36 @@ class Adaptor(SelectorsGeneration):
         :param root: Used internally to pass etree objects instead of text/body arguments, it takes highest priority.
             Don't use it unless you know what you are doing!
         :param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
+        :param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.
         :param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
             priority over all auto-match related arguments/functions in the class.
         :param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
         :param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
             If empty, default values will be used.
-        :param debug: Enable debug mode
         """
         if root is None and not body and text is None:
             raise ValueError("Adaptor class needs text, body, or root arguments to work")
         self.__text = None
+        self.__raw_body = ''
         if root is None:
             if text is None:
                 if not body or not isinstance(body, bytes):
                     raise TypeError(f"body argument must be valid and of type bytes, got {body.__class__}")
                 body = body.replace(b"\x00", b"").strip()
+                self.__raw_body = body.replace(b"\x00", b"").strip().decode()
             else:
                 if not isinstance(text, str):
                     raise TypeError(f"text argument must be of type str, got {text.__class__}")
                 body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
+                self.__raw_body = text.strip()
             # https://lxml.de/api/lxml.etree.HTMLParser-class.html
             parser = html.HTMLParser(
-                recover=True, remove_blank_text=True, remove_comments=(keep_comments is False), encoding=encoding,
-                compact=True, huge_tree=huge_tree, default_doctype=True
+                recover=True, remove_blank_text=True, remove_comments=(not keep_comments), encoding=encoding,
+                compact=True, huge_tree=huge_tree, default_doctype=True, strip_cdata=(not keep_cdata),
             )
             self._root = etree.fromstring(body, parser=parser, base_url=url)
             if is_jsonable(text or body.decode()):
@@ -92,7 +104,6 @@ class Adaptor(SelectorsGeneration):
             self._root = root
-        setup_basic_logging(level='debug' if debug else 'info')
         self.__auto_match_enabled = auto_match
         if self.__auto_match_enabled:
@@ -103,7 +114,7 @@ class Adaptor(SelectorsGeneration):
                 }
             if not hasattr(storage, '__wrapped__'):
-                raise ValueError("Storage class must be wrapped with cache decorator, see docs for info")
+                raise ValueError("Storage class must be wrapped with lru_cache decorator, see docs for info")
             if not issubclass(storage.__wrapped__, StorageSystemMixin):
                 raise ValueError("Storage system must be inherited from class `StorageSystemMixin`")
@@ -111,13 +122,13 @@ class Adaptor(SelectorsGeneration):
             self._storage = storage(**storage_args)
         self.__keep_comments = keep_comments
+        self.__keep_cdata = keep_cdata
         self.__huge_tree_enabled = huge_tree
         self.encoding = encoding
         self.url = url
         # For selector stuff
         self.__attributes = None
         self.__tag = None
-        self.__debug = debug
         # No need to check if all response attributes exist or not because if `status` exist, then the rest exist (Save some CPU cycles for speed)
         self.__response_data = {
             key: getattr(self, key) for key in ('status', 'reason', 'cookies', 'headers', 'request_headers',)
@@ -148,8 +159,8 @@ class Adaptor(SelectorsGeneration):
                     root=element,
                     text='', body=b'',  # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
                     url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
-                    keep_comments=True,  # if the comments are already removed in initialization, no need to try to delete them in sub-elements
-                    huge_tree=self.__huge_tree_enabled, debug=self.__debug,
+                    keep_comments=self.__keep_comments, keep_cdata=self.__keep_cdata,
+                    huge_tree=self.__huge_tree_enabled,
                     **self.__response_data
                 )
             return element
@@ -236,6 +247,10 @@ class Adaptor(SelectorsGeneration):
         return TextHandler(separator.join([s for s in _all_strings]))
+    def urljoin(self, relative_url: str) -> str:
+        """Join this Adaptor's url with a relative url to form an absolute full URL."""
+        return urljoin(self.url, relative_url)
     @property
     def attrib(self) -> AttributesHandler:
         """Get attributes of the element"""
@@ -248,7 +263,10 @@ class Adaptor(SelectorsGeneration):
         """Return the inner html code of the element"""
         return etree.tostring(self._root, encoding='unicode', method='html', with_tail=False)
-    body = html_content
+    @property
+    def body(self) -> str:
+        """Return raw HTML code of the element/page without any processing when possible or return `Adaptor.html_content`"""
+        return self.__raw_body or self.html_content
     def prettify(self) -> str:
         """Return a prettified version of the element's inner html-code"""
@@ -323,6 +341,16 @@ class Adaptor(SelectorsGeneration):
         return self.__convert_results(prev_element)
+    # For easy copy-paste from Scrapy/parsel code when needed :)
+    def get(self, default=None):
+        return self
+    def get_all(self):
+        return self
+    extract = get_all
+    extract_first = get
     def __str__(self) -> str:
         return self.html_content
@@ -385,10 +413,10 @@ class Adaptor(SelectorsGeneration):
         if score_table:
             highest_probability = max(score_table.keys())
             if score_table[highest_probability] and highest_probability >= percentage:
-                logging.debug(f'Highest probability was {highest_probability}%')
-                logging.debug('Top 5 best matching elements are: ')
+                log.debug(f'Highest probability was {highest_probability}%')
+                log.debug('Top 5 best matching elements are: ')
                 for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
-                    logging.debug(f'{percent} -> {self.__convert_results(score_table[percent])}')
+                    log.debug(f'{percent} -> {self.__convert_results(score_table[percent])}')
                 if not adaptor_type:
                     return score_table[highest_probability]
                 return self.__convert_results(score_table[highest_probability])
@@ -514,7 +542,7 @@ class Adaptor(SelectorsGeneration):
             if selected_elements:
                 if not self.__auto_match_enabled and auto_save:
-                    logging.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
+                    log.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
                 elif self.__auto_match_enabled and auto_save:
                     self.save(selected_elements[0], identifier or selector)
@@ -533,7 +561,7 @@ class Adaptor(SelectorsGeneration):
                         return self.__convert_results(selected_elements)
                 elif not self.__auto_match_enabled and auto_match:
-                    logging.warning("Argument `auto_match` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
+                    log.warning("Argument `auto_match` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
                 return self.__convert_results(selected_elements)
@@ -737,8 +765,8 @@ class Adaptor(SelectorsGeneration):
             self._storage.save(element, identifier)
         else:
-            logging.critical(
-                "Can't use Auto-match features with disabled globally, you have to start a new class instance."
+            log.critical(
+                "Can't use Auto-match features while disabled globally, you have to start a new class instance."
             )
     def retrieve(self, identifier: str) -> Optional[Dict]:
@@ -751,8 +779,8 @@ class Adaptor(SelectorsGeneration):
         if self.__auto_match_enabled:
             return self._storage.retrieve(identifier)
-        logging.critical(
-            "Can't use Auto-match features with disabled globally, you have to start a new class instance."
+        log.critical(
+            "Can't use Auto-match features while disabled globally, you have to start a new class instance."
         )
     # Operations on text functions
@@ -1066,12 +1094,19 @@ class Adaptors(List[Adaptor]):
         ]
         return self.__class__(results) if results else results
+    # For easy copy-paste from Scrapy/parsel code when needed :)
     def get(self, default=None):
         """Returns the first item of the current list
         :param default: the default value to return if the current list is empty
         """
         return self[0] if len(self) > 0 else default
+    def extract(self):
+        return self
+    extract_first = get
+    get_all = extract
     @property
     def first(self):
         """Returns the first item of the current list or `None` if the list is empty"""

{scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/METADATA RENAMED Viewed

@@ -1,7 +1,7 @@
 Metadata-Version: 2.1
 Name: scrapling
-Version: 0.2.7
-Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
+Version: 0.2.9
+Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
 Home-page: https://github.com/D4Vinci/Scrapling
 Author: Karim Shoair
 Author-email: karim.shoair@pm.me
@@ -29,7 +29,7 @@ Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
 Classifier: Programming Language :: Python :: Implementation :: CPython
 Classifier: Typing :: Typed
-Requires-Python: >=3.8
+Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: requests>=2.3
@@ -39,10 +39,9 @@ Requires-Dist: w3lib
 Requires-Dist: orjson>=3
 Requires-Dist: tldextract
 Requires-Dist: httpx[brotli,zstd]
-Requires-Dist: playwright==1.48
-Requires-Dist: rebrowser-playwright
-Requires-Dist: camoufox>=0.3.10
-Requires-Dist: browserforge
+Requires-Dist: playwright>=1.49.1
+Requires-Dist: rebrowser-playwright>=1.49.1
+Requires-Dist: camoufox[geoip]>=0.4.9
 # 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
 [![Tests](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg)](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [![PyPI version](https://badge.fury.io/py/Scrapling.svg)](https://badge.fury.io/py/Scrapling) [![Supported Python versions](https://img.shields.io/pypi/pyversions/scrapling.svg)](https://pypi.org/project/scrapling/) [![PyPI Downloads](https://static.pepy.tech/badge/scrapling)](https://pepy.tech/project/scrapling)
@@ -52,7 +51,7 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
 Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
 ```python
->> from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
+>> from scrapling.defaults import Fetcher, AsyncFetcher, StealthyFetcher, PlayWrightFetcher
 # Fetch websites' source under the radar!
 >> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
 >> print(page.status)
@@ -81,7 +80,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
 ## Table of content
   * [Key Features](#key-features)
-    * [Fetch websites as you prefer](#fetch-websites-as-you-prefer)
+    * [Fetch websites as you prefer](#fetch-websites-as-you-prefer-with-async-support)
     * [Adaptive Scraping](#adaptive-scraping)
     * [Performance](#performance)
     * [Developing Experience](#developing-experience)
@@ -122,7 +121,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
 ## Key Features
-### Fetch websites as you prefer
+### Fetch websites as you prefer with async support
 - **HTTP requests**: Stealthy and fast HTTP requests with `Fetcher`
 - **Stealthy fetcher**: Annoying anti-bot protection? No problem! Scrapling can bypass almost all of them with `StealthyFetcher` with default configuration!
 - **Your preferred browser**: Use your real browser with CDP, [NSTbrowser](https://app.nstbrowser.io/r/1vO5e5)'s browserless, PlayWright with stealth mode, or even vanilla PlayWright -  All is possible with `PlayWrightFetcher`!
@@ -213,7 +212,7 @@ Scrapling can find elements with more methods and it returns full element `Adapt
 > All benchmarks' results are an average of 100 runs. See our [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology and to run your comparisons.
 ## Installation
-Scrapling is a breeze to get started with - Starting from version 0.2, we require at least Python 3.8 to work.
+Scrapling is a breeze to get started with - Starting from version 0.2.9, we require at least Python 3.9 to work.
 ```bash
 pip3 install scrapling
 ```
@@ -258,47 +257,58 @@ python -m browserforge update
 ```
 ## Fetching Websites
-Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you want then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
+Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
 ### Features
-You might be a little bit confused by now so let me clear things up. All fetcher-type classes are imported in the same way
+You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way
 ```python
 from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
 ```
-And all of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug` which are the same ones you give to the `Adaptor` class.
+All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, and `storage_args`, which are the same ones you give to the `Adaptor` class.
 If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
 ```python
-from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
+from scrapling.defaults import Fetcher, AsyncFetcher, StealthyFetcher, PlayWrightFetcher
 ```
 then use it right away without initializing like:
 ```python
 page = StealthyFetcher.fetch('https://example.com')
 ```
-Also, the `Response` object returned from all fetchers is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
+Also, the `Response` object returned from all fetchers is the same as the `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
 > [!NOTE]
 > The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
 ### Fetcher
 This class is built on top of [httpx](https://www.python-httpx.org/) with additional configuration options, here you can do `GET`, `POST`, `PUT`, and `DELETE` requests.
-For all methods, you have `stealth_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default.
+For all methods, you have `stealthy_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default. You can also set the number of retries with the argument `retries` for all methods and this will make httpx retry requests if it failed for any reason. The default number of retries for all `Fetcher` methods is 3.
 You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods in this format `http://username:password@localhost:8030`
 ```python
->> page = Fetcher().get('https://httpbin.org/get', stealth_headers=True, follow_redirects=True)
+>> page = Fetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
 >> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
 >> page = Fetcher().put('https://httpbin.org/put', data={'key': 'value'})
 >> page = Fetcher().delete('https://httpbin.org/delete')
 ```
+For Async requests, you will just replace the import like below:
+```python
+>> from scrapling import AsyncFetcher
+>> page = await AsyncFetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
+>> page = await AsyncFetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
+>> page = await AsyncFetcher().put('https://httpbin.org/put', data={'key': 'value'})
+>> page = await AsyncFetcher().delete('https://httpbin.org/delete')
+```
 ### StealthyFetcher
-This class is built on top of [Camoufox](https://github.com/daijro/camoufox) which by default bypasses most of the anti-bot protections. Scrapling adds extra layers of flavors and configurations to increase performance and undetectability even further.
+This class is built on top of [Camoufox](https://github.com/daijro/camoufox), bypassing most anti-bot protections by default. Scrapling adds extra layers of flavors and configurations to increase performance and undetectability even further.
 ```python
 >> page = StealthyFetcher().fetch('https://www.browserscan.net/bot-detection')  # Running headless by default
 >> page.status == 200
 True
+>> page = await StealthyFetcher().async_fetch('https://www.browserscan.net/bot-detection')  # the async version of fetch
+>> page.status == 200
+True
 ```
-> Note: all requests done by this fetcher is waiting by default for all JS to be fully loaded and executed so you don't have to :)
+> Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
 <details><summary><strong>For the sake of simplicity, expand this for the complete list of arguments</strong></summary>
@@ -314,7 +324,8 @@ True
 |     page_action     | Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.                                                                                                                                                                                                                                                                                         |    ✔️    |
 |       addons        | List of Firefox addons to use. **Must be paths to extracted addons.**                                                                                                                                                                                                                                                                                                                                           |    ✔️    |
 |      humanize       | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.                                                                                                                                                                                                                                  |    ✔️    |
-|     allow_webgl     | Whether to allow WebGL. To prevent leaks, only use this for special cases.                                                                                                                                                                                                                                                                                                                                      |    ✔️    |
+|     allow_webgl     | Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled.                                                                                                                                                                                                                                                                                                             |    ✔️    |
+|        geoip        | Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address. It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.                                                                                                                                             |    ✔️    |
 |     disable_ads     | Enabled by default, this installs `uBlock Origin` addon on the browser if enabled.                                                                                                                                                                                                                                                                                                                              |    ✔️    |
 |    network_idle     | Wait for the page until there are no network connections for at least 500 ms.                                                                                                                                                                                                                                                                                                                                   |    ✔️    |
 |       timeout       | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000.                                                                                                                                                                                                                                                                                                    |    ✔️    |
@@ -333,8 +344,11 @@ This class is built on top of [Playwright](https://playwright.dev/python/) which
 >> page = PlayWrightFetcher().fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True)  # Vanilla Playwright option
 >> page.css_first("#search a::attr(href)")
 'https://github.com/D4Vinci/Scrapling'
+>> page = await PlayWrightFetcher().async_fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True)  # the async version of fetch
+>> page.css_first("#search a::attr(href)")
+'https://github.com/D4Vinci/Scrapling'
 ```
-> Note: all requests done by this fetcher is waiting by default for all JS to be fully loaded and executed so you don't have to :)
+> Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
 Using this Fetcher class, you can make requests with:
   1) Vanilla Playwright without any modifications other than the ones you chose.
@@ -346,7 +360,7 @@ Using this Fetcher class, you can make requests with:
   3) Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
   4) [NSTBrowser](https://app.nstbrowser.io/r/1vO5e5)'s [docker browserless](https://hub.docker.com/r/nstbrowser/browserless) option by passing the CDP URL and enabling `nstbrowser_mode` option.
-> Hence using the `real_chrome` argument requires that you have chrome browser installed on your device
+> Hence using the `real_chrome` argument requires that you have Chrome browser installed on your device
 Add that to a lot of controlling/hiding options as you will see in the arguments list below.
@@ -369,7 +383,7 @@ Add that to a lot of controlling/hiding options as you will see in the arguments
 |     hide_canvas     | Add random noise to canvas operations to prevent fingerprinting.                                                                                                                                                                                                                                                                                                                                                |    ✔️    |
 |    disable_webgl    | Disables WebGL and WebGL 2.0 support entirely.                                                                                                                                                                                                                                                                                                                                                                  |    ✔️    |
 |       stealth       | Enables stealth mode, always check the documentation to see what stealth mode does currently.                                                                                                                                                                                                                                                                                                                   |    ✔️    |
-|     real_chrome     | If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.                                                                                                                                                                                                                                                                            |    ✔️    |
+|     real_chrome     | If you have Chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.                                                                                                                                                                                                                                                                            |    ✔️    |
 |       locale        | Set the locale for the browser if wanted. The default value is `en-US`.                                                                                                                                                                                                                                                                                                                                         |    ✔️    |
 |       cdp_url       | Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.                                                                                                                                                                                                                                                                                           |    ✔️    |
 |   nstbrowser_mode   | Enables NSTBrowser mode, **it have to be used with `cdp_url` argument or it will get completely ignored.**                                                                                                                                                                                                                                                                                                      |    ✔️    |
@@ -437,6 +451,9 @@ You can select elements by their text content in multiple ways, here's a full ex
 >>> page.find_by_text('Tipping the Velvet')  # Find the first element whose text fully matches this text
 <data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>
+>>> page.urljoin(page.find_by_text('Tipping the Velvet').attrib['href'])  # We use `page.urljoin` to return the full URL from the relative `href`
+'https://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html'
 >>> page.find_by_text('Tipping the Velvet', first_match=False)  # Get all matches if there are more
 [<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>]
@@ -850,7 +867,6 @@ This project includes code adapted from:
 ## Known Issues
 - In the auto-matching save process, the unique properties of the first element from the selection results are the only ones that get saved. So if the selector you are using selects different elements on the page that are in different locations, auto-matching will probably return to you the first element only when you relocate it later. This doesn't include combined CSS selectors (Using commas to combine more than one selector for example) as these selectors get separated and each selector gets executed alone.
-- Currently, Scrapling is not compatible with async/await.
 ---
 <div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>

scrapling-0.2.9.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,47 @@
+scrapling/__init__.py,sha256=4adit4xM1Io6mBz-VnnSHcPCQxIYhvDmDVMhbXu8VF4,499
+scrapling/defaults.py,sha256=tJAOMB-PMd3aLZz3j_yr6haBxxaklAvWdS_hP-GFFdU,331
+scrapling/fetchers.py,sha256=I_N32DMjCzNCMmrkGYoX480x1Eh5Lka6cMJ-EcSfszk,35342
+scrapling/parser.py,sha256=NKwOsGR6TB7XC9lMkA418_DRWE6pyUqK0XtmTAA51ic,55215
+scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
+scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+scrapling/core/_types.py,sha256=__HJ2JTk5vx5eg_7HAJmDjaHrMDIaoxNG8fadLLyKV8,566
+scrapling/core/custom_types.py,sha256=ZRzpoT6qQ4vU_ejhLXa7WYuYLGl5HwAjLPe01xdhuvM,10808
+scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
+scrapling/core/storage_adaptors.py,sha256=l_ZYcdn1y69AcoPuRrPoaxqKysN62pMExrwJWYdu5MA,6220
+scrapling/core/translator.py,sha256=ojDmNi5pFZE6Ke-AiSsTilXiPRdR8yhX3o-uVGMkap8,5236
+scrapling/core/utils.py,sha256=03LzCDzmeK1TXPjIKVzHSUgSfhpe36XE8AwxlgxzJoU,3705
+scrapling/engines/__init__.py,sha256=zA7tzqcDXP0hllwmjVewNHWipIA4JSU9mRG4J-cud0c,267
+scrapling/engines/camo.py,sha256=L5jRNUgJSAY5hE8KCD-tz4SFrx7ZjowJoWpHrl7havI,12359
+scrapling/engines/constants.py,sha256=Gb_nXFoBB4ujJkd05SKkenMe1UDiRYQA3dkmA3DunLg,3723
+scrapling/engines/pw.py,sha256=0vCDaodve_WcOdbGqBdyRwMECPZmQ0eGLQikh4WHKFc,17011
+scrapling/engines/static.py,sha256=7SVEfeigCPfwC1ukx0zIFFe96Bo5fox6qOq2IWrP6P8,10319
+scrapling/engines/toolbelt/__init__.py,sha256=VQDdYm1zY9Apno6d8UrULk29vUjllZrQqD8mXL1E2Fc,402
+scrapling/engines/toolbelt/custom.py,sha256=FbWTUC0Z8NTmTLFDiiCchs4W0_Q40lz2ONnhInRNuvA,12947
+scrapling/engines/toolbelt/fingerprints.py,sha256=ajEHdXHr7W4hw9KcNS7XlyxNBZu37p1bRj18TiICLzU,2929
+scrapling/engines/toolbelt/navigation.py,sha256=xEfZRJefuxOCGxQOSI2llS0du0Y2XmoIPdVGUSHOd7k,4567
+scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
+scrapling/engines/toolbelt/bypasses/notification_permission.js,sha256=poPM3o5WYgEX-EdiUfDCllpWfc3Umvw4jr2u6O6elus,237
+scrapling/engines/toolbelt/bypasses/pdf_viewer.js,sha256=mKjjSuP1-BOGC_2WhRYHJo_LP7lTBi2KXmP_zsHO_tI,173
+scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js,sha256=3RP1AE_XZRvpupeV_i-WSNVqRxyUy0qd8rQV8j_4j3U,221
+scrapling/engines/toolbelt/bypasses/screen_props.js,sha256=fZEuHMQ1-fYuxxUMoQXUvVWYUkPUbblkfMfpiLvBY7w,599
+scrapling/engines/toolbelt/bypasses/webdriver_fully.js,sha256=hdJw4clRAJQqIdq5gIFC_eC-x7C1i2ab01KV5ylmOBs,728
+scrapling/engines/toolbelt/bypasses/window_chrome.js,sha256=D7hqzNGGDorh8JVlvm2YIv7Bk2CoVkG55MDIdyqhT1w,6808
+tests/__init__.py,sha256=YHFB5ftzgLQVh6gbPfbYcY4yOS9DOBp5dBa6I-qtm8U,32
+tests/fetchers/__init__.py,sha256=6H4NgARhyTcGGd3dNCKQJ8kUFdrAEMSScQL7Ga_vU3c,43
+tests/fetchers/test_utils.py,sha256=ANFu-4FFhtyGFGIwJksUO2M2tTTcKU2M_t6F2aav8lM,4967
+tests/fetchers/async/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+tests/fetchers/async/test_camoufox.py,sha256=BANJ0TVqEdsjkYlsyU-q_spfaMsqTLOBQU8LUDurL9I,3685
+tests/fetchers/async/test_httpx.py,sha256=6WgsvqV1-rYTjZ9na5x-wt49C3Ur9D99HXBFbewO0gc,3888
+tests/fetchers/async/test_playwright.py,sha256=zzSYnfRksjNep_YipTiYAB9eQaIo3fssKLrsGzXEakw,4068
+tests/fetchers/sync/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+tests/fetchers/sync/test_camoufox.py,sha256=IcDXPAWSSJnYT6psDFKSbCeym5n7hCrMPYQEghaOX3A,3165
+tests/fetchers/sync/test_httpx.py,sha256=xItYWjnDOIswKJzua2tDq8Oy43nTeFl0O1bci7lzGmg,3615
+tests/fetchers/sync/test_playwright.py,sha256=5eZdPwk3JGeaO7GuExv_QsByLyWDE9joxnmprW0WO6Q,3780
+tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+tests/parser/test_automatch.py,sha256=SxsNdExE8zz8AcPRQFBUjZ3Q_1-tPOd9dzVvMSZpOYQ,4908
+tests/parser/test_general.py,sha256=dyfOsc8lleoY4AxcfDUBUaD1i95xecfYuTUhKBsYjwo,12100
+scrapling-0.2.9.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
+scrapling-0.2.9.dist-info/METADATA,sha256=Wg6lcRo_5LcyotrB1ZXagT5-gToAyRmtNKsq6TJoNk4,68382
+scrapling-0.2.9.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
+scrapling-0.2.9.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
+scrapling-0.2.9.dist-info/RECORD,,

tests/fetchers/async/__init__.py ADDED Viewed

File without changes

tests/fetchers/async/test_camoufox.py ADDED Viewed

@@ -0,0 +1,95 @@
+import pytest
+import pytest_httpbin
+from scrapling import StealthyFetcher
+@pytest_httpbin.use_class_based_httpbin
+@pytest.mark.asyncio
+class TestStealthyFetcher:
+    @pytest.fixture(scope="class")
+    def fetcher(self):
+        return StealthyFetcher(auto_match=False)
+    @pytest.fixture(scope="class")
+    def urls(self, httpbin):
+        url = httpbin.url
+        return {
+            'status_200': f'{url}/status/200',
+            'status_404': f'{url}/status/404',
+            'status_501': f'{url}/status/501',
+            'basic_url': f'{url}/get',
+            'html_url': f'{url}/html',
+            'delayed_url': f'{url}/delay/10',  # 10 Seconds delay response
+            'cookies_url': f"{url}/cookies/set/test/value"
+        }
+    async def test_basic_fetch(self, fetcher, urls):
+        """Test doing basic fetch request with multiple statuses"""
+        assert (await fetcher.async_fetch(urls['status_200'])).status == 200
+        assert (await fetcher.async_fetch(urls['status_404'])).status == 404
+        assert (await fetcher.async_fetch(urls['status_501'])).status == 501
+    async def test_networkidle(self, fetcher, urls):
+        """Test if waiting for `networkidle` make page does not finish loading or not"""
+        assert (await fetcher.async_fetch(urls['basic_url'], network_idle=True)).status == 200
+    async def test_blocking_resources(self, fetcher, urls):
+        """Test if blocking resources make page does not finish loading or not"""
+        assert (await fetcher.async_fetch(urls['basic_url'], block_images=True)).status == 200
+        assert (await fetcher.async_fetch(urls['basic_url'], disable_resources=True)).status == 200
+    async def test_waiting_selector(self, fetcher, urls):
+        """Test if waiting for a selector make page does not finish loading or not"""
+        assert (await fetcher.async_fetch(urls['html_url'], wait_selector='h1')).status == 200
+        assert (await fetcher.async_fetch(
+            urls['html_url'],
+            wait_selector='h1',
+            wait_selector_state='visible'
+        )).status == 200
+    async def test_cookies_loading(self, fetcher, urls):
+        """Test if cookies are set after the request"""
+        response = await fetcher.async_fetch(urls['cookies_url'])
+        assert response.cookies == {'test': 'value'}
+    async def test_automation(self, fetcher, urls):
+        """Test if automation break the code or not"""
+        async def scroll_page(page):
+            await page.mouse.wheel(10, 0)
+            await page.mouse.move(100, 400)
+            await page.mouse.up()
+            return page
+        assert (await fetcher.async_fetch(urls['html_url'], page_action=scroll_page)).status == 200
+    async def test_properties(self, fetcher, urls):
+        """Test if different arguments breaks the code or not"""
+        assert (await fetcher.async_fetch(
+            urls['html_url'],
+            block_webrtc=True,
+            allow_webgl=True
+        )).status == 200
+        assert (await fetcher.async_fetch(
+            urls['html_url'],
+            block_webrtc=False,
+            allow_webgl=True
+        )).status == 200
+        assert (await fetcher.async_fetch(
+            urls['html_url'],
+            block_webrtc=True,
+            allow_webgl=False
+        )).status == 200
+        assert (await fetcher.async_fetch(
+            urls['html_url'],
+            extra_headers={'ayo': ''},
+            os_randomize=True
+        )).status == 200
+    async def test_infinite_timeout(self, fetcher, urls):
+        """Test if infinite timeout breaks the code or not"""
+        assert (await fetcher.async_fetch(urls['delayed_url'], timeout=None)).status == 200

scrapling 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl

scrapling 0.2.7py3-none-any.whl → 0.2.9py3-none-any.whl