PyPI - scrapling - Versions diffs - 0.2__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

scrapling 0.2py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

scrapling/__init__.py +1 -1
scrapling/core/utils.py +13 -1
scrapling/defaults.py +6 -0
scrapling/engines/camo.py +17 -10
scrapling/engines/pw.py +9 -4
scrapling/engines/static.py +11 -11
scrapling/engines/toolbelt/__init__.py +1 -0
scrapling/engines/toolbelt/custom.py +16 -31
scrapling/engines/toolbelt/navigation.py +34 -0
scrapling/fetchers.py +21 -13
scrapling/parser.py +19 -22
scrapling/py.typed +1 -0
{scrapling-0.2.dist-info → scrapling-0.2.2.dist-info}/METADATA +58 -19
{scrapling-0.2.dist-info → scrapling-0.2.2.dist-info}/RECORD +17 -15
{scrapling-0.2.dist-info → scrapling-0.2.2.dist-info}/WHEEL +1 -1
{scrapling-0.2.dist-info → scrapling-0.2.2.dist-info}/LICENSE +0 -0
{scrapling-0.2.dist-info → scrapling-0.2.2.dist-info}/top_level.txt +0 -0

scrapling/__init__.py CHANGED Viewed

@@ -4,7 +4,7 @@ from scrapling.parser import Adaptor, Adaptors
 from scrapling.core.custom_types import TextHandler, AttributesHandler
 __author__ = "Karim Shoair (karim.shoair@pm.me)"
-__version__ = "0.2"
+__version__ = "0.2.2"
 __copyright__ = "Copyright (c) 2024 Karim Shoair"

scrapling/core/utils.py CHANGED Viewed

@@ -4,8 +4,9 @@ from itertools import chain
 # Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
 from functools import lru_cache as cache  # functools.cache is available on Python 3.9+ only so let's keep lru_cache
-from scrapling.core._types import Dict, Iterable, Any
+from scrapling.core._types import Dict, Iterable, Any, Union
+import orjson
 from lxml import html
 html_forbidden = {html.HtmlComment, }
@@ -18,6 +19,17 @@ logging.basicConfig(
     )
+def is_jsonable(content: Union[bytes, str]) -> bool:
+    if type(content) is bytes:
+        content = content.decode()
+    try:
+        _ = orjson.loads(content)
+        return True
+    except orjson.JSONDecodeError:
+        return False
 @cache(None, typed=True)
 def setup_basic_logging(level: str = 'debug'):
     levels = {

scrapling/defaults.py ADDED Viewed

@@ -0,0 +1,6 @@
+from .fetchers import Fetcher, StealthyFetcher, PlayWrightFetcher
+# If you are going to use Fetchers with the default settings, import them from this file instead for a cleaner looking code
+Fetcher = Fetcher()
+StealthyFetcher = StealthyFetcher()
+PlayWrightFetcher = PlayWrightFetcher()

scrapling/engines/camo.py CHANGED Viewed

@@ -7,6 +7,7 @@ from scrapling.engines.toolbelt import (
     get_os_name,
     intercept_route,
     check_type_validity,
+    construct_proxy_dict,
     generate_convincing_referer,
 )
@@ -18,7 +19,8 @@ class CamoufoxEngine:
             self, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
             block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = False, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
             timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
-            wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, adaptor_arguments: Dict = None
+            wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
+            proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, adaptor_arguments: Dict = None
     ):
         """An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
@@ -33,12 +35,14 @@ class CamoufoxEngine:
         :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
         :param allow_webgl: Whether to allow WebGL. To prevent leaks, only use this for special cases.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+        :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
         :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
         :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
         :param wait_selector: Wait for a specific css selector to be in a specific state.
         :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
         :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
         :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
+        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
         :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
         """
         self.headless = headless
@@ -48,7 +52,9 @@ class CamoufoxEngine:
         self.allow_webgl = bool(allow_webgl)
         self.network_idle = bool(network_idle)
         self.google_search = bool(google_search)
+        self.os_randomize = bool(os_randomize)
         self.extra_headers = extra_headers or {}
+        self.proxy = construct_proxy_dict(proxy)
         self.addons = addons or []
         self.humanize = humanize
         self.timeout = check_type_validity(timeout, [int, float], 30000)
@@ -66,17 +72,18 @@ class CamoufoxEngine:
         """Opens up the browser and do your request based on your chosen options.
         :param url: Target url.
-        :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
+        :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
         with Camoufox(
-                headless=self.headless,
-                block_images=self.block_images,  # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
-                os=get_os_name(),
-                block_webrtc=self.block_webrtc,
-                allow_webgl=self.allow_webgl,
+                proxy=self.proxy,
                 addons=self.addons,
+                headless=self.headless,
                 humanize=self.humanize,
-                i_know_what_im_doing=True,  # To turn warnings off with user configurations
+                i_know_what_im_doing=True,  # To turn warnings off with the user configurations
+                allow_webgl=self.allow_webgl,
+                block_webrtc=self.block_webrtc,
+                block_images=self.block_images,  # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
+                os=None if self.os_randomize else get_os_name(),
         ) as browser:
             page = browser.new_page()
             page.set_default_navigation_timeout(self.timeout)
@@ -107,14 +114,14 @@ class CamoufoxEngine:
             response = Response(
                 url=res.url,
                 text=page.content(),
-                content=res.body(),
+                body=res.body(),
                 status=res.status,
                 reason=res.status_text,
                 encoding=encoding,
                 cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
                 headers=res.all_headers(),
                 request_headers=res.request.all_headers(),
-                adaptor_arguments=self.adaptor_arguments
+                **self.adaptor_arguments
             )
             page.close()

scrapling/engines/pw.py CHANGED Viewed

@@ -9,8 +9,9 @@ from scrapling.engines.toolbelt import (
     js_bypass_path,
     intercept_route,
     generate_headers,
-    check_type_validity,
     construct_cdp_url,
+    check_type_validity,
+    construct_proxy_dict,
     generate_convincing_referer,
 )
@@ -33,6 +34,7 @@ class PlaywrightEngine:
             nstbrowser_config: Optional[Dict] = None,
             google_search: Optional[bool] = True,
             extra_headers: Optional[Dict[str, str]] = None,
+            proxy: Optional[Union[str, Dict[str, str]]] = None,
             adaptor_arguments: Dict = None
     ):
         """An engine that utilizes PlayWright library, check the `PlayWrightFetcher` class for more documentation.
@@ -54,6 +56,7 @@ class PlaywrightEngine:
         :param nstbrowser_mode: Enables NSTBrowser mode, it have to be used with `cdp_url` argument or it will get completely ignored.
         :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
         :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
+        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
         :param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
         :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
         """
@@ -65,6 +68,7 @@ class PlaywrightEngine:
         self.disable_webgl = bool(disable_webgl)
         self.google_search = bool(google_search)
         self.extra_headers = extra_headers or {}
+        self.proxy = construct_proxy_dict(proxy)
         self.cdp_url = cdp_url
         self.useragent = useragent
         self.timeout = check_type_validity(timeout, [int, float], 30000)
@@ -112,7 +116,7 @@ class PlaywrightEngine:
         """Opens up the browser and do your request based on your chosen options.
         :param url: Target url.
-        :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
+        :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
         if not self.stealth:
             from playwright.sync_api import sync_playwright
@@ -151,6 +155,7 @@ class PlaywrightEngine:
                     locale='en-US',
                     is_mobile=False,
                     has_touch=False,
+                    proxy=self.proxy,
                     color_scheme='dark',  # Bypasses the 'prefersLightColor' check in creepjs
                     user_agent=useragent,
                     device_scale_factor=2,
@@ -219,14 +224,14 @@ class PlaywrightEngine:
             response = Response(
                 url=res.url,
                 text=page.content(),
-                content=res.body(),
+                body=res.body(),
                 status=res.status,
                 reason=res.status_text,
                 encoding=encoding,
                 cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
                 headers=res.all_headers(),
                 request_headers=res.request.all_headers(),
-                adaptor_arguments=self.adaptor_arguments
+                **self.adaptor_arguments
             )
             page.close()
         return response

scrapling/engines/static.py CHANGED Viewed

@@ -48,19 +48,19 @@ class StaticEngine:
         """Takes httpx response and generates `Response` object from it.
         :param response: httpx response object
-        :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
+        :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
         return Response(
             url=str(response.url),
             text=response.text,
-            content=response.content,
+            body=response.content,
             status=response.status_code,
             reason=response.reason_phrase,
             encoding=response.encoding or 'utf-8',
             cookies=dict(response.cookies),
             headers=dict(response.headers),
             request_headers=dict(response.request.headers),
-            adaptor_arguments=self.adaptor_arguments
+            **self.adaptor_arguments
         )
     def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
@@ -69,9 +69,9 @@ class StaticEngine:
         :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
             create a referer header as if this request had came from Google's search of this URL's domain.
         :param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
-        :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
+        :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
+        headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
         request = httpx.get(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
         return self._prepare_response(request)
@@ -81,9 +81,9 @@ class StaticEngine:
         :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
             create a referer header as if this request had came from Google's search of this URL's domain.
         :param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
-        :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
+        :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
+        headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
         request = httpx.post(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
         return self._prepare_response(request)
@@ -93,9 +93,9 @@ class StaticEngine:
         :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
             create a referer header as if this request had came from Google's search of this URL's domain.
         :param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
-        :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
+        :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
+        headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
         request = httpx.delete(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
         return self._prepare_response(request)
@@ -105,8 +105,8 @@ class StaticEngine:
         :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
             create a referer header as if this request had came from Google's search of this URL's domain.
         :param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
-        :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
+        :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
+        headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
         request = httpx.put(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
         return self._prepare_response(request)

scrapling/engines/toolbelt/__init__.py CHANGED Viewed

@@ -15,4 +15,5 @@ from .navigation import (
     js_bypass_path,
     intercept_route,
     construct_cdp_url,
+    construct_proxy_dict,
 )

scrapling/engines/toolbelt/custom.py CHANGED Viewed

@@ -3,49 +3,34 @@ Functions related to custom types or type checking
 """
 import inspect
 import logging
-from dataclasses import dataclass, field
 from scrapling.core.utils import setup_basic_logging
 from scrapling.parser import Adaptor, SQLiteStorageSystem
 from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable
-@dataclass(frozen=True)
-class Response:
+class Response(Adaptor):
     """This class is returned by all engines as a way to unify response type between different libraries."""
-    url: str
-    text: str
-    content: bytes
-    status: int
-    reason: str
-    encoding: str = 'utf-8'  # default encoding
-    cookies: Dict = field(default_factory=dict)
-    headers: Dict = field(default_factory=dict)
-    request_headers: Dict = field(default_factory=dict)
-    adaptor_arguments: Dict = field(default_factory=dict)
-    @property
-    def adaptor(self) -> Union[Adaptor, None]:
-        """Generate Adaptor instance from this response if possible, otherwise return None"""
-        automatch_domain = self.adaptor_arguments.pop('automatch_domain', None)
-        if self.text:
-            # For playwright that will be the response after all JS executed
-            return Adaptor(text=self.text, url=automatch_domain or self.url, encoding=self.encoding, **self.adaptor_arguments)
-        elif self.content:
-            # For playwright, that's after all JS is loaded but not all of them executed, because playwright doesn't offer something like page.content()
-            # To get response Bytes after the load states
-            # Reference: https://playwright.dev/python/docs/api/class-page
-            return Adaptor(body=self.content, url=automatch_domain or self.url, encoding=self.encoding, **self.adaptor_arguments)
-        return None
-    def __repr__(self):
-        return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
+    def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict, encoding: str = 'utf-8', **adaptor_arguments: Dict):
+        automatch_domain = adaptor_arguments.pop('automatch_domain', None)
+        self.status = status
+        self.reason = reason
+        self.cookies = cookies
+        self.headers = headers
+        self.request_headers = request_headers
+        super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
+        # For back-ward compatibility
+        self.adaptor = self
+    # def __repr__(self):
+    #     return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
 class BaseFetcher:
     def __init__(
             self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
-            storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] = True,
+            storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] = False,
             automatch_domain: Optional[str] = None,
     ):
         """Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments

scrapling/engines/toolbelt/navigation.py CHANGED Viewed

@@ -25,6 +25,40 @@ def intercept_route(route: Route) -> Union[Route, None]:
     return route.continue_()
+def construct_proxy_dict(proxy_string: Union[str, Dict[str, str]]) -> Union[Dict, None]:
+    """Validate a proxy and return it in the acceptable format for Playwright
+    Reference: https://playwright.dev/python/docs/network#http-proxy
+    :param proxy_string: A string or a dictionary representation of the proxy.
+    :return:
+    """
+    if proxy_string:
+        if isinstance(proxy_string, str):
+            proxy = urlparse(proxy_string)
+            try:
+                return {
+                    'server': f'{proxy.scheme}://{proxy.hostname}:{proxy.port}',
+                    'username': proxy.username or '',
+                    'password': proxy.password or '',
+                }
+            except ValueError:
+                # Urllib will say that one of the parameters above can't be casted to the correct type like `int` for port etc...
+                raise TypeError(f'The proxy argument\'s string is in invalid format!')
+        elif isinstance(proxy_string, dict):
+            valid_keys = ('server', 'username', 'password', )
+            if all(key in valid_keys for key in proxy_string.keys()) and not any(key not in valid_keys for key in proxy_string.keys()):
+                return proxy_string
+            else:
+                raise TypeError(f'A proxy dictionary must have only these keys: {valid_keys}')
+        else:
+            raise TypeError(f'Invalid type of proxy ({type(proxy_string)}), the proxy argument must be a string or a dictionary!')
+    # The default value for proxy in Playwright's source is `None`
+    return None
 def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
     """Takes a CDP URL, reconstruct it to check it's valid, then adds encoded parameters if exists

scrapling/fetchers.py CHANGED Viewed

@@ -17,7 +17,7 @@ class Fetcher(BaseFetcher):
         :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
             create a referer header as if this request had came from Google's search of this URL's domain.
         :param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
-        :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
+        :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
         response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).get(url, stealthy_headers, **kwargs)
         return response_object
@@ -30,7 +30,7 @@ class Fetcher(BaseFetcher):
         :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
             create a referer header as if this request came from Google's search of this URL's domain.
         :param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
-        :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
+        :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
         response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).post(url, stealthy_headers, **kwargs)
         return response_object
@@ -43,7 +43,7 @@ class Fetcher(BaseFetcher):
         :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
         create a referer header as if this request came from Google's search of this URL's domain.
         :param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
-        :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
+        :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
         response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).put(url, stealthy_headers, **kwargs)
         return response_object
@@ -56,7 +56,7 @@ class Fetcher(BaseFetcher):
         :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
             create a referer header as if this request came from Google's search of this URL's domain.
         :param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
-        :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
+        :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
         response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).delete(url, stealthy_headers, **kwargs)
         return response_object
@@ -72,7 +72,8 @@ class StealthyFetcher(BaseFetcher):
             self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
             block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = False, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
             timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
-            wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None
+            wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, proxy: Optional[Union[str, Dict[str, str]]] = None,
+            os_randomize: Optional[bool] = None
     ) -> Response:
         """
         Opens up a browser and do your request based on your chosen options below.
@@ -88,29 +89,33 @@ class StealthyFetcher(BaseFetcher):
         :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
         :param allow_webgl: Whether to allow WebGL. To prevent leaks, only use this for special cases.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+        :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
         :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
         :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
         :param wait_selector: Wait for a specific css selector to be in a specific state.
         :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
         :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
         :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
-        :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
+        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
+        :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
         engine = CamoufoxEngine(
+            proxy=proxy,
+            addons=addons,
             timeout=timeout,
             headless=headless,
-            page_action=page_action,
-            block_images=block_images,
-            block_webrtc=block_webrtc,
-            addons=addons,
             humanize=humanize,
             allow_webgl=allow_webgl,
-            disable_resources=disable_resources,
+            page_action=page_action,
             network_idle=network_idle,
+            block_images=block_images,
+            block_webrtc=block_webrtc,
+            os_randomize=os_randomize,
             wait_selector=wait_selector,
-            wait_selector_state=wait_selector_state,
             google_search=google_search,
             extra_headers=extra_headers,
+            disable_resources=disable_resources,
+            wait_selector_state=wait_selector_state,
             adaptor_arguments=self.adaptor_arguments,
         )
         return engine.fetch(url)
@@ -136,6 +141,7 @@ class PlayWrightFetcher(BaseFetcher):
             useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
             page_action: Callable = do_nothing, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
             hide_canvas: bool = True, disable_webgl: bool = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
+            proxy: Optional[Union[str, Dict[str, str]]] = None,
             stealth: bool = False,
             cdp_url: Optional[str] = None,
             nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
@@ -157,12 +163,14 @@ class PlayWrightFetcher(BaseFetcher):
         :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
         :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
         :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
+        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
         :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
         :param nstbrowser_mode: Enables NSTBrowser mode, it have to be used with `cdp_url` argument or it will get completely ignored.
         :param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
-        :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
+        :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
         engine = PlaywrightEngine(
+            proxy=proxy,
             timeout=timeout,
             stealth=stealth,
             cdp_url=cdp_url,

scrapling/parser.py CHANGED Viewed

@@ -7,7 +7,7 @@ from scrapling.core.translator import HTMLTranslator
 from scrapling.core.mixins import SelectorsGeneration
 from scrapling.core.custom_types import TextHandler, TextHandlers, AttributesHandler
 from scrapling.core.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
-from scrapling.core.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden
+from scrapling.core.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden, is_jsonable
 from scrapling.core._types import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator, SupportsIndex, Iterable
 from lxml import etree, html
 from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
@@ -32,6 +32,7 @@ class Adaptor(SelectorsGeneration):
             storage: Any = SQLiteStorageSystem,
             storage_args: Optional[Dict] = None,
             debug: Optional[bool] = True,
+            **kwargs
     ):
         """The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
         with expressions in CSS, XPath, or with simply text. Check the docs for more info.
@@ -60,6 +61,7 @@ class Adaptor(SelectorsGeneration):
         if root is None and not body and text is None:
             raise ValueError("Adaptor class needs text, body, or root arguments to work")
+        self.__text = None
         if root is None:
             if text is None:
                 if not body or not isinstance(body, bytes):
@@ -72,12 +74,14 @@ class Adaptor(SelectorsGeneration):
                 body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
+            # https://lxml.de/api/lxml.etree.HTMLParser-class.html
             parser = html.HTMLParser(
-                # https://lxml.de/api/lxml.etree.HTMLParser-class.html
                 recover=True, remove_blank_text=True, remove_comments=(keep_comments is False), encoding=encoding,
                 compact=True, huge_tree=huge_tree, default_doctype=True
             )
             self._root = etree.fromstring(body, parser=parser, base_url=url)
+            if is_jsonable(text or body.decode()):
+                self.__text = TextHandler(text or body.decode())
         else:
             # All html types inherits from HtmlMixin so this to check for all at once
@@ -112,9 +116,12 @@ class Adaptor(SelectorsGeneration):
         self.url = url
         # For selector stuff
         self.__attributes = None
-        self.__text = None
         self.__tag = None
         self.__debug = debug
+        # No need to check if all response attributes exist or not because if `status` exist, then the rest exist (Save some CPU cycles for speed)
+        self.__response_data = {
+            key: getattr(self, key) for key in ('status', 'reason', 'cookies', 'headers', 'request_headers',)
+        } if hasattr(self, 'status') else {}
     # Node functionalities, I wanted to move to separate Mixin class but it had slight impact on performance
     @staticmethod
@@ -136,10 +143,14 @@ class Adaptor(SelectorsGeneration):
             return TextHandler(str(element))
         else:
             if issubclass(type(element), html.HtmlMixin):
                 return self.__class__(
-                    root=element, url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
+                    root=element,
+                    text='', body=b'',  # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
+                    url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
                     keep_comments=True,  # if the comments are already removed in initialization, no need to try to delete them in sub-elements
-                    huge_tree=self.__huge_tree_enabled, debug=self.__debug
+                    huge_tree=self.__huge_tree_enabled, debug=self.__debug,
+                    **self.__response_data
                 )
             return element
@@ -185,23 +196,9 @@ class Adaptor(SelectorsGeneration):
     def text(self) -> TextHandler:
         """Get text content of the element"""
         if not self.__text:
-            if self.__keep_comments:
-                if not self.children:
-                    # If use chose to keep comments, remove comments from text
-                    # Escape lxml default behaviour and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
-                    # This issue is present in parsel/scrapy as well so no need to repeat it here so the user can run regex on the full text.
-                    code = self.html_content
-                    parser = html.HTMLParser(
-                        recover=True, remove_blank_text=True, remove_comments=True, encoding=self.encoding,
-                        compact=True, huge_tree=self.__huge_tree_enabled, default_doctype=True
-                    )
-                    fragment_root = html.fragment_fromstring(code, parser=parser)
-                    self.__text = TextHandler(fragment_root.text)
-                else:
-                    self.__text = TextHandler(self._root.text)
-            else:
-                # If user already chose to not keep comments then all is good
-                self.__text = TextHandler(self._root.text)
+            # If you want to escape lxml default behaviour and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
+            # before extracting text then keep `keep_comments` set to False while initializing the first class
+            self.__text = TextHandler(self._root.text)
         return self.__text
     def get_all_text(self, separator: str = "\n", strip: bool = False, ignore_tags: Tuple = ('script', 'style',), valid_values: bool = True) -> TextHandler:

scrapling/py.typed ADDED Viewed

	@@ -0,0 +1 @@
1	+

{scrapling-0.2.dist-info → scrapling-0.2.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: scrapling
-Version: 0.2
+Version: 0.2.2
 Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
 Home-page: https://github.com/D4Vinci/Scrapling
 Author: Karim Shoair
@@ -41,7 +41,7 @@ Requires-Dist: tldextract
 Requires-Dist: httpx[brotli,zstd]
 Requires-Dist: playwright
 Requires-Dist: rebrowser-playwright
-Requires-Dist: camoufox >=0.3.7
+Requires-Dist: camoufox >=0.3.9
 Requires-Dist: browserforge
 # 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
@@ -52,17 +52,33 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
 Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
 ```python
->> from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
+>> from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
 # Fetch websites' source under the radar!
->> fetcher = StealthyFetcher().fetch('https://example.com', headless=True, disable_resources=True)
->> print(fetcher.status)
+>> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
+>> print(page.status)
 200
->> page = fetcher.adaptor
 >> products = page.css('.product', auto_save=True)  # Scrape data that survives website design changes!
 >> # Later, if the website structure changes, pass `auto_match=True`
 >> products = page.css('.product', auto_match=True)  # and Scrapling still finds them!
 ```
+# Sponsors
+[Evomi](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling) is your Swiss Quality Proxy Provider, starting at **$0.49/GB**
+- 👩‍💻 **$0.49 per GB Residential Proxies**: Our price is unbeatable
+- 👩‍💻 **24/7 Expert Support**: We will join your Slack Channel
+- 🌍 **Global Presence**: Available in 150+ Countries
+- ⚡ **Low Latency**
+- 🔒 **Swiss Quality and Privacy**
+- 🎁 **Free Trial**
+- 🛡️ **99.9% Uptime**
+- 🤝 **Special IP Pool selection**: Optimize for fast, quality or quantity of ips
+- 🔧 **Easy Integration**: Compatible with most software and programming languages
+[![Evomi Banner](https://my.evomi.com/images/brand/cta.png)](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling)
+---
 ## Table of content
   * [Key Features](#key-features)
     * [Fetch websites as you prefer](#fetch-websites-as-you-prefer)
@@ -95,7 +111,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
     * [Can Scrapling replace code built on top of BeautifulSoup4?](#can-scrapling-replace-code-built-on-top-of-beautifulsoup4)
     * [Can Scrapling replace code built on top of AutoScraper?](#can-scrapling-replace-code-built-on-top-of-autoscraper)
     * [Is Scrapling thread-safe?](#is-scrapling-thread-safe)
-  * [Sponsors](#sponsors)
+  * [More Sponsors!](#more-sponsors)
   * [Contributing](#contributing)
   * [Disclaimer for Scrapling Project](#disclaimer-for-scrapling-project)
   * [License](#license)
@@ -136,7 +152,7 @@ from scrapling import Fetcher
 fetcher = Fetcher(auto_match=False)
 # Fetch a web page and create an Adaptor instance
-page = fetcher.get('https://quotes.toscrape.com/', stealthy_headers=True).adaptor
+page = fetcher.get('https://quotes.toscrape.com/', stealthy_headers=True)
 # Get all strings in the full page
 page.get_all_text(ignore_tags=('script', 'style'))
@@ -241,11 +257,22 @@ python -m browserforge update
 ```
 ## Fetching Websites Features
-All fetcher-type classes are imported in the same way
+You might be a little bit confused by now so let me clear things up. All fetcher-type classes are imported in the same way
 ```python
 from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
 ```
 And all of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug` which are the same ones you give to the `Adaptor` class.
+If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
+```python
+from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
+```
+then use it right away without initializing like:
+```python
+page = StealthyFetcher.fetch('https://example.com')
+```
+Also, the `Response` object returned from all fetchers is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
 > [!NOTE]
 > The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
 ### Fetcher
@@ -265,6 +292,8 @@ This class is built on top of [Camoufox](https://github.com/daijro/camoufox) whi
 >> page.status == 200
 True
 ```
+> Note: all requests done by this fetcher is waiting by default for all JS to be fully loaded and executed so you don't have to :)
 <details><summary><strong>For the sake of simplicity, expand this for the complete list of arguments</strong></summary>
 |      Argument       | Description                                                                                                                                                                                                                                                                                                                                                                                                     | Optional |
@@ -283,6 +312,8 @@ True
 |    network_idle     | Wait for the page until there are no network connections for at least 500 ms.                                                                                                                                                                                                                                                                                                                                   |    ✔️    |
 |       timeout       | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000.                                                                                                                                                                                                                                                                                                    |    ✔️    |
 |    wait_selector    | Wait for a specific css selector to be in a specific state.                                                                                                                                                                                                                                                                                                                                                     |    ✔️    |
+|        proxy        | The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.                                                                                                                                                                                                                                                                                 |    ✔️    |
+|    os_randomize     | If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.                                                                                                                                                                                                                                                                          |    ✔️    |
 | wait_selector_state | The state to wait for the selector given with `wait_selector`. _Default state is `attached`._                                                                                                                                                                                                                                                                                                                   |    ✔️    |
 </details>
@@ -293,9 +324,11 @@ This list isn't final so expect a lot more additions and flexibility to be added
 This class is built on top of [Playwright](https://playwright.dev/python/) which currently provides 4 main run options but they can be mixed as you want.
 ```python
 >> page = PlayWrightFetcher().fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True)  # Vanilla Playwright option
->> page.adaptor.css_first("#search a::attr(href)")
+>> page.css_first("#search a::attr(href)")
 'https://github.com/D4Vinci/Scrapling'
 ```
+> Note: all requests done by this fetcher is waiting by default for all JS to be fully loaded and executed so you don't have to :)
 Using this Fetcher class, you can make requests with:
   1) Vanilla Playwright without any modifications other than the ones you chose.
   2) Stealthy Playwright with the stealth mode I wrote for it. It's still a WIP but it bypasses many online tests like [Sannysoft's](https://bot.sannysoft.com/).</br> Some of the things this fetcher's stealth mode does include:
@@ -323,6 +356,7 @@ Add that to a lot of controlling/hiding options as you will see in the arguments
 | wait_selector_state | The state to wait for the selector given with `wait_selector`. _Default state is `attached`._                                                                                                                                                                                                                                                                                                                   |    ✔️    |
 |    google_search    | Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.                                                                                                                                                                                                                                                                    |    ✔️    |
 |    extra_headers    | A dictionary of extra headers to add to the request. The referer set by the `google_search` argument takes priority over the referer set here if used together.                                                                                                                                                                                                                                                 |    ✔️    |
+|        proxy        | The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.                                                                                                                                                                                                                                                                                 |    ✔️    |
 |     hide_canvas     | Add random noise to canvas operations to prevent fingerprinting.                                                                                                                                                                                                                                                                                                                                                |    ✔️    |
 |    disable_webgl    | Disables WebGL and WebGL 2.0 support entirely.                                                                                                                                                                                                                                                                                                                                                                  |    ✔️    |
 |       stealth       | Enables stealth mode, always check the documentation to see what stealth mode does currently.                                                                                                                                                                                                                                                                                                                   |    ✔️    |
@@ -387,7 +421,7 @@ You can search for a specific ancestor of an element that satisfies a function,
 ### Content-based Selection & Finding Similar Elements
 You can select elements by their text content in multiple ways, here's a full example on another website:
 ```python
->>> page = Fetcher().get('https://books.toscrape.com/index.html').adaptor
+>>> page = Fetcher().get('https://books.toscrape.com/index.html')
 >>> page.find_by_text('Tipping the Velvet')  # Find the first element whose text fully matches this text
 <data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>
@@ -507,11 +541,11 @@ Now let's test the same selector in both versions
 >> old_url = "https://web.archive.org/web/20100102003420/http://stackoverflow.com/"
 >> new_url = "https://stackoverflow.com/"
 >>
->> page = Fetcher(automatch_domain='stackoverflow.com').get(old_url, timeout=30).adaptor
+>> page = Fetcher(automatch_domain='stackoverflow.com').get(old_url, timeout=30)
 >> element1 = page.css_first(selector, auto_save=True)
 >>
 >> # Same selector but used in the updated website
->> page = Fetcher(automatch_domain="stackoverflow.com").get(new_url).adaptor
+>> page = Fetcher(automatch_domain="stackoverflow.com").get(new_url)
 >> element2 = page.css_first(selector, auto_match=True)
 >>
 >> if element1.text == element2.text:
@@ -523,7 +557,7 @@ Note that I used a new argument called `automatch_domain`, this is because for S
 In a real-world scenario, the code will be the same except it will use the same URL for both requests so you won't need to use the `automatch_domain` argument. This is the closest example I can give to real-world cases so I hope it didn't confuse you :)
 **Notes:**
-1. For the two examples above I used one time the `Adaptor` class and the second time the `Fetcher` class just to show you that you can create the `Adaptor` object by yourself if you have the source or fetch the source using any `Fetcher` class then it will create the `Adaptor` object for you on the `.adaptor` property.
+1. For the two examples above I used one time the `Adaptor` class and the second time the `Fetcher` class just to show you that you can create the `Adaptor` object by yourself if you have the source or fetch the source using any `Fetcher` class then it will create the `Adaptor` object for you.
 2. Passing the `auto_save` argument with the `auto_match` argument set to `False` while initializing the Adaptor/Fetcher object will only result in ignoring the `auto_save` argument value and the following warning message
     ```text
     Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.
@@ -564,7 +598,7 @@ Examples to clear any confusion :)
 ```python
 >> from scrapling import Fetcher
->> page = Fetcher().get('https://quotes.toscrape.com/').adaptor
+>> page = Fetcher().get('https://quotes.toscrape.com/')
 # Find all elements with tag name `div`.
 >> page.find_all('div')
 [<data='<div class="container"> <div class="row...' parent='<body> <div class="container"> <div clas...'>,
@@ -727,7 +761,10 @@ There are a lot of deep details skipped here to make this as short as possible s
 Note that implementing your storage system can be complex as there are some strict rules such as inheriting from the same abstract class, following the singleton design pattern used in other classes, and more. So make sure to read the docs first.
-To give detailed documentation of the library, it will need a website. I'm trying to rush creating the website, researching new ideas, and adding more features/tests/benchmarks but time is tight with too many spinning plates between work, personal life, and working on Scrapling. But you can help by using the [sponsor button](https://github.com/sponsors/D4Vinci) above :)
+> [!IMPORTANT]
+> A website is needed to provide detailed library documentation.<br/>
+> I'm trying to rush creating the website, researching new ideas, and adding more features/tests/benchmarks but time is tight with too many spinning plates between work, personal life, and working on Scrapling. I have been working on Scrapling for months for free after all.<br/><br/>
+> If you like `Scrapling` and want it to keep improving then this is a friendly reminder that you can help by supporting me through the [sponsor button](https://github.com/sponsors/D4Vinci).
 ## ⚡ Enlightening Questions and FAQs
 This section addresses common questions about Scrapling, please read this section before opening an issue.
@@ -741,8 +778,8 @@ This section addresses common questions about Scrapling, please read this sectio
      Together both are used to retrieve the element's unique properties from the database later.
   4. Now later when you enable the `auto_match` parameter for both the Adaptor instance and the method call. The element properties are retrieved and Scrapling loops over all elements in the page and compares each one's unique properties to the unique properties we already have for this element and a score is calculated for each one.
-  5. The comparison between elements is not exact but more about finding how similar these values are, so everything is taken into consideration even the values' order like the order in which the element class names were written before and the order in which the same element class names are written now.
-  6. The score for each element is stored in the table, and in the end, the element(s) with the highest combined similarity scores are returned.
+  5. Comparing elements is not exact but more about finding how similar these values are, so everything is taken into consideration, even the values' order, like the order in which the element class names were written before and the order in which the same element class names are written now.
+  6. The score for each element is stored in the table, and the element(s) with the highest combined similarity scores are returned.
 ### How does the auto-matching work if I didn't pass a URL while initializing the Adaptor object?
 Not a big problem as it depends on your usage. The word `default` will be used in place of the URL field while saving the element's unique properties. So this will only be an issue if you used the same identifier later for a different website that you didn't pass the URL parameter while initializing it as well. The save process will overwrite the previous data and auto-matching uses the latest saved properties only.
@@ -773,8 +810,10 @@ Of course, you can find elements by text/regex, find similar elements in a more
 ### Is Scrapling thread-safe?
 Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its state.
-## Sponsors
+## More Sponsors!
 [![Capsolver Banner](https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/CapSolver.png)](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=Scrapling)
+<a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" width="500" alt="SerpApi Banner" ></a>
 ## Contributing
 Everybody is invited and welcome to contribute to Scrapling. There is a lot to do!

{scrapling-0.2.dist-info → scrapling-0.2.2.dist-info}/RECORD RENAMED Viewed

@@ -1,22 +1,24 @@
-scrapling/__init__.py,sha256=cSitNNcOc3Ud0zZvaLy5NDfZ4c8_UCLWe7FfTBazKnY,433
-scrapling/fetchers.py,sha256=KD2moKWPYEcu7Lq4zIeBXcusmhFlPPueYSjyl8fMpLQ,15365
-scrapling/parser.py,sha256=oC1I9_jDP4zemU6V9e6wDyP-CQk2aMhJzSF2BGSBGp0,54253
+scrapling/__init__.py,sha256=lpRuPRo5y_KrUeY78qgX5H_C2dWFV33VqrTX0OafHO8,435
+scrapling/defaults.py,sha256=blYDLiuI5DgDSLRWnUgpp21WtFOsv1BsCRCmPeg8Xc4,287
+scrapling/fetchers.py,sha256=_6mL7XSTZE1fHXBqbxE2bBHnlQP1lH-4MCiQHQd5hQs,16017
+scrapling/parser.py,sha256=d2n00uF5i7W5lf0afLNRdk17ZFcNyiF9EzXLRQGA0NM,54111
+scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
 scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 scrapling/core/_types.py,sha256=nD2ZY_fitLohx3MfDmqoKJ9ZShrnRhQ8-d1SU1zEGAY,552
 scrapling/core/custom_types.py,sha256=-gMNOiByewoAUqFVrDp822V51rcWNlWVUOB6yGUL648,8403
 scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
 scrapling/core/storage_adaptors.py,sha256=Kbak0BOJX5e9I1PbUS_4sUJi2Wxw8Bv5XsaLHAu1l2Q,6218
 scrapling/core/translator.py,sha256=oU-dQCkNQOccZPrXbPW_VSgC5ll10Bb89C3ezW2lI0o,5228
-scrapling/core/utils.py,sha256=o35SxakRw5Bq_hpOiHu1KaSWrOBxeQpEMuOzG88NCqE,3530
+scrapling/core/utils.py,sha256=fXdANUgRBbVbOerJ94fRY9vi7n5zsbm8t3G4qQ-F3ak,3792
 scrapling/engines/__init__.py,sha256=zwMqcSdNGh-IX0d4zXazrgAeHrkqIN_v5Ia7RU1g8W0,267
-scrapling/engines/camo.py,sha256=Cq8960Uz-y-__4OJviHXPPhjbbVz1ILt9koaPic2x8w,6954
+scrapling/engines/camo.py,sha256=41vp2Nh51kKuOSZ1PijsIpROpQZgFfUPybVbEX8pEXk,7530
 scrapling/engines/constants.py,sha256=jSDA6lgbvEIB8z2m2SFzCKkvFEZnp28Mondy2__FpkM,3721
-scrapling/engines/pw.py,sha256=cx1B0mfatEoGYpFkDt5zPg_cb0lKU0mu4MjuuU-COes,11805
-scrapling/engines/static.py,sha256=K-tT8mEfJY0Ix_gZceazeFIYmZ_ko4nyqZptj6POYmM,7159
-scrapling/engines/toolbelt/__init__.py,sha256=3zWs5aiV8QP5ua-cvIBkCRaDhmjWEEx_xycVpdp3ur4,341
-scrapling/engines/toolbelt/custom.py,sha256=cqXQ2UdzoH0IXBAa0ySg_90kPhlP-f2fLAauJUAMFOs,8167
+scrapling/engines/pw.py,sha256=l5MrSW_WNBKAxAlyxbt09ka_lEGo61XKuaOgWpYmvHk,12102
+scrapling/engines/static.py,sha256=Wsp6_-soZUQJT6kHoKPkLOdHU9J50chLdYxDmQjO4FQ,7101
+scrapling/engines/toolbelt/__init__.py,sha256=BnBp34aDeohYgqdysEAAWnGZgA02YlExkc5FJLetMSo,367
+scrapling/engines/toolbelt/custom.py,sha256=8lvGHWIZoOotSTF97KgPb3CbJquel2QFx8rP8Hf2sQ4,7469
 scrapling/engines/toolbelt/fingerprints.py,sha256=kkVtZKSt2ukc0CV0g6QUvSWR0Yx5p8Mv8xiqACAsMBo,2917
-scrapling/engines/toolbelt/navigation.py,sha256=04Y1zjkVAgmvbgM3tHn6NsAruh5x6ESH1w0EW8CdVxo,2452
+scrapling/engines/toolbelt/navigation.py,sha256=Tde5_6Wv7lOeWXMzs9D6TRaxAbJ3b-zIX6-4HggZbCQ,4017
 tests/__init__.py,sha256=YHFB5ftzgLQVh6gbPfbYcY4yOS9DOBp5dBa6I-qtm8U,32
 tests/fetchers/__init__.py,sha256=6H4NgARhyTcGGd3dNCKQJ8kUFdrAEMSScQL7Ga_vU3c,43
 tests/fetchers/test_camoufox.py,sha256=XPTCDZ9sj_GpCzXyvzKF_uZWhEYX6J_jh_BLeMEl8yY,2874
@@ -25,8 +27,8 @@ tests/fetchers/test_playwright.py,sha256=YOWn89urd9NwoCHfTFj8fY4xYrRY2BeszTt5Q-T
 tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tests/parser/test_automatch.py,sha256=BeeYJi3cYCghbiZmi57z4bqcGPaoUA8GAm7MALBBkkk,2486
 tests/parser/test_general.py,sha256=NfTuGLgAm-LH0dVV0pvbRcYSNI-wSu05rdnuRzmB0m4,11664
-scrapling-0.2.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
-scrapling-0.2.dist-info/METADATA,sha256=yieOuAeWNwx5UMtQN-E1bsNnKEum4xGgPUynOgbG7m0,61418
-scrapling-0.2.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
-scrapling-0.2.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
-scrapling-0.2.dist-info/RECORD,,
+scrapling-0.2.2.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
+scrapling-0.2.2.dist-info/METADATA,sha256=gk7fij0BkRwA51dJlCbARlx_FW9_U9v9ptk3Mc5-YKQ,64784
+scrapling-0.2.2.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
+scrapling-0.2.2.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
+scrapling-0.2.2.dist-info/RECORD,,

{scrapling-0.2.dist-info → scrapling-0.2.2.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.3.0)
+Generator: setuptools (75.5.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{scrapling-0.2.dist-info → scrapling-0.2.2.dist-info}/LICENSE RENAMED Viewed

File without changes

{scrapling-0.2.dist-info → scrapling-0.2.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

scrapling 0.2__py3-none-any.whl → 0.2.2__py3-none-any.whl

scrapling 0.2py3-none-any.whl → 0.2.2py3-none-any.whl