PyPI - scrapling - Versions diffs - 0.2.6__py3-none-any.whl → 0.2.8__py3-none-any.whl - Mend

scrapling 0.2.6py3-none-any.whl → 0.2.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

scrapling/__init__.py +4 -3
scrapling/core/_types.py +2 -3
scrapling/core/custom_types.py +5 -5
scrapling/core/storage_adaptors.py +6 -6
scrapling/core/translator.py +5 -6
scrapling/core/utils.py +15 -12
scrapling/defaults.py +1 -1
scrapling/engines/__init__.py +2 -2
scrapling/engines/camo.py +20 -13
scrapling/engines/constants.py +1 -1
scrapling/engines/pw.py +31 -18
scrapling/engines/static.py +24 -11
scrapling/engines/toolbelt/__init__.py +6 -20
scrapling/engines/toolbelt/custom.py +15 -10
scrapling/engines/toolbelt/fingerprints.py +5 -5
scrapling/engines/toolbelt/navigation.py +6 -6
scrapling/fetchers.py +23 -14
scrapling/parser.py +15 -8
{scrapling-0.2.6.dist-info → scrapling-0.2.8.dist-info}/METADATA +28 -21
scrapling-0.2.8.dist-info/RECORD +42 -0
tests/fetchers/test_camoufox.py +1 -0
tests/fetchers/test_httpx.py +1 -0
tests/fetchers/test_playwright.py +1 -0
tests/parser/test_general.py +3 -1
scrapling-0.2.6.dist-info/RECORD +0 -42
{scrapling-0.2.6.dist-info → scrapling-0.2.8.dist-info}/LICENSE +0 -0
{scrapling-0.2.6.dist-info → scrapling-0.2.8.dist-info}/WHEEL +0 -0
{scrapling-0.2.6.dist-info → scrapling-0.2.8.dist-info}/top_level.txt +0 -0

scrapling/__init__.py CHANGED Viewed

@@ -1,10 +1,11 @@
 # Declare top-level shortcuts
-from scrapling.fetchers import Fetcher, StealthyFetcher, PlayWrightFetcher, CustomFetcher
+from scrapling.core.custom_types import AttributesHandler, TextHandler
+from scrapling.fetchers import (CustomFetcher, Fetcher, PlayWrightFetcher,
+                                StealthyFetcher)
 from scrapling.parser import Adaptor, Adaptors
-from scrapling.core.custom_types import TextHandler, AttributesHandler
 __author__ = "Karim Shoair (karim.shoair@pm.me)"
-__version__ = "0.2.6"
+__version__ = "0.2.8"
 __copyright__ = "Copyright (c) 2024 Karim Shoair"

scrapling/core/_types.py CHANGED Viewed

@@ -2,9 +2,8 @@
 Type definitions for type checking purposes.
 """
-from typing import (
-    Dict, Optional, Union, Callable, Any, List, Tuple, Pattern, Generator, Iterable, Type, TYPE_CHECKING, Literal
-)
+from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable,
+                    List, Literal, Optional, Pattern, Tuple, Type, Union)
 try:
     from typing import Protocol

scrapling/core/custom_types.py CHANGED Viewed

@@ -1,13 +1,13 @@
 import re
-from types import MappingProxyType
 from collections.abc import Mapping
+from types import MappingProxyType
-from scrapling.core.utils import _is_iterable, flatten
-from scrapling.core._types import Dict, List, Union, Pattern, SupportsIndex
-from orjson import loads, dumps
+from orjson import dumps, loads
 from w3lib.html import replace_entities as _replace_entities
+from scrapling.core._types import Dict, List, Pattern, SupportsIndex, Union
+from scrapling.core.utils import _is_iterable, flatten
 class TextHandler(str):
     """Extends standard Python string by adding more functionality"""

scrapling/core/storage_adaptors.py CHANGED Viewed

@@ -1,16 +1,16 @@
-import orjson
-import sqlite3
 import logging
+import sqlite3
 import threading
-from hashlib import sha256
 from abc import ABC, abstractmethod
+from hashlib import sha256
-from scrapling.core._types import Dict, Optional, Union
-from scrapling.core.utils import _StorageTools, cache
+import orjson
 from lxml import html
 from tldextract import extract as tld
+from scrapling.core._types import Dict, Optional, Union
+from scrapling.core.utils import _StorageTools, cache
 class StorageSystemMixin(ABC):
     # If you want to make your own storage system, you have to inherit from this

scrapling/core/translator.py CHANGED Viewed

@@ -10,15 +10,14 @@ So you don't have to learn a new selectors/api method like what bs4 done with so
 import re
-from w3lib.html import HTML5_WHITESPACE
-from scrapling.core.utils import cache
-from scrapling.core._types import Any, Optional, Protocol, Self
-from cssselect.xpath import ExpressionError
-from cssselect.xpath import XPathExpr as OriginalXPathExpr
 from cssselect import HTMLTranslator as OriginalHTMLTranslator
 from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
+from cssselect.xpath import ExpressionError
+from cssselect.xpath import XPathExpr as OriginalXPathExpr
+from w3lib.html import HTML5_WHITESPACE
+from scrapling.core._types import Any, Optional, Protocol, Self
+from scrapling.core.utils import cache
 regex = f"[{HTML5_WHITESPACE}]+"
 replace_html5_whitespaces = re.compile(regex).sub

scrapling/core/utils.py CHANGED Viewed

@@ -1,22 +1,25 @@
-import re
 import logging
+import re
 from itertools import chain
-# Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
-from functools import lru_cache as cache  # functools.cache is available on Python 3.9+ only so let's keep lru_cache
-from scrapling.core._types import Dict, Iterable, Any, Union
 import orjson
 from lxml import html
+from scrapling.core._types import Any, Dict, Iterable, Union
+# Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
+# functools.cache is available on Python 3.9+ only so let's keep lru_cache
+from functools import lru_cache as cache  # isort:skip
 html_forbidden = {html.HtmlComment, }
 logging.basicConfig(
-        level=logging.ERROR,
-        format='%(asctime)s - %(levelname)s - %(message)s',
-        handlers=[
-            logging.StreamHandler()
-        ]
-    )
+    level=logging.ERROR,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler()
+    ]
+)
 def is_jsonable(content: Union[bytes, str]) -> bool:
@@ -94,7 +97,7 @@ class _StorageTools:
         parent = element.getparent()
         return tuple(
             (element.tag,) if parent is None else (
-                    cls._get_element_path(parent) + (element.tag,)
+                cls._get_element_path(parent) + (element.tag,)
             )
         )

scrapling/defaults.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from .fetchers import Fetcher, StealthyFetcher, PlayWrightFetcher
+from .fetchers import Fetcher, PlayWrightFetcher, StealthyFetcher
 # If you are going to use Fetchers with the default settings, import them from this file instead for a cleaner looking code
 Fetcher = Fetcher()

scrapling/engines/__init__.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from .camo import CamoufoxEngine
-from .static import StaticEngine
-from .pw import PlaywrightEngine
 from .constants import DEFAULT_DISABLED_RESOURCES, DEFAULT_STEALTH_FLAGS
+from .pw import PlaywrightEngine
+from .static import StaticEngine
 from .toolbelt import check_if_engine_usable
 __all__ = ['CamoufoxEngine', 'PlaywrightEngine']

scrapling/engines/camo.py CHANGED Viewed

@@ -1,19 +1,16 @@
 import logging
-from scrapling.core._types import Union, Callable, Optional, Dict, List, Literal
-from scrapling.engines.toolbelt import (
-    Response,
-    do_nothing,
-    StatusText,
-    get_os_name,
-    intercept_route,
-    check_type_validity,
-    construct_proxy_dict,
-    generate_convincing_referer,
-)
+from camoufox import DefaultAddons
 from camoufox.sync_api import Camoufox
+from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
+                                   Union)
+from scrapling.engines.toolbelt import (Response, StatusText,
+                                        check_type_validity,
+                                        construct_proxy_dict, do_nothing,
+                                        generate_convincing_referer,
+                                        get_os_name, intercept_route)
 class CamoufoxEngine:
     def __init__(
@@ -21,7 +18,8 @@ class CamoufoxEngine:
             block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = False, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
             timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
             wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
-            proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, adaptor_arguments: Dict = None
+            proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
+            adaptor_arguments: Dict = None,
     ):
         """An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
@@ -36,6 +34,7 @@ class CamoufoxEngine:
         :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
         :param allow_webgl: Whether to allow WebGL. To prevent leaks, only use this for special cases.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
+        :param disable_ads: Enabled by default, this installs `uBlock Origin` addon on the browser if enabled.
         :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
         :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
         :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
@@ -54,6 +53,7 @@ class CamoufoxEngine:
         self.network_idle = bool(network_idle)
         self.google_search = bool(google_search)
         self.os_randomize = bool(os_randomize)
+        self.disable_ads = bool(disable_ads)
         self.extra_headers = extra_headers or {}
         self.proxy = construct_proxy_dict(proxy)
         self.addons = addons or []
@@ -75,9 +75,11 @@ class CamoufoxEngine:
         :param url: Target url.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
+        addons = [] if self.disable_ads else [DefaultAddons.UBO]
         with Camoufox(
                 proxy=self.proxy,
                 addons=self.addons,
+                exclude_addons=addons,
                 headless=self.headless,
                 humanize=self.humanize,
                 i_know_what_im_doing=True,  # To turn warnings off with the user configurations
@@ -105,6 +107,11 @@ class CamoufoxEngine:
             if self.wait_selector and type(self.wait_selector) is str:
                 waiter = page.locator(self.wait_selector)
                 waiter.first.wait_for(state=self.wait_selector_state)
+                # Wait again after waiting for the selector, helpful with protections like Cloudflare
+                page.wait_for_load_state(state="load")
+                page.wait_for_load_state(state="domcontentloaded")
+                if self.network_idle:
+                    page.wait_for_load_state('networkidle')
             # This will be parsed inside `Response`
             encoding = res.headers.get('content-type', '') or 'utf-8'  # default encoding

scrapling/engines/constants.py CHANGED Viewed

@@ -44,7 +44,7 @@ DEFAULT_STEALTH_FLAGS = [
     '--disable-default-apps',
     '--disable-print-preview',
     '--disable-dev-shm-usage',
-    '--disable-popup-blocking',
+    # '--disable-popup-blocking',
     '--metrics-recording-only',
     '--disable-crash-reporter',
     '--disable-partial-raster',

scrapling/engines/pw.py CHANGED Viewed

@@ -1,20 +1,15 @@
 import json
 import logging
-from scrapling.core._types import Union, Callable, Optional, List, Dict
-from scrapling.engines.constants import DEFAULT_STEALTH_FLAGS, NSTBROWSER_DEFAULT_QUERY
-from scrapling.engines.toolbelt import (
-    Response,
-    do_nothing,
-    StatusText,
-    js_bypass_path,
-    intercept_route,
-    generate_headers,
-    construct_cdp_url,
-    check_type_validity,
-    construct_proxy_dict,
-    generate_convincing_referer,
-)
+from scrapling.core._types import Callable, Dict, List, Optional, Union
+from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
+                                         NSTBROWSER_DEFAULT_QUERY)
+from scrapling.engines.toolbelt import (Response, StatusText,
+                                        check_type_validity, construct_cdp_url,
+                                        construct_proxy_dict, do_nothing,
+                                        generate_convincing_referer,
+                                        generate_headers, intercept_route,
+                                        js_bypass_path)
 class PlaywrightEngine:
@@ -26,6 +21,7 @@ class PlaywrightEngine:
             timeout: Optional[float] = 30000,
             page_action: Callable = do_nothing,
             wait_selector: Optional[str] = None,
+            locale: Optional[str] = 'en-US',
             wait_selector_state: Optional[str] = 'attached',
             stealth: Optional[bool] = False,
             real_chrome: Optional[bool] = False,
@@ -50,6 +46,7 @@ class PlaywrightEngine:
         :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
         :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
         :param wait_selector: Wait for a specific css selector to be in a specific state.
+        :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
         :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
         :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
         :param real_chrome: If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.
@@ -64,6 +61,7 @@ class PlaywrightEngine:
         :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
         """
         self.headless = headless
+        self.locale = check_type_validity(locale, [str], 'en-US', param_name='locale')
         self.disable_resources = disable_resources
         self.network_idle = bool(network_idle)
         self.stealth = bool(stealth)
@@ -87,6 +85,14 @@ class PlaywrightEngine:
         self.nstbrowser_mode = bool(nstbrowser_mode)
         self.nstbrowser_config = nstbrowser_config
         self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
+        self.harmful_default_args = [
+            # This will be ignored to avoid detection more and possibly avoid the popup crashing bug abuse: https://issues.chromium.org/issues/340836884
+            '--enable-automation',
+            '--disable-popup-blocking',
+            # '--disable-component-update',
+            # '--disable-default-apps',
+            # '--disable-extensions',
+        ]
     def _cdp_url_logic(self, flags: Optional[List] = None) -> str:
         """Constructs new CDP URL if NSTBrowser is enabled otherwise return CDP URL as it is
@@ -151,15 +157,15 @@ class PlaywrightEngine:
             else:
                 if self.stealth:
                     browser = p.chromium.launch(
-                        headless=self.headless, args=flags, ignore_default_args=['--enable-automation'], chromium_sandbox=True, channel='chrome' if self.real_chrome else 'chromium'
+                        headless=self.headless, args=flags, ignore_default_args=self.harmful_default_args, chromium_sandbox=True, channel='chrome' if self.real_chrome else 'chromium'
                     )
                 else:
-                    browser = p.chromium.launch(headless=self.headless, ignore_default_args=['--enable-automation'], channel='chrome' if self.real_chrome else 'chromium')
+                    browser = p.chromium.launch(headless=self.headless, ignore_default_args=self.harmful_default_args, channel='chrome' if self.real_chrome else 'chromium')
             # Creating the context
             if self.stealth:
                 context = browser.new_context(
-                    locale='en-US',
+                    locale=self.locale,
                     is_mobile=False,
                     has_touch=False,
                     proxy=self.proxy,
@@ -176,6 +182,8 @@ class PlaywrightEngine:
                 )
             else:
                 context = browser.new_context(
+                    locale=self.locale,
+                    proxy=self.proxy,
                     color_scheme='dark',
                     user_agent=useragent,
                     device_scale_factor=2,
@@ -221,6 +229,11 @@ class PlaywrightEngine:
             if self.wait_selector and type(self.wait_selector) is str:
                 waiter = page.locator(self.wait_selector)
                 waiter.first.wait_for(state=self.wait_selector_state)
+                # Wait again after waiting for the selector, helpful with protections like Cloudflare
+                page.wait_for_load_state(state="load")
+                page.wait_for_load_state(state="domcontentloaded")
+                if self.network_idle:
+                    page.wait_for_load_state('networkidle')
             # This will be parsed inside `Response`
             encoding = res.headers.get('content-type', '') or 'utf-8'  # default encoding

scrapling/engines/static.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import logging
-from scrapling.core._types import Union, Optional, Dict
-from .toolbelt import Response, generate_convincing_referer, generate_headers
 import httpx
 from httpx._models import Response as httpxResponse
+from scrapling.core._types import Dict, Optional, Union
+from .toolbelt import Response, generate_convincing_referer, generate_headers
 class StaticEngine:
     def __init__(self, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = None, adaptor_arguments: Dict = None):
@@ -63,54 +64,66 @@ class StaticEngine:
             **self.adaptor_arguments
         )
-    def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
+    def get(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
         """Make basic HTTP GET request for you but with some added flavors.
         :param url: Target url.
         :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
             create a referer header as if this request had came from Google's search of this URL's domain.
+        :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
         :param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
         headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
-        request = httpx.get(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
+        with httpx.Client(proxy=proxy) as client:
+            request = client.get(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
         return self._prepare_response(request)
-    def post(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
+    def post(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
         """Make basic HTTP POST request for you but with some added flavors.
         :param url: Target url.
         :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
             create a referer header as if this request had came from Google's search of this URL's domain.
+        :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
         :param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
         headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
-        request = httpx.post(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
+        with httpx.Client(proxy=proxy) as client:
+            request = client.post(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
         return self._prepare_response(request)
-    def delete(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
+    def delete(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
         """Make basic HTTP DELETE request for you but with some added flavors.
         :param url: Target url.
         :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
             create a referer header as if this request had came from Google's search of this URL's domain.
+        :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
         :param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
         headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
-        request = httpx.delete(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
+        with httpx.Client(proxy=proxy) as client:
+            request = client.delete(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
         return self._prepare_response(request)
-    def put(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
+    def put(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
         """Make basic HTTP PUT request for you but with some added flavors.
         :param url: Target url.
         :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
             create a referer header as if this request had came from Google's search of this URL's domain.
+        :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
         :param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
         headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
-        request = httpx.put(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
+        with httpx.Client(proxy=proxy) as client:
+            request = client.put(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
         return self._prepare_response(request)

scrapling/engines/toolbelt/__init__.py CHANGED Viewed

@@ -1,20 +1,6 @@
-from .fingerprints import (
-    get_os_name,
-    generate_headers,
-    generate_convincing_referer,
-)
-from .custom import (
-    Response,
-    do_nothing,
-    StatusText,
-    BaseFetcher,
-    get_variable_name,
-    check_type_validity,
-    check_if_engine_usable,
-)
-from .navigation import (
-    js_bypass_path,
-    intercept_route,
-    construct_cdp_url,
-    construct_proxy_dict,
-)
+from .custom import (BaseFetcher, Response, StatusText, check_if_engine_usable,
+                     check_type_validity, do_nothing, get_variable_name)
+from .fingerprints import (generate_convincing_referer, generate_headers,
+                           get_os_name)
+from .navigation import (construct_cdp_url, construct_proxy_dict,
+                         intercept_route, js_bypass_path)

scrapling/engines/toolbelt/custom.py CHANGED Viewed

@@ -5,10 +5,11 @@ import inspect
 import logging
 from email.message import Message
+from scrapling.core._types import (Any, Callable, Dict, List, Optional, Tuple,
+                                   Type, Union)
 from scrapling.core.custom_types import MappingProxyType
+from scrapling.core.utils import cache, setup_basic_logging
 from scrapling.parser import Adaptor, SQLiteStorageSystem
-from scrapling.core.utils import setup_basic_logging, cache
-from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable, Tuple
 class ResponseEncoding:
@@ -39,7 +40,7 @@ class ResponseEncoding:
     @classmethod
     @cache(maxsize=None)
-    def get_value(cls, content_type: Optional[str]) -> str:
+    def get_value(cls, content_type: Optional[str], text: Optional[str] = 'test') -> str:
         """Determine the appropriate character encoding from a content-type header.
         The encoding is determined by these rules in order:
@@ -50,26 +51,30 @@ class ResponseEncoding:
             5. Default to UTF-8 if nothing else matches
         :param content_type: Content-Type header value or None
+        :param text: A text to test the encoding on it
         :return: String naming the character encoding
         """
         if not content_type:
             return cls.__DEFAULT_ENCODING
         try:
+            encoding = None
             content_type, params = cls.__parse_content_type(content_type)
             # First check for explicit charset parameter
             if "charset" in params:
                 encoding = params["charset"].strip("'\"")
-                "test".encode(encoding)  # Validate encoding
-                return encoding
             # Apply content-type specific rules
-            if content_type in cls.__ISO_8859_1_CONTENT_TYPES:
-                return "ISO-8859-1"
+            elif content_type in cls.__ISO_8859_1_CONTENT_TYPES:
+                encoding = "ISO-8859-1"
+            elif content_type == "application/json":
+                encoding = cls.__DEFAULT_ENCODING
-            if content_type == "application/json":
-                return cls.__DEFAULT_ENCODING
+            if encoding:
+                _ = text.encode(encoding)  # Validate encoding and validate it can encode the given text
+                return encoding
             return cls.__DEFAULT_ENCODING
@@ -87,7 +92,7 @@ class Response(Adaptor):
         self.cookies = cookies
         self.headers = headers
         self.request_headers = request_headers
-        encoding = ResponseEncoding.get_value(encoding)
+        encoding = ResponseEncoding.get_value(encoding, text)
         super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
         # For back-ward compatibility
         self.adaptor = self

scrapling/engines/toolbelt/fingerprints.py CHANGED Viewed

@@ -4,12 +4,12 @@ Functions related to generating headers and fingerprints generally
 import platform
-from scrapling.core.utils import cache
-from scrapling.core._types import Union, Dict
+from browserforge.fingerprints import Fingerprint, FingerprintGenerator
+from browserforge.headers import Browser, HeaderGenerator
 from tldextract import extract
-from browserforge.headers import HeaderGenerator, Browser
-from browserforge.fingerprints import FingerprintGenerator, Fingerprint
+from scrapling.core._types import Dict, Union
+from scrapling.core.utils import cache
 @cache(None, typed=True)

scrapling/engines/toolbelt/navigation.py CHANGED Viewed

@@ -2,16 +2,16 @@
 Functions related to files and URLs
 """
-import os
 import logging
-from urllib.parse import urlparse, urlencode
+import os
+from urllib.parse import urlencode, urlparse
+from playwright.sync_api import Route
+from scrapling.core._types import Dict, Optional, Union
 from scrapling.core.utils import cache
-from scrapling.core._types import Union, Dict, Optional
 from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
-from playwright.sync_api import Route
 def intercept_route(route: Route) -> Union[Route, None]:
     """This is just a route handler but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
@@ -43,7 +43,7 @@ def construct_proxy_dict(proxy_string: Union[str, Dict[str, str]]) -> Union[Dict
                 }
             except ValueError:
                 # Urllib will say that one of the parameters above can't be casted to the correct type like `int` for port etc...
-                raise TypeError(f'The proxy argument\'s string is in invalid format!')
+                raise TypeError('The proxy argument\'s string is in invalid format!')
         elif isinstance(proxy_string, dict):
             valid_keys = ('server', 'username', 'password', )

scrapling/fetchers.py CHANGED Viewed

@@ -1,7 +1,8 @@
-from scrapling.core._types import Dict, Optional, Union, Callable, List, Literal
-from scrapling.engines.toolbelt import Response, BaseFetcher, do_nothing
-from scrapling.engines import CamoufoxEngine, PlaywrightEngine, StaticEngine, check_if_engine_usable
+from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
+                                   Union)
+from scrapling.engines import (CamoufoxEngine, PlaywrightEngine, StaticEngine,
+                               check_if_engine_usable)
+from scrapling.engines.toolbelt import BaseFetcher, Response, do_nothing
 class Fetcher(BaseFetcher):
@@ -9,7 +10,7 @@ class Fetcher(BaseFetcher):
     Any additional keyword arguments passed to the methods below are passed to the respective httpx's method directly.
     """
-    def get(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
+    def get(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, proxy: Optional[str] = None, **kwargs: Dict) -> Response:
         """Make basic HTTP GET request for you but with some added flavors.
         :param url: Target url.
@@ -17,13 +18,14 @@ class Fetcher(BaseFetcher):
         :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
         :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
             create a referer header as if this request had came from Google's search of this URL's domain.
+        :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
         :param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).get(url, stealthy_headers, **kwargs)
+        response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).get(url, proxy, stealthy_headers, **kwargs)
         return response_object
-    def post(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
+    def post(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, proxy: Optional[str] = None, **kwargs: Dict) -> Response:
         """Make basic HTTP POST request for you but with some added flavors.
         :param url: Target url.
@@ -31,13 +33,14 @@ class Fetcher(BaseFetcher):
         :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
         :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
             create a referer header as if this request came from Google's search of this URL's domain.
+        :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
         :param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).post(url, stealthy_headers, **kwargs)
+        response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).post(url, proxy, stealthy_headers, **kwargs)
         return response_object
-    def put(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
+    def put(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, proxy: Optional[str] = None, **kwargs: Dict) -> Response:
         """Make basic HTTP PUT request for you but with some added flavors.
         :param url: Target url
@@ -45,14 +48,15 @@ class Fetcher(BaseFetcher):
         :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
         :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
             create a referer header as if this request came from Google's search of this URL's domain.
+        :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
         :param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).put(url, stealthy_headers, **kwargs)
+        response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).put(url, proxy, stealthy_headers, **kwargs)
         return response_object
-    def delete(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
+    def delete(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, proxy: Optional[str] = None, **kwargs: Dict) -> Response:
         """Make basic HTTP DELETE request for you but with some added flavors.
         :param url: Target url
@@ -60,10 +64,11 @@ class Fetcher(BaseFetcher):
         :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
         :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
             create a referer header as if this request came from Google's search of this URL's domain.
+        :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
         :param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).delete(url, stealthy_headers, **kwargs)
+        response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).delete(url, proxy, stealthy_headers, **kwargs)
         return response_object
@@ -78,7 +83,7 @@ class StealthyFetcher(BaseFetcher):
             block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = False, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
             timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
             wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, proxy: Optional[Union[str, Dict[str, str]]] = None,
-            os_randomize: Optional[bool] = None
+            os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
     ) -> Response:
         """
         Opens up a browser and do your request based on your chosen options below.
@@ -92,6 +97,7 @@ class StealthyFetcher(BaseFetcher):
             This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
         :param block_webrtc: Blocks WebRTC entirely.
         :param addons: List of Firefox addons to use. Must be paths to extracted addons.
+        :param disable_ads: Enabled by default, this installs `uBlock Origin` addon on the browser if enabled.
         :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
         :param allow_webgl: Whether to allow WebGL. To prevent leaks, only use this for special cases.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
@@ -111,6 +117,7 @@ class StealthyFetcher(BaseFetcher):
             timeout=timeout,
             headless=headless,
             humanize=humanize,
+            disable_ads=disable_ads,
             allow_webgl=allow_webgl,
             page_action=page_action,
             network_idle=network_idle,
@@ -148,7 +155,7 @@ class PlayWrightFetcher(BaseFetcher):
             useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
             page_action: Optional[Callable] = do_nothing, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
             hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
-            proxy: Optional[Union[str, Dict[str, str]]] = None,
+            proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
             stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
             cdp_url: Optional[str] = None,
             nstbrowser_mode: Optional[bool] = False, nstbrowser_config: Optional[Dict] = None,
@@ -163,6 +170,7 @@ class PlayWrightFetcher(BaseFetcher):
         :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
         :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
+        :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
         :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
         :param wait_selector: Wait for a specific css selector to be in a specific state.
         :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
@@ -180,6 +188,7 @@ class PlayWrightFetcher(BaseFetcher):
         """
         engine = PlaywrightEngine(
             proxy=proxy,
+            locale=locale,
             timeout=timeout,
             stealth=stealth,
             cdp_url=cdp_url,

scrapling/parser.py CHANGED Viewed

@@ -1,16 +1,23 @@
+import inspect
 import os
 import re
-import inspect
 from difflib import SequenceMatcher
-from scrapling.core.translator import HTMLTranslator
-from scrapling.core.mixins import SelectorsGeneration
-from scrapling.core.custom_types import TextHandler, TextHandlers, AttributesHandler
-from scrapling.core.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
-from scrapling.core.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden, is_jsonable
-from scrapling.core._types import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator, SupportsIndex, Iterable
+from cssselect import SelectorError, SelectorSyntaxError
+from cssselect import parse as split_selectors
 from lxml import etree, html
-from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
+from scrapling.core._types import (Any, Callable, Dict, Generator, Iterable,
+                                   List, Optional, Pattern, SupportsIndex,
+                                   Tuple, Union)
+from scrapling.core.custom_types import (AttributesHandler, TextHandler,
+                                         TextHandlers)
+from scrapling.core.mixins import SelectorsGeneration
+from scrapling.core.storage_adaptors import (SQLiteStorageSystem,
+                                             StorageSystemMixin, _StorageTools)
+from scrapling.core.translator import HTMLTranslator
+from scrapling.core.utils import (clean_spaces, flatten, html_forbidden,
+                                  is_jsonable, logging, setup_basic_logging)
 class Adaptor(SelectorsGeneration):

{scrapling-0.2.6.dist-info → scrapling-0.2.8.dist-info}/METADATA RENAMED Viewed

@@ -1,7 +1,7 @@
 Metadata-Version: 2.1
 Name: scrapling
-Version: 0.2.6
-Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
+Version: 0.2.8
+Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
 Home-page: https://github.com/D4Vinci/Scrapling
 Author: Karim Shoair
 Author-email: karim.shoair@pm.me
@@ -41,7 +41,7 @@ Requires-Dist: tldextract
 Requires-Dist: httpx[brotli,zstd]
 Requires-Dist: playwright==1.48
 Requires-Dist: rebrowser-playwright
-Requires-Dist: camoufox>=0.3.10
+Requires-Dist: camoufox>=0.4.4
 Requires-Dist: browserforge
 # 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
@@ -52,7 +52,7 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
 Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
 ```python
->> from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
+>> from scrapling.defaults import Fetcher, StealthyFetcher, PlayWrightFetcher
 # Fetch websites' source under the radar!
 >> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
 >> print(page.status)
@@ -90,10 +90,11 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
     * [Text Extraction Speed Test (5000 nested elements).](#text-extraction-speed-test-5000-nested-elements)
     * [Extraction By Text Speed Test](#extraction-by-text-speed-test)
   * [Installation](#installation)
-  * [Fetching Websites Features](#fetching-websites-features)
-    * [Fetcher](#fetcher)
-    * [StealthyFetcher](#stealthyfetcher)
-    * [PlayWrightFetcher](#playwrightfetcher)
+  * [Fetching Websites](#fetching-websites)
+    * [Features](#features)
+    * [Fetcher class](#fetcher)
+    * [StealthyFetcher class](#stealthyfetcher)
+    * [PlayWrightFetcher class](#playwrightfetcher)
   * [Advanced Parsing Features](#advanced-parsing-features)
     * [Smart Navigation](#smart-navigation)
     * [Content-based Selection & Finding Similar Elements](#content-based-selection--finding-similar-elements)
@@ -256,43 +257,48 @@ playwright install chromium
 python -m browserforge update
 ```
-## Fetching Websites Features
-You might be a little bit confused by now so let me clear things up. All fetcher-type classes are imported in the same way
+## Fetching Websites
+Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
+### Features
+You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way
 ```python
 from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
 ```
-And all of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug` which are the same ones you give to the `Adaptor` class.
+All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug`, which are the same ones you give to the `Adaptor` class.
 If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
 ```python
-from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
+from scrapling.defaults import Fetcher, StealthyFetcher, PlayWrightFetcher
 ```
 then use it right away without initializing like:
 ```python
 page = StealthyFetcher.fetch('https://example.com')
 ```
-Also, the `Response` object returned from all fetchers is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
+Also, the `Response` object returned from all fetchers is the same as the `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
 > [!NOTE]
 > The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
 ### Fetcher
 This class is built on top of [httpx](https://www.python-httpx.org/) with additional configuration options, here you can do `GET`, `POST`, `PUT`, and `DELETE` requests.
 For all methods, you have `stealth_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default.
+You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods in this format `http://username:password@localhost:8030`
 ```python
 >> page = Fetcher().get('https://httpbin.org/get', stealth_headers=True, follow_redirects=True)
->> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'})
+>> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
 >> page = Fetcher().put('https://httpbin.org/put', data={'key': 'value'})
 >> page = Fetcher().delete('https://httpbin.org/delete')
 ```
 ### StealthyFetcher
-This class is built on top of [Camoufox](https://github.com/daijro/camoufox) which by default bypasses most of the anti-bot protections. Scrapling adds extra layers of flavors and configurations to increase performance and undetectability even further.
+This class is built on top of [Camoufox](https://github.com/daijro/camoufox), bypassing most anti-bot protections by default. Scrapling adds extra layers of flavors and configurations to increase performance and undetectability even further.
 ```python
 >> page = StealthyFetcher().fetch('https://www.browserscan.net/bot-detection')  # Running headless by default
 >> page.status == 200
 True
 ```
-> Note: all requests done by this fetcher is waiting by default for all JS to be fully loaded and executed so you don't have to :)
+> Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
 <details><summary><strong>For the sake of simplicity, expand this for the complete list of arguments</strong></summary>
@@ -309,6 +315,7 @@ True
 |       addons        | List of Firefox addons to use. **Must be paths to extracted addons.**                                                                                                                                                                                                                                                                                                                                           |    ✔️    |
 |      humanize       | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.                                                                                                                                                                                                                                  |    ✔️    |
 |     allow_webgl     | Whether to allow WebGL. To prevent leaks, only use this for special cases.                                                                                                                                                                                                                                                                                                                                      |    ✔️    |
+|     disable_ads     | Enabled by default, this installs `uBlock Origin` addon on the browser if enabled.                                                                                                                                                                                                                                                                                                                              |    ✔️    |
 |    network_idle     | Wait for the page until there are no network connections for at least 500 ms.                                                                                                                                                                                                                                                                                                                                   |    ✔️    |
 |       timeout       | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000.                                                                                                                                                                                                                                                                                                    |    ✔️    |
 |    wait_selector    | Wait for a specific css selector to be in a specific state.                                                                                                                                                                                                                                                                                                                                                     |    ✔️    |
@@ -327,7 +334,7 @@ This class is built on top of [Playwright](https://playwright.dev/python/) which
 >> page.css_first("#search a::attr(href)")
 'https://github.com/D4Vinci/Scrapling'
 ```
-> Note: all requests done by this fetcher is waiting by default for all JS to be fully loaded and executed so you don't have to :)
+> Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
 Using this Fetcher class, you can make requests with:
   1) Vanilla Playwright without any modifications other than the ones you chose.
@@ -339,7 +346,7 @@ Using this Fetcher class, you can make requests with:
   3) Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
   4) [NSTBrowser](https://app.nstbrowser.io/r/1vO5e5)'s [docker browserless](https://hub.docker.com/r/nstbrowser/browserless) option by passing the CDP URL and enabling `nstbrowser_mode` option.
-> Hence using the `real_chrome` argument requires that you have chrome browser installed on your device
+> Hence using the `real_chrome` argument requires that you have Chrome browser installed on your device
 Add that to a lot of controlling/hiding options as you will see in the arguments list below.
@@ -362,7 +369,8 @@ Add that to a lot of controlling/hiding options as you will see in the arguments
 |     hide_canvas     | Add random noise to canvas operations to prevent fingerprinting.                                                                                                                                                                                                                                                                                                                                                |    ✔️    |
 |    disable_webgl    | Disables WebGL and WebGL 2.0 support entirely.                                                                                                                                                                                                                                                                                                                                                                  |    ✔️    |
 |       stealth       | Enables stealth mode, always check the documentation to see what stealth mode does currently.                                                                                                                                                                                                                                                                                                                   |    ✔️    |
-|     real_chrome     | If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.                                                                                                                                                                                                                                                                            |    ✔️    |
+|     real_chrome     | If you have Chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.                                                                                                                                                                                                                                                                            |    ✔️    |
+|       locale        | Set the locale for the browser if wanted. The default value is `en-US`.                                                                                                                                                                                                                                                                                                                                         |    ✔️    |
 |       cdp_url       | Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.                                                                                                                                                                                                                                                                                           |    ✔️    |
 |   nstbrowser_mode   | Enables NSTBrowser mode, **it have to be used with `cdp_url` argument or it will get completely ignored.**                                                                                                                                                                                                                                                                                                      |    ✔️    |
 |  nstbrowser_config  | The config you want to send with requests to the NSTBrowser. _If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config._                                                                                                                                                                                                                                                        |    ✔️    |
@@ -814,8 +822,7 @@ Of course, you can find elements by text/regex, find similar elements in a more
 Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its state.
 ## More Sponsors!
-[![Capsolver Banner](https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/CapSolver.png)](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=Scrapling)
-<a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" width="500" alt="SerpApi Banner" ></a>
+<a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" alt="SerpApi Banner" ></a>
 ## Contributing

scrapling-0.2.8.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,42 @@
+scrapling/__init__.py,sha256=0-gw4uqckCs7ikl6sHiB5c6y0AelpgefqJkBmSd7j1k,469
+scrapling/defaults.py,sha256=qO6zAS7k5_QXvbjuoBv87fUMqASGMuM2dVry9J9auv0,287
+scrapling/fetchers.py,sha256=iw1wEuFg14akJYpSg9webfBjAL341Pnofn4IkWahGlE,17486
+scrapling/parser.py,sha256=suXggr39GimLnnLm9ivM1CQ40AoDwGke2sgnWszqFqk,54331
+scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
+scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+scrapling/core/_types.py,sha256=__HJ2JTk5vx5eg_7HAJmDjaHrMDIaoxNG8fadLLyKV8,566
+scrapling/core/custom_types.py,sha256=8GCgcZL-IT5lP6titxL-RPCiItQSuJZjSlFIGCDxoSs,8402
+scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
+scrapling/core/storage_adaptors.py,sha256=Q2-G7oDqoIqlIBEmnUsKwSzM2lNGNUPKtTbMjTV9178,6218
+scrapling/core/translator.py,sha256=WN_xPyYrD1MjLPv8Ar8zHNTPC_iYsW29kkjET4hbFI0,5228
+scrapling/core/utils.py,sha256=RajDRSPkVmszjpwNy8NIz8ZlUxPox8j2rSractr7Q9s,3779
+scrapling/engines/__init__.py,sha256=zA7tzqcDXP0hllwmjVewNHWipIA4JSU9mRG4J-cud0c,267
+scrapling/engines/camo.py,sha256=fmpGMW5T7we5cQC8muyvVo_A27yAqc5csm7dO_2jHiE,8446
+scrapling/engines/constants.py,sha256=WTn-X4kFIDWjXTiqOT0tm4XT5pijcdohFyZ0Af2C5Xc,3723
+scrapling/engines/pw.py,sha256=kWbkHm2vnQYeGuJnicKlAL1HrBKuXoFtyRMNFXLs4VY,13962
+scrapling/engines/static.py,sha256=h629IjT78YbhjFYBVSli53lKiYrG3929TAaZ7TA-j-Y,8022
+scrapling/engines/toolbelt/__init__.py,sha256=0tSsxMH5ALOMPXrLkr8mTH7LWg9QfIse4Ij9vUFgYjY,391
+scrapling/engines/toolbelt/custom.py,sha256=tab_wJmN6onvu2U8tDXeJ9jn6A47jTkmxSBoc-w8dIk,12789
+scrapling/engines/toolbelt/fingerprints.py,sha256=Y3FW8uqxxeNK3v6vBVvki8VjeG5oRxSwim4Q2Hv_cRk,2917
+scrapling/engines/toolbelt/navigation.py,sha256=Okpl4ynlLn2cUpSiaaoXDSOdDOXhvxNOOGphE_HXc5k,4016
+scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
+scrapling/engines/toolbelt/bypasses/notification_permission.js,sha256=poPM3o5WYgEX-EdiUfDCllpWfc3Umvw4jr2u6O6elus,237
+scrapling/engines/toolbelt/bypasses/pdf_viewer.js,sha256=mKjjSuP1-BOGC_2WhRYHJo_LP7lTBi2KXmP_zsHO_tI,173
+scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js,sha256=3RP1AE_XZRvpupeV_i-WSNVqRxyUy0qd8rQV8j_4j3U,221
+scrapling/engines/toolbelt/bypasses/screen_props.js,sha256=fZEuHMQ1-fYuxxUMoQXUvVWYUkPUbblkfMfpiLvBY7w,599
+scrapling/engines/toolbelt/bypasses/webdriver_fully.js,sha256=hdJw4clRAJQqIdq5gIFC_eC-x7C1i2ab01KV5ylmOBs,728
+scrapling/engines/toolbelt/bypasses/window_chrome.js,sha256=D7hqzNGGDorh8JVlvm2YIv7Bk2CoVkG55MDIdyqhT1w,6808
+tests/__init__.py,sha256=YHFB5ftzgLQVh6gbPfbYcY4yOS9DOBp5dBa6I-qtm8U,32
+tests/fetchers/__init__.py,sha256=6H4NgARhyTcGGd3dNCKQJ8kUFdrAEMSScQL7Ga_vU3c,43
+tests/fetchers/test_camoufox.py,sha256=-1v_0mXeBcAVW932nkFws1HIDCodGbpNYniSnVMHeeU,3116
+tests/fetchers/test_httpx.py,sha256=rrw9q4KdDAHpQVa4sTmw278Yv1OlwY_SKPbpBPLVN7c,3508
+tests/fetchers/test_playwright.py,sha256=xwhRmlw7WBrtqyilZsoMHkHpyAx7iXQ-YexDMJURTao,3702
+tests/fetchers/test_utils.py,sha256=FPPJkBrqgYxdGeWwapH8Vj8zyfYVLiTE1qSLu8eBWik,5728
+tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+tests/parser/test_automatch.py,sha256=BeeYJi3cYCghbiZmi57z4bqcGPaoUA8GAm7MALBBkkk,2486
+tests/parser/test_general.py,sha256=sPbwQRka9Mh8MDz2Sto8Rwg78t0SWWxELgzhTVPEplE,11785
+scrapling-0.2.8.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
+scrapling-0.2.8.dist-info/METADATA,sha256=0As--zWykpljObaw8DZQJr6udpHm4NyRN-dfUOUrhBc,66605
+scrapling-0.2.8.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
+scrapling-0.2.8.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
+scrapling-0.2.8.dist-info/RECORD,,

tests/fetchers/test_camoufox.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import unittest
 import pytest_httpbin
 from scrapling import StealthyFetcher

tests/fetchers/test_httpx.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import unittest
 import pytest_httpbin
 from scrapling import Fetcher

tests/fetchers/test_playwright.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import unittest
 import pytest_httpbin
 from scrapling import PlayWrightFetcher

tests/parser/test_general.py CHANGED Viewed

@@ -1,9 +1,11 @@
 import pickle
 import unittest
-from scrapling import Adaptor
 from cssselect import SelectorError, SelectorSyntaxError
+from scrapling import Adaptor
 class TestParser(unittest.TestCase):
     def setUp(self):

scrapling-0.2.6.dist-info/RECORD DELETED Viewed

@@ -1,42 +0,0 @@
-scrapling/__init__.py,sha256=NnIpEZcBGs5Pu2TjqPCacC7N6LN37SbnniBU1AhgdXs,435
-scrapling/defaults.py,sha256=blYDLiuI5DgDSLRWnUgpp21WtFOsv1BsCRCmPeg8Xc4,287
-scrapling/fetchers.py,sha256=-gc-Yo1MjF_4cdJ-5rxZqNC0owxFXTFoEBj08BFEYPs,16361
-scrapling/parser.py,sha256=d2n00uF5i7W5lf0afLNRdk17ZFcNyiF9EzXLRQGA0NM,54111
-scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
-scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-scrapling/core/_types.py,sha256=nD2ZY_fitLohx3MfDmqoKJ9ZShrnRhQ8-d1SU1zEGAY,552
-scrapling/core/custom_types.py,sha256=ztE_tshJ8i5uKqqSbsN5S6MoIUSfX6SexlhRjAnkclk,8402
-scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
-scrapling/core/storage_adaptors.py,sha256=Kbak0BOJX5e9I1PbUS_4sUJi2Wxw8Bv5XsaLHAu1l2Q,6218
-scrapling/core/translator.py,sha256=R97lKGq1SDbx8S8Hg_w_5d4ePgukTHj_hRIKFzWiRuc,5229
-scrapling/core/utils.py,sha256=fXdANUgRBbVbOerJ94fRY9vi7n5zsbm8t3G4qQ-F3ak,3792
-scrapling/engines/__init__.py,sha256=zwMqcSdNGh-IX0d4zXazrgAeHrkqIN_v5Ia7RU1g8W0,267
-scrapling/engines/camo.py,sha256=dXkdfFmf3M09RXAvaZ8CE5khsblC3Wd7_6jWfu8XO6I,7618
-scrapling/engines/constants.py,sha256=jSDA6lgbvEIB8z2m2SFzCKkvFEZnp28Mondy2__FpkM,3721
-scrapling/engines/pw.py,sha256=gMWJAZYpJbFK-GiyRrpVrMjyMqSSetE6hf8kmf0zR2o,12729
-scrapling/engines/static.py,sha256=wzBsoOHPpN5JV1izQSSSarPBNWB-wo0BDWNFuin6ID8,7109
-scrapling/engines/toolbelt/__init__.py,sha256=BbxfC0depVOV3i3BnBnyfjHtLcZrDbhz6c5rTRczZUc,383
-scrapling/engines/toolbelt/custom.py,sha256=ELr3_FwUqNI27E98kz-50OA5a6hQQtoIYrZoLKsvUpM,12551
-scrapling/engines/toolbelt/fingerprints.py,sha256=T9HQejHzAnHsD5EIXvrYVC5siiG5q2gOOXVIIANmzMc,2917
-scrapling/engines/toolbelt/navigation.py,sha256=Tde5_6Wv7lOeWXMzs9D6TRaxAbJ3b-zIX6-4HggZbCQ,4017
-scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
-scrapling/engines/toolbelt/bypasses/notification_permission.js,sha256=poPM3o5WYgEX-EdiUfDCllpWfc3Umvw4jr2u6O6elus,237
-scrapling/engines/toolbelt/bypasses/pdf_viewer.js,sha256=mKjjSuP1-BOGC_2WhRYHJo_LP7lTBi2KXmP_zsHO_tI,173
-scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js,sha256=3RP1AE_XZRvpupeV_i-WSNVqRxyUy0qd8rQV8j_4j3U,221
-scrapling/engines/toolbelt/bypasses/screen_props.js,sha256=fZEuHMQ1-fYuxxUMoQXUvVWYUkPUbblkfMfpiLvBY7w,599
-scrapling/engines/toolbelt/bypasses/webdriver_fully.js,sha256=hdJw4clRAJQqIdq5gIFC_eC-x7C1i2ab01KV5ylmOBs,728
-scrapling/engines/toolbelt/bypasses/window_chrome.js,sha256=D7hqzNGGDorh8JVlvm2YIv7Bk2CoVkG55MDIdyqhT1w,6808
-tests/__init__.py,sha256=YHFB5ftzgLQVh6gbPfbYcY4yOS9DOBp5dBa6I-qtm8U,32
-tests/fetchers/__init__.py,sha256=6H4NgARhyTcGGd3dNCKQJ8kUFdrAEMSScQL7Ga_vU3c,43
-tests/fetchers/test_camoufox.py,sha256=53piGA5uuPvOx5BeUEA0bbizYihwHGxehnj5uqCr6Q0,3115
-tests/fetchers/test_httpx.py,sha256=UivOItR3-l-bXp9E6TP5Tvn2OrCdgiVkWsti-f9xdpU,3507
-tests/fetchers/test_playwright.py,sha256=7qwbIU2SwjiQEbaGPA_MBo6kAXM4IBmfvy5kUvKT11M,3701
-tests/fetchers/test_utils.py,sha256=FPPJkBrqgYxdGeWwapH8Vj8zyfYVLiTE1qSLu8eBWik,5728
-tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-tests/parser/test_automatch.py,sha256=BeeYJi3cYCghbiZmi57z4bqcGPaoUA8GAm7MALBBkkk,2486
-tests/parser/test_general.py,sha256=qaiVzpvqESfdXYFat6QrpnMkevPYgCzIcTZK5FwdC0s,11783
-scrapling-0.2.6.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
-scrapling-0.2.6.dist-info/METADATA,sha256=cFOu2nlkXDsjyjkIt9kDu1nKKvS14xYH2LT4_VNH5j0,65362
-scrapling-0.2.6.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
-scrapling-0.2.6.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
-scrapling-0.2.6.dist-info/RECORD,,

{scrapling-0.2.6.dist-info → scrapling-0.2.8.dist-info}/LICENSE RENAMED Viewed

File without changes

{scrapling-0.2.6.dist-info → scrapling-0.2.8.dist-info}/WHEEL RENAMED Viewed

File without changes

{scrapling-0.2.6.dist-info → scrapling-0.2.8.dist-info}/top_level.txt RENAMED Viewed

File without changes

scrapling 0.2.6__py3-none-any.whl → 0.2.8__py3-none-any.whl

scrapling 0.2.6py3-none-any.whl → 0.2.8py3-none-any.whl