PyPI - scrapling - Versions diffs - 0.2.97__py3-none-any.whl → 0.2.99__py3-none-any.whl - Mend

scrapling 0.2.97py3-none-any.whl → 0.2.99py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

scrapling/__init__.py +35 -6
scrapling/core/storage_adaptors.py +3 -3
scrapling/core/translator.py +3 -0
scrapling/core/utils.py +1 -1
scrapling/defaults.py +24 -9
scrapling/engines/camo.py +42 -2
scrapling/engines/pw.py +40 -3
scrapling/engines/static.py +1 -1
scrapling/engines/toolbelt/custom.py +67 -31
scrapling/engines/toolbelt/fingerprints.py +2 -2
scrapling/engines/toolbelt/navigation.py +1 -1
scrapling/fetchers.py +142 -41
scrapling/parser.py +6 -12
scrapling-0.2.99.dist-info/METADATA +290 -0
{scrapling-0.2.97.dist-info → scrapling-0.2.99.dist-info}/RECORD +25 -25
{scrapling-0.2.97.dist-info → scrapling-0.2.99.dist-info}/WHEEL +1 -1
tests/fetchers/async/test_camoufox.py +3 -1
tests/fetchers/async/test_httpx.py +3 -1
tests/fetchers/async/test_playwright.py +3 -1
tests/fetchers/sync/test_camoufox.py +3 -1
tests/fetchers/sync/test_httpx.py +3 -1
tests/fetchers/sync/test_playwright.py +3 -1
scrapling-0.2.97.dist-info/METADATA +0 -867
{scrapling-0.2.97.dist-info → scrapling-0.2.99.dist-info}/entry_points.txt +0 -0
{scrapling-0.2.97.dist-info → scrapling-0.2.99.dist-info/licenses}/LICENSE +0 -0
{scrapling-0.2.97.dist-info → scrapling-0.2.99.dist-info}/top_level.txt +0 -0

scrapling/fetchers.py CHANGED Viewed

@@ -10,9 +10,10 @@ class Fetcher(BaseFetcher):
     Any additional keyword arguments passed to the methods below are passed to the respective httpx's method directly.
     """
+    @classmethod
     def get(
-            self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
-            proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
+            cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
+            proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
         """Make basic HTTP GET request for you but with some added flavors.
         :param url: Target url.
@@ -22,16 +23,23 @@ class Fetcher(BaseFetcher):
             create a referer header as if this request had came from Google's search of this URL's domain.
         :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
         :param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
+        :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
         :param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        adaptor_arguments = tuple(self.adaptor_arguments.items())
+        if not custom_config:
+            custom_config = {}
+        elif not isinstance(custom_config, dict):
+            ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
+        adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
         response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).get(**kwargs)
         return response_object
+    @classmethod
     def post(
-            self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
-            proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
+            cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
+            proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
         """Make basic HTTP POST request for you but with some added flavors.
         :param url: Target url.
@@ -41,16 +49,23 @@ class Fetcher(BaseFetcher):
             create a referer header as if this request came from Google's search of this URL's domain.
         :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
         :param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
+        :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
         :param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        adaptor_arguments = tuple(self.adaptor_arguments.items())
+        if not custom_config:
+            custom_config = {}
+        elif not isinstance(custom_config, dict):
+            ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
+        adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
         response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).post(**kwargs)
         return response_object
+    @classmethod
     def put(
-            self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
-            proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
+            cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
+            proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
         """Make basic HTTP PUT request for you but with some added flavors.
         :param url: Target url
@@ -60,17 +75,24 @@ class Fetcher(BaseFetcher):
             create a referer header as if this request came from Google's search of this URL's domain.
         :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
         :param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
+        :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
         :param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        adaptor_arguments = tuple(self.adaptor_arguments.items())
+        if not custom_config:
+            custom_config = {}
+        elif not isinstance(custom_config, dict):
+            ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
+        adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
         response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).put(**kwargs)
         return response_object
+    @classmethod
     def delete(
-            self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
-            proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
+            cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
+            proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
         """Make basic HTTP DELETE request for you but with some added flavors.
         :param url: Target url
@@ -80,18 +102,25 @@ class Fetcher(BaseFetcher):
             create a referer header as if this request came from Google's search of this URL's domain.
         :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
         :param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
+        :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
         :param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        adaptor_arguments = tuple(self.adaptor_arguments.items())
+        if not custom_config:
+            custom_config = {}
+        elif not isinstance(custom_config, dict):
+            ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
+        adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
         response_object = StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries, adaptor_arguments=adaptor_arguments).delete(**kwargs)
         return response_object
 class AsyncFetcher(Fetcher):
+    @classmethod
     async def get(
-            self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
-            proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
+            cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
+            proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
         """Make basic HTTP GET request for you but with some added flavors.
         :param url: Target url.
@@ -101,16 +130,23 @@ class AsyncFetcher(Fetcher):
             create a referer header as if this request had came from Google's search of this URL's domain.
         :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
         :param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
+        :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
         :param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        adaptor_arguments = tuple(self.adaptor_arguments.items())
+        if not custom_config:
+            custom_config = {}
+        elif not isinstance(custom_config, dict):
+            ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
+        adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
         response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_get(**kwargs)
         return response_object
+    @classmethod
     async def post(
-            self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
-            proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
+            cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
+            proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
         """Make basic HTTP POST request for you but with some added flavors.
         :param url: Target url.
@@ -120,16 +156,23 @@ class AsyncFetcher(Fetcher):
             create a referer header as if this request came from Google's search of this URL's domain.
         :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
         :param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
+        :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
         :param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        adaptor_arguments = tuple(self.adaptor_arguments.items())
+        if not custom_config:
+            custom_config = {}
+        elif not isinstance(custom_config, dict):
+            ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
+        adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
         response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_post(**kwargs)
         return response_object
+    @classmethod
     async def put(
-            self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
-            proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
+            cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
+            proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
         """Make basic HTTP PUT request for you but with some added flavors.
         :param url: Target url
@@ -139,16 +182,23 @@ class AsyncFetcher(Fetcher):
             create a referer header as if this request came from Google's search of this URL's domain.
         :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
         :param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
+        :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
         :param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        adaptor_arguments = tuple(self.adaptor_arguments.items())
+        if not custom_config:
+            custom_config = {}
+        elif not isinstance(custom_config, dict):
+            ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
+        adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
         response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_put(**kwargs)
         return response_object
+    @classmethod
     async def delete(
-            self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
-            proxy: Optional[str] = None, retries: Optional[int] = 3, **kwargs: Dict) -> Response:
+            cls, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: bool = True,
+            proxy: Optional[str] = None, retries: Optional[int] = 3, custom_config: Dict = None, **kwargs: Dict) -> Response:
         """Make basic HTTP DELETE request for you but with some added flavors.
         :param url: Target url
@@ -158,10 +208,16 @@ class AsyncFetcher(Fetcher):
             create a referer header as if this request came from Google's search of this URL's domain.
         :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
         :param retries: The number of retries to do through httpx if the request failed for any reason. The default is 3 retries.
+        :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
         :param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
-        adaptor_arguments = tuple(self.adaptor_arguments.items())
+        if not custom_config:
+            custom_config = {}
+        elif not isinstance(custom_config, dict):
+            ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
+        adaptor_arguments = tuple({**cls._generate_parser_arguments(), **custom_config}.items())
         response_object = await StaticEngine(url, proxy, stealthy_headers, follow_redirects, timeout, retries=retries, adaptor_arguments=adaptor_arguments).async_delete(**kwargs)
         return response_object
@@ -172,12 +228,14 @@ class StealthyFetcher(BaseFetcher):
      It works as real browsers passing almost all online tests/protections based on Camoufox.
      Other added flavors include setting the faked OS fingerprints to match the user's OS and the referer of every request is set as if this request came from Google's search of this URL's domain.
     """
+    @classmethod
     def fetch(
-            self, url: str, headless: Union[bool, Literal['virtual']] = True, block_images: bool = False, disable_resources: bool = False,
-            block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, addons: Optional[List[str]] = None,
+            cls, url: str, headless: Union[bool, Literal['virtual']] = True, block_images: bool = False, disable_resources: bool = False,
+            block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, addons: Optional[List[str]] = None, wait: Optional[int] = 0,
             timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
             wait_selector_state: SelectorWaitStates = 'attached', google_search: bool = True, extra_headers: Optional[Dict[str, str]] = None,
             proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: bool = False, disable_ads: bool = False, geoip: bool = False,
+            custom_config: Dict = None, additional_arguments: Dict = None
     ) -> Response:
         """
         Opens up a browser and do your request based on your chosen options below.
@@ -198,16 +256,25 @@ class StealthyFetcher(BaseFetcher):
             It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
         :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
-        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
+        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000.
+        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning `Response` object.
         :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
         :param wait_selector: Wait for a specific css selector to be in a specific state.
         :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
         :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
         :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
         :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
+        :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
+        :param additional_arguments: Additional arguments to be passed to Camoufox as additional settings and it takes higher priority than Scrapling's settings.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
+        if not custom_config:
+            custom_config = {}
+        elif not isinstance(custom_config, dict):
+            ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
         engine = CamoufoxEngine(
+            wait=wait,
             proxy=proxy,
             geoip=geoip,
             addons=addons,
@@ -226,16 +293,19 @@ class StealthyFetcher(BaseFetcher):
             extra_headers=extra_headers,
             disable_resources=disable_resources,
             wait_selector_state=wait_selector_state,
-            adaptor_arguments=self.adaptor_arguments,
+            adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
+            additional_arguments=additional_arguments or {}
         )
         return engine.fetch(url)
+    @classmethod
     async def async_fetch(
-            self, url: str, headless: Union[bool, Literal['virtual']] = True, block_images: bool = False, disable_resources: bool = False,
-            block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, addons: Optional[List[str]] = None,
+            cls, url: str, headless: Union[bool, Literal['virtual']] = True, block_images: bool = False, disable_resources: bool = False,
+            block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, addons: Optional[List[str]] = None, wait: Optional[int] = 0,
             timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
             wait_selector_state: SelectorWaitStates = 'attached', google_search: bool = True, extra_headers: Optional[Dict[str, str]] = None,
             proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: bool = False, disable_ads: bool = False, geoip: bool = False,
+            custom_config: Dict = None, additional_arguments: Dict = None
     ) -> Response:
         """
         Opens up a browser and do your request based on your chosen options below.
@@ -257,15 +327,24 @@ class StealthyFetcher(BaseFetcher):
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
         :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
         :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
+        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning `Response` object.
         :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
         :param wait_selector: Wait for a specific css selector to be in a specific state.
         :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
         :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
         :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
         :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
+        :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
+        :param additional_arguments: Additional arguments to be passed to Camoufox as additional settings and it takes higher priority than Scrapling's settings.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
+        if not custom_config:
+            custom_config = {}
+        elif not isinstance(custom_config, dict):
+            ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
         engine = CamoufoxEngine(
+            wait=wait,
             proxy=proxy,
             geoip=geoip,
             addons=addons,
@@ -284,7 +363,8 @@ class StealthyFetcher(BaseFetcher):
             extra_headers=extra_headers,
             disable_resources=disable_resources,
             wait_selector_state=wait_selector_state,
-            adaptor_arguments=self.adaptor_arguments,
+            adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
+            additional_arguments=additional_arguments or {}
         )
         return await engine.async_fetch(url)
@@ -305,15 +385,17 @@ class PlayWrightFetcher(BaseFetcher):
     > Note that these are the main options with PlayWright but it can be mixed together.
     """
+    @classmethod
     def fetch(
-            self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
-            useragent: Optional[str] = None, network_idle: bool = False, timeout: Optional[float] = 30000,
+            cls, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
+            useragent: Optional[str] = None, network_idle: bool = False, timeout: Optional[float] = 30000, wait: Optional[int] = 0,
             page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached',
             hide_canvas: bool = False, disable_webgl: bool = False, extra_headers: Optional[Dict[str, str]] = None, google_search: bool = True,
             proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
             stealth: bool = False, real_chrome: bool = False,
             cdp_url: Optional[str] = None,
             nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
+            custom_config: Dict = None
     ) -> Response:
         """Opens up a browser and do your request based on your chosen options below.
@@ -324,7 +406,8 @@ class PlayWrightFetcher(BaseFetcher):
             This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
         :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
-        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
+        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000.
+        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning `Response` object.
         :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
         :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
         :param wait_selector: Wait for a specific css selector to be in a specific state.
@@ -339,9 +422,16 @@ class PlayWrightFetcher(BaseFetcher):
         :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
         :param nstbrowser_mode: Enables NSTBrowser mode, it have to be used with `cdp_url` argument or it will get completely ignored.
         :param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
+        :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
+        if not custom_config:
+            custom_config = {}
+        elif not isinstance(custom_config, dict):
+            ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
         engine = PlaywrightEngine(
+            wait=wait,
             proxy=proxy,
             locale=locale,
             timeout=timeout,
@@ -361,19 +451,21 @@ class PlayWrightFetcher(BaseFetcher):
             nstbrowser_config=nstbrowser_config,
             disable_resources=disable_resources,
             wait_selector_state=wait_selector_state,
-            adaptor_arguments=self.adaptor_arguments,
+            adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
         )
         return engine.fetch(url)
+    @classmethod
     async def async_fetch(
-            self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
-            useragent: Optional[str] = None, network_idle: bool = False, timeout: Optional[float] = 30000,
+            cls, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
+            useragent: Optional[str] = None, network_idle: bool = False, timeout: Optional[float] = 30000, wait: Optional[int] = 0,
             page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached',
             hide_canvas: bool = False, disable_webgl: bool = False, extra_headers: Optional[Dict[str, str]] = None, google_search: bool = True,
             proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
             stealth: bool = False, real_chrome: bool = False,
             cdp_url: Optional[str] = None,
             nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
+            custom_config: Dict = None
     ) -> Response:
         """Opens up a browser and do your request based on your chosen options below.
@@ -384,7 +476,8 @@ class PlayWrightFetcher(BaseFetcher):
             This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
         :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
         :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
-        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
+        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000.
+        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning `Response` object.
         :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
         :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
         :param wait_selector: Wait for a specific css selector to be in a specific state.
@@ -399,9 +492,16 @@ class PlayWrightFetcher(BaseFetcher):
         :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
         :param nstbrowser_mode: Enables NSTBrowser mode, it have to be used with `cdp_url` argument or it will get completely ignored.
         :param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
+        :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
         :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
         """
+        if not custom_config:
+            custom_config = {}
+        elif not isinstance(custom_config, dict):
+            ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
         engine = PlaywrightEngine(
+            wait=wait,
             proxy=proxy,
             locale=locale,
             timeout=timeout,
@@ -421,12 +521,13 @@ class PlayWrightFetcher(BaseFetcher):
             nstbrowser_config=nstbrowser_config,
             disable_resources=disable_resources,
             wait_selector_state=wait_selector_state,
-            adaptor_arguments=self.adaptor_arguments,
+            adaptor_arguments={**cls._generate_parser_arguments(), **custom_config},
         )
         return await engine.async_fetch(url)
 class CustomFetcher(BaseFetcher):
-    def fetch(self, url: str, browser_engine, **kwargs) -> Response:
-        engine = check_if_engine_usable(browser_engine)(adaptor_arguments=self.adaptor_arguments, **kwargs)
+    @classmethod
+    def fetch(cls, url: str, browser_engine, **kwargs) -> Response:
+        engine = check_if_engine_usable(browser_engine)(adaptor_arguments=cls._generate_parser_arguments(), **kwargs)
         return engine.fetch(url)

scrapling/parser.py CHANGED Viewed

@@ -17,7 +17,7 @@ from scrapling.core.custom_types import (AttributesHandler, TextHandler,
 from scrapling.core.mixins import SelectorsGeneration
 from scrapling.core.storage_adaptors import (SQLiteStorageSystem,
                                              StorageSystemMixin, _StorageTools)
-from scrapling.core.translator import HTMLTranslator
+from scrapling.core.translator import translator_instance
 from scrapling.core.utils import (clean_spaces, flatten, html_forbidden,
                                   is_jsonable, log)
@@ -26,7 +26,7 @@ class Adaptor(SelectorsGeneration):
     __slots__ = (
         'url', 'encoding', '__auto_match_enabled', '_root', '_storage',
         '__keep_comments', '__huge_tree_enabled', '__attributes', '__text', '__tag',
-        '__keep_cdata', '__raw_body'
+        '__keep_cdata'
     )
     def __init__(
@@ -39,7 +39,7 @@ class Adaptor(SelectorsGeneration):
             root: Optional[html.HtmlElement] = None,
             keep_comments: Optional[bool] = False,
             keep_cdata: Optional[bool] = False,
-            auto_match: Optional[bool] = True,
+            auto_match: Optional[bool] = False,
             storage: Any = SQLiteStorageSystem,
             storage_args: Optional[Dict] = None,
             **kwargs
@@ -72,20 +72,17 @@ class Adaptor(SelectorsGeneration):
             raise ValueError("Adaptor class needs text, body, or root arguments to work")
         self.__text = ''
-        self.__raw_body = ''
         if root is None:
             if text is None:
                 if not body or not isinstance(body, bytes):
                     raise TypeError(f"body argument must be valid and of type bytes, got {body.__class__}")
                 body = body.replace(b"\x00", b"").strip()
-                self.__raw_body = body.replace(b"\x00", b"").strip().decode()
             else:
                 if not isinstance(text, str):
                     raise TypeError(f"text argument must be of type str, got {text.__class__}")
                 body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
-                self.__raw_body = text.strip()
             # https://lxml.de/api/lxml.etree.HTMLParser-class.html
             parser = html.HTMLParser(
@@ -250,10 +247,7 @@ class Adaptor(SelectorsGeneration):
         """Return the inner html code of the element"""
         return TextHandler(etree.tostring(self._root, encoding='unicode', method='html', with_tail=False))
-    @property
-    def body(self) -> TextHandler:
-        """Return raw HTML code of the element/page without any processing when possible or return `Adaptor.html_content`"""
-        return TextHandler(self.__raw_body) or self.html_content
+    body = html_content
     def prettify(self) -> TextHandler:
         """Return a prettified version of the element's inner html-code"""
@@ -476,7 +470,7 @@ class Adaptor(SelectorsGeneration):
         try:
             if not self.__auto_match_enabled or ',' not in selector:
                 # No need to split selectors in this case, let's save some CPU cycles :)
-                xpath_selector = HTMLTranslator().css_to_xpath(selector)
+                xpath_selector = translator_instance.css_to_xpath(selector)
                 return self.xpath(xpath_selector, identifier or selector, auto_match, auto_save, percentage)
             results = []
@@ -484,7 +478,7 @@ class Adaptor(SelectorsGeneration):
                 for single_selector in split_selectors(selector):
                     # I'm doing this only so the `save` function save data correctly for combined selectors
                     # Like using the ',' to combine two different selectors that point to different elements.
-                    xpath_selector = HTMLTranslator().css_to_xpath(single_selector.canonical())
+                    xpath_selector = translator_instance.css_to_xpath(single_selector.canonical())
                     results += self.xpath(
                         xpath_selector, identifier or single_selector.canonical(), auto_match, auto_save, percentage
                     )

scrapling 0.2.97__py3-none-any.whl → 0.2.99__py3-none-any.whl

scrapling 0.2.97py3-none-any.whl → 0.2.99py3-none-any.whl