PyPI - scrapling - Versions diffs - 0.2.93__tar.gz → 0.2.94__tar.gz - Mend

scrapling 0.2.93tar.gz → 0.2.94tar.gz

Files changed (56) hide show

{scrapling-0.2.93/scrapling.egg-info → scrapling-0.2.94}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: scrapling
-Version: 0.2.93
+Version: 0.2.94
 Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
 Home-page: https://github.com/D4Vinci/Scrapling
 Author: Karim Shoair
@@ -40,7 +40,7 @@ Requires-Dist: tldextract
 Requires-Dist: httpx[brotli,socks,zstd]
 Requires-Dist: playwright>=1.49.1
 Requires-Dist: rebrowser-playwright>=1.49.1
-Requires-Dist: camoufox[geoip]>=0.4.10
+Requires-Dist: camoufox[geoip]>=0.4.11
 Dynamic: author
 Dynamic: author-email
 Dynamic: classifier
@@ -267,7 +267,7 @@ then use it right away without initializing like:
 page = StealthyFetcher.fetch('https://example.com')
 ```
-Also, the `Response` object returned from all fetchers is the same as the `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
+Also, the `Response` object returned from all fetchers is the same as the `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, `history`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
 > [!NOTE]
 > The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
 ### Fetcher

{scrapling-0.2.93 → scrapling-0.2.94}/README.md RENAMED Viewed

@@ -212,7 +212,7 @@ then use it right away without initializing like:
 page = StealthyFetcher.fetch('https://example.com')
 ```
-Also, the `Response` object returned from all fetchers is the same as the `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
+Also, the `Response` object returned from all fetchers is the same as the `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, `history`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
 > [!NOTE]
 > The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
 ### Fetcher

{scrapling-0.2.93 → scrapling-0.2.94}/scrapling/__init__.py RENAMED Viewed

@@ -5,7 +5,7 @@ from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
 from scrapling.parser import Adaptor, Adaptors
 __author__ = "Karim Shoair (karim.shoair@pm.me)"
-__version__ = "0.2.93"
+__version__ = "0.2.94"
 __copyright__ = "Copyright (c) 2024 Karim Shoair"

{scrapling-0.2.93 → scrapling-0.2.94}/scrapling/core/custom_types.py RENAMED Viewed

@@ -134,7 +134,7 @@ class TextHandler(str):
         check_match: Literal[True],
         replace_entities: bool = True,
         clean_match: bool = False,
-        case_sensitive: bool = False,
+        case_sensitive: bool = True,
     ) -> bool:
         ...
@@ -144,26 +144,26 @@ class TextHandler(str):
         regex: Union[str, Pattern[str]],
         replace_entities: bool = True,
         clean_match: bool = False,
-        case_sensitive: bool = False,
+        case_sensitive: bool = True,
         check_match: Literal[False] = False,
     ) -> "TextHandlers[TextHandler]":
         ...
     def re(
             self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
-            case_sensitive: bool = False, check_match: bool = False
+            case_sensitive: bool = True, check_match: bool = False
     ) -> Union["TextHandlers[TextHandler]", bool]:
         """Apply the given regex to the current text and return a list of strings with the matches.
         :param regex: Can be either a compiled regular expression or a string.
         :param replace_entities: if enabled character entity references are replaced by their corresponding character
         :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
-        :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
+        :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
         :param check_match: used to quickly check if this regex matches or not without any operations on the results
         """
         if isinstance(regex, str):
-            if not case_sensitive:
+            if case_sensitive:
                 regex = re.compile(regex, re.UNICODE)
             else:
                 regex = re.compile(regex, flags=re.UNICODE | re.IGNORECASE)
@@ -182,14 +182,14 @@ class TextHandler(str):
         return TextHandlers(typing.cast(List[_TextHandlerType], [TextHandler(_replace_entities(s)) for s in results]))
     def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
-                 clean_match: bool = False, case_sensitive: bool = False) -> "TextHandler":
+                 clean_match: bool = False, case_sensitive: bool = True) -> "TextHandler":
         """Apply the given regex to text and return the first match if found, otherwise return the default value.
         :param regex: Can be either a compiled regular expression or a string.
         :param default: The default value to be returned if there is no match
         :param replace_entities: if enabled character entity references are replaced by their corresponding character
         :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
-        :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
+        :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
         """
         result = self.re(regex, replace_entities, clean_match=clean_match, case_sensitive=case_sensitive)
@@ -218,14 +218,14 @@ class TextHandlers(List[TextHandler]):
         return typing.cast(_TextHandlerType, TextHandler(lst))
     def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
-            case_sensitive: bool = False) -> 'TextHandlers[TextHandler]':
+            case_sensitive: bool = True) -> 'TextHandlers[TextHandler]':
         """Call the ``.re()`` method for each element in this list and return
         their results flattened as TextHandlers.
         :param regex: Can be either a compiled regular expression or a string.
         :param replace_entities: if enabled character entity references are replaced by their corresponding character
         :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
-        :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
+        :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
         """
         results = [
             n.re(regex, replace_entities, clean_match, case_sensitive) for n in self
@@ -233,7 +233,7 @@ class TextHandlers(List[TextHandler]):
         return TextHandlers(flatten(results))
     def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
-                 clean_match: bool = False, case_sensitive: bool = False) -> TextHandler:
+                 clean_match: bool = False, case_sensitive: bool = True) -> TextHandler:
         """Call the ``.re_first()`` method for each element in this list and return
         the first result or the default value otherwise.
@@ -241,7 +241,7 @@ class TextHandlers(List[TextHandler]):
         :param default: The default value to be returned if there is no match
         :param replace_entities: if enabled character entity references are replaced by their corresponding character
         :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
-        :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
+        :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
         """
         for n in self:
             for result in n.re(regex, replace_entities, clean_match, case_sensitive):

{scrapling-0.2.93 → scrapling-0.2.94}/scrapling/engines/camo.py RENAMED Viewed

@@ -95,7 +95,6 @@ class CamoufoxEngine:
         with Camoufox(
                 geoip=self.geoip,
                 proxy=self.proxy,
-                disable_coop=True,
                 enable_cache=True,
                 addons=self.addons,
                 exclude_addons=addons,
@@ -142,6 +141,26 @@ class CamoufoxEngine:
             # PlayWright API sometimes give empty status text for some reason!
             status_text = final_response.status_text or StatusText.get(final_response.status)
+            history = []
+            current_request = first_response.request.redirected_from
+            while current_request:
+                current_response = current_request.response()
+                history.insert(0, Response(
+                    url=current_request.url,
+                    # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
+                    text='',
+                    body=b'',
+                    status=current_response.status if current_response else 301,
+                    reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
+                    encoding=current_response.headers.get('content-type', '') or 'utf-8',
+                    cookies={},
+                    headers=current_response.all_headers() if current_response else {},
+                    request_headers=current_request.all_headers(),
+                    **self.adaptor_arguments
+                ))
+                current_request = current_request.redirected_from
             response = Response(
                 url=page.url,
                 text=page.content(),
@@ -152,6 +171,7 @@ class CamoufoxEngine:
                 cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
                 headers=first_response.all_headers(),
                 request_headers=first_response.request.all_headers(),
+                history=history,
                 **self.adaptor_arguments
             )
             page.close()
@@ -176,7 +196,6 @@ class CamoufoxEngine:
         async with AsyncCamoufox(
                 geoip=self.geoip,
                 proxy=self.proxy,
-                disable_coop=True,
                 enable_cache=True,
                 addons=self.addons,
                 exclude_addons=addons,
@@ -223,6 +242,26 @@ class CamoufoxEngine:
             # PlayWright API sometimes give empty status text for some reason!
             status_text = final_response.status_text or StatusText.get(final_response.status)
+            history = []
+            current_request = first_response.request.redirected_from
+            while current_request:
+                current_response = await current_request.response()
+                history.insert(0, Response(
+                    url=current_request.url,
+                    # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
+                    text='',
+                    body=b'',
+                    status=current_response.status if current_response else 301,
+                    reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
+                    encoding=current_response.headers.get('content-type', '') or 'utf-8',
+                    cookies={},
+                    headers=await current_response.all_headers() if current_response else {},
+                    request_headers=await current_request.all_headers(),
+                    **self.adaptor_arguments
+                ))
+                current_request = current_request.redirected_from
             response = Response(
                 url=page.url,
                 text=await page.content(),
@@ -233,6 +272,7 @@ class CamoufoxEngine:
                 cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
                 headers=await first_response.all_headers(),
                 request_headers=await first_response.request.all_headers(),
+                history=history,
                 **self.adaptor_arguments
             )
             await page.close()

{scrapling-0.2.93 → scrapling-0.2.94}/scrapling/engines/pw.py RENAMED Viewed

@@ -259,6 +259,26 @@ class PlaywrightEngine:
             # PlayWright API sometimes give empty status text for some reason!
             status_text = final_response.status_text or StatusText.get(final_response.status)
+            history = []
+            current_request = first_response.request.redirected_from
+            while current_request:
+                current_response = current_request.response()
+                history.insert(0, Response(
+                    url=current_request.url,
+                    # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
+                    text='',
+                    body=b'',
+                    status=current_response.status if current_response else 301,
+                    reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
+                    encoding=current_response.headers.get('content-type', '') or 'utf-8',
+                    cookies={},
+                    headers=current_response.all_headers() if current_response else {},
+                    request_headers=current_request.all_headers(),
+                    **self.adaptor_arguments
+                ))
+                current_request = current_request.redirected_from
             response = Response(
                 url=page.url,
                 text=page.content(),
@@ -269,6 +289,7 @@ class PlaywrightEngine:
                 cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
                 headers=first_response.all_headers(),
                 request_headers=first_response.request.all_headers(),
+                history=history,
                 **self.adaptor_arguments
             )
             page.close()
@@ -345,6 +366,26 @@ class PlaywrightEngine:
             # PlayWright API sometimes give empty status text for some reason!
             status_text = final_response.status_text or StatusText.get(final_response.status)
+            history = []
+            current_request = first_response.request.redirected_from
+            while current_request:
+                current_response = await current_request.response()
+                history.insert(0, Response(
+                    url=current_request.url,
+                    # using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
+                    text='',
+                    body=b'',
+                    status=current_response.status if current_response else 301,
+                    reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
+                    encoding=current_response.headers.get('content-type', '') or 'utf-8',
+                    cookies={},
+                    headers=await current_response.all_headers() if current_response else {},
+                    request_headers=await current_request.all_headers(),
+                    **self.adaptor_arguments
+                ))
+                current_request = current_request.redirected_from
             response = Response(
                 url=page.url,
                 text=await page.content(),
@@ -355,6 +396,7 @@ class PlaywrightEngine:
                 cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
                 headers=await first_response.all_headers(),
                 request_headers=await first_response.request.all_headers(),
+                history=history,
                 **self.adaptor_arguments
             )
             await page.close()

{scrapling-0.2.93 → scrapling-0.2.94}/scrapling/engines/static.py RENAMED Viewed

@@ -72,6 +72,7 @@ class StaticEngine:
             headers=dict(response.headers),
             request_headers=dict(response.request.headers),
             method=response.request.method,
+            history=[self._prepare_response(redirection) for redirection in response.history],
             **self.adaptor_arguments
         )

{scrapling-0.2.93 → scrapling-0.2.94}/scrapling/engines/toolbelt/custom.py RENAMED Viewed

@@ -85,13 +85,14 @@ class Response(Adaptor):
     """This class is returned by all engines as a way to unify response type between different libraries."""
     def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict,
-                 encoding: str = 'utf-8', method: str = 'GET', **adaptor_arguments: Dict):
+                 encoding: str = 'utf-8', method: str = 'GET', history: List = None, **adaptor_arguments: Dict):
         automatch_domain = adaptor_arguments.pop('automatch_domain', None)
         self.status = status
         self.reason = reason
         self.cookies = cookies
         self.headers = headers
         self.request_headers = request_headers
+        self.history = history or []
         encoding = ResponseEncoding.get_value(encoding, text)
         super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
         # For back-ward compatibility

{scrapling-0.2.93 → scrapling-0.2.94}/scrapling/parser.py RENAMED Viewed

@@ -132,7 +132,7 @@ class Adaptor(SelectorsGeneration):
         self.__tag = None
         # No need to check if all response attributes exist or not because if `status` exist, then the rest exist (Save some CPU cycles for speed)
         self.__response_data = {
-            key: getattr(self, key) for key in ('status', 'reason', 'cookies', 'headers', 'request_headers',)
+            key: getattr(self, key) for key in ('status', 'reason', 'cookies', 'history', 'headers', 'request_headers',)
         } if hasattr(self, 'status') else {}
     # Node functionalities, I wanted to move to separate Mixin class but it had slight impact on performance
@@ -763,25 +763,25 @@ class Adaptor(SelectorsGeneration):
             return self.get_all_text(strip=True).json()
     def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True,
-           clean_match: bool = False, case_sensitive: bool = False) -> TextHandlers:
+           clean_match: bool = False, case_sensitive: bool = True) -> TextHandlers:
         """Apply the given regex to the current text and return a list of strings with the matches.
         :param regex: Can be either a compiled regular expression or a string.
         :param replace_entities: if enabled character entity references are replaced by their corresponding character
         :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
-        :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
+        :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
         """
         return self.text.re(regex, replace_entities, clean_match, case_sensitive)
     def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
-                 clean_match: bool = False, case_sensitive: bool = False) -> TextHandler:
+                 clean_match: bool = False, case_sensitive: bool = True) -> TextHandler:
         """Apply the given regex to text and return the first match if found, otherwise return the default value.
         :param regex: Can be either a compiled regular expression or a string.
         :param default: The default value to be returned if there is no match
         :param replace_entities: if enabled character entity references are replaced by their corresponding character
         :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
-        :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
+        :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
         """
         return self.text.re_first(regex, default, replace_entities, clean_match, case_sensitive)
@@ -1009,14 +1009,14 @@ class Adaptors(List[Adaptor]):
         return self.__class__(flatten(results))
     def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True,
-           clean_match: bool = False, case_sensitive: bool = False) -> TextHandlers[TextHandler]:
+           clean_match: bool = False, case_sensitive: bool = True) -> TextHandlers[TextHandler]:
         """Call the ``.re()`` method for each element in this list and return
         their results flattened as List of TextHandler.
         :param regex: Can be either a compiled regular expression or a string.
         :param replace_entities: if enabled character entity references are replaced by their corresponding character
         :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
-        :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
+        :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
         """
         results = [
             n.text.re(regex, replace_entities, clean_match, case_sensitive) for n in self
@@ -1024,7 +1024,7 @@ class Adaptors(List[Adaptor]):
         return TextHandlers(flatten(results))
     def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
-                 clean_match: bool = False, case_sensitive: bool = False) -> TextHandler:
+                 clean_match: bool = False, case_sensitive: bool = True) -> TextHandler:
         """Call the ``.re_first()`` method for each element in this list and return
         the first result or the default value otherwise.
@@ -1032,7 +1032,7 @@ class Adaptors(List[Adaptor]):
         :param default: The default value to be returned if there is no match
         :param replace_entities: if enabled character entity references are replaced by their corresponding character
         :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
-        :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
+        :param case_sensitive: if disabled, function will set the regex to ignore letters case while compiling it
         """
         for n in self:
             for result in n.re(regex, replace_entities, clean_match, case_sensitive):

{scrapling-0.2.93 → scrapling-0.2.94/scrapling.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: scrapling
-Version: 0.2.93
+Version: 0.2.94
 Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
 Home-page: https://github.com/D4Vinci/Scrapling
 Author: Karim Shoair
@@ -40,7 +40,7 @@ Requires-Dist: tldextract
 Requires-Dist: httpx[brotli,socks,zstd]
 Requires-Dist: playwright>=1.49.1
 Requires-Dist: rebrowser-playwright>=1.49.1
-Requires-Dist: camoufox[geoip]>=0.4.10
+Requires-Dist: camoufox[geoip]>=0.4.11
 Dynamic: author
 Dynamic: author-email
 Dynamic: classifier
@@ -267,7 +267,7 @@ then use it right away without initializing like:
 page = StealthyFetcher.fetch('https://example.com')
 ```
-Also, the `Response` object returned from all fetchers is the same as the `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
+Also, the `Response` object returned from all fetchers is the same as the `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, `history`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
 > [!NOTE]
 > The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
 ### Fetcher

{scrapling-0.2.93 → scrapling-0.2.94}/scrapling.egg-info/requires.txt RENAMED Viewed

@@ -7,4 +7,4 @@ tldextract
 httpx[brotli,socks,zstd]
 playwright>=1.49.1
 rebrowser-playwright>=1.49.1
-camoufox[geoip]>=0.4.10
+camoufox[geoip]>=0.4.11

{scrapling-0.2.93 → scrapling-0.2.94}/setup.cfg RENAMED Viewed

@@ -1,6 +1,6 @@
 [metadata]
 name = scrapling
-version = 0.2.93
+version = 0.2.94
 author = Karim Shoair
 author_email = karim.shoair@pm.me
 description = Scrapling is an undetectable, powerful, flexible, adaptive, and high-performance web scraping library for Python.

{scrapling-0.2.93 → scrapling-0.2.94}/setup.py RENAMED Viewed

@@ -6,7 +6,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
 setup(
     name="scrapling",
-    version="0.2.93",
+    version="0.2.94",
     description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
      simplifies the process of extracting data from websites, even when they undergo structural changes, and offers
      impressive speed improvements over many popular scraping tools.""",
@@ -61,7 +61,7 @@ setup(
         'httpx[brotli,zstd, socks]',
         'playwright>=1.49.1',
         'rebrowser-playwright>=1.49.1',
-        'camoufox[geoip]>=0.4.10'
+        'camoufox[geoip]>=0.4.11'
     ],
     python_requires=">=3.9",
     url="https://github.com/D4Vinci/Scrapling",