PyPI - cloe-nessy - Versions diffs - 0.3.18__py3-none-any.whl → 0.3.19__py3-none-any.whl - Mend

cloe-nessy 0.3.18py3-none-any.whl → 0.3.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

cloe_nessy/clients/api_client/__init__.py +10 -1
cloe_nessy/clients/api_client/api_client.py +19 -8
cloe_nessy/clients/api_client/api_response.py +7 -4
cloe_nessy/clients/api_client/pagination_config.py +84 -0
cloe_nessy/clients/api_client/pagination_strategy.py +500 -0
cloe_nessy/integration/reader/__init__.py +2 -2
cloe_nessy/integration/reader/api_reader.py +463 -72
cloe_nessy/integration/reader/catalog_reader.py +6 -4
cloe_nessy/integration/reader/excel_reader.py +3 -3
cloe_nessy/integration/reader/file_reader.py +3 -1
cloe_nessy/integration/reader/reader.py +1 -1
cloe_nessy/integration/writer/catalog_writer.py +1 -1
cloe_nessy/pipeline/actions/__init__.py +1 -1
cloe_nessy/pipeline/actions/read_api.py +272 -75
cloe_nessy/pipeline/actions/read_excel.py +1 -1
cloe_nessy/pipeline/actions/transform_decode.py +2 -1
cloe_nessy/pipeline/pipeline_config.py +2 -0
cloe_nessy/pipeline/pipeline_context.py +1 -1
cloe_nessy/pipeline/pipeline_parsing_service.py +104 -39
cloe_nessy/pipeline/pipeline_step.py +2 -0
cloe_nessy/session/__init__.py +2 -1
cloe_nessy/session/pyspark_compat.py +15 -0
cloe_nessy/session/session_manager.py +1 -1
{cloe_nessy-0.3.18.dist-info → cloe_nessy-0.3.19.dist-info}/METADATA +3 -3
{cloe_nessy-0.3.18.dist-info → cloe_nessy-0.3.19.dist-info}/RECORD +26 -23
{cloe_nessy-0.3.18.dist-info → cloe_nessy-0.3.19.dist-info}/WHEEL +1 -1

cloe_nessy/clients/api_client/__init__.py CHANGED Viewed

@@ -1,3 +1,12 @@
+from enum import Enum
 from .api_client import APIClient
+from .api_response import APIResponse
+from .pagination_config import PaginationConfig, PaginationConfigData
+from .pagination_strategy import PaginationStrategy
+pagination_strategies = {cls.name: cls for cls in PaginationStrategy.__subclasses__()}
+PaginationStrategyType = Enum("PaginationStrategyType", pagination_strategies)  # type: ignore[misc]
-__all__ = ["APIClient"]
+__all__ = ["APIClient", "APIResponse", "PaginationStrategyType", "PaginationConfig", "PaginationConfigData"]

cloe_nessy/clients/api_client/api_client.py CHANGED Viewed

@@ -4,6 +4,7 @@ from typing import Any
 from urllib.parse import urljoin
 import requests
+from requests.adapters import HTTPAdapter
 from requests.auth import AuthBase
 from .api_response import APIResponse
@@ -28,13 +29,14 @@ class APIClient:
         HTTPStatus.GATEWAY_TIMEOUT,
     ]
-    MAX_SLEEP_TIME: int = 180  # seconds
+    MAX_SLEEP_TIME: int = 1800  # seconds
     def __init__(
         self,
         base_url: str,
         auth: AuthBase | None = None,
-        default_headers: dict[str, str] | None = None,
+        default_headers: dict[str, Any] | None = None,
+        pool_maxsize: int = 10,
     ):
         """Initializes the APIClient object.
@@ -42,11 +44,15 @@ class APIClient:
             base_url: The base URL for the API.
             auth: The authentication method for the API.
             default_headers: Default headers to include in requests.
+            pool_maxsize: The maximum pool size for the HTTPAdapter (maximum number of connections to save in the pool).
         """
         if not base_url.endswith("/"):
             base_url += "/"
         self.base_url = base_url
         self.session = requests.Session()
+        self.pool_maxsize = pool_maxsize
+        adapter = HTTPAdapter(pool_maxsize=pool_maxsize)
+        self.session.mount("https://", adapter)
         if default_headers:
             self.session.headers.update(default_headers)
         self.session.auth = auth
@@ -56,11 +62,13 @@ class APIClient:
         method: str,
         endpoint: str,
         timeout: int = 30,
-        params: dict[str, str] | None = None,
-        data: dict[str, str] | None = None,
-        json: dict[str, str] | None = None,
-        headers: dict[str, str] | None = None,
+        params: dict[str, Any] | None = None,
+        data: dict[str, Any] | None = None,
+        json: dict[str, Any] | None = None,
+        headers: dict[str, Any] | None = None,
         max_retries: int = 0,
+        backoff_factor: int = 1,
+        raise_for_status: bool = True,
     ) -> APIResponse:
         """Makes a request to the API endpoint.
@@ -73,6 +81,8 @@ class APIClient:
             json: The JSON data to include in the request.
             headers: The headers to include in the request.
             max_retries: The maximum number of retries for the request.
+            backoff_factor: Factor for exponential backoff between retries.
+            raise_for_status: Raise HTTPError, if one occurred.
         Returns:
             APIResponse: The response from the API.
@@ -98,13 +108,14 @@ class APIClient:
                     headers=headers,
                 )
                 if response.status_code not in APIClient.RETRY_CODES:
-                    response.raise_for_status()
+                    if raise_for_status:
+                        response.raise_for_status()
                     return APIResponse(response)
             except requests.exceptions.HTTPError as err:
                 raise APIClientHTTPError(f"HTTP error occurred: {err}") from err
             except requests.exceptions.ConnectionError as err:
                 if attempt < max_retries:
-                    sleep_time = min(2**attempt, APIClient.MAX_SLEEP_TIME)
+                    sleep_time = min(backoff_factor * (2**attempt), APIClient.MAX_SLEEP_TIME)
                     sleep(sleep_time)
                     continue
                 raise APIClientConnectionError(f"Connection error occurred: {err}") from err

cloe_nessy/clients/api_client/api_response.py CHANGED Viewed

@@ -56,17 +56,20 @@ class APIResponse:
                 dict_response = {"value": self.response.text}
             if key:
-                dict_response = dict_response[key]
+                dict_response = {"value": dict_response[key]}
         except KeyError as err:
             raise KeyError(
-                f"The key '{err.args[0]}' was not found in the response. Status code: {self.status_code}, Headers: {self.headers}, Response: {dict_response}"
+                f"The key '{err.args[0]}' was not found in the response. Status code: {self.status_code}, "
+                f"Headers: {self.headers}, Response: {dict_response}"
             ) from err
         except ValueError as err:
             raise ValueError(
-                f"Error parsing JSON response: {err}. Status code: {self.status_code}, Headers: {self.headers}, Response content: {self.response.text}"
+                f"Error parsing JSON response: {err}. Status code: {self.status_code}, Headers: {self.headers}, "
+                f"Response content: {self.response.text}"
             ) from err
         except Exception as err:
             raise APIClientError(
-                f"An unexpected error occurred: {err}. Status code: {self.status_code}, Headers: {self.headers}, Response content: {self.response.text}"
+                f"An unexpected error occurred: {err}. Status code: {self.status_code}, Headers: {self.headers}, "
+                f"Response content: {self.response.text}"
             ) from err
         return dict_response

cloe_nessy/clients/api_client/pagination_config.py ADDED Viewed

@@ -0,0 +1,84 @@
+from typing import Self
+from pydantic import BaseModel, Field, field_validator, model_validator
+from typing_extensions import TypedDict
+class PaginationStrategyConfigData(TypedDict, total=False):
+    """Shared config across all strategies."""
+    check_field: str | None  # e.g. "results" or "data.items"
+    next_page_field: str | None  # e.g. "info.next_page"
+    max_page: int  # hard cap (reader also enforces)
+    pages_per_array_limit: int  # chunking behavior for output arrays
+    preliminary_probe: bool  # enable probe_max_page pre-scan
+class LimitOffsetPaginationConfigData(PaginationStrategyConfigData, total=False):
+    """Config for limit-offset pagination."""
+    limit_field: str  # e.g. "limit" or "page_size"
+    offset_field: str  # e.g. "offset" or "cursor"
+class PageBasedPaginationConfigData(PaginationStrategyConfigData, total=False):
+    """Config for page-based pagination."""
+    page_field: str  # e.g. "page"
+class PaginationConfigData(TypedDict, total=False):
+    """Top-level config (what your Pydantic model or dict can accept)."""
+    strategy: str  # "limit_offset" | "page_based" | ...
+    # strategy-specific fields:
+    limit_field: str
+    offset_field: str
+    page_field: str
+    # shared/advanced fields:
+    check_field: str | None
+    next_page_field: str | None
+    max_page: int
+    pages_per_array_limit: int
+    preliminary_probe: bool
+class PaginationConfig(BaseModel):
+    """Configuration model for pagination options."""
+    strategy: str = Field(..., description="Pagination strategy (limit_offset, page_based, cursor_based, etc.)")
+    check_field: str | None = Field(None, description="Field to check for emptiness of response.")
+    next_page_field: str | None = Field(None, description="Field that indicates there is a next page.")
+    limit_field: str | None = Field(
+        None, description="Name of the limit parameter field for items per page or request."
+    )
+    offset_field: str | None = Field(
+        None, description="Name of the offset parameter field for items per page or request."
+    )
+    page_field: str | None = Field(None, description="Name of the page parameter field.")
+    max_page: int = Field(-1, description="Amount of pages to fetch. If not set, will fetch all available data.")
+    pages_per_array_limit: int = Field(-1, description="Maximum number of pages per array.")
+    preliminary_probe: bool = Field(
+        False, description="Whether to perform a preliminary probe to determine the total number of pages."
+    )
+    @field_validator("strategy", mode="before")
+    @classmethod
+    def _validate_strategy(cls, v: str) -> str:
+        """Validates the pagination strategy."""
+        supported_strategies = ["limit_offset", "page_based", "cursor_based", "time_based"]
+        if v not in supported_strategies:
+            if v in ["cursor_based", "time_based"]:
+                raise NotImplementedError("cursor_based and time_based are not yet supported.")
+            supported_str = ", ".join(supported_strategies)
+            raise ValueError(f"Unsupported pagination strategy: {v}. Supported strategies: {supported_str}")
+        return v
+    @model_validator(mode="after")
+    def _validate_strategy_config(self) -> Self:
+        """Validates the configuration of the pagination strategy."""
+        if self.strategy == "limit_offset" and any(field is None for field in [self.limit_field, self.offset_field]):
+            raise ValueError(f"Both <limit_field> and <offset_field> must be set for strategy '{self.strategy}'")
+        if self.strategy == "page_based" and self.page_field is None:
+            raise ValueError(f"<page_field> must be set for strategy '{self.strategy}'")
+        return self

cloe-nessy 0.3.18__py3-none-any.whl → 0.3.19__py3-none-any.whl

cloe-nessy 0.3.18py3-none-any.whl → 0.3.19py3-none-any.whl