PyPI - cloe-nessy - Versions diffs - 0.3.18__py3-none-any.whl → 0.3.19__py3-none-any.whl - Mend

cloe-nessy 0.3.18py3-none-any.whl → 0.3.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

cloe_nessy/clients/api_client/__init__.py +10 -1
cloe_nessy/clients/api_client/api_client.py +19 -8
cloe_nessy/clients/api_client/api_response.py +7 -4
cloe_nessy/clients/api_client/pagination_config.py +84 -0
cloe_nessy/clients/api_client/pagination_strategy.py +500 -0
cloe_nessy/integration/reader/__init__.py +2 -2
cloe_nessy/integration/reader/api_reader.py +463 -72
cloe_nessy/integration/reader/catalog_reader.py +6 -4
cloe_nessy/integration/reader/excel_reader.py +3 -3
cloe_nessy/integration/reader/file_reader.py +3 -1
cloe_nessy/integration/reader/reader.py +1 -1
cloe_nessy/integration/writer/catalog_writer.py +1 -1
cloe_nessy/pipeline/actions/__init__.py +1 -1
cloe_nessy/pipeline/actions/read_api.py +272 -75
cloe_nessy/pipeline/actions/read_excel.py +1 -1
cloe_nessy/pipeline/actions/transform_decode.py +2 -1
cloe_nessy/pipeline/pipeline_config.py +2 -0
cloe_nessy/pipeline/pipeline_context.py +1 -1
cloe_nessy/pipeline/pipeline_parsing_service.py +104 -39
cloe_nessy/pipeline/pipeline_step.py +2 -0
cloe_nessy/session/__init__.py +2 -1
cloe_nessy/session/pyspark_compat.py +15 -0
cloe_nessy/session/session_manager.py +1 -1
{cloe_nessy-0.3.18.dist-info → cloe_nessy-0.3.19.dist-info}/METADATA +3 -3
{cloe_nessy-0.3.18.dist-info → cloe_nessy-0.3.19.dist-info}/RECORD +26 -23
{cloe_nessy-0.3.18.dist-info → cloe_nessy-0.3.19.dist-info}/WHEEL +1 -1

cloe_nessy/pipeline/actions/__init__.py CHANGED Viewed

@@ -33,7 +33,7 @@ pipeline_actions = {cls.name: cls for cls in PipelineAction.__subclasses__()}
 # Register all subclasses dynamically as enum using their "name" attribute as
 # key. We need to do this here, because otherwise we don't get all subclasses
 # from a relative import of PipelineAction
-PipelineActionType = Enum("PipelineActionType", pipeline_actions)  # type: ignore
+PipelineActionType = Enum("PipelineActionType", pipeline_actions)  # type: ignore[misc]
 __all__ = [
     "ReadAPIAction",

cloe_nessy/pipeline/actions/read_api.py CHANGED Viewed

@@ -1,10 +1,12 @@
 from collections.abc import Mapping
 from typing import Any, cast
+from pydantic import ConfigDict, validate_call
 from requests.auth import AuthBase, HTTPBasicAuth
+from ...clients.api_client import PaginationConfig, PaginationConfigData
 from ...clients.api_client.auth import AzureCredentialAuth, ChainedAuth, EnvVariableAuth, SecretScopeAuth
-from ...integration.reader import APIReader
+from ...integration.reader import APIReader, RequestSet
 from ..pipeline_action import PipelineAction
 from ..pipeline_context import PipelineContext
@@ -12,11 +14,7 @@ from ..pipeline_context import PipelineContext
 def process_auth(
     auth: Mapping[str, str | Mapping[str, str] | list[Mapping[str, str]]] | AuthBase | None,
 ) -> AuthBase | None:
-    """Processes the auth parameter to create an AuthBase object.
-    Args:
-        auth: The auth parameter to be processed.
-    """
+    """Processes the auth parameter to create an AuthBase object."""
     result: AuthBase | None = None
     if isinstance(auth, list):
@@ -27,11 +25,9 @@ def process_auth(
             case "basic":
                 result = HTTPBasicAuth(auth["username"], auth["password"])
             case "secret_scope":
-                secret_scope_header_template: dict[str, str] = auth["header_template"]
-                result = SecretScopeAuth(secret_scope_header_template, auth["secret_scope"])
+                result = SecretScopeAuth(auth["header_template"], auth["secret_scope"])
             case "env":
-                env_header_template: dict[str, str] = auth["header_template"]
-                result = EnvVariableAuth(env_header_template)
+                result = EnvVariableAuth(auth["header_template"])
             case "azure_oauth":
                 result = AzureCredentialAuth(
                     scope=auth["scope"],
@@ -40,9 +36,12 @@ def process_auth(
                     tenant_id=auth["tenant_id"],
                 )
             case _:
-                raise ValueError("Invalid auth type specified. Supported types are: basic, secret_scope, env")
+                raise ValueError(
+                    "Invalid auth type specified. Supported types are: basic, secret_scope, env, azure_oauth"
+                )
     else:
-        result = cast(AuthBase, auth)
+        if isinstance(auth, AuthBase):
+            result = auth  # Assume it's already an AuthBase instance
     return result
@@ -50,9 +49,10 @@ def process_auth(
 class ReadAPIAction(PipelineAction):
     """Reads data from an API and loads it into a Spark DataFrame.
-    This method uses the provided API parameters to make a request using the
-    [`APIReader`][cloe_nessy.integration.reader.api_reader] and return a
-    DataFrame containing the response data.
+    This action executes HTTP requests (optionally paginated) in parallel using the
+    [`APIReader`][cloe_nessy.integration.reader.api_reader] and returns a DataFrame
+    containing the response payloads plus request/response metadata. No intermediate
+    files are written.
     Example:
         === "Basic Usage"
@@ -63,6 +63,7 @@ class ReadAPIAction(PipelineAction):
                     base_url: https://some_url.com/api/
                     endpoint: my/endpoint/
             ```
         === "Usage with Parameters and Headers"
             ```yaml
             Read API:
@@ -73,56 +74,211 @@ class ReadAPIAction(PipelineAction):
                     method: GET
                     timeout: 90
                     headers:
-                        key1: value1
-                        key2: value2
+                        Accept: application/json
+                        X-Request: foo
                     params:
-                        key1: value1
-                        key2: value2
+                        q: widget
+                        include: details
             ```
-        === "Usage with Authentication"
+        === "Usage with Authentication (can be chained)"
             ```yaml
             Read API:
                 action: READ_API
                 options:
                     base_url: https://some_url.com/api/
                     endpoint: my/endpoint/
-                    method: GET
-                    timeout: 90
                     auth:
                         - type: basic
                           username: my_username
                           password: my_password
-                        - type: secret_scope
-                          secret_scope: my_secret_scope
+                        - type: env
                           header_template:
-                            "header_key_1": "<ENVIRONMENT_VARIABLE_NAME>"
+                            "X-API-Key": "<ENV_VAR_NAME>"
                         - type: secret_scope
                           secret_scope: my_secret_scope
                           header_template:
-                            "header_key_2": "<SECRET_NAME>"
-                        - type: secret_scope
-                          secret_scope: my_other_secret_scope
-                          header_template:
-                            "header_key_3": "<SECRET_NAME>"
+                            "X-ORG-Token": "<SECRET_NAME>"
                         - type: azure_oauth
                           client_id: my_client_id
                           client_secret: my_client_secret
                           tenant_id: my_tenant_id
                           scope: <entra-id-client-id>
             ```
+            The above will combine credentials (via `ChainedAuth`) so that headers from `env`/`secret_scope`
+            are merged and auth flows like Basic / Azure OAuth are applied to each request.
+        === "Extracting a Nested Field (key)"
+            If the API returns a large JSON object but you only want a nested list (e.g. `data.items`):
+            ```yaml
+            Read API:
+                action: READ_API
+                options:
+                    base_url: https://some_url.com/api/
+                    endpoint: reports/
+                    key: data.items
+            ```
+        === "Pagination (Supported: page_based, limit_offset)"
+            Only `page_based` and `limit_offset` strategies are currently supported. You may also
+            supply the shared/advanced options `check_field`, `next_page_field`, `max_page`,
+            `pages_per_array_limit`, and `preliminary_probe`.
+            **1) Page-Based Pagination**
+            ```yaml
+            Read API:
+                action: READ_API
+                options:
+                    base_url: https://some_url.com/api/
+                    endpoint: items/
+                    params:
+                        page: 1              # starting page (optional; defaults to 1)
+                        per_page: 100
+                    pagination:
+                        strategy: page_based
+                        page_field: page     # required
+                        # Shared/advanced (optional):
+                        check_field: results           # e.g. list to check for emptiness
+                        next_page_field: info.has_next # boolean flag; if present it is trusted
+                        max_page: -1                   # -1 = all pages
+                        pages_per_array_limit: 2       # chunk output rows every 2 pages
+                        preliminary_probe: false       # set true to pre-scan/build all page params
+            ```
+            This issues requests like:
+            ```
+            GET .../items/?page=1&per_page=100
+            GET .../items/?page=2&per_page=100
+            ...
+            ```
+            **2) Limit/Offset Pagination**
+            ```yaml
+            Read API:
+                action: READ_API
+                options:
+                    base_url: https://some_url.com/api/
+                    endpoint: products/
+                    params:
+                        limit: 50
+                        offset: 0
+                    pagination:
+                        strategy: limit_offset
+                        limit_field: limit       # required
+                        offset_field: offset     # required
+                        # Shared/advanced (optional):
+                        check_field: data.items
+                        next_page_field: page_info.has_next
+                        max_page: -1
+                        pages_per_array_limit: -1
+                        preliminary_probe: false
+            ```
+            This issues requests like:
+            ```
+            GET .../products/?limit=50&offset=0
+            GET .../products/?limit=50&offset=50
+            GET .../products/?limit=50&offset=100
+            ...
+            ```
+            **Using `preliminary_probe` to pre-compute all pages**
+            If `preliminary_probe: true` is set, the reader will first probe the API to determine
+            the final page (using `check_field` and/or `next_page_field`) and then fan out one request
+            per page/offset—useful when driving fully parallel execution:
+            ```yaml
+            Read API:
+                action: READ_API
+                options:
+                    base_url: https://api.example.com/
+                    endpoint: orders/
+                    params:
+                        limit: 100
+                        offset: 0
+                    pagination:
+                        strategy: limit_offset
+                        limit_field: limit
+                        offset_field: offset
+                        check_field: data
+                        preliminary_probe: true
+                    max_concurrent_requests: 16
+            ```
-            The above example will combine the headers from the different auth types. The resulting header will look like this:
+        === "Retries and Concurrency"
+            ```yaml
+            Read API:
+                action: READ_API
+                options:
+                    base_url: https://some_url.com/api/
+                    endpoint: heavy/endpoint/
+                    max_retries: 3           # network/5xx retry count
+                    backoff_factor: 2        # exponential backoff multiplier
+                    max_concurrent_requests: 16
+                    timeout: 60
+            ```
-            ```json
-            {
-                "header_key_1": "value_from_environment_variable",
-                "header_key_2": "value_from_secret",
-                "header_key_3": "value_from_secret",
-                "Authorization": "Bearer <access_token> (from azure_oauth)",
-                "Authorization": "Basic am9obkBleGFtcGxlLmNvbTphYmMxMjM= (from basic)"
-            }
+        === "Default Headers on All Requests"
+            ```yaml
+            Read API:
+                action: READ_API
+                options:
+                    base_url: https://some_url.com/api/
+                    endpoint: v1/resources
+                    default_headers:
+                        X-Client: my-pipeline
+                        Accept: application/json
+                    headers:
+                        X-Request: custom
             ```
+        === "Deriving Requests from Context (multiple dynamic requests)"
+            When `requests_from_context: true`, distinct rows from the upstream `context.data`
+            are converted into individual requests (enabling heterogeneous endpoints/params).
+            The DataFrame must have columns: `endpoint`, `params`, `headers`, `data`, `json_body`.
+            ```yaml
+            # Upstream step produces rows like:
+            # | endpoint        | params                  | headers | data | json_body |
+            # | "u/123/profile" | {"verbose": "true"}     |  null   | null |   null    |
+            # | "u/456/profile" | {"verbose": "false"}    |  null   | null |   null    |
+            Read API:
+                action: READ_API
+                options:
+                    base_url: https://some_url.com/api/
+                    requests_from_context: true
+                    method: GET
+                    timeout: 45
+            ```
+    Output:
+        The action returns a Spark DataFrame with one column `json_response` (ArrayType).
+        Each element contains:
+        ```json
+        {
+          "response": "<json string of the API payload (optionally reduced by 'key')>",
+          "__metadata": {
+            "timestamp": "YYYY-MM-DD HH:MM:SS.ssssss",
+            "base_url": "https://some_url.com/api/",
+            "url": "https://some_url.com/api/my/endpoint/?q=...",
+            "status_code": 200,
+            "reason": "OK",
+            "elapsed": 0.123,
+            "endpoint": "my/endpoint/",
+            "query_parameters": { "q": "..." }
+          }
+        }
+        ```
+        When pagination is enabled and `pages_per_array_limit` > 0, responses are chunked
+        into arrays of that many pages; otherwise all pages for a request are grouped together.
+    Validation & Errors:
+        - `base_url` must be provided.
+        - Either `endpoint` must be provided **or** `requests_from_context` must be `true`.
+        - If `requests_from_context` is `true`, `context.data` must be present and non-empty.
+        - Pagination config:
+            - `strategy` must be `page_based` or `limit_offset` (other strategies are not yet supported).
+            - For `page_based`, `page_field` is required.
+            - For `limit_offset`, both `limit_field` and `offset_field` are required.
     !!! warning "Secret information"
         Don't write sensitive information like passwords or tokens directly in the pipeline configuration.
         Use secret scopes or environment variables instead.
@@ -130,73 +286,114 @@ class ReadAPIAction(PipelineAction):
     name: str = "READ_API"
-    @staticmethod
+    @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
     def run(
+        self,
         context: PipelineContext,
         *,
         base_url: str | None = None,
-        auth: AuthBase | dict[str, str] | None = None,
-        default_headers: dict[str, str] | None = None,
-        endpoint: str = "",  # www.neo4j.de/api/table/2020/01/01
+        auth: Mapping[str, str | Mapping[str, str] | list[Mapping[str, str]]] | None = None,
+        endpoint: str | None = None,
+        default_headers: dict[str, Any] | None = None,
         method: str = "GET",
         key: str | None = None,
         timeout: int = 30,
-        params: dict[str, str] | None = None,
-        headers: dict[str, str] | None = None,
-        data: dict[str, str] | None = None,
-        json: dict[str, str] | None = None,
+        params: dict[str, Any] | None = None,
+        headers: dict[str, Any] | None = None,
+        data: dict[str, Any] | None = None,
+        json_body: dict[str, Any] | None = None,
+        pagination: PaginationConfigData | None = None,
         max_retries: int = 0,
-        options: dict[str, str] | None = None,
+        backoff_factor: int = 0,
+        max_concurrent_requests: int = 8,
+        requests_from_context: bool = False,
         **_: Any,
     ) -> PipelineContext:
-        """Utility class for reading an API into a DataFrame.
+        """Executes API requests in parallel by using mapInPandas.
-        This class uses an APIClient to fetch data from an API and load it into a Spark DataFrame.
+        We do NOT write intermediate files; instead we directly return the responses
+        as rows in a Spark DataFrame.
         Args:
-            context: The pipeline context containing information about the pipeline.
-            base_url: The base URL for the API to be called.
-            auth: The authentication credentials for the API.
-            default_headers: Default headers to include in the API request.
-            endpoint: The specific API endpoint to call.
-            method: The HTTP method to use for the request (default is "GET").
-            key: Key for accessing specific data in the response.
-            timeout: Timeout for the API request in seconds (default is 30).
-            params: URL parameters to include in the API request.
-            headers: Additional headers to include in the request.
-            data: Data to send with the request for POST methods.
-            json: JSON data to send with the request for POST methods.
-            max_retries: Maximum number of retries for the API request (default is 0).
-            options: Additional options for the API request.
+            context: The pipeline context used to carry data between actions.
+            base_url: The base URL for all API requests.
+            auth: Authentication configuration, which may be a simple header map,
+                a nested map for different auth scopes, or a list thereof.
+            endpoint: The specific path to append to the base URL for this call.
+            default_headers: Headers to include on every request.
+            method: HTTP method to use.
+            key: JSON field name to extract from each response.
+            timeout: Request timeout in seconds.
+            params: Query parameters to append to the URL.
+            headers: Additional request-specific headers.
+            data: Form-encoded body to send.
+            json_body: JSON-encoded body to send.
+            pagination: Configuration for paginated endpoints.
+            max_retries: Number of times to retry on failure.
+            backoff_factor: Multiplier for retry backoff delays.
+            max_concurrent_requests: Maximum number of parallel API calls.
+            requests_from_context: Whether to derive request parameters from context data.
         Returns:
-            The updated pipeline context containing the DataFrame with the API response data.
+            The updated context, with the read data as a DataFrame.
         Raises:
-            ValueError: If the base_url is not specified.
+            ValueError: If no base URL is provided.
+            ValueError: If neither an endpoint nor context-derived requests are specified.
+            ValueError: If context-derived requests are enabled but no data is present in context.
         """
-        if not options:
-            options = dict()
+        deserialized_auth = process_auth(auth)
+        pagination_config = PaginationConfig(**pagination) if pagination is not None else None
         if base_url is None:
-            raise ValueError("base_url must be specified to fetch data from API.")
+            raise ValueError("A value for base_url must to be supplied")
-        deserialized_auth = process_auth(auth)
+        if endpoint is None and not requests_from_context:
+            raise ValueError("A value for endpoint must to be supplied")
-        api_reader = APIReader(base_url=base_url, auth=deserialized_auth, default_headers=default_headers)
+        api_reader = APIReader(
+            base_url=base_url,
+            auth=deserialized_auth,
+            default_headers=default_headers,
+            max_concurrent_requests=max_concurrent_requests,
+        )
+        dynamic_requests: list[RequestSet] | None = None
+        if requests_from_context:
+            if not context.data:
+                raise ValueError("Cannot generate requests from the context without a DataFrame in the context.")
+            dynamic_requests = [
+                cast(RequestSet, row.asDict())
+                for row in context.data.select(
+                    "endpoint",
+                    "params",
+                    "headers",
+                    "data",
+                    "json_body",
+                )
+                .distinct()
+                .collect()
+            ]
         df = api_reader.read(
-            method=method,
             endpoint=endpoint,
+            method=method,
+            key=key,
             timeout=timeout,
             params=params,
-            key=key,
             headers=headers,
             data=data,
-            json=json,
+            json_body=json_body,
+            pagination_config=pagination_config,
             max_retries=max_retries,
-            options=options,
+            backoff_factor=backoff_factor,
+            dynamic_requests=dynamic_requests,
         )
+        row_count = df.count()
+        self._console_logger.info(f"API requests completed. Final row count = {row_count}.")
         return context.from_existing(data=df)

cloe_nessy/pipeline/actions/read_excel.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from collections.abc import Callable
 from functools import reduce
-from pyspark.sql import DataFrame
+from cloe_nessy.session import DataFrame
 from ...file_utilities import get_file_paths
 from ...integration.reader import ExcelDataFrameReader

cloe_nessy/pipeline/actions/transform_decode.py CHANGED Viewed

@@ -1,8 +1,9 @@
 from typing import Any
-from pyspark.sql import DataFrame
 from pyspark.sql.functions import col, from_json, schema_of_json, unbase64
+from cloe_nessy.session import DataFrame
 from ..pipeline_action import PipelineAction
 from ..pipeline_context import PipelineContext

cloe_nessy/pipeline/pipeline_config.py CHANGED Viewed

@@ -83,6 +83,7 @@ class PipelineStepConfig(PipelineConfigBaseModel):
     context: str | None = None
     table_metadata: str | None = None
     options: dict = Field(default_factory=dict)
+    env: dict = Field(default_factory=dict)
 class PipelineConfig(PipelineConfigBaseModel):
@@ -90,3 +91,4 @@ class PipelineConfig(PipelineConfigBaseModel):
     name: str
     steps: OrderedDict[str, PipelineStepConfig]
+    env: dict[str, str] = Field(default_factory=dict)

cloe_nessy/pipeline/pipeline_context.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from typing import Any
-from pyspark.sql import DataFrame
+from cloe_nessy.session import DataFrame
 from ..models import Table

cloe-nessy 0.3.18__py3-none-any.whl → 0.3.19__py3-none-any.whl

cloe-nessy 0.3.18py3-none-any.whl → 0.3.19py3-none-any.whl