cloe-nessy 0.3.18__py3-none-any.whl → 0.3.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloe_nessy/clients/api_client/__init__.py +10 -1
- cloe_nessy/clients/api_client/api_client.py +19 -8
- cloe_nessy/clients/api_client/api_response.py +7 -4
- cloe_nessy/clients/api_client/pagination_config.py +84 -0
- cloe_nessy/clients/api_client/pagination_strategy.py +500 -0
- cloe_nessy/integration/reader/__init__.py +2 -2
- cloe_nessy/integration/reader/api_reader.py +463 -72
- cloe_nessy/integration/reader/catalog_reader.py +6 -4
- cloe_nessy/integration/reader/excel_reader.py +3 -3
- cloe_nessy/integration/reader/file_reader.py +3 -1
- cloe_nessy/integration/reader/reader.py +1 -1
- cloe_nessy/integration/writer/catalog_writer.py +1 -1
- cloe_nessy/pipeline/actions/__init__.py +1 -1
- cloe_nessy/pipeline/actions/read_api.py +272 -75
- cloe_nessy/pipeline/actions/read_excel.py +1 -1
- cloe_nessy/pipeline/actions/transform_decode.py +2 -1
- cloe_nessy/pipeline/pipeline_config.py +2 -0
- cloe_nessy/pipeline/pipeline_context.py +1 -1
- cloe_nessy/pipeline/pipeline_parsing_service.py +104 -39
- cloe_nessy/pipeline/pipeline_step.py +2 -0
- cloe_nessy/session/__init__.py +2 -1
- cloe_nessy/session/pyspark_compat.py +15 -0
- cloe_nessy/session/session_manager.py +1 -1
- {cloe_nessy-0.3.18.dist-info → cloe_nessy-0.3.19.dist-info}/METADATA +3 -3
- {cloe_nessy-0.3.18.dist-info → cloe_nessy-0.3.19.dist-info}/RECORD +26 -23
- {cloe_nessy-0.3.18.dist-info → cloe_nessy-0.3.19.dist-info}/WHEEL +1 -1
|
@@ -1,3 +1,12 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
1
3
|
from .api_client import APIClient
|
|
4
|
+
from .api_response import APIResponse
|
|
5
|
+
from .pagination_config import PaginationConfig, PaginationConfigData
|
|
6
|
+
from .pagination_strategy import PaginationStrategy
|
|
7
|
+
|
|
8
|
+
pagination_strategies = {cls.name: cls for cls in PaginationStrategy.__subclasses__()}
|
|
9
|
+
PaginationStrategyType = Enum("PaginationStrategyType", pagination_strategies) # type: ignore[misc]
|
|
10
|
+
|
|
2
11
|
|
|
3
|
-
__all__ = ["APIClient"]
|
|
12
|
+
__all__ = ["APIClient", "APIResponse", "PaginationStrategyType", "PaginationConfig", "PaginationConfigData"]
|
|
@@ -4,6 +4,7 @@ from typing import Any
|
|
|
4
4
|
from urllib.parse import urljoin
|
|
5
5
|
|
|
6
6
|
import requests
|
|
7
|
+
from requests.adapters import HTTPAdapter
|
|
7
8
|
from requests.auth import AuthBase
|
|
8
9
|
|
|
9
10
|
from .api_response import APIResponse
|
|
@@ -28,13 +29,14 @@ class APIClient:
|
|
|
28
29
|
HTTPStatus.GATEWAY_TIMEOUT,
|
|
29
30
|
]
|
|
30
31
|
|
|
31
|
-
MAX_SLEEP_TIME: int =
|
|
32
|
+
MAX_SLEEP_TIME: int = 1800 # seconds
|
|
32
33
|
|
|
33
34
|
def __init__(
|
|
34
35
|
self,
|
|
35
36
|
base_url: str,
|
|
36
37
|
auth: AuthBase | None = None,
|
|
37
|
-
default_headers: dict[str,
|
|
38
|
+
default_headers: dict[str, Any] | None = None,
|
|
39
|
+
pool_maxsize: int = 10,
|
|
38
40
|
):
|
|
39
41
|
"""Initializes the APIClient object.
|
|
40
42
|
|
|
@@ -42,11 +44,15 @@ class APIClient:
|
|
|
42
44
|
base_url: The base URL for the API.
|
|
43
45
|
auth: The authentication method for the API.
|
|
44
46
|
default_headers: Default headers to include in requests.
|
|
47
|
+
pool_maxsize: The maximum pool size for the HTTPAdapter (maximum number of connections to save in the pool).
|
|
45
48
|
"""
|
|
46
49
|
if not base_url.endswith("/"):
|
|
47
50
|
base_url += "/"
|
|
48
51
|
self.base_url = base_url
|
|
49
52
|
self.session = requests.Session()
|
|
53
|
+
self.pool_maxsize = pool_maxsize
|
|
54
|
+
adapter = HTTPAdapter(pool_maxsize=pool_maxsize)
|
|
55
|
+
self.session.mount("https://", adapter)
|
|
50
56
|
if default_headers:
|
|
51
57
|
self.session.headers.update(default_headers)
|
|
52
58
|
self.session.auth = auth
|
|
@@ -56,11 +62,13 @@ class APIClient:
|
|
|
56
62
|
method: str,
|
|
57
63
|
endpoint: str,
|
|
58
64
|
timeout: int = 30,
|
|
59
|
-
params: dict[str,
|
|
60
|
-
data: dict[str,
|
|
61
|
-
json: dict[str,
|
|
62
|
-
headers: dict[str,
|
|
65
|
+
params: dict[str, Any] | None = None,
|
|
66
|
+
data: dict[str, Any] | None = None,
|
|
67
|
+
json: dict[str, Any] | None = None,
|
|
68
|
+
headers: dict[str, Any] | None = None,
|
|
63
69
|
max_retries: int = 0,
|
|
70
|
+
backoff_factor: int = 1,
|
|
71
|
+
raise_for_status: bool = True,
|
|
64
72
|
) -> APIResponse:
|
|
65
73
|
"""Makes a request to the API endpoint.
|
|
66
74
|
|
|
@@ -73,6 +81,8 @@ class APIClient:
|
|
|
73
81
|
json: The JSON data to include in the request.
|
|
74
82
|
headers: The headers to include in the request.
|
|
75
83
|
max_retries: The maximum number of retries for the request.
|
|
84
|
+
backoff_factor: Factor for exponential backoff between retries.
|
|
85
|
+
raise_for_status: Raise HTTPError, if one occurred.
|
|
76
86
|
|
|
77
87
|
Returns:
|
|
78
88
|
APIResponse: The response from the API.
|
|
@@ -98,13 +108,14 @@ class APIClient:
|
|
|
98
108
|
headers=headers,
|
|
99
109
|
)
|
|
100
110
|
if response.status_code not in APIClient.RETRY_CODES:
|
|
101
|
-
|
|
111
|
+
if raise_for_status:
|
|
112
|
+
response.raise_for_status()
|
|
102
113
|
return APIResponse(response)
|
|
103
114
|
except requests.exceptions.HTTPError as err:
|
|
104
115
|
raise APIClientHTTPError(f"HTTP error occurred: {err}") from err
|
|
105
116
|
except requests.exceptions.ConnectionError as err:
|
|
106
117
|
if attempt < max_retries:
|
|
107
|
-
sleep_time = min(2**attempt, APIClient.MAX_SLEEP_TIME)
|
|
118
|
+
sleep_time = min(backoff_factor * (2**attempt), APIClient.MAX_SLEEP_TIME)
|
|
108
119
|
sleep(sleep_time)
|
|
109
120
|
continue
|
|
110
121
|
raise APIClientConnectionError(f"Connection error occurred: {err}") from err
|
|
@@ -56,17 +56,20 @@ class APIResponse:
|
|
|
56
56
|
dict_response = {"value": self.response.text}
|
|
57
57
|
|
|
58
58
|
if key:
|
|
59
|
-
dict_response = dict_response[key]
|
|
59
|
+
dict_response = {"value": dict_response[key]}
|
|
60
60
|
except KeyError as err:
|
|
61
61
|
raise KeyError(
|
|
62
|
-
f"The key '{err.args[0]}' was not found in the response. Status code: {self.status_code},
|
|
62
|
+
f"The key '{err.args[0]}' was not found in the response. Status code: {self.status_code}, "
|
|
63
|
+
f"Headers: {self.headers}, Response: {dict_response}"
|
|
63
64
|
) from err
|
|
64
65
|
except ValueError as err:
|
|
65
66
|
raise ValueError(
|
|
66
|
-
f"Error parsing JSON response: {err}. Status code: {self.status_code}, Headers: {self.headers},
|
|
67
|
+
f"Error parsing JSON response: {err}. Status code: {self.status_code}, Headers: {self.headers}, "
|
|
68
|
+
f"Response content: {self.response.text}"
|
|
67
69
|
) from err
|
|
68
70
|
except Exception as err:
|
|
69
71
|
raise APIClientError(
|
|
70
|
-
f"An unexpected error occurred: {err}. Status code: {self.status_code}, Headers: {self.headers},
|
|
72
|
+
f"An unexpected error occurred: {err}. Status code: {self.status_code}, Headers: {self.headers}, "
|
|
73
|
+
f"Response content: {self.response.text}"
|
|
71
74
|
) from err
|
|
72
75
|
return dict_response
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
from typing import Self
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field, field_validator, model_validator
|
|
4
|
+
from typing_extensions import TypedDict
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class PaginationStrategyConfigData(TypedDict, total=False):
|
|
8
|
+
"""Shared config across all strategies."""
|
|
9
|
+
|
|
10
|
+
check_field: str | None # e.g. "results" or "data.items"
|
|
11
|
+
next_page_field: str | None # e.g. "info.next_page"
|
|
12
|
+
max_page: int # hard cap (reader also enforces)
|
|
13
|
+
pages_per_array_limit: int # chunking behavior for output arrays
|
|
14
|
+
preliminary_probe: bool # enable probe_max_page pre-scan
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class LimitOffsetPaginationConfigData(PaginationStrategyConfigData, total=False):
|
|
18
|
+
"""Config for limit-offset pagination."""
|
|
19
|
+
|
|
20
|
+
limit_field: str # e.g. "limit" or "page_size"
|
|
21
|
+
offset_field: str # e.g. "offset" or "cursor"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class PageBasedPaginationConfigData(PaginationStrategyConfigData, total=False):
|
|
25
|
+
"""Config for page-based pagination."""
|
|
26
|
+
|
|
27
|
+
page_field: str # e.g. "page"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class PaginationConfigData(TypedDict, total=False):
|
|
31
|
+
"""Top-level config (what your Pydantic model or dict can accept)."""
|
|
32
|
+
|
|
33
|
+
strategy: str # "limit_offset" | "page_based" | ...
|
|
34
|
+
# strategy-specific fields:
|
|
35
|
+
limit_field: str
|
|
36
|
+
offset_field: str
|
|
37
|
+
page_field: str
|
|
38
|
+
# shared/advanced fields:
|
|
39
|
+
check_field: str | None
|
|
40
|
+
next_page_field: str | None
|
|
41
|
+
max_page: int
|
|
42
|
+
pages_per_array_limit: int
|
|
43
|
+
preliminary_probe: bool
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class PaginationConfig(BaseModel):
|
|
47
|
+
"""Configuration model for pagination options."""
|
|
48
|
+
|
|
49
|
+
strategy: str = Field(..., description="Pagination strategy (limit_offset, page_based, cursor_based, etc.)")
|
|
50
|
+
check_field: str | None = Field(None, description="Field to check for emptiness of response.")
|
|
51
|
+
next_page_field: str | None = Field(None, description="Field that indicates there is a next page.")
|
|
52
|
+
limit_field: str | None = Field(
|
|
53
|
+
None, description="Name of the limit parameter field for items per page or request."
|
|
54
|
+
)
|
|
55
|
+
offset_field: str | None = Field(
|
|
56
|
+
None, description="Name of the offset parameter field for items per page or request."
|
|
57
|
+
)
|
|
58
|
+
page_field: str | None = Field(None, description="Name of the page parameter field.")
|
|
59
|
+
max_page: int = Field(-1, description="Amount of pages to fetch. If not set, will fetch all available data.")
|
|
60
|
+
pages_per_array_limit: int = Field(-1, description="Maximum number of pages per array.")
|
|
61
|
+
preliminary_probe: bool = Field(
|
|
62
|
+
False, description="Whether to perform a preliminary probe to determine the total number of pages."
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
@field_validator("strategy", mode="before")
|
|
66
|
+
@classmethod
|
|
67
|
+
def _validate_strategy(cls, v: str) -> str:
|
|
68
|
+
"""Validates the pagination strategy."""
|
|
69
|
+
supported_strategies = ["limit_offset", "page_based", "cursor_based", "time_based"]
|
|
70
|
+
if v not in supported_strategies:
|
|
71
|
+
if v in ["cursor_based", "time_based"]:
|
|
72
|
+
raise NotImplementedError("cursor_based and time_based are not yet supported.")
|
|
73
|
+
supported_str = ", ".join(supported_strategies)
|
|
74
|
+
raise ValueError(f"Unsupported pagination strategy: {v}. Supported strategies: {supported_str}")
|
|
75
|
+
return v
|
|
76
|
+
|
|
77
|
+
@model_validator(mode="after")
|
|
78
|
+
def _validate_strategy_config(self) -> Self:
|
|
79
|
+
"""Validates the configuration of the pagination strategy."""
|
|
80
|
+
if self.strategy == "limit_offset" and any(field is None for field in [self.limit_field, self.offset_field]):
|
|
81
|
+
raise ValueError(f"Both <limit_field> and <offset_field> must be set for strategy '{self.strategy}'")
|
|
82
|
+
if self.strategy == "page_based" and self.page_field is None:
|
|
83
|
+
raise ValueError(f"<page_field> must be set for strategy '{self.strategy}'")
|
|
84
|
+
return self
|