cloe-nessy 0.3.17.0__py3-none-any.whl → 0.3.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. cloe_nessy/clients/api_client/__init__.py +10 -1
  2. cloe_nessy/clients/api_client/api_client.py +19 -8
  3. cloe_nessy/clients/api_client/api_response.py +7 -4
  4. cloe_nessy/clients/api_client/pagination_config.py +84 -0
  5. cloe_nessy/clients/api_client/pagination_strategy.py +500 -0
  6. cloe_nessy/integration/delta_loader/delta_loader.py +1 -1
  7. cloe_nessy/integration/reader/__init__.py +2 -2
  8. cloe_nessy/integration/reader/api_reader.py +463 -72
  9. cloe_nessy/integration/reader/catalog_reader.py +49 -10
  10. cloe_nessy/integration/reader/excel_reader.py +3 -3
  11. cloe_nessy/integration/reader/file_reader.py +3 -1
  12. cloe_nessy/integration/reader/reader.py +1 -1
  13. cloe_nessy/integration/writer/catalog_writer.py +64 -2
  14. cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py +5 -1
  15. cloe_nessy/models/column.py +3 -2
  16. cloe_nessy/models/schema.py +1 -0
  17. cloe_nessy/models/templates/create_table.sql.j2 +22 -0
  18. cloe_nessy/object_manager/table_manager.py +29 -7
  19. cloe_nessy/pipeline/actions/__init__.py +1 -1
  20. cloe_nessy/pipeline/actions/read_api.py +272 -75
  21. cloe_nessy/pipeline/actions/read_catalog_table.py +73 -10
  22. cloe_nessy/pipeline/actions/read_excel.py +1 -1
  23. cloe_nessy/pipeline/actions/read_metadata_yaml.py +61 -33
  24. cloe_nessy/pipeline/actions/transform_decode.py +2 -1
  25. cloe_nessy/pipeline/actions/transform_join.py +98 -24
  26. cloe_nessy/pipeline/actions/transform_union.py +2 -2
  27. cloe_nessy/pipeline/actions/write_catalog_table.py +66 -21
  28. cloe_nessy/pipeline/actions/write_delta_merge.py +1 -0
  29. cloe_nessy/pipeline/pipeline_config.py +2 -0
  30. cloe_nessy/pipeline/pipeline_context.py +1 -1
  31. cloe_nessy/pipeline/pipeline_parsing_service.py +104 -39
  32. cloe_nessy/pipeline/pipeline_step.py +2 -0
  33. cloe_nessy/session/__init__.py +2 -1
  34. cloe_nessy/session/pyspark_compat.py +15 -0
  35. cloe_nessy/session/session_manager.py +1 -1
  36. {cloe_nessy-0.3.17.0.dist-info → cloe_nessy-0.3.19.dist-info}/METADATA +19 -19
  37. {cloe_nessy-0.3.17.0.dist-info → cloe_nessy-0.3.19.dist-info}/RECORD +38 -36
  38. {cloe_nessy-0.3.17.0.dist-info → cloe_nessy-0.3.19.dist-info}/WHEEL +1 -2
  39. cloe_nessy-0.3.17.0.dist-info/top_level.txt +0 -1
@@ -1,3 +1,12 @@
1
+ from enum import Enum
2
+
1
3
  from .api_client import APIClient
4
+ from .api_response import APIResponse
5
+ from .pagination_config import PaginationConfig, PaginationConfigData
6
+ from .pagination_strategy import PaginationStrategy
7
+
8
+ pagination_strategies = {cls.name: cls for cls in PaginationStrategy.__subclasses__()}
9
+ PaginationStrategyType = Enum("PaginationStrategyType", pagination_strategies) # type: ignore[misc]
10
+
2
11
 
3
- __all__ = ["APIClient"]
12
+ __all__ = ["APIClient", "APIResponse", "PaginationStrategyType", "PaginationConfig", "PaginationConfigData"]
@@ -4,6 +4,7 @@ from typing import Any
4
4
  from urllib.parse import urljoin
5
5
 
6
6
  import requests
7
+ from requests.adapters import HTTPAdapter
7
8
  from requests.auth import AuthBase
8
9
 
9
10
  from .api_response import APIResponse
@@ -28,13 +29,14 @@ class APIClient:
28
29
  HTTPStatus.GATEWAY_TIMEOUT,
29
30
  ]
30
31
 
31
- MAX_SLEEP_TIME: int = 180 # seconds
32
+ MAX_SLEEP_TIME: int = 1800 # seconds
32
33
 
33
34
  def __init__(
34
35
  self,
35
36
  base_url: str,
36
37
  auth: AuthBase | None = None,
37
- default_headers: dict[str, str] | None = None,
38
+ default_headers: dict[str, Any] | None = None,
39
+ pool_maxsize: int = 10,
38
40
  ):
39
41
  """Initializes the APIClient object.
40
42
 
@@ -42,11 +44,15 @@ class APIClient:
42
44
  base_url: The base URL for the API.
43
45
  auth: The authentication method for the API.
44
46
  default_headers: Default headers to include in requests.
47
+ pool_maxsize: The maximum pool size for the HTTPAdapter (maximum number of connections to save in the pool).
45
48
  """
46
49
  if not base_url.endswith("/"):
47
50
  base_url += "/"
48
51
  self.base_url = base_url
49
52
  self.session = requests.Session()
53
+ self.pool_maxsize = pool_maxsize
54
+ adapter = HTTPAdapter(pool_maxsize=pool_maxsize)
55
+ self.session.mount("https://", adapter)
50
56
  if default_headers:
51
57
  self.session.headers.update(default_headers)
52
58
  self.session.auth = auth
@@ -56,11 +62,13 @@ class APIClient:
56
62
  method: str,
57
63
  endpoint: str,
58
64
  timeout: int = 30,
59
- params: dict[str, str] | None = None,
60
- data: dict[str, str] | None = None,
61
- json: dict[str, str] | None = None,
62
- headers: dict[str, str] | None = None,
65
+ params: dict[str, Any] | None = None,
66
+ data: dict[str, Any] | None = None,
67
+ json: dict[str, Any] | None = None,
68
+ headers: dict[str, Any] | None = None,
63
69
  max_retries: int = 0,
70
+ backoff_factor: int = 1,
71
+ raise_for_status: bool = True,
64
72
  ) -> APIResponse:
65
73
  """Makes a request to the API endpoint.
66
74
 
@@ -73,6 +81,8 @@ class APIClient:
73
81
  json: The JSON data to include in the request.
74
82
  headers: The headers to include in the request.
75
83
  max_retries: The maximum number of retries for the request.
84
+ backoff_factor: Factor for exponential backoff between retries.
85
+ raise_for_status: Raise HTTPError, if one occurred.
76
86
 
77
87
  Returns:
78
88
  APIResponse: The response from the API.
@@ -98,13 +108,14 @@ class APIClient:
98
108
  headers=headers,
99
109
  )
100
110
  if response.status_code not in APIClient.RETRY_CODES:
101
- response.raise_for_status()
111
+ if raise_for_status:
112
+ response.raise_for_status()
102
113
  return APIResponse(response)
103
114
  except requests.exceptions.HTTPError as err:
104
115
  raise APIClientHTTPError(f"HTTP error occurred: {err}") from err
105
116
  except requests.exceptions.ConnectionError as err:
106
117
  if attempt < max_retries:
107
- sleep_time = min(2**attempt, APIClient.MAX_SLEEP_TIME)
118
+ sleep_time = min(backoff_factor * (2**attempt), APIClient.MAX_SLEEP_TIME)
108
119
  sleep(sleep_time)
109
120
  continue
110
121
  raise APIClientConnectionError(f"Connection error occurred: {err}") from err
@@ -56,17 +56,20 @@ class APIResponse:
56
56
  dict_response = {"value": self.response.text}
57
57
 
58
58
  if key:
59
- dict_response = dict_response[key]
59
+ dict_response = {"value": dict_response[key]}
60
60
  except KeyError as err:
61
61
  raise KeyError(
62
- f"The key '{err.args[0]}' was not found in the response. Status code: {self.status_code}, Headers: {self.headers}, Response: {dict_response}"
62
+ f"The key '{err.args[0]}' was not found in the response. Status code: {self.status_code}, "
63
+ f"Headers: {self.headers}, Response: {dict_response}"
63
64
  ) from err
64
65
  except ValueError as err:
65
66
  raise ValueError(
66
- f"Error parsing JSON response: {err}. Status code: {self.status_code}, Headers: {self.headers}, Response content: {self.response.text}"
67
+ f"Error parsing JSON response: {err}. Status code: {self.status_code}, Headers: {self.headers}, "
68
+ f"Response content: {self.response.text}"
67
69
  ) from err
68
70
  except Exception as err:
69
71
  raise APIClientError(
70
- f"An unexpected error occurred: {err}. Status code: {self.status_code}, Headers: {self.headers}, Response content: {self.response.text}"
72
+ f"An unexpected error occurred: {err}. Status code: {self.status_code}, Headers: {self.headers}, "
73
+ f"Response content: {self.response.text}"
71
74
  ) from err
72
75
  return dict_response
@@ -0,0 +1,84 @@
1
+ from typing import Self
2
+
3
+ from pydantic import BaseModel, Field, field_validator, model_validator
4
+ from typing_extensions import TypedDict
5
+
6
+
7
+ class PaginationStrategyConfigData(TypedDict, total=False):
8
+ """Shared config across all strategies."""
9
+
10
+ check_field: str | None # e.g. "results" or "data.items"
11
+ next_page_field: str | None # e.g. "info.next_page"
12
+ max_page: int # hard cap (reader also enforces)
13
+ pages_per_array_limit: int # chunking behavior for output arrays
14
+ preliminary_probe: bool # enable probe_max_page pre-scan
15
+
16
+
17
+ class LimitOffsetPaginationConfigData(PaginationStrategyConfigData, total=False):
18
+ """Config for limit-offset pagination."""
19
+
20
+ limit_field: str # e.g. "limit" or "page_size"
21
+ offset_field: str # e.g. "offset" or "cursor"
22
+
23
+
24
+ class PageBasedPaginationConfigData(PaginationStrategyConfigData, total=False):
25
+ """Config for page-based pagination."""
26
+
27
+ page_field: str # e.g. "page"
28
+
29
+
30
+ class PaginationConfigData(TypedDict, total=False):
31
+ """Top-level config (what your Pydantic model or dict can accept)."""
32
+
33
+ strategy: str # "limit_offset" | "page_based" | ...
34
+ # strategy-specific fields:
35
+ limit_field: str
36
+ offset_field: str
37
+ page_field: str
38
+ # shared/advanced fields:
39
+ check_field: str | None
40
+ next_page_field: str | None
41
+ max_page: int
42
+ pages_per_array_limit: int
43
+ preliminary_probe: bool
44
+
45
+
46
+ class PaginationConfig(BaseModel):
47
+ """Configuration model for pagination options."""
48
+
49
+ strategy: str = Field(..., description="Pagination strategy (limit_offset, page_based, cursor_based, etc.)")
50
+ check_field: str | None = Field(None, description="Field to check for emptiness of response.")
51
+ next_page_field: str | None = Field(None, description="Field that indicates there is a next page.")
52
+ limit_field: str | None = Field(
53
+ None, description="Name of the limit parameter field for items per page or request."
54
+ )
55
+ offset_field: str | None = Field(
56
+ None, description="Name of the offset parameter field for items per page or request."
57
+ )
58
+ page_field: str | None = Field(None, description="Name of the page parameter field.")
59
+ max_page: int = Field(-1, description="Amount of pages to fetch. If not set, will fetch all available data.")
60
+ pages_per_array_limit: int = Field(-1, description="Maximum number of pages per array.")
61
+ preliminary_probe: bool = Field(
62
+ False, description="Whether to perform a preliminary probe to determine the total number of pages."
63
+ )
64
+
65
+ @field_validator("strategy", mode="before")
66
+ @classmethod
67
+ def _validate_strategy(cls, v: str) -> str:
68
+ """Validates the pagination strategy."""
69
+ supported_strategies = ["limit_offset", "page_based", "cursor_based", "time_based"]
70
+ if v not in supported_strategies:
71
+ if v in ["cursor_based", "time_based"]:
72
+ raise NotImplementedError("cursor_based and time_based are not yet supported.")
73
+ supported_str = ", ".join(supported_strategies)
74
+ raise ValueError(f"Unsupported pagination strategy: {v}. Supported strategies: {supported_str}")
75
+ return v
76
+
77
+ @model_validator(mode="after")
78
+ def _validate_strategy_config(self) -> Self:
79
+ """Validates the configuration of the pagination strategy."""
80
+ if self.strategy == "limit_offset" and any(field is None for field in [self.limit_field, self.offset_field]):
81
+ raise ValueError(f"Both <limit_field> and <offset_field> must be set for strategy '{self.strategy}'")
82
+ if self.strategy == "page_based" and self.page_field is None:
83
+ raise ValueError(f"<page_field> must be set for strategy '{self.strategy}'")
84
+ return self