cloe-nessy 0.3.17.0__py3-none-any.whl → 0.3.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. cloe_nessy/clients/api_client/__init__.py +10 -1
  2. cloe_nessy/clients/api_client/api_client.py +19 -8
  3. cloe_nessy/clients/api_client/api_response.py +7 -4
  4. cloe_nessy/clients/api_client/pagination_config.py +84 -0
  5. cloe_nessy/clients/api_client/pagination_strategy.py +500 -0
  6. cloe_nessy/integration/delta_loader/delta_loader.py +1 -1
  7. cloe_nessy/integration/reader/__init__.py +2 -2
  8. cloe_nessy/integration/reader/api_reader.py +463 -72
  9. cloe_nessy/integration/reader/catalog_reader.py +49 -10
  10. cloe_nessy/integration/reader/excel_reader.py +3 -3
  11. cloe_nessy/integration/reader/file_reader.py +3 -1
  12. cloe_nessy/integration/reader/reader.py +1 -1
  13. cloe_nessy/integration/writer/catalog_writer.py +64 -2
  14. cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py +5 -1
  15. cloe_nessy/models/column.py +3 -2
  16. cloe_nessy/models/schema.py +1 -0
  17. cloe_nessy/models/templates/create_table.sql.j2 +22 -0
  18. cloe_nessy/object_manager/table_manager.py +29 -7
  19. cloe_nessy/pipeline/actions/__init__.py +1 -1
  20. cloe_nessy/pipeline/actions/read_api.py +272 -75
  21. cloe_nessy/pipeline/actions/read_catalog_table.py +73 -10
  22. cloe_nessy/pipeline/actions/read_excel.py +1 -1
  23. cloe_nessy/pipeline/actions/read_metadata_yaml.py +61 -33
  24. cloe_nessy/pipeline/actions/transform_decode.py +2 -1
  25. cloe_nessy/pipeline/actions/transform_join.py +98 -24
  26. cloe_nessy/pipeline/actions/transform_union.py +2 -2
  27. cloe_nessy/pipeline/actions/write_catalog_table.py +66 -21
  28. cloe_nessy/pipeline/actions/write_delta_merge.py +1 -0
  29. cloe_nessy/pipeline/pipeline_config.py +2 -0
  30. cloe_nessy/pipeline/pipeline_context.py +1 -1
  31. cloe_nessy/pipeline/pipeline_parsing_service.py +104 -39
  32. cloe_nessy/pipeline/pipeline_step.py +2 -0
  33. cloe_nessy/session/__init__.py +2 -1
  34. cloe_nessy/session/pyspark_compat.py +15 -0
  35. cloe_nessy/session/session_manager.py +1 -1
  36. {cloe_nessy-0.3.17.0.dist-info → cloe_nessy-0.3.19.dist-info}/METADATA +19 -19
  37. {cloe_nessy-0.3.17.0.dist-info → cloe_nessy-0.3.19.dist-info}/RECORD +38 -36
  38. {cloe_nessy-0.3.17.0.dist-info → cloe_nessy-0.3.19.dist-info}/WHEEL +1 -2
  39. cloe_nessy-0.3.17.0.dist-info/top_level.txt +0 -1
@@ -0,0 +1,500 @@
1
+ from abc import ABC, abstractmethod
2
+ from collections.abc import Callable
3
+ from typing import Any
4
+
5
+ from .api_response import APIResponse
6
+ from .pagination_config import (
7
+ LimitOffsetPaginationConfigData,
8
+ PageBasedPaginationConfigData,
9
+ PaginationStrategyConfigData,
10
+ )
11
+
12
+
13
+ class PaginationStrategy(ABC):
14
+ """Abstract base class for implementing pagination strategies."""
15
+
16
+ name = ""
17
+
18
+ def __init__(self, config: PaginationStrategyConfigData):
19
+ """Initialize the strategy with a concrete pagination configuration."""
20
+ self._config: PaginationStrategyConfigData = config
21
+
22
+ @staticmethod
23
+ def _resolve_path(data: Any, path: str | None) -> Any:
24
+ """Resolve a dotted path (e.g., 'info.next_page') inside a dict, returning None if any segment is missing."""
25
+ if not path:
26
+ return data
27
+ cur = data
28
+ for part in path.split("."):
29
+ if isinstance(cur, dict) and part in cur:
30
+ cur = cur[part]
31
+ else:
32
+ return None
33
+ return cur
34
+
35
+ def has_any_data(self, response: APIResponse) -> bool:
36
+ """Return True if the current page contains data.
37
+
38
+ If 'check_field' is configured, truthiness of that field is used;
39
+ otherwise, truthiness of the entire response payload is used.
40
+ """
41
+ payload = response.to_dict()
42
+ check_field = self._config.get("check_field")
43
+ if check_field:
44
+ value = self._resolve_path(payload, check_field)
45
+ return bool(value)
46
+ return bool(payload)
47
+
48
+ def has_more_pages(self, response: APIResponse) -> bool | None:
49
+ """Return True/False if 'next_page_field' is configured; return None if not configured."""
50
+ next_field = self._config.get("next_page_field")
51
+ if not next_field:
52
+ return None
53
+ payload = response.to_dict()
54
+ value = self._resolve_path(payload, next_field)
55
+ return bool(value)
56
+
57
+ def has_more_data(self, response: APIResponse) -> bool:
58
+ """Return True if there is more data to fetch.
59
+
60
+ Prefers explicit next-pointer semantics via 'next_page_field'. If not configured,
61
+ falls back to presence of current-page data.
62
+ """
63
+ has_next = self.has_more_pages(response)
64
+ if has_next is not None:
65
+ return has_next
66
+ return self.has_any_data(response)
67
+
68
+ @abstractmethod
69
+ def get_next_params(self, current_params: Any) -> Any:
70
+ """Generate the next set of parameters for the API request."""
71
+ pass
72
+
73
+ @abstractmethod
74
+ def probe_max_page(
75
+ self,
76
+ endpoint: str,
77
+ params: dict[str, Any],
78
+ headers: dict[str, Any] | None,
79
+ data: dict[str, Any] | None,
80
+ json_body: dict[str, Any] | None,
81
+ make_request: Callable[
82
+ [str, dict[str, Any], dict[str, Any] | None, dict[str, Any] | None, dict[str, Any] | None], APIResponse
83
+ ],
84
+ ) -> list[dict[str, Any]]:
85
+ """Find and return the list of parameter maps for all available pages."""
86
+ pass
87
+
88
+
89
+ class LimitOffsetStrategy(PaginationStrategy):
90
+ """Implementation of the limit-offset pagination strategy."""
91
+
92
+ name = "limit_offset"
93
+
94
+ def __init__(self, config: LimitOffsetPaginationConfigData):
95
+ """Initialize the limit/offset strategy with its configuration."""
96
+ super().__init__(config)
97
+ self._config: LimitOffsetPaginationConfigData = config
98
+
99
+ def get_next_params(self, current_params: dict[str, Any]) -> dict[str, Any]:
100
+ """Return parameters for the next page by advancing 'offset' by 'limit'."""
101
+ limit_field = self._config["limit_field"]
102
+ offset_field = self._config["offset_field"]
103
+
104
+ limit = int(current_params[limit_field])
105
+ offset = int(current_params[offset_field])
106
+
107
+ current_params[offset_field] = offset + limit
108
+ return current_params
109
+
110
+ def _aligned_double(self, current_offset: int, limit_val: int) -> int:
111
+ """Return a next offset that roughly doubles progress while remaining aligned to 'limit'."""
112
+ if current_offset == 0:
113
+ return limit_val
114
+ return ((current_offset // limit_val) + 1) * limit_val * 2
115
+
116
+ def _request(
117
+ self,
118
+ endpoint: str,
119
+ params: dict[str, Any],
120
+ headers: dict[str, Any] | None,
121
+ data: dict[str, Any] | None,
122
+ json_body: dict[str, Any] | None,
123
+ make_request: Callable[
124
+ [str, dict[str, Any], dict[str, Any] | None, dict[str, Any] | None, dict[str, Any] | None], APIResponse
125
+ ],
126
+ ) -> APIResponse:
127
+ """Invoke the provided request callable with the given arguments."""
128
+ return make_request(endpoint, params, headers, data, json_body)
129
+
130
+ def _expansion_phase(
131
+ self,
132
+ endpoint: str,
133
+ base_params: dict[str, Any],
134
+ headers: dict[str, Any] | None,
135
+ data: dict[str, Any] | None,
136
+ json_body: dict[str, Any] | None,
137
+ make_request: Callable[
138
+ [str, dict[str, Any], dict[str, Any] | None, dict[str, Any] | None, dict[str, Any] | None], APIResponse
139
+ ],
140
+ initial_offset: int,
141
+ limit_val: int,
142
+ offset_field: str,
143
+ limit_field: str,
144
+ max_steps: int = 64,
145
+ ) -> tuple[bool, int | None, int, bool]:
146
+ """Perform exponential probing to locate a valid range of offsets.
147
+
148
+ Returns:
149
+ seen_valid: whether any page contained data
150
+ low_offset: last offset known to be valid (None if none)
151
+ current_offset: offset at which probing ended
152
+ broke_on_no_more: True if an explicit 'no more pages' signal ended probing
153
+ """
154
+ seen_valid = False
155
+ low_offset: int | None = None
156
+ current_offset = initial_offset
157
+ broke_on_no_more = False
158
+
159
+ for _ in range(max_steps):
160
+ new_params = base_params.copy()
161
+ new_params[offset_field] = current_offset
162
+ new_params[limit_field] = limit_val
163
+
164
+ resp = self._request(endpoint, new_params, headers, data, json_body, make_request)
165
+ valid_now = self.has_any_data(resp)
166
+ has_next = self.has_more_pages(resp)
167
+
168
+ if not valid_now:
169
+ break
170
+
171
+ seen_valid = True
172
+ low_offset = current_offset
173
+
174
+ if has_next is False:
175
+ broke_on_no_more = True
176
+ break
177
+
178
+ current_offset = self._aligned_double(current_offset, limit_val)
179
+
180
+ return seen_valid, low_offset, current_offset, broke_on_no_more
181
+
182
+ def _binary_search_last_valid_offset(
183
+ self,
184
+ endpoint: str,
185
+ base_params: dict[str, Any],
186
+ headers: dict[str, Any] | None,
187
+ data: dict[str, Any] | None,
188
+ json_body: dict[str, Any] | None,
189
+ make_request: Callable[
190
+ [str, dict[str, Any], dict[str, Any] | None, dict[str, Any] | None, dict[str, Any] | None], APIResponse
191
+ ],
192
+ low_valid_offset: int,
193
+ high_invalid_offset: int,
194
+ limit_val: int,
195
+ offset_field: str,
196
+ limit_field: str,
197
+ max_steps: int = 64,
198
+ ) -> int:
199
+ """Binary search for the last valid offset between a known-valid low bound and an invalid/safety high bound."""
200
+ low = low_valid_offset
201
+ high = max(high_invalid_offset, low + limit_val)
202
+
203
+ for _ in range(max_steps):
204
+ if low + limit_val >= high:
205
+ break
206
+
207
+ mid = ((low + high) // 2 // limit_val) * limit_val
208
+ if mid <= low:
209
+ mid = low + limit_val
210
+ if mid >= high:
211
+ break
212
+
213
+ p = base_params.copy()
214
+ p[offset_field] = mid
215
+ p[limit_field] = limit_val
216
+ resp = self._request(endpoint, p, headers, data, json_body, make_request)
217
+
218
+ if self.has_any_data(resp):
219
+ low = mid
220
+ else:
221
+ high = mid
222
+
223
+ return low
224
+
225
+ def _build_offset_pages(
226
+ self,
227
+ base_params: dict[str, Any],
228
+ initial_offset: int,
229
+ last_valid_offset: int,
230
+ limit_val: int,
231
+ offset_field: str,
232
+ limit_field: str,
233
+ ) -> list[dict[str, Any]]:
234
+ """Build and return the list of parameter maps for all offsets from the initial offset to the last valid offset."""
235
+ if last_valid_offset < initial_offset:
236
+ return []
237
+ page_count = ((last_valid_offset - initial_offset) // limit_val) + 1
238
+ if page_count <= 0:
239
+ return []
240
+
241
+ out: list[dict[str, Any]] = []
242
+ for i in range(page_count):
243
+ p = base_params.copy()
244
+ p[offset_field] = initial_offset + i * limit_val
245
+ p[limit_field] = limit_val
246
+ out.append(p)
247
+ return out
248
+
249
+ def probe_max_page(
250
+ self,
251
+ endpoint: str,
252
+ params: dict[str, Any],
253
+ headers: dict[str, Any] | None,
254
+ data: dict[str, Any] | None,
255
+ json_body: dict[str, Any] | None,
256
+ make_request: Callable[
257
+ [str, dict[str, Any], dict[str, Any] | None, dict[str, Any] | None, dict[str, Any] | None], APIResponse
258
+ ],
259
+ ) -> list[dict[str, Any]]:
260
+ """Find the maximum page available.
261
+
262
+ Works with either:
263
+ - a data list field (via config['check_field']), or
264
+ - an explicit next-page pointer (via config['next_page_field']).
265
+
266
+ If both are present, a page is considered valid if it has data, and probing
267
+ stops after the last valid page when 'next_page_field' indicates no more.
268
+ """
269
+ offset_field = self._config["offset_field"]
270
+ limit_field = self._config["limit_field"]
271
+
272
+ initial_offset = max(0, int(params.get(offset_field, 0)))
273
+ limit_val = int(params[limit_field])
274
+
275
+ if limit_val <= 0:
276
+ p = params.copy()
277
+ p[offset_field] = initial_offset
278
+ p[limit_field] = limit_val
279
+ return [p]
280
+
281
+ seen_valid, low_offset, current_offset, broke_on_no_more = self._expansion_phase(
282
+ endpoint,
283
+ params,
284
+ headers,
285
+ data,
286
+ json_body,
287
+ make_request,
288
+ initial_offset,
289
+ limit_val,
290
+ offset_field,
291
+ limit_field,
292
+ )
293
+
294
+ if not seen_valid or low_offset is None:
295
+ return []
296
+
297
+ if broke_on_no_more:
298
+ high_offset = low_offset + limit_val
299
+ else:
300
+ high_offset = max(current_offset, low_offset + limit_val)
301
+
302
+ last_valid_offset = self._binary_search_last_valid_offset(
303
+ endpoint,
304
+ params,
305
+ headers,
306
+ data,
307
+ json_body,
308
+ make_request,
309
+ low_offset,
310
+ high_offset,
311
+ limit_val,
312
+ offset_field,
313
+ limit_field,
314
+ )
315
+
316
+ return self._build_offset_pages(params, initial_offset, last_valid_offset, limit_val, offset_field, limit_field)
317
+
318
+
319
+ class PageBasedStrategy(PaginationStrategy):
320
+ """Implementation of page-based pagination strategy."""
321
+
322
+ name = "page_based"
323
+
324
+ def __init__(self, config: PageBasedPaginationConfigData):
325
+ """Initialize the page-based strategy with its configuration."""
326
+ super().__init__(config)
327
+ self._config: PageBasedPaginationConfigData = config
328
+
329
+ def get_next_params(self, current_params: dict[str, Any]) -> dict[str, Any]:
330
+ """Return parameters for the next page by incrementing 'page_field' by 1."""
331
+ page_field = self._config["page_field"]
332
+ current_page = int(current_params[page_field])
333
+ current_params[page_field] = current_page + 1
334
+ return current_params
335
+
336
+ def _page_request(
337
+ self,
338
+ endpoint: str,
339
+ base_params: dict[str, Any],
340
+ headers: dict[str, Any] | None,
341
+ data: dict[str, Any] | None,
342
+ json_body: dict[str, Any] | None,
343
+ make_request: Callable[
344
+ [str, dict[str, Any], dict[str, Any] | None, dict[str, Any] | None, dict[str, Any] | None], APIResponse
345
+ ],
346
+ page_field: str,
347
+ page_value: int,
348
+ ) -> APIResponse:
349
+ """Send a request for a specific page value."""
350
+ p = base_params.copy()
351
+ p[page_field] = page_value
352
+ return make_request(endpoint, p, headers, data, json_body)
353
+
354
+ def _expansion_phase(
355
+ self,
356
+ endpoint: str,
357
+ base_params: dict[str, Any],
358
+ headers: dict[str, Any] | None,
359
+ data: dict[str, Any] | None,
360
+ json_body: dict[str, Any] | None,
361
+ make_request: Callable[
362
+ [str, dict[str, Any], dict[str, Any] | None, dict[str, Any] | None, dict[str, Any] | None], APIResponse
363
+ ],
364
+ start_page: int,
365
+ page_field: str,
366
+ max_steps: int = 64,
367
+ ) -> tuple[int, bool, int]:
368
+ """Perform exponential probing to locate a valid range of page numbers.
369
+
370
+ Returns:
371
+ last_valid: last page number known to be valid (0 if none)
372
+ broke_on_no_more: True if an explicit 'no more pages' signal ended probing
373
+ current: page number at which probing ended
374
+ """
375
+ last_valid = 0
376
+ current = start_page
377
+ broke_on_no_more = False
378
+
379
+ for _ in range(max_steps):
380
+ resp = self._page_request(
381
+ endpoint, base_params, headers, data, json_body, make_request, page_field, current
382
+ )
383
+ valid_now = self.has_any_data(resp)
384
+ has_next = self.has_more_pages(resp)
385
+
386
+ if not valid_now:
387
+ break
388
+
389
+ last_valid = current
390
+
391
+ if has_next is False:
392
+ broke_on_no_more = True
393
+ break
394
+
395
+ current = current * 2 if current > 0 else 1
396
+
397
+ return last_valid, broke_on_no_more, current
398
+
399
+ def _binary_search_last_valid_page(
400
+ self,
401
+ endpoint: str,
402
+ base_params: dict[str, Any],
403
+ headers: dict[str, Any] | None,
404
+ data: dict[str, Any] | None,
405
+ json_body: dict[str, Any] | None,
406
+ make_request: Callable[
407
+ [str, dict[str, Any], dict[str, Any] | None, dict[str, Any] | None, dict[str, Any] | None], APIResponse
408
+ ],
409
+ low_valid: int,
410
+ high_invalid_or_safety: int,
411
+ page_field: str,
412
+ max_steps: int = 64,
413
+ ) -> int:
414
+ """Binary search for the last valid page number between a known-valid low bound and an invalid/safety high bound."""
415
+ low = low_valid
416
+ high = max(high_invalid_or_safety, low + 1)
417
+
418
+ for _ in range(max_steps):
419
+ if low + 1 >= high:
420
+ break
421
+
422
+ mid = (low + high) // 2
423
+ resp = self._page_request(endpoint, base_params, headers, data, json_body, make_request, page_field, mid)
424
+
425
+ if self.has_any_data(resp):
426
+ low = mid
427
+ else:
428
+ high = mid
429
+
430
+ candidate = low
431
+
432
+ if candidate + 1 < high:
433
+ resp = self._page_request(
434
+ endpoint, base_params, headers, data, json_body, make_request, page_field, candidate + 1
435
+ )
436
+ if self.has_any_data(resp):
437
+ candidate += 1
438
+
439
+ return candidate
440
+
441
+ def _build_page_list(
442
+ self,
443
+ base_params: dict[str, Any],
444
+ start_page: int,
445
+ max_page: int,
446
+ page_field: str,
447
+ ) -> list[dict[str, Any]]:
448
+ """Build and return the list of parameter maps for all pages from start_page to max_page inclusive."""
449
+ if max_page < start_page:
450
+ return []
451
+ out: list[dict[str, Any]] = []
452
+ for page in range(start_page, max_page + 1):
453
+ p = base_params.copy()
454
+ p[page_field] = page
455
+ out.append(p)
456
+ return out
457
+
458
+ def probe_max_page(
459
+ self,
460
+ endpoint: str,
461
+ params: dict[str, Any],
462
+ headers: dict[str, Any] | None,
463
+ data: dict[str, Any] | None,
464
+ json_body: dict[str, Any] | None,
465
+ make_request: Callable[
466
+ [str, dict[str, Any], dict[str, Any] | None, dict[str, Any] | None, dict[str, Any] | None], APIResponse
467
+ ],
468
+ ) -> list[dict[str, Any]]:
469
+ """Find the maximum page available.
470
+
471
+ Honors both 'check_field' (data-present) and 'next_page_field' (explicit next pointer).
472
+ If probing stops because 'next_page_field' is False, no forward-check beyond that point is performed.
473
+ """
474
+ page_field = self._config["page_field"]
475
+
476
+ start_page = max(1, int(params.get(page_field, 1)))
477
+
478
+ last_valid, broke_on_no_more, current = self._expansion_phase(
479
+ endpoint, params, headers, data, json_body, make_request, start_page, page_field
480
+ )
481
+
482
+ if last_valid == 0:
483
+ return []
484
+
485
+ if broke_on_no_more:
486
+ max_page = last_valid
487
+ return self._build_page_list(params, start_page, max_page, page_field)
488
+
489
+ max_page = self._binary_search_last_valid_page(
490
+ endpoint,
491
+ params,
492
+ headers,
493
+ data,
494
+ json_body,
495
+ make_request,
496
+ low_valid=last_valid,
497
+ high_invalid_or_safety=max(current, last_valid + 1),
498
+ page_field=page_field,
499
+ )
500
+ return self._build_page_list(params, start_page, max_page, page_field)
@@ -102,7 +102,7 @@ class DeltaLoader(ABC, LoggerMixin):
102
102
  ),
103
103
  )
104
104
  catalog_writer = CatalogWriter()
105
- catalog_writer.write_table(
105
+ catalog_writer.write(
106
106
  df=metadata_df,
107
107
  table_identifier=self.metadata_table_identifier,
108
108
  mode="append",
@@ -1,6 +1,6 @@
1
- from .api_reader import APIReader
1
+ from .api_reader import APIReader, RequestSet
2
2
  from .catalog_reader import CatalogReader
3
3
  from .excel_reader import ExcelDataFrameReader
4
4
  from .file_reader import FileReader
5
5
 
6
- __all__ = ["APIReader", "CatalogReader", "FileReader", "ExcelDataFrameReader"]
6
+ __all__ = ["APIReader", "CatalogReader", "FileReader", "ExcelDataFrameReader", "RequestSet"]