cloe-nessy 0.3.18__py3-none-any.whl → 0.3.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -33,7 +33,7 @@ pipeline_actions = {cls.name: cls for cls in PipelineAction.__subclasses__()}
33
33
  # Register all subclasses dynamically as enum using their "name" attribute as
34
34
  # key. We need to do this here, because otherwise we don't get all subclasses
35
35
  # from a relative import of PipelineAction
36
- PipelineActionType = Enum("PipelineActionType", pipeline_actions) # type: ignore
36
+ PipelineActionType = Enum("PipelineActionType", pipeline_actions) # type: ignore[misc]
37
37
 
38
38
  __all__ = [
39
39
  "ReadAPIAction",
@@ -1,10 +1,12 @@
1
1
  from collections.abc import Mapping
2
2
  from typing import Any, cast
3
3
 
4
+ from pydantic import ConfigDict, validate_call
4
5
  from requests.auth import AuthBase, HTTPBasicAuth
5
6
 
7
+ from ...clients.api_client import PaginationConfig, PaginationConfigData
6
8
  from ...clients.api_client.auth import AzureCredentialAuth, ChainedAuth, EnvVariableAuth, SecretScopeAuth
7
- from ...integration.reader import APIReader
9
+ from ...integration.reader import APIReader, RequestSet
8
10
  from ..pipeline_action import PipelineAction
9
11
  from ..pipeline_context import PipelineContext
10
12
 
@@ -12,11 +14,7 @@ from ..pipeline_context import PipelineContext
12
14
  def process_auth(
13
15
  auth: Mapping[str, str | Mapping[str, str] | list[Mapping[str, str]]] | AuthBase | None,
14
16
  ) -> AuthBase | None:
15
- """Processes the auth parameter to create an AuthBase object.
16
-
17
- Args:
18
- auth: The auth parameter to be processed.
19
- """
17
+ """Processes the auth parameter to create an AuthBase object."""
20
18
  result: AuthBase | None = None
21
19
 
22
20
  if isinstance(auth, list):
@@ -27,11 +25,9 @@ def process_auth(
27
25
  case "basic":
28
26
  result = HTTPBasicAuth(auth["username"], auth["password"])
29
27
  case "secret_scope":
30
- secret_scope_header_template: dict[str, str] = auth["header_template"]
31
- result = SecretScopeAuth(secret_scope_header_template, auth["secret_scope"])
28
+ result = SecretScopeAuth(auth["header_template"], auth["secret_scope"])
32
29
  case "env":
33
- env_header_template: dict[str, str] = auth["header_template"]
34
- result = EnvVariableAuth(env_header_template)
30
+ result = EnvVariableAuth(auth["header_template"])
35
31
  case "azure_oauth":
36
32
  result = AzureCredentialAuth(
37
33
  scope=auth["scope"],
@@ -40,9 +36,12 @@ def process_auth(
40
36
  tenant_id=auth["tenant_id"],
41
37
  )
42
38
  case _:
43
- raise ValueError("Invalid auth type specified. Supported types are: basic, secret_scope, env")
39
+ raise ValueError(
40
+ "Invalid auth type specified. Supported types are: basic, secret_scope, env, azure_oauth"
41
+ )
44
42
  else:
45
- result = cast(AuthBase, auth)
43
+ if isinstance(auth, AuthBase):
44
+ result = auth # Assume it's already an AuthBase instance
46
45
 
47
46
  return result
48
47
 
@@ -50,9 +49,10 @@ def process_auth(
50
49
  class ReadAPIAction(PipelineAction):
51
50
  """Reads data from an API and loads it into a Spark DataFrame.
52
51
 
53
- This method uses the provided API parameters to make a request using the
54
- [`APIReader`][cloe_nessy.integration.reader.api_reader] and return a
55
- DataFrame containing the response data.
52
+ This action executes HTTP requests (optionally paginated) in parallel using the
53
+ [`APIReader`][cloe_nessy.integration.reader.api_reader] and returns a DataFrame
54
+ containing the response payloads plus request/response metadata. No intermediate
55
+ files are written.
56
56
 
57
57
  Example:
58
58
  === "Basic Usage"
@@ -63,6 +63,7 @@ class ReadAPIAction(PipelineAction):
63
63
  base_url: https://some_url.com/api/
64
64
  endpoint: my/endpoint/
65
65
  ```
66
+
66
67
  === "Usage with Parameters and Headers"
67
68
  ```yaml
68
69
  Read API:
@@ -73,56 +74,211 @@ class ReadAPIAction(PipelineAction):
73
74
  method: GET
74
75
  timeout: 90
75
76
  headers:
76
- key1: value1
77
- key2: value2
77
+ Accept: application/json
78
+ X-Request: foo
78
79
  params:
79
- key1: value1
80
- key2: value2
80
+ q: widget
81
+ include: details
81
82
  ```
82
- === "Usage with Authentication"
83
+
84
+ === "Usage with Authentication (can be chained)"
83
85
  ```yaml
84
86
  Read API:
85
87
  action: READ_API
86
88
  options:
87
89
  base_url: https://some_url.com/api/
88
90
  endpoint: my/endpoint/
89
- method: GET
90
- timeout: 90
91
91
  auth:
92
92
  - type: basic
93
93
  username: my_username
94
94
  password: my_password
95
- - type: secret_scope
96
- secret_scope: my_secret_scope
95
+ - type: env
97
96
  header_template:
98
- "header_key_1": "<ENVIRONMENT_VARIABLE_NAME>"
97
+ "X-API-Key": "<ENV_VAR_NAME>"
99
98
  - type: secret_scope
100
99
  secret_scope: my_secret_scope
101
100
  header_template:
102
- "header_key_2": "<SECRET_NAME>"
103
- - type: secret_scope
104
- secret_scope: my_other_secret_scope
105
- header_template:
106
- "header_key_3": "<SECRET_NAME>"
101
+ "X-ORG-Token": "<SECRET_NAME>"
107
102
  - type: azure_oauth
108
103
  client_id: my_client_id
109
104
  client_secret: my_client_secret
110
105
  tenant_id: my_tenant_id
111
106
  scope: <entra-id-client-id>
112
107
  ```
108
+ The above will combine credentials (via `ChainedAuth`) so that headers from `env`/`secret_scope`
109
+ are merged and auth flows like Basic / Azure OAuth are applied to each request.
110
+
111
+ === "Extracting a Nested Field (key)"
112
+ If the API returns a large JSON object but you only want a nested list (e.g. `data.items`):
113
+ ```yaml
114
+ Read API:
115
+ action: READ_API
116
+ options:
117
+ base_url: https://some_url.com/api/
118
+ endpoint: reports/
119
+ key: data.items
120
+ ```
121
+
122
+ === "Pagination (Supported: page_based, limit_offset)"
123
+ Only `page_based` and `limit_offset` strategies are currently supported. You may also
124
+ supply the shared/advanced options `check_field`, `next_page_field`, `max_page`,
125
+ `pages_per_array_limit`, and `preliminary_probe`.
126
+
127
+ **1) Page-Based Pagination**
128
+ ```yaml
129
+ Read API:
130
+ action: READ_API
131
+ options:
132
+ base_url: https://some_url.com/api/
133
+ endpoint: items/
134
+ params:
135
+ page: 1 # starting page (optional; defaults to 1)
136
+ per_page: 100
137
+ pagination:
138
+ strategy: page_based
139
+ page_field: page # required
140
+ # Shared/advanced (optional):
141
+ check_field: results # e.g. list to check for emptiness
142
+ next_page_field: info.has_next # boolean flag; if present it is trusted
143
+ max_page: -1 # -1 = all pages
144
+ pages_per_array_limit: 2 # chunk output rows every 2 pages
145
+ preliminary_probe: false # set true to pre-scan/build all page params
146
+ ```
147
+ This issues requests like:
148
+ ```
149
+ GET .../items/?page=1&per_page=100
150
+ GET .../items/?page=2&per_page=100
151
+ ...
152
+ ```
153
+
154
+ **2) Limit/Offset Pagination**
155
+ ```yaml
156
+ Read API:
157
+ action: READ_API
158
+ options:
159
+ base_url: https://some_url.com/api/
160
+ endpoint: products/
161
+ params:
162
+ limit: 50
163
+ offset: 0
164
+ pagination:
165
+ strategy: limit_offset
166
+ limit_field: limit # required
167
+ offset_field: offset # required
168
+ # Shared/advanced (optional):
169
+ check_field: data.items
170
+ next_page_field: page_info.has_next
171
+ max_page: -1
172
+ pages_per_array_limit: -1
173
+ preliminary_probe: false
174
+ ```
175
+ This issues requests like:
176
+ ```
177
+ GET .../products/?limit=50&offset=0
178
+ GET .../products/?limit=50&offset=50
179
+ GET .../products/?limit=50&offset=100
180
+ ...
181
+ ```
182
+
183
+ **Using `preliminary_probe` to pre-compute all pages**
184
+ If `preliminary_probe: true` is set, the reader will first probe the API to determine
185
+ the final page (using `check_field` and/or `next_page_field`) and then fan out one request
186
+ per page/offset—useful when driving fully parallel execution:
187
+ ```yaml
188
+ Read API:
189
+ action: READ_API
190
+ options:
191
+ base_url: https://api.example.com/
192
+ endpoint: orders/
193
+ params:
194
+ limit: 100
195
+ offset: 0
196
+ pagination:
197
+ strategy: limit_offset
198
+ limit_field: limit
199
+ offset_field: offset
200
+ check_field: data
201
+ preliminary_probe: true
202
+ max_concurrent_requests: 16
203
+ ```
113
204
 
114
- The above example will combine the headers from the different auth types. The resulting header will look like this:
205
+ === "Retries and Concurrency"
206
+ ```yaml
207
+ Read API:
208
+ action: READ_API
209
+ options:
210
+ base_url: https://some_url.com/api/
211
+ endpoint: heavy/endpoint/
212
+ max_retries: 3 # network/5xx retry count
213
+ backoff_factor: 2 # exponential backoff multiplier
214
+ max_concurrent_requests: 16
215
+ timeout: 60
216
+ ```
115
217
 
116
- ```json
117
- {
118
- "header_key_1": "value_from_environment_variable",
119
- "header_key_2": "value_from_secret",
120
- "header_key_3": "value_from_secret",
121
- "Authorization": "Bearer <access_token> (from azure_oauth)",
122
- "Authorization": "Basic am9obkBleGFtcGxlLmNvbTphYmMxMjM= (from basic)"
123
- }
218
+ === "Default Headers on All Requests"
219
+ ```yaml
220
+ Read API:
221
+ action: READ_API
222
+ options:
223
+ base_url: https://some_url.com/api/
224
+ endpoint: v1/resources
225
+ default_headers:
226
+ X-Client: my-pipeline
227
+ Accept: application/json
228
+ headers:
229
+ X-Request: custom
124
230
  ```
125
231
 
232
+ === "Deriving Requests from Context (multiple dynamic requests)"
233
+ When `requests_from_context: true`, distinct rows from the upstream `context.data`
234
+ are converted into individual requests (enabling heterogeneous endpoints/params).
235
+ The DataFrame must have columns: `endpoint`, `params`, `headers`, `data`, `json_body`.
236
+
237
+ ```yaml
238
+ # Upstream step produces rows like:
239
+ # | endpoint | params | headers | data | json_body |
240
+ # | "u/123/profile" | {"verbose": "true"} | null | null | null |
241
+ # | "u/456/profile" | {"verbose": "false"} | null | null | null |
242
+
243
+ Read API:
244
+ action: READ_API
245
+ options:
246
+ base_url: https://some_url.com/api/
247
+ requests_from_context: true
248
+ method: GET
249
+ timeout: 45
250
+ ```
251
+
252
+ Output:
253
+ The action returns a Spark DataFrame with one column `json_response` (ArrayType).
254
+ Each element contains:
255
+ ```json
256
+ {
257
+ "response": "<json string of the API payload (optionally reduced by 'key')>",
258
+ "__metadata": {
259
+ "timestamp": "YYYY-MM-DD HH:MM:SS.ssssss",
260
+ "base_url": "https://some_url.com/api/",
261
+ "url": "https://some_url.com/api/my/endpoint/?q=...",
262
+ "status_code": 200,
263
+ "reason": "OK",
264
+ "elapsed": 0.123,
265
+ "endpoint": "my/endpoint/",
266
+ "query_parameters": { "q": "..." }
267
+ }
268
+ }
269
+ ```
270
+ When pagination is enabled and `pages_per_array_limit` > 0, responses are chunked
271
+ into arrays of that many pages; otherwise all pages for a request are grouped together.
272
+
273
+ Validation & Errors:
274
+ - `base_url` must be provided.
275
+ - Either `endpoint` must be provided **or** `requests_from_context` must be `true`.
276
+ - If `requests_from_context` is `true`, `context.data` must be present and non-empty.
277
+ - Pagination config:
278
+ - `strategy` must be `page_based` or `limit_offset` (other strategies are not yet supported).
279
+ - For `page_based`, `page_field` is required.
280
+ - For `limit_offset`, both `limit_field` and `offset_field` are required.
281
+
126
282
  !!! warning "Secret information"
127
283
  Don't write sensitive information like passwords or tokens directly in the pipeline configuration.
128
284
  Use secret scopes or environment variables instead.
@@ -130,73 +286,114 @@ class ReadAPIAction(PipelineAction):
130
286
 
131
287
  name: str = "READ_API"
132
288
 
133
- @staticmethod
289
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
134
290
  def run(
291
+ self,
135
292
  context: PipelineContext,
136
293
  *,
137
294
  base_url: str | None = None,
138
- auth: AuthBase | dict[str, str] | None = None,
139
- default_headers: dict[str, str] | None = None,
140
- endpoint: str = "", # www.neo4j.de/api/table/2020/01/01
295
+ auth: Mapping[str, str | Mapping[str, str] | list[Mapping[str, str]]] | None = None,
296
+ endpoint: str | None = None,
297
+ default_headers: dict[str, Any] | None = None,
141
298
  method: str = "GET",
142
299
  key: str | None = None,
143
300
  timeout: int = 30,
144
- params: dict[str, str] | None = None,
145
- headers: dict[str, str] | None = None,
146
- data: dict[str, str] | None = None,
147
- json: dict[str, str] | None = None,
301
+ params: dict[str, Any] | None = None,
302
+ headers: dict[str, Any] | None = None,
303
+ data: dict[str, Any] | None = None,
304
+ json_body: dict[str, Any] | None = None,
305
+ pagination: PaginationConfigData | None = None,
148
306
  max_retries: int = 0,
149
- options: dict[str, str] | None = None,
307
+ backoff_factor: int = 0,
308
+ max_concurrent_requests: int = 8,
309
+ requests_from_context: bool = False,
150
310
  **_: Any,
151
311
  ) -> PipelineContext:
152
- """Utility class for reading an API into a DataFrame.
312
+ """Executes API requests in parallel by using mapInPandas.
153
313
 
154
- This class uses an APIClient to fetch data from an API and load it into a Spark DataFrame.
314
+ We do NOT write intermediate files; instead we directly return the responses
315
+ as rows in a Spark DataFrame.
155
316
 
156
317
 
157
318
  Args:
158
- context: The pipeline context containing information about the pipeline.
159
- base_url: The base URL for the API to be called.
160
- auth: The authentication credentials for the API.
161
- default_headers: Default headers to include in the API request.
162
- endpoint: The specific API endpoint to call.
163
- method: The HTTP method to use for the request (default is "GET").
164
- key: Key for accessing specific data in the response.
165
- timeout: Timeout for the API request in seconds (default is 30).
166
- params: URL parameters to include in the API request.
167
- headers: Additional headers to include in the request.
168
- data: Data to send with the request for POST methods.
169
- json: JSON data to send with the request for POST methods.
170
- max_retries: Maximum number of retries for the API request (default is 0).
171
- options: Additional options for the API request.
319
+ context: The pipeline context used to carry data between actions.
320
+ base_url: The base URL for all API requests.
321
+ auth: Authentication configuration, which may be a simple header map,
322
+ a nested map for different auth scopes, or a list thereof.
323
+ endpoint: The specific path to append to the base URL for this call.
324
+ default_headers: Headers to include on every request.
325
+ method: HTTP method to use.
326
+ key: JSON field name to extract from each response.
327
+ timeout: Request timeout in seconds.
328
+ params: Query parameters to append to the URL.
329
+ headers: Additional request-specific headers.
330
+ data: Form-encoded body to send.
331
+ json_body: JSON-encoded body to send.
332
+ pagination: Configuration for paginated endpoints.
333
+ max_retries: Number of times to retry on failure.
334
+ backoff_factor: Multiplier for retry backoff delays.
335
+ max_concurrent_requests: Maximum number of parallel API calls.
336
+ requests_from_context: Whether to derive request parameters from context data.
172
337
 
173
338
  Returns:
174
- The updated pipeline context containing the DataFrame with the API response data.
339
+ The updated context, with the read data as a DataFrame.
175
340
 
176
341
  Raises:
177
- ValueError: If the base_url is not specified.
342
+ ValueError: If no base URL is provided.
343
+ ValueError: If neither an endpoint nor context-derived requests are specified.
344
+ ValueError: If context-derived requests are enabled but no data is present in context.
178
345
  """
179
- if not options:
180
- options = dict()
346
+ deserialized_auth = process_auth(auth)
347
+ pagination_config = PaginationConfig(**pagination) if pagination is not None else None
181
348
 
182
349
  if base_url is None:
183
- raise ValueError("base_url must be specified to fetch data from API.")
350
+ raise ValueError("A value for base_url must to be supplied")
184
351
 
185
- deserialized_auth = process_auth(auth)
352
+ if endpoint is None and not requests_from_context:
353
+ raise ValueError("A value for endpoint must to be supplied")
186
354
 
187
- api_reader = APIReader(base_url=base_url, auth=deserialized_auth, default_headers=default_headers)
355
+ api_reader = APIReader(
356
+ base_url=base_url,
357
+ auth=deserialized_auth,
358
+ default_headers=default_headers,
359
+ max_concurrent_requests=max_concurrent_requests,
360
+ )
361
+
362
+ dynamic_requests: list[RequestSet] | None = None
363
+
364
+ if requests_from_context:
365
+ if not context.data:
366
+ raise ValueError("Cannot generate requests from the context without a DataFrame in the context.")
367
+
368
+ dynamic_requests = [
369
+ cast(RequestSet, row.asDict())
370
+ for row in context.data.select(
371
+ "endpoint",
372
+ "params",
373
+ "headers",
374
+ "data",
375
+ "json_body",
376
+ )
377
+ .distinct()
378
+ .collect()
379
+ ]
188
380
 
189
381
  df = api_reader.read(
190
- method=method,
191
382
  endpoint=endpoint,
383
+ method=method,
384
+ key=key,
192
385
  timeout=timeout,
193
386
  params=params,
194
- key=key,
195
387
  headers=headers,
196
388
  data=data,
197
- json=json,
389
+ json_body=json_body,
390
+ pagination_config=pagination_config,
198
391
  max_retries=max_retries,
199
- options=options,
392
+ backoff_factor=backoff_factor,
393
+ dynamic_requests=dynamic_requests,
200
394
  )
201
395
 
396
+ row_count = df.count()
397
+ self._console_logger.info(f"API requests completed. Final row count = {row_count}.")
398
+
202
399
  return context.from_existing(data=df)
@@ -1,7 +1,7 @@
1
1
  from collections.abc import Callable
2
2
  from functools import reduce
3
3
 
4
- from pyspark.sql import DataFrame
4
+ from cloe_nessy.session import DataFrame
5
5
 
6
6
  from ...file_utilities import get_file_paths
7
7
  from ...integration.reader import ExcelDataFrameReader
@@ -1,8 +1,9 @@
1
1
  from typing import Any
2
2
 
3
- from pyspark.sql import DataFrame
4
3
  from pyspark.sql.functions import col, from_json, schema_of_json, unbase64
5
4
 
5
+ from cloe_nessy.session import DataFrame
6
+
6
7
  from ..pipeline_action import PipelineAction
7
8
  from ..pipeline_context import PipelineContext
8
9
 
@@ -83,6 +83,7 @@ class PipelineStepConfig(PipelineConfigBaseModel):
83
83
  context: str | None = None
84
84
  table_metadata: str | None = None
85
85
  options: dict = Field(default_factory=dict)
86
+ env: dict = Field(default_factory=dict)
86
87
 
87
88
 
88
89
  class PipelineConfig(PipelineConfigBaseModel):
@@ -90,3 +91,4 @@ class PipelineConfig(PipelineConfigBaseModel):
90
91
 
91
92
  name: str
92
93
  steps: OrderedDict[str, PipelineStepConfig]
94
+ env: dict[str, str] = Field(default_factory=dict)
@@ -1,6 +1,6 @@
1
1
  from typing import Any
2
2
 
3
- from pyspark.sql import DataFrame
3
+ from cloe_nessy.session import DataFrame
4
4
 
5
5
  from ..models import Table
6
6