cloe-nessy 0.3.18__py3-none-any.whl → 0.3.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloe_nessy/clients/api_client/__init__.py +10 -1
- cloe_nessy/clients/api_client/api_client.py +19 -8
- cloe_nessy/clients/api_client/api_response.py +7 -4
- cloe_nessy/clients/api_client/pagination_config.py +84 -0
- cloe_nessy/clients/api_client/pagination_strategy.py +500 -0
- cloe_nessy/integration/reader/__init__.py +2 -2
- cloe_nessy/integration/reader/api_reader.py +463 -72
- cloe_nessy/integration/reader/catalog_reader.py +6 -4
- cloe_nessy/integration/reader/excel_reader.py +3 -3
- cloe_nessy/integration/reader/file_reader.py +3 -1
- cloe_nessy/integration/reader/reader.py +1 -1
- cloe_nessy/integration/writer/catalog_writer.py +1 -1
- cloe_nessy/pipeline/actions/__init__.py +1 -1
- cloe_nessy/pipeline/actions/read_api.py +272 -75
- cloe_nessy/pipeline/actions/read_excel.py +1 -1
- cloe_nessy/pipeline/actions/transform_decode.py +2 -1
- cloe_nessy/pipeline/pipeline_config.py +2 -0
- cloe_nessy/pipeline/pipeline_context.py +1 -1
- cloe_nessy/pipeline/pipeline_parsing_service.py +104 -39
- cloe_nessy/pipeline/pipeline_step.py +2 -0
- cloe_nessy/session/__init__.py +2 -1
- cloe_nessy/session/pyspark_compat.py +15 -0
- cloe_nessy/session/session_manager.py +1 -1
- {cloe_nessy-0.3.18.dist-info → cloe_nessy-0.3.19.dist-info}/METADATA +3 -3
- {cloe_nessy-0.3.18.dist-info → cloe_nessy-0.3.19.dist-info}/RECORD +26 -23
- {cloe_nessy-0.3.18.dist-info → cloe_nessy-0.3.19.dist-info}/WHEEL +1 -1
|
@@ -33,7 +33,7 @@ pipeline_actions = {cls.name: cls for cls in PipelineAction.__subclasses__()}
|
|
|
33
33
|
# Register all subclasses dynamically as enum using their "name" attribute as
|
|
34
34
|
# key. We need to do this here, because otherwise we don't get all subclasses
|
|
35
35
|
# from a relative import of PipelineAction
|
|
36
|
-
PipelineActionType = Enum("PipelineActionType", pipeline_actions) # type: ignore
|
|
36
|
+
PipelineActionType = Enum("PipelineActionType", pipeline_actions) # type: ignore[misc]
|
|
37
37
|
|
|
38
38
|
__all__ = [
|
|
39
39
|
"ReadAPIAction",
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
from collections.abc import Mapping
|
|
2
2
|
from typing import Any, cast
|
|
3
3
|
|
|
4
|
+
from pydantic import ConfigDict, validate_call
|
|
4
5
|
from requests.auth import AuthBase, HTTPBasicAuth
|
|
5
6
|
|
|
7
|
+
from ...clients.api_client import PaginationConfig, PaginationConfigData
|
|
6
8
|
from ...clients.api_client.auth import AzureCredentialAuth, ChainedAuth, EnvVariableAuth, SecretScopeAuth
|
|
7
|
-
from ...integration.reader import APIReader
|
|
9
|
+
from ...integration.reader import APIReader, RequestSet
|
|
8
10
|
from ..pipeline_action import PipelineAction
|
|
9
11
|
from ..pipeline_context import PipelineContext
|
|
10
12
|
|
|
@@ -12,11 +14,7 @@ from ..pipeline_context import PipelineContext
|
|
|
12
14
|
def process_auth(
|
|
13
15
|
auth: Mapping[str, str | Mapping[str, str] | list[Mapping[str, str]]] | AuthBase | None,
|
|
14
16
|
) -> AuthBase | None:
|
|
15
|
-
"""Processes the auth parameter to create an AuthBase object.
|
|
16
|
-
|
|
17
|
-
Args:
|
|
18
|
-
auth: The auth parameter to be processed.
|
|
19
|
-
"""
|
|
17
|
+
"""Processes the auth parameter to create an AuthBase object."""
|
|
20
18
|
result: AuthBase | None = None
|
|
21
19
|
|
|
22
20
|
if isinstance(auth, list):
|
|
@@ -27,11 +25,9 @@ def process_auth(
|
|
|
27
25
|
case "basic":
|
|
28
26
|
result = HTTPBasicAuth(auth["username"], auth["password"])
|
|
29
27
|
case "secret_scope":
|
|
30
|
-
|
|
31
|
-
result = SecretScopeAuth(secret_scope_header_template, auth["secret_scope"])
|
|
28
|
+
result = SecretScopeAuth(auth["header_template"], auth["secret_scope"])
|
|
32
29
|
case "env":
|
|
33
|
-
|
|
34
|
-
result = EnvVariableAuth(env_header_template)
|
|
30
|
+
result = EnvVariableAuth(auth["header_template"])
|
|
35
31
|
case "azure_oauth":
|
|
36
32
|
result = AzureCredentialAuth(
|
|
37
33
|
scope=auth["scope"],
|
|
@@ -40,9 +36,12 @@ def process_auth(
|
|
|
40
36
|
tenant_id=auth["tenant_id"],
|
|
41
37
|
)
|
|
42
38
|
case _:
|
|
43
|
-
raise ValueError(
|
|
39
|
+
raise ValueError(
|
|
40
|
+
"Invalid auth type specified. Supported types are: basic, secret_scope, env, azure_oauth"
|
|
41
|
+
)
|
|
44
42
|
else:
|
|
45
|
-
|
|
43
|
+
if isinstance(auth, AuthBase):
|
|
44
|
+
result = auth # Assume it's already an AuthBase instance
|
|
46
45
|
|
|
47
46
|
return result
|
|
48
47
|
|
|
@@ -50,9 +49,10 @@ def process_auth(
|
|
|
50
49
|
class ReadAPIAction(PipelineAction):
|
|
51
50
|
"""Reads data from an API and loads it into a Spark DataFrame.
|
|
52
51
|
|
|
53
|
-
This
|
|
54
|
-
[`APIReader`][cloe_nessy.integration.reader.api_reader] and
|
|
55
|
-
|
|
52
|
+
This action executes HTTP requests (optionally paginated) in parallel using the
|
|
53
|
+
[`APIReader`][cloe_nessy.integration.reader.api_reader] and returns a DataFrame
|
|
54
|
+
containing the response payloads plus request/response metadata. No intermediate
|
|
55
|
+
files are written.
|
|
56
56
|
|
|
57
57
|
Example:
|
|
58
58
|
=== "Basic Usage"
|
|
@@ -63,6 +63,7 @@ class ReadAPIAction(PipelineAction):
|
|
|
63
63
|
base_url: https://some_url.com/api/
|
|
64
64
|
endpoint: my/endpoint/
|
|
65
65
|
```
|
|
66
|
+
|
|
66
67
|
=== "Usage with Parameters and Headers"
|
|
67
68
|
```yaml
|
|
68
69
|
Read API:
|
|
@@ -73,56 +74,211 @@ class ReadAPIAction(PipelineAction):
|
|
|
73
74
|
method: GET
|
|
74
75
|
timeout: 90
|
|
75
76
|
headers:
|
|
76
|
-
|
|
77
|
-
|
|
77
|
+
Accept: application/json
|
|
78
|
+
X-Request: foo
|
|
78
79
|
params:
|
|
79
|
-
|
|
80
|
-
|
|
80
|
+
q: widget
|
|
81
|
+
include: details
|
|
81
82
|
```
|
|
82
|
-
|
|
83
|
+
|
|
84
|
+
=== "Usage with Authentication (can be chained)"
|
|
83
85
|
```yaml
|
|
84
86
|
Read API:
|
|
85
87
|
action: READ_API
|
|
86
88
|
options:
|
|
87
89
|
base_url: https://some_url.com/api/
|
|
88
90
|
endpoint: my/endpoint/
|
|
89
|
-
method: GET
|
|
90
|
-
timeout: 90
|
|
91
91
|
auth:
|
|
92
92
|
- type: basic
|
|
93
93
|
username: my_username
|
|
94
94
|
password: my_password
|
|
95
|
-
- type:
|
|
96
|
-
secret_scope: my_secret_scope
|
|
95
|
+
- type: env
|
|
97
96
|
header_template:
|
|
98
|
-
"
|
|
97
|
+
"X-API-Key": "<ENV_VAR_NAME>"
|
|
99
98
|
- type: secret_scope
|
|
100
99
|
secret_scope: my_secret_scope
|
|
101
100
|
header_template:
|
|
102
|
-
"
|
|
103
|
-
- type: secret_scope
|
|
104
|
-
secret_scope: my_other_secret_scope
|
|
105
|
-
header_template:
|
|
106
|
-
"header_key_3": "<SECRET_NAME>"
|
|
101
|
+
"X-ORG-Token": "<SECRET_NAME>"
|
|
107
102
|
- type: azure_oauth
|
|
108
103
|
client_id: my_client_id
|
|
109
104
|
client_secret: my_client_secret
|
|
110
105
|
tenant_id: my_tenant_id
|
|
111
106
|
scope: <entra-id-client-id>
|
|
112
107
|
```
|
|
108
|
+
The above will combine credentials (via `ChainedAuth`) so that headers from `env`/`secret_scope`
|
|
109
|
+
are merged and auth flows like Basic / Azure OAuth are applied to each request.
|
|
110
|
+
|
|
111
|
+
=== "Extracting a Nested Field (key)"
|
|
112
|
+
If the API returns a large JSON object but you only want a nested list (e.g. `data.items`):
|
|
113
|
+
```yaml
|
|
114
|
+
Read API:
|
|
115
|
+
action: READ_API
|
|
116
|
+
options:
|
|
117
|
+
base_url: https://some_url.com/api/
|
|
118
|
+
endpoint: reports/
|
|
119
|
+
key: data.items
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
=== "Pagination (Supported: page_based, limit_offset)"
|
|
123
|
+
Only `page_based` and `limit_offset` strategies are currently supported. You may also
|
|
124
|
+
supply the shared/advanced options `check_field`, `next_page_field`, `max_page`,
|
|
125
|
+
`pages_per_array_limit`, and `preliminary_probe`.
|
|
126
|
+
|
|
127
|
+
**1) Page-Based Pagination**
|
|
128
|
+
```yaml
|
|
129
|
+
Read API:
|
|
130
|
+
action: READ_API
|
|
131
|
+
options:
|
|
132
|
+
base_url: https://some_url.com/api/
|
|
133
|
+
endpoint: items/
|
|
134
|
+
params:
|
|
135
|
+
page: 1 # starting page (optional; defaults to 1)
|
|
136
|
+
per_page: 100
|
|
137
|
+
pagination:
|
|
138
|
+
strategy: page_based
|
|
139
|
+
page_field: page # required
|
|
140
|
+
# Shared/advanced (optional):
|
|
141
|
+
check_field: results # e.g. list to check for emptiness
|
|
142
|
+
next_page_field: info.has_next # boolean flag; if present it is trusted
|
|
143
|
+
max_page: -1 # -1 = all pages
|
|
144
|
+
pages_per_array_limit: 2 # chunk output rows every 2 pages
|
|
145
|
+
preliminary_probe: false # set true to pre-scan/build all page params
|
|
146
|
+
```
|
|
147
|
+
This issues requests like:
|
|
148
|
+
```
|
|
149
|
+
GET .../items/?page=1&per_page=100
|
|
150
|
+
GET .../items/?page=2&per_page=100
|
|
151
|
+
...
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
**2) Limit/Offset Pagination**
|
|
155
|
+
```yaml
|
|
156
|
+
Read API:
|
|
157
|
+
action: READ_API
|
|
158
|
+
options:
|
|
159
|
+
base_url: https://some_url.com/api/
|
|
160
|
+
endpoint: products/
|
|
161
|
+
params:
|
|
162
|
+
limit: 50
|
|
163
|
+
offset: 0
|
|
164
|
+
pagination:
|
|
165
|
+
strategy: limit_offset
|
|
166
|
+
limit_field: limit # required
|
|
167
|
+
offset_field: offset # required
|
|
168
|
+
# Shared/advanced (optional):
|
|
169
|
+
check_field: data.items
|
|
170
|
+
next_page_field: page_info.has_next
|
|
171
|
+
max_page: -1
|
|
172
|
+
pages_per_array_limit: -1
|
|
173
|
+
preliminary_probe: false
|
|
174
|
+
```
|
|
175
|
+
This issues requests like:
|
|
176
|
+
```
|
|
177
|
+
GET .../products/?limit=50&offset=0
|
|
178
|
+
GET .../products/?limit=50&offset=50
|
|
179
|
+
GET .../products/?limit=50&offset=100
|
|
180
|
+
...
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
**Using `preliminary_probe` to pre-compute all pages**
|
|
184
|
+
If `preliminary_probe: true` is set, the reader will first probe the API to determine
|
|
185
|
+
the final page (using `check_field` and/or `next_page_field`) and then fan out one request
|
|
186
|
+
per page/offset—useful when driving fully parallel execution:
|
|
187
|
+
```yaml
|
|
188
|
+
Read API:
|
|
189
|
+
action: READ_API
|
|
190
|
+
options:
|
|
191
|
+
base_url: https://api.example.com/
|
|
192
|
+
endpoint: orders/
|
|
193
|
+
params:
|
|
194
|
+
limit: 100
|
|
195
|
+
offset: 0
|
|
196
|
+
pagination:
|
|
197
|
+
strategy: limit_offset
|
|
198
|
+
limit_field: limit
|
|
199
|
+
offset_field: offset
|
|
200
|
+
check_field: data
|
|
201
|
+
preliminary_probe: true
|
|
202
|
+
max_concurrent_requests: 16
|
|
203
|
+
```
|
|
113
204
|
|
|
114
|
-
|
|
205
|
+
=== "Retries and Concurrency"
|
|
206
|
+
```yaml
|
|
207
|
+
Read API:
|
|
208
|
+
action: READ_API
|
|
209
|
+
options:
|
|
210
|
+
base_url: https://some_url.com/api/
|
|
211
|
+
endpoint: heavy/endpoint/
|
|
212
|
+
max_retries: 3 # network/5xx retry count
|
|
213
|
+
backoff_factor: 2 # exponential backoff multiplier
|
|
214
|
+
max_concurrent_requests: 16
|
|
215
|
+
timeout: 60
|
|
216
|
+
```
|
|
115
217
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
218
|
+
=== "Default Headers on All Requests"
|
|
219
|
+
```yaml
|
|
220
|
+
Read API:
|
|
221
|
+
action: READ_API
|
|
222
|
+
options:
|
|
223
|
+
base_url: https://some_url.com/api/
|
|
224
|
+
endpoint: v1/resources
|
|
225
|
+
default_headers:
|
|
226
|
+
X-Client: my-pipeline
|
|
227
|
+
Accept: application/json
|
|
228
|
+
headers:
|
|
229
|
+
X-Request: custom
|
|
124
230
|
```
|
|
125
231
|
|
|
232
|
+
=== "Deriving Requests from Context (multiple dynamic requests)"
|
|
233
|
+
When `requests_from_context: true`, distinct rows from the upstream `context.data`
|
|
234
|
+
are converted into individual requests (enabling heterogeneous endpoints/params).
|
|
235
|
+
The DataFrame must have columns: `endpoint`, `params`, `headers`, `data`, `json_body`.
|
|
236
|
+
|
|
237
|
+
```yaml
|
|
238
|
+
# Upstream step produces rows like:
|
|
239
|
+
# | endpoint | params | headers | data | json_body |
|
|
240
|
+
# | "u/123/profile" | {"verbose": "true"} | null | null | null |
|
|
241
|
+
# | "u/456/profile" | {"verbose": "false"} | null | null | null |
|
|
242
|
+
|
|
243
|
+
Read API:
|
|
244
|
+
action: READ_API
|
|
245
|
+
options:
|
|
246
|
+
base_url: https://some_url.com/api/
|
|
247
|
+
requests_from_context: true
|
|
248
|
+
method: GET
|
|
249
|
+
timeout: 45
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
Output:
|
|
253
|
+
The action returns a Spark DataFrame with one column `json_response` (ArrayType).
|
|
254
|
+
Each element contains:
|
|
255
|
+
```json
|
|
256
|
+
{
|
|
257
|
+
"response": "<json string of the API payload (optionally reduced by 'key')>",
|
|
258
|
+
"__metadata": {
|
|
259
|
+
"timestamp": "YYYY-MM-DD HH:MM:SS.ssssss",
|
|
260
|
+
"base_url": "https://some_url.com/api/",
|
|
261
|
+
"url": "https://some_url.com/api/my/endpoint/?q=...",
|
|
262
|
+
"status_code": 200,
|
|
263
|
+
"reason": "OK",
|
|
264
|
+
"elapsed": 0.123,
|
|
265
|
+
"endpoint": "my/endpoint/",
|
|
266
|
+
"query_parameters": { "q": "..." }
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
```
|
|
270
|
+
When pagination is enabled and `pages_per_array_limit` > 0, responses are chunked
|
|
271
|
+
into arrays of that many pages; otherwise all pages for a request are grouped together.
|
|
272
|
+
|
|
273
|
+
Validation & Errors:
|
|
274
|
+
- `base_url` must be provided.
|
|
275
|
+
- Either `endpoint` must be provided **or** `requests_from_context` must be `true`.
|
|
276
|
+
- If `requests_from_context` is `true`, `context.data` must be present and non-empty.
|
|
277
|
+
- Pagination config:
|
|
278
|
+
- `strategy` must be `page_based` or `limit_offset` (other strategies are not yet supported).
|
|
279
|
+
- For `page_based`, `page_field` is required.
|
|
280
|
+
- For `limit_offset`, both `limit_field` and `offset_field` are required.
|
|
281
|
+
|
|
126
282
|
!!! warning "Secret information"
|
|
127
283
|
Don't write sensitive information like passwords or tokens directly in the pipeline configuration.
|
|
128
284
|
Use secret scopes or environment variables instead.
|
|
@@ -130,73 +286,114 @@ class ReadAPIAction(PipelineAction):
|
|
|
130
286
|
|
|
131
287
|
name: str = "READ_API"
|
|
132
288
|
|
|
133
|
-
@
|
|
289
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
134
290
|
def run(
|
|
291
|
+
self,
|
|
135
292
|
context: PipelineContext,
|
|
136
293
|
*,
|
|
137
294
|
base_url: str | None = None,
|
|
138
|
-
auth:
|
|
139
|
-
|
|
140
|
-
|
|
295
|
+
auth: Mapping[str, str | Mapping[str, str] | list[Mapping[str, str]]] | None = None,
|
|
296
|
+
endpoint: str | None = None,
|
|
297
|
+
default_headers: dict[str, Any] | None = None,
|
|
141
298
|
method: str = "GET",
|
|
142
299
|
key: str | None = None,
|
|
143
300
|
timeout: int = 30,
|
|
144
|
-
params: dict[str,
|
|
145
|
-
headers: dict[str,
|
|
146
|
-
data: dict[str,
|
|
147
|
-
|
|
301
|
+
params: dict[str, Any] | None = None,
|
|
302
|
+
headers: dict[str, Any] | None = None,
|
|
303
|
+
data: dict[str, Any] | None = None,
|
|
304
|
+
json_body: dict[str, Any] | None = None,
|
|
305
|
+
pagination: PaginationConfigData | None = None,
|
|
148
306
|
max_retries: int = 0,
|
|
149
|
-
|
|
307
|
+
backoff_factor: int = 0,
|
|
308
|
+
max_concurrent_requests: int = 8,
|
|
309
|
+
requests_from_context: bool = False,
|
|
150
310
|
**_: Any,
|
|
151
311
|
) -> PipelineContext:
|
|
152
|
-
"""
|
|
312
|
+
"""Executes API requests in parallel by using mapInPandas.
|
|
153
313
|
|
|
154
|
-
|
|
314
|
+
We do NOT write intermediate files; instead we directly return the responses
|
|
315
|
+
as rows in a Spark DataFrame.
|
|
155
316
|
|
|
156
317
|
|
|
157
318
|
Args:
|
|
158
|
-
context: The pipeline context
|
|
159
|
-
base_url: The base URL for
|
|
160
|
-
auth:
|
|
161
|
-
|
|
162
|
-
endpoint: The specific
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
319
|
+
context: The pipeline context used to carry data between actions.
|
|
320
|
+
base_url: The base URL for all API requests.
|
|
321
|
+
auth: Authentication configuration, which may be a simple header map,
|
|
322
|
+
a nested map for different auth scopes, or a list thereof.
|
|
323
|
+
endpoint: The specific path to append to the base URL for this call.
|
|
324
|
+
default_headers: Headers to include on every request.
|
|
325
|
+
method: HTTP method to use.
|
|
326
|
+
key: JSON field name to extract from each response.
|
|
327
|
+
timeout: Request timeout in seconds.
|
|
328
|
+
params: Query parameters to append to the URL.
|
|
329
|
+
headers: Additional request-specific headers.
|
|
330
|
+
data: Form-encoded body to send.
|
|
331
|
+
json_body: JSON-encoded body to send.
|
|
332
|
+
pagination: Configuration for paginated endpoints.
|
|
333
|
+
max_retries: Number of times to retry on failure.
|
|
334
|
+
backoff_factor: Multiplier for retry backoff delays.
|
|
335
|
+
max_concurrent_requests: Maximum number of parallel API calls.
|
|
336
|
+
requests_from_context: Whether to derive request parameters from context data.
|
|
172
337
|
|
|
173
338
|
Returns:
|
|
174
|
-
The updated
|
|
339
|
+
The updated context, with the read data as a DataFrame.
|
|
175
340
|
|
|
176
341
|
Raises:
|
|
177
|
-
ValueError: If
|
|
342
|
+
ValueError: If no base URL is provided.
|
|
343
|
+
ValueError: If neither an endpoint nor context-derived requests are specified.
|
|
344
|
+
ValueError: If context-derived requests are enabled but no data is present in context.
|
|
178
345
|
"""
|
|
179
|
-
|
|
180
|
-
|
|
346
|
+
deserialized_auth = process_auth(auth)
|
|
347
|
+
pagination_config = PaginationConfig(**pagination) if pagination is not None else None
|
|
181
348
|
|
|
182
349
|
if base_url is None:
|
|
183
|
-
raise ValueError("base_url must
|
|
350
|
+
raise ValueError("A value for base_url must to be supplied")
|
|
184
351
|
|
|
185
|
-
|
|
352
|
+
if endpoint is None and not requests_from_context:
|
|
353
|
+
raise ValueError("A value for endpoint must to be supplied")
|
|
186
354
|
|
|
187
|
-
api_reader = APIReader(
|
|
355
|
+
api_reader = APIReader(
|
|
356
|
+
base_url=base_url,
|
|
357
|
+
auth=deserialized_auth,
|
|
358
|
+
default_headers=default_headers,
|
|
359
|
+
max_concurrent_requests=max_concurrent_requests,
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
dynamic_requests: list[RequestSet] | None = None
|
|
363
|
+
|
|
364
|
+
if requests_from_context:
|
|
365
|
+
if not context.data:
|
|
366
|
+
raise ValueError("Cannot generate requests from the context without a DataFrame in the context.")
|
|
367
|
+
|
|
368
|
+
dynamic_requests = [
|
|
369
|
+
cast(RequestSet, row.asDict())
|
|
370
|
+
for row in context.data.select(
|
|
371
|
+
"endpoint",
|
|
372
|
+
"params",
|
|
373
|
+
"headers",
|
|
374
|
+
"data",
|
|
375
|
+
"json_body",
|
|
376
|
+
)
|
|
377
|
+
.distinct()
|
|
378
|
+
.collect()
|
|
379
|
+
]
|
|
188
380
|
|
|
189
381
|
df = api_reader.read(
|
|
190
|
-
method=method,
|
|
191
382
|
endpoint=endpoint,
|
|
383
|
+
method=method,
|
|
384
|
+
key=key,
|
|
192
385
|
timeout=timeout,
|
|
193
386
|
params=params,
|
|
194
|
-
key=key,
|
|
195
387
|
headers=headers,
|
|
196
388
|
data=data,
|
|
197
|
-
|
|
389
|
+
json_body=json_body,
|
|
390
|
+
pagination_config=pagination_config,
|
|
198
391
|
max_retries=max_retries,
|
|
199
|
-
|
|
392
|
+
backoff_factor=backoff_factor,
|
|
393
|
+
dynamic_requests=dynamic_requests,
|
|
200
394
|
)
|
|
201
395
|
|
|
396
|
+
row_count = df.count()
|
|
397
|
+
self._console_logger.info(f"API requests completed. Final row count = {row_count}.")
|
|
398
|
+
|
|
202
399
|
return context.from_existing(data=df)
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
from typing import Any
|
|
2
2
|
|
|
3
|
-
from pyspark.sql import DataFrame
|
|
4
3
|
from pyspark.sql.functions import col, from_json, schema_of_json, unbase64
|
|
5
4
|
|
|
5
|
+
from cloe_nessy.session import DataFrame
|
|
6
|
+
|
|
6
7
|
from ..pipeline_action import PipelineAction
|
|
7
8
|
from ..pipeline_context import PipelineContext
|
|
8
9
|
|
|
@@ -83,6 +83,7 @@ class PipelineStepConfig(PipelineConfigBaseModel):
|
|
|
83
83
|
context: str | None = None
|
|
84
84
|
table_metadata: str | None = None
|
|
85
85
|
options: dict = Field(default_factory=dict)
|
|
86
|
+
env: dict = Field(default_factory=dict)
|
|
86
87
|
|
|
87
88
|
|
|
88
89
|
class PipelineConfig(PipelineConfigBaseModel):
|
|
@@ -90,3 +91,4 @@ class PipelineConfig(PipelineConfigBaseModel):
|
|
|
90
91
|
|
|
91
92
|
name: str
|
|
92
93
|
steps: OrderedDict[str, PipelineStepConfig]
|
|
94
|
+
env: dict[str, str] = Field(default_factory=dict)
|