etlplus 0.16.0__py3-none-any.whl → 0.16.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- etlplus/api/types.py +32 -11
- etlplus/ops/extract.py +209 -22
- etlplus/ops/load.py +140 -34
- etlplus/ops/run.py +85 -100
- etlplus/ops/transform.py +30 -11
- etlplus/types.py +3 -2
- etlplus/workflow/__init__.py +2 -0
- etlplus/workflow/dag.py +23 -1
- etlplus/workflow/jobs.py +15 -26
- etlplus/workflow/pipeline.py +37 -54
- etlplus/workflow/profile.py +4 -2
- {etlplus-0.16.0.dist-info → etlplus-0.16.2.dist-info}/METADATA +1 -1
- {etlplus-0.16.0.dist-info → etlplus-0.16.2.dist-info}/RECORD +17 -17
- {etlplus-0.16.0.dist-info → etlplus-0.16.2.dist-info}/WHEEL +0 -0
- {etlplus-0.16.0.dist-info → etlplus-0.16.2.dist-info}/entry_points.txt +0 -0
- {etlplus-0.16.0.dist-info → etlplus-0.16.2.dist-info}/licenses/LICENSE +0 -0
- {etlplus-0.16.0.dist-info → etlplus-0.16.2.dist-info}/top_level.txt +0 -0
etlplus/api/types.py
CHANGED
|
@@ -53,7 +53,31 @@ __all__ = [
|
|
|
53
53
|
# SECTION: CONSTANTS ======================================================== #
|
|
54
54
|
|
|
55
55
|
|
|
56
|
-
_UNSET = object()
|
|
56
|
+
_UNSET: object = object()
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# SECTION: INTERNAL FUNCTIONS =============================================== #
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _to_dict(
|
|
63
|
+
value: Mapping[str, Any] | object | None,
|
|
64
|
+
) -> dict[str, Any] | None:
|
|
65
|
+
"""
|
|
66
|
+
Return a defensive ``dict`` copy for mapping inputs.
|
|
67
|
+
|
|
68
|
+
Parameters
|
|
69
|
+
----------
|
|
70
|
+
value : Mapping[str, Any] | object | None
|
|
71
|
+
Mapping to copy, or ``None``.
|
|
72
|
+
|
|
73
|
+
Returns
|
|
74
|
+
-------
|
|
75
|
+
dict[str, Any] | None
|
|
76
|
+
New ``dict`` instance or ``None`` when the input is ``None``.
|
|
77
|
+
"""
|
|
78
|
+
if value is None:
|
|
79
|
+
return None
|
|
80
|
+
return cast(dict[str, Any], value)
|
|
57
81
|
|
|
58
82
|
|
|
59
83
|
# SECTION: TYPED DICTS ====================================================== #
|
|
@@ -176,9 +200,9 @@ class RequestOptions:
|
|
|
176
200
|
|
|
177
201
|
def __post_init__(self) -> None:
|
|
178
202
|
if self.params is not None:
|
|
179
|
-
object.__setattr__(self, 'params',
|
|
203
|
+
object.__setattr__(self, 'params', _to_dict(self.params))
|
|
180
204
|
if self.headers is not None:
|
|
181
|
-
object.__setattr__(self, 'headers',
|
|
205
|
+
object.__setattr__(self, 'headers', _to_dict(self.headers))
|
|
182
206
|
|
|
183
207
|
# -- Instance Methods -- #
|
|
184
208
|
|
|
@@ -224,23 +248,20 @@ class RequestOptions:
|
|
|
224
248
|
|
|
225
249
|
Returns
|
|
226
250
|
-------
|
|
227
|
-
|
|
251
|
+
Self
|
|
228
252
|
New snapshot reflecting the provided overrides.
|
|
229
253
|
"""
|
|
230
254
|
if params is _UNSET:
|
|
231
255
|
next_params = self.params
|
|
232
|
-
elif params is None:
|
|
233
|
-
next_params = None
|
|
234
256
|
else:
|
|
235
|
-
next_params =
|
|
257
|
+
# next_params = _to_dict(params) if params is not None else None
|
|
258
|
+
next_params = _to_dict(params)
|
|
236
259
|
|
|
237
260
|
if headers is _UNSET:
|
|
238
261
|
next_headers = self.headers
|
|
239
|
-
elif headers is None:
|
|
240
|
-
next_headers = None
|
|
241
262
|
else:
|
|
242
|
-
next_headers =
|
|
243
|
-
|
|
263
|
+
# next_headers = _to_dict(headers) if headers is not None else None
|
|
264
|
+
next_headers = _to_dict(headers)
|
|
244
265
|
if timeout is _UNSET:
|
|
245
266
|
next_timeout = self.timeout
|
|
246
267
|
else:
|
etlplus/ops/extract.py
CHANGED
|
@@ -6,11 +6,19 @@ Helpers to extract data from files, databases, and REST APIs.
|
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
|
+
from collections.abc import Mapping
|
|
9
10
|
from pathlib import Path
|
|
10
11
|
from typing import Any
|
|
11
12
|
from typing import cast
|
|
13
|
+
from urllib.parse import urlsplit
|
|
14
|
+
from urllib.parse import urlunsplit
|
|
12
15
|
|
|
16
|
+
from ..api import EndpointClient
|
|
13
17
|
from ..api import HttpMethod
|
|
18
|
+
from ..api import PaginationConfigMap
|
|
19
|
+
from ..api import RequestOptions
|
|
20
|
+
from ..api import compose_api_request_env
|
|
21
|
+
from ..api import paginate_with_client
|
|
14
22
|
from ..api.utils import resolve_request
|
|
15
23
|
from ..connector import DataConnectorType
|
|
16
24
|
from ..file import File
|
|
@@ -19,6 +27,7 @@ from ..types import JSONData
|
|
|
19
27
|
from ..types import JSONDict
|
|
20
28
|
from ..types import JSONList
|
|
21
29
|
from ..types import StrPath
|
|
30
|
+
from ..types import Timeout
|
|
22
31
|
|
|
23
32
|
# SECTION: EXPORTS ========================================================== #
|
|
24
33
|
|
|
@@ -32,50 +41,164 @@ __all__ = [
|
|
|
32
41
|
]
|
|
33
42
|
|
|
34
43
|
|
|
35
|
-
# SECTION: FUNCTIONS
|
|
44
|
+
# SECTION: INTERNAL FUNCTIONS =============================================== #
|
|
36
45
|
|
|
37
46
|
|
|
38
|
-
def
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
47
|
+
def _build_client(
|
|
48
|
+
*,
|
|
49
|
+
base_url: str,
|
|
50
|
+
base_path: str | None,
|
|
51
|
+
endpoints: dict[str, str],
|
|
52
|
+
retry: Any,
|
|
53
|
+
retry_network_errors: bool,
|
|
54
|
+
session: Any,
|
|
55
|
+
) -> EndpointClient:
|
|
56
|
+
"""
|
|
57
|
+
Construct an API client with shared defaults.
|
|
58
|
+
|
|
59
|
+
Parameters
|
|
60
|
+
----------
|
|
61
|
+
base_url : str
|
|
62
|
+
API base URL.
|
|
63
|
+
base_path : str | None
|
|
64
|
+
Base path to prepend for endpoints.
|
|
65
|
+
endpoints : dict[str, str]
|
|
66
|
+
Endpoint name to path mappings.
|
|
67
|
+
retry : Any
|
|
68
|
+
Retry policy configuration.
|
|
69
|
+
retry_network_errors : bool
|
|
70
|
+
Whether to retry on network errors.
|
|
71
|
+
session : Any
|
|
72
|
+
Optional requests session.
|
|
73
|
+
|
|
74
|
+
Returns
|
|
75
|
+
-------
|
|
76
|
+
EndpointClient
|
|
77
|
+
Configured endpoint client instance.
|
|
78
|
+
"""
|
|
79
|
+
ClientClass = EndpointClient # noqa: N806
|
|
80
|
+
return ClientClass(
|
|
81
|
+
base_url=base_url,
|
|
82
|
+
base_path=base_path,
|
|
83
|
+
endpoints=endpoints,
|
|
84
|
+
retry=retry,
|
|
85
|
+
retry_network_errors=retry_network_errors,
|
|
86
|
+
session=session,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _extract_from_api_env(
|
|
91
|
+
env: Mapping[str, Any],
|
|
92
|
+
*,
|
|
93
|
+
use_client: bool,
|
|
42
94
|
) -> JSONData:
|
|
43
95
|
"""
|
|
44
|
-
Extract data from a
|
|
96
|
+
Extract API data from a normalized request environment.
|
|
45
97
|
|
|
46
98
|
Parameters
|
|
47
99
|
----------
|
|
48
|
-
|
|
49
|
-
API
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
**kwargs : Any
|
|
53
|
-
Extra arguments forwarded to the underlying ``requests`` call
|
|
54
|
-
(for example, ``timeout``). To use a pre-configured
|
|
55
|
-
:class:`requests.Session`, provide it via ``session``.
|
|
56
|
-
When omitted, ``timeout`` defaults to 10 seconds.
|
|
100
|
+
env : Mapping[str, Any]
|
|
101
|
+
Normalized environment describing API request parameters.
|
|
102
|
+
use_client : bool
|
|
103
|
+
Whether to use the endpoint client/pagination machinery.
|
|
57
104
|
|
|
58
105
|
Returns
|
|
59
106
|
-------
|
|
60
107
|
JSONData
|
|
61
|
-
|
|
108
|
+
Extracted payload.
|
|
62
109
|
|
|
63
110
|
Raises
|
|
64
111
|
------
|
|
65
|
-
|
|
66
|
-
If
|
|
67
|
-
method (for example, ``get``).
|
|
112
|
+
ValueError
|
|
113
|
+
If required parameters are missing.
|
|
68
114
|
"""
|
|
69
|
-
|
|
70
|
-
|
|
115
|
+
if (
|
|
116
|
+
use_client
|
|
117
|
+
and env.get('use_endpoints')
|
|
118
|
+
and env.get('base_url')
|
|
119
|
+
and env.get('endpoints_map')
|
|
120
|
+
and env.get('endpoint_key')
|
|
121
|
+
):
|
|
122
|
+
client = _build_client(
|
|
123
|
+
base_url=cast(str, env.get('base_url')),
|
|
124
|
+
base_path=cast(str | None, env.get('base_path')),
|
|
125
|
+
endpoints=cast(dict[str, str], env.get('endpoints_map', {})),
|
|
126
|
+
retry=env.get('retry'),
|
|
127
|
+
retry_network_errors=bool(env.get('retry_network_errors', False)),
|
|
128
|
+
session=env.get('session'),
|
|
129
|
+
)
|
|
130
|
+
return paginate_with_client(
|
|
131
|
+
client,
|
|
132
|
+
cast(str, env.get('endpoint_key')),
|
|
133
|
+
env.get('params'),
|
|
134
|
+
env.get('headers'),
|
|
135
|
+
env.get('timeout'),
|
|
136
|
+
env.get('pagination'),
|
|
137
|
+
cast(float | None, env.get('sleep_seconds')),
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
url = env.get('url')
|
|
141
|
+
if not url:
|
|
142
|
+
raise ValueError('API source missing URL')
|
|
143
|
+
|
|
144
|
+
if use_client:
|
|
145
|
+
parts = urlsplit(cast(str, url))
|
|
146
|
+
base = urlunsplit((parts.scheme, parts.netloc, '', '', ''))
|
|
147
|
+
client = _build_client(
|
|
148
|
+
base_url=base,
|
|
149
|
+
base_path=None,
|
|
150
|
+
endpoints={},
|
|
151
|
+
retry=env.get('retry'),
|
|
152
|
+
retry_network_errors=bool(env.get('retry_network_errors', False)),
|
|
153
|
+
session=env.get('session'),
|
|
154
|
+
)
|
|
155
|
+
request_options = RequestOptions(
|
|
156
|
+
params=cast(Mapping[str, Any] | None, env.get('params')),
|
|
157
|
+
headers=cast(Mapping[str, str] | None, env.get('headers')),
|
|
158
|
+
timeout=cast(Timeout | None, env.get('timeout')),
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
return client.paginate_url(
|
|
162
|
+
cast(str, url),
|
|
163
|
+
cast(PaginationConfigMap | None, env.get('pagination')),
|
|
164
|
+
request=request_options,
|
|
165
|
+
sleep_seconds=cast(float, env.get('sleep_seconds', 0.0)),
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
method = env.get('method', HttpMethod.GET)
|
|
169
|
+
timeout = env.get('timeout', None)
|
|
170
|
+
session = env.get('session', None)
|
|
171
|
+
request_kwargs = dict(env.get('request_kwargs') or {})
|
|
71
172
|
request_callable, timeout, _ = resolve_request(
|
|
72
173
|
method,
|
|
73
174
|
session=session,
|
|
74
175
|
timeout=timeout,
|
|
75
176
|
)
|
|
76
|
-
response = request_callable(
|
|
177
|
+
response = request_callable(
|
|
178
|
+
cast(str, url),
|
|
179
|
+
timeout=timeout,
|
|
180
|
+
**request_kwargs,
|
|
181
|
+
)
|
|
77
182
|
response.raise_for_status()
|
|
183
|
+
return _parse_api_response(response)
|
|
184
|
+
|
|
78
185
|
|
|
186
|
+
def _parse_api_response(
|
|
187
|
+
response: Any,
|
|
188
|
+
) -> JSONData:
|
|
189
|
+
"""
|
|
190
|
+
Parse API responses into a consistent JSON payload.
|
|
191
|
+
|
|
192
|
+
Parameters
|
|
193
|
+
----------
|
|
194
|
+
response : Any
|
|
195
|
+
HTTP response object exposing ``headers``, ``json()``, and ``text``.
|
|
196
|
+
|
|
197
|
+
Returns
|
|
198
|
+
-------
|
|
199
|
+
JSONData
|
|
200
|
+
Parsed JSON payload, or a fallback object with raw text.
|
|
201
|
+
"""
|
|
79
202
|
content_type = response.headers.get('content-type', '').lower()
|
|
80
203
|
if 'application/json' in content_type:
|
|
81
204
|
try:
|
|
@@ -99,6 +222,70 @@ def extract_from_api(
|
|
|
99
222
|
return {'content': response.text, 'content_type': content_type}
|
|
100
223
|
|
|
101
224
|
|
|
225
|
+
# SECTION: FUNCTIONS ======================================================== #
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def extract_from_api(
|
|
229
|
+
url: str,
|
|
230
|
+
method: HttpMethod | str = HttpMethod.GET,
|
|
231
|
+
**kwargs: Any,
|
|
232
|
+
) -> JSONData:
|
|
233
|
+
"""
|
|
234
|
+
Extract data from a REST API.
|
|
235
|
+
|
|
236
|
+
Parameters
|
|
237
|
+
----------
|
|
238
|
+
url : str
|
|
239
|
+
API endpoint URL.
|
|
240
|
+
method : HttpMethod | str, optional
|
|
241
|
+
HTTP method to use. Defaults to ``GET``.
|
|
242
|
+
**kwargs : Any
|
|
243
|
+
Extra arguments forwarded to the underlying ``requests`` call
|
|
244
|
+
(for example, ``timeout``). To use a pre-configured
|
|
245
|
+
:class:`requests.Session`, provide it via ``session``.
|
|
246
|
+
When omitted, ``timeout`` defaults to 10 seconds.
|
|
247
|
+
|
|
248
|
+
Returns
|
|
249
|
+
-------
|
|
250
|
+
JSONData
|
|
251
|
+
Parsed JSON payload, or a fallback object with raw text.
|
|
252
|
+
"""
|
|
253
|
+
env = {
|
|
254
|
+
'url': url,
|
|
255
|
+
'method': method,
|
|
256
|
+
'timeout': kwargs.pop('timeout', None),
|
|
257
|
+
'session': kwargs.pop('session', None),
|
|
258
|
+
'request_kwargs': kwargs,
|
|
259
|
+
}
|
|
260
|
+
return _extract_from_api_env(env, use_client=False)
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def extract_from_api_source(
|
|
264
|
+
cfg: Any,
|
|
265
|
+
source_obj: Any,
|
|
266
|
+
overrides: dict[str, Any],
|
|
267
|
+
) -> JSONData:
|
|
268
|
+
"""
|
|
269
|
+
Extract data from a REST API source connector.
|
|
270
|
+
|
|
271
|
+
Parameters
|
|
272
|
+
----------
|
|
273
|
+
cfg : Any
|
|
274
|
+
Pipeline configuration.
|
|
275
|
+
source_obj : Any
|
|
276
|
+
Connector configuration.
|
|
277
|
+
overrides : dict[str, Any]
|
|
278
|
+
Extract-time overrides.
|
|
279
|
+
|
|
280
|
+
Returns
|
|
281
|
+
-------
|
|
282
|
+
JSONData
|
|
283
|
+
Extracted payload.
|
|
284
|
+
"""
|
|
285
|
+
env = compose_api_request_env(cfg, source_obj, overrides)
|
|
286
|
+
return _extract_from_api_env(env, use_client=True)
|
|
287
|
+
|
|
288
|
+
|
|
102
289
|
def extract_from_database(
|
|
103
290
|
connection_string: str,
|
|
104
291
|
) -> JSONList:
|
etlplus/ops/load.py
CHANGED
|
@@ -8,11 +8,13 @@ from __future__ import annotations
|
|
|
8
8
|
|
|
9
9
|
import json
|
|
10
10
|
import sys
|
|
11
|
+
from collections.abc import Mapping
|
|
11
12
|
from pathlib import Path
|
|
12
13
|
from typing import Any
|
|
13
14
|
from typing import cast
|
|
14
15
|
|
|
15
16
|
from ..api import HttpMethod
|
|
17
|
+
from ..api import compose_api_target_env
|
|
16
18
|
from ..api.utils import resolve_request
|
|
17
19
|
from ..connector import DataConnectorType
|
|
18
20
|
from ..file import File
|
|
@@ -39,6 +41,108 @@ __all__ = [
|
|
|
39
41
|
# SECTION: INTERNAL FUNCTIONS ============================================== #
|
|
40
42
|
|
|
41
43
|
|
|
44
|
+
def _load_data_from_str(
|
|
45
|
+
source: str,
|
|
46
|
+
) -> JSONData:
|
|
47
|
+
"""
|
|
48
|
+
Load JSON data from a string or file path.
|
|
49
|
+
|
|
50
|
+
Parameters
|
|
51
|
+
----------
|
|
52
|
+
source : str
|
|
53
|
+
Input string representing a file path or JSON payload.
|
|
54
|
+
|
|
55
|
+
Returns
|
|
56
|
+
-------
|
|
57
|
+
JSONData
|
|
58
|
+
Parsed JSON payload.
|
|
59
|
+
"""
|
|
60
|
+
# Special case: '-' means read JSON from STDIN (Unix convention).
|
|
61
|
+
if source == '-':
|
|
62
|
+
raw = sys.stdin.read()
|
|
63
|
+
return _parse_json_string(raw)
|
|
64
|
+
|
|
65
|
+
candidate = Path(source)
|
|
66
|
+
if candidate.exists():
|
|
67
|
+
try:
|
|
68
|
+
return File(candidate, FileFormat.JSON).read()
|
|
69
|
+
except (OSError, json.JSONDecodeError, ValueError):
|
|
70
|
+
# Fall back to treating the string as raw JSON content.
|
|
71
|
+
pass
|
|
72
|
+
return _parse_json_string(source)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _load_to_api_env(
|
|
76
|
+
data: JSONData,
|
|
77
|
+
env: Mapping[str, Any],
|
|
78
|
+
) -> JSONDict:
|
|
79
|
+
"""
|
|
80
|
+
Load data to an API target using a normalized environment.
|
|
81
|
+
|
|
82
|
+
Parameters
|
|
83
|
+
----------
|
|
84
|
+
data : JSONData
|
|
85
|
+
Payload to load.
|
|
86
|
+
env : Mapping[str, Any]
|
|
87
|
+
Normalized request environment.
|
|
88
|
+
|
|
89
|
+
Returns
|
|
90
|
+
-------
|
|
91
|
+
JSONDict
|
|
92
|
+
Load result payload.
|
|
93
|
+
|
|
94
|
+
Raises
|
|
95
|
+
------
|
|
96
|
+
ValueError
|
|
97
|
+
If required parameters are missing.
|
|
98
|
+
"""
|
|
99
|
+
url = env.get('url')
|
|
100
|
+
if not url:
|
|
101
|
+
raise ValueError('API target missing "url"')
|
|
102
|
+
method = env.get('method') or 'post'
|
|
103
|
+
kwargs: dict[str, Any] = {}
|
|
104
|
+
headers = env.get('headers')
|
|
105
|
+
if headers:
|
|
106
|
+
kwargs['headers'] = cast(dict[str, str], headers)
|
|
107
|
+
if env.get('timeout') is not None:
|
|
108
|
+
kwargs['timeout'] = env.get('timeout')
|
|
109
|
+
session = env.get('session')
|
|
110
|
+
if session is not None:
|
|
111
|
+
kwargs['session'] = session
|
|
112
|
+
extra_kwargs = env.get('request_kwargs')
|
|
113
|
+
if isinstance(extra_kwargs, Mapping):
|
|
114
|
+
kwargs.update(extra_kwargs)
|
|
115
|
+
timeout = kwargs.pop('timeout', 10.0)
|
|
116
|
+
session = kwargs.pop('session', None)
|
|
117
|
+
request_callable, timeout, http_method = resolve_request(
|
|
118
|
+
method,
|
|
119
|
+
session=session,
|
|
120
|
+
timeout=timeout,
|
|
121
|
+
)
|
|
122
|
+
response = request_callable(
|
|
123
|
+
cast(str, url),
|
|
124
|
+
json=data,
|
|
125
|
+
timeout=timeout,
|
|
126
|
+
**kwargs,
|
|
127
|
+
)
|
|
128
|
+
response.raise_for_status()
|
|
129
|
+
|
|
130
|
+
# Try JSON first, fall back to text.
|
|
131
|
+
try:
|
|
132
|
+
payload: Any = response.json()
|
|
133
|
+
except ValueError:
|
|
134
|
+
payload = response.text
|
|
135
|
+
|
|
136
|
+
return {
|
|
137
|
+
'status': 'success',
|
|
138
|
+
'status_code': response.status_code,
|
|
139
|
+
'message': f'Data loaded to {url}',
|
|
140
|
+
'response': payload,
|
|
141
|
+
'records': count_records(data),
|
|
142
|
+
'method': http_method.value.upper(),
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
|
|
42
146
|
def _parse_json_string(
|
|
43
147
|
raw: str,
|
|
44
148
|
) -> JSONData:
|
|
@@ -113,18 +217,7 @@ def load_data(
|
|
|
113
217
|
return File(source, FileFormat.JSON).read()
|
|
114
218
|
|
|
115
219
|
if isinstance(source, str):
|
|
116
|
-
|
|
117
|
-
if source == '-':
|
|
118
|
-
raw = sys.stdin.read()
|
|
119
|
-
return _parse_json_string(raw)
|
|
120
|
-
candidate = Path(source)
|
|
121
|
-
if candidate.exists():
|
|
122
|
-
try:
|
|
123
|
-
return File(candidate, FileFormat.JSON).read()
|
|
124
|
-
except (OSError, json.JSONDecodeError, ValueError):
|
|
125
|
-
# Fall back to treating the string as raw JSON content.
|
|
126
|
-
pass
|
|
127
|
-
return _parse_json_string(source)
|
|
220
|
+
return _load_data_from_str(source)
|
|
128
221
|
|
|
129
222
|
raise TypeError(
|
|
130
223
|
'source must be a mapping, sequence of mappings, path, or JSON string',
|
|
@@ -158,30 +251,43 @@ def load_to_api(
|
|
|
158
251
|
Result dictionary including response payload or text.
|
|
159
252
|
"""
|
|
160
253
|
# Apply a conservative timeout to guard against hanging requests.
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
session
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
response.raise_for_status()
|
|
254
|
+
env = {
|
|
255
|
+
'url': url,
|
|
256
|
+
'method': method,
|
|
257
|
+
'timeout': kwargs.pop('timeout', 10.0),
|
|
258
|
+
'session': kwargs.pop('session', None),
|
|
259
|
+
'request_kwargs': kwargs,
|
|
260
|
+
}
|
|
261
|
+
return _load_to_api_env(data, env)
|
|
170
262
|
|
|
171
|
-
# Try JSON first, fall back to text.
|
|
172
|
-
try:
|
|
173
|
-
payload: Any = response.json()
|
|
174
|
-
except ValueError:
|
|
175
|
-
payload = response.text
|
|
176
263
|
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
264
|
+
def load_to_api_target(
|
|
265
|
+
cfg: Any,
|
|
266
|
+
target_obj: Any,
|
|
267
|
+
overrides: dict[str, Any],
|
|
268
|
+
data: JSONData,
|
|
269
|
+
) -> JSONDict:
|
|
270
|
+
"""
|
|
271
|
+
Load data to an API target connector.
|
|
272
|
+
|
|
273
|
+
Parameters
|
|
274
|
+
----------
|
|
275
|
+
cfg : Any
|
|
276
|
+
Pipeline configuration.
|
|
277
|
+
target_obj : Any
|
|
278
|
+
Connector configuration.
|
|
279
|
+
overrides : dict[str, Any]
|
|
280
|
+
Load-time overrides.
|
|
281
|
+
data : JSONData
|
|
282
|
+
Payload to load.
|
|
283
|
+
|
|
284
|
+
Returns
|
|
285
|
+
-------
|
|
286
|
+
JSONDict
|
|
287
|
+
Load result.
|
|
288
|
+
"""
|
|
289
|
+
env = compose_api_target_env(cfg, target_obj, overrides)
|
|
290
|
+
return _load_to_api_env(data, env)
|
|
185
291
|
|
|
186
292
|
|
|
187
293
|
def load_to_database(
|
etlplus/ops/run.py
CHANGED
|
@@ -6,31 +6,23 @@ A module for running ETL jobs defined in YAML configurations.
|
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
|
-
from collections.abc import Mapping
|
|
10
9
|
from typing import Any
|
|
11
10
|
from typing import Final
|
|
12
11
|
from typing import cast
|
|
13
|
-
from urllib.parse import urlsplit
|
|
14
|
-
from urllib.parse import urlunsplit
|
|
15
12
|
|
|
16
|
-
from ..api import EndpointClient # noqa: F401 (re-exported for tests)
|
|
17
13
|
from ..api import HttpMethod
|
|
18
|
-
from ..api import PaginationConfigMap
|
|
19
|
-
from ..api import RequestOptions
|
|
20
|
-
from ..api import compose_api_request_env
|
|
21
|
-
from ..api import compose_api_target_env
|
|
22
|
-
from ..api import paginate_with_client
|
|
23
14
|
from ..connector import DataConnectorType
|
|
24
15
|
from ..file import FileFormat
|
|
25
16
|
from ..types import JSONData
|
|
26
17
|
from ..types import JSONDict
|
|
27
18
|
from ..types import PipelineConfig
|
|
28
19
|
from ..types import StrPath
|
|
29
|
-
from ..types import Timeout
|
|
30
20
|
from ..utils import print_json
|
|
31
21
|
from ..workflow import load_pipeline_config
|
|
32
22
|
from .extract import extract
|
|
23
|
+
from .extract import extract_from_api_source
|
|
33
24
|
from .load import load
|
|
25
|
+
from .load import load_to_api_target
|
|
34
26
|
from .transform import transform
|
|
35
27
|
from .utils import maybe_validate
|
|
36
28
|
from .validate import validate
|
|
@@ -54,6 +46,75 @@ DEFAULT_CONFIG_PATH: Final[str] = 'in/pipeline.yml'
|
|
|
54
46
|
# SECTION: INTERNAL FUNCTIONS =============================================== #
|
|
55
47
|
|
|
56
48
|
|
|
49
|
+
def _index_connectors(
|
|
50
|
+
connectors: list[Any],
|
|
51
|
+
*,
|
|
52
|
+
label: str,
|
|
53
|
+
) -> dict[str, Any]:
|
|
54
|
+
"""
|
|
55
|
+
Index connectors by name with a helpful error on duplicates.
|
|
56
|
+
|
|
57
|
+
Parameters
|
|
58
|
+
----------
|
|
59
|
+
connectors : list[Any]
|
|
60
|
+
Connector objects to index.
|
|
61
|
+
label : str
|
|
62
|
+
Label used in error messages (e.g., ``"source"``).
|
|
63
|
+
|
|
64
|
+
Returns
|
|
65
|
+
-------
|
|
66
|
+
dict[str, Any]
|
|
67
|
+
Mapping of connector names to connector objects.
|
|
68
|
+
|
|
69
|
+
Raises
|
|
70
|
+
------
|
|
71
|
+
ValueError
|
|
72
|
+
If duplicate connector names are found.
|
|
73
|
+
"""
|
|
74
|
+
indexed: dict[str, Any] = {}
|
|
75
|
+
for connector in connectors:
|
|
76
|
+
name = getattr(connector, 'name', None)
|
|
77
|
+
if not isinstance(name, str) or not name:
|
|
78
|
+
continue
|
|
79
|
+
if name in indexed:
|
|
80
|
+
raise ValueError(f'Duplicate {label} connector name: {name}')
|
|
81
|
+
indexed[name] = connector
|
|
82
|
+
return indexed
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _require_named_connector(
|
|
86
|
+
connectors: dict[str, Any],
|
|
87
|
+
name: str,
|
|
88
|
+
*,
|
|
89
|
+
label: str,
|
|
90
|
+
) -> Any:
|
|
91
|
+
"""
|
|
92
|
+
Return a connector by name or raise a helpful error.
|
|
93
|
+
|
|
94
|
+
Parameters
|
|
95
|
+
----------
|
|
96
|
+
connectors : dict[str, Any]
|
|
97
|
+
Mapping of connector names to connector objects.
|
|
98
|
+
name : str
|
|
99
|
+
Connector name to retrieve.
|
|
100
|
+
label : str
|
|
101
|
+
Label used in error messages (e.g., ``"source"``).
|
|
102
|
+
|
|
103
|
+
Returns
|
|
104
|
+
-------
|
|
105
|
+
Any
|
|
106
|
+
Connector object.
|
|
107
|
+
|
|
108
|
+
Raises
|
|
109
|
+
------
|
|
110
|
+
ValueError
|
|
111
|
+
If the connector name is not found.
|
|
112
|
+
"""
|
|
113
|
+
if name not in connectors:
|
|
114
|
+
raise ValueError(f'Unknown {label}: {name}')
|
|
115
|
+
return connectors[name]
|
|
116
|
+
|
|
117
|
+
|
|
57
118
|
def _resolve_validation_config(
|
|
58
119
|
job_obj: Any,
|
|
59
120
|
cfg: Any,
|
|
@@ -122,16 +183,18 @@ def run(
|
|
|
122
183
|
raise ValueError(f'Job not found: {job}')
|
|
123
184
|
|
|
124
185
|
# Index sources/targets by name
|
|
125
|
-
sources_by_name =
|
|
126
|
-
targets_by_name =
|
|
186
|
+
sources_by_name = _index_connectors(cfg.sources, label='source')
|
|
187
|
+
targets_by_name = _index_connectors(cfg.targets, label='target')
|
|
127
188
|
|
|
128
189
|
# Extract.
|
|
129
190
|
if not job_obj.extract:
|
|
130
191
|
raise ValueError('Job missing "extract" section')
|
|
131
192
|
source_name = job_obj.extract.source
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
193
|
+
source_obj = _require_named_connector(
|
|
194
|
+
sources_by_name,
|
|
195
|
+
source_name,
|
|
196
|
+
label='source',
|
|
197
|
+
)
|
|
135
198
|
ex_opts: dict[str, Any] = job_obj.extract.options or {}
|
|
136
199
|
|
|
137
200
|
data: Any
|
|
@@ -151,68 +214,7 @@ def run(
|
|
|
151
214
|
conn = getattr(source_obj, 'connection_string', '')
|
|
152
215
|
data = extract('database', conn)
|
|
153
216
|
case DataConnectorType.API:
|
|
154
|
-
|
|
155
|
-
if (
|
|
156
|
-
env.get('use_endpoints')
|
|
157
|
-
and env.get('base_url')
|
|
158
|
-
and env.get('endpoints_map')
|
|
159
|
-
and env.get('endpoint_key')
|
|
160
|
-
):
|
|
161
|
-
# Construct client using module-level EndpointClient so tests
|
|
162
|
-
# can monkeypatch this class on etlplus.ops.run.
|
|
163
|
-
ClientClass = EndpointClient # noqa: N806
|
|
164
|
-
client = ClientClass(
|
|
165
|
-
base_url=cast(str, env.get('base_url')),
|
|
166
|
-
base_path=cast(str | None, env.get('base_path')),
|
|
167
|
-
endpoints=cast(
|
|
168
|
-
dict[str, str],
|
|
169
|
-
env.get('endpoints_map', {}),
|
|
170
|
-
),
|
|
171
|
-
retry=env.get('retry'),
|
|
172
|
-
retry_network_errors=bool(
|
|
173
|
-
env.get('retry_network_errors', False),
|
|
174
|
-
),
|
|
175
|
-
session=env.get('session'),
|
|
176
|
-
)
|
|
177
|
-
data = paginate_with_client(
|
|
178
|
-
client,
|
|
179
|
-
cast(str, env.get('endpoint_key')),
|
|
180
|
-
env.get('params'),
|
|
181
|
-
env.get('headers'),
|
|
182
|
-
env.get('timeout'),
|
|
183
|
-
env.get('pagination'),
|
|
184
|
-
cast(float | None, env.get('sleep_seconds')),
|
|
185
|
-
)
|
|
186
|
-
else:
|
|
187
|
-
url = env.get('url')
|
|
188
|
-
if not url:
|
|
189
|
-
raise ValueError('API source missing URL')
|
|
190
|
-
parts = urlsplit(cast(str, url))
|
|
191
|
-
base = urlunsplit((parts.scheme, parts.netloc, '', '', ''))
|
|
192
|
-
ClientClass = EndpointClient # noqa: N806
|
|
193
|
-
client = ClientClass(
|
|
194
|
-
base_url=base,
|
|
195
|
-
base_path=None,
|
|
196
|
-
endpoints={},
|
|
197
|
-
retry=env.get('retry'),
|
|
198
|
-
retry_network_errors=bool(
|
|
199
|
-
env.get('retry_network_errors', False),
|
|
200
|
-
),
|
|
201
|
-
session=env.get('session'),
|
|
202
|
-
)
|
|
203
|
-
|
|
204
|
-
request_options = RequestOptions(
|
|
205
|
-
params=cast(Mapping[str, Any] | None, env.get('params')),
|
|
206
|
-
headers=cast(Mapping[str, str] | None, env.get('headers')),
|
|
207
|
-
timeout=cast(Timeout | None, env.get('timeout')),
|
|
208
|
-
)
|
|
209
|
-
|
|
210
|
-
data = client.paginate_url(
|
|
211
|
-
cast(str, url),
|
|
212
|
-
cast(PaginationConfigMap | None, env.get('pagination')),
|
|
213
|
-
request=request_options,
|
|
214
|
-
sleep_seconds=cast(float, env.get('sleep_seconds', 0.0)),
|
|
215
|
-
)
|
|
217
|
+
data = extract_from_api_source(cfg, source_obj, ex_opts)
|
|
216
218
|
case _:
|
|
217
219
|
# :meth:`coerce` already raises for invalid connector types, but
|
|
218
220
|
# keep explicit guard for defensive programming.
|
|
@@ -256,9 +258,11 @@ def run(
|
|
|
256
258
|
if not job_obj.load:
|
|
257
259
|
raise ValueError('Job missing "load" section')
|
|
258
260
|
target_name = job_obj.load.target
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
261
|
+
target_obj = _require_named_connector(
|
|
262
|
+
targets_by_name,
|
|
263
|
+
target_name,
|
|
264
|
+
label='target',
|
|
265
|
+
)
|
|
262
266
|
overrides = job_obj.load.overrides or {}
|
|
263
267
|
|
|
264
268
|
ttype_raw = getattr(target_obj, 'type', None)
|
|
@@ -274,26 +278,7 @@ def run(
|
|
|
274
278
|
raise ValueError('File target missing "path"')
|
|
275
279
|
result = load(data, 'file', path, file_format=fmt)
|
|
276
280
|
case DataConnectorType.API:
|
|
277
|
-
|
|
278
|
-
url_t = env_t.get('url')
|
|
279
|
-
if not url_t:
|
|
280
|
-
raise ValueError('API target missing "url"')
|
|
281
|
-
kwargs_t: dict[str, Any] = {}
|
|
282
|
-
headers = env_t.get('headers')
|
|
283
|
-
if headers:
|
|
284
|
-
kwargs_t['headers'] = cast(dict[str, str], headers)
|
|
285
|
-
if env_t.get('timeout') is not None:
|
|
286
|
-
kwargs_t['timeout'] = env_t.get('timeout')
|
|
287
|
-
session = env_t.get('session')
|
|
288
|
-
if session is not None:
|
|
289
|
-
kwargs_t['session'] = session
|
|
290
|
-
result = load(
|
|
291
|
-
data,
|
|
292
|
-
'api',
|
|
293
|
-
cast(str, url_t),
|
|
294
|
-
method=cast(str | Any, env_t.get('method') or 'post'),
|
|
295
|
-
**kwargs_t,
|
|
296
|
-
)
|
|
281
|
+
result = load_to_api_target(cfg, target_obj, overrides, data)
|
|
297
282
|
case DataConnectorType.DATABASE:
|
|
298
283
|
conn = overrides.get('connection_string') or getattr(
|
|
299
284
|
target_obj,
|
etlplus/ops/transform.py
CHANGED
|
@@ -206,15 +206,12 @@ def _normalize_specs(
|
|
|
206
206
|
"""
|
|
207
207
|
if config is None:
|
|
208
208
|
return []
|
|
209
|
-
if
|
|
210
|
-
config,
|
|
211
|
-
(str, bytes, bytearray),
|
|
212
|
-
):
|
|
209
|
+
if _is_sequence_not_text(config):
|
|
213
210
|
# Already a sequence of step specs; normalize to a list.
|
|
214
|
-
return list(config)
|
|
211
|
+
return list(cast(Sequence[StepSpec], config))
|
|
215
212
|
|
|
216
213
|
# Single spec
|
|
217
|
-
return [config]
|
|
214
|
+
return [cast(StepSpec, config)]
|
|
218
215
|
|
|
219
216
|
|
|
220
217
|
def _normalize_operation_keys(ops: Mapping[Any, Any]) -> dict[str, Any]:
|
|
@@ -702,7 +699,31 @@ def _apply_sort_step(
|
|
|
702
699
|
# -- Helpers -- #
|
|
703
700
|
|
|
704
701
|
|
|
705
|
-
def
|
|
702
|
+
def _is_sequence_not_text(
|
|
703
|
+
obj: Any,
|
|
704
|
+
) -> bool:
|
|
705
|
+
"""
|
|
706
|
+
Return ``True`` for non-text sequences.
|
|
707
|
+
|
|
708
|
+
Parameters
|
|
709
|
+
----------
|
|
710
|
+
obj : Any
|
|
711
|
+
The object to check.
|
|
712
|
+
|
|
713
|
+
Returns
|
|
714
|
+
-------
|
|
715
|
+
bool
|
|
716
|
+
``True`` when *obj* is a non-text sequence.
|
|
717
|
+
"""
|
|
718
|
+
return isinstance(obj, Sequence) and not isinstance(
|
|
719
|
+
obj,
|
|
720
|
+
(str, bytes, bytearray),
|
|
721
|
+
)
|
|
722
|
+
|
|
723
|
+
|
|
724
|
+
def _is_plain_fields_list(
|
|
725
|
+
obj: Any,
|
|
726
|
+
) -> bool:
|
|
706
727
|
"""
|
|
707
728
|
Return True if obj is a non-text sequence of non-mapping items.
|
|
708
729
|
|
|
@@ -719,10 +740,8 @@ def _is_plain_fields_list(obj: Any) -> bool:
|
|
|
719
740
|
True if obj is a non-text sequence of non-mapping items, False
|
|
720
741
|
otherwise.
|
|
721
742
|
"""
|
|
722
|
-
return (
|
|
723
|
-
isinstance(
|
|
724
|
-
and not isinstance(obj, (str, bytes, bytearray))
|
|
725
|
-
and not any(isinstance(x, Mapping) for x in obj)
|
|
743
|
+
return _is_sequence_not_text(obj) and not any(
|
|
744
|
+
isinstance(x, Mapping) for x in obj
|
|
726
745
|
)
|
|
727
746
|
|
|
728
747
|
|
etlplus/types.py
CHANGED
|
@@ -12,8 +12,7 @@ Notes
|
|
|
12
12
|
See Also
|
|
13
13
|
--------
|
|
14
14
|
- :mod:`etlplus.api.types` for HTTP-specific aliases and data classes
|
|
15
|
-
- :mod:`etlplus.connector.types` for connector-specific aliases
|
|
16
|
-
surfaces
|
|
15
|
+
- :mod:`etlplus.connector.types` for connector-specific aliases
|
|
17
16
|
|
|
18
17
|
Examples
|
|
19
18
|
--------
|
|
@@ -81,6 +80,8 @@ __all__ = [
|
|
|
81
80
|
# Type Aliases (Networking / Runtime)
|
|
82
81
|
'Sleeper',
|
|
83
82
|
'Timeout',
|
|
83
|
+
# Type Aliases (Templates)
|
|
84
|
+
'TemplateKey',
|
|
84
85
|
]
|
|
85
86
|
|
|
86
87
|
|
etlplus/workflow/__init__.py
CHANGED
|
@@ -14,6 +14,7 @@ from .jobs import TransformRef
|
|
|
14
14
|
from .jobs import ValidationRef
|
|
15
15
|
from .pipeline import PipelineConfig
|
|
16
16
|
from .pipeline import load_pipeline_config
|
|
17
|
+
from .profile import ProfileConfig
|
|
17
18
|
|
|
18
19
|
# SECTION: EXPORTS ========================================================== #
|
|
19
20
|
|
|
@@ -24,6 +25,7 @@ __all__ = [
|
|
|
24
25
|
'JobConfig',
|
|
25
26
|
'LoadRef',
|
|
26
27
|
'PipelineConfig',
|
|
28
|
+
'ProfileConfig',
|
|
27
29
|
'TransformRef',
|
|
28
30
|
'ValidationRef',
|
|
29
31
|
# Functions
|
etlplus/workflow/dag.py
CHANGED
|
@@ -47,6 +47,28 @@ class DagError(ValueError):
|
|
|
47
47
|
return self.message
|
|
48
48
|
|
|
49
49
|
|
|
50
|
+
# SECTION: INTERNAL FUNCTIONS =============================================== #
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _ready(
|
|
54
|
+
indegree: dict[str, int],
|
|
55
|
+
) -> list[str]:
|
|
56
|
+
"""
|
|
57
|
+
Return a sorted list of nodes with zero indegree.
|
|
58
|
+
|
|
59
|
+
Parameters
|
|
60
|
+
----------
|
|
61
|
+
indegree : dict[str, int]
|
|
62
|
+
Mapping of node name to indegree.
|
|
63
|
+
|
|
64
|
+
Returns
|
|
65
|
+
-------
|
|
66
|
+
list[str]
|
|
67
|
+
Sorted list of node names ready to process.
|
|
68
|
+
"""
|
|
69
|
+
return sorted(name for name, deg in indegree.items() if deg == 0)
|
|
70
|
+
|
|
71
|
+
|
|
50
72
|
# SECTION: FUNCTIONS ======================================================== #
|
|
51
73
|
|
|
52
74
|
|
|
@@ -88,7 +110,7 @@ def topological_sort_jobs(
|
|
|
88
110
|
edges[dep].add(job.name)
|
|
89
111
|
indegree[job.name] += 1
|
|
90
112
|
|
|
91
|
-
queue = deque(
|
|
113
|
+
queue = deque(_ready(indegree))
|
|
92
114
|
ordered: list[str] = []
|
|
93
115
|
|
|
94
116
|
while queue:
|
etlplus/workflow/jobs.py
CHANGED
|
@@ -14,6 +14,7 @@ Notes
|
|
|
14
14
|
|
|
15
15
|
from __future__ import annotations
|
|
16
16
|
|
|
17
|
+
from collections.abc import Sequence
|
|
17
18
|
from dataclasses import dataclass
|
|
18
19
|
from dataclasses import field
|
|
19
20
|
from typing import Any
|
|
@@ -76,13 +77,15 @@ def _parse_depends_on(
|
|
|
76
77
|
"""
|
|
77
78
|
if isinstance(value, str):
|
|
78
79
|
return [value]
|
|
79
|
-
if isinstance(value,
|
|
80
|
+
if isinstance(value, Sequence) and not isinstance(
|
|
81
|
+
value,
|
|
82
|
+
(str, bytes, bytearray),
|
|
83
|
+
):
|
|
80
84
|
return [entry for entry in value if isinstance(entry, str)]
|
|
81
85
|
return []
|
|
82
86
|
|
|
83
87
|
|
|
84
88
|
def _require_str(
|
|
85
|
-
# data: dict[str, Any],
|
|
86
89
|
data: StrAnyMap,
|
|
87
90
|
key: str,
|
|
88
91
|
) -> str | None:
|
|
@@ -149,13 +152,9 @@ class ExtractRef:
|
|
|
149
152
|
data = maybe_mapping(obj)
|
|
150
153
|
if not data:
|
|
151
154
|
return None
|
|
152
|
-
source
|
|
153
|
-
if source is None:
|
|
155
|
+
if (source := _require_str(data, 'source')) is None:
|
|
154
156
|
return None
|
|
155
|
-
return cls(
|
|
156
|
-
source=source,
|
|
157
|
-
options=coerce_dict(data.get('options')),
|
|
158
|
-
)
|
|
157
|
+
return cls(source=source, options=coerce_dict(data.get('options')))
|
|
159
158
|
|
|
160
159
|
|
|
161
160
|
@dataclass(kw_only=True, slots=True)
|
|
@@ -214,18 +213,13 @@ class JobConfig:
|
|
|
214
213
|
data = maybe_mapping(obj)
|
|
215
214
|
if not data:
|
|
216
215
|
return None
|
|
217
|
-
name
|
|
218
|
-
if name is None:
|
|
216
|
+
if (name := _require_str(data, 'name')) is None:
|
|
219
217
|
return None
|
|
220
218
|
|
|
221
|
-
description = _coerce_optional_str(data.get('description'))
|
|
222
|
-
|
|
223
|
-
depends_on = _parse_depends_on(data.get('depends_on'))
|
|
224
|
-
|
|
225
219
|
return cls(
|
|
226
220
|
name=name,
|
|
227
|
-
description=description,
|
|
228
|
-
depends_on=depends_on,
|
|
221
|
+
description=_coerce_optional_str(data.get('description')),
|
|
222
|
+
depends_on=_parse_depends_on(data.get('depends_on')),
|
|
229
223
|
extract=ExtractRef.from_obj(data.get('extract')),
|
|
230
224
|
validate=ValidationRef.from_obj(data.get('validate')),
|
|
231
225
|
transform=TransformRef.from_obj(data.get('transform')),
|
|
@@ -274,8 +268,7 @@ class LoadRef:
|
|
|
274
268
|
data = maybe_mapping(obj)
|
|
275
269
|
if not data:
|
|
276
270
|
return None
|
|
277
|
-
target
|
|
278
|
-
if target is None:
|
|
271
|
+
if (target := _require_str(data, 'target')) is None:
|
|
279
272
|
return None
|
|
280
273
|
return cls(
|
|
281
274
|
target=target,
|
|
@@ -321,8 +314,7 @@ class TransformRef:
|
|
|
321
314
|
data = maybe_mapping(obj)
|
|
322
315
|
if not data:
|
|
323
316
|
return None
|
|
324
|
-
pipeline
|
|
325
|
-
if pipeline is None:
|
|
317
|
+
if (pipeline := _require_str(data, 'pipeline')) is None:
|
|
326
318
|
return None
|
|
327
319
|
return cls(pipeline=pipeline)
|
|
328
320
|
|
|
@@ -372,13 +364,10 @@ class ValidationRef:
|
|
|
372
364
|
data = maybe_mapping(obj)
|
|
373
365
|
if not data:
|
|
374
366
|
return None
|
|
375
|
-
ruleset
|
|
376
|
-
if ruleset is None:
|
|
367
|
+
if (ruleset := _require_str(data, 'ruleset')) is None:
|
|
377
368
|
return None
|
|
378
|
-
severity = _coerce_optional_str(data.get('severity'))
|
|
379
|
-
phase = _coerce_optional_str(data.get('phase'))
|
|
380
369
|
return cls(
|
|
381
370
|
ruleset=ruleset,
|
|
382
|
-
severity=severity,
|
|
383
|
-
phase=phase,
|
|
371
|
+
severity=_coerce_optional_str(data.get('severity')),
|
|
372
|
+
phase=_coerce_optional_str(data.get('phase')),
|
|
384
373
|
)
|
etlplus/workflow/pipeline.py
CHANGED
|
@@ -50,20 +50,42 @@ __all__ = [
|
|
|
50
50
|
# SECTION: INTERNAL FUNCTIONS =============================================== #
|
|
51
51
|
|
|
52
52
|
|
|
53
|
-
def
|
|
53
|
+
def _build_connectors(
|
|
54
54
|
raw: StrAnyMap,
|
|
55
|
+
*,
|
|
55
56
|
key: str,
|
|
56
|
-
|
|
57
|
-
) -> list[T]:
|
|
57
|
+
) -> list[Connector]:
|
|
58
58
|
"""
|
|
59
|
-
|
|
59
|
+
Parse connector entries from a list under ``raw[key]``.
|
|
60
60
|
|
|
61
61
|
Parameters
|
|
62
62
|
----------
|
|
63
63
|
raw : StrAnyMap
|
|
64
64
|
Raw pipeline mapping.
|
|
65
65
|
key : str
|
|
66
|
-
Key pointing to
|
|
66
|
+
Key pointing to connector entries (e.g., ``"sources"``).
|
|
67
|
+
|
|
68
|
+
Returns
|
|
69
|
+
-------
|
|
70
|
+
list[Connector]
|
|
71
|
+
Parsed connector instances.
|
|
72
|
+
"""
|
|
73
|
+
return list(
|
|
74
|
+
_collect_parsed(raw.get(key, []) or [], _parse_connector_entry),
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _collect_parsed[T](
|
|
79
|
+
items: Any,
|
|
80
|
+
parser: Callable[[Any], T | None],
|
|
81
|
+
) -> list[T]:
|
|
82
|
+
"""
|
|
83
|
+
Collect parsed items from ``raw[key]`` using a tolerant parser.
|
|
84
|
+
|
|
85
|
+
Parameters
|
|
86
|
+
----------
|
|
87
|
+
items : Any
|
|
88
|
+
List-like payload to parse.
|
|
67
89
|
parser : Callable[[Any], T | None]
|
|
68
90
|
Parser that returns an instance or ``None`` for invalid entries.
|
|
69
91
|
|
|
@@ -72,12 +94,12 @@ def _collect_parsed[T](
|
|
|
72
94
|
list[T]
|
|
73
95
|
Parsed items, excluding invalid entries.
|
|
74
96
|
"""
|
|
75
|
-
|
|
76
|
-
for entry in
|
|
97
|
+
parsed_items: list[T] = []
|
|
98
|
+
for entry in items or []:
|
|
77
99
|
parsed = parser(entry)
|
|
78
100
|
if parsed is not None:
|
|
79
|
-
|
|
80
|
-
return
|
|
101
|
+
parsed_items.append(parsed)
|
|
102
|
+
return parsed_items
|
|
81
103
|
|
|
82
104
|
|
|
83
105
|
def _parse_connector_entry(
|
|
@@ -104,48 +126,6 @@ def _parse_connector_entry(
|
|
|
104
126
|
return None
|
|
105
127
|
|
|
106
128
|
|
|
107
|
-
def _build_sources(
|
|
108
|
-
raw: StrAnyMap,
|
|
109
|
-
) -> list[Connector]:
|
|
110
|
-
"""
|
|
111
|
-
Return a list of source connectors parsed from the mapping.
|
|
112
|
-
|
|
113
|
-
Parameters
|
|
114
|
-
----------
|
|
115
|
-
raw : StrAnyMap
|
|
116
|
-
Raw pipeline mapping.
|
|
117
|
-
|
|
118
|
-
Returns
|
|
119
|
-
-------
|
|
120
|
-
list[Connector]
|
|
121
|
-
Parsed source connectors.
|
|
122
|
-
"""
|
|
123
|
-
return list(
|
|
124
|
-
_collect_parsed(raw, 'sources', _parse_connector_entry),
|
|
125
|
-
)
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
def _build_targets(
|
|
129
|
-
raw: StrAnyMap,
|
|
130
|
-
) -> list[Connector]:
|
|
131
|
-
"""
|
|
132
|
-
Return a list of target connectors parsed from the mapping.
|
|
133
|
-
|
|
134
|
-
Parameters
|
|
135
|
-
----------
|
|
136
|
-
raw : StrAnyMap
|
|
137
|
-
Raw pipeline mapping.
|
|
138
|
-
|
|
139
|
-
Returns
|
|
140
|
-
-------
|
|
141
|
-
list[Connector]
|
|
142
|
-
Parsed target connectors.
|
|
143
|
-
"""
|
|
144
|
-
return list(
|
|
145
|
-
_collect_parsed(raw, 'targets', _parse_connector_entry),
|
|
146
|
-
)
|
|
147
|
-
|
|
148
|
-
|
|
149
129
|
# SECTION: FUNCTIONS ======================================================== #
|
|
150
130
|
|
|
151
131
|
|
|
@@ -311,17 +291,20 @@ class PipelineConfig:
|
|
|
311
291
|
file_systems = coerce_dict(raw.get('file_systems'))
|
|
312
292
|
|
|
313
293
|
# Sources
|
|
314
|
-
sources =
|
|
294
|
+
sources = _build_connectors(raw, key='sources')
|
|
315
295
|
|
|
316
296
|
# Validations/Transforms
|
|
317
297
|
validations = coerce_dict(raw.get('validations'))
|
|
318
298
|
transforms = coerce_dict(raw.get('transforms'))
|
|
319
299
|
|
|
320
300
|
# Targets
|
|
321
|
-
targets =
|
|
301
|
+
targets = _build_connectors(raw, key='targets')
|
|
322
302
|
|
|
323
303
|
# Jobs
|
|
324
|
-
jobs = _collect_parsed(
|
|
304
|
+
jobs: list[JobConfig] = _collect_parsed(
|
|
305
|
+
raw.get('jobs', []) or [],
|
|
306
|
+
JobConfig.from_obj,
|
|
307
|
+
)
|
|
325
308
|
|
|
326
309
|
# Table schemas (optional, tolerant pass-through structures).
|
|
327
310
|
table_schemas: list[dict[str, Any]] = []
|
etlplus/workflow/profile.py
CHANGED
|
@@ -18,6 +18,7 @@ from typing import Self
|
|
|
18
18
|
|
|
19
19
|
from ..types import StrAnyMap
|
|
20
20
|
from ..utils import cast_str_dict
|
|
21
|
+
from ..utils import maybe_mapping
|
|
21
22
|
|
|
22
23
|
# SECTION: EXPORTS ========================================================== #
|
|
23
24
|
|
|
@@ -56,7 +57,8 @@ class ProfileConfig:
|
|
|
56
57
|
cls,
|
|
57
58
|
obj: StrAnyMap | None,
|
|
58
59
|
) -> Self:
|
|
59
|
-
"""
|
|
60
|
+
"""
|
|
61
|
+
Parse a mapping into a :class:`ProfileConfig` instance.
|
|
60
62
|
|
|
61
63
|
Parameters
|
|
62
64
|
----------
|
|
@@ -73,7 +75,7 @@ class ProfileConfig:
|
|
|
73
75
|
return cls()
|
|
74
76
|
|
|
75
77
|
# Coerce all env values to strings using shared helper.
|
|
76
|
-
env = cast_str_dict(obj.get('env'))
|
|
78
|
+
env = cast_str_dict(maybe_mapping(obj.get('env')))
|
|
77
79
|
|
|
78
80
|
return cls(
|
|
79
81
|
default_target=obj.get('default_target'),
|
|
@@ -5,7 +5,7 @@ etlplus/__version__.py,sha256=1E0GMK_yUWCMQFKxXjTvyMwofi0qT2k4CDNiHWiymWE,327
|
|
|
5
5
|
etlplus/enums.py,sha256=8-uUOKe68cPzlmUg-e7gavkC95kbTJXRpRzvXehIsRk,6841
|
|
6
6
|
etlplus/mixins.py,sha256=ifGpHwWv7U00yqGf-kN93vJax2IiK4jaGtTsPsO3Oak,1350
|
|
7
7
|
etlplus/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
-
etlplus/types.py,sha256=
|
|
8
|
+
etlplus/types.py,sha256=J1ftMh0dmxe9ObFd3eXCbHiFvZI_5pV_hEHUpgeJQtY,6239
|
|
9
9
|
etlplus/utils.py,sha256=X-k_Y8i6oDjlE5aQu9sw3gPw7O2ikiSn4uoheVv_ERc,17091
|
|
10
10
|
etlplus/api/README.md,sha256=amxS_eIcsnNuVvD0x_w8nkyfedOTYbhlY0gGhaFg0DE,8705
|
|
11
11
|
etlplus/api/__init__.py,sha256=PK2lQv1FbsE7ZZS_ejevFZQSuOUHGApBc22YfHAzMqA,4615
|
|
@@ -17,7 +17,7 @@ etlplus/api/errors.py,sha256=XjI2xW-sypMUNUbqfc2S57-IGyWnH3oCDFhCmKYYI_Q,4648
|
|
|
17
17
|
etlplus/api/request_manager.py,sha256=fhzPV5x7DqpKqoLvfDR8GKhBX_QBMtvZsRXxVnQQElY,18674
|
|
18
18
|
etlplus/api/retry_manager.py,sha256=aq9iNCxt-Puy4rAgKNtNucxw2eP1yqAKZ2lfgMkzbCk,11302
|
|
19
19
|
etlplus/api/transport.py,sha256=abm-_WieBDSSbFanBwhmudBuVVm7LjYUb8vrlMXo7SA,9408
|
|
20
|
-
etlplus/api/types.py,sha256=
|
|
20
|
+
etlplus/api/types.py,sha256=UcTrB347So12l8NplY-_HDf2T5IwZL_2r8CJDUSAm5Q,7975
|
|
21
21
|
etlplus/api/utils.py,sha256=lNBfJKz3fJ4RhvnnX3uxVZC__6-WKksYMSGGYi0RRqM,26247
|
|
22
22
|
etlplus/api/pagination/__init__.py,sha256=a4UX2J0AG8RMvmHt_CCofUm5vSmFo6GAfkb8XnSXypM,1395
|
|
23
23
|
etlplus/api/pagination/client.py,sha256=yMEpWqRxTCD4zRc9OYtEyUtShpGH5atiHFEAt95v2FE,5394
|
|
@@ -117,10 +117,10 @@ etlplus/file/zip.py,sha256=8wnmnGW_pGTx65736CzAG67XIi5y98KxucRT8sNDeuQ,4195
|
|
|
117
117
|
etlplus/file/zsav.py,sha256=5hMuBjYeHw--UL2ZCCDn6TzJkr_YNhdQhvKI6nr3WW0,1674
|
|
118
118
|
etlplus/ops/README.md,sha256=8omi7DYZhelc26JKk8Cm8QR8I3OGwziysPj1ivx41iQ,1380
|
|
119
119
|
etlplus/ops/__init__.py,sha256=NIIr2f-AZj5B0piBt6gjv46Yn0SzGYxEe6BPoopRh38,1702
|
|
120
|
-
etlplus/ops/extract.py,sha256=
|
|
121
|
-
etlplus/ops/load.py,sha256=
|
|
122
|
-
etlplus/ops/run.py,sha256=
|
|
123
|
-
etlplus/ops/transform.py,sha256=
|
|
120
|
+
etlplus/ops/extract.py,sha256=LOYiPrALRMF7JDBabnRF24_HKnnIcfTdfXesWdS3QZM,11020
|
|
121
|
+
etlplus/ops/load.py,sha256=yicciVwomUKkdbhuRqbavKBNpT2Hg813BnQzG6IgF4o,10811
|
|
122
|
+
etlplus/ops/run.py,sha256=FYb2W5pi2PXx5E-l5etFMUcr7UmQWfMNHew9-otWIYE,11294
|
|
123
|
+
etlplus/ops/transform.py,sha256=3qIJsy2lUSMPoTRWn8Yw4JocKV_ZTQx_fKRW0w73Cnc,25682
|
|
124
124
|
etlplus/ops/utils.py,sha256=lJmrO1KDob-xZU8Gc2SvZvMgdYLsVoaz-fTV42KkLVo,10835
|
|
125
125
|
etlplus/ops/validate.py,sha256=-OLAwQNNCmmDbmj0SB7zzYXDkJfcyBP_z9nTpqImLP0,13271
|
|
126
126
|
etlplus/templates/README.md,sha256=IfPXlj1TGVA-uFWosHJhE2rabFW-znxOlOMazO9Z5cE,1361
|
|
@@ -128,14 +128,14 @@ etlplus/templates/__init__.py,sha256=tsniN7XJYs3NwYxJ6c2HD5upHP3CDkLx-bQCMt97UOM
|
|
|
128
128
|
etlplus/templates/ddl.sql.j2,sha256=s8fMWvcb4eaJVXkifuib1aQPljtZ8buuyB_uA-ZdU3Q,4734
|
|
129
129
|
etlplus/templates/view.sql.j2,sha256=Iy8DHfhq5yyvrUKDxqp_aHIEXY4Tm6j4wT7YDEFWAhk,2180
|
|
130
130
|
etlplus/workflow/README.md,sha256=D1oloiJCOHiqpqgv3m3qpRSIUOMIQcWtIsOPv7KkNI0,1652
|
|
131
|
-
etlplus/workflow/__init__.py,sha256=
|
|
132
|
-
etlplus/workflow/dag.py,sha256
|
|
133
|
-
etlplus/workflow/jobs.py,sha256=
|
|
134
|
-
etlplus/workflow/pipeline.py,sha256=
|
|
135
|
-
etlplus/workflow/profile.py,sha256=
|
|
136
|
-
etlplus-0.16.
|
|
137
|
-
etlplus-0.16.
|
|
138
|
-
etlplus-0.16.
|
|
139
|
-
etlplus-0.16.
|
|
140
|
-
etlplus-0.16.
|
|
141
|
-
etlplus-0.16.
|
|
131
|
+
etlplus/workflow/__init__.py,sha256=ueothwpLruyLgr3-2hW8VT1unNyFJxdmT-l_3eB2ejc,724
|
|
132
|
+
etlplus/workflow/dag.py,sha256=-f1x8N1eb-PUuiOwEvFLmJwfR7JaMDJihlCHlhrFhgE,2937
|
|
133
|
+
etlplus/workflow/jobs.py,sha256=5DmAzmEZV6XXQ-xzowkLxFzplIh8Eno3wuCmjy79xHw,8818
|
|
134
|
+
etlplus/workflow/pipeline.py,sha256=PA5zhcfrk--pAg3b3x4oBf29WMj5HqR8zOozz4oEmg8,9387
|
|
135
|
+
etlplus/workflow/profile.py,sha256=FQU3bzBZ9_yjKC9kCXKN1FQDS9zjNUjtWB1r3UL95_Q,1993
|
|
136
|
+
etlplus-0.16.2.dist-info/licenses/LICENSE,sha256=MuNO63i6kWmgnV2pbP2SLqP54mk1BGmu7CmbtxMmT-U,1069
|
|
137
|
+
etlplus-0.16.2.dist-info/METADATA,sha256=QdFDSAYSrjZKyu5G8TWQSlC1Lobu8hA9qgXpnIOQ2eM,28114
|
|
138
|
+
etlplus-0.16.2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
139
|
+
etlplus-0.16.2.dist-info/entry_points.txt,sha256=6w-2-jzuPa55spzK34h-UKh2JTEShh38adFRONNP9QE,45
|
|
140
|
+
etlplus-0.16.2.dist-info/top_level.txt,sha256=aWWF-udn_sLGuHTM6W6MLh99ArS9ROkUWO8Mi8y1_2U,8
|
|
141
|
+
etlplus-0.16.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|