etlplus 0.16.0__py3-none-any.whl → 0.16.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- etlplus/api/types.py +32 -11
- etlplus/enums.py +35 -167
- etlplus/ops/__init__.py +1 -0
- etlplus/ops/enums.py +173 -0
- etlplus/ops/extract.py +209 -22
- etlplus/ops/load.py +140 -34
- etlplus/ops/run.py +86 -101
- etlplus/ops/transform.py +46 -27
- etlplus/ops/types.py +147 -0
- etlplus/types.py +3 -101
- etlplus/workflow/__init__.py +2 -0
- etlplus/workflow/dag.py +23 -1
- etlplus/workflow/jobs.py +15 -26
- etlplus/workflow/pipeline.py +37 -54
- etlplus/workflow/profile.py +4 -2
- {etlplus-0.16.0.dist-info → etlplus-0.16.3.dist-info}/METADATA +1 -1
- {etlplus-0.16.0.dist-info → etlplus-0.16.3.dist-info}/RECORD +21 -19
- {etlplus-0.16.0.dist-info → etlplus-0.16.3.dist-info}/WHEEL +0 -0
- {etlplus-0.16.0.dist-info → etlplus-0.16.3.dist-info}/entry_points.txt +0 -0
- {etlplus-0.16.0.dist-info → etlplus-0.16.3.dist-info}/licenses/LICENSE +0 -0
- {etlplus-0.16.0.dist-info → etlplus-0.16.3.dist-info}/top_level.txt +0 -0
etlplus/ops/extract.py
CHANGED
|
@@ -6,11 +6,19 @@ Helpers to extract data from files, databases, and REST APIs.
|
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
|
+
from collections.abc import Mapping
|
|
9
10
|
from pathlib import Path
|
|
10
11
|
from typing import Any
|
|
11
12
|
from typing import cast
|
|
13
|
+
from urllib.parse import urlsplit
|
|
14
|
+
from urllib.parse import urlunsplit
|
|
12
15
|
|
|
16
|
+
from ..api import EndpointClient
|
|
13
17
|
from ..api import HttpMethod
|
|
18
|
+
from ..api import PaginationConfigMap
|
|
19
|
+
from ..api import RequestOptions
|
|
20
|
+
from ..api import compose_api_request_env
|
|
21
|
+
from ..api import paginate_with_client
|
|
14
22
|
from ..api.utils import resolve_request
|
|
15
23
|
from ..connector import DataConnectorType
|
|
16
24
|
from ..file import File
|
|
@@ -19,6 +27,7 @@ from ..types import JSONData
|
|
|
19
27
|
from ..types import JSONDict
|
|
20
28
|
from ..types import JSONList
|
|
21
29
|
from ..types import StrPath
|
|
30
|
+
from ..types import Timeout
|
|
22
31
|
|
|
23
32
|
# SECTION: EXPORTS ========================================================== #
|
|
24
33
|
|
|
@@ -32,50 +41,164 @@ __all__ = [
|
|
|
32
41
|
]
|
|
33
42
|
|
|
34
43
|
|
|
35
|
-
# SECTION: FUNCTIONS
|
|
44
|
+
# SECTION: INTERNAL FUNCTIONS =============================================== #
|
|
36
45
|
|
|
37
46
|
|
|
38
|
-
def
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
47
|
+
def _build_client(
|
|
48
|
+
*,
|
|
49
|
+
base_url: str,
|
|
50
|
+
base_path: str | None,
|
|
51
|
+
endpoints: dict[str, str],
|
|
52
|
+
retry: Any,
|
|
53
|
+
retry_network_errors: bool,
|
|
54
|
+
session: Any,
|
|
55
|
+
) -> EndpointClient:
|
|
56
|
+
"""
|
|
57
|
+
Construct an API client with shared defaults.
|
|
58
|
+
|
|
59
|
+
Parameters
|
|
60
|
+
----------
|
|
61
|
+
base_url : str
|
|
62
|
+
API base URL.
|
|
63
|
+
base_path : str | None
|
|
64
|
+
Base path to prepend for endpoints.
|
|
65
|
+
endpoints : dict[str, str]
|
|
66
|
+
Endpoint name to path mappings.
|
|
67
|
+
retry : Any
|
|
68
|
+
Retry policy configuration.
|
|
69
|
+
retry_network_errors : bool
|
|
70
|
+
Whether to retry on network errors.
|
|
71
|
+
session : Any
|
|
72
|
+
Optional requests session.
|
|
73
|
+
|
|
74
|
+
Returns
|
|
75
|
+
-------
|
|
76
|
+
EndpointClient
|
|
77
|
+
Configured endpoint client instance.
|
|
78
|
+
"""
|
|
79
|
+
ClientClass = EndpointClient # noqa: N806
|
|
80
|
+
return ClientClass(
|
|
81
|
+
base_url=base_url,
|
|
82
|
+
base_path=base_path,
|
|
83
|
+
endpoints=endpoints,
|
|
84
|
+
retry=retry,
|
|
85
|
+
retry_network_errors=retry_network_errors,
|
|
86
|
+
session=session,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _extract_from_api_env(
|
|
91
|
+
env: Mapping[str, Any],
|
|
92
|
+
*,
|
|
93
|
+
use_client: bool,
|
|
42
94
|
) -> JSONData:
|
|
43
95
|
"""
|
|
44
|
-
Extract data from a
|
|
96
|
+
Extract API data from a normalized request environment.
|
|
45
97
|
|
|
46
98
|
Parameters
|
|
47
99
|
----------
|
|
48
|
-
|
|
49
|
-
API
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
**kwargs : Any
|
|
53
|
-
Extra arguments forwarded to the underlying ``requests`` call
|
|
54
|
-
(for example, ``timeout``). To use a pre-configured
|
|
55
|
-
:class:`requests.Session`, provide it via ``session``.
|
|
56
|
-
When omitted, ``timeout`` defaults to 10 seconds.
|
|
100
|
+
env : Mapping[str, Any]
|
|
101
|
+
Normalized environment describing API request parameters.
|
|
102
|
+
use_client : bool
|
|
103
|
+
Whether to use the endpoint client/pagination machinery.
|
|
57
104
|
|
|
58
105
|
Returns
|
|
59
106
|
-------
|
|
60
107
|
JSONData
|
|
61
|
-
|
|
108
|
+
Extracted payload.
|
|
62
109
|
|
|
63
110
|
Raises
|
|
64
111
|
------
|
|
65
|
-
|
|
66
|
-
If
|
|
67
|
-
method (for example, ``get``).
|
|
112
|
+
ValueError
|
|
113
|
+
If required parameters are missing.
|
|
68
114
|
"""
|
|
69
|
-
|
|
70
|
-
|
|
115
|
+
if (
|
|
116
|
+
use_client
|
|
117
|
+
and env.get('use_endpoints')
|
|
118
|
+
and env.get('base_url')
|
|
119
|
+
and env.get('endpoints_map')
|
|
120
|
+
and env.get('endpoint_key')
|
|
121
|
+
):
|
|
122
|
+
client = _build_client(
|
|
123
|
+
base_url=cast(str, env.get('base_url')),
|
|
124
|
+
base_path=cast(str | None, env.get('base_path')),
|
|
125
|
+
endpoints=cast(dict[str, str], env.get('endpoints_map', {})),
|
|
126
|
+
retry=env.get('retry'),
|
|
127
|
+
retry_network_errors=bool(env.get('retry_network_errors', False)),
|
|
128
|
+
session=env.get('session'),
|
|
129
|
+
)
|
|
130
|
+
return paginate_with_client(
|
|
131
|
+
client,
|
|
132
|
+
cast(str, env.get('endpoint_key')),
|
|
133
|
+
env.get('params'),
|
|
134
|
+
env.get('headers'),
|
|
135
|
+
env.get('timeout'),
|
|
136
|
+
env.get('pagination'),
|
|
137
|
+
cast(float | None, env.get('sleep_seconds')),
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
url = env.get('url')
|
|
141
|
+
if not url:
|
|
142
|
+
raise ValueError('API source missing URL')
|
|
143
|
+
|
|
144
|
+
if use_client:
|
|
145
|
+
parts = urlsplit(cast(str, url))
|
|
146
|
+
base = urlunsplit((parts.scheme, parts.netloc, '', '', ''))
|
|
147
|
+
client = _build_client(
|
|
148
|
+
base_url=base,
|
|
149
|
+
base_path=None,
|
|
150
|
+
endpoints={},
|
|
151
|
+
retry=env.get('retry'),
|
|
152
|
+
retry_network_errors=bool(env.get('retry_network_errors', False)),
|
|
153
|
+
session=env.get('session'),
|
|
154
|
+
)
|
|
155
|
+
request_options = RequestOptions(
|
|
156
|
+
params=cast(Mapping[str, Any] | None, env.get('params')),
|
|
157
|
+
headers=cast(Mapping[str, str] | None, env.get('headers')),
|
|
158
|
+
timeout=cast(Timeout | None, env.get('timeout')),
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
return client.paginate_url(
|
|
162
|
+
cast(str, url),
|
|
163
|
+
cast(PaginationConfigMap | None, env.get('pagination')),
|
|
164
|
+
request=request_options,
|
|
165
|
+
sleep_seconds=cast(float, env.get('sleep_seconds', 0.0)),
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
method = env.get('method', HttpMethod.GET)
|
|
169
|
+
timeout = env.get('timeout', None)
|
|
170
|
+
session = env.get('session', None)
|
|
171
|
+
request_kwargs = dict(env.get('request_kwargs') or {})
|
|
71
172
|
request_callable, timeout, _ = resolve_request(
|
|
72
173
|
method,
|
|
73
174
|
session=session,
|
|
74
175
|
timeout=timeout,
|
|
75
176
|
)
|
|
76
|
-
response = request_callable(
|
|
177
|
+
response = request_callable(
|
|
178
|
+
cast(str, url),
|
|
179
|
+
timeout=timeout,
|
|
180
|
+
**request_kwargs,
|
|
181
|
+
)
|
|
77
182
|
response.raise_for_status()
|
|
183
|
+
return _parse_api_response(response)
|
|
184
|
+
|
|
78
185
|
|
|
186
|
+
def _parse_api_response(
|
|
187
|
+
response: Any,
|
|
188
|
+
) -> JSONData:
|
|
189
|
+
"""
|
|
190
|
+
Parse API responses into a consistent JSON payload.
|
|
191
|
+
|
|
192
|
+
Parameters
|
|
193
|
+
----------
|
|
194
|
+
response : Any
|
|
195
|
+
HTTP response object exposing ``headers``, ``json()``, and ``text``.
|
|
196
|
+
|
|
197
|
+
Returns
|
|
198
|
+
-------
|
|
199
|
+
JSONData
|
|
200
|
+
Parsed JSON payload, or a fallback object with raw text.
|
|
201
|
+
"""
|
|
79
202
|
content_type = response.headers.get('content-type', '').lower()
|
|
80
203
|
if 'application/json' in content_type:
|
|
81
204
|
try:
|
|
@@ -99,6 +222,70 @@ def extract_from_api(
|
|
|
99
222
|
return {'content': response.text, 'content_type': content_type}
|
|
100
223
|
|
|
101
224
|
|
|
225
|
+
# SECTION: FUNCTIONS ======================================================== #
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def extract_from_api(
|
|
229
|
+
url: str,
|
|
230
|
+
method: HttpMethod | str = HttpMethod.GET,
|
|
231
|
+
**kwargs: Any,
|
|
232
|
+
) -> JSONData:
|
|
233
|
+
"""
|
|
234
|
+
Extract data from a REST API.
|
|
235
|
+
|
|
236
|
+
Parameters
|
|
237
|
+
----------
|
|
238
|
+
url : str
|
|
239
|
+
API endpoint URL.
|
|
240
|
+
method : HttpMethod | str, optional
|
|
241
|
+
HTTP method to use. Defaults to ``GET``.
|
|
242
|
+
**kwargs : Any
|
|
243
|
+
Extra arguments forwarded to the underlying ``requests`` call
|
|
244
|
+
(for example, ``timeout``). To use a pre-configured
|
|
245
|
+
:class:`requests.Session`, provide it via ``session``.
|
|
246
|
+
When omitted, ``timeout`` defaults to 10 seconds.
|
|
247
|
+
|
|
248
|
+
Returns
|
|
249
|
+
-------
|
|
250
|
+
JSONData
|
|
251
|
+
Parsed JSON payload, or a fallback object with raw text.
|
|
252
|
+
"""
|
|
253
|
+
env = {
|
|
254
|
+
'url': url,
|
|
255
|
+
'method': method,
|
|
256
|
+
'timeout': kwargs.pop('timeout', None),
|
|
257
|
+
'session': kwargs.pop('session', None),
|
|
258
|
+
'request_kwargs': kwargs,
|
|
259
|
+
}
|
|
260
|
+
return _extract_from_api_env(env, use_client=False)
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def extract_from_api_source(
|
|
264
|
+
cfg: Any,
|
|
265
|
+
source_obj: Any,
|
|
266
|
+
overrides: dict[str, Any],
|
|
267
|
+
) -> JSONData:
|
|
268
|
+
"""
|
|
269
|
+
Extract data from a REST API source connector.
|
|
270
|
+
|
|
271
|
+
Parameters
|
|
272
|
+
----------
|
|
273
|
+
cfg : Any
|
|
274
|
+
Pipeline configuration.
|
|
275
|
+
source_obj : Any
|
|
276
|
+
Connector configuration.
|
|
277
|
+
overrides : dict[str, Any]
|
|
278
|
+
Extract-time overrides.
|
|
279
|
+
|
|
280
|
+
Returns
|
|
281
|
+
-------
|
|
282
|
+
JSONData
|
|
283
|
+
Extracted payload.
|
|
284
|
+
"""
|
|
285
|
+
env = compose_api_request_env(cfg, source_obj, overrides)
|
|
286
|
+
return _extract_from_api_env(env, use_client=True)
|
|
287
|
+
|
|
288
|
+
|
|
102
289
|
def extract_from_database(
|
|
103
290
|
connection_string: str,
|
|
104
291
|
) -> JSONList:
|
etlplus/ops/load.py
CHANGED
|
@@ -8,11 +8,13 @@ from __future__ import annotations
|
|
|
8
8
|
|
|
9
9
|
import json
|
|
10
10
|
import sys
|
|
11
|
+
from collections.abc import Mapping
|
|
11
12
|
from pathlib import Path
|
|
12
13
|
from typing import Any
|
|
13
14
|
from typing import cast
|
|
14
15
|
|
|
15
16
|
from ..api import HttpMethod
|
|
17
|
+
from ..api import compose_api_target_env
|
|
16
18
|
from ..api.utils import resolve_request
|
|
17
19
|
from ..connector import DataConnectorType
|
|
18
20
|
from ..file import File
|
|
@@ -39,6 +41,108 @@ __all__ = [
|
|
|
39
41
|
# SECTION: INTERNAL FUNCTIONS ============================================== #
|
|
40
42
|
|
|
41
43
|
|
|
44
|
+
def _load_data_from_str(
|
|
45
|
+
source: str,
|
|
46
|
+
) -> JSONData:
|
|
47
|
+
"""
|
|
48
|
+
Load JSON data from a string or file path.
|
|
49
|
+
|
|
50
|
+
Parameters
|
|
51
|
+
----------
|
|
52
|
+
source : str
|
|
53
|
+
Input string representing a file path or JSON payload.
|
|
54
|
+
|
|
55
|
+
Returns
|
|
56
|
+
-------
|
|
57
|
+
JSONData
|
|
58
|
+
Parsed JSON payload.
|
|
59
|
+
"""
|
|
60
|
+
# Special case: '-' means read JSON from STDIN (Unix convention).
|
|
61
|
+
if source == '-':
|
|
62
|
+
raw = sys.stdin.read()
|
|
63
|
+
return _parse_json_string(raw)
|
|
64
|
+
|
|
65
|
+
candidate = Path(source)
|
|
66
|
+
if candidate.exists():
|
|
67
|
+
try:
|
|
68
|
+
return File(candidate, FileFormat.JSON).read()
|
|
69
|
+
except (OSError, json.JSONDecodeError, ValueError):
|
|
70
|
+
# Fall back to treating the string as raw JSON content.
|
|
71
|
+
pass
|
|
72
|
+
return _parse_json_string(source)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _load_to_api_env(
|
|
76
|
+
data: JSONData,
|
|
77
|
+
env: Mapping[str, Any],
|
|
78
|
+
) -> JSONDict:
|
|
79
|
+
"""
|
|
80
|
+
Load data to an API target using a normalized environment.
|
|
81
|
+
|
|
82
|
+
Parameters
|
|
83
|
+
----------
|
|
84
|
+
data : JSONData
|
|
85
|
+
Payload to load.
|
|
86
|
+
env : Mapping[str, Any]
|
|
87
|
+
Normalized request environment.
|
|
88
|
+
|
|
89
|
+
Returns
|
|
90
|
+
-------
|
|
91
|
+
JSONDict
|
|
92
|
+
Load result payload.
|
|
93
|
+
|
|
94
|
+
Raises
|
|
95
|
+
------
|
|
96
|
+
ValueError
|
|
97
|
+
If required parameters are missing.
|
|
98
|
+
"""
|
|
99
|
+
url = env.get('url')
|
|
100
|
+
if not url:
|
|
101
|
+
raise ValueError('API target missing "url"')
|
|
102
|
+
method = env.get('method') or 'post'
|
|
103
|
+
kwargs: dict[str, Any] = {}
|
|
104
|
+
headers = env.get('headers')
|
|
105
|
+
if headers:
|
|
106
|
+
kwargs['headers'] = cast(dict[str, str], headers)
|
|
107
|
+
if env.get('timeout') is not None:
|
|
108
|
+
kwargs['timeout'] = env.get('timeout')
|
|
109
|
+
session = env.get('session')
|
|
110
|
+
if session is not None:
|
|
111
|
+
kwargs['session'] = session
|
|
112
|
+
extra_kwargs = env.get('request_kwargs')
|
|
113
|
+
if isinstance(extra_kwargs, Mapping):
|
|
114
|
+
kwargs.update(extra_kwargs)
|
|
115
|
+
timeout = kwargs.pop('timeout', 10.0)
|
|
116
|
+
session = kwargs.pop('session', None)
|
|
117
|
+
request_callable, timeout, http_method = resolve_request(
|
|
118
|
+
method,
|
|
119
|
+
session=session,
|
|
120
|
+
timeout=timeout,
|
|
121
|
+
)
|
|
122
|
+
response = request_callable(
|
|
123
|
+
cast(str, url),
|
|
124
|
+
json=data,
|
|
125
|
+
timeout=timeout,
|
|
126
|
+
**kwargs,
|
|
127
|
+
)
|
|
128
|
+
response.raise_for_status()
|
|
129
|
+
|
|
130
|
+
# Try JSON first, fall back to text.
|
|
131
|
+
try:
|
|
132
|
+
payload: Any = response.json()
|
|
133
|
+
except ValueError:
|
|
134
|
+
payload = response.text
|
|
135
|
+
|
|
136
|
+
return {
|
|
137
|
+
'status': 'success',
|
|
138
|
+
'status_code': response.status_code,
|
|
139
|
+
'message': f'Data loaded to {url}',
|
|
140
|
+
'response': payload,
|
|
141
|
+
'records': count_records(data),
|
|
142
|
+
'method': http_method.value.upper(),
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
|
|
42
146
|
def _parse_json_string(
|
|
43
147
|
raw: str,
|
|
44
148
|
) -> JSONData:
|
|
@@ -113,18 +217,7 @@ def load_data(
|
|
|
113
217
|
return File(source, FileFormat.JSON).read()
|
|
114
218
|
|
|
115
219
|
if isinstance(source, str):
|
|
116
|
-
|
|
117
|
-
if source == '-':
|
|
118
|
-
raw = sys.stdin.read()
|
|
119
|
-
return _parse_json_string(raw)
|
|
120
|
-
candidate = Path(source)
|
|
121
|
-
if candidate.exists():
|
|
122
|
-
try:
|
|
123
|
-
return File(candidate, FileFormat.JSON).read()
|
|
124
|
-
except (OSError, json.JSONDecodeError, ValueError):
|
|
125
|
-
# Fall back to treating the string as raw JSON content.
|
|
126
|
-
pass
|
|
127
|
-
return _parse_json_string(source)
|
|
220
|
+
return _load_data_from_str(source)
|
|
128
221
|
|
|
129
222
|
raise TypeError(
|
|
130
223
|
'source must be a mapping, sequence of mappings, path, or JSON string',
|
|
@@ -158,30 +251,43 @@ def load_to_api(
|
|
|
158
251
|
Result dictionary including response payload or text.
|
|
159
252
|
"""
|
|
160
253
|
# Apply a conservative timeout to guard against hanging requests.
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
session
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
response.raise_for_status()
|
|
254
|
+
env = {
|
|
255
|
+
'url': url,
|
|
256
|
+
'method': method,
|
|
257
|
+
'timeout': kwargs.pop('timeout', 10.0),
|
|
258
|
+
'session': kwargs.pop('session', None),
|
|
259
|
+
'request_kwargs': kwargs,
|
|
260
|
+
}
|
|
261
|
+
return _load_to_api_env(data, env)
|
|
170
262
|
|
|
171
|
-
# Try JSON first, fall back to text.
|
|
172
|
-
try:
|
|
173
|
-
payload: Any = response.json()
|
|
174
|
-
except ValueError:
|
|
175
|
-
payload = response.text
|
|
176
263
|
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
264
|
+
def load_to_api_target(
|
|
265
|
+
cfg: Any,
|
|
266
|
+
target_obj: Any,
|
|
267
|
+
overrides: dict[str, Any],
|
|
268
|
+
data: JSONData,
|
|
269
|
+
) -> JSONDict:
|
|
270
|
+
"""
|
|
271
|
+
Load data to an API target connector.
|
|
272
|
+
|
|
273
|
+
Parameters
|
|
274
|
+
----------
|
|
275
|
+
cfg : Any
|
|
276
|
+
Pipeline configuration.
|
|
277
|
+
target_obj : Any
|
|
278
|
+
Connector configuration.
|
|
279
|
+
overrides : dict[str, Any]
|
|
280
|
+
Load-time overrides.
|
|
281
|
+
data : JSONData
|
|
282
|
+
Payload to load.
|
|
283
|
+
|
|
284
|
+
Returns
|
|
285
|
+
-------
|
|
286
|
+
JSONDict
|
|
287
|
+
Load result.
|
|
288
|
+
"""
|
|
289
|
+
env = compose_api_target_env(cfg, target_obj, overrides)
|
|
290
|
+
return _load_to_api_env(data, env)
|
|
185
291
|
|
|
186
292
|
|
|
187
293
|
def load_to_database(
|