etlplus 0.9.2__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- etlplus/__init__.py +26 -1
- etlplus/api/README.md +3 -51
- etlplus/api/__init__.py +0 -10
- etlplus/api/config.py +28 -39
- etlplus/api/endpoint_client.py +3 -3
- etlplus/api/pagination/client.py +1 -1
- etlplus/api/rate_limiting/config.py +1 -13
- etlplus/api/rate_limiting/rate_limiter.py +11 -8
- etlplus/api/request_manager.py +6 -11
- etlplus/api/transport.py +2 -14
- etlplus/api/types.py +6 -96
- etlplus/cli/commands.py +43 -76
- etlplus/cli/constants.py +1 -1
- etlplus/cli/handlers.py +12 -40
- etlplus/cli/io.py +2 -2
- etlplus/cli/main.py +1 -1
- etlplus/cli/state.py +7 -4
- etlplus/{workflow → config}/__init__.py +23 -10
- etlplus/{workflow → config}/connector.py +44 -58
- etlplus/{workflow → config}/jobs.py +32 -105
- etlplus/{workflow → config}/pipeline.py +51 -59
- etlplus/{workflow → config}/profile.py +5 -8
- etlplus/config/types.py +204 -0
- etlplus/config/utils.py +120 -0
- etlplus/database/ddl.py +1 -1
- etlplus/database/engine.py +3 -19
- etlplus/database/orm.py +0 -2
- etlplus/database/schema.py +1 -1
- etlplus/enums.py +288 -0
- etlplus/{ops/extract.py → extract.py} +99 -81
- etlplus/file.py +652 -0
- etlplus/{ops/load.py → load.py} +101 -78
- etlplus/{ops/run.py → run.py} +127 -159
- etlplus/{api/utils.py → run_helpers.py} +153 -209
- etlplus/{ops/transform.py → transform.py} +68 -75
- etlplus/types.py +4 -5
- etlplus/utils.py +2 -136
- etlplus/{ops/validate.py → validate.py} +12 -22
- etlplus/validation/__init__.py +44 -0
- etlplus/{ops → validation}/utils.py +17 -53
- {etlplus-0.9.2.dist-info → etlplus-0.10.2.dist-info}/METADATA +17 -210
- etlplus-0.10.2.dist-info/RECORD +65 -0
- {etlplus-0.9.2.dist-info → etlplus-0.10.2.dist-info}/WHEEL +1 -1
- etlplus/README.md +0 -37
- etlplus/api/enums.py +0 -51
- etlplus/cli/README.md +0 -40
- etlplus/database/README.md +0 -48
- etlplus/file/README.md +0 -105
- etlplus/file/__init__.py +0 -25
- etlplus/file/_imports.py +0 -141
- etlplus/file/_io.py +0 -160
- etlplus/file/accdb.py +0 -78
- etlplus/file/arrow.py +0 -78
- etlplus/file/avro.py +0 -176
- etlplus/file/bson.py +0 -77
- etlplus/file/cbor.py +0 -78
- etlplus/file/cfg.py +0 -79
- etlplus/file/conf.py +0 -80
- etlplus/file/core.py +0 -322
- etlplus/file/csv.py +0 -79
- etlplus/file/dat.py +0 -78
- etlplus/file/dta.py +0 -77
- etlplus/file/duckdb.py +0 -78
- etlplus/file/enums.py +0 -343
- etlplus/file/feather.py +0 -111
- etlplus/file/fwf.py +0 -77
- etlplus/file/gz.py +0 -123
- etlplus/file/hbs.py +0 -78
- etlplus/file/hdf5.py +0 -78
- etlplus/file/ini.py +0 -79
- etlplus/file/ion.py +0 -78
- etlplus/file/jinja2.py +0 -78
- etlplus/file/json.py +0 -98
- etlplus/file/log.py +0 -78
- etlplus/file/mat.py +0 -78
- etlplus/file/mdb.py +0 -78
- etlplus/file/msgpack.py +0 -78
- etlplus/file/mustache.py +0 -78
- etlplus/file/nc.py +0 -78
- etlplus/file/ndjson.py +0 -108
- etlplus/file/numbers.py +0 -75
- etlplus/file/ods.py +0 -79
- etlplus/file/orc.py +0 -111
- etlplus/file/parquet.py +0 -113
- etlplus/file/pb.py +0 -78
- etlplus/file/pbf.py +0 -77
- etlplus/file/properties.py +0 -78
- etlplus/file/proto.py +0 -77
- etlplus/file/psv.py +0 -79
- etlplus/file/rda.py +0 -78
- etlplus/file/rds.py +0 -78
- etlplus/file/sas7bdat.py +0 -78
- etlplus/file/sav.py +0 -77
- etlplus/file/sqlite.py +0 -78
- etlplus/file/stub.py +0 -84
- etlplus/file/sylk.py +0 -77
- etlplus/file/tab.py +0 -81
- etlplus/file/toml.py +0 -78
- etlplus/file/tsv.py +0 -80
- etlplus/file/txt.py +0 -102
- etlplus/file/vm.py +0 -78
- etlplus/file/wks.py +0 -77
- etlplus/file/xls.py +0 -88
- etlplus/file/xlsm.py +0 -79
- etlplus/file/xlsx.py +0 -99
- etlplus/file/xml.py +0 -185
- etlplus/file/xpt.py +0 -78
- etlplus/file/yaml.py +0 -95
- etlplus/file/zip.py +0 -175
- etlplus/file/zsav.py +0 -77
- etlplus/ops/README.md +0 -50
- etlplus/ops/__init__.py +0 -61
- etlplus/templates/README.md +0 -46
- etlplus/workflow/README.md +0 -52
- etlplus/workflow/dag.py +0 -105
- etlplus/workflow/types.py +0 -115
- etlplus-0.9.2.dist-info/RECORD +0 -134
- {etlplus-0.9.2.dist-info → etlplus-0.10.2.dist-info}/entry_points.txt +0 -0
- {etlplus-0.9.2.dist-info → etlplus-0.10.2.dist-info}/licenses/LICENSE +0 -0
- {etlplus-0.9.2.dist-info → etlplus-0.10.2.dist-info}/top_level.txt +0 -0
etlplus/{ops/load.py → load.py}
RENAMED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
:mod:`etlplus.
|
|
2
|
+
:mod:`etlplus.load` module.
|
|
3
3
|
|
|
4
4
|
Helpers to load data into files, databases, and REST APIs.
|
|
5
5
|
"""
|
|
@@ -12,16 +12,20 @@ from pathlib import Path
|
|
|
12
12
|
from typing import Any
|
|
13
13
|
from typing import cast
|
|
14
14
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
from
|
|
18
|
-
from
|
|
19
|
-
from
|
|
20
|
-
from
|
|
21
|
-
from
|
|
22
|
-
from
|
|
23
|
-
from
|
|
24
|
-
from
|
|
15
|
+
import requests # type: ignore[import]
|
|
16
|
+
|
|
17
|
+
from .enums import DataConnectorType
|
|
18
|
+
from .enums import FileFormat
|
|
19
|
+
from .enums import HttpMethod
|
|
20
|
+
from .enums import coerce_data_connector_type
|
|
21
|
+
from .enums import coerce_file_format
|
|
22
|
+
from .enums import coerce_http_method
|
|
23
|
+
from .file import File
|
|
24
|
+
from .types import JSONData
|
|
25
|
+
from .types import JSONDict
|
|
26
|
+
from .types import JSONList
|
|
27
|
+
from .types import StrPath
|
|
28
|
+
from .utils import count_records
|
|
25
29
|
|
|
26
30
|
# SECTION: INTERNAL FUNCTIONS ============================================== #
|
|
27
31
|
|
|
@@ -68,7 +72,7 @@ def _parse_json_string(
|
|
|
68
72
|
# SECTION: FUNCTIONS ======================================================== #
|
|
69
73
|
|
|
70
74
|
|
|
71
|
-
# --
|
|
75
|
+
# -- Data Loading -- #
|
|
72
76
|
|
|
73
77
|
|
|
74
78
|
def load_data(
|
|
@@ -97,7 +101,7 @@ def load_data(
|
|
|
97
101
|
return cast(JSONData, source)
|
|
98
102
|
|
|
99
103
|
if isinstance(source, Path):
|
|
100
|
-
return File(source, FileFormat.JSON).
|
|
104
|
+
return File(source, FileFormat.JSON).read_json()
|
|
101
105
|
|
|
102
106
|
if isinstance(source, str):
|
|
103
107
|
# Special case: '-' means read JSON from STDIN (Unix convention).
|
|
@@ -107,7 +111,7 @@ def load_data(
|
|
|
107
111
|
candidate = Path(source)
|
|
108
112
|
if candidate.exists():
|
|
109
113
|
try:
|
|
110
|
-
return File(candidate, FileFormat.JSON).
|
|
114
|
+
return File(candidate, FileFormat.JSON).read_json()
|
|
111
115
|
except (OSError, json.JSONDecodeError, ValueError):
|
|
112
116
|
# Fall back to treating the string as raw JSON content.
|
|
113
117
|
pass
|
|
@@ -118,59 +122,58 @@ def load_data(
|
|
|
118
122
|
)
|
|
119
123
|
|
|
120
124
|
|
|
121
|
-
|
|
125
|
+
# -- File Loading -- #
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def load_to_file(
|
|
122
129
|
data: JSONData,
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
**kwargs: Any,
|
|
130
|
+
file_path: StrPath,
|
|
131
|
+
file_format: FileFormat | str | None = None,
|
|
126
132
|
) -> JSONDict:
|
|
127
133
|
"""
|
|
128
|
-
|
|
134
|
+
Persist data to a local file.
|
|
129
135
|
|
|
130
136
|
Parameters
|
|
131
137
|
----------
|
|
132
138
|
data : JSONData
|
|
133
|
-
Data to
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
Extra arguments forwarded to ``requests`` (e.g., ``timeout``).
|
|
140
|
-
When omitted, ``timeout`` defaults to 10 seconds.
|
|
139
|
+
Data to write.
|
|
140
|
+
file_path : StrPath
|
|
141
|
+
Target file path.
|
|
142
|
+
file_format : FileFormat | str | None, optional
|
|
143
|
+
Output format. If omitted (None), the format is inferred from the
|
|
144
|
+
filename extension.
|
|
141
145
|
|
|
142
146
|
Returns
|
|
143
147
|
-------
|
|
144
148
|
JSONDict
|
|
145
|
-
Result dictionary
|
|
149
|
+
Result dictionary with status and record count.
|
|
146
150
|
"""
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
session = kwargs.pop('session', None)
|
|
150
|
-
request_callable, timeout, http_method = resolve_request(
|
|
151
|
-
method,
|
|
152
|
-
session=session,
|
|
153
|
-
timeout=timeout,
|
|
154
|
-
)
|
|
155
|
-
response = request_callable(url, json=data, timeout=timeout, **kwargs)
|
|
156
|
-
response.raise_for_status()
|
|
151
|
+
path = Path(file_path)
|
|
152
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
157
153
|
|
|
158
|
-
#
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
154
|
+
# If no explicit format is provided, let File infer from extension.
|
|
155
|
+
if file_format is None:
|
|
156
|
+
records = File(path).write(data)
|
|
157
|
+
ext = path.suffix.lstrip('.').lower()
|
|
158
|
+
fmt = coerce_file_format(ext) if ext else FileFormat.JSON
|
|
159
|
+
else:
|
|
160
|
+
fmt = coerce_file_format(file_format)
|
|
161
|
+
records = File(path, fmt).write(data)
|
|
162
|
+
if fmt is FileFormat.CSV and records == 0:
|
|
163
|
+
message = 'No data to write'
|
|
164
|
+
else:
|
|
165
|
+
message = f'Data loaded to {path}'
|
|
163
166
|
|
|
164
167
|
return {
|
|
165
168
|
'status': 'success',
|
|
166
|
-
'
|
|
167
|
-
'
|
|
168
|
-
'response': payload,
|
|
169
|
-
'records': count_records(data),
|
|
170
|
-
'method': http_method.value.upper(),
|
|
169
|
+
'message': message,
|
|
170
|
+
'records': records,
|
|
171
171
|
}
|
|
172
172
|
|
|
173
173
|
|
|
174
|
+
# -- Database Loading (Placeholder) -- #
|
|
175
|
+
|
|
176
|
+
|
|
174
177
|
def load_to_database(
|
|
175
178
|
data: JSONData,
|
|
176
179
|
connection_string: str,
|
|
@@ -206,49 +209,69 @@ def load_to_database(
|
|
|
206
209
|
}
|
|
207
210
|
|
|
208
211
|
|
|
209
|
-
|
|
212
|
+
# -- REST API Loading -- #
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def load_to_api(
|
|
210
216
|
data: JSONData,
|
|
211
|
-
|
|
212
|
-
|
|
217
|
+
url: str,
|
|
218
|
+
method: HttpMethod | str,
|
|
219
|
+
**kwargs: Any,
|
|
213
220
|
) -> JSONDict:
|
|
214
221
|
"""
|
|
215
|
-
|
|
222
|
+
Load data to a REST API.
|
|
216
223
|
|
|
217
224
|
Parameters
|
|
218
225
|
----------
|
|
219
226
|
data : JSONData
|
|
220
|
-
Data to
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
227
|
+
Data to send as JSON.
|
|
228
|
+
url : str
|
|
229
|
+
API endpoint URL.
|
|
230
|
+
method : HttpMethod | str
|
|
231
|
+
HTTP method to use.
|
|
232
|
+
**kwargs : Any
|
|
233
|
+
Extra arguments forwarded to ``requests`` (e.g., ``timeout``).
|
|
226
234
|
|
|
227
235
|
Returns
|
|
228
236
|
-------
|
|
229
237
|
JSONDict
|
|
230
|
-
Result dictionary
|
|
238
|
+
Result dictionary including response payload or text.
|
|
239
|
+
|
|
240
|
+
Raises
|
|
241
|
+
------
|
|
242
|
+
TypeError
|
|
243
|
+
If the session object is not valid.
|
|
231
244
|
"""
|
|
232
|
-
|
|
233
|
-
path.parent.mkdir(parents=True, exist_ok=True)
|
|
245
|
+
http_method = coerce_http_method(method)
|
|
234
246
|
|
|
235
|
-
#
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
+
# Apply a conservative timeout to guard against hanging requests.
|
|
248
|
+
timeout = kwargs.pop('timeout', 10.0)
|
|
249
|
+
session = kwargs.pop('session', None)
|
|
250
|
+
requester = session or requests
|
|
251
|
+
|
|
252
|
+
request_callable = getattr(requester, http_method.value, None)
|
|
253
|
+
if not callable(request_callable):
|
|
254
|
+
raise TypeError(
|
|
255
|
+
'Session object must supply a '
|
|
256
|
+
f'callable "{http_method.value}" method',
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
response = request_callable(url, json=data, timeout=timeout, **kwargs)
|
|
260
|
+
response.raise_for_status()
|
|
261
|
+
|
|
262
|
+
# Try JSON first, fall back to text.
|
|
263
|
+
try:
|
|
264
|
+
payload: Any = response.json()
|
|
265
|
+
except ValueError:
|
|
266
|
+
payload = response.text
|
|
247
267
|
|
|
248
268
|
return {
|
|
249
269
|
'status': 'success',
|
|
250
|
-
'
|
|
251
|
-
'
|
|
270
|
+
'status_code': response.status_code,
|
|
271
|
+
'message': f'Data loaded to {url}',
|
|
272
|
+
'response': payload,
|
|
273
|
+
'records': count_records(data),
|
|
274
|
+
'method': http_method.value.upper(),
|
|
252
275
|
}
|
|
253
276
|
|
|
254
277
|
|
|
@@ -293,7 +316,7 @@ def load(
|
|
|
293
316
|
"""
|
|
294
317
|
data = load_data(source)
|
|
295
318
|
|
|
296
|
-
match
|
|
319
|
+
match coerce_data_connector_type(target_type):
|
|
297
320
|
case DataConnectorType.FILE:
|
|
298
321
|
# Prefer explicit format if provided, else infer from filename.
|
|
299
322
|
return load_to_file(data, target, file_format)
|
|
@@ -308,6 +331,6 @@ def load(
|
|
|
308
331
|
**kwargs,
|
|
309
332
|
)
|
|
310
333
|
case _:
|
|
311
|
-
#
|
|
312
|
-
#
|
|
334
|
+
# `coerce_data_connector_type` covers invalid entries, but keep
|
|
335
|
+
# explicit guard.
|
|
313
336
|
raise ValueError(f'Invalid target type: {target_type}')
|
etlplus/{ops/run.py → run.py}
RENAMED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
:mod:`etlplus.
|
|
2
|
+
:mod:`etlplus.run` module.
|
|
3
3
|
|
|
4
4
|
A module for running ETL jobs defined in YAML configurations.
|
|
5
5
|
"""
|
|
@@ -9,78 +9,127 @@ from __future__ import annotations
|
|
|
9
9
|
from collections.abc import Mapping
|
|
10
10
|
from typing import Any
|
|
11
11
|
from typing import Final
|
|
12
|
+
from typing import TypedDict
|
|
12
13
|
from typing import cast
|
|
13
14
|
from urllib.parse import urlsplit
|
|
14
15
|
from urllib.parse import urlunsplit
|
|
15
16
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
from
|
|
19
|
-
from
|
|
20
|
-
from
|
|
21
|
-
from
|
|
22
|
-
from
|
|
23
|
-
from
|
|
24
|
-
from
|
|
25
|
-
from
|
|
26
|
-
from ..types import JSONDict
|
|
27
|
-
from ..types import PipelineConfig
|
|
28
|
-
from ..types import StrPath
|
|
29
|
-
from ..types import Timeout
|
|
30
|
-
from ..utils import print_json
|
|
31
|
-
from ..workflow import load_pipeline_config
|
|
17
|
+
import requests # type: ignore[import]
|
|
18
|
+
|
|
19
|
+
from .api import EndpointClient # noqa: F401 (re-exported for tests)
|
|
20
|
+
from .api import PaginationConfigMap
|
|
21
|
+
from .api import RequestOptions
|
|
22
|
+
from .api import RetryPolicy
|
|
23
|
+
from .api import Url
|
|
24
|
+
from .config import load_pipeline_config
|
|
25
|
+
from .enums import DataConnectorType
|
|
26
|
+
from .enums import coerce_data_connector_type
|
|
32
27
|
from .extract import extract
|
|
33
28
|
from .load import load
|
|
29
|
+
from .run_helpers import compose_api_request_env
|
|
30
|
+
from .run_helpers import compose_api_target_env
|
|
31
|
+
from .run_helpers import paginate_with_client
|
|
34
32
|
from .transform import transform
|
|
35
|
-
from .
|
|
33
|
+
from .types import JSONDict
|
|
34
|
+
from .types import Timeout
|
|
35
|
+
from .utils import print_json
|
|
36
36
|
from .validate import validate
|
|
37
|
+
from .validation.utils import maybe_validate
|
|
37
38
|
|
|
38
39
|
# SECTION: EXPORTS ========================================================== #
|
|
39
40
|
|
|
40
41
|
|
|
41
|
-
__all__ = [
|
|
42
|
-
# Functions
|
|
43
|
-
'run',
|
|
44
|
-
'run_pipeline',
|
|
45
|
-
]
|
|
42
|
+
__all__ = ['run']
|
|
46
43
|
|
|
47
44
|
|
|
48
|
-
# SECTION:
|
|
45
|
+
# SECTION: TYPED DICTS ====================================================== #
|
|
49
46
|
|
|
50
47
|
|
|
51
|
-
|
|
48
|
+
class BaseApiHttpEnv(TypedDict, total=False):
|
|
49
|
+
"""
|
|
50
|
+
Common HTTP request environment for API interactions.
|
|
52
51
|
|
|
52
|
+
Fields shared by both source-side and target-side API operations.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
# Request details
|
|
56
|
+
url: Url | None
|
|
57
|
+
headers: dict[str, str]
|
|
58
|
+
timeout: Timeout
|
|
53
59
|
|
|
54
|
-
#
|
|
60
|
+
# Session
|
|
61
|
+
session: requests.Session | None
|
|
55
62
|
|
|
56
63
|
|
|
57
|
-
|
|
58
|
-
job_obj: Any,
|
|
59
|
-
cfg: Any,
|
|
60
|
-
) -> tuple[bool, dict[str, Any], str, str]:
|
|
64
|
+
class ApiRequestEnv(BaseApiHttpEnv, total=False):
|
|
61
65
|
"""
|
|
62
|
-
|
|
66
|
+
Composed request environment for API sources.
|
|
63
67
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
cfg : Any
|
|
69
|
-
Pipeline configuration object with validations.
|
|
68
|
+
Returned by ``compose_api_request_env`` (run_helpers) and consumed by the
|
|
69
|
+
API extract branch. Values are fully merged with endpoint/API defaults and
|
|
70
|
+
job-level overrides, preserving the original precedence and behavior.
|
|
71
|
+
"""
|
|
70
72
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
73
|
+
# Client
|
|
74
|
+
use_endpoints: bool
|
|
75
|
+
base_url: str | None
|
|
76
|
+
base_path: str | None
|
|
77
|
+
endpoints_map: dict[str, str] | None
|
|
78
|
+
endpoint_key: str | None
|
|
79
|
+
|
|
80
|
+
# Request
|
|
81
|
+
params: dict[str, Any]
|
|
82
|
+
pagination: PaginationConfigMap | None
|
|
83
|
+
sleep_seconds: float
|
|
84
|
+
|
|
85
|
+
# Reliability
|
|
86
|
+
retry: RetryPolicy | None
|
|
87
|
+
retry_network_errors: bool
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class ApiTargetEnv(BaseApiHttpEnv, total=False):
|
|
75
91
|
"""
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
92
|
+
Composed request environment for API targets.
|
|
93
|
+
|
|
94
|
+
Returned by ``compose_api_target_env`` (run_helpers) and consumed by the
|
|
95
|
+
API load branch. Values are merged from the target object, optional
|
|
96
|
+
API/endpoint reference, and job-level overrides, preserving original
|
|
97
|
+
precedence and behavior.
|
|
98
|
+
|
|
99
|
+
Notes
|
|
100
|
+
-----
|
|
101
|
+
- Precedence for inherited values matches original logic:
|
|
102
|
+
overrides -> target -> API profile defaults.
|
|
103
|
+
- Target composition does not include pagination/rate-limit/retry since
|
|
104
|
+
loads are single-request operations; only headers/timeout/session
|
|
105
|
+
apply.
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
# Request
|
|
109
|
+
method: str | None
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class SessionConfig(TypedDict, total=False):
|
|
113
|
+
"""
|
|
114
|
+
Minimal session configuration schema accepted by this runner.
|
|
115
|
+
|
|
116
|
+
Keys mirror common requests.Session options; all are optional.
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
headers: Mapping[str, Any]
|
|
120
|
+
params: Mapping[str, Any]
|
|
121
|
+
auth: Any # (user, pass) tuple or requests-compatible auth object
|
|
122
|
+
verify: bool | str
|
|
123
|
+
cert: Any # str or (cert, key)
|
|
124
|
+
proxies: Mapping[str, Any]
|
|
125
|
+
cookies: Mapping[str, Any]
|
|
126
|
+
trust_env: bool
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
# SECTION: CONSTANTS ======================================================== #
|
|
130
|
+
|
|
79
131
|
|
|
80
|
-
|
|
81
|
-
severity = (val_ref.severity or 'error').lower()
|
|
82
|
-
phase = (val_ref.phase or 'before_transform').lower()
|
|
83
|
-
return True, rules, severity, phase
|
|
132
|
+
DEFAULT_CONFIG_PATH: Final[str] = 'in/pipeline.yml'
|
|
84
133
|
|
|
85
134
|
|
|
86
135
|
# SECTION: FUNCTIONS ======================================================== #
|
|
@@ -136,7 +185,8 @@ def run(
|
|
|
136
185
|
|
|
137
186
|
data: Any
|
|
138
187
|
stype_raw = getattr(source_obj, 'type', None)
|
|
139
|
-
|
|
188
|
+
stype = coerce_data_connector_type(stype_raw or '')
|
|
189
|
+
match stype:
|
|
140
190
|
case DataConnectorType.FILE:
|
|
141
191
|
path = getattr(source_obj, 'path', None)
|
|
142
192
|
fmt = ex_opts.get('format') or getattr(
|
|
@@ -159,15 +209,12 @@ def run(
|
|
|
159
209
|
and env.get('endpoint_key')
|
|
160
210
|
):
|
|
161
211
|
# Construct client using module-level EndpointClient so tests
|
|
162
|
-
# can monkeypatch this class on etlplus.
|
|
212
|
+
# can monkeypatch this class on etlplus.run.
|
|
163
213
|
ClientClass = EndpointClient # noqa: N806
|
|
164
214
|
client = ClientClass(
|
|
165
|
-
base_url=cast(str, env
|
|
215
|
+
base_url=cast(str, env['base_url']),
|
|
166
216
|
base_path=cast(str | None, env.get('base_path')),
|
|
167
|
-
endpoints=cast(
|
|
168
|
-
dict[str, str],
|
|
169
|
-
env.get('endpoints_map', {}),
|
|
170
|
-
),
|
|
217
|
+
endpoints=cast(dict[str, str], env['endpoints_map']),
|
|
171
218
|
retry=env.get('retry'),
|
|
172
219
|
retry_network_errors=bool(
|
|
173
220
|
env.get('retry_network_errors', False),
|
|
@@ -176,7 +223,7 @@ def run(
|
|
|
176
223
|
)
|
|
177
224
|
data = paginate_with_client(
|
|
178
225
|
client,
|
|
179
|
-
cast(str, env
|
|
226
|
+
cast(str, env['endpoint_key']),
|
|
180
227
|
env.get('params'),
|
|
181
228
|
env.get('headers'),
|
|
182
229
|
env.get('timeout'),
|
|
@@ -214,14 +261,23 @@ def run(
|
|
|
214
261
|
sleep_seconds=cast(float, env.get('sleep_seconds', 0.0)),
|
|
215
262
|
)
|
|
216
263
|
case _:
|
|
217
|
-
#
|
|
218
|
-
#
|
|
264
|
+
# ``coerce_data_connector_type`` already raises for invalid
|
|
265
|
+
# connector types; this branch is defensive only.
|
|
219
266
|
raise ValueError(f'Unsupported source type: {stype_raw}')
|
|
220
267
|
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
268
|
+
# DRY: unified validation helper (pre/post transform)
|
|
269
|
+
val_ref = job_obj.validate
|
|
270
|
+
enabled_validation = val_ref is not None
|
|
271
|
+
if enabled_validation:
|
|
272
|
+
# Type narrowing for static checkers
|
|
273
|
+
assert val_ref is not None
|
|
274
|
+
rules = cfg.validations.get(val_ref.ruleset, {})
|
|
275
|
+
severity = (val_ref.severity or 'error').lower()
|
|
276
|
+
phase = (val_ref.phase or 'before_transform').lower()
|
|
277
|
+
else:
|
|
278
|
+
rules = {}
|
|
279
|
+
severity = 'error'
|
|
280
|
+
phase = 'before_transform'
|
|
225
281
|
|
|
226
282
|
# Pre-transform validation (if configured).
|
|
227
283
|
data = maybe_validate(
|
|
@@ -262,7 +318,8 @@ def run(
|
|
|
262
318
|
overrides = job_obj.load.overrides or {}
|
|
263
319
|
|
|
264
320
|
ttype_raw = getattr(target_obj, 'type', None)
|
|
265
|
-
|
|
321
|
+
ttype = coerce_data_connector_type(ttype_raw or '')
|
|
322
|
+
match ttype:
|
|
266
323
|
case DataConnectorType.FILE:
|
|
267
324
|
path = overrides.get('path') or getattr(target_obj, 'path', None)
|
|
268
325
|
fmt = overrides.get('format') or getattr(
|
|
@@ -279,14 +336,12 @@ def run(
|
|
|
279
336
|
if not url_t:
|
|
280
337
|
raise ValueError('API target missing "url"')
|
|
281
338
|
kwargs_t: dict[str, Any] = {}
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
kwargs_t['headers'] = cast(dict[str, str], headers)
|
|
339
|
+
if env_t.get('headers'):
|
|
340
|
+
kwargs_t['headers'] = cast(dict[str, str], env_t['headers'])
|
|
285
341
|
if env_t.get('timeout') is not None:
|
|
286
|
-
kwargs_t['timeout'] = env_t
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
kwargs_t['session'] = session
|
|
342
|
+
kwargs_t['timeout'] = env_t['timeout']
|
|
343
|
+
if env_t.get('session') is not None:
|
|
344
|
+
kwargs_t['session'] = env_t['session']
|
|
290
345
|
result = load(
|
|
291
346
|
data,
|
|
292
347
|
'api',
|
|
@@ -302,97 +357,10 @@ def run(
|
|
|
302
357
|
)
|
|
303
358
|
result = load(data, 'database', str(conn))
|
|
304
359
|
case _:
|
|
305
|
-
#
|
|
306
|
-
#
|
|
360
|
+
# ``coerce_data_connector_type`` already raises for invalid
|
|
361
|
+
# connector types; this branch is defensive only.
|
|
307
362
|
raise ValueError(f'Unsupported target type: {ttype_raw}')
|
|
308
363
|
|
|
309
364
|
# Return the terminal load result directly; callers (e.g., CLI) can wrap
|
|
310
365
|
# it in their own envelope when needed.
|
|
311
366
|
return cast(JSONDict, result)
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
def run_pipeline(
|
|
315
|
-
*,
|
|
316
|
-
source_type: DataConnectorType | str | None = None,
|
|
317
|
-
source: StrPath | JSONData | None = None,
|
|
318
|
-
operations: PipelineConfig | None = None,
|
|
319
|
-
target_type: DataConnectorType | str | None = None,
|
|
320
|
-
target: StrPath | None = None,
|
|
321
|
-
file_format: FileFormat | str | None = None,
|
|
322
|
-
method: HttpMethod | str | None = None,
|
|
323
|
-
**kwargs: Any,
|
|
324
|
-
) -> JSONData:
|
|
325
|
-
"""
|
|
326
|
-
Run a single extract-transform-load flow without a YAML config.
|
|
327
|
-
|
|
328
|
-
Parameters
|
|
329
|
-
----------
|
|
330
|
-
source_type : DataConnectorType | str | None, optional
|
|
331
|
-
Connector type for extraction. When ``None``, ``source`` is assumed
|
|
332
|
-
to be pre-loaded data and extraction is skipped.
|
|
333
|
-
source : StrPath | JSONData | None, optional
|
|
334
|
-
Data source for extraction or the pre-loaded payload when
|
|
335
|
-
``source_type`` is ``None``.
|
|
336
|
-
operations : PipelineConfig | None, optional
|
|
337
|
-
Transform configuration passed to :func:`etlplus.ops.transform`.
|
|
338
|
-
target_type : DataConnectorType | str | None, optional
|
|
339
|
-
Connector type for loading. When ``None``, load is skipped and the
|
|
340
|
-
transformed data is returned.
|
|
341
|
-
target : StrPath | None, optional
|
|
342
|
-
Target for loading (file path, connection string, or API URL).
|
|
343
|
-
file_format : FileFormat | str | None, optional
|
|
344
|
-
File format for file sources/targets (forwarded to extract/load).
|
|
345
|
-
method : HttpMethod | str | None, optional
|
|
346
|
-
HTTP method for API loads (forwarded to :func:`etlplus.ops.load`).
|
|
347
|
-
**kwargs : Any
|
|
348
|
-
Extra keyword arguments forwarded to extract/load for API options
|
|
349
|
-
(headers, timeout, session, etc.).
|
|
350
|
-
|
|
351
|
-
Returns
|
|
352
|
-
-------
|
|
353
|
-
JSONData
|
|
354
|
-
Transformed data or the load result payload.
|
|
355
|
-
|
|
356
|
-
Raises
|
|
357
|
-
------
|
|
358
|
-
TypeError
|
|
359
|
-
Raised when extracted data is not a dict or list of dicts and no
|
|
360
|
-
target is specified.
|
|
361
|
-
ValueError
|
|
362
|
-
Raised when required source/target inputs are missing.
|
|
363
|
-
"""
|
|
364
|
-
if source_type is None:
|
|
365
|
-
if source is None:
|
|
366
|
-
raise ValueError('source or source_type is required')
|
|
367
|
-
data = source
|
|
368
|
-
else:
|
|
369
|
-
if source is None:
|
|
370
|
-
raise ValueError('source is required when source_type is set')
|
|
371
|
-
data = extract(
|
|
372
|
-
source_type,
|
|
373
|
-
cast(StrPath, source),
|
|
374
|
-
file_format=file_format,
|
|
375
|
-
**kwargs,
|
|
376
|
-
)
|
|
377
|
-
|
|
378
|
-
if operations:
|
|
379
|
-
data = transform(data, operations)
|
|
380
|
-
|
|
381
|
-
if target_type is None:
|
|
382
|
-
if not isinstance(data, (dict, list)):
|
|
383
|
-
raise TypeError(
|
|
384
|
-
f'Expected data to be dict or list of dicts, '
|
|
385
|
-
f'got {type(data).__name__}',
|
|
386
|
-
)
|
|
387
|
-
return data
|
|
388
|
-
if target is None:
|
|
389
|
-
raise ValueError('target is required when target_type is set')
|
|
390
|
-
|
|
391
|
-
return load(
|
|
392
|
-
data,
|
|
393
|
-
target_type,
|
|
394
|
-
target,
|
|
395
|
-
file_format=file_format,
|
|
396
|
-
method=method,
|
|
397
|
-
**kwargs,
|
|
398
|
-
)
|