etlplus 0.9.1__py3-none-any.whl → 0.9.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- etlplus/README.md +37 -0
- etlplus/__init__.py +1 -26
- etlplus/api/README.md +51 -3
- etlplus/api/__init__.py +10 -0
- etlplus/api/config.py +39 -28
- etlplus/api/endpoint_client.py +3 -3
- etlplus/api/enums.py +51 -0
- etlplus/api/pagination/client.py +1 -1
- etlplus/api/rate_limiting/config.py +13 -1
- etlplus/api/rate_limiting/rate_limiter.py +8 -11
- etlplus/api/request_manager.py +11 -6
- etlplus/api/transport.py +14 -2
- etlplus/api/types.py +96 -6
- etlplus/{run_helpers.py → api/utils.py} +209 -153
- etlplus/cli/README.md +40 -0
- etlplus/cli/commands.py +76 -43
- etlplus/cli/constants.py +1 -1
- etlplus/cli/handlers.py +40 -12
- etlplus/cli/io.py +2 -2
- etlplus/cli/main.py +1 -1
- etlplus/cli/state.py +4 -7
- etlplus/database/README.md +48 -0
- etlplus/database/ddl.py +1 -1
- etlplus/database/engine.py +19 -3
- etlplus/database/orm.py +2 -0
- etlplus/database/schema.py +1 -1
- etlplus/enums.py +1 -157
- etlplus/file/README.md +105 -0
- etlplus/file/__init__.py +25 -0
- etlplus/file/_imports.py +141 -0
- etlplus/file/_io.py +160 -0
- etlplus/file/accdb.py +78 -0
- etlplus/file/arrow.py +78 -0
- etlplus/file/avro.py +176 -0
- etlplus/file/bson.py +77 -0
- etlplus/file/cbor.py +78 -0
- etlplus/file/cfg.py +79 -0
- etlplus/file/conf.py +80 -0
- etlplus/file/core.py +322 -0
- etlplus/file/csv.py +79 -0
- etlplus/file/dat.py +78 -0
- etlplus/file/dta.py +77 -0
- etlplus/file/duckdb.py +78 -0
- etlplus/file/enums.py +343 -0
- etlplus/file/feather.py +111 -0
- etlplus/file/fwf.py +77 -0
- etlplus/file/gz.py +123 -0
- etlplus/file/hbs.py +78 -0
- etlplus/file/hdf5.py +78 -0
- etlplus/file/ini.py +79 -0
- etlplus/file/ion.py +78 -0
- etlplus/file/jinja2.py +78 -0
- etlplus/file/json.py +98 -0
- etlplus/file/log.py +78 -0
- etlplus/file/mat.py +78 -0
- etlplus/file/mdb.py +78 -0
- etlplus/file/msgpack.py +78 -0
- etlplus/file/mustache.py +78 -0
- etlplus/file/nc.py +78 -0
- etlplus/file/ndjson.py +108 -0
- etlplus/file/numbers.py +75 -0
- etlplus/file/ods.py +79 -0
- etlplus/file/orc.py +111 -0
- etlplus/file/parquet.py +113 -0
- etlplus/file/pb.py +78 -0
- etlplus/file/pbf.py +77 -0
- etlplus/file/properties.py +78 -0
- etlplus/file/proto.py +77 -0
- etlplus/file/psv.py +79 -0
- etlplus/file/rda.py +78 -0
- etlplus/file/rds.py +78 -0
- etlplus/file/sas7bdat.py +78 -0
- etlplus/file/sav.py +77 -0
- etlplus/file/sqlite.py +78 -0
- etlplus/file/stub.py +84 -0
- etlplus/file/sylk.py +77 -0
- etlplus/file/tab.py +81 -0
- etlplus/file/toml.py +78 -0
- etlplus/file/tsv.py +80 -0
- etlplus/file/txt.py +102 -0
- etlplus/file/vm.py +78 -0
- etlplus/file/wks.py +77 -0
- etlplus/file/xls.py +88 -0
- etlplus/file/xlsm.py +79 -0
- etlplus/file/xlsx.py +99 -0
- etlplus/file/xml.py +185 -0
- etlplus/file/xpt.py +78 -0
- etlplus/file/yaml.py +95 -0
- etlplus/file/zip.py +175 -0
- etlplus/file/zsav.py +77 -0
- etlplus/ops/README.md +50 -0
- etlplus/ops/__init__.py +61 -0
- etlplus/{extract.py → ops/extract.py} +81 -99
- etlplus/{load.py → ops/load.py} +78 -101
- etlplus/{run.py → ops/run.py} +159 -127
- etlplus/{transform.py → ops/transform.py} +75 -68
- etlplus/{validation → ops}/utils.py +53 -17
- etlplus/{validate.py → ops/validate.py} +22 -12
- etlplus/templates/README.md +46 -0
- etlplus/types.py +5 -4
- etlplus/utils.py +136 -2
- etlplus/workflow/README.md +52 -0
- etlplus/{config → workflow}/__init__.py +10 -23
- etlplus/{config → workflow}/connector.py +58 -44
- etlplus/workflow/dag.py +105 -0
- etlplus/{config → workflow}/jobs.py +105 -32
- etlplus/{config → workflow}/pipeline.py +59 -51
- etlplus/{config → workflow}/profile.py +8 -5
- etlplus/workflow/types.py +115 -0
- {etlplus-0.9.1.dist-info → etlplus-0.9.2.dist-info}/METADATA +210 -17
- etlplus-0.9.2.dist-info/RECORD +134 -0
- {etlplus-0.9.1.dist-info → etlplus-0.9.2.dist-info}/WHEEL +1 -1
- etlplus/config/types.py +0 -204
- etlplus/config/utils.py +0 -120
- etlplus/file.py +0 -657
- etlplus/validation/__init__.py +0 -44
- etlplus-0.9.1.dist-info/RECORD +0 -65
- {etlplus-0.9.1.dist-info → etlplus-0.9.2.dist-info}/entry_points.txt +0 -0
- {etlplus-0.9.1.dist-info → etlplus-0.9.2.dist-info}/licenses/LICENSE +0 -0
- {etlplus-0.9.1.dist-info → etlplus-0.9.2.dist-info}/top_level.txt +0 -0
etlplus/{run.py → ops/run.py}
RENAMED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
:mod:`etlplus.run` module.
|
|
2
|
+
:mod:`etlplus.ops.run` module.
|
|
3
3
|
|
|
4
4
|
A module for running ETL jobs defined in YAML configurations.
|
|
5
5
|
"""
|
|
@@ -9,127 +9,78 @@ from __future__ import annotations
|
|
|
9
9
|
from collections.abc import Mapping
|
|
10
10
|
from typing import Any
|
|
11
11
|
from typing import Final
|
|
12
|
-
from typing import TypedDict
|
|
13
12
|
from typing import cast
|
|
14
13
|
from urllib.parse import urlsplit
|
|
15
14
|
from urllib.parse import urlunsplit
|
|
16
15
|
|
|
17
|
-
import
|
|
18
|
-
|
|
19
|
-
from
|
|
20
|
-
from
|
|
21
|
-
from
|
|
22
|
-
from
|
|
23
|
-
from
|
|
24
|
-
from
|
|
25
|
-
from
|
|
26
|
-
from
|
|
16
|
+
from ..api import EndpointClient # noqa: F401 (re-exported for tests)
|
|
17
|
+
from ..api import HttpMethod
|
|
18
|
+
from ..api import PaginationConfigMap
|
|
19
|
+
from ..api import RequestOptions
|
|
20
|
+
from ..api import compose_api_request_env
|
|
21
|
+
from ..api import compose_api_target_env
|
|
22
|
+
from ..api import paginate_with_client
|
|
23
|
+
from ..enums import DataConnectorType
|
|
24
|
+
from ..file import FileFormat
|
|
25
|
+
from ..types import JSONData
|
|
26
|
+
from ..types import JSONDict
|
|
27
|
+
from ..types import PipelineConfig
|
|
28
|
+
from ..types import StrPath
|
|
29
|
+
from ..types import Timeout
|
|
30
|
+
from ..utils import print_json
|
|
31
|
+
from ..workflow import load_pipeline_config
|
|
27
32
|
from .extract import extract
|
|
28
33
|
from .load import load
|
|
29
|
-
from .run_helpers import compose_api_request_env
|
|
30
|
-
from .run_helpers import compose_api_target_env
|
|
31
|
-
from .run_helpers import paginate_with_client
|
|
32
34
|
from .transform import transform
|
|
33
|
-
from .
|
|
34
|
-
from .types import Timeout
|
|
35
|
-
from .utils import print_json
|
|
35
|
+
from .utils import maybe_validate
|
|
36
36
|
from .validate import validate
|
|
37
|
-
from .validation.utils import maybe_validate
|
|
38
37
|
|
|
39
38
|
# SECTION: EXPORTS ========================================================== #
|
|
40
39
|
|
|
41
40
|
|
|
42
|
-
__all__ = [
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
41
|
+
__all__ = [
|
|
42
|
+
# Functions
|
|
43
|
+
'run',
|
|
44
|
+
'run_pipeline',
|
|
45
|
+
]
|
|
47
46
|
|
|
48
|
-
class BaseApiHttpEnv(TypedDict, total=False):
|
|
49
|
-
"""
|
|
50
|
-
Common HTTP request environment for API interactions.
|
|
51
|
-
|
|
52
|
-
Fields shared by both source-side and target-side API operations.
|
|
53
|
-
"""
|
|
54
47
|
|
|
55
|
-
|
|
56
|
-
url: Url | None
|
|
57
|
-
headers: dict[str, str]
|
|
58
|
-
timeout: Timeout
|
|
59
|
-
|
|
60
|
-
# Session
|
|
61
|
-
session: requests.Session | None
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
class ApiRequestEnv(BaseApiHttpEnv, total=False):
|
|
65
|
-
"""
|
|
66
|
-
Composed request environment for API sources.
|
|
48
|
+
# SECTION: CONSTANTS ======================================================== #
|
|
67
49
|
|
|
68
|
-
Returned by ``compose_api_request_env`` (run_helpers) and consumed by the
|
|
69
|
-
API extract branch. Values are fully merged with endpoint/API defaults and
|
|
70
|
-
job-level overrides, preserving the original precedence and behavior.
|
|
71
|
-
"""
|
|
72
50
|
|
|
73
|
-
|
|
74
|
-
use_endpoints: bool
|
|
75
|
-
base_url: str | None
|
|
76
|
-
base_path: str | None
|
|
77
|
-
endpoints_map: dict[str, str] | None
|
|
78
|
-
endpoint_key: str | None
|
|
51
|
+
DEFAULT_CONFIG_PATH: Final[str] = 'in/pipeline.yml'
|
|
79
52
|
|
|
80
|
-
# Request
|
|
81
|
-
params: dict[str, Any]
|
|
82
|
-
pagination: PaginationConfigMap | None
|
|
83
|
-
sleep_seconds: float
|
|
84
53
|
|
|
85
|
-
|
|
86
|
-
retry: RetryPolicy | None
|
|
87
|
-
retry_network_errors: bool
|
|
54
|
+
# SECTION: INTERNAL FUNCTIONS =============================================== #
|
|
88
55
|
|
|
89
56
|
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
Returned by ``compose_api_target_env`` (run_helpers) and consumed by the
|
|
95
|
-
API load branch. Values are merged from the target object, optional
|
|
96
|
-
API/endpoint reference, and job-level overrides, preserving original
|
|
97
|
-
precedence and behavior.
|
|
98
|
-
|
|
99
|
-
Notes
|
|
100
|
-
-----
|
|
101
|
-
- Precedence for inherited values matches original logic:
|
|
102
|
-
overrides -> target -> API profile defaults.
|
|
103
|
-
- Target composition does not include pagination/rate-limit/retry since
|
|
104
|
-
loads are single-request operations; only headers/timeout/session
|
|
105
|
-
apply.
|
|
57
|
+
def _resolve_validation_config(
|
|
58
|
+
job_obj: Any,
|
|
59
|
+
cfg: Any,
|
|
60
|
+
) -> tuple[bool, dict[str, Any], str, str]:
|
|
106
61
|
"""
|
|
62
|
+
Resolve validation settings for a job with safe defaults.
|
|
107
63
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
Minimal session configuration schema accepted by this runner.
|
|
64
|
+
Parameters
|
|
65
|
+
----------
|
|
66
|
+
job_obj : Any
|
|
67
|
+
Job configuration object.
|
|
68
|
+
cfg : Any
|
|
69
|
+
Pipeline configuration object with validations.
|
|
115
70
|
|
|
116
|
-
|
|
71
|
+
Returns
|
|
72
|
+
-------
|
|
73
|
+
tuple[bool, dict[str, Any], str, str]
|
|
74
|
+
Tuple of (enabled, rules, severity, phase).
|
|
117
75
|
"""
|
|
76
|
+
val_ref = job_obj.validate
|
|
77
|
+
if val_ref is None:
|
|
78
|
+
return False, {}, 'error', 'before_transform'
|
|
118
79
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
cert: Any # str or (cert, key)
|
|
124
|
-
proxies: Mapping[str, Any]
|
|
125
|
-
cookies: Mapping[str, Any]
|
|
126
|
-
trust_env: bool
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
# SECTION: CONSTANTS ======================================================== #
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
DEFAULT_CONFIG_PATH: Final[str] = 'in/pipeline.yml'
|
|
80
|
+
rules = cfg.validations.get(val_ref.ruleset, {})
|
|
81
|
+
severity = (val_ref.severity or 'error').lower()
|
|
82
|
+
phase = (val_ref.phase or 'before_transform').lower()
|
|
83
|
+
return True, rules, severity, phase
|
|
133
84
|
|
|
134
85
|
|
|
135
86
|
# SECTION: FUNCTIONS ======================================================== #
|
|
@@ -185,8 +136,7 @@ def run(
|
|
|
185
136
|
|
|
186
137
|
data: Any
|
|
187
138
|
stype_raw = getattr(source_obj, 'type', None)
|
|
188
|
-
|
|
189
|
-
match stype:
|
|
139
|
+
match DataConnectorType.coerce(stype_raw or ''):
|
|
190
140
|
case DataConnectorType.FILE:
|
|
191
141
|
path = getattr(source_obj, 'path', None)
|
|
192
142
|
fmt = ex_opts.get('format') or getattr(
|
|
@@ -209,12 +159,15 @@ def run(
|
|
|
209
159
|
and env.get('endpoint_key')
|
|
210
160
|
):
|
|
211
161
|
# Construct client using module-level EndpointClient so tests
|
|
212
|
-
# can monkeypatch this class on etlplus.run.
|
|
162
|
+
# can monkeypatch this class on etlplus.ops.run.
|
|
213
163
|
ClientClass = EndpointClient # noqa: N806
|
|
214
164
|
client = ClientClass(
|
|
215
|
-
base_url=cast(str, env
|
|
165
|
+
base_url=cast(str, env.get('base_url')),
|
|
216
166
|
base_path=cast(str | None, env.get('base_path')),
|
|
217
|
-
endpoints=cast(
|
|
167
|
+
endpoints=cast(
|
|
168
|
+
dict[str, str],
|
|
169
|
+
env.get('endpoints_map', {}),
|
|
170
|
+
),
|
|
218
171
|
retry=env.get('retry'),
|
|
219
172
|
retry_network_errors=bool(
|
|
220
173
|
env.get('retry_network_errors', False),
|
|
@@ -223,7 +176,7 @@ def run(
|
|
|
223
176
|
)
|
|
224
177
|
data = paginate_with_client(
|
|
225
178
|
client,
|
|
226
|
-
cast(str, env
|
|
179
|
+
cast(str, env.get('endpoint_key')),
|
|
227
180
|
env.get('params'),
|
|
228
181
|
env.get('headers'),
|
|
229
182
|
env.get('timeout'),
|
|
@@ -261,23 +214,14 @@ def run(
|
|
|
261
214
|
sleep_seconds=cast(float, env.get('sleep_seconds', 0.0)),
|
|
262
215
|
)
|
|
263
216
|
case _:
|
|
264
|
-
#
|
|
265
|
-
#
|
|
217
|
+
# :meth:`coerce` already raises for invalid connector types, but
|
|
218
|
+
# keep explicit guard for defensive programming.
|
|
266
219
|
raise ValueError(f'Unsupported source type: {stype_raw}')
|
|
267
220
|
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
# Type narrowing for static checkers
|
|
273
|
-
assert val_ref is not None
|
|
274
|
-
rules = cfg.validations.get(val_ref.ruleset, {})
|
|
275
|
-
severity = (val_ref.severity or 'error').lower()
|
|
276
|
-
phase = (val_ref.phase or 'before_transform').lower()
|
|
277
|
-
else:
|
|
278
|
-
rules = {}
|
|
279
|
-
severity = 'error'
|
|
280
|
-
phase = 'before_transform'
|
|
221
|
+
enabled_validation, rules, severity, phase = _resolve_validation_config(
|
|
222
|
+
job_obj,
|
|
223
|
+
cfg,
|
|
224
|
+
)
|
|
281
225
|
|
|
282
226
|
# Pre-transform validation (if configured).
|
|
283
227
|
data = maybe_validate(
|
|
@@ -318,8 +262,7 @@ def run(
|
|
|
318
262
|
overrides = job_obj.load.overrides or {}
|
|
319
263
|
|
|
320
264
|
ttype_raw = getattr(target_obj, 'type', None)
|
|
321
|
-
|
|
322
|
-
match ttype:
|
|
265
|
+
match DataConnectorType.coerce(ttype_raw or ''):
|
|
323
266
|
case DataConnectorType.FILE:
|
|
324
267
|
path = overrides.get('path') or getattr(target_obj, 'path', None)
|
|
325
268
|
fmt = overrides.get('format') or getattr(
|
|
@@ -336,12 +279,14 @@ def run(
|
|
|
336
279
|
if not url_t:
|
|
337
280
|
raise ValueError('API target missing "url"')
|
|
338
281
|
kwargs_t: dict[str, Any] = {}
|
|
339
|
-
|
|
340
|
-
|
|
282
|
+
headers = env_t.get('headers')
|
|
283
|
+
if headers:
|
|
284
|
+
kwargs_t['headers'] = cast(dict[str, str], headers)
|
|
341
285
|
if env_t.get('timeout') is not None:
|
|
342
|
-
kwargs_t['timeout'] = env_t
|
|
343
|
-
|
|
344
|
-
|
|
286
|
+
kwargs_t['timeout'] = env_t.get('timeout')
|
|
287
|
+
session = env_t.get('session')
|
|
288
|
+
if session is not None:
|
|
289
|
+
kwargs_t['session'] = session
|
|
345
290
|
result = load(
|
|
346
291
|
data,
|
|
347
292
|
'api',
|
|
@@ -357,10 +302,97 @@ def run(
|
|
|
357
302
|
)
|
|
358
303
|
result = load(data, 'database', str(conn))
|
|
359
304
|
case _:
|
|
360
|
-
#
|
|
361
|
-
#
|
|
305
|
+
# :meth:`coerce` already raises for invalid connector types, but
|
|
306
|
+
# keep explicit guard for defensive programming.
|
|
362
307
|
raise ValueError(f'Unsupported target type: {ttype_raw}')
|
|
363
308
|
|
|
364
309
|
# Return the terminal load result directly; callers (e.g., CLI) can wrap
|
|
365
310
|
# it in their own envelope when needed.
|
|
366
311
|
return cast(JSONDict, result)
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def run_pipeline(
|
|
315
|
+
*,
|
|
316
|
+
source_type: DataConnectorType | str | None = None,
|
|
317
|
+
source: StrPath | JSONData | None = None,
|
|
318
|
+
operations: PipelineConfig | None = None,
|
|
319
|
+
target_type: DataConnectorType | str | None = None,
|
|
320
|
+
target: StrPath | None = None,
|
|
321
|
+
file_format: FileFormat | str | None = None,
|
|
322
|
+
method: HttpMethod | str | None = None,
|
|
323
|
+
**kwargs: Any,
|
|
324
|
+
) -> JSONData:
|
|
325
|
+
"""
|
|
326
|
+
Run a single extract-transform-load flow without a YAML config.
|
|
327
|
+
|
|
328
|
+
Parameters
|
|
329
|
+
----------
|
|
330
|
+
source_type : DataConnectorType | str | None, optional
|
|
331
|
+
Connector type for extraction. When ``None``, ``source`` is assumed
|
|
332
|
+
to be pre-loaded data and extraction is skipped.
|
|
333
|
+
source : StrPath | JSONData | None, optional
|
|
334
|
+
Data source for extraction or the pre-loaded payload when
|
|
335
|
+
``source_type`` is ``None``.
|
|
336
|
+
operations : PipelineConfig | None, optional
|
|
337
|
+
Transform configuration passed to :func:`etlplus.ops.transform`.
|
|
338
|
+
target_type : DataConnectorType | str | None, optional
|
|
339
|
+
Connector type for loading. When ``None``, load is skipped and the
|
|
340
|
+
transformed data is returned.
|
|
341
|
+
target : StrPath | None, optional
|
|
342
|
+
Target for loading (file path, connection string, or API URL).
|
|
343
|
+
file_format : FileFormat | str | None, optional
|
|
344
|
+
File format for file sources/targets (forwarded to extract/load).
|
|
345
|
+
method : HttpMethod | str | None, optional
|
|
346
|
+
HTTP method for API loads (forwarded to :func:`etlplus.ops.load`).
|
|
347
|
+
**kwargs : Any
|
|
348
|
+
Extra keyword arguments forwarded to extract/load for API options
|
|
349
|
+
(headers, timeout, session, etc.).
|
|
350
|
+
|
|
351
|
+
Returns
|
|
352
|
+
-------
|
|
353
|
+
JSONData
|
|
354
|
+
Transformed data or the load result payload.
|
|
355
|
+
|
|
356
|
+
Raises
|
|
357
|
+
------
|
|
358
|
+
TypeError
|
|
359
|
+
Raised when extracted data is not a dict or list of dicts and no
|
|
360
|
+
target is specified.
|
|
361
|
+
ValueError
|
|
362
|
+
Raised when required source/target inputs are missing.
|
|
363
|
+
"""
|
|
364
|
+
if source_type is None:
|
|
365
|
+
if source is None:
|
|
366
|
+
raise ValueError('source or source_type is required')
|
|
367
|
+
data = source
|
|
368
|
+
else:
|
|
369
|
+
if source is None:
|
|
370
|
+
raise ValueError('source is required when source_type is set')
|
|
371
|
+
data = extract(
|
|
372
|
+
source_type,
|
|
373
|
+
cast(StrPath, source),
|
|
374
|
+
file_format=file_format,
|
|
375
|
+
**kwargs,
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
if operations:
|
|
379
|
+
data = transform(data, operations)
|
|
380
|
+
|
|
381
|
+
if target_type is None:
|
|
382
|
+
if not isinstance(data, (dict, list)):
|
|
383
|
+
raise TypeError(
|
|
384
|
+
f'Expected data to be dict or list of dicts, '
|
|
385
|
+
f'got {type(data).__name__}',
|
|
386
|
+
)
|
|
387
|
+
return data
|
|
388
|
+
if target is None:
|
|
389
|
+
raise ValueError('target is required when target_type is set')
|
|
390
|
+
|
|
391
|
+
return load(
|
|
392
|
+
data,
|
|
393
|
+
target_type,
|
|
394
|
+
target,
|
|
395
|
+
file_format=file_format,
|
|
396
|
+
method=method,
|
|
397
|
+
**kwargs,
|
|
398
|
+
)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
:mod:`etlplus.transform` module.
|
|
2
|
+
:mod:`etlplus.ops.transform` module.
|
|
3
3
|
|
|
4
4
|
Helpers to filter, map/rename, select, sort, aggregate, and otherwise
|
|
5
5
|
transform JSON-like records (dicts and lists of dicts).
|
|
@@ -24,7 +24,7 @@ Basic pipeline with strings::
|
|
|
24
24
|
|
|
25
25
|
Using enums for keys and functions::
|
|
26
26
|
|
|
27
|
-
from .enums import PipelineStep, OperatorName, AggregateName
|
|
27
|
+
from etlplus.enums import PipelineStep, OperatorName, AggregateName
|
|
28
28
|
ops = {
|
|
29
29
|
PipelineStep.FILTER: {
|
|
30
30
|
'field': 'age', 'op': OperatorName.GTE, 'value': 18
|
|
@@ -44,28 +44,28 @@ from collections.abc import Sequence
|
|
|
44
44
|
from typing import Any
|
|
45
45
|
from typing import cast
|
|
46
46
|
|
|
47
|
-
from
|
|
48
|
-
from
|
|
49
|
-
from
|
|
47
|
+
from ..enums import AggregateName
|
|
48
|
+
from ..enums import OperatorName
|
|
49
|
+
from ..enums import PipelineStep
|
|
50
|
+
from ..types import AggregateFunc
|
|
51
|
+
from ..types import AggregateSpec
|
|
52
|
+
from ..types import FieldName
|
|
53
|
+
from ..types import Fields
|
|
54
|
+
from ..types import FilterSpec
|
|
55
|
+
from ..types import JSONData
|
|
56
|
+
from ..types import JSONDict
|
|
57
|
+
from ..types import JSONList
|
|
58
|
+
from ..types import MapSpec
|
|
59
|
+
from ..types import OperatorFunc
|
|
60
|
+
from ..types import PipelineConfig
|
|
61
|
+
from ..types import PipelineStepName
|
|
62
|
+
from ..types import SortKey
|
|
63
|
+
from ..types import StepApplier
|
|
64
|
+
from ..types import StepOrSteps
|
|
65
|
+
from ..types import StepSpec
|
|
66
|
+
from ..types import StrPath
|
|
67
|
+
from ..utils import to_number
|
|
50
68
|
from .load import load_data
|
|
51
|
-
from .types import AggregateFunc
|
|
52
|
-
from .types import AggregateSpec
|
|
53
|
-
from .types import FieldName
|
|
54
|
-
from .types import Fields
|
|
55
|
-
from .types import FilterSpec
|
|
56
|
-
from .types import JSONData
|
|
57
|
-
from .types import JSONDict
|
|
58
|
-
from .types import JSONList
|
|
59
|
-
from .types import MapSpec
|
|
60
|
-
from .types import OperatorFunc
|
|
61
|
-
from .types import PipelineConfig
|
|
62
|
-
from .types import PipelineStepName
|
|
63
|
-
from .types import SortKey
|
|
64
|
-
from .types import StepApplier
|
|
65
|
-
from .types import StepOrSteps
|
|
66
|
-
from .types import StepSpec
|
|
67
|
-
from .types import StrPath
|
|
68
|
-
from .utils import to_number
|
|
69
69
|
|
|
70
70
|
# SECTION: EXPORTS ========================================================== #
|
|
71
71
|
|
|
@@ -730,15 +730,16 @@ def _is_plain_fields_list(obj: Any) -> bool:
|
|
|
730
730
|
|
|
731
731
|
|
|
732
732
|
_PIPELINE_STEPS: tuple[PipelineStepName, ...] = (
|
|
733
|
+
'aggregate',
|
|
733
734
|
'filter',
|
|
734
735
|
'map',
|
|
735
736
|
'select',
|
|
736
737
|
'sort',
|
|
737
|
-
'aggregate',
|
|
738
738
|
)
|
|
739
739
|
|
|
740
740
|
|
|
741
741
|
_STEP_APPLIERS: dict[PipelineStepName, StepApplier] = {
|
|
742
|
+
'aggregate': _apply_aggregate_step,
|
|
742
743
|
'filter': _apply_filter_step,
|
|
743
744
|
'map': _apply_map_step,
|
|
744
745
|
'select': _apply_select_step,
|
|
@@ -746,7 +747,54 @@ _STEP_APPLIERS: dict[PipelineStepName, StepApplier] = {
|
|
|
746
747
|
}
|
|
747
748
|
|
|
748
749
|
|
|
749
|
-
# SECTION:
|
|
750
|
+
# SECTION: FUNCTIONS ======================================================== #
|
|
751
|
+
|
|
752
|
+
|
|
753
|
+
# -- Helpers -- #
|
|
754
|
+
|
|
755
|
+
|
|
756
|
+
def apply_aggregate(
|
|
757
|
+
records: JSONList,
|
|
758
|
+
operation: AggregateSpec,
|
|
759
|
+
) -> JSONDict:
|
|
760
|
+
"""
|
|
761
|
+
Aggregate a numeric field or count presence.
|
|
762
|
+
|
|
763
|
+
Parameters
|
|
764
|
+
----------
|
|
765
|
+
records : JSONList
|
|
766
|
+
Records to aggregate.
|
|
767
|
+
operation : AggregateSpec
|
|
768
|
+
Dict with keys ``field`` and ``func``. ``func`` is one of
|
|
769
|
+
``'sum'``, ``'avg'``, ``'min'``, ``'max'``, or ``'count'``.
|
|
770
|
+
A callable may also be supplied for ``func``. Optionally, set
|
|
771
|
+
``alias`` to control the output key name.
|
|
772
|
+
|
|
773
|
+
Returns
|
|
774
|
+
-------
|
|
775
|
+
JSONDict
|
|
776
|
+
A single-row result like ``{"sum_age": 42}``.
|
|
777
|
+
|
|
778
|
+
Notes
|
|
779
|
+
-----
|
|
780
|
+
Numeric operations ignore non-numeric values but count their presence
|
|
781
|
+
for ``'count'``.
|
|
782
|
+
"""
|
|
783
|
+
field = operation.get('field')
|
|
784
|
+
func = operation.get('func')
|
|
785
|
+
alias = operation.get('alias')
|
|
786
|
+
|
|
787
|
+
if not field or func is None:
|
|
788
|
+
return {'error': 'Invalid aggregation operation'}
|
|
789
|
+
|
|
790
|
+
try:
|
|
791
|
+
aggregator = _resolve_aggregator(func)
|
|
792
|
+
except TypeError:
|
|
793
|
+
return {'error': f'Unknown aggregation function: {func}'}
|
|
794
|
+
|
|
795
|
+
nums, present = _collect_numeric_and_presence(records, field)
|
|
796
|
+
key_name = _derive_agg_key(func, field, alias)
|
|
797
|
+
return {key_name: aggregator(nums, present)}
|
|
750
798
|
|
|
751
799
|
|
|
752
800
|
def apply_filter(
|
|
@@ -894,48 +942,7 @@ def apply_sort(
|
|
|
894
942
|
)
|
|
895
943
|
|
|
896
944
|
|
|
897
|
-
|
|
898
|
-
records: JSONList,
|
|
899
|
-
operation: AggregateSpec,
|
|
900
|
-
) -> JSONDict:
|
|
901
|
-
"""
|
|
902
|
-
Aggregate a numeric field or count presence.
|
|
903
|
-
|
|
904
|
-
Parameters
|
|
905
|
-
----------
|
|
906
|
-
records : JSONList
|
|
907
|
-
Records to aggregate.
|
|
908
|
-
operation : AggregateSpec
|
|
909
|
-
Dict with keys ``field`` and ``func``. ``func`` is one of
|
|
910
|
-
``'sum'``, ``'avg'``, ``'min'``, ``'max'``, or ``'count'``.
|
|
911
|
-
A callable may also be supplied for ``func``. Optionally, set
|
|
912
|
-
``alias`` to control the output key name.
|
|
913
|
-
|
|
914
|
-
Returns
|
|
915
|
-
-------
|
|
916
|
-
JSONDict
|
|
917
|
-
A single-row result like ``{"sum_age": 42}``.
|
|
918
|
-
|
|
919
|
-
Notes
|
|
920
|
-
-----
|
|
921
|
-
Numeric operations ignore non-numeric values but count their presence
|
|
922
|
-
for ``'count'``.
|
|
923
|
-
"""
|
|
924
|
-
field = operation.get('field')
|
|
925
|
-
func = operation.get('func')
|
|
926
|
-
alias = operation.get('alias')
|
|
927
|
-
|
|
928
|
-
if not field or func is None:
|
|
929
|
-
return {'error': 'Invalid aggregation operation'}
|
|
930
|
-
|
|
931
|
-
try:
|
|
932
|
-
aggregator = _resolve_aggregator(func)
|
|
933
|
-
except TypeError:
|
|
934
|
-
return {'error': f'Unknown aggregation function: {func}'}
|
|
935
|
-
|
|
936
|
-
nums, present = _collect_numeric_and_presence(records, field)
|
|
937
|
-
key_name = _derive_agg_key(func, field, alias)
|
|
938
|
-
return {key_name: aggregator(nums, present)}
|
|
945
|
+
# -- Orchestration -- #
|
|
939
946
|
|
|
940
947
|
|
|
941
948
|
def transform(
|
|
@@ -982,7 +989,7 @@ def transform(
|
|
|
982
989
|
|
|
983
990
|
Using enums for keys and functions::
|
|
984
991
|
|
|
985
|
-
from .enums import PipelineStep, OperatorName, AggregateName
|
|
992
|
+
from etlplus.enums import PipelineStep, OperatorName, AggregateName
|
|
986
993
|
ops = {
|
|
987
994
|
PipelineStep.FILTER: {
|
|
988
995
|
'field': 'age', 'op': OperatorName.GTE, 'value': 18
|
|
@@ -1,26 +1,27 @@
|
|
|
1
1
|
"""
|
|
2
|
-
:mod:`etlplus.
|
|
2
|
+
:mod:`etlplus.ops.utils` module.
|
|
3
3
|
|
|
4
|
-
Utility helpers for conditional
|
|
4
|
+
Utility helpers for conditional data ops orchestration.
|
|
5
5
|
|
|
6
6
|
The helpers defined here embrace a "high cohesion, low coupling" design by
|
|
7
7
|
isolating normalization, configuration, and logging responsibilities. The
|
|
8
8
|
resulting surface keeps ``maybe_validate`` focused on orchestration while
|
|
9
9
|
offloading ancillary concerns to composable helpers.
|
|
10
|
-
|
|
11
10
|
"""
|
|
12
11
|
|
|
13
12
|
from __future__ import annotations
|
|
14
13
|
|
|
15
14
|
from collections.abc import Callable
|
|
16
15
|
from dataclasses import dataclass
|
|
16
|
+
from types import MappingProxyType
|
|
17
17
|
from typing import Any
|
|
18
18
|
from typing import Literal
|
|
19
19
|
from typing import Self
|
|
20
20
|
from typing import TypedDict
|
|
21
|
+
from typing import cast
|
|
21
22
|
|
|
22
23
|
from ..types import StrAnyMap
|
|
23
|
-
from ..utils import
|
|
24
|
+
from ..utils import normalize_choice
|
|
24
25
|
|
|
25
26
|
# SECTION: TYPED DICTIONARIES =============================================== #
|
|
26
27
|
|
|
@@ -47,6 +48,30 @@ type ValidateFn = Callable[[Any, Ruleset], ValidationResult]
|
|
|
47
48
|
type PrintFn = Callable[[Any], None]
|
|
48
49
|
|
|
49
50
|
|
|
51
|
+
# SECTION: INTERNAL CONSTANTS ============================================== #
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
_PHASE_CHOICES = MappingProxyType(
|
|
55
|
+
{
|
|
56
|
+
'before_transform': 'before_transform',
|
|
57
|
+
'after_transform': 'after_transform',
|
|
58
|
+
},
|
|
59
|
+
)
|
|
60
|
+
_SEVERITY_CHOICES = MappingProxyType(
|
|
61
|
+
{
|
|
62
|
+
'warn': 'warn',
|
|
63
|
+
'error': 'error',
|
|
64
|
+
},
|
|
65
|
+
)
|
|
66
|
+
_WINDOW_CHOICES = MappingProxyType(
|
|
67
|
+
{
|
|
68
|
+
'before_transform': 'before_transform',
|
|
69
|
+
'after_transform': 'after_transform',
|
|
70
|
+
'both': 'both',
|
|
71
|
+
},
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
|
|
50
75
|
# SECTION: DATA CLASSES ===================================================== #
|
|
51
76
|
|
|
52
77
|
|
|
@@ -291,11 +316,14 @@ def _normalize_phase(
|
|
|
291
316
|
Normalized validation phase. Defaults to ``"before_transform"`` when
|
|
292
317
|
unspecified.
|
|
293
318
|
"""
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
319
|
+
return cast(
|
|
320
|
+
ValidationPhase,
|
|
321
|
+
normalize_choice(
|
|
322
|
+
value,
|
|
323
|
+
mapping=_PHASE_CHOICES,
|
|
324
|
+
default='before_transform',
|
|
325
|
+
),
|
|
326
|
+
)
|
|
299
327
|
|
|
300
328
|
|
|
301
329
|
def _normalize_severity(
|
|
@@ -314,7 +342,14 @@ def _normalize_severity(
|
|
|
314
342
|
ValidationSeverity
|
|
315
343
|
Normalized severity. Defaults to ``"error"`` when unspecified.
|
|
316
344
|
"""
|
|
317
|
-
return
|
|
345
|
+
return cast(
|
|
346
|
+
ValidationSeverity,
|
|
347
|
+
normalize_choice(
|
|
348
|
+
value,
|
|
349
|
+
mapping=_SEVERITY_CHOICES,
|
|
350
|
+
default='error',
|
|
351
|
+
),
|
|
352
|
+
)
|
|
318
353
|
|
|
319
354
|
|
|
320
355
|
def _normalize_window(
|
|
@@ -333,13 +368,14 @@ def _normalize_window(
|
|
|
333
368
|
ValidationWindow
|
|
334
369
|
Normalized validation window. Defaults to ``"both"`` when unspecified.
|
|
335
370
|
"""
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
371
|
+
return cast(
|
|
372
|
+
ValidationWindow,
|
|
373
|
+
normalize_choice(
|
|
374
|
+
value,
|
|
375
|
+
mapping=_WINDOW_CHOICES,
|
|
376
|
+
default='both',
|
|
377
|
+
),
|
|
378
|
+
)
|
|
343
379
|
|
|
344
380
|
|
|
345
381
|
def _rule_name(
|