etlplus 0.13.0__py3-none-any.whl → 0.14.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- etlplus/README.md +1 -1
- etlplus/__init__.py +1 -26
- etlplus/api/__init__.py +8 -0
- etlplus/api/endpoint_client.py +3 -3
- etlplus/{run_helpers.py → api/utils.py} +121 -79
- etlplus/cli/handlers.py +17 -7
- etlplus/{validation → ops}/README.md +2 -2
- etlplus/ops/__init__.py +61 -0
- etlplus/{extract.py → ops/extract.py} +78 -94
- etlplus/{load.py → ops/load.py} +73 -93
- etlplus/{run.py → ops/run.py} +140 -110
- etlplus/{transform.py → ops/transform.py} +75 -68
- etlplus/{validation → ops}/utils.py +62 -15
- etlplus/{validate.py → ops/validate.py} +19 -9
- etlplus/types.py +2 -2
- {etlplus-0.13.0.dist-info → etlplus-0.14.1.dist-info}/METADATA +4 -4
- {etlplus-0.13.0.dist-info → etlplus-0.14.1.dist-info}/RECORD +21 -21
- etlplus/validation/__init__.py +0 -44
- {etlplus-0.13.0.dist-info → etlplus-0.14.1.dist-info}/WHEEL +0 -0
- {etlplus-0.13.0.dist-info → etlplus-0.14.1.dist-info}/entry_points.txt +0 -0
- {etlplus-0.13.0.dist-info → etlplus-0.14.1.dist-info}/licenses/LICENSE +0 -0
- {etlplus-0.13.0.dist-info → etlplus-0.14.1.dist-info}/top_level.txt +0 -0
etlplus/{run.py → ops/run.py}
RENAMED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
:mod:`etlplus.run` module.
|
|
2
|
+
:mod:`etlplus.ops.run` module.
|
|
3
3
|
|
|
4
4
|
A module for running ETL jobs defined in YAML configurations.
|
|
5
5
|
"""
|
|
@@ -9,126 +9,78 @@ from __future__ import annotations
|
|
|
9
9
|
from collections.abc import Mapping
|
|
10
10
|
from typing import Any
|
|
11
11
|
from typing import Final
|
|
12
|
-
from typing import TypedDict
|
|
13
12
|
from typing import cast
|
|
14
13
|
from urllib.parse import urlsplit
|
|
15
14
|
from urllib.parse import urlunsplit
|
|
16
15
|
|
|
17
|
-
import
|
|
18
|
-
|
|
19
|
-
from
|
|
20
|
-
from
|
|
21
|
-
from
|
|
22
|
-
from
|
|
23
|
-
from
|
|
24
|
-
from
|
|
25
|
-
from
|
|
16
|
+
from ..api import EndpointClient # noqa: F401 (re-exported for tests)
|
|
17
|
+
from ..api import PaginationConfigMap
|
|
18
|
+
from ..api import RequestOptions
|
|
19
|
+
from ..api import compose_api_request_env
|
|
20
|
+
from ..api import compose_api_target_env
|
|
21
|
+
from ..api import paginate_with_client
|
|
22
|
+
from ..config import load_pipeline_config
|
|
23
|
+
from ..enums import DataConnectorType
|
|
24
|
+
from ..enums import HttpMethod
|
|
25
|
+
from ..file import FileFormat
|
|
26
|
+
from ..types import JSONData
|
|
27
|
+
from ..types import JSONDict
|
|
28
|
+
from ..types import PipelineConfig
|
|
29
|
+
from ..types import StrPath
|
|
30
|
+
from ..types import Timeout
|
|
31
|
+
from ..utils import print_json
|
|
26
32
|
from .extract import extract
|
|
27
33
|
from .load import load
|
|
28
|
-
from .run_helpers import compose_api_request_env
|
|
29
|
-
from .run_helpers import compose_api_target_env
|
|
30
|
-
from .run_helpers import paginate_with_client
|
|
31
34
|
from .transform import transform
|
|
32
|
-
from .
|
|
33
|
-
from .types import Timeout
|
|
34
|
-
from .utils import print_json
|
|
35
|
+
from .utils import maybe_validate
|
|
35
36
|
from .validate import validate
|
|
36
|
-
from .validation.utils import maybe_validate
|
|
37
37
|
|
|
38
38
|
# SECTION: EXPORTS ========================================================== #
|
|
39
39
|
|
|
40
40
|
|
|
41
|
-
__all__ = [
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
41
|
+
__all__ = [
|
|
42
|
+
# Functions
|
|
43
|
+
'run',
|
|
44
|
+
'run_pipeline',
|
|
45
|
+
]
|
|
46
46
|
|
|
47
|
-
class BaseApiHttpEnv(TypedDict, total=False):
|
|
48
|
-
"""
|
|
49
|
-
Common HTTP request environment for API interactions.
|
|
50
|
-
|
|
51
|
-
Fields shared by both source-side and target-side API operations.
|
|
52
|
-
"""
|
|
53
47
|
|
|
54
|
-
|
|
55
|
-
url: Url | None
|
|
56
|
-
headers: dict[str, str]
|
|
57
|
-
timeout: Timeout
|
|
58
|
-
|
|
59
|
-
# Session
|
|
60
|
-
session: requests.Session | None
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
class ApiRequestEnv(BaseApiHttpEnv, total=False):
|
|
64
|
-
"""
|
|
65
|
-
Composed request environment for API sources.
|
|
48
|
+
# SECTION: CONSTANTS ======================================================== #
|
|
66
49
|
|
|
67
|
-
Returned by ``compose_api_request_env`` (run_helpers) and consumed by the
|
|
68
|
-
API extract branch. Values are fully merged with endpoint/API defaults and
|
|
69
|
-
job-level overrides, preserving the original precedence and behavior.
|
|
70
|
-
"""
|
|
71
50
|
|
|
72
|
-
|
|
73
|
-
use_endpoints: bool
|
|
74
|
-
base_url: str | None
|
|
75
|
-
base_path: str | None
|
|
76
|
-
endpoints_map: dict[str, str] | None
|
|
77
|
-
endpoint_key: str | None
|
|
51
|
+
DEFAULT_CONFIG_PATH: Final[str] = 'in/pipeline.yml'
|
|
78
52
|
|
|
79
|
-
# Request
|
|
80
|
-
params: dict[str, Any]
|
|
81
|
-
pagination: PaginationConfigMap | None
|
|
82
|
-
sleep_seconds: float
|
|
83
53
|
|
|
84
|
-
|
|
85
|
-
retry: RetryPolicy | None
|
|
86
|
-
retry_network_errors: bool
|
|
54
|
+
# SECTION: INTERNAL FUNCTIONS =============================================== #
|
|
87
55
|
|
|
88
56
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
Returned by ``compose_api_target_env`` (run_helpers) and consumed by the
|
|
94
|
-
API load branch. Values are merged from the target object, optional
|
|
95
|
-
API/endpoint reference, and job-level overrides, preserving original
|
|
96
|
-
precedence and behavior.
|
|
97
|
-
|
|
98
|
-
Notes
|
|
99
|
-
-----
|
|
100
|
-
- Precedence for inherited values matches original logic:
|
|
101
|
-
overrides -> target -> API profile defaults.
|
|
102
|
-
- Target composition does not include pagination/rate-limit/retry since
|
|
103
|
-
loads are single-request operations; only headers/timeout/session
|
|
104
|
-
apply.
|
|
57
|
+
def _resolve_validation_config(
|
|
58
|
+
job_obj: Any,
|
|
59
|
+
cfg: Any,
|
|
60
|
+
) -> tuple[bool, dict[str, Any], str, str]:
|
|
105
61
|
"""
|
|
62
|
+
Resolve validation settings for a job with safe defaults.
|
|
106
63
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
Minimal session configuration schema accepted by this runner.
|
|
64
|
+
Parameters
|
|
65
|
+
----------
|
|
66
|
+
job_obj : Any
|
|
67
|
+
Job configuration object.
|
|
68
|
+
cfg : Any
|
|
69
|
+
Pipeline configuration object with validations.
|
|
114
70
|
|
|
115
|
-
|
|
71
|
+
Returns
|
|
72
|
+
-------
|
|
73
|
+
tuple[bool, dict[str, Any], str, str]
|
|
74
|
+
Tuple of (enabled, rules, severity, phase).
|
|
116
75
|
"""
|
|
76
|
+
val_ref = job_obj.validate
|
|
77
|
+
if val_ref is None:
|
|
78
|
+
return False, {}, 'error', 'before_transform'
|
|
117
79
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
cert: Any # str or (cert, key)
|
|
123
|
-
proxies: Mapping[str, Any]
|
|
124
|
-
cookies: Mapping[str, Any]
|
|
125
|
-
trust_env: bool
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
# SECTION: CONSTANTS ======================================================== #
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
DEFAULT_CONFIG_PATH: Final[str] = 'in/pipeline.yml'
|
|
80
|
+
rules = cfg.validations.get(val_ref.ruleset, {})
|
|
81
|
+
severity = (val_ref.severity or 'error').lower()
|
|
82
|
+
phase = (val_ref.phase or 'before_transform').lower()
|
|
83
|
+
return True, rules, severity, phase
|
|
132
84
|
|
|
133
85
|
|
|
134
86
|
# SECTION: FUNCTIONS ======================================================== #
|
|
@@ -207,7 +159,7 @@ def run(
|
|
|
207
159
|
and env.get('endpoint_key')
|
|
208
160
|
):
|
|
209
161
|
# Construct client using module-level EndpointClient so tests
|
|
210
|
-
# can monkeypatch this class on etlplus.run.
|
|
162
|
+
# can monkeypatch this class on etlplus.ops.run.
|
|
211
163
|
ClientClass = EndpointClient # noqa: N806
|
|
212
164
|
client = ClientClass(
|
|
213
165
|
base_url=cast(str, env['base_url']),
|
|
@@ -263,19 +215,10 @@ def run(
|
|
|
263
215
|
# keep explicit guard for defensive programming.
|
|
264
216
|
raise ValueError(f'Unsupported source type: {stype_raw}')
|
|
265
217
|
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
# Type narrowing for static checkers
|
|
271
|
-
assert val_ref is not None
|
|
272
|
-
rules = cfg.validations.get(val_ref.ruleset, {})
|
|
273
|
-
severity = (val_ref.severity or 'error').lower()
|
|
274
|
-
phase = (val_ref.phase or 'before_transform').lower()
|
|
275
|
-
else:
|
|
276
|
-
rules = {}
|
|
277
|
-
severity = 'error'
|
|
278
|
-
phase = 'before_transform'
|
|
218
|
+
enabled_validation, rules, severity, phase = _resolve_validation_config(
|
|
219
|
+
job_obj,
|
|
220
|
+
cfg,
|
|
221
|
+
)
|
|
279
222
|
|
|
280
223
|
# Pre-transform validation (if configured).
|
|
281
224
|
data = maybe_validate(
|
|
@@ -361,3 +304,90 @@ def run(
|
|
|
361
304
|
# Return the terminal load result directly; callers (e.g., CLI) can wrap
|
|
362
305
|
# it in their own envelope when needed.
|
|
363
306
|
return cast(JSONDict, result)
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def run_pipeline(
|
|
310
|
+
*,
|
|
311
|
+
source_type: DataConnectorType | str | None = None,
|
|
312
|
+
source: StrPath | JSONData | None = None,
|
|
313
|
+
operations: PipelineConfig | None = None,
|
|
314
|
+
target_type: DataConnectorType | str | None = None,
|
|
315
|
+
target: StrPath | None = None,
|
|
316
|
+
file_format: FileFormat | str | None = None,
|
|
317
|
+
method: HttpMethod | str | None = None,
|
|
318
|
+
**kwargs: Any,
|
|
319
|
+
) -> JSONData:
|
|
320
|
+
"""
|
|
321
|
+
Run a single extract-transform-load flow without a YAML config.
|
|
322
|
+
|
|
323
|
+
Parameters
|
|
324
|
+
----------
|
|
325
|
+
source_type : DataConnectorType | str | None, optional
|
|
326
|
+
Connector type for extraction. When ``None``, ``source`` is assumed
|
|
327
|
+
to be pre-loaded data and extraction is skipped.
|
|
328
|
+
source : StrPath | JSONData | None, optional
|
|
329
|
+
Data source for extraction or the pre-loaded payload when
|
|
330
|
+
``source_type`` is ``None``.
|
|
331
|
+
operations : PipelineConfig | None, optional
|
|
332
|
+
Transform configuration passed to :func:`etlplus.ops.transform`.
|
|
333
|
+
target_type : DataConnectorType | str | None, optional
|
|
334
|
+
Connector type for loading. When ``None``, load is skipped and the
|
|
335
|
+
transformed data is returned.
|
|
336
|
+
target : StrPath | None, optional
|
|
337
|
+
Target for loading (file path, connection string, or API URL).
|
|
338
|
+
file_format : FileFormat | str | None, optional
|
|
339
|
+
File format for file sources/targets (forwarded to extract/load).
|
|
340
|
+
method : HttpMethod | str | None, optional
|
|
341
|
+
HTTP method for API loads (forwarded to :func:`etlplus.ops.load`).
|
|
342
|
+
**kwargs : Any
|
|
343
|
+
Extra keyword arguments forwarded to extract/load for API options
|
|
344
|
+
(headers, timeout, session, etc.).
|
|
345
|
+
|
|
346
|
+
Returns
|
|
347
|
+
-------
|
|
348
|
+
JSONData
|
|
349
|
+
Transformed data or the load result payload.
|
|
350
|
+
|
|
351
|
+
Raises
|
|
352
|
+
------
|
|
353
|
+
TypeError
|
|
354
|
+
Raised when extracted data is not a dict or list of dicts and no
|
|
355
|
+
target is specified.
|
|
356
|
+
ValueError
|
|
357
|
+
Raised when required source/target inputs are missing.
|
|
358
|
+
"""
|
|
359
|
+
if source_type is None:
|
|
360
|
+
if source is None:
|
|
361
|
+
raise ValueError('source or source_type is required')
|
|
362
|
+
data = source
|
|
363
|
+
else:
|
|
364
|
+
if source is None:
|
|
365
|
+
raise ValueError('source is required when source_type is set')
|
|
366
|
+
data = extract(
|
|
367
|
+
source_type,
|
|
368
|
+
cast(StrPath, source),
|
|
369
|
+
file_format=file_format,
|
|
370
|
+
**kwargs,
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
if operations:
|
|
374
|
+
data = transform(data, operations)
|
|
375
|
+
|
|
376
|
+
if target_type is None:
|
|
377
|
+
if not isinstance(data, (dict, list)):
|
|
378
|
+
raise TypeError(
|
|
379
|
+
f'Expected data to be dict or list of dicts, '
|
|
380
|
+
f'got {type(data).__name__}',
|
|
381
|
+
)
|
|
382
|
+
return data
|
|
383
|
+
if target is None:
|
|
384
|
+
raise ValueError('target is required when target_type is set')
|
|
385
|
+
|
|
386
|
+
return load(
|
|
387
|
+
data,
|
|
388
|
+
target_type,
|
|
389
|
+
target,
|
|
390
|
+
file_format=file_format,
|
|
391
|
+
method=method,
|
|
392
|
+
**kwargs,
|
|
393
|
+
)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
:mod:`etlplus.transform` module.
|
|
2
|
+
:mod:`etlplus.ops.transform` module.
|
|
3
3
|
|
|
4
4
|
Helpers to filter, map/rename, select, sort, aggregate, and otherwise
|
|
5
5
|
transform JSON-like records (dicts and lists of dicts).
|
|
@@ -24,7 +24,7 @@ Basic pipeline with strings::
|
|
|
24
24
|
|
|
25
25
|
Using enums for keys and functions::
|
|
26
26
|
|
|
27
|
-
from .enums import PipelineStep, OperatorName, AggregateName
|
|
27
|
+
from etlplus.enums import PipelineStep, OperatorName, AggregateName
|
|
28
28
|
ops = {
|
|
29
29
|
PipelineStep.FILTER: {
|
|
30
30
|
'field': 'age', 'op': OperatorName.GTE, 'value': 18
|
|
@@ -44,28 +44,28 @@ from collections.abc import Sequence
|
|
|
44
44
|
from typing import Any
|
|
45
45
|
from typing import cast
|
|
46
46
|
|
|
47
|
-
from
|
|
48
|
-
from
|
|
49
|
-
from
|
|
47
|
+
from ..enums import AggregateName
|
|
48
|
+
from ..enums import OperatorName
|
|
49
|
+
from ..enums import PipelineStep
|
|
50
|
+
from ..types import AggregateFunc
|
|
51
|
+
from ..types import AggregateSpec
|
|
52
|
+
from ..types import FieldName
|
|
53
|
+
from ..types import Fields
|
|
54
|
+
from ..types import FilterSpec
|
|
55
|
+
from ..types import JSONData
|
|
56
|
+
from ..types import JSONDict
|
|
57
|
+
from ..types import JSONList
|
|
58
|
+
from ..types import MapSpec
|
|
59
|
+
from ..types import OperatorFunc
|
|
60
|
+
from ..types import PipelineConfig
|
|
61
|
+
from ..types import PipelineStepName
|
|
62
|
+
from ..types import SortKey
|
|
63
|
+
from ..types import StepApplier
|
|
64
|
+
from ..types import StepOrSteps
|
|
65
|
+
from ..types import StepSpec
|
|
66
|
+
from ..types import StrPath
|
|
67
|
+
from ..utils import to_number
|
|
50
68
|
from .load import load_data
|
|
51
|
-
from .types import AggregateFunc
|
|
52
|
-
from .types import AggregateSpec
|
|
53
|
-
from .types import FieldName
|
|
54
|
-
from .types import Fields
|
|
55
|
-
from .types import FilterSpec
|
|
56
|
-
from .types import JSONData
|
|
57
|
-
from .types import JSONDict
|
|
58
|
-
from .types import JSONList
|
|
59
|
-
from .types import MapSpec
|
|
60
|
-
from .types import OperatorFunc
|
|
61
|
-
from .types import PipelineConfig
|
|
62
|
-
from .types import PipelineStepName
|
|
63
|
-
from .types import SortKey
|
|
64
|
-
from .types import StepApplier
|
|
65
|
-
from .types import StepOrSteps
|
|
66
|
-
from .types import StepSpec
|
|
67
|
-
from .types import StrPath
|
|
68
|
-
from .utils import to_number
|
|
69
69
|
|
|
70
70
|
# SECTION: EXPORTS ========================================================== #
|
|
71
71
|
|
|
@@ -730,15 +730,16 @@ def _is_plain_fields_list(obj: Any) -> bool:
|
|
|
730
730
|
|
|
731
731
|
|
|
732
732
|
_PIPELINE_STEPS: tuple[PipelineStepName, ...] = (
|
|
733
|
+
'aggregate',
|
|
733
734
|
'filter',
|
|
734
735
|
'map',
|
|
735
736
|
'select',
|
|
736
737
|
'sort',
|
|
737
|
-
'aggregate',
|
|
738
738
|
)
|
|
739
739
|
|
|
740
740
|
|
|
741
741
|
_STEP_APPLIERS: dict[PipelineStepName, StepApplier] = {
|
|
742
|
+
'aggregate': _apply_aggregate_step,
|
|
742
743
|
'filter': _apply_filter_step,
|
|
743
744
|
'map': _apply_map_step,
|
|
744
745
|
'select': _apply_select_step,
|
|
@@ -746,7 +747,54 @@ _STEP_APPLIERS: dict[PipelineStepName, StepApplier] = {
|
|
|
746
747
|
}
|
|
747
748
|
|
|
748
749
|
|
|
749
|
-
# SECTION:
|
|
750
|
+
# SECTION: FUNCTIONS ======================================================== #
|
|
751
|
+
|
|
752
|
+
|
|
753
|
+
# -- Helpers -- #
|
|
754
|
+
|
|
755
|
+
|
|
756
|
+
def apply_aggregate(
|
|
757
|
+
records: JSONList,
|
|
758
|
+
operation: AggregateSpec,
|
|
759
|
+
) -> JSONDict:
|
|
760
|
+
"""
|
|
761
|
+
Aggregate a numeric field or count presence.
|
|
762
|
+
|
|
763
|
+
Parameters
|
|
764
|
+
----------
|
|
765
|
+
records : JSONList
|
|
766
|
+
Records to aggregate.
|
|
767
|
+
operation : AggregateSpec
|
|
768
|
+
Dict with keys ``field`` and ``func``. ``func`` is one of
|
|
769
|
+
``'sum'``, ``'avg'``, ``'min'``, ``'max'``, or ``'count'``.
|
|
770
|
+
A callable may also be supplied for ``func``. Optionally, set
|
|
771
|
+
``alias`` to control the output key name.
|
|
772
|
+
|
|
773
|
+
Returns
|
|
774
|
+
-------
|
|
775
|
+
JSONDict
|
|
776
|
+
A single-row result like ``{"sum_age": 42}``.
|
|
777
|
+
|
|
778
|
+
Notes
|
|
779
|
+
-----
|
|
780
|
+
Numeric operations ignore non-numeric values but count their presence
|
|
781
|
+
for ``'count'``.
|
|
782
|
+
"""
|
|
783
|
+
field = operation.get('field')
|
|
784
|
+
func = operation.get('func')
|
|
785
|
+
alias = operation.get('alias')
|
|
786
|
+
|
|
787
|
+
if not field or func is None:
|
|
788
|
+
return {'error': 'Invalid aggregation operation'}
|
|
789
|
+
|
|
790
|
+
try:
|
|
791
|
+
aggregator = _resolve_aggregator(func)
|
|
792
|
+
except TypeError:
|
|
793
|
+
return {'error': f'Unknown aggregation function: {func}'}
|
|
794
|
+
|
|
795
|
+
nums, present = _collect_numeric_and_presence(records, field)
|
|
796
|
+
key_name = _derive_agg_key(func, field, alias)
|
|
797
|
+
return {key_name: aggregator(nums, present)}
|
|
750
798
|
|
|
751
799
|
|
|
752
800
|
def apply_filter(
|
|
@@ -894,48 +942,7 @@ def apply_sort(
|
|
|
894
942
|
)
|
|
895
943
|
|
|
896
944
|
|
|
897
|
-
|
|
898
|
-
records: JSONList,
|
|
899
|
-
operation: AggregateSpec,
|
|
900
|
-
) -> JSONDict:
|
|
901
|
-
"""
|
|
902
|
-
Aggregate a numeric field or count presence.
|
|
903
|
-
|
|
904
|
-
Parameters
|
|
905
|
-
----------
|
|
906
|
-
records : JSONList
|
|
907
|
-
Records to aggregate.
|
|
908
|
-
operation : AggregateSpec
|
|
909
|
-
Dict with keys ``field`` and ``func``. ``func`` is one of
|
|
910
|
-
``'sum'``, ``'avg'``, ``'min'``, ``'max'``, or ``'count'``.
|
|
911
|
-
A callable may also be supplied for ``func``. Optionally, set
|
|
912
|
-
``alias`` to control the output key name.
|
|
913
|
-
|
|
914
|
-
Returns
|
|
915
|
-
-------
|
|
916
|
-
JSONDict
|
|
917
|
-
A single-row result like ``{"sum_age": 42}``.
|
|
918
|
-
|
|
919
|
-
Notes
|
|
920
|
-
-----
|
|
921
|
-
Numeric operations ignore non-numeric values but count their presence
|
|
922
|
-
for ``'count'``.
|
|
923
|
-
"""
|
|
924
|
-
field = operation.get('field')
|
|
925
|
-
func = operation.get('func')
|
|
926
|
-
alias = operation.get('alias')
|
|
927
|
-
|
|
928
|
-
if not field or func is None:
|
|
929
|
-
return {'error': 'Invalid aggregation operation'}
|
|
930
|
-
|
|
931
|
-
try:
|
|
932
|
-
aggregator = _resolve_aggregator(func)
|
|
933
|
-
except TypeError:
|
|
934
|
-
return {'error': f'Unknown aggregation function: {func}'}
|
|
935
|
-
|
|
936
|
-
nums, present = _collect_numeric_and_presence(records, field)
|
|
937
|
-
key_name = _derive_agg_key(func, field, alias)
|
|
938
|
-
return {key_name: aggregator(nums, present)}
|
|
945
|
+
# -- Orchestration -- #
|
|
939
946
|
|
|
940
947
|
|
|
941
948
|
def transform(
|
|
@@ -982,7 +989,7 @@ def transform(
|
|
|
982
989
|
|
|
983
990
|
Using enums for keys and functions::
|
|
984
991
|
|
|
985
|
-
from .enums import PipelineStep, OperatorName, AggregateName
|
|
992
|
+
from etlplus.enums import PipelineStep, OperatorName, AggregateName
|
|
986
993
|
ops = {
|
|
987
994
|
PipelineStep.FILTER: {
|
|
988
995
|
'field': 'age', 'op': OperatorName.GTE, 'value': 18
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""
|
|
2
|
-
:mod:`etlplus.
|
|
2
|
+
:mod:`etlplus.ops.utils` module.
|
|
3
3
|
|
|
4
|
-
Utility helpers for conditional
|
|
4
|
+
Utility helpers for conditional data ops orchestration.
|
|
5
5
|
|
|
6
6
|
The helpers defined here embrace a "high cohesion, low coupling" design by
|
|
7
7
|
isolating normalization, configuration, and logging responsibilities. The
|
|
@@ -13,11 +13,13 @@ offloading ancillary concerns to composable helpers.
|
|
|
13
13
|
from __future__ import annotations
|
|
14
14
|
|
|
15
15
|
from collections.abc import Callable
|
|
16
|
+
from collections.abc import Mapping
|
|
16
17
|
from dataclasses import dataclass
|
|
17
18
|
from typing import Any
|
|
18
19
|
from typing import Literal
|
|
19
20
|
from typing import Self
|
|
20
21
|
from typing import TypedDict
|
|
22
|
+
from typing import cast
|
|
21
23
|
|
|
22
24
|
from ..types import StrAnyMap
|
|
23
25
|
from ..utils import normalized_str
|
|
@@ -291,11 +293,17 @@ def _normalize_phase(
|
|
|
291
293
|
Normalized validation phase. Defaults to ``"before_transform"`` when
|
|
292
294
|
unspecified.
|
|
293
295
|
"""
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
296
|
+
return cast(
|
|
297
|
+
ValidationPhase,
|
|
298
|
+
_normalize_choice(
|
|
299
|
+
value,
|
|
300
|
+
mapping={
|
|
301
|
+
'before_transform': 'before_transform',
|
|
302
|
+
'after_transform': 'after_transform',
|
|
303
|
+
},
|
|
304
|
+
default='before_transform',
|
|
305
|
+
),
|
|
306
|
+
)
|
|
299
307
|
|
|
300
308
|
|
|
301
309
|
def _normalize_severity(
|
|
@@ -314,7 +322,14 @@ def _normalize_severity(
|
|
|
314
322
|
ValidationSeverity
|
|
315
323
|
Normalized severity. Defaults to ``"error"`` when unspecified.
|
|
316
324
|
"""
|
|
317
|
-
return
|
|
325
|
+
return cast(
|
|
326
|
+
ValidationSeverity,
|
|
327
|
+
_normalize_choice(
|
|
328
|
+
value,
|
|
329
|
+
mapping={'warn': 'warn'},
|
|
330
|
+
default='error',
|
|
331
|
+
),
|
|
332
|
+
)
|
|
318
333
|
|
|
319
334
|
|
|
320
335
|
def _normalize_window(
|
|
@@ -333,13 +348,45 @@ def _normalize_window(
|
|
|
333
348
|
ValidationWindow
|
|
334
349
|
Normalized validation window. Defaults to ``"both"`` when unspecified.
|
|
335
350
|
"""
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
351
|
+
return cast(
|
|
352
|
+
ValidationWindow,
|
|
353
|
+
_normalize_choice(
|
|
354
|
+
value,
|
|
355
|
+
mapping={
|
|
356
|
+
'before_transform': 'before_transform',
|
|
357
|
+
'after_transform': 'after_transform',
|
|
358
|
+
'both': 'both',
|
|
359
|
+
},
|
|
360
|
+
default='both',
|
|
361
|
+
),
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
def _normalize_choice(
|
|
366
|
+
value: str | None,
|
|
367
|
+
*,
|
|
368
|
+
mapping: Mapping[str, str],
|
|
369
|
+
default: str,
|
|
370
|
+
) -> str:
|
|
371
|
+
"""
|
|
372
|
+
Normalize a text value against a mapping with a default fallback.
|
|
373
|
+
|
|
374
|
+
Parameters
|
|
375
|
+
----------
|
|
376
|
+
value : str | None
|
|
377
|
+
Input text to normalize.
|
|
378
|
+
mapping : Mapping[str, str]
|
|
379
|
+
Mapping of accepted values to normalized outputs.
|
|
380
|
+
default : str
|
|
381
|
+
Default to return when input is missing or unrecognized.
|
|
382
|
+
|
|
383
|
+
Returns
|
|
384
|
+
-------
|
|
385
|
+
str
|
|
386
|
+
Normalized value.
|
|
387
|
+
"""
|
|
388
|
+
normalized = normalized_str(value)
|
|
389
|
+
return mapping.get(normalized, default)
|
|
343
390
|
|
|
344
391
|
|
|
345
392
|
def _rule_name(
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
:mod:`etlplus.
|
|
2
|
+
:mod:`etlplus.ops.validate` module.
|
|
3
3
|
|
|
4
4
|
Validate dicts and lists of dicts using simple, schema-like rules.
|
|
5
5
|
|
|
@@ -34,11 +34,11 @@ from typing import Final
|
|
|
34
34
|
from typing import Literal
|
|
35
35
|
from typing import TypedDict
|
|
36
36
|
|
|
37
|
+
from ..types import JSONData
|
|
38
|
+
from ..types import Record
|
|
39
|
+
from ..types import StrAnyMap
|
|
40
|
+
from ..types import StrPath
|
|
37
41
|
from .load import load_data
|
|
38
|
-
from .types import JSONData
|
|
39
|
-
from .types import Record
|
|
40
|
-
from .types import StrAnyMap
|
|
41
|
-
from .types import StrPath
|
|
42
42
|
|
|
43
43
|
# SECTION: EXPORTS ========================================================== #
|
|
44
44
|
|
|
@@ -279,11 +279,15 @@ def _type_matches(
|
|
|
279
279
|
bool
|
|
280
280
|
``True`` if the value matches the expected type; ``False`` if not.
|
|
281
281
|
"""
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
282
|
+
if expected == 'number':
|
|
283
|
+
return _is_number(value)
|
|
284
|
+
if expected == 'integer':
|
|
285
|
+
return isinstance(value, int) and not isinstance(value, bool)
|
|
286
|
+
if expected == 'boolean':
|
|
287
|
+
return isinstance(value, bool)
|
|
285
288
|
|
|
286
|
-
|
|
289
|
+
py_type = TYPE_MAP.get(expected)
|
|
290
|
+
return isinstance(value, py_type) if py_type else False
|
|
287
291
|
|
|
288
292
|
|
|
289
293
|
def _validate_record(
|
|
@@ -330,6 +334,9 @@ def _validate_record(
|
|
|
330
334
|
# SECTION: FUNCTIONS ======================================================== #
|
|
331
335
|
|
|
332
336
|
|
|
337
|
+
# -- Helpers -- #
|
|
338
|
+
|
|
339
|
+
|
|
333
340
|
def validate_field(
|
|
334
341
|
value: Any,
|
|
335
342
|
rules: StrAnyMap | FieldRules,
|
|
@@ -425,6 +432,9 @@ def validate_field(
|
|
|
425
432
|
return {'valid': len(errors) == 0, 'errors': errors}
|
|
426
433
|
|
|
427
434
|
|
|
435
|
+
# -- Orchestration -- #
|
|
436
|
+
|
|
437
|
+
|
|
428
438
|
def validate(
|
|
429
439
|
source: StrPath | JSONData,
|
|
430
440
|
rules: RulesMap | None = None,
|
etlplus/types.py
CHANGED
|
@@ -193,8 +193,8 @@ type AggregateSpec = StrAnyMap
|
|
|
193
193
|
|
|
194
194
|
# -- Pipelines-- #
|
|
195
195
|
|
|
196
|
-
# Unified pipeline step spec consumed by :mod:`etlplus.transform`.
|
|
197
|
-
type StepSpec = FilterSpec | MapSpec | SelectSpec | SortSpec
|
|
196
|
+
# Unified pipeline step spec consumed by :mod:`etlplus.ops.transform`.
|
|
197
|
+
type StepSpec = AggregateSpec | FilterSpec | MapSpec | SelectSpec | SortSpec
|
|
198
198
|
|
|
199
199
|
# Collections of steps
|
|
200
200
|
|