etlplus 0.16.0__py3-none-any.whl → 0.16.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- etlplus/README.md +24 -2
- etlplus/__init__.py +2 -0
- etlplus/api/__init__.py +14 -14
- etlplus/api/auth.py +9 -6
- etlplus/api/config.py +6 -6
- etlplus/api/endpoint_client.py +16 -16
- etlplus/api/enums.py +2 -2
- etlplus/api/errors.py +4 -4
- etlplus/api/pagination/__init__.py +6 -6
- etlplus/api/pagination/config.py +11 -9
- etlplus/api/rate_limiting/__init__.py +2 -2
- etlplus/api/rate_limiting/config.py +10 -10
- etlplus/api/rate_limiting/rate_limiter.py +2 -2
- etlplus/api/request_manager.py +4 -4
- etlplus/api/retry_manager.py +6 -6
- etlplus/api/transport.py +10 -10
- etlplus/api/types.py +47 -26
- etlplus/api/utils.py +49 -49
- etlplus/cli/README.md +9 -7
- etlplus/cli/commands.py +22 -22
- etlplus/cli/handlers.py +12 -13
- etlplus/cli/main.py +1 -1
- etlplus/{workflow/pipeline.py → config.py} +54 -91
- etlplus/connector/__init__.py +6 -6
- etlplus/connector/api.py +7 -7
- etlplus/connector/database.py +3 -3
- etlplus/connector/file.py +3 -3
- etlplus/connector/types.py +2 -2
- etlplus/database/README.md +7 -7
- etlplus/enums.py +35 -167
- etlplus/file/README.md +7 -5
- etlplus/file/accdb.py +2 -1
- etlplus/file/arrow.py +2 -1
- etlplus/file/bson.py +2 -1
- etlplus/file/cbor.py +2 -1
- etlplus/file/cfg.py +1 -1
- etlplus/file/conf.py +1 -1
- etlplus/file/dat.py +1 -1
- etlplus/file/dta.py +1 -1
- etlplus/file/duckdb.py +2 -1
- etlplus/file/enums.py +1 -1
- etlplus/file/fwf.py +2 -1
- etlplus/file/hbs.py +2 -1
- etlplus/file/hdf5.py +2 -1
- etlplus/file/ini.py +2 -1
- etlplus/file/ion.py +1 -1
- etlplus/file/jinja2.py +2 -1
- etlplus/file/log.py +1 -1
- etlplus/file/mat.py +1 -1
- etlplus/file/mdb.py +2 -1
- etlplus/file/msgpack.py +2 -1
- etlplus/file/mustache.py +2 -1
- etlplus/file/nc.py +1 -1
- etlplus/file/numbers.py +2 -1
- etlplus/file/ods.py +2 -1
- etlplus/file/pb.py +2 -1
- etlplus/file/pbf.py +2 -1
- etlplus/file/properties.py +2 -1
- etlplus/file/proto.py +2 -1
- etlplus/file/psv.py +2 -1
- etlplus/file/rda.py +2 -1
- etlplus/file/rds.py +1 -1
- etlplus/file/sas7bdat.py +2 -1
- etlplus/file/sav.py +1 -1
- etlplus/file/sqlite.py +2 -1
- etlplus/file/sylk.py +2 -1
- etlplus/file/tab.py +2 -1
- etlplus/file/toml.py +2 -1
- etlplus/file/vm.py +2 -1
- etlplus/file/wks.py +2 -1
- etlplus/file/xls.py +1 -1
- etlplus/file/xlsm.py +2 -2
- etlplus/file/xpt.py +2 -1
- etlplus/file/zsav.py +2 -1
- etlplus/ops/README.md +10 -9
- etlplus/ops/__init__.py +1 -0
- etlplus/ops/enums.py +173 -0
- etlplus/ops/extract.py +209 -22
- etlplus/ops/load.py +140 -34
- etlplus/ops/run.py +88 -103
- etlplus/ops/transform.py +46 -27
- etlplus/ops/types.py +147 -0
- etlplus/ops/utils.py +5 -5
- etlplus/ops/validate.py +13 -13
- etlplus/templates/README.md +11 -9
- etlplus/types.py +5 -102
- etlplus/workflow/README.md +0 -24
- etlplus/workflow/__init__.py +2 -4
- etlplus/workflow/dag.py +23 -1
- etlplus/workflow/jobs.py +15 -28
- etlplus/workflow/profile.py +4 -2
- {etlplus-0.16.0.dist-info → etlplus-0.16.7.dist-info}/METADATA +32 -28
- etlplus-0.16.7.dist-info/RECORD +143 -0
- etlplus-0.16.0.dist-info/RECORD +0 -141
- {etlplus-0.16.0.dist-info → etlplus-0.16.7.dist-info}/WHEEL +0 -0
- {etlplus-0.16.0.dist-info → etlplus-0.16.7.dist-info}/entry_points.txt +0 -0
- {etlplus-0.16.0.dist-info → etlplus-0.16.7.dist-info}/licenses/LICENSE +0 -0
- {etlplus-0.16.0.dist-info → etlplus-0.16.7.dist-info}/top_level.txt +0 -0
etlplus/ops/types.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""
|
|
2
|
+
:mod:`etlplus.ops.types` module.
|
|
3
|
+
|
|
4
|
+
Shared type aliases leveraged across :mod:`etlplus.ops` modules.
|
|
5
|
+
|
|
6
|
+
Notes
|
|
7
|
+
-----
|
|
8
|
+
- Centralizes ops-focused aliases (functions, specs, and pipeline helpers).
|
|
9
|
+
- Relies on Python 3.13 ``type`` statements for readability and IDE support.
|
|
10
|
+
|
|
11
|
+
Examples
|
|
12
|
+
--------
|
|
13
|
+
>>> from etlplus.ops.types import AggregateFunc, OperatorFunc
|
|
14
|
+
>>> def total(xs: list[float], _: int) -> float:
|
|
15
|
+
... return sum(xs)
|
|
16
|
+
>>> agg: AggregateFunc = total
|
|
17
|
+
>>> op: OperatorFunc = lambda a, b: a == b
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
from collections.abc import Callable
|
|
23
|
+
from collections.abc import Mapping
|
|
24
|
+
from collections.abc import Sequence
|
|
25
|
+
from typing import Any
|
|
26
|
+
from typing import Literal
|
|
27
|
+
|
|
28
|
+
from ..types import JSONList
|
|
29
|
+
from ..types import StrAnyMap
|
|
30
|
+
from ..types import StrSeqMap
|
|
31
|
+
from ..types import StrStrMap
|
|
32
|
+
|
|
33
|
+
# SECTION: EXPORTS ========================================================== #
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
__all__ = [
|
|
37
|
+
# Type Aliases (Functions)
|
|
38
|
+
'AggregateFunc',
|
|
39
|
+
'OperatorFunc',
|
|
40
|
+
# Type Aliases (Records & Fields)
|
|
41
|
+
'FieldName',
|
|
42
|
+
'Fields',
|
|
43
|
+
# Type Aliases (Transform Specs)
|
|
44
|
+
'AggregateSpec',
|
|
45
|
+
'FilterSpec',
|
|
46
|
+
'MapSpec',
|
|
47
|
+
'SelectSpec',
|
|
48
|
+
'SortSpec',
|
|
49
|
+
# Type Aliases (Pipelines)
|
|
50
|
+
'StepOrSteps',
|
|
51
|
+
'StepSeq',
|
|
52
|
+
'StepSpec',
|
|
53
|
+
'PipelineConfig',
|
|
54
|
+
'PipelineStepName',
|
|
55
|
+
# Type Aliases (Helpers)
|
|
56
|
+
'StepApplier',
|
|
57
|
+
'SortKey',
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# SECTION: TYPE ALIASES ===================================================== #
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# -- Functions -- #
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# TODO: Consider redefining to use `functools.reduce` signature.
|
|
68
|
+
# TODO: Consider adding `**kwargs` to support richer aggregation functions.
|
|
69
|
+
# TODO: Consider constraining first argument to `Sequence[float]`.
|
|
70
|
+
# TODO: Consider constraining return type to `float | int | None`.
|
|
71
|
+
# Callable reducing numeric collections into a summary value.
|
|
72
|
+
type AggregateFunc = Callable[[list[float], int], Any]
|
|
73
|
+
|
|
74
|
+
# Binary predicate consumed by filter operations.
|
|
75
|
+
type OperatorFunc = Callable[[Any, Any], bool]
|
|
76
|
+
|
|
77
|
+
# -- Records & Fields -- #
|
|
78
|
+
|
|
79
|
+
# Individual field identifier referenced inside specs.
|
|
80
|
+
type FieldName = str
|
|
81
|
+
|
|
82
|
+
# Ordered list of :data:`FieldName` entries preserving projection order.
|
|
83
|
+
type Fields = list[FieldName]
|
|
84
|
+
|
|
85
|
+
# -- Transform Specs -- #
|
|
86
|
+
|
|
87
|
+
# Filtering spec expecting ``field``, ``op``, and ``value`` keys.
|
|
88
|
+
type FilterSpec = StrAnyMap
|
|
89
|
+
|
|
90
|
+
# Field renaming instructions mapping old keys to new ones.
|
|
91
|
+
type MapSpec = StrStrMap
|
|
92
|
+
|
|
93
|
+
# Projection spec as a field list or mapping with metadata.
|
|
94
|
+
#
|
|
95
|
+
# Examples
|
|
96
|
+
# --------
|
|
97
|
+
# >>> from etlplus.ops.types import SelectSpec
|
|
98
|
+
# >>> spec1: SelectSpec = ['a','b']
|
|
99
|
+
# >>> spec2: SelectSpec = {'fields': [...]}
|
|
100
|
+
type SelectSpec = Fields | StrSeqMap
|
|
101
|
+
|
|
102
|
+
# Sort directive expressed as a field string or mapping with flags.
|
|
103
|
+
#
|
|
104
|
+
# Examples
|
|
105
|
+
# --------
|
|
106
|
+
# >>> from etlplus.ops.types import SortSpec
|
|
107
|
+
# >>> spec1: SortSpec = 'field'
|
|
108
|
+
# >>> spec2: SortSpec = {'field': 'x', 'reverse': True}
|
|
109
|
+
type SortSpec = str | StrAnyMap
|
|
110
|
+
|
|
111
|
+
# Aggregate instruction covering ``field``, ``func``, and optional alias.
|
|
112
|
+
#
|
|
113
|
+
# Supported functions: ``avg``, ``count``, ``max``, ``min``, and ``sum``.
|
|
114
|
+
# Examples
|
|
115
|
+
# --------
|
|
116
|
+
# >>> from etlplus.ops.types import AggregateSpec
|
|
117
|
+
# >>> spec: AggregateSpec = \
|
|
118
|
+
# ... {'field': 'x', 'func': 'sum' | 'avg' | ..., 'alias'?: '...'}
|
|
119
|
+
type AggregateSpec = StrAnyMap
|
|
120
|
+
|
|
121
|
+
# -- Pipelines-- #
|
|
122
|
+
|
|
123
|
+
# Unified pipeline step spec consumed by :mod:`etlplus.ops.transform`.
|
|
124
|
+
type StepSpec = AggregateSpec | FilterSpec | MapSpec | SelectSpec | SortSpec
|
|
125
|
+
|
|
126
|
+
# Collections of steps
|
|
127
|
+
|
|
128
|
+
# Ordered collection of :data:`StepSpec` entries.
|
|
129
|
+
type StepSeq = Sequence[StepSpec]
|
|
130
|
+
|
|
131
|
+
# Accepts either a single :data:`StepSpec` or a sequence of them.
|
|
132
|
+
type StepOrSteps = StepSpec | StepSeq
|
|
133
|
+
|
|
134
|
+
# Canonical literal names for supported transform stages.
|
|
135
|
+
type PipelineStepName = Literal['aggregate', 'filter', 'map', 'select', 'sort']
|
|
136
|
+
|
|
137
|
+
# Mapping from step name to its associated specification payload.
|
|
138
|
+
# TODO: Consider replacing with etlplus.workflow.types.PipelineConfig.
|
|
139
|
+
type PipelineConfig = Mapping[PipelineStepName, StepOrSteps]
|
|
140
|
+
|
|
141
|
+
# -- Helpers -- #
|
|
142
|
+
|
|
143
|
+
# Callable that applies step configuration to a batch of records.
|
|
144
|
+
type StepApplier = Callable[[JSONList, Any], JSONList]
|
|
145
|
+
|
|
146
|
+
# Tuple combining stable sort index and computed sort value.
|
|
147
|
+
type SortKey = tuple[int, Any]
|
etlplus/ops/utils.py
CHANGED
|
@@ -26,7 +26,7 @@ from ..utils import normalize_choice
|
|
|
26
26
|
# SECTION: TYPED DICTIONARIES =============================================== #
|
|
27
27
|
|
|
28
28
|
|
|
29
|
-
class
|
|
29
|
+
class ValidationResultDict(TypedDict, total=False):
|
|
30
30
|
"""Shape returned by ``validate_fn`` callables."""
|
|
31
31
|
|
|
32
32
|
valid: bool
|
|
@@ -44,7 +44,7 @@ type ValidationPhase = Literal['before_transform', 'after_transform']
|
|
|
44
44
|
type ValidationWindow = Literal['before_transform', 'after_transform', 'both']
|
|
45
45
|
type ValidationSeverity = Literal['warn', 'error']
|
|
46
46
|
|
|
47
|
-
type ValidateFn = Callable[[Any, Ruleset],
|
|
47
|
+
type ValidateFn = Callable[[Any, Ruleset], ValidationResultDict]
|
|
48
48
|
type PrintFn = Callable[[Any], None]
|
|
49
49
|
|
|
50
50
|
|
|
@@ -198,7 +198,7 @@ def maybe_validate(
|
|
|
198
198
|
Failure severity (``"warn"`` or ``"error"``).
|
|
199
199
|
validate_fn : ValidateFn
|
|
200
200
|
Engine that performs validation and returns a
|
|
201
|
-
:class:`
|
|
201
|
+
:class:`ValidationResultDict` instance.
|
|
202
202
|
print_json_fn : PrintFn
|
|
203
203
|
Structured logger invoked when validation fails.
|
|
204
204
|
|
|
@@ -270,7 +270,7 @@ def _log_failure(
|
|
|
270
270
|
phase: ValidationPhase,
|
|
271
271
|
window: ValidationWindow,
|
|
272
272
|
ruleset_name: str | None,
|
|
273
|
-
result:
|
|
273
|
+
result: ValidationResultDict,
|
|
274
274
|
) -> None:
|
|
275
275
|
"""
|
|
276
276
|
Emit a structured message describing the failed validation.
|
|
@@ -285,7 +285,7 @@ def _log_failure(
|
|
|
285
285
|
Configured validation window.
|
|
286
286
|
ruleset_name : str | None
|
|
287
287
|
Name of the validation ruleset.
|
|
288
|
-
result :
|
|
288
|
+
result : ValidationResultDict
|
|
289
289
|
Result of the failed validation.
|
|
290
290
|
"""
|
|
291
291
|
printer(
|
etlplus/ops/validate.py
CHANGED
|
@@ -44,9 +44,9 @@ from .load import load_data
|
|
|
44
44
|
|
|
45
45
|
|
|
46
46
|
__all__ = [
|
|
47
|
-
'
|
|
48
|
-
'
|
|
49
|
-
'
|
|
47
|
+
'FieldRulesDict',
|
|
48
|
+
'FieldValidationDict',
|
|
49
|
+
'ValidationDict',
|
|
50
50
|
'validate_field',
|
|
51
51
|
'validate',
|
|
52
52
|
]
|
|
@@ -69,7 +69,7 @@ TYPE_MAP: Final[dict[str, type | tuple[type, ...]]] = {
|
|
|
69
69
|
# SECTION: TYPED DICTS ====================================================== #
|
|
70
70
|
|
|
71
71
|
|
|
72
|
-
class
|
|
72
|
+
class FieldRulesDict(TypedDict, total=False):
|
|
73
73
|
"""
|
|
74
74
|
Validation rules for a single field.
|
|
75
75
|
|
|
@@ -93,7 +93,7 @@ class FieldRules(TypedDict, total=False):
|
|
|
93
93
|
enum: list[Any]
|
|
94
94
|
|
|
95
95
|
|
|
96
|
-
class
|
|
96
|
+
class FieldValidationDict(TypedDict):
|
|
97
97
|
"""
|
|
98
98
|
Validation result for a single field.
|
|
99
99
|
|
|
@@ -109,7 +109,7 @@ class FieldValidation(TypedDict):
|
|
|
109
109
|
errors: list[str]
|
|
110
110
|
|
|
111
111
|
|
|
112
|
-
class
|
|
112
|
+
class ValidationDict(TypedDict):
|
|
113
113
|
"""
|
|
114
114
|
Validation result for a complete data structure.
|
|
115
115
|
|
|
@@ -134,7 +134,7 @@ class Validation(TypedDict):
|
|
|
134
134
|
# SECTION: TYPE ALIASES ===================================================== #
|
|
135
135
|
|
|
136
136
|
|
|
137
|
-
type RulesMap = Mapping[str,
|
|
137
|
+
type RulesMap = Mapping[str, FieldRulesDict]
|
|
138
138
|
|
|
139
139
|
|
|
140
140
|
# SECTION: INTERNAL FUNCTIONS ============================================== #
|
|
@@ -339,8 +339,8 @@ def _validate_record(
|
|
|
339
339
|
|
|
340
340
|
def validate_field(
|
|
341
341
|
value: Any,
|
|
342
|
-
rules: StrAnyMap |
|
|
343
|
-
) ->
|
|
342
|
+
rules: StrAnyMap | FieldRulesDict,
|
|
343
|
+
) -> FieldValidationDict:
|
|
344
344
|
"""
|
|
345
345
|
Validate a single value against field rules.
|
|
346
346
|
|
|
@@ -348,14 +348,14 @@ def validate_field(
|
|
|
348
348
|
----------
|
|
349
349
|
value : Any
|
|
350
350
|
The value to validate. ``None`` is treated as missing.
|
|
351
|
-
rules : StrAnyMap |
|
|
351
|
+
rules : StrAnyMap | FieldRulesDict
|
|
352
352
|
Rule dictionary. Supported keys include ``required``, ``type``,
|
|
353
353
|
``min``, ``max``, ``minLength``, ``maxLength``, ``pattern``, and
|
|
354
354
|
``enum``.
|
|
355
355
|
|
|
356
356
|
Returns
|
|
357
357
|
-------
|
|
358
|
-
|
|
358
|
+
FieldValidationDict
|
|
359
359
|
Result with ``valid`` and a list of ``errors``.
|
|
360
360
|
|
|
361
361
|
Notes
|
|
@@ -438,7 +438,7 @@ def validate_field(
|
|
|
438
438
|
def validate(
|
|
439
439
|
source: StrPath | JSONData,
|
|
440
440
|
rules: RulesMap | None = None,
|
|
441
|
-
) ->
|
|
441
|
+
) -> ValidationDict:
|
|
442
442
|
"""
|
|
443
443
|
Validate data against rules.
|
|
444
444
|
|
|
@@ -452,7 +452,7 @@ def validate(
|
|
|
452
452
|
|
|
453
453
|
Returns
|
|
454
454
|
-------
|
|
455
|
-
|
|
455
|
+
ValidationDict
|
|
456
456
|
Structured result with keys ``valid``, ``errors``, ``field_errors``,
|
|
457
457
|
and ``data``. If loading fails, ``data`` is ``None`` and an error is
|
|
458
458
|
reported in ``errors``.
|
etlplus/templates/README.md
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
# `etlplus.templates` Subpackage
|
|
2
2
|
|
|
3
|
-
Documentation for the `etlplus.templates` subpackage: SQL
|
|
3
|
+
Documentation for the `etlplus.templates` subpackage: bundled SQL/DDL templates used by the database
|
|
4
|
+
helpers.
|
|
4
5
|
|
|
5
6
|
- Provides Jinja2 templates for DDL and view generation
|
|
6
|
-
-
|
|
7
|
-
-
|
|
7
|
+
- Used by `etlplus.database.render_table_sql` and related helpers
|
|
8
|
+
- Exposed as plain template files you can reuse with your own Jinja2 setup
|
|
8
9
|
|
|
9
10
|
Back to project overview: see the top-level [README](../../README.md).
|
|
10
11
|
|
|
@@ -21,21 +22,22 @@ Back to project overview: see the top-level [README](../../README.md).
|
|
|
21
22
|
|
|
22
23
|
## Rendering Templates
|
|
23
24
|
|
|
24
|
-
|
|
25
|
+
ETLPlus does not currently expose a `render_template` helper in this package. Use the database
|
|
26
|
+
helpers instead:
|
|
25
27
|
|
|
26
28
|
```python
|
|
27
|
-
from etlplus.
|
|
29
|
+
from etlplus.database import render_table_sql, load_table_spec
|
|
28
30
|
|
|
29
|
-
|
|
31
|
+
spec = load_table_spec("schemas/users.yml")
|
|
32
|
+
sql = render_table_sql(spec, template="ddl")
|
|
30
33
|
```
|
|
31
34
|
|
|
32
35
|
## Example: Rendering a DDL Template
|
|
33
36
|
|
|
34
37
|
```python
|
|
35
|
-
from etlplus.
|
|
38
|
+
from etlplus.database import render_tables_to_string
|
|
36
39
|
|
|
37
|
-
|
|
38
|
-
sql = render_template("ddl.sql.j2", schema=schema)
|
|
40
|
+
sql = render_tables_to_string(["schemas/users.yml"], template="ddl")
|
|
39
41
|
print(sql)
|
|
40
42
|
```
|
|
41
43
|
|
etlplus/types.py
CHANGED
|
@@ -12,12 +12,12 @@ Notes
|
|
|
12
12
|
See Also
|
|
13
13
|
--------
|
|
14
14
|
- :mod:`etlplus.api.types` for HTTP-specific aliases and data classes
|
|
15
|
-
- :mod:`etlplus.connector.types` for connector-specific aliases
|
|
16
|
-
surfaces
|
|
15
|
+
- :mod:`etlplus.connector.types` for connector-specific aliases
|
|
17
16
|
|
|
18
17
|
Examples
|
|
19
18
|
--------
|
|
20
|
-
>>> from etlplus.types import JSONDict
|
|
19
|
+
>>> from etlplus.types import JSONDict
|
|
20
|
+
>>> from etlplus.ops.types import PipelineConfig
|
|
21
21
|
>>> payload: JSONDict = {'id': 1, 'name': 'Ada'}
|
|
22
22
|
>>> isinstance(payload, dict)
|
|
23
23
|
True
|
|
@@ -54,33 +54,15 @@ __all__ = [
|
|
|
54
54
|
'JSONRecords',
|
|
55
55
|
# Type Aliases (File System)
|
|
56
56
|
'StrPath',
|
|
57
|
-
# Type Aliases (Functions)
|
|
58
|
-
'AggregateFunc',
|
|
59
|
-
'OperatorFunc',
|
|
60
|
-
# Type Aliases (Records & Fields)
|
|
61
|
-
'FieldName',
|
|
62
|
-
'Fields',
|
|
63
57
|
# Type Aliases (Transform Specs)
|
|
64
58
|
'StrAnyMap',
|
|
65
59
|
'StrSeqMap',
|
|
66
60
|
'StrStrMap',
|
|
67
|
-
'AggregateSpec',
|
|
68
|
-
'FilterSpec',
|
|
69
|
-
'MapSpec',
|
|
70
|
-
'SelectSpec',
|
|
71
|
-
'SortSpec',
|
|
72
|
-
# Type Aliases (Pipelines)
|
|
73
|
-
'StepOrSteps',
|
|
74
|
-
'StepSeq',
|
|
75
|
-
'StepSpec',
|
|
76
|
-
'PipelineStepName',
|
|
77
|
-
'PipelineConfig',
|
|
78
|
-
# Type Aliases (Helpers)
|
|
79
|
-
'StepApplier',
|
|
80
|
-
'SortKey',
|
|
81
61
|
# Type Aliases (Networking / Runtime)
|
|
82
62
|
'Sleeper',
|
|
83
63
|
'Timeout',
|
|
64
|
+
# Type Aliases (Templates)
|
|
65
|
+
'TemplateKey',
|
|
84
66
|
]
|
|
85
67
|
|
|
86
68
|
|
|
@@ -125,22 +107,6 @@ type JSONRecords = list[JSONRecord]
|
|
|
125
107
|
# Path-like inputs accepted by file helpers.
|
|
126
108
|
type StrPath = str | Path | PathLike[str]
|
|
127
109
|
|
|
128
|
-
# -- Functions -- #
|
|
129
|
-
|
|
130
|
-
# Callable reducing numeric collections into a summary value.
|
|
131
|
-
type AggregateFunc = Callable[[list[float], int], Any]
|
|
132
|
-
|
|
133
|
-
# Binary predicate consumed by filter operations.
|
|
134
|
-
type OperatorFunc = Callable[[Any, Any], bool]
|
|
135
|
-
|
|
136
|
-
# -- Records & Fields -- #
|
|
137
|
-
|
|
138
|
-
# Individual field identifier referenced inside specs.
|
|
139
|
-
type FieldName = str
|
|
140
|
-
|
|
141
|
-
# Ordered list of :data:`FieldName` entries preserving projection order.
|
|
142
|
-
type Fields = list[FieldName]
|
|
143
|
-
|
|
144
110
|
# -- Transform Specs -- #
|
|
145
111
|
|
|
146
112
|
# Kept intentionally broad for runtime-friendly validation in transform.py.
|
|
@@ -156,69 +122,6 @@ type StrStrMap = Mapping[str, str]
|
|
|
156
122
|
# Mapping whose values are homogeneous sequences.
|
|
157
123
|
type StrSeqMap = Mapping[str, Sequence[Any]]
|
|
158
124
|
|
|
159
|
-
# Transform step specifications
|
|
160
|
-
|
|
161
|
-
# Filtering spec expecting ``field``, ``op``, and ``value`` keys.
|
|
162
|
-
type FilterSpec = StrAnyMap
|
|
163
|
-
|
|
164
|
-
# Field renaming instructions mapping old keys to new ones.
|
|
165
|
-
type MapSpec = StrStrMap
|
|
166
|
-
|
|
167
|
-
# Projection spec as a field list or mapping with metadata.
|
|
168
|
-
#
|
|
169
|
-
# Examples
|
|
170
|
-
# --------
|
|
171
|
-
# >>> from etlplus.types import SelectSpec
|
|
172
|
-
# >>> spec1: SelectSpec = ['a','b']
|
|
173
|
-
# >>> spec2: SelectSpec = {'fields': [...]}
|
|
174
|
-
type SelectSpec = Fields | StrSeqMap
|
|
175
|
-
|
|
176
|
-
# Sort directive expressed as a field string or mapping with flags.
|
|
177
|
-
#
|
|
178
|
-
# Examples
|
|
179
|
-
# --------
|
|
180
|
-
# >>> from etlplus.types import SortSpec
|
|
181
|
-
# >>> spec1: SortSpec = 'field'
|
|
182
|
-
# >>> spec2: SortSpec = {'field': 'x', 'reverse': True}
|
|
183
|
-
type SortSpec = str | StrAnyMap
|
|
184
|
-
|
|
185
|
-
# Aggregate instruction covering ``field``, ``func``, and optional alias.
|
|
186
|
-
#
|
|
187
|
-
# Supported functions: ``avg``, ``count``, ``max``, ``min``, and ``sum``.
|
|
188
|
-
# Examples
|
|
189
|
-
# --------
|
|
190
|
-
# >>> from etlplus.types import AggregateSpec
|
|
191
|
-
# >>> spec: AggregateSpec = \
|
|
192
|
-
# ... {'field': 'x', 'func': 'sum' | 'avg' | ..., 'alias'?: '...'}
|
|
193
|
-
type AggregateSpec = StrAnyMap
|
|
194
|
-
|
|
195
|
-
# -- Pipelines-- #
|
|
196
|
-
|
|
197
|
-
# Unified pipeline step spec consumed by :mod:`etlplus.ops.transform`.
|
|
198
|
-
type StepSpec = AggregateSpec | FilterSpec | MapSpec | SelectSpec | SortSpec
|
|
199
|
-
|
|
200
|
-
# Collections of steps
|
|
201
|
-
|
|
202
|
-
# Ordered collection of :data:`StepSpec` entries.
|
|
203
|
-
type StepSeq = Sequence[StepSpec]
|
|
204
|
-
|
|
205
|
-
# Accepts either a single :data:`StepSpec` or a sequence of them.
|
|
206
|
-
type StepOrSteps = StepSpec | StepSeq
|
|
207
|
-
|
|
208
|
-
# Canonical literal names for supported transform stages.
|
|
209
|
-
type PipelineStepName = Literal['filter', 'map', 'select', 'sort', 'aggregate']
|
|
210
|
-
|
|
211
|
-
# Mapping from step name to its associated specification payload.
|
|
212
|
-
type PipelineConfig = Mapping[PipelineStepName, StepOrSteps]
|
|
213
|
-
|
|
214
|
-
# -- Helpers -- #
|
|
215
|
-
|
|
216
|
-
# Callable that applies step configuration to a batch of records.
|
|
217
|
-
type StepApplier = Callable[[JSONList, Any], JSONList]
|
|
218
|
-
|
|
219
|
-
# Tuple combining stable sort index and computed sort value.
|
|
220
|
-
type SortKey = tuple[int, Any]
|
|
221
|
-
|
|
222
125
|
# -- Networking / Runtime -- #
|
|
223
126
|
|
|
224
127
|
# Sleep function used by retry helpers.
|
etlplus/workflow/README.md
CHANGED
|
@@ -12,8 +12,6 @@ Back to project overview: see the top-level [README](../../README.md).
|
|
|
12
12
|
|
|
13
13
|
- [`etlplus.workflow` Subpackage](#etlplusworkflow-subpackage)
|
|
14
14
|
- [Supported Configuration Types](#supported-configuration-types)
|
|
15
|
-
- [Loading and Validating Configs](#loading-and-validating-configs)
|
|
16
|
-
- [Example: Loading a Pipeline Config](#example-loading-a-pipeline-config)
|
|
17
15
|
- [See Also](#see-also)
|
|
18
16
|
|
|
19
17
|
## Supported Configuration Types
|
|
@@ -23,28 +21,6 @@ Back to project overview: see the top-level [README](../../README.md).
|
|
|
23
21
|
- **Pipeline**: End-to-end pipeline configuration
|
|
24
22
|
- **Profile**: User or environment-specific settings
|
|
25
23
|
|
|
26
|
-
## Loading and Validating Configs
|
|
27
|
-
|
|
28
|
-
Use the provided classes to load and validate configuration files:
|
|
29
|
-
|
|
30
|
-
```python
|
|
31
|
-
from etlplus.workflow import PipelineConfig
|
|
32
|
-
|
|
33
|
-
cfg = PipelineConfig.from_yaml("pipeline.yml")
|
|
34
|
-
```
|
|
35
|
-
|
|
36
|
-
- Supports YAML and JSON formats
|
|
37
|
-
- Validates against expected schema
|
|
38
|
-
|
|
39
|
-
## Example: Loading a Pipeline Config
|
|
40
|
-
|
|
41
|
-
```python
|
|
42
|
-
from etlplus.workflow import PipelineConfig
|
|
43
|
-
|
|
44
|
-
pipeline = PipelineConfig.from_yaml("configs/pipeline.yml")
|
|
45
|
-
print(pipeline)
|
|
46
|
-
```
|
|
47
|
-
|
|
48
24
|
## See Also
|
|
49
25
|
|
|
50
26
|
- Top-level CLI and library usage in the main [README](../../README.md)
|
etlplus/workflow/__init__.py
CHANGED
|
@@ -12,8 +12,7 @@ from .jobs import JobConfig
|
|
|
12
12
|
from .jobs import LoadRef
|
|
13
13
|
from .jobs import TransformRef
|
|
14
14
|
from .jobs import ValidationRef
|
|
15
|
-
from .
|
|
16
|
-
from .pipeline import load_pipeline_config
|
|
15
|
+
from .profile import ProfileConfig
|
|
17
16
|
|
|
18
17
|
# SECTION: EXPORTS ========================================================== #
|
|
19
18
|
|
|
@@ -23,10 +22,9 @@ __all__ = [
|
|
|
23
22
|
'ExtractRef',
|
|
24
23
|
'JobConfig',
|
|
25
24
|
'LoadRef',
|
|
26
|
-
'
|
|
25
|
+
'ProfileConfig',
|
|
27
26
|
'TransformRef',
|
|
28
27
|
'ValidationRef',
|
|
29
28
|
# Functions
|
|
30
|
-
'load_pipeline_config',
|
|
31
29
|
'topological_sort_jobs',
|
|
32
30
|
]
|
etlplus/workflow/dag.py
CHANGED
|
@@ -47,6 +47,28 @@ class DagError(ValueError):
|
|
|
47
47
|
return self.message
|
|
48
48
|
|
|
49
49
|
|
|
50
|
+
# SECTION: INTERNAL FUNCTIONS =============================================== #
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _ready(
|
|
54
|
+
indegree: dict[str, int],
|
|
55
|
+
) -> list[str]:
|
|
56
|
+
"""
|
|
57
|
+
Return a sorted list of nodes with zero indegree.
|
|
58
|
+
|
|
59
|
+
Parameters
|
|
60
|
+
----------
|
|
61
|
+
indegree : dict[str, int]
|
|
62
|
+
Mapping of node name to indegree.
|
|
63
|
+
|
|
64
|
+
Returns
|
|
65
|
+
-------
|
|
66
|
+
list[str]
|
|
67
|
+
Sorted list of node names ready to process.
|
|
68
|
+
"""
|
|
69
|
+
return sorted(name for name, deg in indegree.items() if deg == 0)
|
|
70
|
+
|
|
71
|
+
|
|
50
72
|
# SECTION: FUNCTIONS ======================================================== #
|
|
51
73
|
|
|
52
74
|
|
|
@@ -88,7 +110,7 @@ def topological_sort_jobs(
|
|
|
88
110
|
edges[dep].add(job.name)
|
|
89
111
|
indegree[job.name] += 1
|
|
90
112
|
|
|
91
|
-
queue = deque(
|
|
113
|
+
queue = deque(_ready(indegree))
|
|
92
114
|
ordered: list[str] = []
|
|
93
115
|
|
|
94
116
|
while queue:
|
etlplus/workflow/jobs.py
CHANGED
|
@@ -6,14 +6,13 @@ transform, load).
|
|
|
6
6
|
|
|
7
7
|
Notes
|
|
8
8
|
-----
|
|
9
|
-
- Lightweight references used inside :class:`PipelineConfig` to avoid storing
|
|
10
|
-
large nested structures.
|
|
11
9
|
- All attributes are simple and optional where appropriate, keeping parsing
|
|
12
10
|
tolerant.
|
|
13
11
|
"""
|
|
14
12
|
|
|
15
13
|
from __future__ import annotations
|
|
16
14
|
|
|
15
|
+
from collections.abc import Sequence
|
|
17
16
|
from dataclasses import dataclass
|
|
18
17
|
from dataclasses import field
|
|
19
18
|
from typing import Any
|
|
@@ -76,13 +75,15 @@ def _parse_depends_on(
|
|
|
76
75
|
"""
|
|
77
76
|
if isinstance(value, str):
|
|
78
77
|
return [value]
|
|
79
|
-
if isinstance(value,
|
|
78
|
+
if isinstance(value, Sequence) and not isinstance(
|
|
79
|
+
value,
|
|
80
|
+
(str, bytes, bytearray),
|
|
81
|
+
):
|
|
80
82
|
return [entry for entry in value if isinstance(entry, str)]
|
|
81
83
|
return []
|
|
82
84
|
|
|
83
85
|
|
|
84
86
|
def _require_str(
|
|
85
|
-
# data: dict[str, Any],
|
|
86
87
|
data: StrAnyMap,
|
|
87
88
|
key: str,
|
|
88
89
|
) -> str | None:
|
|
@@ -149,13 +150,9 @@ class ExtractRef:
|
|
|
149
150
|
data = maybe_mapping(obj)
|
|
150
151
|
if not data:
|
|
151
152
|
return None
|
|
152
|
-
source
|
|
153
|
-
if source is None:
|
|
153
|
+
if (source := _require_str(data, 'source')) is None:
|
|
154
154
|
return None
|
|
155
|
-
return cls(
|
|
156
|
-
source=source,
|
|
157
|
-
options=coerce_dict(data.get('options')),
|
|
158
|
-
)
|
|
155
|
+
return cls(source=source, options=coerce_dict(data.get('options')))
|
|
159
156
|
|
|
160
157
|
|
|
161
158
|
@dataclass(kw_only=True, slots=True)
|
|
@@ -214,18 +211,13 @@ class JobConfig:
|
|
|
214
211
|
data = maybe_mapping(obj)
|
|
215
212
|
if not data:
|
|
216
213
|
return None
|
|
217
|
-
name
|
|
218
|
-
if name is None:
|
|
214
|
+
if (name := _require_str(data, 'name')) is None:
|
|
219
215
|
return None
|
|
220
216
|
|
|
221
|
-
description = _coerce_optional_str(data.get('description'))
|
|
222
|
-
|
|
223
|
-
depends_on = _parse_depends_on(data.get('depends_on'))
|
|
224
|
-
|
|
225
217
|
return cls(
|
|
226
218
|
name=name,
|
|
227
|
-
description=description,
|
|
228
|
-
depends_on=depends_on,
|
|
219
|
+
description=_coerce_optional_str(data.get('description')),
|
|
220
|
+
depends_on=_parse_depends_on(data.get('depends_on')),
|
|
229
221
|
extract=ExtractRef.from_obj(data.get('extract')),
|
|
230
222
|
validate=ValidationRef.from_obj(data.get('validate')),
|
|
231
223
|
transform=TransformRef.from_obj(data.get('transform')),
|
|
@@ -274,8 +266,7 @@ class LoadRef:
|
|
|
274
266
|
data = maybe_mapping(obj)
|
|
275
267
|
if not data:
|
|
276
268
|
return None
|
|
277
|
-
target
|
|
278
|
-
if target is None:
|
|
269
|
+
if (target := _require_str(data, 'target')) is None:
|
|
279
270
|
return None
|
|
280
271
|
return cls(
|
|
281
272
|
target=target,
|
|
@@ -321,8 +312,7 @@ class TransformRef:
|
|
|
321
312
|
data = maybe_mapping(obj)
|
|
322
313
|
if not data:
|
|
323
314
|
return None
|
|
324
|
-
pipeline
|
|
325
|
-
if pipeline is None:
|
|
315
|
+
if (pipeline := _require_str(data, 'pipeline')) is None:
|
|
326
316
|
return None
|
|
327
317
|
return cls(pipeline=pipeline)
|
|
328
318
|
|
|
@@ -372,13 +362,10 @@ class ValidationRef:
|
|
|
372
362
|
data = maybe_mapping(obj)
|
|
373
363
|
if not data:
|
|
374
364
|
return None
|
|
375
|
-
ruleset
|
|
376
|
-
if ruleset is None:
|
|
365
|
+
if (ruleset := _require_str(data, 'ruleset')) is None:
|
|
377
366
|
return None
|
|
378
|
-
severity = _coerce_optional_str(data.get('severity'))
|
|
379
|
-
phase = _coerce_optional_str(data.get('phase'))
|
|
380
367
|
return cls(
|
|
381
368
|
ruleset=ruleset,
|
|
382
|
-
severity=severity,
|
|
383
|
-
phase=phase,
|
|
369
|
+
severity=_coerce_optional_str(data.get('severity')),
|
|
370
|
+
phase=_coerce_optional_str(data.get('phase')),
|
|
384
371
|
)
|