etlplus 0.15.0__py3-none-any.whl → 0.16.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- etlplus/README.md +25 -3
- etlplus/__init__.py +2 -0
- etlplus/api/README.md +31 -0
- etlplus/api/__init__.py +14 -14
- etlplus/api/auth.py +10 -7
- etlplus/api/config.py +8 -13
- etlplus/api/endpoint_client.py +20 -20
- etlplus/api/errors.py +4 -4
- etlplus/api/pagination/__init__.py +6 -6
- etlplus/api/pagination/config.py +12 -10
- etlplus/api/pagination/paginator.py +6 -7
- etlplus/api/rate_limiting/__init__.py +2 -2
- etlplus/api/rate_limiting/config.py +14 -14
- etlplus/api/rate_limiting/rate_limiter.py +3 -3
- etlplus/api/request_manager.py +4 -4
- etlplus/api/retry_manager.py +8 -8
- etlplus/api/transport.py +11 -11
- etlplus/api/types.py +131 -11
- etlplus/api/utils.py +50 -50
- etlplus/cli/commands.py +93 -60
- etlplus/cli/constants.py +1 -1
- etlplus/cli/handlers.py +43 -26
- etlplus/cli/io.py +2 -2
- etlplus/cli/main.py +2 -2
- etlplus/cli/state.py +4 -7
- etlplus/{workflow/pipeline.py → config.py} +62 -99
- etlplus/connector/__init__.py +43 -0
- etlplus/connector/api.py +161 -0
- etlplus/connector/connector.py +26 -0
- etlplus/connector/core.py +132 -0
- etlplus/connector/database.py +122 -0
- etlplus/connector/enums.py +52 -0
- etlplus/connector/file.py +120 -0
- etlplus/connector/types.py +40 -0
- etlplus/connector/utils.py +122 -0
- etlplus/database/ddl.py +2 -2
- etlplus/database/engine.py +19 -3
- etlplus/database/orm.py +2 -0
- etlplus/enums.py +36 -200
- etlplus/file/_imports.py +1 -0
- etlplus/file/_io.py +52 -4
- etlplus/file/accdb.py +3 -2
- etlplus/file/arrow.py +3 -2
- etlplus/file/avro.py +3 -2
- etlplus/file/bson.py +3 -2
- etlplus/file/cbor.py +3 -2
- etlplus/file/cfg.py +3 -2
- etlplus/file/conf.py +3 -2
- etlplus/file/core.py +11 -8
- etlplus/file/csv.py +3 -2
- etlplus/file/dat.py +3 -2
- etlplus/file/dta.py +3 -2
- etlplus/file/duckdb.py +3 -2
- etlplus/file/enums.py +1 -1
- etlplus/file/feather.py +3 -2
- etlplus/file/fwf.py +3 -2
- etlplus/file/gz.py +3 -2
- etlplus/file/hbs.py +3 -2
- etlplus/file/hdf5.py +3 -2
- etlplus/file/ini.py +3 -2
- etlplus/file/ion.py +3 -2
- etlplus/file/jinja2.py +3 -2
- etlplus/file/json.py +5 -16
- etlplus/file/log.py +3 -2
- etlplus/file/mat.py +3 -2
- etlplus/file/mdb.py +3 -2
- etlplus/file/msgpack.py +3 -2
- etlplus/file/mustache.py +3 -2
- etlplus/file/nc.py +3 -2
- etlplus/file/ndjson.py +3 -2
- etlplus/file/numbers.py +3 -2
- etlplus/file/ods.py +3 -2
- etlplus/file/orc.py +3 -2
- etlplus/file/parquet.py +3 -2
- etlplus/file/pb.py +3 -2
- etlplus/file/pbf.py +3 -2
- etlplus/file/properties.py +3 -2
- etlplus/file/proto.py +3 -2
- etlplus/file/psv.py +3 -2
- etlplus/file/rda.py +3 -2
- etlplus/file/rds.py +3 -2
- etlplus/file/sas7bdat.py +3 -2
- etlplus/file/sav.py +3 -2
- etlplus/file/sqlite.py +3 -2
- etlplus/file/stub.py +1 -0
- etlplus/file/sylk.py +3 -2
- etlplus/file/tab.py +3 -2
- etlplus/file/toml.py +3 -2
- etlplus/file/tsv.py +3 -2
- etlplus/file/txt.py +4 -3
- etlplus/file/vm.py +3 -2
- etlplus/file/wks.py +3 -2
- etlplus/file/xls.py +3 -2
- etlplus/file/xlsm.py +3 -2
- etlplus/file/xlsx.py +3 -2
- etlplus/file/xml.py +9 -3
- etlplus/file/xpt.py +3 -2
- etlplus/file/yaml.py +5 -16
- etlplus/file/zip.py +3 -2
- etlplus/file/zsav.py +3 -2
- etlplus/ops/__init__.py +1 -0
- etlplus/ops/enums.py +173 -0
- etlplus/ops/extract.py +222 -23
- etlplus/ops/load.py +155 -36
- etlplus/ops/run.py +92 -107
- etlplus/ops/transform.py +48 -29
- etlplus/ops/types.py +147 -0
- etlplus/ops/utils.py +11 -40
- etlplus/ops/validate.py +16 -16
- etlplus/types.py +6 -102
- etlplus/utils.py +163 -29
- etlplus/workflow/README.md +0 -24
- etlplus/workflow/__init__.py +2 -15
- etlplus/workflow/dag.py +23 -1
- etlplus/workflow/jobs.py +83 -39
- etlplus/workflow/profile.py +4 -2
- {etlplus-0.15.0.dist-info → etlplus-0.16.6.dist-info}/METADATA +4 -4
- etlplus-0.16.6.dist-info/RECORD +143 -0
- {etlplus-0.15.0.dist-info → etlplus-0.16.6.dist-info}/WHEEL +1 -1
- etlplus/config/README.md +0 -50
- etlplus/config/__init__.py +0 -33
- etlplus/config/types.py +0 -140
- etlplus/dag.py +0 -103
- etlplus/workflow/connector.py +0 -373
- etlplus/workflow/types.py +0 -115
- etlplus/workflow/utils.py +0 -120
- etlplus-0.15.0.dist-info/RECORD +0 -139
- {etlplus-0.15.0.dist-info → etlplus-0.16.6.dist-info}/entry_points.txt +0 -0
- {etlplus-0.15.0.dist-info → etlplus-0.16.6.dist-info}/licenses/LICENSE +0 -0
- {etlplus-0.15.0.dist-info → etlplus-0.16.6.dist-info}/top_level.txt +0 -0
etlplus/ops/run.py
CHANGED
|
@@ -6,31 +6,23 @@ A module for running ETL jobs defined in YAML configurations.
|
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
|
-
from collections.abc import Mapping
|
|
10
9
|
from typing import Any
|
|
11
10
|
from typing import Final
|
|
12
11
|
from typing import cast
|
|
13
|
-
from urllib.parse import urlsplit
|
|
14
|
-
from urllib.parse import urlunsplit
|
|
15
12
|
|
|
16
|
-
from ..api import EndpointClient # noqa: F401 (re-exported for tests)
|
|
17
13
|
from ..api import HttpMethod
|
|
18
|
-
from ..
|
|
19
|
-
from ..
|
|
20
|
-
from ..api import compose_api_request_env
|
|
21
|
-
from ..api import compose_api_target_env
|
|
22
|
-
from ..api import paginate_with_client
|
|
23
|
-
from ..enums import DataConnectorType
|
|
14
|
+
from ..config import Config
|
|
15
|
+
from ..connector import DataConnectorType
|
|
24
16
|
from ..file import FileFormat
|
|
17
|
+
from ..ops.types import PipelineConfig
|
|
25
18
|
from ..types import JSONData
|
|
26
19
|
from ..types import JSONDict
|
|
27
|
-
from ..types import PipelineConfig
|
|
28
20
|
from ..types import StrPath
|
|
29
|
-
from ..types import Timeout
|
|
30
21
|
from ..utils import print_json
|
|
31
|
-
from ..workflow import load_pipeline_config
|
|
32
22
|
from .extract import extract
|
|
23
|
+
from .extract import extract_from_api_source
|
|
33
24
|
from .load import load
|
|
25
|
+
from .load import load_to_api_target
|
|
34
26
|
from .transform import transform
|
|
35
27
|
from .utils import maybe_validate
|
|
36
28
|
from .validate import validate
|
|
@@ -54,6 +46,75 @@ DEFAULT_CONFIG_PATH: Final[str] = 'in/pipeline.yml'
|
|
|
54
46
|
# SECTION: INTERNAL FUNCTIONS =============================================== #
|
|
55
47
|
|
|
56
48
|
|
|
49
|
+
def _index_connectors(
|
|
50
|
+
connectors: list[Any],
|
|
51
|
+
*,
|
|
52
|
+
label: str,
|
|
53
|
+
) -> dict[str, Any]:
|
|
54
|
+
"""
|
|
55
|
+
Index connectors by name with a helpful error on duplicates.
|
|
56
|
+
|
|
57
|
+
Parameters
|
|
58
|
+
----------
|
|
59
|
+
connectors : list[Any]
|
|
60
|
+
Connector objects to index.
|
|
61
|
+
label : str
|
|
62
|
+
Label used in error messages (e.g., ``"source"``).
|
|
63
|
+
|
|
64
|
+
Returns
|
|
65
|
+
-------
|
|
66
|
+
dict[str, Any]
|
|
67
|
+
Mapping of connector names to connector objects.
|
|
68
|
+
|
|
69
|
+
Raises
|
|
70
|
+
------
|
|
71
|
+
ValueError
|
|
72
|
+
If duplicate connector names are found.
|
|
73
|
+
"""
|
|
74
|
+
indexed: dict[str, Any] = {}
|
|
75
|
+
for connector in connectors:
|
|
76
|
+
name = getattr(connector, 'name', None)
|
|
77
|
+
if not isinstance(name, str) or not name:
|
|
78
|
+
continue
|
|
79
|
+
if name in indexed:
|
|
80
|
+
raise ValueError(f'Duplicate {label} connector name: {name}')
|
|
81
|
+
indexed[name] = connector
|
|
82
|
+
return indexed
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _require_named_connector(
|
|
86
|
+
connectors: dict[str, Any],
|
|
87
|
+
name: str,
|
|
88
|
+
*,
|
|
89
|
+
label: str,
|
|
90
|
+
) -> Any:
|
|
91
|
+
"""
|
|
92
|
+
Return a connector by name or raise a helpful error.
|
|
93
|
+
|
|
94
|
+
Parameters
|
|
95
|
+
----------
|
|
96
|
+
connectors : dict[str, Any]
|
|
97
|
+
Mapping of connector names to connector objects.
|
|
98
|
+
name : str
|
|
99
|
+
Connector name to retrieve.
|
|
100
|
+
label : str
|
|
101
|
+
Label used in error messages (e.g., ``"source"``).
|
|
102
|
+
|
|
103
|
+
Returns
|
|
104
|
+
-------
|
|
105
|
+
Any
|
|
106
|
+
Connector object.
|
|
107
|
+
|
|
108
|
+
Raises
|
|
109
|
+
------
|
|
110
|
+
ValueError
|
|
111
|
+
If the connector name is not found.
|
|
112
|
+
"""
|
|
113
|
+
if name not in connectors:
|
|
114
|
+
raise ValueError(f'Unknown {label}: {name}')
|
|
115
|
+
return connectors[name]
|
|
116
|
+
|
|
117
|
+
|
|
57
118
|
def _resolve_validation_config(
|
|
58
119
|
job_obj: Any,
|
|
59
120
|
cfg: Any,
|
|
@@ -94,7 +155,7 @@ def run(
|
|
|
94
155
|
Run a pipeline job defined in a YAML configuration.
|
|
95
156
|
|
|
96
157
|
By default it reads the configuration from ``in/pipeline.yml``, but callers
|
|
97
|
-
can provide an explicit
|
|
158
|
+
can provide an explicit *config_path* to override this.
|
|
98
159
|
|
|
99
160
|
Parameters
|
|
100
161
|
----------
|
|
@@ -115,23 +176,25 @@ def run(
|
|
|
115
176
|
If the job is not found or if there are configuration issues.
|
|
116
177
|
"""
|
|
117
178
|
cfg_path = config_path or DEFAULT_CONFIG_PATH
|
|
118
|
-
cfg =
|
|
179
|
+
cfg = Config.from_yaml(cfg_path, substitute=True)
|
|
119
180
|
|
|
120
181
|
# Lookup job by name
|
|
121
182
|
if not (job_obj := next((j for j in cfg.jobs if j.name == job), None)):
|
|
122
183
|
raise ValueError(f'Job not found: {job}')
|
|
123
184
|
|
|
124
185
|
# Index sources/targets by name
|
|
125
|
-
sources_by_name =
|
|
126
|
-
targets_by_name =
|
|
186
|
+
sources_by_name = _index_connectors(cfg.sources, label='source')
|
|
187
|
+
targets_by_name = _index_connectors(cfg.targets, label='target')
|
|
127
188
|
|
|
128
189
|
# Extract.
|
|
129
190
|
if not job_obj.extract:
|
|
130
191
|
raise ValueError('Job missing "extract" section')
|
|
131
192
|
source_name = job_obj.extract.source
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
193
|
+
source_obj = _require_named_connector(
|
|
194
|
+
sources_by_name,
|
|
195
|
+
source_name,
|
|
196
|
+
label='source',
|
|
197
|
+
)
|
|
135
198
|
ex_opts: dict[str, Any] = job_obj.extract.options or {}
|
|
136
199
|
|
|
137
200
|
data: Any
|
|
@@ -151,68 +214,7 @@ def run(
|
|
|
151
214
|
conn = getattr(source_obj, 'connection_string', '')
|
|
152
215
|
data = extract('database', conn)
|
|
153
216
|
case DataConnectorType.API:
|
|
154
|
-
|
|
155
|
-
if (
|
|
156
|
-
env.get('use_endpoints')
|
|
157
|
-
and env.get('base_url')
|
|
158
|
-
and env.get('endpoints_map')
|
|
159
|
-
and env.get('endpoint_key')
|
|
160
|
-
):
|
|
161
|
-
# Construct client using module-level EndpointClient so tests
|
|
162
|
-
# can monkeypatch this class on etlplus.ops.run.
|
|
163
|
-
ClientClass = EndpointClient # noqa: N806
|
|
164
|
-
client = ClientClass(
|
|
165
|
-
base_url=cast(str, env.get('base_url')),
|
|
166
|
-
base_path=cast(str | None, env.get('base_path')),
|
|
167
|
-
endpoints=cast(
|
|
168
|
-
dict[str, str],
|
|
169
|
-
env.get('endpoints_map', {}),
|
|
170
|
-
),
|
|
171
|
-
retry=env.get('retry'),
|
|
172
|
-
retry_network_errors=bool(
|
|
173
|
-
env.get('retry_network_errors', False),
|
|
174
|
-
),
|
|
175
|
-
session=env.get('session'),
|
|
176
|
-
)
|
|
177
|
-
data = paginate_with_client(
|
|
178
|
-
client,
|
|
179
|
-
cast(str, env.get('endpoint_key')),
|
|
180
|
-
env.get('params'),
|
|
181
|
-
env.get('headers'),
|
|
182
|
-
env.get('timeout'),
|
|
183
|
-
env.get('pagination'),
|
|
184
|
-
cast(float | None, env.get('sleep_seconds')),
|
|
185
|
-
)
|
|
186
|
-
else:
|
|
187
|
-
url = env.get('url')
|
|
188
|
-
if not url:
|
|
189
|
-
raise ValueError('API source missing URL')
|
|
190
|
-
parts = urlsplit(cast(str, url))
|
|
191
|
-
base = urlunsplit((parts.scheme, parts.netloc, '', '', ''))
|
|
192
|
-
ClientClass = EndpointClient # noqa: N806
|
|
193
|
-
client = ClientClass(
|
|
194
|
-
base_url=base,
|
|
195
|
-
base_path=None,
|
|
196
|
-
endpoints={},
|
|
197
|
-
retry=env.get('retry'),
|
|
198
|
-
retry_network_errors=bool(
|
|
199
|
-
env.get('retry_network_errors', False),
|
|
200
|
-
),
|
|
201
|
-
session=env.get('session'),
|
|
202
|
-
)
|
|
203
|
-
|
|
204
|
-
request_options = RequestOptions(
|
|
205
|
-
params=cast(Mapping[str, Any] | None, env.get('params')),
|
|
206
|
-
headers=cast(Mapping[str, str] | None, env.get('headers')),
|
|
207
|
-
timeout=cast(Timeout | None, env.get('timeout')),
|
|
208
|
-
)
|
|
209
|
-
|
|
210
|
-
data = client.paginate_url(
|
|
211
|
-
cast(str, url),
|
|
212
|
-
cast(PaginationConfigMap | None, env.get('pagination')),
|
|
213
|
-
request=request_options,
|
|
214
|
-
sleep_seconds=cast(float, env.get('sleep_seconds', 0.0)),
|
|
215
|
-
)
|
|
217
|
+
data = extract_from_api_source(cfg, source_obj, ex_opts)
|
|
216
218
|
case _:
|
|
217
219
|
# :meth:`coerce` already raises for invalid connector types, but
|
|
218
220
|
# keep explicit guard for defensive programming.
|
|
@@ -256,9 +258,11 @@ def run(
|
|
|
256
258
|
if not job_obj.load:
|
|
257
259
|
raise ValueError('Job missing "load" section')
|
|
258
260
|
target_name = job_obj.load.target
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
261
|
+
target_obj = _require_named_connector(
|
|
262
|
+
targets_by_name,
|
|
263
|
+
target_name,
|
|
264
|
+
label='target',
|
|
265
|
+
)
|
|
262
266
|
overrides = job_obj.load.overrides or {}
|
|
263
267
|
|
|
264
268
|
ttype_raw = getattr(target_obj, 'type', None)
|
|
@@ -274,26 +278,7 @@ def run(
|
|
|
274
278
|
raise ValueError('File target missing "path"')
|
|
275
279
|
result = load(data, 'file', path, file_format=fmt)
|
|
276
280
|
case DataConnectorType.API:
|
|
277
|
-
|
|
278
|
-
url_t = env_t.get('url')
|
|
279
|
-
if not url_t:
|
|
280
|
-
raise ValueError('API target missing "url"')
|
|
281
|
-
kwargs_t: dict[str, Any] = {}
|
|
282
|
-
headers = env_t.get('headers')
|
|
283
|
-
if headers:
|
|
284
|
-
kwargs_t['headers'] = cast(dict[str, str], headers)
|
|
285
|
-
if env_t.get('timeout') is not None:
|
|
286
|
-
kwargs_t['timeout'] = env_t.get('timeout')
|
|
287
|
-
session = env_t.get('session')
|
|
288
|
-
if session is not None:
|
|
289
|
-
kwargs_t['session'] = session
|
|
290
|
-
result = load(
|
|
291
|
-
data,
|
|
292
|
-
'api',
|
|
293
|
-
cast(str, url_t),
|
|
294
|
-
method=cast(str | Any, env_t.get('method') or 'post'),
|
|
295
|
-
**kwargs_t,
|
|
296
|
-
)
|
|
281
|
+
result = load_to_api_target(cfg, target_obj, overrides, data)
|
|
297
282
|
case DataConnectorType.DATABASE:
|
|
298
283
|
conn = overrides.get('connection_string') or getattr(
|
|
299
284
|
target_obj,
|
|
@@ -328,11 +313,11 @@ def run_pipeline(
|
|
|
328
313
|
Parameters
|
|
329
314
|
----------
|
|
330
315
|
source_type : DataConnectorType | str | None, optional
|
|
331
|
-
Connector type for extraction. When ``None``,
|
|
316
|
+
Connector type for extraction. When ``None``, *source* is assumed
|
|
332
317
|
to be pre-loaded data and extraction is skipped.
|
|
333
318
|
source : StrPath | JSONData | None, optional
|
|
334
319
|
Data source for extraction or the pre-loaded payload when
|
|
335
|
-
|
|
320
|
+
*source_type* is ``None``.
|
|
336
321
|
operations : PipelineConfig | None, optional
|
|
337
322
|
Transform configuration passed to :func:`etlplus.ops.transform`.
|
|
338
323
|
target_type : DataConnectorType | str | None, optional
|
etlplus/ops/transform.py
CHANGED
|
@@ -44,28 +44,28 @@ from collections.abc import Sequence
|
|
|
44
44
|
from typing import Any
|
|
45
45
|
from typing import cast
|
|
46
46
|
|
|
47
|
-
from ..
|
|
48
|
-
from ..enums import OperatorName
|
|
49
|
-
from ..enums import PipelineStep
|
|
50
|
-
from ..types import AggregateFunc
|
|
51
|
-
from ..types import AggregateSpec
|
|
52
|
-
from ..types import FieldName
|
|
53
|
-
from ..types import Fields
|
|
54
|
-
from ..types import FilterSpec
|
|
47
|
+
from ..ops.types import PipelineConfig
|
|
55
48
|
from ..types import JSONData
|
|
56
49
|
from ..types import JSONDict
|
|
57
50
|
from ..types import JSONList
|
|
58
|
-
from ..types import MapSpec
|
|
59
|
-
from ..types import OperatorFunc
|
|
60
|
-
from ..types import PipelineConfig
|
|
61
|
-
from ..types import PipelineStepName
|
|
62
|
-
from ..types import SortKey
|
|
63
|
-
from ..types import StepApplier
|
|
64
|
-
from ..types import StepOrSteps
|
|
65
|
-
from ..types import StepSpec
|
|
66
51
|
from ..types import StrPath
|
|
67
52
|
from ..utils import to_number
|
|
53
|
+
from .enums import AggregateName
|
|
54
|
+
from .enums import OperatorName
|
|
55
|
+
from .enums import PipelineStep
|
|
68
56
|
from .load import load_data
|
|
57
|
+
from .types import AggregateFunc
|
|
58
|
+
from .types import AggregateSpec
|
|
59
|
+
from .types import FieldName
|
|
60
|
+
from .types import Fields
|
|
61
|
+
from .types import FilterSpec
|
|
62
|
+
from .types import MapSpec
|
|
63
|
+
from .types import OperatorFunc
|
|
64
|
+
from .types import PipelineStepName
|
|
65
|
+
from .types import SortKey
|
|
66
|
+
from .types import StepApplier
|
|
67
|
+
from .types import StepOrSteps
|
|
68
|
+
from .types import StepSpec
|
|
69
69
|
|
|
70
70
|
# SECTION: EXPORTS ========================================================== #
|
|
71
71
|
|
|
@@ -110,7 +110,7 @@ def _agg_count(
|
|
|
110
110
|
present: int,
|
|
111
111
|
) -> int:
|
|
112
112
|
"""
|
|
113
|
-
Return the provided presence count
|
|
113
|
+
Return the provided presence count *present*.
|
|
114
114
|
|
|
115
115
|
Parameters
|
|
116
116
|
----------
|
|
@@ -120,7 +120,7 @@ def _agg_count(
|
|
|
120
120
|
Returns
|
|
121
121
|
-------
|
|
122
122
|
int
|
|
123
|
-
The provided presence count
|
|
123
|
+
The provided presence count *present*.
|
|
124
124
|
"""
|
|
125
125
|
return present
|
|
126
126
|
|
|
@@ -206,15 +206,12 @@ def _normalize_specs(
|
|
|
206
206
|
"""
|
|
207
207
|
if config is None:
|
|
208
208
|
return []
|
|
209
|
-
if
|
|
210
|
-
config,
|
|
211
|
-
(str, bytes, bytearray),
|
|
212
|
-
):
|
|
209
|
+
if _is_sequence_not_text(config):
|
|
213
210
|
# Already a sequence of step specs; normalize to a list.
|
|
214
|
-
return list(config)
|
|
211
|
+
return list(cast(Sequence[StepSpec], config))
|
|
215
212
|
|
|
216
213
|
# Single spec
|
|
217
|
-
return [config]
|
|
214
|
+
return [cast(StepSpec, config)]
|
|
218
215
|
|
|
219
216
|
|
|
220
217
|
def _normalize_operation_keys(ops: Mapping[Any, Any]) -> dict[str, Any]:
|
|
@@ -702,7 +699,31 @@ def _apply_sort_step(
|
|
|
702
699
|
# -- Helpers -- #
|
|
703
700
|
|
|
704
701
|
|
|
705
|
-
def
|
|
702
|
+
def _is_sequence_not_text(
|
|
703
|
+
obj: Any,
|
|
704
|
+
) -> bool:
|
|
705
|
+
"""
|
|
706
|
+
Return ``True`` for non-text sequences.
|
|
707
|
+
|
|
708
|
+
Parameters
|
|
709
|
+
----------
|
|
710
|
+
obj : Any
|
|
711
|
+
The object to check.
|
|
712
|
+
|
|
713
|
+
Returns
|
|
714
|
+
-------
|
|
715
|
+
bool
|
|
716
|
+
``True`` when *obj* is a non-text sequence.
|
|
717
|
+
"""
|
|
718
|
+
return isinstance(obj, Sequence) and not isinstance(
|
|
719
|
+
obj,
|
|
720
|
+
(str, bytes, bytearray),
|
|
721
|
+
)
|
|
722
|
+
|
|
723
|
+
|
|
724
|
+
def _is_plain_fields_list(
|
|
725
|
+
obj: Any,
|
|
726
|
+
) -> bool:
|
|
706
727
|
"""
|
|
707
728
|
Return True if obj is a non-text sequence of non-mapping items.
|
|
708
729
|
|
|
@@ -719,10 +740,8 @@ def _is_plain_fields_list(obj: Any) -> bool:
|
|
|
719
740
|
True if obj is a non-text sequence of non-mapping items, False
|
|
720
741
|
otherwise.
|
|
721
742
|
"""
|
|
722
|
-
return (
|
|
723
|
-
isinstance(
|
|
724
|
-
and not isinstance(obj, (str, bytes, bytearray))
|
|
725
|
-
and not any(isinstance(x, Mapping) for x in obj)
|
|
743
|
+
return _is_sequence_not_text(obj) and not any(
|
|
744
|
+
isinstance(x, Mapping) for x in obj
|
|
726
745
|
)
|
|
727
746
|
|
|
728
747
|
|
etlplus/ops/types.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""
|
|
2
|
+
:mod:`etlplus.ops.types` module.
|
|
3
|
+
|
|
4
|
+
Shared type aliases leveraged across :mod:`etlplus.ops` modules.
|
|
5
|
+
|
|
6
|
+
Notes
|
|
7
|
+
-----
|
|
8
|
+
- Centralizes ops-focused aliases (functions, specs, and pipeline helpers).
|
|
9
|
+
- Relies on Python 3.13 ``type`` statements for readability and IDE support.
|
|
10
|
+
|
|
11
|
+
Examples
|
|
12
|
+
--------
|
|
13
|
+
>>> from etlplus.ops.types import AggregateFunc, OperatorFunc
|
|
14
|
+
>>> def total(xs: list[float], _: int) -> float:
|
|
15
|
+
... return sum(xs)
|
|
16
|
+
>>> agg: AggregateFunc = total
|
|
17
|
+
>>> op: OperatorFunc = lambda a, b: a == b
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
from collections.abc import Callable
|
|
23
|
+
from collections.abc import Mapping
|
|
24
|
+
from collections.abc import Sequence
|
|
25
|
+
from typing import Any
|
|
26
|
+
from typing import Literal
|
|
27
|
+
|
|
28
|
+
from ..types import JSONList
|
|
29
|
+
from ..types import StrAnyMap
|
|
30
|
+
from ..types import StrSeqMap
|
|
31
|
+
from ..types import StrStrMap
|
|
32
|
+
|
|
33
|
+
# SECTION: EXPORTS ========================================================== #
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
__all__ = [
|
|
37
|
+
# Type Aliases (Functions)
|
|
38
|
+
'AggregateFunc',
|
|
39
|
+
'OperatorFunc',
|
|
40
|
+
# Type Aliases (Records & Fields)
|
|
41
|
+
'FieldName',
|
|
42
|
+
'Fields',
|
|
43
|
+
# Type Aliases (Transform Specs)
|
|
44
|
+
'AggregateSpec',
|
|
45
|
+
'FilterSpec',
|
|
46
|
+
'MapSpec',
|
|
47
|
+
'SelectSpec',
|
|
48
|
+
'SortSpec',
|
|
49
|
+
# Type Aliases (Pipelines)
|
|
50
|
+
'StepOrSteps',
|
|
51
|
+
'StepSeq',
|
|
52
|
+
'StepSpec',
|
|
53
|
+
'PipelineConfig',
|
|
54
|
+
'PipelineStepName',
|
|
55
|
+
# Type Aliases (Helpers)
|
|
56
|
+
'StepApplier',
|
|
57
|
+
'SortKey',
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# SECTION: TYPE ALIASES ===================================================== #
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# -- Functions -- #
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# TODO: Consider redefining to use `functools.reduce` signature.
|
|
68
|
+
# TODO: Consider adding `**kwargs` to support richer aggregation functions.
|
|
69
|
+
# TODO: Consider constraining first argument to `Sequence[float]`.
|
|
70
|
+
# TODO: Consider constraining return type to `float | int | None`.
|
|
71
|
+
# Callable reducing numeric collections into a summary value.
|
|
72
|
+
type AggregateFunc = Callable[[list[float], int], Any]
|
|
73
|
+
|
|
74
|
+
# Binary predicate consumed by filter operations.
|
|
75
|
+
type OperatorFunc = Callable[[Any, Any], bool]
|
|
76
|
+
|
|
77
|
+
# -- Records & Fields -- #
|
|
78
|
+
|
|
79
|
+
# Individual field identifier referenced inside specs.
|
|
80
|
+
type FieldName = str
|
|
81
|
+
|
|
82
|
+
# Ordered list of :data:`FieldName` entries preserving projection order.
|
|
83
|
+
type Fields = list[FieldName]
|
|
84
|
+
|
|
85
|
+
# -- Transform Specs -- #
|
|
86
|
+
|
|
87
|
+
# Filtering spec expecting ``field``, ``op``, and ``value`` keys.
|
|
88
|
+
type FilterSpec = StrAnyMap
|
|
89
|
+
|
|
90
|
+
# Field renaming instructions mapping old keys to new ones.
|
|
91
|
+
type MapSpec = StrStrMap
|
|
92
|
+
|
|
93
|
+
# Projection spec as a field list or mapping with metadata.
|
|
94
|
+
#
|
|
95
|
+
# Examples
|
|
96
|
+
# --------
|
|
97
|
+
# >>> from etlplus.ops.types import SelectSpec
|
|
98
|
+
# >>> spec1: SelectSpec = ['a','b']
|
|
99
|
+
# >>> spec2: SelectSpec = {'fields': [...]}
|
|
100
|
+
type SelectSpec = Fields | StrSeqMap
|
|
101
|
+
|
|
102
|
+
# Sort directive expressed as a field string or mapping with flags.
|
|
103
|
+
#
|
|
104
|
+
# Examples
|
|
105
|
+
# --------
|
|
106
|
+
# >>> from etlplus.ops.types import SortSpec
|
|
107
|
+
# >>> spec1: SortSpec = 'field'
|
|
108
|
+
# >>> spec2: SortSpec = {'field': 'x', 'reverse': True}
|
|
109
|
+
type SortSpec = str | StrAnyMap
|
|
110
|
+
|
|
111
|
+
# Aggregate instruction covering ``field``, ``func``, and optional alias.
|
|
112
|
+
#
|
|
113
|
+
# Supported functions: ``avg``, ``count``, ``max``, ``min``, and ``sum``.
|
|
114
|
+
# Examples
|
|
115
|
+
# --------
|
|
116
|
+
# >>> from etlplus.ops.types import AggregateSpec
|
|
117
|
+
# >>> spec: AggregateSpec = \
|
|
118
|
+
# ... {'field': 'x', 'func': 'sum' | 'avg' | ..., 'alias'?: '...'}
|
|
119
|
+
type AggregateSpec = StrAnyMap
|
|
120
|
+
|
|
121
|
+
# -- Pipelines-- #
|
|
122
|
+
|
|
123
|
+
# Unified pipeline step spec consumed by :mod:`etlplus.ops.transform`.
|
|
124
|
+
type StepSpec = AggregateSpec | FilterSpec | MapSpec | SelectSpec | SortSpec
|
|
125
|
+
|
|
126
|
+
# Collections of steps
|
|
127
|
+
|
|
128
|
+
# Ordered collection of :data:`StepSpec` entries.
|
|
129
|
+
type StepSeq = Sequence[StepSpec]
|
|
130
|
+
|
|
131
|
+
# Accepts either a single :data:`StepSpec` or a sequence of them.
|
|
132
|
+
type StepOrSteps = StepSpec | StepSeq
|
|
133
|
+
|
|
134
|
+
# Canonical literal names for supported transform stages.
|
|
135
|
+
type PipelineStepName = Literal['aggregate', 'filter', 'map', 'select', 'sort']
|
|
136
|
+
|
|
137
|
+
# Mapping from step name to its associated specification payload.
|
|
138
|
+
# TODO: Consider replacing with etlplus.workflow.types.PipelineConfig.
|
|
139
|
+
type PipelineConfig = Mapping[PipelineStepName, StepOrSteps]
|
|
140
|
+
|
|
141
|
+
# -- Helpers -- #
|
|
142
|
+
|
|
143
|
+
# Callable that applies step configuration to a batch of records.
|
|
144
|
+
type StepApplier = Callable[[JSONList, Any], JSONList]
|
|
145
|
+
|
|
146
|
+
# Tuple combining stable sort index and computed sort value.
|
|
147
|
+
type SortKey = tuple[int, Any]
|
etlplus/ops/utils.py
CHANGED
|
@@ -7,13 +7,11 @@ The helpers defined here embrace a "high cohesion, low coupling" design by
|
|
|
7
7
|
isolating normalization, configuration, and logging responsibilities. The
|
|
8
8
|
resulting surface keeps ``maybe_validate`` focused on orchestration while
|
|
9
9
|
offloading ancillary concerns to composable helpers.
|
|
10
|
-
|
|
11
10
|
"""
|
|
12
11
|
|
|
13
12
|
from __future__ import annotations
|
|
14
13
|
|
|
15
14
|
from collections.abc import Callable
|
|
16
|
-
from collections.abc import Mapping
|
|
17
15
|
from dataclasses import dataclass
|
|
18
16
|
from types import MappingProxyType
|
|
19
17
|
from typing import Any
|
|
@@ -23,12 +21,12 @@ from typing import TypedDict
|
|
|
23
21
|
from typing import cast
|
|
24
22
|
|
|
25
23
|
from ..types import StrAnyMap
|
|
26
|
-
from ..utils import
|
|
24
|
+
from ..utils import normalize_choice
|
|
27
25
|
|
|
28
26
|
# SECTION: TYPED DICTIONARIES =============================================== #
|
|
29
27
|
|
|
30
28
|
|
|
31
|
-
class
|
|
29
|
+
class ValidationResultDict(TypedDict, total=False):
|
|
32
30
|
"""Shape returned by ``validate_fn`` callables."""
|
|
33
31
|
|
|
34
32
|
valid: bool
|
|
@@ -46,7 +44,7 @@ type ValidationPhase = Literal['before_transform', 'after_transform']
|
|
|
46
44
|
type ValidationWindow = Literal['before_transform', 'after_transform', 'both']
|
|
47
45
|
type ValidationSeverity = Literal['warn', 'error']
|
|
48
46
|
|
|
49
|
-
type ValidateFn = Callable[[Any, Ruleset],
|
|
47
|
+
type ValidateFn = Callable[[Any, Ruleset], ValidationResultDict]
|
|
50
48
|
type PrintFn = Callable[[Any], None]
|
|
51
49
|
|
|
52
50
|
|
|
@@ -200,21 +198,21 @@ def maybe_validate(
|
|
|
200
198
|
Failure severity (``"warn"`` or ``"error"``).
|
|
201
199
|
validate_fn : ValidateFn
|
|
202
200
|
Engine that performs validation and returns a
|
|
203
|
-
:class:`
|
|
201
|
+
:class:`ValidationResultDict` instance.
|
|
204
202
|
print_json_fn : PrintFn
|
|
205
203
|
Structured logger invoked when validation fails.
|
|
206
204
|
|
|
207
205
|
Returns
|
|
208
206
|
-------
|
|
209
207
|
Any
|
|
210
|
-
|
|
208
|
+
*payload* when validation is skipped or when severity is ``"warn"``
|
|
211
209
|
and the validation fails. Returns the validator ``data`` payload when
|
|
212
210
|
validation succeeds.
|
|
213
211
|
|
|
214
212
|
Raises
|
|
215
213
|
------
|
|
216
214
|
ValueError
|
|
217
|
-
Raised when validation fails and
|
|
215
|
+
Raised when validation fails and *severity* is ``"error"``.
|
|
218
216
|
|
|
219
217
|
Examples
|
|
220
218
|
--------
|
|
@@ -272,7 +270,7 @@ def _log_failure(
|
|
|
272
270
|
phase: ValidationPhase,
|
|
273
271
|
window: ValidationWindow,
|
|
274
272
|
ruleset_name: str | None,
|
|
275
|
-
result:
|
|
273
|
+
result: ValidationResultDict,
|
|
276
274
|
) -> None:
|
|
277
275
|
"""
|
|
278
276
|
Emit a structured message describing the failed validation.
|
|
@@ -287,7 +285,7 @@ def _log_failure(
|
|
|
287
285
|
Configured validation window.
|
|
288
286
|
ruleset_name : str | None
|
|
289
287
|
Name of the validation ruleset.
|
|
290
|
-
result :
|
|
288
|
+
result : ValidationResultDict
|
|
291
289
|
Result of the failed validation.
|
|
292
290
|
"""
|
|
293
291
|
printer(
|
|
@@ -320,7 +318,7 @@ def _normalize_phase(
|
|
|
320
318
|
"""
|
|
321
319
|
return cast(
|
|
322
320
|
ValidationPhase,
|
|
323
|
-
|
|
321
|
+
normalize_choice(
|
|
324
322
|
value,
|
|
325
323
|
mapping=_PHASE_CHOICES,
|
|
326
324
|
default='before_transform',
|
|
@@ -346,7 +344,7 @@ def _normalize_severity(
|
|
|
346
344
|
"""
|
|
347
345
|
return cast(
|
|
348
346
|
ValidationSeverity,
|
|
349
|
-
|
|
347
|
+
normalize_choice(
|
|
350
348
|
value,
|
|
351
349
|
mapping=_SEVERITY_CHOICES,
|
|
352
350
|
default='error',
|
|
@@ -372,7 +370,7 @@ def _normalize_window(
|
|
|
372
370
|
"""
|
|
373
371
|
return cast(
|
|
374
372
|
ValidationWindow,
|
|
375
|
-
|
|
373
|
+
normalize_choice(
|
|
376
374
|
value,
|
|
377
375
|
mapping=_WINDOW_CHOICES,
|
|
378
376
|
default='both',
|
|
@@ -380,33 +378,6 @@ def _normalize_window(
|
|
|
380
378
|
)
|
|
381
379
|
|
|
382
380
|
|
|
383
|
-
def _normalize_choice(
|
|
384
|
-
value: str | None,
|
|
385
|
-
*,
|
|
386
|
-
mapping: Mapping[str, str],
|
|
387
|
-
default: str,
|
|
388
|
-
) -> str:
|
|
389
|
-
"""
|
|
390
|
-
Normalize a text value against a mapping with a default fallback.
|
|
391
|
-
|
|
392
|
-
Parameters
|
|
393
|
-
----------
|
|
394
|
-
value : str | None
|
|
395
|
-
Input text to normalize.
|
|
396
|
-
mapping : Mapping[str, str]
|
|
397
|
-
Mapping of accepted values to normalized outputs.
|
|
398
|
-
default : str
|
|
399
|
-
Default to return when input is missing or unrecognized.
|
|
400
|
-
|
|
401
|
-
Returns
|
|
402
|
-
-------
|
|
403
|
-
str
|
|
404
|
-
Normalized value.
|
|
405
|
-
"""
|
|
406
|
-
normalized = normalized_str(value)
|
|
407
|
-
return mapping.get(normalized, default)
|
|
408
|
-
|
|
409
|
-
|
|
410
381
|
def _rule_name(
|
|
411
382
|
rules: Ruleset,
|
|
412
383
|
) -> str | None:
|