etlplus 0.15.0__py3-none-any.whl → 0.16.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. etlplus/README.md +25 -3
  2. etlplus/__init__.py +2 -0
  3. etlplus/api/README.md +31 -0
  4. etlplus/api/__init__.py +14 -14
  5. etlplus/api/auth.py +10 -7
  6. etlplus/api/config.py +8 -13
  7. etlplus/api/endpoint_client.py +20 -20
  8. etlplus/api/errors.py +4 -4
  9. etlplus/api/pagination/__init__.py +6 -6
  10. etlplus/api/pagination/config.py +12 -10
  11. etlplus/api/pagination/paginator.py +6 -7
  12. etlplus/api/rate_limiting/__init__.py +2 -2
  13. etlplus/api/rate_limiting/config.py +14 -14
  14. etlplus/api/rate_limiting/rate_limiter.py +3 -3
  15. etlplus/api/request_manager.py +4 -4
  16. etlplus/api/retry_manager.py +8 -8
  17. etlplus/api/transport.py +11 -11
  18. etlplus/api/types.py +131 -11
  19. etlplus/api/utils.py +50 -50
  20. etlplus/cli/commands.py +93 -60
  21. etlplus/cli/constants.py +1 -1
  22. etlplus/cli/handlers.py +43 -26
  23. etlplus/cli/io.py +2 -2
  24. etlplus/cli/main.py +2 -2
  25. etlplus/cli/state.py +4 -7
  26. etlplus/{workflow/pipeline.py → config.py} +62 -99
  27. etlplus/connector/__init__.py +43 -0
  28. etlplus/connector/api.py +161 -0
  29. etlplus/connector/connector.py +26 -0
  30. etlplus/connector/core.py +132 -0
  31. etlplus/connector/database.py +122 -0
  32. etlplus/connector/enums.py +52 -0
  33. etlplus/connector/file.py +120 -0
  34. etlplus/connector/types.py +40 -0
  35. etlplus/connector/utils.py +122 -0
  36. etlplus/database/ddl.py +2 -2
  37. etlplus/database/engine.py +19 -3
  38. etlplus/database/orm.py +2 -0
  39. etlplus/enums.py +36 -200
  40. etlplus/file/_imports.py +1 -0
  41. etlplus/file/_io.py +52 -4
  42. etlplus/file/accdb.py +3 -2
  43. etlplus/file/arrow.py +3 -2
  44. etlplus/file/avro.py +3 -2
  45. etlplus/file/bson.py +3 -2
  46. etlplus/file/cbor.py +3 -2
  47. etlplus/file/cfg.py +3 -2
  48. etlplus/file/conf.py +3 -2
  49. etlplus/file/core.py +11 -8
  50. etlplus/file/csv.py +3 -2
  51. etlplus/file/dat.py +3 -2
  52. etlplus/file/dta.py +3 -2
  53. etlplus/file/duckdb.py +3 -2
  54. etlplus/file/enums.py +1 -1
  55. etlplus/file/feather.py +3 -2
  56. etlplus/file/fwf.py +3 -2
  57. etlplus/file/gz.py +3 -2
  58. etlplus/file/hbs.py +3 -2
  59. etlplus/file/hdf5.py +3 -2
  60. etlplus/file/ini.py +3 -2
  61. etlplus/file/ion.py +3 -2
  62. etlplus/file/jinja2.py +3 -2
  63. etlplus/file/json.py +5 -16
  64. etlplus/file/log.py +3 -2
  65. etlplus/file/mat.py +3 -2
  66. etlplus/file/mdb.py +3 -2
  67. etlplus/file/msgpack.py +3 -2
  68. etlplus/file/mustache.py +3 -2
  69. etlplus/file/nc.py +3 -2
  70. etlplus/file/ndjson.py +3 -2
  71. etlplus/file/numbers.py +3 -2
  72. etlplus/file/ods.py +3 -2
  73. etlplus/file/orc.py +3 -2
  74. etlplus/file/parquet.py +3 -2
  75. etlplus/file/pb.py +3 -2
  76. etlplus/file/pbf.py +3 -2
  77. etlplus/file/properties.py +3 -2
  78. etlplus/file/proto.py +3 -2
  79. etlplus/file/psv.py +3 -2
  80. etlplus/file/rda.py +3 -2
  81. etlplus/file/rds.py +3 -2
  82. etlplus/file/sas7bdat.py +3 -2
  83. etlplus/file/sav.py +3 -2
  84. etlplus/file/sqlite.py +3 -2
  85. etlplus/file/stub.py +1 -0
  86. etlplus/file/sylk.py +3 -2
  87. etlplus/file/tab.py +3 -2
  88. etlplus/file/toml.py +3 -2
  89. etlplus/file/tsv.py +3 -2
  90. etlplus/file/txt.py +4 -3
  91. etlplus/file/vm.py +3 -2
  92. etlplus/file/wks.py +3 -2
  93. etlplus/file/xls.py +3 -2
  94. etlplus/file/xlsm.py +3 -2
  95. etlplus/file/xlsx.py +3 -2
  96. etlplus/file/xml.py +9 -3
  97. etlplus/file/xpt.py +3 -2
  98. etlplus/file/yaml.py +5 -16
  99. etlplus/file/zip.py +3 -2
  100. etlplus/file/zsav.py +3 -2
  101. etlplus/ops/__init__.py +1 -0
  102. etlplus/ops/enums.py +173 -0
  103. etlplus/ops/extract.py +222 -23
  104. etlplus/ops/load.py +155 -36
  105. etlplus/ops/run.py +92 -107
  106. etlplus/ops/transform.py +48 -29
  107. etlplus/ops/types.py +147 -0
  108. etlplus/ops/utils.py +11 -40
  109. etlplus/ops/validate.py +16 -16
  110. etlplus/types.py +6 -102
  111. etlplus/utils.py +163 -29
  112. etlplus/workflow/README.md +0 -24
  113. etlplus/workflow/__init__.py +2 -15
  114. etlplus/workflow/dag.py +23 -1
  115. etlplus/workflow/jobs.py +83 -39
  116. etlplus/workflow/profile.py +4 -2
  117. {etlplus-0.15.0.dist-info → etlplus-0.16.6.dist-info}/METADATA +4 -4
  118. etlplus-0.16.6.dist-info/RECORD +143 -0
  119. {etlplus-0.15.0.dist-info → etlplus-0.16.6.dist-info}/WHEEL +1 -1
  120. etlplus/config/README.md +0 -50
  121. etlplus/config/__init__.py +0 -33
  122. etlplus/config/types.py +0 -140
  123. etlplus/dag.py +0 -103
  124. etlplus/workflow/connector.py +0 -373
  125. etlplus/workflow/types.py +0 -115
  126. etlplus/workflow/utils.py +0 -120
  127. etlplus-0.15.0.dist-info/RECORD +0 -139
  128. {etlplus-0.15.0.dist-info → etlplus-0.16.6.dist-info}/entry_points.txt +0 -0
  129. {etlplus-0.15.0.dist-info → etlplus-0.16.6.dist-info}/licenses/LICENSE +0 -0
  130. {etlplus-0.15.0.dist-info → etlplus-0.16.6.dist-info}/top_level.txt +0 -0
etlplus/ops/run.py CHANGED
@@ -6,31 +6,23 @@ A module for running ETL jobs defined in YAML configurations.
6
6
 
7
7
  from __future__ import annotations
8
8
 
9
- from collections.abc import Mapping
10
9
  from typing import Any
11
10
  from typing import Final
12
11
  from typing import cast
13
- from urllib.parse import urlsplit
14
- from urllib.parse import urlunsplit
15
12
 
16
- from ..api import EndpointClient # noqa: F401 (re-exported for tests)
17
13
  from ..api import HttpMethod
18
- from ..api import PaginationConfigMap
19
- from ..api import RequestOptions
20
- from ..api import compose_api_request_env
21
- from ..api import compose_api_target_env
22
- from ..api import paginate_with_client
23
- from ..enums import DataConnectorType
14
+ from ..config import Config
15
+ from ..connector import DataConnectorType
24
16
  from ..file import FileFormat
17
+ from ..ops.types import PipelineConfig
25
18
  from ..types import JSONData
26
19
  from ..types import JSONDict
27
- from ..types import PipelineConfig
28
20
  from ..types import StrPath
29
- from ..types import Timeout
30
21
  from ..utils import print_json
31
- from ..workflow import load_pipeline_config
32
22
  from .extract import extract
23
+ from .extract import extract_from_api_source
33
24
  from .load import load
25
+ from .load import load_to_api_target
34
26
  from .transform import transform
35
27
  from .utils import maybe_validate
36
28
  from .validate import validate
@@ -54,6 +46,75 @@ DEFAULT_CONFIG_PATH: Final[str] = 'in/pipeline.yml'
54
46
  # SECTION: INTERNAL FUNCTIONS =============================================== #
55
47
 
56
48
 
49
+ def _index_connectors(
50
+ connectors: list[Any],
51
+ *,
52
+ label: str,
53
+ ) -> dict[str, Any]:
54
+ """
55
+ Index connectors by name with a helpful error on duplicates.
56
+
57
+ Parameters
58
+ ----------
59
+ connectors : list[Any]
60
+ Connector objects to index.
61
+ label : str
62
+ Label used in error messages (e.g., ``"source"``).
63
+
64
+ Returns
65
+ -------
66
+ dict[str, Any]
67
+ Mapping of connector names to connector objects.
68
+
69
+ Raises
70
+ ------
71
+ ValueError
72
+ If duplicate connector names are found.
73
+ """
74
+ indexed: dict[str, Any] = {}
75
+ for connector in connectors:
76
+ name = getattr(connector, 'name', None)
77
+ if not isinstance(name, str) or not name:
78
+ continue
79
+ if name in indexed:
80
+ raise ValueError(f'Duplicate {label} connector name: {name}')
81
+ indexed[name] = connector
82
+ return indexed
83
+
84
+
85
+ def _require_named_connector(
86
+ connectors: dict[str, Any],
87
+ name: str,
88
+ *,
89
+ label: str,
90
+ ) -> Any:
91
+ """
92
+ Return a connector by name or raise a helpful error.
93
+
94
+ Parameters
95
+ ----------
96
+ connectors : dict[str, Any]
97
+ Mapping of connector names to connector objects.
98
+ name : str
99
+ Connector name to retrieve.
100
+ label : str
101
+ Label used in error messages (e.g., ``"source"``).
102
+
103
+ Returns
104
+ -------
105
+ Any
106
+ Connector object.
107
+
108
+ Raises
109
+ ------
110
+ ValueError
111
+ If the connector name is not found.
112
+ """
113
+ if name not in connectors:
114
+ raise ValueError(f'Unknown {label}: {name}')
115
+ return connectors[name]
116
+
117
+
57
118
  def _resolve_validation_config(
58
119
  job_obj: Any,
59
120
  cfg: Any,
@@ -94,7 +155,7 @@ def run(
94
155
  Run a pipeline job defined in a YAML configuration.
95
156
 
96
157
  By default it reads the configuration from ``in/pipeline.yml``, but callers
97
- can provide an explicit ``config_path`` to override this.
158
+ can provide an explicit *config_path* to override this.
98
159
 
99
160
  Parameters
100
161
  ----------
@@ -115,23 +176,25 @@ def run(
115
176
  If the job is not found or if there are configuration issues.
116
177
  """
117
178
  cfg_path = config_path or DEFAULT_CONFIG_PATH
118
- cfg = load_pipeline_config(cfg_path, substitute=True)
179
+ cfg = Config.from_yaml(cfg_path, substitute=True)
119
180
 
120
181
  # Lookup job by name
121
182
  if not (job_obj := next((j for j in cfg.jobs if j.name == job), None)):
122
183
  raise ValueError(f'Job not found: {job}')
123
184
 
124
185
  # Index sources/targets by name
125
- sources_by_name = {getattr(s, 'name', None): s for s in cfg.sources}
126
- targets_by_name = {getattr(t, 'name', None): t for t in cfg.targets}
186
+ sources_by_name = _index_connectors(cfg.sources, label='source')
187
+ targets_by_name = _index_connectors(cfg.targets, label='target')
127
188
 
128
189
  # Extract.
129
190
  if not job_obj.extract:
130
191
  raise ValueError('Job missing "extract" section')
131
192
  source_name = job_obj.extract.source
132
- if source_name not in sources_by_name:
133
- raise ValueError(f'Unknown source: {source_name}')
134
- source_obj = sources_by_name[source_name]
193
+ source_obj = _require_named_connector(
194
+ sources_by_name,
195
+ source_name,
196
+ label='source',
197
+ )
135
198
  ex_opts: dict[str, Any] = job_obj.extract.options or {}
136
199
 
137
200
  data: Any
@@ -151,68 +214,7 @@ def run(
151
214
  conn = getattr(source_obj, 'connection_string', '')
152
215
  data = extract('database', conn)
153
216
  case DataConnectorType.API:
154
- env = compose_api_request_env(cfg, source_obj, ex_opts)
155
- if (
156
- env.get('use_endpoints')
157
- and env.get('base_url')
158
- and env.get('endpoints_map')
159
- and env.get('endpoint_key')
160
- ):
161
- # Construct client using module-level EndpointClient so tests
162
- # can monkeypatch this class on etlplus.ops.run.
163
- ClientClass = EndpointClient # noqa: N806
164
- client = ClientClass(
165
- base_url=cast(str, env.get('base_url')),
166
- base_path=cast(str | None, env.get('base_path')),
167
- endpoints=cast(
168
- dict[str, str],
169
- env.get('endpoints_map', {}),
170
- ),
171
- retry=env.get('retry'),
172
- retry_network_errors=bool(
173
- env.get('retry_network_errors', False),
174
- ),
175
- session=env.get('session'),
176
- )
177
- data = paginate_with_client(
178
- client,
179
- cast(str, env.get('endpoint_key')),
180
- env.get('params'),
181
- env.get('headers'),
182
- env.get('timeout'),
183
- env.get('pagination'),
184
- cast(float | None, env.get('sleep_seconds')),
185
- )
186
- else:
187
- url = env.get('url')
188
- if not url:
189
- raise ValueError('API source missing URL')
190
- parts = urlsplit(cast(str, url))
191
- base = urlunsplit((parts.scheme, parts.netloc, '', '', ''))
192
- ClientClass = EndpointClient # noqa: N806
193
- client = ClientClass(
194
- base_url=base,
195
- base_path=None,
196
- endpoints={},
197
- retry=env.get('retry'),
198
- retry_network_errors=bool(
199
- env.get('retry_network_errors', False),
200
- ),
201
- session=env.get('session'),
202
- )
203
-
204
- request_options = RequestOptions(
205
- params=cast(Mapping[str, Any] | None, env.get('params')),
206
- headers=cast(Mapping[str, str] | None, env.get('headers')),
207
- timeout=cast(Timeout | None, env.get('timeout')),
208
- )
209
-
210
- data = client.paginate_url(
211
- cast(str, url),
212
- cast(PaginationConfigMap | None, env.get('pagination')),
213
- request=request_options,
214
- sleep_seconds=cast(float, env.get('sleep_seconds', 0.0)),
215
- )
217
+ data = extract_from_api_source(cfg, source_obj, ex_opts)
216
218
  case _:
217
219
  # :meth:`coerce` already raises for invalid connector types, but
218
220
  # keep explicit guard for defensive programming.
@@ -256,9 +258,11 @@ def run(
256
258
  if not job_obj.load:
257
259
  raise ValueError('Job missing "load" section')
258
260
  target_name = job_obj.load.target
259
- if target_name not in targets_by_name:
260
- raise ValueError(f'Unknown target: {target_name}')
261
- target_obj = targets_by_name[target_name]
261
+ target_obj = _require_named_connector(
262
+ targets_by_name,
263
+ target_name,
264
+ label='target',
265
+ )
262
266
  overrides = job_obj.load.overrides or {}
263
267
 
264
268
  ttype_raw = getattr(target_obj, 'type', None)
@@ -274,26 +278,7 @@ def run(
274
278
  raise ValueError('File target missing "path"')
275
279
  result = load(data, 'file', path, file_format=fmt)
276
280
  case DataConnectorType.API:
277
- env_t = compose_api_target_env(cfg, target_obj, overrides)
278
- url_t = env_t.get('url')
279
- if not url_t:
280
- raise ValueError('API target missing "url"')
281
- kwargs_t: dict[str, Any] = {}
282
- headers = env_t.get('headers')
283
- if headers:
284
- kwargs_t['headers'] = cast(dict[str, str], headers)
285
- if env_t.get('timeout') is not None:
286
- kwargs_t['timeout'] = env_t.get('timeout')
287
- session = env_t.get('session')
288
- if session is not None:
289
- kwargs_t['session'] = session
290
- result = load(
291
- data,
292
- 'api',
293
- cast(str, url_t),
294
- method=cast(str | Any, env_t.get('method') or 'post'),
295
- **kwargs_t,
296
- )
281
+ result = load_to_api_target(cfg, target_obj, overrides, data)
297
282
  case DataConnectorType.DATABASE:
298
283
  conn = overrides.get('connection_string') or getattr(
299
284
  target_obj,
@@ -328,11 +313,11 @@ def run_pipeline(
328
313
  Parameters
329
314
  ----------
330
315
  source_type : DataConnectorType | str | None, optional
331
- Connector type for extraction. When ``None``, ``source`` is assumed
316
+ Connector type for extraction. When ``None``, *source* is assumed
332
317
  to be pre-loaded data and extraction is skipped.
333
318
  source : StrPath | JSONData | None, optional
334
319
  Data source for extraction or the pre-loaded payload when
335
- ``source_type`` is ``None``.
320
+ *source_type* is ``None``.
336
321
  operations : PipelineConfig | None, optional
337
322
  Transform configuration passed to :func:`etlplus.ops.transform`.
338
323
  target_type : DataConnectorType | str | None, optional
etlplus/ops/transform.py CHANGED
@@ -44,28 +44,28 @@ from collections.abc import Sequence
44
44
  from typing import Any
45
45
  from typing import cast
46
46
 
47
- from ..enums import AggregateName
48
- from ..enums import OperatorName
49
- from ..enums import PipelineStep
50
- from ..types import AggregateFunc
51
- from ..types import AggregateSpec
52
- from ..types import FieldName
53
- from ..types import Fields
54
- from ..types import FilterSpec
47
+ from ..ops.types import PipelineConfig
55
48
  from ..types import JSONData
56
49
  from ..types import JSONDict
57
50
  from ..types import JSONList
58
- from ..types import MapSpec
59
- from ..types import OperatorFunc
60
- from ..types import PipelineConfig
61
- from ..types import PipelineStepName
62
- from ..types import SortKey
63
- from ..types import StepApplier
64
- from ..types import StepOrSteps
65
- from ..types import StepSpec
66
51
  from ..types import StrPath
67
52
  from ..utils import to_number
53
+ from .enums import AggregateName
54
+ from .enums import OperatorName
55
+ from .enums import PipelineStep
68
56
  from .load import load_data
57
+ from .types import AggregateFunc
58
+ from .types import AggregateSpec
59
+ from .types import FieldName
60
+ from .types import Fields
61
+ from .types import FilterSpec
62
+ from .types import MapSpec
63
+ from .types import OperatorFunc
64
+ from .types import PipelineStepName
65
+ from .types import SortKey
66
+ from .types import StepApplier
67
+ from .types import StepOrSteps
68
+ from .types import StepSpec
69
69
 
70
70
  # SECTION: EXPORTS ========================================================== #
71
71
 
@@ -110,7 +110,7 @@ def _agg_count(
110
110
  present: int,
111
111
  ) -> int:
112
112
  """
113
- Return the provided presence count ``present``.
113
+ Return the provided presence count *present*.
114
114
 
115
115
  Parameters
116
116
  ----------
@@ -120,7 +120,7 @@ def _agg_count(
120
120
  Returns
121
121
  -------
122
122
  int
123
- The provided presence count ``present``.
123
+ The provided presence count *present*.
124
124
  """
125
125
  return present
126
126
 
@@ -206,15 +206,12 @@ def _normalize_specs(
206
206
  """
207
207
  if config is None:
208
208
  return []
209
- if isinstance(config, Sequence) and not isinstance(
210
- config,
211
- (str, bytes, bytearray),
212
- ):
209
+ if _is_sequence_not_text(config):
213
210
  # Already a sequence of step specs; normalize to a list.
214
- return list(config) # type: ignore[list-item]
211
+ return list(cast(Sequence[StepSpec], config))
215
212
 
216
213
  # Single spec
217
- return [config]
214
+ return [cast(StepSpec, config)]
218
215
 
219
216
 
220
217
  def _normalize_operation_keys(ops: Mapping[Any, Any]) -> dict[str, Any]:
@@ -702,7 +699,31 @@ def _apply_sort_step(
702
699
  # -- Helpers -- #
703
700
 
704
701
 
705
- def _is_plain_fields_list(obj: Any) -> bool:
702
+ def _is_sequence_not_text(
703
+ obj: Any,
704
+ ) -> bool:
705
+ """
706
+ Return ``True`` for non-text sequences.
707
+
708
+ Parameters
709
+ ----------
710
+ obj : Any
711
+ The object to check.
712
+
713
+ Returns
714
+ -------
715
+ bool
716
+ ``True`` when *obj* is a non-text sequence.
717
+ """
718
+ return isinstance(obj, Sequence) and not isinstance(
719
+ obj,
720
+ (str, bytes, bytearray),
721
+ )
722
+
723
+
724
+ def _is_plain_fields_list(
725
+ obj: Any,
726
+ ) -> bool:
706
727
  """
707
728
  Return True if obj is a non-text sequence of non-mapping items.
708
729
 
@@ -719,10 +740,8 @@ def _is_plain_fields_list(obj: Any) -> bool:
719
740
  True if obj is a non-text sequence of non-mapping items, False
720
741
  otherwise.
721
742
  """
722
- return (
723
- isinstance(obj, Sequence)
724
- and not isinstance(obj, (str, bytes, bytearray))
725
- and not any(isinstance(x, Mapping) for x in obj)
743
+ return _is_sequence_not_text(obj) and not any(
744
+ isinstance(x, Mapping) for x in obj
726
745
  )
727
746
 
728
747
 
etlplus/ops/types.py ADDED
@@ -0,0 +1,147 @@
1
+ """
2
+ :mod:`etlplus.ops.types` module.
3
+
4
+ Shared type aliases leveraged across :mod:`etlplus.ops` modules.
5
+
6
+ Notes
7
+ -----
8
+ - Centralizes ops-focused aliases (functions, specs, and pipeline helpers).
9
+ - Relies on Python 3.13 ``type`` statements for readability and IDE support.
10
+
11
+ Examples
12
+ --------
13
+ >>> from etlplus.ops.types import AggregateFunc, OperatorFunc
14
+ >>> def total(xs: list[float], _: int) -> float:
15
+ ... return sum(xs)
16
+ >>> agg: AggregateFunc = total
17
+ >>> op: OperatorFunc = lambda a, b: a == b
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ from collections.abc import Callable
23
+ from collections.abc import Mapping
24
+ from collections.abc import Sequence
25
+ from typing import Any
26
+ from typing import Literal
27
+
28
+ from ..types import JSONList
29
+ from ..types import StrAnyMap
30
+ from ..types import StrSeqMap
31
+ from ..types import StrStrMap
32
+
33
+ # SECTION: EXPORTS ========================================================== #
34
+
35
+
36
+ __all__ = [
37
+ # Type Aliases (Functions)
38
+ 'AggregateFunc',
39
+ 'OperatorFunc',
40
+ # Type Aliases (Records & Fields)
41
+ 'FieldName',
42
+ 'Fields',
43
+ # Type Aliases (Transform Specs)
44
+ 'AggregateSpec',
45
+ 'FilterSpec',
46
+ 'MapSpec',
47
+ 'SelectSpec',
48
+ 'SortSpec',
49
+ # Type Aliases (Pipelines)
50
+ 'StepOrSteps',
51
+ 'StepSeq',
52
+ 'StepSpec',
53
+ 'PipelineConfig',
54
+ 'PipelineStepName',
55
+ # Type Aliases (Helpers)
56
+ 'StepApplier',
57
+ 'SortKey',
58
+ ]
59
+
60
+
61
+ # SECTION: TYPE ALIASES ===================================================== #
62
+
63
+
64
+ # -- Functions -- #
65
+
66
+
67
+ # TODO: Consider redefining to use `functools.reduce` signature.
68
+ # TODO: Consider adding `**kwargs` to support richer aggregation functions.
69
+ # TODO: Consider constraining first argument to `Sequence[float]`.
70
+ # TODO: Consider constraining return type to `float | int | None`.
71
+ # Callable reducing numeric collections into a summary value.
72
+ type AggregateFunc = Callable[[list[float], int], Any]
73
+
74
+ # Binary predicate consumed by filter operations.
75
+ type OperatorFunc = Callable[[Any, Any], bool]
76
+
77
+ # -- Records & Fields -- #
78
+
79
+ # Individual field identifier referenced inside specs.
80
+ type FieldName = str
81
+
82
+ # Ordered list of :data:`FieldName` entries preserving projection order.
83
+ type Fields = list[FieldName]
84
+
85
+ # -- Transform Specs -- #
86
+
87
+ # Filtering spec expecting ``field``, ``op``, and ``value`` keys.
88
+ type FilterSpec = StrAnyMap
89
+
90
+ # Field renaming instructions mapping old keys to new ones.
91
+ type MapSpec = StrStrMap
92
+
93
+ # Projection spec as a field list or mapping with metadata.
94
+ #
95
+ # Examples
96
+ # --------
97
+ # >>> from etlplus.ops.types import SelectSpec
98
+ # >>> spec1: SelectSpec = ['a','b']
99
+ # >>> spec2: SelectSpec = {'fields': [...]}
100
+ type SelectSpec = Fields | StrSeqMap
101
+
102
+ # Sort directive expressed as a field string or mapping with flags.
103
+ #
104
+ # Examples
105
+ # --------
106
+ # >>> from etlplus.ops.types import SortSpec
107
+ # >>> spec1: SortSpec = 'field'
108
+ # >>> spec2: SortSpec = {'field': 'x', 'reverse': True}
109
+ type SortSpec = str | StrAnyMap
110
+
111
+ # Aggregate instruction covering ``field``, ``func``, and optional alias.
112
+ #
113
+ # Supported functions: ``avg``, ``count``, ``max``, ``min``, and ``sum``.
114
+ # Examples
115
+ # --------
116
+ # >>> from etlplus.ops.types import AggregateSpec
117
+ # >>> spec: AggregateSpec = \
118
+ # ... {'field': 'x', 'func': 'sum' | 'avg' | ..., 'alias'?: '...'}
119
+ type AggregateSpec = StrAnyMap
120
+
121
+ # -- Pipelines-- #
122
+
123
+ # Unified pipeline step spec consumed by :mod:`etlplus.ops.transform`.
124
+ type StepSpec = AggregateSpec | FilterSpec | MapSpec | SelectSpec | SortSpec
125
+
126
+ # Collections of steps
127
+
128
+ # Ordered collection of :data:`StepSpec` entries.
129
+ type StepSeq = Sequence[StepSpec]
130
+
131
+ # Accepts either a single :data:`StepSpec` or a sequence of them.
132
+ type StepOrSteps = StepSpec | StepSeq
133
+
134
+ # Canonical literal names for supported transform stages.
135
+ type PipelineStepName = Literal['aggregate', 'filter', 'map', 'select', 'sort']
136
+
137
+ # Mapping from step name to its associated specification payload.
138
+ # TODO: Consider replacing with etlplus.workflow.types.PipelineConfig.
139
+ type PipelineConfig = Mapping[PipelineStepName, StepOrSteps]
140
+
141
+ # -- Helpers -- #
142
+
143
+ # Callable that applies step configuration to a batch of records.
144
+ type StepApplier = Callable[[JSONList, Any], JSONList]
145
+
146
+ # Tuple combining stable sort index and computed sort value.
147
+ type SortKey = tuple[int, Any]
etlplus/ops/utils.py CHANGED
@@ -7,13 +7,11 @@ The helpers defined here embrace a "high cohesion, low coupling" design by
7
7
  isolating normalization, configuration, and logging responsibilities. The
8
8
  resulting surface keeps ``maybe_validate`` focused on orchestration while
9
9
  offloading ancillary concerns to composable helpers.
10
-
11
10
  """
12
11
 
13
12
  from __future__ import annotations
14
13
 
15
14
  from collections.abc import Callable
16
- from collections.abc import Mapping
17
15
  from dataclasses import dataclass
18
16
  from types import MappingProxyType
19
17
  from typing import Any
@@ -23,12 +21,12 @@ from typing import TypedDict
23
21
  from typing import cast
24
22
 
25
23
  from ..types import StrAnyMap
26
- from ..utils import normalized_str
24
+ from ..utils import normalize_choice
27
25
 
28
26
  # SECTION: TYPED DICTIONARIES =============================================== #
29
27
 
30
28
 
31
- class ValidationResult(TypedDict, total=False):
29
+ class ValidationResultDict(TypedDict, total=False):
32
30
  """Shape returned by ``validate_fn`` callables."""
33
31
 
34
32
  valid: bool
@@ -46,7 +44,7 @@ type ValidationPhase = Literal['before_transform', 'after_transform']
46
44
  type ValidationWindow = Literal['before_transform', 'after_transform', 'both']
47
45
  type ValidationSeverity = Literal['warn', 'error']
48
46
 
49
- type ValidateFn = Callable[[Any, Ruleset], ValidationResult]
47
+ type ValidateFn = Callable[[Any, Ruleset], ValidationResultDict]
50
48
  type PrintFn = Callable[[Any], None]
51
49
 
52
50
 
@@ -200,21 +198,21 @@ def maybe_validate(
200
198
  Failure severity (``"warn"`` or ``"error"``).
201
199
  validate_fn : ValidateFn
202
200
  Engine that performs validation and returns a
203
- :class:`ValidationResult` instance.
201
+ :class:`ValidationResultDict` instance.
204
202
  print_json_fn : PrintFn
205
203
  Structured logger invoked when validation fails.
206
204
 
207
205
  Returns
208
206
  -------
209
207
  Any
210
- ``payload`` when validation is skipped or when severity is ``"warn"``
208
+ *payload* when validation is skipped or when severity is ``"warn"``
211
209
  and the validation fails. Returns the validator ``data`` payload when
212
210
  validation succeeds.
213
211
 
214
212
  Raises
215
213
  ------
216
214
  ValueError
217
- Raised when validation fails and ``severity`` is ``"error"``.
215
+ Raised when validation fails and *severity* is ``"error"``.
218
216
 
219
217
  Examples
220
218
  --------
@@ -272,7 +270,7 @@ def _log_failure(
272
270
  phase: ValidationPhase,
273
271
  window: ValidationWindow,
274
272
  ruleset_name: str | None,
275
- result: ValidationResult,
273
+ result: ValidationResultDict,
276
274
  ) -> None:
277
275
  """
278
276
  Emit a structured message describing the failed validation.
@@ -287,7 +285,7 @@ def _log_failure(
287
285
  Configured validation window.
288
286
  ruleset_name : str | None
289
287
  Name of the validation ruleset.
290
- result : ValidationResult
288
+ result : ValidationResultDict
291
289
  Result of the failed validation.
292
290
  """
293
291
  printer(
@@ -320,7 +318,7 @@ def _normalize_phase(
320
318
  """
321
319
  return cast(
322
320
  ValidationPhase,
323
- _normalize_choice(
321
+ normalize_choice(
324
322
  value,
325
323
  mapping=_PHASE_CHOICES,
326
324
  default='before_transform',
@@ -346,7 +344,7 @@ def _normalize_severity(
346
344
  """
347
345
  return cast(
348
346
  ValidationSeverity,
349
- _normalize_choice(
347
+ normalize_choice(
350
348
  value,
351
349
  mapping=_SEVERITY_CHOICES,
352
350
  default='error',
@@ -372,7 +370,7 @@ def _normalize_window(
372
370
  """
373
371
  return cast(
374
372
  ValidationWindow,
375
- _normalize_choice(
373
+ normalize_choice(
376
374
  value,
377
375
  mapping=_WINDOW_CHOICES,
378
376
  default='both',
@@ -380,33 +378,6 @@ def _normalize_window(
380
378
  )
381
379
 
382
380
 
383
- def _normalize_choice(
384
- value: str | None,
385
- *,
386
- mapping: Mapping[str, str],
387
- default: str,
388
- ) -> str:
389
- """
390
- Normalize a text value against a mapping with a default fallback.
391
-
392
- Parameters
393
- ----------
394
- value : str | None
395
- Input text to normalize.
396
- mapping : Mapping[str, str]
397
- Mapping of accepted values to normalized outputs.
398
- default : str
399
- Default to return when input is missing or unrecognized.
400
-
401
- Returns
402
- -------
403
- str
404
- Normalized value.
405
- """
406
- normalized = normalized_str(value)
407
- return mapping.get(normalized, default)
408
-
409
-
410
381
  def _rule_name(
411
382
  rules: Ruleset,
412
383
  ) -> str | None: