etlplus 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- etlplus/__init__.py +43 -0
- etlplus/__main__.py +22 -0
- etlplus/__version__.py +14 -0
- etlplus/api/README.md +237 -0
- etlplus/api/__init__.py +136 -0
- etlplus/api/auth.py +432 -0
- etlplus/api/config.py +633 -0
- etlplus/api/endpoint_client.py +885 -0
- etlplus/api/errors.py +170 -0
- etlplus/api/pagination/__init__.py +47 -0
- etlplus/api/pagination/client.py +188 -0
- etlplus/api/pagination/config.py +440 -0
- etlplus/api/pagination/paginator.py +775 -0
- etlplus/api/rate_limiting/__init__.py +38 -0
- etlplus/api/rate_limiting/config.py +343 -0
- etlplus/api/rate_limiting/rate_limiter.py +266 -0
- etlplus/api/request_manager.py +589 -0
- etlplus/api/retry_manager.py +430 -0
- etlplus/api/transport.py +325 -0
- etlplus/api/types.py +172 -0
- etlplus/cli/__init__.py +15 -0
- etlplus/cli/app.py +1367 -0
- etlplus/cli/handlers.py +775 -0
- etlplus/cli/main.py +616 -0
- etlplus/config/__init__.py +56 -0
- etlplus/config/connector.py +372 -0
- etlplus/config/jobs.py +311 -0
- etlplus/config/pipeline.py +339 -0
- etlplus/config/profile.py +78 -0
- etlplus/config/types.py +204 -0
- etlplus/config/utils.py +120 -0
- etlplus/ddl.py +197 -0
- etlplus/enums.py +414 -0
- etlplus/extract.py +218 -0
- etlplus/file.py +657 -0
- etlplus/load.py +336 -0
- etlplus/mixins.py +62 -0
- etlplus/py.typed +0 -0
- etlplus/run.py +368 -0
- etlplus/run_helpers.py +843 -0
- etlplus/templates/__init__.py +5 -0
- etlplus/templates/ddl.sql.j2 +128 -0
- etlplus/templates/view.sql.j2 +69 -0
- etlplus/transform.py +1049 -0
- etlplus/types.py +227 -0
- etlplus/utils.py +638 -0
- etlplus/validate.py +493 -0
- etlplus/validation/__init__.py +44 -0
- etlplus/validation/utils.py +389 -0
- etlplus-0.5.4.dist-info/METADATA +616 -0
- etlplus-0.5.4.dist-info/RECORD +55 -0
- etlplus-0.5.4.dist-info/WHEEL +5 -0
- etlplus-0.5.4.dist-info/entry_points.txt +2 -0
- etlplus-0.5.4.dist-info/licenses/LICENSE +21 -0
- etlplus-0.5.4.dist-info/top_level.txt +1 -0
etlplus/transform.py
ADDED
|
@@ -0,0 +1,1049 @@
|
|
|
1
|
+
"""
|
|
2
|
+
:mod:`etlplus.transform` module.
|
|
3
|
+
|
|
4
|
+
Helpers to filter, map/rename, select, sort, aggregate, and otherwise
|
|
5
|
+
transform JSON-like records (dicts and lists of dicts).
|
|
6
|
+
|
|
7
|
+
The pipeline accepts both **string** names (e.g., ``"filter"``) and the
|
|
8
|
+
enum ``PipelineStep`` for operation keys. For operators and aggregates,
|
|
9
|
+
specs may provide **strings** (with aliases), the corresponding **enums**
|
|
10
|
+
``OperatorName`` / ``AggregateName``, or **callables**.
|
|
11
|
+
|
|
12
|
+
Examples
|
|
13
|
+
--------
|
|
14
|
+
Basic pipeline with strings::
|
|
15
|
+
|
|
16
|
+
ops = {
|
|
17
|
+
'filter': {'field': 'age', 'op': 'gte', 'value': 18},
|
|
18
|
+
'map': {'first_name': 'name'},
|
|
19
|
+
'select': ['name', 'age'],
|
|
20
|
+
'sort': {'field': 'name'},
|
|
21
|
+
'aggregate': {'field': 'age', 'func': 'avg', 'alias': 'avg_age'},
|
|
22
|
+
}
|
|
23
|
+
result = transform(data, ops)
|
|
24
|
+
|
|
25
|
+
Using enums for keys and functions::
|
|
26
|
+
|
|
27
|
+
from .enums import PipelineStep, OperatorName, AggregateName
|
|
28
|
+
ops = {
|
|
29
|
+
PipelineStep.FILTER: {
|
|
30
|
+
'field': 'age', 'op': OperatorName.GTE, 'value': 18
|
|
31
|
+
},
|
|
32
|
+
PipelineStep.AGGREGATE: {
|
|
33
|
+
'field': 'age', 'func': AggregateName.AVG
|
|
34
|
+
},
|
|
35
|
+
}
|
|
36
|
+
result = transform(data, ops)
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
from __future__ import annotations
|
|
40
|
+
|
|
41
|
+
from collections.abc import Callable
|
|
42
|
+
from collections.abc import Mapping
|
|
43
|
+
from collections.abc import Sequence
|
|
44
|
+
from typing import Any
|
|
45
|
+
from typing import cast
|
|
46
|
+
|
|
47
|
+
from .enums import AggregateName
|
|
48
|
+
from .enums import OperatorName
|
|
49
|
+
from .enums import PipelineStep
|
|
50
|
+
from .load import load_data
|
|
51
|
+
from .types import AggregateFunc
|
|
52
|
+
from .types import AggregateSpec
|
|
53
|
+
from .types import FieldName
|
|
54
|
+
from .types import Fields
|
|
55
|
+
from .types import FilterSpec
|
|
56
|
+
from .types import JSONData
|
|
57
|
+
from .types import JSONDict
|
|
58
|
+
from .types import JSONList
|
|
59
|
+
from .types import MapSpec
|
|
60
|
+
from .types import OperatorFunc
|
|
61
|
+
from .types import PipelineConfig
|
|
62
|
+
from .types import PipelineStepName
|
|
63
|
+
from .types import SortKey
|
|
64
|
+
from .types import StepApplier
|
|
65
|
+
from .types import StepOrSteps
|
|
66
|
+
from .types import StepSpec
|
|
67
|
+
from .types import StrPath
|
|
68
|
+
from .utils import to_number
|
|
69
|
+
|
|
70
|
+
# SECTION: EXPORTS ========================================================== #
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
__all__ = [
|
|
74
|
+
'apply_aggregate',
|
|
75
|
+
'apply_filter',
|
|
76
|
+
'apply_map',
|
|
77
|
+
'apply_select',
|
|
78
|
+
'apply_sort',
|
|
79
|
+
'transform',
|
|
80
|
+
]
|
|
81
|
+
|
|
82
|
+
# SECTION: INTERNAL FUNCTIONS ============================================== #
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# -- Aggregators -- #
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _agg_avg(
|
|
89
|
+
nums: list[float],
|
|
90
|
+
_: int,
|
|
91
|
+
) -> float:
|
|
92
|
+
"""
|
|
93
|
+
Average of *nums* or ``0.0`` if empty.
|
|
94
|
+
|
|
95
|
+
Parameters
|
|
96
|
+
----------
|
|
97
|
+
nums : list[float]
|
|
98
|
+
Numeric values to average.
|
|
99
|
+
|
|
100
|
+
Returns
|
|
101
|
+
-------
|
|
102
|
+
float
|
|
103
|
+
The average of the input numbers or ``0.0`` if empty.
|
|
104
|
+
"""
|
|
105
|
+
return (sum(nums) / len(nums)) if nums else 0.0
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _agg_count(
|
|
109
|
+
_: list[float],
|
|
110
|
+
present: int,
|
|
111
|
+
) -> int:
|
|
112
|
+
"""
|
|
113
|
+
Return the provided presence count ``present``.
|
|
114
|
+
|
|
115
|
+
Parameters
|
|
116
|
+
----------
|
|
117
|
+
present : int
|
|
118
|
+
Count of present values.
|
|
119
|
+
|
|
120
|
+
Returns
|
|
121
|
+
-------
|
|
122
|
+
int
|
|
123
|
+
The provided presence count ``present``.
|
|
124
|
+
"""
|
|
125
|
+
return present
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _agg_max(
|
|
129
|
+
nums: list[float],
|
|
130
|
+
_: int,
|
|
131
|
+
) -> float | None:
|
|
132
|
+
"""
|
|
133
|
+
Maximum of *nums* or ``None`` if empty.
|
|
134
|
+
|
|
135
|
+
Parameters
|
|
136
|
+
----------
|
|
137
|
+
nums : list[float]
|
|
138
|
+
Numeric values to consider.
|
|
139
|
+
|
|
140
|
+
Returns
|
|
141
|
+
-------
|
|
142
|
+
float | None
|
|
143
|
+
The maximum of the input numbers or ``None`` if empty.
|
|
144
|
+
"""
|
|
145
|
+
return max(nums) if nums else None
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _agg_min(
|
|
149
|
+
nums: list[float],
|
|
150
|
+
_: int,
|
|
151
|
+
) -> float | None:
|
|
152
|
+
"""
|
|
153
|
+
Minimum of *nums* or ``None`` if empty.
|
|
154
|
+
|
|
155
|
+
Parameters
|
|
156
|
+
----------
|
|
157
|
+
nums : list[float]
|
|
158
|
+
Numeric values to consider.
|
|
159
|
+
|
|
160
|
+
Returns
|
|
161
|
+
-------
|
|
162
|
+
float | None
|
|
163
|
+
The minimum of the input numbers or ``None`` if empty.
|
|
164
|
+
"""
|
|
165
|
+
return min(nums) if nums else None
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _agg_sum(
|
|
169
|
+
nums: list[float],
|
|
170
|
+
_: int,
|
|
171
|
+
) -> float:
|
|
172
|
+
"""
|
|
173
|
+
Sum of *nums* (``0.0`` for empty).
|
|
174
|
+
|
|
175
|
+
Parameters
|
|
176
|
+
----------
|
|
177
|
+
nums : list[float]
|
|
178
|
+
Numeric values to sum.
|
|
179
|
+
|
|
180
|
+
Returns
|
|
181
|
+
-------
|
|
182
|
+
float
|
|
183
|
+
The sum of the input numbers or ``0.0`` if empty.
|
|
184
|
+
"""
|
|
185
|
+
return sum(nums)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
# -- Normalization -- #
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _normalize_specs(
|
|
192
|
+
config: StepOrSteps | None,
|
|
193
|
+
) -> list[StepSpec]:
|
|
194
|
+
"""
|
|
195
|
+
Normalize a step config into a list of step specs.
|
|
196
|
+
|
|
197
|
+
Parameters
|
|
198
|
+
----------
|
|
199
|
+
config : StepOrSteps | None
|
|
200
|
+
``None``, a single mapping, or a sequence of mappings.
|
|
201
|
+
|
|
202
|
+
Returns
|
|
203
|
+
-------
|
|
204
|
+
list[StepSpec]
|
|
205
|
+
An empty list for ``None``, otherwise a list form of *config*.
|
|
206
|
+
"""
|
|
207
|
+
if config is None:
|
|
208
|
+
return []
|
|
209
|
+
if isinstance(config, Sequence) and not isinstance(
|
|
210
|
+
config,
|
|
211
|
+
(str, bytes, bytearray),
|
|
212
|
+
):
|
|
213
|
+
# Already a sequence of step specs; normalize to a list.
|
|
214
|
+
return list(config) # type: ignore[list-item]
|
|
215
|
+
|
|
216
|
+
# Single spec
|
|
217
|
+
return [config]
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _normalize_operation_keys(ops: Mapping[Any, Any]) -> dict[str, Any]:
|
|
221
|
+
"""
|
|
222
|
+
Normalize pipeline operation keys to plain strings.
|
|
223
|
+
|
|
224
|
+
Accepts both string keys (e.g., 'filter') and enum keys
|
|
225
|
+
(PipelineStep.FILTER), returning a str->spec mapping.
|
|
226
|
+
|
|
227
|
+
Parameters
|
|
228
|
+
----------
|
|
229
|
+
ops : Mapping[Any, Any]
|
|
230
|
+
Pipeline operations to normalize.
|
|
231
|
+
|
|
232
|
+
Returns
|
|
233
|
+
-------
|
|
234
|
+
dict[str, Any]
|
|
235
|
+
Dictionary whose keys are normalized step names.
|
|
236
|
+
"""
|
|
237
|
+
normalized: dict[str, Any] = {}
|
|
238
|
+
for k, v in ops.items():
|
|
239
|
+
if isinstance(k, str):
|
|
240
|
+
normalized[k] = v
|
|
241
|
+
elif isinstance(k, PipelineStep):
|
|
242
|
+
normalized[k.value] = v
|
|
243
|
+
else:
|
|
244
|
+
# Fallback: try `.value`, else use string form
|
|
245
|
+
name = getattr(k, 'value', str(k))
|
|
246
|
+
if isinstance(name, str):
|
|
247
|
+
normalized[name] = v
|
|
248
|
+
return normalized
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
# -- Predicates -- #
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def _contains(
|
|
255
|
+
container: Any,
|
|
256
|
+
member: Any,
|
|
257
|
+
) -> bool:
|
|
258
|
+
"""
|
|
259
|
+
Return ``True`` if *member* is contained in *container*.
|
|
260
|
+
|
|
261
|
+
Parameters
|
|
262
|
+
----------
|
|
263
|
+
container : Any
|
|
264
|
+
Potential container object.
|
|
265
|
+
member : Any
|
|
266
|
+
Candidate member to check for containment.
|
|
267
|
+
|
|
268
|
+
Returns
|
|
269
|
+
-------
|
|
270
|
+
bool
|
|
271
|
+
``True`` if ``member in container`` succeeds; ``False`` on
|
|
272
|
+
``TypeError`` or when containment fails.
|
|
273
|
+
"""
|
|
274
|
+
try:
|
|
275
|
+
return member in container # type: ignore[operator]
|
|
276
|
+
except TypeError:
|
|
277
|
+
return False
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def _has(
|
|
281
|
+
member: Any,
|
|
282
|
+
container: Any,
|
|
283
|
+
) -> bool:
|
|
284
|
+
"""
|
|
285
|
+
Return ``True`` if *container* contains *member*.
|
|
286
|
+
|
|
287
|
+
This is the dual form of :func:`_contains` for readability in certain
|
|
288
|
+
operator contexts (``in`` vs. ``contains``).
|
|
289
|
+
"""
|
|
290
|
+
return _contains(container, member)
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
# -- Resolvers -- #
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def _resolve_aggregator(
|
|
297
|
+
func: AggregateName | AggregateFunc | str,
|
|
298
|
+
) -> Callable:
|
|
299
|
+
"""
|
|
300
|
+
Resolve an aggregate specifier to a callable.
|
|
301
|
+
|
|
302
|
+
Parameters
|
|
303
|
+
----------
|
|
304
|
+
func : AggregateName | AggregateFunc | str
|
|
305
|
+
An :class:`AggregateName`, a string (with aliases), or a callable.
|
|
306
|
+
|
|
307
|
+
Returns
|
|
308
|
+
-------
|
|
309
|
+
Callable
|
|
310
|
+
Function of signature ``(xs: list[float], n: int) -> Any``.
|
|
311
|
+
|
|
312
|
+
Raises
|
|
313
|
+
------
|
|
314
|
+
TypeError
|
|
315
|
+
If *func* cannot be interpreted as an aggregator.
|
|
316
|
+
"""
|
|
317
|
+
if isinstance(func, AggregateName):
|
|
318
|
+
return func.func
|
|
319
|
+
if isinstance(func, str):
|
|
320
|
+
return AggregateName.coerce(func).func
|
|
321
|
+
if callable(func):
|
|
322
|
+
return func
|
|
323
|
+
|
|
324
|
+
raise TypeError(f'Invalid aggregate func: {func!r}')
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def _resolve_operator(
|
|
328
|
+
op: OperatorName | OperatorFunc | str,
|
|
329
|
+
) -> Callable:
|
|
330
|
+
"""
|
|
331
|
+
Resolve an operator specifier to a binary predicate.
|
|
332
|
+
|
|
333
|
+
Parameters
|
|
334
|
+
----------
|
|
335
|
+
op : OperatorName | OperatorFunc | str
|
|
336
|
+
An :class:`OperatorName`, a string (with aliases), or a callable.
|
|
337
|
+
|
|
338
|
+
Returns
|
|
339
|
+
-------
|
|
340
|
+
Callable
|
|
341
|
+
Function of signature ``(a: Any, b: Any) -> bool``.
|
|
342
|
+
|
|
343
|
+
Raises
|
|
344
|
+
------
|
|
345
|
+
TypeError
|
|
346
|
+
If *op* cannot be interpreted as an operator.
|
|
347
|
+
"""
|
|
348
|
+
|
|
349
|
+
def _wrap_numeric(op_name: OperatorName) -> Callable[[Any, Any], bool]:
|
|
350
|
+
base = op_name.func
|
|
351
|
+
if op_name in {
|
|
352
|
+
OperatorName.GT,
|
|
353
|
+
OperatorName.GTE,
|
|
354
|
+
OperatorName.LT,
|
|
355
|
+
OperatorName.LTE,
|
|
356
|
+
OperatorName.EQ,
|
|
357
|
+
OperatorName.NE,
|
|
358
|
+
}:
|
|
359
|
+
|
|
360
|
+
def compare(a: Any, b: Any) -> bool: # noqa: ANN401 - generic
|
|
361
|
+
a_num = to_number(a)
|
|
362
|
+
b_num = to_number(b)
|
|
363
|
+
if a_num is not None and b_num is not None:
|
|
364
|
+
return bool(base(a_num, b_num))
|
|
365
|
+
return bool(base(a, b))
|
|
366
|
+
|
|
367
|
+
return compare
|
|
368
|
+
# Non-numeric operators: use base behavior
|
|
369
|
+
return base
|
|
370
|
+
|
|
371
|
+
if isinstance(op, OperatorName):
|
|
372
|
+
return _wrap_numeric(op)
|
|
373
|
+
if isinstance(op, str):
|
|
374
|
+
return _wrap_numeric(OperatorName.coerce(op))
|
|
375
|
+
if callable(op):
|
|
376
|
+
return op
|
|
377
|
+
|
|
378
|
+
raise TypeError(f'Invalid operator: {op!r}')
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
# -- Sorting -- #
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
def _sort_key(
|
|
385
|
+
value: Any,
|
|
386
|
+
) -> SortKey:
|
|
387
|
+
"""
|
|
388
|
+
Coerce mixed-type values into a sortable tuple key.
|
|
389
|
+
|
|
390
|
+
Ordering policy
|
|
391
|
+
---------------
|
|
392
|
+
1) Numbers
|
|
393
|
+
2) Non-numeric values (stringified)
|
|
394
|
+
3) ``None`` (last)
|
|
395
|
+
|
|
396
|
+
Parameters
|
|
397
|
+
----------
|
|
398
|
+
value : Any
|
|
399
|
+
Value to normalize for sorting.
|
|
400
|
+
|
|
401
|
+
Returns
|
|
402
|
+
-------
|
|
403
|
+
SortKey
|
|
404
|
+
A key with a type tag to avoid cross-type comparisons.
|
|
405
|
+
"""
|
|
406
|
+
if value is None:
|
|
407
|
+
return (2, '')
|
|
408
|
+
if isinstance(value, (int, float)):
|
|
409
|
+
return (0, float(value))
|
|
410
|
+
|
|
411
|
+
return (1, str(value))
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
# -- Aggregation and filtering -- #
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
def _collect_numeric_and_presence(
|
|
418
|
+
rows: JSONList,
|
|
419
|
+
field: FieldName | None,
|
|
420
|
+
) -> tuple[list[float], int]:
|
|
421
|
+
"""
|
|
422
|
+
Collect numeric values and count presence of field in rows.
|
|
423
|
+
|
|
424
|
+
If field is None, returns ([], len(rows)).
|
|
425
|
+
|
|
426
|
+
Parameters
|
|
427
|
+
----------
|
|
428
|
+
rows : JSONList
|
|
429
|
+
Input records.
|
|
430
|
+
field : FieldName | None
|
|
431
|
+
Field name to check for presence.
|
|
432
|
+
|
|
433
|
+
Returns
|
|
434
|
+
-------
|
|
435
|
+
tuple[list[float], int]
|
|
436
|
+
A tuple containing a list of numeric values and the count of present
|
|
437
|
+
fields.
|
|
438
|
+
"""
|
|
439
|
+
if not field:
|
|
440
|
+
return [], len(rows)
|
|
441
|
+
|
|
442
|
+
nums: list[float] = []
|
|
443
|
+
present = 0
|
|
444
|
+
for r in rows:
|
|
445
|
+
if field in r:
|
|
446
|
+
present += 1
|
|
447
|
+
v = r.get(field)
|
|
448
|
+
if isinstance(v, (int, float)):
|
|
449
|
+
nums.append(float(v))
|
|
450
|
+
return nums, present
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
def _derive_agg_key(
|
|
454
|
+
func_raw: AggregateName | AggregateFunc | str,
|
|
455
|
+
field: FieldName | None,
|
|
456
|
+
alias: Any,
|
|
457
|
+
) -> str:
|
|
458
|
+
"""
|
|
459
|
+
Derive the output key name for an aggregate.
|
|
460
|
+
|
|
461
|
+
Uses alias when provided; otherwise builds like "sum_amount" or "count".
|
|
462
|
+
|
|
463
|
+
Parameters
|
|
464
|
+
----------
|
|
465
|
+
func_raw : AggregateName | AggregateFunc | str
|
|
466
|
+
The raw function specifier.
|
|
467
|
+
field : FieldName | None
|
|
468
|
+
The field being aggregated.
|
|
469
|
+
alias : Any
|
|
470
|
+
Optional alias for the output key.
|
|
471
|
+
|
|
472
|
+
Returns
|
|
473
|
+
-------
|
|
474
|
+
str
|
|
475
|
+
The derived output key name.
|
|
476
|
+
"""
|
|
477
|
+
if alias is not None:
|
|
478
|
+
return str(alias)
|
|
479
|
+
|
|
480
|
+
if isinstance(func_raw, AggregateName):
|
|
481
|
+
label = func_raw.value
|
|
482
|
+
elif isinstance(func_raw, str):
|
|
483
|
+
label = AggregateName.coerce(func_raw).value
|
|
484
|
+
elif callable(func_raw):
|
|
485
|
+
label = getattr(func_raw, '__name__', 'custom')
|
|
486
|
+
else:
|
|
487
|
+
label = str(func_raw)
|
|
488
|
+
|
|
489
|
+
return label if not field else f'{label}_{field}'
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
def _eval_condition(
|
|
493
|
+
record: JSONDict,
|
|
494
|
+
field: FieldName,
|
|
495
|
+
op_func: OperatorFunc,
|
|
496
|
+
value: Any,
|
|
497
|
+
catch_all: bool,
|
|
498
|
+
) -> bool:
|
|
499
|
+
"""
|
|
500
|
+
Evaluate a filter condition on a record.
|
|
501
|
+
|
|
502
|
+
Returns False if the field is missing or if the operator raises.
|
|
503
|
+
|
|
504
|
+
Parameters
|
|
505
|
+
----------
|
|
506
|
+
record : JSONDict
|
|
507
|
+
The input record.
|
|
508
|
+
field : FieldName
|
|
509
|
+
The field name to check.
|
|
510
|
+
op_func : OperatorFunc
|
|
511
|
+
The binary operator function.
|
|
512
|
+
value : Any
|
|
513
|
+
The value to compare against.
|
|
514
|
+
catch_all : bool
|
|
515
|
+
If True, catch all exceptions and return; if False, propagate
|
|
516
|
+
exceptions.
|
|
517
|
+
|
|
518
|
+
Returns
|
|
519
|
+
-------
|
|
520
|
+
bool
|
|
521
|
+
True if the condition is met; False if not.
|
|
522
|
+
|
|
523
|
+
Raises
|
|
524
|
+
------
|
|
525
|
+
Exception
|
|
526
|
+
If *catch_all* is False and the operator raises.
|
|
527
|
+
"""
|
|
528
|
+
try:
|
|
529
|
+
lhs = record[field]
|
|
530
|
+
except KeyError:
|
|
531
|
+
return False
|
|
532
|
+
|
|
533
|
+
try:
|
|
534
|
+
return bool(op_func(lhs, value))
|
|
535
|
+
except Exception: # noqa: BLE001 - controlled by flag
|
|
536
|
+
if catch_all:
|
|
537
|
+
return False
|
|
538
|
+
raise
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
# -- Step Appliers -- #
|
|
542
|
+
|
|
543
|
+
|
|
544
|
+
def _apply_aggregate_step(
|
|
545
|
+
rows: JSONList,
|
|
546
|
+
spec: AggregateSpec,
|
|
547
|
+
) -> JSONList:
|
|
548
|
+
"""
|
|
549
|
+
Apply a single aggregate spec and return a one-row result list.
|
|
550
|
+
|
|
551
|
+
Parameters
|
|
552
|
+
----------
|
|
553
|
+
rows : JSONList
|
|
554
|
+
Input records.
|
|
555
|
+
spec : AggregateSpec
|
|
556
|
+
Mapping with keys like ``{'field': 'amount', 'func': 'sum', 'alias':
|
|
557
|
+
'total'}``.
|
|
558
|
+
|
|
559
|
+
Returns
|
|
560
|
+
-------
|
|
561
|
+
JSONList
|
|
562
|
+
A list containing one mapping ``[{alias: value}]``.
|
|
563
|
+
"""
|
|
564
|
+
field: FieldName | None = spec.get('field') # type: ignore[assignment]
|
|
565
|
+
func_raw = spec.get('func', 'count')
|
|
566
|
+
alias = spec.get('alias')
|
|
567
|
+
|
|
568
|
+
agg_func = _resolve_aggregator(func_raw)
|
|
569
|
+
xs, present = _collect_numeric_and_presence(rows, field)
|
|
570
|
+
key = _derive_agg_key(func_raw, field, alias)
|
|
571
|
+
result = agg_func(xs, present)
|
|
572
|
+
return [{key: result}]
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
def _apply_filter_step(
|
|
576
|
+
records: JSONList,
|
|
577
|
+
spec: Any,
|
|
578
|
+
) -> JSONList:
|
|
579
|
+
"""
|
|
580
|
+
Functional filter applier used by the pipeline engine.
|
|
581
|
+
|
|
582
|
+
Parameters
|
|
583
|
+
----------
|
|
584
|
+
records : JSONList
|
|
585
|
+
Input records to filter.
|
|
586
|
+
spec : Any
|
|
587
|
+
Mapping with keys ``field``, ``op``, and ``value``. ``op`` may be a
|
|
588
|
+
string, :class:`OperatorName`, or a callable.
|
|
589
|
+
|
|
590
|
+
Returns
|
|
591
|
+
-------
|
|
592
|
+
JSONList
|
|
593
|
+
Filtered records.
|
|
594
|
+
"""
|
|
595
|
+
field: FieldName = spec.get('field') # type: ignore[assignment]
|
|
596
|
+
op = spec.get('op')
|
|
597
|
+
value = spec.get('value')
|
|
598
|
+
|
|
599
|
+
if not field:
|
|
600
|
+
return records # Or raise, depending on your policy.
|
|
601
|
+
|
|
602
|
+
op_func = _resolve_operator(op)
|
|
603
|
+
|
|
604
|
+
return [
|
|
605
|
+
r
|
|
606
|
+
for r in records
|
|
607
|
+
if _eval_condition(r, field, op_func, value, catch_all=True)
|
|
608
|
+
]
|
|
609
|
+
|
|
610
|
+
|
|
611
|
+
def _apply_map_step(
|
|
612
|
+
records: JSONList,
|
|
613
|
+
spec: Any,
|
|
614
|
+
) -> JSONList:
|
|
615
|
+
"""
|
|
616
|
+
Functional map/rename applier used by the pipeline engine.
|
|
617
|
+
|
|
618
|
+
Parameters
|
|
619
|
+
----------
|
|
620
|
+
records : JSONList
|
|
621
|
+
Input records to transform.
|
|
622
|
+
spec : Any
|
|
623
|
+
Mapping of **old field names** to **new field names**.
|
|
624
|
+
|
|
625
|
+
Returns
|
|
626
|
+
-------
|
|
627
|
+
JSONList
|
|
628
|
+
Transformed records.
|
|
629
|
+
"""
|
|
630
|
+
if isinstance(spec, Mapping):
|
|
631
|
+
return apply_map(records, spec)
|
|
632
|
+
|
|
633
|
+
return records
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
def _apply_select_step(
|
|
637
|
+
records: JSONList,
|
|
638
|
+
spec: Any,
|
|
639
|
+
) -> JSONList:
|
|
640
|
+
"""
|
|
641
|
+
Functional select/project applier used by the pipeline engine.
|
|
642
|
+
|
|
643
|
+
Parameters
|
|
644
|
+
----------
|
|
645
|
+
records : JSONList
|
|
646
|
+
Input records to transform.
|
|
647
|
+
spec : Any
|
|
648
|
+
Either a mapping with key ``'fields'`` whose value is a sequence of
|
|
649
|
+
field names, or a plain sequence of field names.
|
|
650
|
+
|
|
651
|
+
Returns
|
|
652
|
+
-------
|
|
653
|
+
JSONList
|
|
654
|
+
Transformed data.
|
|
655
|
+
"""
|
|
656
|
+
fields: Sequence[Any]
|
|
657
|
+
if isinstance(spec, Mapping):
|
|
658
|
+
maybe_fields = spec.get('fields')
|
|
659
|
+
if not _is_plain_fields_list(maybe_fields):
|
|
660
|
+
return records
|
|
661
|
+
fields = cast(Sequence[Any], maybe_fields)
|
|
662
|
+
elif _is_plain_fields_list(spec):
|
|
663
|
+
fields = cast(Sequence[Any], spec)
|
|
664
|
+
else:
|
|
665
|
+
return records
|
|
666
|
+
|
|
667
|
+
return apply_select(records, [str(field) for field in fields])
|
|
668
|
+
|
|
669
|
+
|
|
670
|
+
def _apply_sort_step(
|
|
671
|
+
records: JSONList,
|
|
672
|
+
spec: Any,
|
|
673
|
+
) -> JSONList:
|
|
674
|
+
"""
|
|
675
|
+
Functional sort applier used by the pipeline engine.
|
|
676
|
+
|
|
677
|
+
Parameters
|
|
678
|
+
----------
|
|
679
|
+
records : JSONList
|
|
680
|
+
Input records to sort.
|
|
681
|
+
spec : Any
|
|
682
|
+
Either a mapping with keys ``'field'`` and optional ``'reverse'``, or
|
|
683
|
+
a plain field name.
|
|
684
|
+
|
|
685
|
+
Returns
|
|
686
|
+
-------
|
|
687
|
+
JSONList
|
|
688
|
+
Sorted records.
|
|
689
|
+
"""
|
|
690
|
+
if isinstance(spec, Mapping):
|
|
691
|
+
field_value = spec.get('field')
|
|
692
|
+
field = str(field_value) if field_value is not None else None
|
|
693
|
+
reverse = bool(spec.get('reverse', False))
|
|
694
|
+
return apply_sort(records, field, reverse)
|
|
695
|
+
|
|
696
|
+
if spec is None:
|
|
697
|
+
return records
|
|
698
|
+
|
|
699
|
+
return apply_sort(records, str(spec), False)
|
|
700
|
+
|
|
701
|
+
|
|
702
|
+
# -- Helpers -- #
|
|
703
|
+
|
|
704
|
+
|
|
705
|
+
def _is_plain_fields_list(obj: Any) -> bool:
|
|
706
|
+
"""
|
|
707
|
+
Return True if obj is a non-text sequence of non-mapping items.
|
|
708
|
+
|
|
709
|
+
Used to detect a list/tuple of field names like ['name', 'age'].
|
|
710
|
+
|
|
711
|
+
Parameters
|
|
712
|
+
----------
|
|
713
|
+
obj : Any
|
|
714
|
+
The object to check.
|
|
715
|
+
|
|
716
|
+
Returns
|
|
717
|
+
-------
|
|
718
|
+
bool
|
|
719
|
+
True if obj is a non-text sequence of non-mapping items, False
|
|
720
|
+
otherwise.
|
|
721
|
+
"""
|
|
722
|
+
return (
|
|
723
|
+
isinstance(obj, Sequence)
|
|
724
|
+
and not isinstance(obj, (str, bytes, bytearray))
|
|
725
|
+
and not any(isinstance(x, Mapping) for x in obj)
|
|
726
|
+
)
|
|
727
|
+
|
|
728
|
+
|
|
729
|
+
# SECTION: INTERNAL CONSTANTS ============================================== #
|
|
730
|
+
|
|
731
|
+
|
|
732
|
+
_PIPELINE_STEPS: tuple[PipelineStepName, ...] = (
|
|
733
|
+
'filter',
|
|
734
|
+
'map',
|
|
735
|
+
'select',
|
|
736
|
+
'sort',
|
|
737
|
+
'aggregate',
|
|
738
|
+
)
|
|
739
|
+
|
|
740
|
+
|
|
741
|
+
_STEP_APPLIERS: dict[PipelineStepName, StepApplier] = {
|
|
742
|
+
'filter': _apply_filter_step,
|
|
743
|
+
'map': _apply_map_step,
|
|
744
|
+
'select': _apply_select_step,
|
|
745
|
+
'sort': _apply_sort_step,
|
|
746
|
+
}
|
|
747
|
+
|
|
748
|
+
|
|
749
|
+
# SECTION: EXPORTS ========================================================== #
|
|
750
|
+
|
|
751
|
+
|
|
752
|
+
def apply_filter(
|
|
753
|
+
records: JSONList,
|
|
754
|
+
condition: FilterSpec,
|
|
755
|
+
) -> JSONList:
|
|
756
|
+
"""
|
|
757
|
+
Filter a list of records by a simple condition.
|
|
758
|
+
|
|
759
|
+
Parameters
|
|
760
|
+
----------
|
|
761
|
+
records : JSONList
|
|
762
|
+
Records to filter.
|
|
763
|
+
condition : FilterSpec
|
|
764
|
+
Condition object with keys ``field``, ``op``, and ``value``. The
|
|
765
|
+
``op`` can be one of ``'eq'``, ``'ne'``, ``'gt'``, ``'gte'``,
|
|
766
|
+
``'lt'``, ``'lte'``, ``'in'``, or ``'contains'``. Custom comparison
|
|
767
|
+
logic can be provided by supplying a callable for ``op``.
|
|
768
|
+
|
|
769
|
+
Returns
|
|
770
|
+
-------
|
|
771
|
+
JSONList
|
|
772
|
+
Filtered records.
|
|
773
|
+
"""
|
|
774
|
+
field = condition.get('field')
|
|
775
|
+
op_raw = condition.get('op')
|
|
776
|
+
value = condition.get('value')
|
|
777
|
+
|
|
778
|
+
if not field or op_raw is None or value is None:
|
|
779
|
+
return records
|
|
780
|
+
|
|
781
|
+
try:
|
|
782
|
+
op_func = cast(OperatorFunc, _resolve_operator(op_raw))
|
|
783
|
+
except TypeError:
|
|
784
|
+
return records
|
|
785
|
+
|
|
786
|
+
result: JSONList = []
|
|
787
|
+
for record in records:
|
|
788
|
+
if field not in record:
|
|
789
|
+
continue
|
|
790
|
+
try:
|
|
791
|
+
if _eval_condition(record, field, op_func, value, catch_all=False):
|
|
792
|
+
result.append(record)
|
|
793
|
+
except TypeError:
|
|
794
|
+
# Skip records where the comparison is not supported.
|
|
795
|
+
continue
|
|
796
|
+
|
|
797
|
+
return result
|
|
798
|
+
|
|
799
|
+
|
|
800
|
+
def apply_map(
|
|
801
|
+
records: JSONList,
|
|
802
|
+
mapping: MapSpec,
|
|
803
|
+
) -> JSONList:
|
|
804
|
+
"""
|
|
805
|
+
Map/rename fields in each record.
|
|
806
|
+
|
|
807
|
+
Parameters
|
|
808
|
+
----------
|
|
809
|
+
records : JSONList
|
|
810
|
+
Records to transform.
|
|
811
|
+
mapping : MapSpec
|
|
812
|
+
Mapping of old field names to new field names.
|
|
813
|
+
|
|
814
|
+
Returns
|
|
815
|
+
-------
|
|
816
|
+
JSONList
|
|
817
|
+
New records with keys renamed. Unmapped fields are preserved.
|
|
818
|
+
"""
|
|
819
|
+
rename_map = dict(mapping)
|
|
820
|
+
result: JSONList = []
|
|
821
|
+
|
|
822
|
+
for record in records:
|
|
823
|
+
renamed = {
|
|
824
|
+
new_key: record[old_key]
|
|
825
|
+
for old_key, new_key in rename_map.items()
|
|
826
|
+
if old_key in record
|
|
827
|
+
}
|
|
828
|
+
renamed.update(
|
|
829
|
+
{
|
|
830
|
+
key: value
|
|
831
|
+
for key, value in record.items()
|
|
832
|
+
if key not in rename_map
|
|
833
|
+
},
|
|
834
|
+
)
|
|
835
|
+
result.append(renamed)
|
|
836
|
+
|
|
837
|
+
return result
|
|
838
|
+
|
|
839
|
+
|
|
840
|
+
def apply_select(
|
|
841
|
+
records: JSONList,
|
|
842
|
+
fields: Fields,
|
|
843
|
+
) -> JSONList:
|
|
844
|
+
"""
|
|
845
|
+
Keep only the requested fields in each record.
|
|
846
|
+
|
|
847
|
+
Parameters
|
|
848
|
+
----------
|
|
849
|
+
records : JSONList
|
|
850
|
+
Records to project.
|
|
851
|
+
fields : Fields
|
|
852
|
+
Field names to retain.
|
|
853
|
+
|
|
854
|
+
Returns
|
|
855
|
+
-------
|
|
856
|
+
JSONList
|
|
857
|
+
Records containing the requested fields; missing fields are ``None``.
|
|
858
|
+
"""
|
|
859
|
+
return [
|
|
860
|
+
{field: record.get(field) for field in fields} for record in records
|
|
861
|
+
]
|
|
862
|
+
|
|
863
|
+
|
|
864
|
+
def apply_sort(
|
|
865
|
+
records: JSONList,
|
|
866
|
+
field: FieldName | None,
|
|
867
|
+
reverse: bool = False,
|
|
868
|
+
) -> JSONList:
|
|
869
|
+
"""
|
|
870
|
+
Sort records by a field.
|
|
871
|
+
|
|
872
|
+
Parameters
|
|
873
|
+
----------
|
|
874
|
+
records : JSONList
|
|
875
|
+
Records to sort.
|
|
876
|
+
field : FieldName | None
|
|
877
|
+
Field name to sort by. If ``None``, input is returned unchanged.
|
|
878
|
+
reverse : bool, optional
|
|
879
|
+
Sort descending if ``True``. Default is ``False``.
|
|
880
|
+
|
|
881
|
+
Returns
|
|
882
|
+
-------
|
|
883
|
+
JSONList
|
|
884
|
+
Sorted records.
|
|
885
|
+
"""
|
|
886
|
+
if not field:
|
|
887
|
+
return records
|
|
888
|
+
|
|
889
|
+
key_field: FieldName = field
|
|
890
|
+
return sorted(
|
|
891
|
+
records,
|
|
892
|
+
key=lambda x: _sort_key(x.get(key_field)),
|
|
893
|
+
reverse=reverse,
|
|
894
|
+
)
|
|
895
|
+
|
|
896
|
+
|
|
897
|
+
def apply_aggregate(
|
|
898
|
+
records: JSONList,
|
|
899
|
+
operation: AggregateSpec,
|
|
900
|
+
) -> JSONDict:
|
|
901
|
+
"""
|
|
902
|
+
Aggregate a numeric field or count presence.
|
|
903
|
+
|
|
904
|
+
Parameters
|
|
905
|
+
----------
|
|
906
|
+
records : JSONList
|
|
907
|
+
Records to aggregate.
|
|
908
|
+
operation : AggregateSpec
|
|
909
|
+
Dict with keys ``field`` and ``func``. ``func`` is one of
|
|
910
|
+
``'sum'``, ``'avg'``, ``'min'``, ``'max'``, or ``'count'``.
|
|
911
|
+
A callable may also be supplied for ``func``. Optionally, set
|
|
912
|
+
``alias`` to control the output key name.
|
|
913
|
+
|
|
914
|
+
Returns
|
|
915
|
+
-------
|
|
916
|
+
JSONDict
|
|
917
|
+
A single-row result like ``{"sum_age": 42}``.
|
|
918
|
+
|
|
919
|
+
Notes
|
|
920
|
+
-----
|
|
921
|
+
Numeric operations ignore non-numeric values but count their presence
|
|
922
|
+
for ``'count'``.
|
|
923
|
+
"""
|
|
924
|
+
field = operation.get('field')
|
|
925
|
+
func = operation.get('func')
|
|
926
|
+
alias = operation.get('alias')
|
|
927
|
+
|
|
928
|
+
if not field or func is None:
|
|
929
|
+
return {'error': 'Invalid aggregation operation'}
|
|
930
|
+
|
|
931
|
+
try:
|
|
932
|
+
aggregator = _resolve_aggregator(func)
|
|
933
|
+
except TypeError:
|
|
934
|
+
return {'error': f'Unknown aggregation function: {func}'}
|
|
935
|
+
|
|
936
|
+
nums, present = _collect_numeric_and_presence(records, field)
|
|
937
|
+
key_name = _derive_agg_key(func, field, alias)
|
|
938
|
+
return {key_name: aggregator(nums, present)}
|
|
939
|
+
|
|
940
|
+
|
|
941
|
+
def transform(
|
|
942
|
+
source: StrPath | JSONData,
|
|
943
|
+
operations: PipelineConfig | None = None,
|
|
944
|
+
) -> JSONData:
|
|
945
|
+
"""
|
|
946
|
+
Transform data using optional filter/map/select/sort/aggregate steps.
|
|
947
|
+
|
|
948
|
+
Parameters
|
|
949
|
+
----------
|
|
950
|
+
source : StrPath | JSONData
|
|
951
|
+
Data source to transform.
|
|
952
|
+
operations : PipelineConfig | None, optional
|
|
953
|
+
Operation dictionary that may contain the keys ``filter``, ``map``,
|
|
954
|
+
``select``, ``sort``, and ``aggregate`` with their respective
|
|
955
|
+
configs. Each value may be a single config or a sequence of configs
|
|
956
|
+
to apply in order. Aggregations accept multiple configs and merge
|
|
957
|
+
the results.
|
|
958
|
+
|
|
959
|
+
Returns
|
|
960
|
+
-------
|
|
961
|
+
JSONData
|
|
962
|
+
Transformed data.
|
|
963
|
+
|
|
964
|
+
Notes
|
|
965
|
+
-----
|
|
966
|
+
Operation keys may be provided as strings (e.g., ``"filter"``) or as
|
|
967
|
+
:class:`PipelineStep` enum members. The aggregate step returns a **single
|
|
968
|
+
mapping** with merged aggregate results when present.
|
|
969
|
+
|
|
970
|
+
Examples
|
|
971
|
+
--------
|
|
972
|
+
Minimal example with multiple steps::
|
|
973
|
+
|
|
974
|
+
ops = {
|
|
975
|
+
'filter': {'field': 'age', 'op': 'gt', 'value': 18},
|
|
976
|
+
'map': {'old_name': 'new_name'},
|
|
977
|
+
'select': ['name', 'age'],
|
|
978
|
+
'sort': {'field': 'name', 'reverse': False},
|
|
979
|
+
'aggregate': {'field': 'age', 'func': 'avg'},
|
|
980
|
+
}
|
|
981
|
+
result = transform(data, ops)
|
|
982
|
+
|
|
983
|
+
Using enums for keys and functions::
|
|
984
|
+
|
|
985
|
+
from .enums import PipelineStep, OperatorName, AggregateName
|
|
986
|
+
ops = {
|
|
987
|
+
PipelineStep.FILTER: {
|
|
988
|
+
'field': 'age', 'op': OperatorName.GTE, 'value': 18
|
|
989
|
+
},
|
|
990
|
+
PipelineStep.AGGREGATE: {
|
|
991
|
+
'field': 'age', 'func': AggregateName.AVG
|
|
992
|
+
},
|
|
993
|
+
}
|
|
994
|
+
result = transform(data, ops)
|
|
995
|
+
"""
|
|
996
|
+
data = load_data(source)
|
|
997
|
+
|
|
998
|
+
if not operations:
|
|
999
|
+
return data
|
|
1000
|
+
|
|
1001
|
+
ops = _normalize_operation_keys(operations)
|
|
1002
|
+
|
|
1003
|
+
# Convert single dict to list for uniform processing.
|
|
1004
|
+
is_single_dict = isinstance(data, dict)
|
|
1005
|
+
if is_single_dict:
|
|
1006
|
+
data = [data] # type: ignore[list-item]
|
|
1007
|
+
|
|
1008
|
+
# All record-wise ops require a list of dicts.
|
|
1009
|
+
if isinstance(data, list):
|
|
1010
|
+
for step in _PIPELINE_STEPS:
|
|
1011
|
+
raw_spec = ops.get(step)
|
|
1012
|
+
if raw_spec is None:
|
|
1013
|
+
continue
|
|
1014
|
+
|
|
1015
|
+
specs = _normalize_specs(raw_spec)
|
|
1016
|
+
if not specs:
|
|
1017
|
+
continue
|
|
1018
|
+
|
|
1019
|
+
if step == 'aggregate':
|
|
1020
|
+
combined: JSONDict = {}
|
|
1021
|
+
for spec in specs:
|
|
1022
|
+
if not isinstance(spec, Mapping):
|
|
1023
|
+
continue
|
|
1024
|
+
# Use enum-based applier that returns a single-row list
|
|
1025
|
+
# like: [{alias: value}]
|
|
1026
|
+
out_rows = _apply_aggregate_step(data, spec)
|
|
1027
|
+
if out_rows and isinstance(out_rows[0], Mapping):
|
|
1028
|
+
combined.update(cast(JSONDict, out_rows[0]))
|
|
1029
|
+
if combined:
|
|
1030
|
+
return combined
|
|
1031
|
+
continue
|
|
1032
|
+
|
|
1033
|
+
# Special-case: plain list/tuple of field names for 'select'.
|
|
1034
|
+
if step == 'select' and _is_plain_fields_list(raw_spec):
|
|
1035
|
+
# Keep the whole fields list as a single spec.
|
|
1036
|
+
specs = [cast(StepSpec, raw_spec)]
|
|
1037
|
+
|
|
1038
|
+
applier: StepApplier | None = _STEP_APPLIERS.get(step)
|
|
1039
|
+
if applier is None:
|
|
1040
|
+
continue
|
|
1041
|
+
|
|
1042
|
+
for spec in specs:
|
|
1043
|
+
data = applier(data, spec)
|
|
1044
|
+
|
|
1045
|
+
# Convert back to single dict if input was single dict.
|
|
1046
|
+
if is_single_dict and isinstance(data, list) and len(data) == 1:
|
|
1047
|
+
return data[0]
|
|
1048
|
+
|
|
1049
|
+
return data
|