etlplus 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. etlplus/__init__.py +43 -0
  2. etlplus/__main__.py +22 -0
  3. etlplus/__version__.py +14 -0
  4. etlplus/api/README.md +237 -0
  5. etlplus/api/__init__.py +136 -0
  6. etlplus/api/auth.py +432 -0
  7. etlplus/api/config.py +633 -0
  8. etlplus/api/endpoint_client.py +885 -0
  9. etlplus/api/errors.py +170 -0
  10. etlplus/api/pagination/__init__.py +47 -0
  11. etlplus/api/pagination/client.py +188 -0
  12. etlplus/api/pagination/config.py +440 -0
  13. etlplus/api/pagination/paginator.py +775 -0
  14. etlplus/api/rate_limiting/__init__.py +38 -0
  15. etlplus/api/rate_limiting/config.py +343 -0
  16. etlplus/api/rate_limiting/rate_limiter.py +266 -0
  17. etlplus/api/request_manager.py +589 -0
  18. etlplus/api/retry_manager.py +430 -0
  19. etlplus/api/transport.py +325 -0
  20. etlplus/api/types.py +172 -0
  21. etlplus/cli/__init__.py +15 -0
  22. etlplus/cli/app.py +1367 -0
  23. etlplus/cli/handlers.py +775 -0
  24. etlplus/cli/main.py +616 -0
  25. etlplus/config/__init__.py +56 -0
  26. etlplus/config/connector.py +372 -0
  27. etlplus/config/jobs.py +311 -0
  28. etlplus/config/pipeline.py +339 -0
  29. etlplus/config/profile.py +78 -0
  30. etlplus/config/types.py +204 -0
  31. etlplus/config/utils.py +120 -0
  32. etlplus/ddl.py +197 -0
  33. etlplus/enums.py +414 -0
  34. etlplus/extract.py +218 -0
  35. etlplus/file.py +657 -0
  36. etlplus/load.py +336 -0
  37. etlplus/mixins.py +62 -0
  38. etlplus/py.typed +0 -0
  39. etlplus/run.py +368 -0
  40. etlplus/run_helpers.py +843 -0
  41. etlplus/templates/__init__.py +5 -0
  42. etlplus/templates/ddl.sql.j2 +128 -0
  43. etlplus/templates/view.sql.j2 +69 -0
  44. etlplus/transform.py +1049 -0
  45. etlplus/types.py +227 -0
  46. etlplus/utils.py +638 -0
  47. etlplus/validate.py +493 -0
  48. etlplus/validation/__init__.py +44 -0
  49. etlplus/validation/utils.py +389 -0
  50. etlplus-0.5.4.dist-info/METADATA +616 -0
  51. etlplus-0.5.4.dist-info/RECORD +55 -0
  52. etlplus-0.5.4.dist-info/WHEEL +5 -0
  53. etlplus-0.5.4.dist-info/entry_points.txt +2 -0
  54. etlplus-0.5.4.dist-info/licenses/LICENSE +21 -0
  55. etlplus-0.5.4.dist-info/top_level.txt +1 -0
etlplus/transform.py ADDED
@@ -0,0 +1,1049 @@
1
+ """
2
+ :mod:`etlplus.transform` module.
3
+
4
+ Helpers to filter, map/rename, select, sort, aggregate, and otherwise
5
+ transform JSON-like records (dicts and lists of dicts).
6
+
7
+ The pipeline accepts both **string** names (e.g., ``"filter"``) and the
8
+ enum ``PipelineStep`` for operation keys. For operators and aggregates,
9
+ specs may provide **strings** (with aliases), the corresponding **enums**
10
+ ``OperatorName`` / ``AggregateName``, or **callables**.
11
+
12
+ Examples
13
+ --------
14
+ Basic pipeline with strings::
15
+
16
+ ops = {
17
+ 'filter': {'field': 'age', 'op': 'gte', 'value': 18},
18
+ 'map': {'first_name': 'name'},
19
+ 'select': ['name', 'age'],
20
+ 'sort': {'field': 'name'},
21
+ 'aggregate': {'field': 'age', 'func': 'avg', 'alias': 'avg_age'},
22
+ }
23
+ result = transform(data, ops)
24
+
25
+ Using enums for keys and functions::
26
+
27
+ from .enums import PipelineStep, OperatorName, AggregateName
28
+ ops = {
29
+ PipelineStep.FILTER: {
30
+ 'field': 'age', 'op': OperatorName.GTE, 'value': 18
31
+ },
32
+ PipelineStep.AGGREGATE: {
33
+ 'field': 'age', 'func': AggregateName.AVG
34
+ },
35
+ }
36
+ result = transform(data, ops)
37
+ """
38
+
39
+ from __future__ import annotations
40
+
41
+ from collections.abc import Callable
42
+ from collections.abc import Mapping
43
+ from collections.abc import Sequence
44
+ from typing import Any
45
+ from typing import cast
46
+
47
+ from .enums import AggregateName
48
+ from .enums import OperatorName
49
+ from .enums import PipelineStep
50
+ from .load import load_data
51
+ from .types import AggregateFunc
52
+ from .types import AggregateSpec
53
+ from .types import FieldName
54
+ from .types import Fields
55
+ from .types import FilterSpec
56
+ from .types import JSONData
57
+ from .types import JSONDict
58
+ from .types import JSONList
59
+ from .types import MapSpec
60
+ from .types import OperatorFunc
61
+ from .types import PipelineConfig
62
+ from .types import PipelineStepName
63
+ from .types import SortKey
64
+ from .types import StepApplier
65
+ from .types import StepOrSteps
66
+ from .types import StepSpec
67
+ from .types import StrPath
68
+ from .utils import to_number
69
+
70
+ # SECTION: EXPORTS ========================================================== #
71
+
72
+
73
+ __all__ = [
74
+ 'apply_aggregate',
75
+ 'apply_filter',
76
+ 'apply_map',
77
+ 'apply_select',
78
+ 'apply_sort',
79
+ 'transform',
80
+ ]
81
+
82
+ # SECTION: INTERNAL FUNCTIONS ============================================== #
83
+
84
+
85
+ # -- Aggregators -- #
86
+
87
+
88
+ def _agg_avg(
89
+ nums: list[float],
90
+ _: int,
91
+ ) -> float:
92
+ """
93
+ Average of *nums* or ``0.0`` if empty.
94
+
95
+ Parameters
96
+ ----------
97
+ nums : list[float]
98
+ Numeric values to average.
99
+
100
+ Returns
101
+ -------
102
+ float
103
+ The average of the input numbers or ``0.0`` if empty.
104
+ """
105
+ return (sum(nums) / len(nums)) if nums else 0.0
106
+
107
+
108
+ def _agg_count(
109
+ _: list[float],
110
+ present: int,
111
+ ) -> int:
112
+ """
113
+ Return the provided presence count ``present``.
114
+
115
+ Parameters
116
+ ----------
117
+ present : int
118
+ Count of present values.
119
+
120
+ Returns
121
+ -------
122
+ int
123
+ The provided presence count ``present``.
124
+ """
125
+ return present
126
+
127
+
128
+ def _agg_max(
129
+ nums: list[float],
130
+ _: int,
131
+ ) -> float | None:
132
+ """
133
+ Maximum of *nums* or ``None`` if empty.
134
+
135
+ Parameters
136
+ ----------
137
+ nums : list[float]
138
+ Numeric values to consider.
139
+
140
+ Returns
141
+ -------
142
+ float | None
143
+ The maximum of the input numbers or ``None`` if empty.
144
+ """
145
+ return max(nums) if nums else None
146
+
147
+
148
+ def _agg_min(
149
+ nums: list[float],
150
+ _: int,
151
+ ) -> float | None:
152
+ """
153
+ Minimum of *nums* or ``None`` if empty.
154
+
155
+ Parameters
156
+ ----------
157
+ nums : list[float]
158
+ Numeric values to consider.
159
+
160
+ Returns
161
+ -------
162
+ float | None
163
+ The minimum of the input numbers or ``None`` if empty.
164
+ """
165
+ return min(nums) if nums else None
166
+
167
+
168
+ def _agg_sum(
169
+ nums: list[float],
170
+ _: int,
171
+ ) -> float:
172
+ """
173
+ Sum of *nums* (``0.0`` for empty).
174
+
175
+ Parameters
176
+ ----------
177
+ nums : list[float]
178
+ Numeric values to sum.
179
+
180
+ Returns
181
+ -------
182
+ float
183
+ The sum of the input numbers or ``0.0`` if empty.
184
+ """
185
+ return sum(nums)
186
+
187
+
188
+ # -- Normalization -- #
189
+
190
+
191
+ def _normalize_specs(
192
+ config: StepOrSteps | None,
193
+ ) -> list[StepSpec]:
194
+ """
195
+ Normalize a step config into a list of step specs.
196
+
197
+ Parameters
198
+ ----------
199
+ config : StepOrSteps | None
200
+ ``None``, a single mapping, or a sequence of mappings.
201
+
202
+ Returns
203
+ -------
204
+ list[StepSpec]
205
+ An empty list for ``None``, otherwise a list form of *config*.
206
+ """
207
+ if config is None:
208
+ return []
209
+ if isinstance(config, Sequence) and not isinstance(
210
+ config,
211
+ (str, bytes, bytearray),
212
+ ):
213
+ # Already a sequence of step specs; normalize to a list.
214
+ return list(config) # type: ignore[list-item]
215
+
216
+ # Single spec
217
+ return [config]
218
+
219
+
220
+ def _normalize_operation_keys(ops: Mapping[Any, Any]) -> dict[str, Any]:
221
+ """
222
+ Normalize pipeline operation keys to plain strings.
223
+
224
+ Accepts both string keys (e.g., 'filter') and enum keys
225
+ (PipelineStep.FILTER), returning a str->spec mapping.
226
+
227
+ Parameters
228
+ ----------
229
+ ops : Mapping[Any, Any]
230
+ Pipeline operations to normalize.
231
+
232
+ Returns
233
+ -------
234
+ dict[str, Any]
235
+ Dictionary whose keys are normalized step names.
236
+ """
237
+ normalized: dict[str, Any] = {}
238
+ for k, v in ops.items():
239
+ if isinstance(k, str):
240
+ normalized[k] = v
241
+ elif isinstance(k, PipelineStep):
242
+ normalized[k.value] = v
243
+ else:
244
+ # Fallback: try `.value`, else use string form
245
+ name = getattr(k, 'value', str(k))
246
+ if isinstance(name, str):
247
+ normalized[name] = v
248
+ return normalized
249
+
250
+
251
+ # -- Predicates -- #
252
+
253
+
254
+ def _contains(
255
+ container: Any,
256
+ member: Any,
257
+ ) -> bool:
258
+ """
259
+ Return ``True`` if *member* is contained in *container*.
260
+
261
+ Parameters
262
+ ----------
263
+ container : Any
264
+ Potential container object.
265
+ member : Any
266
+ Candidate member to check for containment.
267
+
268
+ Returns
269
+ -------
270
+ bool
271
+ ``True`` if ``member in container`` succeeds; ``False`` on
272
+ ``TypeError`` or when containment fails.
273
+ """
274
+ try:
275
+ return member in container # type: ignore[operator]
276
+ except TypeError:
277
+ return False
278
+
279
+
280
+ def _has(
281
+ member: Any,
282
+ container: Any,
283
+ ) -> bool:
284
+ """
285
+ Return ``True`` if *container* contains *member*.
286
+
287
+ This is the dual form of :func:`_contains` for readability in certain
288
+ operator contexts (``in`` vs. ``contains``).
289
+ """
290
+ return _contains(container, member)
291
+
292
+
293
+ # -- Resolvers -- #
294
+
295
+
296
+ def _resolve_aggregator(
297
+ func: AggregateName | AggregateFunc | str,
298
+ ) -> Callable:
299
+ """
300
+ Resolve an aggregate specifier to a callable.
301
+
302
+ Parameters
303
+ ----------
304
+ func : AggregateName | AggregateFunc | str
305
+ An :class:`AggregateName`, a string (with aliases), or a callable.
306
+
307
+ Returns
308
+ -------
309
+ Callable
310
+ Function of signature ``(xs: list[float], n: int) -> Any``.
311
+
312
+ Raises
313
+ ------
314
+ TypeError
315
+ If *func* cannot be interpreted as an aggregator.
316
+ """
317
+ if isinstance(func, AggregateName):
318
+ return func.func
319
+ if isinstance(func, str):
320
+ return AggregateName.coerce(func).func
321
+ if callable(func):
322
+ return func
323
+
324
+ raise TypeError(f'Invalid aggregate func: {func!r}')
325
+
326
+
327
+ def _resolve_operator(
328
+ op: OperatorName | OperatorFunc | str,
329
+ ) -> Callable:
330
+ """
331
+ Resolve an operator specifier to a binary predicate.
332
+
333
+ Parameters
334
+ ----------
335
+ op : OperatorName | OperatorFunc | str
336
+ An :class:`OperatorName`, a string (with aliases), or a callable.
337
+
338
+ Returns
339
+ -------
340
+ Callable
341
+ Function of signature ``(a: Any, b: Any) -> bool``.
342
+
343
+ Raises
344
+ ------
345
+ TypeError
346
+ If *op* cannot be interpreted as an operator.
347
+ """
348
+
349
+ def _wrap_numeric(op_name: OperatorName) -> Callable[[Any, Any], bool]:
350
+ base = op_name.func
351
+ if op_name in {
352
+ OperatorName.GT,
353
+ OperatorName.GTE,
354
+ OperatorName.LT,
355
+ OperatorName.LTE,
356
+ OperatorName.EQ,
357
+ OperatorName.NE,
358
+ }:
359
+
360
+ def compare(a: Any, b: Any) -> bool: # noqa: ANN401 - generic
361
+ a_num = to_number(a)
362
+ b_num = to_number(b)
363
+ if a_num is not None and b_num is not None:
364
+ return bool(base(a_num, b_num))
365
+ return bool(base(a, b))
366
+
367
+ return compare
368
+ # Non-numeric operators: use base behavior
369
+ return base
370
+
371
+ if isinstance(op, OperatorName):
372
+ return _wrap_numeric(op)
373
+ if isinstance(op, str):
374
+ return _wrap_numeric(OperatorName.coerce(op))
375
+ if callable(op):
376
+ return op
377
+
378
+ raise TypeError(f'Invalid operator: {op!r}')
379
+
380
+
381
+ # -- Sorting -- #
382
+
383
+
384
+ def _sort_key(
385
+ value: Any,
386
+ ) -> SortKey:
387
+ """
388
+ Coerce mixed-type values into a sortable tuple key.
389
+
390
+ Ordering policy
391
+ ---------------
392
+ 1) Numbers
393
+ 2) Non-numeric values (stringified)
394
+ 3) ``None`` (last)
395
+
396
+ Parameters
397
+ ----------
398
+ value : Any
399
+ Value to normalize for sorting.
400
+
401
+ Returns
402
+ -------
403
+ SortKey
404
+ A key with a type tag to avoid cross-type comparisons.
405
+ """
406
+ if value is None:
407
+ return (2, '')
408
+ if isinstance(value, (int, float)):
409
+ return (0, float(value))
410
+
411
+ return (1, str(value))
412
+
413
+
414
+ # -- Aggregation and filtering -- #
415
+
416
+
417
+ def _collect_numeric_and_presence(
418
+ rows: JSONList,
419
+ field: FieldName | None,
420
+ ) -> tuple[list[float], int]:
421
+ """
422
+ Collect numeric values and count presence of field in rows.
423
+
424
+ If field is None, returns ([], len(rows)).
425
+
426
+ Parameters
427
+ ----------
428
+ rows : JSONList
429
+ Input records.
430
+ field : FieldName | None
431
+ Field name to check for presence.
432
+
433
+ Returns
434
+ -------
435
+ tuple[list[float], int]
436
+ A tuple containing a list of numeric values and the count of present
437
+ fields.
438
+ """
439
+ if not field:
440
+ return [], len(rows)
441
+
442
+ nums: list[float] = []
443
+ present = 0
444
+ for r in rows:
445
+ if field in r:
446
+ present += 1
447
+ v = r.get(field)
448
+ if isinstance(v, (int, float)):
449
+ nums.append(float(v))
450
+ return nums, present
451
+
452
+
453
+ def _derive_agg_key(
454
+ func_raw: AggregateName | AggregateFunc | str,
455
+ field: FieldName | None,
456
+ alias: Any,
457
+ ) -> str:
458
+ """
459
+ Derive the output key name for an aggregate.
460
+
461
+ Uses alias when provided; otherwise builds like "sum_amount" or "count".
462
+
463
+ Parameters
464
+ ----------
465
+ func_raw : AggregateName | AggregateFunc | str
466
+ The raw function specifier.
467
+ field : FieldName | None
468
+ The field being aggregated.
469
+ alias : Any
470
+ Optional alias for the output key.
471
+
472
+ Returns
473
+ -------
474
+ str
475
+ The derived output key name.
476
+ """
477
+ if alias is not None:
478
+ return str(alias)
479
+
480
+ if isinstance(func_raw, AggregateName):
481
+ label = func_raw.value
482
+ elif isinstance(func_raw, str):
483
+ label = AggregateName.coerce(func_raw).value
484
+ elif callable(func_raw):
485
+ label = getattr(func_raw, '__name__', 'custom')
486
+ else:
487
+ label = str(func_raw)
488
+
489
+ return label if not field else f'{label}_{field}'
490
+
491
+
492
+ def _eval_condition(
493
+ record: JSONDict,
494
+ field: FieldName,
495
+ op_func: OperatorFunc,
496
+ value: Any,
497
+ catch_all: bool,
498
+ ) -> bool:
499
+ """
500
+ Evaluate a filter condition on a record.
501
+
502
+ Returns False if the field is missing or if the operator raises.
503
+
504
+ Parameters
505
+ ----------
506
+ record : JSONDict
507
+ The input record.
508
+ field : FieldName
509
+ The field name to check.
510
+ op_func : OperatorFunc
511
+ The binary operator function.
512
+ value : Any
513
+ The value to compare against.
514
+ catch_all : bool
515
+ If True, catch all exceptions and return; if False, propagate
516
+ exceptions.
517
+
518
+ Returns
519
+ -------
520
+ bool
521
+ True if the condition is met; False if not.
522
+
523
+ Raises
524
+ ------
525
+ Exception
526
+ If *catch_all* is False and the operator raises.
527
+ """
528
+ try:
529
+ lhs = record[field]
530
+ except KeyError:
531
+ return False
532
+
533
+ try:
534
+ return bool(op_func(lhs, value))
535
+ except Exception: # noqa: BLE001 - controlled by flag
536
+ if catch_all:
537
+ return False
538
+ raise
539
+
540
+
541
+ # -- Step Appliers -- #
542
+
543
+
544
+ def _apply_aggregate_step(
545
+ rows: JSONList,
546
+ spec: AggregateSpec,
547
+ ) -> JSONList:
548
+ """
549
+ Apply a single aggregate spec and return a one-row result list.
550
+
551
+ Parameters
552
+ ----------
553
+ rows : JSONList
554
+ Input records.
555
+ spec : AggregateSpec
556
+ Mapping with keys like ``{'field': 'amount', 'func': 'sum', 'alias':
557
+ 'total'}``.
558
+
559
+ Returns
560
+ -------
561
+ JSONList
562
+ A list containing one mapping ``[{alias: value}]``.
563
+ """
564
+ field: FieldName | None = spec.get('field') # type: ignore[assignment]
565
+ func_raw = spec.get('func', 'count')
566
+ alias = spec.get('alias')
567
+
568
+ agg_func = _resolve_aggregator(func_raw)
569
+ xs, present = _collect_numeric_and_presence(rows, field)
570
+ key = _derive_agg_key(func_raw, field, alias)
571
+ result = agg_func(xs, present)
572
+ return [{key: result}]
573
+
574
+
575
+ def _apply_filter_step(
576
+ records: JSONList,
577
+ spec: Any,
578
+ ) -> JSONList:
579
+ """
580
+ Functional filter applier used by the pipeline engine.
581
+
582
+ Parameters
583
+ ----------
584
+ records : JSONList
585
+ Input records to filter.
586
+ spec : Any
587
+ Mapping with keys ``field``, ``op``, and ``value``. ``op`` may be a
588
+ string, :class:`OperatorName`, or a callable.
589
+
590
+ Returns
591
+ -------
592
+ JSONList
593
+ Filtered records.
594
+ """
595
+ field: FieldName = spec.get('field') # type: ignore[assignment]
596
+ op = spec.get('op')
597
+ value = spec.get('value')
598
+
599
+ if not field:
600
+ return records # Or raise, depending on your policy.
601
+
602
+ op_func = _resolve_operator(op)
603
+
604
+ return [
605
+ r
606
+ for r in records
607
+ if _eval_condition(r, field, op_func, value, catch_all=True)
608
+ ]
609
+
610
+
611
+ def _apply_map_step(
612
+ records: JSONList,
613
+ spec: Any,
614
+ ) -> JSONList:
615
+ """
616
+ Functional map/rename applier used by the pipeline engine.
617
+
618
+ Parameters
619
+ ----------
620
+ records : JSONList
621
+ Input records to transform.
622
+ spec : Any
623
+ Mapping of **old field names** to **new field names**.
624
+
625
+ Returns
626
+ -------
627
+ JSONList
628
+ Transformed records.
629
+ """
630
+ if isinstance(spec, Mapping):
631
+ return apply_map(records, spec)
632
+
633
+ return records
634
+
635
+
636
+ def _apply_select_step(
637
+ records: JSONList,
638
+ spec: Any,
639
+ ) -> JSONList:
640
+ """
641
+ Functional select/project applier used by the pipeline engine.
642
+
643
+ Parameters
644
+ ----------
645
+ records : JSONList
646
+ Input records to transform.
647
+ spec : Any
648
+ Either a mapping with key ``'fields'`` whose value is a sequence of
649
+ field names, or a plain sequence of field names.
650
+
651
+ Returns
652
+ -------
653
+ JSONList
654
+ Transformed data.
655
+ """
656
+ fields: Sequence[Any]
657
+ if isinstance(spec, Mapping):
658
+ maybe_fields = spec.get('fields')
659
+ if not _is_plain_fields_list(maybe_fields):
660
+ return records
661
+ fields = cast(Sequence[Any], maybe_fields)
662
+ elif _is_plain_fields_list(spec):
663
+ fields = cast(Sequence[Any], spec)
664
+ else:
665
+ return records
666
+
667
+ return apply_select(records, [str(field) for field in fields])
668
+
669
+
670
+ def _apply_sort_step(
671
+ records: JSONList,
672
+ spec: Any,
673
+ ) -> JSONList:
674
+ """
675
+ Functional sort applier used by the pipeline engine.
676
+
677
+ Parameters
678
+ ----------
679
+ records : JSONList
680
+ Input records to sort.
681
+ spec : Any
682
+ Either a mapping with keys ``'field'`` and optional ``'reverse'``, or
683
+ a plain field name.
684
+
685
+ Returns
686
+ -------
687
+ JSONList
688
+ Sorted records.
689
+ """
690
+ if isinstance(spec, Mapping):
691
+ field_value = spec.get('field')
692
+ field = str(field_value) if field_value is not None else None
693
+ reverse = bool(spec.get('reverse', False))
694
+ return apply_sort(records, field, reverse)
695
+
696
+ if spec is None:
697
+ return records
698
+
699
+ return apply_sort(records, str(spec), False)
700
+
701
+
702
+ # -- Helpers -- #
703
+
704
+
705
+ def _is_plain_fields_list(obj: Any) -> bool:
706
+ """
707
+ Return True if obj is a non-text sequence of non-mapping items.
708
+
709
+ Used to detect a list/tuple of field names like ['name', 'age'].
710
+
711
+ Parameters
712
+ ----------
713
+ obj : Any
714
+ The object to check.
715
+
716
+ Returns
717
+ -------
718
+ bool
719
+ True if obj is a non-text sequence of non-mapping items, False
720
+ otherwise.
721
+ """
722
+ return (
723
+ isinstance(obj, Sequence)
724
+ and not isinstance(obj, (str, bytes, bytearray))
725
+ and not any(isinstance(x, Mapping) for x in obj)
726
+ )
727
+
728
+
729
+ # SECTION: INTERNAL CONSTANTS ============================================== #
730
+
731
+
732
+ _PIPELINE_STEPS: tuple[PipelineStepName, ...] = (
733
+ 'filter',
734
+ 'map',
735
+ 'select',
736
+ 'sort',
737
+ 'aggregate',
738
+ )
739
+
740
+
741
+ _STEP_APPLIERS: dict[PipelineStepName, StepApplier] = {
742
+ 'filter': _apply_filter_step,
743
+ 'map': _apply_map_step,
744
+ 'select': _apply_select_step,
745
+ 'sort': _apply_sort_step,
746
+ }
747
+
748
+
749
+ # SECTION: EXPORTS ========================================================== #
750
+
751
+
752
+ def apply_filter(
753
+ records: JSONList,
754
+ condition: FilterSpec,
755
+ ) -> JSONList:
756
+ """
757
+ Filter a list of records by a simple condition.
758
+
759
+ Parameters
760
+ ----------
761
+ records : JSONList
762
+ Records to filter.
763
+ condition : FilterSpec
764
+ Condition object with keys ``field``, ``op``, and ``value``. The
765
+ ``op`` can be one of ``'eq'``, ``'ne'``, ``'gt'``, ``'gte'``,
766
+ ``'lt'``, ``'lte'``, ``'in'``, or ``'contains'``. Custom comparison
767
+ logic can be provided by supplying a callable for ``op``.
768
+
769
+ Returns
770
+ -------
771
+ JSONList
772
+ Filtered records.
773
+ """
774
+ field = condition.get('field')
775
+ op_raw = condition.get('op')
776
+ value = condition.get('value')
777
+
778
+ if not field or op_raw is None or value is None:
779
+ return records
780
+
781
+ try:
782
+ op_func = cast(OperatorFunc, _resolve_operator(op_raw))
783
+ except TypeError:
784
+ return records
785
+
786
+ result: JSONList = []
787
+ for record in records:
788
+ if field not in record:
789
+ continue
790
+ try:
791
+ if _eval_condition(record, field, op_func, value, catch_all=False):
792
+ result.append(record)
793
+ except TypeError:
794
+ # Skip records where the comparison is not supported.
795
+ continue
796
+
797
+ return result
798
+
799
+
800
+ def apply_map(
801
+ records: JSONList,
802
+ mapping: MapSpec,
803
+ ) -> JSONList:
804
+ """
805
+ Map/rename fields in each record.
806
+
807
+ Parameters
808
+ ----------
809
+ records : JSONList
810
+ Records to transform.
811
+ mapping : MapSpec
812
+ Mapping of old field names to new field names.
813
+
814
+ Returns
815
+ -------
816
+ JSONList
817
+ New records with keys renamed. Unmapped fields are preserved.
818
+ """
819
+ rename_map = dict(mapping)
820
+ result: JSONList = []
821
+
822
+ for record in records:
823
+ renamed = {
824
+ new_key: record[old_key]
825
+ for old_key, new_key in rename_map.items()
826
+ if old_key in record
827
+ }
828
+ renamed.update(
829
+ {
830
+ key: value
831
+ for key, value in record.items()
832
+ if key not in rename_map
833
+ },
834
+ )
835
+ result.append(renamed)
836
+
837
+ return result
838
+
839
+
840
+ def apply_select(
841
+ records: JSONList,
842
+ fields: Fields,
843
+ ) -> JSONList:
844
+ """
845
+ Keep only the requested fields in each record.
846
+
847
+ Parameters
848
+ ----------
849
+ records : JSONList
850
+ Records to project.
851
+ fields : Fields
852
+ Field names to retain.
853
+
854
+ Returns
855
+ -------
856
+ JSONList
857
+ Records containing the requested fields; missing fields are ``None``.
858
+ """
859
+ return [
860
+ {field: record.get(field) for field in fields} for record in records
861
+ ]
862
+
863
+
864
+ def apply_sort(
865
+ records: JSONList,
866
+ field: FieldName | None,
867
+ reverse: bool = False,
868
+ ) -> JSONList:
869
+ """
870
+ Sort records by a field.
871
+
872
+ Parameters
873
+ ----------
874
+ records : JSONList
875
+ Records to sort.
876
+ field : FieldName | None
877
+ Field name to sort by. If ``None``, input is returned unchanged.
878
+ reverse : bool, optional
879
+ Sort descending if ``True``. Default is ``False``.
880
+
881
+ Returns
882
+ -------
883
+ JSONList
884
+ Sorted records.
885
+ """
886
+ if not field:
887
+ return records
888
+
889
+ key_field: FieldName = field
890
+ return sorted(
891
+ records,
892
+ key=lambda x: _sort_key(x.get(key_field)),
893
+ reverse=reverse,
894
+ )
895
+
896
+
897
+ def apply_aggregate(
898
+ records: JSONList,
899
+ operation: AggregateSpec,
900
+ ) -> JSONDict:
901
+ """
902
+ Aggregate a numeric field or count presence.
903
+
904
+ Parameters
905
+ ----------
906
+ records : JSONList
907
+ Records to aggregate.
908
+ operation : AggregateSpec
909
+ Dict with keys ``field`` and ``func``. ``func`` is one of
910
+ ``'sum'``, ``'avg'``, ``'min'``, ``'max'``, or ``'count'``.
911
+ A callable may also be supplied for ``func``. Optionally, set
912
+ ``alias`` to control the output key name.
913
+
914
+ Returns
915
+ -------
916
+ JSONDict
917
+ A single-row result like ``{"sum_age": 42}``.
918
+
919
+ Notes
920
+ -----
921
+ Numeric operations ignore non-numeric values but count their presence
922
+ for ``'count'``.
923
+ """
924
+ field = operation.get('field')
925
+ func = operation.get('func')
926
+ alias = operation.get('alias')
927
+
928
+ if not field or func is None:
929
+ return {'error': 'Invalid aggregation operation'}
930
+
931
+ try:
932
+ aggregator = _resolve_aggregator(func)
933
+ except TypeError:
934
+ return {'error': f'Unknown aggregation function: {func}'}
935
+
936
+ nums, present = _collect_numeric_and_presence(records, field)
937
+ key_name = _derive_agg_key(func, field, alias)
938
+ return {key_name: aggregator(nums, present)}
939
+
940
+
941
+ def transform(
942
+ source: StrPath | JSONData,
943
+ operations: PipelineConfig | None = None,
944
+ ) -> JSONData:
945
+ """
946
+ Transform data using optional filter/map/select/sort/aggregate steps.
947
+
948
+ Parameters
949
+ ----------
950
+ source : StrPath | JSONData
951
+ Data source to transform.
952
+ operations : PipelineConfig | None, optional
953
+ Operation dictionary that may contain the keys ``filter``, ``map``,
954
+ ``select``, ``sort``, and ``aggregate`` with their respective
955
+ configs. Each value may be a single config or a sequence of configs
956
+ to apply in order. Aggregations accept multiple configs and merge
957
+ the results.
958
+
959
+ Returns
960
+ -------
961
+ JSONData
962
+ Transformed data.
963
+
964
+ Notes
965
+ -----
966
+ Operation keys may be provided as strings (e.g., ``"filter"``) or as
967
+ :class:`PipelineStep` enum members. The aggregate step returns a **single
968
+ mapping** with merged aggregate results when present.
969
+
970
+ Examples
971
+ --------
972
+ Minimal example with multiple steps::
973
+
974
+ ops = {
975
+ 'filter': {'field': 'age', 'op': 'gt', 'value': 18},
976
+ 'map': {'old_name': 'new_name'},
977
+ 'select': ['name', 'age'],
978
+ 'sort': {'field': 'name', 'reverse': False},
979
+ 'aggregate': {'field': 'age', 'func': 'avg'},
980
+ }
981
+ result = transform(data, ops)
982
+
983
+ Using enums for keys and functions::
984
+
985
+ from .enums import PipelineStep, OperatorName, AggregateName
986
+ ops = {
987
+ PipelineStep.FILTER: {
988
+ 'field': 'age', 'op': OperatorName.GTE, 'value': 18
989
+ },
990
+ PipelineStep.AGGREGATE: {
991
+ 'field': 'age', 'func': AggregateName.AVG
992
+ },
993
+ }
994
+ result = transform(data, ops)
995
+ """
996
+ data = load_data(source)
997
+
998
+ if not operations:
999
+ return data
1000
+
1001
+ ops = _normalize_operation_keys(operations)
1002
+
1003
+ # Convert single dict to list for uniform processing.
1004
+ is_single_dict = isinstance(data, dict)
1005
+ if is_single_dict:
1006
+ data = [data] # type: ignore[list-item]
1007
+
1008
+ # All record-wise ops require a list of dicts.
1009
+ if isinstance(data, list):
1010
+ for step in _PIPELINE_STEPS:
1011
+ raw_spec = ops.get(step)
1012
+ if raw_spec is None:
1013
+ continue
1014
+
1015
+ specs = _normalize_specs(raw_spec)
1016
+ if not specs:
1017
+ continue
1018
+
1019
+ if step == 'aggregate':
1020
+ combined: JSONDict = {}
1021
+ for spec in specs:
1022
+ if not isinstance(spec, Mapping):
1023
+ continue
1024
+ # Use enum-based applier that returns a single-row list
1025
+ # like: [{alias: value}]
1026
+ out_rows = _apply_aggregate_step(data, spec)
1027
+ if out_rows and isinstance(out_rows[0], Mapping):
1028
+ combined.update(cast(JSONDict, out_rows[0]))
1029
+ if combined:
1030
+ return combined
1031
+ continue
1032
+
1033
+ # Special-case: plain list/tuple of field names for 'select'.
1034
+ if step == 'select' and _is_plain_fields_list(raw_spec):
1035
+ # Keep the whole fields list as a single spec.
1036
+ specs = [cast(StepSpec, raw_spec)]
1037
+
1038
+ applier: StepApplier | None = _STEP_APPLIERS.get(step)
1039
+ if applier is None:
1040
+ continue
1041
+
1042
+ for spec in specs:
1043
+ data = applier(data, spec)
1044
+
1045
+ # Convert back to single dict if input was single dict.
1046
+ if is_single_dict and isinstance(data, list) and len(data) == 1:
1047
+ return data[0]
1048
+
1049
+ return data