etlplus 0.12.10__py3-none-any.whl → 0.14.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- etlplus/README.md +1 -1
- etlplus/__init__.py +1 -26
- etlplus/api/__init__.py +10 -0
- etlplus/api/config.py +36 -20
- etlplus/api/endpoint_client.py +3 -3
- etlplus/api/enums.py +51 -0
- etlplus/api/pagination/client.py +1 -1
- etlplus/api/rate_limiting/config.py +13 -1
- etlplus/api/rate_limiting/rate_limiter.py +8 -11
- etlplus/api/request_manager.py +11 -6
- etlplus/api/transport.py +14 -2
- etlplus/api/types.py +7 -6
- etlplus/{run_helpers.py → api/utils.py} +205 -153
- etlplus/cli/handlers.py +17 -7
- etlplus/config/jobs.py +14 -4
- etlplus/dag.py +103 -0
- etlplus/enums.py +0 -32
- etlplus/file/cfg.py +2 -2
- etlplus/file/conf.py +2 -2
- etlplus/file/dta.py +77 -0
- etlplus/file/enums.py +10 -4
- etlplus/file/hbs.py +78 -0
- etlplus/file/hdf5.py +78 -0
- etlplus/file/jinja2.py +78 -0
- etlplus/file/mat.py +78 -0
- etlplus/file/mustache.py +78 -0
- etlplus/file/nc.py +78 -0
- etlplus/file/numbers.py +75 -0
- etlplus/file/ods.py +79 -0
- etlplus/file/properties.py +13 -13
- etlplus/file/rda.py +78 -0
- etlplus/file/rds.py +78 -0
- etlplus/file/sas7bdat.py +78 -0
- etlplus/file/sav.py +77 -0
- etlplus/file/sylk.py +77 -0
- etlplus/file/toml.py +1 -1
- etlplus/file/vm.py +78 -0
- etlplus/file/wks.py +77 -0
- etlplus/file/xlsm.py +79 -0
- etlplus/file/xpt.py +78 -0
- etlplus/file/zsav.py +77 -0
- etlplus/{validation → ops}/README.md +2 -2
- etlplus/ops/__init__.py +61 -0
- etlplus/{extract.py → ops/extract.py} +78 -94
- etlplus/{load.py → ops/load.py} +73 -93
- etlplus/{run.py → ops/run.py} +140 -110
- etlplus/{transform.py → ops/transform.py} +75 -68
- etlplus/{validation → ops}/utils.py +80 -15
- etlplus/{validate.py → ops/validate.py} +19 -9
- etlplus/types.py +2 -2
- {etlplus-0.12.10.dist-info → etlplus-0.14.3.dist-info}/METADATA +91 -60
- {etlplus-0.12.10.dist-info → etlplus-0.14.3.dist-info}/RECORD +56 -35
- etlplus/validation/__init__.py +0 -44
- {etlplus-0.12.10.dist-info → etlplus-0.14.3.dist-info}/WHEEL +0 -0
- {etlplus-0.12.10.dist-info → etlplus-0.14.3.dist-info}/entry_points.txt +0 -0
- {etlplus-0.12.10.dist-info → etlplus-0.14.3.dist-info}/licenses/LICENSE +0 -0
- {etlplus-0.12.10.dist-info → etlplus-0.14.3.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
:mod:`etlplus.transform` module.
|
|
2
|
+
:mod:`etlplus.ops.transform` module.
|
|
3
3
|
|
|
4
4
|
Helpers to filter, map/rename, select, sort, aggregate, and otherwise
|
|
5
5
|
transform JSON-like records (dicts and lists of dicts).
|
|
@@ -24,7 +24,7 @@ Basic pipeline with strings::
|
|
|
24
24
|
|
|
25
25
|
Using enums for keys and functions::
|
|
26
26
|
|
|
27
|
-
from .enums import PipelineStep, OperatorName, AggregateName
|
|
27
|
+
from etlplus.enums import PipelineStep, OperatorName, AggregateName
|
|
28
28
|
ops = {
|
|
29
29
|
PipelineStep.FILTER: {
|
|
30
30
|
'field': 'age', 'op': OperatorName.GTE, 'value': 18
|
|
@@ -44,28 +44,28 @@ from collections.abc import Sequence
|
|
|
44
44
|
from typing import Any
|
|
45
45
|
from typing import cast
|
|
46
46
|
|
|
47
|
-
from
|
|
48
|
-
from
|
|
49
|
-
from
|
|
47
|
+
from ..enums import AggregateName
|
|
48
|
+
from ..enums import OperatorName
|
|
49
|
+
from ..enums import PipelineStep
|
|
50
|
+
from ..types import AggregateFunc
|
|
51
|
+
from ..types import AggregateSpec
|
|
52
|
+
from ..types import FieldName
|
|
53
|
+
from ..types import Fields
|
|
54
|
+
from ..types import FilterSpec
|
|
55
|
+
from ..types import JSONData
|
|
56
|
+
from ..types import JSONDict
|
|
57
|
+
from ..types import JSONList
|
|
58
|
+
from ..types import MapSpec
|
|
59
|
+
from ..types import OperatorFunc
|
|
60
|
+
from ..types import PipelineConfig
|
|
61
|
+
from ..types import PipelineStepName
|
|
62
|
+
from ..types import SortKey
|
|
63
|
+
from ..types import StepApplier
|
|
64
|
+
from ..types import StepOrSteps
|
|
65
|
+
from ..types import StepSpec
|
|
66
|
+
from ..types import StrPath
|
|
67
|
+
from ..utils import to_number
|
|
50
68
|
from .load import load_data
|
|
51
|
-
from .types import AggregateFunc
|
|
52
|
-
from .types import AggregateSpec
|
|
53
|
-
from .types import FieldName
|
|
54
|
-
from .types import Fields
|
|
55
|
-
from .types import FilterSpec
|
|
56
|
-
from .types import JSONData
|
|
57
|
-
from .types import JSONDict
|
|
58
|
-
from .types import JSONList
|
|
59
|
-
from .types import MapSpec
|
|
60
|
-
from .types import OperatorFunc
|
|
61
|
-
from .types import PipelineConfig
|
|
62
|
-
from .types import PipelineStepName
|
|
63
|
-
from .types import SortKey
|
|
64
|
-
from .types import StepApplier
|
|
65
|
-
from .types import StepOrSteps
|
|
66
|
-
from .types import StepSpec
|
|
67
|
-
from .types import StrPath
|
|
68
|
-
from .utils import to_number
|
|
69
69
|
|
|
70
70
|
# SECTION: EXPORTS ========================================================== #
|
|
71
71
|
|
|
@@ -730,15 +730,16 @@ def _is_plain_fields_list(obj: Any) -> bool:
|
|
|
730
730
|
|
|
731
731
|
|
|
732
732
|
_PIPELINE_STEPS: tuple[PipelineStepName, ...] = (
|
|
733
|
+
'aggregate',
|
|
733
734
|
'filter',
|
|
734
735
|
'map',
|
|
735
736
|
'select',
|
|
736
737
|
'sort',
|
|
737
|
-
'aggregate',
|
|
738
738
|
)
|
|
739
739
|
|
|
740
740
|
|
|
741
741
|
_STEP_APPLIERS: dict[PipelineStepName, StepApplier] = {
|
|
742
|
+
'aggregate': _apply_aggregate_step,
|
|
742
743
|
'filter': _apply_filter_step,
|
|
743
744
|
'map': _apply_map_step,
|
|
744
745
|
'select': _apply_select_step,
|
|
@@ -746,7 +747,54 @@ _STEP_APPLIERS: dict[PipelineStepName, StepApplier] = {
|
|
|
746
747
|
}
|
|
747
748
|
|
|
748
749
|
|
|
749
|
-
# SECTION:
|
|
750
|
+
# SECTION: FUNCTIONS ======================================================== #
|
|
751
|
+
|
|
752
|
+
|
|
753
|
+
# -- Helpers -- #
|
|
754
|
+
|
|
755
|
+
|
|
756
|
+
def apply_aggregate(
|
|
757
|
+
records: JSONList,
|
|
758
|
+
operation: AggregateSpec,
|
|
759
|
+
) -> JSONDict:
|
|
760
|
+
"""
|
|
761
|
+
Aggregate a numeric field or count presence.
|
|
762
|
+
|
|
763
|
+
Parameters
|
|
764
|
+
----------
|
|
765
|
+
records : JSONList
|
|
766
|
+
Records to aggregate.
|
|
767
|
+
operation : AggregateSpec
|
|
768
|
+
Dict with keys ``field`` and ``func``. ``func`` is one of
|
|
769
|
+
``'sum'``, ``'avg'``, ``'min'``, ``'max'``, or ``'count'``.
|
|
770
|
+
A callable may also be supplied for ``func``. Optionally, set
|
|
771
|
+
``alias`` to control the output key name.
|
|
772
|
+
|
|
773
|
+
Returns
|
|
774
|
+
-------
|
|
775
|
+
JSONDict
|
|
776
|
+
A single-row result like ``{"sum_age": 42}``.
|
|
777
|
+
|
|
778
|
+
Notes
|
|
779
|
+
-----
|
|
780
|
+
Numeric operations ignore non-numeric values but count their presence
|
|
781
|
+
for ``'count'``.
|
|
782
|
+
"""
|
|
783
|
+
field = operation.get('field')
|
|
784
|
+
func = operation.get('func')
|
|
785
|
+
alias = operation.get('alias')
|
|
786
|
+
|
|
787
|
+
if not field or func is None:
|
|
788
|
+
return {'error': 'Invalid aggregation operation'}
|
|
789
|
+
|
|
790
|
+
try:
|
|
791
|
+
aggregator = _resolve_aggregator(func)
|
|
792
|
+
except TypeError:
|
|
793
|
+
return {'error': f'Unknown aggregation function: {func}'}
|
|
794
|
+
|
|
795
|
+
nums, present = _collect_numeric_and_presence(records, field)
|
|
796
|
+
key_name = _derive_agg_key(func, field, alias)
|
|
797
|
+
return {key_name: aggregator(nums, present)}
|
|
750
798
|
|
|
751
799
|
|
|
752
800
|
def apply_filter(
|
|
@@ -894,48 +942,7 @@ def apply_sort(
|
|
|
894
942
|
)
|
|
895
943
|
|
|
896
944
|
|
|
897
|
-
|
|
898
|
-
records: JSONList,
|
|
899
|
-
operation: AggregateSpec,
|
|
900
|
-
) -> JSONDict:
|
|
901
|
-
"""
|
|
902
|
-
Aggregate a numeric field or count presence.
|
|
903
|
-
|
|
904
|
-
Parameters
|
|
905
|
-
----------
|
|
906
|
-
records : JSONList
|
|
907
|
-
Records to aggregate.
|
|
908
|
-
operation : AggregateSpec
|
|
909
|
-
Dict with keys ``field`` and ``func``. ``func`` is one of
|
|
910
|
-
``'sum'``, ``'avg'``, ``'min'``, ``'max'``, or ``'count'``.
|
|
911
|
-
A callable may also be supplied for ``func``. Optionally, set
|
|
912
|
-
``alias`` to control the output key name.
|
|
913
|
-
|
|
914
|
-
Returns
|
|
915
|
-
-------
|
|
916
|
-
JSONDict
|
|
917
|
-
A single-row result like ``{"sum_age": 42}``.
|
|
918
|
-
|
|
919
|
-
Notes
|
|
920
|
-
-----
|
|
921
|
-
Numeric operations ignore non-numeric values but count their presence
|
|
922
|
-
for ``'count'``.
|
|
923
|
-
"""
|
|
924
|
-
field = operation.get('field')
|
|
925
|
-
func = operation.get('func')
|
|
926
|
-
alias = operation.get('alias')
|
|
927
|
-
|
|
928
|
-
if not field or func is None:
|
|
929
|
-
return {'error': 'Invalid aggregation operation'}
|
|
930
|
-
|
|
931
|
-
try:
|
|
932
|
-
aggregator = _resolve_aggregator(func)
|
|
933
|
-
except TypeError:
|
|
934
|
-
return {'error': f'Unknown aggregation function: {func}'}
|
|
935
|
-
|
|
936
|
-
nums, present = _collect_numeric_and_presence(records, field)
|
|
937
|
-
key_name = _derive_agg_key(func, field, alias)
|
|
938
|
-
return {key_name: aggregator(nums, present)}
|
|
945
|
+
# -- Orchestration -- #
|
|
939
946
|
|
|
940
947
|
|
|
941
948
|
def transform(
|
|
@@ -982,7 +989,7 @@ def transform(
|
|
|
982
989
|
|
|
983
990
|
Using enums for keys and functions::
|
|
984
991
|
|
|
985
|
-
from .enums import PipelineStep, OperatorName, AggregateName
|
|
992
|
+
from etlplus.enums import PipelineStep, OperatorName, AggregateName
|
|
986
993
|
ops = {
|
|
987
994
|
PipelineStep.FILTER: {
|
|
988
995
|
'field': 'age', 'op': OperatorName.GTE, 'value': 18
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""
|
|
2
|
-
:mod:`etlplus.
|
|
2
|
+
:mod:`etlplus.ops.utils` module.
|
|
3
3
|
|
|
4
|
-
Utility helpers for conditional
|
|
4
|
+
Utility helpers for conditional data ops orchestration.
|
|
5
5
|
|
|
6
6
|
The helpers defined here embrace a "high cohesion, low coupling" design by
|
|
7
7
|
isolating normalization, configuration, and logging responsibilities. The
|
|
@@ -13,11 +13,14 @@ offloading ancillary concerns to composable helpers.
|
|
|
13
13
|
from __future__ import annotations
|
|
14
14
|
|
|
15
15
|
from collections.abc import Callable
|
|
16
|
+
from collections.abc import Mapping
|
|
16
17
|
from dataclasses import dataclass
|
|
18
|
+
from types import MappingProxyType
|
|
17
19
|
from typing import Any
|
|
18
20
|
from typing import Literal
|
|
19
21
|
from typing import Self
|
|
20
22
|
from typing import TypedDict
|
|
23
|
+
from typing import cast
|
|
21
24
|
|
|
22
25
|
from ..types import StrAnyMap
|
|
23
26
|
from ..utils import normalized_str
|
|
@@ -47,6 +50,30 @@ type ValidateFn = Callable[[Any, Ruleset], ValidationResult]
|
|
|
47
50
|
type PrintFn = Callable[[Any], None]
|
|
48
51
|
|
|
49
52
|
|
|
53
|
+
# SECTION: INTERNAL CONSTANTS ============================================== #
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
_PHASE_CHOICES = MappingProxyType(
|
|
57
|
+
{
|
|
58
|
+
'before_transform': 'before_transform',
|
|
59
|
+
'after_transform': 'after_transform',
|
|
60
|
+
},
|
|
61
|
+
)
|
|
62
|
+
_SEVERITY_CHOICES = MappingProxyType(
|
|
63
|
+
{
|
|
64
|
+
'warn': 'warn',
|
|
65
|
+
'error': 'error',
|
|
66
|
+
},
|
|
67
|
+
)
|
|
68
|
+
_WINDOW_CHOICES = MappingProxyType(
|
|
69
|
+
{
|
|
70
|
+
'before_transform': 'before_transform',
|
|
71
|
+
'after_transform': 'after_transform',
|
|
72
|
+
'both': 'both',
|
|
73
|
+
},
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
50
77
|
# SECTION: DATA CLASSES ===================================================== #
|
|
51
78
|
|
|
52
79
|
|
|
@@ -291,11 +318,14 @@ def _normalize_phase(
|
|
|
291
318
|
Normalized validation phase. Defaults to ``"before_transform"`` when
|
|
292
319
|
unspecified.
|
|
293
320
|
"""
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
321
|
+
return cast(
|
|
322
|
+
ValidationPhase,
|
|
323
|
+
_normalize_choice(
|
|
324
|
+
value,
|
|
325
|
+
mapping=_PHASE_CHOICES,
|
|
326
|
+
default='before_transform',
|
|
327
|
+
),
|
|
328
|
+
)
|
|
299
329
|
|
|
300
330
|
|
|
301
331
|
def _normalize_severity(
|
|
@@ -314,7 +344,14 @@ def _normalize_severity(
|
|
|
314
344
|
ValidationSeverity
|
|
315
345
|
Normalized severity. Defaults to ``"error"`` when unspecified.
|
|
316
346
|
"""
|
|
317
|
-
return
|
|
347
|
+
return cast(
|
|
348
|
+
ValidationSeverity,
|
|
349
|
+
_normalize_choice(
|
|
350
|
+
value,
|
|
351
|
+
mapping=_SEVERITY_CHOICES,
|
|
352
|
+
default='error',
|
|
353
|
+
),
|
|
354
|
+
)
|
|
318
355
|
|
|
319
356
|
|
|
320
357
|
def _normalize_window(
|
|
@@ -333,13 +370,41 @@ def _normalize_window(
|
|
|
333
370
|
ValidationWindow
|
|
334
371
|
Normalized validation window. Defaults to ``"both"`` when unspecified.
|
|
335
372
|
"""
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
373
|
+
return cast(
|
|
374
|
+
ValidationWindow,
|
|
375
|
+
_normalize_choice(
|
|
376
|
+
value,
|
|
377
|
+
mapping=_WINDOW_CHOICES,
|
|
378
|
+
default='both',
|
|
379
|
+
),
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
def _normalize_choice(
|
|
384
|
+
value: str | None,
|
|
385
|
+
*,
|
|
386
|
+
mapping: Mapping[str, str],
|
|
387
|
+
default: str,
|
|
388
|
+
) -> str:
|
|
389
|
+
"""
|
|
390
|
+
Normalize a text value against a mapping with a default fallback.
|
|
391
|
+
|
|
392
|
+
Parameters
|
|
393
|
+
----------
|
|
394
|
+
value : str | None
|
|
395
|
+
Input text to normalize.
|
|
396
|
+
mapping : Mapping[str, str]
|
|
397
|
+
Mapping of accepted values to normalized outputs.
|
|
398
|
+
default : str
|
|
399
|
+
Default to return when input is missing or unrecognized.
|
|
400
|
+
|
|
401
|
+
Returns
|
|
402
|
+
-------
|
|
403
|
+
str
|
|
404
|
+
Normalized value.
|
|
405
|
+
"""
|
|
406
|
+
normalized = normalized_str(value)
|
|
407
|
+
return mapping.get(normalized, default)
|
|
343
408
|
|
|
344
409
|
|
|
345
410
|
def _rule_name(
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
:mod:`etlplus.
|
|
2
|
+
:mod:`etlplus.ops.validate` module.
|
|
3
3
|
|
|
4
4
|
Validate dicts and lists of dicts using simple, schema-like rules.
|
|
5
5
|
|
|
@@ -34,11 +34,11 @@ from typing import Final
|
|
|
34
34
|
from typing import Literal
|
|
35
35
|
from typing import TypedDict
|
|
36
36
|
|
|
37
|
+
from ..types import JSONData
|
|
38
|
+
from ..types import Record
|
|
39
|
+
from ..types import StrAnyMap
|
|
40
|
+
from ..types import StrPath
|
|
37
41
|
from .load import load_data
|
|
38
|
-
from .types import JSONData
|
|
39
|
-
from .types import Record
|
|
40
|
-
from .types import StrAnyMap
|
|
41
|
-
from .types import StrPath
|
|
42
42
|
|
|
43
43
|
# SECTION: EXPORTS ========================================================== #
|
|
44
44
|
|
|
@@ -279,11 +279,15 @@ def _type_matches(
|
|
|
279
279
|
bool
|
|
280
280
|
``True`` if the value matches the expected type; ``False`` if not.
|
|
281
281
|
"""
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
282
|
+
if expected == 'number':
|
|
283
|
+
return _is_number(value)
|
|
284
|
+
if expected == 'integer':
|
|
285
|
+
return isinstance(value, int) and not isinstance(value, bool)
|
|
286
|
+
if expected == 'boolean':
|
|
287
|
+
return isinstance(value, bool)
|
|
285
288
|
|
|
286
|
-
|
|
289
|
+
py_type = TYPE_MAP.get(expected)
|
|
290
|
+
return isinstance(value, py_type) if py_type else False
|
|
287
291
|
|
|
288
292
|
|
|
289
293
|
def _validate_record(
|
|
@@ -330,6 +334,9 @@ def _validate_record(
|
|
|
330
334
|
# SECTION: FUNCTIONS ======================================================== #
|
|
331
335
|
|
|
332
336
|
|
|
337
|
+
# -- Helpers -- #
|
|
338
|
+
|
|
339
|
+
|
|
333
340
|
def validate_field(
|
|
334
341
|
value: Any,
|
|
335
342
|
rules: StrAnyMap | FieldRules,
|
|
@@ -425,6 +432,9 @@ def validate_field(
|
|
|
425
432
|
return {'valid': len(errors) == 0, 'errors': errors}
|
|
426
433
|
|
|
427
434
|
|
|
435
|
+
# -- Orchestration -- #
|
|
436
|
+
|
|
437
|
+
|
|
428
438
|
def validate(
|
|
429
439
|
source: StrPath | JSONData,
|
|
430
440
|
rules: RulesMap | None = None,
|
etlplus/types.py
CHANGED
|
@@ -193,8 +193,8 @@ type AggregateSpec = StrAnyMap
|
|
|
193
193
|
|
|
194
194
|
# -- Pipelines-- #
|
|
195
195
|
|
|
196
|
-
# Unified pipeline step spec consumed by :mod:`etlplus.transform`.
|
|
197
|
-
type StepSpec = FilterSpec | MapSpec | SelectSpec | SortSpec
|
|
196
|
+
# Unified pipeline step spec consumed by :mod:`etlplus.ops.transform`.
|
|
197
|
+
type StepSpec = AggregateSpec | FilterSpec | MapSpec | SelectSpec | SortSpec
|
|
198
198
|
|
|
199
199
|
# Collections of steps
|
|
200
200
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: etlplus
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.14.3
|
|
4
4
|
Summary: A Swiss Army knife for simple ETL operations
|
|
5
5
|
Home-page: https://github.com/Dagitali/ETLPlus
|
|
6
6
|
Author: ETLPlus Team
|
|
@@ -79,8 +79,10 @@ package and command-line interface for data extraction, validation, transformati
|
|
|
79
79
|
- [Binary Serialization and Interchange](#binary-serialization-and-interchange)
|
|
80
80
|
- [Databases and Embedded Storage](#databases-and-embedded-storage)
|
|
81
81
|
- [Spreadsheets](#spreadsheets)
|
|
82
|
-
- [
|
|
82
|
+
- [Statistical / Scientific / Numeric Computing](#statistical--scientific--numeric-computing)
|
|
83
83
|
- [Logs and Event Streams](#logs-and-event-streams)
|
|
84
|
+
- [Data Archives](#data-archives)
|
|
85
|
+
- [Templates](#templates)
|
|
84
86
|
- [Usage](#usage)
|
|
85
87
|
- [Command Line Interface](#command-line-interface)
|
|
86
88
|
- [Argument Order and Required Options](#argument-order-and-required-options)
|
|
@@ -194,7 +196,7 @@ etlplus extract file examples/data/sample.csv \
|
|
|
194
196
|
[Python API](#python-api):
|
|
195
197
|
|
|
196
198
|
```python
|
|
197
|
-
from etlplus import extract, transform, validate, load
|
|
199
|
+
from etlplus.ops import extract, transform, validate, load
|
|
198
200
|
|
|
199
201
|
data = extract("file", "input.csv")
|
|
200
202
|
ops = {"filter": {"field": "age", "op": "gt", "value": 25}, "select": ["name", "email"]}
|
|
@@ -221,93 +223,122 @@ DDL can be rendered from table specs for migrations or schema checks.
|
|
|
221
223
|
|
|
222
224
|
### Files (`file`)
|
|
223
225
|
|
|
224
|
-
|
|
226
|
+
Recognized file formats are listed in the tables below. Support for reading to or writing from a recognized file format is marked as:
|
|
225
227
|
|
|
226
228
|
- **Y**: implemented (may require optional dependencies)
|
|
227
229
|
- **N**: stubbed or not yet implemented
|
|
228
230
|
|
|
229
231
|
#### Stubbed / Placeholder
|
|
230
232
|
|
|
231
|
-
| Format |
|
|
232
|
-
| --- | --- | --- |
|
|
233
|
+
| Format | Read | Write | Description |
|
|
234
|
+
| --- | --- | --- | --- |
|
|
233
235
|
| `stub` | N | Placeholder format for tests and future connectors. |
|
|
234
236
|
|
|
235
237
|
#### Tabular & Delimited Text
|
|
236
238
|
|
|
237
|
-
| Format |
|
|
238
|
-
| --- | --- | --- |
|
|
239
|
-
| `csv` | Y | Comma-Separated Values |
|
|
240
|
-
| `
|
|
241
|
-
| `
|
|
242
|
-
| `psv` | N | Pipe-Separated Values |
|
|
243
|
-
| `tab` | N | Often synonymous with TSV |
|
|
244
|
-
| `tsv` | Y | Tab-Separated Values |
|
|
245
|
-
| `txt` | Y | Plain text, often delimited or fixed-width |
|
|
239
|
+
| Format | Read | Write | Description |
|
|
240
|
+
| --- | --- | --- | --- |
|
|
241
|
+
| `csv` | Y | Y | Comma-Separated Values |
|
|
242
|
+
| `dat` | N | N | Generic data file, often delimited or fixed-width |
|
|
243
|
+
| `fwf` | N | N | Fixed-Width Fields |
|
|
244
|
+
| `psv` | N | N | Pipe-Separated Values |
|
|
245
|
+
| `tab` | N | N | Often synonymous with TSV |
|
|
246
|
+
| `tsv` | Y | Y | Tab-Separated Values |
|
|
247
|
+
| `txt` | Y | Y | Plain text, often delimited or fixed-width |
|
|
246
248
|
|
|
247
249
|
#### Semi-Structured Text
|
|
248
250
|
|
|
249
|
-
| Format |
|
|
250
|
-
| --- | --- | --- |
|
|
251
|
-
| `cfg` | N | Config-style key-value pairs |
|
|
252
|
-
| `conf` | N | Config-style key-value pairs |
|
|
253
|
-
| `ini` | N | Config-style key-value pairs |
|
|
254
|
-
| `json` | Y | JavaScript Object Notation |
|
|
255
|
-
| `ndjson` | Y | Newline-Delimited JSON |
|
|
256
|
-
| `properties` | N | Java-style key-value pairs |
|
|
257
|
-
| `toml` | N | Tom's Obvious Minimal Language |
|
|
258
|
-
| `xml` | Y | Extensible Markup Language |
|
|
259
|
-
| `yaml` | Y | YAML Ain't Markup Language |
|
|
251
|
+
| Format | Read | Write | Description |
|
|
252
|
+
| --- | --- | --- | --- |
|
|
253
|
+
| `cfg` | N | N | Config-style key-value pairs |
|
|
254
|
+
| `conf` | N | N | Config-style key-value pairs |
|
|
255
|
+
| `ini` | N | N | Config-style key-value pairs |
|
|
256
|
+
| `json` | Y | Y | JavaScript Object Notation |
|
|
257
|
+
| `ndjson` | Y | Y | Newline-Delimited JSON |
|
|
258
|
+
| `properties` | N | N | Java-style key-value pairs |
|
|
259
|
+
| `toml` | N | N | Tom's Obvious Minimal Language |
|
|
260
|
+
| `xml` | Y | Y | Extensible Markup Language |
|
|
261
|
+
| `yaml` | Y | Y | YAML Ain't Markup Language |
|
|
260
262
|
|
|
261
263
|
#### Columnar / Analytics-Friendly
|
|
262
264
|
|
|
263
|
-
| Format |
|
|
264
|
-
| --- | --- | --- |
|
|
265
|
-
| `arrow` | N | Apache Arrow IPC |
|
|
266
|
-
| `feather` | Y | Apache Arrow Feather |
|
|
267
|
-
| `orc` | Y | Optimized Row Columnar; common in Hadoop |
|
|
268
|
-
| `parquet` | Y | Apache Parquet; common in Big Data |
|
|
265
|
+
| Format | Read | Write | Description |
|
|
266
|
+
| --- | --- | --- | --- |
|
|
267
|
+
| `arrow` | N | N | Apache Arrow IPC |
|
|
268
|
+
| `feather` | Y | Y | Apache Arrow Feather |
|
|
269
|
+
| `orc` | Y | Y | Optimized Row Columnar; common in Hadoop |
|
|
270
|
+
| `parquet` | Y | Y | Apache Parquet; common in Big Data |
|
|
269
271
|
|
|
270
272
|
#### Binary Serialization and Interchange
|
|
271
273
|
|
|
272
|
-
| Format |
|
|
273
|
-
| --- | --- | --- |
|
|
274
|
-
| `avro` | Y | Apache Avro |
|
|
275
|
-
| `bson` | N | Binary JSON; common with MongoDB exports/dumps |
|
|
276
|
-
| `cbor` | N | Concise Binary Object Representation |
|
|
277
|
-
| `ion` | N | Amazon Ion |
|
|
278
|
-
| `msgpack` | N | MessagePack |
|
|
279
|
-
| `pb` | N | Protocol Buffers (Google Protobuf) |
|
|
280
|
-
| `pbf` | N | Protocolbuffer Binary Format; often for GIS data |
|
|
281
|
-
| `proto` | N | Protocol Buffers schema; often in .pb / .bin |
|
|
274
|
+
| Format | Read | Write | Description |
|
|
275
|
+
| --- | --- | --- | --- |
|
|
276
|
+
| `avro` | Y | Y | Apache Avro |
|
|
277
|
+
| `bson` | N | N | Binary JSON; common with MongoDB exports/dumps |
|
|
278
|
+
| `cbor` | N | N | Concise Binary Object Representation |
|
|
279
|
+
| `ion` | N | N | Amazon Ion |
|
|
280
|
+
| `msgpack` | N | N | MessagePack |
|
|
281
|
+
| `pb` | N | N | Protocol Buffers (Google Protobuf) |
|
|
282
|
+
| `pbf` | N | N | Protocolbuffer Binary Format; often for GIS data |
|
|
283
|
+
| `proto` | N | N | Protocol Buffers schema; often in .pb / .bin |
|
|
282
284
|
|
|
283
285
|
#### Databases and Embedded Storage
|
|
284
286
|
|
|
285
|
-
| Format |
|
|
286
|
-
| --- | --- | --- |
|
|
287
|
-
| `accdb` | N | Microsoft Access
|
|
288
|
-
| `duckdb` | N |
|
|
289
|
-
| `mdb` | N | Microsoft Access
|
|
290
|
-
| `sqlite` | N |
|
|
287
|
+
| Format | Read | Write | Description |
|
|
288
|
+
| --- | --- | --- | --- |
|
|
289
|
+
| `accdb` | N | N | Microsoft Access (newer format) |
|
|
290
|
+
| `duckdb` | N | N | DuckDB |
|
|
291
|
+
| `mdb` | N | N | Microsoft Access (older format) |
|
|
292
|
+
| `sqlite` | N | N | SQLite |
|
|
291
293
|
|
|
292
294
|
#### Spreadsheets
|
|
293
295
|
|
|
296
|
+
| Format | Read | Write | Description |
|
|
297
|
+
| --- | --- | --- | --- |
|
|
298
|
+
| `numbers` | N | N | Apple Numbers |
|
|
299
|
+
| `ods` | N | N | OpenDocument |
|
|
300
|
+
| `wks` | N | N | Lotus 1-2-3 |
|
|
301
|
+
| `xls` | Y | Y | Microsoft Excel (BIFF) |
|
|
302
|
+
| `xlsm` | N | N | Microsoft Excel Macro-Enabled (Open XML) |
|
|
303
|
+
| `xlsx` | Y | Y | Microsoft Excel (Open XML) |
|
|
304
|
+
|
|
305
|
+
#### Statistical / Scientific / Numeric Computing
|
|
306
|
+
|
|
307
|
+
| Format | Read | Write | Description |
|
|
308
|
+
| --- | --- | --- | --- |
|
|
309
|
+
| `dta` | N | N | Stata |
|
|
310
|
+
| `hdf5` | N | N | Hierarchical Data Format |
|
|
311
|
+
| `mat` | N | N | MATLAB |
|
|
312
|
+
| `nc` | N | N | NetCDF |
|
|
313
|
+
| `rda` | N | N | RData workspace/object |
|
|
314
|
+
| `rds` | N | N | R data |
|
|
315
|
+
| `sas7bdat` | N | N | SAS data |
|
|
316
|
+
| `sav` | N | N | SPSS data |
|
|
317
|
+
| `sylk` | N | N | Symbolic Link |
|
|
318
|
+
| `xpt` | N | N | SAS Transport |
|
|
319
|
+
| `zsav` | N | N | Compressed SPSS data |
|
|
320
|
+
|
|
321
|
+
#### Logs and Event Streams
|
|
322
|
+
|
|
294
323
|
| Format | Supported | Description |
|
|
295
324
|
| --- | --- | --- |
|
|
296
|
-
| `
|
|
297
|
-
| `xlsx` | Y | Microsoft Excel (Open XML) |
|
|
325
|
+
| `log` | N | N | Generic log file |
|
|
298
326
|
|
|
299
327
|
#### Data Archives
|
|
300
328
|
|
|
301
|
-
| Format |
|
|
302
|
-
| --- | --- | --- |
|
|
303
|
-
| `gz` | Y | Gzip-compressed file |
|
|
304
|
-
| `zip` | Y | ZIP archive |
|
|
329
|
+
| Format | Read | Write | Description |
|
|
330
|
+
| --- | --- | --- | --- |
|
|
331
|
+
| `gz` | Y | Y | Gzip-compressed file |
|
|
332
|
+
| `zip` | Y | Y | ZIP archive |
|
|
305
333
|
|
|
306
|
-
####
|
|
334
|
+
#### Templates
|
|
307
335
|
|
|
308
|
-
| Format |
|
|
309
|
-
| --- | --- | --- |
|
|
310
|
-
| `
|
|
336
|
+
| Format | Read | Write | Description |
|
|
337
|
+
| --- | --- | --- | --- |
|
|
338
|
+
| `hbs` | N | N | Handlebars |
|
|
339
|
+
| `jinja2` | N | N | Jinja2 |
|
|
340
|
+
| `mustache` | N | N | Mustache |
|
|
341
|
+
| `vm` | N | N | Apache Velocity |
|
|
311
342
|
|
|
312
343
|
## Usage
|
|
313
344
|
|
|
@@ -500,7 +531,7 @@ cat examples/data/sample.json \
|
|
|
500
531
|
Use ETLPlus as a Python library:
|
|
501
532
|
|
|
502
533
|
```python
|
|
503
|
-
from etlplus import extract, validate, transform, load
|
|
534
|
+
from etlplus.ops import extract, validate, transform, load
|
|
504
535
|
|
|
505
536
|
# Extract data
|
|
506
537
|
data = extract("file", "data.json")
|
|
@@ -695,7 +726,7 @@ We split tests into two layers:
|
|
|
695
726
|
pagination + rate limit defaults, file/API connector interactions) may touch temp files and use
|
|
696
727
|
fake clients.
|
|
697
728
|
|
|
698
|
-
If a test calls `etlplus.cli.main()` or `etlplus.run.run()` it’s integration by default.
|
|
729
|
+
If a test calls `etlplus.cli.main()` or `etlplus.ops.run.run()` it’s integration by default. Full
|
|
699
730
|
criteria: [`CONTRIBUTING.md#testing`](CONTRIBUTING.md#testing).
|
|
700
731
|
|
|
701
732
|
### Code Coverage
|