etlplus 0.9.2__py3-none-any.whl → 0.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- etlplus/__init__.py +26 -1
- etlplus/api/README.md +3 -51
- etlplus/api/__init__.py +0 -10
- etlplus/api/config.py +28 -39
- etlplus/api/endpoint_client.py +3 -3
- etlplus/api/pagination/client.py +1 -1
- etlplus/api/rate_limiting/config.py +1 -13
- etlplus/api/rate_limiting/rate_limiter.py +11 -8
- etlplus/api/request_manager.py +6 -11
- etlplus/api/transport.py +2 -14
- etlplus/api/types.py +6 -96
- etlplus/cli/commands.py +43 -76
- etlplus/cli/constants.py +1 -1
- etlplus/cli/handlers.py +12 -40
- etlplus/cli/io.py +2 -2
- etlplus/cli/main.py +1 -1
- etlplus/cli/state.py +7 -4
- etlplus/{workflow → config}/__init__.py +23 -10
- etlplus/{workflow → config}/connector.py +44 -58
- etlplus/{workflow → config}/jobs.py +32 -105
- etlplus/{workflow → config}/pipeline.py +51 -59
- etlplus/{workflow → config}/profile.py +5 -8
- etlplus/config/types.py +204 -0
- etlplus/config/utils.py +120 -0
- etlplus/database/ddl.py +1 -1
- etlplus/database/engine.py +3 -19
- etlplus/database/orm.py +0 -2
- etlplus/database/schema.py +1 -1
- etlplus/enums.py +266 -0
- etlplus/{ops/extract.py → extract.py} +99 -81
- etlplus/file.py +652 -0
- etlplus/{ops/load.py → load.py} +101 -78
- etlplus/{ops/run.py → run.py} +127 -159
- etlplus/{api/utils.py → run_helpers.py} +153 -209
- etlplus/{ops/transform.py → transform.py} +68 -75
- etlplus/types.py +4 -5
- etlplus/utils.py +2 -136
- etlplus/{ops/validate.py → validate.py} +12 -22
- etlplus/validation/__init__.py +44 -0
- etlplus/{ops → validation}/utils.py +17 -53
- {etlplus-0.9.2.dist-info → etlplus-0.10.1.dist-info}/METADATA +17 -210
- etlplus-0.10.1.dist-info/RECORD +65 -0
- {etlplus-0.9.2.dist-info → etlplus-0.10.1.dist-info}/WHEEL +1 -1
- etlplus/README.md +0 -37
- etlplus/api/enums.py +0 -51
- etlplus/cli/README.md +0 -40
- etlplus/database/README.md +0 -48
- etlplus/file/README.md +0 -105
- etlplus/file/__init__.py +0 -25
- etlplus/file/_imports.py +0 -141
- etlplus/file/_io.py +0 -160
- etlplus/file/accdb.py +0 -78
- etlplus/file/arrow.py +0 -78
- etlplus/file/avro.py +0 -176
- etlplus/file/bson.py +0 -77
- etlplus/file/cbor.py +0 -78
- etlplus/file/cfg.py +0 -79
- etlplus/file/conf.py +0 -80
- etlplus/file/core.py +0 -322
- etlplus/file/csv.py +0 -79
- etlplus/file/dat.py +0 -78
- etlplus/file/dta.py +0 -77
- etlplus/file/duckdb.py +0 -78
- etlplus/file/enums.py +0 -343
- etlplus/file/feather.py +0 -111
- etlplus/file/fwf.py +0 -77
- etlplus/file/gz.py +0 -123
- etlplus/file/hbs.py +0 -78
- etlplus/file/hdf5.py +0 -78
- etlplus/file/ini.py +0 -79
- etlplus/file/ion.py +0 -78
- etlplus/file/jinja2.py +0 -78
- etlplus/file/json.py +0 -98
- etlplus/file/log.py +0 -78
- etlplus/file/mat.py +0 -78
- etlplus/file/mdb.py +0 -78
- etlplus/file/msgpack.py +0 -78
- etlplus/file/mustache.py +0 -78
- etlplus/file/nc.py +0 -78
- etlplus/file/ndjson.py +0 -108
- etlplus/file/numbers.py +0 -75
- etlplus/file/ods.py +0 -79
- etlplus/file/orc.py +0 -111
- etlplus/file/parquet.py +0 -113
- etlplus/file/pb.py +0 -78
- etlplus/file/pbf.py +0 -77
- etlplus/file/properties.py +0 -78
- etlplus/file/proto.py +0 -77
- etlplus/file/psv.py +0 -79
- etlplus/file/rda.py +0 -78
- etlplus/file/rds.py +0 -78
- etlplus/file/sas7bdat.py +0 -78
- etlplus/file/sav.py +0 -77
- etlplus/file/sqlite.py +0 -78
- etlplus/file/stub.py +0 -84
- etlplus/file/sylk.py +0 -77
- etlplus/file/tab.py +0 -81
- etlplus/file/toml.py +0 -78
- etlplus/file/tsv.py +0 -80
- etlplus/file/txt.py +0 -102
- etlplus/file/vm.py +0 -78
- etlplus/file/wks.py +0 -77
- etlplus/file/xls.py +0 -88
- etlplus/file/xlsm.py +0 -79
- etlplus/file/xlsx.py +0 -99
- etlplus/file/xml.py +0 -185
- etlplus/file/xpt.py +0 -78
- etlplus/file/yaml.py +0 -95
- etlplus/file/zip.py +0 -175
- etlplus/file/zsav.py +0 -77
- etlplus/ops/README.md +0 -50
- etlplus/ops/__init__.py +0 -61
- etlplus/templates/README.md +0 -46
- etlplus/workflow/README.md +0 -52
- etlplus/workflow/dag.py +0 -105
- etlplus/workflow/types.py +0 -115
- etlplus-0.9.2.dist-info/RECORD +0 -134
- {etlplus-0.9.2.dist-info → etlplus-0.10.1.dist-info}/entry_points.txt +0 -0
- {etlplus-0.9.2.dist-info → etlplus-0.10.1.dist-info}/licenses/LICENSE +0 -0
- {etlplus-0.9.2.dist-info → etlplus-0.10.1.dist-info}/top_level.txt +0 -0
etlplus/config/types.py
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
"""
|
|
2
|
+
:mod:`etlplus.config.types` module.
|
|
3
|
+
|
|
4
|
+
Type aliases and editor-only TypedDicts for :mod:`etlplus.config`.
|
|
5
|
+
|
|
6
|
+
These types improve IDE autocomplete and static analysis while the runtime
|
|
7
|
+
parsers remain permissive.
|
|
8
|
+
|
|
9
|
+
Notes
|
|
10
|
+
-----
|
|
11
|
+
- TypedDicts in this module are intentionally ``total=False`` and are not
|
|
12
|
+
enforced at runtime.
|
|
13
|
+
- ``*.from_obj`` constructors accept ``Mapping[str, Any]`` and perform
|
|
14
|
+
tolerant parsing and light casting. This keeps the runtime permissive while
|
|
15
|
+
improving autocomplete and static analysis for contributors.
|
|
16
|
+
|
|
17
|
+
Examples
|
|
18
|
+
--------
|
|
19
|
+
>>> from etlplus.config import Connector
|
|
20
|
+
>>> src: Connector = {
|
|
21
|
+
>>> "type": "file",
|
|
22
|
+
>>> "path": "/data/input.csv",
|
|
23
|
+
>>> }
|
|
24
|
+
>>> tgt: Connector = {
|
|
25
|
+
>>> "type": "database",
|
|
26
|
+
>>> "connection_string": "postgresql://user:pass@localhost/db",
|
|
27
|
+
>>> }
|
|
28
|
+
>>> from etlplus.api import RetryPolicy
|
|
29
|
+
>>> rp: RetryPolicy = {"max_attempts": 3, "backoff": 0.5}
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
from __future__ import annotations
|
|
33
|
+
|
|
34
|
+
from collections.abc import Mapping
|
|
35
|
+
from typing import Any
|
|
36
|
+
from typing import Literal
|
|
37
|
+
from typing import TypedDict
|
|
38
|
+
|
|
39
|
+
from ..api import PaginationConfigMap
|
|
40
|
+
from ..api import RateLimitConfigMap
|
|
41
|
+
from ..types import StrAnyMap
|
|
42
|
+
|
|
43
|
+
# SECTION: EXPORTS ========================================================= #
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
__all__ = [
|
|
47
|
+
# Type aliases
|
|
48
|
+
'ConnectorType',
|
|
49
|
+
# 'PaginationType',
|
|
50
|
+
# TypedDicts
|
|
51
|
+
'ApiProfileDefaultsMap',
|
|
52
|
+
'ApiProfileConfigMap',
|
|
53
|
+
'ApiConfigMap',
|
|
54
|
+
'EndpointMap',
|
|
55
|
+
'ConnectorApiConfigMap',
|
|
56
|
+
'ConnectorDbConfigMap',
|
|
57
|
+
'ConnectorFileConfigMap',
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# SECTION: TYPE ALIASES ===================================================== #
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# Literal type for supported connector kinds
|
|
65
|
+
type ConnectorType = Literal['api', 'database', 'file']
|
|
66
|
+
|
|
67
|
+
# Literal type for supported pagination kinds
|
|
68
|
+
# type PaginationType = Literal['page', 'offset', 'cursor']
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# SECTION: TYPED DICTS ====================================================== #
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class ApiConfigMap(TypedDict, total=False):
|
|
75
|
+
"""
|
|
76
|
+
Top-level API config shape parsed by ApiConfig.from_obj.
|
|
77
|
+
|
|
78
|
+
Either provide a 'base_url' with optional 'headers' and 'endpoints', or
|
|
79
|
+
provide 'profiles' with at least one profile having a 'base_url'.
|
|
80
|
+
|
|
81
|
+
See Also
|
|
82
|
+
--------
|
|
83
|
+
- etlplus.config.api.ApiConfig.from_obj: parses this mapping
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
base_url: str
|
|
87
|
+
headers: StrAnyMap
|
|
88
|
+
endpoints: Mapping[str, EndpointMap | str]
|
|
89
|
+
profiles: Mapping[str, ApiProfileConfigMap]
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class ApiProfileConfigMap(TypedDict, total=False):
|
|
93
|
+
"""
|
|
94
|
+
Shape accepted for a profile entry under ApiConfigMap.profiles.
|
|
95
|
+
|
|
96
|
+
Notes
|
|
97
|
+
-----
|
|
98
|
+
`base_url` is required at runtime when profiles are provided.
|
|
99
|
+
|
|
100
|
+
See Also
|
|
101
|
+
--------
|
|
102
|
+
- etlplus.config.api.ApiProfileConfig.from_obj: parses this mapping
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
base_url: str
|
|
106
|
+
headers: StrAnyMap
|
|
107
|
+
base_path: str
|
|
108
|
+
auth: StrAnyMap
|
|
109
|
+
defaults: ApiProfileDefaultsMap
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class ApiProfileDefaultsMap(TypedDict, total=False):
|
|
113
|
+
"""
|
|
114
|
+
Defaults block available under a profile (all keys optional).
|
|
115
|
+
|
|
116
|
+
Notes
|
|
117
|
+
-----
|
|
118
|
+
Runtime expects header values to be str; typing remains permissive.
|
|
119
|
+
|
|
120
|
+
See Also
|
|
121
|
+
--------
|
|
122
|
+
- etlplus.config.api.ApiProfileConfig.from_obj: consumes this block
|
|
123
|
+
- etlplus.config.pagination.PaginationConfig.from_obj: parses pagination
|
|
124
|
+
- etlplus.api.rate_limiting.RateLimitConfig.from_obj: parses rate_limit
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
headers: StrAnyMap
|
|
128
|
+
pagination: PaginationConfigMap | StrAnyMap
|
|
129
|
+
rate_limit: RateLimitConfigMap | StrAnyMap
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class ConnectorApiConfigMap(TypedDict, total=False):
|
|
133
|
+
"""
|
|
134
|
+
Shape accepted by ConnectorApi.from_obj (all keys optional).
|
|
135
|
+
|
|
136
|
+
See Also
|
|
137
|
+
--------
|
|
138
|
+
- etlplus.config.connector.ConnectorApi.from_obj
|
|
139
|
+
"""
|
|
140
|
+
|
|
141
|
+
name: str
|
|
142
|
+
type: ConnectorType
|
|
143
|
+
url: str
|
|
144
|
+
method: str
|
|
145
|
+
headers: StrAnyMap
|
|
146
|
+
query_params: StrAnyMap
|
|
147
|
+
pagination: PaginationConfigMap
|
|
148
|
+
rate_limit: RateLimitConfigMap
|
|
149
|
+
api: str
|
|
150
|
+
endpoint: str
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class ConnectorDbConfigMap(TypedDict, total=False):
|
|
154
|
+
"""
|
|
155
|
+
Shape accepted by ConnectorDb.from_obj (all keys optional).
|
|
156
|
+
|
|
157
|
+
See Also
|
|
158
|
+
--------
|
|
159
|
+
- etlplus.config.connector.ConnectorDb.from_obj
|
|
160
|
+
"""
|
|
161
|
+
|
|
162
|
+
name: str
|
|
163
|
+
type: ConnectorType
|
|
164
|
+
connection_string: str
|
|
165
|
+
query: str
|
|
166
|
+
table: str
|
|
167
|
+
mode: str
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class ConnectorFileConfigMap(TypedDict, total=False):
|
|
171
|
+
"""
|
|
172
|
+
Shape accepted by ConnectorFile.from_obj (all keys optional).
|
|
173
|
+
|
|
174
|
+
See Also
|
|
175
|
+
--------
|
|
176
|
+
- etlplus.config.connector.ConnectorFile.from_obj
|
|
177
|
+
"""
|
|
178
|
+
|
|
179
|
+
name: str
|
|
180
|
+
type: ConnectorType
|
|
181
|
+
format: str
|
|
182
|
+
path: str
|
|
183
|
+
options: StrAnyMap
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
class EndpointMap(TypedDict, total=False):
|
|
187
|
+
"""
|
|
188
|
+
Shape accepted by EndpointConfig.from_obj.
|
|
189
|
+
|
|
190
|
+
One of 'path' or 'url' should be provided.
|
|
191
|
+
|
|
192
|
+
See Also
|
|
193
|
+
--------
|
|
194
|
+
- etlplus.config.api.EndpointConfig.from_obj: parses this mapping
|
|
195
|
+
"""
|
|
196
|
+
|
|
197
|
+
path: str
|
|
198
|
+
url: str
|
|
199
|
+
method: str
|
|
200
|
+
path_params: StrAnyMap
|
|
201
|
+
query_params: StrAnyMap
|
|
202
|
+
body: Any
|
|
203
|
+
pagination: PaginationConfigMap
|
|
204
|
+
rate_limit: RateLimitConfigMap
|
etlplus/config/utils.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"""
|
|
2
|
+
:mod:`etlplus.config.utils` module.
|
|
3
|
+
|
|
4
|
+
A module defining utility helpers for ETL pipeline configuration.
|
|
5
|
+
|
|
6
|
+
Notes
|
|
7
|
+
-----
|
|
8
|
+
- Inputs to parsers favor ``Mapping[str, Any]`` to remain permissive and
|
|
9
|
+
avoid unnecessary copies; normalization returns concrete types.
|
|
10
|
+
- Substitution is shallow for strings and recursive for containers.
|
|
11
|
+
- Numeric coercion helpers are intentionally forgiving: invalid values
|
|
12
|
+
become ``None`` rather than raising.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from collections.abc import Iterable
|
|
18
|
+
from collections.abc import Mapping
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
from ..types import StrAnyMap
|
|
22
|
+
|
|
23
|
+
# SECTION: EXPORTS ========================================================== #
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
__all__ = [
|
|
27
|
+
# Functions
|
|
28
|
+
'deep_substitute',
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# SECTION: FUNCTIONS ======================================================== #
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def deep_substitute(
|
|
36
|
+
value: Any,
|
|
37
|
+
vars_map: StrAnyMap | None,
|
|
38
|
+
env_map: Mapping[str, str] | None,
|
|
39
|
+
) -> Any:
|
|
40
|
+
"""
|
|
41
|
+
Recursively substitute ``${VAR}`` tokens in nested structures.
|
|
42
|
+
|
|
43
|
+
Only strings are substituted; other types are returned as-is.
|
|
44
|
+
|
|
45
|
+
Parameters
|
|
46
|
+
----------
|
|
47
|
+
value : Any
|
|
48
|
+
The value to perform substitutions on.
|
|
49
|
+
vars_map : StrAnyMap | None
|
|
50
|
+
Mapping of variable names to replacement values (lower precedence).
|
|
51
|
+
env_map : Mapping[str, str] | None
|
|
52
|
+
Mapping of environment variables overriding ``vars_map`` values (higher
|
|
53
|
+
precedence).
|
|
54
|
+
|
|
55
|
+
Returns
|
|
56
|
+
-------
|
|
57
|
+
Any
|
|
58
|
+
New structure with substitutions applied where tokens were found.
|
|
59
|
+
"""
|
|
60
|
+
substitutions = _prepare_substitutions(vars_map, env_map)
|
|
61
|
+
|
|
62
|
+
def _apply(node: Any) -> Any:
|
|
63
|
+
match node:
|
|
64
|
+
case str():
|
|
65
|
+
return _replace_tokens(node, substitutions)
|
|
66
|
+
case Mapping():
|
|
67
|
+
return {k: _apply(v) for k, v in node.items()}
|
|
68
|
+
case list() | tuple() as seq:
|
|
69
|
+
apply = [_apply(item) for item in seq]
|
|
70
|
+
return apply if isinstance(seq, list) else tuple(apply)
|
|
71
|
+
case set():
|
|
72
|
+
return {_apply(item) for item in node}
|
|
73
|
+
case frozenset():
|
|
74
|
+
return frozenset(_apply(item) for item in node)
|
|
75
|
+
case _:
|
|
76
|
+
return node
|
|
77
|
+
|
|
78
|
+
return _apply(value)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# SECTION: INTERNAL FUNCTIONS ============================================== #
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _prepare_substitutions(
|
|
85
|
+
vars_map: StrAnyMap | None,
|
|
86
|
+
env_map: Mapping[str, Any] | None,
|
|
87
|
+
) -> tuple[tuple[str, Any], ...]:
|
|
88
|
+
"""Merge variable and environment maps into an ordered substitutions list.
|
|
89
|
+
|
|
90
|
+
Parameters
|
|
91
|
+
----------
|
|
92
|
+
vars_map : StrAnyMap | None
|
|
93
|
+
Mapping of variable names to replacement values (lower precedence).
|
|
94
|
+
env_map : Mapping[str, Any] | None
|
|
95
|
+
Environment-backed values that override entries from ``vars_map``.
|
|
96
|
+
|
|
97
|
+
Returns
|
|
98
|
+
-------
|
|
99
|
+
tuple[tuple[str, Any], ...]
|
|
100
|
+
Immutable sequence of ``(name, value)`` pairs suitable for token
|
|
101
|
+
replacement.
|
|
102
|
+
"""
|
|
103
|
+
if not vars_map and not env_map:
|
|
104
|
+
return ()
|
|
105
|
+
merged: dict[str, Any] = {**(vars_map or {}), **(env_map or {})}
|
|
106
|
+
return tuple(merged.items())
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _replace_tokens(
|
|
110
|
+
text: str,
|
|
111
|
+
substitutions: Iterable[tuple[str, Any]],
|
|
112
|
+
) -> str:
|
|
113
|
+
if not substitutions:
|
|
114
|
+
return text
|
|
115
|
+
out = text
|
|
116
|
+
for name, replacement in substitutions:
|
|
117
|
+
token = f'${{{name}}}'
|
|
118
|
+
if token in out:
|
|
119
|
+
out = out.replace(token, str(replacement))
|
|
120
|
+
return out
|
etlplus/database/ddl.py
CHANGED
|
@@ -203,7 +203,7 @@ def load_table_spec(
|
|
|
203
203
|
raise ValueError('Spec must be .json, .yml, or .yaml')
|
|
204
204
|
|
|
205
205
|
try:
|
|
206
|
-
spec = File(spec_path)
|
|
206
|
+
spec = File.read_file(spec_path)
|
|
207
207
|
except ImportError as e:
|
|
208
208
|
if suffix in {'.yml', '.yaml'}:
|
|
209
209
|
raise RuntimeError(
|
etlplus/database/engine.py
CHANGED
|
@@ -113,7 +113,7 @@ def load_database_url_from_config(
|
|
|
113
113
|
ValueError
|
|
114
114
|
If no connection string/URL/DSN is found for the specified entry.
|
|
115
115
|
"""
|
|
116
|
-
cfg = File(Path(path))
|
|
116
|
+
cfg = File.read_file(Path(path))
|
|
117
117
|
if not isinstance(cfg, Mapping):
|
|
118
118
|
raise TypeError('Database config must be a mapping')
|
|
119
119
|
|
|
@@ -136,25 +136,9 @@ def load_database_url_from_config(
|
|
|
136
136
|
return url
|
|
137
137
|
|
|
138
138
|
|
|
139
|
-
def make_engine(
|
|
140
|
-
|
|
141
|
-
**engine_kwargs: Any,
|
|
142
|
-
) -> Engine:
|
|
143
|
-
"""
|
|
144
|
-
Create a SQLAlchemy Engine, defaulting to env config if no URL given.
|
|
145
|
-
|
|
146
|
-
Parameters
|
|
147
|
-
----------
|
|
148
|
-
url : str | None, optional
|
|
149
|
-
Database URL/DSN string. When omitted, ``DATABASE_URL`` is used.
|
|
150
|
-
**engine_kwargs : Any
|
|
151
|
-
Extra keyword arguments forwarded to ``create_engine``.
|
|
139
|
+
def make_engine(url: str | None = None, **engine_kwargs: Any) -> Engine:
|
|
140
|
+
"""Create a SQLAlchemy Engine, defaulting to env config if no URL given."""
|
|
152
141
|
|
|
153
|
-
Returns
|
|
154
|
-
-------
|
|
155
|
-
Engine
|
|
156
|
-
Configured SQLAlchemy engine instance.
|
|
157
|
-
"""
|
|
158
142
|
resolved_url = url or DATABASE_URL
|
|
159
143
|
return create_engine(resolved_url, pool_pre_ping=True, **engine_kwargs)
|
|
160
144
|
|
etlplus/database/orm.py
CHANGED
|
@@ -201,14 +201,12 @@ def build_models(
|
|
|
201
201
|
) -> ModelRegistry:
|
|
202
202
|
"""
|
|
203
203
|
Build SQLAlchemy ORM models from table specifications.
|
|
204
|
-
|
|
205
204
|
Parameters
|
|
206
205
|
----------
|
|
207
206
|
specs : list[TableSpec]
|
|
208
207
|
List of table specifications.
|
|
209
208
|
base : type[DeclarativeBase], optional
|
|
210
209
|
Base class for the ORM models (default: :class:`Base`).
|
|
211
|
-
|
|
212
210
|
Returns
|
|
213
211
|
-------
|
|
214
212
|
ModelRegistry
|
etlplus/database/schema.py
CHANGED
etlplus/enums.py
CHANGED
|
@@ -8,6 +8,7 @@ from __future__ import annotations
|
|
|
8
8
|
|
|
9
9
|
import enum
|
|
10
10
|
import operator as _op
|
|
11
|
+
from pathlib import PurePath
|
|
11
12
|
from statistics import fmean
|
|
12
13
|
from typing import Self
|
|
13
14
|
|
|
@@ -22,9 +23,18 @@ __all__ = [
|
|
|
22
23
|
# Enums
|
|
23
24
|
'AggregateName',
|
|
24
25
|
'CoercibleStrEnum',
|
|
26
|
+
'CompressionFormat',
|
|
25
27
|
'DataConnectorType',
|
|
28
|
+
'FileFormat',
|
|
29
|
+
'HttpMethod',
|
|
26
30
|
'OperatorName',
|
|
27
31
|
'PipelineStep',
|
|
32
|
+
# Functions
|
|
33
|
+
'coerce_compression_format',
|
|
34
|
+
'coerce_data_connector_type',
|
|
35
|
+
'coerce_file_format',
|
|
36
|
+
'coerce_http_method',
|
|
37
|
+
'infer_file_format_and_compression',
|
|
28
38
|
]
|
|
29
39
|
|
|
30
40
|
|
|
@@ -168,6 +178,39 @@ class AggregateName(CoercibleStrEnum):
|
|
|
168
178
|
return lambda xs, n: (fmean(xs) if xs else 0.0)
|
|
169
179
|
|
|
170
180
|
|
|
181
|
+
class CompressionFormat(CoercibleStrEnum):
|
|
182
|
+
"""Supported compression formats for data files."""
|
|
183
|
+
|
|
184
|
+
# -- Constants -- #
|
|
185
|
+
|
|
186
|
+
GZ = 'gz'
|
|
187
|
+
ZIP = 'zip'
|
|
188
|
+
|
|
189
|
+
# -- Class Methods -- #
|
|
190
|
+
|
|
191
|
+
@classmethod
|
|
192
|
+
def aliases(cls) -> StrStrMap:
|
|
193
|
+
"""
|
|
194
|
+
Return a mapping of common aliases for each enum member.
|
|
195
|
+
|
|
196
|
+
Returns
|
|
197
|
+
-------
|
|
198
|
+
StrStrMap
|
|
199
|
+
A mapping of alias names to their corresponding enum member names.
|
|
200
|
+
"""
|
|
201
|
+
return {
|
|
202
|
+
# File extensions
|
|
203
|
+
'.gz': 'gz',
|
|
204
|
+
'.gzip': 'gz',
|
|
205
|
+
'.zip': 'zip',
|
|
206
|
+
# MIME types
|
|
207
|
+
'application/gzip': 'gz',
|
|
208
|
+
'application/x-gzip': 'gz',
|
|
209
|
+
'application/zip': 'zip',
|
|
210
|
+
'application/x-zip-compressed': 'zip',
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
|
|
171
214
|
class DataConnectorType(CoercibleStrEnum):
|
|
172
215
|
"""Supported data connector types."""
|
|
173
216
|
|
|
@@ -199,6 +242,119 @@ class DataConnectorType(CoercibleStrEnum):
|
|
|
199
242
|
}
|
|
200
243
|
|
|
201
244
|
|
|
245
|
+
class FileFormat(CoercibleStrEnum):
|
|
246
|
+
"""Supported file formats for extraction."""
|
|
247
|
+
|
|
248
|
+
# -- Constants -- #
|
|
249
|
+
|
|
250
|
+
AVRO = 'avro'
|
|
251
|
+
CSV = 'csv'
|
|
252
|
+
FEATHER = 'feather'
|
|
253
|
+
GZ = 'gz'
|
|
254
|
+
JSON = 'json'
|
|
255
|
+
NDJSON = 'ndjson'
|
|
256
|
+
ORC = 'orc'
|
|
257
|
+
PARQUET = 'parquet'
|
|
258
|
+
TSV = 'tsv'
|
|
259
|
+
TXT = 'txt'
|
|
260
|
+
XLS = 'xls'
|
|
261
|
+
XLSX = 'xlsx'
|
|
262
|
+
ZIP = 'zip'
|
|
263
|
+
XML = 'xml'
|
|
264
|
+
YAML = 'yaml'
|
|
265
|
+
|
|
266
|
+
# -- Class Methods -- #
|
|
267
|
+
|
|
268
|
+
@classmethod
|
|
269
|
+
def aliases(cls) -> StrStrMap:
|
|
270
|
+
"""
|
|
271
|
+
Return a mapping of common aliases for each enum member.
|
|
272
|
+
|
|
273
|
+
Returns
|
|
274
|
+
-------
|
|
275
|
+
StrStrMap
|
|
276
|
+
A mapping of alias names to their corresponding enum member names.
|
|
277
|
+
"""
|
|
278
|
+
return {
|
|
279
|
+
# Common shorthand
|
|
280
|
+
'parq': 'parquet',
|
|
281
|
+
'yml': 'yaml',
|
|
282
|
+
# File extensions
|
|
283
|
+
'.avro': 'avro',
|
|
284
|
+
'.csv': 'csv',
|
|
285
|
+
'.feather': 'feather',
|
|
286
|
+
'.gz': 'gz',
|
|
287
|
+
'.json': 'json',
|
|
288
|
+
'.jsonl': 'ndjson',
|
|
289
|
+
'.ndjson': 'ndjson',
|
|
290
|
+
'.orc': 'orc',
|
|
291
|
+
'.parquet': 'parquet',
|
|
292
|
+
'.pq': 'parquet',
|
|
293
|
+
'.tsv': 'tsv',
|
|
294
|
+
'.txt': 'txt',
|
|
295
|
+
'.xls': 'xls',
|
|
296
|
+
'.xlsx': 'xlsx',
|
|
297
|
+
'.zip': 'zip',
|
|
298
|
+
'.xml': 'xml',
|
|
299
|
+
'.yaml': 'yaml',
|
|
300
|
+
'.yml': 'yaml',
|
|
301
|
+
# MIME types
|
|
302
|
+
'application/avro': 'avro',
|
|
303
|
+
'application/feather': 'feather',
|
|
304
|
+
'application/gzip': 'gz',
|
|
305
|
+
'application/json': 'json',
|
|
306
|
+
'application/jsonlines': 'ndjson',
|
|
307
|
+
'application/ndjson': 'ndjson',
|
|
308
|
+
'application/orc': 'orc',
|
|
309
|
+
'application/vnd.apache.arrow.file': 'feather',
|
|
310
|
+
'application/vnd.apache.orc': 'orc',
|
|
311
|
+
'application/vnd.ms-excel': 'xls',
|
|
312
|
+
(
|
|
313
|
+
'application/vnd.openxmlformats-'
|
|
314
|
+
'officedocument.spreadsheetml.sheet'
|
|
315
|
+
): 'xlsx',
|
|
316
|
+
'application/x-avro': 'avro',
|
|
317
|
+
'application/x-ndjson': 'ndjson',
|
|
318
|
+
'application/x-parquet': 'parquet',
|
|
319
|
+
'application/xml': 'xml',
|
|
320
|
+
'application/zip': 'zip',
|
|
321
|
+
'text/csv': 'csv',
|
|
322
|
+
'text/plain': 'txt',
|
|
323
|
+
'text/tab-separated-values': 'tsv',
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
class HttpMethod(CoercibleStrEnum):
|
|
328
|
+
"""Supported HTTP verbs that accept JSON payloads."""
|
|
329
|
+
|
|
330
|
+
# -- Constants -- #
|
|
331
|
+
|
|
332
|
+
CONNECT = 'connect'
|
|
333
|
+
DELETE = 'delete'
|
|
334
|
+
GET = 'get'
|
|
335
|
+
HEAD = 'head'
|
|
336
|
+
OPTIONS = 'options'
|
|
337
|
+
PATCH = 'patch'
|
|
338
|
+
POST = 'post'
|
|
339
|
+
PUT = 'put'
|
|
340
|
+
TRACE = 'trace'
|
|
341
|
+
|
|
342
|
+
# -- Getters -- #
|
|
343
|
+
|
|
344
|
+
@property
|
|
345
|
+
def allows_body(self) -> bool:
|
|
346
|
+
"""
|
|
347
|
+
Whether the method typically allows a request body.
|
|
348
|
+
|
|
349
|
+
Notes
|
|
350
|
+
-----
|
|
351
|
+
- RFCs do not strictly forbid bodies on some other methods (e.g.,
|
|
352
|
+
``DELETE``), but many servers/clients do not expect them. We mark
|
|
353
|
+
``POST``, ``PUT``, and ``PATCH`` as True.
|
|
354
|
+
"""
|
|
355
|
+
return self in {HttpMethod.POST, HttpMethod.PUT, HttpMethod.PATCH}
|
|
356
|
+
|
|
357
|
+
|
|
202
358
|
class OperatorName(CoercibleStrEnum):
|
|
203
359
|
"""Supported comparison operators with helpers."""
|
|
204
360
|
|
|
@@ -298,6 +454,13 @@ class PipelineStep(CoercibleStrEnum):
|
|
|
298
454
|
# SECTION: INTERNAL CONSTANTS ============================================== #
|
|
299
455
|
|
|
300
456
|
|
|
457
|
+
# Compression formats that are also file formats.
|
|
458
|
+
_COMPRESSION_FILE_FORMATS: set[FileFormat] = {
|
|
459
|
+
FileFormat.GZ,
|
|
460
|
+
FileFormat.ZIP,
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
|
|
301
464
|
# Precomputed order index for PipelineStep; avoids recomputing on each access.
|
|
302
465
|
_PIPELINE_ORDER_INDEX: dict[PipelineStep, int] = {
|
|
303
466
|
PipelineStep.FILTER: 0,
|
|
@@ -306,3 +469,106 @@ _PIPELINE_ORDER_INDEX: dict[PipelineStep, int] = {
|
|
|
306
469
|
PipelineStep.SORT: 3,
|
|
307
470
|
PipelineStep.AGGREGATE: 4,
|
|
308
471
|
}
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
# SECTION: FUNCTIONS ======================================================== #
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
def coerce_data_connector_type(
|
|
478
|
+
connector: DataConnectorType | str,
|
|
479
|
+
) -> DataConnectorType:
|
|
480
|
+
"""
|
|
481
|
+
Normalize textual data connector values to :class:`DataConnectorType`.
|
|
482
|
+
|
|
483
|
+
This thin wrapper is kept for backward compatibility; prefer
|
|
484
|
+
:meth:`DataConnectorType.coerce` going forward.
|
|
485
|
+
"""
|
|
486
|
+
return DataConnectorType.coerce(connector)
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
def coerce_file_format(
|
|
490
|
+
file_format: FileFormat | str,
|
|
491
|
+
) -> FileFormat:
|
|
492
|
+
"""
|
|
493
|
+
Normalize textual file format values to :class:`FileFormat`.
|
|
494
|
+
|
|
495
|
+
This thin wrapper is kept for backward compatibility; prefer
|
|
496
|
+
:meth:`FileFormat.coerce` going forward.
|
|
497
|
+
"""
|
|
498
|
+
return FileFormat.coerce(file_format)
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
def coerce_compression_format(
|
|
502
|
+
compression_format: CompressionFormat | str,
|
|
503
|
+
) -> CompressionFormat:
|
|
504
|
+
"""
|
|
505
|
+
Normalize textual compression format values to :class:`CompressionFormat`.
|
|
506
|
+
|
|
507
|
+
This thin wrapper is kept for backward compatibility; prefer
|
|
508
|
+
:meth:`CompressionFormat.coerce` going forward.
|
|
509
|
+
"""
|
|
510
|
+
return CompressionFormat.coerce(compression_format)
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
def coerce_http_method(
|
|
514
|
+
http_method: HttpMethod | str,
|
|
515
|
+
) -> HttpMethod:
|
|
516
|
+
"""
|
|
517
|
+
Normalize textual HTTP method values to :class:`HttpMethod`.
|
|
518
|
+
|
|
519
|
+
This thin wrapper is kept for backward compatibility; prefer
|
|
520
|
+
:meth:`HttpMethod.coerce` going forward.
|
|
521
|
+
"""
|
|
522
|
+
return HttpMethod.coerce(http_method)
|
|
523
|
+
|
|
524
|
+
|
|
525
|
+
def infer_file_format_and_compression(
|
|
526
|
+
value: object,
|
|
527
|
+
) -> tuple[FileFormat | None, CompressionFormat | None]:
|
|
528
|
+
"""
|
|
529
|
+
Infer data format and compression from a filename, extension, or MIME type.
|
|
530
|
+
|
|
531
|
+
Parameters
|
|
532
|
+
----------
|
|
533
|
+
value : object
|
|
534
|
+
A filename, extension, MIME type, or existing enum member.
|
|
535
|
+
|
|
536
|
+
Returns
|
|
537
|
+
-------
|
|
538
|
+
tuple[FileFormat | None, CompressionFormat | None]
|
|
539
|
+
The inferred data format and compression, if any.
|
|
540
|
+
"""
|
|
541
|
+
if isinstance(value, FileFormat):
|
|
542
|
+
if value in _COMPRESSION_FILE_FORMATS:
|
|
543
|
+
return None, CompressionFormat.coerce(value.value)
|
|
544
|
+
return value, None
|
|
545
|
+
if isinstance(value, CompressionFormat):
|
|
546
|
+
return None, value
|
|
547
|
+
|
|
548
|
+
text = str(value).strip()
|
|
549
|
+
if not text:
|
|
550
|
+
return None, None
|
|
551
|
+
|
|
552
|
+
normalized = text.casefold()
|
|
553
|
+
mime = normalized.split(';', 1)[0].strip()
|
|
554
|
+
|
|
555
|
+
compression = CompressionFormat.try_coerce(mime)
|
|
556
|
+
fmt = FileFormat.try_coerce(mime)
|
|
557
|
+
|
|
558
|
+
suffixes = PurePath(text).suffixes
|
|
559
|
+
if suffixes:
|
|
560
|
+
normalized_suffixes = [suffix.casefold() for suffix in suffixes]
|
|
561
|
+
compression = (
|
|
562
|
+
CompressionFormat.try_coerce(normalized_suffixes[-1])
|
|
563
|
+
or compression
|
|
564
|
+
)
|
|
565
|
+
if compression is not None:
|
|
566
|
+
normalized_suffixes = normalized_suffixes[:-1]
|
|
567
|
+
if normalized_suffixes:
|
|
568
|
+
fmt = FileFormat.try_coerce(normalized_suffixes[-1]) or fmt
|
|
569
|
+
|
|
570
|
+
if fmt in _COMPRESSION_FILE_FORMATS:
|
|
571
|
+
compression = compression or CompressionFormat.coerce(fmt.value)
|
|
572
|
+
fmt = None
|
|
573
|
+
|
|
574
|
+
return fmt, compression
|