etlplus 0.9.2__py3-none-any.whl → 0.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- etlplus/__init__.py +26 -1
- etlplus/api/README.md +3 -51
- etlplus/api/__init__.py +0 -10
- etlplus/api/config.py +28 -39
- etlplus/api/endpoint_client.py +3 -3
- etlplus/api/pagination/client.py +1 -1
- etlplus/api/rate_limiting/config.py +1 -13
- etlplus/api/rate_limiting/rate_limiter.py +11 -8
- etlplus/api/request_manager.py +6 -11
- etlplus/api/transport.py +2 -14
- etlplus/api/types.py +6 -96
- etlplus/cli/commands.py +43 -76
- etlplus/cli/constants.py +1 -1
- etlplus/cli/handlers.py +12 -40
- etlplus/cli/io.py +2 -2
- etlplus/cli/main.py +1 -1
- etlplus/cli/state.py +7 -4
- etlplus/{workflow → config}/__init__.py +23 -10
- etlplus/{workflow → config}/connector.py +44 -58
- etlplus/{workflow → config}/jobs.py +32 -105
- etlplus/{workflow → config}/pipeline.py +51 -59
- etlplus/{workflow → config}/profile.py +5 -8
- etlplus/config/types.py +204 -0
- etlplus/config/utils.py +120 -0
- etlplus/database/ddl.py +1 -1
- etlplus/database/engine.py +3 -19
- etlplus/database/orm.py +0 -2
- etlplus/database/schema.py +1 -1
- etlplus/enums.py +266 -0
- etlplus/{ops/extract.py → extract.py} +99 -81
- etlplus/file.py +652 -0
- etlplus/{ops/load.py → load.py} +101 -78
- etlplus/{ops/run.py → run.py} +127 -159
- etlplus/{api/utils.py → run_helpers.py} +153 -209
- etlplus/{ops/transform.py → transform.py} +68 -75
- etlplus/types.py +4 -5
- etlplus/utils.py +2 -136
- etlplus/{ops/validate.py → validate.py} +12 -22
- etlplus/validation/__init__.py +44 -0
- etlplus/{ops → validation}/utils.py +17 -53
- {etlplus-0.9.2.dist-info → etlplus-0.10.1.dist-info}/METADATA +17 -210
- etlplus-0.10.1.dist-info/RECORD +65 -0
- {etlplus-0.9.2.dist-info → etlplus-0.10.1.dist-info}/WHEEL +1 -1
- etlplus/README.md +0 -37
- etlplus/api/enums.py +0 -51
- etlplus/cli/README.md +0 -40
- etlplus/database/README.md +0 -48
- etlplus/file/README.md +0 -105
- etlplus/file/__init__.py +0 -25
- etlplus/file/_imports.py +0 -141
- etlplus/file/_io.py +0 -160
- etlplus/file/accdb.py +0 -78
- etlplus/file/arrow.py +0 -78
- etlplus/file/avro.py +0 -176
- etlplus/file/bson.py +0 -77
- etlplus/file/cbor.py +0 -78
- etlplus/file/cfg.py +0 -79
- etlplus/file/conf.py +0 -80
- etlplus/file/core.py +0 -322
- etlplus/file/csv.py +0 -79
- etlplus/file/dat.py +0 -78
- etlplus/file/dta.py +0 -77
- etlplus/file/duckdb.py +0 -78
- etlplus/file/enums.py +0 -343
- etlplus/file/feather.py +0 -111
- etlplus/file/fwf.py +0 -77
- etlplus/file/gz.py +0 -123
- etlplus/file/hbs.py +0 -78
- etlplus/file/hdf5.py +0 -78
- etlplus/file/ini.py +0 -79
- etlplus/file/ion.py +0 -78
- etlplus/file/jinja2.py +0 -78
- etlplus/file/json.py +0 -98
- etlplus/file/log.py +0 -78
- etlplus/file/mat.py +0 -78
- etlplus/file/mdb.py +0 -78
- etlplus/file/msgpack.py +0 -78
- etlplus/file/mustache.py +0 -78
- etlplus/file/nc.py +0 -78
- etlplus/file/ndjson.py +0 -108
- etlplus/file/numbers.py +0 -75
- etlplus/file/ods.py +0 -79
- etlplus/file/orc.py +0 -111
- etlplus/file/parquet.py +0 -113
- etlplus/file/pb.py +0 -78
- etlplus/file/pbf.py +0 -77
- etlplus/file/properties.py +0 -78
- etlplus/file/proto.py +0 -77
- etlplus/file/psv.py +0 -79
- etlplus/file/rda.py +0 -78
- etlplus/file/rds.py +0 -78
- etlplus/file/sas7bdat.py +0 -78
- etlplus/file/sav.py +0 -77
- etlplus/file/sqlite.py +0 -78
- etlplus/file/stub.py +0 -84
- etlplus/file/sylk.py +0 -77
- etlplus/file/tab.py +0 -81
- etlplus/file/toml.py +0 -78
- etlplus/file/tsv.py +0 -80
- etlplus/file/txt.py +0 -102
- etlplus/file/vm.py +0 -78
- etlplus/file/wks.py +0 -77
- etlplus/file/xls.py +0 -88
- etlplus/file/xlsm.py +0 -79
- etlplus/file/xlsx.py +0 -99
- etlplus/file/xml.py +0 -185
- etlplus/file/xpt.py +0 -78
- etlplus/file/yaml.py +0 -95
- etlplus/file/zip.py +0 -175
- etlplus/file/zsav.py +0 -77
- etlplus/ops/README.md +0 -50
- etlplus/ops/__init__.py +0 -61
- etlplus/templates/README.md +0 -46
- etlplus/workflow/README.md +0 -52
- etlplus/workflow/dag.py +0 -105
- etlplus/workflow/types.py +0 -115
- etlplus-0.9.2.dist-info/RECORD +0 -134
- {etlplus-0.9.2.dist-info → etlplus-0.10.1.dist-info}/entry_points.txt +0 -0
- {etlplus-0.9.2.dist-info → etlplus-0.10.1.dist-info}/licenses/LICENSE +0 -0
- {etlplus-0.9.2.dist-info → etlplus-0.10.1.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
:mod:`etlplus.
|
|
2
|
+
:mod:`etlplus.extract` module.
|
|
3
3
|
|
|
4
4
|
Helpers to extract data from files, databases, and REST APIs.
|
|
5
5
|
"""
|
|
@@ -10,81 +10,58 @@ from pathlib import Path
|
|
|
10
10
|
from typing import Any
|
|
11
11
|
from typing import cast
|
|
12
12
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
from
|
|
16
|
-
from
|
|
17
|
-
from
|
|
18
|
-
from
|
|
19
|
-
from
|
|
20
|
-
from
|
|
21
|
-
from
|
|
13
|
+
import requests # type: ignore[import]
|
|
14
|
+
|
|
15
|
+
from .enums import DataConnectorType
|
|
16
|
+
from .enums import FileFormat
|
|
17
|
+
from .enums import HttpMethod
|
|
18
|
+
from .enums import coerce_data_connector_type
|
|
19
|
+
from .enums import coerce_file_format
|
|
20
|
+
from .file import File
|
|
21
|
+
from .types import JSONData
|
|
22
|
+
from .types import JSONDict
|
|
23
|
+
from .types import JSONList
|
|
24
|
+
from .types import StrPath
|
|
22
25
|
|
|
23
26
|
# SECTION: FUNCTIONS ======================================================== #
|
|
24
27
|
|
|
25
28
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
29
|
+
# -- File Extraction -- #
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def extract_from_file(
|
|
33
|
+
file_path: StrPath,
|
|
34
|
+
file_format: FileFormat | str | None = FileFormat.JSON,
|
|
30
35
|
) -> JSONData:
|
|
31
36
|
"""
|
|
32
|
-
Extract data from a
|
|
37
|
+
Extract (semi-)structured data from a local file.
|
|
33
38
|
|
|
34
39
|
Parameters
|
|
35
40
|
----------
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
(for example, ``timeout``). To use a pre-configured
|
|
43
|
-
:class:`requests.Session`, provide it via ``session``.
|
|
44
|
-
When omitted, ``timeout`` defaults to 10 seconds.
|
|
41
|
+
file_path : StrPath
|
|
42
|
+
Source file path.
|
|
43
|
+
file_format : FileFormat | str | None, optional
|
|
44
|
+
File format to parse. If ``None``, infer from the filename
|
|
45
|
+
extension. Defaults to `'json'` for backward compatibility when
|
|
46
|
+
explicitly provided.
|
|
45
47
|
|
|
46
48
|
Returns
|
|
47
49
|
-------
|
|
48
50
|
JSONData
|
|
49
|
-
Parsed
|
|
50
|
-
|
|
51
|
-
Raises
|
|
52
|
-
------
|
|
53
|
-
TypeError
|
|
54
|
-
If a provided ``session`` does not expose the required HTTP
|
|
55
|
-
method (for example, ``get``).
|
|
51
|
+
Parsed data as a mapping or a list of mappings.
|
|
56
52
|
"""
|
|
57
|
-
|
|
58
|
-
session = kwargs.pop('session', None)
|
|
59
|
-
request_callable, timeout, _ = resolve_request(
|
|
60
|
-
method,
|
|
61
|
-
session=session,
|
|
62
|
-
timeout=timeout,
|
|
63
|
-
)
|
|
64
|
-
response = request_callable(url, timeout=timeout, **kwargs)
|
|
65
|
-
response.raise_for_status()
|
|
53
|
+
path = Path(file_path)
|
|
66
54
|
|
|
67
|
-
|
|
68
|
-
if
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
except ValueError:
|
|
72
|
-
# Malformed JSON despite content-type; fall back to text
|
|
73
|
-
return {
|
|
74
|
-
'content': response.text,
|
|
75
|
-
'content_type': content_type,
|
|
76
|
-
}
|
|
77
|
-
if isinstance(payload, dict):
|
|
78
|
-
return cast(JSONDict, payload)
|
|
79
|
-
if isinstance(payload, list):
|
|
80
|
-
if all(isinstance(x, dict) for x in payload):
|
|
81
|
-
return cast(JSONList, payload)
|
|
82
|
-
# Coerce non-dict array items into objects for consistency
|
|
83
|
-
return [{'value': x} for x in payload]
|
|
84
|
-
# Fallback: wrap scalar JSON
|
|
85
|
-
return {'value': payload}
|
|
55
|
+
# If no explicit format is provided, let File infer from extension.
|
|
56
|
+
if file_format is None:
|
|
57
|
+
return File(path, None).read()
|
|
58
|
+
fmt = coerce_file_format(file_format)
|
|
86
59
|
|
|
87
|
-
|
|
60
|
+
# Let file module perform existence and format validation.
|
|
61
|
+
return File(path, fmt).read()
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# -- Database Extraction (Placeholder) -- #
|
|
88
65
|
|
|
89
66
|
|
|
90
67
|
def extract_from_database(
|
|
@@ -119,36 +96,77 @@ def extract_from_database(
|
|
|
119
96
|
]
|
|
120
97
|
|
|
121
98
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
99
|
+
# -- REST API Extraction -- #
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def extract_from_api(
|
|
103
|
+
url: str,
|
|
104
|
+
method: HttpMethod | str = HttpMethod.GET,
|
|
105
|
+
**kwargs: Any,
|
|
125
106
|
) -> JSONData:
|
|
126
107
|
"""
|
|
127
|
-
Extract
|
|
108
|
+
Extract data from a REST API.
|
|
128
109
|
|
|
129
110
|
Parameters
|
|
130
111
|
----------
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
112
|
+
url : str
|
|
113
|
+
API endpoint URL.
|
|
114
|
+
method : HttpMethod | str, optional
|
|
115
|
+
HTTP method to use. Defaults to ``GET``.
|
|
116
|
+
**kwargs : Any
|
|
117
|
+
Extra arguments forwarded to the underlying ``requests`` call
|
|
118
|
+
(for example, ``timeout``). To use a pre-configured
|
|
119
|
+
:class:`requests.Session`, provide it via ``session``.
|
|
137
120
|
|
|
138
121
|
Returns
|
|
139
122
|
-------
|
|
140
123
|
JSONData
|
|
141
|
-
Parsed
|
|
124
|
+
Parsed JSON payload, or a fallback object with raw text.
|
|
125
|
+
|
|
126
|
+
Raises
|
|
127
|
+
------
|
|
128
|
+
TypeError
|
|
129
|
+
If a provided ``session`` does not expose the required HTTP
|
|
130
|
+
method (for example, ``get``).
|
|
142
131
|
"""
|
|
143
|
-
|
|
132
|
+
http_method = HttpMethod.coerce(method)
|
|
144
133
|
|
|
145
|
-
#
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
134
|
+
# Apply a conservative timeout to guard against hanging requests.
|
|
135
|
+
timeout = kwargs.pop('timeout', 10.0)
|
|
136
|
+
session = kwargs.pop('session', None)
|
|
137
|
+
requester = session or requests
|
|
149
138
|
|
|
150
|
-
|
|
151
|
-
|
|
139
|
+
request_callable = getattr(requester, http_method.value, None)
|
|
140
|
+
if not callable(request_callable):
|
|
141
|
+
raise TypeError(
|
|
142
|
+
'Session object must supply a callable'
|
|
143
|
+
f'"{http_method.value}" method',
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
response = request_callable(url, timeout=timeout, **kwargs)
|
|
147
|
+
response.raise_for_status()
|
|
148
|
+
|
|
149
|
+
content_type = response.headers.get('content-type', '').lower()
|
|
150
|
+
if 'application/json' in content_type:
|
|
151
|
+
try:
|
|
152
|
+
payload: Any = response.json()
|
|
153
|
+
except ValueError:
|
|
154
|
+
# Malformed JSON despite content-type; fall back to text
|
|
155
|
+
return {
|
|
156
|
+
'content': response.text,
|
|
157
|
+
'content_type': content_type,
|
|
158
|
+
}
|
|
159
|
+
if isinstance(payload, dict):
|
|
160
|
+
return cast(JSONDict, payload)
|
|
161
|
+
if isinstance(payload, list):
|
|
162
|
+
if all(isinstance(x, dict) for x in payload):
|
|
163
|
+
return cast(JSONList, payload)
|
|
164
|
+
# Coerce non-dict array items into objects for consistency
|
|
165
|
+
return [{'value': x} for x in payload]
|
|
166
|
+
# Fallback: wrap scalar JSON
|
|
167
|
+
return {'value': payload}
|
|
168
|
+
|
|
169
|
+
return {'content': response.text, 'content_type': content_type}
|
|
152
170
|
|
|
153
171
|
|
|
154
172
|
# -- Orchestration -- #
|
|
@@ -184,7 +202,7 @@ def extract(
|
|
|
184
202
|
ValueError
|
|
185
203
|
If `source_type` is not one of the supported values.
|
|
186
204
|
"""
|
|
187
|
-
match
|
|
205
|
+
match coerce_data_connector_type(source_type):
|
|
188
206
|
case DataConnectorType.FILE:
|
|
189
207
|
# Prefer explicit format if provided, else infer from filename.
|
|
190
208
|
return extract_from_file(source, file_format)
|
|
@@ -195,6 +213,6 @@ def extract(
|
|
|
195
213
|
# ``file_format`` is ignored for APIs.
|
|
196
214
|
return extract_from_api(str(source), **kwargs)
|
|
197
215
|
case _:
|
|
198
|
-
#
|
|
199
|
-
#
|
|
216
|
+
# ``coerce_data_connector_type`` covers invalid entries, but keep
|
|
217
|
+
# explicit guard for defensive programming.
|
|
200
218
|
raise ValueError(f'Invalid source type: {source_type}')
|