etlplus 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- etlplus/__init__.py +43 -0
- etlplus/__main__.py +22 -0
- etlplus/__version__.py +14 -0
- etlplus/api/README.md +237 -0
- etlplus/api/__init__.py +136 -0
- etlplus/api/auth.py +432 -0
- etlplus/api/config.py +633 -0
- etlplus/api/endpoint_client.py +885 -0
- etlplus/api/errors.py +170 -0
- etlplus/api/pagination/__init__.py +47 -0
- etlplus/api/pagination/client.py +188 -0
- etlplus/api/pagination/config.py +440 -0
- etlplus/api/pagination/paginator.py +775 -0
- etlplus/api/rate_limiting/__init__.py +38 -0
- etlplus/api/rate_limiting/config.py +343 -0
- etlplus/api/rate_limiting/rate_limiter.py +266 -0
- etlplus/api/request_manager.py +589 -0
- etlplus/api/retry_manager.py +430 -0
- etlplus/api/transport.py +325 -0
- etlplus/api/types.py +172 -0
- etlplus/cli/__init__.py +15 -0
- etlplus/cli/app.py +1367 -0
- etlplus/cli/handlers.py +775 -0
- etlplus/cli/main.py +616 -0
- etlplus/config/__init__.py +56 -0
- etlplus/config/connector.py +372 -0
- etlplus/config/jobs.py +311 -0
- etlplus/config/pipeline.py +339 -0
- etlplus/config/profile.py +78 -0
- etlplus/config/types.py +204 -0
- etlplus/config/utils.py +120 -0
- etlplus/ddl.py +197 -0
- etlplus/enums.py +414 -0
- etlplus/extract.py +218 -0
- etlplus/file.py +657 -0
- etlplus/load.py +336 -0
- etlplus/mixins.py +62 -0
- etlplus/py.typed +0 -0
- etlplus/run.py +368 -0
- etlplus/run_helpers.py +843 -0
- etlplus/templates/__init__.py +5 -0
- etlplus/templates/ddl.sql.j2 +128 -0
- etlplus/templates/view.sql.j2 +69 -0
- etlplus/transform.py +1049 -0
- etlplus/types.py +227 -0
- etlplus/utils.py +638 -0
- etlplus/validate.py +493 -0
- etlplus/validation/__init__.py +44 -0
- etlplus/validation/utils.py +389 -0
- etlplus-0.5.4.dist-info/METADATA +616 -0
- etlplus-0.5.4.dist-info/RECORD +55 -0
- etlplus-0.5.4.dist-info/WHEEL +5 -0
- etlplus-0.5.4.dist-info/entry_points.txt +2 -0
- etlplus-0.5.4.dist-info/licenses/LICENSE +21 -0
- etlplus-0.5.4.dist-info/top_level.txt +1 -0
etlplus/enums.py
ADDED
|
@@ -0,0 +1,414 @@
|
|
|
1
|
+
"""
|
|
2
|
+
:mod:`etlplus.enums` module.
|
|
3
|
+
|
|
4
|
+
Shared enumeration types used across ETLPlus modules.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import enum
|
|
10
|
+
import operator as _op
|
|
11
|
+
from statistics import fmean
|
|
12
|
+
from typing import Self
|
|
13
|
+
|
|
14
|
+
from .types import AggregateFunc
|
|
15
|
+
from .types import OperatorFunc
|
|
16
|
+
from .types import StrStrMap
|
|
17
|
+
|
|
18
|
+
# SECTION: EXPORTS ========================================================== #
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
'AggregateName',
|
|
23
|
+
'CoercibleStrEnum',
|
|
24
|
+
'DataConnectorType',
|
|
25
|
+
'FileFormat',
|
|
26
|
+
'HttpMethod',
|
|
27
|
+
'OperatorName',
|
|
28
|
+
'PipelineStep',
|
|
29
|
+
'coerce_data_connector_type',
|
|
30
|
+
'coerce_file_format',
|
|
31
|
+
'coerce_http_method',
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# SECTION: CLASSES ========================================================== #
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class CoercibleStrEnum(enum.StrEnum):
|
|
39
|
+
"""
|
|
40
|
+
StrEnum with ergonomic helpers.
|
|
41
|
+
|
|
42
|
+
Provides a DRY, class-level :meth:`coerce` that normalizes inputs and
|
|
43
|
+
produces consistent, informative error messages. Also exposes
|
|
44
|
+
:meth:`choices` for UI/validation and :meth:`try_coerce` for soft parsing.
|
|
45
|
+
|
|
46
|
+
Notes
|
|
47
|
+
-----
|
|
48
|
+
- Values are normalized via ``str(value).strip().casefold()``.
|
|
49
|
+
- Error messages enumerate allowed values for easier debugging.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
# -- Class Methods -- #
|
|
53
|
+
|
|
54
|
+
@classmethod
|
|
55
|
+
def aliases(cls) -> StrStrMap:
|
|
56
|
+
"""
|
|
57
|
+
Return a mapping of common aliases for each enum member.
|
|
58
|
+
|
|
59
|
+
Subclasses may override this method to provide custom aliases.
|
|
60
|
+
|
|
61
|
+
Returns
|
|
62
|
+
-------
|
|
63
|
+
StrStrMap
|
|
64
|
+
A mapping of alias names to their corresponding enum member names.
|
|
65
|
+
"""
|
|
66
|
+
return {}
|
|
67
|
+
|
|
68
|
+
@classmethod
|
|
69
|
+
def choices(cls) -> tuple[str, ...]:
|
|
70
|
+
"""
|
|
71
|
+
Return the allowed string values for this enum.
|
|
72
|
+
|
|
73
|
+
Returns
|
|
74
|
+
-------
|
|
75
|
+
tuple[str, ...]
|
|
76
|
+
A tuple of allowed string values for this enum.
|
|
77
|
+
"""
|
|
78
|
+
return tuple(member.value for member in cls)
|
|
79
|
+
|
|
80
|
+
@classmethod
|
|
81
|
+
def coerce(cls, value: Self | str | object) -> Self:
|
|
82
|
+
"""
|
|
83
|
+
Convert an enum member or string-like input to a member of ``cls``.
|
|
84
|
+
|
|
85
|
+
Parameters
|
|
86
|
+
----------
|
|
87
|
+
value : Self | str | object
|
|
88
|
+
An existing enum member or a text value to normalize.
|
|
89
|
+
|
|
90
|
+
Returns
|
|
91
|
+
-------
|
|
92
|
+
Self
|
|
93
|
+
The corresponding enum member.
|
|
94
|
+
|
|
95
|
+
Raises
|
|
96
|
+
------
|
|
97
|
+
ValueError
|
|
98
|
+
If the value cannot be coerced into a valid member.
|
|
99
|
+
"""
|
|
100
|
+
if isinstance(value, cls):
|
|
101
|
+
return value
|
|
102
|
+
try:
|
|
103
|
+
normalized = str(value).strip().casefold()
|
|
104
|
+
resolved = cls.aliases().get(normalized, normalized)
|
|
105
|
+
return cls(resolved) # type: ignore[arg-type]
|
|
106
|
+
except (ValueError, TypeError) as e:
|
|
107
|
+
allowed = ', '.join(cls.choices())
|
|
108
|
+
raise ValueError(
|
|
109
|
+
f'Invalid {cls.__name__} value: {value!r}. Allowed: {allowed}',
|
|
110
|
+
) from e
|
|
111
|
+
|
|
112
|
+
@classmethod
|
|
113
|
+
def try_coerce(
|
|
114
|
+
cls,
|
|
115
|
+
value: object,
|
|
116
|
+
) -> Self | None:
|
|
117
|
+
"""
|
|
118
|
+
Best-effort parse; return ``None`` on failure instead of raising.
|
|
119
|
+
|
|
120
|
+
Parameters
|
|
121
|
+
----------
|
|
122
|
+
value : object
|
|
123
|
+
An existing enum member or a text value to normalize.
|
|
124
|
+
|
|
125
|
+
Returns
|
|
126
|
+
-------
|
|
127
|
+
Self | None
|
|
128
|
+
The corresponding enum member, or ``None`` if coercion fails.
|
|
129
|
+
"""
|
|
130
|
+
try:
|
|
131
|
+
return cls.coerce(value)
|
|
132
|
+
except ValueError:
|
|
133
|
+
return None
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
# SECTION: ENUMS ============================================================ #
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
class AggregateName(CoercibleStrEnum):
|
|
140
|
+
"""Supported aggregations with helpers."""
|
|
141
|
+
|
|
142
|
+
# -- Constants -- #
|
|
143
|
+
|
|
144
|
+
AVG = 'avg'
|
|
145
|
+
COUNT = 'count'
|
|
146
|
+
MAX = 'max'
|
|
147
|
+
MIN = 'min'
|
|
148
|
+
SUM = 'sum'
|
|
149
|
+
|
|
150
|
+
# -- Class Methods -- #
|
|
151
|
+
|
|
152
|
+
@property
|
|
153
|
+
def func(self) -> AggregateFunc:
|
|
154
|
+
"""
|
|
155
|
+
Get the aggregation function for this aggregation type.
|
|
156
|
+
|
|
157
|
+
Returns
|
|
158
|
+
-------
|
|
159
|
+
AggregateFunc
|
|
160
|
+
The aggregation function corresponding to this aggregation type.
|
|
161
|
+
"""
|
|
162
|
+
if self is AggregateName.COUNT:
|
|
163
|
+
return lambda xs, n: n
|
|
164
|
+
if self is AggregateName.MAX:
|
|
165
|
+
return lambda xs, n: (max(xs) if xs else None)
|
|
166
|
+
if self is AggregateName.MIN:
|
|
167
|
+
return lambda xs, n: (min(xs) if xs else None)
|
|
168
|
+
if self is AggregateName.SUM:
|
|
169
|
+
return lambda xs, n: sum(xs)
|
|
170
|
+
|
|
171
|
+
# AVG
|
|
172
|
+
return lambda xs, n: (fmean(xs) if xs else 0.0)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
class DataConnectorType(CoercibleStrEnum):
|
|
176
|
+
"""Supported data connector types."""
|
|
177
|
+
|
|
178
|
+
# -- Constants -- #
|
|
179
|
+
|
|
180
|
+
API = 'api'
|
|
181
|
+
DATABASE = 'database'
|
|
182
|
+
FILE = 'file'
|
|
183
|
+
|
|
184
|
+
# -- Class Methods -- #
|
|
185
|
+
|
|
186
|
+
@classmethod
|
|
187
|
+
def aliases(cls) -> StrStrMap:
|
|
188
|
+
"""
|
|
189
|
+
Return a mapping of common aliases for each enum member.
|
|
190
|
+
|
|
191
|
+
Returns
|
|
192
|
+
-------
|
|
193
|
+
StrStrMap
|
|
194
|
+
A mapping of alias names to their corresponding enum member names.
|
|
195
|
+
"""
|
|
196
|
+
return {
|
|
197
|
+
'http': 'api',
|
|
198
|
+
'https': 'api',
|
|
199
|
+
'rest': 'api',
|
|
200
|
+
'db': 'database',
|
|
201
|
+
'filesystem': 'file',
|
|
202
|
+
'fs': 'file',
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
class FileFormat(CoercibleStrEnum):
|
|
207
|
+
"""Supported file formats for extraction."""
|
|
208
|
+
|
|
209
|
+
# -- Constants -- #
|
|
210
|
+
|
|
211
|
+
CSV = 'csv'
|
|
212
|
+
JSON = 'json'
|
|
213
|
+
XML = 'xml'
|
|
214
|
+
YAML = 'yaml'
|
|
215
|
+
|
|
216
|
+
# -- Class Methods -- #
|
|
217
|
+
|
|
218
|
+
@classmethod
|
|
219
|
+
def aliases(cls) -> StrStrMap:
|
|
220
|
+
"""
|
|
221
|
+
Return a mapping of common aliases for each enum member.
|
|
222
|
+
|
|
223
|
+
Returns
|
|
224
|
+
-------
|
|
225
|
+
StrStrMap
|
|
226
|
+
A mapping of alias names to their corresponding enum member names.
|
|
227
|
+
"""
|
|
228
|
+
return {
|
|
229
|
+
# Common shorthand
|
|
230
|
+
'yml': 'yaml',
|
|
231
|
+
# MIME types
|
|
232
|
+
'text/csv': 'csv',
|
|
233
|
+
'application/json': 'json',
|
|
234
|
+
'application/xml': 'xml',
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
class HttpMethod(CoercibleStrEnum):
|
|
239
|
+
"""Supported HTTP verbs that accept JSON payloads."""
|
|
240
|
+
|
|
241
|
+
# -- Constants -- #
|
|
242
|
+
|
|
243
|
+
CONNECT = 'connect'
|
|
244
|
+
DELETE = 'delete'
|
|
245
|
+
GET = 'get'
|
|
246
|
+
HEAD = 'head'
|
|
247
|
+
OPTIONS = 'options'
|
|
248
|
+
PATCH = 'patch'
|
|
249
|
+
POST = 'post'
|
|
250
|
+
PUT = 'put'
|
|
251
|
+
TRACE = 'trace'
|
|
252
|
+
|
|
253
|
+
# -- Getters -- #
|
|
254
|
+
|
|
255
|
+
@property
|
|
256
|
+
def allows_body(self) -> bool:
|
|
257
|
+
"""
|
|
258
|
+
Whether the method typically allows a request body.
|
|
259
|
+
|
|
260
|
+
Notes
|
|
261
|
+
-----
|
|
262
|
+
- RFCs do not strictly forbid bodies on some other methods (e.g.,
|
|
263
|
+
``DELETE``), but many servers/clients do not expect them. We mark
|
|
264
|
+
``POST``, ``PUT``, and ``PATCH`` as True.
|
|
265
|
+
"""
|
|
266
|
+
return self in {HttpMethod.POST, HttpMethod.PUT, HttpMethod.PATCH}
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
class OperatorName(CoercibleStrEnum):
|
|
270
|
+
"""Supported comparison operators with helpers."""
|
|
271
|
+
|
|
272
|
+
# -- Constants -- #
|
|
273
|
+
|
|
274
|
+
EQ = 'eq'
|
|
275
|
+
NE = 'ne'
|
|
276
|
+
GT = 'gt'
|
|
277
|
+
GTE = 'gte'
|
|
278
|
+
LT = 'lt'
|
|
279
|
+
LTE = 'lte'
|
|
280
|
+
IN = 'in'
|
|
281
|
+
CONTAINS = 'contains'
|
|
282
|
+
|
|
283
|
+
# -- Getters -- #
|
|
284
|
+
|
|
285
|
+
@property
|
|
286
|
+
def func(self) -> OperatorFunc:
|
|
287
|
+
"""
|
|
288
|
+
Get the comparison function for this operator.
|
|
289
|
+
|
|
290
|
+
Returns
|
|
291
|
+
-------
|
|
292
|
+
OperatorFunc
|
|
293
|
+
The comparison function corresponding to this operator.
|
|
294
|
+
"""
|
|
295
|
+
match self:
|
|
296
|
+
case OperatorName.EQ:
|
|
297
|
+
return _op.eq
|
|
298
|
+
case OperatorName.NE:
|
|
299
|
+
return _op.ne
|
|
300
|
+
case OperatorName.GT:
|
|
301
|
+
return _op.gt
|
|
302
|
+
case OperatorName.GTE:
|
|
303
|
+
return _op.ge
|
|
304
|
+
case OperatorName.LT:
|
|
305
|
+
return _op.lt
|
|
306
|
+
case OperatorName.LTE:
|
|
307
|
+
return _op.le
|
|
308
|
+
case OperatorName.IN:
|
|
309
|
+
return lambda a, b: a in b
|
|
310
|
+
case OperatorName.CONTAINS:
|
|
311
|
+
return lambda a, b: b in a
|
|
312
|
+
|
|
313
|
+
# -- Class Methods -- #
|
|
314
|
+
|
|
315
|
+
@classmethod
|
|
316
|
+
def aliases(cls) -> StrStrMap:
|
|
317
|
+
"""
|
|
318
|
+
Return a mapping of common aliases for each enum member.
|
|
319
|
+
|
|
320
|
+
Returns
|
|
321
|
+
-------
|
|
322
|
+
StrStrMap
|
|
323
|
+
A mapping of alias names to their corresponding enum member names.
|
|
324
|
+
"""
|
|
325
|
+
return {
|
|
326
|
+
'==': 'eq',
|
|
327
|
+
'=': 'eq',
|
|
328
|
+
'!=': 'ne',
|
|
329
|
+
'<>': 'ne',
|
|
330
|
+
'>=': 'gte',
|
|
331
|
+
'≥': 'gte',
|
|
332
|
+
'<=': 'lte',
|
|
333
|
+
'≤': 'lte',
|
|
334
|
+
'>': 'gt',
|
|
335
|
+
'<': 'lt',
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
class PipelineStep(CoercibleStrEnum):
|
|
340
|
+
"""Pipeline step names as an enum for internal orchestration."""
|
|
341
|
+
|
|
342
|
+
# -- Constants -- #
|
|
343
|
+
|
|
344
|
+
FILTER = 'filter'
|
|
345
|
+
MAP = 'map'
|
|
346
|
+
SELECT = 'select'
|
|
347
|
+
SORT = 'sort'
|
|
348
|
+
AGGREGATE = 'aggregate'
|
|
349
|
+
|
|
350
|
+
# -- Getters -- #
|
|
351
|
+
|
|
352
|
+
@property
|
|
353
|
+
def order(self) -> int:
|
|
354
|
+
"""
|
|
355
|
+
Get the execution order of this pipeline step.
|
|
356
|
+
|
|
357
|
+
Returns
|
|
358
|
+
-------
|
|
359
|
+
int
|
|
360
|
+
The execution order of this pipeline step.
|
|
361
|
+
"""
|
|
362
|
+
return _PIPELINE_ORDER_INDEX[self]
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
# SECTION: INTERNAL CONSTANTS ============================================== #
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
# Precomputed order index for PipelineStep; avoids recomputing on each access.
|
|
369
|
+
_PIPELINE_ORDER_INDEX: dict[PipelineStep, int] = {
|
|
370
|
+
PipelineStep.FILTER: 0,
|
|
371
|
+
PipelineStep.MAP: 1,
|
|
372
|
+
PipelineStep.SELECT: 2,
|
|
373
|
+
PipelineStep.SORT: 3,
|
|
374
|
+
PipelineStep.AGGREGATE: 4,
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
# SECTION: FUNCTIONS ======================================================== #
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def coerce_data_connector_type(
|
|
382
|
+
connector: DataConnectorType | str,
|
|
383
|
+
) -> DataConnectorType:
|
|
384
|
+
"""
|
|
385
|
+
Normalize textual data connector values to :class:`DataConnectorType`.
|
|
386
|
+
|
|
387
|
+
This thin wrapper is kept for backward compatibility; prefer
|
|
388
|
+
:meth:`DataConnectorType.coerce` going forward.
|
|
389
|
+
"""
|
|
390
|
+
return DataConnectorType.coerce(connector)
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
def coerce_file_format(
|
|
394
|
+
file_format: FileFormat | str,
|
|
395
|
+
) -> FileFormat:
|
|
396
|
+
"""
|
|
397
|
+
Normalize textual file format values to :class:`FileFormat`.
|
|
398
|
+
|
|
399
|
+
This thin wrapper is kept for backward compatibility; prefer
|
|
400
|
+
:meth:`FileFormat.coerce` going forward.
|
|
401
|
+
"""
|
|
402
|
+
return FileFormat.coerce(file_format)
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
def coerce_http_method(
|
|
406
|
+
http_method: HttpMethod | str,
|
|
407
|
+
) -> HttpMethod:
|
|
408
|
+
"""
|
|
409
|
+
Normalize textual HTTP method values to :class:`HttpMethod`.
|
|
410
|
+
|
|
411
|
+
This thin wrapper is kept for backward compatibility; prefer
|
|
412
|
+
:meth:`HttpMethod.coerce` going forward.
|
|
413
|
+
"""
|
|
414
|
+
return HttpMethod.coerce(http_method)
|
etlplus/extract.py
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
"""
|
|
2
|
+
:mod:`etlplus.extract` module.
|
|
3
|
+
|
|
4
|
+
Helpers to extract data from files, databases, and REST APIs.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
from typing import cast
|
|
12
|
+
|
|
13
|
+
import requests # type: ignore[import]
|
|
14
|
+
|
|
15
|
+
from .enums import DataConnectorType
|
|
16
|
+
from .enums import FileFormat
|
|
17
|
+
from .enums import HttpMethod
|
|
18
|
+
from .enums import coerce_data_connector_type
|
|
19
|
+
from .enums import coerce_file_format
|
|
20
|
+
from .file import File
|
|
21
|
+
from .types import JSONData
|
|
22
|
+
from .types import JSONDict
|
|
23
|
+
from .types import JSONList
|
|
24
|
+
from .types import StrPath
|
|
25
|
+
|
|
26
|
+
# SECTION: FUNCTIONS ======================================================== #
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# -- File Extraction -- #
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def extract_from_file(
|
|
33
|
+
file_path: StrPath,
|
|
34
|
+
file_format: FileFormat | str | None = FileFormat.JSON,
|
|
35
|
+
) -> JSONData:
|
|
36
|
+
"""
|
|
37
|
+
Extract (semi-)structured data from a local file.
|
|
38
|
+
|
|
39
|
+
Parameters
|
|
40
|
+
----------
|
|
41
|
+
file_path : StrPath
|
|
42
|
+
Source file path.
|
|
43
|
+
file_format : FileFormat | str | None, optional
|
|
44
|
+
File format to parse. If ``None``, infer from the filename
|
|
45
|
+
extension. Defaults to `'json'` for backward compatibility when
|
|
46
|
+
explicitly provided.
|
|
47
|
+
|
|
48
|
+
Returns
|
|
49
|
+
-------
|
|
50
|
+
JSONData
|
|
51
|
+
Parsed data as a mapping or a list of mappings.
|
|
52
|
+
"""
|
|
53
|
+
path = Path(file_path)
|
|
54
|
+
|
|
55
|
+
# If no explicit format is provided, let File infer from extension.
|
|
56
|
+
if file_format is None:
|
|
57
|
+
return File(path, None).read()
|
|
58
|
+
fmt = coerce_file_format(file_format)
|
|
59
|
+
|
|
60
|
+
# Let file module perform existence and format validation.
|
|
61
|
+
return File(path, fmt).read()
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# -- Database Extraction (Placeholder) -- #
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def extract_from_database(
|
|
68
|
+
connection_string: str,
|
|
69
|
+
) -> JSONList:
|
|
70
|
+
"""
|
|
71
|
+
Extract data from a database.
|
|
72
|
+
|
|
73
|
+
Notes
|
|
74
|
+
-----
|
|
75
|
+
Placeholder implementation. To enable database extraction, install and
|
|
76
|
+
configure database-specific drivers and query logic.
|
|
77
|
+
|
|
78
|
+
Parameters
|
|
79
|
+
----------
|
|
80
|
+
connection_string : str
|
|
81
|
+
Database connection string.
|
|
82
|
+
|
|
83
|
+
Returns
|
|
84
|
+
-------
|
|
85
|
+
JSONList
|
|
86
|
+
Informational message payload.
|
|
87
|
+
"""
|
|
88
|
+
return [
|
|
89
|
+
{
|
|
90
|
+
'message': 'Database extraction not yet implemented',
|
|
91
|
+
'connection_string': connection_string,
|
|
92
|
+
'note': (
|
|
93
|
+
'Install database-specific drivers to enable this feature'
|
|
94
|
+
),
|
|
95
|
+
},
|
|
96
|
+
]
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
# -- REST API Extraction -- #
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def extract_from_api(
|
|
103
|
+
url: str,
|
|
104
|
+
method: HttpMethod | str = HttpMethod.GET,
|
|
105
|
+
**kwargs: Any,
|
|
106
|
+
) -> JSONData:
|
|
107
|
+
"""
|
|
108
|
+
Extract data from a REST API.
|
|
109
|
+
|
|
110
|
+
Parameters
|
|
111
|
+
----------
|
|
112
|
+
url : str
|
|
113
|
+
API endpoint URL.
|
|
114
|
+
method : HttpMethod | str, optional
|
|
115
|
+
HTTP method to use. Defaults to ``GET``.
|
|
116
|
+
**kwargs : Any
|
|
117
|
+
Extra arguments forwarded to the underlying ``requests`` call
|
|
118
|
+
(for example, ``timeout``). To use a pre-configured
|
|
119
|
+
:class:`requests.Session`, provide it via ``session``.
|
|
120
|
+
|
|
121
|
+
Returns
|
|
122
|
+
-------
|
|
123
|
+
JSONData
|
|
124
|
+
Parsed JSON payload, or a fallback object with raw text.
|
|
125
|
+
|
|
126
|
+
Raises
|
|
127
|
+
------
|
|
128
|
+
TypeError
|
|
129
|
+
If a provided ``session`` does not expose the required HTTP
|
|
130
|
+
method (for example, ``get``).
|
|
131
|
+
"""
|
|
132
|
+
http_method = HttpMethod.coerce(method)
|
|
133
|
+
|
|
134
|
+
# Apply a conservative timeout to guard against hanging requests.
|
|
135
|
+
timeout = kwargs.pop('timeout', 10.0)
|
|
136
|
+
session = kwargs.pop('session', None)
|
|
137
|
+
requester = session or requests
|
|
138
|
+
|
|
139
|
+
request_callable = getattr(requester, http_method.value, None)
|
|
140
|
+
if not callable(request_callable):
|
|
141
|
+
raise TypeError(
|
|
142
|
+
'Session object must supply a callable'
|
|
143
|
+
f'"{http_method.value}" method',
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
response = request_callable(url, timeout=timeout, **kwargs)
|
|
147
|
+
response.raise_for_status()
|
|
148
|
+
|
|
149
|
+
content_type = response.headers.get('content-type', '').lower()
|
|
150
|
+
if 'application/json' in content_type:
|
|
151
|
+
try:
|
|
152
|
+
payload: Any = response.json()
|
|
153
|
+
except ValueError:
|
|
154
|
+
# Malformed JSON despite content-type; fall back to text
|
|
155
|
+
return {
|
|
156
|
+
'content': response.text,
|
|
157
|
+
'content_type': content_type,
|
|
158
|
+
}
|
|
159
|
+
if isinstance(payload, dict):
|
|
160
|
+
return cast(JSONDict, payload)
|
|
161
|
+
if isinstance(payload, list):
|
|
162
|
+
if all(isinstance(x, dict) for x in payload):
|
|
163
|
+
return cast(JSONList, payload)
|
|
164
|
+
# Coerce non-dict array items into objects for consistency
|
|
165
|
+
return [{'value': x} for x in payload]
|
|
166
|
+
# Fallback: wrap scalar JSON
|
|
167
|
+
return {'value': payload}
|
|
168
|
+
|
|
169
|
+
return {'content': response.text, 'content_type': content_type}
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
# -- Orchestration -- #
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def extract(
|
|
176
|
+
source_type: DataConnectorType | str,
|
|
177
|
+
source: StrPath,
|
|
178
|
+
file_format: FileFormat | str | None = None,
|
|
179
|
+
**kwargs: Any,
|
|
180
|
+
) -> JSONData:
|
|
181
|
+
"""
|
|
182
|
+
Extract data from a source (file, database, or API).
|
|
183
|
+
|
|
184
|
+
Parameters
|
|
185
|
+
----------
|
|
186
|
+
source_type : DataConnectorType | str
|
|
187
|
+
Type of data source.
|
|
188
|
+
source : StrPath
|
|
189
|
+
Source location (file path, connection string, or API URL).
|
|
190
|
+
file_format : FileFormat | str | None, optional
|
|
191
|
+
File format, inferred from filename extension if omitted.
|
|
192
|
+
**kwargs : Any
|
|
193
|
+
Additional arguments forwarded to source-specific extractors.
|
|
194
|
+
|
|
195
|
+
Returns
|
|
196
|
+
-------
|
|
197
|
+
JSONData
|
|
198
|
+
Extracted data.
|
|
199
|
+
|
|
200
|
+
Raises
|
|
201
|
+
------
|
|
202
|
+
ValueError
|
|
203
|
+
If `source_type` is not one of the supported values.
|
|
204
|
+
"""
|
|
205
|
+
match coerce_data_connector_type(source_type):
|
|
206
|
+
case DataConnectorType.FILE:
|
|
207
|
+
# Prefer explicit format if provided, else infer from filename.
|
|
208
|
+
return extract_from_file(source, file_format)
|
|
209
|
+
case DataConnectorType.DATABASE:
|
|
210
|
+
return extract_from_database(str(source))
|
|
211
|
+
case DataConnectorType.API:
|
|
212
|
+
# API extraction always uses an HTTP method; default is GET.
|
|
213
|
+
# ``file_format`` is ignored for APIs.
|
|
214
|
+
return extract_from_api(str(source), **kwargs)
|
|
215
|
+
case _:
|
|
216
|
+
# ``coerce_data_connector_type`` covers invalid entries, but keep
|
|
217
|
+
# explicit guard for defensive programming.
|
|
218
|
+
raise ValueError(f'Invalid source type: {source_type}')
|