etlplus 0.9.2__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- etlplus/__init__.py +26 -1
- etlplus/api/README.md +3 -51
- etlplus/api/__init__.py +0 -10
- etlplus/api/config.py +28 -39
- etlplus/api/endpoint_client.py +3 -3
- etlplus/api/pagination/client.py +1 -1
- etlplus/api/rate_limiting/config.py +1 -13
- etlplus/api/rate_limiting/rate_limiter.py +11 -8
- etlplus/api/request_manager.py +6 -11
- etlplus/api/transport.py +2 -14
- etlplus/api/types.py +6 -96
- etlplus/cli/commands.py +43 -76
- etlplus/cli/constants.py +1 -1
- etlplus/cli/handlers.py +12 -40
- etlplus/cli/io.py +2 -2
- etlplus/cli/main.py +1 -1
- etlplus/cli/state.py +7 -4
- etlplus/{workflow → config}/__init__.py +23 -10
- etlplus/{workflow → config}/connector.py +44 -58
- etlplus/{workflow → config}/jobs.py +32 -105
- etlplus/{workflow → config}/pipeline.py +51 -59
- etlplus/{workflow → config}/profile.py +5 -8
- etlplus/config/types.py +204 -0
- etlplus/config/utils.py +120 -0
- etlplus/database/ddl.py +1 -1
- etlplus/database/engine.py +3 -19
- etlplus/database/orm.py +0 -2
- etlplus/database/schema.py +1 -1
- etlplus/enums.py +288 -0
- etlplus/{ops/extract.py → extract.py} +99 -81
- etlplus/file.py +652 -0
- etlplus/{ops/load.py → load.py} +101 -78
- etlplus/{ops/run.py → run.py} +127 -159
- etlplus/{api/utils.py → run_helpers.py} +153 -209
- etlplus/{ops/transform.py → transform.py} +68 -75
- etlplus/types.py +4 -5
- etlplus/utils.py +2 -136
- etlplus/{ops/validate.py → validate.py} +12 -22
- etlplus/validation/__init__.py +44 -0
- etlplus/{ops → validation}/utils.py +17 -53
- {etlplus-0.9.2.dist-info → etlplus-0.10.2.dist-info}/METADATA +17 -210
- etlplus-0.10.2.dist-info/RECORD +65 -0
- {etlplus-0.9.2.dist-info → etlplus-0.10.2.dist-info}/WHEEL +1 -1
- etlplus/README.md +0 -37
- etlplus/api/enums.py +0 -51
- etlplus/cli/README.md +0 -40
- etlplus/database/README.md +0 -48
- etlplus/file/README.md +0 -105
- etlplus/file/__init__.py +0 -25
- etlplus/file/_imports.py +0 -141
- etlplus/file/_io.py +0 -160
- etlplus/file/accdb.py +0 -78
- etlplus/file/arrow.py +0 -78
- etlplus/file/avro.py +0 -176
- etlplus/file/bson.py +0 -77
- etlplus/file/cbor.py +0 -78
- etlplus/file/cfg.py +0 -79
- etlplus/file/conf.py +0 -80
- etlplus/file/core.py +0 -322
- etlplus/file/csv.py +0 -79
- etlplus/file/dat.py +0 -78
- etlplus/file/dta.py +0 -77
- etlplus/file/duckdb.py +0 -78
- etlplus/file/enums.py +0 -343
- etlplus/file/feather.py +0 -111
- etlplus/file/fwf.py +0 -77
- etlplus/file/gz.py +0 -123
- etlplus/file/hbs.py +0 -78
- etlplus/file/hdf5.py +0 -78
- etlplus/file/ini.py +0 -79
- etlplus/file/ion.py +0 -78
- etlplus/file/jinja2.py +0 -78
- etlplus/file/json.py +0 -98
- etlplus/file/log.py +0 -78
- etlplus/file/mat.py +0 -78
- etlplus/file/mdb.py +0 -78
- etlplus/file/msgpack.py +0 -78
- etlplus/file/mustache.py +0 -78
- etlplus/file/nc.py +0 -78
- etlplus/file/ndjson.py +0 -108
- etlplus/file/numbers.py +0 -75
- etlplus/file/ods.py +0 -79
- etlplus/file/orc.py +0 -111
- etlplus/file/parquet.py +0 -113
- etlplus/file/pb.py +0 -78
- etlplus/file/pbf.py +0 -77
- etlplus/file/properties.py +0 -78
- etlplus/file/proto.py +0 -77
- etlplus/file/psv.py +0 -79
- etlplus/file/rda.py +0 -78
- etlplus/file/rds.py +0 -78
- etlplus/file/sas7bdat.py +0 -78
- etlplus/file/sav.py +0 -77
- etlplus/file/sqlite.py +0 -78
- etlplus/file/stub.py +0 -84
- etlplus/file/sylk.py +0 -77
- etlplus/file/tab.py +0 -81
- etlplus/file/toml.py +0 -78
- etlplus/file/tsv.py +0 -80
- etlplus/file/txt.py +0 -102
- etlplus/file/vm.py +0 -78
- etlplus/file/wks.py +0 -77
- etlplus/file/xls.py +0 -88
- etlplus/file/xlsm.py +0 -79
- etlplus/file/xlsx.py +0 -99
- etlplus/file/xml.py +0 -185
- etlplus/file/xpt.py +0 -78
- etlplus/file/yaml.py +0 -95
- etlplus/file/zip.py +0 -175
- etlplus/file/zsav.py +0 -77
- etlplus/ops/README.md +0 -50
- etlplus/ops/__init__.py +0 -61
- etlplus/templates/README.md +0 -46
- etlplus/workflow/README.md +0 -52
- etlplus/workflow/dag.py +0 -105
- etlplus/workflow/types.py +0 -115
- etlplus-0.9.2.dist-info/RECORD +0 -134
- {etlplus-0.9.2.dist-info → etlplus-0.10.2.dist-info}/entry_points.txt +0 -0
- {etlplus-0.9.2.dist-info → etlplus-0.10.2.dist-info}/licenses/LICENSE +0 -0
- {etlplus-0.9.2.dist-info → etlplus-0.10.2.dist-info}/top_level.txt +0 -0
etlplus/enums.py
CHANGED
|
@@ -8,6 +8,7 @@ from __future__ import annotations
|
|
|
8
8
|
|
|
9
9
|
import enum
|
|
10
10
|
import operator as _op
|
|
11
|
+
from pathlib import PurePath
|
|
11
12
|
from statistics import fmean
|
|
12
13
|
from typing import Self
|
|
13
14
|
|
|
@@ -22,9 +23,18 @@ __all__ = [
|
|
|
22
23
|
# Enums
|
|
23
24
|
'AggregateName',
|
|
24
25
|
'CoercibleStrEnum',
|
|
26
|
+
'CompressionFormat',
|
|
25
27
|
'DataConnectorType',
|
|
28
|
+
'FileFormat',
|
|
29
|
+
'HttpMethod',
|
|
26
30
|
'OperatorName',
|
|
27
31
|
'PipelineStep',
|
|
32
|
+
# Functions
|
|
33
|
+
'coerce_compression_format',
|
|
34
|
+
'coerce_data_connector_type',
|
|
35
|
+
'coerce_file_format',
|
|
36
|
+
'coerce_http_method',
|
|
37
|
+
'infer_file_format_and_compression',
|
|
28
38
|
]
|
|
29
39
|
|
|
30
40
|
|
|
@@ -168,6 +178,39 @@ class AggregateName(CoercibleStrEnum):
|
|
|
168
178
|
return lambda xs, n: (fmean(xs) if xs else 0.0)
|
|
169
179
|
|
|
170
180
|
|
|
181
|
+
class CompressionFormat(CoercibleStrEnum):
|
|
182
|
+
"""Supported compression formats for data files."""
|
|
183
|
+
|
|
184
|
+
# -- Constants -- #
|
|
185
|
+
|
|
186
|
+
GZ = 'gz'
|
|
187
|
+
ZIP = 'zip'
|
|
188
|
+
|
|
189
|
+
# -- Class Methods -- #
|
|
190
|
+
|
|
191
|
+
@classmethod
|
|
192
|
+
def aliases(cls) -> StrStrMap:
|
|
193
|
+
"""
|
|
194
|
+
Return a mapping of common aliases for each enum member.
|
|
195
|
+
|
|
196
|
+
Returns
|
|
197
|
+
-------
|
|
198
|
+
StrStrMap
|
|
199
|
+
A mapping of alias names to their corresponding enum member names.
|
|
200
|
+
"""
|
|
201
|
+
return {
|
|
202
|
+
# File extensions
|
|
203
|
+
'.gz': 'gz',
|
|
204
|
+
'.gzip': 'gz',
|
|
205
|
+
'.zip': 'zip',
|
|
206
|
+
# MIME types
|
|
207
|
+
'application/gzip': 'gz',
|
|
208
|
+
'application/x-gzip': 'gz',
|
|
209
|
+
'application/zip': 'zip',
|
|
210
|
+
'application/x-zip-compressed': 'zip',
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
|
|
171
214
|
class DataConnectorType(CoercibleStrEnum):
|
|
172
215
|
"""Supported data connector types."""
|
|
173
216
|
|
|
@@ -199,6 +242,119 @@ class DataConnectorType(CoercibleStrEnum):
|
|
|
199
242
|
}
|
|
200
243
|
|
|
201
244
|
|
|
245
|
+
class FileFormat(CoercibleStrEnum):
|
|
246
|
+
"""Supported file formats for extraction."""
|
|
247
|
+
|
|
248
|
+
# -- Constants -- #
|
|
249
|
+
|
|
250
|
+
AVRO = 'avro'
|
|
251
|
+
CSV = 'csv'
|
|
252
|
+
FEATHER = 'feather'
|
|
253
|
+
GZ = 'gz'
|
|
254
|
+
JSON = 'json'
|
|
255
|
+
NDJSON = 'ndjson'
|
|
256
|
+
ORC = 'orc'
|
|
257
|
+
PARQUET = 'parquet'
|
|
258
|
+
TSV = 'tsv'
|
|
259
|
+
TXT = 'txt'
|
|
260
|
+
XLS = 'xls'
|
|
261
|
+
XLSX = 'xlsx'
|
|
262
|
+
ZIP = 'zip'
|
|
263
|
+
XML = 'xml'
|
|
264
|
+
YAML = 'yaml'
|
|
265
|
+
|
|
266
|
+
# -- Class Methods -- #
|
|
267
|
+
|
|
268
|
+
@classmethod
|
|
269
|
+
def aliases(cls) -> StrStrMap:
|
|
270
|
+
"""
|
|
271
|
+
Return a mapping of common aliases for each enum member.
|
|
272
|
+
|
|
273
|
+
Returns
|
|
274
|
+
-------
|
|
275
|
+
StrStrMap
|
|
276
|
+
A mapping of alias names to their corresponding enum member names.
|
|
277
|
+
"""
|
|
278
|
+
return {
|
|
279
|
+
# Common shorthand
|
|
280
|
+
'parq': 'parquet',
|
|
281
|
+
'yml': 'yaml',
|
|
282
|
+
# File extensions
|
|
283
|
+
'.avro': 'avro',
|
|
284
|
+
'.csv': 'csv',
|
|
285
|
+
'.feather': 'feather',
|
|
286
|
+
'.gz': 'gz',
|
|
287
|
+
'.json': 'json',
|
|
288
|
+
'.jsonl': 'ndjson',
|
|
289
|
+
'.ndjson': 'ndjson',
|
|
290
|
+
'.orc': 'orc',
|
|
291
|
+
'.parquet': 'parquet',
|
|
292
|
+
'.pq': 'parquet',
|
|
293
|
+
'.tsv': 'tsv',
|
|
294
|
+
'.txt': 'txt',
|
|
295
|
+
'.xls': 'xls',
|
|
296
|
+
'.xlsx': 'xlsx',
|
|
297
|
+
'.zip': 'zip',
|
|
298
|
+
'.xml': 'xml',
|
|
299
|
+
'.yaml': 'yaml',
|
|
300
|
+
'.yml': 'yaml',
|
|
301
|
+
# MIME types
|
|
302
|
+
'application/avro': 'avro',
|
|
303
|
+
'application/feather': 'feather',
|
|
304
|
+
'application/gzip': 'gz',
|
|
305
|
+
'application/json': 'json',
|
|
306
|
+
'application/jsonlines': 'ndjson',
|
|
307
|
+
'application/ndjson': 'ndjson',
|
|
308
|
+
'application/orc': 'orc',
|
|
309
|
+
'application/vnd.apache.arrow.file': 'feather',
|
|
310
|
+
'application/vnd.apache.orc': 'orc',
|
|
311
|
+
'application/vnd.ms-excel': 'xls',
|
|
312
|
+
(
|
|
313
|
+
'application/vnd.openxmlformats-'
|
|
314
|
+
'officedocument.spreadsheetml.sheet'
|
|
315
|
+
): 'xlsx',
|
|
316
|
+
'application/x-avro': 'avro',
|
|
317
|
+
'application/x-ndjson': 'ndjson',
|
|
318
|
+
'application/x-parquet': 'parquet',
|
|
319
|
+
'application/xml': 'xml',
|
|
320
|
+
'application/zip': 'zip',
|
|
321
|
+
'text/csv': 'csv',
|
|
322
|
+
'text/plain': 'txt',
|
|
323
|
+
'text/tab-separated-values': 'tsv',
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
class HttpMethod(CoercibleStrEnum):
|
|
328
|
+
"""Supported HTTP verbs that accept JSON payloads."""
|
|
329
|
+
|
|
330
|
+
# -- Constants -- #
|
|
331
|
+
|
|
332
|
+
CONNECT = 'connect'
|
|
333
|
+
DELETE = 'delete'
|
|
334
|
+
GET = 'get'
|
|
335
|
+
HEAD = 'head'
|
|
336
|
+
OPTIONS = 'options'
|
|
337
|
+
PATCH = 'patch'
|
|
338
|
+
POST = 'post'
|
|
339
|
+
PUT = 'put'
|
|
340
|
+
TRACE = 'trace'
|
|
341
|
+
|
|
342
|
+
# -- Getters -- #
|
|
343
|
+
|
|
344
|
+
@property
|
|
345
|
+
def allows_body(self) -> bool:
|
|
346
|
+
"""
|
|
347
|
+
Whether the method typically allows a request body.
|
|
348
|
+
|
|
349
|
+
Notes
|
|
350
|
+
-----
|
|
351
|
+
- RFCs do not strictly forbid bodies on some other methods (e.g.,
|
|
352
|
+
``DELETE``), but many servers/clients do not expect them. We mark
|
|
353
|
+
``POST``, ``PUT``, and ``PATCH`` as True.
|
|
354
|
+
"""
|
|
355
|
+
return self in {HttpMethod.POST, HttpMethod.PUT, HttpMethod.PATCH}
|
|
356
|
+
|
|
357
|
+
|
|
202
358
|
class OperatorName(CoercibleStrEnum):
|
|
203
359
|
"""Supported comparison operators with helpers."""
|
|
204
360
|
|
|
@@ -298,6 +454,13 @@ class PipelineStep(CoercibleStrEnum):
|
|
|
298
454
|
# SECTION: INTERNAL CONSTANTS ============================================== #
|
|
299
455
|
|
|
300
456
|
|
|
457
|
+
# Compression formats that are also file formats.
|
|
458
|
+
_COMPRESSION_FILE_FORMATS: set[FileFormat] = {
|
|
459
|
+
FileFormat.GZ,
|
|
460
|
+
FileFormat.ZIP,
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
|
|
301
464
|
# Precomputed order index for PipelineStep; avoids recomputing on each access.
|
|
302
465
|
_PIPELINE_ORDER_INDEX: dict[PipelineStep, int] = {
|
|
303
466
|
PipelineStep.FILTER: 0,
|
|
@@ -306,3 +469,128 @@ _PIPELINE_ORDER_INDEX: dict[PipelineStep, int] = {
|
|
|
306
469
|
PipelineStep.SORT: 3,
|
|
307
470
|
PipelineStep.AGGREGATE: 4,
|
|
308
471
|
}
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
# SECTION: FUNCTIONS ======================================================== #
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
def coerce_data_connector_type(
|
|
478
|
+
connector: DataConnectorType | str,
|
|
479
|
+
) -> DataConnectorType:
|
|
480
|
+
"""
|
|
481
|
+
Normalize textual data connector values to :class:`DataConnectorType`.
|
|
482
|
+
|
|
483
|
+
This thin wrapper is kept for backward compatibility; prefer
|
|
484
|
+
:meth:`DataConnectorType.coerce` going forward.
|
|
485
|
+
"""
|
|
486
|
+
return DataConnectorType.coerce(connector)
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
def coerce_file_format(
|
|
490
|
+
file_format: FileFormat | str,
|
|
491
|
+
) -> FileFormat:
|
|
492
|
+
"""
|
|
493
|
+
Normalize textual file format values to :class:`FileFormat`.
|
|
494
|
+
|
|
495
|
+
This thin wrapper is kept for backward compatibility; prefer
|
|
496
|
+
:meth:`FileFormat.coerce` going forward.
|
|
497
|
+
"""
|
|
498
|
+
return FileFormat.coerce(file_format)
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
def coerce_compression_format(
|
|
502
|
+
compression_format: CompressionFormat | str,
|
|
503
|
+
) -> CompressionFormat:
|
|
504
|
+
"""
|
|
505
|
+
Normalize textual compression format values to :class:`CompressionFormat`.
|
|
506
|
+
|
|
507
|
+
This thin wrapper is kept for backward compatibility; prefer
|
|
508
|
+
:meth:`CompressionFormat.coerce` going forward.
|
|
509
|
+
"""
|
|
510
|
+
return CompressionFormat.coerce(compression_format)
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
def coerce_http_method(
|
|
514
|
+
http_method: HttpMethod | str,
|
|
515
|
+
) -> HttpMethod:
|
|
516
|
+
"""
|
|
517
|
+
Normalize textual HTTP method values to :class:`HttpMethod`.
|
|
518
|
+
|
|
519
|
+
This thin wrapper is kept for backward compatibility; prefer
|
|
520
|
+
:meth:`HttpMethod.coerce` going forward.
|
|
521
|
+
"""
|
|
522
|
+
return HttpMethod.coerce(http_method)
|
|
523
|
+
|
|
524
|
+
|
|
525
|
+
def infer_file_format_and_compression(
|
|
526
|
+
value: object,
|
|
527
|
+
filename: object | None = None,
|
|
528
|
+
) -> tuple[FileFormat | None, CompressionFormat | None]:
|
|
529
|
+
"""
|
|
530
|
+
Infer data format and compression from a filename, extension, or MIME type.
|
|
531
|
+
|
|
532
|
+
Parameters
|
|
533
|
+
----------
|
|
534
|
+
value : object
|
|
535
|
+
A filename, extension, MIME type, or existing enum member.
|
|
536
|
+
filename : object | None, optional
|
|
537
|
+
A filename to consult for extension-based inference (e.g. when
|
|
538
|
+
``value`` is ``application/octet-stream``).
|
|
539
|
+
|
|
540
|
+
Returns
|
|
541
|
+
-------
|
|
542
|
+
tuple[FileFormat | None, CompressionFormat | None]
|
|
543
|
+
The inferred data format and compression, if any.
|
|
544
|
+
"""
|
|
545
|
+
if isinstance(value, FileFormat):
|
|
546
|
+
if value in _COMPRESSION_FILE_FORMATS:
|
|
547
|
+
return None, CompressionFormat.coerce(value.value)
|
|
548
|
+
return value, None
|
|
549
|
+
if isinstance(value, CompressionFormat):
|
|
550
|
+
return None, value
|
|
551
|
+
|
|
552
|
+
text = str(value).strip()
|
|
553
|
+
if not text:
|
|
554
|
+
return None, None
|
|
555
|
+
|
|
556
|
+
normalized = text.casefold()
|
|
557
|
+
mime = normalized.split(';', 1)[0].strip()
|
|
558
|
+
|
|
559
|
+
compression = CompressionFormat.try_coerce(mime)
|
|
560
|
+
fmt = FileFormat.try_coerce(mime)
|
|
561
|
+
|
|
562
|
+
is_mime = mime.startswith(
|
|
563
|
+
(
|
|
564
|
+
'application/',
|
|
565
|
+
'text/',
|
|
566
|
+
'audio/',
|
|
567
|
+
'image/',
|
|
568
|
+
'video/',
|
|
569
|
+
'multipart/',
|
|
570
|
+
),
|
|
571
|
+
)
|
|
572
|
+
suffix_source: object | None = filename if filename is not None else text
|
|
573
|
+
if is_mime and filename is None:
|
|
574
|
+
suffix_source = None
|
|
575
|
+
|
|
576
|
+
suffixes = (
|
|
577
|
+
PurePath(str(suffix_source)).suffixes
|
|
578
|
+
if suffix_source is not None
|
|
579
|
+
else []
|
|
580
|
+
)
|
|
581
|
+
if suffixes:
|
|
582
|
+
normalized_suffixes = [suffix.casefold() for suffix in suffixes]
|
|
583
|
+
compression = (
|
|
584
|
+
CompressionFormat.try_coerce(normalized_suffixes[-1])
|
|
585
|
+
or compression
|
|
586
|
+
)
|
|
587
|
+
if compression is not None:
|
|
588
|
+
normalized_suffixes = normalized_suffixes[:-1]
|
|
589
|
+
if normalized_suffixes:
|
|
590
|
+
fmt = FileFormat.try_coerce(normalized_suffixes[-1]) or fmt
|
|
591
|
+
|
|
592
|
+
if fmt in _COMPRESSION_FILE_FORMATS:
|
|
593
|
+
compression = compression or CompressionFormat.coerce(fmt.value)
|
|
594
|
+
fmt = None
|
|
595
|
+
|
|
596
|
+
return fmt, compression
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
:mod:`etlplus.
|
|
2
|
+
:mod:`etlplus.extract` module.
|
|
3
3
|
|
|
4
4
|
Helpers to extract data from files, databases, and REST APIs.
|
|
5
5
|
"""
|
|
@@ -10,81 +10,58 @@ from pathlib import Path
|
|
|
10
10
|
from typing import Any
|
|
11
11
|
from typing import cast
|
|
12
12
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
from
|
|
16
|
-
from
|
|
17
|
-
from
|
|
18
|
-
from
|
|
19
|
-
from
|
|
20
|
-
from
|
|
21
|
-
from
|
|
13
|
+
import requests # type: ignore[import]
|
|
14
|
+
|
|
15
|
+
from .enums import DataConnectorType
|
|
16
|
+
from .enums import FileFormat
|
|
17
|
+
from .enums import HttpMethod
|
|
18
|
+
from .enums import coerce_data_connector_type
|
|
19
|
+
from .enums import coerce_file_format
|
|
20
|
+
from .file import File
|
|
21
|
+
from .types import JSONData
|
|
22
|
+
from .types import JSONDict
|
|
23
|
+
from .types import JSONList
|
|
24
|
+
from .types import StrPath
|
|
22
25
|
|
|
23
26
|
# SECTION: FUNCTIONS ======================================================== #
|
|
24
27
|
|
|
25
28
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
29
|
+
# -- File Extraction -- #
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def extract_from_file(
|
|
33
|
+
file_path: StrPath,
|
|
34
|
+
file_format: FileFormat | str | None = FileFormat.JSON,
|
|
30
35
|
) -> JSONData:
|
|
31
36
|
"""
|
|
32
|
-
Extract data from a
|
|
37
|
+
Extract (semi-)structured data from a local file.
|
|
33
38
|
|
|
34
39
|
Parameters
|
|
35
40
|
----------
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
(for example, ``timeout``). To use a pre-configured
|
|
43
|
-
:class:`requests.Session`, provide it via ``session``.
|
|
44
|
-
When omitted, ``timeout`` defaults to 10 seconds.
|
|
41
|
+
file_path : StrPath
|
|
42
|
+
Source file path.
|
|
43
|
+
file_format : FileFormat | str | None, optional
|
|
44
|
+
File format to parse. If ``None``, infer from the filename
|
|
45
|
+
extension. Defaults to `'json'` for backward compatibility when
|
|
46
|
+
explicitly provided.
|
|
45
47
|
|
|
46
48
|
Returns
|
|
47
49
|
-------
|
|
48
50
|
JSONData
|
|
49
|
-
Parsed
|
|
50
|
-
|
|
51
|
-
Raises
|
|
52
|
-
------
|
|
53
|
-
TypeError
|
|
54
|
-
If a provided ``session`` does not expose the required HTTP
|
|
55
|
-
method (for example, ``get``).
|
|
51
|
+
Parsed data as a mapping or a list of mappings.
|
|
56
52
|
"""
|
|
57
|
-
|
|
58
|
-
session = kwargs.pop('session', None)
|
|
59
|
-
request_callable, timeout, _ = resolve_request(
|
|
60
|
-
method,
|
|
61
|
-
session=session,
|
|
62
|
-
timeout=timeout,
|
|
63
|
-
)
|
|
64
|
-
response = request_callable(url, timeout=timeout, **kwargs)
|
|
65
|
-
response.raise_for_status()
|
|
53
|
+
path = Path(file_path)
|
|
66
54
|
|
|
67
|
-
|
|
68
|
-
if
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
except ValueError:
|
|
72
|
-
# Malformed JSON despite content-type; fall back to text
|
|
73
|
-
return {
|
|
74
|
-
'content': response.text,
|
|
75
|
-
'content_type': content_type,
|
|
76
|
-
}
|
|
77
|
-
if isinstance(payload, dict):
|
|
78
|
-
return cast(JSONDict, payload)
|
|
79
|
-
if isinstance(payload, list):
|
|
80
|
-
if all(isinstance(x, dict) for x in payload):
|
|
81
|
-
return cast(JSONList, payload)
|
|
82
|
-
# Coerce non-dict array items into objects for consistency
|
|
83
|
-
return [{'value': x} for x in payload]
|
|
84
|
-
# Fallback: wrap scalar JSON
|
|
85
|
-
return {'value': payload}
|
|
55
|
+
# If no explicit format is provided, let File infer from extension.
|
|
56
|
+
if file_format is None:
|
|
57
|
+
return File(path, None).read()
|
|
58
|
+
fmt = coerce_file_format(file_format)
|
|
86
59
|
|
|
87
|
-
|
|
60
|
+
# Let file module perform existence and format validation.
|
|
61
|
+
return File(path, fmt).read()
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# -- Database Extraction (Placeholder) -- #
|
|
88
65
|
|
|
89
66
|
|
|
90
67
|
def extract_from_database(
|
|
@@ -119,36 +96,77 @@ def extract_from_database(
|
|
|
119
96
|
]
|
|
120
97
|
|
|
121
98
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
99
|
+
# -- REST API Extraction -- #
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def extract_from_api(
|
|
103
|
+
url: str,
|
|
104
|
+
method: HttpMethod | str = HttpMethod.GET,
|
|
105
|
+
**kwargs: Any,
|
|
125
106
|
) -> JSONData:
|
|
126
107
|
"""
|
|
127
|
-
Extract
|
|
108
|
+
Extract data from a REST API.
|
|
128
109
|
|
|
129
110
|
Parameters
|
|
130
111
|
----------
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
112
|
+
url : str
|
|
113
|
+
API endpoint URL.
|
|
114
|
+
method : HttpMethod | str, optional
|
|
115
|
+
HTTP method to use. Defaults to ``GET``.
|
|
116
|
+
**kwargs : Any
|
|
117
|
+
Extra arguments forwarded to the underlying ``requests`` call
|
|
118
|
+
(for example, ``timeout``). To use a pre-configured
|
|
119
|
+
:class:`requests.Session`, provide it via ``session``.
|
|
137
120
|
|
|
138
121
|
Returns
|
|
139
122
|
-------
|
|
140
123
|
JSONData
|
|
141
|
-
Parsed
|
|
124
|
+
Parsed JSON payload, or a fallback object with raw text.
|
|
125
|
+
|
|
126
|
+
Raises
|
|
127
|
+
------
|
|
128
|
+
TypeError
|
|
129
|
+
If a provided ``session`` does not expose the required HTTP
|
|
130
|
+
method (for example, ``get``).
|
|
142
131
|
"""
|
|
143
|
-
|
|
132
|
+
http_method = HttpMethod.coerce(method)
|
|
144
133
|
|
|
145
|
-
#
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
134
|
+
# Apply a conservative timeout to guard against hanging requests.
|
|
135
|
+
timeout = kwargs.pop('timeout', 10.0)
|
|
136
|
+
session = kwargs.pop('session', None)
|
|
137
|
+
requester = session or requests
|
|
149
138
|
|
|
150
|
-
|
|
151
|
-
|
|
139
|
+
request_callable = getattr(requester, http_method.value, None)
|
|
140
|
+
if not callable(request_callable):
|
|
141
|
+
raise TypeError(
|
|
142
|
+
'Session object must supply a callable'
|
|
143
|
+
f'"{http_method.value}" method',
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
response = request_callable(url, timeout=timeout, **kwargs)
|
|
147
|
+
response.raise_for_status()
|
|
148
|
+
|
|
149
|
+
content_type = response.headers.get('content-type', '').lower()
|
|
150
|
+
if 'application/json' in content_type:
|
|
151
|
+
try:
|
|
152
|
+
payload: Any = response.json()
|
|
153
|
+
except ValueError:
|
|
154
|
+
# Malformed JSON despite content-type; fall back to text
|
|
155
|
+
return {
|
|
156
|
+
'content': response.text,
|
|
157
|
+
'content_type': content_type,
|
|
158
|
+
}
|
|
159
|
+
if isinstance(payload, dict):
|
|
160
|
+
return cast(JSONDict, payload)
|
|
161
|
+
if isinstance(payload, list):
|
|
162
|
+
if all(isinstance(x, dict) for x in payload):
|
|
163
|
+
return cast(JSONList, payload)
|
|
164
|
+
# Coerce non-dict array items into objects for consistency
|
|
165
|
+
return [{'value': x} for x in payload]
|
|
166
|
+
# Fallback: wrap scalar JSON
|
|
167
|
+
return {'value': payload}
|
|
168
|
+
|
|
169
|
+
return {'content': response.text, 'content_type': content_type}
|
|
152
170
|
|
|
153
171
|
|
|
154
172
|
# -- Orchestration -- #
|
|
@@ -184,7 +202,7 @@ def extract(
|
|
|
184
202
|
ValueError
|
|
185
203
|
If `source_type` is not one of the supported values.
|
|
186
204
|
"""
|
|
187
|
-
match
|
|
205
|
+
match coerce_data_connector_type(source_type):
|
|
188
206
|
case DataConnectorType.FILE:
|
|
189
207
|
# Prefer explicit format if provided, else infer from filename.
|
|
190
208
|
return extract_from_file(source, file_format)
|
|
@@ -195,6 +213,6 @@ def extract(
|
|
|
195
213
|
# ``file_format`` is ignored for APIs.
|
|
196
214
|
return extract_from_api(str(source), **kwargs)
|
|
197
215
|
case _:
|
|
198
|
-
#
|
|
199
|
-
#
|
|
216
|
+
# ``coerce_data_connector_type`` covers invalid entries, but keep
|
|
217
|
+
# explicit guard for defensive programming.
|
|
200
218
|
raise ValueError(f'Invalid source type: {source_type}')
|