etlplus 0.10.5__py3-none-any.whl → 0.11.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- etlplus/cli/commands.py +1 -1
- etlplus/cli/constants.py +1 -1
- etlplus/cli/io.py +2 -2
- etlplus/config/pipeline.py +2 -2
- etlplus/enums.py +2 -270
- etlplus/extract.py +5 -7
- etlplus/file/__init__.py +27 -0
- etlplus/file/core.py +287 -0
- etlplus/file/csv.py +82 -0
- etlplus/file/enums.py +266 -0
- etlplus/file/json.py +87 -0
- etlplus/file/xml.py +165 -0
- etlplus/file/yaml.py +125 -0
- etlplus/load.py +9 -12
- etlplus/run.py +6 -9
- {etlplus-0.10.5.dist-info → etlplus-0.11.2.dist-info}/METADATA +1 -1
- {etlplus-0.10.5.dist-info → etlplus-0.11.2.dist-info}/RECORD +21 -15
- etlplus/file.py +0 -652
- {etlplus-0.10.5.dist-info → etlplus-0.11.2.dist-info}/WHEEL +0 -0
- {etlplus-0.10.5.dist-info → etlplus-0.11.2.dist-info}/entry_points.txt +0 -0
- {etlplus-0.10.5.dist-info → etlplus-0.11.2.dist-info}/licenses/LICENSE +0 -0
- {etlplus-0.10.5.dist-info → etlplus-0.11.2.dist-info}/top_level.txt +0 -0
etlplus/cli/commands.py
CHANGED
etlplus/cli/constants.py
CHANGED
etlplus/cli/io.py
CHANGED
|
@@ -15,8 +15,8 @@ from pathlib import Path
|
|
|
15
15
|
from typing import Any
|
|
16
16
|
from typing import cast
|
|
17
17
|
|
|
18
|
-
from ..enums import FileFormat
|
|
19
18
|
from ..file import File
|
|
19
|
+
from ..file import FileFormat
|
|
20
20
|
from ..types import JSONData
|
|
21
21
|
from ..utils import print_json
|
|
22
22
|
|
|
@@ -331,6 +331,6 @@ def write_json_output(
|
|
|
331
331
|
"""
|
|
332
332
|
if not output_path or output_path == '-':
|
|
333
333
|
return False
|
|
334
|
-
File(Path(output_path), FileFormat.JSON).
|
|
334
|
+
File(Path(output_path), FileFormat.JSON).write(data)
|
|
335
335
|
print(f'{success_message} {output_path}')
|
|
336
336
|
return True
|
etlplus/config/pipeline.py
CHANGED
|
@@ -24,8 +24,8 @@ from typing import Any
|
|
|
24
24
|
from typing import Self
|
|
25
25
|
|
|
26
26
|
from ..api import ApiConfig
|
|
27
|
-
from ..enums import FileFormat
|
|
28
27
|
from ..file import File
|
|
28
|
+
from ..file import FileFormat
|
|
29
29
|
from ..types import StrAnyMap
|
|
30
30
|
from ..utils import coerce_dict
|
|
31
31
|
from ..utils import maybe_mapping
|
|
@@ -246,7 +246,7 @@ class PipelineConfig:
|
|
|
246
246
|
TypeError
|
|
247
247
|
If the YAML root is not a mapping/object.
|
|
248
248
|
"""
|
|
249
|
-
raw = File(Path(path), FileFormat.YAML).
|
|
249
|
+
raw = File(Path(path), FileFormat.YAML).read()
|
|
250
250
|
if not isinstance(raw, dict):
|
|
251
251
|
raise TypeError('Pipeline YAML must have a mapping/object root')
|
|
252
252
|
|
etlplus/enums.py
CHANGED
|
@@ -8,7 +8,6 @@ from __future__ import annotations
|
|
|
8
8
|
|
|
9
9
|
import enum
|
|
10
10
|
import operator as _op
|
|
11
|
-
from pathlib import PurePath
|
|
12
11
|
from statistics import fmean
|
|
13
12
|
from typing import Self
|
|
14
13
|
|
|
@@ -23,18 +22,10 @@ __all__ = [
|
|
|
23
22
|
# Enums
|
|
24
23
|
'AggregateName',
|
|
25
24
|
'CoercibleStrEnum',
|
|
26
|
-
'CompressionFormat',
|
|
27
25
|
'DataConnectorType',
|
|
28
|
-
'FileFormat',
|
|
29
26
|
'HttpMethod',
|
|
30
27
|
'OperatorName',
|
|
31
28
|
'PipelineStep',
|
|
32
|
-
# Functions
|
|
33
|
-
'coerce_compression_format',
|
|
34
|
-
'coerce_data_connector_type',
|
|
35
|
-
'coerce_file_format',
|
|
36
|
-
'coerce_http_method',
|
|
37
|
-
'infer_file_format_and_compression',
|
|
38
29
|
]
|
|
39
30
|
|
|
40
31
|
|
|
@@ -178,39 +169,6 @@ class AggregateName(CoercibleStrEnum):
|
|
|
178
169
|
return lambda xs, n: (fmean(xs) if xs else 0.0)
|
|
179
170
|
|
|
180
171
|
|
|
181
|
-
class CompressionFormat(CoercibleStrEnum):
|
|
182
|
-
"""Supported compression formats for data files."""
|
|
183
|
-
|
|
184
|
-
# -- Constants -- #
|
|
185
|
-
|
|
186
|
-
GZ = 'gz'
|
|
187
|
-
ZIP = 'zip'
|
|
188
|
-
|
|
189
|
-
# -- Class Methods -- #
|
|
190
|
-
|
|
191
|
-
@classmethod
|
|
192
|
-
def aliases(cls) -> StrStrMap:
|
|
193
|
-
"""
|
|
194
|
-
Return a mapping of common aliases for each enum member.
|
|
195
|
-
|
|
196
|
-
Returns
|
|
197
|
-
-------
|
|
198
|
-
StrStrMap
|
|
199
|
-
A mapping of alias names to their corresponding enum member names.
|
|
200
|
-
"""
|
|
201
|
-
return {
|
|
202
|
-
# File extensions
|
|
203
|
-
'.gz': 'gz',
|
|
204
|
-
'.gzip': 'gz',
|
|
205
|
-
'.zip': 'zip',
|
|
206
|
-
# MIME types
|
|
207
|
-
'application/gzip': 'gz',
|
|
208
|
-
'application/x-gzip': 'gz',
|
|
209
|
-
'application/zip': 'zip',
|
|
210
|
-
'application/x-zip-compressed': 'zip',
|
|
211
|
-
}
|
|
212
|
-
|
|
213
|
-
|
|
214
172
|
class DataConnectorType(CoercibleStrEnum):
|
|
215
173
|
"""Supported data connector types."""
|
|
216
174
|
|
|
@@ -242,99 +200,6 @@ class DataConnectorType(CoercibleStrEnum):
|
|
|
242
200
|
}
|
|
243
201
|
|
|
244
202
|
|
|
245
|
-
class FileFormat(CoercibleStrEnum):
|
|
246
|
-
"""Supported file formats for extraction."""
|
|
247
|
-
|
|
248
|
-
# -- Constants -- #
|
|
249
|
-
|
|
250
|
-
AVRO = 'avro'
|
|
251
|
-
CSV = 'csv'
|
|
252
|
-
FEATHER = 'feather'
|
|
253
|
-
GZ = 'gz'
|
|
254
|
-
JSON = 'json'
|
|
255
|
-
NDJSON = 'ndjson'
|
|
256
|
-
ORC = 'orc'
|
|
257
|
-
PARQUET = 'parquet'
|
|
258
|
-
TSV = 'tsv'
|
|
259
|
-
TXT = 'txt'
|
|
260
|
-
XLS = 'xls'
|
|
261
|
-
XLSX = 'xlsx'
|
|
262
|
-
ZIP = 'zip'
|
|
263
|
-
XML = 'xml'
|
|
264
|
-
YAML = 'yaml'
|
|
265
|
-
|
|
266
|
-
# -- Class Methods -- #
|
|
267
|
-
|
|
268
|
-
@classmethod
|
|
269
|
-
def aliases(cls) -> StrStrMap:
|
|
270
|
-
"""
|
|
271
|
-
Return a mapping of common aliases for each enum member.
|
|
272
|
-
|
|
273
|
-
Returns
|
|
274
|
-
-------
|
|
275
|
-
StrStrMap
|
|
276
|
-
A mapping of alias names to their corresponding enum member names.
|
|
277
|
-
"""
|
|
278
|
-
return {
|
|
279
|
-
# Common shorthand
|
|
280
|
-
'parq': 'parquet',
|
|
281
|
-
'yml': 'yaml',
|
|
282
|
-
# File extensions
|
|
283
|
-
'.avro': 'avro',
|
|
284
|
-
'.csv': 'csv',
|
|
285
|
-
'.feather': 'feather',
|
|
286
|
-
'.gz': 'gz',
|
|
287
|
-
'.json': 'json',
|
|
288
|
-
'.jsonl': 'ndjson',
|
|
289
|
-
'.ndjson': 'ndjson',
|
|
290
|
-
'.orc': 'orc',
|
|
291
|
-
'.parquet': 'parquet',
|
|
292
|
-
'.pq': 'parquet',
|
|
293
|
-
'.tsv': 'tsv',
|
|
294
|
-
'.txt': 'txt',
|
|
295
|
-
'.xls': 'xls',
|
|
296
|
-
'.xlsx': 'xlsx',
|
|
297
|
-
'.zip': 'zip',
|
|
298
|
-
'.xml': 'xml',
|
|
299
|
-
'.yaml': 'yaml',
|
|
300
|
-
'.yml': 'yaml',
|
|
301
|
-
# MIME types
|
|
302
|
-
'application/avro': 'avro',
|
|
303
|
-
'application/csv': 'csv',
|
|
304
|
-
'application/feather': 'feather',
|
|
305
|
-
'application/gzip': 'gz',
|
|
306
|
-
'application/json': 'json',
|
|
307
|
-
'application/jsonlines': 'ndjson',
|
|
308
|
-
'application/ndjson': 'ndjson',
|
|
309
|
-
'application/orc': 'orc',
|
|
310
|
-
'application/parquet': 'parquet',
|
|
311
|
-
'application/vnd.apache.avro': 'avro',
|
|
312
|
-
'application/vnd.apache.parquet': 'parquet',
|
|
313
|
-
'application/vnd.apache.arrow.file': 'feather',
|
|
314
|
-
'application/vnd.apache.orc': 'orc',
|
|
315
|
-
'application/vnd.ms-excel': 'xls',
|
|
316
|
-
(
|
|
317
|
-
'application/vnd.openxmlformats-'
|
|
318
|
-
'officedocument.spreadsheetml.sheet'
|
|
319
|
-
): 'xlsx',
|
|
320
|
-
'application/x-avro': 'avro',
|
|
321
|
-
'application/x-csv': 'csv',
|
|
322
|
-
'application/x-feather': 'feather',
|
|
323
|
-
'application/x-orc': 'orc',
|
|
324
|
-
'application/x-ndjson': 'ndjson',
|
|
325
|
-
'application/x-parquet': 'parquet',
|
|
326
|
-
'application/x-yaml': 'yaml',
|
|
327
|
-
'application/xml': 'xml',
|
|
328
|
-
'application/zip': 'zip',
|
|
329
|
-
'text/csv': 'csv',
|
|
330
|
-
'text/plain': 'txt',
|
|
331
|
-
'text/tab-separated-values': 'tsv',
|
|
332
|
-
'text/tsv': 'tsv',
|
|
333
|
-
'text/xml': 'xml',
|
|
334
|
-
'text/yaml': 'yaml',
|
|
335
|
-
}
|
|
336
|
-
|
|
337
|
-
|
|
338
203
|
class HttpMethod(CoercibleStrEnum):
|
|
339
204
|
"""Supported HTTP verbs that accept JSON payloads."""
|
|
340
205
|
|
|
@@ -360,8 +225,8 @@ class HttpMethod(CoercibleStrEnum):
|
|
|
360
225
|
Notes
|
|
361
226
|
-----
|
|
362
227
|
- RFCs do not strictly forbid bodies on some other methods (e.g.,
|
|
363
|
-
|
|
364
|
-
|
|
228
|
+
``DELETE``), but many servers/clients do not expect them. We mark
|
|
229
|
+
``POST``, ``PUT``, and ``PATCH`` as True.
|
|
365
230
|
"""
|
|
366
231
|
return self in {HttpMethod.POST, HttpMethod.PUT, HttpMethod.PATCH}
|
|
367
232
|
|
|
@@ -465,13 +330,6 @@ class PipelineStep(CoercibleStrEnum):
|
|
|
465
330
|
# SECTION: INTERNAL CONSTANTS ============================================== #
|
|
466
331
|
|
|
467
332
|
|
|
468
|
-
# Compression formats that are also file formats.
|
|
469
|
-
_COMPRESSION_FILE_FORMATS: set[FileFormat] = {
|
|
470
|
-
FileFormat.GZ,
|
|
471
|
-
FileFormat.ZIP,
|
|
472
|
-
}
|
|
473
|
-
|
|
474
|
-
|
|
475
333
|
# Precomputed order index for PipelineStep; avoids recomputing on each access.
|
|
476
334
|
_PIPELINE_ORDER_INDEX: dict[PipelineStep, int] = {
|
|
477
335
|
PipelineStep.FILTER: 0,
|
|
@@ -480,129 +338,3 @@ _PIPELINE_ORDER_INDEX: dict[PipelineStep, int] = {
|
|
|
480
338
|
PipelineStep.SORT: 3,
|
|
481
339
|
PipelineStep.AGGREGATE: 4,
|
|
482
340
|
}
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
# SECTION: FUNCTIONS ======================================================== #
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
def coerce_data_connector_type(
|
|
489
|
-
connector: DataConnectorType | str,
|
|
490
|
-
) -> DataConnectorType:
|
|
491
|
-
"""
|
|
492
|
-
Normalize textual data connector values to :class:`DataConnectorType`.
|
|
493
|
-
|
|
494
|
-
This thin wrapper is kept for backward compatibility; prefer
|
|
495
|
-
:meth:`DataConnectorType.coerce` going forward.
|
|
496
|
-
"""
|
|
497
|
-
return DataConnectorType.coerce(connector)
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
def coerce_file_format(
|
|
501
|
-
file_format: FileFormat | str,
|
|
502
|
-
) -> FileFormat:
|
|
503
|
-
"""
|
|
504
|
-
Normalize textual file format values to :class:`FileFormat`.
|
|
505
|
-
|
|
506
|
-
This thin wrapper is kept for backward compatibility; prefer
|
|
507
|
-
:meth:`FileFormat.coerce` going forward.
|
|
508
|
-
"""
|
|
509
|
-
return FileFormat.coerce(file_format)
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
def coerce_compression_format(
|
|
513
|
-
compression_format: CompressionFormat | str,
|
|
514
|
-
) -> CompressionFormat:
|
|
515
|
-
"""
|
|
516
|
-
Normalize textual compression format values to :class:`CompressionFormat`.
|
|
517
|
-
|
|
518
|
-
This thin wrapper is kept for backward compatibility; prefer
|
|
519
|
-
:meth:`CompressionFormat.coerce` going forward.
|
|
520
|
-
"""
|
|
521
|
-
return CompressionFormat.coerce(compression_format)
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
def coerce_http_method(
|
|
525
|
-
http_method: HttpMethod | str,
|
|
526
|
-
) -> HttpMethod:
|
|
527
|
-
"""
|
|
528
|
-
Normalize textual HTTP method values to :class:`HttpMethod`.
|
|
529
|
-
|
|
530
|
-
This thin wrapper is kept for backward compatibility; prefer
|
|
531
|
-
:meth:`HttpMethod.coerce` going forward.
|
|
532
|
-
"""
|
|
533
|
-
return HttpMethod.coerce(http_method)
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
def infer_file_format_and_compression(
|
|
537
|
-
value: object,
|
|
538
|
-
filename: object | None = None,
|
|
539
|
-
) -> tuple[FileFormat | None, CompressionFormat | None]:
|
|
540
|
-
"""
|
|
541
|
-
Infer data format and compression from a filename, extension, or MIME type.
|
|
542
|
-
|
|
543
|
-
Parameters
|
|
544
|
-
----------
|
|
545
|
-
value : object
|
|
546
|
-
A filename, extension, MIME type, or existing enum member.
|
|
547
|
-
filename : object | None, optional
|
|
548
|
-
A filename to consult for extension-based inference (e.g. when
|
|
549
|
-
``value`` is ``application/octet-stream``).
|
|
550
|
-
|
|
551
|
-
Returns
|
|
552
|
-
-------
|
|
553
|
-
tuple[FileFormat | None, CompressionFormat | None]
|
|
554
|
-
The inferred data format and compression, if any.
|
|
555
|
-
"""
|
|
556
|
-
if isinstance(value, FileFormat):
|
|
557
|
-
if value in _COMPRESSION_FILE_FORMATS:
|
|
558
|
-
return None, CompressionFormat.coerce(value.value)
|
|
559
|
-
return value, None
|
|
560
|
-
if isinstance(value, CompressionFormat):
|
|
561
|
-
return None, value
|
|
562
|
-
|
|
563
|
-
text = str(value).strip()
|
|
564
|
-
if not text:
|
|
565
|
-
return None, None
|
|
566
|
-
|
|
567
|
-
normalized = text.casefold()
|
|
568
|
-
mime = normalized.split(';', 1)[0].strip()
|
|
569
|
-
|
|
570
|
-
is_octet_stream = mime == 'application/octet-stream'
|
|
571
|
-
compression = CompressionFormat.try_coerce(mime)
|
|
572
|
-
fmt = None if is_octet_stream else FileFormat.try_coerce(mime)
|
|
573
|
-
|
|
574
|
-
is_mime = mime.startswith(
|
|
575
|
-
(
|
|
576
|
-
'application/',
|
|
577
|
-
'text/',
|
|
578
|
-
'audio/',
|
|
579
|
-
'image/',
|
|
580
|
-
'video/',
|
|
581
|
-
'multipart/',
|
|
582
|
-
),
|
|
583
|
-
)
|
|
584
|
-
suffix_source: object | None = filename if filename is not None else text
|
|
585
|
-
if is_mime and filename is None:
|
|
586
|
-
suffix_source = None
|
|
587
|
-
|
|
588
|
-
suffixes = (
|
|
589
|
-
PurePath(str(suffix_source)).suffixes
|
|
590
|
-
if suffix_source is not None
|
|
591
|
-
else []
|
|
592
|
-
)
|
|
593
|
-
if suffixes:
|
|
594
|
-
normalized_suffixes = [suffix.casefold() for suffix in suffixes]
|
|
595
|
-
compression = (
|
|
596
|
-
CompressionFormat.try_coerce(normalized_suffixes[-1])
|
|
597
|
-
or compression
|
|
598
|
-
)
|
|
599
|
-
if compression is not None:
|
|
600
|
-
normalized_suffixes = normalized_suffixes[:-1]
|
|
601
|
-
if normalized_suffixes:
|
|
602
|
-
fmt = FileFormat.try_coerce(normalized_suffixes[-1]) or fmt
|
|
603
|
-
|
|
604
|
-
if fmt in _COMPRESSION_FILE_FORMATS:
|
|
605
|
-
compression = compression or CompressionFormat.coerce(fmt.value)
|
|
606
|
-
fmt = None
|
|
607
|
-
|
|
608
|
-
return fmt, compression
|
etlplus/extract.py
CHANGED
|
@@ -13,11 +13,9 @@ from typing import cast
|
|
|
13
13
|
import requests # type: ignore[import]
|
|
14
14
|
|
|
15
15
|
from .enums import DataConnectorType
|
|
16
|
-
from .enums import FileFormat
|
|
17
16
|
from .enums import HttpMethod
|
|
18
|
-
from .enums import coerce_data_connector_type
|
|
19
|
-
from .enums import coerce_file_format
|
|
20
17
|
from .file import File
|
|
18
|
+
from .file import FileFormat
|
|
21
19
|
from .types import JSONData
|
|
22
20
|
from .types import JSONDict
|
|
23
21
|
from .types import JSONList
|
|
@@ -55,7 +53,7 @@ def extract_from_file(
|
|
|
55
53
|
# If no explicit format is provided, let File infer from extension.
|
|
56
54
|
if file_format is None:
|
|
57
55
|
return File(path, None).read()
|
|
58
|
-
fmt =
|
|
56
|
+
fmt = FileFormat.coerce(file_format)
|
|
59
57
|
|
|
60
58
|
# Let file module perform existence and format validation.
|
|
61
59
|
return File(path, fmt).read()
|
|
@@ -202,7 +200,7 @@ def extract(
|
|
|
202
200
|
ValueError
|
|
203
201
|
If `source_type` is not one of the supported values.
|
|
204
202
|
"""
|
|
205
|
-
match
|
|
203
|
+
match DataConnectorType.coerce(source_type):
|
|
206
204
|
case DataConnectorType.FILE:
|
|
207
205
|
# Prefer explicit format if provided, else infer from filename.
|
|
208
206
|
return extract_from_file(source, file_format)
|
|
@@ -213,6 +211,6 @@ def extract(
|
|
|
213
211
|
# ``file_format`` is ignored for APIs.
|
|
214
212
|
return extract_from_api(str(source), **kwargs)
|
|
215
213
|
case _:
|
|
216
|
-
#
|
|
217
|
-
# explicit guard for defensive programming.
|
|
214
|
+
# :meth:`coerce` already raises for invalid connector types, but
|
|
215
|
+
# keep explicit guard for defensive programming.
|
|
218
216
|
raise ValueError(f'Invalid source type: {source_type}')
|
etlplus/file/__init__.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""
|
|
2
|
+
:mod:`etlplus.file` package.
|
|
3
|
+
|
|
4
|
+
Public file IO helpers.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from .core import File
|
|
10
|
+
from .enums import CompressionFormat
|
|
11
|
+
from .enums import FileFormat
|
|
12
|
+
from .enums import coerce_file_format
|
|
13
|
+
from .enums import infer_file_format_and_compression
|
|
14
|
+
|
|
15
|
+
# SECTION: EXPORTS ========================================================== #
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
# Class
|
|
20
|
+
'File',
|
|
21
|
+
# Enums
|
|
22
|
+
'CompressionFormat',
|
|
23
|
+
'FileFormat',
|
|
24
|
+
# Functions
|
|
25
|
+
'coerce_file_format',
|
|
26
|
+
'infer_file_format_and_compression',
|
|
27
|
+
]
|