etlplus 0.9.2__py3-none-any.whl → 0.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- etlplus/__init__.py +26 -1
- etlplus/api/README.md +3 -51
- etlplus/api/__init__.py +0 -10
- etlplus/api/config.py +28 -39
- etlplus/api/endpoint_client.py +3 -3
- etlplus/api/pagination/client.py +1 -1
- etlplus/api/rate_limiting/config.py +1 -13
- etlplus/api/rate_limiting/rate_limiter.py +11 -8
- etlplus/api/request_manager.py +6 -11
- etlplus/api/transport.py +2 -14
- etlplus/api/types.py +6 -96
- etlplus/cli/commands.py +43 -76
- etlplus/cli/constants.py +1 -1
- etlplus/cli/handlers.py +12 -40
- etlplus/cli/io.py +2 -2
- etlplus/cli/main.py +1 -1
- etlplus/cli/state.py +7 -4
- etlplus/{workflow → config}/__init__.py +23 -10
- etlplus/{workflow → config}/connector.py +44 -58
- etlplus/{workflow → config}/jobs.py +32 -105
- etlplus/{workflow → config}/pipeline.py +51 -59
- etlplus/{workflow → config}/profile.py +5 -8
- etlplus/config/types.py +204 -0
- etlplus/config/utils.py +120 -0
- etlplus/database/ddl.py +1 -1
- etlplus/database/engine.py +3 -19
- etlplus/database/orm.py +0 -2
- etlplus/database/schema.py +1 -1
- etlplus/enums.py +266 -0
- etlplus/{ops/extract.py → extract.py} +99 -81
- etlplus/file.py +652 -0
- etlplus/{ops/load.py → load.py} +101 -78
- etlplus/{ops/run.py → run.py} +127 -159
- etlplus/{api/utils.py → run_helpers.py} +153 -209
- etlplus/{ops/transform.py → transform.py} +68 -75
- etlplus/types.py +4 -5
- etlplus/utils.py +2 -136
- etlplus/{ops/validate.py → validate.py} +12 -22
- etlplus/validation/__init__.py +44 -0
- etlplus/{ops → validation}/utils.py +17 -53
- {etlplus-0.9.2.dist-info → etlplus-0.10.1.dist-info}/METADATA +17 -210
- etlplus-0.10.1.dist-info/RECORD +65 -0
- {etlplus-0.9.2.dist-info → etlplus-0.10.1.dist-info}/WHEEL +1 -1
- etlplus/README.md +0 -37
- etlplus/api/enums.py +0 -51
- etlplus/cli/README.md +0 -40
- etlplus/database/README.md +0 -48
- etlplus/file/README.md +0 -105
- etlplus/file/__init__.py +0 -25
- etlplus/file/_imports.py +0 -141
- etlplus/file/_io.py +0 -160
- etlplus/file/accdb.py +0 -78
- etlplus/file/arrow.py +0 -78
- etlplus/file/avro.py +0 -176
- etlplus/file/bson.py +0 -77
- etlplus/file/cbor.py +0 -78
- etlplus/file/cfg.py +0 -79
- etlplus/file/conf.py +0 -80
- etlplus/file/core.py +0 -322
- etlplus/file/csv.py +0 -79
- etlplus/file/dat.py +0 -78
- etlplus/file/dta.py +0 -77
- etlplus/file/duckdb.py +0 -78
- etlplus/file/enums.py +0 -343
- etlplus/file/feather.py +0 -111
- etlplus/file/fwf.py +0 -77
- etlplus/file/gz.py +0 -123
- etlplus/file/hbs.py +0 -78
- etlplus/file/hdf5.py +0 -78
- etlplus/file/ini.py +0 -79
- etlplus/file/ion.py +0 -78
- etlplus/file/jinja2.py +0 -78
- etlplus/file/json.py +0 -98
- etlplus/file/log.py +0 -78
- etlplus/file/mat.py +0 -78
- etlplus/file/mdb.py +0 -78
- etlplus/file/msgpack.py +0 -78
- etlplus/file/mustache.py +0 -78
- etlplus/file/nc.py +0 -78
- etlplus/file/ndjson.py +0 -108
- etlplus/file/numbers.py +0 -75
- etlplus/file/ods.py +0 -79
- etlplus/file/orc.py +0 -111
- etlplus/file/parquet.py +0 -113
- etlplus/file/pb.py +0 -78
- etlplus/file/pbf.py +0 -77
- etlplus/file/properties.py +0 -78
- etlplus/file/proto.py +0 -77
- etlplus/file/psv.py +0 -79
- etlplus/file/rda.py +0 -78
- etlplus/file/rds.py +0 -78
- etlplus/file/sas7bdat.py +0 -78
- etlplus/file/sav.py +0 -77
- etlplus/file/sqlite.py +0 -78
- etlplus/file/stub.py +0 -84
- etlplus/file/sylk.py +0 -77
- etlplus/file/tab.py +0 -81
- etlplus/file/toml.py +0 -78
- etlplus/file/tsv.py +0 -80
- etlplus/file/txt.py +0 -102
- etlplus/file/vm.py +0 -78
- etlplus/file/wks.py +0 -77
- etlplus/file/xls.py +0 -88
- etlplus/file/xlsm.py +0 -79
- etlplus/file/xlsx.py +0 -99
- etlplus/file/xml.py +0 -185
- etlplus/file/xpt.py +0 -78
- etlplus/file/yaml.py +0 -95
- etlplus/file/zip.py +0 -175
- etlplus/file/zsav.py +0 -77
- etlplus/ops/README.md +0 -50
- etlplus/ops/__init__.py +0 -61
- etlplus/templates/README.md +0 -46
- etlplus/workflow/README.md +0 -52
- etlplus/workflow/dag.py +0 -105
- etlplus/workflow/types.py +0 -115
- etlplus-0.9.2.dist-info/RECORD +0 -134
- {etlplus-0.9.2.dist-info → etlplus-0.10.1.dist-info}/entry_points.txt +0 -0
- {etlplus-0.9.2.dist-info → etlplus-0.10.1.dist-info}/licenses/LICENSE +0 -0
- {etlplus-0.9.2.dist-info → etlplus-0.10.1.dist-info}/top_level.txt +0 -0
etlplus/file/dat.py
DELETED
|
@@ -1,78 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
:mod:`etlplus.file.dat` module.
|
|
3
|
-
|
|
4
|
-
Helpers for reading/writing data (DAT) files.
|
|
5
|
-
|
|
6
|
-
Notes
|
|
7
|
-
-----
|
|
8
|
-
- A “DAT-formatted” file is a generic data file that may use various
|
|
9
|
-
delimiters or fixed-width formats.
|
|
10
|
-
- Common cases:
|
|
11
|
-
- Delimited text files (e.g., CSV, TSV).
|
|
12
|
-
- Fixed-width formatted files.
|
|
13
|
-
- Custom formats specific to certain applications.
|
|
14
|
-
- Rule of thumb:
|
|
15
|
-
- If the file does not follow a specific standard format, use this module
|
|
16
|
-
for reading and writing.
|
|
17
|
-
"""
|
|
18
|
-
|
|
19
|
-
from __future__ import annotations
|
|
20
|
-
|
|
21
|
-
from pathlib import Path
|
|
22
|
-
|
|
23
|
-
from ..types import JSONData
|
|
24
|
-
from ..types import JSONList
|
|
25
|
-
from . import stub
|
|
26
|
-
|
|
27
|
-
# SECTION: EXPORTS ========================================================== #
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
__all__ = [
|
|
31
|
-
'read',
|
|
32
|
-
'write',
|
|
33
|
-
]
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
# SECTION: FUNCTIONS ======================================================== #
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
def read(
|
|
40
|
-
path: Path,
|
|
41
|
-
) -> JSONList:
|
|
42
|
-
"""
|
|
43
|
-
Read DAT content from ``path``.
|
|
44
|
-
|
|
45
|
-
Parameters
|
|
46
|
-
----------
|
|
47
|
-
path : Path
|
|
48
|
-
Path to the DAT file on disk.
|
|
49
|
-
|
|
50
|
-
Returns
|
|
51
|
-
-------
|
|
52
|
-
JSONList
|
|
53
|
-
The list of dictionaries read from the DAT file.
|
|
54
|
-
"""
|
|
55
|
-
return stub.read(path, format_name='DAT')
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def write(
|
|
59
|
-
path: Path,
|
|
60
|
-
data: JSONData,
|
|
61
|
-
) -> int:
|
|
62
|
-
"""
|
|
63
|
-
Write ``data`` to DAT file at ``path`` and return record count.
|
|
64
|
-
|
|
65
|
-
Parameters
|
|
66
|
-
----------
|
|
67
|
-
path : Path
|
|
68
|
-
Path to the DAT file on disk.
|
|
69
|
-
data : JSONData
|
|
70
|
-
Data to write as DAT file. Should be a list of dictionaries or a
|
|
71
|
-
single dictionary.
|
|
72
|
-
|
|
73
|
-
Returns
|
|
74
|
-
-------
|
|
75
|
-
int
|
|
76
|
-
The number of rows written to the DAT file.
|
|
77
|
-
"""
|
|
78
|
-
return stub.write(path, data, format_name='DAT')
|
etlplus/file/dta.py
DELETED
|
@@ -1,77 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
:mod:`etlplus.file.dta` module.
|
|
3
|
-
|
|
4
|
-
Helpers for reading/writing Stata (DTA) data files.
|
|
5
|
-
|
|
6
|
-
Notes
|
|
7
|
-
-----
|
|
8
|
-
- Stata DTA files are binary files used by Stata statistical software that
|
|
9
|
-
store datasets with variables, labels, and data types.
|
|
10
|
-
- Common cases:
|
|
11
|
-
- Reading data for analysis in Python.
|
|
12
|
-
- Writing processed data back to Stata format.
|
|
13
|
-
- Rule of thumb:
|
|
14
|
-
- If you need to work with Stata data files, use this module for reading
|
|
15
|
-
and writing.
|
|
16
|
-
"""
|
|
17
|
-
|
|
18
|
-
from __future__ import annotations
|
|
19
|
-
|
|
20
|
-
from pathlib import Path
|
|
21
|
-
|
|
22
|
-
from ..types import JSONData
|
|
23
|
-
from ..types import JSONList
|
|
24
|
-
from . import stub
|
|
25
|
-
|
|
26
|
-
# SECTION: EXPORTS ========================================================== #
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
__all__ = [
|
|
30
|
-
'read',
|
|
31
|
-
'write',
|
|
32
|
-
]
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
# SECTION: FUNCTIONS ======================================================== #
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
def read(
|
|
39
|
-
path: Path,
|
|
40
|
-
) -> JSONList:
|
|
41
|
-
"""
|
|
42
|
-
Read DTA content from ``path``.
|
|
43
|
-
|
|
44
|
-
Parameters
|
|
45
|
-
----------
|
|
46
|
-
path : Path
|
|
47
|
-
Path to the DTA file on disk.
|
|
48
|
-
|
|
49
|
-
Returns
|
|
50
|
-
-------
|
|
51
|
-
JSONList
|
|
52
|
-
The list of dictionaries read from the DTA file.
|
|
53
|
-
"""
|
|
54
|
-
return stub.read(path, format_name='DTA')
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
def write(
|
|
58
|
-
path: Path,
|
|
59
|
-
data: JSONData,
|
|
60
|
-
) -> int:
|
|
61
|
-
"""
|
|
62
|
-
Write ``data`` to DTA file at ``path`` and return record count.
|
|
63
|
-
|
|
64
|
-
Parameters
|
|
65
|
-
----------
|
|
66
|
-
path : Path
|
|
67
|
-
Path to the DTA file on disk.
|
|
68
|
-
data : JSONData
|
|
69
|
-
Data to write as DTA file. Should be a list of dictionaries or a
|
|
70
|
-
single dictionary.
|
|
71
|
-
|
|
72
|
-
Returns
|
|
73
|
-
-------
|
|
74
|
-
int
|
|
75
|
-
The number of rows written to the DTA file.
|
|
76
|
-
"""
|
|
77
|
-
return stub.write(path, data, format_name='DTA')
|
etlplus/file/duckdb.py
DELETED
|
@@ -1,78 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
:mod:`etlplus.file.duckdb` module.
|
|
3
|
-
|
|
4
|
-
Helpers for reading/writing DuckDB database (DUCKDB) files.
|
|
5
|
-
|
|
6
|
-
Notes
|
|
7
|
-
-----
|
|
8
|
-
- A DUCKDB file is a self-contained, serverless database file format used by
|
|
9
|
-
DuckDB.
|
|
10
|
-
- Common cases:
|
|
11
|
-
- Analytical data storage and processing.
|
|
12
|
-
- Embedded database applications.
|
|
13
|
-
- Fast querying of large datasets.
|
|
14
|
-
- Rule of thumb:
|
|
15
|
-
- If the file follows the DUCKDB specification, use this module for reading
|
|
16
|
-
and writing.
|
|
17
|
-
"""
|
|
18
|
-
|
|
19
|
-
from __future__ import annotations
|
|
20
|
-
|
|
21
|
-
from pathlib import Path
|
|
22
|
-
|
|
23
|
-
from ..types import JSONData
|
|
24
|
-
from ..types import JSONList
|
|
25
|
-
from . import stub
|
|
26
|
-
|
|
27
|
-
# SECTION: EXPORTS ========================================================== #
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
__all__ = [
|
|
31
|
-
'read',
|
|
32
|
-
'write',
|
|
33
|
-
]
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
# SECTION: FUNCTIONS ======================================================== #
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
def read(
|
|
40
|
-
path: Path,
|
|
41
|
-
) -> JSONList:
|
|
42
|
-
"""
|
|
43
|
-
Read DUCKDB content from ``path``.
|
|
44
|
-
|
|
45
|
-
Parameters
|
|
46
|
-
----------
|
|
47
|
-
path : Path
|
|
48
|
-
Path to the DUCKDB file on disk.
|
|
49
|
-
|
|
50
|
-
Returns
|
|
51
|
-
-------
|
|
52
|
-
JSONList
|
|
53
|
-
The list of dictionaries read from the DUCKDB file.
|
|
54
|
-
"""
|
|
55
|
-
return stub.read(path, format_name='DUCKDB')
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def write(
|
|
59
|
-
path: Path,
|
|
60
|
-
data: JSONData,
|
|
61
|
-
) -> int:
|
|
62
|
-
"""
|
|
63
|
-
Write ``data`` to DUCKDB at ``path`` and return record count.
|
|
64
|
-
|
|
65
|
-
Parameters
|
|
66
|
-
----------
|
|
67
|
-
path : Path
|
|
68
|
-
Path to the DUCKDB file on disk.
|
|
69
|
-
data : JSONData
|
|
70
|
-
Data to write as DUCKDB. Should be a list of dictionaries or a
|
|
71
|
-
single dictionary.
|
|
72
|
-
|
|
73
|
-
Returns
|
|
74
|
-
-------
|
|
75
|
-
int
|
|
76
|
-
The number of rows written to the DUCKDB file.
|
|
77
|
-
"""
|
|
78
|
-
return stub.write(path, data, format_name='DUCKDB')
|
etlplus/file/enums.py
DELETED
|
@@ -1,343 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
:mod:`etlplus.file.enums` module.
|
|
3
|
-
|
|
4
|
-
File-specific enums and helpers.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
from __future__ import annotations
|
|
8
|
-
|
|
9
|
-
from pathlib import PurePath
|
|
10
|
-
|
|
11
|
-
from ..enums import CoercibleStrEnum
|
|
12
|
-
from ..types import StrStrMap
|
|
13
|
-
|
|
14
|
-
# SECTION: EXPORTS ========================================================= #
|
|
15
|
-
|
|
16
|
-
__all__ = [
|
|
17
|
-
'CompressionFormat',
|
|
18
|
-
'FileFormat',
|
|
19
|
-
'infer_file_format_and_compression',
|
|
20
|
-
]
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
# SECTION: ENUMS ============================================================ #
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
class CompressionFormat(CoercibleStrEnum):
|
|
27
|
-
"""Supported compression formats."""
|
|
28
|
-
|
|
29
|
-
# -- Constants -- #
|
|
30
|
-
|
|
31
|
-
GZ = 'gz'
|
|
32
|
-
ZIP = 'zip'
|
|
33
|
-
|
|
34
|
-
# -- Class Methods -- #
|
|
35
|
-
|
|
36
|
-
@classmethod
|
|
37
|
-
def aliases(cls) -> StrStrMap:
|
|
38
|
-
"""
|
|
39
|
-
Return a mapping of common aliases for each enum member.
|
|
40
|
-
|
|
41
|
-
Returns
|
|
42
|
-
-------
|
|
43
|
-
StrStrMap
|
|
44
|
-
A mapping of alias names to their corresponding enum member names.
|
|
45
|
-
"""
|
|
46
|
-
return {
|
|
47
|
-
# File extensions
|
|
48
|
-
'.gz': 'gz',
|
|
49
|
-
'.gzip': 'gz',
|
|
50
|
-
'.zip': 'zip',
|
|
51
|
-
# MIME types
|
|
52
|
-
'application/gzip': 'gz',
|
|
53
|
-
'application/x-gzip': 'gz',
|
|
54
|
-
'application/zip': 'zip',
|
|
55
|
-
'application/x-zip-compressed': 'zip',
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
class FileFormat(CoercibleStrEnum):
|
|
60
|
-
"""Supported file formats for extraction."""
|
|
61
|
-
|
|
62
|
-
# -- Constants -- #
|
|
63
|
-
|
|
64
|
-
# Stubbed / placeholder
|
|
65
|
-
STUB = 'stub' # Placeholder format for tests & future connectors
|
|
66
|
-
|
|
67
|
-
# Tabular & delimited text
|
|
68
|
-
CSV = 'csv' # Comma-Separated Values
|
|
69
|
-
DAT = 'dat' # Generic data file, often delimited or fixed-width
|
|
70
|
-
FWF = 'fwf' # Fixed-Width Formatted
|
|
71
|
-
PSV = 'psv' # Pipe-Separated Values
|
|
72
|
-
TAB = 'tab' # Often synonymous with TSV
|
|
73
|
-
TSV = 'tsv' # Tab-Separated Values
|
|
74
|
-
TXT = 'txt' # Plain text, often delimited or fixed-width
|
|
75
|
-
|
|
76
|
-
# Semi-structured text
|
|
77
|
-
CFG = 'cfg' # Config-style key-value pairs
|
|
78
|
-
CONF = 'conf' # Config-style key-value pairs
|
|
79
|
-
INI = 'ini' # INI-style key-value pairs
|
|
80
|
-
JSON = 'json' # JavaScript Object Notation
|
|
81
|
-
NDJSON = 'ndjson' # Newline-Delimited JSON
|
|
82
|
-
PROPERTIES = 'properties' # Java-style key-value pairs
|
|
83
|
-
TOML = 'toml' # Tom's Obvious Minimal Language
|
|
84
|
-
XML = 'xml' # Extensible Markup Language
|
|
85
|
-
YAML = 'yaml' # YAML Ain't Markup Language
|
|
86
|
-
|
|
87
|
-
# Columnar / analytics-friendly
|
|
88
|
-
ARROW = 'arrow' # Apache Arrow IPC
|
|
89
|
-
FEATHER = 'feather' # Apache Arrow Feather
|
|
90
|
-
ORC = 'orc' # Optimized Row Columnar; common in Hadoop
|
|
91
|
-
PARQUET = 'parquet' # Apache Parquet; common in Big Data
|
|
92
|
-
|
|
93
|
-
# Binary serialization & interchange
|
|
94
|
-
AVRO = 'avro' # Apache Avro
|
|
95
|
-
BSON = 'bson' # Binary JSON; common with MongoDB exports/dumps
|
|
96
|
-
CBOR = 'cbor' # Concise Binary Object Representation
|
|
97
|
-
ION = 'ion' # Amazon Ion
|
|
98
|
-
MSGPACK = 'msgpack' # MessagePack
|
|
99
|
-
PB = 'pb' # Protocol Buffers (Google Protobuf)
|
|
100
|
-
PBF = 'pbf' # Protocolbuffer Binary Format; often for GIS data
|
|
101
|
-
PROTO = 'proto' # Protocol Buffers schema; often in .pb / .bin
|
|
102
|
-
|
|
103
|
-
# Databases & embedded storage
|
|
104
|
-
ACCDB = 'accdb' # Microsoft Access database file (newer format)
|
|
105
|
-
DUCKDB = 'duckdb' # DuckDB database file
|
|
106
|
-
MDB = 'mdb' # Microsoft Access database file (older format)
|
|
107
|
-
SQLITE = 'sqlite' # SQLite database file
|
|
108
|
-
|
|
109
|
-
# Spreadsheets
|
|
110
|
-
NUMBERS = 'numbers' # Apple Numbers spreadsheet
|
|
111
|
-
ODS = 'ods' # OpenDocument spreadsheet
|
|
112
|
-
WKS = 'wks' # Lotus 1-2-3 spreadsheet
|
|
113
|
-
XLS = 'xls' # Microsoft Excel (BIFF); read-only
|
|
114
|
-
XLSM = 'xlsm' # Microsoft Excel Macro-Enabled (Open XML)
|
|
115
|
-
XLSX = 'xlsx' # Microsoft Excel (Open XML)
|
|
116
|
-
|
|
117
|
-
# Statistical / scientific / numeric computing
|
|
118
|
-
DTA = 'dta' # Stata data file
|
|
119
|
-
HDF5 = 'hdf5' # Hierarchical Data Format
|
|
120
|
-
MAT = 'mat' # MATLAB data file
|
|
121
|
-
NC = 'nc' # NetCDF data file
|
|
122
|
-
RDA = 'rda' # RData workspace/object bundle
|
|
123
|
-
RDS = 'rds' # R data file
|
|
124
|
-
SAS7BDAT = 'sas7bdat' # SAS data file
|
|
125
|
-
SAV = 'sav' # SPSS data file
|
|
126
|
-
SYLK = 'sylk' # Symbolic Link
|
|
127
|
-
XPT = 'xpt' # SAS Transport file
|
|
128
|
-
ZSAV = 'zsav' # Compressed SPSS data file
|
|
129
|
-
|
|
130
|
-
# Time series and financial data
|
|
131
|
-
CAMT = 'camt' # ISO 20022 Cash Management messages
|
|
132
|
-
FXT = 'fxt' # Forex time series data
|
|
133
|
-
MT940 = 'mt940' # SWIFT MT940 bank statement format
|
|
134
|
-
MT942 = 'mt942' # SWIFT MT942 interim transaction report format
|
|
135
|
-
OFX = 'ofx' # Open Financial Exchange
|
|
136
|
-
QFX = 'qfx' # Quicken Financial Exchange
|
|
137
|
-
QIF = 'qif' # Quicken Interchange Format
|
|
138
|
-
QQQ = 'qqq' # QuantQuote historical data
|
|
139
|
-
TRR = 'trr' # Trade and transaction reports
|
|
140
|
-
TSDB = 'tsdb' # Time series database export
|
|
141
|
-
|
|
142
|
-
# Geospatial data
|
|
143
|
-
GEOJSON = 'geojson' # GeoJSON
|
|
144
|
-
GEOTIFF = 'geotiff' # GeoTIFF
|
|
145
|
-
GML = 'gml' # Geography Markup Language
|
|
146
|
-
GPKG = 'gpkg' # GeoPackage
|
|
147
|
-
GPX = 'gpx' # GPS Exchange Format
|
|
148
|
-
KML = 'kml' # Keyhole Markup Language
|
|
149
|
-
LAS = 'las' # LiDAR Aerial Survey
|
|
150
|
-
LAZ = 'laz' # LASzip (compressed LAS)
|
|
151
|
-
OSM = 'osm' # OpenStreetMap XML Data
|
|
152
|
-
SHP = 'shp' # ESRI Shapefile
|
|
153
|
-
WKB = 'wkb' # Well-Known Binary
|
|
154
|
-
WKT = 'wkt' # Well-Known Text
|
|
155
|
-
|
|
156
|
-
# Logs & event streams
|
|
157
|
-
EVT = 'evt' # Windows Event Trace Log (pre-Vista)
|
|
158
|
-
EVTX = 'evtx' # Windows Event Trace Log (Vista and later)
|
|
159
|
-
LOG = 'log' # Generic log file
|
|
160
|
-
PCAP = 'pcap' # Packet Capture file
|
|
161
|
-
PCAPPNG = 'pcapng' # Packet Capture Next Generation file
|
|
162
|
-
SLOG = 'slog' # Structured log file
|
|
163
|
-
W3CLOG = 'w3clog' # W3C Extended Log File Format
|
|
164
|
-
|
|
165
|
-
# “Data archives” & packaging
|
|
166
|
-
_7Z = '7z' # 7-Zip archive
|
|
167
|
-
GZ = 'gz' # Gzip-compressed file
|
|
168
|
-
JAR = 'jar' # Java archive
|
|
169
|
-
RAR = 'rar' # RAR archive
|
|
170
|
-
SIT = 'sit' # StuffIt archive
|
|
171
|
-
SITX = 'sitx' # StuffIt X archive
|
|
172
|
-
TAR = 'tar' # TAR archive
|
|
173
|
-
TGZ = 'tgz' # Gzip-compressed TAR archive
|
|
174
|
-
ZIP = 'zip' # ZIP archive
|
|
175
|
-
|
|
176
|
-
# Domain-specific & less common
|
|
177
|
-
|
|
178
|
-
# Templates
|
|
179
|
-
HBS = 'hbs' # Handlebars
|
|
180
|
-
JINJA2 = 'jinja2' # Jinja2
|
|
181
|
-
MUSTACHE = 'mustache' # Mustache
|
|
182
|
-
VM = 'vm' # Apache Velocity
|
|
183
|
-
|
|
184
|
-
# -- Class Methods -- #
|
|
185
|
-
|
|
186
|
-
@classmethod
|
|
187
|
-
def aliases(cls) -> StrStrMap:
|
|
188
|
-
"""
|
|
189
|
-
Return a mapping of common aliases for each enum member.
|
|
190
|
-
|
|
191
|
-
Returns
|
|
192
|
-
-------
|
|
193
|
-
StrStrMap
|
|
194
|
-
A mapping of alias names to their corresponding enum member names.
|
|
195
|
-
"""
|
|
196
|
-
return {
|
|
197
|
-
# Common shorthand
|
|
198
|
-
'parq': 'parquet',
|
|
199
|
-
'yml': 'yaml',
|
|
200
|
-
# File extensions
|
|
201
|
-
'.avro': 'avro',
|
|
202
|
-
'.csv': 'csv',
|
|
203
|
-
'.feather': 'feather',
|
|
204
|
-
'.gz': 'gz',
|
|
205
|
-
'.json': 'json',
|
|
206
|
-
'.jsonl': 'ndjson',
|
|
207
|
-
'.ndjson': 'ndjson',
|
|
208
|
-
'.orc': 'orc',
|
|
209
|
-
'.parquet': 'parquet',
|
|
210
|
-
'.pq': 'parquet',
|
|
211
|
-
'.stub': 'stub',
|
|
212
|
-
'.tsv': 'tsv',
|
|
213
|
-
'.txt': 'txt',
|
|
214
|
-
'.xls': 'xls',
|
|
215
|
-
'.xlsx': 'xlsx',
|
|
216
|
-
'.zip': 'zip',
|
|
217
|
-
'.xml': 'xml',
|
|
218
|
-
'.yaml': 'yaml',
|
|
219
|
-
'.yml': 'yaml',
|
|
220
|
-
# MIME types
|
|
221
|
-
'application/avro': 'avro',
|
|
222
|
-
'application/csv': 'csv',
|
|
223
|
-
'application/feather': 'feather',
|
|
224
|
-
'application/gzip': 'gz',
|
|
225
|
-
'application/json': 'json',
|
|
226
|
-
'application/jsonlines': 'ndjson',
|
|
227
|
-
'application/ndjson': 'ndjson',
|
|
228
|
-
'application/orc': 'orc',
|
|
229
|
-
'application/parquet': 'parquet',
|
|
230
|
-
'application/vnd.apache.avro': 'avro',
|
|
231
|
-
'application/vnd.apache.parquet': 'parquet',
|
|
232
|
-
'application/vnd.apache.arrow.file': 'feather',
|
|
233
|
-
'application/vnd.apache.orc': 'orc',
|
|
234
|
-
'application/vnd.ms-excel': 'xls',
|
|
235
|
-
(
|
|
236
|
-
'application/vnd.openxmlformats-'
|
|
237
|
-
'officedocument.spreadsheetml.sheet'
|
|
238
|
-
): 'xlsx',
|
|
239
|
-
'application/x-avro': 'avro',
|
|
240
|
-
'application/x-csv': 'csv',
|
|
241
|
-
'application/x-feather': 'feather',
|
|
242
|
-
'application/x-orc': 'orc',
|
|
243
|
-
'application/x-ndjson': 'ndjson',
|
|
244
|
-
'application/x-parquet': 'parquet',
|
|
245
|
-
'application/x-yaml': 'yaml',
|
|
246
|
-
'application/xml': 'xml',
|
|
247
|
-
'application/zip': 'zip',
|
|
248
|
-
'text/csv': 'csv',
|
|
249
|
-
'text/plain': 'txt',
|
|
250
|
-
'text/tab-separated-values': 'tsv',
|
|
251
|
-
'text/tsv': 'tsv',
|
|
252
|
-
'text/xml': 'xml',
|
|
253
|
-
'text/yaml': 'yaml',
|
|
254
|
-
}
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
# SECTION: INTERNAL CONSTANTS =============================================== #
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
# Compression formats that are also file formats.
|
|
261
|
-
_COMPRESSION_FILE_FORMATS: set[FileFormat] = {
|
|
262
|
-
FileFormat.GZ,
|
|
263
|
-
FileFormat.ZIP,
|
|
264
|
-
}
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
# SECTION: FUNCTIONS ======================================================== #
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
# TODO: Convert to a method on FileFormat or CompressionFormat?
|
|
271
|
-
def infer_file_format_and_compression(
|
|
272
|
-
value: object,
|
|
273
|
-
filename: object | None = None,
|
|
274
|
-
) -> tuple[FileFormat | None, CompressionFormat | None]:
|
|
275
|
-
"""
|
|
276
|
-
Infer data format and compression from a filename, extension, or MIME type.
|
|
277
|
-
|
|
278
|
-
Parameters
|
|
279
|
-
----------
|
|
280
|
-
value : object
|
|
281
|
-
A filename, extension, MIME type, or existing enum member.
|
|
282
|
-
filename : object | None, optional
|
|
283
|
-
A filename to consult for extension-based inference (e.g. when
|
|
284
|
-
``value`` is ``application/octet-stream``).
|
|
285
|
-
|
|
286
|
-
Returns
|
|
287
|
-
-------
|
|
288
|
-
tuple[FileFormat | None, CompressionFormat | None]
|
|
289
|
-
The inferred data format and compression, if any.
|
|
290
|
-
"""
|
|
291
|
-
if isinstance(value, FileFormat):
|
|
292
|
-
if value in _COMPRESSION_FILE_FORMATS:
|
|
293
|
-
return None, CompressionFormat.coerce(value.value)
|
|
294
|
-
return value, None
|
|
295
|
-
if isinstance(value, CompressionFormat):
|
|
296
|
-
return None, value
|
|
297
|
-
|
|
298
|
-
text = str(value).strip()
|
|
299
|
-
if not text:
|
|
300
|
-
return None, None
|
|
301
|
-
|
|
302
|
-
normalized = text.casefold()
|
|
303
|
-
mime = normalized.split(';', 1)[0].strip()
|
|
304
|
-
|
|
305
|
-
is_octet_stream = mime == 'application/octet-stream'
|
|
306
|
-
compression = CompressionFormat.try_coerce(mime)
|
|
307
|
-
fmt = None if is_octet_stream else FileFormat.try_coerce(mime)
|
|
308
|
-
|
|
309
|
-
is_mime = mime.startswith(
|
|
310
|
-
(
|
|
311
|
-
'application/',
|
|
312
|
-
'text/',
|
|
313
|
-
'audio/',
|
|
314
|
-
'image/',
|
|
315
|
-
'video/',
|
|
316
|
-
'multipart/',
|
|
317
|
-
),
|
|
318
|
-
)
|
|
319
|
-
suffix_source: object | None = filename if filename is not None else text
|
|
320
|
-
if is_mime and filename is None:
|
|
321
|
-
suffix_source = None
|
|
322
|
-
|
|
323
|
-
suffixes = (
|
|
324
|
-
PurePath(str(suffix_source)).suffixes
|
|
325
|
-
if suffix_source is not None
|
|
326
|
-
else []
|
|
327
|
-
)
|
|
328
|
-
if suffixes:
|
|
329
|
-
normalized_suffixes = [suffix.casefold() for suffix in suffixes]
|
|
330
|
-
compression = (
|
|
331
|
-
CompressionFormat.try_coerce(normalized_suffixes[-1])
|
|
332
|
-
or compression
|
|
333
|
-
)
|
|
334
|
-
if compression is not None:
|
|
335
|
-
normalized_suffixes = normalized_suffixes[:-1]
|
|
336
|
-
if normalized_suffixes:
|
|
337
|
-
fmt = FileFormat.try_coerce(normalized_suffixes[-1]) or fmt
|
|
338
|
-
|
|
339
|
-
if fmt in _COMPRESSION_FILE_FORMATS:
|
|
340
|
-
compression = compression or CompressionFormat.coerce(fmt.value)
|
|
341
|
-
fmt = None
|
|
342
|
-
|
|
343
|
-
return fmt, compression
|
etlplus/file/feather.py
DELETED
|
@@ -1,111 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
:mod:`etlplus.file.feather` module.
|
|
3
|
-
|
|
4
|
-
Helpers for reading/writing Apache Arrow Feather (FEATHER) files.
|
|
5
|
-
|
|
6
|
-
Notes
|
|
7
|
-
-----
|
|
8
|
-
- A FEATHER file is a binary file format designed for efficient
|
|
9
|
-
on-disk storage of data frames, built on top of Apache Arrow.
|
|
10
|
-
- Common cases:
|
|
11
|
-
- Fast read/write operations for data frames.
|
|
12
|
-
- Interoperability between different data analysis tools.
|
|
13
|
-
- Storage of large datasets with efficient compression.
|
|
14
|
-
- Rule of thumb:
|
|
15
|
-
- If the file follows the Apache Arrow Feather specification, use this
|
|
16
|
-
module for reading and writing.
|
|
17
|
-
"""
|
|
18
|
-
|
|
19
|
-
from __future__ import annotations
|
|
20
|
-
|
|
21
|
-
from pathlib import Path
|
|
22
|
-
from typing import cast
|
|
23
|
-
|
|
24
|
-
from ..types import JSONData
|
|
25
|
-
from ..types import JSONList
|
|
26
|
-
from ._imports import get_pandas
|
|
27
|
-
from ._io import normalize_records
|
|
28
|
-
|
|
29
|
-
# SECTION: EXPORTS ========================================================== #
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
__all__ = [
|
|
33
|
-
'read',
|
|
34
|
-
'write',
|
|
35
|
-
]
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
# SECTION: FUNCTIONS ======================================================== #
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
def read(
|
|
42
|
-
path: Path,
|
|
43
|
-
) -> JSONList:
|
|
44
|
-
"""
|
|
45
|
-
Read Feather content from ``path``.
|
|
46
|
-
|
|
47
|
-
Parameters
|
|
48
|
-
----------
|
|
49
|
-
path : Path
|
|
50
|
-
Path to the Feather file on disk.
|
|
51
|
-
|
|
52
|
-
Returns
|
|
53
|
-
-------
|
|
54
|
-
JSONList
|
|
55
|
-
The list of dictionaries read from the Feather file.
|
|
56
|
-
|
|
57
|
-
Raises
|
|
58
|
-
------
|
|
59
|
-
ImportError
|
|
60
|
-
When optional dependency "pyarrow" is missing.
|
|
61
|
-
"""
|
|
62
|
-
pandas = get_pandas('Feather')
|
|
63
|
-
try:
|
|
64
|
-
frame = pandas.read_feather(path)
|
|
65
|
-
except ImportError as e: # pragma: no cover
|
|
66
|
-
raise ImportError(
|
|
67
|
-
'Feather support requires optional dependency "pyarrow".\n'
|
|
68
|
-
'Install with: pip install pyarrow',
|
|
69
|
-
) from e
|
|
70
|
-
return cast(JSONList, frame.to_dict(orient='records'))
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
def write(
|
|
74
|
-
path: Path,
|
|
75
|
-
data: JSONData,
|
|
76
|
-
) -> int:
|
|
77
|
-
"""
|
|
78
|
-
Write ``data`` to Feather at ``path`` and return record count.
|
|
79
|
-
|
|
80
|
-
Parameters
|
|
81
|
-
----------
|
|
82
|
-
path : Path
|
|
83
|
-
Path to the Feather file on disk.
|
|
84
|
-
data : JSONData
|
|
85
|
-
Data to write.
|
|
86
|
-
|
|
87
|
-
Returns
|
|
88
|
-
-------
|
|
89
|
-
int
|
|
90
|
-
Number of records written.
|
|
91
|
-
|
|
92
|
-
Raises
|
|
93
|
-
------
|
|
94
|
-
ImportError
|
|
95
|
-
When optional dependency "pyarrow" is missing.
|
|
96
|
-
"""
|
|
97
|
-
records = normalize_records(data, 'Feather')
|
|
98
|
-
if not records:
|
|
99
|
-
return 0
|
|
100
|
-
|
|
101
|
-
pandas = get_pandas('Feather')
|
|
102
|
-
path.parent.mkdir(parents=True, exist_ok=True)
|
|
103
|
-
frame = pandas.DataFrame.from_records(records)
|
|
104
|
-
try:
|
|
105
|
-
frame.to_feather(path)
|
|
106
|
-
except ImportError as e: # pragma: no cover
|
|
107
|
-
raise ImportError(
|
|
108
|
-
'Feather support requires optional dependency "pyarrow".\n'
|
|
109
|
-
'Install with: pip install pyarrow',
|
|
110
|
-
) from e
|
|
111
|
-
return len(records)
|