etlplus 0.16.10__py3-none-any.whl → 0.17.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- etlplus/file/README.md +33 -0
- etlplus/file/arrow.py +35 -5
- etlplus/file/bson.py +50 -5
- etlplus/file/cbor.py +35 -11
- etlplus/file/dat.py +44 -6
- etlplus/file/dta.py +46 -11
- etlplus/file/duckdb.py +159 -5
- etlplus/file/enums.py +29 -0
- etlplus/file/fwf.py +37 -5
- etlplus/file/hdf5.py +41 -3
- etlplus/file/ini.py +62 -12
- etlplus/file/msgpack.py +33 -9
- etlplus/file/nc.py +55 -6
- etlplus/file/ods.py +39 -6
- etlplus/file/pb.py +32 -19
- etlplus/file/properties.py +52 -12
- etlplus/file/proto.py +24 -12
- etlplus/file/psv.py +5 -5
- etlplus/file/rda.py +83 -9
- etlplus/file/rds.py +76 -8
- etlplus/file/sas7bdat.py +41 -11
- etlplus/file/sav.py +40 -11
- etlplus/file/sqlite.py +123 -5
- etlplus/file/tab.py +6 -7
- etlplus/file/toml.py +54 -12
- etlplus/file/xlsm.py +39 -5
- etlplus/file/xpt.py +61 -5
- {etlplus-0.16.10.dist-info → etlplus-0.17.2.dist-info}/METADATA +44 -26
- {etlplus-0.16.10.dist-info → etlplus-0.17.2.dist-info}/RECORD +33 -33
- {etlplus-0.16.10.dist-info → etlplus-0.17.2.dist-info}/WHEEL +0 -0
- {etlplus-0.16.10.dist-info → etlplus-0.17.2.dist-info}/entry_points.txt +0 -0
- {etlplus-0.16.10.dist-info → etlplus-0.17.2.dist-info}/licenses/LICENSE +0 -0
- {etlplus-0.16.10.dist-info → etlplus-0.17.2.dist-info}/top_level.txt +0 -0
etlplus/file/README.md
CHANGED
|
@@ -9,6 +9,12 @@ and writing data files.
|
|
|
9
9
|
types
|
|
10
10
|
- Exposes a `File` class with instance methods for reading and writing data
|
|
11
11
|
|
|
12
|
+
Some formats require optional dependencies. Install with:
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
pip install -e ".[file]"
|
|
16
|
+
```
|
|
17
|
+
|
|
12
18
|
Back to project overview: see the top-level [README](../../README.md).
|
|
13
19
|
|
|
14
20
|
- [`etlplus.file` Subpackage](#etlplusfile-subpackage)
|
|
@@ -29,21 +35,48 @@ matrix across all `FileFormat` values, see the top-level [README](../../README.m
|
|
|
29
35
|
| Format | Description |
|
|
30
36
|
|-----------|---------------------------------------------|
|
|
31
37
|
| avro | Apache Avro binary serialization |
|
|
38
|
+
| arrow | Apache Arrow IPC |
|
|
39
|
+
| bson | Binary JSON (BSON) |
|
|
40
|
+
| cbor | Concise Binary Object Representation |
|
|
32
41
|
| csv | Comma-separated values text files |
|
|
42
|
+
| dat | Generic data files (delimited) |
|
|
43
|
+
| dta | Stata datasets |
|
|
44
|
+
| duckdb | DuckDB database file |
|
|
33
45
|
| feather | Apache Arrow Feather columnar format |
|
|
46
|
+
| fwf | Fixed-width formatted text files |
|
|
34
47
|
| gz | Gzip-compressed files (see Compression) |
|
|
48
|
+
| hdf5 | Hierarchical Data Format |
|
|
49
|
+
| ini | INI config files |
|
|
35
50
|
| json | Standard JSON files |
|
|
51
|
+
| msgpack | MessagePack binary serialization |
|
|
52
|
+
| nc | NetCDF datasets |
|
|
36
53
|
| ndjson | Newline-delimited JSON (JSON Lines) |
|
|
54
|
+
| ods | OpenDocument spreadsheets |
|
|
37
55
|
| orc | Apache ORC columnar format |
|
|
38
56
|
| parquet | Apache Parquet columnar format |
|
|
57
|
+
| pb | Protocol Buffers binary |
|
|
58
|
+
| properties | Java-style properties |
|
|
59
|
+
| proto | Protocol Buffers schema |
|
|
60
|
+
| psv | Pipe-separated values text files |
|
|
61
|
+
| rda | RData workspace bundles |
|
|
62
|
+
| rds | RDS datasets |
|
|
63
|
+
| sas7bdat | SAS datasets |
|
|
64
|
+
| sav | SPSS datasets |
|
|
65
|
+
| sqlite | SQLite database file |
|
|
66
|
+
| tab | Tab-delimited text files |
|
|
67
|
+
| toml | TOML config files |
|
|
39
68
|
| tsv | Tab-separated values text files |
|
|
40
69
|
| txt | Plain text files |
|
|
41
70
|
| xls | Microsoft Excel (legacy .xls; read-only) |
|
|
71
|
+
| xlsm | Microsoft Excel Macro-Enabled (XLSM) |
|
|
42
72
|
| xlsx | Microsoft Excel (modern .xlsx) |
|
|
73
|
+
| xpt | SAS transport files |
|
|
43
74
|
| zip | ZIP-compressed files (see Compression) |
|
|
44
75
|
| xml | XML files |
|
|
45
76
|
| yaml | YAML files |
|
|
46
77
|
|
|
78
|
+
Note: HDF5 support is read-only; writing is currently disabled.
|
|
79
|
+
|
|
47
80
|
Compression formats (gz, zip) are also supported as wrappers for other formats. Formats not listed
|
|
48
81
|
here are currently stubbed and will raise `NotImplementedError` on read/write.
|
|
49
82
|
|
etlplus/file/arrow.py
CHANGED
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
"""
|
|
2
2
|
:mod:`etlplus.file.arrow` module.
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
yet).
|
|
4
|
+
Helpers for reading/writing Apache Arrow (ARROW) files.
|
|
6
5
|
|
|
7
6
|
Notes
|
|
8
7
|
-----
|
|
@@ -20,10 +19,13 @@ Notes
|
|
|
20
19
|
from __future__ import annotations
|
|
21
20
|
|
|
22
21
|
from pathlib import Path
|
|
22
|
+
from typing import Any
|
|
23
|
+
from typing import cast
|
|
23
24
|
|
|
24
25
|
from ..types import JSONData
|
|
25
26
|
from ..types import JSONList
|
|
26
|
-
from . import
|
|
27
|
+
from ._imports import get_optional_module
|
|
28
|
+
from ._io import normalize_records
|
|
27
29
|
|
|
28
30
|
# SECTION: EXPORTS ========================================================== #
|
|
29
31
|
|
|
@@ -35,6 +37,20 @@ __all__ = [
|
|
|
35
37
|
]
|
|
36
38
|
|
|
37
39
|
|
|
40
|
+
# SECTION: INTERNAL FUNCTIONS =============================================== #
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _get_pyarrow() -> Any:
|
|
44
|
+
"""Return the pyarrow module, importing it on first use."""
|
|
45
|
+
return get_optional_module(
|
|
46
|
+
'pyarrow',
|
|
47
|
+
error_message=(
|
|
48
|
+
'ARROW support requires optional dependency "pyarrow".\n'
|
|
49
|
+
'Install with: pip install pyarrow'
|
|
50
|
+
),
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
38
54
|
# SECTION: FUNCTIONS ======================================================== #
|
|
39
55
|
|
|
40
56
|
|
|
@@ -54,7 +70,11 @@ def read(
|
|
|
54
70
|
JSONList
|
|
55
71
|
The list of dictionaries read from the Apache Arrow file.
|
|
56
72
|
"""
|
|
57
|
-
|
|
73
|
+
pyarrow = _get_pyarrow()
|
|
74
|
+
with pyarrow.memory_map(str(path), 'r') as source:
|
|
75
|
+
reader = pyarrow.ipc.open_file(source)
|
|
76
|
+
table = reader.read_all()
|
|
77
|
+
return cast(JSONList, table.to_pylist())
|
|
58
78
|
|
|
59
79
|
|
|
60
80
|
def write(
|
|
@@ -77,4 +97,14 @@ def write(
|
|
|
77
97
|
int
|
|
78
98
|
The number of rows written to the ARROW file.
|
|
79
99
|
"""
|
|
80
|
-
|
|
100
|
+
records = normalize_records(data, 'ARROW')
|
|
101
|
+
if not records:
|
|
102
|
+
return 0
|
|
103
|
+
|
|
104
|
+
pyarrow = _get_pyarrow()
|
|
105
|
+
table = pyarrow.Table.from_pylist(records)
|
|
106
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
107
|
+
with pyarrow.OSFile(str(path), 'wb') as sink:
|
|
108
|
+
with pyarrow.ipc.new_file(sink, table.schema) as writer:
|
|
109
|
+
writer.write_table(table)
|
|
110
|
+
return len(records)
|
etlplus/file/bson.py
CHANGED
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
"""
|
|
2
2
|
:mod:`etlplus.file.bson` module.
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
yet).
|
|
4
|
+
Helpers for reading/writing Binary JSON (BSON) files.
|
|
6
5
|
|
|
7
6
|
Notes
|
|
8
7
|
-----
|
|
@@ -19,10 +18,13 @@ Notes
|
|
|
19
18
|
from __future__ import annotations
|
|
20
19
|
|
|
21
20
|
from pathlib import Path
|
|
21
|
+
from typing import Any
|
|
22
|
+
from typing import cast
|
|
22
23
|
|
|
23
24
|
from ..types import JSONData
|
|
24
25
|
from ..types import JSONList
|
|
25
|
-
from . import
|
|
26
|
+
from ._imports import get_optional_module
|
|
27
|
+
from ._io import normalize_records
|
|
26
28
|
|
|
27
29
|
# SECTION: EXPORTS ========================================================== #
|
|
28
30
|
|
|
@@ -34,6 +36,36 @@ __all__ = [
|
|
|
34
36
|
]
|
|
35
37
|
|
|
36
38
|
|
|
39
|
+
# SECTION: INTERNAL FUNCTIONS =============================================== #
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _decode_all(bson_module: Any, payload: bytes) -> list[dict[str, Any]]:
|
|
43
|
+
if hasattr(bson_module, 'decode_all'):
|
|
44
|
+
return bson_module.decode_all(payload)
|
|
45
|
+
if hasattr(bson_module, 'BSON'):
|
|
46
|
+
return bson_module.BSON.decode_all(payload)
|
|
47
|
+
raise AttributeError('bson module lacks decode_all()')
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _encode_doc(bson_module: Any, doc: dict[str, Any]) -> bytes:
|
|
51
|
+
if hasattr(bson_module, 'encode'):
|
|
52
|
+
return bson_module.encode(doc)
|
|
53
|
+
if hasattr(bson_module, 'BSON'):
|
|
54
|
+
return bson_module.BSON.encode(doc)
|
|
55
|
+
raise AttributeError('bson module lacks encode()')
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _get_bson() -> Any:
|
|
59
|
+
"""Return the bson module, importing it on first use."""
|
|
60
|
+
return get_optional_module(
|
|
61
|
+
'bson',
|
|
62
|
+
error_message=(
|
|
63
|
+
'BSON support requires optional dependency "pymongo".\n'
|
|
64
|
+
'Install with: pip install pymongo'
|
|
65
|
+
),
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
37
69
|
# SECTION: FUNCTIONS ======================================================== #
|
|
38
70
|
|
|
39
71
|
|
|
@@ -53,7 +85,11 @@ def read(
|
|
|
53
85
|
JSONList
|
|
54
86
|
The list of dictionaries read from the BSON file.
|
|
55
87
|
"""
|
|
56
|
-
|
|
88
|
+
bson = _get_bson()
|
|
89
|
+
with path.open('rb') as handle:
|
|
90
|
+
payload = handle.read()
|
|
91
|
+
docs = _decode_all(bson, payload)
|
|
92
|
+
return cast(JSONList, docs)
|
|
57
93
|
|
|
58
94
|
|
|
59
95
|
def write(
|
|
@@ -76,4 +112,13 @@ def write(
|
|
|
76
112
|
int
|
|
77
113
|
The number of rows written to the BSON file.
|
|
78
114
|
"""
|
|
79
|
-
|
|
115
|
+
bson = _get_bson()
|
|
116
|
+
records = normalize_records(data, 'BSON')
|
|
117
|
+
if not records:
|
|
118
|
+
return 0
|
|
119
|
+
|
|
120
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
121
|
+
with path.open('wb') as handle:
|
|
122
|
+
for record in records:
|
|
123
|
+
handle.write(_encode_doc(bson, record))
|
|
124
|
+
return len(records)
|
etlplus/file/cbor.py
CHANGED
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
"""
|
|
2
2
|
:mod:`etlplus.file.cbor` module.
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
files (not implemented yet).
|
|
4
|
+
Helpers for reading/writing Concise Binary Object Representation (CBOR) files.
|
|
6
5
|
|
|
7
6
|
Notes
|
|
8
7
|
-----
|
|
@@ -20,10 +19,12 @@ Notes
|
|
|
20
19
|
from __future__ import annotations
|
|
21
20
|
|
|
22
21
|
from pathlib import Path
|
|
22
|
+
from typing import Any
|
|
23
23
|
|
|
24
24
|
from ..types import JSONData
|
|
25
|
-
from
|
|
26
|
-
from . import
|
|
25
|
+
from ._imports import get_optional_module
|
|
26
|
+
from ._io import coerce_record_payload
|
|
27
|
+
from ._io import normalize_records
|
|
27
28
|
|
|
28
29
|
# SECTION: EXPORTS ========================================================== #
|
|
29
30
|
|
|
@@ -35,12 +36,26 @@ __all__ = [
|
|
|
35
36
|
]
|
|
36
37
|
|
|
37
38
|
|
|
39
|
+
# SECTION: INTERNAL FUNCTIONS =============================================== #
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _get_cbor() -> Any:
|
|
43
|
+
"""Return the cbor2 module, importing it on first use."""
|
|
44
|
+
return get_optional_module(
|
|
45
|
+
'cbor2',
|
|
46
|
+
error_message=(
|
|
47
|
+
'CBOR support requires optional dependency "cbor2".\n'
|
|
48
|
+
'Install with: pip install cbor2'
|
|
49
|
+
),
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
38
53
|
# SECTION: FUNCTIONS ======================================================== #
|
|
39
54
|
|
|
40
55
|
|
|
41
56
|
def read(
|
|
42
57
|
path: Path,
|
|
43
|
-
) ->
|
|
58
|
+
) -> JSONData:
|
|
44
59
|
"""
|
|
45
60
|
Read CBOR content from *path*.
|
|
46
61
|
|
|
@@ -51,10 +66,13 @@ def read(
|
|
|
51
66
|
|
|
52
67
|
Returns
|
|
53
68
|
-------
|
|
54
|
-
|
|
55
|
-
The
|
|
69
|
+
JSONData
|
|
70
|
+
The structured data read from the CBOR file.
|
|
56
71
|
"""
|
|
57
|
-
|
|
72
|
+
cbor2 = _get_cbor()
|
|
73
|
+
with path.open('rb') as handle:
|
|
74
|
+
payload = cbor2.loads(handle.read())
|
|
75
|
+
return coerce_record_payload(payload, format_name='CBOR')
|
|
58
76
|
|
|
59
77
|
|
|
60
78
|
def write(
|
|
@@ -62,14 +80,14 @@ def write(
|
|
|
62
80
|
data: JSONData,
|
|
63
81
|
) -> int:
|
|
64
82
|
"""
|
|
65
|
-
Write *data* to CBOR at *path* and return record count.
|
|
83
|
+
Write *data* to CBOR file at *path* and return record count.
|
|
66
84
|
|
|
67
85
|
Parameters
|
|
68
86
|
----------
|
|
69
87
|
path : Path
|
|
70
88
|
Path to the CBOR file on disk.
|
|
71
89
|
data : JSONData
|
|
72
|
-
Data to write as CBOR. Should be a list of dictionaries or a
|
|
90
|
+
Data to write as CBOR file. Should be a list of dictionaries or a
|
|
73
91
|
single dictionary.
|
|
74
92
|
|
|
75
93
|
Returns
|
|
@@ -77,4 +95,10 @@ def write(
|
|
|
77
95
|
int
|
|
78
96
|
The number of rows written to the CBOR file.
|
|
79
97
|
"""
|
|
80
|
-
|
|
98
|
+
cbor2 = _get_cbor()
|
|
99
|
+
records = normalize_records(data, 'CBOR')
|
|
100
|
+
payload: JSONData = records if isinstance(data, list) else records[0]
|
|
101
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
102
|
+
with path.open('wb') as handle:
|
|
103
|
+
handle.write(cbor2.dumps(payload))
|
|
104
|
+
return len(records)
|
etlplus/file/dat.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
"""
|
|
2
2
|
:mod:`etlplus.file.dat` module.
|
|
3
3
|
|
|
4
|
-
|
|
4
|
+
Helpers for reading/writing data (DAT) files.
|
|
5
5
|
|
|
6
6
|
Notes
|
|
7
7
|
-----
|
|
8
|
-
- A
|
|
9
|
-
|
|
8
|
+
- A DAT file is a generic data file that may use various delimiters or fixed-
|
|
9
|
+
width formats.
|
|
10
10
|
- Common cases:
|
|
11
11
|
- Delimited text files (e.g., CSV, TSV).
|
|
12
12
|
- Fixed-width formatted files.
|
|
@@ -18,11 +18,14 @@ Notes
|
|
|
18
18
|
|
|
19
19
|
from __future__ import annotations
|
|
20
20
|
|
|
21
|
+
import csv
|
|
21
22
|
from pathlib import Path
|
|
23
|
+
from typing import cast
|
|
22
24
|
|
|
23
25
|
from ..types import JSONData
|
|
26
|
+
from ..types import JSONDict
|
|
24
27
|
from ..types import JSONList
|
|
25
|
-
from . import
|
|
28
|
+
from ._io import write_delimited
|
|
26
29
|
|
|
27
30
|
# SECTION: EXPORTS ========================================================== #
|
|
28
31
|
|
|
@@ -53,7 +56,42 @@ def read(
|
|
|
53
56
|
JSONList
|
|
54
57
|
The list of dictionaries read from the DAT file.
|
|
55
58
|
"""
|
|
56
|
-
|
|
59
|
+
with path.open('r', encoding='utf-8', newline='') as handle:
|
|
60
|
+
sample = handle.read(4096)
|
|
61
|
+
handle.seek(0)
|
|
62
|
+
sniffer = csv.Sniffer()
|
|
63
|
+
dialect: csv.Dialect
|
|
64
|
+
try:
|
|
65
|
+
dialect = cast(
|
|
66
|
+
csv.Dialect,
|
|
67
|
+
sniffer.sniff(sample, delimiters=',\t|;'),
|
|
68
|
+
)
|
|
69
|
+
except csv.Error:
|
|
70
|
+
dialect = cast(csv.Dialect, csv.get_dialect('excel'))
|
|
71
|
+
try:
|
|
72
|
+
has_header = sniffer.has_header(sample)
|
|
73
|
+
except csv.Error:
|
|
74
|
+
has_header = True
|
|
75
|
+
|
|
76
|
+
reader = csv.reader(handle, dialect)
|
|
77
|
+
rows = [row for row in reader if any(field.strip() for field in row)]
|
|
78
|
+
if not rows:
|
|
79
|
+
return []
|
|
80
|
+
|
|
81
|
+
if has_header:
|
|
82
|
+
header = rows[0]
|
|
83
|
+
data_rows = rows[1:]
|
|
84
|
+
else:
|
|
85
|
+
header = [f'col_{i + 1}' for i in range(len(rows[0]))]
|
|
86
|
+
data_rows = rows
|
|
87
|
+
|
|
88
|
+
records: JSONList = []
|
|
89
|
+
for row in data_rows:
|
|
90
|
+
record: JSONDict = {}
|
|
91
|
+
for index, name in enumerate(header):
|
|
92
|
+
record[name] = row[index] if index < len(row) else None
|
|
93
|
+
records.append(record)
|
|
94
|
+
return records
|
|
57
95
|
|
|
58
96
|
|
|
59
97
|
def write(
|
|
@@ -76,4 +114,4 @@ def write(
|
|
|
76
114
|
int
|
|
77
115
|
The number of rows written to the DAT file.
|
|
78
116
|
"""
|
|
79
|
-
return
|
|
117
|
+
return write_delimited(path, data, delimiter=',')
|
etlplus/file/dta.py
CHANGED
|
@@ -1,27 +1,30 @@
|
|
|
1
1
|
"""
|
|
2
2
|
:mod:`etlplus.file.dta` module.
|
|
3
3
|
|
|
4
|
-
|
|
4
|
+
Helpers for reading/writing Stata (DTA) files.
|
|
5
5
|
|
|
6
6
|
Notes
|
|
7
7
|
-----
|
|
8
|
-
-
|
|
9
|
-
|
|
8
|
+
- A DTA file is a proprietary binary format created by Stata to store datasets
|
|
9
|
+
with variables, labels, and data types.
|
|
10
10
|
- Common cases:
|
|
11
|
-
-
|
|
12
|
-
-
|
|
11
|
+
- Statistical analysis workflows.
|
|
12
|
+
- Data sharing in research environments.
|
|
13
|
+
- Interchange between Stata and other analytics tools.
|
|
13
14
|
- Rule of thumb:
|
|
14
|
-
- If
|
|
15
|
+
- If the file follows the DTA specification, use this module for reading
|
|
15
16
|
and writing.
|
|
16
17
|
"""
|
|
17
18
|
|
|
18
19
|
from __future__ import annotations
|
|
19
20
|
|
|
20
21
|
from pathlib import Path
|
|
22
|
+
from typing import cast
|
|
21
23
|
|
|
22
24
|
from ..types import JSONData
|
|
23
25
|
from ..types import JSONList
|
|
24
|
-
from . import
|
|
26
|
+
from ._imports import get_pandas
|
|
27
|
+
from ._io import normalize_records
|
|
25
28
|
|
|
26
29
|
# SECTION: EXPORTS ========================================================== #
|
|
27
30
|
|
|
@@ -51,8 +54,21 @@ def read(
|
|
|
51
54
|
-------
|
|
52
55
|
JSONList
|
|
53
56
|
The list of dictionaries read from the DTA file.
|
|
57
|
+
|
|
58
|
+
Raises
|
|
59
|
+
------
|
|
60
|
+
ImportError
|
|
61
|
+
If optional dependencies for DTA support are missing.
|
|
54
62
|
"""
|
|
55
|
-
|
|
63
|
+
pandas = get_pandas('DTA')
|
|
64
|
+
try:
|
|
65
|
+
frame = pandas.read_stata(path)
|
|
66
|
+
except ImportError as err: # pragma: no cover
|
|
67
|
+
raise ImportError(
|
|
68
|
+
'DTA support may require optional dependency "pyreadstat".\n'
|
|
69
|
+
'Install with: pip install pyreadstat',
|
|
70
|
+
) from err
|
|
71
|
+
return cast(JSONList, frame.to_dict(orient='records'))
|
|
56
72
|
|
|
57
73
|
|
|
58
74
|
def write(
|
|
@@ -67,12 +83,31 @@ def write(
|
|
|
67
83
|
path : Path
|
|
68
84
|
Path to the DTA file on disk.
|
|
69
85
|
data : JSONData
|
|
70
|
-
Data to write as DTA file. Should be a list of dictionaries or a
|
|
71
|
-
|
|
86
|
+
Data to write as DTA file. Should be a list of dictionaries or a single
|
|
87
|
+
dictionary.
|
|
72
88
|
|
|
73
89
|
Returns
|
|
74
90
|
-------
|
|
75
91
|
int
|
|
76
92
|
The number of rows written to the DTA file.
|
|
93
|
+
|
|
94
|
+
Raises
|
|
95
|
+
------
|
|
96
|
+
ImportError
|
|
97
|
+
If optional dependencies for DTA support are missing.
|
|
77
98
|
"""
|
|
78
|
-
|
|
99
|
+
records = normalize_records(data, 'DTA')
|
|
100
|
+
if not records:
|
|
101
|
+
return 0
|
|
102
|
+
|
|
103
|
+
pandas = get_pandas('DTA')
|
|
104
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
105
|
+
frame = pandas.DataFrame.from_records(records)
|
|
106
|
+
try:
|
|
107
|
+
frame.to_stata(path, write_index=False)
|
|
108
|
+
except ImportError as err: # pragma: no cover
|
|
109
|
+
raise ImportError(
|
|
110
|
+
'DTA support may require optional dependency "pyreadstat".\n'
|
|
111
|
+
'Install with: pip install pyreadstat',
|
|
112
|
+
) from err
|
|
113
|
+
return len(records)
|
etlplus/file/duckdb.py
CHANGED
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
"""
|
|
2
2
|
:mod:`etlplus.file.duckdb` module.
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
implemented yet).
|
|
4
|
+
Helpers for reading/writing DuckDB database (DUCKDB) files.
|
|
6
5
|
|
|
7
6
|
Notes
|
|
8
7
|
-----
|
|
@@ -19,11 +18,14 @@ Notes
|
|
|
19
18
|
|
|
20
19
|
from __future__ import annotations
|
|
21
20
|
|
|
21
|
+
import json
|
|
22
22
|
from pathlib import Path
|
|
23
|
+
from typing import Any
|
|
23
24
|
|
|
24
25
|
from ..types import JSONData
|
|
25
26
|
from ..types import JSONList
|
|
26
|
-
from . import
|
|
27
|
+
from ._imports import get_optional_module
|
|
28
|
+
from ._io import normalize_records
|
|
27
29
|
|
|
28
30
|
# SECTION: EXPORTS ========================================================== #
|
|
29
31
|
|
|
@@ -35,6 +37,103 @@ __all__ = [
|
|
|
35
37
|
]
|
|
36
38
|
|
|
37
39
|
|
|
40
|
+
# SECTION: INTERNAL CONSTANTS ============================================== #
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
DEFAULT_TABLE = 'data'
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# SECTION: INTERNAL FUNCTIONS =============================================== #
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _coerce_sql_value(
|
|
50
|
+
value: Any,
|
|
51
|
+
) -> Any:
|
|
52
|
+
"""
|
|
53
|
+
Normalize values into DuckDB-compatible types.
|
|
54
|
+
|
|
55
|
+
Parameters
|
|
56
|
+
----------
|
|
57
|
+
value : Any
|
|
58
|
+
The value to normalize.
|
|
59
|
+
|
|
60
|
+
Returns
|
|
61
|
+
-------
|
|
62
|
+
Any
|
|
63
|
+
The normalized value.
|
|
64
|
+
"""
|
|
65
|
+
if value is None or isinstance(value, (str, int, float, bool)):
|
|
66
|
+
return value
|
|
67
|
+
return json.dumps(value, ensure_ascii=True)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _get_duckdb() -> Any:
|
|
71
|
+
"""
|
|
72
|
+
Return the duckdb module, importing it on first use.
|
|
73
|
+
|
|
74
|
+
Returns
|
|
75
|
+
-------
|
|
76
|
+
Any
|
|
77
|
+
The duckdb module.
|
|
78
|
+
"""
|
|
79
|
+
return get_optional_module(
|
|
80
|
+
'duckdb',
|
|
81
|
+
error_message=(
|
|
82
|
+
'DUCKDB support requires optional dependency "duckdb".\n'
|
|
83
|
+
'Install with: pip install duckdb'
|
|
84
|
+
),
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _infer_column_type(values: list[Any]) -> str:
|
|
89
|
+
"""Infer a basic DuckDB column type from sample values."""
|
|
90
|
+
seen_bool = False
|
|
91
|
+
seen_int = False
|
|
92
|
+
seen_float = False
|
|
93
|
+
seen_other = False
|
|
94
|
+
for value in values:
|
|
95
|
+
if value is None:
|
|
96
|
+
continue
|
|
97
|
+
if isinstance(value, bool):
|
|
98
|
+
seen_bool = True
|
|
99
|
+
elif isinstance(value, int):
|
|
100
|
+
seen_int = True
|
|
101
|
+
elif isinstance(value, float):
|
|
102
|
+
seen_float = True
|
|
103
|
+
else:
|
|
104
|
+
seen_other = True
|
|
105
|
+
break
|
|
106
|
+
if seen_other:
|
|
107
|
+
return 'VARCHAR'
|
|
108
|
+
if seen_float:
|
|
109
|
+
return 'DOUBLE'
|
|
110
|
+
if seen_int:
|
|
111
|
+
return 'BIGINT'
|
|
112
|
+
if seen_bool:
|
|
113
|
+
return 'BOOLEAN'
|
|
114
|
+
return 'VARCHAR'
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _quote_identifier(value: str) -> str:
|
|
118
|
+
"""Return a safely quoted SQL identifier."""
|
|
119
|
+
escaped = value.replace('"', '""')
|
|
120
|
+
return f'"{escaped}"'
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _resolve_table(tables: list[str]) -> str | None:
|
|
124
|
+
"""Pick a table name for read operations."""
|
|
125
|
+
if not tables:
|
|
126
|
+
return None
|
|
127
|
+
if DEFAULT_TABLE in tables:
|
|
128
|
+
return DEFAULT_TABLE
|
|
129
|
+
if len(tables) == 1:
|
|
130
|
+
return tables[0]
|
|
131
|
+
raise ValueError(
|
|
132
|
+
'Multiple tables found in DuckDB file; expected "data" or a '
|
|
133
|
+
'single table',
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
|
|
38
137
|
# SECTION: FUNCTIONS ======================================================== #
|
|
39
138
|
|
|
40
139
|
|
|
@@ -54,7 +153,25 @@ def read(
|
|
|
54
153
|
JSONList
|
|
55
154
|
The list of dictionaries read from the DUCKDB file.
|
|
56
155
|
"""
|
|
57
|
-
|
|
156
|
+
duckdb = _get_duckdb()
|
|
157
|
+
conn = duckdb.connect(str(path))
|
|
158
|
+
try:
|
|
159
|
+
tables = [row[0] for row in conn.execute('SHOW TABLES').fetchall()]
|
|
160
|
+
table = _resolve_table(tables)
|
|
161
|
+
if table is None:
|
|
162
|
+
return []
|
|
163
|
+
query = f'SELECT * FROM {_quote_identifier(table)}'
|
|
164
|
+
cursor = conn.execute(query)
|
|
165
|
+
rows = cursor.fetchall()
|
|
166
|
+
columns = [desc[0] for desc in cursor.description or []]
|
|
167
|
+
if not columns:
|
|
168
|
+
info = conn.execute(
|
|
169
|
+
f'PRAGMA table_info({_quote_identifier(table)})',
|
|
170
|
+
).fetchall()
|
|
171
|
+
columns = [row[1] for row in info]
|
|
172
|
+
return [dict(zip(columns, row, strict=True)) for row in rows]
|
|
173
|
+
finally:
|
|
174
|
+
conn.close()
|
|
58
175
|
|
|
59
176
|
|
|
60
177
|
def write(
|
|
@@ -77,4 +194,41 @@ def write(
|
|
|
77
194
|
int
|
|
78
195
|
The number of rows written to the DUCKDB file.
|
|
79
196
|
"""
|
|
80
|
-
|
|
197
|
+
records = normalize_records(data, 'DUCKDB')
|
|
198
|
+
if not records:
|
|
199
|
+
return 0
|
|
200
|
+
|
|
201
|
+
columns = sorted({key for row in records for key in row})
|
|
202
|
+
if not columns:
|
|
203
|
+
return 0
|
|
204
|
+
|
|
205
|
+
column_values: dict[str, list[Any]] = {col: [] for col in columns}
|
|
206
|
+
for row in records:
|
|
207
|
+
for column in columns:
|
|
208
|
+
column_values[column].append(row.get(column))
|
|
209
|
+
|
|
210
|
+
column_defs = ', '.join(
|
|
211
|
+
f'{_quote_identifier(column)} {_infer_column_type(values)}'
|
|
212
|
+
for column, values in column_values.items()
|
|
213
|
+
)
|
|
214
|
+
table_ident = _quote_identifier(DEFAULT_TABLE)
|
|
215
|
+
insert_columns = ', '.join(_quote_identifier(column) for column in columns)
|
|
216
|
+
placeholders = ', '.join('?' for _ in columns)
|
|
217
|
+
insert_sql = (
|
|
218
|
+
f'INSERT INTO {table_ident} ({insert_columns}) VALUES ({placeholders})'
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
duckdb = _get_duckdb()
|
|
222
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
223
|
+
conn = duckdb.connect(str(path))
|
|
224
|
+
try:
|
|
225
|
+
conn.execute(f'DROP TABLE IF EXISTS {table_ident}')
|
|
226
|
+
conn.execute(f'CREATE TABLE {table_ident} ({column_defs})')
|
|
227
|
+
rows = [
|
|
228
|
+
tuple(_coerce_sql_value(row.get(column)) for column in columns)
|
|
229
|
+
for row in records
|
|
230
|
+
]
|
|
231
|
+
conn.executemany(insert_sql, rows)
|
|
232
|
+
finally:
|
|
233
|
+
conn.close()
|
|
234
|
+
return len(records)
|