etlplus 0.16.10__py3-none-any.whl → 0.17.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- etlplus/file/README.md +33 -0
- etlplus/file/_imports.py +35 -20
- etlplus/file/_io.py +138 -15
- etlplus/file/_r.py +48 -0
- etlplus/file/_sql.py +224 -0
- etlplus/file/accdb.py +7 -6
- etlplus/file/arrow.py +29 -10
- etlplus/file/avro.py +13 -10
- etlplus/file/bson.py +94 -10
- etlplus/file/cbor.py +29 -17
- etlplus/file/cfg.py +7 -6
- etlplus/file/conf.py +7 -6
- etlplus/file/core.py +1 -1
- etlplus/file/csv.py +8 -7
- etlplus/file/dat.py +52 -11
- etlplus/file/dta.py +36 -16
- etlplus/file/duckdb.py +72 -11
- etlplus/file/enums.py +29 -0
- etlplus/file/feather.py +15 -30
- etlplus/file/fwf.py +44 -10
- etlplus/file/gz.py +12 -7
- etlplus/file/hbs.py +7 -6
- etlplus/file/hdf5.py +71 -8
- etlplus/file/ini.py +60 -17
- etlplus/file/ion.py +7 -6
- etlplus/file/jinja2.py +7 -6
- etlplus/file/json.py +10 -11
- etlplus/file/log.py +7 -6
- etlplus/file/mat.py +7 -6
- etlplus/file/mdb.py +7 -6
- etlplus/file/msgpack.py +27 -15
- etlplus/file/mustache.py +7 -6
- etlplus/file/nc.py +69 -11
- etlplus/file/ndjson.py +10 -6
- etlplus/file/numbers.py +7 -6
- etlplus/file/ods.py +48 -11
- etlplus/file/orc.py +15 -30
- etlplus/file/parquet.py +10 -6
- etlplus/file/pb.py +36 -24
- etlplus/file/pbf.py +7 -6
- etlplus/file/properties.py +44 -18
- etlplus/file/proto.py +24 -18
- etlplus/file/psv.py +12 -11
- etlplus/file/rda.py +57 -15
- etlplus/file/rds.py +50 -14
- etlplus/file/sas7bdat.py +26 -16
- etlplus/file/sav.py +34 -16
- etlplus/file/sqlite.py +70 -10
- etlplus/file/stub.py +8 -6
- etlplus/file/sylk.py +7 -6
- etlplus/file/tab.py +13 -13
- etlplus/file/toml.py +56 -17
- etlplus/file/tsv.py +8 -7
- etlplus/file/txt.py +10 -7
- etlplus/file/vm.py +7 -6
- etlplus/file/wks.py +7 -6
- etlplus/file/xls.py +8 -5
- etlplus/file/xlsm.py +48 -10
- etlplus/file/xlsx.py +10 -6
- etlplus/file/xml.py +11 -9
- etlplus/file/xpt.py +46 -10
- etlplus/file/yaml.py +10 -11
- etlplus/file/zip.py +10 -5
- etlplus/file/zsav.py +7 -6
- {etlplus-0.16.10.dist-info → etlplus-0.17.3.dist-info}/METADATA +44 -26
- {etlplus-0.16.10.dist-info → etlplus-0.17.3.dist-info}/RECORD +70 -68
- {etlplus-0.16.10.dist-info → etlplus-0.17.3.dist-info}/WHEEL +0 -0
- {etlplus-0.16.10.dist-info → etlplus-0.17.3.dist-info}/entry_points.txt +0 -0
- {etlplus-0.16.10.dist-info → etlplus-0.17.3.dist-info}/licenses/LICENSE +0 -0
- {etlplus-0.16.10.dist-info → etlplus-0.17.3.dist-info}/top_level.txt +0 -0
etlplus/file/dat.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
"""
|
|
2
2
|
:mod:`etlplus.file.dat` module.
|
|
3
3
|
|
|
4
|
-
|
|
4
|
+
Helpers for reading/writing data (DAT) files.
|
|
5
5
|
|
|
6
6
|
Notes
|
|
7
7
|
-----
|
|
8
|
-
- A
|
|
9
|
-
|
|
8
|
+
- A DAT file is a generic data file that may use various delimiters or fixed-
|
|
9
|
+
width formats.
|
|
10
10
|
- Common cases:
|
|
11
11
|
- Delimited text files (e.g., CSV, TSV).
|
|
12
12
|
- Fixed-width formatted files.
|
|
@@ -18,11 +18,15 @@ Notes
|
|
|
18
18
|
|
|
19
19
|
from __future__ import annotations
|
|
20
20
|
|
|
21
|
-
|
|
21
|
+
import csv
|
|
22
|
+
from typing import cast
|
|
22
23
|
|
|
23
24
|
from ..types import JSONData
|
|
25
|
+
from ..types import JSONDict
|
|
24
26
|
from ..types import JSONList
|
|
25
|
-
from
|
|
27
|
+
from ..types import StrPath
|
|
28
|
+
from ._io import coerce_path
|
|
29
|
+
from ._io import write_delimited
|
|
26
30
|
|
|
27
31
|
# SECTION: EXPORTS ========================================================== #
|
|
28
32
|
|
|
@@ -38,14 +42,14 @@ __all__ = [
|
|
|
38
42
|
|
|
39
43
|
|
|
40
44
|
def read(
|
|
41
|
-
path:
|
|
45
|
+
path: StrPath,
|
|
42
46
|
) -> JSONList:
|
|
43
47
|
"""
|
|
44
48
|
Read DAT content from *path*.
|
|
45
49
|
|
|
46
50
|
Parameters
|
|
47
51
|
----------
|
|
48
|
-
path :
|
|
52
|
+
path : StrPath
|
|
49
53
|
Path to the DAT file on disk.
|
|
50
54
|
|
|
51
55
|
Returns
|
|
@@ -53,11 +57,47 @@ def read(
|
|
|
53
57
|
JSONList
|
|
54
58
|
The list of dictionaries read from the DAT file.
|
|
55
59
|
"""
|
|
56
|
-
|
|
60
|
+
path = coerce_path(path)
|
|
61
|
+
with path.open('r', encoding='utf-8', newline='') as handle:
|
|
62
|
+
sample = handle.read(4096)
|
|
63
|
+
handle.seek(0)
|
|
64
|
+
sniffer = csv.Sniffer()
|
|
65
|
+
dialect: csv.Dialect
|
|
66
|
+
try:
|
|
67
|
+
dialect = cast(
|
|
68
|
+
csv.Dialect,
|
|
69
|
+
sniffer.sniff(sample, delimiters=',\t|;'),
|
|
70
|
+
)
|
|
71
|
+
except csv.Error:
|
|
72
|
+
dialect = cast(csv.Dialect, csv.get_dialect('excel'))
|
|
73
|
+
try:
|
|
74
|
+
has_header = sniffer.has_header(sample)
|
|
75
|
+
except csv.Error:
|
|
76
|
+
has_header = True
|
|
77
|
+
|
|
78
|
+
reader = csv.reader(handle, dialect)
|
|
79
|
+
rows = [row for row in reader if any(field.strip() for field in row)]
|
|
80
|
+
if not rows:
|
|
81
|
+
return []
|
|
82
|
+
|
|
83
|
+
if has_header:
|
|
84
|
+
header = rows[0]
|
|
85
|
+
data_rows = rows[1:]
|
|
86
|
+
else:
|
|
87
|
+
header = [f'col_{i + 1}' for i in range(len(rows[0]))]
|
|
88
|
+
data_rows = rows
|
|
89
|
+
|
|
90
|
+
records: JSONList = []
|
|
91
|
+
for row in data_rows:
|
|
92
|
+
record: JSONDict = {}
|
|
93
|
+
for index, name in enumerate(header):
|
|
94
|
+
record[name] = row[index] if index < len(row) else None
|
|
95
|
+
records.append(record)
|
|
96
|
+
return records
|
|
57
97
|
|
|
58
98
|
|
|
59
99
|
def write(
|
|
60
|
-
path:
|
|
100
|
+
path: StrPath,
|
|
61
101
|
data: JSONData,
|
|
62
102
|
) -> int:
|
|
63
103
|
"""
|
|
@@ -65,7 +105,7 @@ def write(
|
|
|
65
105
|
|
|
66
106
|
Parameters
|
|
67
107
|
----------
|
|
68
|
-
path :
|
|
108
|
+
path : StrPath
|
|
69
109
|
Path to the DAT file on disk.
|
|
70
110
|
data : JSONData
|
|
71
111
|
Data to write as DAT file. Should be a list of dictionaries or a
|
|
@@ -76,4 +116,5 @@ def write(
|
|
|
76
116
|
int
|
|
77
117
|
The number of rows written to the DAT file.
|
|
78
118
|
"""
|
|
79
|
-
|
|
119
|
+
path = coerce_path(path)
|
|
120
|
+
return write_delimited(path, data, delimiter=',', format_name='DAT')
|
etlplus/file/dta.py
CHANGED
|
@@ -1,27 +1,33 @@
|
|
|
1
1
|
"""
|
|
2
2
|
:mod:`etlplus.file.dta` module.
|
|
3
3
|
|
|
4
|
-
|
|
4
|
+
Helpers for reading/writing Stata (DTA) files.
|
|
5
5
|
|
|
6
6
|
Notes
|
|
7
7
|
-----
|
|
8
|
-
-
|
|
9
|
-
|
|
8
|
+
- A DTA file is a proprietary binary format created by Stata to store datasets
|
|
9
|
+
with variables, labels, and data types.
|
|
10
10
|
- Common cases:
|
|
11
|
-
-
|
|
12
|
-
-
|
|
11
|
+
- Statistical analysis workflows.
|
|
12
|
+
- Data sharing in research environments.
|
|
13
|
+
- Interchange between Stata and other analytics tools.
|
|
13
14
|
- Rule of thumb:
|
|
14
|
-
- If
|
|
15
|
+
- If the file follows the DTA specification, use this module for reading
|
|
15
16
|
and writing.
|
|
16
17
|
"""
|
|
17
18
|
|
|
18
19
|
from __future__ import annotations
|
|
19
20
|
|
|
20
|
-
from
|
|
21
|
+
from typing import cast
|
|
21
22
|
|
|
22
23
|
from ..types import JSONData
|
|
23
24
|
from ..types import JSONList
|
|
24
|
-
from
|
|
25
|
+
from ..types import StrPath
|
|
26
|
+
from ._imports import get_dependency
|
|
27
|
+
from ._imports import get_pandas
|
|
28
|
+
from ._io import coerce_path
|
|
29
|
+
from ._io import ensure_parent_dir
|
|
30
|
+
from ._io import normalize_records
|
|
25
31
|
|
|
26
32
|
# SECTION: EXPORTS ========================================================== #
|
|
27
33
|
|
|
@@ -37,14 +43,14 @@ __all__ = [
|
|
|
37
43
|
|
|
38
44
|
|
|
39
45
|
def read(
|
|
40
|
-
path:
|
|
46
|
+
path: StrPath,
|
|
41
47
|
) -> JSONList:
|
|
42
48
|
"""
|
|
43
49
|
Read DTA content from *path*.
|
|
44
50
|
|
|
45
51
|
Parameters
|
|
46
52
|
----------
|
|
47
|
-
path :
|
|
53
|
+
path : StrPath
|
|
48
54
|
Path to the DTA file on disk.
|
|
49
55
|
|
|
50
56
|
Returns
|
|
@@ -52,11 +58,15 @@ def read(
|
|
|
52
58
|
JSONList
|
|
53
59
|
The list of dictionaries read from the DTA file.
|
|
54
60
|
"""
|
|
55
|
-
|
|
61
|
+
path = coerce_path(path)
|
|
62
|
+
get_dependency('pyreadstat', format_name='DTA')
|
|
63
|
+
pandas = get_pandas('DTA')
|
|
64
|
+
frame = pandas.read_stata(path)
|
|
65
|
+
return cast(JSONList, frame.to_dict(orient='records'))
|
|
56
66
|
|
|
57
67
|
|
|
58
68
|
def write(
|
|
59
|
-
path:
|
|
69
|
+
path: StrPath,
|
|
60
70
|
data: JSONData,
|
|
61
71
|
) -> int:
|
|
62
72
|
"""
|
|
@@ -64,15 +74,25 @@ def write(
|
|
|
64
74
|
|
|
65
75
|
Parameters
|
|
66
76
|
----------
|
|
67
|
-
path :
|
|
77
|
+
path : StrPath
|
|
68
78
|
Path to the DTA file on disk.
|
|
69
79
|
data : JSONData
|
|
70
|
-
Data to write as DTA file. Should be a list of dictionaries or a
|
|
71
|
-
|
|
80
|
+
Data to write as DTA file. Should be a list of dictionaries or a single
|
|
81
|
+
dictionary.
|
|
72
82
|
|
|
73
83
|
Returns
|
|
74
84
|
-------
|
|
75
85
|
int
|
|
76
86
|
The number of rows written to the DTA file.
|
|
77
87
|
"""
|
|
78
|
-
|
|
88
|
+
path = coerce_path(path)
|
|
89
|
+
records = normalize_records(data, 'DTA')
|
|
90
|
+
if not records:
|
|
91
|
+
return 0
|
|
92
|
+
|
|
93
|
+
get_dependency('pyreadstat', format_name='DTA')
|
|
94
|
+
pandas = get_pandas('DTA')
|
|
95
|
+
ensure_parent_dir(path)
|
|
96
|
+
frame = pandas.DataFrame.from_records(records)
|
|
97
|
+
frame.to_stata(path, write_index=False)
|
|
98
|
+
return len(records)
|
etlplus/file/duckdb.py
CHANGED
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
"""
|
|
2
2
|
:mod:`etlplus.file.duckdb` module.
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
implemented yet).
|
|
4
|
+
Helpers for reading/writing DuckDB database (DUCKDB) files.
|
|
6
5
|
|
|
7
6
|
Notes
|
|
8
7
|
-----
|
|
@@ -19,11 +18,20 @@ Notes
|
|
|
19
18
|
|
|
20
19
|
from __future__ import annotations
|
|
21
20
|
|
|
22
|
-
from pathlib import Path
|
|
23
|
-
|
|
24
21
|
from ..types import JSONData
|
|
25
22
|
from ..types import JSONList
|
|
26
|
-
from
|
|
23
|
+
from ..types import StrPath
|
|
24
|
+
from ._imports import get_dependency
|
|
25
|
+
from ._io import coerce_path
|
|
26
|
+
from ._io import ensure_parent_dir
|
|
27
|
+
from ._io import normalize_records
|
|
28
|
+
from ._sql import DEFAULT_TABLE
|
|
29
|
+
from ._sql import DUCKDB_DIALECT
|
|
30
|
+
from ._sql import coerce_sql_value
|
|
31
|
+
from ._sql import collect_column_values
|
|
32
|
+
from ._sql import infer_column_type
|
|
33
|
+
from ._sql import quote_identifier
|
|
34
|
+
from ._sql import resolve_table
|
|
27
35
|
|
|
28
36
|
# SECTION: EXPORTS ========================================================== #
|
|
29
37
|
|
|
@@ -39,14 +47,14 @@ __all__ = [
|
|
|
39
47
|
|
|
40
48
|
|
|
41
49
|
def read(
|
|
42
|
-
path:
|
|
50
|
+
path: StrPath,
|
|
43
51
|
) -> JSONList:
|
|
44
52
|
"""
|
|
45
53
|
Read DUCKDB content from *path*.
|
|
46
54
|
|
|
47
55
|
Parameters
|
|
48
56
|
----------
|
|
49
|
-
path :
|
|
57
|
+
path : StrPath
|
|
50
58
|
Path to the DUCKDB file on disk.
|
|
51
59
|
|
|
52
60
|
Returns
|
|
@@ -54,11 +62,30 @@ def read(
|
|
|
54
62
|
JSONList
|
|
55
63
|
The list of dictionaries read from the DUCKDB file.
|
|
56
64
|
"""
|
|
57
|
-
|
|
65
|
+
path = coerce_path(path)
|
|
66
|
+
duckdb = get_dependency('duckdb', format_name='DUCKDB')
|
|
67
|
+
conn = duckdb.connect(str(path))
|
|
68
|
+
try:
|
|
69
|
+
tables = [row[0] for row in conn.execute('SHOW TABLES').fetchall()]
|
|
70
|
+
table = resolve_table(tables, engine_name='DuckDB')
|
|
71
|
+
if table is None:
|
|
72
|
+
return []
|
|
73
|
+
query = f'SELECT * FROM {quote_identifier(table)}'
|
|
74
|
+
cursor = conn.execute(query)
|
|
75
|
+
rows = cursor.fetchall()
|
|
76
|
+
columns = [desc[0] for desc in cursor.description or []]
|
|
77
|
+
if not columns:
|
|
78
|
+
info = conn.execute(
|
|
79
|
+
f'PRAGMA table_info({quote_identifier(table)})',
|
|
80
|
+
).fetchall()
|
|
81
|
+
columns = [row[1] for row in info]
|
|
82
|
+
return [dict(zip(columns, row, strict=True)) for row in rows]
|
|
83
|
+
finally:
|
|
84
|
+
conn.close()
|
|
58
85
|
|
|
59
86
|
|
|
60
87
|
def write(
|
|
61
|
-
path:
|
|
88
|
+
path: StrPath,
|
|
62
89
|
data: JSONData,
|
|
63
90
|
) -> int:
|
|
64
91
|
"""
|
|
@@ -66,7 +93,7 @@ def write(
|
|
|
66
93
|
|
|
67
94
|
Parameters
|
|
68
95
|
----------
|
|
69
|
-
path :
|
|
96
|
+
path : StrPath
|
|
70
97
|
Path to the DUCKDB file on disk.
|
|
71
98
|
data : JSONData
|
|
72
99
|
Data to write as DUCKDB. Should be a list of dictionaries or a
|
|
@@ -77,4 +104,38 @@ def write(
|
|
|
77
104
|
int
|
|
78
105
|
The number of rows written to the DUCKDB file.
|
|
79
106
|
"""
|
|
80
|
-
|
|
107
|
+
path = coerce_path(path)
|
|
108
|
+
records = normalize_records(data, 'DUCKDB')
|
|
109
|
+
if not records:
|
|
110
|
+
return 0
|
|
111
|
+
|
|
112
|
+
columns, column_values = collect_column_values(records)
|
|
113
|
+
if not columns:
|
|
114
|
+
return 0
|
|
115
|
+
|
|
116
|
+
column_defs = ', '.join(
|
|
117
|
+
f'{quote_identifier(column)} '
|
|
118
|
+
f'{infer_column_type(values, DUCKDB_DIALECT)}'
|
|
119
|
+
for column, values in column_values.items()
|
|
120
|
+
)
|
|
121
|
+
table_ident = quote_identifier(DEFAULT_TABLE)
|
|
122
|
+
insert_columns = ', '.join(quote_identifier(column) for column in columns)
|
|
123
|
+
placeholders = ', '.join('?' for _ in columns)
|
|
124
|
+
insert_sql = (
|
|
125
|
+
f'INSERT INTO {table_ident} ({insert_columns}) VALUES ({placeholders})'
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
duckdb = get_dependency('duckdb', format_name='DUCKDB')
|
|
129
|
+
ensure_parent_dir(path)
|
|
130
|
+
conn = duckdb.connect(str(path))
|
|
131
|
+
try:
|
|
132
|
+
conn.execute(f'DROP TABLE IF EXISTS {table_ident}')
|
|
133
|
+
conn.execute(f'CREATE TABLE {table_ident} ({column_defs})')
|
|
134
|
+
rows = [
|
|
135
|
+
tuple(coerce_sql_value(row.get(column)) for column in columns)
|
|
136
|
+
for row in records
|
|
137
|
+
]
|
|
138
|
+
conn.executemany(insert_sql, rows)
|
|
139
|
+
finally:
|
|
140
|
+
conn.close()
|
|
141
|
+
return len(records)
|
etlplus/file/enums.py
CHANGED
|
@@ -199,19 +199,48 @@ class FileFormat(CoercibleStrEnum):
|
|
|
199
199
|
'yml': 'yaml',
|
|
200
200
|
# File extensions
|
|
201
201
|
'.avro': 'avro',
|
|
202
|
+
'.arrow': 'arrow',
|
|
202
203
|
'.csv': 'csv',
|
|
204
|
+
'.duckdb': 'duckdb',
|
|
205
|
+
'.dat': 'dat',
|
|
203
206
|
'.feather': 'feather',
|
|
207
|
+
'.fwf': 'fwf',
|
|
204
208
|
'.gz': 'gz',
|
|
209
|
+
'.hdf': 'hdf5',
|
|
210
|
+
'.hdf5': 'hdf5',
|
|
211
|
+
'.h5': 'hdf5',
|
|
212
|
+
'.ini': 'ini',
|
|
205
213
|
'.json': 'json',
|
|
206
214
|
'.jsonl': 'ndjson',
|
|
215
|
+
'.bson': 'bson',
|
|
216
|
+
'.cbor': 'cbor',
|
|
217
|
+
'.msgpack': 'msgpack',
|
|
207
218
|
'.ndjson': 'ndjson',
|
|
219
|
+
'.ods': 'ods',
|
|
208
220
|
'.orc': 'orc',
|
|
209
221
|
'.parquet': 'parquet',
|
|
210
222
|
'.pq': 'parquet',
|
|
223
|
+
'.pb': 'pb',
|
|
224
|
+
'.proto': 'proto',
|
|
225
|
+
'.psv': 'psv',
|
|
226
|
+
'.sqlite': 'sqlite',
|
|
227
|
+
'.sqlite3': 'sqlite',
|
|
211
228
|
'.stub': 'stub',
|
|
229
|
+
'.tab': 'tab',
|
|
230
|
+
'.dta': 'dta',
|
|
231
|
+
'.sas7bdat': 'sas7bdat',
|
|
232
|
+
'.xpt': 'xpt',
|
|
233
|
+
'.rds': 'rds',
|
|
234
|
+
'.rda': 'rda',
|
|
235
|
+
'.nc': 'nc',
|
|
236
|
+
'.sav': 'sav',
|
|
237
|
+
'.properties': 'properties',
|
|
238
|
+
'.prop': 'properties',
|
|
239
|
+
'.toml': 'toml',
|
|
212
240
|
'.tsv': 'tsv',
|
|
213
241
|
'.txt': 'txt',
|
|
214
242
|
'.xls': 'xls',
|
|
243
|
+
'.xlsm': 'xlsm',
|
|
215
244
|
'.xlsx': 'xlsx',
|
|
216
245
|
'.zip': 'zip',
|
|
217
246
|
'.xml': 'xml',
|
etlplus/file/feather.py
CHANGED
|
@@ -18,12 +18,15 @@ Notes
|
|
|
18
18
|
|
|
19
19
|
from __future__ import annotations
|
|
20
20
|
|
|
21
|
-
from pathlib import Path
|
|
22
21
|
from typing import cast
|
|
23
22
|
|
|
24
23
|
from ..types import JSONData
|
|
25
24
|
from ..types import JSONList
|
|
25
|
+
from ..types import StrPath
|
|
26
|
+
from ._imports import get_dependency
|
|
26
27
|
from ._imports import get_pandas
|
|
28
|
+
from ._io import coerce_path
|
|
29
|
+
from ._io import ensure_parent_dir
|
|
27
30
|
from ._io import normalize_records
|
|
28
31
|
|
|
29
32
|
# SECTION: EXPORTS ========================================================== #
|
|
@@ -40,39 +43,30 @@ __all__ = [
|
|
|
40
43
|
|
|
41
44
|
|
|
42
45
|
def read(
|
|
43
|
-
path:
|
|
46
|
+
path: StrPath,
|
|
44
47
|
) -> JSONList:
|
|
45
48
|
"""
|
|
46
49
|
Read Feather content from *path*.
|
|
47
50
|
|
|
48
51
|
Parameters
|
|
49
52
|
----------
|
|
50
|
-
path :
|
|
53
|
+
path : StrPath
|
|
51
54
|
Path to the Feather file on disk.
|
|
52
55
|
|
|
53
56
|
Returns
|
|
54
57
|
-------
|
|
55
58
|
JSONList
|
|
56
59
|
The list of dictionaries read from the Feather file.
|
|
57
|
-
|
|
58
|
-
Raises
|
|
59
|
-
------
|
|
60
|
-
ImportError
|
|
61
|
-
When optional dependency "pyarrow" is missing.
|
|
62
60
|
"""
|
|
61
|
+
path = coerce_path(path)
|
|
62
|
+
get_dependency('pyarrow', format_name='Feather')
|
|
63
63
|
pandas = get_pandas('Feather')
|
|
64
|
-
|
|
65
|
-
frame = pandas.read_feather(path)
|
|
66
|
-
except ImportError as e: # pragma: no cover
|
|
67
|
-
raise ImportError(
|
|
68
|
-
'Feather support requires optional dependency "pyarrow".\n'
|
|
69
|
-
'Install with: pip install pyarrow',
|
|
70
|
-
) from e
|
|
64
|
+
frame = pandas.read_feather(path)
|
|
71
65
|
return cast(JSONList, frame.to_dict(orient='records'))
|
|
72
66
|
|
|
73
67
|
|
|
74
68
|
def write(
|
|
75
|
-
path:
|
|
69
|
+
path: StrPath,
|
|
76
70
|
data: JSONData,
|
|
77
71
|
) -> int:
|
|
78
72
|
"""
|
|
@@ -80,7 +74,7 @@ def write(
|
|
|
80
74
|
|
|
81
75
|
Parameters
|
|
82
76
|
----------
|
|
83
|
-
path :
|
|
77
|
+
path : StrPath
|
|
84
78
|
Path to the Feather file on disk.
|
|
85
79
|
data : JSONData
|
|
86
80
|
Data to write.
|
|
@@ -89,24 +83,15 @@ def write(
|
|
|
89
83
|
-------
|
|
90
84
|
int
|
|
91
85
|
Number of records written.
|
|
92
|
-
|
|
93
|
-
Raises
|
|
94
|
-
------
|
|
95
|
-
ImportError
|
|
96
|
-
When optional dependency "pyarrow" is missing.
|
|
97
86
|
"""
|
|
87
|
+
path = coerce_path(path)
|
|
98
88
|
records = normalize_records(data, 'Feather')
|
|
99
89
|
if not records:
|
|
100
90
|
return 0
|
|
101
91
|
|
|
92
|
+
get_dependency('pyarrow', format_name='Feather')
|
|
102
93
|
pandas = get_pandas('Feather')
|
|
103
|
-
path
|
|
94
|
+
ensure_parent_dir(path)
|
|
104
95
|
frame = pandas.DataFrame.from_records(records)
|
|
105
|
-
|
|
106
|
-
frame.to_feather(path)
|
|
107
|
-
except ImportError as e: # pragma: no cover
|
|
108
|
-
raise ImportError(
|
|
109
|
-
'Feather support requires optional dependency "pyarrow".\n'
|
|
110
|
-
'Install with: pip install pyarrow',
|
|
111
|
-
) from e
|
|
96
|
+
frame.to_feather(path)
|
|
112
97
|
return len(records)
|
etlplus/file/fwf.py
CHANGED
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
"""
|
|
2
2
|
:mod:`etlplus.file.fwf` module.
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
implemented yet).
|
|
4
|
+
Helpers for reading/writing Fixed-Width Fields (FWF) files.
|
|
6
5
|
|
|
7
6
|
Notes
|
|
8
7
|
-----
|
|
@@ -18,11 +17,16 @@ Notes
|
|
|
18
17
|
|
|
19
18
|
from __future__ import annotations
|
|
20
19
|
|
|
21
|
-
from
|
|
20
|
+
from typing import cast
|
|
22
21
|
|
|
23
22
|
from ..types import JSONData
|
|
24
23
|
from ..types import JSONList
|
|
25
|
-
from
|
|
24
|
+
from ..types import StrPath
|
|
25
|
+
from ._imports import get_pandas
|
|
26
|
+
from ._io import coerce_path
|
|
27
|
+
from ._io import ensure_parent_dir
|
|
28
|
+
from ._io import normalize_records
|
|
29
|
+
from ._io import stringify_value
|
|
26
30
|
|
|
27
31
|
# SECTION: EXPORTS ========================================================== #
|
|
28
32
|
|
|
@@ -38,14 +42,14 @@ __all__ = [
|
|
|
38
42
|
|
|
39
43
|
|
|
40
44
|
def read(
|
|
41
|
-
path:
|
|
45
|
+
path: StrPath,
|
|
42
46
|
) -> JSONList:
|
|
43
47
|
"""
|
|
44
48
|
Read FWF content from *path*.
|
|
45
49
|
|
|
46
50
|
Parameters
|
|
47
51
|
----------
|
|
48
|
-
path :
|
|
52
|
+
path : StrPath
|
|
49
53
|
Path to the FWF file on disk.
|
|
50
54
|
|
|
51
55
|
Returns
|
|
@@ -53,11 +57,14 @@ def read(
|
|
|
53
57
|
JSONList
|
|
54
58
|
The list of dictionaries read from the FWF file.
|
|
55
59
|
"""
|
|
56
|
-
|
|
60
|
+
path = coerce_path(path)
|
|
61
|
+
pandas = get_pandas('FWF')
|
|
62
|
+
frame = pandas.read_fwf(path)
|
|
63
|
+
return cast(JSONList, frame.to_dict(orient='records'))
|
|
57
64
|
|
|
58
65
|
|
|
59
66
|
def write(
|
|
60
|
-
path:
|
|
67
|
+
path: StrPath,
|
|
61
68
|
data: JSONData,
|
|
62
69
|
) -> int:
|
|
63
70
|
"""
|
|
@@ -65,7 +72,7 @@ def write(
|
|
|
65
72
|
|
|
66
73
|
Parameters
|
|
67
74
|
----------
|
|
68
|
-
path :
|
|
75
|
+
path : StrPath
|
|
69
76
|
Path to the FWF file on disk.
|
|
70
77
|
data : JSONData
|
|
71
78
|
Data to write as FWF file. Should be a list of dictionaries or a
|
|
@@ -76,4 +83,31 @@ def write(
|
|
|
76
83
|
int
|
|
77
84
|
The number of rows written to the FWF file.
|
|
78
85
|
"""
|
|
79
|
-
|
|
86
|
+
path = coerce_path(path)
|
|
87
|
+
records = normalize_records(data, 'FWF')
|
|
88
|
+
if not records:
|
|
89
|
+
return 0
|
|
90
|
+
|
|
91
|
+
fieldnames = sorted({key for row in records for key in row})
|
|
92
|
+
if not fieldnames:
|
|
93
|
+
return 0
|
|
94
|
+
|
|
95
|
+
widths: dict[str, int] = {name: len(name) for name in fieldnames}
|
|
96
|
+
for row in records:
|
|
97
|
+
for name in fieldnames:
|
|
98
|
+
widths[name] = max(
|
|
99
|
+
widths[name],
|
|
100
|
+
len(stringify_value(row.get(name))),
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
ensure_parent_dir(path)
|
|
104
|
+
with path.open('w', encoding='utf-8', newline='') as handle:
|
|
105
|
+
header = ' '.join(name.ljust(widths[name]) for name in fieldnames)
|
|
106
|
+
handle.write(header + '\n')
|
|
107
|
+
for row in records:
|
|
108
|
+
line = ' '.join(
|
|
109
|
+
stringify_value(row.get(name)).ljust(widths[name])
|
|
110
|
+
for name in fieldnames
|
|
111
|
+
)
|
|
112
|
+
handle.write(line + '\n')
|
|
113
|
+
return len(records)
|
etlplus/file/gz.py
CHANGED
|
@@ -11,6 +11,9 @@ import tempfile
|
|
|
11
11
|
from pathlib import Path
|
|
12
12
|
|
|
13
13
|
from ..types import JSONData
|
|
14
|
+
from ..types import StrPath
|
|
15
|
+
from ._io import coerce_path
|
|
16
|
+
from ._io import ensure_parent_dir
|
|
14
17
|
from .enums import CompressionFormat
|
|
15
18
|
from .enums import FileFormat
|
|
16
19
|
from .enums import infer_file_format_and_compression
|
|
@@ -29,14 +32,14 @@ __all__ = [
|
|
|
29
32
|
|
|
30
33
|
|
|
31
34
|
def _resolve_format(
|
|
32
|
-
path:
|
|
35
|
+
path: StrPath,
|
|
33
36
|
) -> FileFormat:
|
|
34
37
|
"""
|
|
35
38
|
Resolve the inner file format from a .gz filename.
|
|
36
39
|
|
|
37
40
|
Parameters
|
|
38
41
|
----------
|
|
39
|
-
path :
|
|
42
|
+
path : StrPath
|
|
40
43
|
Path to the GZ file on disk.
|
|
41
44
|
|
|
42
45
|
Returns
|
|
@@ -63,14 +66,14 @@ def _resolve_format(
|
|
|
63
66
|
|
|
64
67
|
|
|
65
68
|
def read(
|
|
66
|
-
path:
|
|
69
|
+
path: StrPath,
|
|
67
70
|
) -> JSONData:
|
|
68
71
|
"""
|
|
69
72
|
Read GZ content from *path* and parse the inner payload.
|
|
70
73
|
|
|
71
74
|
Parameters
|
|
72
75
|
----------
|
|
73
|
-
path :
|
|
76
|
+
path : StrPath
|
|
74
77
|
Path to the GZ file on disk.
|
|
75
78
|
|
|
76
79
|
Returns
|
|
@@ -78,6 +81,7 @@ def read(
|
|
|
78
81
|
JSONData
|
|
79
82
|
Parsed payload.
|
|
80
83
|
"""
|
|
84
|
+
path = coerce_path(path)
|
|
81
85
|
fmt = _resolve_format(path)
|
|
82
86
|
with gzip.open(path, 'rb') as handle:
|
|
83
87
|
payload = handle.read()
|
|
@@ -91,7 +95,7 @@ def read(
|
|
|
91
95
|
|
|
92
96
|
|
|
93
97
|
def write(
|
|
94
|
-
path:
|
|
98
|
+
path: StrPath,
|
|
95
99
|
data: JSONData,
|
|
96
100
|
) -> int:
|
|
97
101
|
"""
|
|
@@ -99,7 +103,7 @@ def write(
|
|
|
99
103
|
|
|
100
104
|
Parameters
|
|
101
105
|
----------
|
|
102
|
-
path :
|
|
106
|
+
path : StrPath
|
|
103
107
|
Path to the GZ file on disk.
|
|
104
108
|
data : JSONData
|
|
105
109
|
Data to write.
|
|
@@ -109,6 +113,7 @@ def write(
|
|
|
109
113
|
int
|
|
110
114
|
Number of records written.
|
|
111
115
|
"""
|
|
116
|
+
path = coerce_path(path)
|
|
112
117
|
fmt = _resolve_format(path)
|
|
113
118
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
114
119
|
tmp_path = Path(tmpdir) / f'payload.{fmt.value}'
|
|
@@ -117,7 +122,7 @@ def write(
|
|
|
117
122
|
count = File(tmp_path, fmt).write(data)
|
|
118
123
|
payload = tmp_path.read_bytes()
|
|
119
124
|
|
|
120
|
-
path
|
|
125
|
+
ensure_parent_dir(path)
|
|
121
126
|
with gzip.open(path, 'wb') as handle:
|
|
122
127
|
handle.write(payload)
|
|
123
128
|
|