etlplus 0.11.5__py3-none-any.whl → 0.12.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- etlplus/README.md +37 -0
- etlplus/api/README.md +20 -3
- etlplus/cli/README.md +40 -0
- etlplus/cli/handlers.py +1 -1
- etlplus/config/README.md +52 -0
- etlplus/database/README.md +48 -0
- etlplus/database/ddl.py +1 -1
- etlplus/database/engine.py +1 -1
- etlplus/database/schema.py +1 -1
- etlplus/file/README.md +105 -0
- etlplus/file/avro.py +198 -0
- etlplus/file/core.py +105 -105
- etlplus/file/csv.py +12 -3
- etlplus/file/feather.py +144 -0
- etlplus/file/gz.py +123 -0
- etlplus/file/json.py +13 -2
- etlplus/file/ndjson.py +109 -0
- etlplus/file/orc.py +142 -0
- etlplus/file/parquet.py +146 -0
- etlplus/file/tsv.py +91 -0
- etlplus/file/txt.py +99 -0
- etlplus/file/xls.py +132 -0
- etlplus/file/xlsx.py +142 -0
- etlplus/file/xml.py +12 -3
- etlplus/file/yaml.py +13 -2
- etlplus/file/zip.py +175 -0
- etlplus/templates/README.md +46 -0
- etlplus/validation/README.md +50 -0
- {etlplus-0.11.5.dist-info → etlplus-0.12.1.dist-info}/METADATA +58 -14
- {etlplus-0.11.5.dist-info → etlplus-0.12.1.dist-info}/RECORD +34 -16
- {etlplus-0.11.5.dist-info → etlplus-0.12.1.dist-info}/WHEEL +0 -0
- {etlplus-0.11.5.dist-info → etlplus-0.12.1.dist-info}/entry_points.txt +0 -0
- {etlplus-0.11.5.dist-info → etlplus-0.12.1.dist-info}/licenses/LICENSE +0 -0
- {etlplus-0.11.5.dist-info → etlplus-0.12.1.dist-info}/top_level.txt +0 -0
etlplus/file/core.py
CHANGED
|
@@ -11,11 +11,21 @@ from dataclasses import dataclass
|
|
|
11
11
|
from pathlib import Path
|
|
12
12
|
|
|
13
13
|
from ..types import JSONData
|
|
14
|
-
from
|
|
14
|
+
from . import avro
|
|
15
15
|
from . import csv
|
|
16
|
+
from . import feather
|
|
17
|
+
from . import gz
|
|
16
18
|
from . import json
|
|
19
|
+
from . import ndjson
|
|
20
|
+
from . import orc
|
|
21
|
+
from . import parquet
|
|
22
|
+
from . import tsv
|
|
23
|
+
from . import txt
|
|
24
|
+
from . import xls
|
|
25
|
+
from . import xlsx
|
|
17
26
|
from . import xml
|
|
18
27
|
from . import yaml
|
|
28
|
+
from . import zip
|
|
19
29
|
from .enums import FileFormat
|
|
20
30
|
from .enums import infer_file_format_and_compression
|
|
21
31
|
|
|
@@ -43,7 +53,15 @@ class File:
|
|
|
43
53
|
Path to the file on disk.
|
|
44
54
|
file_format : FileFormat | None, optional
|
|
45
55
|
Explicit format. If omitted, the format is inferred from the file
|
|
46
|
-
extension (``.csv``, ``.json``,
|
|
56
|
+
extension (``.csv``, ``.json``, etc.).
|
|
57
|
+
|
|
58
|
+
Parameters
|
|
59
|
+
----------
|
|
60
|
+
path : StrPath
|
|
61
|
+
Path to the file on disk.
|
|
62
|
+
file_format : FileFormat | str | None, optional
|
|
63
|
+
Explicit format. If omitted, the format is inferred from the file
|
|
64
|
+
extension (``.csv``, ``.json``, etc.).
|
|
47
65
|
"""
|
|
48
66
|
|
|
49
67
|
# -- Attributes -- #
|
|
@@ -62,16 +80,10 @@ class File:
|
|
|
62
80
|
extension is unknown, the attribute is left as ``None`` and will be
|
|
63
81
|
validated later by :meth:`_ensure_format`.
|
|
64
82
|
"""
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
self.path = Path(self.path)
|
|
68
|
-
|
|
83
|
+
self.path = Path(self.path)
|
|
84
|
+
self.file_format = self._coerce_format(self.file_format)
|
|
69
85
|
if self.file_format is None:
|
|
70
|
-
|
|
71
|
-
self.file_format = self._guess_format()
|
|
72
|
-
except ValueError:
|
|
73
|
-
# Leave as None; _ensure_format() will raise on use if needed.
|
|
74
|
-
pass
|
|
86
|
+
self.file_format = self._maybe_guess_format()
|
|
75
87
|
|
|
76
88
|
# -- Internal Instance Methods -- #
|
|
77
89
|
|
|
@@ -84,6 +96,28 @@ class File:
|
|
|
84
96
|
if not self.path.exists():
|
|
85
97
|
raise FileNotFoundError(f'File not found: {self.path}')
|
|
86
98
|
|
|
99
|
+
def _coerce_format(
|
|
100
|
+
self,
|
|
101
|
+
file_format: FileFormat | str | None,
|
|
102
|
+
) -> FileFormat | None:
|
|
103
|
+
"""
|
|
104
|
+
Normalize the file format input.
|
|
105
|
+
|
|
106
|
+
Parameters
|
|
107
|
+
----------
|
|
108
|
+
file_format : FileFormat | str | None
|
|
109
|
+
File format specifier. Strings are coerced into
|
|
110
|
+
:class:`FileFormat`.
|
|
111
|
+
|
|
112
|
+
Returns
|
|
113
|
+
-------
|
|
114
|
+
FileFormat | None
|
|
115
|
+
A normalized file format, or ``None`` when unspecified.
|
|
116
|
+
"""
|
|
117
|
+
if file_format is None or isinstance(file_format, FileFormat):
|
|
118
|
+
return file_format
|
|
119
|
+
return FileFormat.coerce(file_format)
|
|
120
|
+
|
|
87
121
|
def _ensure_format(self) -> FileFormat:
|
|
88
122
|
"""
|
|
89
123
|
Resolve the active format, guessing from extension if needed.
|
|
@@ -125,7 +159,22 @@ class File:
|
|
|
125
159
|
f'Cannot infer file format from extension {self.path.suffix!r}',
|
|
126
160
|
)
|
|
127
161
|
|
|
128
|
-
|
|
162
|
+
def _maybe_guess_format(self) -> FileFormat | None:
|
|
163
|
+
"""
|
|
164
|
+
Try to infer the format, returning ``None`` if it cannot be inferred.
|
|
165
|
+
|
|
166
|
+
Returns
|
|
167
|
+
-------
|
|
168
|
+
FileFormat | None
|
|
169
|
+
The inferred format, or ``None`` if inference fails.
|
|
170
|
+
"""
|
|
171
|
+
try:
|
|
172
|
+
return self._guess_format()
|
|
173
|
+
except ValueError:
|
|
174
|
+
# Leave as None; _ensure_format() will raise on use if needed.
|
|
175
|
+
return None
|
|
176
|
+
|
|
177
|
+
# -- Instance Methods -- #
|
|
129
178
|
|
|
130
179
|
def read(self) -> JSONData:
|
|
131
180
|
"""
|
|
@@ -144,14 +193,36 @@ class File:
|
|
|
144
193
|
self._assert_exists()
|
|
145
194
|
fmt = self._ensure_format()
|
|
146
195
|
match fmt:
|
|
196
|
+
case FileFormat.AVRO:
|
|
197
|
+
return avro.read(self.path)
|
|
147
198
|
case FileFormat.CSV:
|
|
148
199
|
return csv.read(self.path)
|
|
200
|
+
case FileFormat.FEATHER:
|
|
201
|
+
return feather.read(self.path)
|
|
202
|
+
case FileFormat.GZ:
|
|
203
|
+
return gz.read(self.path)
|
|
149
204
|
case FileFormat.JSON:
|
|
150
205
|
return json.read(self.path)
|
|
206
|
+
case FileFormat.NDJSON:
|
|
207
|
+
return ndjson.read(self.path)
|
|
208
|
+
case FileFormat.ORC:
|
|
209
|
+
return orc.read(self.path)
|
|
210
|
+
case FileFormat.PARQUET:
|
|
211
|
+
return parquet.read(self.path)
|
|
212
|
+
case FileFormat.TSV:
|
|
213
|
+
return tsv.read(self.path)
|
|
214
|
+
case FileFormat.TXT:
|
|
215
|
+
return txt.read(self.path)
|
|
216
|
+
case FileFormat.XLS:
|
|
217
|
+
return xls.read(self.path)
|
|
218
|
+
case FileFormat.XLSX:
|
|
219
|
+
return xlsx.read(self.path)
|
|
151
220
|
case FileFormat.XML:
|
|
152
221
|
return xml.read(self.path)
|
|
153
222
|
case FileFormat.YAML:
|
|
154
223
|
return yaml.read(self.path)
|
|
224
|
+
case FileFormat.ZIP:
|
|
225
|
+
return zip.read(self.path)
|
|
155
226
|
raise ValueError(f'Unsupported format: {fmt}')
|
|
156
227
|
|
|
157
228
|
def write(
|
|
@@ -183,105 +254,34 @@ class File:
|
|
|
183
254
|
"""
|
|
184
255
|
fmt = self._ensure_format()
|
|
185
256
|
match fmt:
|
|
257
|
+
case FileFormat.AVRO:
|
|
258
|
+
return avro.write(self.path, data)
|
|
186
259
|
case FileFormat.CSV:
|
|
187
260
|
return csv.write(self.path, data)
|
|
261
|
+
case FileFormat.FEATHER:
|
|
262
|
+
return feather.write(self.path, data)
|
|
263
|
+
case FileFormat.GZ:
|
|
264
|
+
return gz.write(self.path, data)
|
|
188
265
|
case FileFormat.JSON:
|
|
189
266
|
return json.write(self.path, data)
|
|
267
|
+
case FileFormat.NDJSON:
|
|
268
|
+
return ndjson.write(self.path, data)
|
|
269
|
+
case FileFormat.ORC:
|
|
270
|
+
return orc.write(self.path, data)
|
|
271
|
+
case FileFormat.PARQUET:
|
|
272
|
+
return parquet.write(self.path, data)
|
|
273
|
+
case FileFormat.TSV:
|
|
274
|
+
return tsv.write(self.path, data)
|
|
275
|
+
case FileFormat.TXT:
|
|
276
|
+
return txt.write(self.path, data)
|
|
277
|
+
case FileFormat.XLS:
|
|
278
|
+
return xls.write(self.path, data)
|
|
279
|
+
case FileFormat.XLSX:
|
|
280
|
+
return xlsx.write(self.path, data)
|
|
190
281
|
case FileFormat.XML:
|
|
191
282
|
return xml.write(self.path, data, root_tag=root_tag)
|
|
192
283
|
case FileFormat.YAML:
|
|
193
284
|
return yaml.write(self.path, data)
|
|
285
|
+
case FileFormat.ZIP:
|
|
286
|
+
return zip.write(self.path, data)
|
|
194
287
|
raise ValueError(f'Unsupported format: {fmt}')
|
|
195
|
-
|
|
196
|
-
# -- Class Methods -- #
|
|
197
|
-
|
|
198
|
-
@classmethod
|
|
199
|
-
def from_path(
|
|
200
|
-
cls,
|
|
201
|
-
path: StrPath,
|
|
202
|
-
*,
|
|
203
|
-
file_format: FileFormat | str | None = None,
|
|
204
|
-
) -> File:
|
|
205
|
-
"""
|
|
206
|
-
Create a :class:`File` from any path-like and optional format.
|
|
207
|
-
|
|
208
|
-
Parameters
|
|
209
|
-
----------
|
|
210
|
-
path : StrPath
|
|
211
|
-
Path to the file on disk.
|
|
212
|
-
file_format : FileFormat | str | None, optional
|
|
213
|
-
Explicit format. If omitted, the format is inferred from the file
|
|
214
|
-
extension (``.csv``, ``.json``, or ``.xml``).
|
|
215
|
-
|
|
216
|
-
Returns
|
|
217
|
-
-------
|
|
218
|
-
File
|
|
219
|
-
The constructed :class:`File` instance.
|
|
220
|
-
"""
|
|
221
|
-
resolved = Path(path)
|
|
222
|
-
ff: FileFormat | None
|
|
223
|
-
if isinstance(file_format, str):
|
|
224
|
-
ff = FileFormat.coerce(file_format)
|
|
225
|
-
else:
|
|
226
|
-
ff = file_format
|
|
227
|
-
|
|
228
|
-
return cls(resolved, ff)
|
|
229
|
-
|
|
230
|
-
@classmethod
|
|
231
|
-
def read_file(
|
|
232
|
-
cls,
|
|
233
|
-
path: StrPath,
|
|
234
|
-
file_format: FileFormat | str | None = None,
|
|
235
|
-
) -> JSONData:
|
|
236
|
-
"""
|
|
237
|
-
Read structured data.
|
|
238
|
-
|
|
239
|
-
Parameters
|
|
240
|
-
----------
|
|
241
|
-
path : StrPath
|
|
242
|
-
Path to the file on disk.
|
|
243
|
-
file_format : FileFormat | str | None, optional
|
|
244
|
-
Explicit format. If omitted, the format is inferred from the file
|
|
245
|
-
extension (``.csv``, ``.json``, or ``.xml``).
|
|
246
|
-
|
|
247
|
-
Returns
|
|
248
|
-
-------
|
|
249
|
-
JSONData
|
|
250
|
-
The structured data read from the file.
|
|
251
|
-
"""
|
|
252
|
-
return cls.from_path(path, file_format=file_format).read()
|
|
253
|
-
|
|
254
|
-
@classmethod
|
|
255
|
-
def write_file(
|
|
256
|
-
cls,
|
|
257
|
-
path: StrPath,
|
|
258
|
-
data: JSONData,
|
|
259
|
-
file_format: FileFormat | str | None = None,
|
|
260
|
-
*,
|
|
261
|
-
root_tag: str = xml.DEFAULT_XML_ROOT,
|
|
262
|
-
) -> int:
|
|
263
|
-
"""
|
|
264
|
-
Write structured data and count written records.
|
|
265
|
-
|
|
266
|
-
Parameters
|
|
267
|
-
----------
|
|
268
|
-
path : StrPath
|
|
269
|
-
Path to the file on disk.
|
|
270
|
-
data : JSONData
|
|
271
|
-
Data to write to the file.
|
|
272
|
-
file_format : FileFormat | str | None, optional
|
|
273
|
-
Explicit format. If omitted, the format is inferred from the file
|
|
274
|
-
extension (``.csv``, ``.json``, or ``.xml``).
|
|
275
|
-
root_tag : str, optional
|
|
276
|
-
Root tag name to use when writing XML files. Defaults to
|
|
277
|
-
``'root'``.
|
|
278
|
-
|
|
279
|
-
Returns
|
|
280
|
-
-------
|
|
281
|
-
int
|
|
282
|
-
The number of records written to the file.
|
|
283
|
-
"""
|
|
284
|
-
return cls.from_path(path, file_format=file_format).write(
|
|
285
|
-
data,
|
|
286
|
-
root_tag=root_tag,
|
|
287
|
-
)
|
etlplus/file/csv.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""
|
|
2
2
|
:mod:`etlplus.file.csv` module.
|
|
3
3
|
|
|
4
|
-
|
|
4
|
+
Helpers for reading/writing CSV files.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
@@ -14,6 +14,15 @@ from ..types import JSONData
|
|
|
14
14
|
from ..types import JSONDict
|
|
15
15
|
from ..types import JSONList
|
|
16
16
|
|
|
17
|
+
# SECTION: EXPORTS ========================================================== #
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
'read',
|
|
22
|
+
'write',
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
17
26
|
# SECTION: FUNCTIONS ======================================================== #
|
|
18
27
|
|
|
19
28
|
|
|
@@ -21,7 +30,7 @@ def read(
|
|
|
21
30
|
path: Path,
|
|
22
31
|
) -> JSONList:
|
|
23
32
|
"""
|
|
24
|
-
|
|
33
|
+
Read CSV content from ``path``.
|
|
25
34
|
|
|
26
35
|
Parameters
|
|
27
36
|
----------
|
|
@@ -48,7 +57,7 @@ def write(
|
|
|
48
57
|
data: JSONData,
|
|
49
58
|
) -> int:
|
|
50
59
|
"""
|
|
51
|
-
Write CSV
|
|
60
|
+
Write ``data`` to CSV at ``path`` and return record count.
|
|
52
61
|
|
|
53
62
|
Parameters
|
|
54
63
|
----------
|
etlplus/file/feather.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""
|
|
2
|
+
:mod:`etlplus.file.feather` module.
|
|
3
|
+
|
|
4
|
+
Helpers for reading/writing Feather files.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
from typing import cast
|
|
12
|
+
|
|
13
|
+
from ..types import JSONData
|
|
14
|
+
from ..types import JSONDict
|
|
15
|
+
from ..types import JSONList
|
|
16
|
+
|
|
17
|
+
# SECTION: EXPORTS ========================================================== #
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
'read',
|
|
22
|
+
'write',
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# SECTION: INTERNAL CONSTANTS =============================================== #
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
_PANDAS_CACHE: dict[str, Any] = {}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# SECTION: INTERNAL FUNCTIONS =============================================== #
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _get_pandas() -> Any:
|
|
36
|
+
"""
|
|
37
|
+
Return the pandas module, importing it on first use.
|
|
38
|
+
|
|
39
|
+
Raises an informative ImportError if the optional dependency is missing.
|
|
40
|
+
"""
|
|
41
|
+
mod = _PANDAS_CACHE.get('mod')
|
|
42
|
+
if mod is not None: # pragma: no cover - tiny branch
|
|
43
|
+
return mod
|
|
44
|
+
try:
|
|
45
|
+
_pd = __import__('pandas') # type: ignore[assignment]
|
|
46
|
+
except ImportError as e: # pragma: no cover
|
|
47
|
+
raise ImportError(
|
|
48
|
+
'Feather support requires optional dependency "pandas".\n'
|
|
49
|
+
'Install with: pip install pandas',
|
|
50
|
+
) from e
|
|
51
|
+
_PANDAS_CACHE['mod'] = _pd
|
|
52
|
+
|
|
53
|
+
return _pd
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _normalize_records(data: JSONData) -> JSONList:
|
|
57
|
+
"""
|
|
58
|
+
Normalize JSON payloads into a list of dictionaries.
|
|
59
|
+
|
|
60
|
+
Raises TypeError when payloads contain non-dict items.
|
|
61
|
+
"""
|
|
62
|
+
if isinstance(data, list):
|
|
63
|
+
if not all(isinstance(item, dict) for item in data):
|
|
64
|
+
raise TypeError(
|
|
65
|
+
'Feather payloads must contain only objects (dicts)',
|
|
66
|
+
)
|
|
67
|
+
return cast(JSONList, data)
|
|
68
|
+
return [cast(JSONDict, data)]
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# SECTION: FUNCTIONS ======================================================== #
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def read(
|
|
75
|
+
path: Path,
|
|
76
|
+
) -> JSONList:
|
|
77
|
+
"""
|
|
78
|
+
Read Feather content from ``path``.
|
|
79
|
+
|
|
80
|
+
Parameters
|
|
81
|
+
----------
|
|
82
|
+
path : Path
|
|
83
|
+
Path to the Feather file on disk.
|
|
84
|
+
|
|
85
|
+
Returns
|
|
86
|
+
-------
|
|
87
|
+
JSONList
|
|
88
|
+
The list of dictionaries read from the Feather file.
|
|
89
|
+
|
|
90
|
+
Raises
|
|
91
|
+
------
|
|
92
|
+
ImportError
|
|
93
|
+
When optional dependency "pyarrow" is missing.
|
|
94
|
+
"""
|
|
95
|
+
pandas = _get_pandas()
|
|
96
|
+
try:
|
|
97
|
+
frame = pandas.read_feather(path)
|
|
98
|
+
except ImportError as e: # pragma: no cover
|
|
99
|
+
raise ImportError(
|
|
100
|
+
'Feather support requires optional dependency "pyarrow".\n'
|
|
101
|
+
'Install with: pip install pyarrow',
|
|
102
|
+
) from e
|
|
103
|
+
return cast(JSONList, frame.to_dict(orient='records'))
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def write(
|
|
107
|
+
path: Path,
|
|
108
|
+
data: JSONData,
|
|
109
|
+
) -> int:
|
|
110
|
+
"""
|
|
111
|
+
Write ``data`` to Feather at ``path`` and return record count.
|
|
112
|
+
|
|
113
|
+
Parameters
|
|
114
|
+
----------
|
|
115
|
+
path : Path
|
|
116
|
+
Path to the Feather file on disk.
|
|
117
|
+
data : JSONData
|
|
118
|
+
Data to write.
|
|
119
|
+
|
|
120
|
+
Returns
|
|
121
|
+
-------
|
|
122
|
+
int
|
|
123
|
+
Number of records written.
|
|
124
|
+
|
|
125
|
+
Raises
|
|
126
|
+
------
|
|
127
|
+
ImportError
|
|
128
|
+
When optional dependency "pyarrow" is missing.
|
|
129
|
+
"""
|
|
130
|
+
records = _normalize_records(data)
|
|
131
|
+
if not records:
|
|
132
|
+
return 0
|
|
133
|
+
|
|
134
|
+
pandas = _get_pandas()
|
|
135
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
136
|
+
frame = pandas.DataFrame.from_records(records)
|
|
137
|
+
try:
|
|
138
|
+
frame.to_feather(path)
|
|
139
|
+
except ImportError as e: # pragma: no cover
|
|
140
|
+
raise ImportError(
|
|
141
|
+
'Feather support requires optional dependency "pyarrow".\n'
|
|
142
|
+
'Install with: pip install pyarrow',
|
|
143
|
+
) from e
|
|
144
|
+
return len(records)
|
etlplus/file/gz.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""
|
|
2
|
+
:mod:`etlplus.file.gz` module.
|
|
3
|
+
|
|
4
|
+
Helpers for reading/writing GZ files.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import gzip
|
|
10
|
+
import tempfile
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
from ..types import JSONData
|
|
14
|
+
from .enums import CompressionFormat
|
|
15
|
+
from .enums import FileFormat
|
|
16
|
+
from .enums import infer_file_format_and_compression
|
|
17
|
+
|
|
18
|
+
# SECTION: EXPORTS ========================================================== #
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
'read',
|
|
23
|
+
'write',
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# SECTION: INTERNAL FUNCTIONS =============================================== #
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _resolve_format(
|
|
31
|
+
path: Path,
|
|
32
|
+
) -> FileFormat:
|
|
33
|
+
"""
|
|
34
|
+
Resolve the inner file format from a .gz filename.
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
----------
|
|
38
|
+
path : Path
|
|
39
|
+
Path to the GZ file on disk.
|
|
40
|
+
|
|
41
|
+
Returns
|
|
42
|
+
-------
|
|
43
|
+
FileFormat
|
|
44
|
+
The inferred inner file format.
|
|
45
|
+
|
|
46
|
+
Raises
|
|
47
|
+
------
|
|
48
|
+
ValueError
|
|
49
|
+
If the file format cannot be inferred from the filename.
|
|
50
|
+
"""
|
|
51
|
+
fmt, compression = infer_file_format_and_compression(path)
|
|
52
|
+
if compression is not CompressionFormat.GZ:
|
|
53
|
+
raise ValueError(f'Not a gzip file: {path}')
|
|
54
|
+
if fmt is None:
|
|
55
|
+
raise ValueError(
|
|
56
|
+
f'Cannot infer file format from compressed file {path!r}',
|
|
57
|
+
)
|
|
58
|
+
return fmt
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# SECTION: FUNCTIONS ======================================================== #
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def read(
|
|
65
|
+
path: Path,
|
|
66
|
+
) -> JSONData:
|
|
67
|
+
"""
|
|
68
|
+
Read GZ content from ``path`` and parse the inner payload.
|
|
69
|
+
|
|
70
|
+
Parameters
|
|
71
|
+
----------
|
|
72
|
+
path : Path
|
|
73
|
+
Path to the GZ file on disk.
|
|
74
|
+
|
|
75
|
+
Returns
|
|
76
|
+
-------
|
|
77
|
+
JSONData
|
|
78
|
+
Parsed payload.
|
|
79
|
+
"""
|
|
80
|
+
fmt = _resolve_format(path)
|
|
81
|
+
with gzip.open(path, 'rb') as handle:
|
|
82
|
+
payload = handle.read()
|
|
83
|
+
|
|
84
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
85
|
+
tmp_path = Path(tmpdir) / f'payload.{fmt.value}'
|
|
86
|
+
tmp_path.write_bytes(payload)
|
|
87
|
+
from .core import File
|
|
88
|
+
|
|
89
|
+
return File(tmp_path, fmt).read()
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def write(
|
|
93
|
+
path: Path,
|
|
94
|
+
data: JSONData,
|
|
95
|
+
) -> int:
|
|
96
|
+
"""
|
|
97
|
+
Write ``data`` to GZ at ``path`` and return record count.
|
|
98
|
+
|
|
99
|
+
Parameters
|
|
100
|
+
----------
|
|
101
|
+
path : Path
|
|
102
|
+
Path to the GZ file on disk.
|
|
103
|
+
data : JSONData
|
|
104
|
+
Data to write.
|
|
105
|
+
|
|
106
|
+
Returns
|
|
107
|
+
-------
|
|
108
|
+
int
|
|
109
|
+
Number of records written.
|
|
110
|
+
"""
|
|
111
|
+
fmt = _resolve_format(path)
|
|
112
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
113
|
+
tmp_path = Path(tmpdir) / f'payload.{fmt.value}'
|
|
114
|
+
from .core import File
|
|
115
|
+
|
|
116
|
+
count = File(tmp_path, fmt).write(data)
|
|
117
|
+
payload = tmp_path.read_bytes()
|
|
118
|
+
|
|
119
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
120
|
+
with gzip.open(path, 'wb') as handle:
|
|
121
|
+
handle.write(payload)
|
|
122
|
+
|
|
123
|
+
return count
|
etlplus/file/json.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""
|
|
2
2
|
:mod:`etlplus.file.json` module.
|
|
3
3
|
|
|
4
|
-
|
|
4
|
+
Helpers for reading/writing JSON files.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
@@ -15,6 +15,15 @@ from ..types import JSONDict
|
|
|
15
15
|
from ..types import JSONList
|
|
16
16
|
from ..utils import count_records
|
|
17
17
|
|
|
18
|
+
# SECTION: EXPORTS ========================================================== #
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
'read',
|
|
23
|
+
'write',
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
|
|
18
27
|
# SECTION: FUNCTIONS ======================================================== #
|
|
19
28
|
|
|
20
29
|
|
|
@@ -22,7 +31,9 @@ def read(
|
|
|
22
31
|
path: Path,
|
|
23
32
|
) -> JSONData:
|
|
24
33
|
"""
|
|
25
|
-
|
|
34
|
+
Read JSON content from ``path``.
|
|
35
|
+
|
|
36
|
+
Validates that the JSON root is a dict or a list of dicts.
|
|
26
37
|
|
|
27
38
|
Parameters
|
|
28
39
|
----------
|