etlplus 0.10.4__py3-none-any.whl → 0.11.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
etlplus/file/core.py ADDED
@@ -0,0 +1,287 @@
1
+ """
2
+ :mod:`etlplus.file.core` module.
3
+
4
+ Shared helpers for reading and writing structured and semi-structured data
5
+ files.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass
11
+ from pathlib import Path
12
+
13
+ from ..types import JSONData
14
+ from . import avro
15
+ from . import csv
16
+ from . import feather
17
+ from . import gz
18
+ from . import json
19
+ from . import ndjson
20
+ from . import orc
21
+ from . import parquet
22
+ from . import tsv
23
+ from . import txt
24
+ from . import xls
25
+ from . import xlsx
26
+ from . import xml
27
+ from . import yaml
28
+ from . import zip
29
+ from .enums import FileFormat
30
+ from .enums import infer_file_format_and_compression
31
+
32
+ # SECTION: EXPORTS ========================================================== #
33
+
34
+
35
+ __all__ = ['File']
36
+
37
+
38
+ # SECTION: CLASSES ========================================================== #
39
+
40
+
41
+ @dataclass(slots=True)
42
+ class File:
43
+ """
44
+ Convenience wrapper around structured file IO.
45
+
46
+ This class encapsulates the one-off helpers in this module as convenient
47
+ instance methods while retaining the original function API for
48
+ backward compatibility (those functions delegate to this class).
49
+
50
+ Attributes
51
+ ----------
52
+ path : Path
53
+ Path to the file on disk.
54
+ file_format : FileFormat | None, optional
55
+ Explicit format. If omitted, the format is inferred from the file
56
+ extension (``.csv``, ``.json``, etc.).
57
+
58
+ Parameters
59
+ ----------
60
+ path : StrPath
61
+ Path to the file on disk.
62
+ file_format : FileFormat | str | None, optional
63
+ Explicit format. If omitted, the format is inferred from the file
64
+ extension (``.csv``, ``.json``, etc.).
65
+ """
66
+
67
+ # -- Attributes -- #
68
+
69
+ path: Path
70
+ file_format: FileFormat | None = None
71
+
72
+ # -- Magic Methods (Object Lifecycle) -- #
73
+
74
+ def __post_init__(self) -> None:
75
+ """
76
+ Auto-detect and set the file format on initialization.
77
+
78
+ If no explicit ``file_format`` is provided, attempt to infer it from
79
+ the file path's extension and update :attr:`file_format`. If the
80
+ extension is unknown, the attribute is left as ``None`` and will be
81
+ validated later by :meth:`_ensure_format`.
82
+ """
83
+ self.path = Path(self.path)
84
+ self.file_format = self._coerce_format(self.file_format)
85
+ if self.file_format is None:
86
+ self.file_format = self._maybe_guess_format()
87
+
88
+ # -- Internal Instance Methods -- #
89
+
90
+ def _assert_exists(self) -> None:
91
+ """
92
+ Raise FileNotFoundError if :attr:`path` does not exist.
93
+
94
+ This centralizes existence checks across multiple read methods.
95
+ """
96
+ if not self.path.exists():
97
+ raise FileNotFoundError(f'File not found: {self.path}')
98
+
99
+ def _coerce_format(
100
+ self,
101
+ file_format: FileFormat | str | None,
102
+ ) -> FileFormat | None:
103
+ """
104
+ Normalize the file format input.
105
+
106
+ Parameters
107
+ ----------
108
+ file_format : FileFormat | str | None
109
+ File format specifier. Strings are coerced into
110
+ :class:`FileFormat`.
111
+
112
+ Returns
113
+ -------
114
+ FileFormat | None
115
+ A normalized file format, or ``None`` when unspecified.
116
+ """
117
+ if file_format is None or isinstance(file_format, FileFormat):
118
+ return file_format
119
+ return FileFormat.coerce(file_format)
120
+
121
+ def _ensure_format(self) -> FileFormat:
122
+ """
123
+ Resolve the active format, guessing from extension if needed.
124
+
125
+ Returns
126
+ -------
127
+ FileFormat
128
+ The resolved file format.
129
+ """
130
+ return (
131
+ self.file_format
132
+ if self.file_format is not None
133
+ else self._guess_format()
134
+ )
135
+
136
+ def _guess_format(self) -> FileFormat:
137
+ """
138
+ Infer the file format from the filename extension.
139
+
140
+ Returns
141
+ -------
142
+ FileFormat
143
+ The inferred file format based on the file extension.
144
+
145
+ Raises
146
+ ------
147
+ ValueError
148
+ If the extension is unknown or unsupported.
149
+ """
150
+ fmt, compression = infer_file_format_and_compression(self.path)
151
+ if fmt is not None:
152
+ return fmt
153
+ if compression is not None:
154
+ raise ValueError(
155
+ 'Cannot infer file format from compressed file '
156
+ f'{self.path!r} with compression {compression.value!r}',
157
+ )
158
+ raise ValueError(
159
+ f'Cannot infer file format from extension {self.path.suffix!r}',
160
+ )
161
+
162
+ def _maybe_guess_format(self) -> FileFormat | None:
163
+ """
164
+ Try to infer the format, returning ``None`` if it cannot be inferred.
165
+
166
+ Returns
167
+ -------
168
+ FileFormat | None
169
+ The inferred format, or ``None`` if inference fails.
170
+ """
171
+ try:
172
+ return self._guess_format()
173
+ except ValueError:
174
+ # Leave as None; _ensure_format() will raise on use if needed.
175
+ return None
176
+
177
+ # -- Instance Methods -- #
178
+
179
+ def read(self) -> JSONData:
180
+ """
181
+ Read structured data from :attr:`path` using :attr:`file_format`.
182
+
183
+ Returns
184
+ -------
185
+ JSONData
186
+ The structured data read from the file.
187
+
188
+ Raises
189
+ ------
190
+ ValueError
191
+ If the resolved file format is unsupported.
192
+ """
193
+ self._assert_exists()
194
+ fmt = self._ensure_format()
195
+ match fmt:
196
+ case FileFormat.AVRO:
197
+ return avro.read(self.path)
198
+ case FileFormat.CSV:
199
+ return csv.read(self.path)
200
+ case FileFormat.FEATHER:
201
+ return feather.read(self.path)
202
+ case FileFormat.GZ:
203
+ return gz.read(self.path)
204
+ case FileFormat.JSON:
205
+ return json.read(self.path)
206
+ case FileFormat.NDJSON:
207
+ return ndjson.read(self.path)
208
+ case FileFormat.ORC:
209
+ return orc.read(self.path)
210
+ case FileFormat.PARQUET:
211
+ return parquet.read(self.path)
212
+ case FileFormat.TSV:
213
+ return tsv.read(self.path)
214
+ case FileFormat.TXT:
215
+ return txt.read(self.path)
216
+ case FileFormat.XLS:
217
+ return xls.read(self.path)
218
+ case FileFormat.XLSX:
219
+ return xlsx.read(self.path)
220
+ case FileFormat.XML:
221
+ return xml.read(self.path)
222
+ case FileFormat.YAML:
223
+ return yaml.read(self.path)
224
+ case FileFormat.ZIP:
225
+ return zip.read(self.path)
226
+ raise ValueError(f'Unsupported format: {fmt}')
227
+
228
+ def write(
229
+ self,
230
+ data: JSONData,
231
+ *,
232
+ root_tag: str = xml.DEFAULT_XML_ROOT,
233
+ ) -> int:
234
+ """
235
+ Write ``data`` to :attr:`path` using :attr:`file_format`.
236
+
237
+ Parameters
238
+ ----------
239
+ data : JSONData
240
+ Data to write to the file.
241
+ root_tag : str, optional
242
+ Root tag name to use when writing XML files. Defaults to
243
+ ``'root'``.
244
+
245
+ Returns
246
+ -------
247
+ int
248
+ The number of records written.
249
+
250
+ Raises
251
+ ------
252
+ ValueError
253
+ If the resolved file format is unsupported.
254
+ """
255
+ fmt = self._ensure_format()
256
+ match fmt:
257
+ case FileFormat.AVRO:
258
+ return avro.write(self.path, data)
259
+ case FileFormat.CSV:
260
+ return csv.write(self.path, data)
261
+ case FileFormat.FEATHER:
262
+ return feather.write(self.path, data)
263
+ case FileFormat.GZ:
264
+ return gz.write(self.path, data)
265
+ case FileFormat.JSON:
266
+ return json.write(self.path, data)
267
+ case FileFormat.NDJSON:
268
+ return ndjson.write(self.path, data)
269
+ case FileFormat.ORC:
270
+ return orc.write(self.path, data)
271
+ case FileFormat.PARQUET:
272
+ return parquet.write(self.path, data)
273
+ case FileFormat.TSV:
274
+ return tsv.write(self.path, data)
275
+ case FileFormat.TXT:
276
+ return txt.write(self.path, data)
277
+ case FileFormat.XLS:
278
+ return xls.write(self.path, data)
279
+ case FileFormat.XLSX:
280
+ return xlsx.write(self.path, data)
281
+ case FileFormat.XML:
282
+ return xml.write(self.path, data, root_tag=root_tag)
283
+ case FileFormat.YAML:
284
+ return yaml.write(self.path, data)
285
+ case FileFormat.ZIP:
286
+ return zip.write(self.path, data)
287
+ raise ValueError(f'Unsupported format: {fmt}')
etlplus/file/csv.py ADDED
@@ -0,0 +1,82 @@
1
+ """
2
+ :mod:`etlplus.file.csv` module.
3
+
4
+ CSV read/write helpers.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import csv
10
+ from pathlib import Path
11
+ from typing import cast
12
+
13
+ from ..types import JSONData
14
+ from ..types import JSONDict
15
+ from ..types import JSONList
16
+
17
+ # SECTION: FUNCTIONS ======================================================== #
18
+
19
+
20
+ def read(
21
+ path: Path,
22
+ ) -> JSONList:
23
+ """
24
+ Load CSV content as a list of dictionaries.
25
+
26
+ Parameters
27
+ ----------
28
+ path : Path
29
+ Path to the CSV file on disk.
30
+
31
+ Returns
32
+ -------
33
+ JSONList
34
+ The list of dictionaries read from the CSV file.
35
+ """
36
+ with path.open('r', encoding='utf-8', newline='') as handle:
37
+ reader: csv.DictReader[str] = csv.DictReader(handle)
38
+ rows: JSONList = []
39
+ for row in reader:
40
+ if not any(row.values()):
41
+ continue
42
+ rows.append(cast(JSONDict, dict(row)))
43
+ return rows
44
+
45
+
46
+ def write(
47
+ path: Path,
48
+ data: JSONData,
49
+ ) -> int:
50
+ """
51
+ Write CSV rows to ``path`` and return the number of rows.
52
+
53
+ Parameters
54
+ ----------
55
+ path : Path
56
+ Path to the CSV file on disk.
57
+ data : JSONData
58
+ Data to write as CSV. Should be a list of dictionaries or a
59
+ single dictionary.
60
+
61
+ Returns
62
+ -------
63
+ int
64
+ The number of rows written to the CSV file.
65
+ """
66
+ rows: list[JSONDict]
67
+ if isinstance(data, list):
68
+ rows = [row for row in data if isinstance(row, dict)]
69
+ else:
70
+ rows = [data]
71
+
72
+ if not rows:
73
+ return 0
74
+
75
+ fieldnames = sorted({key for row in rows for key in row})
76
+ with path.open('w', encoding='utf-8', newline='') as handle:
77
+ writer = csv.DictWriter(handle, fieldnames=fieldnames)
78
+ writer.writeheader()
79
+ for row in rows:
80
+ writer.writerow({field: row.get(field) for field in fieldnames})
81
+
82
+ return len(rows)
etlplus/file/enums.py ADDED
@@ -0,0 +1,238 @@
1
+ """
2
+ :mod:`etlplus.file.enums` module.
3
+
4
+ File-specific enums and helpers.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import PurePath
10
+
11
+ from ..enums import CoercibleStrEnum
12
+ from ..types import StrStrMap
13
+
14
+ # SECTION: EXPORTS ========================================================= #
15
+
16
+ __all__ = [
17
+ 'CompressionFormat',
18
+ 'FileFormat',
19
+ 'infer_file_format_and_compression',
20
+ ]
21
+
22
+
23
+ # SECTION: ENUMS ============================================================ #
24
+
25
+
26
+ class CompressionFormat(CoercibleStrEnum):
27
+ """Supported compression formats."""
28
+
29
+ # -- Constants -- #
30
+
31
+ GZ = 'gz'
32
+ ZIP = 'zip'
33
+
34
+ # -- Class Methods -- #
35
+
36
+ @classmethod
37
+ def aliases(cls) -> StrStrMap:
38
+ """
39
+ Return a mapping of common aliases for each enum member.
40
+
41
+ Returns
42
+ -------
43
+ StrStrMap
44
+ A mapping of alias names to their corresponding enum member names.
45
+ """
46
+ return {
47
+ # File extensions
48
+ '.gz': 'gz',
49
+ '.gzip': 'gz',
50
+ '.zip': 'zip',
51
+ # MIME types
52
+ 'application/gzip': 'gz',
53
+ 'application/x-gzip': 'gz',
54
+ 'application/zip': 'zip',
55
+ 'application/x-zip-compressed': 'zip',
56
+ }
57
+
58
+
59
+ class FileFormat(CoercibleStrEnum):
60
+ """Supported file formats for extraction."""
61
+
62
+ # -- Constants -- #
63
+
64
+ AVRO = 'avro'
65
+ CSV = 'csv'
66
+ FEATHER = 'feather'
67
+ GZ = 'gz'
68
+ JSON = 'json'
69
+ NDJSON = 'ndjson'
70
+ ORC = 'orc'
71
+ PARQUET = 'parquet'
72
+ TSV = 'tsv'
73
+ TXT = 'txt'
74
+ XLS = 'xls'
75
+ XLSX = 'xlsx'
76
+ ZIP = 'zip'
77
+ XML = 'xml'
78
+ YAML = 'yaml'
79
+
80
+ # -- Class Methods -- #
81
+
82
+ @classmethod
83
+ def aliases(cls) -> StrStrMap:
84
+ """
85
+ Return a mapping of common aliases for each enum member.
86
+
87
+ Returns
88
+ -------
89
+ StrStrMap
90
+ A mapping of alias names to their corresponding enum member names.
91
+ """
92
+ return {
93
+ # Common shorthand
94
+ 'parq': 'parquet',
95
+ 'yml': 'yaml',
96
+ # File extensions
97
+ '.avro': 'avro',
98
+ '.csv': 'csv',
99
+ '.feather': 'feather',
100
+ '.gz': 'gz',
101
+ '.json': 'json',
102
+ '.jsonl': 'ndjson',
103
+ '.ndjson': 'ndjson',
104
+ '.orc': 'orc',
105
+ '.parquet': 'parquet',
106
+ '.pq': 'parquet',
107
+ '.tsv': 'tsv',
108
+ '.txt': 'txt',
109
+ '.xls': 'xls',
110
+ '.xlsx': 'xlsx',
111
+ '.zip': 'zip',
112
+ '.xml': 'xml',
113
+ '.yaml': 'yaml',
114
+ '.yml': 'yaml',
115
+ # MIME types
116
+ 'application/avro': 'avro',
117
+ 'application/csv': 'csv',
118
+ 'application/feather': 'feather',
119
+ 'application/gzip': 'gz',
120
+ 'application/json': 'json',
121
+ 'application/jsonlines': 'ndjson',
122
+ 'application/ndjson': 'ndjson',
123
+ 'application/orc': 'orc',
124
+ 'application/parquet': 'parquet',
125
+ 'application/vnd.apache.avro': 'avro',
126
+ 'application/vnd.apache.parquet': 'parquet',
127
+ 'application/vnd.apache.arrow.file': 'feather',
128
+ 'application/vnd.apache.orc': 'orc',
129
+ 'application/vnd.ms-excel': 'xls',
130
+ (
131
+ 'application/vnd.openxmlformats-'
132
+ 'officedocument.spreadsheetml.sheet'
133
+ ): 'xlsx',
134
+ 'application/x-avro': 'avro',
135
+ 'application/x-csv': 'csv',
136
+ 'application/x-feather': 'feather',
137
+ 'application/x-orc': 'orc',
138
+ 'application/x-ndjson': 'ndjson',
139
+ 'application/x-parquet': 'parquet',
140
+ 'application/x-yaml': 'yaml',
141
+ 'application/xml': 'xml',
142
+ 'application/zip': 'zip',
143
+ 'text/csv': 'csv',
144
+ 'text/plain': 'txt',
145
+ 'text/tab-separated-values': 'tsv',
146
+ 'text/tsv': 'tsv',
147
+ 'text/xml': 'xml',
148
+ 'text/yaml': 'yaml',
149
+ }
150
+
151
+
152
+ # SECTION: INTERNAL CONSTANTS =============================================== #
153
+
154
+
155
+ # Compression formats that are also file formats.
156
+ _COMPRESSION_FILE_FORMATS: set[FileFormat] = {
157
+ FileFormat.GZ,
158
+ FileFormat.ZIP,
159
+ }
160
+
161
+
162
+ # SECTION: FUNCTIONS ======================================================== #
163
+
164
+
165
+ # TODO: Convert to a method on FileFormat or CompressionFormat?
166
+ def infer_file_format_and_compression(
167
+ value: object,
168
+ filename: object | None = None,
169
+ ) -> tuple[FileFormat | None, CompressionFormat | None]:
170
+ """
171
+ Infer data format and compression from a filename, extension, or MIME type.
172
+
173
+ Parameters
174
+ ----------
175
+ value : object
176
+ A filename, extension, MIME type, or existing enum member.
177
+ filename : object | None, optional
178
+ A filename to consult for extension-based inference (e.g. when
179
+ ``value`` is ``application/octet-stream``).
180
+
181
+ Returns
182
+ -------
183
+ tuple[FileFormat | None, CompressionFormat | None]
184
+ The inferred data format and compression, if any.
185
+ """
186
+ if isinstance(value, FileFormat):
187
+ if value in _COMPRESSION_FILE_FORMATS:
188
+ return None, CompressionFormat.coerce(value.value)
189
+ return value, None
190
+ if isinstance(value, CompressionFormat):
191
+ return None, value
192
+
193
+ text = str(value).strip()
194
+ if not text:
195
+ return None, None
196
+
197
+ normalized = text.casefold()
198
+ mime = normalized.split(';', 1)[0].strip()
199
+
200
+ is_octet_stream = mime == 'application/octet-stream'
201
+ compression = CompressionFormat.try_coerce(mime)
202
+ fmt = None if is_octet_stream else FileFormat.try_coerce(mime)
203
+
204
+ is_mime = mime.startswith(
205
+ (
206
+ 'application/',
207
+ 'text/',
208
+ 'audio/',
209
+ 'image/',
210
+ 'video/',
211
+ 'multipart/',
212
+ ),
213
+ )
214
+ suffix_source: object | None = filename if filename is not None else text
215
+ if is_mime and filename is None:
216
+ suffix_source = None
217
+
218
+ suffixes = (
219
+ PurePath(str(suffix_source)).suffixes
220
+ if suffix_source is not None
221
+ else []
222
+ )
223
+ if suffixes:
224
+ normalized_suffixes = [suffix.casefold() for suffix in suffixes]
225
+ compression = (
226
+ CompressionFormat.try_coerce(normalized_suffixes[-1])
227
+ or compression
228
+ )
229
+ if compression is not None:
230
+ normalized_suffixes = normalized_suffixes[:-1]
231
+ if normalized_suffixes:
232
+ fmt = FileFormat.try_coerce(normalized_suffixes[-1]) or fmt
233
+
234
+ if fmt in _COMPRESSION_FILE_FORMATS:
235
+ compression = compression or CompressionFormat.coerce(fmt.value)
236
+ fmt = None
237
+
238
+ return fmt, compression
@@ -0,0 +1,59 @@
1
+ """
2
+ :mod:`etlplus.file.feather` module.
3
+
4
+ Stub helpers for FEATHER read/write.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+
11
+ from ..types import JSONData
12
+
13
+ # SECTION: EXPORTS ========================================================== #
14
+
15
+
16
+ def read(path: Path) -> JSONData:
17
+ """
18
+ Read FEATHER content from ``path``.
19
+
20
+ Parameters
21
+ ----------
22
+ path : Path
23
+ Path to the FEATHER file on disk.
24
+
25
+ Returns
26
+ -------
27
+ JSONData
28
+ Parsed payload.
29
+
30
+ Raises
31
+ ------
32
+ NotImplementedError
33
+ FEATHER :func:`read` is not implemented yet.
34
+ """
35
+ raise NotImplementedError('FEATHER read is not implemented yet')
36
+
37
+
38
+ def write(path: Path, data: JSONData) -> int:
39
+ """
40
+ Write ``data`` to FEATHER at ``path``.
41
+
42
+ Parameters
43
+ ----------
44
+ path : Path
45
+ Path to the FEATHER file on disk.
46
+ data : JSONData
47
+ Data to write.
48
+
49
+ Returns
50
+ -------
51
+ int
52
+ Number of records written.
53
+
54
+ Raises
55
+ ------
56
+ NotImplementedError
57
+ FEATHER :func:`write` is not implemented yet.
58
+ """
59
+ raise NotImplementedError('FEATHER write is not implemented yet')