etlplus 0.10.5__py3-none-any.whl → 0.11.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
etlplus/file/core.py ADDED
@@ -0,0 +1,287 @@
1
+ """
2
+ :mod:`etlplus.file.core` module.
3
+
4
+ Shared helpers for reading and writing structured and semi-structured data
5
+ files.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass
11
+ from pathlib import Path
12
+
13
+ from ..types import JSONData
14
+ from ..types import StrPath
15
+ from . import csv
16
+ from . import json
17
+ from . import xml
18
+ from . import yaml
19
+ from .enums import FileFormat
20
+ from .enums import infer_file_format_and_compression
21
+
22
+ # SECTION: EXPORTS ========================================================== #
23
+
24
+
25
+ __all__ = ['File']
26
+
27
+
28
+ # SECTION: CLASSES ========================================================== #
29
+
30
+
31
+ @dataclass(slots=True)
32
+ class File:
33
+ """
34
+ Convenience wrapper around structured file IO.
35
+
36
+ This class encapsulates the one-off helpers in this module as convenient
37
+ instance methods while retaining the original function API for
38
+ backward compatibility (those functions delegate to this class).
39
+
40
+ Attributes
41
+ ----------
42
+ path : Path
43
+ Path to the file on disk.
44
+ file_format : FileFormat | None, optional
45
+ Explicit format. If omitted, the format is inferred from the file
46
+ extension (``.csv``, ``.json``, or ``.xml``).
47
+ """
48
+
49
+ # -- Attributes -- #
50
+
51
+ path: Path
52
+ file_format: FileFormat | None = None
53
+
54
+ # -- Magic Methods (Object Lifecycle) -- #
55
+
56
+ def __post_init__(self) -> None:
57
+ """
58
+ Auto-detect and set the file format on initialization.
59
+
60
+ If no explicit ``file_format`` is provided, attempt to infer it from
61
+ the file path's extension and update :attr:`file_format`. If the
62
+ extension is unknown, the attribute is left as ``None`` and will be
63
+ validated later by :meth:`_ensure_format`.
64
+ """
65
+ # Normalize incoming path (allow str in constructor) to Path.
66
+ if isinstance(self.path, str):
67
+ self.path = Path(self.path)
68
+
69
+ if self.file_format is None:
70
+ try:
71
+ self.file_format = self._guess_format()
72
+ except ValueError:
73
+ # Leave as None; _ensure_format() will raise on use if needed.
74
+ pass
75
+
76
+ # -- Internal Instance Methods -- #
77
+
78
+ def _assert_exists(self) -> None:
79
+ """
80
+ Raise FileNotFoundError if :attr:`path` does not exist.
81
+
82
+ This centralizes existence checks across multiple read methods.
83
+ """
84
+ if not self.path.exists():
85
+ raise FileNotFoundError(f'File not found: {self.path}')
86
+
87
+ def _ensure_format(self) -> FileFormat:
88
+ """
89
+ Resolve the active format, guessing from extension if needed.
90
+
91
+ Returns
92
+ -------
93
+ FileFormat
94
+ The resolved file format.
95
+ """
96
+ return (
97
+ self.file_format
98
+ if self.file_format is not None
99
+ else self._guess_format()
100
+ )
101
+
102
+ def _guess_format(self) -> FileFormat:
103
+ """
104
+ Infer the file format from the filename extension.
105
+
106
+ Returns
107
+ -------
108
+ FileFormat
109
+ The inferred file format based on the file extension.
110
+
111
+ Raises
112
+ ------
113
+ ValueError
114
+ If the extension is unknown or unsupported.
115
+ """
116
+ fmt, compression = infer_file_format_and_compression(self.path)
117
+ if fmt is not None:
118
+ return fmt
119
+ if compression is not None:
120
+ raise ValueError(
121
+ 'Cannot infer file format from compressed file '
122
+ f'{self.path!r} with compression {compression.value!r}',
123
+ )
124
+ raise ValueError(
125
+ f'Cannot infer file format from extension {self.path.suffix!r}',
126
+ )
127
+
128
+ # -- Instance Methods (Generic API) -- #
129
+
130
+ def read(self) -> JSONData:
131
+ """
132
+ Read structured data from :attr:`path` using :attr:`file_format`.
133
+
134
+ Returns
135
+ -------
136
+ JSONData
137
+ The structured data read from the file.
138
+
139
+ Raises
140
+ ------
141
+ ValueError
142
+ If the resolved file format is unsupported.
143
+ """
144
+ self._assert_exists()
145
+ fmt = self._ensure_format()
146
+ match fmt:
147
+ case FileFormat.CSV:
148
+ return csv.read(self.path)
149
+ case FileFormat.JSON:
150
+ return json.read(self.path)
151
+ case FileFormat.XML:
152
+ return xml.read(self.path)
153
+ case FileFormat.YAML:
154
+ return yaml.read(self.path)
155
+ raise ValueError(f'Unsupported format: {fmt}')
156
+
157
+ def write(
158
+ self,
159
+ data: JSONData,
160
+ *,
161
+ root_tag: str = xml.DEFAULT_XML_ROOT,
162
+ ) -> int:
163
+ """
164
+ Write ``data`` to :attr:`path` using :attr:`file_format`.
165
+
166
+ Parameters
167
+ ----------
168
+ data : JSONData
169
+ Data to write to the file.
170
+ root_tag : str, optional
171
+ Root tag name to use when writing XML files. Defaults to
172
+ ``'root'``.
173
+
174
+ Returns
175
+ -------
176
+ int
177
+ The number of records written.
178
+
179
+ Raises
180
+ ------
181
+ ValueError
182
+ If the resolved file format is unsupported.
183
+ """
184
+ fmt = self._ensure_format()
185
+ match fmt:
186
+ case FileFormat.CSV:
187
+ return csv.write(self.path, data)
188
+ case FileFormat.JSON:
189
+ return json.write(self.path, data)
190
+ case FileFormat.XML:
191
+ return xml.write(self.path, data, root_tag=root_tag)
192
+ case FileFormat.YAML:
193
+ return yaml.write(self.path, data)
194
+ raise ValueError(f'Unsupported format: {fmt}')
195
+
196
+ # -- Class Methods -- #
197
+
198
+ @classmethod
199
+ def from_path(
200
+ cls,
201
+ path: StrPath,
202
+ *,
203
+ file_format: FileFormat | str | None = None,
204
+ ) -> File:
205
+ """
206
+ Create a :class:`File` from any path-like and optional format.
207
+
208
+ Parameters
209
+ ----------
210
+ path : StrPath
211
+ Path to the file on disk.
212
+ file_format : FileFormat | str | None, optional
213
+ Explicit format. If omitted, the format is inferred from the file
214
+ extension (``.csv``, ``.json``, or ``.xml``).
215
+
216
+ Returns
217
+ -------
218
+ File
219
+ The constructed :class:`File` instance.
220
+ """
221
+ resolved = Path(path)
222
+ ff: FileFormat | None
223
+ if isinstance(file_format, str):
224
+ ff = FileFormat.coerce(file_format)
225
+ else:
226
+ ff = file_format
227
+
228
+ return cls(resolved, ff)
229
+
230
+ @classmethod
231
+ def read_file(
232
+ cls,
233
+ path: StrPath,
234
+ file_format: FileFormat | str | None = None,
235
+ ) -> JSONData:
236
+ """
237
+ Read structured data.
238
+
239
+ Parameters
240
+ ----------
241
+ path : StrPath
242
+ Path to the file on disk.
243
+ file_format : FileFormat | str | None, optional
244
+ Explicit format. If omitted, the format is inferred from the file
245
+ extension (``.csv``, ``.json``, or ``.xml``).
246
+
247
+ Returns
248
+ -------
249
+ JSONData
250
+ The structured data read from the file.
251
+ """
252
+ return cls.from_path(path, file_format=file_format).read()
253
+
254
+ @classmethod
255
+ def write_file(
256
+ cls,
257
+ path: StrPath,
258
+ data: JSONData,
259
+ file_format: FileFormat | str | None = None,
260
+ *,
261
+ root_tag: str = xml.DEFAULT_XML_ROOT,
262
+ ) -> int:
263
+ """
264
+ Write structured data and count written records.
265
+
266
+ Parameters
267
+ ----------
268
+ path : StrPath
269
+ Path to the file on disk.
270
+ data : JSONData
271
+ Data to write to the file.
272
+ file_format : FileFormat | str | None, optional
273
+ Explicit format. If omitted, the format is inferred from the file
274
+ extension (``.csv``, ``.json``, or ``.xml``).
275
+ root_tag : str, optional
276
+ Root tag name to use when writing XML files. Defaults to
277
+ ``'root'``.
278
+
279
+ Returns
280
+ -------
281
+ int
282
+ The number of records written to the file.
283
+ """
284
+ return cls.from_path(path, file_format=file_format).write(
285
+ data,
286
+ root_tag=root_tag,
287
+ )
etlplus/file/csv.py ADDED
@@ -0,0 +1,82 @@
1
+ """
2
+ :mod:`etlplus.file.csv` module.
3
+
4
+ CSV read/write helpers.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import csv
10
+ from pathlib import Path
11
+ from typing import cast
12
+
13
+ from ..types import JSONData
14
+ from ..types import JSONDict
15
+ from ..types import JSONList
16
+
17
+ # SECTION: FUNCTIONS ======================================================== #
18
+
19
+
20
+ def read(
21
+ path: Path,
22
+ ) -> JSONList:
23
+ """
24
+ Load CSV content as a list of dictionaries.
25
+
26
+ Parameters
27
+ ----------
28
+ path : Path
29
+ Path to the CSV file on disk.
30
+
31
+ Returns
32
+ -------
33
+ JSONList
34
+ The list of dictionaries read from the CSV file.
35
+ """
36
+ with path.open('r', encoding='utf-8', newline='') as handle:
37
+ reader: csv.DictReader[str] = csv.DictReader(handle)
38
+ rows: JSONList = []
39
+ for row in reader:
40
+ if not any(row.values()):
41
+ continue
42
+ rows.append(cast(JSONDict, dict(row)))
43
+ return rows
44
+
45
+
46
+ def write(
47
+ path: Path,
48
+ data: JSONData,
49
+ ) -> int:
50
+ """
51
+ Write CSV rows to ``path`` and return the number of rows.
52
+
53
+ Parameters
54
+ ----------
55
+ path : Path
56
+ Path to the CSV file on disk.
57
+ data : JSONData
58
+ Data to write as CSV. Should be a list of dictionaries or a
59
+ single dictionary.
60
+
61
+ Returns
62
+ -------
63
+ int
64
+ The number of rows written to the CSV file.
65
+ """
66
+ rows: list[JSONDict]
67
+ if isinstance(data, list):
68
+ rows = [row for row in data if isinstance(row, dict)]
69
+ else:
70
+ rows = [data]
71
+
72
+ if not rows:
73
+ return 0
74
+
75
+ fieldnames = sorted({key for row in rows for key in row})
76
+ with path.open('w', encoding='utf-8', newline='') as handle:
77
+ writer = csv.DictWriter(handle, fieldnames=fieldnames)
78
+ writer.writeheader()
79
+ for row in rows:
80
+ writer.writerow({field: row.get(field) for field in fieldnames})
81
+
82
+ return len(rows)
etlplus/file/enums.py ADDED
@@ -0,0 +1,266 @@
1
+ """
2
+ :mod:`etlplus.file.enums` module.
3
+
4
+ File-specific enums and helpers.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import PurePath
10
+
11
+ from ..enums import CoercibleStrEnum
12
+ from ..types import StrStrMap
13
+
14
+ # SECTION: EXPORTS ========================================================= #
15
+
16
+ __all__ = [
17
+ 'CompressionFormat',
18
+ 'FileFormat',
19
+ 'coerce_compression_format',
20
+ 'coerce_file_format',
21
+ 'infer_file_format_and_compression',
22
+ ]
23
+
24
+
25
+ # SECTION: ENUMS ============================================================ #
26
+
27
+
28
+ class CompressionFormat(CoercibleStrEnum):
29
+ """Supported compression formats."""
30
+
31
+ # -- Constants -- #
32
+
33
+ GZ = 'gz'
34
+ ZIP = 'zip'
35
+
36
+ # -- Class Methods -- #
37
+
38
+ @classmethod
39
+ def aliases(cls) -> StrStrMap:
40
+ """
41
+ Return a mapping of common aliases for each enum member.
42
+
43
+ Returns
44
+ -------
45
+ StrStrMap
46
+ A mapping of alias names to their corresponding enum member names.
47
+ """
48
+ return {
49
+ # File extensions
50
+ '.gz': 'gz',
51
+ '.gzip': 'gz',
52
+ '.zip': 'zip',
53
+ # MIME types
54
+ 'application/gzip': 'gz',
55
+ 'application/x-gzip': 'gz',
56
+ 'application/zip': 'zip',
57
+ 'application/x-zip-compressed': 'zip',
58
+ }
59
+
60
+
61
+ class FileFormat(CoercibleStrEnum):
62
+ """Supported file formats for extraction."""
63
+
64
+ # -- Constants -- #
65
+
66
+ AVRO = 'avro'
67
+ CSV = 'csv'
68
+ FEATHER = 'feather'
69
+ GZ = 'gz'
70
+ JSON = 'json'
71
+ NDJSON = 'ndjson'
72
+ ORC = 'orc'
73
+ PARQUET = 'parquet'
74
+ TSV = 'tsv'
75
+ TXT = 'txt'
76
+ XLS = 'xls'
77
+ XLSX = 'xlsx'
78
+ ZIP = 'zip'
79
+ XML = 'xml'
80
+ YAML = 'yaml'
81
+
82
+ # -- Class Methods -- #
83
+
84
+ @classmethod
85
+ def aliases(cls) -> StrStrMap:
86
+ """
87
+ Return a mapping of common aliases for each enum member.
88
+
89
+ Returns
90
+ -------
91
+ StrStrMap
92
+ A mapping of alias names to their corresponding enum member names.
93
+ """
94
+ return {
95
+ # Common shorthand
96
+ 'parq': 'parquet',
97
+ 'yml': 'yaml',
98
+ # File extensions
99
+ '.avro': 'avro',
100
+ '.csv': 'csv',
101
+ '.feather': 'feather',
102
+ '.gz': 'gz',
103
+ '.json': 'json',
104
+ '.jsonl': 'ndjson',
105
+ '.ndjson': 'ndjson',
106
+ '.orc': 'orc',
107
+ '.parquet': 'parquet',
108
+ '.pq': 'parquet',
109
+ '.tsv': 'tsv',
110
+ '.txt': 'txt',
111
+ '.xls': 'xls',
112
+ '.xlsx': 'xlsx',
113
+ '.zip': 'zip',
114
+ '.xml': 'xml',
115
+ '.yaml': 'yaml',
116
+ '.yml': 'yaml',
117
+ # MIME types
118
+ 'application/avro': 'avro',
119
+ 'application/csv': 'csv',
120
+ 'application/feather': 'feather',
121
+ 'application/gzip': 'gz',
122
+ 'application/json': 'json',
123
+ 'application/jsonlines': 'ndjson',
124
+ 'application/ndjson': 'ndjson',
125
+ 'application/orc': 'orc',
126
+ 'application/parquet': 'parquet',
127
+ 'application/vnd.apache.avro': 'avro',
128
+ 'application/vnd.apache.parquet': 'parquet',
129
+ 'application/vnd.apache.arrow.file': 'feather',
130
+ 'application/vnd.apache.orc': 'orc',
131
+ 'application/vnd.ms-excel': 'xls',
132
+ (
133
+ 'application/vnd.openxmlformats-'
134
+ 'officedocument.spreadsheetml.sheet'
135
+ ): 'xlsx',
136
+ 'application/x-avro': 'avro',
137
+ 'application/x-csv': 'csv',
138
+ 'application/x-feather': 'feather',
139
+ 'application/x-orc': 'orc',
140
+ 'application/x-ndjson': 'ndjson',
141
+ 'application/x-parquet': 'parquet',
142
+ 'application/x-yaml': 'yaml',
143
+ 'application/xml': 'xml',
144
+ 'application/zip': 'zip',
145
+ 'text/csv': 'csv',
146
+ 'text/plain': 'txt',
147
+ 'text/tab-separated-values': 'tsv',
148
+ 'text/tsv': 'tsv',
149
+ 'text/xml': 'xml',
150
+ 'text/yaml': 'yaml',
151
+ }
152
+
153
+
154
+ # SECTION: INTERNAL CONSTANTS =============================================== #
155
+
156
+
157
+ # Compression formats that are also file formats.
158
+ _COMPRESSION_FILE_FORMATS: set[FileFormat] = {
159
+ FileFormat.GZ,
160
+ FileFormat.ZIP,
161
+ }
162
+
163
+
164
+ # SECTION: FUNCTIONS ======================================================== #
165
+
166
+
167
+ # TODO: Deprecate in favor of using the enum methods directly.
168
+ def coerce_compression_format(
169
+ compression_format: CompressionFormat | str,
170
+ ) -> CompressionFormat:
171
+ """
172
+ Normalize textual compression format values to :class:`CompressionFormat`.
173
+
174
+ This thin wrapper is kept for backward compatibility; prefer
175
+ :meth:`CompressionFormat.coerce` going forward.
176
+ """
177
+ return CompressionFormat.coerce(compression_format)
178
+
179
+
180
+ # TODO: Deprecate in favor of using the enum methods directly.
181
+ def coerce_file_format(
182
+ file_format: FileFormat | str,
183
+ ) -> FileFormat:
184
+ """
185
+ Normalize textual file format values to :class:`FileFormat`.
186
+
187
+ This thin wrapper is kept for backward compatibility; prefer
188
+ :meth:`FileFormat.coerce` going forward.
189
+ """
190
+ return FileFormat.coerce(file_format)
191
+
192
+
193
+ # TODO: Convert to a method on FileFormat or CompressionFormat?
194
+ def infer_file_format_and_compression(
195
+ value: object,
196
+ filename: object | None = None,
197
+ ) -> tuple[FileFormat | None, CompressionFormat | None]:
198
+ """
199
+ Infer data format and compression from a filename, extension, or MIME type.
200
+
201
+ Parameters
202
+ ----------
203
+ value : object
204
+ A filename, extension, MIME type, or existing enum member.
205
+ filename : object | None, optional
206
+ A filename to consult for extension-based inference (e.g. when
207
+ ``value`` is ``application/octet-stream``).
208
+
209
+ Returns
210
+ -------
211
+ tuple[FileFormat | None, CompressionFormat | None]
212
+ The inferred data format and compression, if any.
213
+ """
214
+ if isinstance(value, FileFormat):
215
+ if value in _COMPRESSION_FILE_FORMATS:
216
+ return None, CompressionFormat.coerce(value.value)
217
+ return value, None
218
+ if isinstance(value, CompressionFormat):
219
+ return None, value
220
+
221
+ text = str(value).strip()
222
+ if not text:
223
+ return None, None
224
+
225
+ normalized = text.casefold()
226
+ mime = normalized.split(';', 1)[0].strip()
227
+
228
+ is_octet_stream = mime == 'application/octet-stream'
229
+ compression = CompressionFormat.try_coerce(mime)
230
+ fmt = None if is_octet_stream else FileFormat.try_coerce(mime)
231
+
232
+ is_mime = mime.startswith(
233
+ (
234
+ 'application/',
235
+ 'text/',
236
+ 'audio/',
237
+ 'image/',
238
+ 'video/',
239
+ 'multipart/',
240
+ ),
241
+ )
242
+ suffix_source: object | None = filename if filename is not None else text
243
+ if is_mime and filename is None:
244
+ suffix_source = None
245
+
246
+ suffixes = (
247
+ PurePath(str(suffix_source)).suffixes
248
+ if suffix_source is not None
249
+ else []
250
+ )
251
+ if suffixes:
252
+ normalized_suffixes = [suffix.casefold() for suffix in suffixes]
253
+ compression = (
254
+ CompressionFormat.try_coerce(normalized_suffixes[-1])
255
+ or compression
256
+ )
257
+ if compression is not None:
258
+ normalized_suffixes = normalized_suffixes[:-1]
259
+ if normalized_suffixes:
260
+ fmt = FileFormat.try_coerce(normalized_suffixes[-1]) or fmt
261
+
262
+ if fmt in _COMPRESSION_FILE_FORMATS:
263
+ compression = compression or CompressionFormat.coerce(fmt.value)
264
+ fmt = None
265
+
266
+ return fmt, compression