etlplus 0.10.4__py3-none-any.whl → 0.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
etlplus/file/csv.py ADDED
@@ -0,0 +1,82 @@
1
+ """
2
+ :mod:`etlplus.file.csv` module.
3
+
4
+ CSV read/write helpers.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import csv
10
+ from pathlib import Path
11
+ from typing import cast
12
+
13
+ from ..types import JSONData
14
+ from ..types import JSONDict
15
+ from ..types import JSONList
16
+
17
+ # SECTION: FUNCTIONS ======================================================== #
18
+
19
+
20
+ def read(
21
+ path: Path,
22
+ ) -> JSONList:
23
+ """
24
+ Load CSV content as a list of dictionaries.
25
+
26
+ Parameters
27
+ ----------
28
+ path : Path
29
+ Path to the CSV file on disk.
30
+
31
+ Returns
32
+ -------
33
+ JSONList
34
+ The list of dictionaries read from the CSV file.
35
+ """
36
+ with path.open('r', encoding='utf-8', newline='') as handle:
37
+ reader: csv.DictReader[str] = csv.DictReader(handle)
38
+ rows: JSONList = []
39
+ for row in reader:
40
+ if not any(row.values()):
41
+ continue
42
+ rows.append(cast(JSONDict, dict(row)))
43
+ return rows
44
+
45
+
46
+ def write(
47
+ path: Path,
48
+ data: JSONData,
49
+ ) -> int:
50
+ """
51
+ Write CSV rows to ``path`` and return the number of rows.
52
+
53
+ Parameters
54
+ ----------
55
+ path : Path
56
+ Path to the CSV file on disk.
57
+ data : JSONData
58
+ Data to write as CSV. Should be a list of dictionaries or a
59
+ single dictionary.
60
+
61
+ Returns
62
+ -------
63
+ int
64
+ The number of rows written to the CSV file.
65
+ """
66
+ rows: list[JSONDict]
67
+ if isinstance(data, list):
68
+ rows = [row for row in data if isinstance(row, dict)]
69
+ else:
70
+ rows = [data]
71
+
72
+ if not rows:
73
+ return 0
74
+
75
+ fieldnames = sorted({key for row in rows for key in row})
76
+ with path.open('w', encoding='utf-8', newline='') as handle:
77
+ writer = csv.DictWriter(handle, fieldnames=fieldnames)
78
+ writer.writeheader()
79
+ for row in rows:
80
+ writer.writerow({field: row.get(field) for field in fieldnames})
81
+
82
+ return len(rows)
etlplus/file/enums.py ADDED
@@ -0,0 +1,266 @@
1
+ """
2
+ :mod:`etlplus.file.enums` module.
3
+
4
+ File-specific enums and helpers.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import PurePath
10
+
11
+ from ..enums import CoercibleStrEnum
12
+ from ..types import StrStrMap
13
+
14
+ # SECTION: EXPORTS ========================================================= #
15
+
16
+ __all__ = [
17
+ 'CompressionFormat',
18
+ 'FileFormat',
19
+ 'coerce_compression_format',
20
+ 'coerce_file_format',
21
+ 'infer_file_format_and_compression',
22
+ ]
23
+
24
+
25
+ # SECTION: ENUMS ============================================================ #
26
+
27
+
28
+ class CompressionFormat(CoercibleStrEnum):
29
+ """Supported compression formats."""
30
+
31
+ # -- Constants -- #
32
+
33
+ GZ = 'gz'
34
+ ZIP = 'zip'
35
+
36
+ # -- Class Methods -- #
37
+
38
+ @classmethod
39
+ def aliases(cls) -> StrStrMap:
40
+ """
41
+ Return a mapping of common aliases for each enum member.
42
+
43
+ Returns
44
+ -------
45
+ StrStrMap
46
+ A mapping of alias names to their corresponding enum member names.
47
+ """
48
+ return {
49
+ # File extensions
50
+ '.gz': 'gz',
51
+ '.gzip': 'gz',
52
+ '.zip': 'zip',
53
+ # MIME types
54
+ 'application/gzip': 'gz',
55
+ 'application/x-gzip': 'gz',
56
+ 'application/zip': 'zip',
57
+ 'application/x-zip-compressed': 'zip',
58
+ }
59
+
60
+
61
+ class FileFormat(CoercibleStrEnum):
62
+ """Supported file formats for extraction."""
63
+
64
+ # -- Constants -- #
65
+
66
+ AVRO = 'avro'
67
+ CSV = 'csv'
68
+ FEATHER = 'feather'
69
+ GZ = 'gz'
70
+ JSON = 'json'
71
+ NDJSON = 'ndjson'
72
+ ORC = 'orc'
73
+ PARQUET = 'parquet'
74
+ TSV = 'tsv'
75
+ TXT = 'txt'
76
+ XLS = 'xls'
77
+ XLSX = 'xlsx'
78
+ ZIP = 'zip'
79
+ XML = 'xml'
80
+ YAML = 'yaml'
81
+
82
+ # -- Class Methods -- #
83
+
84
+ @classmethod
85
+ def aliases(cls) -> StrStrMap:
86
+ """
87
+ Return a mapping of common aliases for each enum member.
88
+
89
+ Returns
90
+ -------
91
+ StrStrMap
92
+ A mapping of alias names to their corresponding enum member names.
93
+ """
94
+ return {
95
+ # Common shorthand
96
+ 'parq': 'parquet',
97
+ 'yml': 'yaml',
98
+ # File extensions
99
+ '.avro': 'avro',
100
+ '.csv': 'csv',
101
+ '.feather': 'feather',
102
+ '.gz': 'gz',
103
+ '.json': 'json',
104
+ '.jsonl': 'ndjson',
105
+ '.ndjson': 'ndjson',
106
+ '.orc': 'orc',
107
+ '.parquet': 'parquet',
108
+ '.pq': 'parquet',
109
+ '.tsv': 'tsv',
110
+ '.txt': 'txt',
111
+ '.xls': 'xls',
112
+ '.xlsx': 'xlsx',
113
+ '.zip': 'zip',
114
+ '.xml': 'xml',
115
+ '.yaml': 'yaml',
116
+ '.yml': 'yaml',
117
+ # MIME types
118
+ 'application/avro': 'avro',
119
+ 'application/csv': 'csv',
120
+ 'application/feather': 'feather',
121
+ 'application/gzip': 'gz',
122
+ 'application/json': 'json',
123
+ 'application/jsonlines': 'ndjson',
124
+ 'application/ndjson': 'ndjson',
125
+ 'application/orc': 'orc',
126
+ 'application/parquet': 'parquet',
127
+ 'application/vnd.apache.avro': 'avro',
128
+ 'application/vnd.apache.parquet': 'parquet',
129
+ 'application/vnd.apache.arrow.file': 'feather',
130
+ 'application/vnd.apache.orc': 'orc',
131
+ 'application/vnd.ms-excel': 'xls',
132
+ (
133
+ 'application/vnd.openxmlformats-'
134
+ 'officedocument.spreadsheetml.sheet'
135
+ ): 'xlsx',
136
+ 'application/x-avro': 'avro',
137
+ 'application/x-csv': 'csv',
138
+ 'application/x-feather': 'feather',
139
+ 'application/x-orc': 'orc',
140
+ 'application/x-ndjson': 'ndjson',
141
+ 'application/x-parquet': 'parquet',
142
+ 'application/x-yaml': 'yaml',
143
+ 'application/xml': 'xml',
144
+ 'application/zip': 'zip',
145
+ 'text/csv': 'csv',
146
+ 'text/plain': 'txt',
147
+ 'text/tab-separated-values': 'tsv',
148
+ 'text/tsv': 'tsv',
149
+ 'text/xml': 'xml',
150
+ 'text/yaml': 'yaml',
151
+ }
152
+
153
+
154
+ # SECTION: INTERNAL CONSTANTS =============================================== #
155
+
156
+
157
+ # Compression formats that are also file formats.
158
+ _COMPRESSION_FILE_FORMATS: set[FileFormat] = {
159
+ FileFormat.GZ,
160
+ FileFormat.ZIP,
161
+ }
162
+
163
+
164
+ # SECTION: FUNCTIONS ======================================================== #
165
+
166
+
167
+ # TODO: Deprecate in favor of using the enum methods directly.
168
+ def coerce_compression_format(
169
+ compression_format: CompressionFormat | str,
170
+ ) -> CompressionFormat:
171
+ """
172
+ Normalize textual compression format values to :class:`CompressionFormat`.
173
+
174
+ This thin wrapper is kept for backward compatibility; prefer
175
+ :meth:`CompressionFormat.coerce` going forward.
176
+ """
177
+ return CompressionFormat.coerce(compression_format)
178
+
179
+
180
+ # TODO: Deprecate in favor of using the enum methods directly.
181
+ def coerce_file_format(
182
+ file_format: FileFormat | str,
183
+ ) -> FileFormat:
184
+ """
185
+ Normalize textual file format values to :class:`FileFormat`.
186
+
187
+ This thin wrapper is kept for backward compatibility; prefer
188
+ :meth:`FileFormat.coerce` going forward.
189
+ """
190
+ return FileFormat.coerce(file_format)
191
+
192
+
193
+ # TODO: Convert to a method on FileFormat or CompressionFormat?
194
+ def infer_file_format_and_compression(
195
+ value: object,
196
+ filename: object | None = None,
197
+ ) -> tuple[FileFormat | None, CompressionFormat | None]:
198
+ """
199
+ Infer data format and compression from a filename, extension, or MIME type.
200
+
201
+ Parameters
202
+ ----------
203
+ value : object
204
+ A filename, extension, MIME type, or existing enum member.
205
+ filename : object | None, optional
206
+ A filename to consult for extension-based inference (e.g. when
207
+ ``value`` is ``application/octet-stream``).
208
+
209
+ Returns
210
+ -------
211
+ tuple[FileFormat | None, CompressionFormat | None]
212
+ The inferred data format and compression, if any.
213
+ """
214
+ if isinstance(value, FileFormat):
215
+ if value in _COMPRESSION_FILE_FORMATS:
216
+ return None, CompressionFormat.coerce(value.value)
217
+ return value, None
218
+ if isinstance(value, CompressionFormat):
219
+ return None, value
220
+
221
+ text = str(value).strip()
222
+ if not text:
223
+ return None, None
224
+
225
+ normalized = text.casefold()
226
+ mime = normalized.split(';', 1)[0].strip()
227
+
228
+ is_octet_stream = mime == 'application/octet-stream'
229
+ compression = CompressionFormat.try_coerce(mime)
230
+ fmt = None if is_octet_stream else FileFormat.try_coerce(mime)
231
+
232
+ is_mime = mime.startswith(
233
+ (
234
+ 'application/',
235
+ 'text/',
236
+ 'audio/',
237
+ 'image/',
238
+ 'video/',
239
+ 'multipart/',
240
+ ),
241
+ )
242
+ suffix_source: object | None = filename if filename is not None else text
243
+ if is_mime and filename is None:
244
+ suffix_source = None
245
+
246
+ suffixes = (
247
+ PurePath(str(suffix_source)).suffixes
248
+ if suffix_source is not None
249
+ else []
250
+ )
251
+ if suffixes:
252
+ normalized_suffixes = [suffix.casefold() for suffix in suffixes]
253
+ compression = (
254
+ CompressionFormat.try_coerce(normalized_suffixes[-1])
255
+ or compression
256
+ )
257
+ if compression is not None:
258
+ normalized_suffixes = normalized_suffixes[:-1]
259
+ if normalized_suffixes:
260
+ fmt = FileFormat.try_coerce(normalized_suffixes[-1]) or fmt
261
+
262
+ if fmt in _COMPRESSION_FILE_FORMATS:
263
+ compression = compression or CompressionFormat.coerce(fmt.value)
264
+ fmt = None
265
+
266
+ return fmt, compression
etlplus/file/json.py ADDED
@@ -0,0 +1,87 @@
1
+ """
2
+ :mod:`etlplus.file.json` module.
3
+
4
+ JSON read/write helpers.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ from pathlib import Path
11
+ from typing import cast
12
+
13
+ from ..types import JSONData
14
+ from ..types import JSONDict
15
+ from ..types import JSONList
16
+ from ..utils import count_records
17
+
18
+ # SECTION: FUNCTIONS ======================================================== #
19
+
20
+
21
+ def read(
22
+ path: Path,
23
+ ) -> JSONData:
24
+ """
25
+ Load and validate JSON payloads from ``path``.
26
+
27
+ Parameters
28
+ ----------
29
+ path : Path
30
+ Path to the JSON file on disk.
31
+
32
+ Returns
33
+ -------
34
+ JSONData
35
+ The structured data read from the JSON file.
36
+
37
+ Raises
38
+ ------
39
+ TypeError
40
+ If the JSON root is not an object or an array of objects.
41
+ """
42
+ with path.open('r', encoding='utf-8') as handle:
43
+ loaded = json.load(handle)
44
+
45
+ if isinstance(loaded, dict):
46
+ return cast(JSONDict, loaded)
47
+ if isinstance(loaded, list):
48
+ if all(isinstance(item, dict) for item in loaded):
49
+ return cast(JSONList, loaded)
50
+ raise TypeError(
51
+ 'JSON array must contain only objects (dicts) when loading file',
52
+ )
53
+ raise TypeError(
54
+ 'JSON root must be an object or an array of objects when loading file',
55
+ )
56
+
57
+
58
+ def write(
59
+ path: Path,
60
+ data: JSONData,
61
+ ) -> int:
62
+ """
63
+ Write ``data`` as formatted JSON to ``path``.
64
+
65
+ Parameters
66
+ ----------
67
+ path : Path
68
+ Path to the JSON file on disk.
69
+ data : JSONData
70
+ Data to serialize as JSON.
71
+
72
+ Returns
73
+ -------
74
+ int
75
+ The number of records written to the JSON file.
76
+ """
77
+ path.parent.mkdir(parents=True, exist_ok=True)
78
+ with path.open('w', encoding='utf-8') as handle:
79
+ json.dump(
80
+ data,
81
+ handle,
82
+ indent=2,
83
+ ensure_ascii=False,
84
+ )
85
+ handle.write('\n')
86
+
87
+ return count_records(data)
etlplus/file/xml.py ADDED
@@ -0,0 +1,165 @@
1
+ """
2
+ :mod:`etlplus.file.xml` module.
3
+
4
+ XML read/write helpers.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import xml.etree.ElementTree as ET
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+ from ..types import JSONData
14
+ from ..types import JSONDict
15
+ from ..utils import count_records
16
+
17
+ # SECTION: CONSTANTS ======================================================== #
18
+
19
+
20
+ DEFAULT_XML_ROOT = 'root'
21
+
22
+
23
+ # SECTION: INTERNAL FUNCTIONS =============================================== #
24
+
25
+
26
+ def _dict_to_element(
27
+ name: str,
28
+ payload: Any,
29
+ ) -> ET.Element:
30
+ """
31
+ Convert a dictionary-like payload into an XML element.
32
+
33
+ Parameters
34
+ ----------
35
+ name : str
36
+ Name of the XML element.
37
+ payload : Any
38
+ The data to include in the XML element.
39
+
40
+ Returns
41
+ -------
42
+ ET.Element
43
+ The constructed XML element.
44
+ """
45
+ element = ET.Element(name)
46
+
47
+ if isinstance(payload, dict):
48
+ text = payload.get('text')
49
+ if text is not None:
50
+ element.text = str(text)
51
+
52
+ for key, value in payload.items():
53
+ if key == 'text':
54
+ continue
55
+ if key.startswith('@'):
56
+ element.set(key[1:], str(value))
57
+ continue
58
+ if isinstance(value, list):
59
+ for item in value:
60
+ element.append(_dict_to_element(key, item))
61
+ else:
62
+ element.append(_dict_to_element(key, value))
63
+ elif isinstance(payload, list):
64
+ for item in payload:
65
+ element.append(_dict_to_element('item', item))
66
+ elif payload is not None:
67
+ element.text = str(payload)
68
+
69
+ return element
70
+
71
+
72
+ def _element_to_dict(
73
+ element: ET.Element,
74
+ ) -> JSONDict:
75
+ """
76
+ Convert an XML element into a nested dictionary.
77
+
78
+ Parameters
79
+ ----------
80
+ element : ET.Element
81
+ XML element to convert.
82
+
83
+ Returns
84
+ -------
85
+ JSONDict
86
+ Nested dictionary representation of the XML element.
87
+ """
88
+ result: JSONDict = {}
89
+ text = (element.text or '').strip()
90
+ if text:
91
+ result['text'] = text
92
+
93
+ for child in element:
94
+ child_data = _element_to_dict(child)
95
+ tag = child.tag
96
+ if tag in result:
97
+ existing = result[tag]
98
+ if isinstance(existing, list):
99
+ existing.append(child_data)
100
+ else:
101
+ result[tag] = [existing, child_data]
102
+ else:
103
+ result[tag] = child_data
104
+
105
+ for key, value in element.attrib.items():
106
+ if key in result:
107
+ result[f'@{key}'] = value
108
+ else:
109
+ result[key] = value
110
+ return result
111
+
112
+
113
+ # SECTION: FUNCTIONS ======================================================== #
114
+
115
+
116
+ def read(
117
+ path: Path,
118
+ ) -> JSONDict:
119
+ """
120
+ Parse XML document at ``path`` into a nested dictionary.
121
+
122
+ Parameters
123
+ ----------
124
+ path : Path
125
+ Path to the XML file on disk.
126
+
127
+ Returns
128
+ -------
129
+ JSONDict
130
+ Nested dictionary representation of the XML file.
131
+ """
132
+ tree = ET.parse(path)
133
+ root = tree.getroot()
134
+
135
+ return {root.tag: _element_to_dict(root)}
136
+
137
+
138
+ def write(path: Path, data: JSONData, *, root_tag: str) -> int:
139
+ """
140
+ Write ``data`` as XML to ``path`` and return record count.
141
+
142
+ Parameters
143
+ ----------
144
+ path : Path
145
+ Path to the XML file on disk.
146
+ data : JSONData
147
+ Data to write as XML.
148
+ root_tag : str
149
+ Root tag name to use when writing XML files.
150
+
151
+ Returns
152
+ -------
153
+ int
154
+ The number of records written to the XML file.
155
+ """
156
+ if isinstance(data, dict) and len(data) == 1:
157
+ root_name, payload = next(iter(data.items()))
158
+ root_element = _dict_to_element(str(root_name), payload)
159
+ else:
160
+ root_element = _dict_to_element(root_tag, data)
161
+
162
+ tree = ET.ElementTree(root_element)
163
+ tree.write(path, encoding='utf-8', xml_declaration=True)
164
+
165
+ return count_records(data)