etlplus 0.9.0__py3-none-any.whl → 0.11.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
etlplus/file/enums.py ADDED
@@ -0,0 +1,238 @@
1
+ """
2
+ :mod:`etlplus.file.enums` module.
3
+
4
+ File-specific enums and helpers.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import PurePath
10
+
11
+ from ..enums import CoercibleStrEnum
12
+ from ..types import StrStrMap
13
+
14
+ # SECTION: EXPORTS ========================================================= #
15
+
16
+ __all__ = [
17
+ 'CompressionFormat',
18
+ 'FileFormat',
19
+ 'infer_file_format_and_compression',
20
+ ]
21
+
22
+
23
+ # SECTION: ENUMS ============================================================ #
24
+
25
+
26
+ class CompressionFormat(CoercibleStrEnum):
27
+ """Supported compression formats."""
28
+
29
+ # -- Constants -- #
30
+
31
+ GZ = 'gz'
32
+ ZIP = 'zip'
33
+
34
+ # -- Class Methods -- #
35
+
36
+ @classmethod
37
+ def aliases(cls) -> StrStrMap:
38
+ """
39
+ Return a mapping of common aliases for each enum member.
40
+
41
+ Returns
42
+ -------
43
+ StrStrMap
44
+ A mapping of alias names to their corresponding enum member names.
45
+ """
46
+ return {
47
+ # File extensions
48
+ '.gz': 'gz',
49
+ '.gzip': 'gz',
50
+ '.zip': 'zip',
51
+ # MIME types
52
+ 'application/gzip': 'gz',
53
+ 'application/x-gzip': 'gz',
54
+ 'application/zip': 'zip',
55
+ 'application/x-zip-compressed': 'zip',
56
+ }
57
+
58
+
59
+ class FileFormat(CoercibleStrEnum):
60
+ """Supported file formats for extraction."""
61
+
62
+ # -- Constants -- #
63
+
64
+ AVRO = 'avro'
65
+ CSV = 'csv'
66
+ FEATHER = 'feather'
67
+ GZ = 'gz'
68
+ JSON = 'json'
69
+ NDJSON = 'ndjson'
70
+ ORC = 'orc'
71
+ PARQUET = 'parquet'
72
+ TSV = 'tsv'
73
+ TXT = 'txt'
74
+ XLS = 'xls'
75
+ XLSX = 'xlsx'
76
+ ZIP = 'zip'
77
+ XML = 'xml'
78
+ YAML = 'yaml'
79
+
80
+ # -- Class Methods -- #
81
+
82
+ @classmethod
83
+ def aliases(cls) -> StrStrMap:
84
+ """
85
+ Return a mapping of common aliases for each enum member.
86
+
87
+ Returns
88
+ -------
89
+ StrStrMap
90
+ A mapping of alias names to their corresponding enum member names.
91
+ """
92
+ return {
93
+ # Common shorthand
94
+ 'parq': 'parquet',
95
+ 'yml': 'yaml',
96
+ # File extensions
97
+ '.avro': 'avro',
98
+ '.csv': 'csv',
99
+ '.feather': 'feather',
100
+ '.gz': 'gz',
101
+ '.json': 'json',
102
+ '.jsonl': 'ndjson',
103
+ '.ndjson': 'ndjson',
104
+ '.orc': 'orc',
105
+ '.parquet': 'parquet',
106
+ '.pq': 'parquet',
107
+ '.tsv': 'tsv',
108
+ '.txt': 'txt',
109
+ '.xls': 'xls',
110
+ '.xlsx': 'xlsx',
111
+ '.zip': 'zip',
112
+ '.xml': 'xml',
113
+ '.yaml': 'yaml',
114
+ '.yml': 'yaml',
115
+ # MIME types
116
+ 'application/avro': 'avro',
117
+ 'application/csv': 'csv',
118
+ 'application/feather': 'feather',
119
+ 'application/gzip': 'gz',
120
+ 'application/json': 'json',
121
+ 'application/jsonlines': 'ndjson',
122
+ 'application/ndjson': 'ndjson',
123
+ 'application/orc': 'orc',
124
+ 'application/parquet': 'parquet',
125
+ 'application/vnd.apache.avro': 'avro',
126
+ 'application/vnd.apache.parquet': 'parquet',
127
+ 'application/vnd.apache.arrow.file': 'feather',
128
+ 'application/vnd.apache.orc': 'orc',
129
+ 'application/vnd.ms-excel': 'xls',
130
+ (
131
+ 'application/vnd.openxmlformats-'
132
+ 'officedocument.spreadsheetml.sheet'
133
+ ): 'xlsx',
134
+ 'application/x-avro': 'avro',
135
+ 'application/x-csv': 'csv',
136
+ 'application/x-feather': 'feather',
137
+ 'application/x-orc': 'orc',
138
+ 'application/x-ndjson': 'ndjson',
139
+ 'application/x-parquet': 'parquet',
140
+ 'application/x-yaml': 'yaml',
141
+ 'application/xml': 'xml',
142
+ 'application/zip': 'zip',
143
+ 'text/csv': 'csv',
144
+ 'text/plain': 'txt',
145
+ 'text/tab-separated-values': 'tsv',
146
+ 'text/tsv': 'tsv',
147
+ 'text/xml': 'xml',
148
+ 'text/yaml': 'yaml',
149
+ }
150
+
151
+
152
+ # SECTION: INTERNAL CONSTANTS =============================================== #
153
+
154
+
155
+ # Compression formats that are also file formats.
156
+ _COMPRESSION_FILE_FORMATS: set[FileFormat] = {
157
+ FileFormat.GZ,
158
+ FileFormat.ZIP,
159
+ }
160
+
161
+
162
+ # SECTION: FUNCTIONS ======================================================== #
163
+
164
+
165
+ # TODO: Convert to a method on FileFormat or CompressionFormat?
166
+ def infer_file_format_and_compression(
167
+ value: object,
168
+ filename: object | None = None,
169
+ ) -> tuple[FileFormat | None, CompressionFormat | None]:
170
+ """
171
+ Infer data format and compression from a filename, extension, or MIME type.
172
+
173
+ Parameters
174
+ ----------
175
+ value : object
176
+ A filename, extension, MIME type, or existing enum member.
177
+ filename : object | None, optional
178
+ A filename to consult for extension-based inference (e.g. when
179
+ ``value`` is ``application/octet-stream``).
180
+
181
+ Returns
182
+ -------
183
+ tuple[FileFormat | None, CompressionFormat | None]
184
+ The inferred data format and compression, if any.
185
+ """
186
+ if isinstance(value, FileFormat):
187
+ if value in _COMPRESSION_FILE_FORMATS:
188
+ return None, CompressionFormat.coerce(value.value)
189
+ return value, None
190
+ if isinstance(value, CompressionFormat):
191
+ return None, value
192
+
193
+ text = str(value).strip()
194
+ if not text:
195
+ return None, None
196
+
197
+ normalized = text.casefold()
198
+ mime = normalized.split(';', 1)[0].strip()
199
+
200
+ is_octet_stream = mime == 'application/octet-stream'
201
+ compression = CompressionFormat.try_coerce(mime)
202
+ fmt = None if is_octet_stream else FileFormat.try_coerce(mime)
203
+
204
+ is_mime = mime.startswith(
205
+ (
206
+ 'application/',
207
+ 'text/',
208
+ 'audio/',
209
+ 'image/',
210
+ 'video/',
211
+ 'multipart/',
212
+ ),
213
+ )
214
+ suffix_source: object | None = filename if filename is not None else text
215
+ if is_mime and filename is None:
216
+ suffix_source = None
217
+
218
+ suffixes = (
219
+ PurePath(str(suffix_source)).suffixes
220
+ if suffix_source is not None
221
+ else []
222
+ )
223
+ if suffixes:
224
+ normalized_suffixes = [suffix.casefold() for suffix in suffixes]
225
+ compression = (
226
+ CompressionFormat.try_coerce(normalized_suffixes[-1])
227
+ or compression
228
+ )
229
+ if compression is not None:
230
+ normalized_suffixes = normalized_suffixes[:-1]
231
+ if normalized_suffixes:
232
+ fmt = FileFormat.try_coerce(normalized_suffixes[-1]) or fmt
233
+
234
+ if fmt in _COMPRESSION_FILE_FORMATS:
235
+ compression = compression or CompressionFormat.coerce(fmt.value)
236
+ fmt = None
237
+
238
+ return fmt, compression
etlplus/file/json.py ADDED
@@ -0,0 +1,87 @@
1
+ """
2
+ :mod:`etlplus.file.json` module.
3
+
4
+ JSON read/write helpers.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ from pathlib import Path
11
+ from typing import cast
12
+
13
+ from ..types import JSONData
14
+ from ..types import JSONDict
15
+ from ..types import JSONList
16
+ from ..utils import count_records
17
+
18
+ # SECTION: FUNCTIONS ======================================================== #
19
+
20
+
21
+ def read(
22
+ path: Path,
23
+ ) -> JSONData:
24
+ """
25
+ Load and validate JSON payloads from ``path``.
26
+
27
+ Parameters
28
+ ----------
29
+ path : Path
30
+ Path to the JSON file on disk.
31
+
32
+ Returns
33
+ -------
34
+ JSONData
35
+ The structured data read from the JSON file.
36
+
37
+ Raises
38
+ ------
39
+ TypeError
40
+ If the JSON root is not an object or an array of objects.
41
+ """
42
+ with path.open('r', encoding='utf-8') as handle:
43
+ loaded = json.load(handle)
44
+
45
+ if isinstance(loaded, dict):
46
+ return cast(JSONDict, loaded)
47
+ if isinstance(loaded, list):
48
+ if all(isinstance(item, dict) for item in loaded):
49
+ return cast(JSONList, loaded)
50
+ raise TypeError(
51
+ 'JSON array must contain only objects (dicts) when loading file',
52
+ )
53
+ raise TypeError(
54
+ 'JSON root must be an object or an array of objects when loading file',
55
+ )
56
+
57
+
58
+ def write(
59
+ path: Path,
60
+ data: JSONData,
61
+ ) -> int:
62
+ """
63
+ Write ``data`` as formatted JSON to ``path``.
64
+
65
+ Parameters
66
+ ----------
67
+ path : Path
68
+ Path to the JSON file on disk.
69
+ data : JSONData
70
+ Data to serialize as JSON.
71
+
72
+ Returns
73
+ -------
74
+ int
75
+ The number of records written to the JSON file.
76
+ """
77
+ path.parent.mkdir(parents=True, exist_ok=True)
78
+ with path.open('w', encoding='utf-8') as handle:
79
+ json.dump(
80
+ data,
81
+ handle,
82
+ indent=2,
83
+ ensure_ascii=False,
84
+ )
85
+ handle.write('\n')
86
+
87
+ return count_records(data)
etlplus/file/xml.py ADDED
@@ -0,0 +1,165 @@
1
+ """
2
+ :mod:`etlplus.file.xml` module.
3
+
4
+ XML read/write helpers.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import xml.etree.ElementTree as ET
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+ from ..types import JSONData
14
+ from ..types import JSONDict
15
+ from ..utils import count_records
16
+
17
+ # SECTION: CONSTANTS ======================================================== #
18
+
19
+
20
+ DEFAULT_XML_ROOT = 'root'
21
+
22
+
23
+ # SECTION: INTERNAL FUNCTIONS =============================================== #
24
+
25
+
26
+ def _dict_to_element(
27
+ name: str,
28
+ payload: Any,
29
+ ) -> ET.Element:
30
+ """
31
+ Convert a dictionary-like payload into an XML element.
32
+
33
+ Parameters
34
+ ----------
35
+ name : str
36
+ Name of the XML element.
37
+ payload : Any
38
+ The data to include in the XML element.
39
+
40
+ Returns
41
+ -------
42
+ ET.Element
43
+ The constructed XML element.
44
+ """
45
+ element = ET.Element(name)
46
+
47
+ if isinstance(payload, dict):
48
+ text = payload.get('text')
49
+ if text is not None:
50
+ element.text = str(text)
51
+
52
+ for key, value in payload.items():
53
+ if key == 'text':
54
+ continue
55
+ if key.startswith('@'):
56
+ element.set(key[1:], str(value))
57
+ continue
58
+ if isinstance(value, list):
59
+ for item in value:
60
+ element.append(_dict_to_element(key, item))
61
+ else:
62
+ element.append(_dict_to_element(key, value))
63
+ elif isinstance(payload, list):
64
+ for item in payload:
65
+ element.append(_dict_to_element('item', item))
66
+ elif payload is not None:
67
+ element.text = str(payload)
68
+
69
+ return element
70
+
71
+
72
+ def _element_to_dict(
73
+ element: ET.Element,
74
+ ) -> JSONDict:
75
+ """
76
+ Convert an XML element into a nested dictionary.
77
+
78
+ Parameters
79
+ ----------
80
+ element : ET.Element
81
+ XML element to convert.
82
+
83
+ Returns
84
+ -------
85
+ JSONDict
86
+ Nested dictionary representation of the XML element.
87
+ """
88
+ result: JSONDict = {}
89
+ text = (element.text or '').strip()
90
+ if text:
91
+ result['text'] = text
92
+
93
+ for child in element:
94
+ child_data = _element_to_dict(child)
95
+ tag = child.tag
96
+ if tag in result:
97
+ existing = result[tag]
98
+ if isinstance(existing, list):
99
+ existing.append(child_data)
100
+ else:
101
+ result[tag] = [existing, child_data]
102
+ else:
103
+ result[tag] = child_data
104
+
105
+ for key, value in element.attrib.items():
106
+ if key in result:
107
+ result[f'@{key}'] = value
108
+ else:
109
+ result[key] = value
110
+ return result
111
+
112
+
113
+ # SECTION: FUNCTIONS ======================================================== #
114
+
115
+
116
+ def read(
117
+ path: Path,
118
+ ) -> JSONDict:
119
+ """
120
+ Parse XML document at ``path`` into a nested dictionary.
121
+
122
+ Parameters
123
+ ----------
124
+ path : Path
125
+ Path to the XML file on disk.
126
+
127
+ Returns
128
+ -------
129
+ JSONDict
130
+ Nested dictionary representation of the XML file.
131
+ """
132
+ tree = ET.parse(path)
133
+ root = tree.getroot()
134
+
135
+ return {root.tag: _element_to_dict(root)}
136
+
137
+
138
+ def write(path: Path, data: JSONData, *, root_tag: str) -> int:
139
+ """
140
+ Write ``data`` as XML to ``path`` and return record count.
141
+
142
+ Parameters
143
+ ----------
144
+ path : Path
145
+ Path to the XML file on disk.
146
+ data : JSONData
147
+ Data to write as XML.
148
+ root_tag : str
149
+ Root tag name to use when writing XML files.
150
+
151
+ Returns
152
+ -------
153
+ int
154
+ The number of records written to the XML file.
155
+ """
156
+ if isinstance(data, dict) and len(data) == 1:
157
+ root_name, payload = next(iter(data.items()))
158
+ root_element = _dict_to_element(str(root_name), payload)
159
+ else:
160
+ root_element = _dict_to_element(root_tag, data)
161
+
162
+ tree = ET.ElementTree(root_element)
163
+ tree.write(path, encoding='utf-8', xml_declaration=True)
164
+
165
+ return count_records(data)
etlplus/file/yaml.py ADDED
@@ -0,0 +1,125 @@
1
+ """
2
+ :mod:`etlplus.file.yaml` module.
3
+
4
+ Optional YAML read/write helpers.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+ from typing import Any
11
+ from typing import cast
12
+
13
+ from ..types import JSONData
14
+ from ..types import JSONDict
15
+ from ..types import JSONList
16
+ from ..utils import count_records
17
+
18
+ # SECTION: INTERNAL CONSTANTS =============================================== #
19
+
20
+
21
+ # Optional YAML support (lazy-loaded to avoid hard dependency)
22
+ # Cached access function to avoid global statements.
23
+ _YAML_CACHE: dict[str, Any] = {}
24
+
25
+
26
+ # SECTION: INTERNAL FUNCTIONS =============================================== #
27
+
28
+
29
+ def _get_yaml() -> Any:
30
+ """
31
+ Return the PyYAML module, importing it on first use.
32
+
33
+ Raises an informative ImportError if the optional dependency is missing.
34
+ """
35
+ mod = _YAML_CACHE.get('mod')
36
+ if mod is not None: # pragma: no cover - tiny branch
37
+ return mod
38
+ try:
39
+ _yaml_mod = __import__('yaml') # type: ignore[assignment]
40
+ except ImportError as e: # pragma: no cover
41
+ raise ImportError(
42
+ 'YAML support requires optional dependency "PyYAML".\n'
43
+ 'Install with: pip install PyYAML',
44
+ ) from e
45
+ _YAML_CACHE['mod'] = _yaml_mod
46
+
47
+ return _yaml_mod
48
+
49
+
50
+ def _require_yaml() -> None:
51
+ """Ensure PyYAML is available or raise an informative error."""
52
+ _get_yaml()
53
+
54
+
55
+ # SECTION: FUNCTIONS ======================================================== #
56
+
57
+
58
+ def read(
59
+ path: Path,
60
+ ) -> JSONData:
61
+ """
62
+ Load and validate YAML payloads from ``path``.
63
+
64
+ Parameters
65
+ ----------
66
+ path : Path
67
+ Path to the YAML file on disk.
68
+
69
+ Returns
70
+ -------
71
+ JSONData
72
+ The structured data read from the YAML file.
73
+
74
+ Raises
75
+ ------
76
+ TypeError
77
+ If the YAML root is not an object or an array of objects.
78
+ """
79
+ _require_yaml()
80
+
81
+ with path.open('r', encoding='utf-8') as handle:
82
+ loaded = _get_yaml().safe_load(handle)
83
+
84
+ if isinstance(loaded, dict):
85
+ return cast(JSONDict, loaded)
86
+ if isinstance(loaded, list):
87
+ if all(isinstance(item, dict) for item in loaded):
88
+ return cast(JSONList, loaded)
89
+ raise TypeError(
90
+ 'YAML array must contain only objects (dicts) when loading',
91
+ )
92
+ raise TypeError(
93
+ 'YAML root must be an object or an array of objects when loading',
94
+ )
95
+
96
+
97
+ def write(
98
+ path: Path,
99
+ data: JSONData,
100
+ ) -> int:
101
+ """
102
+ Write ``data`` as YAML to ``path`` and return record count.
103
+
104
+ Parameters
105
+ ----------
106
+ path : Path
107
+ Path to the YAML file on disk.
108
+ data : JSONData
109
+ Data to write as YAML.
110
+
111
+ Returns
112
+ -------
113
+ int
114
+ The number of records written.
115
+ """
116
+ _require_yaml()
117
+ with path.open('w', encoding='utf-8') as handle:
118
+ _get_yaml().safe_dump(
119
+ data,
120
+ handle,
121
+ sort_keys=False,
122
+ allow_unicode=True,
123
+ default_flow_style=False,
124
+ )
125
+ return count_records(data)
etlplus/load.py CHANGED
@@ -15,12 +15,9 @@ from typing import cast
15
15
  import requests # type: ignore[import]
16
16
 
17
17
  from .enums import DataConnectorType
18
- from .enums import FileFormat
19
18
  from .enums import HttpMethod
20
- from .enums import coerce_data_connector_type
21
- from .enums import coerce_file_format
22
- from .enums import coerce_http_method
23
19
  from .file import File
20
+ from .file import FileFormat
24
21
  from .types import JSONData
25
22
  from .types import JSONDict
26
23
  from .types import JSONList
@@ -101,7 +98,7 @@ def load_data(
101
98
  return cast(JSONData, source)
102
99
 
103
100
  if isinstance(source, Path):
104
- return File(source, FileFormat.JSON).read_json()
101
+ return File(source, FileFormat.JSON).read()
105
102
 
106
103
  if isinstance(source, str):
107
104
  # Special case: '-' means read JSON from STDIN (Unix convention).
@@ -111,7 +108,7 @@ def load_data(
111
108
  candidate = Path(source)
112
109
  if candidate.exists():
113
110
  try:
114
- return File(candidate, FileFormat.JSON).read_json()
111
+ return File(candidate, FileFormat.JSON).read()
115
112
  except (OSError, json.JSONDecodeError, ValueError):
116
113
  # Fall back to treating the string as raw JSON content.
117
114
  pass
@@ -155,9 +152,9 @@ def load_to_file(
155
152
  if file_format is None:
156
153
  records = File(path).write(data)
157
154
  ext = path.suffix.lstrip('.').lower()
158
- fmt = coerce_file_format(ext) if ext else FileFormat.JSON
155
+ fmt = FileFormat.coerce(ext) if ext else FileFormat.JSON
159
156
  else:
160
- fmt = coerce_file_format(file_format)
157
+ fmt = FileFormat.coerce(file_format)
161
158
  records = File(path, fmt).write(data)
162
159
  if fmt is FileFormat.CSV and records == 0:
163
160
  message = 'No data to write'
@@ -242,7 +239,7 @@ def load_to_api(
242
239
  TypeError
243
240
  If the session object is not valid.
244
241
  """
245
- http_method = coerce_http_method(method)
242
+ http_method = HttpMethod.coerce(method)
246
243
 
247
244
  # Apply a conservative timeout to guard against hanging requests.
248
245
  timeout = kwargs.pop('timeout', 10.0)
@@ -316,7 +313,7 @@ def load(
316
313
  """
317
314
  data = load_data(source)
318
315
 
319
- match coerce_data_connector_type(target_type):
316
+ match DataConnectorType.coerce(target_type):
320
317
  case DataConnectorType.FILE:
321
318
  # Prefer explicit format if provided, else infer from filename.
322
319
  return load_to_file(data, target, file_format)
@@ -331,6 +328,6 @@ def load(
331
328
  **kwargs,
332
329
  )
333
330
  case _:
334
- # `coerce_data_connector_type` covers invalid entries, but keep
335
- # explicit guard.
331
+ # :meth:`coerce` already raises for invalid connector types, but
332
+ # keep explicit guard for defensive programming.
336
333
  raise ValueError(f'Invalid target type: {target_type}')