etlplus 0.10.5__py3-none-any.whl → 0.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
etlplus/cli/commands.py CHANGED
@@ -36,7 +36,7 @@ from typing import cast
36
36
  import typer
37
37
 
38
38
  from .. import __version__
39
- from ..enums import FileFormat
39
+ from ..file import FileFormat
40
40
  from . import handlers
41
41
  from .constants import CLI_DESCRIPTION
42
42
  from .constants import CLI_EPILOG
etlplus/cli/constants.py CHANGED
@@ -9,7 +9,7 @@ from __future__ import annotations
9
9
  from typing import Final
10
10
 
11
11
  from ..enums import DataConnectorType
12
- from ..enums import FileFormat
12
+ from ..file import FileFormat
13
13
 
14
14
  # SECTION: EXPORTS ========================================================== #
15
15
 
etlplus/cli/io.py CHANGED
@@ -15,8 +15,8 @@ from pathlib import Path
15
15
  from typing import Any
16
16
  from typing import cast
17
17
 
18
- from ..enums import FileFormat
19
18
  from ..file import File
19
+ from ..file import FileFormat
20
20
  from ..types import JSONData
21
21
  from ..utils import print_json
22
22
 
@@ -331,6 +331,6 @@ def write_json_output(
331
331
  """
332
332
  if not output_path or output_path == '-':
333
333
  return False
334
- File(Path(output_path), FileFormat.JSON).write_json(data)
334
+ File(Path(output_path), FileFormat.JSON).write(data)
335
335
  print(f'{success_message} {output_path}')
336
336
  return True
@@ -24,8 +24,8 @@ from typing import Any
24
24
  from typing import Self
25
25
 
26
26
  from ..api import ApiConfig
27
- from ..enums import FileFormat
28
27
  from ..file import File
28
+ from ..file import FileFormat
29
29
  from ..types import StrAnyMap
30
30
  from ..utils import coerce_dict
31
31
  from ..utils import maybe_mapping
@@ -246,7 +246,7 @@ class PipelineConfig:
246
246
  TypeError
247
247
  If the YAML root is not a mapping/object.
248
248
  """
249
- raw = File(Path(path), FileFormat.YAML).read_yaml()
249
+ raw = File(Path(path), FileFormat.YAML).read()
250
250
  if not isinstance(raw, dict):
251
251
  raise TypeError('Pipeline YAML must have a mapping/object root')
252
252
 
etlplus/enums.py CHANGED
@@ -8,7 +8,6 @@ from __future__ import annotations
8
8
 
9
9
  import enum
10
10
  import operator as _op
11
- from pathlib import PurePath
12
11
  from statistics import fmean
13
12
  from typing import Self
14
13
 
@@ -23,18 +22,13 @@ __all__ = [
23
22
  # Enums
24
23
  'AggregateName',
25
24
  'CoercibleStrEnum',
26
- 'CompressionFormat',
27
25
  'DataConnectorType',
28
- 'FileFormat',
29
26
  'HttpMethod',
30
27
  'OperatorName',
31
28
  'PipelineStep',
32
29
  # Functions
33
- 'coerce_compression_format',
34
30
  'coerce_data_connector_type',
35
- 'coerce_file_format',
36
31
  'coerce_http_method',
37
- 'infer_file_format_and_compression',
38
32
  ]
39
33
 
40
34
 
@@ -178,39 +172,6 @@ class AggregateName(CoercibleStrEnum):
178
172
  return lambda xs, n: (fmean(xs) if xs else 0.0)
179
173
 
180
174
 
181
- class CompressionFormat(CoercibleStrEnum):
182
- """Supported compression formats for data files."""
183
-
184
- # -- Constants -- #
185
-
186
- GZ = 'gz'
187
- ZIP = 'zip'
188
-
189
- # -- Class Methods -- #
190
-
191
- @classmethod
192
- def aliases(cls) -> StrStrMap:
193
- """
194
- Return a mapping of common aliases for each enum member.
195
-
196
- Returns
197
- -------
198
- StrStrMap
199
- A mapping of alias names to their corresponding enum member names.
200
- """
201
- return {
202
- # File extensions
203
- '.gz': 'gz',
204
- '.gzip': 'gz',
205
- '.zip': 'zip',
206
- # MIME types
207
- 'application/gzip': 'gz',
208
- 'application/x-gzip': 'gz',
209
- 'application/zip': 'zip',
210
- 'application/x-zip-compressed': 'zip',
211
- }
212
-
213
-
214
175
  class DataConnectorType(CoercibleStrEnum):
215
176
  """Supported data connector types."""
216
177
 
@@ -242,99 +203,6 @@ class DataConnectorType(CoercibleStrEnum):
242
203
  }
243
204
 
244
205
 
245
- class FileFormat(CoercibleStrEnum):
246
- """Supported file formats for extraction."""
247
-
248
- # -- Constants -- #
249
-
250
- AVRO = 'avro'
251
- CSV = 'csv'
252
- FEATHER = 'feather'
253
- GZ = 'gz'
254
- JSON = 'json'
255
- NDJSON = 'ndjson'
256
- ORC = 'orc'
257
- PARQUET = 'parquet'
258
- TSV = 'tsv'
259
- TXT = 'txt'
260
- XLS = 'xls'
261
- XLSX = 'xlsx'
262
- ZIP = 'zip'
263
- XML = 'xml'
264
- YAML = 'yaml'
265
-
266
- # -- Class Methods -- #
267
-
268
- @classmethod
269
- def aliases(cls) -> StrStrMap:
270
- """
271
- Return a mapping of common aliases for each enum member.
272
-
273
- Returns
274
- -------
275
- StrStrMap
276
- A mapping of alias names to their corresponding enum member names.
277
- """
278
- return {
279
- # Common shorthand
280
- 'parq': 'parquet',
281
- 'yml': 'yaml',
282
- # File extensions
283
- '.avro': 'avro',
284
- '.csv': 'csv',
285
- '.feather': 'feather',
286
- '.gz': 'gz',
287
- '.json': 'json',
288
- '.jsonl': 'ndjson',
289
- '.ndjson': 'ndjson',
290
- '.orc': 'orc',
291
- '.parquet': 'parquet',
292
- '.pq': 'parquet',
293
- '.tsv': 'tsv',
294
- '.txt': 'txt',
295
- '.xls': 'xls',
296
- '.xlsx': 'xlsx',
297
- '.zip': 'zip',
298
- '.xml': 'xml',
299
- '.yaml': 'yaml',
300
- '.yml': 'yaml',
301
- # MIME types
302
- 'application/avro': 'avro',
303
- 'application/csv': 'csv',
304
- 'application/feather': 'feather',
305
- 'application/gzip': 'gz',
306
- 'application/json': 'json',
307
- 'application/jsonlines': 'ndjson',
308
- 'application/ndjson': 'ndjson',
309
- 'application/orc': 'orc',
310
- 'application/parquet': 'parquet',
311
- 'application/vnd.apache.avro': 'avro',
312
- 'application/vnd.apache.parquet': 'parquet',
313
- 'application/vnd.apache.arrow.file': 'feather',
314
- 'application/vnd.apache.orc': 'orc',
315
- 'application/vnd.ms-excel': 'xls',
316
- (
317
- 'application/vnd.openxmlformats-'
318
- 'officedocument.spreadsheetml.sheet'
319
- ): 'xlsx',
320
- 'application/x-avro': 'avro',
321
- 'application/x-csv': 'csv',
322
- 'application/x-feather': 'feather',
323
- 'application/x-orc': 'orc',
324
- 'application/x-ndjson': 'ndjson',
325
- 'application/x-parquet': 'parquet',
326
- 'application/x-yaml': 'yaml',
327
- 'application/xml': 'xml',
328
- 'application/zip': 'zip',
329
- 'text/csv': 'csv',
330
- 'text/plain': 'txt',
331
- 'text/tab-separated-values': 'tsv',
332
- 'text/tsv': 'tsv',
333
- 'text/xml': 'xml',
334
- 'text/yaml': 'yaml',
335
- }
336
-
337
-
338
206
  class HttpMethod(CoercibleStrEnum):
339
207
  """Supported HTTP verbs that accept JSON payloads."""
340
208
 
@@ -360,8 +228,8 @@ class HttpMethod(CoercibleStrEnum):
360
228
  Notes
361
229
  -----
362
230
  - RFCs do not strictly forbid bodies on some other methods (e.g.,
363
- ``DELETE``), but many servers/clients do not expect them. We mark
364
- ``POST``, ``PUT``, and ``PATCH`` as True.
231
+ ``DELETE``), but many servers/clients do not expect them. We mark
232
+ ``POST``, ``PUT``, and ``PATCH`` as True.
365
233
  """
366
234
  return self in {HttpMethod.POST, HttpMethod.PUT, HttpMethod.PATCH}
367
235
 
@@ -465,13 +333,6 @@ class PipelineStep(CoercibleStrEnum):
465
333
  # SECTION: INTERNAL CONSTANTS ============================================== #
466
334
 
467
335
 
468
- # Compression formats that are also file formats.
469
- _COMPRESSION_FILE_FORMATS: set[FileFormat] = {
470
- FileFormat.GZ,
471
- FileFormat.ZIP,
472
- }
473
-
474
-
475
336
  # Precomputed order index for PipelineStep; avoids recomputing on each access.
476
337
  _PIPELINE_ORDER_INDEX: dict[PipelineStep, int] = {
477
338
  PipelineStep.FILTER: 0,
@@ -497,30 +358,6 @@ def coerce_data_connector_type(
497
358
  return DataConnectorType.coerce(connector)
498
359
 
499
360
 
500
- def coerce_file_format(
501
- file_format: FileFormat | str,
502
- ) -> FileFormat:
503
- """
504
- Normalize textual file format values to :class:`FileFormat`.
505
-
506
- This thin wrapper is kept for backward compatibility; prefer
507
- :meth:`FileFormat.coerce` going forward.
508
- """
509
- return FileFormat.coerce(file_format)
510
-
511
-
512
- def coerce_compression_format(
513
- compression_format: CompressionFormat | str,
514
- ) -> CompressionFormat:
515
- """
516
- Normalize textual compression format values to :class:`CompressionFormat`.
517
-
518
- This thin wrapper is kept for backward compatibility; prefer
519
- :meth:`CompressionFormat.coerce` going forward.
520
- """
521
- return CompressionFormat.coerce(compression_format)
522
-
523
-
524
361
  def coerce_http_method(
525
362
  http_method: HttpMethod | str,
526
363
  ) -> HttpMethod:
@@ -531,78 +368,3 @@ def coerce_http_method(
531
368
  :meth:`HttpMethod.coerce` going forward.
532
369
  """
533
370
  return HttpMethod.coerce(http_method)
534
-
535
-
536
- def infer_file_format_and_compression(
537
- value: object,
538
- filename: object | None = None,
539
- ) -> tuple[FileFormat | None, CompressionFormat | None]:
540
- """
541
- Infer data format and compression from a filename, extension, or MIME type.
542
-
543
- Parameters
544
- ----------
545
- value : object
546
- A filename, extension, MIME type, or existing enum member.
547
- filename : object | None, optional
548
- A filename to consult for extension-based inference (e.g. when
549
- ``value`` is ``application/octet-stream``).
550
-
551
- Returns
552
- -------
553
- tuple[FileFormat | None, CompressionFormat | None]
554
- The inferred data format and compression, if any.
555
- """
556
- if isinstance(value, FileFormat):
557
- if value in _COMPRESSION_FILE_FORMATS:
558
- return None, CompressionFormat.coerce(value.value)
559
- return value, None
560
- if isinstance(value, CompressionFormat):
561
- return None, value
562
-
563
- text = str(value).strip()
564
- if not text:
565
- return None, None
566
-
567
- normalized = text.casefold()
568
- mime = normalized.split(';', 1)[0].strip()
569
-
570
- is_octet_stream = mime == 'application/octet-stream'
571
- compression = CompressionFormat.try_coerce(mime)
572
- fmt = None if is_octet_stream else FileFormat.try_coerce(mime)
573
-
574
- is_mime = mime.startswith(
575
- (
576
- 'application/',
577
- 'text/',
578
- 'audio/',
579
- 'image/',
580
- 'video/',
581
- 'multipart/',
582
- ),
583
- )
584
- suffix_source: object | None = filename if filename is not None else text
585
- if is_mime and filename is None:
586
- suffix_source = None
587
-
588
- suffixes = (
589
- PurePath(str(suffix_source)).suffixes
590
- if suffix_source is not None
591
- else []
592
- )
593
- if suffixes:
594
- normalized_suffixes = [suffix.casefold() for suffix in suffixes]
595
- compression = (
596
- CompressionFormat.try_coerce(normalized_suffixes[-1])
597
- or compression
598
- )
599
- if compression is not None:
600
- normalized_suffixes = normalized_suffixes[:-1]
601
- if normalized_suffixes:
602
- fmt = FileFormat.try_coerce(normalized_suffixes[-1]) or fmt
603
-
604
- if fmt in _COMPRESSION_FILE_FORMATS:
605
- compression = compression or CompressionFormat.coerce(fmt.value)
606
- fmt = None
607
-
608
- return fmt, compression
etlplus/extract.py CHANGED
@@ -13,11 +13,11 @@ from typing import cast
13
13
  import requests # type: ignore[import]
14
14
 
15
15
  from .enums import DataConnectorType
16
- from .enums import FileFormat
17
16
  from .enums import HttpMethod
18
17
  from .enums import coerce_data_connector_type
19
- from .enums import coerce_file_format
20
18
  from .file import File
19
+ from .file import FileFormat
20
+ from .file import coerce_file_format
21
21
  from .types import JSONData
22
22
  from .types import JSONDict
23
23
  from .types import JSONList
@@ -0,0 +1,27 @@
1
+ """
2
+ :mod:`etlplus.file` package.
3
+
4
+ Public file IO helpers.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from .core import File
10
+ from .enums import CompressionFormat
11
+ from .enums import FileFormat
12
+ from .enums import coerce_file_format
13
+ from .enums import infer_file_format_and_compression
14
+
15
+ # SECTION: EXPORTS ========================================================== #
16
+
17
+
18
+ __all__ = [
19
+ # Class
20
+ 'File',
21
+ # Enums
22
+ 'CompressionFormat',
23
+ 'FileFormat',
24
+ # Functions
25
+ 'coerce_file_format',
26
+ 'infer_file_format_and_compression',
27
+ ]
etlplus/file/core.py ADDED
@@ -0,0 +1,287 @@
1
+ """
2
+ :mod:`etlplus.file.core` module.
3
+
4
+ Shared helpers for reading and writing structured and semi-structured data
5
+ files.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass
11
+ from pathlib import Path
12
+
13
+ from ..types import JSONData
14
+ from ..types import StrPath
15
+ from . import csv
16
+ from . import json
17
+ from . import xml
18
+ from . import yaml
19
+ from .enums import FileFormat
20
+ from .enums import infer_file_format_and_compression
21
+
22
+ # SECTION: EXPORTS ========================================================== #
23
+
24
+
25
+ __all__ = ['File']
26
+
27
+
28
+ # SECTION: CLASSES ========================================================== #
29
+
30
+
31
+ @dataclass(slots=True)
32
+ class File:
33
+ """
34
+ Convenience wrapper around structured file IO.
35
+
36
+ This class encapsulates the one-off helpers in this module as convenient
37
+ instance methods while retaining the original function API for
38
+ backward compatibility (those functions delegate to this class).
39
+
40
+ Attributes
41
+ ----------
42
+ path : Path
43
+ Path to the file on disk.
44
+ file_format : FileFormat | None, optional
45
+ Explicit format. If omitted, the format is inferred from the file
46
+ extension (``.csv``, ``.json``, or ``.xml``).
47
+ """
48
+
49
+ # -- Attributes -- #
50
+
51
+ path: Path
52
+ file_format: FileFormat | None = None
53
+
54
+ # -- Magic Methods (Object Lifecycle) -- #
55
+
56
+ def __post_init__(self) -> None:
57
+ """
58
+ Auto-detect and set the file format on initialization.
59
+
60
+ If no explicit ``file_format`` is provided, attempt to infer it from
61
+ the file path's extension and update :attr:`file_format`. If the
62
+ extension is unknown, the attribute is left as ``None`` and will be
63
+ validated later by :meth:`_ensure_format`.
64
+ """
65
+ # Normalize incoming path (allow str in constructor) to Path.
66
+ if isinstance(self.path, str):
67
+ self.path = Path(self.path)
68
+
69
+ if self.file_format is None:
70
+ try:
71
+ self.file_format = self._guess_format()
72
+ except ValueError:
73
+ # Leave as None; _ensure_format() will raise on use if needed.
74
+ pass
75
+
76
+ # -- Internal Instance Methods -- #
77
+
78
+ def _assert_exists(self) -> None:
79
+ """
80
+ Raise FileNotFoundError if :attr:`path` does not exist.
81
+
82
+ This centralizes existence checks across multiple read methods.
83
+ """
84
+ if not self.path.exists():
85
+ raise FileNotFoundError(f'File not found: {self.path}')
86
+
87
+ def _ensure_format(self) -> FileFormat:
88
+ """
89
+ Resolve the active format, guessing from extension if needed.
90
+
91
+ Returns
92
+ -------
93
+ FileFormat
94
+ The resolved file format.
95
+ """
96
+ return (
97
+ self.file_format
98
+ if self.file_format is not None
99
+ else self._guess_format()
100
+ )
101
+
102
+ def _guess_format(self) -> FileFormat:
103
+ """
104
+ Infer the file format from the filename extension.
105
+
106
+ Returns
107
+ -------
108
+ FileFormat
109
+ The inferred file format based on the file extension.
110
+
111
+ Raises
112
+ ------
113
+ ValueError
114
+ If the extension is unknown or unsupported.
115
+ """
116
+ fmt, compression = infer_file_format_and_compression(self.path)
117
+ if fmt is not None:
118
+ return fmt
119
+ if compression is not None:
120
+ raise ValueError(
121
+ 'Cannot infer file format from compressed file '
122
+ f'{self.path!r} with compression {compression.value!r}',
123
+ )
124
+ raise ValueError(
125
+ f'Cannot infer file format from extension {self.path.suffix!r}',
126
+ )
127
+
128
+ # -- Instance Methods (Generic API) -- #
129
+
130
+ def read(self) -> JSONData:
131
+ """
132
+ Read structured data from :attr:`path` using :attr:`file_format`.
133
+
134
+ Returns
135
+ -------
136
+ JSONData
137
+ The structured data read from the file.
138
+
139
+ Raises
140
+ ------
141
+ ValueError
142
+ If the resolved file format is unsupported.
143
+ """
144
+ self._assert_exists()
145
+ fmt = self._ensure_format()
146
+ match fmt:
147
+ case FileFormat.CSV:
148
+ return csv.read(self.path)
149
+ case FileFormat.JSON:
150
+ return json.read(self.path)
151
+ case FileFormat.XML:
152
+ return xml.read(self.path)
153
+ case FileFormat.YAML:
154
+ return yaml.read(self.path)
155
+ raise ValueError(f'Unsupported format: {fmt}')
156
+
157
+ def write(
158
+ self,
159
+ data: JSONData,
160
+ *,
161
+ root_tag: str = xml.DEFAULT_XML_ROOT,
162
+ ) -> int:
163
+ """
164
+ Write ``data`` to :attr:`path` using :attr:`file_format`.
165
+
166
+ Parameters
167
+ ----------
168
+ data : JSONData
169
+ Data to write to the file.
170
+ root_tag : str, optional
171
+ Root tag name to use when writing XML files. Defaults to
172
+ ``'root'``.
173
+
174
+ Returns
175
+ -------
176
+ int
177
+ The number of records written.
178
+
179
+ Raises
180
+ ------
181
+ ValueError
182
+ If the resolved file format is unsupported.
183
+ """
184
+ fmt = self._ensure_format()
185
+ match fmt:
186
+ case FileFormat.CSV:
187
+ return csv.write(self.path, data)
188
+ case FileFormat.JSON:
189
+ return json.write(self.path, data)
190
+ case FileFormat.XML:
191
+ return xml.write(self.path, data, root_tag=root_tag)
192
+ case FileFormat.YAML:
193
+ return yaml.write(self.path, data)
194
+ raise ValueError(f'Unsupported format: {fmt}')
195
+
196
+ # -- Class Methods -- #
197
+
198
+ @classmethod
199
+ def from_path(
200
+ cls,
201
+ path: StrPath,
202
+ *,
203
+ file_format: FileFormat | str | None = None,
204
+ ) -> File:
205
+ """
206
+ Create a :class:`File` from any path-like and optional format.
207
+
208
+ Parameters
209
+ ----------
210
+ path : StrPath
211
+ Path to the file on disk.
212
+ file_format : FileFormat | str | None, optional
213
+ Explicit format. If omitted, the format is inferred from the file
214
+ extension (``.csv``, ``.json``, or ``.xml``).
215
+
216
+ Returns
217
+ -------
218
+ File
219
+ The constructed :class:`File` instance.
220
+ """
221
+ resolved = Path(path)
222
+ ff: FileFormat | None
223
+ if isinstance(file_format, str):
224
+ ff = FileFormat.coerce(file_format)
225
+ else:
226
+ ff = file_format
227
+
228
+ return cls(resolved, ff)
229
+
230
+ @classmethod
231
+ def read_file(
232
+ cls,
233
+ path: StrPath,
234
+ file_format: FileFormat | str | None = None,
235
+ ) -> JSONData:
236
+ """
237
+ Read structured data.
238
+
239
+ Parameters
240
+ ----------
241
+ path : StrPath
242
+ Path to the file on disk.
243
+ file_format : FileFormat | str | None, optional
244
+ Explicit format. If omitted, the format is inferred from the file
245
+ extension (``.csv``, ``.json``, or ``.xml``).
246
+
247
+ Returns
248
+ -------
249
+ JSONData
250
+ The structured data read from the file.
251
+ """
252
+ return cls.from_path(path, file_format=file_format).read()
253
+
254
+ @classmethod
255
+ def write_file(
256
+ cls,
257
+ path: StrPath,
258
+ data: JSONData,
259
+ file_format: FileFormat | str | None = None,
260
+ *,
261
+ root_tag: str = xml.DEFAULT_XML_ROOT,
262
+ ) -> int:
263
+ """
264
+ Write structured data and count written records.
265
+
266
+ Parameters
267
+ ----------
268
+ path : StrPath
269
+ Path to the file on disk.
270
+ data : JSONData
271
+ Data to write to the file.
272
+ file_format : FileFormat | str | None, optional
273
+ Explicit format. If omitted, the format is inferred from the file
274
+ extension (``.csv``, ``.json``, or ``.xml``).
275
+ root_tag : str, optional
276
+ Root tag name to use when writing XML files. Defaults to
277
+ ``'root'``.
278
+
279
+ Returns
280
+ -------
281
+ int
282
+ The number of records written to the file.
283
+ """
284
+ return cls.from_path(path, file_format=file_format).write(
285
+ data,
286
+ root_tag=root_tag,
287
+ )