etlplus 0.11.5__py3-none-any.whl → 0.12.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
etlplus/file/core.py CHANGED
@@ -11,11 +11,21 @@ from dataclasses import dataclass
11
11
  from pathlib import Path
12
12
 
13
13
  from ..types import JSONData
14
- from ..types import StrPath
14
+ from . import avro
15
15
  from . import csv
16
+ from . import feather
17
+ from . import gz
16
18
  from . import json
19
+ from . import ndjson
20
+ from . import orc
21
+ from . import parquet
22
+ from . import tsv
23
+ from . import txt
24
+ from . import xls
25
+ from . import xlsx
17
26
  from . import xml
18
27
  from . import yaml
28
+ from . import zip
19
29
  from .enums import FileFormat
20
30
  from .enums import infer_file_format_and_compression
21
31
 
@@ -43,7 +53,15 @@ class File:
43
53
  Path to the file on disk.
44
54
  file_format : FileFormat | None, optional
45
55
  Explicit format. If omitted, the format is inferred from the file
46
- extension (``.csv``, ``.json``, or ``.xml``).
56
+ extension (``.csv``, ``.json``, etc.).
57
+
58
+ Parameters
59
+ ----------
60
+ path : StrPath
61
+ Path to the file on disk.
62
+ file_format : FileFormat | str | None, optional
63
+ Explicit format. If omitted, the format is inferred from the file
64
+ extension (``.csv``, ``.json``, etc.).
47
65
  """
48
66
 
49
67
  # -- Attributes -- #
@@ -62,16 +80,10 @@ class File:
62
80
  extension is unknown, the attribute is left as ``None`` and will be
63
81
  validated later by :meth:`_ensure_format`.
64
82
  """
65
- # Normalize incoming path (allow str in constructor) to Path.
66
- if isinstance(self.path, str):
67
- self.path = Path(self.path)
68
-
83
+ self.path = Path(self.path)
84
+ self.file_format = self._coerce_format(self.file_format)
69
85
  if self.file_format is None:
70
- try:
71
- self.file_format = self._guess_format()
72
- except ValueError:
73
- # Leave as None; _ensure_format() will raise on use if needed.
74
- pass
86
+ self.file_format = self._maybe_guess_format()
75
87
 
76
88
  # -- Internal Instance Methods -- #
77
89
 
@@ -84,6 +96,28 @@ class File:
84
96
  if not self.path.exists():
85
97
  raise FileNotFoundError(f'File not found: {self.path}')
86
98
 
99
+ def _coerce_format(
100
+ self,
101
+ file_format: FileFormat | str | None,
102
+ ) -> FileFormat | None:
103
+ """
104
+ Normalize the file format input.
105
+
106
+ Parameters
107
+ ----------
108
+ file_format : FileFormat | str | None
109
+ File format specifier. Strings are coerced into
110
+ :class:`FileFormat`.
111
+
112
+ Returns
113
+ -------
114
+ FileFormat | None
115
+ A normalized file format, or ``None`` when unspecified.
116
+ """
117
+ if file_format is None or isinstance(file_format, FileFormat):
118
+ return file_format
119
+ return FileFormat.coerce(file_format)
120
+
87
121
  def _ensure_format(self) -> FileFormat:
88
122
  """
89
123
  Resolve the active format, guessing from extension if needed.
@@ -125,7 +159,22 @@ class File:
125
159
  f'Cannot infer file format from extension {self.path.suffix!r}',
126
160
  )
127
161
 
128
- # -- Instance Methods (Generic API) -- #
162
+ def _maybe_guess_format(self) -> FileFormat | None:
163
+ """
164
+ Try to infer the format, returning ``None`` if it cannot be inferred.
165
+
166
+ Returns
167
+ -------
168
+ FileFormat | None
169
+ The inferred format, or ``None`` if inference fails.
170
+ """
171
+ try:
172
+ return self._guess_format()
173
+ except ValueError:
174
+ # Leave as None; _ensure_format() will raise on use if needed.
175
+ return None
176
+
177
+ # -- Instance Methods -- #
129
178
 
130
179
  def read(self) -> JSONData:
131
180
  """
@@ -144,14 +193,36 @@ class File:
144
193
  self._assert_exists()
145
194
  fmt = self._ensure_format()
146
195
  match fmt:
196
+ case FileFormat.AVRO:
197
+ return avro.read(self.path)
147
198
  case FileFormat.CSV:
148
199
  return csv.read(self.path)
200
+ case FileFormat.FEATHER:
201
+ return feather.read(self.path)
202
+ case FileFormat.GZ:
203
+ return gz.read(self.path)
149
204
  case FileFormat.JSON:
150
205
  return json.read(self.path)
206
+ case FileFormat.NDJSON:
207
+ return ndjson.read(self.path)
208
+ case FileFormat.ORC:
209
+ return orc.read(self.path)
210
+ case FileFormat.PARQUET:
211
+ return parquet.read(self.path)
212
+ case FileFormat.TSV:
213
+ return tsv.read(self.path)
214
+ case FileFormat.TXT:
215
+ return txt.read(self.path)
216
+ case FileFormat.XLS:
217
+ return xls.read(self.path)
218
+ case FileFormat.XLSX:
219
+ return xlsx.read(self.path)
151
220
  case FileFormat.XML:
152
221
  return xml.read(self.path)
153
222
  case FileFormat.YAML:
154
223
  return yaml.read(self.path)
224
+ case FileFormat.ZIP:
225
+ return zip.read(self.path)
155
226
  raise ValueError(f'Unsupported format: {fmt}')
156
227
 
157
228
  def write(
@@ -183,105 +254,34 @@ class File:
183
254
  """
184
255
  fmt = self._ensure_format()
185
256
  match fmt:
257
+ case FileFormat.AVRO:
258
+ return avro.write(self.path, data)
186
259
  case FileFormat.CSV:
187
260
  return csv.write(self.path, data)
261
+ case FileFormat.FEATHER:
262
+ return feather.write(self.path, data)
263
+ case FileFormat.GZ:
264
+ return gz.write(self.path, data)
188
265
  case FileFormat.JSON:
189
266
  return json.write(self.path, data)
267
+ case FileFormat.NDJSON:
268
+ return ndjson.write(self.path, data)
269
+ case FileFormat.ORC:
270
+ return orc.write(self.path, data)
271
+ case FileFormat.PARQUET:
272
+ return parquet.write(self.path, data)
273
+ case FileFormat.TSV:
274
+ return tsv.write(self.path, data)
275
+ case FileFormat.TXT:
276
+ return txt.write(self.path, data)
277
+ case FileFormat.XLS:
278
+ return xls.write(self.path, data)
279
+ case FileFormat.XLSX:
280
+ return xlsx.write(self.path, data)
190
281
  case FileFormat.XML:
191
282
  return xml.write(self.path, data, root_tag=root_tag)
192
283
  case FileFormat.YAML:
193
284
  return yaml.write(self.path, data)
285
+ case FileFormat.ZIP:
286
+ return zip.write(self.path, data)
194
287
  raise ValueError(f'Unsupported format: {fmt}')
195
-
196
- # -- Class Methods -- #
197
-
198
- @classmethod
199
- def from_path(
200
- cls,
201
- path: StrPath,
202
- *,
203
- file_format: FileFormat | str | None = None,
204
- ) -> File:
205
- """
206
- Create a :class:`File` from any path-like and optional format.
207
-
208
- Parameters
209
- ----------
210
- path : StrPath
211
- Path to the file on disk.
212
- file_format : FileFormat | str | None, optional
213
- Explicit format. If omitted, the format is inferred from the file
214
- extension (``.csv``, ``.json``, or ``.xml``).
215
-
216
- Returns
217
- -------
218
- File
219
- The constructed :class:`File` instance.
220
- """
221
- resolved = Path(path)
222
- ff: FileFormat | None
223
- if isinstance(file_format, str):
224
- ff = FileFormat.coerce(file_format)
225
- else:
226
- ff = file_format
227
-
228
- return cls(resolved, ff)
229
-
230
- @classmethod
231
- def read_file(
232
- cls,
233
- path: StrPath,
234
- file_format: FileFormat | str | None = None,
235
- ) -> JSONData:
236
- """
237
- Read structured data.
238
-
239
- Parameters
240
- ----------
241
- path : StrPath
242
- Path to the file on disk.
243
- file_format : FileFormat | str | None, optional
244
- Explicit format. If omitted, the format is inferred from the file
245
- extension (``.csv``, ``.json``, or ``.xml``).
246
-
247
- Returns
248
- -------
249
- JSONData
250
- The structured data read from the file.
251
- """
252
- return cls.from_path(path, file_format=file_format).read()
253
-
254
- @classmethod
255
- def write_file(
256
- cls,
257
- path: StrPath,
258
- data: JSONData,
259
- file_format: FileFormat | str | None = None,
260
- *,
261
- root_tag: str = xml.DEFAULT_XML_ROOT,
262
- ) -> int:
263
- """
264
- Write structured data and count written records.
265
-
266
- Parameters
267
- ----------
268
- path : StrPath
269
- Path to the file on disk.
270
- data : JSONData
271
- Data to write to the file.
272
- file_format : FileFormat | str | None, optional
273
- Explicit format. If omitted, the format is inferred from the file
274
- extension (``.csv``, ``.json``, or ``.xml``).
275
- root_tag : str, optional
276
- Root tag name to use when writing XML files. Defaults to
277
- ``'root'``.
278
-
279
- Returns
280
- -------
281
- int
282
- The number of records written to the file.
283
- """
284
- return cls.from_path(path, file_format=file_format).write(
285
- data,
286
- root_tag=root_tag,
287
- )
etlplus/file/csv.py CHANGED
@@ -1,7 +1,7 @@
1
1
  """
2
2
  :mod:`etlplus.file.csv` module.
3
3
 
4
- CSV read/write helpers.
4
+ Helpers for reading/writing CSV files.
5
5
  """
6
6
 
7
7
  from __future__ import annotations
@@ -14,6 +14,15 @@ from ..types import JSONData
14
14
  from ..types import JSONDict
15
15
  from ..types import JSONList
16
16
 
17
+ # SECTION: EXPORTS ========================================================== #
18
+
19
+
20
+ __all__ = [
21
+ 'read',
22
+ 'write',
23
+ ]
24
+
25
+
17
26
  # SECTION: FUNCTIONS ======================================================== #
18
27
 
19
28
 
@@ -21,7 +30,7 @@ def read(
21
30
  path: Path,
22
31
  ) -> JSONList:
23
32
  """
24
- Load CSV content as a list of dictionaries.
33
+ Read CSV content from ``path``.
25
34
 
26
35
  Parameters
27
36
  ----------
@@ -48,7 +57,7 @@ def write(
48
57
  data: JSONData,
49
58
  ) -> int:
50
59
  """
51
- Write CSV rows to ``path`` and return the number of rows.
60
+ Write ``data`` to CSV at ``path`` and return record count.
52
61
 
53
62
  Parameters
54
63
  ----------
@@ -0,0 +1,144 @@
1
+ """
2
+ :mod:`etlplus.file.feather` module.
3
+
4
+ Helpers for reading/writing Feather files.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+ from typing import Any
11
+ from typing import cast
12
+
13
+ from ..types import JSONData
14
+ from ..types import JSONDict
15
+ from ..types import JSONList
16
+
17
+ # SECTION: EXPORTS ========================================================== #
18
+
19
+
20
+ __all__ = [
21
+ 'read',
22
+ 'write',
23
+ ]
24
+
25
+
26
+ # SECTION: INTERNAL CONSTANTS =============================================== #
27
+
28
+
29
+ _PANDAS_CACHE: dict[str, Any] = {}
30
+
31
+
32
+ # SECTION: INTERNAL FUNCTIONS =============================================== #
33
+
34
+
35
+ def _get_pandas() -> Any:
36
+ """
37
+ Return the pandas module, importing it on first use.
38
+
39
+ Raises an informative ImportError if the optional dependency is missing.
40
+ """
41
+ mod = _PANDAS_CACHE.get('mod')
42
+ if mod is not None: # pragma: no cover - tiny branch
43
+ return mod
44
+ try:
45
+ _pd = __import__('pandas') # type: ignore[assignment]
46
+ except ImportError as e: # pragma: no cover
47
+ raise ImportError(
48
+ 'Feather support requires optional dependency "pandas".\n'
49
+ 'Install with: pip install pandas',
50
+ ) from e
51
+ _PANDAS_CACHE['mod'] = _pd
52
+
53
+ return _pd
54
+
55
+
56
+ def _normalize_records(data: JSONData) -> JSONList:
57
+ """
58
+ Normalize JSON payloads into a list of dictionaries.
59
+
60
+ Raises TypeError when payloads contain non-dict items.
61
+ """
62
+ if isinstance(data, list):
63
+ if not all(isinstance(item, dict) for item in data):
64
+ raise TypeError(
65
+ 'Feather payloads must contain only objects (dicts)',
66
+ )
67
+ return cast(JSONList, data)
68
+ return [cast(JSONDict, data)]
69
+
70
+
71
+ # SECTION: FUNCTIONS ======================================================== #
72
+
73
+
74
+ def read(
75
+ path: Path,
76
+ ) -> JSONList:
77
+ """
78
+ Read Feather content from ``path``.
79
+
80
+ Parameters
81
+ ----------
82
+ path : Path
83
+ Path to the Feather file on disk.
84
+
85
+ Returns
86
+ -------
87
+ JSONList
88
+ The list of dictionaries read from the Feather file.
89
+
90
+ Raises
91
+ ------
92
+ ImportError
93
+ When optional dependency "pyarrow" is missing.
94
+ """
95
+ pandas = _get_pandas()
96
+ try:
97
+ frame = pandas.read_feather(path)
98
+ except ImportError as e: # pragma: no cover
99
+ raise ImportError(
100
+ 'Feather support requires optional dependency "pyarrow".\n'
101
+ 'Install with: pip install pyarrow',
102
+ ) from e
103
+ return cast(JSONList, frame.to_dict(orient='records'))
104
+
105
+
106
+ def write(
107
+ path: Path,
108
+ data: JSONData,
109
+ ) -> int:
110
+ """
111
+ Write ``data`` to Feather at ``path`` and return record count.
112
+
113
+ Parameters
114
+ ----------
115
+ path : Path
116
+ Path to the Feather file on disk.
117
+ data : JSONData
118
+ Data to write.
119
+
120
+ Returns
121
+ -------
122
+ int
123
+ Number of records written.
124
+
125
+ Raises
126
+ ------
127
+ ImportError
128
+ When optional dependency "pyarrow" is missing.
129
+ """
130
+ records = _normalize_records(data)
131
+ if not records:
132
+ return 0
133
+
134
+ pandas = _get_pandas()
135
+ path.parent.mkdir(parents=True, exist_ok=True)
136
+ frame = pandas.DataFrame.from_records(records)
137
+ try:
138
+ frame.to_feather(path)
139
+ except ImportError as e: # pragma: no cover
140
+ raise ImportError(
141
+ 'Feather support requires optional dependency "pyarrow".\n'
142
+ 'Install with: pip install pyarrow',
143
+ ) from e
144
+ return len(records)
etlplus/file/gz.py ADDED
@@ -0,0 +1,123 @@
1
+ """
2
+ :mod:`etlplus.file.gz` module.
3
+
4
+ Helpers for reading/writing GZ files.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import gzip
10
+ import tempfile
11
+ from pathlib import Path
12
+
13
+ from ..types import JSONData
14
+ from .enums import CompressionFormat
15
+ from .enums import FileFormat
16
+ from .enums import infer_file_format_and_compression
17
+
18
+ # SECTION: EXPORTS ========================================================== #
19
+
20
+
21
+ __all__ = [
22
+ 'read',
23
+ 'write',
24
+ ]
25
+
26
+
27
+ # SECTION: INTERNAL FUNCTIONS =============================================== #
28
+
29
+
30
+ def _resolve_format(
31
+ path: Path,
32
+ ) -> FileFormat:
33
+ """
34
+ Resolve the inner file format from a .gz filename.
35
+
36
+ Parameters
37
+ ----------
38
+ path : Path
39
+ Path to the GZ file on disk.
40
+
41
+ Returns
42
+ -------
43
+ FileFormat
44
+ The inferred inner file format.
45
+
46
+ Raises
47
+ ------
48
+ ValueError
49
+ If the file format cannot be inferred from the filename.
50
+ """
51
+ fmt, compression = infer_file_format_and_compression(path)
52
+ if compression is not CompressionFormat.GZ:
53
+ raise ValueError(f'Not a gzip file: {path}')
54
+ if fmt is None:
55
+ raise ValueError(
56
+ f'Cannot infer file format from compressed file {path!r}',
57
+ )
58
+ return fmt
59
+
60
+
61
+ # SECTION: FUNCTIONS ======================================================== #
62
+
63
+
64
+ def read(
65
+ path: Path,
66
+ ) -> JSONData:
67
+ """
68
+ Read GZ content from ``path`` and parse the inner payload.
69
+
70
+ Parameters
71
+ ----------
72
+ path : Path
73
+ Path to the GZ file on disk.
74
+
75
+ Returns
76
+ -------
77
+ JSONData
78
+ Parsed payload.
79
+ """
80
+ fmt = _resolve_format(path)
81
+ with gzip.open(path, 'rb') as handle:
82
+ payload = handle.read()
83
+
84
+ with tempfile.TemporaryDirectory() as tmpdir:
85
+ tmp_path = Path(tmpdir) / f'payload.{fmt.value}'
86
+ tmp_path.write_bytes(payload)
87
+ from .core import File
88
+
89
+ return File(tmp_path, fmt).read()
90
+
91
+
92
+ def write(
93
+ path: Path,
94
+ data: JSONData,
95
+ ) -> int:
96
+ """
97
+ Write ``data`` to GZ at ``path`` and return record count.
98
+
99
+ Parameters
100
+ ----------
101
+ path : Path
102
+ Path to the GZ file on disk.
103
+ data : JSONData
104
+ Data to write.
105
+
106
+ Returns
107
+ -------
108
+ int
109
+ Number of records written.
110
+ """
111
+ fmt = _resolve_format(path)
112
+ with tempfile.TemporaryDirectory() as tmpdir:
113
+ tmp_path = Path(tmpdir) / f'payload.{fmt.value}'
114
+ from .core import File
115
+
116
+ count = File(tmp_path, fmt).write(data)
117
+ payload = tmp_path.read_bytes()
118
+
119
+ path.parent.mkdir(parents=True, exist_ok=True)
120
+ with gzip.open(path, 'wb') as handle:
121
+ handle.write(payload)
122
+
123
+ return count
etlplus/file/json.py CHANGED
@@ -1,7 +1,7 @@
1
1
  """
2
2
  :mod:`etlplus.file.json` module.
3
3
 
4
- JSON read/write helpers.
4
+ Helpers for reading/writing JSON files.
5
5
  """
6
6
 
7
7
  from __future__ import annotations
@@ -15,6 +15,15 @@ from ..types import JSONDict
15
15
  from ..types import JSONList
16
16
  from ..utils import count_records
17
17
 
18
+ # SECTION: EXPORTS ========================================================== #
19
+
20
+
21
+ __all__ = [
22
+ 'read',
23
+ 'write',
24
+ ]
25
+
26
+
18
27
  # SECTION: FUNCTIONS ======================================================== #
19
28
 
20
29
 
@@ -22,7 +31,9 @@ def read(
22
31
  path: Path,
23
32
  ) -> JSONData:
24
33
  """
25
- Load and validate JSON payloads from ``path``.
34
+ Read JSON content from ``path``.
35
+
36
+ Validates that the JSON root is a dict or a list of dicts.
26
37
 
27
38
  Parameters
28
39
  ----------