etlplus 0.10.4__py3-none-any.whl → 0.12.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. etlplus/README.md +37 -0
  2. etlplus/api/README.md +20 -3
  3. etlplus/cli/README.md +40 -0
  4. etlplus/cli/commands.py +1 -1
  5. etlplus/cli/constants.py +1 -1
  6. etlplus/cli/handlers.py +1 -1
  7. etlplus/cli/io.py +2 -2
  8. etlplus/config/README.md +52 -0
  9. etlplus/config/pipeline.py +2 -2
  10. etlplus/database/README.md +48 -0
  11. etlplus/database/ddl.py +1 -1
  12. etlplus/database/engine.py +1 -1
  13. etlplus/database/schema.py +1 -1
  14. etlplus/enums.py +2 -270
  15. etlplus/extract.py +5 -7
  16. etlplus/file/README.md +105 -0
  17. etlplus/file/__init__.py +25 -0
  18. etlplus/file/avro.py +198 -0
  19. etlplus/file/core.py +287 -0
  20. etlplus/file/csv.py +91 -0
  21. etlplus/file/enums.py +238 -0
  22. etlplus/file/feather.py +144 -0
  23. etlplus/file/gz.py +123 -0
  24. etlplus/file/json.py +98 -0
  25. etlplus/file/ndjson.py +109 -0
  26. etlplus/file/orc.py +142 -0
  27. etlplus/file/parquet.py +146 -0
  28. etlplus/file/tsv.py +91 -0
  29. etlplus/file/txt.py +99 -0
  30. etlplus/file/xls.py +132 -0
  31. etlplus/file/xlsx.py +142 -0
  32. etlplus/file/xml.py +174 -0
  33. etlplus/file/yaml.py +136 -0
  34. etlplus/file/zip.py +175 -0
  35. etlplus/load.py +9 -12
  36. etlplus/run.py +6 -9
  37. etlplus/templates/README.md +46 -0
  38. etlplus/validation/README.md +50 -0
  39. {etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/METADATA +58 -14
  40. {etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/RECORD +44 -20
  41. etlplus/file.py +0 -652
  42. {etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/WHEEL +0 -0
  43. {etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/entry_points.txt +0 -0
  44. {etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/licenses/LICENSE +0 -0
  45. {etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/top_level.txt +0 -0
etlplus/file/enums.py ADDED
@@ -0,0 +1,238 @@
1
+ """
2
+ :mod:`etlplus.file.enums` module.
3
+
4
+ File-specific enums and helpers.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import PurePath
10
+
11
+ from ..enums import CoercibleStrEnum
12
+ from ..types import StrStrMap
13
+
14
+ # SECTION: EXPORTS ========================================================= #
15
+
16
+ __all__ = [
17
+ 'CompressionFormat',
18
+ 'FileFormat',
19
+ 'infer_file_format_and_compression',
20
+ ]
21
+
22
+
23
+ # SECTION: ENUMS ============================================================ #
24
+
25
+
26
+ class CompressionFormat(CoercibleStrEnum):
27
+ """Supported compression formats."""
28
+
29
+ # -- Constants -- #
30
+
31
+ GZ = 'gz'
32
+ ZIP = 'zip'
33
+
34
+ # -- Class Methods -- #
35
+
36
+ @classmethod
37
+ def aliases(cls) -> StrStrMap:
38
+ """
39
+ Return a mapping of common aliases for each enum member.
40
+
41
+ Returns
42
+ -------
43
+ StrStrMap
44
+ A mapping of alias names to their corresponding enum member names.
45
+ """
46
+ return {
47
+ # File extensions
48
+ '.gz': 'gz',
49
+ '.gzip': 'gz',
50
+ '.zip': 'zip',
51
+ # MIME types
52
+ 'application/gzip': 'gz',
53
+ 'application/x-gzip': 'gz',
54
+ 'application/zip': 'zip',
55
+ 'application/x-zip-compressed': 'zip',
56
+ }
57
+
58
+
59
+ class FileFormat(CoercibleStrEnum):
60
+ """Supported file formats for extraction."""
61
+
62
+ # -- Constants -- #
63
+
64
+ AVRO = 'avro'
65
+ CSV = 'csv'
66
+ FEATHER = 'feather'
67
+ GZ = 'gz'
68
+ JSON = 'json'
69
+ NDJSON = 'ndjson'
70
+ ORC = 'orc'
71
+ PARQUET = 'parquet'
72
+ TSV = 'tsv'
73
+ TXT = 'txt'
74
+ XLS = 'xls'
75
+ XLSX = 'xlsx'
76
+ ZIP = 'zip'
77
+ XML = 'xml'
78
+ YAML = 'yaml'
79
+
80
+ # -- Class Methods -- #
81
+
82
+ @classmethod
83
+ def aliases(cls) -> StrStrMap:
84
+ """
85
+ Return a mapping of common aliases for each enum member.
86
+
87
+ Returns
88
+ -------
89
+ StrStrMap
90
+ A mapping of alias names to their corresponding enum member names.
91
+ """
92
+ return {
93
+ # Common shorthand
94
+ 'parq': 'parquet',
95
+ 'yml': 'yaml',
96
+ # File extensions
97
+ '.avro': 'avro',
98
+ '.csv': 'csv',
99
+ '.feather': 'feather',
100
+ '.gz': 'gz',
101
+ '.json': 'json',
102
+ '.jsonl': 'ndjson',
103
+ '.ndjson': 'ndjson',
104
+ '.orc': 'orc',
105
+ '.parquet': 'parquet',
106
+ '.pq': 'parquet',
107
+ '.tsv': 'tsv',
108
+ '.txt': 'txt',
109
+ '.xls': 'xls',
110
+ '.xlsx': 'xlsx',
111
+ '.zip': 'zip',
112
+ '.xml': 'xml',
113
+ '.yaml': 'yaml',
114
+ '.yml': 'yaml',
115
+ # MIME types
116
+ 'application/avro': 'avro',
117
+ 'application/csv': 'csv',
118
+ 'application/feather': 'feather',
119
+ 'application/gzip': 'gz',
120
+ 'application/json': 'json',
121
+ 'application/jsonlines': 'ndjson',
122
+ 'application/ndjson': 'ndjson',
123
+ 'application/orc': 'orc',
124
+ 'application/parquet': 'parquet',
125
+ 'application/vnd.apache.avro': 'avro',
126
+ 'application/vnd.apache.parquet': 'parquet',
127
+ 'application/vnd.apache.arrow.file': 'feather',
128
+ 'application/vnd.apache.orc': 'orc',
129
+ 'application/vnd.ms-excel': 'xls',
130
+ (
131
+ 'application/vnd.openxmlformats-'
132
+ 'officedocument.spreadsheetml.sheet'
133
+ ): 'xlsx',
134
+ 'application/x-avro': 'avro',
135
+ 'application/x-csv': 'csv',
136
+ 'application/x-feather': 'feather',
137
+ 'application/x-orc': 'orc',
138
+ 'application/x-ndjson': 'ndjson',
139
+ 'application/x-parquet': 'parquet',
140
+ 'application/x-yaml': 'yaml',
141
+ 'application/xml': 'xml',
142
+ 'application/zip': 'zip',
143
+ 'text/csv': 'csv',
144
+ 'text/plain': 'txt',
145
+ 'text/tab-separated-values': 'tsv',
146
+ 'text/tsv': 'tsv',
147
+ 'text/xml': 'xml',
148
+ 'text/yaml': 'yaml',
149
+ }
150
+
151
+
152
+ # SECTION: INTERNAL CONSTANTS =============================================== #
153
+
154
+
155
+ # Compression formats that are also file formats.
156
+ _COMPRESSION_FILE_FORMATS: set[FileFormat] = {
157
+ FileFormat.GZ,
158
+ FileFormat.ZIP,
159
+ }
160
+
161
+
162
+ # SECTION: FUNCTIONS ======================================================== #
163
+
164
+
165
+ # TODO: Convert to a method on FileFormat or CompressionFormat?
166
+ def infer_file_format_and_compression(
167
+ value: object,
168
+ filename: object | None = None,
169
+ ) -> tuple[FileFormat | None, CompressionFormat | None]:
170
+ """
171
+ Infer data format and compression from a filename, extension, or MIME type.
172
+
173
+ Parameters
174
+ ----------
175
+ value : object
176
+ A filename, extension, MIME type, or existing enum member.
177
+ filename : object | None, optional
178
+ A filename to consult for extension-based inference (e.g. when
179
+ ``value`` is ``application/octet-stream``).
180
+
181
+ Returns
182
+ -------
183
+ tuple[FileFormat | None, CompressionFormat | None]
184
+ The inferred data format and compression, if any.
185
+ """
186
+ if isinstance(value, FileFormat):
187
+ if value in _COMPRESSION_FILE_FORMATS:
188
+ return None, CompressionFormat.coerce(value.value)
189
+ return value, None
190
+ if isinstance(value, CompressionFormat):
191
+ return None, value
192
+
193
+ text = str(value).strip()
194
+ if not text:
195
+ return None, None
196
+
197
+ normalized = text.casefold()
198
+ mime = normalized.split(';', 1)[0].strip()
199
+
200
+ is_octet_stream = mime == 'application/octet-stream'
201
+ compression = CompressionFormat.try_coerce(mime)
202
+ fmt = None if is_octet_stream else FileFormat.try_coerce(mime)
203
+
204
+ is_mime = mime.startswith(
205
+ (
206
+ 'application/',
207
+ 'text/',
208
+ 'audio/',
209
+ 'image/',
210
+ 'video/',
211
+ 'multipart/',
212
+ ),
213
+ )
214
+ suffix_source: object | None = filename if filename is not None else text
215
+ if is_mime and filename is None:
216
+ suffix_source = None
217
+
218
+ suffixes = (
219
+ PurePath(str(suffix_source)).suffixes
220
+ if suffix_source is not None
221
+ else []
222
+ )
223
+ if suffixes:
224
+ normalized_suffixes = [suffix.casefold() for suffix in suffixes]
225
+ compression = (
226
+ CompressionFormat.try_coerce(normalized_suffixes[-1])
227
+ or compression
228
+ )
229
+ if compression is not None:
230
+ normalized_suffixes = normalized_suffixes[:-1]
231
+ if normalized_suffixes:
232
+ fmt = FileFormat.try_coerce(normalized_suffixes[-1]) or fmt
233
+
234
+ if fmt in _COMPRESSION_FILE_FORMATS:
235
+ compression = compression or CompressionFormat.coerce(fmt.value)
236
+ fmt = None
237
+
238
+ return fmt, compression
@@ -0,0 +1,144 @@
1
+ """
2
+ :mod:`etlplus.file.feather` module.
3
+
4
+ Helpers for reading/writing Feather files.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+ from typing import Any
11
+ from typing import cast
12
+
13
+ from ..types import JSONData
14
+ from ..types import JSONDict
15
+ from ..types import JSONList
16
+
17
+ # SECTION: EXPORTS ========================================================== #
18
+
19
+
20
+ __all__ = [
21
+ 'read',
22
+ 'write',
23
+ ]
24
+
25
+
26
+ # SECTION: INTERNAL CONSTANTS =============================================== #
27
+
28
+
29
+ _PANDAS_CACHE: dict[str, Any] = {}
30
+
31
+
32
+ # SECTION: INTERNAL FUNCTIONS =============================================== #
33
+
34
+
35
+ def _get_pandas() -> Any:
36
+ """
37
+ Return the pandas module, importing it on first use.
38
+
39
+ Raises an informative ImportError if the optional dependency is missing.
40
+ """
41
+ mod = _PANDAS_CACHE.get('mod')
42
+ if mod is not None: # pragma: no cover - tiny branch
43
+ return mod
44
+ try:
45
+ _pd = __import__('pandas') # type: ignore[assignment]
46
+ except ImportError as e: # pragma: no cover
47
+ raise ImportError(
48
+ 'Feather support requires optional dependency "pandas".\n'
49
+ 'Install with: pip install pandas',
50
+ ) from e
51
+ _PANDAS_CACHE['mod'] = _pd
52
+
53
+ return _pd
54
+
55
+
56
+ def _normalize_records(data: JSONData) -> JSONList:
57
+ """
58
+ Normalize JSON payloads into a list of dictionaries.
59
+
60
+ Raises TypeError when payloads contain non-dict items.
61
+ """
62
+ if isinstance(data, list):
63
+ if not all(isinstance(item, dict) for item in data):
64
+ raise TypeError(
65
+ 'Feather payloads must contain only objects (dicts)',
66
+ )
67
+ return cast(JSONList, data)
68
+ return [cast(JSONDict, data)]
69
+
70
+
71
+ # SECTION: FUNCTIONS ======================================================== #
72
+
73
+
74
+ def read(
75
+ path: Path,
76
+ ) -> JSONList:
77
+ """
78
+ Read Feather content from ``path``.
79
+
80
+ Parameters
81
+ ----------
82
+ path : Path
83
+ Path to the Feather file on disk.
84
+
85
+ Returns
86
+ -------
87
+ JSONList
88
+ The list of dictionaries read from the Feather file.
89
+
90
+ Raises
91
+ ------
92
+ ImportError
93
+ When optional dependency "pyarrow" is missing.
94
+ """
95
+ pandas = _get_pandas()
96
+ try:
97
+ frame = pandas.read_feather(path)
98
+ except ImportError as e: # pragma: no cover
99
+ raise ImportError(
100
+ 'Feather support requires optional dependency "pyarrow".\n'
101
+ 'Install with: pip install pyarrow',
102
+ ) from e
103
+ return cast(JSONList, frame.to_dict(orient='records'))
104
+
105
+
106
+ def write(
107
+ path: Path,
108
+ data: JSONData,
109
+ ) -> int:
110
+ """
111
+ Write ``data`` to Feather at ``path`` and return record count.
112
+
113
+ Parameters
114
+ ----------
115
+ path : Path
116
+ Path to the Feather file on disk.
117
+ data : JSONData
118
+ Data to write.
119
+
120
+ Returns
121
+ -------
122
+ int
123
+ Number of records written.
124
+
125
+ Raises
126
+ ------
127
+ ImportError
128
+ When optional dependency "pyarrow" is missing.
129
+ """
130
+ records = _normalize_records(data)
131
+ if not records:
132
+ return 0
133
+
134
+ pandas = _get_pandas()
135
+ path.parent.mkdir(parents=True, exist_ok=True)
136
+ frame = pandas.DataFrame.from_records(records)
137
+ try:
138
+ frame.to_feather(path)
139
+ except ImportError as e: # pragma: no cover
140
+ raise ImportError(
141
+ 'Feather support requires optional dependency "pyarrow".\n'
142
+ 'Install with: pip install pyarrow',
143
+ ) from e
144
+ return len(records)
etlplus/file/gz.py ADDED
@@ -0,0 +1,123 @@
1
+ """
2
+ :mod:`etlplus.file.gz` module.
3
+
4
+ Helpers for reading/writing GZ files.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import gzip
10
+ import tempfile
11
+ from pathlib import Path
12
+
13
+ from ..types import JSONData
14
+ from .enums import CompressionFormat
15
+ from .enums import FileFormat
16
+ from .enums import infer_file_format_and_compression
17
+
18
+ # SECTION: EXPORTS ========================================================== #
19
+
20
+
21
+ __all__ = [
22
+ 'read',
23
+ 'write',
24
+ ]
25
+
26
+
27
+ # SECTION: INTERNAL FUNCTIONS =============================================== #
28
+
29
+
30
+ def _resolve_format(
31
+ path: Path,
32
+ ) -> FileFormat:
33
+ """
34
+ Resolve the inner file format from a .gz filename.
35
+
36
+ Parameters
37
+ ----------
38
+ path : Path
39
+ Path to the GZ file on disk.
40
+
41
+ Returns
42
+ -------
43
+ FileFormat
44
+ The inferred inner file format.
45
+
46
+ Raises
47
+ ------
48
+ ValueError
49
+ If the file format cannot be inferred from the filename.
50
+ """
51
+ fmt, compression = infer_file_format_and_compression(path)
52
+ if compression is not CompressionFormat.GZ:
53
+ raise ValueError(f'Not a gzip file: {path}')
54
+ if fmt is None:
55
+ raise ValueError(
56
+ f'Cannot infer file format from compressed file {path!r}',
57
+ )
58
+ return fmt
59
+
60
+
61
+ # SECTION: FUNCTIONS ======================================================== #
62
+
63
+
64
+ def read(
65
+ path: Path,
66
+ ) -> JSONData:
67
+ """
68
+ Read GZ content from ``path`` and parse the inner payload.
69
+
70
+ Parameters
71
+ ----------
72
+ path : Path
73
+ Path to the GZ file on disk.
74
+
75
+ Returns
76
+ -------
77
+ JSONData
78
+ Parsed payload.
79
+ """
80
+ fmt = _resolve_format(path)
81
+ with gzip.open(path, 'rb') as handle:
82
+ payload = handle.read()
83
+
84
+ with tempfile.TemporaryDirectory() as tmpdir:
85
+ tmp_path = Path(tmpdir) / f'payload.{fmt.value}'
86
+ tmp_path.write_bytes(payload)
87
+ from .core import File
88
+
89
+ return File(tmp_path, fmt).read()
90
+
91
+
92
+ def write(
93
+ path: Path,
94
+ data: JSONData,
95
+ ) -> int:
96
+ """
97
+ Write ``data`` to GZ at ``path`` and return record count.
98
+
99
+ Parameters
100
+ ----------
101
+ path : Path
102
+ Path to the GZ file on disk.
103
+ data : JSONData
104
+ Data to write.
105
+
106
+ Returns
107
+ -------
108
+ int
109
+ Number of records written.
110
+ """
111
+ fmt = _resolve_format(path)
112
+ with tempfile.TemporaryDirectory() as tmpdir:
113
+ tmp_path = Path(tmpdir) / f'payload.{fmt.value}'
114
+ from .core import File
115
+
116
+ count = File(tmp_path, fmt).write(data)
117
+ payload = tmp_path.read_bytes()
118
+
119
+ path.parent.mkdir(parents=True, exist_ok=True)
120
+ with gzip.open(path, 'wb') as handle:
121
+ handle.write(payload)
122
+
123
+ return count
etlplus/file/json.py ADDED
@@ -0,0 +1,98 @@
1
+ """
2
+ :mod:`etlplus.file.json` module.
3
+
4
+ Helpers for reading/writing JSON files.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ from pathlib import Path
11
+ from typing import cast
12
+
13
+ from ..types import JSONData
14
+ from ..types import JSONDict
15
+ from ..types import JSONList
16
+ from ..utils import count_records
17
+
18
+ # SECTION: EXPORTS ========================================================== #
19
+
20
+
21
+ __all__ = [
22
+ 'read',
23
+ 'write',
24
+ ]
25
+
26
+
27
+ # SECTION: FUNCTIONS ======================================================== #
28
+
29
+
30
+ def read(
31
+ path: Path,
32
+ ) -> JSONData:
33
+ """
34
+ Read JSON content from ``path``.
35
+
36
+ Validates that the JSON root is a dict or a list of dicts.
37
+
38
+ Parameters
39
+ ----------
40
+ path : Path
41
+ Path to the JSON file on disk.
42
+
43
+ Returns
44
+ -------
45
+ JSONData
46
+ The structured data read from the JSON file.
47
+
48
+ Raises
49
+ ------
50
+ TypeError
51
+ If the JSON root is not an object or an array of objects.
52
+ """
53
+ with path.open('r', encoding='utf-8') as handle:
54
+ loaded = json.load(handle)
55
+
56
+ if isinstance(loaded, dict):
57
+ return cast(JSONDict, loaded)
58
+ if isinstance(loaded, list):
59
+ if all(isinstance(item, dict) for item in loaded):
60
+ return cast(JSONList, loaded)
61
+ raise TypeError(
62
+ 'JSON array must contain only objects (dicts) when loading file',
63
+ )
64
+ raise TypeError(
65
+ 'JSON root must be an object or an array of objects when loading file',
66
+ )
67
+
68
+
69
+ def write(
70
+ path: Path,
71
+ data: JSONData,
72
+ ) -> int:
73
+ """
74
+ Write ``data`` as formatted JSON to ``path``.
75
+
76
+ Parameters
77
+ ----------
78
+ path : Path
79
+ Path to the JSON file on disk.
80
+ data : JSONData
81
+ Data to serialize as JSON.
82
+
83
+ Returns
84
+ -------
85
+ int
86
+ The number of records written to the JSON file.
87
+ """
88
+ path.parent.mkdir(parents=True, exist_ok=True)
89
+ with path.open('w', encoding='utf-8') as handle:
90
+ json.dump(
91
+ data,
92
+ handle,
93
+ indent=2,
94
+ ensure_ascii=False,
95
+ )
96
+ handle.write('\n')
97
+
98
+ return count_records(data)
etlplus/file/ndjson.py ADDED
@@ -0,0 +1,109 @@
1
+ """
2
+ :mod:`etlplus.file.ndjson` module.
3
+
4
+ Helpers for reading/writing NDJSON files.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ from pathlib import Path
11
+ from typing import cast
12
+
13
+ from ..types import JSONData
14
+ from ..types import JSONDict
15
+ from ..types import JSONList
16
+ from ..utils import count_records
17
+
18
+ # SECTION: EXPORTS ========================================================== #
19
+
20
+
21
+ __all__ = [
22
+ 'read',
23
+ 'write',
24
+ ]
25
+
26
+
27
+ # SECTION: FUNCTIONS ======================================================== #
28
+
29
+
30
+ def read(
31
+ path: Path,
32
+ ) -> JSONList:
33
+ """
34
+ Read NDJSON content from ``path``.
35
+
36
+ Parameters
37
+ ----------
38
+ path : Path
39
+ Path to the NDJSON file on disk.
40
+
41
+ Returns
42
+ -------
43
+ JSONList
44
+ The list of dictionaries read from the NDJSON file.
45
+
46
+ Raises
47
+ ------
48
+ TypeError
49
+ If any line in the NDJSON file is not a JSON object (dict).
50
+ """
51
+ rows: JSONList = []
52
+ with path.open('r', encoding='utf-8') as handle:
53
+ for idx, line in enumerate(handle, start=1):
54
+ text = line.strip()
55
+ if not text:
56
+ continue
57
+ payload = json.loads(text)
58
+ if not isinstance(payload, dict):
59
+ raise TypeError(
60
+ f'NDJSON lines must be objects (dicts) (line {idx})',
61
+ )
62
+ rows.append(cast(JSONDict, payload))
63
+ return rows
64
+
65
+
66
+ def write(
67
+ path: Path,
68
+ data: JSONData,
69
+ ) -> int:
70
+ """
71
+ Write ``data`` to NDJSON at ``path``.
72
+
73
+ Parameters
74
+ ----------
75
+ path : Path
76
+ Path to the NDJSON file on disk.
77
+ data : JSONData
78
+ Data to write.
79
+
80
+ Returns
81
+ -------
82
+ int
83
+ Number of records written.
84
+
85
+ Raises
86
+ ------
87
+ TypeError
88
+ If ``data`` is a list containing non-dict items.
89
+ """
90
+ rows: JSONList
91
+ if isinstance(data, list):
92
+ if not all(isinstance(item, dict) for item in data):
93
+ raise TypeError(
94
+ 'NDJSON payloads must contain only objects (dicts)',
95
+ )
96
+ rows = cast(JSONList, data)
97
+ else:
98
+ rows = [cast(JSONDict, data)]
99
+
100
+ if not rows:
101
+ return 0
102
+
103
+ path.parent.mkdir(parents=True, exist_ok=True)
104
+ with path.open('w', encoding='utf-8') as handle:
105
+ for row in rows:
106
+ handle.write(json.dumps(row, ensure_ascii=False))
107
+ handle.write('\n')
108
+
109
+ return count_records(rows)