etlplus 0.10.4__py3-none-any.whl → 0.12.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. etlplus/README.md +37 -0
  2. etlplus/api/README.md +20 -3
  3. etlplus/cli/README.md +40 -0
  4. etlplus/cli/commands.py +1 -1
  5. etlplus/cli/constants.py +1 -1
  6. etlplus/cli/handlers.py +1 -1
  7. etlplus/cli/io.py +2 -2
  8. etlplus/config/README.md +52 -0
  9. etlplus/config/pipeline.py +2 -2
  10. etlplus/database/README.md +48 -0
  11. etlplus/database/ddl.py +1 -1
  12. etlplus/database/engine.py +1 -1
  13. etlplus/database/schema.py +1 -1
  14. etlplus/enums.py +2 -270
  15. etlplus/extract.py +5 -7
  16. etlplus/file/README.md +105 -0
  17. etlplus/file/__init__.py +25 -0
  18. etlplus/file/avro.py +198 -0
  19. etlplus/file/core.py +287 -0
  20. etlplus/file/csv.py +91 -0
  21. etlplus/file/enums.py +238 -0
  22. etlplus/file/feather.py +144 -0
  23. etlplus/file/gz.py +123 -0
  24. etlplus/file/json.py +98 -0
  25. etlplus/file/ndjson.py +109 -0
  26. etlplus/file/orc.py +142 -0
  27. etlplus/file/parquet.py +146 -0
  28. etlplus/file/tsv.py +91 -0
  29. etlplus/file/txt.py +99 -0
  30. etlplus/file/xls.py +132 -0
  31. etlplus/file/xlsx.py +142 -0
  32. etlplus/file/xml.py +174 -0
  33. etlplus/file/yaml.py +136 -0
  34. etlplus/file/zip.py +175 -0
  35. etlplus/load.py +9 -12
  36. etlplus/run.py +6 -9
  37. etlplus/templates/README.md +46 -0
  38. etlplus/validation/README.md +50 -0
  39. {etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/METADATA +58 -14
  40. {etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/RECORD +44 -20
  41. etlplus/file.py +0 -652
  42. {etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/WHEEL +0 -0
  43. {etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/entry_points.txt +0 -0
  44. {etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/licenses/LICENSE +0 -0
  45. {etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/top_level.txt +0 -0
etlplus/file/orc.py ADDED
@@ -0,0 +1,142 @@
1
+ """
2
+ :mod:`etlplus.file.orc` module.
3
+
4
+ Helpers for reading/writing ORC files.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+ from typing import Any
11
+ from typing import cast
12
+
13
+ from ..types import JSONData
14
+ from ..types import JSONDict
15
+ from ..types import JSONList
16
+
17
+ # SECTION: EXPORTS ========================================================== #
18
+
19
+
20
+ __all__ = [
21
+ 'read',
22
+ 'write',
23
+ ]
24
+
25
+
26
+ # SECTION: INTERNAL CONSTANTS =============================================== #
27
+
28
+
29
+ _PANDAS_CACHE: dict[str, Any] = {}
30
+
31
+
32
+ # SECTION: INTERNAL FUNCTIONS =============================================== #
33
+
34
+
35
+ def _get_pandas() -> Any:
36
+ """
37
+ Return the pandas module, importing it on first use.
38
+
39
+ Raises an informative ImportError if the optional dependency is missing.
40
+ """
41
+ mod = _PANDAS_CACHE.get('mod')
42
+ if mod is not None: # pragma: no cover - tiny branch
43
+ return mod
44
+ try:
45
+ _pd = __import__('pandas') # type: ignore[assignment]
46
+ except ImportError as e: # pragma: no cover
47
+ raise ImportError(
48
+ 'ORC support requires optional dependency "pandas".\n'
49
+ 'Install with: pip install pandas',
50
+ ) from e
51
+ _PANDAS_CACHE['mod'] = _pd
52
+
53
+ return _pd
54
+
55
+
56
+ def _normalize_records(data: JSONData) -> JSONList:
57
+ """
58
+ Normalize JSON payloads into a list of dictionaries.
59
+
60
+ Raises TypeError when payloads contain non-dict items.
61
+ """
62
+ if isinstance(data, list):
63
+ if not all(isinstance(item, dict) for item in data):
64
+ raise TypeError('ORC payloads must contain only objects (dicts)')
65
+ return cast(JSONList, data)
66
+ return [cast(JSONDict, data)]
67
+
68
+
69
+ # SECTION: FUNCTIONS ======================================================== #
70
+
71
+
72
+ def read(
73
+ path: Path,
74
+ ) -> JSONList:
75
+ """
76
+ Read ORC content from ``path``.
77
+
78
+ Parameters
79
+ ----------
80
+ path : Path
81
+ Path to the ORC file on disk.
82
+
83
+ Returns
84
+ -------
85
+ JSONList
86
+ The list of dictionaries read from the ORC file.
87
+
88
+ Raises
89
+ ------
90
+ ImportError
91
+ When optional dependency "pyarrow" is missing.
92
+ """
93
+ pandas = _get_pandas()
94
+ try:
95
+ frame = pandas.read_orc(path)
96
+ except ImportError as e: # pragma: no cover
97
+ raise ImportError(
98
+ 'ORC support requires optional dependency "pyarrow".\n'
99
+ 'Install with: pip install pyarrow',
100
+ ) from e
101
+ return cast(JSONList, frame.to_dict(orient='records'))
102
+
103
+
104
+ def write(
105
+ path: Path,
106
+ data: JSONData,
107
+ ) -> int:
108
+ """
109
+ Write ``data`` to ORC at ``path`` and return record count.
110
+
111
+ Parameters
112
+ ----------
113
+ path : Path
114
+ Path to the ORC file on disk.
115
+ data : JSONData
116
+ Data to write.
117
+
118
+ Returns
119
+ -------
120
+ int
121
+ Number of records written.
122
+
123
+ Raises
124
+ ------
125
+ ImportError
126
+ When optional dependency "pyarrow" is missing.
127
+ """
128
+ records = _normalize_records(data)
129
+ if not records:
130
+ return 0
131
+
132
+ pandas = _get_pandas()
133
+ path.parent.mkdir(parents=True, exist_ok=True)
134
+ frame = pandas.DataFrame.from_records(records)
135
+ try:
136
+ frame.to_orc(path, index=False)
137
+ except ImportError as e: # pragma: no cover
138
+ raise ImportError(
139
+ 'ORC support requires optional dependency "pyarrow".\n'
140
+ 'Install with: pip install pyarrow',
141
+ ) from e
142
+ return len(records)
@@ -0,0 +1,146 @@
1
+ """
2
+ :mod:`etlplus.file.parquet` module.
3
+
4
+ Helpers for reading/writing Parquet files.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+ from typing import Any
11
+ from typing import cast
12
+
13
+ from ..types import JSONData
14
+ from ..types import JSONDict
15
+ from ..types import JSONList
16
+
17
+ # SECTION: EXPORTS ========================================================== #
18
+
19
+
20
+ __all__ = [
21
+ 'read',
22
+ 'write',
23
+ ]
24
+
25
+
26
+ # SECTION: INTERNAL CONSTANTS =============================================== #
27
+
28
+
29
+ _PANDAS_CACHE: dict[str, Any] = {}
30
+
31
+
32
+ # SECTION: INTERNAL FUNCTIONS =============================================== #
33
+
34
+
35
+ def _get_pandas() -> Any:
36
+ """
37
+ Return the pandas module, importing it on first use.
38
+
39
+ Raises an informative ImportError if the optional dependency is missing.
40
+ """
41
+ mod = _PANDAS_CACHE.get('mod')
42
+ if mod is not None: # pragma: no cover - tiny branch
43
+ return mod
44
+ try:
45
+ _pd = __import__('pandas') # type: ignore[assignment]
46
+ except ImportError as e: # pragma: no cover
47
+ raise ImportError(
48
+ 'Parquet support requires optional dependency "pandas".\n'
49
+ 'Install with: pip install pandas',
50
+ ) from e
51
+ _PANDAS_CACHE['mod'] = _pd
52
+
53
+ return _pd
54
+
55
+
56
+ def _normalize_records(data: JSONData) -> JSONList:
57
+ """
58
+ Normalize JSON payloads into a list of dictionaries.
59
+
60
+ Raises TypeError when payloads contain non-dict items.
61
+ """
62
+ if isinstance(data, list):
63
+ if not all(isinstance(item, dict) for item in data):
64
+ raise TypeError(
65
+ 'Parquet payloads must contain only objects (dicts)',
66
+ )
67
+ return cast(JSONList, data)
68
+ return [cast(JSONDict, data)]
69
+
70
+
71
+ # SECTION: FUNCTIONS ======================================================== #
72
+
73
+
74
+ def read(
75
+ path: Path,
76
+ ) -> JSONList:
77
+ """
78
+ Read Parquet content from ``path``.
79
+
80
+ Parameters
81
+ ----------
82
+ path : Path
83
+ Path to the PARQUET file on disk.
84
+
85
+ Returns
86
+ -------
87
+ JSONList
88
+ The list of dictionaries read from the Parquet file.
89
+
90
+ Raises
91
+ ------
92
+ ImportError
93
+ If optional dependencies for Parquet support are missing.
94
+ """
95
+ pandas = _get_pandas()
96
+ try:
97
+ frame = pandas.read_parquet(path)
98
+ except ImportError as e: # pragma: no cover
99
+ raise ImportError(
100
+ 'Parquet support requires optional dependency '
101
+ '"pyarrow" or "fastparquet".\n'
102
+ 'Install with: pip install pyarrow',
103
+ ) from e
104
+ return cast(JSONList, frame.to_dict(orient='records'))
105
+
106
+
107
+ def write(
108
+ path: Path,
109
+ data: JSONData,
110
+ ) -> int:
111
+ """
112
+ Write ``data`` to Parquet at ``path`` and return record count.
113
+
114
+ Parameters
115
+ ----------
116
+ path : Path
117
+ Path to the PARQUET file on disk.
118
+ data : JSONData
119
+ Data to write.
120
+
121
+ Returns
122
+ -------
123
+ int
124
+ Number of records written.
125
+
126
+ Raises
127
+ ------
128
+ ImportError
129
+ If optional dependencies for Parquet support are missing.
130
+ """
131
+ records = _normalize_records(data)
132
+ if not records:
133
+ return 0
134
+
135
+ pandas = _get_pandas()
136
+ path.parent.mkdir(parents=True, exist_ok=True)
137
+ frame = pandas.DataFrame.from_records(records)
138
+ try:
139
+ frame.to_parquet(path, index=False)
140
+ except ImportError as e: # pragma: no cover
141
+ raise ImportError(
142
+ 'Parquet support requires optional dependency '
143
+ '"pyarrow" or "fastparquet".\n'
144
+ 'Install with: pip install pyarrow',
145
+ ) from e
146
+ return len(records)
etlplus/file/tsv.py ADDED
@@ -0,0 +1,91 @@
1
+ """
2
+ :mod:`etlplus.file.tsv` module.
3
+
4
+ Helpers for reading/writing TSV files.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import csv
10
+ from pathlib import Path
11
+ from typing import cast
12
+
13
+ from ..types import JSONData
14
+ from ..types import JSONDict
15
+ from ..types import JSONList
16
+
17
+ # SECTION: EXPORTS ========================================================== #
18
+
19
+
20
+ __all__ = [
21
+ 'read',
22
+ 'write',
23
+ ]
24
+
25
+
26
+ # SECTION: FUNCTIONS ======================================================== #
27
+
28
+
29
+ def read(
30
+ path: Path,
31
+ ) -> JSONList:
32
+ """
33
+ Read TSV content from ``path``.
34
+
35
+ Parameters
36
+ ----------
37
+ path : Path
38
+ Path to the TSV file on disk.
39
+
40
+ Returns
41
+ -------
42
+ JSONList
43
+ The list of dictionaries read from the TSV file.
44
+ """
45
+ with path.open('r', encoding='utf-8', newline='') as handle:
46
+ reader: csv.DictReader[str] = csv.DictReader(handle, delimiter='\t')
47
+ rows: JSONList = []
48
+ for row in reader:
49
+ if not any(row.values()):
50
+ continue
51
+ rows.append(cast(JSONDict, dict(row)))
52
+ return rows
53
+
54
+
55
+ def write(
56
+ path: Path,
57
+ data: JSONData,
58
+ ) -> int:
59
+ """
60
+ Write ``data`` to TSV at ``path`` and return record count.
61
+
62
+ Parameters
63
+ ----------
64
+ path : Path
65
+ Path to the TSV file on disk.
66
+ data : JSONData
67
+ Data to write as TSV. Should be a list of dictionaries or a
68
+ single dictionary.
69
+
70
+ Returns
71
+ -------
72
+ int
73
+ The number of rows written to the TSV file.
74
+ """
75
+ rows: list[JSONDict]
76
+ if isinstance(data, list):
77
+ rows = [row for row in data if isinstance(row, dict)]
78
+ else:
79
+ rows = [data]
80
+
81
+ if not rows:
82
+ return 0
83
+
84
+ fieldnames = sorted({key for row in rows for key in row})
85
+ with path.open('w', encoding='utf-8', newline='') as handle:
86
+ writer = csv.DictWriter(handle, fieldnames=fieldnames, delimiter='\t')
87
+ writer.writeheader()
88
+ for row in rows:
89
+ writer.writerow({field: row.get(field) for field in fieldnames})
90
+
91
+ return len(rows)
etlplus/file/txt.py ADDED
@@ -0,0 +1,99 @@
1
+ """
2
+ :mod:`etlplus.file.txt` module.
3
+
4
+ Helpers for reading/writing text files.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+ from typing import cast
11
+
12
+ from ..types import JSONData
13
+ from ..types import JSONDict
14
+ from ..types import JSONList
15
+ from ..utils import count_records
16
+
17
+ # SECTION: EXPORTS ========================================================== #
18
+
19
+
20
+ __all__ = [
21
+ 'read',
22
+ 'write',
23
+ ]
24
+
25
+
26
+ # SECTION: FUNCTIONS ======================================================== #
27
+
28
+
29
+ def read(
30
+ path: Path,
31
+ ) -> JSONList:
32
+ """
33
+ Read TXT content from ``path``.
34
+
35
+ Parameters
36
+ ----------
37
+ path : Path
38
+ Path to the TXT file on disk.
39
+
40
+ Returns
41
+ -------
42
+ JSONList
43
+ The list of dictionaries read from the TXT file.
44
+ """
45
+ rows: JSONList = []
46
+ with path.open('r', encoding='utf-8') as handle:
47
+ for line in handle:
48
+ text = line.rstrip('\n')
49
+ if text == '':
50
+ continue
51
+ rows.append({'text': text})
52
+ return rows
53
+
54
+
55
+ def write(
56
+ path: Path,
57
+ data: JSONData,
58
+ ) -> int:
59
+ """
60
+ Write ``data`` to TXT at ``path`` and return record count.
61
+
62
+ Parameters
63
+ ----------
64
+ path : Path
65
+ Path to the TXT file on disk.
66
+ data : JSONData
67
+ Data to write. Expects ``{'text': '...'} `` or a list of those.
68
+
69
+ Returns
70
+ -------
71
+ int
72
+ Number of records written.
73
+
74
+ Raises
75
+ ------
76
+ TypeError
77
+ If any item in ``data`` is not a dictionary or if any dictionary
78
+ does not contain a ``'text'`` key.
79
+ """
80
+ rows: JSONList
81
+ if isinstance(data, list):
82
+ if not all(isinstance(item, dict) for item in data):
83
+ raise TypeError('TXT payloads must contain only objects (dicts)')
84
+ rows = cast(JSONList, data)
85
+ else:
86
+ rows = [cast(JSONDict, data)]
87
+
88
+ if not rows:
89
+ return 0
90
+
91
+ path.parent.mkdir(parents=True, exist_ok=True)
92
+ with path.open('w', encoding='utf-8') as handle:
93
+ for row in rows:
94
+ if 'text' not in row:
95
+ raise TypeError('TXT payloads must include a "text" key')
96
+ handle.write(str(row['text']))
97
+ handle.write('\n')
98
+
99
+ return count_records(rows)
etlplus/file/xls.py ADDED
@@ -0,0 +1,132 @@
1
+ """
2
+ :mod:`etlplus.file.xls` module.
3
+
4
+ Helpers for reading/writing Excel XLS files.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+ from typing import Any
11
+ from typing import cast
12
+
13
+ from ..types import JSONData
14
+ from ..types import JSONDict
15
+ from ..types import JSONList
16
+
17
+ # SECTION: EXPORTS ========================================================== #
18
+
19
+
20
+ __all__ = [
21
+ 'read',
22
+ 'write',
23
+ ]
24
+
25
+
26
+ # SECTION: INTERNAL CONSTANTS =============================================== #
27
+
28
+
29
+ _PANDAS_CACHE: dict[str, Any] = {}
30
+
31
+
32
+ # SECTION: INTERNAL FUNCTIONS =============================================== #
33
+
34
+
35
+ def _get_pandas() -> Any:
36
+ """
37
+ Return the pandas module, importing it on first use.
38
+
39
+ Raises an informative ImportError if the optional dependency is missing.
40
+ """
41
+ mod = _PANDAS_CACHE.get('mod')
42
+ if mod is not None: # pragma: no cover - tiny branch
43
+ return mod
44
+ try:
45
+ _pd = __import__('pandas') # type: ignore[assignment]
46
+ except ImportError as e: # pragma: no cover
47
+ raise ImportError(
48
+ 'XLS support requires optional dependency "pandas".\n'
49
+ 'Install with: pip install pandas',
50
+ ) from e
51
+ _PANDAS_CACHE['mod'] = _pd
52
+
53
+ return _pd
54
+
55
+
56
+ def _normalize_records(data: JSONData) -> JSONList:
57
+ """
58
+ Normalize JSON payloads into a list of dictionaries.
59
+
60
+ Raises TypeError when payloads contain non-dict items.
61
+ """
62
+ if isinstance(data, list):
63
+ if not all(isinstance(item, dict) for item in data):
64
+ raise TypeError('XLS payloads must contain only objects (dicts)')
65
+ return cast(JSONList, data)
66
+ return [cast(JSONDict, data)]
67
+
68
+
69
+ # SECTION: FUNCTIONS ======================================================== #
70
+
71
+
72
+ def read(
73
+ path: Path,
74
+ ) -> JSONList:
75
+ """
76
+ Read XLS content from ``path``.
77
+
78
+ Parameters
79
+ ----------
80
+ path : Path
81
+ Path to the XLS file on disk.
82
+
83
+ Returns
84
+ -------
85
+ JSONList
86
+ The list of dictionaries read from the XLS file.
87
+
88
+ Raises
89
+ ------
90
+ ImportError
91
+ If the optional dependency "xlrd" is not installed.
92
+ """
93
+ pandas = _get_pandas()
94
+ try:
95
+ frame = pandas.read_excel(path, engine='xlrd')
96
+ except ImportError as e: # pragma: no cover
97
+ raise ImportError(
98
+ 'XLS support requires optional dependency "xlrd".\n'
99
+ 'Install with: pip install xlrd',
100
+ ) from e
101
+ return cast(JSONList, frame.to_dict(orient='records'))
102
+
103
+
104
+ def write(
105
+ path: Path,
106
+ data: JSONData,
107
+ ) -> int:
108
+ """
109
+ Write ``data`` to XLS at ``path`` and return record count.
110
+
111
+ Notes
112
+ -----
113
+ XLS writing is not supported by pandas 2.x. Use XLSX for writes.
114
+
115
+ Parameters
116
+ ----------
117
+ path : Path
118
+ Path to the XLS file on disk.
119
+ data : JSONData
120
+ Data to write.
121
+
122
+ Returns
123
+ -------
124
+ int
125
+ Number of records written.
126
+
127
+ Raises
128
+ ------
129
+ ImportError
130
+ If the optional dependency "xlwt" is not installed.
131
+ """
132
+ raise RuntimeError('XLS write is not supported; use XLSX instead')