etlplus 0.9.1__py3-none-any.whl → 0.9.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. etlplus/README.md +37 -0
  2. etlplus/__init__.py +1 -26
  3. etlplus/api/README.md +51 -3
  4. etlplus/api/__init__.py +10 -0
  5. etlplus/api/config.py +39 -28
  6. etlplus/api/endpoint_client.py +3 -3
  7. etlplus/api/enums.py +51 -0
  8. etlplus/api/pagination/client.py +1 -1
  9. etlplus/api/rate_limiting/config.py +13 -1
  10. etlplus/api/rate_limiting/rate_limiter.py +8 -11
  11. etlplus/api/request_manager.py +11 -6
  12. etlplus/api/transport.py +14 -2
  13. etlplus/api/types.py +96 -6
  14. etlplus/{run_helpers.py → api/utils.py} +209 -153
  15. etlplus/cli/README.md +40 -0
  16. etlplus/cli/commands.py +76 -43
  17. etlplus/cli/constants.py +1 -1
  18. etlplus/cli/handlers.py +40 -12
  19. etlplus/cli/io.py +2 -2
  20. etlplus/cli/main.py +1 -1
  21. etlplus/cli/state.py +4 -7
  22. etlplus/database/README.md +48 -0
  23. etlplus/database/ddl.py +1 -1
  24. etlplus/database/engine.py +19 -3
  25. etlplus/database/orm.py +2 -0
  26. etlplus/database/schema.py +1 -1
  27. etlplus/enums.py +1 -157
  28. etlplus/file/README.md +105 -0
  29. etlplus/file/__init__.py +25 -0
  30. etlplus/file/_imports.py +141 -0
  31. etlplus/file/_io.py +160 -0
  32. etlplus/file/accdb.py +78 -0
  33. etlplus/file/arrow.py +78 -0
  34. etlplus/file/avro.py +176 -0
  35. etlplus/file/bson.py +77 -0
  36. etlplus/file/cbor.py +78 -0
  37. etlplus/file/cfg.py +79 -0
  38. etlplus/file/conf.py +80 -0
  39. etlplus/file/core.py +322 -0
  40. etlplus/file/csv.py +79 -0
  41. etlplus/file/dat.py +78 -0
  42. etlplus/file/dta.py +77 -0
  43. etlplus/file/duckdb.py +78 -0
  44. etlplus/file/enums.py +343 -0
  45. etlplus/file/feather.py +111 -0
  46. etlplus/file/fwf.py +77 -0
  47. etlplus/file/gz.py +123 -0
  48. etlplus/file/hbs.py +78 -0
  49. etlplus/file/hdf5.py +78 -0
  50. etlplus/file/ini.py +79 -0
  51. etlplus/file/ion.py +78 -0
  52. etlplus/file/jinja2.py +78 -0
  53. etlplus/file/json.py +98 -0
  54. etlplus/file/log.py +78 -0
  55. etlplus/file/mat.py +78 -0
  56. etlplus/file/mdb.py +78 -0
  57. etlplus/file/msgpack.py +78 -0
  58. etlplus/file/mustache.py +78 -0
  59. etlplus/file/nc.py +78 -0
  60. etlplus/file/ndjson.py +108 -0
  61. etlplus/file/numbers.py +75 -0
  62. etlplus/file/ods.py +79 -0
  63. etlplus/file/orc.py +111 -0
  64. etlplus/file/parquet.py +113 -0
  65. etlplus/file/pb.py +78 -0
  66. etlplus/file/pbf.py +77 -0
  67. etlplus/file/properties.py +78 -0
  68. etlplus/file/proto.py +77 -0
  69. etlplus/file/psv.py +79 -0
  70. etlplus/file/rda.py +78 -0
  71. etlplus/file/rds.py +78 -0
  72. etlplus/file/sas7bdat.py +78 -0
  73. etlplus/file/sav.py +77 -0
  74. etlplus/file/sqlite.py +78 -0
  75. etlplus/file/stub.py +84 -0
  76. etlplus/file/sylk.py +77 -0
  77. etlplus/file/tab.py +81 -0
  78. etlplus/file/toml.py +78 -0
  79. etlplus/file/tsv.py +80 -0
  80. etlplus/file/txt.py +102 -0
  81. etlplus/file/vm.py +78 -0
  82. etlplus/file/wks.py +77 -0
  83. etlplus/file/xls.py +88 -0
  84. etlplus/file/xlsm.py +79 -0
  85. etlplus/file/xlsx.py +99 -0
  86. etlplus/file/xml.py +185 -0
  87. etlplus/file/xpt.py +78 -0
  88. etlplus/file/yaml.py +95 -0
  89. etlplus/file/zip.py +175 -0
  90. etlplus/file/zsav.py +77 -0
  91. etlplus/ops/README.md +50 -0
  92. etlplus/ops/__init__.py +61 -0
  93. etlplus/{extract.py → ops/extract.py} +81 -99
  94. etlplus/{load.py → ops/load.py} +78 -101
  95. etlplus/{run.py → ops/run.py} +159 -127
  96. etlplus/{transform.py → ops/transform.py} +75 -68
  97. etlplus/{validation → ops}/utils.py +53 -17
  98. etlplus/{validate.py → ops/validate.py} +22 -12
  99. etlplus/templates/README.md +46 -0
  100. etlplus/types.py +5 -4
  101. etlplus/utils.py +136 -2
  102. etlplus/workflow/README.md +52 -0
  103. etlplus/{config → workflow}/__init__.py +10 -23
  104. etlplus/{config → workflow}/connector.py +58 -44
  105. etlplus/workflow/dag.py +105 -0
  106. etlplus/{config → workflow}/jobs.py +105 -32
  107. etlplus/{config → workflow}/pipeline.py +59 -51
  108. etlplus/{config → workflow}/profile.py +8 -5
  109. etlplus/workflow/types.py +115 -0
  110. {etlplus-0.9.1.dist-info → etlplus-0.9.2.dist-info}/METADATA +210 -17
  111. etlplus-0.9.2.dist-info/RECORD +134 -0
  112. {etlplus-0.9.1.dist-info → etlplus-0.9.2.dist-info}/WHEEL +1 -1
  113. etlplus/config/types.py +0 -204
  114. etlplus/config/utils.py +0 -120
  115. etlplus/file.py +0 -657
  116. etlplus/validation/__init__.py +0 -44
  117. etlplus-0.9.1.dist-info/RECORD +0 -65
  118. {etlplus-0.9.1.dist-info → etlplus-0.9.2.dist-info}/entry_points.txt +0 -0
  119. {etlplus-0.9.1.dist-info → etlplus-0.9.2.dist-info}/licenses/LICENSE +0 -0
  120. {etlplus-0.9.1.dist-info → etlplus-0.9.2.dist-info}/top_level.txt +0 -0
etlplus/file/README.md ADDED
@@ -0,0 +1,105 @@
1
+ # `etlplus.file` Subpackage
2
+
3
+ Documentation for the `etlplus.file` subpackage: unified file format support and helpers for reading
4
+ and writing data files.
5
+
6
+ - Provides a consistent interface for reading and writing files in various formats
7
+ - Supports all formats defined in `FileFormat` (see below)
8
+ - Includes helpers for inferring file format and compression from filenames, extensions, or MIME
9
+ types
10
+ - Exposes a `File` class with instance methods for reading and writing data
11
+
12
+ Back to project overview: see the top-level [README](../../README.md).
13
+
14
+ - [`etlplus.file` Subpackage](#etlplusfile-subpackage)
15
+ - [Supported File Formats](#supported-file-formats)
16
+ - [Inferring File Format and Compression](#inferring-file-format-and-compression)
17
+ - [Reading and Writing Files](#reading-and-writing-files)
18
+ - [Reading a File](#reading-a-file)
19
+ - [Writing a File](#writing-a-file)
20
+ - [File Instance Methods](#file-instance-methods)
21
+ - [Example: Reading and Writing](#example-reading-and-writing)
22
+ - [See Also](#see-also)
23
+
24
+ ## Supported File Formats
25
+
26
+ The following formats are defined in `FileFormat` and supported for reading and writing:
27
+
28
+ | Format | Description |
29
+ |-----------|---------------------------------------------|
30
+ | avro | Apache Avro binary serialization |
31
+ | csv | Comma-separated values text files |
32
+ | feather | Apache Arrow Feather columnar format |
33
+ | gz | Gzip-compressed files (see Compression) |
34
+ | json | Standard JSON files |
35
+ | ndjson | Newline-delimited JSON (JSON Lines) |
36
+ | orc | Apache ORC columnar format |
37
+ | parquet | Apache Parquet columnar format |
38
+ | tsv | Tab-separated values text files |
39
+ | txt | Plain text files |
40
+ | xls | Microsoft Excel (legacy .xls) |
41
+ | xlsx | Microsoft Excel (modern .xlsx) |
42
+ | zip | ZIP-compressed files (see Compression) |
43
+ | xml | XML files |
44
+ | yaml | YAML files |
45
+
46
+ Compression formats (gz, zip) are also supported as wrappers for other formats.
47
+
48
+ ## Inferring File Format and Compression
49
+
50
+ Use `infer_file_format_and_compression(value, filename=None)` to infer the file format and
51
+ compression from a filename, extension, or MIME type. Returns a tuple `(file_format,
52
+ compression_format)`.
53
+
54
+ ## Reading and Writing Files
55
+
56
+ The main entry point for file operations is the `File` class. To read or write files:
57
+
58
+ ### Reading a File
59
+
60
+ ```python
61
+ from etlplus.file import File
62
+
63
+ f = File("data/sample.csv")
64
+ data = f.read()
65
+ ```
66
+
67
+ - The `read()` method automatically detects the format and compression.
68
+ - Returns parsed data (e.g., list of dicts for tabular formats).
69
+
70
+ ### Writing a File
71
+
72
+ ```python
73
+ from etlplus.file import File
74
+
75
+ f = File("output.json")
76
+ f.write(data)
77
+ ```
78
+
79
+ - The `write()` method serializes and writes data in the appropriate format.
80
+ - Supports all formats listed above.
81
+
82
+ ## File Instance Methods
83
+
84
+ - `read()`: Reads and parses the file, returning structured data.
85
+ - `write(data)`: Writes structured data to the file in the detected format.
86
+
87
+ ## Example: Reading and Writing
88
+
89
+ ```python
90
+ from etlplus.file import File
91
+
92
+ # Read CSV
93
+ csv_file = File("data.csv")
94
+ rows = csv_file.read()
95
+
96
+ # Write JSON
97
+ json_file = File("output.json")
98
+ json_file.write(rows)
99
+ ```
100
+
101
+ ## See Also
102
+
103
+ - Top-level CLI and library usage in the main [README](../../README.md)
104
+ - File format enums in [enums.py](enums.py)
105
+ - Compression format enums in [enums.py](enums.py)
@@ -0,0 +1,25 @@
1
+ """
2
+ :mod:`etlplus.file` package.
3
+
4
+ Public file IO helpers.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from .core import File
10
+ from .enums import CompressionFormat
11
+ from .enums import FileFormat
12
+ from .enums import infer_file_format_and_compression
13
+
14
+ # SECTION: EXPORTS ========================================================== #
15
+
16
+
17
+ __all__ = [
18
+ # Class
19
+ 'File',
20
+ # Enums
21
+ 'CompressionFormat',
22
+ 'FileFormat',
23
+ # Functions
24
+ 'infer_file_format_and_compression',
25
+ ]
@@ -0,0 +1,141 @@
1
+ """
2
+ :mod:`etlplus.file._imports` module.
3
+
4
+ Shared helpers for optional dependency imports.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from importlib import import_module
10
+ from typing import Any
11
+
12
+ # SECTION: INTERNAL CONSTANTS =============================================== #
13
+
14
+
15
+ _MODULE_CACHE: dict[str, Any] = {}
16
+
17
+
18
+ # SECTION: INTERNAL FUNCTIONS =============================================== #
19
+
20
+
21
+ def _error_message(
22
+ module_name: str,
23
+ format_name: str,
24
+ ) -> str:
25
+ """
26
+ Build an import error message for an optional dependency.
27
+
28
+ Parameters
29
+ ----------
30
+ module_name : str
31
+ Module name to look up.
32
+ format_name : str
33
+ Human-readable format name for templated messages.
34
+
35
+ Returns
36
+ -------
37
+ str
38
+ Formatted error message.
39
+ """
40
+ return (
41
+ f'{format_name} support requires '
42
+ f'optional dependency "{module_name}".\n'
43
+ f'Install with: pip install {module_name}'
44
+ )
45
+
46
+
47
+ # SECTION: FUNCTIONS ======================================================== #
48
+
49
+
50
+ def get_optional_module(
51
+ module_name: str,
52
+ *,
53
+ error_message: str,
54
+ ) -> Any:
55
+ """
56
+ Return an optional dependency module, caching on first import.
57
+
58
+ Parameters
59
+ ----------
60
+ module_name : str
61
+ Name of the module to import.
62
+ error_message : str
63
+ Error message to surface when the module is missing.
64
+
65
+ Returns
66
+ -------
67
+ Any
68
+ The imported module.
69
+
70
+ Raises
71
+ ------
72
+ ImportError
73
+ If the optional dependency is missing.
74
+ """
75
+ cached = _MODULE_CACHE.get(module_name)
76
+ if cached is not None: # pragma: no cover - tiny branch
77
+ return cached
78
+ try:
79
+ module = import_module(module_name)
80
+ except ImportError as e: # pragma: no cover
81
+ raise ImportError(error_message) from e
82
+ _MODULE_CACHE[module_name] = module
83
+ return module
84
+
85
+
86
+ def get_fastavro() -> Any:
87
+ """
88
+ Return the fastavro module, importing it on first use.
89
+
90
+ Raises an informative ImportError if the optional dependency is missing.
91
+
92
+ Notes
93
+ -----
94
+ Prefer :func:`get_optional_module` for new call sites.
95
+ """
96
+ return get_optional_module(
97
+ 'fastavro',
98
+ error_message=_error_message('fastavro', format_name='AVRO'),
99
+ )
100
+
101
+
102
+ def get_pandas(
103
+ format_name: str,
104
+ ) -> Any:
105
+ """
106
+ Return the pandas module, importing it on first use.
107
+
108
+ Parameters
109
+ ----------
110
+ format_name : str
111
+ Human-readable format name for error messages.
112
+
113
+ Returns
114
+ -------
115
+ Any
116
+ The pandas module.
117
+
118
+ Notes
119
+ -----
120
+ Prefer :func:`get_optional_module` for new call sites.
121
+ """
122
+ return get_optional_module(
123
+ 'pandas',
124
+ error_message=_error_message('pandas', format_name=format_name),
125
+ )
126
+
127
+
128
+ def get_yaml() -> Any:
129
+ """
130
+ Return the PyYAML module, importing it on first use.
131
+
132
+ Raises an informative ImportError if the optional dependency is missing.
133
+
134
+ Notes
135
+ -----
136
+ Prefer :func:`get_optional_module` for new call sites.
137
+ """
138
+ return get_optional_module(
139
+ 'yaml',
140
+ error_message=_error_message('PyYAML', format_name='YAML'),
141
+ )
etlplus/file/_io.py ADDED
@@ -0,0 +1,160 @@
1
+ """
2
+ :mod:`etlplus.file._io` module.
3
+
4
+ Shared helpers for record normalization and delimited text formats.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import csv
10
+ from pathlib import Path
11
+ from typing import Any
12
+ from typing import cast
13
+
14
+ from ..types import JSONData
15
+ from ..types import JSONDict
16
+ from ..types import JSONList
17
+
18
+ # SECTION: FUNCTIONS ======================================================== #
19
+
20
+
21
+ def coerce_record_payload(
22
+ payload: Any,
23
+ *,
24
+ format_name: str,
25
+ ) -> JSONData:
26
+ """
27
+ Validate that ``payload`` is an object or list of objects.
28
+
29
+ Parameters
30
+ ----------
31
+ payload : Any
32
+ Parsed payload to validate.
33
+ format_name : str
34
+ Human-readable format name for error messages.
35
+
36
+ Returns
37
+ -------
38
+ JSONData
39
+ ``payload`` when it is a dict or a list of dicts.
40
+
41
+ Raises
42
+ ------
43
+ TypeError
44
+ If the payload is not a dict or list of dicts.
45
+ """
46
+ if isinstance(payload, dict):
47
+ return cast(JSONDict, payload)
48
+ if isinstance(payload, list):
49
+ if all(isinstance(item, dict) for item in payload):
50
+ return cast(JSONList, payload)
51
+ raise TypeError(
52
+ f'{format_name} array must contain only objects (dicts)',
53
+ )
54
+ raise TypeError(
55
+ f'{format_name} root must be an object or an array of objects',
56
+ )
57
+
58
+
59
+ def normalize_records(
60
+ data: JSONData,
61
+ format_name: str,
62
+ ) -> JSONList:
63
+ """
64
+ Normalize payloads into a list of dictionaries.
65
+
66
+ Parameters
67
+ ----------
68
+ data : JSONData
69
+ Input payload to normalize.
70
+ format_name : str
71
+ Human-readable format name for error messages.
72
+
73
+ Returns
74
+ -------
75
+ JSONList
76
+ Normalized list of dictionaries.
77
+
78
+ Raises
79
+ ------
80
+ TypeError
81
+ If a list payload contains non-dict items.
82
+ """
83
+ if isinstance(data, list):
84
+ if not all(isinstance(item, dict) for item in data):
85
+ raise TypeError(
86
+ f'{format_name} payloads must contain only objects (dicts)',
87
+ )
88
+ return cast(JSONList, data)
89
+ return [cast(JSONDict, data)]
90
+
91
+
92
+ def read_delimited(path: Path, *, delimiter: str) -> JSONList:
93
+ """
94
+ Read delimited content from ``path``.
95
+
96
+ Parameters
97
+ ----------
98
+ path : Path
99
+ Path to the delimited file on disk.
100
+ delimiter : str
101
+ Delimiter character for parsing.
102
+
103
+ Returns
104
+ -------
105
+ JSONList
106
+ The list of dictionaries read from the delimited file.
107
+ """
108
+ with path.open('r', encoding='utf-8', newline='') as handle:
109
+ reader: csv.DictReader[str] = csv.DictReader(
110
+ handle,
111
+ delimiter=delimiter,
112
+ )
113
+ rows: JSONList = []
114
+ for row in reader:
115
+ if not any(row.values()):
116
+ continue
117
+ rows.append(cast(JSONDict, dict(row)))
118
+ return rows
119
+
120
+
121
+ def write_delimited(path: Path, data: JSONData, *, delimiter: str) -> int:
122
+ """
123
+ Write ``data`` to a delimited file and return record count.
124
+
125
+ Parameters
126
+ ----------
127
+ path : Path
128
+ Path to the delimited file on disk.
129
+ data : JSONData
130
+ Data to write as delimited rows.
131
+ delimiter : str
132
+ Delimiter character for writing.
133
+
134
+ Returns
135
+ -------
136
+ int
137
+ The number of rows written.
138
+ """
139
+ rows: list[JSONDict]
140
+ if isinstance(data, list):
141
+ rows = [row for row in data if isinstance(row, dict)]
142
+ else:
143
+ rows = [data]
144
+
145
+ if not rows:
146
+ return 0
147
+
148
+ fieldnames = sorted({key for row in rows for key in row})
149
+ path.parent.mkdir(parents=True, exist_ok=True)
150
+ with path.open('w', encoding='utf-8', newline='') as handle:
151
+ writer = csv.DictWriter(
152
+ handle,
153
+ fieldnames=fieldnames,
154
+ delimiter=delimiter,
155
+ )
156
+ writer.writeheader()
157
+ for row in rows:
158
+ writer.writerow({field: row.get(field) for field in fieldnames})
159
+
160
+ return len(rows)
etlplus/file/accdb.py ADDED
@@ -0,0 +1,78 @@
1
+ """
2
+ :mod:`etlplus.file.accdb` module.
3
+
4
+ Helpers for reading/writing newer Microsoft Access database (ACCDB) files.
5
+
6
+ Notes
7
+ -----
8
+ - An ACCDB file is a proprietary database file format used by Microsoft Access
9
+ 2007 and later.
10
+ - Common cases:
11
+ - Storing relational data for small to medium-sized applications.
12
+ - Desktop database applications.
13
+ - Data management for non-enterprise solutions.
14
+ - Rule of thumb:
15
+ - If the file follows the ACCDB specification, use this module for reading
16
+ and writing.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ from pathlib import Path
22
+
23
+ from ..types import JSONData
24
+ from ..types import JSONList
25
+ from . import stub
26
+
27
+ # SECTION: EXPORTS ========================================================== #
28
+
29
+
30
+ __all__ = [
31
+ 'read',
32
+ 'write',
33
+ ]
34
+
35
+
36
+ # SECTION: FUNCTIONS ======================================================== #
37
+
38
+
39
+ def read(
40
+ path: Path,
41
+ ) -> JSONList:
42
+ """
43
+ Read ACCDB content from ``path``.
44
+
45
+ Parameters
46
+ ----------
47
+ path : Path
48
+ Path to the ACCDB file on disk.
49
+
50
+ Returns
51
+ -------
52
+ JSONList
53
+ The list of dictionaries read from the ACCDB file.
54
+ """
55
+ return stub.read(path, format_name='ACCDB')
56
+
57
+
58
+ def write(
59
+ path: Path,
60
+ data: JSONData,
61
+ ) -> int:
62
+ """
63
+ Write ``data`` to ACCDB at ``path`` and return record count.
64
+
65
+ Parameters
66
+ ----------
67
+ path : Path
68
+ Path to the ACCDB file on disk.
69
+ data : JSONData
70
+ Data to write as ACCDB. Should be a list of dictionaries or a single
71
+ dictionary.
72
+
73
+ Returns
74
+ -------
75
+ int
76
+ The number of rows written to the ACCDB file.
77
+ """
78
+ return stub.write(path, data, format_name='ACCDB')
etlplus/file/arrow.py ADDED
@@ -0,0 +1,78 @@
1
+ """
2
+ :mod:`etlplus.file.arrow` module.
3
+
4
+ Helpers for reading/writing Apache Arrow (ARROW) files.
5
+
6
+ Notes
7
+ -----
8
+ - An ARROW file is a binary file format designed for efficient
9
+ columnar data storage and processing.
10
+ - Common cases:
11
+ - High-performance data analytics.
12
+ - Interoperability between different data processing systems.
13
+ - In-memory data representation for fast computations.
14
+ - Rule of thumb:
15
+ - If the file follows the Apache Arrow specification, use this module for
16
+ reading and writing.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ from pathlib import Path
22
+
23
+ from ..types import JSONData
24
+ from ..types import JSONList
25
+ from . import stub
26
+
27
+ # SECTION: EXPORTS ========================================================== #
28
+
29
+
30
+ __all__ = [
31
+ 'read',
32
+ 'write',
33
+ ]
34
+
35
+
36
+ # SECTION: FUNCTIONS ======================================================== #
37
+
38
+
39
+ def read(
40
+ path: Path,
41
+ ) -> JSONList:
42
+ """
43
+ Read ARROW content from ``path``.
44
+
45
+ Parameters
46
+ ----------
47
+ path : Path
48
+ Path to the Apache Arrow file on disk.
49
+
50
+ Returns
51
+ -------
52
+ JSONList
53
+ The list of dictionaries read from the Apache Arrow file.
54
+ """
55
+ return stub.read(path, format_name='ARROW')
56
+
57
+
58
+ def write(
59
+ path: Path,
60
+ data: JSONData,
61
+ ) -> int:
62
+ """
63
+ Write ``data`` to ARROW at ``path`` and return record count.
64
+
65
+ Parameters
66
+ ----------
67
+ path : Path
68
+ Path to the ARROW file on disk.
69
+ data : JSONData
70
+ Data to write as ARROW. Should be a list of dictionaries or a
71
+ single dictionary.
72
+
73
+ Returns
74
+ -------
75
+ int
76
+ The number of rows written to the ARROW file.
77
+ """
78
+ return stub.write(path, data, format_name='ARROW')