etlplus 0.9.2__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. etlplus/__init__.py +26 -1
  2. etlplus/api/README.md +3 -51
  3. etlplus/api/__init__.py +0 -10
  4. etlplus/api/config.py +28 -39
  5. etlplus/api/endpoint_client.py +3 -3
  6. etlplus/api/pagination/client.py +1 -1
  7. etlplus/api/rate_limiting/config.py +1 -13
  8. etlplus/api/rate_limiting/rate_limiter.py +11 -8
  9. etlplus/api/request_manager.py +6 -11
  10. etlplus/api/transport.py +2 -14
  11. etlplus/api/types.py +6 -96
  12. etlplus/cli/commands.py +43 -76
  13. etlplus/cli/constants.py +1 -1
  14. etlplus/cli/handlers.py +12 -40
  15. etlplus/cli/io.py +2 -2
  16. etlplus/cli/main.py +1 -1
  17. etlplus/cli/state.py +7 -4
  18. etlplus/{workflow → config}/__init__.py +23 -10
  19. etlplus/{workflow → config}/connector.py +44 -58
  20. etlplus/{workflow → config}/jobs.py +32 -105
  21. etlplus/{workflow → config}/pipeline.py +51 -59
  22. etlplus/{workflow → config}/profile.py +5 -8
  23. etlplus/config/types.py +204 -0
  24. etlplus/config/utils.py +120 -0
  25. etlplus/database/ddl.py +1 -1
  26. etlplus/database/engine.py +3 -19
  27. etlplus/database/orm.py +0 -2
  28. etlplus/database/schema.py +1 -1
  29. etlplus/enums.py +266 -0
  30. etlplus/{ops/extract.py → extract.py} +99 -81
  31. etlplus/file.py +652 -0
  32. etlplus/{ops/load.py → load.py} +101 -78
  33. etlplus/{ops/run.py → run.py} +127 -159
  34. etlplus/{api/utils.py → run_helpers.py} +153 -209
  35. etlplus/{ops/transform.py → transform.py} +68 -75
  36. etlplus/types.py +4 -5
  37. etlplus/utils.py +2 -136
  38. etlplus/{ops/validate.py → validate.py} +12 -22
  39. etlplus/validation/__init__.py +44 -0
  40. etlplus/{ops → validation}/utils.py +17 -53
  41. {etlplus-0.9.2.dist-info → etlplus-0.10.1.dist-info}/METADATA +17 -210
  42. etlplus-0.10.1.dist-info/RECORD +65 -0
  43. {etlplus-0.9.2.dist-info → etlplus-0.10.1.dist-info}/WHEEL +1 -1
  44. etlplus/README.md +0 -37
  45. etlplus/api/enums.py +0 -51
  46. etlplus/cli/README.md +0 -40
  47. etlplus/database/README.md +0 -48
  48. etlplus/file/README.md +0 -105
  49. etlplus/file/__init__.py +0 -25
  50. etlplus/file/_imports.py +0 -141
  51. etlplus/file/_io.py +0 -160
  52. etlplus/file/accdb.py +0 -78
  53. etlplus/file/arrow.py +0 -78
  54. etlplus/file/avro.py +0 -176
  55. etlplus/file/bson.py +0 -77
  56. etlplus/file/cbor.py +0 -78
  57. etlplus/file/cfg.py +0 -79
  58. etlplus/file/conf.py +0 -80
  59. etlplus/file/core.py +0 -322
  60. etlplus/file/csv.py +0 -79
  61. etlplus/file/dat.py +0 -78
  62. etlplus/file/dta.py +0 -77
  63. etlplus/file/duckdb.py +0 -78
  64. etlplus/file/enums.py +0 -343
  65. etlplus/file/feather.py +0 -111
  66. etlplus/file/fwf.py +0 -77
  67. etlplus/file/gz.py +0 -123
  68. etlplus/file/hbs.py +0 -78
  69. etlplus/file/hdf5.py +0 -78
  70. etlplus/file/ini.py +0 -79
  71. etlplus/file/ion.py +0 -78
  72. etlplus/file/jinja2.py +0 -78
  73. etlplus/file/json.py +0 -98
  74. etlplus/file/log.py +0 -78
  75. etlplus/file/mat.py +0 -78
  76. etlplus/file/mdb.py +0 -78
  77. etlplus/file/msgpack.py +0 -78
  78. etlplus/file/mustache.py +0 -78
  79. etlplus/file/nc.py +0 -78
  80. etlplus/file/ndjson.py +0 -108
  81. etlplus/file/numbers.py +0 -75
  82. etlplus/file/ods.py +0 -79
  83. etlplus/file/orc.py +0 -111
  84. etlplus/file/parquet.py +0 -113
  85. etlplus/file/pb.py +0 -78
  86. etlplus/file/pbf.py +0 -77
  87. etlplus/file/properties.py +0 -78
  88. etlplus/file/proto.py +0 -77
  89. etlplus/file/psv.py +0 -79
  90. etlplus/file/rda.py +0 -78
  91. etlplus/file/rds.py +0 -78
  92. etlplus/file/sas7bdat.py +0 -78
  93. etlplus/file/sav.py +0 -77
  94. etlplus/file/sqlite.py +0 -78
  95. etlplus/file/stub.py +0 -84
  96. etlplus/file/sylk.py +0 -77
  97. etlplus/file/tab.py +0 -81
  98. etlplus/file/toml.py +0 -78
  99. etlplus/file/tsv.py +0 -80
  100. etlplus/file/txt.py +0 -102
  101. etlplus/file/vm.py +0 -78
  102. etlplus/file/wks.py +0 -77
  103. etlplus/file/xls.py +0 -88
  104. etlplus/file/xlsm.py +0 -79
  105. etlplus/file/xlsx.py +0 -99
  106. etlplus/file/xml.py +0 -185
  107. etlplus/file/xpt.py +0 -78
  108. etlplus/file/yaml.py +0 -95
  109. etlplus/file/zip.py +0 -175
  110. etlplus/file/zsav.py +0 -77
  111. etlplus/ops/README.md +0 -50
  112. etlplus/ops/__init__.py +0 -61
  113. etlplus/templates/README.md +0 -46
  114. etlplus/workflow/README.md +0 -52
  115. etlplus/workflow/dag.py +0 -105
  116. etlplus/workflow/types.py +0 -115
  117. etlplus-0.9.2.dist-info/RECORD +0 -134
  118. {etlplus-0.9.2.dist-info → etlplus-0.10.1.dist-info}/entry_points.txt +0 -0
  119. {etlplus-0.9.2.dist-info → etlplus-0.10.1.dist-info}/licenses/LICENSE +0 -0
  120. {etlplus-0.9.2.dist-info → etlplus-0.10.1.dist-info}/top_level.txt +0 -0
etlplus/file/__init__.py DELETED
@@ -1,25 +0,0 @@
1
- """
2
- :mod:`etlplus.file` package.
3
-
4
- Public file IO helpers.
5
- """
6
-
7
- from __future__ import annotations
8
-
9
- from .core import File
10
- from .enums import CompressionFormat
11
- from .enums import FileFormat
12
- from .enums import infer_file_format_and_compression
13
-
14
- # SECTION: EXPORTS ========================================================== #
15
-
16
-
17
- __all__ = [
18
- # Class
19
- 'File',
20
- # Enums
21
- 'CompressionFormat',
22
- 'FileFormat',
23
- # Functions
24
- 'infer_file_format_and_compression',
25
- ]
etlplus/file/_imports.py DELETED
@@ -1,141 +0,0 @@
1
- """
2
- :mod:`etlplus.file._imports` module.
3
-
4
- Shared helpers for optional dependency imports.
5
- """
6
-
7
- from __future__ import annotations
8
-
9
- from importlib import import_module
10
- from typing import Any
11
-
12
- # SECTION: INTERNAL CONSTANTS =============================================== #
13
-
14
-
15
- _MODULE_CACHE: dict[str, Any] = {}
16
-
17
-
18
- # SECTION: INTERNAL FUNCTIONS =============================================== #
19
-
20
-
21
- def _error_message(
22
- module_name: str,
23
- format_name: str,
24
- ) -> str:
25
- """
26
- Build an import error message for an optional dependency.
27
-
28
- Parameters
29
- ----------
30
- module_name : str
31
- Module name to look up.
32
- format_name : str
33
- Human-readable format name for templated messages.
34
-
35
- Returns
36
- -------
37
- str
38
- Formatted error message.
39
- """
40
- return (
41
- f'{format_name} support requires '
42
- f'optional dependency "{module_name}".\n'
43
- f'Install with: pip install {module_name}'
44
- )
45
-
46
-
47
- # SECTION: FUNCTIONS ======================================================== #
48
-
49
-
50
- def get_optional_module(
51
- module_name: str,
52
- *,
53
- error_message: str,
54
- ) -> Any:
55
- """
56
- Return an optional dependency module, caching on first import.
57
-
58
- Parameters
59
- ----------
60
- module_name : str
61
- Name of the module to import.
62
- error_message : str
63
- Error message to surface when the module is missing.
64
-
65
- Returns
66
- -------
67
- Any
68
- The imported module.
69
-
70
- Raises
71
- ------
72
- ImportError
73
- If the optional dependency is missing.
74
- """
75
- cached = _MODULE_CACHE.get(module_name)
76
- if cached is not None: # pragma: no cover - tiny branch
77
- return cached
78
- try:
79
- module = import_module(module_name)
80
- except ImportError as e: # pragma: no cover
81
- raise ImportError(error_message) from e
82
- _MODULE_CACHE[module_name] = module
83
- return module
84
-
85
-
86
- def get_fastavro() -> Any:
87
- """
88
- Return the fastavro module, importing it on first use.
89
-
90
- Raises an informative ImportError if the optional dependency is missing.
91
-
92
- Notes
93
- -----
94
- Prefer :func:`get_optional_module` for new call sites.
95
- """
96
- return get_optional_module(
97
- 'fastavro',
98
- error_message=_error_message('fastavro', format_name='AVRO'),
99
- )
100
-
101
-
102
- def get_pandas(
103
- format_name: str,
104
- ) -> Any:
105
- """
106
- Return the pandas module, importing it on first use.
107
-
108
- Parameters
109
- ----------
110
- format_name : str
111
- Human-readable format name for error messages.
112
-
113
- Returns
114
- -------
115
- Any
116
- The pandas module.
117
-
118
- Notes
119
- -----
120
- Prefer :func:`get_optional_module` for new call sites.
121
- """
122
- return get_optional_module(
123
- 'pandas',
124
- error_message=_error_message('pandas', format_name=format_name),
125
- )
126
-
127
-
128
- def get_yaml() -> Any:
129
- """
130
- Return the PyYAML module, importing it on first use.
131
-
132
- Raises an informative ImportError if the optional dependency is missing.
133
-
134
- Notes
135
- -----
136
- Prefer :func:`get_optional_module` for new call sites.
137
- """
138
- return get_optional_module(
139
- 'yaml',
140
- error_message=_error_message('PyYAML', format_name='YAML'),
141
- )
etlplus/file/_io.py DELETED
@@ -1,160 +0,0 @@
1
- """
2
- :mod:`etlplus.file._io` module.
3
-
4
- Shared helpers for record normalization and delimited text formats.
5
- """
6
-
7
- from __future__ import annotations
8
-
9
- import csv
10
- from pathlib import Path
11
- from typing import Any
12
- from typing import cast
13
-
14
- from ..types import JSONData
15
- from ..types import JSONDict
16
- from ..types import JSONList
17
-
18
- # SECTION: FUNCTIONS ======================================================== #
19
-
20
-
21
- def coerce_record_payload(
22
- payload: Any,
23
- *,
24
- format_name: str,
25
- ) -> JSONData:
26
- """
27
- Validate that ``payload`` is an object or list of objects.
28
-
29
- Parameters
30
- ----------
31
- payload : Any
32
- Parsed payload to validate.
33
- format_name : str
34
- Human-readable format name for error messages.
35
-
36
- Returns
37
- -------
38
- JSONData
39
- ``payload`` when it is a dict or a list of dicts.
40
-
41
- Raises
42
- ------
43
- TypeError
44
- If the payload is not a dict or list of dicts.
45
- """
46
- if isinstance(payload, dict):
47
- return cast(JSONDict, payload)
48
- if isinstance(payload, list):
49
- if all(isinstance(item, dict) for item in payload):
50
- return cast(JSONList, payload)
51
- raise TypeError(
52
- f'{format_name} array must contain only objects (dicts)',
53
- )
54
- raise TypeError(
55
- f'{format_name} root must be an object or an array of objects',
56
- )
57
-
58
-
59
- def normalize_records(
60
- data: JSONData,
61
- format_name: str,
62
- ) -> JSONList:
63
- """
64
- Normalize payloads into a list of dictionaries.
65
-
66
- Parameters
67
- ----------
68
- data : JSONData
69
- Input payload to normalize.
70
- format_name : str
71
- Human-readable format name for error messages.
72
-
73
- Returns
74
- -------
75
- JSONList
76
- Normalized list of dictionaries.
77
-
78
- Raises
79
- ------
80
- TypeError
81
- If a list payload contains non-dict items.
82
- """
83
- if isinstance(data, list):
84
- if not all(isinstance(item, dict) for item in data):
85
- raise TypeError(
86
- f'{format_name} payloads must contain only objects (dicts)',
87
- )
88
- return cast(JSONList, data)
89
- return [cast(JSONDict, data)]
90
-
91
-
92
- def read_delimited(path: Path, *, delimiter: str) -> JSONList:
93
- """
94
- Read delimited content from ``path``.
95
-
96
- Parameters
97
- ----------
98
- path : Path
99
- Path to the delimited file on disk.
100
- delimiter : str
101
- Delimiter character for parsing.
102
-
103
- Returns
104
- -------
105
- JSONList
106
- The list of dictionaries read from the delimited file.
107
- """
108
- with path.open('r', encoding='utf-8', newline='') as handle:
109
- reader: csv.DictReader[str] = csv.DictReader(
110
- handle,
111
- delimiter=delimiter,
112
- )
113
- rows: JSONList = []
114
- for row in reader:
115
- if not any(row.values()):
116
- continue
117
- rows.append(cast(JSONDict, dict(row)))
118
- return rows
119
-
120
-
121
- def write_delimited(path: Path, data: JSONData, *, delimiter: str) -> int:
122
- """
123
- Write ``data`` to a delimited file and return record count.
124
-
125
- Parameters
126
- ----------
127
- path : Path
128
- Path to the delimited file on disk.
129
- data : JSONData
130
- Data to write as delimited rows.
131
- delimiter : str
132
- Delimiter character for writing.
133
-
134
- Returns
135
- -------
136
- int
137
- The number of rows written.
138
- """
139
- rows: list[JSONDict]
140
- if isinstance(data, list):
141
- rows = [row for row in data if isinstance(row, dict)]
142
- else:
143
- rows = [data]
144
-
145
- if not rows:
146
- return 0
147
-
148
- fieldnames = sorted({key for row in rows for key in row})
149
- path.parent.mkdir(parents=True, exist_ok=True)
150
- with path.open('w', encoding='utf-8', newline='') as handle:
151
- writer = csv.DictWriter(
152
- handle,
153
- fieldnames=fieldnames,
154
- delimiter=delimiter,
155
- )
156
- writer.writeheader()
157
- for row in rows:
158
- writer.writerow({field: row.get(field) for field in fieldnames})
159
-
160
- return len(rows)
etlplus/file/accdb.py DELETED
@@ -1,78 +0,0 @@
1
- """
2
- :mod:`etlplus.file.accdb` module.
3
-
4
- Helpers for reading/writing newer Microsoft Access database (ACCDB) files.
5
-
6
- Notes
7
- -----
8
- - An ACCDB file is a proprietary database file format used by Microsoft Access
9
- 2007 and later.
10
- - Common cases:
11
- - Storing relational data for small to medium-sized applications.
12
- - Desktop database applications.
13
- - Data management for non-enterprise solutions.
14
- - Rule of thumb:
15
- - If the file follows the ACCDB specification, use this module for reading
16
- and writing.
17
- """
18
-
19
- from __future__ import annotations
20
-
21
- from pathlib import Path
22
-
23
- from ..types import JSONData
24
- from ..types import JSONList
25
- from . import stub
26
-
27
- # SECTION: EXPORTS ========================================================== #
28
-
29
-
30
- __all__ = [
31
- 'read',
32
- 'write',
33
- ]
34
-
35
-
36
- # SECTION: FUNCTIONS ======================================================== #
37
-
38
-
39
- def read(
40
- path: Path,
41
- ) -> JSONList:
42
- """
43
- Read ACCDB content from ``path``.
44
-
45
- Parameters
46
- ----------
47
- path : Path
48
- Path to the ACCDB file on disk.
49
-
50
- Returns
51
- -------
52
- JSONList
53
- The list of dictionaries read from the ACCDB file.
54
- """
55
- return stub.read(path, format_name='ACCDB')
56
-
57
-
58
- def write(
59
- path: Path,
60
- data: JSONData,
61
- ) -> int:
62
- """
63
- Write ``data`` to ACCDB at ``path`` and return record count.
64
-
65
- Parameters
66
- ----------
67
- path : Path
68
- Path to the ACCDB file on disk.
69
- data : JSONData
70
- Data to write as ACCDB. Should be a list of dictionaries or a single
71
- dictionary.
72
-
73
- Returns
74
- -------
75
- int
76
- The number of rows written to the ACCDB file.
77
- """
78
- return stub.write(path, data, format_name='ACCDB')
etlplus/file/arrow.py DELETED
@@ -1,78 +0,0 @@
1
- """
2
- :mod:`etlplus.file.arrow` module.
3
-
4
- Helpers for reading/writing Apache Arrow (ARROW) files.
5
-
6
- Notes
7
- -----
8
- - An ARROW file is a binary file format designed for efficient
9
- columnar data storage and processing.
10
- - Common cases:
11
- - High-performance data analytics.
12
- - Interoperability between different data processing systems.
13
- - In-memory data representation for fast computations.
14
- - Rule of thumb:
15
- - If the file follows the Apache Arrow specification, use this module for
16
- reading and writing.
17
- """
18
-
19
- from __future__ import annotations
20
-
21
- from pathlib import Path
22
-
23
- from ..types import JSONData
24
- from ..types import JSONList
25
- from . import stub
26
-
27
- # SECTION: EXPORTS ========================================================== #
28
-
29
-
30
- __all__ = [
31
- 'read',
32
- 'write',
33
- ]
34
-
35
-
36
- # SECTION: FUNCTIONS ======================================================== #
37
-
38
-
39
- def read(
40
- path: Path,
41
- ) -> JSONList:
42
- """
43
- Read ARROW content from ``path``.
44
-
45
- Parameters
46
- ----------
47
- path : Path
48
- Path to the Apache Arrow file on disk.
49
-
50
- Returns
51
- -------
52
- JSONList
53
- The list of dictionaries read from the Apache Arrow file.
54
- """
55
- return stub.read(path, format_name='ARROW')
56
-
57
-
58
- def write(
59
- path: Path,
60
- data: JSONData,
61
- ) -> int:
62
- """
63
- Write ``data`` to ARROW at ``path`` and return record count.
64
-
65
- Parameters
66
- ----------
67
- path : Path
68
- Path to the ARROW file on disk.
69
- data : JSONData
70
- Data to write as ARROW. Should be a list of dictionaries or a
71
- single dictionary.
72
-
73
- Returns
74
- -------
75
- int
76
- The number of rows written to the ARROW file.
77
- """
78
- return stub.write(path, data, format_name='ARROW')
etlplus/file/avro.py DELETED
@@ -1,176 +0,0 @@
1
- """
2
- :mod:`etlplus.file.avro` module.
3
-
4
- Helpers for reading/writing Apache Avro (AVRO) files.
5
-
6
- Notes
7
- -----
8
- - An AVRO file is a binary file format designed for efficient
9
- on-disk storage of data, with a schema definition.
10
- - Common cases:
11
- - Data serialization for distributed systems.
12
- - Interoperability between different programming languages.
13
- - Storage of large datasets with schema evolution support.
14
- - Rule of thumb:
15
- - If the file follows the Apache Avro specification, use this module for
16
- reading and writing.
17
- """
18
-
19
- from __future__ import annotations
20
-
21
- from pathlib import Path
22
- from typing import Any
23
- from typing import cast
24
-
25
- from etlplus.file._imports import get_fastavro
26
-
27
- from ..types import JSONData
28
- from ..types import JSONDict
29
- from ..types import JSONList
30
- from ._io import normalize_records
31
-
32
- # SECTION: EXPORTS ========================================================== #
33
-
34
-
35
- __all__ = [
36
- 'read',
37
- 'write',
38
- ]
39
-
40
-
41
- # SECTION: INTERNAL CONSTANTS =============================================== #
42
-
43
-
44
- _PRIMITIVE_TYPES: tuple[type, ...] = (
45
- bool,
46
- int,
47
- float,
48
- str,
49
- bytes,
50
- bytearray,
51
- )
52
-
53
-
54
- # SECTION: INTERNAL FUNCTIONS =============================================== #
55
-
56
-
57
- def _infer_schema(records: JSONList) -> dict[str, Any]:
58
- """
59
- Infer a basic Avro schema from record payloads.
60
-
61
- Only primitive field values are supported; complex values raise TypeError.
62
- """
63
- field_names = sorted({key for record in records for key in record})
64
- fields: list[dict[str, Any]] = []
65
- for name in field_names:
66
- types: list[str] = []
67
- for record in records:
68
- value = record.get(name)
69
- if value is None:
70
- types.append('null')
71
- continue
72
- if isinstance(value, dict | list):
73
- raise TypeError(
74
- 'AVRO payloads must contain only primitive values',
75
- )
76
- if not isinstance(value, _PRIMITIVE_TYPES):
77
- raise TypeError(
78
- 'AVRO payloads must contain only primitive values',
79
- )
80
- types.append(cast(str, _infer_value_type(value)))
81
- fields.append({'name': name, 'type': _merge_types(types)})
82
-
83
- return {
84
- 'name': 'etlplus_record',
85
- 'type': 'record',
86
- 'fields': fields,
87
- }
88
-
89
-
90
- def _infer_value_type(value: object) -> str | list[str]:
91
- """
92
- Infer the Avro type for a primitive value.
93
-
94
- Raises TypeError for unsupported types.
95
- """
96
- if value is None:
97
- return 'null'
98
- if isinstance(value, bool):
99
- return 'boolean'
100
- if isinstance(value, int):
101
- return 'long'
102
- if isinstance(value, float):
103
- return 'double'
104
- if isinstance(value, str):
105
- return 'string'
106
- if isinstance(value, (bytes, bytearray)):
107
- return 'bytes'
108
- raise TypeError('AVRO payloads must contain only primitive values')
109
-
110
-
111
- def _merge_types(types: list[str]) -> str | list[str]:
112
- """Return a stable Avro type union for a list of types."""
113
- unique = list(dict.fromkeys(types))
114
- if len(unique) == 1:
115
- return unique[0]
116
- ordered = ['null'] + sorted(t for t in unique if t != 'null')
117
- return ordered
118
-
119
-
120
- # SECTION: FUNCTIONS ======================================================== #
121
-
122
-
123
- def read(
124
- path: Path,
125
- ) -> JSONList:
126
- """
127
- Read AVRO content from ``path``.
128
-
129
- Parameters
130
- ----------
131
- path : Path
132
- Path to the AVRO file on disk.
133
-
134
- Returns
135
- -------
136
- JSONList
137
- The list of dictionaries read from the AVRO file.
138
- """
139
- fastavro = get_fastavro()
140
- with path.open('rb') as handle:
141
- reader = fastavro.reader(handle)
142
- return [cast(JSONDict, record) for record in reader]
143
-
144
-
145
- def write(
146
- path: Path,
147
- data: JSONData,
148
- ) -> int:
149
- """
150
- Write ``data`` to AVRO at ``path`` and return record count.
151
-
152
- Parameters
153
- ----------
154
- path : Path
155
- Path to the AVRO file on disk.
156
- data : JSONData
157
- Data to write.
158
-
159
- Returns
160
- -------
161
- int
162
- Number of records written.
163
- """
164
- records = normalize_records(data, 'AVRO')
165
- if not records:
166
- return 0
167
-
168
- fastavro = get_fastavro()
169
- schema = _infer_schema(records)
170
- parsed_schema = fastavro.parse_schema(schema)
171
-
172
- path.parent.mkdir(parents=True, exist_ok=True)
173
- with path.open('wb') as handle:
174
- fastavro.writer(handle, parsed_schema, records)
175
-
176
- return len(records)