etlplus 0.11.11__py3-none-any.whl → 0.12.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
etlplus/README.md ADDED
@@ -0,0 +1,37 @@
1
+ # etlplus package
2
+
3
+ The `etlplus` package provides a unified Python API and CLI for ETL operations: extraction,
4
+ validation, transformation, and loading of data from files, APIs, and databases.
5
+
6
+ - Top-level entry points for extract, validate, transform, and load
7
+ - Utilities for pipeline orchestration and helpers
8
+ - Exposes all subpackages for advanced usage
9
+
10
+ Back to project overview: see the top-level [README](../README.md).
11
+
12
+ ## Subpackages
13
+
14
+ - [etlplus.api](api/README.md): Lightweight HTTP client and paginated REST helpers
15
+ - [etlplus.file](file/README.md): Unified file format support and helpers
16
+ - [etlplus.config](config/README.md): Configuration helpers for connectors, pipelines, jobs, and
17
+ profiles
18
+ - [etlplus.cli](cli/README.md): Command-line interface for ETLPlus workflows
19
+ - [etlplus.database](database/README.md): Database engine, schema, and ORM helpers
20
+ - [etlplus.templates](templates/README.md): SQL and DDL template helpers
21
+ - [etlplus.validation](validation/README.md): Data validation utilities and helpers
22
+
23
+ ## Quickstart
24
+
25
+ ```python
26
+ from etlplus import extract, validate, transform, load
27
+
28
+ data = extract("file", "input.csv")
29
+ filtered = transform(data, {"filter": {"field": "age", "op": "gt", "value": 25}})
30
+ assert validate(filtered, {"age": {"type": "number", "min": 0}})["valid"]
31
+ load(filtered, "file", "output.json", file_format="json")
32
+ ```
33
+
34
+ ## See Also
35
+
36
+ - [Top-level project README](../README.md)
37
+ - [API reference](../docs/README.md)
etlplus/api/README.md CHANGED
@@ -1,7 +1,7 @@
1
- # etlplus.api module.
1
+ # etlplus.api subpackage
2
2
 
3
- Focused documentation for the `etlplus.api` subpackage: a lightweight HTTP client and helpers for
4
- paginated REST endpoints.
3
+ Documentation for the `etlplus.api` subpackage: a lightweight HTTP client and helpers for paginated
4
+ REST endpoints.
5
5
 
6
6
  - Provides a small `EndpointClient` for calling JSON APIs
7
7
  - Supports page-, offset-, and cursor-based pagination via `PaginationConfig`
@@ -12,6 +12,20 @@ paginated REST endpoints.
12
12
 
13
13
  Back to project overview: see the top-level [README](../../README.md).
14
14
 
15
+ - [etlplus.api subpackage](#etlplusapi-subpackage)
16
+ - [Installation](#installation)
17
+ - [Quickstart](#quickstart)
18
+ - [Overriding Rate Limits Per Call](#overriding-rate-limits-per-call)
19
+ - [Choosing `records_path` and `cursor_path`](#choosing-records_path-and-cursor_path)
20
+ - [Cursor-Based Pagination Example](#cursor-based-pagination-example)
21
+ - [Offset-based pagination example](#offset-based-pagination-example)
22
+ - [Authentication](#authentication)
23
+ - [Errors and Rate Limiting](#errors-and-rate-limiting)
24
+ - [Types and Transport](#types-and-transport)
25
+ - [Supporting Modules](#supporting-modules)
26
+ - [Minimal Contract](#minimal-contract)
27
+ - [See also](#see-also)
28
+
15
29
  ## Installation
16
30
 
17
31
  `etlplus.api` ships as part of the `etlplus` package. Install the package as usual:
@@ -233,3 +247,6 @@ providers can fall back to their own defaults. If you already possess a static t
233
247
  ## See also
234
248
 
235
249
  - Top-level CLI and library usage in the main [README](../../README.md)
250
+
251
+
252
+ [def]: #installation
etlplus/cli/README.md ADDED
@@ -0,0 +1,40 @@
1
+ # etlplus.cli subpackage
2
+
3
+ Documentation for the `etlplus.cli` subpackage: command-line interface for ETLPlus workflows.
4
+
5
+ - Provides a CLI for running ETL pipelines, jobs, and utilities
6
+ - Supports commands for running, validating, and inspecting pipelines
7
+ - Includes options for configuration, state, and output control
8
+ - Exposes handlers for custom command integration
9
+
10
+ Back to project overview: see the top-level [README](../../README.md).
11
+
12
+ - [etlplus.cli subpackage](#etlpluscli-subpackage)
13
+ - [Available Commands](#available-commands)
14
+ - [Command Options](#command-options)
15
+ - [Example: Running a Pipeline](#example-running-a-pipeline)
16
+ - [See Also](#see-also)
17
+
18
+ ## Available Commands
19
+
20
+ - **run**: Execute a pipeline or job
21
+ - **validate**: Validate pipeline or config files
22
+ - **inspect**: Show pipeline/job details
23
+
24
+ ## Command Options
25
+
26
+ - `--config`: Path to config file
27
+ - `--state`: Path to state file
28
+ - `--output`: Output file or format
29
+
30
+ ## Example: Running a Pipeline
31
+
32
+ ```bash
33
+ etlplus run --config configs/pipeline.yml --output results.json
34
+ ```
35
+
36
+ ## See Also
37
+
38
+ - Top-level CLI and library usage in the main [README](../../README.md)
39
+ - Command handlers in [handlers.py](handlers.py)
40
+ - Command options in [options.py](options.py)
@@ -0,0 +1,52 @@
1
+ # etlplus.config subpackage
2
+
3
+ Documentation for the `etlplus.config` subpackage: configuration helpers for connectors, pipelines,
4
+ jobs, and profiles.
5
+
6
+ - Provides classes and utilities for managing ETL pipeline configuration
7
+ - Supports YAML/JSON config loading and validation
8
+ - Includes helpers for connectors, jobs, pipelines, and profiles
9
+ - Exposes type definitions for config schemas
10
+
11
+ Back to project overview: see the top-level [README](../../README.md).
12
+
13
+ - [etlplus.config subpackage](#etlplusconfig-subpackage)
14
+ - [Supported Configuration Types](#supported-configuration-types)
15
+ - [Loading and Validating Configs](#loading-and-validating-configs)
16
+ - [Example: Loading a Pipeline Config](#example-loading-a-pipeline-config)
17
+ - [See Also](#see-also)
18
+
19
+ ## Supported Configuration Types
20
+
21
+ - **Connector**: Connection details for databases, files, or APIs
22
+ - **Job**: ETL job definitions and scheduling
23
+ - **Pipeline**: End-to-end pipeline configuration
24
+ - **Profile**: User or environment-specific settings
25
+
26
+ ## Loading and Validating Configs
27
+
28
+ Use the provided classes to load and validate configuration files:
29
+
30
+ ```python
31
+ from etlplus.config import PipelineConfig
32
+
33
+ cfg = PipelineConfig.from_yaml("pipeline.yml")
34
+ ```
35
+
36
+ - Supports YAML and JSON formats
37
+ - Validates against expected schema
38
+
39
+ ## Example: Loading a Pipeline Config
40
+
41
+ ```python
42
+ from etlplus.config import PipelineConfig
43
+
44
+ pipeline = PipelineConfig.from_yaml("configs/pipeline.yml")
45
+ print(pipeline)
46
+ ```
47
+
48
+ ## See Also
49
+
50
+ - Top-level CLI and library usage in the main [README](../../README.md)
51
+ - Config type definitions in [types.py](types.py)
52
+ - Config utilities in [utils.py](utils.py)
@@ -0,0 +1,48 @@
1
+ # etlplus.database subpackage
2
+
3
+ Documentation for the `etlplus.database` subpackage: database engine, schema, and ORM helpers.
4
+
5
+ - Provides database engine and connection management
6
+ - Supports schema definition and DDL generation
7
+ - Includes lightweight ORM utilities for tabular data
8
+ - Exposes type definitions for database objects
9
+
10
+ Back to project overview: see the top-level [README](../../README.md).
11
+
12
+ - [etlplus.database subpackage](#etlplusdatabase-subpackage)
13
+ - [Database Engine and Connections](#database-engine-and-connections)
14
+ - [Schema and DDL Helpers](#schema-and-ddl-helpers)
15
+ - [ORM Utilities](#orm-utilities)
16
+ - [Example: Creating a Table](#example-creating-a-table)
17
+ - [See Also](#see-also)
18
+
19
+ ## Database Engine and Connections
20
+
21
+ - Manage connections to supported databases
22
+ - Configure engines for different backends
23
+
24
+ ## Schema and DDL Helpers
25
+
26
+ - Define table schemas and columns
27
+ - Generate DDL statements for supported databases
28
+
29
+ ## ORM Utilities
30
+
31
+ - Map rows to Python objects
32
+ - Simple CRUD helpers for tabular data
33
+
34
+ ## Example: Creating a Table
35
+
36
+ ```python
37
+ from etlplus.database import Schema, Engine
38
+
39
+ engine = Engine.connect("sqlite:///example.db")
40
+ schema = Schema.from_dict({"name": "users", "columns": [ ... ]})
41
+ engine.create_table(schema)
42
+ ```
43
+
44
+ ## See Also
45
+
46
+ - Top-level CLI and library usage in the main [README](../../README.md)
47
+ - Schema helpers in [schema.py](schema.py)
48
+ - ORM utilities in [orm.py](orm.py)
etlplus/file/README.md ADDED
@@ -0,0 +1,105 @@
1
+ # etlplus.file subpackage
2
+
3
+ Documentation for the `etlplus.file` subpackage: unified file format support and helpers for reading
4
+ and writing data files.
5
+
6
+ - Provides a consistent interface for reading and writing files in various formats
7
+ - Supports all formats defined in `FileFormat` (see below)
8
+ - Includes helpers for inferring file format and compression from filenames, extensions, or MIME
9
+ types
10
+ - Exposes a `File` class with instance methods for reading and writing data
11
+
12
+ Back to project overview: see the top-level [README](../../README.md).
13
+
14
+ - [etlplus.file subpackage](#etlplusfile-subpackage)
15
+ - [Supported File Formats](#supported-file-formats)
16
+ - [Inferring File Format and Compression](#inferring-file-format-and-compression)
17
+ - [Reading and Writing Files](#reading-and-writing-files)
18
+ - [Reading a File](#reading-a-file)
19
+ - [Writing a File](#writing-a-file)
20
+ - [File Instance Methods](#file-instance-methods)
21
+ - [Example: Reading and Writing](#example-reading-and-writing)
22
+ - [See Also](#see-also)
23
+
24
+ ## Supported File Formats
25
+
26
+ The following formats are defined in `FileFormat` and supported for reading and writing:
27
+
28
+ | Format | Description |
29
+ |-----------|---------------------------------------------|
30
+ | avro | Apache Avro binary serialization |
31
+ | csv | Comma-separated values text files |
32
+ | feather | Apache Arrow Feather columnar format |
33
+ | gz | Gzip-compressed files (see Compression) |
34
+ | json | Standard JSON files |
35
+ | ndjson | Newline-delimited JSON (JSON Lines) |
36
+ | orc | Apache ORC columnar format |
37
+ | parquet | Apache Parquet columnar format |
38
+ | tsv | Tab-separated values text files |
39
+ | txt | Plain text files |
40
+ | xls | Microsoft Excel (legacy .xls) |
41
+ | xlsx | Microsoft Excel (modern .xlsx) |
42
+ | zip | ZIP-compressed files (see Compression) |
43
+ | xml | XML files |
44
+ | yaml | YAML files |
45
+
46
+ Compression formats (gz, zip) are also supported as wrappers for other formats.
47
+
48
+ ## Inferring File Format and Compression
49
+
50
+ Use `infer_file_format_and_compression(value, filename=None)` to infer the file format and
51
+ compression from a filename, extension, or MIME type. Returns a tuple `(file_format,
52
+ compression_format)`.
53
+
54
+ ## Reading and Writing Files
55
+
56
+ The main entry point for file operations is the `File` class. To read or write files:
57
+
58
+ ### Reading a File
59
+
60
+ ```python
61
+ from etlplus.file import File
62
+
63
+ f = File("data/sample.csv")
64
+ data = f.read()
65
+ ```
66
+
67
+ - The `read()` method automatically detects the format and compression.
68
+ - Returns parsed data (e.g., list of dicts for tabular formats).
69
+
70
+ ### Writing a File
71
+
72
+ ```python
73
+ from etlplus.file import File
74
+
75
+ f = File("output.json")
76
+ f.write(data)
77
+ ```
78
+
79
+ - The `write()` method serializes and writes data in the appropriate format.
80
+ - Supports all formats listed above.
81
+
82
+ ## File Instance Methods
83
+
84
+ - `read()`: Reads and parses the file, returning structured data.
85
+ - `write(data)`: Writes structured data to the file in the detected format.
86
+
87
+ ## Example: Reading and Writing
88
+
89
+ ```python
90
+ from etlplus.file import File
91
+
92
+ # Read CSV
93
+ csv_file = File("data.csv")
94
+ rows = csv_file.read()
95
+
96
+ # Write JSON
97
+ json_file = File("output.json")
98
+ json_file.write(rows)
99
+ ```
100
+
101
+ ## See Also
102
+
103
+ - Top-level CLI and library usage in the main [README](../../README.md)
104
+ - File format enums in [enums.py](enums.py)
105
+ - Compression format enums in [enums.py](enums.py)
etlplus/file/avro.py CHANGED
@@ -1,19 +1,150 @@
1
1
  """
2
2
  :mod:`etlplus.file.avro` module.
3
3
 
4
- Stub helpers for AVRO read/write.
4
+ Helpers for reading/writing Avro files.
5
5
  """
6
6
 
7
7
  from __future__ import annotations
8
8
 
9
9
  from pathlib import Path
10
+ from typing import Any
11
+ from typing import cast
10
12
 
11
13
  from ..types import JSONData
14
+ from ..types import JSONDict
15
+ from ..types import JSONList
12
16
 
13
17
  # SECTION: EXPORTS ========================================================== #
14
18
 
15
19
 
16
- def read(path: Path) -> JSONData:
20
+ __all__ = [
21
+ 'read',
22
+ 'write',
23
+ ]
24
+
25
+
26
+ # SECTION: INTERNAL CONSTANTS =============================================== #
27
+
28
+
29
+ _FASTAVRO_CACHE: dict[str, Any] = {}
30
+
31
+
32
+ _PRIMITIVE_TYPES: tuple[type, ...] = (
33
+ bool,
34
+ int,
35
+ float,
36
+ str,
37
+ bytes,
38
+ bytearray,
39
+ )
40
+
41
+
42
+ # SECTION: INTERNAL FUNCTIONS =============================================== #
43
+
44
+
45
+ def _get_fastavro() -> Any:
46
+ """
47
+ Return the fastavro module, importing it on first use.
48
+
49
+ Raises an informative ImportError if the optional dependency is missing.
50
+ """
51
+ mod = _FASTAVRO_CACHE.get('mod')
52
+ if mod is not None: # pragma: no cover - tiny branch
53
+ return mod
54
+ try:
55
+ _fastavro = __import__('fastavro') # type: ignore[assignment]
56
+ except ImportError as e: # pragma: no cover
57
+ raise ImportError(
58
+ 'AVRO support requires optional dependency "fastavro".\n'
59
+ 'Install with: pip install fastavro',
60
+ ) from e
61
+ _FASTAVRO_CACHE['mod'] = _fastavro
62
+
63
+ return _fastavro
64
+
65
+
66
+ def _normalize_records(data: JSONData) -> JSONList:
67
+ """
68
+ Normalize JSON payloads into a list of dictionaries.
69
+
70
+ Raises TypeError when payloads contain non-dict items.
71
+ """
72
+ if isinstance(data, list):
73
+ if not all(isinstance(item, dict) for item in data):
74
+ raise TypeError('AVRO payloads must contain only objects (dicts)')
75
+ return cast(JSONList, data)
76
+ return [cast(JSONDict, data)]
77
+
78
+
79
+ def _infer_value_type(value: object) -> str | list[str]:
80
+ """
81
+ Infer the Avro type for a primitive value.
82
+
83
+ Raises TypeError for unsupported types.
84
+ """
85
+ if value is None:
86
+ return 'null'
87
+ if isinstance(value, bool):
88
+ return 'boolean'
89
+ if isinstance(value, int):
90
+ return 'long'
91
+ if isinstance(value, float):
92
+ return 'double'
93
+ if isinstance(value, str):
94
+ return 'string'
95
+ if isinstance(value, (bytes, bytearray)):
96
+ return 'bytes'
97
+ raise TypeError('AVRO payloads must contain only primitive values')
98
+
99
+
100
+ def _merge_types(types: list[str]) -> str | list[str]:
101
+ """Return a stable Avro type union for a list of types."""
102
+ unique = list(dict.fromkeys(types))
103
+ if len(unique) == 1:
104
+ return unique[0]
105
+ ordered = ['null'] + sorted(t for t in unique if t != 'null')
106
+ return ordered
107
+
108
+
109
+ def _infer_schema(records: JSONList) -> dict[str, Any]:
110
+ """
111
+ Infer a basic Avro schema from record payloads.
112
+
113
+ Only primitive field values are supported; complex values raise TypeError.
114
+ """
115
+ field_names = sorted({key for record in records for key in record})
116
+ fields: list[dict[str, Any]] = []
117
+ for name in field_names:
118
+ types: list[str] = []
119
+ for record in records:
120
+ value = record.get(name)
121
+ if value is None:
122
+ types.append('null')
123
+ continue
124
+ if isinstance(value, dict | list):
125
+ raise TypeError(
126
+ 'AVRO payloads must contain only primitive values',
127
+ )
128
+ if not isinstance(value, _PRIMITIVE_TYPES):
129
+ raise TypeError(
130
+ 'AVRO payloads must contain only primitive values',
131
+ )
132
+ types.append(cast(str, _infer_value_type(value)))
133
+ fields.append({'name': name, 'type': _merge_types(types)})
134
+
135
+ return {
136
+ 'name': 'etlplus_record',
137
+ 'type': 'record',
138
+ 'fields': fields,
139
+ }
140
+
141
+
142
+ # SECTION: FUNCTIONS ======================================================== #
143
+
144
+
145
+ def read(
146
+ path: Path,
147
+ ) -> JSONList:
17
148
  """
18
149
  Read AVRO content from ``path``.
19
150
 
@@ -24,20 +155,21 @@ def read(path: Path) -> JSONData:
24
155
 
25
156
  Returns
26
157
  -------
27
- JSONData
28
- Parsed payload.
29
-
30
- Raises
31
- ------
32
- NotImplementedError
33
- AVRO :func:`read` is not implemented yet.
158
+ JSONList
159
+ The list of dictionaries read from the AVRO file.
34
160
  """
35
- raise NotImplementedError('AVRO read is not implemented yet')
161
+ fastavro = _get_fastavro()
162
+ with path.open('rb') as handle:
163
+ reader = fastavro.reader(handle)
164
+ return [cast(JSONDict, record) for record in reader]
36
165
 
37
166
 
38
- def write(path: Path, data: JSONData) -> int:
167
+ def write(
168
+ path: Path,
169
+ data: JSONData,
170
+ ) -> int:
39
171
  """
40
- Write ``data`` to AVRO at ``path``.
172
+ Write ``data`` to AVRO at ``path`` and return record count.
41
173
 
42
174
  Parameters
43
175
  ----------
@@ -50,10 +182,17 @@ def write(path: Path, data: JSONData) -> int:
50
182
  -------
51
183
  int
52
184
  Number of records written.
53
-
54
- Raises
55
- ------
56
- NotImplementedError
57
- AVRO :func:`write` is not implemented yet.
58
185
  """
59
- raise NotImplementedError('AVRO write is not implemented yet')
186
+ records = _normalize_records(data)
187
+ if not records:
188
+ return 0
189
+
190
+ fastavro = _get_fastavro()
191
+ schema = _infer_schema(records)
192
+ parsed_schema = fastavro.parse_schema(schema)
193
+
194
+ path.parent.mkdir(parents=True, exist_ok=True)
195
+ with path.open('wb') as handle:
196
+ fastavro.writer(handle, parsed_schema, records)
197
+
198
+ return len(records)
etlplus/file/csv.py CHANGED
@@ -1,7 +1,7 @@
1
1
  """
2
2
  :mod:`etlplus.file.csv` module.
3
3
 
4
- CSV read/write helpers.
4
+ Helpers for reading/writing CSV files.
5
5
  """
6
6
 
7
7
  from __future__ import annotations
@@ -14,6 +14,15 @@ from ..types import JSONData
14
14
  from ..types import JSONDict
15
15
  from ..types import JSONList
16
16
 
17
+ # SECTION: EXPORTS ========================================================== #
18
+
19
+
20
+ __all__ = [
21
+ 'read',
22
+ 'write',
23
+ ]
24
+
25
+
17
26
  # SECTION: FUNCTIONS ======================================================== #
18
27
 
19
28
 
@@ -21,7 +30,7 @@ def read(
21
30
  path: Path,
22
31
  ) -> JSONList:
23
32
  """
24
- Load CSV content as a list of dictionaries.
33
+ Read CSV content from ``path``.
25
34
 
26
35
  Parameters
27
36
  ----------
@@ -48,7 +57,7 @@ def write(
48
57
  data: JSONData,
49
58
  ) -> int:
50
59
  """
51
- Write CSV rows to ``path`` and return the number of rows.
60
+ Write ``data`` to CSV at ``path`` and return record count.
52
61
 
53
62
  Parameters
54
63
  ----------