etlplus 0.11.11__py3-none-any.whl → 0.12.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- etlplus/README.md +37 -0
- etlplus/api/README.md +20 -3
- etlplus/cli/README.md +40 -0
- etlplus/config/README.md +52 -0
- etlplus/database/README.md +48 -0
- etlplus/file/README.md +105 -0
- etlplus/file/avro.py +157 -18
- etlplus/file/csv.py +12 -3
- etlplus/file/feather.py +100 -15
- etlplus/file/gz.py +80 -16
- etlplus/file/json.py +13 -2
- etlplus/file/ndjson.py +61 -11
- etlplus/file/orc.py +95 -12
- etlplus/file/parquet.py +100 -13
- etlplus/file/tsv.py +52 -20
- etlplus/file/txt.py +56 -16
- etlplus/file/xls.py +85 -12
- etlplus/file/xlsx.py +95 -12
- etlplus/file/xml.py +12 -3
- etlplus/file/yaml.py +13 -2
- etlplus/file/zip.py +133 -7
- etlplus/templates/README.md +46 -0
- etlplus/validation/README.md +50 -0
- {etlplus-0.11.11.dist-info → etlplus-0.12.1.dist-info}/METADATA +58 -14
- {etlplus-0.11.11.dist-info → etlplus-0.12.1.dist-info}/RECORD +29 -22
- {etlplus-0.11.11.dist-info → etlplus-0.12.1.dist-info}/WHEEL +0 -0
- {etlplus-0.11.11.dist-info → etlplus-0.12.1.dist-info}/entry_points.txt +0 -0
- {etlplus-0.11.11.dist-info → etlplus-0.12.1.dist-info}/licenses/LICENSE +0 -0
- {etlplus-0.11.11.dist-info → etlplus-0.12.1.dist-info}/top_level.txt +0 -0
etlplus/README.md
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# etlplus package
|
|
2
|
+
|
|
3
|
+
The `etlplus` package provides a unified Python API and CLI for ETL operations: extraction,
|
|
4
|
+
validation, transformation, and loading of data from files, APIs, and databases.
|
|
5
|
+
|
|
6
|
+
- Top-level entry points for extract, validate, transform, and load
|
|
7
|
+
- Utilities for pipeline orchestration and helpers
|
|
8
|
+
- Exposes all subpackages for advanced usage
|
|
9
|
+
|
|
10
|
+
Back to project overview: see the top-level [README](../README.md).
|
|
11
|
+
|
|
12
|
+
## Subpackages
|
|
13
|
+
|
|
14
|
+
- [etlplus.api](api/README.md): Lightweight HTTP client and paginated REST helpers
|
|
15
|
+
- [etlplus.file](file/README.md): Unified file format support and helpers
|
|
16
|
+
- [etlplus.config](config/README.md): Configuration helpers for connectors, pipelines, jobs, and
|
|
17
|
+
profiles
|
|
18
|
+
- [etlplus.cli](cli/README.md): Command-line interface for ETLPlus workflows
|
|
19
|
+
- [etlplus.database](database/README.md): Database engine, schema, and ORM helpers
|
|
20
|
+
- [etlplus.templates](templates/README.md): SQL and DDL template helpers
|
|
21
|
+
- [etlplus.validation](validation/README.md): Data validation utilities and helpers
|
|
22
|
+
|
|
23
|
+
## Quickstart
|
|
24
|
+
|
|
25
|
+
```python
|
|
26
|
+
from etlplus import extract, validate, transform, load
|
|
27
|
+
|
|
28
|
+
data = extract("file", "input.csv")
|
|
29
|
+
filtered = transform(data, {"filter": {"field": "age", "op": "gt", "value": 25}})
|
|
30
|
+
assert validate(filtered, {"age": {"type": "number", "min": 0}})["valid"]
|
|
31
|
+
load(filtered, "file", "output.json", file_format="json")
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## See Also
|
|
35
|
+
|
|
36
|
+
- [Top-level project README](../README.md)
|
|
37
|
+
- [API reference](../docs/README.md)
|
etlplus/api/README.md
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
# etlplus.api
|
|
1
|
+
# etlplus.api subpackage
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
3
|
+
Documentation for the `etlplus.api` subpackage: a lightweight HTTP client and helpers for paginated
|
|
4
|
+
REST endpoints.
|
|
5
5
|
|
|
6
6
|
- Provides a small `EndpointClient` for calling JSON APIs
|
|
7
7
|
- Supports page-, offset-, and cursor-based pagination via `PaginationConfig`
|
|
@@ -12,6 +12,20 @@ paginated REST endpoints.
|
|
|
12
12
|
|
|
13
13
|
Back to project overview: see the top-level [README](../../README.md).
|
|
14
14
|
|
|
15
|
+
- [etlplus.api subpackage](#etlplusapi-subpackage)
|
|
16
|
+
- [Installation](#installation)
|
|
17
|
+
- [Quickstart](#quickstart)
|
|
18
|
+
- [Overriding Rate Limits Per Call](#overriding-rate-limits-per-call)
|
|
19
|
+
- [Choosing `records_path` and `cursor_path`](#choosing-records_path-and-cursor_path)
|
|
20
|
+
- [Cursor-Based Pagination Example](#cursor-based-pagination-example)
|
|
21
|
+
- [Offset-based pagination example](#offset-based-pagination-example)
|
|
22
|
+
- [Authentication](#authentication)
|
|
23
|
+
- [Errors and Rate Limiting](#errors-and-rate-limiting)
|
|
24
|
+
- [Types and Transport](#types-and-transport)
|
|
25
|
+
- [Supporting Modules](#supporting-modules)
|
|
26
|
+
- [Minimal Contract](#minimal-contract)
|
|
27
|
+
- [See also](#see-also)
|
|
28
|
+
|
|
15
29
|
## Installation
|
|
16
30
|
|
|
17
31
|
`etlplus.api` ships as part of the `etlplus` package. Install the package as usual:
|
|
@@ -233,3 +247,6 @@ providers can fall back to their own defaults. If you already possess a static t
|
|
|
233
247
|
## See also
|
|
234
248
|
|
|
235
249
|
- Top-level CLI and library usage in the main [README](../../README.md)
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
[def]: #installation
|
etlplus/cli/README.md
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# etlplus.cli subpackage
|
|
2
|
+
|
|
3
|
+
Documentation for the `etlplus.cli` subpackage: command-line interface for ETLPlus workflows.
|
|
4
|
+
|
|
5
|
+
- Provides a CLI for running ETL pipelines, jobs, and utilities
|
|
6
|
+
- Supports commands for running, validating, and inspecting pipelines
|
|
7
|
+
- Includes options for configuration, state, and output control
|
|
8
|
+
- Exposes handlers for custom command integration
|
|
9
|
+
|
|
10
|
+
Back to project overview: see the top-level [README](../../README.md).
|
|
11
|
+
|
|
12
|
+
- [etlplus.cli subpackage](#etlpluscli-subpackage)
|
|
13
|
+
- [Available Commands](#available-commands)
|
|
14
|
+
- [Command Options](#command-options)
|
|
15
|
+
- [Example: Running a Pipeline](#example-running-a-pipeline)
|
|
16
|
+
- [See Also](#see-also)
|
|
17
|
+
|
|
18
|
+
## Available Commands
|
|
19
|
+
|
|
20
|
+
- **run**: Execute a pipeline or job
|
|
21
|
+
- **validate**: Validate pipeline or config files
|
|
22
|
+
- **inspect**: Show pipeline/job details
|
|
23
|
+
|
|
24
|
+
## Command Options
|
|
25
|
+
|
|
26
|
+
- `--config`: Path to config file
|
|
27
|
+
- `--state`: Path to state file
|
|
28
|
+
- `--output`: Output file or format
|
|
29
|
+
|
|
30
|
+
## Example: Running a Pipeline
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
etlplus run --config configs/pipeline.yml --output results.json
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## See Also
|
|
37
|
+
|
|
38
|
+
- Top-level CLI and library usage in the main [README](../../README.md)
|
|
39
|
+
- Command handlers in [handlers.py](handlers.py)
|
|
40
|
+
- Command options in [options.py](options.py)
|
etlplus/config/README.md
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# etlplus.config subpackage
|
|
2
|
+
|
|
3
|
+
Documentation for the `etlplus.config` subpackage: configuration helpers for connectors, pipelines,
|
|
4
|
+
jobs, and profiles.
|
|
5
|
+
|
|
6
|
+
- Provides classes and utilities for managing ETL pipeline configuration
|
|
7
|
+
- Supports YAML/JSON config loading and validation
|
|
8
|
+
- Includes helpers for connectors, jobs, pipelines, and profiles
|
|
9
|
+
- Exposes type definitions for config schemas
|
|
10
|
+
|
|
11
|
+
Back to project overview: see the top-level [README](../../README.md).
|
|
12
|
+
|
|
13
|
+
- [etlplus.config subpackage](#etlplusconfig-subpackage)
|
|
14
|
+
- [Supported Configuration Types](#supported-configuration-types)
|
|
15
|
+
- [Loading and Validating Configs](#loading-and-validating-configs)
|
|
16
|
+
- [Example: Loading a Pipeline Config](#example-loading-a-pipeline-config)
|
|
17
|
+
- [See Also](#see-also)
|
|
18
|
+
|
|
19
|
+
## Supported Configuration Types
|
|
20
|
+
|
|
21
|
+
- **Connector**: Connection details for databases, files, or APIs
|
|
22
|
+
- **Job**: ETL job definitions and scheduling
|
|
23
|
+
- **Pipeline**: End-to-end pipeline configuration
|
|
24
|
+
- **Profile**: User or environment-specific settings
|
|
25
|
+
|
|
26
|
+
## Loading and Validating Configs
|
|
27
|
+
|
|
28
|
+
Use the provided classes to load and validate configuration files:
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
from etlplus.config import PipelineConfig
|
|
32
|
+
|
|
33
|
+
cfg = PipelineConfig.from_yaml("pipeline.yml")
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
- Supports YAML and JSON formats
|
|
37
|
+
- Validates against expected schema
|
|
38
|
+
|
|
39
|
+
## Example: Loading a Pipeline Config
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from etlplus.config import PipelineConfig
|
|
43
|
+
|
|
44
|
+
pipeline = PipelineConfig.from_yaml("configs/pipeline.yml")
|
|
45
|
+
print(pipeline)
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## See Also
|
|
49
|
+
|
|
50
|
+
- Top-level CLI and library usage in the main [README](../../README.md)
|
|
51
|
+
- Config type definitions in [types.py](types.py)
|
|
52
|
+
- Config utilities in [utils.py](utils.py)
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# etlplus.database subpackage
|
|
2
|
+
|
|
3
|
+
Documentation for the `etlplus.database` subpackage: database engine, schema, and ORM helpers.
|
|
4
|
+
|
|
5
|
+
- Provides database engine and connection management
|
|
6
|
+
- Supports schema definition and DDL generation
|
|
7
|
+
- Includes lightweight ORM utilities for tabular data
|
|
8
|
+
- Exposes type definitions for database objects
|
|
9
|
+
|
|
10
|
+
Back to project overview: see the top-level [README](../../README.md).
|
|
11
|
+
|
|
12
|
+
- [etlplus.database subpackage](#etlplusdatabase-subpackage)
|
|
13
|
+
- [Database Engine and Connections](#database-engine-and-connections)
|
|
14
|
+
- [Schema and DDL Helpers](#schema-and-ddl-helpers)
|
|
15
|
+
- [ORM Utilities](#orm-utilities)
|
|
16
|
+
- [Example: Creating a Table](#example-creating-a-table)
|
|
17
|
+
- [See Also](#see-also)
|
|
18
|
+
|
|
19
|
+
## Database Engine and Connections
|
|
20
|
+
|
|
21
|
+
- Manage connections to supported databases
|
|
22
|
+
- Configure engines for different backends
|
|
23
|
+
|
|
24
|
+
## Schema and DDL Helpers
|
|
25
|
+
|
|
26
|
+
- Define table schemas and columns
|
|
27
|
+
- Generate DDL statements for supported databases
|
|
28
|
+
|
|
29
|
+
## ORM Utilities
|
|
30
|
+
|
|
31
|
+
- Map rows to Python objects
|
|
32
|
+
- Simple CRUD helpers for tabular data
|
|
33
|
+
|
|
34
|
+
## Example: Creating a Table
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
from etlplus.database import Schema, Engine
|
|
38
|
+
|
|
39
|
+
engine = Engine.connect("sqlite:///example.db")
|
|
40
|
+
schema = Schema.from_dict({"name": "users", "columns": [ ... ]})
|
|
41
|
+
engine.create_table(schema)
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## See Also
|
|
45
|
+
|
|
46
|
+
- Top-level CLI and library usage in the main [README](../../README.md)
|
|
47
|
+
- Schema helpers in [schema.py](schema.py)
|
|
48
|
+
- ORM utilities in [orm.py](orm.py)
|
etlplus/file/README.md
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# etlplus.file subpackage
|
|
2
|
+
|
|
3
|
+
Documentation for the `etlplus.file` subpackage: unified file format support and helpers for reading
|
|
4
|
+
and writing data files.
|
|
5
|
+
|
|
6
|
+
- Provides a consistent interface for reading and writing files in various formats
|
|
7
|
+
- Supports all formats defined in `FileFormat` (see below)
|
|
8
|
+
- Includes helpers for inferring file format and compression from filenames, extensions, or MIME
|
|
9
|
+
types
|
|
10
|
+
- Exposes a `File` class with instance methods for reading and writing data
|
|
11
|
+
|
|
12
|
+
Back to project overview: see the top-level [README](../../README.md).
|
|
13
|
+
|
|
14
|
+
- [etlplus.file subpackage](#etlplusfile-subpackage)
|
|
15
|
+
- [Supported File Formats](#supported-file-formats)
|
|
16
|
+
- [Inferring File Format and Compression](#inferring-file-format-and-compression)
|
|
17
|
+
- [Reading and Writing Files](#reading-and-writing-files)
|
|
18
|
+
- [Reading a File](#reading-a-file)
|
|
19
|
+
- [Writing a File](#writing-a-file)
|
|
20
|
+
- [File Instance Methods](#file-instance-methods)
|
|
21
|
+
- [Example: Reading and Writing](#example-reading-and-writing)
|
|
22
|
+
- [See Also](#see-also)
|
|
23
|
+
|
|
24
|
+
## Supported File Formats
|
|
25
|
+
|
|
26
|
+
The following formats are defined in `FileFormat` and supported for reading and writing:
|
|
27
|
+
|
|
28
|
+
| Format | Description |
|
|
29
|
+
|-----------|---------------------------------------------|
|
|
30
|
+
| avro | Apache Avro binary serialization |
|
|
31
|
+
| csv | Comma-separated values text files |
|
|
32
|
+
| feather | Apache Arrow Feather columnar format |
|
|
33
|
+
| gz | Gzip-compressed files (see Compression) |
|
|
34
|
+
| json | Standard JSON files |
|
|
35
|
+
| ndjson | Newline-delimited JSON (JSON Lines) |
|
|
36
|
+
| orc | Apache ORC columnar format |
|
|
37
|
+
| parquet | Apache Parquet columnar format |
|
|
38
|
+
| tsv | Tab-separated values text files |
|
|
39
|
+
| txt | Plain text files |
|
|
40
|
+
| xls | Microsoft Excel (legacy .xls) |
|
|
41
|
+
| xlsx | Microsoft Excel (modern .xlsx) |
|
|
42
|
+
| zip | ZIP-compressed files (see Compression) |
|
|
43
|
+
| xml | XML files |
|
|
44
|
+
| yaml | YAML files |
|
|
45
|
+
|
|
46
|
+
Compression formats (gz, zip) are also supported as wrappers for other formats.
|
|
47
|
+
|
|
48
|
+
## Inferring File Format and Compression
|
|
49
|
+
|
|
50
|
+
Use `infer_file_format_and_compression(value, filename=None)` to infer the file format and
|
|
51
|
+
compression from a filename, extension, or MIME type. Returns a tuple `(file_format,
|
|
52
|
+
compression_format)`.
|
|
53
|
+
|
|
54
|
+
## Reading and Writing Files
|
|
55
|
+
|
|
56
|
+
The main entry point for file operations is the `File` class. To read or write files:
|
|
57
|
+
|
|
58
|
+
### Reading a File
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from etlplus.file import File
|
|
62
|
+
|
|
63
|
+
f = File("data/sample.csv")
|
|
64
|
+
data = f.read()
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
- The `read()` method automatically detects the format and compression.
|
|
68
|
+
- Returns parsed data (e.g., list of dicts for tabular formats).
|
|
69
|
+
|
|
70
|
+
### Writing a File
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
from etlplus.file import File
|
|
74
|
+
|
|
75
|
+
f = File("output.json")
|
|
76
|
+
f.write(data)
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
- The `write()` method serializes and writes data in the appropriate format.
|
|
80
|
+
- Supports all formats listed above.
|
|
81
|
+
|
|
82
|
+
## File Instance Methods
|
|
83
|
+
|
|
84
|
+
- `read()`: Reads and parses the file, returning structured data.
|
|
85
|
+
- `write(data)`: Writes structured data to the file in the detected format.
|
|
86
|
+
|
|
87
|
+
## Example: Reading and Writing
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from etlplus.file import File
|
|
91
|
+
|
|
92
|
+
# Read CSV
|
|
93
|
+
csv_file = File("data.csv")
|
|
94
|
+
rows = csv_file.read()
|
|
95
|
+
|
|
96
|
+
# Write JSON
|
|
97
|
+
json_file = File("output.json")
|
|
98
|
+
json_file.write(rows)
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## See Also
|
|
102
|
+
|
|
103
|
+
- Top-level CLI and library usage in the main [README](../../README.md)
|
|
104
|
+
- File format enums in [enums.py](enums.py)
|
|
105
|
+
- Compression format enums in [enums.py](enums.py)
|
etlplus/file/avro.py
CHANGED
|
@@ -1,19 +1,150 @@
|
|
|
1
1
|
"""
|
|
2
2
|
:mod:`etlplus.file.avro` module.
|
|
3
3
|
|
|
4
|
-
|
|
4
|
+
Helpers for reading/writing Avro files.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
9
|
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
from typing import cast
|
|
10
12
|
|
|
11
13
|
from ..types import JSONData
|
|
14
|
+
from ..types import JSONDict
|
|
15
|
+
from ..types import JSONList
|
|
12
16
|
|
|
13
17
|
# SECTION: EXPORTS ========================================================== #
|
|
14
18
|
|
|
15
19
|
|
|
16
|
-
|
|
20
|
+
__all__ = [
|
|
21
|
+
'read',
|
|
22
|
+
'write',
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# SECTION: INTERNAL CONSTANTS =============================================== #
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
_FASTAVRO_CACHE: dict[str, Any] = {}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
_PRIMITIVE_TYPES: tuple[type, ...] = (
|
|
33
|
+
bool,
|
|
34
|
+
int,
|
|
35
|
+
float,
|
|
36
|
+
str,
|
|
37
|
+
bytes,
|
|
38
|
+
bytearray,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# SECTION: INTERNAL FUNCTIONS =============================================== #
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _get_fastavro() -> Any:
|
|
46
|
+
"""
|
|
47
|
+
Return the fastavro module, importing it on first use.
|
|
48
|
+
|
|
49
|
+
Raises an informative ImportError if the optional dependency is missing.
|
|
50
|
+
"""
|
|
51
|
+
mod = _FASTAVRO_CACHE.get('mod')
|
|
52
|
+
if mod is not None: # pragma: no cover - tiny branch
|
|
53
|
+
return mod
|
|
54
|
+
try:
|
|
55
|
+
_fastavro = __import__('fastavro') # type: ignore[assignment]
|
|
56
|
+
except ImportError as e: # pragma: no cover
|
|
57
|
+
raise ImportError(
|
|
58
|
+
'AVRO support requires optional dependency "fastavro".\n'
|
|
59
|
+
'Install with: pip install fastavro',
|
|
60
|
+
) from e
|
|
61
|
+
_FASTAVRO_CACHE['mod'] = _fastavro
|
|
62
|
+
|
|
63
|
+
return _fastavro
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _normalize_records(data: JSONData) -> JSONList:
|
|
67
|
+
"""
|
|
68
|
+
Normalize JSON payloads into a list of dictionaries.
|
|
69
|
+
|
|
70
|
+
Raises TypeError when payloads contain non-dict items.
|
|
71
|
+
"""
|
|
72
|
+
if isinstance(data, list):
|
|
73
|
+
if not all(isinstance(item, dict) for item in data):
|
|
74
|
+
raise TypeError('AVRO payloads must contain only objects (dicts)')
|
|
75
|
+
return cast(JSONList, data)
|
|
76
|
+
return [cast(JSONDict, data)]
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _infer_value_type(value: object) -> str | list[str]:
|
|
80
|
+
"""
|
|
81
|
+
Infer the Avro type for a primitive value.
|
|
82
|
+
|
|
83
|
+
Raises TypeError for unsupported types.
|
|
84
|
+
"""
|
|
85
|
+
if value is None:
|
|
86
|
+
return 'null'
|
|
87
|
+
if isinstance(value, bool):
|
|
88
|
+
return 'boolean'
|
|
89
|
+
if isinstance(value, int):
|
|
90
|
+
return 'long'
|
|
91
|
+
if isinstance(value, float):
|
|
92
|
+
return 'double'
|
|
93
|
+
if isinstance(value, str):
|
|
94
|
+
return 'string'
|
|
95
|
+
if isinstance(value, (bytes, bytearray)):
|
|
96
|
+
return 'bytes'
|
|
97
|
+
raise TypeError('AVRO payloads must contain only primitive values')
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _merge_types(types: list[str]) -> str | list[str]:
|
|
101
|
+
"""Return a stable Avro type union for a list of types."""
|
|
102
|
+
unique = list(dict.fromkeys(types))
|
|
103
|
+
if len(unique) == 1:
|
|
104
|
+
return unique[0]
|
|
105
|
+
ordered = ['null'] + sorted(t for t in unique if t != 'null')
|
|
106
|
+
return ordered
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _infer_schema(records: JSONList) -> dict[str, Any]:
|
|
110
|
+
"""
|
|
111
|
+
Infer a basic Avro schema from record payloads.
|
|
112
|
+
|
|
113
|
+
Only primitive field values are supported; complex values raise TypeError.
|
|
114
|
+
"""
|
|
115
|
+
field_names = sorted({key for record in records for key in record})
|
|
116
|
+
fields: list[dict[str, Any]] = []
|
|
117
|
+
for name in field_names:
|
|
118
|
+
types: list[str] = []
|
|
119
|
+
for record in records:
|
|
120
|
+
value = record.get(name)
|
|
121
|
+
if value is None:
|
|
122
|
+
types.append('null')
|
|
123
|
+
continue
|
|
124
|
+
if isinstance(value, dict | list):
|
|
125
|
+
raise TypeError(
|
|
126
|
+
'AVRO payloads must contain only primitive values',
|
|
127
|
+
)
|
|
128
|
+
if not isinstance(value, _PRIMITIVE_TYPES):
|
|
129
|
+
raise TypeError(
|
|
130
|
+
'AVRO payloads must contain only primitive values',
|
|
131
|
+
)
|
|
132
|
+
types.append(cast(str, _infer_value_type(value)))
|
|
133
|
+
fields.append({'name': name, 'type': _merge_types(types)})
|
|
134
|
+
|
|
135
|
+
return {
|
|
136
|
+
'name': 'etlplus_record',
|
|
137
|
+
'type': 'record',
|
|
138
|
+
'fields': fields,
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
# SECTION: FUNCTIONS ======================================================== #
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def read(
|
|
146
|
+
path: Path,
|
|
147
|
+
) -> JSONList:
|
|
17
148
|
"""
|
|
18
149
|
Read AVRO content from ``path``.
|
|
19
150
|
|
|
@@ -24,20 +155,21 @@ def read(path: Path) -> JSONData:
|
|
|
24
155
|
|
|
25
156
|
Returns
|
|
26
157
|
-------
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
Raises
|
|
31
|
-
------
|
|
32
|
-
NotImplementedError
|
|
33
|
-
AVRO :func:`read` is not implemented yet.
|
|
158
|
+
JSONList
|
|
159
|
+
The list of dictionaries read from the AVRO file.
|
|
34
160
|
"""
|
|
35
|
-
|
|
161
|
+
fastavro = _get_fastavro()
|
|
162
|
+
with path.open('rb') as handle:
|
|
163
|
+
reader = fastavro.reader(handle)
|
|
164
|
+
return [cast(JSONDict, record) for record in reader]
|
|
36
165
|
|
|
37
166
|
|
|
38
|
-
def write(
|
|
167
|
+
def write(
|
|
168
|
+
path: Path,
|
|
169
|
+
data: JSONData,
|
|
170
|
+
) -> int:
|
|
39
171
|
"""
|
|
40
|
-
Write ``data`` to AVRO at ``path
|
|
172
|
+
Write ``data`` to AVRO at ``path`` and return record count.
|
|
41
173
|
|
|
42
174
|
Parameters
|
|
43
175
|
----------
|
|
@@ -50,10 +182,17 @@ def write(path: Path, data: JSONData) -> int:
|
|
|
50
182
|
-------
|
|
51
183
|
int
|
|
52
184
|
Number of records written.
|
|
53
|
-
|
|
54
|
-
Raises
|
|
55
|
-
------
|
|
56
|
-
NotImplementedError
|
|
57
|
-
AVRO :func:`write` is not implemented yet.
|
|
58
185
|
"""
|
|
59
|
-
|
|
186
|
+
records = _normalize_records(data)
|
|
187
|
+
if not records:
|
|
188
|
+
return 0
|
|
189
|
+
|
|
190
|
+
fastavro = _get_fastavro()
|
|
191
|
+
schema = _infer_schema(records)
|
|
192
|
+
parsed_schema = fastavro.parse_schema(schema)
|
|
193
|
+
|
|
194
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
195
|
+
with path.open('wb') as handle:
|
|
196
|
+
fastavro.writer(handle, parsed_schema, records)
|
|
197
|
+
|
|
198
|
+
return len(records)
|
etlplus/file/csv.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""
|
|
2
2
|
:mod:`etlplus.file.csv` module.
|
|
3
3
|
|
|
4
|
-
|
|
4
|
+
Helpers for reading/writing CSV files.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
@@ -14,6 +14,15 @@ from ..types import JSONData
|
|
|
14
14
|
from ..types import JSONDict
|
|
15
15
|
from ..types import JSONList
|
|
16
16
|
|
|
17
|
+
# SECTION: EXPORTS ========================================================== #
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
'read',
|
|
22
|
+
'write',
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
17
26
|
# SECTION: FUNCTIONS ======================================================== #
|
|
18
27
|
|
|
19
28
|
|
|
@@ -21,7 +30,7 @@ def read(
|
|
|
21
30
|
path: Path,
|
|
22
31
|
) -> JSONList:
|
|
23
32
|
"""
|
|
24
|
-
|
|
33
|
+
Read CSV content from ``path``.
|
|
25
34
|
|
|
26
35
|
Parameters
|
|
27
36
|
----------
|
|
@@ -48,7 +57,7 @@ def write(
|
|
|
48
57
|
data: JSONData,
|
|
49
58
|
) -> int:
|
|
50
59
|
"""
|
|
51
|
-
Write CSV
|
|
60
|
+
Write ``data`` to CSV at ``path`` and return record count.
|
|
52
61
|
|
|
53
62
|
Parameters
|
|
54
63
|
----------
|