etlplus 0.10.4__py3-none-any.whl → 0.12.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- etlplus/README.md +37 -0
- etlplus/api/README.md +20 -3
- etlplus/cli/README.md +40 -0
- etlplus/cli/commands.py +1 -1
- etlplus/cli/constants.py +1 -1
- etlplus/cli/handlers.py +1 -1
- etlplus/cli/io.py +2 -2
- etlplus/config/README.md +52 -0
- etlplus/config/pipeline.py +2 -2
- etlplus/database/README.md +48 -0
- etlplus/database/ddl.py +1 -1
- etlplus/database/engine.py +1 -1
- etlplus/database/schema.py +1 -1
- etlplus/enums.py +2 -270
- etlplus/extract.py +5 -7
- etlplus/file/README.md +105 -0
- etlplus/file/__init__.py +25 -0
- etlplus/file/avro.py +198 -0
- etlplus/file/core.py +287 -0
- etlplus/file/csv.py +91 -0
- etlplus/file/enums.py +238 -0
- etlplus/file/feather.py +144 -0
- etlplus/file/gz.py +123 -0
- etlplus/file/json.py +98 -0
- etlplus/file/ndjson.py +109 -0
- etlplus/file/orc.py +142 -0
- etlplus/file/parquet.py +146 -0
- etlplus/file/tsv.py +91 -0
- etlplus/file/txt.py +99 -0
- etlplus/file/xls.py +132 -0
- etlplus/file/xlsx.py +142 -0
- etlplus/file/xml.py +174 -0
- etlplus/file/yaml.py +136 -0
- etlplus/file/zip.py +175 -0
- etlplus/load.py +9 -12
- etlplus/run.py +6 -9
- etlplus/templates/README.md +46 -0
- etlplus/validation/README.md +50 -0
- {etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/METADATA +58 -14
- {etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/RECORD +44 -20
- etlplus/file.py +0 -652
- {etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/WHEEL +0 -0
- {etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/entry_points.txt +0 -0
- {etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/licenses/LICENSE +0 -0
- {etlplus-0.10.4.dist-info → etlplus-0.12.2.dist-info}/top_level.txt +0 -0
etlplus/README.md
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# etlplus package
|
|
2
|
+
|
|
3
|
+
The `etlplus` package provides a unified Python API and CLI for ETL operations: extraction,
|
|
4
|
+
validation, transformation, and loading of data from files, APIs, and databases.
|
|
5
|
+
|
|
6
|
+
- Top-level entry points for extract, validate, transform, and load
|
|
7
|
+
- Utilities for pipeline orchestration and helpers
|
|
8
|
+
- Exposes all subpackages for advanced usage
|
|
9
|
+
|
|
10
|
+
Back to project overview: see the top-level [README](../README.md).
|
|
11
|
+
|
|
12
|
+
## Subpackages
|
|
13
|
+
|
|
14
|
+
- [etlplus.api](api/README.md): Lightweight HTTP client and paginated REST helpers
|
|
15
|
+
- [etlplus.file](file/README.md): Unified file format support and helpers
|
|
16
|
+
- [etlplus.config](config/README.md): Configuration helpers for connectors, pipelines, jobs, and
|
|
17
|
+
profiles
|
|
18
|
+
- [etlplus.cli](cli/README.md): Command-line interface for ETLPlus workflows
|
|
19
|
+
- [etlplus.database](database/README.md): Database engine, schema, and ORM helpers
|
|
20
|
+
- [etlplus.templates](templates/README.md): SQL and DDL template helpers
|
|
21
|
+
- [etlplus.validation](validation/README.md): Data validation utilities and helpers
|
|
22
|
+
|
|
23
|
+
## Quickstart
|
|
24
|
+
|
|
25
|
+
```python
|
|
26
|
+
from etlplus import extract, validate, transform, load
|
|
27
|
+
|
|
28
|
+
data = extract("file", "input.csv")
|
|
29
|
+
filtered = transform(data, {"filter": {"field": "age", "op": "gt", "value": 25}})
|
|
30
|
+
assert validate(filtered, {"age": {"type": "number", "min": 0}})["valid"]
|
|
31
|
+
load(filtered, "file", "output.json", file_format="json")
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## See Also
|
|
35
|
+
|
|
36
|
+
- [Top-level project README](../README.md)
|
|
37
|
+
- [API reference](../docs/README.md)
|
etlplus/api/README.md
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
# etlplus.api
|
|
1
|
+
# etlplus.api subpackage
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
3
|
+
Documentation for the `etlplus.api` subpackage: a lightweight HTTP client and helpers for paginated
|
|
4
|
+
REST endpoints.
|
|
5
5
|
|
|
6
6
|
- Provides a small `EndpointClient` for calling JSON APIs
|
|
7
7
|
- Supports page-, offset-, and cursor-based pagination via `PaginationConfig`
|
|
@@ -12,6 +12,20 @@ paginated REST endpoints.
|
|
|
12
12
|
|
|
13
13
|
Back to project overview: see the top-level [README](../../README.md).
|
|
14
14
|
|
|
15
|
+
- [etlplus.api subpackage](#etlplusapi-subpackage)
|
|
16
|
+
- [Installation](#installation)
|
|
17
|
+
- [Quickstart](#quickstart)
|
|
18
|
+
- [Overriding Rate Limits Per Call](#overriding-rate-limits-per-call)
|
|
19
|
+
- [Choosing `records_path` and `cursor_path`](#choosing-records_path-and-cursor_path)
|
|
20
|
+
- [Cursor-Based Pagination Example](#cursor-based-pagination-example)
|
|
21
|
+
- [Offset-based pagination example](#offset-based-pagination-example)
|
|
22
|
+
- [Authentication](#authentication)
|
|
23
|
+
- [Errors and Rate Limiting](#errors-and-rate-limiting)
|
|
24
|
+
- [Types and Transport](#types-and-transport)
|
|
25
|
+
- [Supporting Modules](#supporting-modules)
|
|
26
|
+
- [Minimal Contract](#minimal-contract)
|
|
27
|
+
- [See also](#see-also)
|
|
28
|
+
|
|
15
29
|
## Installation
|
|
16
30
|
|
|
17
31
|
`etlplus.api` ships as part of the `etlplus` package. Install the package as usual:
|
|
@@ -233,3 +247,6 @@ providers can fall back to their own defaults. If you already possess a static t
|
|
|
233
247
|
## See also
|
|
234
248
|
|
|
235
249
|
- Top-level CLI and library usage in the main [README](../../README.md)
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
[def]: #installation
|
etlplus/cli/README.md
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# etlplus.cli subpackage
|
|
2
|
+
|
|
3
|
+
Documentation for the `etlplus.cli` subpackage: command-line interface for ETLPlus workflows.
|
|
4
|
+
|
|
5
|
+
- Provides a CLI for running ETL pipelines, jobs, and utilities
|
|
6
|
+
- Supports commands for running, validating, and inspecting pipelines
|
|
7
|
+
- Includes options for configuration, state, and output control
|
|
8
|
+
- Exposes handlers for custom command integration
|
|
9
|
+
|
|
10
|
+
Back to project overview: see the top-level [README](../../README.md).
|
|
11
|
+
|
|
12
|
+
- [etlplus.cli subpackage](#etlpluscli-subpackage)
|
|
13
|
+
- [Available Commands](#available-commands)
|
|
14
|
+
- [Command Options](#command-options)
|
|
15
|
+
- [Example: Running a Pipeline](#example-running-a-pipeline)
|
|
16
|
+
- [See Also](#see-also)
|
|
17
|
+
|
|
18
|
+
## Available Commands
|
|
19
|
+
|
|
20
|
+
- **run**: Execute a pipeline or job
|
|
21
|
+
- **validate**: Validate pipeline or config files
|
|
22
|
+
- **inspect**: Show pipeline/job details
|
|
23
|
+
|
|
24
|
+
## Command Options
|
|
25
|
+
|
|
26
|
+
- `--config`: Path to config file
|
|
27
|
+
- `--state`: Path to state file
|
|
28
|
+
- `--output`: Output file or format
|
|
29
|
+
|
|
30
|
+
## Example: Running a Pipeline
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
etlplus run --config configs/pipeline.yml --output results.json
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## See Also
|
|
37
|
+
|
|
38
|
+
- Top-level CLI and library usage in the main [README](../../README.md)
|
|
39
|
+
- Command handlers in [handlers.py](handlers.py)
|
|
40
|
+
- Command options in [options.py](options.py)
|
etlplus/cli/commands.py
CHANGED
etlplus/cli/constants.py
CHANGED
etlplus/cli/handlers.py
CHANGED
|
@@ -570,7 +570,7 @@ def transform_handler(
|
|
|
570
570
|
data = transform(payload, cast(TransformOperations, operations_payload))
|
|
571
571
|
|
|
572
572
|
if target and target != '-':
|
|
573
|
-
File
|
|
573
|
+
File(target, file_format=target_format).write(data)
|
|
574
574
|
print(f'Data transformed and saved to {target}')
|
|
575
575
|
return 0
|
|
576
576
|
|
etlplus/cli/io.py
CHANGED
|
@@ -15,8 +15,8 @@ from pathlib import Path
|
|
|
15
15
|
from typing import Any
|
|
16
16
|
from typing import cast
|
|
17
17
|
|
|
18
|
-
from ..enums import FileFormat
|
|
19
18
|
from ..file import File
|
|
19
|
+
from ..file import FileFormat
|
|
20
20
|
from ..types import JSONData
|
|
21
21
|
from ..utils import print_json
|
|
22
22
|
|
|
@@ -331,6 +331,6 @@ def write_json_output(
|
|
|
331
331
|
"""
|
|
332
332
|
if not output_path or output_path == '-':
|
|
333
333
|
return False
|
|
334
|
-
File(Path(output_path), FileFormat.JSON).
|
|
334
|
+
File(Path(output_path), FileFormat.JSON).write(data)
|
|
335
335
|
print(f'{success_message} {output_path}')
|
|
336
336
|
return True
|
etlplus/config/README.md
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# etlplus.config subpackage
|
|
2
|
+
|
|
3
|
+
Documentation for the `etlplus.config` subpackage: configuration helpers for connectors, pipelines,
|
|
4
|
+
jobs, and profiles.
|
|
5
|
+
|
|
6
|
+
- Provides classes and utilities for managing ETL pipeline configuration
|
|
7
|
+
- Supports YAML/JSON config loading and validation
|
|
8
|
+
- Includes helpers for connectors, jobs, pipelines, and profiles
|
|
9
|
+
- Exposes type definitions for config schemas
|
|
10
|
+
|
|
11
|
+
Back to project overview: see the top-level [README](../../README.md).
|
|
12
|
+
|
|
13
|
+
- [etlplus.config subpackage](#etlplusconfig-subpackage)
|
|
14
|
+
- [Supported Configuration Types](#supported-configuration-types)
|
|
15
|
+
- [Loading and Validating Configs](#loading-and-validating-configs)
|
|
16
|
+
- [Example: Loading a Pipeline Config](#example-loading-a-pipeline-config)
|
|
17
|
+
- [See Also](#see-also)
|
|
18
|
+
|
|
19
|
+
## Supported Configuration Types
|
|
20
|
+
|
|
21
|
+
- **Connector**: Connection details for databases, files, or APIs
|
|
22
|
+
- **Job**: ETL job definitions and scheduling
|
|
23
|
+
- **Pipeline**: End-to-end pipeline configuration
|
|
24
|
+
- **Profile**: User or environment-specific settings
|
|
25
|
+
|
|
26
|
+
## Loading and Validating Configs
|
|
27
|
+
|
|
28
|
+
Use the provided classes to load and validate configuration files:
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
from etlplus.config import PipelineConfig
|
|
32
|
+
|
|
33
|
+
cfg = PipelineConfig.from_yaml("pipeline.yml")
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
- Supports YAML and JSON formats
|
|
37
|
+
- Validates against expected schema
|
|
38
|
+
|
|
39
|
+
## Example: Loading a Pipeline Config
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from etlplus.config import PipelineConfig
|
|
43
|
+
|
|
44
|
+
pipeline = PipelineConfig.from_yaml("configs/pipeline.yml")
|
|
45
|
+
print(pipeline)
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## See Also
|
|
49
|
+
|
|
50
|
+
- Top-level CLI and library usage in the main [README](../../README.md)
|
|
51
|
+
- Config type definitions in [types.py](types.py)
|
|
52
|
+
- Config utilities in [utils.py](utils.py)
|
etlplus/config/pipeline.py
CHANGED
|
@@ -24,8 +24,8 @@ from typing import Any
|
|
|
24
24
|
from typing import Self
|
|
25
25
|
|
|
26
26
|
from ..api import ApiConfig
|
|
27
|
-
from ..enums import FileFormat
|
|
28
27
|
from ..file import File
|
|
28
|
+
from ..file import FileFormat
|
|
29
29
|
from ..types import StrAnyMap
|
|
30
30
|
from ..utils import coerce_dict
|
|
31
31
|
from ..utils import maybe_mapping
|
|
@@ -246,7 +246,7 @@ class PipelineConfig:
|
|
|
246
246
|
TypeError
|
|
247
247
|
If the YAML root is not a mapping/object.
|
|
248
248
|
"""
|
|
249
|
-
raw = File(Path(path), FileFormat.YAML).
|
|
249
|
+
raw = File(Path(path), FileFormat.YAML).read()
|
|
250
250
|
if not isinstance(raw, dict):
|
|
251
251
|
raise TypeError('Pipeline YAML must have a mapping/object root')
|
|
252
252
|
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# etlplus.database subpackage
|
|
2
|
+
|
|
3
|
+
Documentation for the `etlplus.database` subpackage: database engine, schema, and ORM helpers.
|
|
4
|
+
|
|
5
|
+
- Provides database engine and connection management
|
|
6
|
+
- Supports schema definition and DDL generation
|
|
7
|
+
- Includes lightweight ORM utilities for tabular data
|
|
8
|
+
- Exposes type definitions for database objects
|
|
9
|
+
|
|
10
|
+
Back to project overview: see the top-level [README](../../README.md).
|
|
11
|
+
|
|
12
|
+
- [etlplus.database subpackage](#etlplusdatabase-subpackage)
|
|
13
|
+
- [Database Engine and Connections](#database-engine-and-connections)
|
|
14
|
+
- [Schema and DDL Helpers](#schema-and-ddl-helpers)
|
|
15
|
+
- [ORM Utilities](#orm-utilities)
|
|
16
|
+
- [Example: Creating a Table](#example-creating-a-table)
|
|
17
|
+
- [See Also](#see-also)
|
|
18
|
+
|
|
19
|
+
## Database Engine and Connections
|
|
20
|
+
|
|
21
|
+
- Manage connections to supported databases
|
|
22
|
+
- Configure engines for different backends
|
|
23
|
+
|
|
24
|
+
## Schema and DDL Helpers
|
|
25
|
+
|
|
26
|
+
- Define table schemas and columns
|
|
27
|
+
- Generate DDL statements for supported databases
|
|
28
|
+
|
|
29
|
+
## ORM Utilities
|
|
30
|
+
|
|
31
|
+
- Map rows to Python objects
|
|
32
|
+
- Simple CRUD helpers for tabular data
|
|
33
|
+
|
|
34
|
+
## Example: Creating a Table
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
from etlplus.database import Schema, Engine
|
|
38
|
+
|
|
39
|
+
engine = Engine.connect("sqlite:///example.db")
|
|
40
|
+
schema = Schema.from_dict({"name": "users", "columns": [ ... ]})
|
|
41
|
+
engine.create_table(schema)
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## See Also
|
|
45
|
+
|
|
46
|
+
- Top-level CLI and library usage in the main [README](../../README.md)
|
|
47
|
+
- Schema helpers in [schema.py](schema.py)
|
|
48
|
+
- ORM utilities in [orm.py](orm.py)
|
etlplus/database/ddl.py
CHANGED
|
@@ -203,7 +203,7 @@ def load_table_spec(
|
|
|
203
203
|
raise ValueError('Spec must be .json, .yml, or .yaml')
|
|
204
204
|
|
|
205
205
|
try:
|
|
206
|
-
spec = File
|
|
206
|
+
spec = File(spec_path).read()
|
|
207
207
|
except ImportError as e:
|
|
208
208
|
if suffix in {'.yml', '.yaml'}:
|
|
209
209
|
raise RuntimeError(
|
etlplus/database/engine.py
CHANGED
|
@@ -113,7 +113,7 @@ def load_database_url_from_config(
|
|
|
113
113
|
ValueError
|
|
114
114
|
If no connection string/URL/DSN is found for the specified entry.
|
|
115
115
|
"""
|
|
116
|
-
cfg = File
|
|
116
|
+
cfg = File(Path(path)).read()
|
|
117
117
|
if not isinstance(cfg, Mapping):
|
|
118
118
|
raise TypeError('Database config must be a mapping')
|
|
119
119
|
|
etlplus/database/schema.py
CHANGED
etlplus/enums.py
CHANGED
|
@@ -8,7 +8,6 @@ from __future__ import annotations
|
|
|
8
8
|
|
|
9
9
|
import enum
|
|
10
10
|
import operator as _op
|
|
11
|
-
from pathlib import PurePath
|
|
12
11
|
from statistics import fmean
|
|
13
12
|
from typing import Self
|
|
14
13
|
|
|
@@ -23,18 +22,10 @@ __all__ = [
|
|
|
23
22
|
# Enums
|
|
24
23
|
'AggregateName',
|
|
25
24
|
'CoercibleStrEnum',
|
|
26
|
-
'CompressionFormat',
|
|
27
25
|
'DataConnectorType',
|
|
28
|
-
'FileFormat',
|
|
29
26
|
'HttpMethod',
|
|
30
27
|
'OperatorName',
|
|
31
28
|
'PipelineStep',
|
|
32
|
-
# Functions
|
|
33
|
-
'coerce_compression_format',
|
|
34
|
-
'coerce_data_connector_type',
|
|
35
|
-
'coerce_file_format',
|
|
36
|
-
'coerce_http_method',
|
|
37
|
-
'infer_file_format_and_compression',
|
|
38
29
|
]
|
|
39
30
|
|
|
40
31
|
|
|
@@ -178,39 +169,6 @@ class AggregateName(CoercibleStrEnum):
|
|
|
178
169
|
return lambda xs, n: (fmean(xs) if xs else 0.0)
|
|
179
170
|
|
|
180
171
|
|
|
181
|
-
class CompressionFormat(CoercibleStrEnum):
|
|
182
|
-
"""Supported compression formats for data files."""
|
|
183
|
-
|
|
184
|
-
# -- Constants -- #
|
|
185
|
-
|
|
186
|
-
GZ = 'gz'
|
|
187
|
-
ZIP = 'zip'
|
|
188
|
-
|
|
189
|
-
# -- Class Methods -- #
|
|
190
|
-
|
|
191
|
-
@classmethod
|
|
192
|
-
def aliases(cls) -> StrStrMap:
|
|
193
|
-
"""
|
|
194
|
-
Return a mapping of common aliases for each enum member.
|
|
195
|
-
|
|
196
|
-
Returns
|
|
197
|
-
-------
|
|
198
|
-
StrStrMap
|
|
199
|
-
A mapping of alias names to their corresponding enum member names.
|
|
200
|
-
"""
|
|
201
|
-
return {
|
|
202
|
-
# File extensions
|
|
203
|
-
'.gz': 'gz',
|
|
204
|
-
'.gzip': 'gz',
|
|
205
|
-
'.zip': 'zip',
|
|
206
|
-
# MIME types
|
|
207
|
-
'application/gzip': 'gz',
|
|
208
|
-
'application/x-gzip': 'gz',
|
|
209
|
-
'application/zip': 'zip',
|
|
210
|
-
'application/x-zip-compressed': 'zip',
|
|
211
|
-
}
|
|
212
|
-
|
|
213
|
-
|
|
214
172
|
class DataConnectorType(CoercibleStrEnum):
|
|
215
173
|
"""Supported data connector types."""
|
|
216
174
|
|
|
@@ -242,99 +200,6 @@ class DataConnectorType(CoercibleStrEnum):
|
|
|
242
200
|
}
|
|
243
201
|
|
|
244
202
|
|
|
245
|
-
class FileFormat(CoercibleStrEnum):
|
|
246
|
-
"""Supported file formats for extraction."""
|
|
247
|
-
|
|
248
|
-
# -- Constants -- #
|
|
249
|
-
|
|
250
|
-
AVRO = 'avro'
|
|
251
|
-
CSV = 'csv'
|
|
252
|
-
FEATHER = 'feather'
|
|
253
|
-
GZ = 'gz'
|
|
254
|
-
JSON = 'json'
|
|
255
|
-
NDJSON = 'ndjson'
|
|
256
|
-
ORC = 'orc'
|
|
257
|
-
PARQUET = 'parquet'
|
|
258
|
-
TSV = 'tsv'
|
|
259
|
-
TXT = 'txt'
|
|
260
|
-
XLS = 'xls'
|
|
261
|
-
XLSX = 'xlsx'
|
|
262
|
-
ZIP = 'zip'
|
|
263
|
-
XML = 'xml'
|
|
264
|
-
YAML = 'yaml'
|
|
265
|
-
|
|
266
|
-
# -- Class Methods -- #
|
|
267
|
-
|
|
268
|
-
@classmethod
|
|
269
|
-
def aliases(cls) -> StrStrMap:
|
|
270
|
-
"""
|
|
271
|
-
Return a mapping of common aliases for each enum member.
|
|
272
|
-
|
|
273
|
-
Returns
|
|
274
|
-
-------
|
|
275
|
-
StrStrMap
|
|
276
|
-
A mapping of alias names to their corresponding enum member names.
|
|
277
|
-
"""
|
|
278
|
-
return {
|
|
279
|
-
# Common shorthand
|
|
280
|
-
'parq': 'parquet',
|
|
281
|
-
'yml': 'yaml',
|
|
282
|
-
# File extensions
|
|
283
|
-
'.avro': 'avro',
|
|
284
|
-
'.csv': 'csv',
|
|
285
|
-
'.feather': 'feather',
|
|
286
|
-
'.gz': 'gz',
|
|
287
|
-
'.json': 'json',
|
|
288
|
-
'.jsonl': 'ndjson',
|
|
289
|
-
'.ndjson': 'ndjson',
|
|
290
|
-
'.orc': 'orc',
|
|
291
|
-
'.parquet': 'parquet',
|
|
292
|
-
'.pq': 'parquet',
|
|
293
|
-
'.tsv': 'tsv',
|
|
294
|
-
'.txt': 'txt',
|
|
295
|
-
'.xls': 'xls',
|
|
296
|
-
'.xlsx': 'xlsx',
|
|
297
|
-
'.zip': 'zip',
|
|
298
|
-
'.xml': 'xml',
|
|
299
|
-
'.yaml': 'yaml',
|
|
300
|
-
'.yml': 'yaml',
|
|
301
|
-
# MIME types
|
|
302
|
-
'application/avro': 'avro',
|
|
303
|
-
'application/csv': 'csv',
|
|
304
|
-
'application/feather': 'feather',
|
|
305
|
-
'application/gzip': 'gz',
|
|
306
|
-
'application/json': 'json',
|
|
307
|
-
'application/jsonlines': 'ndjson',
|
|
308
|
-
'application/ndjson': 'ndjson',
|
|
309
|
-
'application/orc': 'orc',
|
|
310
|
-
'application/parquet': 'parquet',
|
|
311
|
-
'application/vnd.apache.avro': 'avro',
|
|
312
|
-
'application/vnd.apache.parquet': 'parquet',
|
|
313
|
-
'application/vnd.apache.arrow.file': 'feather',
|
|
314
|
-
'application/vnd.apache.orc': 'orc',
|
|
315
|
-
'application/vnd.ms-excel': 'xls',
|
|
316
|
-
(
|
|
317
|
-
'application/vnd.openxmlformats-'
|
|
318
|
-
'officedocument.spreadsheetml.sheet'
|
|
319
|
-
): 'xlsx',
|
|
320
|
-
'application/x-avro': 'avro',
|
|
321
|
-
'application/x-csv': 'csv',
|
|
322
|
-
'application/x-feather': 'feather',
|
|
323
|
-
'application/x-orc': 'orc',
|
|
324
|
-
'application/x-ndjson': 'ndjson',
|
|
325
|
-
'application/x-parquet': 'parquet',
|
|
326
|
-
'application/x-yaml': 'yaml',
|
|
327
|
-
'application/xml': 'xml',
|
|
328
|
-
'application/zip': 'zip',
|
|
329
|
-
'text/csv': 'csv',
|
|
330
|
-
'text/plain': 'txt',
|
|
331
|
-
'text/tab-separated-values': 'tsv',
|
|
332
|
-
'text/tsv': 'tsv',
|
|
333
|
-
'text/xml': 'xml',
|
|
334
|
-
'text/yaml': 'yaml',
|
|
335
|
-
}
|
|
336
|
-
|
|
337
|
-
|
|
338
203
|
class HttpMethod(CoercibleStrEnum):
|
|
339
204
|
"""Supported HTTP verbs that accept JSON payloads."""
|
|
340
205
|
|
|
@@ -360,8 +225,8 @@ class HttpMethod(CoercibleStrEnum):
|
|
|
360
225
|
Notes
|
|
361
226
|
-----
|
|
362
227
|
- RFCs do not strictly forbid bodies on some other methods (e.g.,
|
|
363
|
-
|
|
364
|
-
|
|
228
|
+
``DELETE``), but many servers/clients do not expect them. We mark
|
|
229
|
+
``POST``, ``PUT``, and ``PATCH`` as True.
|
|
365
230
|
"""
|
|
366
231
|
return self in {HttpMethod.POST, HttpMethod.PUT, HttpMethod.PATCH}
|
|
367
232
|
|
|
@@ -465,13 +330,6 @@ class PipelineStep(CoercibleStrEnum):
|
|
|
465
330
|
# SECTION: INTERNAL CONSTANTS ============================================== #
|
|
466
331
|
|
|
467
332
|
|
|
468
|
-
# Compression formats that are also file formats.
|
|
469
|
-
_COMPRESSION_FILE_FORMATS: set[FileFormat] = {
|
|
470
|
-
FileFormat.GZ,
|
|
471
|
-
FileFormat.ZIP,
|
|
472
|
-
}
|
|
473
|
-
|
|
474
|
-
|
|
475
333
|
# Precomputed order index for PipelineStep; avoids recomputing on each access.
|
|
476
334
|
_PIPELINE_ORDER_INDEX: dict[PipelineStep, int] = {
|
|
477
335
|
PipelineStep.FILTER: 0,
|
|
@@ -480,129 +338,3 @@ _PIPELINE_ORDER_INDEX: dict[PipelineStep, int] = {
|
|
|
480
338
|
PipelineStep.SORT: 3,
|
|
481
339
|
PipelineStep.AGGREGATE: 4,
|
|
482
340
|
}
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
# SECTION: FUNCTIONS ======================================================== #
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
def coerce_data_connector_type(
|
|
489
|
-
connector: DataConnectorType | str,
|
|
490
|
-
) -> DataConnectorType:
|
|
491
|
-
"""
|
|
492
|
-
Normalize textual data connector values to :class:`DataConnectorType`.
|
|
493
|
-
|
|
494
|
-
This thin wrapper is kept for backward compatibility; prefer
|
|
495
|
-
:meth:`DataConnectorType.coerce` going forward.
|
|
496
|
-
"""
|
|
497
|
-
return DataConnectorType.coerce(connector)
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
def coerce_file_format(
|
|
501
|
-
file_format: FileFormat | str,
|
|
502
|
-
) -> FileFormat:
|
|
503
|
-
"""
|
|
504
|
-
Normalize textual file format values to :class:`FileFormat`.
|
|
505
|
-
|
|
506
|
-
This thin wrapper is kept for backward compatibility; prefer
|
|
507
|
-
:meth:`FileFormat.coerce` going forward.
|
|
508
|
-
"""
|
|
509
|
-
return FileFormat.coerce(file_format)
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
def coerce_compression_format(
|
|
513
|
-
compression_format: CompressionFormat | str,
|
|
514
|
-
) -> CompressionFormat:
|
|
515
|
-
"""
|
|
516
|
-
Normalize textual compression format values to :class:`CompressionFormat`.
|
|
517
|
-
|
|
518
|
-
This thin wrapper is kept for backward compatibility; prefer
|
|
519
|
-
:meth:`CompressionFormat.coerce` going forward.
|
|
520
|
-
"""
|
|
521
|
-
return CompressionFormat.coerce(compression_format)
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
def coerce_http_method(
|
|
525
|
-
http_method: HttpMethod | str,
|
|
526
|
-
) -> HttpMethod:
|
|
527
|
-
"""
|
|
528
|
-
Normalize textual HTTP method values to :class:`HttpMethod`.
|
|
529
|
-
|
|
530
|
-
This thin wrapper is kept for backward compatibility; prefer
|
|
531
|
-
:meth:`HttpMethod.coerce` going forward.
|
|
532
|
-
"""
|
|
533
|
-
return HttpMethod.coerce(http_method)
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
def infer_file_format_and_compression(
|
|
537
|
-
value: object,
|
|
538
|
-
filename: object | None = None,
|
|
539
|
-
) -> tuple[FileFormat | None, CompressionFormat | None]:
|
|
540
|
-
"""
|
|
541
|
-
Infer data format and compression from a filename, extension, or MIME type.
|
|
542
|
-
|
|
543
|
-
Parameters
|
|
544
|
-
----------
|
|
545
|
-
value : object
|
|
546
|
-
A filename, extension, MIME type, or existing enum member.
|
|
547
|
-
filename : object | None, optional
|
|
548
|
-
A filename to consult for extension-based inference (e.g. when
|
|
549
|
-
``value`` is ``application/octet-stream``).
|
|
550
|
-
|
|
551
|
-
Returns
|
|
552
|
-
-------
|
|
553
|
-
tuple[FileFormat | None, CompressionFormat | None]
|
|
554
|
-
The inferred data format and compression, if any.
|
|
555
|
-
"""
|
|
556
|
-
if isinstance(value, FileFormat):
|
|
557
|
-
if value in _COMPRESSION_FILE_FORMATS:
|
|
558
|
-
return None, CompressionFormat.coerce(value.value)
|
|
559
|
-
return value, None
|
|
560
|
-
if isinstance(value, CompressionFormat):
|
|
561
|
-
return None, value
|
|
562
|
-
|
|
563
|
-
text = str(value).strip()
|
|
564
|
-
if not text:
|
|
565
|
-
return None, None
|
|
566
|
-
|
|
567
|
-
normalized = text.casefold()
|
|
568
|
-
mime = normalized.split(';', 1)[0].strip()
|
|
569
|
-
|
|
570
|
-
is_octet_stream = mime == 'application/octet-stream'
|
|
571
|
-
compression = CompressionFormat.try_coerce(mime)
|
|
572
|
-
fmt = None if is_octet_stream else FileFormat.try_coerce(mime)
|
|
573
|
-
|
|
574
|
-
is_mime = mime.startswith(
|
|
575
|
-
(
|
|
576
|
-
'application/',
|
|
577
|
-
'text/',
|
|
578
|
-
'audio/',
|
|
579
|
-
'image/',
|
|
580
|
-
'video/',
|
|
581
|
-
'multipart/',
|
|
582
|
-
),
|
|
583
|
-
)
|
|
584
|
-
suffix_source: object | None = filename if filename is not None else text
|
|
585
|
-
if is_mime and filename is None:
|
|
586
|
-
suffix_source = None
|
|
587
|
-
|
|
588
|
-
suffixes = (
|
|
589
|
-
PurePath(str(suffix_source)).suffixes
|
|
590
|
-
if suffix_source is not None
|
|
591
|
-
else []
|
|
592
|
-
)
|
|
593
|
-
if suffixes:
|
|
594
|
-
normalized_suffixes = [suffix.casefold() for suffix in suffixes]
|
|
595
|
-
compression = (
|
|
596
|
-
CompressionFormat.try_coerce(normalized_suffixes[-1])
|
|
597
|
-
or compression
|
|
598
|
-
)
|
|
599
|
-
if compression is not None:
|
|
600
|
-
normalized_suffixes = normalized_suffixes[:-1]
|
|
601
|
-
if normalized_suffixes:
|
|
602
|
-
fmt = FileFormat.try_coerce(normalized_suffixes[-1]) or fmt
|
|
603
|
-
|
|
604
|
-
if fmt in _COMPRESSION_FILE_FORMATS:
|
|
605
|
-
compression = compression or CompressionFormat.coerce(fmt.value)
|
|
606
|
-
fmt = None
|
|
607
|
-
|
|
608
|
-
return fmt, compression
|
etlplus/extract.py
CHANGED
|
@@ -13,11 +13,9 @@ from typing import cast
|
|
|
13
13
|
import requests # type: ignore[import]
|
|
14
14
|
|
|
15
15
|
from .enums import DataConnectorType
|
|
16
|
-
from .enums import FileFormat
|
|
17
16
|
from .enums import HttpMethod
|
|
18
|
-
from .enums import coerce_data_connector_type
|
|
19
|
-
from .enums import coerce_file_format
|
|
20
17
|
from .file import File
|
|
18
|
+
from .file import FileFormat
|
|
21
19
|
from .types import JSONData
|
|
22
20
|
from .types import JSONDict
|
|
23
21
|
from .types import JSONList
|
|
@@ -55,7 +53,7 @@ def extract_from_file(
|
|
|
55
53
|
# If no explicit format is provided, let File infer from extension.
|
|
56
54
|
if file_format is None:
|
|
57
55
|
return File(path, None).read()
|
|
58
|
-
fmt =
|
|
56
|
+
fmt = FileFormat.coerce(file_format)
|
|
59
57
|
|
|
60
58
|
# Let file module perform existence and format validation.
|
|
61
59
|
return File(path, fmt).read()
|
|
@@ -202,7 +200,7 @@ def extract(
|
|
|
202
200
|
ValueError
|
|
203
201
|
If `source_type` is not one of the supported values.
|
|
204
202
|
"""
|
|
205
|
-
match
|
|
203
|
+
match DataConnectorType.coerce(source_type):
|
|
206
204
|
case DataConnectorType.FILE:
|
|
207
205
|
# Prefer explicit format if provided, else infer from filename.
|
|
208
206
|
return extract_from_file(source, file_format)
|
|
@@ -213,6 +211,6 @@ def extract(
|
|
|
213
211
|
# ``file_format`` is ignored for APIs.
|
|
214
212
|
return extract_from_api(str(source), **kwargs)
|
|
215
213
|
case _:
|
|
216
|
-
#
|
|
217
|
-
# explicit guard for defensive programming.
|
|
214
|
+
# :meth:`coerce` already raises for invalid connector types, but
|
|
215
|
+
# keep explicit guard for defensive programming.
|
|
218
216
|
raise ValueError(f'Invalid source type: {source_type}')
|