dataenginex 0.3.4__tar.gz → 0.4.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataenginex-0.4.11/.gitignore +81 -0
- dataenginex-0.4.11/CHANGELOG.md +138 -0
- dataenginex-0.4.11/PKG-INFO +71 -0
- {dataenginex-0.3.4 → dataenginex-0.4.11}/pyproject.toml +19 -9
- {dataenginex-0.3.4 → dataenginex-0.4.11}/src/dataenginex/README.md +1 -1
- {dataenginex-0.3.4 → dataenginex-0.4.11}/src/dataenginex/RELEASE_NOTES.md +10 -0
- dataenginex-0.4.11/src/dataenginex/__init__.py +34 -0
- dataenginex-0.4.11/src/dataenginex/api/__init__.py +54 -0
- {dataenginex-0.3.4 → dataenginex-0.4.11}/src/dataenginex/api/auth.py +0 -0
- {dataenginex-0.3.4 → dataenginex-0.4.11}/src/dataenginex/api/errors.py +0 -0
- {dataenginex-0.3.4 → dataenginex-0.4.11}/src/dataenginex/api/health.py +10 -1
- {dataenginex-0.3.4 → dataenginex-0.4.11}/src/dataenginex/api/pagination.py +0 -0
- {dataenginex-0.3.4 → dataenginex-0.4.11}/src/dataenginex/api/rate_limit.py +0 -0
- dataenginex-0.4.11/src/dataenginex/api/routers/__init__.py +3 -0
- {dataenginex-0.3.4 → dataenginex-0.4.11}/src/dataenginex/api/routers/v1.py +35 -19
- dataenginex-0.4.11/src/dataenginex/core/__init__.py +81 -0
- {dataenginex-0.3.4 → dataenginex-0.4.11}/src/dataenginex/core/medallion_architecture.py +97 -72
- {dataenginex-0.3.4 → dataenginex-0.4.11}/src/dataenginex/core/pipeline_config.py +11 -7
- dataenginex-0.4.11/src/dataenginex/core/quality.py +305 -0
- {dataenginex-0.3.4 → dataenginex-0.4.11}/src/dataenginex/core/schemas.py +27 -40
- {dataenginex-0.3.4 → dataenginex-0.4.11}/src/dataenginex/core/validators.py +101 -106
- dataenginex-0.4.11/src/dataenginex/data/__init__.py +33 -0
- {dataenginex-0.3.4 → dataenginex-0.4.11}/src/dataenginex/data/connectors.py +24 -8
- {dataenginex-0.3.4 → dataenginex-0.4.11}/src/dataenginex/data/profiler.py +11 -4
- {dataenginex-0.3.4 → dataenginex-0.4.11}/src/dataenginex/data/registry.py +13 -7
- dataenginex-0.4.11/src/dataenginex/lakehouse/__init__.py +38 -0
- {dataenginex-0.3.4 → dataenginex-0.4.11}/src/dataenginex/lakehouse/catalog.py +20 -2
- {dataenginex-0.3.4 → dataenginex-0.4.11}/src/dataenginex/lakehouse/partitioning.py +0 -0
- dataenginex-0.4.11/src/dataenginex/lakehouse/storage.py +381 -0
- dataenginex-0.4.11/src/dataenginex/middleware/__init__.py +35 -0
- {dataenginex-0.3.4 → dataenginex-0.4.11}/src/dataenginex/middleware/logging_config.py +13 -6
- {dataenginex-0.3.4 → dataenginex-0.4.11}/src/dataenginex/middleware/metrics.py +20 -4
- {dataenginex-0.3.4 → dataenginex-0.4.11}/src/dataenginex/middleware/metrics_middleware.py +31 -9
- {dataenginex-0.3.4 → dataenginex-0.4.11}/src/dataenginex/middleware/request_logging.py +8 -1
- {dataenginex-0.3.4 → dataenginex-0.4.11}/src/dataenginex/middleware/tracing.py +12 -1
- {dataenginex-0.3.4 → dataenginex-0.4.11}/src/dataenginex/ml/__init__.py +20 -12
- {dataenginex-0.3.4 → dataenginex-0.4.11}/src/dataenginex/ml/drift.py +14 -1
- {dataenginex-0.3.4 → dataenginex-0.4.11}/src/dataenginex/ml/registry.py +17 -2
- {dataenginex-0.3.4 → dataenginex-0.4.11}/src/dataenginex/ml/serving.py +19 -4
- {dataenginex-0.3.4 → dataenginex-0.4.11}/src/dataenginex/ml/training.py +62 -21
- dataenginex-0.4.11/src/dataenginex/warehouse/__init__.py +40 -0
- {dataenginex-0.3.4 → dataenginex-0.4.11}/src/dataenginex/warehouse/lineage.py +18 -1
- {dataenginex-0.3.4 → dataenginex-0.4.11}/src/dataenginex/warehouse/transforms.py +27 -9
- dataenginex-0.3.4/PKG-INFO +0 -66
- dataenginex-0.3.4/src/dataenginex/__init__.py +0 -16
- dataenginex-0.3.4/src/dataenginex/api/__init__.py +0 -11
- dataenginex-0.3.4/src/dataenginex/api/routers/__init__.py +0 -1
- dataenginex-0.3.4/src/dataenginex/core/__init__.py +0 -36
- dataenginex-0.3.4/src/dataenginex/data/__init__.py +0 -22
- dataenginex-0.3.4/src/dataenginex/lakehouse/__init__.py +0 -22
- dataenginex-0.3.4/src/dataenginex/lakehouse/storage.py +0 -177
- dataenginex-0.3.4/src/dataenginex/middleware/__init__.py +0 -19
- dataenginex-0.3.4/src/dataenginex/warehouse/__init__.py +0 -19
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# Byte-compiled / cache
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# Distribution / packaging
|
|
7
|
+
build/
|
|
8
|
+
dist/
|
|
9
|
+
*.egg-info/
|
|
10
|
+
.eggs/
|
|
11
|
+
|
|
12
|
+
# Virtual environments
|
|
13
|
+
.env
|
|
14
|
+
.venv/
|
|
15
|
+
venv/
|
|
16
|
+
ENV/
|
|
17
|
+
env/
|
|
18
|
+
|
|
19
|
+
# TODO
|
|
20
|
+
TODO.md
|
|
21
|
+
|
|
22
|
+
# Installer logs
|
|
23
|
+
pip-log.txt
|
|
24
|
+
pip-delete-this-directory.txt
|
|
25
|
+
|
|
26
|
+
# Unit test / coverage
|
|
27
|
+
.coverage
|
|
28
|
+
coverage.xml
|
|
29
|
+
htmlcov/
|
|
30
|
+
.tox/
|
|
31
|
+
.nox/
|
|
32
|
+
.pytest_cache/
|
|
33
|
+
|
|
34
|
+
# PyInstaller
|
|
35
|
+
*.manifest
|
|
36
|
+
*.spec
|
|
37
|
+
|
|
38
|
+
# Type checkers
|
|
39
|
+
.mypy_cache/
|
|
40
|
+
.pyre/
|
|
41
|
+
.pytype/
|
|
42
|
+
|
|
43
|
+
# Linter cache
|
|
44
|
+
.ruff_cache/
|
|
45
|
+
|
|
46
|
+
# Tool / build cache
|
|
47
|
+
.cache/
|
|
48
|
+
|
|
49
|
+
# IDEs and editors
|
|
50
|
+
.vscode/
|
|
51
|
+
.idea/
|
|
52
|
+
*.sublime-project
|
|
53
|
+
*.sublime-workspace
|
|
54
|
+
|
|
55
|
+
# Track VS Code MCP config only
|
|
56
|
+
!.vscode/
|
|
57
|
+
.vscode/*
|
|
58
|
+
!.vscode/mcp.json
|
|
59
|
+
|
|
60
|
+
# OS files
|
|
61
|
+
.DS_Store
|
|
62
|
+
Thumbs.db
|
|
63
|
+
desktop.ini
|
|
64
|
+
|
|
65
|
+
# Logs and databases
|
|
66
|
+
*.log
|
|
67
|
+
*.sqlite3
|
|
68
|
+
logs/
|
|
69
|
+
|
|
70
|
+
# Wheel metadata
|
|
71
|
+
pip-wheel-metadata/
|
|
72
|
+
|
|
73
|
+
# Python egg
|
|
74
|
+
*.egg
|
|
75
|
+
*.egg-info/
|
|
76
|
+
|
|
77
|
+
# Temporary
|
|
78
|
+
*.tmp
|
|
79
|
+
|
|
80
|
+
# Generated site (Zensical / MkDocs)
|
|
81
|
+
site/
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to `dataenginex` will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
## [0.4.11] - 2026-02-27
|
|
11
|
+
|
|
12
|
+
### Changed
|
|
13
|
+
|
|
14
|
+
- Added `environment` label support across HTTP metrics counters/histograms/gauges and middleware emission.
|
|
15
|
+
- Aligned alert rule histogram quantile expressions with explicit bucket aggregation by `le` and `environment`.
|
|
16
|
+
- Standardized docs and release prep metadata for CSV-canonical roadmap and setup workflow updates.
|
|
17
|
+
|
|
18
|
+
## [0.4.10] - 2026-02-21
|
|
19
|
+
|
|
20
|
+
### Added
|
|
21
|
+
|
|
22
|
+
- `examples/` directory with 4 runnable quickstart scripts
|
|
23
|
+
- `01_hello_pipeline.py` — profiler, schema validation, medallion config
|
|
24
|
+
- `02_api_quickstart.py` — FastAPI app with health, v1 router, metrics
|
|
25
|
+
- `03_quality_gate.py` — QualityGate evaluations against layer thresholds
|
|
26
|
+
- `04_ml_training.py` — SklearnTrainer, ModelRegistry, DriftDetector demo
|
|
27
|
+
- `examples/README.md` with table of examples and run instructions
|
|
28
|
+
|
|
29
|
+
## [0.4.8] - 2026-02-21
|
|
30
|
+
|
|
31
|
+
### Added
|
|
32
|
+
|
|
33
|
+
- PySpark local-mode test fixtures in `tests/conftest.py` (session-scoped `spark` session)
|
|
34
|
+
- Sample DataFrame fixtures: `spark_df_jobs`, `spark_df_weather`, `spark_df_empty`
|
|
35
|
+
- `requires_pyspark` skip marker — tests auto-skip when PySpark is not installed
|
|
36
|
+
- `tests/fixtures/sample_data.py` — factory helpers for job, user, and weather records
|
|
37
|
+
- `tests/unit/test_spark_fixtures.py` — validates PySpark fixture behaviour
|
|
38
|
+
|
|
39
|
+
## [0.4.6] - 2026-02-21
|
|
40
|
+
|
|
41
|
+
### Added
|
|
42
|
+
|
|
43
|
+
- `QualityGate` — orchestrates quality checks at medallion layer transitions
|
|
44
|
+
- `QualityStore` — in-memory store accumulating per-layer quality metrics
|
|
45
|
+
- `QualityResult` — immutable dataclass capturing evaluation outcomes
|
|
46
|
+
- `QualityDimension` — StrEnum for named quality dimensions
|
|
47
|
+
- `/api/v1/data/quality/{layer}` endpoint for per-layer quality history
|
|
48
|
+
- `set_quality_store()` / `get_quality_store()` for wiring quality at app startup
|
|
49
|
+
- New exports in `dataenginex.core` and `dataenginex.api`
|
|
50
|
+
|
|
51
|
+
### Changed
|
|
52
|
+
|
|
53
|
+
- `/api/v1/data/quality` now returns live metrics from `QualityStore` (was placeholder zeros)
|
|
54
|
+
- Wired `DataProfiler`, `DataQualityChecks`, and `QualityScorer` into `QualityGate` pipeline
|
|
55
|
+
|
|
56
|
+
## [0.4.5] - 2026-02-21
|
|
57
|
+
|
|
58
|
+
### Added
|
|
59
|
+
|
|
60
|
+
- `StorageBackend` ABC with proper `@abstractmethod` contracts
|
|
61
|
+
- `S3Storage` backend for AWS S3 (requires `boto3`)
|
|
62
|
+
- `GCSStorage` backend for Google Cloud Storage (requires `google-cloud-storage`)
|
|
63
|
+
- Re-exported `StorageBackend` from `dataenginex.lakehouse`
|
|
64
|
+
|
|
65
|
+
### Changed
|
|
66
|
+
|
|
67
|
+
- Refactored `StorageBackend` from plain class to proper `ABC` subclass
|
|
68
|
+
- Updated `lakehouse.__init__` to export all 4 storage backends + ABC
|
|
69
|
+
|
|
70
|
+
## [0.4.3] - 2026-02-21
|
|
71
|
+
|
|
72
|
+
### Added
|
|
73
|
+
|
|
74
|
+
- Comprehensive attribute-level docstrings on all public dataclasses
|
|
75
|
+
- `from __future__ import annotations` in all source modules
|
|
76
|
+
- Module-level class/function inventory docstrings
|
|
77
|
+
- mkdocs API reference configuration with `mkdocstrings` plugin
|
|
78
|
+
- API reference pages for all 7 subpackages under `docs/api-reference/`
|
|
79
|
+
|
|
80
|
+
### Changed
|
|
81
|
+
|
|
82
|
+
- Upgraded mkdocs theme from `mkdocs` to `material`
|
|
83
|
+
- Enhanced module docstrings in middleware, core, and validators
|
|
84
|
+
|
|
85
|
+
## [0.4.1] - 2026-02-21
|
|
86
|
+
|
|
87
|
+
### Added
|
|
88
|
+
|
|
89
|
+
- CHANGELOG.md with Keep a Changelog format
|
|
90
|
+
- Release workflow extracts changelog notes for GitHub Releases automatically
|
|
91
|
+
|
|
92
|
+
### Changed
|
|
93
|
+
|
|
94
|
+
- `release.yml` now reads `packages/dataenginex/CHANGELOG.md` for release notes
|
|
95
|
+
|
|
96
|
+
## [0.4.0] - 2026-02-21
|
|
97
|
+
|
|
98
|
+
### Added
|
|
99
|
+
|
|
100
|
+
- Stable `__all__` exports in every subpackage `__init__.py`
|
|
101
|
+
- `from __future__ import annotations` in all public modules
|
|
102
|
+
- Comprehensive module-level docstrings with usage examples
|
|
103
|
+
- New public API exports: `ComponentHealth`, `AuthMiddleware`, `AuthUser`,
|
|
104
|
+
`create_token`, `decode_token`, `BadRequestError`, `NotFoundError`,
|
|
105
|
+
`PaginationMeta`, `RateLimiter`, `RateLimitMiddleware`,
|
|
106
|
+
`ConnectorStatus`, `FetchResult`, `ColumnProfile`, `get_logger`, `get_tracer`
|
|
107
|
+
|
|
108
|
+
### Changed
|
|
109
|
+
|
|
110
|
+
- Reorganized `__all__` in all subpackages for logical grouping
|
|
111
|
+
- Updated package version to 0.4.0
|
|
112
|
+
|
|
113
|
+
## [0.3.5] - 2026-02-13
|
|
114
|
+
|
|
115
|
+
### Added
|
|
116
|
+
|
|
117
|
+
- Production hardening: structured logging, Prometheus/OTel, health probes
|
|
118
|
+
- Data connectors: `RestConnector`, `FileConnector` with async interface
|
|
119
|
+
- Schema registry with versioned schema management
|
|
120
|
+
- Data profiler with automated dataset statistics
|
|
121
|
+
- Lakehouse catalog, partitioning, and storage backends
|
|
122
|
+
- ML framework: trainer, model registry, drift detection, serving
|
|
123
|
+
- Warehouse transforms and persistent lineage tracking
|
|
124
|
+
- JWT authentication middleware
|
|
125
|
+
- Rate limiting middleware
|
|
126
|
+
- Cursor-based pagination utilities
|
|
127
|
+
- Versioned API router (`/api/v1/`)
|
|
128
|
+
|
|
129
|
+
[Unreleased]: https://github.com/TheDataEngineX/DEX/compare/v0.4.11...HEAD
|
|
130
|
+
[0.4.11]: https://github.com/TheDataEngineX/DEX/compare/v0.4.10...v0.4.11
|
|
131
|
+
[0.4.10]: https://github.com/TheDataEngineX/DEX/compare/v0.4.8...v0.4.10
|
|
132
|
+
[0.4.8]: https://github.com/TheDataEngineX/DEX/compare/v0.4.6...v0.4.8
|
|
133
|
+
[0.4.6]: https://github.com/TheDataEngineX/DEX/compare/v0.4.5...v0.4.6
|
|
134
|
+
[0.4.5]: https://github.com/TheDataEngineX/DEX/compare/v0.4.3...v0.4.5
|
|
135
|
+
[0.4.3]: https://github.com/TheDataEngineX/DEX/compare/v0.4.1...v0.4.3
|
|
136
|
+
[0.4.1]: https://github.com/TheDataEngineX/DEX/compare/v0.4.0...v0.4.1
|
|
137
|
+
[0.4.0]: https://github.com/TheDataEngineX/DEX/compare/v0.3.5...v0.4.0
|
|
138
|
+
[0.3.5]: https://github.com/TheDataEngineX/DEX/releases/tag/v0.3.5
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dataenginex
|
|
3
|
+
Version: 0.4.11
|
|
4
|
+
Summary: DataEngineX - Core framework for data engineering projects
|
|
5
|
+
Author-email: Jay <jayapal.myaka99@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.11
|
|
8
|
+
Requires-Dist: email-validator>=2.0.0
|
|
9
|
+
Requires-Dist: fastapi>=0.128.4
|
|
10
|
+
Requires-Dist: httpx>=0.28.0
|
|
11
|
+
Requires-Dist: loguru>=0.7.3
|
|
12
|
+
Requires-Dist: opentelemetry-api>=1.39.0
|
|
13
|
+
Requires-Dist: opentelemetry-exporter-otlp>=1.39.0
|
|
14
|
+
Requires-Dist: opentelemetry-instrumentation-fastapi>=0.60b1
|
|
15
|
+
Requires-Dist: opentelemetry-sdk>=1.39.0
|
|
16
|
+
Requires-Dist: prometheus-client>=0.24.0
|
|
17
|
+
Requires-Dist: python-dotenv>=1.2.0
|
|
18
|
+
Requires-Dist: python-json-logger>=4.0.0
|
|
19
|
+
Requires-Dist: pyyaml>=6.0.2
|
|
20
|
+
Requires-Dist: structlog>=25.5.0
|
|
21
|
+
Requires-Dist: uvicorn>=0.40.0
|
|
22
|
+
Provides-Extra: all
|
|
23
|
+
Requires-Dist: boto3>=1.35.0; extra == 'all'
|
|
24
|
+
Requires-Dist: google-cloud-storage>=2.18.0; extra == 'all'
|
|
25
|
+
Requires-Dist: pyarrow>=18.0.0; extra == 'all'
|
|
26
|
+
Provides-Extra: cloud
|
|
27
|
+
Requires-Dist: boto3>=1.35.0; extra == 'cloud'
|
|
28
|
+
Requires-Dist: google-cloud-storage>=2.18.0; extra == 'cloud'
|
|
29
|
+
Provides-Extra: gcs
|
|
30
|
+
Requires-Dist: google-cloud-storage>=2.18.0; extra == 'gcs'
|
|
31
|
+
Provides-Extra: parquet
|
|
32
|
+
Requires-Dist: pyarrow>=18.0.0; extra == 'parquet'
|
|
33
|
+
Provides-Extra: s3
|
|
34
|
+
Requires-Dist: boto3>=1.35.0; extra == 's3'
|
|
35
|
+
Description-Content-Type: text/markdown
|
|
36
|
+
|
|
37
|
+
# dataenginex
|
|
38
|
+
|
|
39
|
+
`dataenginex` is the core DataEngineX framework package for building observable, production-ready data and API services.
|
|
40
|
+
|
|
41
|
+
It provides:
|
|
42
|
+
- FastAPI application primitives and API extensions
|
|
43
|
+
- Middleware for structured logging, metrics, and tracing
|
|
44
|
+
- Data quality and validation utilities
|
|
45
|
+
- Lakehouse and warehouse building blocks
|
|
46
|
+
- Reusable ML support modules for model-serving workflows
|
|
47
|
+
|
|
48
|
+
## Install
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
pip install dataenginex
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Package Scope
|
|
55
|
+
|
|
56
|
+
This package is the core library from the DEX monorepo.
|
|
57
|
+
`careerdex` and `weatherdex` are maintained in the same repository but are not part of this package release flow.
|
|
58
|
+
|
|
59
|
+
## Quick Usage
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
from dataenginex import __version__
|
|
63
|
+
|
|
64
|
+
print(__version__)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Source and Docs
|
|
68
|
+
|
|
69
|
+
- Repository: https://github.com/TheDataEngineX/DEX
|
|
70
|
+
- CI/CD guide: `docs/CI_CD.md`
|
|
71
|
+
- Release notes: `packages/dataenginex/src/dataenginex/RELEASE_NOTES.md`
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "dataenginex"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.4.11"
|
|
4
4
|
description = "DataEngineX - Core framework for data engineering projects"
|
|
5
5
|
authors = [
|
|
6
6
|
{name = "Jay", email = "jayapal.myaka99@gmail.com"}
|
|
@@ -25,13 +25,23 @@ dependencies = [
|
|
|
25
25
|
"httpx>=0.28.0",
|
|
26
26
|
]
|
|
27
27
|
|
|
28
|
-
[
|
|
29
|
-
|
|
30
|
-
|
|
28
|
+
[project.optional-dependencies]
|
|
29
|
+
s3 = ["boto3>=1.35.0"]
|
|
30
|
+
gcs = ["google-cloud-storage>=2.18.0"]
|
|
31
|
+
cloud = [
|
|
32
|
+
"boto3>=1.35.0",
|
|
33
|
+
"google-cloud-storage>=2.18.0",
|
|
34
|
+
]
|
|
35
|
+
parquet = ["pyarrow>=18.0.0"]
|
|
36
|
+
all = [
|
|
37
|
+
"boto3>=1.35.0",
|
|
38
|
+
"google-cloud-storage>=2.18.0",
|
|
39
|
+
"pyarrow>=18.0.0",
|
|
40
|
+
]
|
|
31
41
|
|
|
32
|
-
[
|
|
33
|
-
|
|
42
|
+
[build-system]
|
|
43
|
+
requires = ["hatchling>=1.25.0"]
|
|
44
|
+
build-backend = "hatchling.build"
|
|
34
45
|
|
|
35
|
-
[
|
|
36
|
-
|
|
37
|
-
from = "src"
|
|
46
|
+
[tool.hatch.build.targets.wheel]
|
|
47
|
+
packages = ["src/dataenginex"]
|
|
@@ -30,6 +30,6 @@ print(__version__)
|
|
|
30
30
|
|
|
31
31
|
## Source and Docs
|
|
32
32
|
|
|
33
|
-
- Repository: https://github.com/
|
|
33
|
+
- Repository: https://github.com/TheDataEngineX/DEX
|
|
34
34
|
- CI/CD guide: `docs/CI_CD.md`
|
|
35
35
|
- Release notes: `packages/dataenginex/src/dataenginex/RELEASE_NOTES.md`
|
|
@@ -3,6 +3,16 @@
|
|
|
3
3
|
This document tracks published package releases for `dataenginex` only.
|
|
4
4
|
Only include changes that modify files under `packages/dataenginex/src/dataenginex/**`.
|
|
5
5
|
|
|
6
|
+
## v0.3.5 - 2026-02-20
|
|
7
|
+
|
|
8
|
+
- Released package version `0.3.5`.
|
|
9
|
+
- Tag: `v0.3.5`
|
|
10
|
+
- Release title: `Release v0.3.5`
|
|
11
|
+
- Changes in this release:
|
|
12
|
+
- Hardened `PyPI Publish` workflow for trusted publishing.
|
|
13
|
+
- Added release-only publish gating and build-only behavior for manual dispatch.
|
|
14
|
+
- Switched publish job environment names to repo vars for stricter workflow validation compatibility.
|
|
15
|
+
|
|
6
16
|
## v0.3.4 - 2026-02-20
|
|
7
17
|
|
|
8
18
|
- Released package version `0.3.4`.
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DataEngineX (DEX) — Core framework for data engineering projects.
|
|
3
|
+
|
|
4
|
+
Public API surface. Import from top-level or from subpackages:
|
|
5
|
+
|
|
6
|
+
from dataenginex import __version__
|
|
7
|
+
from dataenginex.api import HealthChecker, HealthStatus
|
|
8
|
+
from dataenginex.core import MedallionArchitecture, DataLayer
|
|
9
|
+
from dataenginex.data import DataConnector, DataProfiler, SchemaRegistry
|
|
10
|
+
from dataenginex.lakehouse import DataCatalog, ParquetStorage
|
|
11
|
+
from dataenginex.middleware import configure_logging, configure_tracing
|
|
12
|
+
from dataenginex.ml import ModelRegistry, SklearnTrainer, DriftDetector
|
|
13
|
+
from dataenginex.warehouse import PersistentLineage, TransformPipeline
|
|
14
|
+
|
|
15
|
+
Submodules:
|
|
16
|
+
api – FastAPI application, health checks, error handling, pagination
|
|
17
|
+
core – Schemas, validators, medallion architecture, pipeline config
|
|
18
|
+
data – Data connectors, profiler, schema registry
|
|
19
|
+
lakehouse – Storage backends, data catalog, partitioning
|
|
20
|
+
middleware – Logging, metrics, tracing, request middleware
|
|
21
|
+
ml – ML training, model registry, drift detection, serving
|
|
22
|
+
warehouse – Transforms, persistent lineage tracking
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
__version__ = version("dataenginex")
|
|
31
|
+
except PackageNotFoundError:
|
|
32
|
+
__version__ = "0.4.11"
|
|
33
|
+
|
|
34
|
+
__all__ = ["__version__"]
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""Reusable API components — auth, health, errors, pagination, rate limiting, quality.
|
|
2
|
+
|
|
3
|
+
Public API::
|
|
4
|
+
|
|
5
|
+
from dataenginex.api import (
|
|
6
|
+
HealthChecker, HealthStatus, ComponentHealth,
|
|
7
|
+
APIHTTPException, BadRequestError, NotFoundError, ServiceUnavailableError,
|
|
8
|
+
PaginatedResponse, paginate,
|
|
9
|
+
AuthMiddleware, AuthUser, create_token, decode_token,
|
|
10
|
+
RateLimiter, RateLimitMiddleware,
|
|
11
|
+
get_quality_store, set_quality_store,
|
|
12
|
+
)
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from .auth import AuthMiddleware, AuthUser, create_token, decode_token
|
|
18
|
+
from .errors import (
|
|
19
|
+
APIHTTPException,
|
|
20
|
+
BadRequestError,
|
|
21
|
+
NotFoundError,
|
|
22
|
+
ServiceUnavailableError,
|
|
23
|
+
)
|
|
24
|
+
from .health import ComponentHealth, HealthChecker, HealthStatus
|
|
25
|
+
from .pagination import PaginatedResponse, PaginationMeta, paginate
|
|
26
|
+
from .rate_limit import RateLimiter, RateLimitMiddleware
|
|
27
|
+
from .routers.v1 import get_quality_store, set_quality_store
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
# Auth
|
|
31
|
+
"AuthMiddleware",
|
|
32
|
+
"AuthUser",
|
|
33
|
+
"create_token",
|
|
34
|
+
"decode_token",
|
|
35
|
+
# Errors
|
|
36
|
+
"APIHTTPException",
|
|
37
|
+
"BadRequestError",
|
|
38
|
+
"NotFoundError",
|
|
39
|
+
"ServiceUnavailableError",
|
|
40
|
+
# Health
|
|
41
|
+
"ComponentHealth",
|
|
42
|
+
"HealthChecker",
|
|
43
|
+
"HealthStatus",
|
|
44
|
+
# Pagination
|
|
45
|
+
"PaginatedResponse",
|
|
46
|
+
"PaginationMeta",
|
|
47
|
+
"paginate",
|
|
48
|
+
# Quality store
|
|
49
|
+
"get_quality_store",
|
|
50
|
+
"set_quality_store",
|
|
51
|
+
# Rate limiting
|
|
52
|
+
"RateLimiter",
|
|
53
|
+
"RateLimitMiddleware",
|
|
54
|
+
]
|
|
File without changes
|
|
File without changes
|
|
@@ -22,6 +22,15 @@ class HealthStatus(StrEnum):
|
|
|
22
22
|
|
|
23
23
|
@dataclass(frozen=True)
|
|
24
24
|
class ComponentHealth:
|
|
25
|
+
"""Health status of a single dependency component.
|
|
26
|
+
|
|
27
|
+
Attributes:
|
|
28
|
+
name: Component identifier (e.g. ``"database"``, ``"cache"``).
|
|
29
|
+
status: Current health status.
|
|
30
|
+
message: Optional human-readable message.
|
|
31
|
+
duration_ms: Time taken for the health check in milliseconds.
|
|
32
|
+
"""
|
|
33
|
+
|
|
25
34
|
name: str
|
|
26
35
|
status: HealthStatus
|
|
27
36
|
message: str | None = None
|
|
@@ -129,5 +138,5 @@ class HealthChecker:
|
|
|
129
138
|
asyncio.open_connection(host, port), timeout=self.timeout_seconds
|
|
130
139
|
)
|
|
131
140
|
return True, "reachable"
|
|
132
|
-
except (TimeoutError, OSError) as exc:
|
|
141
|
+
except (TimeoutError, ConnectionRefusedError, OSError) as exc:
|
|
133
142
|
return False, f"error={exc.__class__.__name__}"
|
|
File without changes
|
|
File without changes
|
|
@@ -12,9 +12,25 @@ from typing import Any
|
|
|
12
12
|
from fastapi import APIRouter
|
|
13
13
|
|
|
14
14
|
from dataenginex.api.pagination import PaginatedResponse, paginate
|
|
15
|
+
from dataenginex.core.quality import QualityStore
|
|
15
16
|
|
|
16
17
|
router = APIRouter(prefix="/api/v1", tags=["v1"])
|
|
17
18
|
|
|
19
|
+
# Module-level quality store — shared across requests.
|
|
20
|
+
# Populate via ``set_quality_store()`` from application startup.
|
|
21
|
+
_quality_store: QualityStore = QualityStore()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def set_quality_store(store: QualityStore) -> None:
|
|
25
|
+
"""Replace the module-level quality store (call at app startup)."""
|
|
26
|
+
global _quality_store # noqa: PLW0603
|
|
27
|
+
_quality_store = store
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_quality_store() -> QualityStore:
|
|
31
|
+
"""Return the active quality store."""
|
|
32
|
+
return _quality_store
|
|
33
|
+
|
|
18
34
|
|
|
19
35
|
# ---------------------------------------------------------------------------
|
|
20
36
|
# Data pipeline endpoints
|
|
@@ -35,28 +51,28 @@ def list_data_sources(cursor: str | None = None, limit: int = 20) -> PaginatedRe
|
|
|
35
51
|
|
|
36
52
|
@router.get("/data/quality")
|
|
37
53
|
def data_quality_summary() -> dict[str, Any]:
|
|
38
|
-
"""Return a summary of data quality metrics.
|
|
54
|
+
"""Return a summary of data quality metrics from the quality store.
|
|
55
|
+
|
|
56
|
+
Returns live metrics when a ``QualityStore`` has been populated via
|
|
57
|
+
``QualityGate.evaluate()``. Falls back to zeros when no evaluations
|
|
58
|
+
have been recorded yet.
|
|
59
|
+
"""
|
|
60
|
+
return _quality_store.summary()
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@router.get("/data/quality/{layer}")
|
|
64
|
+
def data_quality_layer(layer: str, limit: int = 10) -> dict[str, Any]:
|
|
65
|
+
"""Return quality history for a specific medallion layer.
|
|
39
66
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
67
|
+
Args:
|
|
68
|
+
layer: One of ``bronze``, ``silver``, ``gold``.
|
|
69
|
+
limit: Maximum number of history entries to return.
|
|
43
70
|
"""
|
|
44
|
-
|
|
71
|
+
latest = _quality_store.latest(layer)
|
|
45
72
|
return {
|
|
46
|
-
"
|
|
47
|
-
"
|
|
48
|
-
|
|
49
|
-
"accuracy": 0.0,
|
|
50
|
-
"consistency": 0.0,
|
|
51
|
-
"timeliness": 0.0,
|
|
52
|
-
"uniqueness": 0.0,
|
|
53
|
-
},
|
|
54
|
-
"layer_scores": {
|
|
55
|
-
"bronze": 0.0,
|
|
56
|
-
"silver": 0.0,
|
|
57
|
-
"gold": 0.0,
|
|
58
|
-
},
|
|
59
|
-
"_note": "Placeholder — connect quality store for live data",
|
|
73
|
+
"layer": layer,
|
|
74
|
+
"latest": latest.to_dict() if latest else None,
|
|
75
|
+
"history": _quality_store.history(layer, limit=limit),
|
|
60
76
|
}
|
|
61
77
|
|
|
62
78
|
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""Core framework — schemas, validators, medallion architecture, pipeline config, quality.
|
|
2
|
+
|
|
3
|
+
Public API::
|
|
4
|
+
|
|
5
|
+
from dataenginex.core import (
|
|
6
|
+
# Medallion
|
|
7
|
+
MedallionArchitecture, DataLayer, StorageFormat, LayerConfiguration,
|
|
8
|
+
# Pipeline
|
|
9
|
+
PipelineConfig, PipelineMetrics,
|
|
10
|
+
# Quality
|
|
11
|
+
QualityGate, QualityStore, QualityResult, QualityDimension,
|
|
12
|
+
# Schemas
|
|
13
|
+
JobPosting, JobSourceEnum, UserProfile,
|
|
14
|
+
ErrorDetail, ErrorResponse, RootResponse, HealthResponse,
|
|
15
|
+
DataQualityReport, PipelineExecutionMetadata,
|
|
16
|
+
# Validators
|
|
17
|
+
SchemaValidator, DataQualityChecks, DataHash,
|
|
18
|
+
QualityScorer, ValidationReport,
|
|
19
|
+
)
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
from .medallion_architecture import (
|
|
25
|
+
DataLayer,
|
|
26
|
+
LayerConfiguration,
|
|
27
|
+
MedallionArchitecture,
|
|
28
|
+
StorageFormat,
|
|
29
|
+
)
|
|
30
|
+
from .pipeline_config import PipelineConfig, PipelineMetrics
|
|
31
|
+
from .quality import QualityDimension, QualityGate, QualityResult, QualityStore
|
|
32
|
+
from .schemas import (
|
|
33
|
+
DataQualityReport,
|
|
34
|
+
ErrorDetail,
|
|
35
|
+
ErrorResponse,
|
|
36
|
+
HealthResponse,
|
|
37
|
+
JobPosting,
|
|
38
|
+
JobSourceEnum,
|
|
39
|
+
PipelineExecutionMetadata,
|
|
40
|
+
RootResponse,
|
|
41
|
+
UserProfile,
|
|
42
|
+
)
|
|
43
|
+
from .validators import (
|
|
44
|
+
DataHash,
|
|
45
|
+
DataQualityChecks,
|
|
46
|
+
QualityScorer,
|
|
47
|
+
SchemaValidator,
|
|
48
|
+
ValidationReport,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
__all__ = [
|
|
52
|
+
# Medallion architecture
|
|
53
|
+
"DataLayer",
|
|
54
|
+
"LayerConfiguration",
|
|
55
|
+
"MedallionArchitecture",
|
|
56
|
+
"StorageFormat",
|
|
57
|
+
# Pipeline
|
|
58
|
+
"PipelineConfig",
|
|
59
|
+
"PipelineMetrics",
|
|
60
|
+
# Quality gate
|
|
61
|
+
"QualityDimension",
|
|
62
|
+
"QualityGate",
|
|
63
|
+
"QualityResult",
|
|
64
|
+
"QualityStore",
|
|
65
|
+
# Schemas
|
|
66
|
+
"DataQualityReport",
|
|
67
|
+
"ErrorDetail",
|
|
68
|
+
"ErrorResponse",
|
|
69
|
+
"HealthResponse",
|
|
70
|
+
"JobPosting",
|
|
71
|
+
"JobSourceEnum",
|
|
72
|
+
"PipelineExecutionMetadata",
|
|
73
|
+
"RootResponse",
|
|
74
|
+
"UserProfile",
|
|
75
|
+
# Validators
|
|
76
|
+
"DataHash",
|
|
77
|
+
"DataQualityChecks",
|
|
78
|
+
"QualityScorer",
|
|
79
|
+
"SchemaValidator",
|
|
80
|
+
"ValidationReport",
|
|
81
|
+
]
|