dataenginex 0.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataenginex/README.md +35 -0
- dataenginex/RELEASE_NOTES.md +38 -0
- dataenginex/__init__.py +16 -0
- dataenginex/api/__init__.py +11 -0
- dataenginex/api/auth.py +173 -0
- dataenginex/api/errors.py +70 -0
- dataenginex/api/health.py +133 -0
- dataenginex/api/pagination.py +94 -0
- dataenginex/api/rate_limit.py +122 -0
- dataenginex/api/routers/__init__.py +1 -0
- dataenginex/api/routers/v1.py +113 -0
- dataenginex/core/__init__.py +36 -0
- dataenginex/core/medallion_architecture.py +414 -0
- dataenginex/core/pipeline_config.py +111 -0
- dataenginex/core/schemas.py +304 -0
- dataenginex/core/validators.py +394 -0
- dataenginex/data/__init__.py +22 -0
- dataenginex/data/connectors.py +332 -0
- dataenginex/data/profiler.py +217 -0
- dataenginex/data/registry.py +148 -0
- dataenginex/lakehouse/__init__.py +22 -0
- dataenginex/lakehouse/catalog.py +145 -0
- dataenginex/lakehouse/partitioning.py +99 -0
- dataenginex/lakehouse/storage.py +177 -0
- dataenginex/middleware/__init__.py +19 -0
- dataenginex/middleware/logging_config.py +137 -0
- dataenginex/middleware/metrics.py +45 -0
- dataenginex/middleware/metrics_middleware.py +61 -0
- dataenginex/middleware/request_logging.py +77 -0
- dataenginex/middleware/tracing.py +87 -0
- dataenginex/ml/__init__.py +28 -0
- dataenginex/ml/drift.py +165 -0
- dataenginex/ml/registry.py +156 -0
- dataenginex/ml/serving.py +141 -0
- dataenginex/ml/training.py +205 -0
- dataenginex/warehouse/__init__.py +19 -0
- dataenginex/warehouse/lineage.py +164 -0
- dataenginex/warehouse/transforms.py +206 -0
- dataenginex-0.3.4.dist-info/METADATA +66 -0
- dataenginex-0.3.4.dist-info/RECORD +41 -0
- dataenginex-0.3.4.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Transform framework — declarative, composable data transformations.
|
|
3
|
+
|
|
4
|
+
Each ``Transform`` is a named, reusable function that maps a record dict to a
|
|
5
|
+
new record dict. ``TransformPipeline`` chains multiple transforms and collects
|
|
6
|
+
per-step metrics for observability.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import contextlib
|
|
12
|
+
import time
|
|
13
|
+
from abc import ABC, abstractmethod
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from datetime import UTC, datetime
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
from loguru import logger
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class TransformResult:
|
|
23
|
+
"""Outcome of running a pipeline over a batch of records."""
|
|
24
|
+
|
|
25
|
+
input_count: int
|
|
26
|
+
output_count: int
|
|
27
|
+
dropped_count: int = 0
|
|
28
|
+
error_count: int = 0
|
|
29
|
+
duration_ms: float = 0.0
|
|
30
|
+
records: list[dict[str, Any]] = field(default_factory=list)
|
|
31
|
+
step_metrics: list[dict[str, Any]] = field(default_factory=list)
|
|
32
|
+
completed_at: datetime = field(default_factory=lambda: datetime.now(tz=UTC))
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def success_rate(self) -> float:
|
|
36
|
+
return self.output_count / self.input_count if self.input_count else 0.0
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# ---------------------------------------------------------------------------
|
|
40
|
+
# Single transform
|
|
41
|
+
# ---------------------------------------------------------------------------
|
|
42
|
+
|
|
43
|
+
class Transform(ABC):
|
|
44
|
+
"""Base class for a single data transform step.
|
|
45
|
+
|
|
46
|
+
Subclass and implement ``apply`` which receives one record and returns
|
|
47
|
+
either a transformed record or *None* to drop it.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def __init__(self, name: str, description: str = "") -> None:
|
|
51
|
+
self.name = name
|
|
52
|
+
self.description = description
|
|
53
|
+
|
|
54
|
+
@abstractmethod
|
|
55
|
+
def apply(self, record: dict[str, Any]) -> dict[str, Any] | None:
|
|
56
|
+
"""Transform *record* in place or return a new dict.
|
|
57
|
+
|
|
58
|
+
Return *None* to drop the record from the pipeline.
|
|
59
|
+
"""
|
|
60
|
+
...
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# ---------------------------------------------------------------------------
|
|
64
|
+
# Built-in transforms
|
|
65
|
+
# ---------------------------------------------------------------------------
|
|
66
|
+
|
|
67
|
+
class RenameFieldsTransform(Transform):
|
|
68
|
+
"""Rename keys in a record according to a mapping."""
|
|
69
|
+
|
|
70
|
+
def __init__(self, mapping: dict[str, str]) -> None:
|
|
71
|
+
super().__init__(name="rename_fields", description="Rename record fields")
|
|
72
|
+
self.mapping = mapping
|
|
73
|
+
|
|
74
|
+
def apply(self, record: dict[str, Any]) -> dict[str, Any]:
|
|
75
|
+
out = dict(record)
|
|
76
|
+
for old_key, new_key in self.mapping.items():
|
|
77
|
+
if old_key in out:
|
|
78
|
+
out[new_key] = out.pop(old_key)
|
|
79
|
+
return out
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class DropNullsTransform(Transform):
|
|
83
|
+
"""Drop records that have *None* in any of the specified fields."""
|
|
84
|
+
|
|
85
|
+
def __init__(self, required_fields: list[str]) -> None:
|
|
86
|
+
super().__init__(name="drop_nulls", description="Drop records with null required fields")
|
|
87
|
+
self.required_fields = required_fields
|
|
88
|
+
|
|
89
|
+
def apply(self, record: dict[str, Any]) -> dict[str, Any] | None:
|
|
90
|
+
for f in self.required_fields:
|
|
91
|
+
if record.get(f) is None:
|
|
92
|
+
return None
|
|
93
|
+
return record
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class CastTypesTransform(Transform):
|
|
97
|
+
"""Cast fields to target types (``int``, ``float``, ``str``, ``bool``)."""
|
|
98
|
+
|
|
99
|
+
_CASTERS: dict[str, type] = {"int": int, "float": float, "str": str, "bool": bool}
|
|
100
|
+
|
|
101
|
+
def __init__(self, type_map: dict[str, str]) -> None:
|
|
102
|
+
super().__init__(name="cast_types", description="Cast field types")
|
|
103
|
+
self.type_map = type_map
|
|
104
|
+
|
|
105
|
+
def apply(self, record: dict[str, Any]) -> dict[str, Any]:
|
|
106
|
+
out = dict(record)
|
|
107
|
+
for field_name, target in self.type_map.items():
|
|
108
|
+
if field_name in out and out[field_name] is not None:
|
|
109
|
+
caster = self._CASTERS.get(target)
|
|
110
|
+
if caster:
|
|
111
|
+
with contextlib.suppress(ValueError, TypeError):
|
|
112
|
+
out[field_name] = caster(out[field_name])
|
|
113
|
+
return out
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class AddTimestampTransform(Transform):
|
|
117
|
+
"""Add a processing timestamp field to every record."""
|
|
118
|
+
|
|
119
|
+
def __init__(self, field_name: str = "processed_at") -> None:
|
|
120
|
+
super().__init__(name="add_timestamp", description="Add processing timestamp")
|
|
121
|
+
self.field_name = field_name
|
|
122
|
+
|
|
123
|
+
def apply(self, record: dict[str, Any]) -> dict[str, Any]:
|
|
124
|
+
out = dict(record)
|
|
125
|
+
out[self.field_name] = datetime.now(tz=UTC).isoformat()
|
|
126
|
+
return out
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class FilterTransform(Transform):
|
|
130
|
+
"""Keep only records that match a predicate expression.
|
|
131
|
+
|
|
132
|
+
``predicate`` receives a record dict and returns ``True`` to keep it.
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
def __init__(self, name: str, predicate: Any) -> None:
|
|
136
|
+
super().__init__(name=name, description="Filter records by predicate")
|
|
137
|
+
self._predicate = predicate
|
|
138
|
+
|
|
139
|
+
def apply(self, record: dict[str, Any]) -> dict[str, Any] | None:
|
|
140
|
+
if self._predicate(record):
|
|
141
|
+
return record
|
|
142
|
+
return None
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
# ---------------------------------------------------------------------------
|
|
146
|
+
# Pipeline
|
|
147
|
+
# ---------------------------------------------------------------------------
|
|
148
|
+
|
|
149
|
+
class TransformPipeline:
|
|
150
|
+
"""Execute an ordered chain of ``Transform`` steps over a batch.
|
|
151
|
+
|
|
152
|
+
Example::
|
|
153
|
+
|
|
154
|
+
pipeline = TransformPipeline("bronze_to_silver")
|
|
155
|
+
pipeline.add(DropNullsTransform(["job_id", "company_name"]))
|
|
156
|
+
pipeline.add(CastTypesTransform({"salary_min": "float"}))
|
|
157
|
+
result = pipeline.run(records)
|
|
158
|
+
"""
|
|
159
|
+
|
|
160
|
+
def __init__(self, name: str) -> None:
|
|
161
|
+
self.name = name
|
|
162
|
+
self._steps: list[Transform] = []
|
|
163
|
+
|
|
164
|
+
def add(self, transform: Transform) -> TransformPipeline:
|
|
165
|
+
self._steps.append(transform)
|
|
166
|
+
return self # fluent API
|
|
167
|
+
|
|
168
|
+
def run(self, records: list[dict[str, Any]]) -> TransformResult:
|
|
169
|
+
start = time.perf_counter()
|
|
170
|
+
current = list(records)
|
|
171
|
+
step_metrics: list[dict[str, Any]] = []
|
|
172
|
+
|
|
173
|
+
for step in self._steps:
|
|
174
|
+
step_start = time.perf_counter()
|
|
175
|
+
before_count = len(current)
|
|
176
|
+
output: list[dict[str, Any]] = []
|
|
177
|
+
|
|
178
|
+
for rec in current:
|
|
179
|
+
try:
|
|
180
|
+
result = step.apply(rec)
|
|
181
|
+
if result is not None:
|
|
182
|
+
output.append(result)
|
|
183
|
+
except Exception as exc:
|
|
184
|
+
logger.warning(
|
|
185
|
+
"Transform %s failed on record: %s", step.name, exc,
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
step_duration = (time.perf_counter() - step_start) * 1000
|
|
189
|
+
step_metrics.append({
|
|
190
|
+
"step": step.name,
|
|
191
|
+
"input_count": before_count,
|
|
192
|
+
"output_count": len(output),
|
|
193
|
+
"dropped": before_count - len(output),
|
|
194
|
+
"duration_ms": round(step_duration, 2),
|
|
195
|
+
})
|
|
196
|
+
current = output
|
|
197
|
+
|
|
198
|
+
total_duration = (time.perf_counter() - start) * 1000
|
|
199
|
+
return TransformResult(
|
|
200
|
+
input_count=len(records),
|
|
201
|
+
output_count=len(current),
|
|
202
|
+
dropped_count=len(records) - len(current),
|
|
203
|
+
duration_ms=round(total_duration, 2),
|
|
204
|
+
records=current,
|
|
205
|
+
step_metrics=step_metrics,
|
|
206
|
+
)
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dataenginex
|
|
3
|
+
Version: 0.3.4
|
|
4
|
+
Summary: DataEngineX - Core framework for data engineering projects
|
|
5
|
+
License: MIT
|
|
6
|
+
Author: Jay
|
|
7
|
+
Author-email: jayapal.myaka99@gmail.com
|
|
8
|
+
Requires-Python: >=3.11
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
15
|
+
Requires-Dist: email-validator (>=2.0.0)
|
|
16
|
+
Requires-Dist: fastapi (>=0.128.4)
|
|
17
|
+
Requires-Dist: httpx (>=0.28.0)
|
|
18
|
+
Requires-Dist: loguru (>=0.7.3)
|
|
19
|
+
Requires-Dist: opentelemetry-api (>=1.39.0)
|
|
20
|
+
Requires-Dist: opentelemetry-exporter-otlp (>=1.39.0)
|
|
21
|
+
Requires-Dist: opentelemetry-instrumentation-fastapi (>=0.60b1)
|
|
22
|
+
Requires-Dist: opentelemetry-sdk (>=1.39.0)
|
|
23
|
+
Requires-Dist: prometheus-client (>=0.24.0)
|
|
24
|
+
Requires-Dist: python-dotenv (>=1.2.0)
|
|
25
|
+
Requires-Dist: python-json-logger (>=4.0.0)
|
|
26
|
+
Requires-Dist: pyyaml (>=6.0.2)
|
|
27
|
+
Requires-Dist: structlog (>=25.5.0)
|
|
28
|
+
Requires-Dist: uvicorn (>=0.40.0)
|
|
29
|
+
Description-Content-Type: text/markdown
|
|
30
|
+
|
|
31
|
+
# dataenginex
|
|
32
|
+
|
|
33
|
+
`dataenginex` is the core DataEngineX framework package for building observable, production-ready data and API services.
|
|
34
|
+
|
|
35
|
+
It provides:
|
|
36
|
+
- FastAPI application primitives and API extensions
|
|
37
|
+
- Middleware for structured logging, metrics, and tracing
|
|
38
|
+
- Data quality and validation utilities
|
|
39
|
+
- Lakehouse and warehouse building blocks
|
|
40
|
+
- Reusable ML support modules for model-serving workflows
|
|
41
|
+
|
|
42
|
+
## Install
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install dataenginex
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Package Scope
|
|
49
|
+
|
|
50
|
+
This package is the core library from the DEX monorepo.
|
|
51
|
+
`careerdex` and `weatherdex` are maintained in the same repository but are not part of this package release flow.
|
|
52
|
+
|
|
53
|
+
## Quick Usage
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
from dataenginex import __version__
|
|
57
|
+
|
|
58
|
+
print(__version__)
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Source and Docs
|
|
62
|
+
|
|
63
|
+
- Repository: https://github.com/data-literate/DEX
|
|
64
|
+
- CI/CD guide: `docs/CI_CD.md`
|
|
65
|
+
- Release notes: `packages/dataenginex/src/dataenginex/RELEASE_NOTES.md`
|
|
66
|
+
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
dataenginex/README.md,sha256=CA8sEQ6QCFmmx5vE6lkRvGtf5N6zGG6aRi4vz7WpFsA,901
|
|
2
|
+
dataenginex/RELEASE_NOTES.md,sha256=-jqu-JCcbRfa8Yry0qZ6yEl2nZ7P5PBzbYGpHj8GiNY,1812
|
|
3
|
+
dataenginex/__init__.py,sha256=Y1UcBcb91_9R1emZaCoObxDovJkCo3e2umfjfMe6zZs,469
|
|
4
|
+
dataenginex/api/__init__.py,sha256=z9YbGR8qY9qmJ4Gnx2mknT7UsncsUlG5aK5gbAYGM84,336
|
|
5
|
+
dataenginex/api/auth.py,sha256=0MrtA6imcwAVynrXWUvEhQhaTuTocR8hxVc-FaSVBxw,5759
|
|
6
|
+
dataenginex/api/errors.py,sha256=hDO4LsAiQqjyglccSj53FDSkLjo9lhjR2Twr9TDzvME,1843
|
|
7
|
+
dataenginex/api/health.py,sha256=OTbY0kmy14KYoX463pFY0HYxhqtHfhZpwReY9GpIT20,4337
|
|
8
|
+
dataenginex/api/pagination.py,sha256=dzHSXDXsjYiL8HzOXQR5wNTRYVqCzbCrb-ZE17EpWEQ,2693
|
|
9
|
+
dataenginex/api/rate_limit.py,sha256=OajTPGiLCptoRnlH1-rX2cqTJjJJxe0QGd5R3wKumlI,4017
|
|
10
|
+
dataenginex/api/routers/__init__.py,sha256=6pzSoxY7glXyKAfCChT-ui-N64qJkdEUlyR1ibN1Kyw,28
|
|
11
|
+
dataenginex/api/routers/v1.py,sha256=HimY-vbmI8JoLSkQZxVqs5KGWsx_WLdUEp3xNGj6E9g,3831
|
|
12
|
+
dataenginex/core/__init__.py,sha256=VMw1YlZoHSXarP5FyftrvgF0-7pJ-SUQvb3ZonlroZM,860
|
|
13
|
+
dataenginex/core/medallion_architecture.py,sha256=Ai2wDxbPYAU4I8h61zk8skw7RkFLbZr5iLp7ZpXPXiI,14562
|
|
14
|
+
dataenginex/core/pipeline_config.py,sha256=DM7v0-b4DBmXd8Bn4RqsQdWJlLsIxjytEBeasnYCmxQ,3860
|
|
15
|
+
dataenginex/core/schemas.py,sha256=hyIvim0rRzAk0OceqXbAhMp0i8QM_AUq4zFzVy07Q7g,9599
|
|
16
|
+
dataenginex/core/validators.py,sha256=btaOKhrAan_RWY7cD9Kdhr99EFz_khIqy7LKy-Et8dg,12872
|
|
17
|
+
dataenginex/data/__init__.py,sha256=EyZ1_MRGQqbgE0Ss1EtJOa7n2Ga9n8LmPpb6d92XNaY,637
|
|
18
|
+
dataenginex/data/connectors.py,sha256=5uQ2TAUGsoLwywnrhnNL-PFqpBZPcgZKlSninH9LI1M,10586
|
|
19
|
+
dataenginex/data/profiler.py,sha256=PnYDavjHfSoBGInyd2WN_uFkASzTDTlbUHZIDrPABEc,7175
|
|
20
|
+
dataenginex/data/registry.py,sha256=Ljlra7PV2-lq6vy7OXSs1vHkZ5rZ9Pm1JTLgv0FqYfk,5673
|
|
21
|
+
dataenginex/lakehouse/__init__.py,sha256=lEEorD6i0cjZeGLcfDb89rFGRb3XvSDzbzIdrjgDf4Q,678
|
|
22
|
+
dataenginex/lakehouse/catalog.py,sha256=UF9iAunQx9WVdtCzifoojs4RN9eQsgLSOaS_AmeU31g,5130
|
|
23
|
+
dataenginex/lakehouse/partitioning.py,sha256=6pQcbALqB5gqyoT_HHtcTGq2EYh2FR1FVy5wiZ9z51Y,3444
|
|
24
|
+
dataenginex/lakehouse/storage.py,sha256=wnzylQjeH8tZf0yb3DF0d4Bg2xVYzbVXrkQVDOd2VNI,6231
|
|
25
|
+
dataenginex/middleware/__init__.py,sha256=ylCRjVgIzV8tusa6Ip8gTkbEpeyEdO7A_9Ud22uCzc4,621
|
|
26
|
+
dataenginex/middleware/logging_config.py,sha256=QUnBxFtt3ROkuNR7OWxo_50HnSeV8Tn9dYUb36F2uX4,4578
|
|
27
|
+
dataenginex/middleware/metrics.py,sha256=UKKZlK-vwOL3EA2QEkw5h65bUsQAD0spX_js8k76zUE,1028
|
|
28
|
+
dataenginex/middleware/metrics_middleware.py,sha256=YnUaNUoMd48Q8q8yPylDfjDrTZPmeJqKLMimh7lp6mI,1879
|
|
29
|
+
dataenginex/middleware/request_logging.py,sha256=kO7tSX-oAeBfVoCKwD4jV-TPYLqqsEJQZcOXu5Io9L4,2358
|
|
30
|
+
dataenginex/middleware/tracing.py,sha256=bz4AttXSy75Rxf9SKkpomzKWPHnrD3-KKuWf_4SUE30,2575
|
|
31
|
+
dataenginex/ml/__init__.py,sha256=_kGIalASJYSKosTOEySB7d17R1HryFmDOEGZM6AZ1o4,913
|
|
32
|
+
dataenginex/ml/drift.py,sha256=2sY4xHZ4Mi9fOXiP-KmFvN9uBkxwWSpejTyg6ngMadY,5469
|
|
33
|
+
dataenginex/ml/registry.py,sha256=dp1TN-saDnzPvDrhFIzvppphb2an97NPV7tu7yumC-I,5684
|
|
34
|
+
dataenginex/ml/serving.py,sha256=WML6mJNaPuEy_tUr6ouiDK9EKX9CyP8BYcD1kS9CO7o,4636
|
|
35
|
+
dataenginex/ml/training.py,sha256=-P72dZYD59IcY054rX4r0Z044yxpbfbIIdpZiET4TdI,6471
|
|
36
|
+
dataenginex/warehouse/__init__.py,sha256=j3UTpKSVCA-JtFe_R_jlNCMhcsJLcT5wOWpCxm5Nghg,587
|
|
37
|
+
dataenginex/warehouse/lineage.py,sha256=5oHlPnn7ogVaKpi7BKhBxXE7tLIqzE5xdW-nnTcUSnA,5570
|
|
38
|
+
dataenginex/warehouse/transforms.py,sha256=5_rLWvsCOD-ZySEwyiRq0kkgj7KHMOK6EwPYYYiWPQc,7262
|
|
39
|
+
dataenginex-0.3.4.dist-info/METADATA,sha256=EcWqEM6-XgsLFcJuAfcnBFNvrjXHjcnajs0b02kWx_M,2033
|
|
40
|
+
dataenginex-0.3.4.dist-info/WHEEL,sha256=kJCRJT_g0adfAJzTx2GUMmS80rTJIVHRCfG0DQgLq3o,88
|
|
41
|
+
dataenginex-0.3.4.dist-info/RECORD,,
|