dataenginex 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. dataenginex/README.md +35 -0
  2. dataenginex/RELEASE_NOTES.md +38 -0
  3. dataenginex/__init__.py +16 -0
  4. dataenginex/api/__init__.py +11 -0
  5. dataenginex/api/auth.py +173 -0
  6. dataenginex/api/errors.py +70 -0
  7. dataenginex/api/health.py +133 -0
  8. dataenginex/api/pagination.py +94 -0
  9. dataenginex/api/rate_limit.py +122 -0
  10. dataenginex/api/routers/__init__.py +1 -0
  11. dataenginex/api/routers/v1.py +113 -0
  12. dataenginex/core/__init__.py +36 -0
  13. dataenginex/core/medallion_architecture.py +414 -0
  14. dataenginex/core/pipeline_config.py +111 -0
  15. dataenginex/core/schemas.py +304 -0
  16. dataenginex/core/validators.py +394 -0
  17. dataenginex/data/__init__.py +22 -0
  18. dataenginex/data/connectors.py +332 -0
  19. dataenginex/data/profiler.py +217 -0
  20. dataenginex/data/registry.py +148 -0
  21. dataenginex/lakehouse/__init__.py +22 -0
  22. dataenginex/lakehouse/catalog.py +145 -0
  23. dataenginex/lakehouse/partitioning.py +99 -0
  24. dataenginex/lakehouse/storage.py +177 -0
  25. dataenginex/middleware/__init__.py +19 -0
  26. dataenginex/middleware/logging_config.py +137 -0
  27. dataenginex/middleware/metrics.py +45 -0
  28. dataenginex/middleware/metrics_middleware.py +61 -0
  29. dataenginex/middleware/request_logging.py +77 -0
  30. dataenginex/middleware/tracing.py +87 -0
  31. dataenginex/ml/__init__.py +28 -0
  32. dataenginex/ml/drift.py +165 -0
  33. dataenginex/ml/registry.py +156 -0
  34. dataenginex/ml/serving.py +141 -0
  35. dataenginex/ml/training.py +205 -0
  36. dataenginex/warehouse/__init__.py +19 -0
  37. dataenginex/warehouse/lineage.py +164 -0
  38. dataenginex/warehouse/transforms.py +206 -0
  39. dataenginex-0.3.4.dist-info/METADATA +66 -0
  40. dataenginex-0.3.4.dist-info/RECORD +41 -0
  41. dataenginex-0.3.4.dist-info/WHEEL +4 -0
@@ -0,0 +1,206 @@
1
+ """
2
+ Transform framework — declarative, composable data transformations.
3
+
4
+ Each ``Transform`` is a named, reusable function that maps a record dict to a
5
+ new record dict. ``TransformPipeline`` chains multiple transforms and collects
6
+ per-step metrics for observability.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import contextlib
12
+ import time
13
+ from abc import ABC, abstractmethod
14
+ from dataclasses import dataclass, field
15
+ from datetime import UTC, datetime
16
+ from typing import Any
17
+
18
+ from loguru import logger
19
+
20
+
21
+ @dataclass
22
+ class TransformResult:
23
+ """Outcome of running a pipeline over a batch of records."""
24
+
25
+ input_count: int
26
+ output_count: int
27
+ dropped_count: int = 0
28
+ error_count: int = 0
29
+ duration_ms: float = 0.0
30
+ records: list[dict[str, Any]] = field(default_factory=list)
31
+ step_metrics: list[dict[str, Any]] = field(default_factory=list)
32
+ completed_at: datetime = field(default_factory=lambda: datetime.now(tz=UTC))
33
+
34
+ @property
35
+ def success_rate(self) -> float:
36
+ return self.output_count / self.input_count if self.input_count else 0.0
37
+
38
+
39
+ # ---------------------------------------------------------------------------
40
+ # Single transform
41
+ # ---------------------------------------------------------------------------
42
+
43
+ class Transform(ABC):
44
+ """Base class for a single data transform step.
45
+
46
+ Subclass and implement ``apply`` which receives one record and returns
47
+ either a transformed record or *None* to drop it.
48
+ """
49
+
50
+ def __init__(self, name: str, description: str = "") -> None:
51
+ self.name = name
52
+ self.description = description
53
+
54
+ @abstractmethod
55
+ def apply(self, record: dict[str, Any]) -> dict[str, Any] | None:
56
+ """Transform *record* in place or return a new dict.
57
+
58
+ Return *None* to drop the record from the pipeline.
59
+ """
60
+ ...
61
+
62
+
63
+ # ---------------------------------------------------------------------------
64
+ # Built-in transforms
65
+ # ---------------------------------------------------------------------------
66
+
67
+ class RenameFieldsTransform(Transform):
68
+ """Rename keys in a record according to a mapping."""
69
+
70
+ def __init__(self, mapping: dict[str, str]) -> None:
71
+ super().__init__(name="rename_fields", description="Rename record fields")
72
+ self.mapping = mapping
73
+
74
+ def apply(self, record: dict[str, Any]) -> dict[str, Any]:
75
+ out = dict(record)
76
+ for old_key, new_key in self.mapping.items():
77
+ if old_key in out:
78
+ out[new_key] = out.pop(old_key)
79
+ return out
80
+
81
+
82
+ class DropNullsTransform(Transform):
83
+ """Drop records that have *None* in any of the specified fields."""
84
+
85
+ def __init__(self, required_fields: list[str]) -> None:
86
+ super().__init__(name="drop_nulls", description="Drop records with null required fields")
87
+ self.required_fields = required_fields
88
+
89
+ def apply(self, record: dict[str, Any]) -> dict[str, Any] | None:
90
+ for f in self.required_fields:
91
+ if record.get(f) is None:
92
+ return None
93
+ return record
94
+
95
+
96
+ class CastTypesTransform(Transform):
97
+ """Cast fields to target types (``int``, ``float``, ``str``, ``bool``)."""
98
+
99
+ _CASTERS: dict[str, type] = {"int": int, "float": float, "str": str, "bool": bool}
100
+
101
+ def __init__(self, type_map: dict[str, str]) -> None:
102
+ super().__init__(name="cast_types", description="Cast field types")
103
+ self.type_map = type_map
104
+
105
+ def apply(self, record: dict[str, Any]) -> dict[str, Any]:
106
+ out = dict(record)
107
+ for field_name, target in self.type_map.items():
108
+ if field_name in out and out[field_name] is not None:
109
+ caster = self._CASTERS.get(target)
110
+ if caster:
111
+ with contextlib.suppress(ValueError, TypeError):
112
+ out[field_name] = caster(out[field_name])
113
+ return out
114
+
115
+
116
+ class AddTimestampTransform(Transform):
117
+ """Add a processing timestamp field to every record."""
118
+
119
+ def __init__(self, field_name: str = "processed_at") -> None:
120
+ super().__init__(name="add_timestamp", description="Add processing timestamp")
121
+ self.field_name = field_name
122
+
123
+ def apply(self, record: dict[str, Any]) -> dict[str, Any]:
124
+ out = dict(record)
125
+ out[self.field_name] = datetime.now(tz=UTC).isoformat()
126
+ return out
127
+
128
+
129
+ class FilterTransform(Transform):
130
+ """Keep only records that match a predicate expression.
131
+
132
+ ``predicate`` receives a record dict and returns ``True`` to keep it.
133
+ """
134
+
135
+ def __init__(self, name: str, predicate: Any) -> None:
136
+ super().__init__(name=name, description="Filter records by predicate")
137
+ self._predicate = predicate
138
+
139
+ def apply(self, record: dict[str, Any]) -> dict[str, Any] | None:
140
+ if self._predicate(record):
141
+ return record
142
+ return None
143
+
144
+
145
+ # ---------------------------------------------------------------------------
146
+ # Pipeline
147
+ # ---------------------------------------------------------------------------
148
+
149
+ class TransformPipeline:
150
+ """Execute an ordered chain of ``Transform`` steps over a batch.
151
+
152
+ Example::
153
+
154
+ pipeline = TransformPipeline("bronze_to_silver")
155
+ pipeline.add(DropNullsTransform(["job_id", "company_name"]))
156
+ pipeline.add(CastTypesTransform({"salary_min": "float"}))
157
+ result = pipeline.run(records)
158
+ """
159
+
160
+ def __init__(self, name: str) -> None:
161
+ self.name = name
162
+ self._steps: list[Transform] = []
163
+
164
+ def add(self, transform: Transform) -> TransformPipeline:
165
+ self._steps.append(transform)
166
+ return self # fluent API
167
+
168
+ def run(self, records: list[dict[str, Any]]) -> TransformResult:
169
+ start = time.perf_counter()
170
+ current = list(records)
171
+ step_metrics: list[dict[str, Any]] = []
172
+
173
+ for step in self._steps:
174
+ step_start = time.perf_counter()
175
+ before_count = len(current)
176
+ output: list[dict[str, Any]] = []
177
+
178
+ for rec in current:
179
+ try:
180
+ result = step.apply(rec)
181
+ if result is not None:
182
+ output.append(result)
183
+ except Exception as exc:
184
+ logger.warning(
185
+ "Transform %s failed on record: %s", step.name, exc,
186
+ )
187
+
188
+ step_duration = (time.perf_counter() - step_start) * 1000
189
+ step_metrics.append({
190
+ "step": step.name,
191
+ "input_count": before_count,
192
+ "output_count": len(output),
193
+ "dropped": before_count - len(output),
194
+ "duration_ms": round(step_duration, 2),
195
+ })
196
+ current = output
197
+
198
+ total_duration = (time.perf_counter() - start) * 1000
199
+ return TransformResult(
200
+ input_count=len(records),
201
+ output_count=len(current),
202
+ dropped_count=len(records) - len(current),
203
+ duration_ms=round(total_duration, 2),
204
+ records=current,
205
+ step_metrics=step_metrics,
206
+ )
@@ -0,0 +1,66 @@
1
+ Metadata-Version: 2.4
2
+ Name: dataenginex
3
+ Version: 0.3.4
4
+ Summary: DataEngineX - Core framework for data engineering projects
5
+ License: MIT
6
+ Author: Jay
7
+ Author-email: jayapal.myaka99@gmail.com
8
+ Requires-Python: >=3.11
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Classifier: Programming Language :: Python :: 3.14
15
+ Requires-Dist: email-validator (>=2.0.0)
16
+ Requires-Dist: fastapi (>=0.128.4)
17
+ Requires-Dist: httpx (>=0.28.0)
18
+ Requires-Dist: loguru (>=0.7.3)
19
+ Requires-Dist: opentelemetry-api (>=1.39.0)
20
+ Requires-Dist: opentelemetry-exporter-otlp (>=1.39.0)
21
+ Requires-Dist: opentelemetry-instrumentation-fastapi (>=0.60b1)
22
+ Requires-Dist: opentelemetry-sdk (>=1.39.0)
23
+ Requires-Dist: prometheus-client (>=0.24.0)
24
+ Requires-Dist: python-dotenv (>=1.2.0)
25
+ Requires-Dist: python-json-logger (>=4.0.0)
26
+ Requires-Dist: pyyaml (>=6.0.2)
27
+ Requires-Dist: structlog (>=25.5.0)
28
+ Requires-Dist: uvicorn (>=0.40.0)
29
+ Description-Content-Type: text/markdown
30
+
31
+ # dataenginex
32
+
33
+ `dataenginex` is the core DataEngineX framework package for building observable, production-ready data and API services.
34
+
35
+ It provides:
36
+ - FastAPI application primitives and API extensions
37
+ - Middleware for structured logging, metrics, and tracing
38
+ - Data quality and validation utilities
39
+ - Lakehouse and warehouse building blocks
40
+ - Reusable ML support modules for model-serving workflows
41
+
42
+ ## Install
43
+
44
+ ```bash
45
+ pip install dataenginex
46
+ ```
47
+
48
+ ## Package Scope
49
+
50
+ This package is the core library from the DEX monorepo.
51
+ `careerdex` and `weatherdex` are maintained in the same repository but are not part of this package release flow.
52
+
53
+ ## Quick Usage
54
+
55
+ ```python
56
+ from dataenginex import __version__
57
+
58
+ print(__version__)
59
+ ```
60
+
61
+ ## Source and Docs
62
+
63
+ - Repository: https://github.com/data-literate/DEX
64
+ - CI/CD guide: `docs/CI_CD.md`
65
+ - Release notes: `packages/dataenginex/src/dataenginex/RELEASE_NOTES.md`
66
+
@@ -0,0 +1,41 @@
1
+ dataenginex/README.md,sha256=CA8sEQ6QCFmmx5vE6lkRvGtf5N6zGG6aRi4vz7WpFsA,901
2
+ dataenginex/RELEASE_NOTES.md,sha256=-jqu-JCcbRfa8Yry0qZ6yEl2nZ7P5PBzbYGpHj8GiNY,1812
3
+ dataenginex/__init__.py,sha256=Y1UcBcb91_9R1emZaCoObxDovJkCo3e2umfjfMe6zZs,469
4
+ dataenginex/api/__init__.py,sha256=z9YbGR8qY9qmJ4Gnx2mknT7UsncsUlG5aK5gbAYGM84,336
5
+ dataenginex/api/auth.py,sha256=0MrtA6imcwAVynrXWUvEhQhaTuTocR8hxVc-FaSVBxw,5759
6
+ dataenginex/api/errors.py,sha256=hDO4LsAiQqjyglccSj53FDSkLjo9lhjR2Twr9TDzvME,1843
7
+ dataenginex/api/health.py,sha256=OTbY0kmy14KYoX463pFY0HYxhqtHfhZpwReY9GpIT20,4337
8
+ dataenginex/api/pagination.py,sha256=dzHSXDXsjYiL8HzOXQR5wNTRYVqCzbCrb-ZE17EpWEQ,2693
9
+ dataenginex/api/rate_limit.py,sha256=OajTPGiLCptoRnlH1-rX2cqTJjJJxe0QGd5R3wKumlI,4017
10
+ dataenginex/api/routers/__init__.py,sha256=6pzSoxY7glXyKAfCChT-ui-N64qJkdEUlyR1ibN1Kyw,28
11
+ dataenginex/api/routers/v1.py,sha256=HimY-vbmI8JoLSkQZxVqs5KGWsx_WLdUEp3xNGj6E9g,3831
12
+ dataenginex/core/__init__.py,sha256=VMw1YlZoHSXarP5FyftrvgF0-7pJ-SUQvb3ZonlroZM,860
13
+ dataenginex/core/medallion_architecture.py,sha256=Ai2wDxbPYAU4I8h61zk8skw7RkFLbZr5iLp7ZpXPXiI,14562
14
+ dataenginex/core/pipeline_config.py,sha256=DM7v0-b4DBmXd8Bn4RqsQdWJlLsIxjytEBeasnYCmxQ,3860
15
+ dataenginex/core/schemas.py,sha256=hyIvim0rRzAk0OceqXbAhMp0i8QM_AUq4zFzVy07Q7g,9599
16
+ dataenginex/core/validators.py,sha256=btaOKhrAan_RWY7cD9Kdhr99EFz_khIqy7LKy-Et8dg,12872
17
+ dataenginex/data/__init__.py,sha256=EyZ1_MRGQqbgE0Ss1EtJOa7n2Ga9n8LmPpb6d92XNaY,637
18
+ dataenginex/data/connectors.py,sha256=5uQ2TAUGsoLwywnrhnNL-PFqpBZPcgZKlSninH9LI1M,10586
19
+ dataenginex/data/profiler.py,sha256=PnYDavjHfSoBGInyd2WN_uFkASzTDTlbUHZIDrPABEc,7175
20
+ dataenginex/data/registry.py,sha256=Ljlra7PV2-lq6vy7OXSs1vHkZ5rZ9Pm1JTLgv0FqYfk,5673
21
+ dataenginex/lakehouse/__init__.py,sha256=lEEorD6i0cjZeGLcfDb89rFGRb3XvSDzbzIdrjgDf4Q,678
22
+ dataenginex/lakehouse/catalog.py,sha256=UF9iAunQx9WVdtCzifoojs4RN9eQsgLSOaS_AmeU31g,5130
23
+ dataenginex/lakehouse/partitioning.py,sha256=6pQcbALqB5gqyoT_HHtcTGq2EYh2FR1FVy5wiZ9z51Y,3444
24
+ dataenginex/lakehouse/storage.py,sha256=wnzylQjeH8tZf0yb3DF0d4Bg2xVYzbVXrkQVDOd2VNI,6231
25
+ dataenginex/middleware/__init__.py,sha256=ylCRjVgIzV8tusa6Ip8gTkbEpeyEdO7A_9Ud22uCzc4,621
26
+ dataenginex/middleware/logging_config.py,sha256=QUnBxFtt3ROkuNR7OWxo_50HnSeV8Tn9dYUb36F2uX4,4578
27
+ dataenginex/middleware/metrics.py,sha256=UKKZlK-vwOL3EA2QEkw5h65bUsQAD0spX_js8k76zUE,1028
28
+ dataenginex/middleware/metrics_middleware.py,sha256=YnUaNUoMd48Q8q8yPylDfjDrTZPmeJqKLMimh7lp6mI,1879
29
+ dataenginex/middleware/request_logging.py,sha256=kO7tSX-oAeBfVoCKwD4jV-TPYLqqsEJQZcOXu5Io9L4,2358
30
+ dataenginex/middleware/tracing.py,sha256=bz4AttXSy75Rxf9SKkpomzKWPHnrD3-KKuWf_4SUE30,2575
31
+ dataenginex/ml/__init__.py,sha256=_kGIalASJYSKosTOEySB7d17R1HryFmDOEGZM6AZ1o4,913
32
+ dataenginex/ml/drift.py,sha256=2sY4xHZ4Mi9fOXiP-KmFvN9uBkxwWSpejTyg6ngMadY,5469
33
+ dataenginex/ml/registry.py,sha256=dp1TN-saDnzPvDrhFIzvppphb2an97NPV7tu7yumC-I,5684
34
+ dataenginex/ml/serving.py,sha256=WML6mJNaPuEy_tUr6ouiDK9EKX9CyP8BYcD1kS9CO7o,4636
35
+ dataenginex/ml/training.py,sha256=-P72dZYD59IcY054rX4r0Z044yxpbfbIIdpZiET4TdI,6471
36
+ dataenginex/warehouse/__init__.py,sha256=j3UTpKSVCA-JtFe_R_jlNCMhcsJLcT5wOWpCxm5Nghg,587
37
+ dataenginex/warehouse/lineage.py,sha256=5oHlPnn7ogVaKpi7BKhBxXE7tLIqzE5xdW-nnTcUSnA,5570
38
+ dataenginex/warehouse/transforms.py,sha256=5_rLWvsCOD-ZySEwyiRq0kkgj7KHMOK6EwPYYYiWPQc,7262
39
+ dataenginex-0.3.4.dist-info/METADATA,sha256=EcWqEM6-XgsLFcJuAfcnBFNvrjXHjcnajs0b02kWx_M,2033
40
+ dataenginex-0.3.4.dist-info/WHEEL,sha256=kJCRJT_g0adfAJzTx2GUMmS80rTJIVHRCfG0DQgLq3o,88
41
+ dataenginex-0.3.4.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: poetry-core 2.3.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any