aptdata 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aptdata/__init__.py +3 -0
- aptdata/cli/__init__.py +5 -0
- aptdata/cli/app.py +247 -0
- aptdata/cli/commands/__init__.py +9 -0
- aptdata/cli/commands/config_cmd.py +128 -0
- aptdata/cli/commands/mesh_cmd.py +435 -0
- aptdata/cli/commands/plugin_cmd.py +107 -0
- aptdata/cli/commands/system_cmd.py +90 -0
- aptdata/cli/commands/telemetry_cmd.py +57 -0
- aptdata/cli/completions.py +56 -0
- aptdata/cli/interactive.py +269 -0
- aptdata/cli/rendering/__init__.py +31 -0
- aptdata/cli/rendering/console.py +119 -0
- aptdata/cli/rendering/logger.py +26 -0
- aptdata/cli/rendering/panels.py +87 -0
- aptdata/cli/rendering/tables.py +81 -0
- aptdata/cli/scaffold.py +1089 -0
- aptdata/config/__init__.py +13 -0
- aptdata/config/parser.py +136 -0
- aptdata/config/schema.py +27 -0
- aptdata/config/secrets.py +60 -0
- aptdata/core/__init__.py +46 -0
- aptdata/core/context.py +31 -0
- aptdata/core/dataset.py +39 -0
- aptdata/core/lineage.py +213 -0
- aptdata/core/state.py +27 -0
- aptdata/core/system.py +317 -0
- aptdata/core/workflow.py +372 -0
- aptdata/mcp/__init__.py +5 -0
- aptdata/mcp/server.py +198 -0
- aptdata/plugins/__init__.py +77 -0
- aptdata/plugins/ai/__init__.py +6 -0
- aptdata/plugins/ai/chunking.py +66 -0
- aptdata/plugins/ai/embeddings.py +56 -0
- aptdata/plugins/base.py +57 -0
- aptdata/plugins/dataset.py +62 -0
- aptdata/plugins/governance/__init__.py +32 -0
- aptdata/plugins/governance/catalog.py +115 -0
- aptdata/plugins/governance/classification.py +44 -0
- aptdata/plugins/governance/lineage_store.py +49 -0
- aptdata/plugins/governance/rules.py +180 -0
- aptdata/plugins/local_fs.py +241 -0
- aptdata/plugins/manager.py +142 -0
- aptdata/plugins/postgres.py +113 -0
- aptdata/plugins/quality/__init__.py +39 -0
- aptdata/plugins/quality/contract.py +128 -0
- aptdata/plugins/quality/expectations.py +310 -0
- aptdata/plugins/quality/report.py +94 -0
- aptdata/plugins/quality/validator.py +139 -0
- aptdata/plugins/rest.py +135 -0
- aptdata/plugins/transform/__init__.py +14 -0
- aptdata/plugins/transform/pandas.py +129 -0
- aptdata/plugins/transform/spark.py +134 -0
- aptdata/plugins/vector/__init__.py +6 -0
- aptdata/plugins/vector/base.py +19 -0
- aptdata/plugins/vector/qdrant.py +41 -0
- aptdata/telemetry/__init__.py +5 -0
- aptdata/telemetry/instrumentation.py +164 -0
- aptdata/tui/__init__.py +5 -0
- aptdata/tui/monitor.py +279 -0
- aptdata-0.0.2.dist-info/METADATA +330 -0
- aptdata-0.0.2.dist-info/RECORD +65 -0
- aptdata-0.0.2.dist-info/WHEEL +4 -0
- aptdata-0.0.2.dist-info/entry_points.txt +3 -0
- aptdata-0.0.2.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"""PostgreSQL reader / writer plugin.
|
|
2
|
+
|
|
3
|
+
Provides :class:`PostgresReader` and :class:`PostgresWriter` for
|
|
4
|
+
interacting with PostgreSQL databases via **SQLAlchemy**.
|
|
5
|
+
|
|
6
|
+
Both ``sqlalchemy`` and a PostgreSQL driver (e.g. ``psycopg2-binary``)
|
|
7
|
+
are required. A friendly
|
|
8
|
+
:class:`~aptdata.plugins.manager.PluginDependencyError` is raised
|
|
9
|
+
when either is missing.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
from aptdata.core.dataset import BaseDataset
|
|
17
|
+
from aptdata.plugins.base import BaseReader, BaseWriter
|
|
18
|
+
from aptdata.plugins.dataset import InMemoryDataset
|
|
19
|
+
from aptdata.plugins.manager import PluginDependencyError
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _require_sqlalchemy() -> Any:
|
|
23
|
+
"""Import and return the ``sqlalchemy`` module, or raise a friendly error."""
|
|
24
|
+
try:
|
|
25
|
+
import sqlalchemy # noqa: WPS433
|
|
26
|
+
except ImportError:
|
|
27
|
+
raise PluginDependencyError("postgres", "sqlalchemy") from None
|
|
28
|
+
return sqlalchemy
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class PostgresReader(BaseReader):
|
|
32
|
+
"""Execute a SQL query against a PostgreSQL database and return the result
|
|
33
|
+
as an :class:`~aptdata.plugins.dataset.InMemoryDataset`.
|
|
34
|
+
|
|
35
|
+
Parameters
|
|
36
|
+
----------
|
|
37
|
+
connection_url:
|
|
38
|
+
SQLAlchemy connection URL, e.g.
|
|
39
|
+
``"postgresql+psycopg2://user:pass@host:5432/db"``.
|
|
40
|
+
query:
|
|
41
|
+
Raw SQL ``SELECT`` query to execute.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(self, connection_url: str, query: str) -> None:
|
|
45
|
+
self.connection_url = connection_url
|
|
46
|
+
self.query = query
|
|
47
|
+
|
|
48
|
+
def read(self, **kwargs: Any) -> InMemoryDataset:
|
|
49
|
+
sa = _require_sqlalchemy()
|
|
50
|
+
engine = sa.create_engine(self.connection_url)
|
|
51
|
+
with engine.connect() as conn:
|
|
52
|
+
result = conn.execute(sa.text(self.query))
|
|
53
|
+
columns = list(result.keys())
|
|
54
|
+
records = [dict(zip(columns, row)) for row in result.fetchall()]
|
|
55
|
+
|
|
56
|
+
ds = InMemoryDataset(uri=self.connection_url)
|
|
57
|
+
ds.write(records)
|
|
58
|
+
return ds
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class PostgresWriter(BaseWriter):
|
|
62
|
+
"""Write an :class:`~aptdata.plugins.dataset.InMemoryDataset` to a
|
|
63
|
+
PostgreSQL table.
|
|
64
|
+
|
|
65
|
+
Parameters
|
|
66
|
+
----------
|
|
67
|
+
connection_url:
|
|
68
|
+
SQLAlchemy connection URL.
|
|
69
|
+
table:
|
|
70
|
+
Target table name.
|
|
71
|
+
if_exists:
|
|
72
|
+
Behaviour when the table already exists: ``"append"`` (default),
|
|
73
|
+
``"replace"``, or ``"fail"``.
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
def __init__(
|
|
77
|
+
self,
|
|
78
|
+
connection_url: str,
|
|
79
|
+
table: str,
|
|
80
|
+
*,
|
|
81
|
+
if_exists: str = "append",
|
|
82
|
+
) -> None:
|
|
83
|
+
self.connection_url = connection_url
|
|
84
|
+
self.table = table
|
|
85
|
+
self.if_exists = if_exists
|
|
86
|
+
|
|
87
|
+
def write(self, dataset: BaseDataset, **kwargs: Any) -> None:
|
|
88
|
+
sa = _require_sqlalchemy()
|
|
89
|
+
records: list[dict[str, Any]] = dataset.read()
|
|
90
|
+
if not records:
|
|
91
|
+
return
|
|
92
|
+
|
|
93
|
+
engine = sa.create_engine(self.connection_url)
|
|
94
|
+
meta = sa.MetaData()
|
|
95
|
+
|
|
96
|
+
with engine.connect() as conn:
|
|
97
|
+
if self.if_exists == "replace":
|
|
98
|
+
# Use SQLAlchemy DDL to avoid raw SQL interpolation
|
|
99
|
+
tbl = sa.Table(self.table, sa.MetaData())
|
|
100
|
+
tbl.drop(engine, checkfirst=True)
|
|
101
|
+
|
|
102
|
+
# Auto-create a simple text-column table when it doesn't exist
|
|
103
|
+
if not sa.inspect(engine).has_table(self.table):
|
|
104
|
+
columns = [sa.Column(k, sa.Text) for k in records[0]]
|
|
105
|
+
sa.Table(self.table, meta, *columns)
|
|
106
|
+
meta.create_all(engine)
|
|
107
|
+
|
|
108
|
+
table_obj = sa.Table(self.table, sa.MetaData(), autoload_with=engine)
|
|
109
|
+
conn.execute(table_obj.insert(), records)
|
|
110
|
+
conn.commit()
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
__all__ = ["PostgresReader", "PostgresWriter"]
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Data quality plugin package.
|
|
2
|
+
|
|
3
|
+
Re-exports the main public API for data contracts, quality expectations,
|
|
4
|
+
validation, and reporting.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from aptdata.plugins.quality.contract import (
|
|
10
|
+
ColumnClassification,
|
|
11
|
+
ColumnContract,
|
|
12
|
+
EnforcementMode,
|
|
13
|
+
SchemaContract,
|
|
14
|
+
)
|
|
15
|
+
from aptdata.plugins.quality.expectations import (
|
|
16
|
+
BaseExpectation,
|
|
17
|
+
ExpectColumnToNotBeNull,
|
|
18
|
+
ExpectColumnValuesInRange,
|
|
19
|
+
ExpectColumnValuesToBeUnique,
|
|
20
|
+
ExpectColumnValuesToMatchRegex,
|
|
21
|
+
)
|
|
22
|
+
from aptdata.plugins.quality.report import CheckResult, CheckStatus, QualityReport
|
|
23
|
+
from aptdata.plugins.quality.validator import QualityValidator
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
"ColumnClassification",
|
|
27
|
+
"ColumnContract",
|
|
28
|
+
"EnforcementMode",
|
|
29
|
+
"SchemaContract",
|
|
30
|
+
"BaseExpectation",
|
|
31
|
+
"ExpectColumnToNotBeNull",
|
|
32
|
+
"ExpectColumnValuesInRange",
|
|
33
|
+
"ExpectColumnValuesToBeUnique",
|
|
34
|
+
"ExpectColumnValuesToMatchRegex",
|
|
35
|
+
"CheckResult",
|
|
36
|
+
"CheckStatus",
|
|
37
|
+
"QualityReport",
|
|
38
|
+
"QualityValidator",
|
|
39
|
+
]
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""Schema contracts and column classification definitions.
|
|
2
|
+
|
|
3
|
+
Provides :class:`SchemaContract` for declaring the expected schema of a
|
|
4
|
+
dataset, including column types, nullability, classification, and PII
|
|
5
|
+
annotations.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from enum import Enum
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ColumnClassification(str, Enum):
|
|
16
|
+
"""Data sensitivity classification for a column or dataset."""
|
|
17
|
+
|
|
18
|
+
PUBLIC = "PUBLIC"
|
|
19
|
+
INTERNAL = "INTERNAL"
|
|
20
|
+
CONFIDENTIAL = "CONFIDENTIAL"
|
|
21
|
+
PII = "PII"
|
|
22
|
+
PHI = "PHI"
|
|
23
|
+
FINANCIAL = "FINANCIAL"
|
|
24
|
+
SENSITIVE = "SENSITIVE"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class EnforcementMode(str, Enum):
|
|
28
|
+
"""How the quality framework reacts when a contract is violated.
|
|
29
|
+
|
|
30
|
+
ABORT
|
|
31
|
+
Raise an exception immediately.
|
|
32
|
+
WARN
|
|
33
|
+
Log a warning but continue processing.
|
|
34
|
+
TAG
|
|
35
|
+
Annotate the data with quality metadata and continue.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
ABORT = "ABORT"
|
|
39
|
+
WARN = "WARN"
|
|
40
|
+
TAG = "TAG"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class ColumnContract:
|
|
45
|
+
"""Contract specification for a single column.
|
|
46
|
+
|
|
47
|
+
Parameters
|
|
48
|
+
----------
|
|
49
|
+
name:
|
|
50
|
+
Column name as it appears in the dataset.
|
|
51
|
+
dtype:
|
|
52
|
+
Expected data type (e.g. ``"int64"``, ``"str"``).
|
|
53
|
+
nullable:
|
|
54
|
+
Whether ``null`` / ``None`` values are allowed.
|
|
55
|
+
classification:
|
|
56
|
+
Sensitivity classification (see :class:`ColumnClassification`).
|
|
57
|
+
description:
|
|
58
|
+
Human-readable description.
|
|
59
|
+
pii:
|
|
60
|
+
Shorthand flag indicating the column contains personally
|
|
61
|
+
identifiable information.
|
|
62
|
+
retention_days:
|
|
63
|
+
Number of days the column's data must be retained.
|
|
64
|
+
metadata:
|
|
65
|
+
Arbitrary extra metadata.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
name: str
|
|
69
|
+
dtype: str = ""
|
|
70
|
+
nullable: bool = True
|
|
71
|
+
classification: ColumnClassification = ColumnClassification.INTERNAL
|
|
72
|
+
description: str = ""
|
|
73
|
+
pii: bool = False
|
|
74
|
+
retention_days: int = 0
|
|
75
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@dataclass
|
|
79
|
+
class SchemaContract:
|
|
80
|
+
"""Contract specification for an entire dataset schema.
|
|
81
|
+
|
|
82
|
+
Parameters
|
|
83
|
+
----------
|
|
84
|
+
name:
|
|
85
|
+
Contract identifier.
|
|
86
|
+
version:
|
|
87
|
+
Semantic version string (e.g. ``"1.0.0"``).
|
|
88
|
+
owner:
|
|
89
|
+
Team or person responsible for this contract.
|
|
90
|
+
description:
|
|
91
|
+
Human-readable description.
|
|
92
|
+
columns:
|
|
93
|
+
Ordered list of :class:`ColumnContract` definitions.
|
|
94
|
+
enforcement:
|
|
95
|
+
How violations are handled (see :class:`EnforcementMode`).
|
|
96
|
+
metadata:
|
|
97
|
+
Arbitrary extra metadata.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
name: str
|
|
101
|
+
version: str = "1.0.0"
|
|
102
|
+
owner: str = ""
|
|
103
|
+
description: str = ""
|
|
104
|
+
columns: list[ColumnContract] = field(default_factory=list)
|
|
105
|
+
enforcement: EnforcementMode = EnforcementMode.ABORT
|
|
106
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
107
|
+
|
|
108
|
+
def get_pii_columns(self) -> list[ColumnContract]:
|
|
109
|
+
"""Return columns that are flagged as PII."""
|
|
110
|
+
return [
|
|
111
|
+
c
|
|
112
|
+
for c in self.columns
|
|
113
|
+
if c.pii or c.classification == ColumnClassification.PII
|
|
114
|
+
]
|
|
115
|
+
|
|
116
|
+
def get_columns_by_classification(
|
|
117
|
+
self, classification: ColumnClassification
|
|
118
|
+
) -> list[ColumnContract]:
|
|
119
|
+
"""Return columns whose classification matches *classification*."""
|
|
120
|
+
return [c for c in self.columns if c.classification == classification]
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
__all__ = [
|
|
124
|
+
"ColumnClassification",
|
|
125
|
+
"EnforcementMode",
|
|
126
|
+
"ColumnContract",
|
|
127
|
+
"SchemaContract",
|
|
128
|
+
]
|
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
"""Data quality expectations — engine-agnostic validators.
|
|
2
|
+
|
|
3
|
+
Each :class:`BaseExpectation` automatically dispatches to either
|
|
4
|
+
:meth:`validate_pandas` or :meth:`validate_spark` based on the type of the
|
|
5
|
+
DataFrame passed to :meth:`validate`.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from abc import ABC, abstractmethod
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
from aptdata.plugins.quality.report import CheckResult, CheckStatus
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _is_spark_df(df: Any) -> bool:
|
|
18
|
+
"""Return ``True`` when *df* looks like a PySpark DataFrame."""
|
|
19
|
+
return "pyspark" in type(df).__module__
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class BaseExpectation(ABC):
|
|
23
|
+
"""Abstract base for all data quality expectations.
|
|
24
|
+
|
|
25
|
+
Subclasses implement :meth:`validate_pandas` and :meth:`validate_spark`.
|
|
26
|
+
The concrete :meth:`validate` method dispatches automatically.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
@abstractmethod
|
|
30
|
+
def validate_pandas(self, df: Any) -> CheckResult:
|
|
31
|
+
"""Validate a ``pd.DataFrame`` and return a :class:`CheckResult`."""
|
|
32
|
+
|
|
33
|
+
@abstractmethod
|
|
34
|
+
def validate_spark(self, df: Any) -> CheckResult:
|
|
35
|
+
"""Validate a PySpark ``DataFrame`` and return a :class:`CheckResult`."""
|
|
36
|
+
|
|
37
|
+
def validate(self, df: Any) -> CheckResult:
|
|
38
|
+
"""Validate *df* dispatching to the appropriate engine implementation."""
|
|
39
|
+
if _is_spark_df(df):
|
|
40
|
+
return self.validate_spark(df)
|
|
41
|
+
return self.validate_pandas(df)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class ExpectColumnToNotBeNull(BaseExpectation):
|
|
46
|
+
"""Expect that a column contains no null values.
|
|
47
|
+
|
|
48
|
+
Parameters
|
|
49
|
+
----------
|
|
50
|
+
column:
|
|
51
|
+
Name of the column to check.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
column: str
|
|
55
|
+
|
|
56
|
+
def validate_pandas(self, df: Any) -> CheckResult:
|
|
57
|
+
"""Check for null values using pandas."""
|
|
58
|
+
if self.column not in df.columns:
|
|
59
|
+
return CheckResult(
|
|
60
|
+
expectation_name="ExpectColumnToNotBeNull",
|
|
61
|
+
column=self.column,
|
|
62
|
+
status=CheckStatus.FAILED,
|
|
63
|
+
message=f"Column '{self.column}' not found in DataFrame.",
|
|
64
|
+
rows_evaluated=len(df),
|
|
65
|
+
rows_failed=len(df),
|
|
66
|
+
)
|
|
67
|
+
null_count = int(df[self.column].isnull().sum())
|
|
68
|
+
status = CheckStatus.PASSED if null_count == 0 else CheckStatus.FAILED
|
|
69
|
+
return CheckResult(
|
|
70
|
+
expectation_name="ExpectColumnToNotBeNull",
|
|
71
|
+
column=self.column,
|
|
72
|
+
status=status,
|
|
73
|
+
message=(
|
|
74
|
+
f"Column '{self.column}' has {null_count} null value(s)."
|
|
75
|
+
if null_count > 0
|
|
76
|
+
else f"Column '{self.column}' has no null values."
|
|
77
|
+
),
|
|
78
|
+
rows_evaluated=len(df),
|
|
79
|
+
rows_failed=null_count,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
def validate_spark(self, df: Any) -> CheckResult:
|
|
83
|
+
"""Check for null values using PySpark."""
|
|
84
|
+
from pyspark.sql import functions as F # noqa: N812
|
|
85
|
+
|
|
86
|
+
total = df.count()
|
|
87
|
+
null_count = df.filter(F.col(self.column).isNull()).count()
|
|
88
|
+
status = CheckStatus.PASSED if null_count == 0 else CheckStatus.FAILED
|
|
89
|
+
return CheckResult(
|
|
90
|
+
expectation_name="ExpectColumnToNotBeNull",
|
|
91
|
+
column=self.column,
|
|
92
|
+
status=status,
|
|
93
|
+
message=(
|
|
94
|
+
f"Column '{self.column}' has {null_count} null value(s)."
|
|
95
|
+
if null_count > 0
|
|
96
|
+
else f"Column '{self.column}' has no null values."
|
|
97
|
+
),
|
|
98
|
+
rows_evaluated=total,
|
|
99
|
+
rows_failed=null_count,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@dataclass
|
|
104
|
+
class ExpectColumnValuesInRange(BaseExpectation):
|
|
105
|
+
"""Expect that all values in a column fall within [min_val, max_val].
|
|
106
|
+
|
|
107
|
+
Parameters
|
|
108
|
+
----------
|
|
109
|
+
column:
|
|
110
|
+
Name of the column to check.
|
|
111
|
+
min_val:
|
|
112
|
+
Inclusive lower bound.
|
|
113
|
+
max_val:
|
|
114
|
+
Inclusive upper bound.
|
|
115
|
+
"""
|
|
116
|
+
|
|
117
|
+
column: str
|
|
118
|
+
min_val: float
|
|
119
|
+
max_val: float
|
|
120
|
+
|
|
121
|
+
def validate_pandas(self, df: Any) -> CheckResult:
|
|
122
|
+
"""Check numeric range using pandas."""
|
|
123
|
+
if self.column not in df.columns:
|
|
124
|
+
return CheckResult(
|
|
125
|
+
expectation_name="ExpectColumnValuesInRange",
|
|
126
|
+
column=self.column,
|
|
127
|
+
status=CheckStatus.FAILED,
|
|
128
|
+
message=f"Column '{self.column}' not found in DataFrame.",
|
|
129
|
+
rows_evaluated=len(df),
|
|
130
|
+
rows_failed=len(df),
|
|
131
|
+
)
|
|
132
|
+
series = df[self.column]
|
|
133
|
+
out_of_range = int(((series < self.min_val) | (series > self.max_val)).sum())
|
|
134
|
+
status = CheckStatus.PASSED if out_of_range == 0 else CheckStatus.FAILED
|
|
135
|
+
return CheckResult(
|
|
136
|
+
expectation_name="ExpectColumnValuesInRange",
|
|
137
|
+
column=self.column,
|
|
138
|
+
status=status,
|
|
139
|
+
message=(
|
|
140
|
+
f"Column '{self.column}' has {out_of_range} value(s) outside "
|
|
141
|
+
f"[{self.min_val}, {self.max_val}]."
|
|
142
|
+
if out_of_range > 0
|
|
143
|
+
else f"All values in '{self.column}' are within range."
|
|
144
|
+
),
|
|
145
|
+
rows_evaluated=len(df),
|
|
146
|
+
rows_failed=out_of_range,
|
|
147
|
+
metadata={"min_val": self.min_val, "max_val": self.max_val},
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
def validate_spark(self, df: Any) -> CheckResult:
|
|
151
|
+
"""Check numeric range using PySpark."""
|
|
152
|
+
from pyspark.sql import functions as F # noqa: N812
|
|
153
|
+
|
|
154
|
+
total = df.count()
|
|
155
|
+
out_of_range = df.filter(
|
|
156
|
+
(F.col(self.column) < self.min_val) | (F.col(self.column) > self.max_val)
|
|
157
|
+
).count()
|
|
158
|
+
status = CheckStatus.PASSED if out_of_range == 0 else CheckStatus.FAILED
|
|
159
|
+
return CheckResult(
|
|
160
|
+
expectation_name="ExpectColumnValuesInRange",
|
|
161
|
+
column=self.column,
|
|
162
|
+
status=status,
|
|
163
|
+
message=(
|
|
164
|
+
f"Column '{self.column}' has {out_of_range} value(s) outside "
|
|
165
|
+
f"[{self.min_val}, {self.max_val}]."
|
|
166
|
+
if out_of_range > 0
|
|
167
|
+
else f"All values in '{self.column}' are within range."
|
|
168
|
+
),
|
|
169
|
+
rows_evaluated=total,
|
|
170
|
+
rows_failed=out_of_range,
|
|
171
|
+
metadata={"min_val": self.min_val, "max_val": self.max_val},
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
@dataclass
|
|
176
|
+
class ExpectColumnValuesToBeUnique(BaseExpectation):
|
|
177
|
+
"""Expect that all values in a column are unique.
|
|
178
|
+
|
|
179
|
+
Parameters
|
|
180
|
+
----------
|
|
181
|
+
column:
|
|
182
|
+
Name of the column to check.
|
|
183
|
+
"""
|
|
184
|
+
|
|
185
|
+
column: str
|
|
186
|
+
|
|
187
|
+
def validate_pandas(self, df: Any) -> CheckResult:
|
|
188
|
+
"""Check uniqueness using pandas."""
|
|
189
|
+
if self.column not in df.columns:
|
|
190
|
+
return CheckResult(
|
|
191
|
+
expectation_name="ExpectColumnValuesToBeUnique",
|
|
192
|
+
column=self.column,
|
|
193
|
+
status=CheckStatus.FAILED,
|
|
194
|
+
message=f"Column '{self.column}' not found in DataFrame.",
|
|
195
|
+
rows_evaluated=len(df),
|
|
196
|
+
rows_failed=len(df),
|
|
197
|
+
)
|
|
198
|
+
duplicates = int(df[self.column].duplicated().sum())
|
|
199
|
+
status = CheckStatus.PASSED if duplicates == 0 else CheckStatus.FAILED
|
|
200
|
+
return CheckResult(
|
|
201
|
+
expectation_name="ExpectColumnValuesToBeUnique",
|
|
202
|
+
column=self.column,
|
|
203
|
+
status=status,
|
|
204
|
+
message=(
|
|
205
|
+
f"Column '{self.column}' has {duplicates} duplicate value(s)."
|
|
206
|
+
if duplicates > 0
|
|
207
|
+
else f"All values in '{self.column}' are unique."
|
|
208
|
+
),
|
|
209
|
+
rows_evaluated=len(df),
|
|
210
|
+
rows_failed=duplicates,
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
def validate_spark(self, df: Any) -> CheckResult:
|
|
214
|
+
"""Check uniqueness using PySpark."""
|
|
215
|
+
from pyspark.sql import functions as F # noqa: N812, F401
|
|
216
|
+
|
|
217
|
+
total = df.count()
|
|
218
|
+
unique_count = df.select(self.column).distinct().count()
|
|
219
|
+
duplicates = total - unique_count
|
|
220
|
+
status = CheckStatus.PASSED if duplicates == 0 else CheckStatus.FAILED
|
|
221
|
+
return CheckResult(
|
|
222
|
+
expectation_name="ExpectColumnValuesToBeUnique",
|
|
223
|
+
column=self.column,
|
|
224
|
+
status=status,
|
|
225
|
+
message=(
|
|
226
|
+
f"Column '{self.column}' has {duplicates} duplicate value(s)."
|
|
227
|
+
if duplicates > 0
|
|
228
|
+
else f"All values in '{self.column}' are unique."
|
|
229
|
+
),
|
|
230
|
+
rows_evaluated=total,
|
|
231
|
+
rows_failed=duplicates,
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
@dataclass
|
|
236
|
+
class ExpectColumnValuesToMatchRegex(BaseExpectation):
|
|
237
|
+
"""Expect that all values in a column match a regular expression.
|
|
238
|
+
|
|
239
|
+
Parameters
|
|
240
|
+
----------
|
|
241
|
+
column:
|
|
242
|
+
Name of the column to check.
|
|
243
|
+
pattern:
|
|
244
|
+
Regular expression pattern (Python :mod:`re` syntax).
|
|
245
|
+
"""
|
|
246
|
+
|
|
247
|
+
column: str
|
|
248
|
+
pattern: str
|
|
249
|
+
|
|
250
|
+
def validate_pandas(self, df: Any) -> CheckResult:
|
|
251
|
+
"""Check regex match using pandas."""
|
|
252
|
+
if self.column not in df.columns:
|
|
253
|
+
return CheckResult(
|
|
254
|
+
expectation_name="ExpectColumnValuesToMatchRegex",
|
|
255
|
+
column=self.column,
|
|
256
|
+
status=CheckStatus.FAILED,
|
|
257
|
+
message=f"Column '{self.column}' not found in DataFrame.",
|
|
258
|
+
rows_evaluated=len(df),
|
|
259
|
+
rows_failed=len(df),
|
|
260
|
+
)
|
|
261
|
+
series = df[self.column].astype(str)
|
|
262
|
+
non_matching = int((~series.str.match(self.pattern)).sum())
|
|
263
|
+
status = CheckStatus.PASSED if non_matching == 0 else CheckStatus.FAILED
|
|
264
|
+
return CheckResult(
|
|
265
|
+
expectation_name="ExpectColumnValuesToMatchRegex",
|
|
266
|
+
column=self.column,
|
|
267
|
+
status=status,
|
|
268
|
+
message=(
|
|
269
|
+
f"Column '{self.column}' has {non_matching} value(s) not matching "
|
|
270
|
+
f"pattern '{self.pattern}'."
|
|
271
|
+
if non_matching > 0
|
|
272
|
+
else f"All values in '{self.column}' match the pattern."
|
|
273
|
+
),
|
|
274
|
+
rows_evaluated=len(df),
|
|
275
|
+
rows_failed=non_matching,
|
|
276
|
+
metadata={"pattern": self.pattern},
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
def validate_spark(self, df: Any) -> CheckResult:
|
|
280
|
+
"""Check regex match using PySpark."""
|
|
281
|
+
from pyspark.sql import functions as F # noqa: N812
|
|
282
|
+
|
|
283
|
+
total = df.count()
|
|
284
|
+
non_matching = df.filter(
|
|
285
|
+
~F.col(self.column).cast("string").rlike(self.pattern)
|
|
286
|
+
).count()
|
|
287
|
+
status = CheckStatus.PASSED if non_matching == 0 else CheckStatus.FAILED
|
|
288
|
+
return CheckResult(
|
|
289
|
+
expectation_name="ExpectColumnValuesToMatchRegex",
|
|
290
|
+
column=self.column,
|
|
291
|
+
status=status,
|
|
292
|
+
message=(
|
|
293
|
+
f"Column '{self.column}' has {non_matching} value(s) not matching "
|
|
294
|
+
f"pattern '{self.pattern}'."
|
|
295
|
+
if non_matching > 0
|
|
296
|
+
else f"All values in '{self.column}' match the pattern."
|
|
297
|
+
),
|
|
298
|
+
rows_evaluated=total,
|
|
299
|
+
rows_failed=non_matching,
|
|
300
|
+
metadata={"pattern": self.pattern},
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
__all__ = [
|
|
305
|
+
"BaseExpectation",
|
|
306
|
+
"ExpectColumnToNotBeNull",
|
|
307
|
+
"ExpectColumnValuesInRange",
|
|
308
|
+
"ExpectColumnValuesToBeUnique",
|
|
309
|
+
"ExpectColumnValuesToMatchRegex",
|
|
310
|
+
]
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Quality check result and report types."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
from enum import Enum
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class CheckStatus(str, Enum):
|
|
12
|
+
"""Status of a single quality check."""
|
|
13
|
+
|
|
14
|
+
PASSED = "PASSED"
|
|
15
|
+
FAILED = "FAILED"
|
|
16
|
+
WARNING = "WARNING"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class CheckResult:
|
|
21
|
+
"""Result of a single data quality expectation.
|
|
22
|
+
|
|
23
|
+
Parameters
|
|
24
|
+
----------
|
|
25
|
+
expectation_name:
|
|
26
|
+
Class name or human-readable label of the expectation.
|
|
27
|
+
column:
|
|
28
|
+
Column name the check was applied to (empty for table-level checks).
|
|
29
|
+
status:
|
|
30
|
+
Whether the check passed, failed, or issued a warning.
|
|
31
|
+
message:
|
|
32
|
+
Human-readable description of the outcome.
|
|
33
|
+
rows_evaluated:
|
|
34
|
+
Total number of rows considered by the check.
|
|
35
|
+
rows_failed:
|
|
36
|
+
Number of rows that violated the expectation.
|
|
37
|
+
metadata:
|
|
38
|
+
Arbitrary extra metadata from the expectation.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
expectation_name: str
|
|
42
|
+
column: str = ""
|
|
43
|
+
status: CheckStatus = CheckStatus.PASSED
|
|
44
|
+
message: str = ""
|
|
45
|
+
rows_evaluated: int = 0
|
|
46
|
+
rows_failed: int = 0
|
|
47
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class QualityReport:
|
|
52
|
+
"""Aggregated quality report for a single dataset validation run.
|
|
53
|
+
|
|
54
|
+
Parameters
|
|
55
|
+
----------
|
|
56
|
+
dataset_uri:
|
|
57
|
+
URI of the dataset that was validated.
|
|
58
|
+
workflow_name:
|
|
59
|
+
Name of the workflow that triggered validation.
|
|
60
|
+
trace_id:
|
|
61
|
+
OpenTelemetry trace identifier.
|
|
62
|
+
timestamp:
|
|
63
|
+
UTC ISO-8601 timestamp of when the report was generated.
|
|
64
|
+
checks:
|
|
65
|
+
Individual :class:`CheckResult` objects.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
dataset_uri: str
|
|
69
|
+
workflow_name: str = ""
|
|
70
|
+
trace_id: str = ""
|
|
71
|
+
timestamp: str = field(
|
|
72
|
+
default_factory=lambda: datetime.now(timezone.utc).isoformat()
|
|
73
|
+
)
|
|
74
|
+
checks: list[CheckResult] = field(default_factory=list)
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def passed(self) -> bool:
|
|
78
|
+
"""Return ``True`` when no check has a FAILED status."""
|
|
79
|
+
return all(c.status != CheckStatus.FAILED for c in self.checks)
|
|
80
|
+
|
|
81
|
+
@property
|
|
82
|
+
def summary(self) -> dict[str, int]:
|
|
83
|
+
"""Return counts of PASSED, FAILED, and WARNING results."""
|
|
84
|
+
counts: dict[str, int] = {
|
|
85
|
+
CheckStatus.PASSED: 0,
|
|
86
|
+
CheckStatus.FAILED: 0,
|
|
87
|
+
CheckStatus.WARNING: 0,
|
|
88
|
+
}
|
|
89
|
+
for check in self.checks:
|
|
90
|
+
counts[check.status] += 1
|
|
91
|
+
return counts
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
__all__ = ["CheckStatus", "CheckResult", "QualityReport"]
|