dqtlib 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dqt/__init__.py +49 -0
- dqt/adapters/__init__.py +0 -0
- dqt/adapters/_protocol.py +47 -0
- dqt/adapters/local/__init__.py +3 -0
- dqt/adapters/local/adapter.py +118 -0
- dqt/adapters/postgres/__init__.py +4 -0
- dqt/adapters/postgres/adapter.py +154 -0
- dqt/adapters/postgres/config.py +17 -0
- dqt/agent/__init__.py +0 -0
- dqt/algorithms/__init__.py +0 -0
- dqt/algorithms/_base.py +70 -0
- dqt/algorithms/_registry.py +28 -0
- dqt/algorithms/_scales.py +62 -0
- dqt/algorithms/basic/__init__.py +35 -0
- dqt/algorithms/basic/_helpers.py +18 -0
- dqt/algorithms/basic/column_pairs.py +80 -0
- dqt/algorithms/basic/completeness.py +40 -0
- dqt/algorithms/basic/date_part.py +52 -0
- dqt/algorithms/basic/freshness.py +53 -0
- dqt/algorithms/basic/monotonicity.py +42 -0
- dqt/algorithms/basic/null_fraction.py +37 -0
- dqt/algorithms/basic/numeric.py +42 -0
- dqt/algorithms/basic/numeric_bounds.py +155 -0
- dqt/algorithms/basic/sql_assertion.py +40 -0
- dqt/algorithms/basic/string_case.py +51 -0
- dqt/algorithms/basic/uniqueness.py +40 -0
- dqt/algorithms/basic/validity.py +46 -0
- dqt/algorithms/basic/value_checks.py +174 -0
- dqt/algorithms/basic/volume.py +35 -0
- dqt/algorithms/distribution/__init__.py +7 -0
- dqt/algorithms/distribution/profiler.py +127 -0
- dqt/algorithms/drift/__init__.py +3 -0
- dqt/algorithms/drift/ks2sample.py +40 -0
- dqt/algorithms/outliers_multi/__init__.py +3 -0
- dqt/algorithms/outliers_multi/isolation_forest.py +41 -0
- dqt/algorithms/outliers_uni/__init__.py +14 -0
- dqt/algorithms/outliers_uni/adjusted_boxplot.py +59 -0
- dqt/algorithms/outliers_uni/auto_outlier.py +110 -0
- dqt/algorithms/outliers_uni/mad.py +84 -0
- dqt/algorithms/outliers_uni/outlier_fraction_range.py +116 -0
- dqt/algorithms/outliers_uni/zscore.py +35 -0
- dqt/algorithms/referential/__init__.py +2 -0
- dqt/algorithms/referential/referential.py +52 -0
- dqt/algorithms/schema/__init__.py +2 -0
- dqt/algorithms/schema/schema_checks.py +56 -0
- dqt/algorithms/timeseries/__init__.py +3 -0
- dqt/algorithms/timeseries/stl.py +51 -0
- dqt/causality/__init__.py +0 -0
- dqt/checks/__init__.py +0 -0
- dqt/checks/loader.py +70 -0
- dqt/checks/models.py +42 -0
- dqt/checks/schema/__init__.py +0 -0
- dqt/checks/schema/check.schema.json +74 -0
- dqt/compat/__init__.py +0 -0
- dqt/governance/__init__.py +0 -0
- dqt/hitl/__init__.py +0 -0
- dqt/lineage/__init__.py +4 -0
- dqt/lineage/models.py +42 -0
- dqt/lineage/vault.py +273 -0
- dqt/profiling/__init__.py +12 -0
- dqt/profiling/models.py +79 -0
- dqt/profiling/profiler.py +236 -0
- dqt/reporting/__init__.py +5 -0
- dqt/reporting/_charts.py +173 -0
- dqt/reporting/html_report.py +480 -0
- dqt/runner/__init__.py +0 -0
- dqt/runner/runner.py +121 -0
- dqt/semantic/__init__.py +9 -0
- dqt/semantic/loader.py +10 -0
- dqt/semantic/models.py +35 -0
- dqt/store/__init__.py +0 -0
- dqt/store/_protocol.py +43 -0
- dqt/store/memory.py +27 -0
- dqt/utils/__init__.py +0 -0
- dqt/utils/logging.py +26 -0
- dqtlib-0.1.0.dist-info/METADATA +64 -0
- dqtlib-0.1.0.dist-info/RECORD +78 -0
- dqtlib-0.1.0.dist-info/WHEEL +4 -0
dqt/__init__.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# packages/dqt/src/dqt/__init__.py
|
|
2
|
+
"""dqt — open-source data quality, observability, and causality library."""
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
__version__ = "0.1.0"
|
|
6
|
+
|
|
7
|
+
from dqt.algorithms._base import (
|
|
8
|
+
BaseAggregateDetector,
|
|
9
|
+
BaseDetector,
|
|
10
|
+
DetectorResult,
|
|
11
|
+
Verdict,
|
|
12
|
+
compute_verdict,
|
|
13
|
+
)
|
|
14
|
+
from dqt.adapters._protocol import AggExpr, ColumnMeta, HealthCheckResult, WarehouseAdapter
|
|
15
|
+
from dqt.store._protocol import Incident, ResultsStore, RunResult
|
|
16
|
+
from dqt.store.memory import MemoryStore
|
|
17
|
+
from dqt.checks.models import BaselineConfig, Check, CheckFilter, CheckScope
|
|
18
|
+
from dqt.runner.runner import Runner
|
|
19
|
+
|
|
20
|
+
# Import all detector groups to trigger @registry.register side effects
|
|
21
|
+
import dqt.algorithms.basic # noqa: F401
|
|
22
|
+
import dqt.algorithms.schema # noqa: F401
|
|
23
|
+
import dqt.algorithms.referential # noqa: F401
|
|
24
|
+
import dqt.algorithms.drift # noqa: F401
|
|
25
|
+
import dqt.algorithms.outliers_uni # noqa: F401
|
|
26
|
+
import dqt.algorithms.outliers_multi # noqa: F401
|
|
27
|
+
import dqt.algorithms.timeseries # noqa: F401
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
"__version__",
|
|
31
|
+
"Verdict",
|
|
32
|
+
"DetectorResult",
|
|
33
|
+
"BaseDetector",
|
|
34
|
+
"BaseAggregateDetector",
|
|
35
|
+
"compute_verdict",
|
|
36
|
+
"AggExpr",
|
|
37
|
+
"ColumnMeta",
|
|
38
|
+
"HealthCheckResult",
|
|
39
|
+
"WarehouseAdapter",
|
|
40
|
+
"ResultsStore",
|
|
41
|
+
"RunResult",
|
|
42
|
+
"Incident",
|
|
43
|
+
"MemoryStore",
|
|
44
|
+
"Check",
|
|
45
|
+
"CheckScope",
|
|
46
|
+
"CheckFilter",
|
|
47
|
+
"BaselineConfig",
|
|
48
|
+
"Runner",
|
|
49
|
+
]
|
dqt/adapters/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Literal, Protocol, runtime_checkable
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class AggExpr:
|
|
11
|
+
name: str
|
|
12
|
+
sql: str
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class HealthCheckStep:
|
|
17
|
+
name: str
|
|
18
|
+
status: Literal["pass", "fail", "skip"]
|
|
19
|
+
latency_ms: float
|
|
20
|
+
detail: str
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class HealthCheckResult:
|
|
25
|
+
steps: list[HealthCheckStep] = field(default_factory=list)
|
|
26
|
+
|
|
27
|
+
@property
|
|
28
|
+
def passed(self) -> bool:
|
|
29
|
+
return all(s.status in ("pass", "skip") for s in self.steps)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class ColumnMeta:
|
|
34
|
+
name: str
|
|
35
|
+
data_type: str
|
|
36
|
+
nullable: bool
|
|
37
|
+
position: int
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@runtime_checkable
|
|
41
|
+
class WarehouseAdapter(Protocol):
|
|
42
|
+
def health_check(self) -> HealthCheckResult: ...
|
|
43
|
+
def sample(self, schema: str, table: str, n: int = 100_000) -> pd.DataFrame: ...
|
|
44
|
+
def aggregate(self, schema: str, table: str, exprs: list[AggExpr]) -> dict[str, object]: ...
|
|
45
|
+
def describe_columns(self, schema: str, table: str) -> list[ColumnMeta]: ...
|
|
46
|
+
def list_schemas(self) -> list[str]: ...
|
|
47
|
+
def list_tables(self, schema: str) -> list[str]: ...
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# Ref: https://duckdb.org/docs/api/python/overview — used for SQL aggregations on DataFrames
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import pathlib
|
|
5
|
+
import time
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from dqt.adapters._protocol import (
|
|
11
|
+
AggExpr,
|
|
12
|
+
ColumnMeta,
|
|
13
|
+
HealthCheckResult,
|
|
14
|
+
HealthCheckStep,
|
|
15
|
+
)
|
|
16
|
+
from dqt.utils.logging import get_logger
|
|
17
|
+
|
|
18
|
+
_log = get_logger(__name__)
|
|
19
|
+
|
|
20
|
+
_READERS: dict[str, Any] = {
|
|
21
|
+
".csv": lambda p: pd.read_csv(p),
|
|
22
|
+
".tsv": lambda p: pd.read_csv(p, sep="\t"),
|
|
23
|
+
".xlsx": lambda p: pd.read_excel(p),
|
|
24
|
+
".xls": lambda p: pd.read_excel(p),
|
|
25
|
+
".parquet": lambda p: pd.read_parquet(p),
|
|
26
|
+
".json": lambda p: pd.read_json(p),
|
|
27
|
+
".jsonl": lambda p: pd.read_json(p, lines=True),
|
|
28
|
+
".ndjson": lambda p: pd.read_json(p, lines=True),
|
|
29
|
+
".feather": lambda p: pd.read_feather(p),
|
|
30
|
+
".arrow": lambda p: pd.read_feather(p),
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
_HEALTH_STEPS = ("readable", "parseable", "columns", "sample_read", "row_count")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class LocalFileAdapter:
|
|
37
|
+
"""Reads a local file and exposes it as a single-table WarehouseAdapter."""
|
|
38
|
+
|
|
39
|
+
def __init__(self, path: str | pathlib.Path) -> None:
|
|
40
|
+
self._path = pathlib.Path(path)
|
|
41
|
+
self._suffix = self._path.suffix.lower()
|
|
42
|
+
if self._suffix not in _READERS:
|
|
43
|
+
supported = ", ".join(sorted(_READERS))
|
|
44
|
+
raise ValueError(f"Unsupported format '{self._suffix}'. Supported: {supported}")
|
|
45
|
+
self._table_name = self._path.stem
|
|
46
|
+
|
|
47
|
+
def _read(self) -> pd.DataFrame:
|
|
48
|
+
return _READERS[self._suffix](self._path)
|
|
49
|
+
|
|
50
|
+
def health_check(self) -> HealthCheckResult:
|
|
51
|
+
steps: list[HealthCheckStep] = []
|
|
52
|
+
|
|
53
|
+
t0 = time.perf_counter()
|
|
54
|
+
if not self._path.exists():
|
|
55
|
+
steps.append(HealthCheckStep("file_exists", "fail", 0.0, f"not found: {self._path}"))
|
|
56
|
+
for name in _HEALTH_STEPS:
|
|
57
|
+
steps.append(HealthCheckStep(name, "skip", 0.0, "skipped"))
|
|
58
|
+
return HealthCheckResult(steps=steps)
|
|
59
|
+
steps.append(HealthCheckStep("file_exists", "pass", (time.perf_counter() - t0) * 1000, str(self._path)))
|
|
60
|
+
|
|
61
|
+
t0 = time.perf_counter()
|
|
62
|
+
try:
|
|
63
|
+
self._path.read_bytes()[:1024]
|
|
64
|
+
steps.append(HealthCheckStep("readable", "pass", (time.perf_counter() - t0) * 1000, "ok"))
|
|
65
|
+
except Exception as exc:
|
|
66
|
+
steps.append(HealthCheckStep("readable", "fail", 0.0, str(exc)))
|
|
67
|
+
for name in ("parseable", "columns", "sample_read", "row_count"):
|
|
68
|
+
steps.append(HealthCheckStep(name, "skip", 0.0, "skipped"))
|
|
69
|
+
return HealthCheckResult(steps=steps)
|
|
70
|
+
|
|
71
|
+
t0 = time.perf_counter()
|
|
72
|
+
try:
|
|
73
|
+
df = self._read()
|
|
74
|
+
except Exception as exc:
|
|
75
|
+
steps.append(HealthCheckStep("parseable", "fail", 0.0, str(exc)))
|
|
76
|
+
for name in ("columns", "sample_read", "row_count"):
|
|
77
|
+
steps.append(HealthCheckStep(name, "skip", 0.0, "skipped"))
|
|
78
|
+
return HealthCheckResult(steps=steps)
|
|
79
|
+
steps.append(HealthCheckStep("parseable", "pass", (time.perf_counter() - t0) * 1000, f"{len(df.columns)} columns"))
|
|
80
|
+
|
|
81
|
+
steps.append(HealthCheckStep("columns", "pass", 0.0, str(list(df.columns)[:5])))
|
|
82
|
+
steps.append(HealthCheckStep("sample_read", "pass", 0.0, "ok"))
|
|
83
|
+
steps.append(HealthCheckStep("row_count", "pass", 0.0, f"{len(df)} rows"))
|
|
84
|
+
return HealthCheckResult(steps=steps)
|
|
85
|
+
|
|
86
|
+
def list_schemas(self) -> list[str]:
|
|
87
|
+
return ["default"]
|
|
88
|
+
|
|
89
|
+
def list_tables(self, schema: str) -> list[str]:
|
|
90
|
+
return [self._table_name]
|
|
91
|
+
|
|
92
|
+
def describe_columns(self, schema: str, table: str) -> list[ColumnMeta]:
|
|
93
|
+
df = self._read()
|
|
94
|
+
return [
|
|
95
|
+
ColumnMeta(
|
|
96
|
+
name=col,
|
|
97
|
+
data_type=str(df[col].dtype),
|
|
98
|
+
nullable=bool(df[col].isna().any()),
|
|
99
|
+
position=i + 1,
|
|
100
|
+
)
|
|
101
|
+
for i, col in enumerate(df.columns)
|
|
102
|
+
]
|
|
103
|
+
|
|
104
|
+
def sample(self, schema: str, table: str, n: int = 100_000) -> pd.DataFrame:
|
|
105
|
+
df = self._read()
|
|
106
|
+
if len(df) <= n:
|
|
107
|
+
return df.reset_index(drop=True)
|
|
108
|
+
return df.sample(n=n, random_state=42).reset_index(drop=True)
|
|
109
|
+
|
|
110
|
+
def aggregate(self, schema: str, table: str, exprs: list[AggExpr]) -> dict[str, Any]:
|
|
111
|
+
import duckdb
|
|
112
|
+
df = self._read()
|
|
113
|
+
con = duckdb.connect()
|
|
114
|
+
con.register("_data", df)
|
|
115
|
+
cols = ", ".join(f"{e.sql} AS {e.name}" for e in exprs)
|
|
116
|
+
row = con.execute(f"SELECT {cols} FROM _data").fetchone() # noqa: S608
|
|
117
|
+
con.close()
|
|
118
|
+
return dict(zip([e.name for e in exprs], row))
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
# PostgresAdapter wraps SQLAlchemy for all warehouse operations.
|
|
2
|
+
# Sampling uses LIMIT for portable random rows; TABLESAMPLE BERNOULLI available as an option.
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import datetime
|
|
6
|
+
import time
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
import sqlalchemy as sa
|
|
11
|
+
|
|
12
|
+
from dqt.adapters._protocol import (
|
|
13
|
+
AggExpr,
|
|
14
|
+
ColumnMeta,
|
|
15
|
+
HealthCheckResult,
|
|
16
|
+
HealthCheckStep,
|
|
17
|
+
)
|
|
18
|
+
from dqt.utils.logging import get_logger
|
|
19
|
+
|
|
20
|
+
_log = get_logger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class PostgresAdapter:
|
|
24
|
+
def __init__(self, conn_str: str) -> None:
|
|
25
|
+
self._conn_str = conn_str
|
|
26
|
+
self._engine = sa.create_engine(
|
|
27
|
+
conn_str,
|
|
28
|
+
pool_pre_ping=True,
|
|
29
|
+
execution_options={"isolation_level": "READ COMMITTED"},
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
def health_check(self) -> HealthCheckResult:
|
|
33
|
+
steps: list[HealthCheckStep] = []
|
|
34
|
+
steps.append(self._step_tcp())
|
|
35
|
+
if steps[-1].status == "fail":
|
|
36
|
+
for name in ("auth", "info_schema", "sample_select", "latency_probe", "clock_skew"):
|
|
37
|
+
steps.append(HealthCheckStep(name=name, status="skip", latency_ms=0.0, detail="skipped"))
|
|
38
|
+
return HealthCheckResult(steps=steps)
|
|
39
|
+
steps.append(self._step_auth())
|
|
40
|
+
steps.append(self._step_info_schema())
|
|
41
|
+
steps.append(self._step_sample_select())
|
|
42
|
+
steps.append(self._step_latency())
|
|
43
|
+
steps.append(self._step_clock_skew())
|
|
44
|
+
return HealthCheckResult(steps=steps)
|
|
45
|
+
|
|
46
|
+
def _step_tcp(self) -> HealthCheckStep:
|
|
47
|
+
t0 = time.perf_counter()
|
|
48
|
+
try:
|
|
49
|
+
with self._engine.connect() as conn:
|
|
50
|
+
conn.execute(sa.text("SELECT 1"))
|
|
51
|
+
return HealthCheckStep("tcp_reach", "pass", (time.perf_counter() - t0) * 1000, "ok")
|
|
52
|
+
except Exception as exc:
|
|
53
|
+
return HealthCheckStep("tcp_reach", "fail", 0.0, str(exc))
|
|
54
|
+
|
|
55
|
+
def _step_auth(self) -> HealthCheckStep:
|
|
56
|
+
t0 = time.perf_counter()
|
|
57
|
+
try:
|
|
58
|
+
with self._engine.connect() as conn:
|
|
59
|
+
user = conn.execute(sa.text("SELECT current_user")).scalar()
|
|
60
|
+
return HealthCheckStep("auth", "pass", (time.perf_counter() - t0) * 1000, f"user={user}")
|
|
61
|
+
except Exception as exc:
|
|
62
|
+
return HealthCheckStep("auth", "fail", 0.0, str(exc))
|
|
63
|
+
|
|
64
|
+
def _step_info_schema(self) -> HealthCheckStep:
|
|
65
|
+
t0 = time.perf_counter()
|
|
66
|
+
try:
|
|
67
|
+
with self._engine.connect() as conn:
|
|
68
|
+
conn.execute(sa.text(
|
|
69
|
+
"SELECT COUNT(*) FROM information_schema.tables "
|
|
70
|
+
"WHERE table_schema NOT IN ('pg_catalog','information_schema')"
|
|
71
|
+
)).scalar()
|
|
72
|
+
return HealthCheckStep("info_schema", "pass", (time.perf_counter() - t0) * 1000, "readable")
|
|
73
|
+
except Exception as exc:
|
|
74
|
+
return HealthCheckStep("info_schema", "fail", 0.0, str(exc))
|
|
75
|
+
|
|
76
|
+
def _step_sample_select(self) -> HealthCheckStep:
|
|
77
|
+
t0 = time.perf_counter()
|
|
78
|
+
try:
|
|
79
|
+
with self._engine.connect() as conn:
|
|
80
|
+
conn.execute(sa.text(
|
|
81
|
+
"SELECT table_name FROM information_schema.tables "
|
|
82
|
+
"WHERE table_schema NOT IN ('pg_catalog','information_schema') LIMIT 1"
|
|
83
|
+
)).fetchone()
|
|
84
|
+
return HealthCheckStep("sample_select", "pass", (time.perf_counter() - t0) * 1000, "ok")
|
|
85
|
+
except Exception as exc:
|
|
86
|
+
return HealthCheckStep("sample_select", "fail", 0.0, str(exc))
|
|
87
|
+
|
|
88
|
+
def _step_latency(self) -> HealthCheckStep:
|
|
89
|
+
t0 = time.perf_counter()
|
|
90
|
+
try:
|
|
91
|
+
with self._engine.connect() as conn:
|
|
92
|
+
conn.execute(sa.text("SELECT 1"))
|
|
93
|
+
latency = (time.perf_counter() - t0) * 1000
|
|
94
|
+
return HealthCheckStep("latency_probe", "pass", latency, f"{latency:.1f}ms")
|
|
95
|
+
except Exception as exc:
|
|
96
|
+
return HealthCheckStep("latency_probe", "fail", 0.0, str(exc))
|
|
97
|
+
|
|
98
|
+
def _step_clock_skew(self) -> HealthCheckStep:
|
|
99
|
+
t0 = time.perf_counter()
|
|
100
|
+
try:
|
|
101
|
+
with self._engine.connect() as conn:
|
|
102
|
+
db_now = conn.execute(sa.text("SELECT NOW()")).scalar()
|
|
103
|
+
local_now = datetime.datetime.now(datetime.timezone.utc)
|
|
104
|
+
if db_now.tzinfo is None:
|
|
105
|
+
db_now = db_now.replace(tzinfo=datetime.timezone.utc)
|
|
106
|
+
skew_s = abs((db_now - local_now).total_seconds())
|
|
107
|
+
status = "pass" if skew_s < 60 else "fail"
|
|
108
|
+
return HealthCheckStep("clock_skew", status, (time.perf_counter() - t0) * 1000, f"skew={skew_s:.1f}s")
|
|
109
|
+
except Exception as exc:
|
|
110
|
+
return HealthCheckStep("clock_skew", "fail", 0.0, str(exc))
|
|
111
|
+
|
|
112
|
+
def list_schemas(self) -> list[str]:
|
|
113
|
+
with self._engine.connect() as conn:
|
|
114
|
+
rows = conn.execute(sa.text(
|
|
115
|
+
"SELECT DISTINCT table_schema FROM information_schema.tables "
|
|
116
|
+
"WHERE table_schema NOT IN ('pg_catalog','information_schema') ORDER BY 1"
|
|
117
|
+
)).fetchall()
|
|
118
|
+
return [r[0] for r in rows]
|
|
119
|
+
|
|
120
|
+
def list_tables(self, schema: str) -> list[str]:
|
|
121
|
+
with self._engine.connect() as conn:
|
|
122
|
+
rows = conn.execute(sa.text(
|
|
123
|
+
"SELECT table_name FROM information_schema.tables "
|
|
124
|
+
"WHERE table_schema = :schema ORDER BY 1"
|
|
125
|
+
), {"schema": schema}).fetchall()
|
|
126
|
+
return [r[0] for r in rows]
|
|
127
|
+
|
|
128
|
+
def describe_columns(self, schema: str, table: str) -> list[ColumnMeta]:
|
|
129
|
+
with self._engine.connect() as conn:
|
|
130
|
+
rows = conn.execute(sa.text(
|
|
131
|
+
"SELECT column_name, data_type, is_nullable, ordinal_position "
|
|
132
|
+
"FROM information_schema.columns "
|
|
133
|
+
"WHERE table_schema = :schema AND table_name = :table "
|
|
134
|
+
"ORDER BY ordinal_position"
|
|
135
|
+
), {"schema": schema, "table": table}).fetchall()
|
|
136
|
+
return [
|
|
137
|
+
ColumnMeta(name=r[0], data_type=r[1], nullable=(r[2] == "YES"), position=r[3])
|
|
138
|
+
for r in rows
|
|
139
|
+
]
|
|
140
|
+
|
|
141
|
+
def sample(self, schema: str, table: str, n: int = 100_000) -> pd.DataFrame:
|
|
142
|
+
# Use ORDER BY random() to get a genuine random sample without TABLESAMPLE bias on small tables.
|
|
143
|
+
# schema/table are double-quoted identifiers, not user values in SQL context.
|
|
144
|
+
query = sa.text(f'SELECT * FROM "{schema}"."{table}" ORDER BY random() LIMIT :n')
|
|
145
|
+
with self._engine.connect() as conn:
|
|
146
|
+
return pd.read_sql(query, conn, params={"n": n})
|
|
147
|
+
|
|
148
|
+
def aggregate(self, schema: str, table: str, exprs: list[AggExpr]) -> dict[str, Any]:
|
|
149
|
+
cols = ", ".join(f"{e.sql} AS {e.name}" for e in exprs)
|
|
150
|
+
# schema/table are double-quoted identifiers; cols are built from AggExpr.sql (caller-controlled).
|
|
151
|
+
query = sa.text(f'SELECT {cols} FROM "{schema}"."{table}"')
|
|
152
|
+
with self._engine.connect() as conn:
|
|
153
|
+
row = conn.execute(query).fetchone()
|
|
154
|
+
return dict(zip([e.name for e in exprs], row))
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
@dataclass
|
|
5
|
+
class PostgresConfig:
|
|
6
|
+
host: str = "localhost"
|
|
7
|
+
port: int = 5432
|
|
8
|
+
database: str = "postgres"
|
|
9
|
+
username: str = "postgres"
|
|
10
|
+
password: str = ""
|
|
11
|
+
ssl_mode: str = "prefer"
|
|
12
|
+
|
|
13
|
+
def to_conn_str(self) -> str:
|
|
14
|
+
return (
|
|
15
|
+
f"postgresql+psycopg2://{self.username}:{self.password}"
|
|
16
|
+
f"@{self.host}:{self.port}/{self.database}?sslmode={self.ssl_mode}"
|
|
17
|
+
)
|
dqt/agent/__init__.py
ADDED
|
File without changes
|
|
File without changes
|
dqt/algorithms/_base.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# Base classes for all detectors. StatScale and STAT_SCALES live in _scales.py (no dqt imports there).
|
|
2
|
+
# compute_verdict defers the _scales import to break any potential circular dependency.
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from enum import Enum
|
|
7
|
+
from typing import TYPE_CHECKING, Any, ClassVar
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from dqt.adapters._protocol import AggExpr
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Verdict(str, Enum):
|
|
16
|
+
pass_ = "pass"
|
|
17
|
+
warn = "warn"
|
|
18
|
+
fail = "fail"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class DetectorResult:
|
|
23
|
+
score: float
|
|
24
|
+
verdict: Verdict
|
|
25
|
+
plain_english: str
|
|
26
|
+
details: dict[str, Any] = field(default_factory=dict)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
DetectorState = Any
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def compute_verdict(score: float, slug: str) -> Verdict:
|
|
33
|
+
from dqt.algorithms._scales import STAT_SCALES # deferred to avoid circular deps
|
|
34
|
+
scale = STAT_SCALES.get(slug)
|
|
35
|
+
if scale is None:
|
|
36
|
+
raise KeyError(f"No STAT_SCALE entry for slug '{slug}'. Add it to _scales.py.")
|
|
37
|
+
if scale.direction == "lower_is_better":
|
|
38
|
+
if score >= scale.fail_threshold:
|
|
39
|
+
return Verdict.fail
|
|
40
|
+
if score >= scale.warn_threshold:
|
|
41
|
+
return Verdict.warn
|
|
42
|
+
return Verdict.pass_
|
|
43
|
+
else:
|
|
44
|
+
if score <= scale.fail_threshold:
|
|
45
|
+
return Verdict.fail
|
|
46
|
+
if score <= scale.warn_threshold:
|
|
47
|
+
return Verdict.warn
|
|
48
|
+
return Verdict.pass_
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class BaseDetector:
|
|
52
|
+
slug: ClassVar[str]
|
|
53
|
+
group: ClassVar[str]
|
|
54
|
+
kind: ClassVar[str] = "sample"
|
|
55
|
+
|
|
56
|
+
def fit(self, reference: pd.DataFrame) -> DetectorState:
|
|
57
|
+
raise NotImplementedError
|
|
58
|
+
|
|
59
|
+
def score(self, current: pd.DataFrame, state: DetectorState) -> DetectorResult:
|
|
60
|
+
raise NotImplementedError
|
|
61
|
+
|
|
62
|
+
def _verdict(self, score: float) -> Verdict:
|
|
63
|
+
return compute_verdict(score, self.slug)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class BaseAggregateDetector(BaseDetector):
|
|
67
|
+
kind: ClassVar[str] = "aggregate"
|
|
68
|
+
|
|
69
|
+
def get_aggregations(self, col: str) -> list[AggExpr]:
|
|
70
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Detector registry for slug-based lookup.
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from dqt.algorithms._base import BaseDetector
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Registry:
|
|
11
|
+
def __init__(self) -> None:
|
|
12
|
+
self._map: dict[str, type[BaseDetector]] = {}
|
|
13
|
+
|
|
14
|
+
def register(self, cls: type[BaseDetector]) -> type[BaseDetector]:
|
|
15
|
+
self._map[cls.slug] = cls
|
|
16
|
+
return cls
|
|
17
|
+
|
|
18
|
+
def get(self, slug: str) -> type[BaseDetector]:
|
|
19
|
+
try:
|
|
20
|
+
return self._map[slug]
|
|
21
|
+
except KeyError:
|
|
22
|
+
raise KeyError(f"Detector slug '{slug}' not registered. Import the detector module first.")
|
|
23
|
+
|
|
24
|
+
def slugs(self) -> list[str]:
|
|
25
|
+
return list(self._map.keys())
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
registry = Registry()
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# Single source of truth for stat scale definitions.
|
|
2
|
+
# Frontend reads the TS version generated by `make stats-scales`.
|
|
3
|
+
# ZERO imports from dqt — this file must import nothing from this package.
|
|
4
|
+
from typing import Literal, NamedTuple
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class StatScale(NamedTuple):
|
|
8
|
+
slug: str
|
|
9
|
+
max: float
|
|
10
|
+
warn_threshold: float
|
|
11
|
+
fail_threshold: float
|
|
12
|
+
direction: Literal["lower_is_better", "higher_is_better"]
|
|
13
|
+
plain_english_label: str
|
|
14
|
+
hint: str
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
STAT_SCALES: dict[str, StatScale] = {
|
|
18
|
+
s.slug: s for s in [
|
|
19
|
+
StatScale("completeness_rate", 1.0, 0.95, 0.90, "higher_is_better", "Completeness", "Fraction of non-null values"),
|
|
20
|
+
StatScale("uniqueness_rate", 1.0, 0.95, 0.80, "higher_is_better", "Uniqueness", "Fraction of distinct values"),
|
|
21
|
+
StatScale("validity_rate", 1.0, 0.95, 0.90, "higher_is_better", "Validity", "Fraction of values matching the rule"),
|
|
22
|
+
StatScale("numeric_mean_shift", 10.0, 2.0, 3.0, "lower_is_better", "Mean shift (σ)", "Z-score of mean deviation from baseline"),
|
|
23
|
+
StatScale("volume_change_ratio", 1.0, 0.10, 0.25, "lower_is_better", "Row-count change", "Fractional deviation from baseline row count"),
|
|
24
|
+
StatScale("schema_change", 1.0, 0.5, 0.5, "lower_is_better", "Schema change", "1.0 if schema changed, 0.0 if unchanged"),
|
|
25
|
+
StatScale("max_in_range", 1.0, 0.5, 0.5, "lower_is_better", "Max in bounds", "1.0 when MAX(col) outside [min, max]; 0.0 otherwise"),
|
|
26
|
+
StatScale("min_in_range", 1.0, 0.5, 0.5, "lower_is_better", "Min in bounds", "1.0 when MIN(col) outside [min, max]; 0.0 otherwise"),
|
|
27
|
+
StatScale("median_in_range", 1.0, 0.5, 0.5, "lower_is_better", "Median in bounds", "1.0 when median outside [min, max]"),
|
|
28
|
+
StatScale("stddev_in_range", 1.0, 0.5, 0.5, "lower_is_better", "Stddev in bounds", "1.0 when STDDEV outside [min, max]"),
|
|
29
|
+
StatScale("sum_in_range", 1.0, 0.5, 0.5, "lower_is_better", "Sum in bounds", "1.0 when SUM outside [min, max]"),
|
|
30
|
+
StatScale("cardinality_in_range", 1.0, 0.5, 0.5, "lower_is_better", "Cardinality in bounds", "1.0 when COUNT(DISTINCT col) outside [min, max]"),
|
|
31
|
+
StatScale("quantile_in_range", 1.0, 0.5, 0.5, "lower_is_better", "Quantile in bounds", "1.0 when specified quantile outside [min, max]"),
|
|
32
|
+
StatScale("value_in_range_violation", 0.10, 0.001, 0.01, "lower_is_better", "Values in range", "Fraction of values outside [min, max]"),
|
|
33
|
+
StatScale("set_membership_violation", 0.10, 0.001, 0.01, "lower_is_better", "Set membership", "Fraction of values not in the allowed set"),
|
|
34
|
+
StatScale("set_exclusion_violation", 0.10, 0.001, 0.01, "lower_is_better", "Set exclusion", "Fraction of values in the forbidden set"),
|
|
35
|
+
StatScale("regex_match_violation", 0.10, 0.001, 0.01, "lower_is_better", "Regex format", "Fraction of values not matching any regex pattern"),
|
|
36
|
+
StatScale("string_length_violation", 0.10, 0.001, 0.01, "lower_is_better", "String length", "Fraction of values with length outside [min_len, max_len]"),
|
|
37
|
+
StatScale("date_format_violation", 0.10, 0.001, 0.01, "lower_is_better", "Date format", "Fraction of values not parseable as the given date format"),
|
|
38
|
+
StatScale("monotonicity_violation", 1.0, 0.5, 0.5, "lower_is_better", "Monotonicity", "1.0 if ordering violated; 0.0 if sequence is monotonic"),
|
|
39
|
+
StatScale("column_pair_violation", 0.10, 0.001, 0.01, "lower_is_better", "Column pair rule", "Fraction of rows where the pair comparison rule is violated"),
|
|
40
|
+
StatScale("composite_uniqueness_violation", 0.10, 0.001, 0.01, "lower_is_better","Composite key uniqueness", "Fraction of rows that are duplicates on the composite key"),
|
|
41
|
+
StatScale("referential_integrity_rate", 1.0, 0.99, 0.95, "higher_is_better", "Referential integrity", "Fraction of FK values present in parent table"),
|
|
42
|
+
StatScale("ks_pvalue", 1.0, 0.95, 0.99, "lower_is_better", "KS drift (1−p)", "1 − p-value from two-sample KS test; warn p<0.05, fail p<0.01"),
|
|
43
|
+
StatScale("mad_outlier_fraction", 0.20, 0.01, 0.05, "lower_is_better", "Outlier fraction (MAD)", "Fraction of values with |modified Z| > 3.5"),
|
|
44
|
+
StatScale("double_mad_outlier_fraction", 0.20, 0.01, 0.05, "lower_is_better", "Outlier fraction (double-MAD)", "Fraction flagged by asymmetric double-MAD; robust on skewed distributions"),
|
|
45
|
+
StatScale("zscore_outlier_fraction", 0.10, 0.01, 0.05, "lower_is_better", "Outlier fraction (Z-score)", "Fraction of values with |Z| > threshold; valid only under normality"),
|
|
46
|
+
StatScale("adjusted_boxplot_fraction", 0.20, 0.01, 0.05, "lower_is_better", "Outlier fraction (adj. boxplot)", "Fraction outside medcouple-adjusted Tukey fences; Hubert & Vandervieren 2008"),
|
|
47
|
+
StatScale("isolation_forest_fraction", 0.20, 0.05, 0.10, "lower_is_better", "Outlier fraction (IF)", "Fraction of rows classified as anomalies by Isolation Forest"),
|
|
48
|
+
StatScale("stl_residual_zscore", 10.0, 3.0, 5.0, "lower_is_better", "STL residual Z-score", "Max absolute Z-score of STL residuals over the current window"),
|
|
49
|
+
StatScale("freshness_seconds_behind", 86400*7, 3600, 86400, "lower_is_better",
|
|
50
|
+
"Data freshness", "Seconds since the most recent row timestamp"),
|
|
51
|
+
StatScale("null_fraction", 1.0, 0.01, 0.05, "lower_is_better",
|
|
52
|
+
"Null fraction", "Fraction of rows where the column is NULL"),
|
|
53
|
+
StatScale("string_case_violation", 1.0, 0.001, 0.01, "lower_is_better",
|
|
54
|
+
"String case violation", "Fraction of rows with wrong case"),
|
|
55
|
+
StatScale("sql_assertion_violation", 1.0, 0.001, 0.01, "lower_is_better",
|
|
56
|
+
"SQL assertion violation", "Fraction of rows failing the custom SQL condition"),
|
|
57
|
+
StatScale("date_part_missing_fraction", 1.0, 0.01, 0.05, "lower_is_better",
|
|
58
|
+
"Date-part completeness", "Fraction of expected date buckets with no data"),
|
|
59
|
+
StatScale("outlier_fraction_drift", 1.0, 0.001, 0.01, "lower_is_better",
|
|
60
|
+
"Outlier fraction drift", "Deviation of current outlier fraction from historical baseline range"),
|
|
61
|
+
]
|
|
62
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from dqt.algorithms.basic.completeness import CompletenessDetector
|
|
2
|
+
from dqt.algorithms.basic.uniqueness import UniquenessDetector
|
|
3
|
+
from dqt.algorithms.basic.validity import ValidityDetector
|
|
4
|
+
from dqt.algorithms.basic.numeric import NumericMeanDetector
|
|
5
|
+
from dqt.algorithms.basic.volume import VolumeDetector
|
|
6
|
+
from dqt.algorithms.basic.numeric_bounds import (
|
|
7
|
+
MaxInRangeDetector, MinInRangeDetector, MedianInRangeDetector,
|
|
8
|
+
StdDevInRangeDetector, SumInRangeDetector, CardinalityInRangeDetector,
|
|
9
|
+
QuantileInRangeDetector,
|
|
10
|
+
)
|
|
11
|
+
from dqt.algorithms.basic.value_checks import (
|
|
12
|
+
ValueInRangeDetector, SetMembershipDetector, SetExclusionDetector,
|
|
13
|
+
RegexMatchDetector, StringLengthRangeDetector, DateFormatDetector,
|
|
14
|
+
)
|
|
15
|
+
from dqt.algorithms.basic.monotonicity import MonotonicityDetector
|
|
16
|
+
from dqt.algorithms.basic.column_pairs import ColumnPairComparisonDetector, CompositeUniquenessDetector
|
|
17
|
+
from dqt.algorithms.basic.freshness import FreshnessDetector
|
|
18
|
+
from dqt.algorithms.basic.null_fraction import NullFractionDetector
|
|
19
|
+
from dqt.algorithms.basic.string_case import StringCaseDetector
|
|
20
|
+
from dqt.algorithms.basic.sql_assertion import SqlAssertionDetector
|
|
21
|
+
from dqt.algorithms.basic.date_part import DatePartCompletenessDetector
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"CompletenessDetector", "UniquenessDetector", "ValidityDetector",
|
|
25
|
+
"NumericMeanDetector", "VolumeDetector",
|
|
26
|
+
"MaxInRangeDetector", "MinInRangeDetector", "MedianInRangeDetector",
|
|
27
|
+
"StdDevInRangeDetector", "SumInRangeDetector", "CardinalityInRangeDetector",
|
|
28
|
+
"QuantileInRangeDetector",
|
|
29
|
+
"ValueInRangeDetector", "SetMembershipDetector", "SetExclusionDetector",
|
|
30
|
+
"RegexMatchDetector", "StringLengthRangeDetector", "DateFormatDetector",
|
|
31
|
+
"MonotonicityDetector",
|
|
32
|
+
"ColumnPairComparisonDetector", "CompositeUniquenessDetector",
|
|
33
|
+
"FreshnessDetector", "NullFractionDetector", "StringCaseDetector",
|
|
34
|
+
"SqlAssertionDetector", "DatePartCompletenessDetector",
|
|
35
|
+
]
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from dqt.algorithms._base import DetectorResult
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def fraction_result(df: pd.DataFrame, slug: str, label: str) -> DetectorResult:
|
|
9
|
+
from dqt.algorithms._base import compute_verdict
|
|
10
|
+
row = df.iloc[0]
|
|
11
|
+
total = int(row["total_count"])
|
|
12
|
+
frac = int(row["violation_count"]) / total if total > 0 else 0.0
|
|
13
|
+
return DetectorResult(
|
|
14
|
+
score=frac,
|
|
15
|
+
verdict=compute_verdict(frac, slug),
|
|
16
|
+
plain_english=f"{frac:.2%} of values violate {label}",
|
|
17
|
+
details={"violation_fraction": frac, "violation_count": int(row["violation_count"]), "total": total},
|
|
18
|
+
)
|