dqtlib 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. dqt/__init__.py +49 -0
  2. dqt/adapters/__init__.py +0 -0
  3. dqt/adapters/_protocol.py +47 -0
  4. dqt/adapters/local/__init__.py +3 -0
  5. dqt/adapters/local/adapter.py +118 -0
  6. dqt/adapters/postgres/__init__.py +4 -0
  7. dqt/adapters/postgres/adapter.py +154 -0
  8. dqt/adapters/postgres/config.py +17 -0
  9. dqt/agent/__init__.py +0 -0
  10. dqt/algorithms/__init__.py +0 -0
  11. dqt/algorithms/_base.py +70 -0
  12. dqt/algorithms/_registry.py +28 -0
  13. dqt/algorithms/_scales.py +62 -0
  14. dqt/algorithms/basic/__init__.py +35 -0
  15. dqt/algorithms/basic/_helpers.py +18 -0
  16. dqt/algorithms/basic/column_pairs.py +80 -0
  17. dqt/algorithms/basic/completeness.py +40 -0
  18. dqt/algorithms/basic/date_part.py +52 -0
  19. dqt/algorithms/basic/freshness.py +53 -0
  20. dqt/algorithms/basic/monotonicity.py +42 -0
  21. dqt/algorithms/basic/null_fraction.py +37 -0
  22. dqt/algorithms/basic/numeric.py +42 -0
  23. dqt/algorithms/basic/numeric_bounds.py +155 -0
  24. dqt/algorithms/basic/sql_assertion.py +40 -0
  25. dqt/algorithms/basic/string_case.py +51 -0
  26. dqt/algorithms/basic/uniqueness.py +40 -0
  27. dqt/algorithms/basic/validity.py +46 -0
  28. dqt/algorithms/basic/value_checks.py +174 -0
  29. dqt/algorithms/basic/volume.py +35 -0
  30. dqt/algorithms/distribution/__init__.py +7 -0
  31. dqt/algorithms/distribution/profiler.py +127 -0
  32. dqt/algorithms/drift/__init__.py +3 -0
  33. dqt/algorithms/drift/ks2sample.py +40 -0
  34. dqt/algorithms/outliers_multi/__init__.py +3 -0
  35. dqt/algorithms/outliers_multi/isolation_forest.py +41 -0
  36. dqt/algorithms/outliers_uni/__init__.py +14 -0
  37. dqt/algorithms/outliers_uni/adjusted_boxplot.py +59 -0
  38. dqt/algorithms/outliers_uni/auto_outlier.py +110 -0
  39. dqt/algorithms/outliers_uni/mad.py +84 -0
  40. dqt/algorithms/outliers_uni/outlier_fraction_range.py +116 -0
  41. dqt/algorithms/outliers_uni/zscore.py +35 -0
  42. dqt/algorithms/referential/__init__.py +2 -0
  43. dqt/algorithms/referential/referential.py +52 -0
  44. dqt/algorithms/schema/__init__.py +2 -0
  45. dqt/algorithms/schema/schema_checks.py +56 -0
  46. dqt/algorithms/timeseries/__init__.py +3 -0
  47. dqt/algorithms/timeseries/stl.py +51 -0
  48. dqt/causality/__init__.py +0 -0
  49. dqt/checks/__init__.py +0 -0
  50. dqt/checks/loader.py +70 -0
  51. dqt/checks/models.py +42 -0
  52. dqt/checks/schema/__init__.py +0 -0
  53. dqt/checks/schema/check.schema.json +74 -0
  54. dqt/compat/__init__.py +0 -0
  55. dqt/governance/__init__.py +0 -0
  56. dqt/hitl/__init__.py +0 -0
  57. dqt/lineage/__init__.py +4 -0
  58. dqt/lineage/models.py +42 -0
  59. dqt/lineage/vault.py +273 -0
  60. dqt/profiling/__init__.py +12 -0
  61. dqt/profiling/models.py +79 -0
  62. dqt/profiling/profiler.py +236 -0
  63. dqt/reporting/__init__.py +5 -0
  64. dqt/reporting/_charts.py +173 -0
  65. dqt/reporting/html_report.py +480 -0
  66. dqt/runner/__init__.py +0 -0
  67. dqt/runner/runner.py +121 -0
  68. dqt/semantic/__init__.py +9 -0
  69. dqt/semantic/loader.py +10 -0
  70. dqt/semantic/models.py +35 -0
  71. dqt/store/__init__.py +0 -0
  72. dqt/store/_protocol.py +43 -0
  73. dqt/store/memory.py +27 -0
  74. dqt/utils/__init__.py +0 -0
  75. dqt/utils/logging.py +26 -0
  76. dqtlib-0.1.0.dist-info/METADATA +64 -0
  77. dqtlib-0.1.0.dist-info/RECORD +78 -0
  78. dqtlib-0.1.0.dist-info/WHEEL +4 -0
dqt/__init__.py ADDED
@@ -0,0 +1,49 @@
1
+ # packages/dqt/src/dqt/__init__.py
2
+ """dqt — open-source data quality, observability, and causality library."""
3
+ from __future__ import annotations
4
+
5
+ __version__ = "0.1.0"
6
+
7
+ from dqt.algorithms._base import (
8
+ BaseAggregateDetector,
9
+ BaseDetector,
10
+ DetectorResult,
11
+ Verdict,
12
+ compute_verdict,
13
+ )
14
+ from dqt.adapters._protocol import AggExpr, ColumnMeta, HealthCheckResult, WarehouseAdapter
15
+ from dqt.store._protocol import Incident, ResultsStore, RunResult
16
+ from dqt.store.memory import MemoryStore
17
+ from dqt.checks.models import BaselineConfig, Check, CheckFilter, CheckScope
18
+ from dqt.runner.runner import Runner
19
+
20
+ # Import all detector groups to trigger @registry.register side effects
21
+ import dqt.algorithms.basic # noqa: F401
22
+ import dqt.algorithms.schema # noqa: F401
23
+ import dqt.algorithms.referential # noqa: F401
24
+ import dqt.algorithms.drift # noqa: F401
25
+ import dqt.algorithms.outliers_uni # noqa: F401
26
+ import dqt.algorithms.outliers_multi # noqa: F401
27
+ import dqt.algorithms.timeseries # noqa: F401
28
+
29
+ __all__ = [
30
+ "__version__",
31
+ "Verdict",
32
+ "DetectorResult",
33
+ "BaseDetector",
34
+ "BaseAggregateDetector",
35
+ "compute_verdict",
36
+ "AggExpr",
37
+ "ColumnMeta",
38
+ "HealthCheckResult",
39
+ "WarehouseAdapter",
40
+ "ResultsStore",
41
+ "RunResult",
42
+ "Incident",
43
+ "MemoryStore",
44
+ "Check",
45
+ "CheckScope",
46
+ "CheckFilter",
47
+ "BaselineConfig",
48
+ "Runner",
49
+ ]
File without changes
@@ -0,0 +1,47 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Literal, Protocol, runtime_checkable
5
+
6
+ import pandas as pd
7
+
8
+
9
+ @dataclass
10
+ class AggExpr:
11
+ name: str
12
+ sql: str
13
+
14
+
15
+ @dataclass
16
+ class HealthCheckStep:
17
+ name: str
18
+ status: Literal["pass", "fail", "skip"]
19
+ latency_ms: float
20
+ detail: str
21
+
22
+
23
+ @dataclass
24
+ class HealthCheckResult:
25
+ steps: list[HealthCheckStep] = field(default_factory=list)
26
+
27
+ @property
28
+ def passed(self) -> bool:
29
+ return all(s.status in ("pass", "skip") for s in self.steps)
30
+
31
+
32
+ @dataclass
33
+ class ColumnMeta:
34
+ name: str
35
+ data_type: str
36
+ nullable: bool
37
+ position: int
38
+
39
+
40
+ @runtime_checkable
41
+ class WarehouseAdapter(Protocol):
42
+ def health_check(self) -> HealthCheckResult: ...
43
+ def sample(self, schema: str, table: str, n: int = 100_000) -> pd.DataFrame: ...
44
+ def aggregate(self, schema: str, table: str, exprs: list[AggExpr]) -> dict[str, object]: ...
45
+ def describe_columns(self, schema: str, table: str) -> list[ColumnMeta]: ...
46
+ def list_schemas(self) -> list[str]: ...
47
+ def list_tables(self, schema: str) -> list[str]: ...
@@ -0,0 +1,3 @@
1
+ from dqt.adapters.local.adapter import LocalFileAdapter
2
+
3
+ __all__ = ["LocalFileAdapter"]
@@ -0,0 +1,118 @@
1
+ # Ref: https://duckdb.org/docs/api/python/overview — used for SQL aggregations on DataFrames
2
+ from __future__ import annotations
3
+
4
+ import pathlib
5
+ import time
6
+ from typing import Any
7
+
8
+ import pandas as pd
9
+
10
+ from dqt.adapters._protocol import (
11
+ AggExpr,
12
+ ColumnMeta,
13
+ HealthCheckResult,
14
+ HealthCheckStep,
15
+ )
16
+ from dqt.utils.logging import get_logger
17
+
18
+ _log = get_logger(__name__)
19
+
20
+ _READERS: dict[str, Any] = {
21
+ ".csv": lambda p: pd.read_csv(p),
22
+ ".tsv": lambda p: pd.read_csv(p, sep="\t"),
23
+ ".xlsx": lambda p: pd.read_excel(p),
24
+ ".xls": lambda p: pd.read_excel(p),
25
+ ".parquet": lambda p: pd.read_parquet(p),
26
+ ".json": lambda p: pd.read_json(p),
27
+ ".jsonl": lambda p: pd.read_json(p, lines=True),
28
+ ".ndjson": lambda p: pd.read_json(p, lines=True),
29
+ ".feather": lambda p: pd.read_feather(p),
30
+ ".arrow": lambda p: pd.read_feather(p),
31
+ }
32
+
33
+ _HEALTH_STEPS = ("readable", "parseable", "columns", "sample_read", "row_count")
34
+
35
+
36
+ class LocalFileAdapter:
37
+ """Reads a local file and exposes it as a single-table WarehouseAdapter."""
38
+
39
+ def __init__(self, path: str | pathlib.Path) -> None:
40
+ self._path = pathlib.Path(path)
41
+ self._suffix = self._path.suffix.lower()
42
+ if self._suffix not in _READERS:
43
+ supported = ", ".join(sorted(_READERS))
44
+ raise ValueError(f"Unsupported format '{self._suffix}'. Supported: {supported}")
45
+ self._table_name = self._path.stem
46
+
47
+ def _read(self) -> pd.DataFrame:
48
+ return _READERS[self._suffix](self._path)
49
+
50
+ def health_check(self) -> HealthCheckResult:
51
+ steps: list[HealthCheckStep] = []
52
+
53
+ t0 = time.perf_counter()
54
+ if not self._path.exists():
55
+ steps.append(HealthCheckStep("file_exists", "fail", 0.0, f"not found: {self._path}"))
56
+ for name in _HEALTH_STEPS:
57
+ steps.append(HealthCheckStep(name, "skip", 0.0, "skipped"))
58
+ return HealthCheckResult(steps=steps)
59
+ steps.append(HealthCheckStep("file_exists", "pass", (time.perf_counter() - t0) * 1000, str(self._path)))
60
+
61
+ t0 = time.perf_counter()
62
+ try:
63
+ self._path.read_bytes()[:1024]
64
+ steps.append(HealthCheckStep("readable", "pass", (time.perf_counter() - t0) * 1000, "ok"))
65
+ except Exception as exc:
66
+ steps.append(HealthCheckStep("readable", "fail", 0.0, str(exc)))
67
+ for name in ("parseable", "columns", "sample_read", "row_count"):
68
+ steps.append(HealthCheckStep(name, "skip", 0.0, "skipped"))
69
+ return HealthCheckResult(steps=steps)
70
+
71
+ t0 = time.perf_counter()
72
+ try:
73
+ df = self._read()
74
+ except Exception as exc:
75
+ steps.append(HealthCheckStep("parseable", "fail", 0.0, str(exc)))
76
+ for name in ("columns", "sample_read", "row_count"):
77
+ steps.append(HealthCheckStep(name, "skip", 0.0, "skipped"))
78
+ return HealthCheckResult(steps=steps)
79
+ steps.append(HealthCheckStep("parseable", "pass", (time.perf_counter() - t0) * 1000, f"{len(df.columns)} columns"))
80
+
81
+ steps.append(HealthCheckStep("columns", "pass", 0.0, str(list(df.columns)[:5])))
82
+ steps.append(HealthCheckStep("sample_read", "pass", 0.0, "ok"))
83
+ steps.append(HealthCheckStep("row_count", "pass", 0.0, f"{len(df)} rows"))
84
+ return HealthCheckResult(steps=steps)
85
+
86
+ def list_schemas(self) -> list[str]:
87
+ return ["default"]
88
+
89
+ def list_tables(self, schema: str) -> list[str]:
90
+ return [self._table_name]
91
+
92
+ def describe_columns(self, schema: str, table: str) -> list[ColumnMeta]:
93
+ df = self._read()
94
+ return [
95
+ ColumnMeta(
96
+ name=col,
97
+ data_type=str(df[col].dtype),
98
+ nullable=bool(df[col].isna().any()),
99
+ position=i + 1,
100
+ )
101
+ for i, col in enumerate(df.columns)
102
+ ]
103
+
104
+ def sample(self, schema: str, table: str, n: int = 100_000) -> pd.DataFrame:
105
+ df = self._read()
106
+ if len(df) <= n:
107
+ return df.reset_index(drop=True)
108
+ return df.sample(n=n, random_state=42).reset_index(drop=True)
109
+
110
+ def aggregate(self, schema: str, table: str, exprs: list[AggExpr]) -> dict[str, Any]:
111
+ import duckdb
112
+ df = self._read()
113
+ con = duckdb.connect()
114
+ con.register("_data", df)
115
+ cols = ", ".join(f"{e.sql} AS {e.name}" for e in exprs)
116
+ row = con.execute(f"SELECT {cols} FROM _data").fetchone() # noqa: S608
117
+ con.close()
118
+ return dict(zip([e.name for e in exprs], row))
@@ -0,0 +1,4 @@
1
+ from dqt.adapters.postgres.adapter import PostgresAdapter
2
+ from dqt.adapters.postgres.config import PostgresConfig
3
+
4
+ __all__ = ["PostgresAdapter", "PostgresConfig"]
@@ -0,0 +1,154 @@
1
+ # PostgresAdapter wraps SQLAlchemy for all warehouse operations.
2
+ # Sampling uses LIMIT for portable random rows; TABLESAMPLE BERNOULLI available as an option.
3
+ from __future__ import annotations
4
+
5
+ import datetime
6
+ import time
7
+ from typing import Any
8
+
9
+ import pandas as pd
10
+ import sqlalchemy as sa
11
+
12
+ from dqt.adapters._protocol import (
13
+ AggExpr,
14
+ ColumnMeta,
15
+ HealthCheckResult,
16
+ HealthCheckStep,
17
+ )
18
+ from dqt.utils.logging import get_logger
19
+
20
+ _log = get_logger(__name__)
21
+
22
+
23
+ class PostgresAdapter:
24
+ def __init__(self, conn_str: str) -> None:
25
+ self._conn_str = conn_str
26
+ self._engine = sa.create_engine(
27
+ conn_str,
28
+ pool_pre_ping=True,
29
+ execution_options={"isolation_level": "READ COMMITTED"},
30
+ )
31
+
32
+ def health_check(self) -> HealthCheckResult:
33
+ steps: list[HealthCheckStep] = []
34
+ steps.append(self._step_tcp())
35
+ if steps[-1].status == "fail":
36
+ for name in ("auth", "info_schema", "sample_select", "latency_probe", "clock_skew"):
37
+ steps.append(HealthCheckStep(name=name, status="skip", latency_ms=0.0, detail="skipped"))
38
+ return HealthCheckResult(steps=steps)
39
+ steps.append(self._step_auth())
40
+ steps.append(self._step_info_schema())
41
+ steps.append(self._step_sample_select())
42
+ steps.append(self._step_latency())
43
+ steps.append(self._step_clock_skew())
44
+ return HealthCheckResult(steps=steps)
45
+
46
+ def _step_tcp(self) -> HealthCheckStep:
47
+ t0 = time.perf_counter()
48
+ try:
49
+ with self._engine.connect() as conn:
50
+ conn.execute(sa.text("SELECT 1"))
51
+ return HealthCheckStep("tcp_reach", "pass", (time.perf_counter() - t0) * 1000, "ok")
52
+ except Exception as exc:
53
+ return HealthCheckStep("tcp_reach", "fail", 0.0, str(exc))
54
+
55
+ def _step_auth(self) -> HealthCheckStep:
56
+ t0 = time.perf_counter()
57
+ try:
58
+ with self._engine.connect() as conn:
59
+ user = conn.execute(sa.text("SELECT current_user")).scalar()
60
+ return HealthCheckStep("auth", "pass", (time.perf_counter() - t0) * 1000, f"user={user}")
61
+ except Exception as exc:
62
+ return HealthCheckStep("auth", "fail", 0.0, str(exc))
63
+
64
+ def _step_info_schema(self) -> HealthCheckStep:
65
+ t0 = time.perf_counter()
66
+ try:
67
+ with self._engine.connect() as conn:
68
+ conn.execute(sa.text(
69
+ "SELECT COUNT(*) FROM information_schema.tables "
70
+ "WHERE table_schema NOT IN ('pg_catalog','information_schema')"
71
+ )).scalar()
72
+ return HealthCheckStep("info_schema", "pass", (time.perf_counter() - t0) * 1000, "readable")
73
+ except Exception as exc:
74
+ return HealthCheckStep("info_schema", "fail", 0.0, str(exc))
75
+
76
+ def _step_sample_select(self) -> HealthCheckStep:
77
+ t0 = time.perf_counter()
78
+ try:
79
+ with self._engine.connect() as conn:
80
+ conn.execute(sa.text(
81
+ "SELECT table_name FROM information_schema.tables "
82
+ "WHERE table_schema NOT IN ('pg_catalog','information_schema') LIMIT 1"
83
+ )).fetchone()
84
+ return HealthCheckStep("sample_select", "pass", (time.perf_counter() - t0) * 1000, "ok")
85
+ except Exception as exc:
86
+ return HealthCheckStep("sample_select", "fail", 0.0, str(exc))
87
+
88
+ def _step_latency(self) -> HealthCheckStep:
89
+ t0 = time.perf_counter()
90
+ try:
91
+ with self._engine.connect() as conn:
92
+ conn.execute(sa.text("SELECT 1"))
93
+ latency = (time.perf_counter() - t0) * 1000
94
+ return HealthCheckStep("latency_probe", "pass", latency, f"{latency:.1f}ms")
95
+ except Exception as exc:
96
+ return HealthCheckStep("latency_probe", "fail", 0.0, str(exc))
97
+
98
+ def _step_clock_skew(self) -> HealthCheckStep:
99
+ t0 = time.perf_counter()
100
+ try:
101
+ with self._engine.connect() as conn:
102
+ db_now = conn.execute(sa.text("SELECT NOW()")).scalar()
103
+ local_now = datetime.datetime.now(datetime.timezone.utc)
104
+ if db_now.tzinfo is None:
105
+ db_now = db_now.replace(tzinfo=datetime.timezone.utc)
106
+ skew_s = abs((db_now - local_now).total_seconds())
107
+ status = "pass" if skew_s < 60 else "fail"
108
+ return HealthCheckStep("clock_skew", status, (time.perf_counter() - t0) * 1000, f"skew={skew_s:.1f}s")
109
+ except Exception as exc:
110
+ return HealthCheckStep("clock_skew", "fail", 0.0, str(exc))
111
+
112
+ def list_schemas(self) -> list[str]:
113
+ with self._engine.connect() as conn:
114
+ rows = conn.execute(sa.text(
115
+ "SELECT DISTINCT table_schema FROM information_schema.tables "
116
+ "WHERE table_schema NOT IN ('pg_catalog','information_schema') ORDER BY 1"
117
+ )).fetchall()
118
+ return [r[0] for r in rows]
119
+
120
+ def list_tables(self, schema: str) -> list[str]:
121
+ with self._engine.connect() as conn:
122
+ rows = conn.execute(sa.text(
123
+ "SELECT table_name FROM information_schema.tables "
124
+ "WHERE table_schema = :schema ORDER BY 1"
125
+ ), {"schema": schema}).fetchall()
126
+ return [r[0] for r in rows]
127
+
128
+ def describe_columns(self, schema: str, table: str) -> list[ColumnMeta]:
129
+ with self._engine.connect() as conn:
130
+ rows = conn.execute(sa.text(
131
+ "SELECT column_name, data_type, is_nullable, ordinal_position "
132
+ "FROM information_schema.columns "
133
+ "WHERE table_schema = :schema AND table_name = :table "
134
+ "ORDER BY ordinal_position"
135
+ ), {"schema": schema, "table": table}).fetchall()
136
+ return [
137
+ ColumnMeta(name=r[0], data_type=r[1], nullable=(r[2] == "YES"), position=r[3])
138
+ for r in rows
139
+ ]
140
+
141
+ def sample(self, schema: str, table: str, n: int = 100_000) -> pd.DataFrame:
142
+ # Use ORDER BY random() to get a genuine random sample without TABLESAMPLE bias on small tables.
143
+ # schema/table are double-quoted identifiers, not user values in SQL context.
144
+ query = sa.text(f'SELECT * FROM "{schema}"."{table}" ORDER BY random() LIMIT :n')
145
+ with self._engine.connect() as conn:
146
+ return pd.read_sql(query, conn, params={"n": n})
147
+
148
+ def aggregate(self, schema: str, table: str, exprs: list[AggExpr]) -> dict[str, Any]:
149
+ cols = ", ".join(f"{e.sql} AS {e.name}" for e in exprs)
150
+ # schema/table are double-quoted identifiers; cols are built from AggExpr.sql (caller-controlled).
151
+ query = sa.text(f'SELECT {cols} FROM "{schema}"."{table}"')
152
+ with self._engine.connect() as conn:
153
+ row = conn.execute(query).fetchone()
154
+ return dict(zip([e.name for e in exprs], row))
@@ -0,0 +1,17 @@
1
+ from dataclasses import dataclass
2
+
3
+
4
+ @dataclass
5
+ class PostgresConfig:
6
+ host: str = "localhost"
7
+ port: int = 5432
8
+ database: str = "postgres"
9
+ username: str = "postgres"
10
+ password: str = ""
11
+ ssl_mode: str = "prefer"
12
+
13
+ def to_conn_str(self) -> str:
14
+ return (
15
+ f"postgresql+psycopg2://{self.username}:{self.password}"
16
+ f"@{self.host}:{self.port}/{self.database}?sslmode={self.ssl_mode}"
17
+ )
dqt/agent/__init__.py ADDED
File without changes
File without changes
@@ -0,0 +1,70 @@
1
+ # Base classes for all detectors. StatScale and STAT_SCALES live in _scales.py (no dqt imports there).
2
+ # compute_verdict defers the _scales import to break any potential circular dependency.
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from enum import Enum
7
+ from typing import TYPE_CHECKING, Any, ClassVar
8
+
9
+ import pandas as pd
10
+
11
+ if TYPE_CHECKING:
12
+ from dqt.adapters._protocol import AggExpr
13
+
14
+
15
+ class Verdict(str, Enum):
16
+ pass_ = "pass"
17
+ warn = "warn"
18
+ fail = "fail"
19
+
20
+
21
+ @dataclass
22
+ class DetectorResult:
23
+ score: float
24
+ verdict: Verdict
25
+ plain_english: str
26
+ details: dict[str, Any] = field(default_factory=dict)
27
+
28
+
29
+ DetectorState = Any
30
+
31
+
32
+ def compute_verdict(score: float, slug: str) -> Verdict:
33
+ from dqt.algorithms._scales import STAT_SCALES # deferred to avoid circular deps
34
+ scale = STAT_SCALES.get(slug)
35
+ if scale is None:
36
+ raise KeyError(f"No STAT_SCALE entry for slug '{slug}'. Add it to _scales.py.")
37
+ if scale.direction == "lower_is_better":
38
+ if score >= scale.fail_threshold:
39
+ return Verdict.fail
40
+ if score >= scale.warn_threshold:
41
+ return Verdict.warn
42
+ return Verdict.pass_
43
+ else:
44
+ if score <= scale.fail_threshold:
45
+ return Verdict.fail
46
+ if score <= scale.warn_threshold:
47
+ return Verdict.warn
48
+ return Verdict.pass_
49
+
50
+
51
+ class BaseDetector:
52
+ slug: ClassVar[str]
53
+ group: ClassVar[str]
54
+ kind: ClassVar[str] = "sample"
55
+
56
+ def fit(self, reference: pd.DataFrame) -> DetectorState:
57
+ raise NotImplementedError
58
+
59
+ def score(self, current: pd.DataFrame, state: DetectorState) -> DetectorResult:
60
+ raise NotImplementedError
61
+
62
+ def _verdict(self, score: float) -> Verdict:
63
+ return compute_verdict(score, self.slug)
64
+
65
+
66
+ class BaseAggregateDetector(BaseDetector):
67
+ kind: ClassVar[str] = "aggregate"
68
+
69
+ def get_aggregations(self, col: str) -> list[AggExpr]:
70
+ raise NotImplementedError
@@ -0,0 +1,28 @@
1
+ # Detector registry for slug-based lookup.
2
+ from __future__ import annotations
3
+
4
+ from typing import TYPE_CHECKING
5
+
6
+ if TYPE_CHECKING:
7
+ from dqt.algorithms._base import BaseDetector
8
+
9
+
10
+ class Registry:
11
+ def __init__(self) -> None:
12
+ self._map: dict[str, type[BaseDetector]] = {}
13
+
14
+ def register(self, cls: type[BaseDetector]) -> type[BaseDetector]:
15
+ self._map[cls.slug] = cls
16
+ return cls
17
+
18
+ def get(self, slug: str) -> type[BaseDetector]:
19
+ try:
20
+ return self._map[slug]
21
+ except KeyError:
22
+ raise KeyError(f"Detector slug '{slug}' not registered. Import the detector module first.")
23
+
24
+ def slugs(self) -> list[str]:
25
+ return list(self._map.keys())
26
+
27
+
28
+ registry = Registry()
@@ -0,0 +1,62 @@
1
+ # Single source of truth for stat scale definitions.
2
+ # Frontend reads the TS version generated by `make stats-scales`.
3
+ # ZERO imports from dqt — this file must import nothing from this package.
4
+ from typing import Literal, NamedTuple
5
+
6
+
7
+ class StatScale(NamedTuple):
8
+ slug: str
9
+ max: float
10
+ warn_threshold: float
11
+ fail_threshold: float
12
+ direction: Literal["lower_is_better", "higher_is_better"]
13
+ plain_english_label: str
14
+ hint: str
15
+
16
+
17
+ STAT_SCALES: dict[str, StatScale] = {
18
+ s.slug: s for s in [
19
+ StatScale("completeness_rate", 1.0, 0.95, 0.90, "higher_is_better", "Completeness", "Fraction of non-null values"),
20
+ StatScale("uniqueness_rate", 1.0, 0.95, 0.80, "higher_is_better", "Uniqueness", "Fraction of distinct values"),
21
+ StatScale("validity_rate", 1.0, 0.95, 0.90, "higher_is_better", "Validity", "Fraction of values matching the rule"),
22
+ StatScale("numeric_mean_shift", 10.0, 2.0, 3.0, "lower_is_better", "Mean shift (σ)", "Z-score of mean deviation from baseline"),
23
+ StatScale("volume_change_ratio", 1.0, 0.10, 0.25, "lower_is_better", "Row-count change", "Fractional deviation from baseline row count"),
24
+ StatScale("schema_change", 1.0, 0.5, 0.5, "lower_is_better", "Schema change", "1.0 if schema changed, 0.0 if unchanged"),
25
+ StatScale("max_in_range", 1.0, 0.5, 0.5, "lower_is_better", "Max in bounds", "1.0 when MAX(col) outside [min, max]; 0.0 otherwise"),
26
+ StatScale("min_in_range", 1.0, 0.5, 0.5, "lower_is_better", "Min in bounds", "1.0 when MIN(col) outside [min, max]; 0.0 otherwise"),
27
+ StatScale("median_in_range", 1.0, 0.5, 0.5, "lower_is_better", "Median in bounds", "1.0 when median outside [min, max]"),
28
+ StatScale("stddev_in_range", 1.0, 0.5, 0.5, "lower_is_better", "Stddev in bounds", "1.0 when STDDEV outside [min, max]"),
29
+ StatScale("sum_in_range", 1.0, 0.5, 0.5, "lower_is_better", "Sum in bounds", "1.0 when SUM outside [min, max]"),
30
+ StatScale("cardinality_in_range", 1.0, 0.5, 0.5, "lower_is_better", "Cardinality in bounds", "1.0 when COUNT(DISTINCT col) outside [min, max]"),
31
+ StatScale("quantile_in_range", 1.0, 0.5, 0.5, "lower_is_better", "Quantile in bounds", "1.0 when specified quantile outside [min, max]"),
32
+ StatScale("value_in_range_violation", 0.10, 0.001, 0.01, "lower_is_better", "Values in range", "Fraction of values outside [min, max]"),
33
+ StatScale("set_membership_violation", 0.10, 0.001, 0.01, "lower_is_better", "Set membership", "Fraction of values not in the allowed set"),
34
+ StatScale("set_exclusion_violation", 0.10, 0.001, 0.01, "lower_is_better", "Set exclusion", "Fraction of values in the forbidden set"),
35
+ StatScale("regex_match_violation", 0.10, 0.001, 0.01, "lower_is_better", "Regex format", "Fraction of values not matching any regex pattern"),
36
+ StatScale("string_length_violation", 0.10, 0.001, 0.01, "lower_is_better", "String length", "Fraction of values with length outside [min_len, max_len]"),
37
+ StatScale("date_format_violation", 0.10, 0.001, 0.01, "lower_is_better", "Date format", "Fraction of values not parseable as the given date format"),
38
+ StatScale("monotonicity_violation", 1.0, 0.5, 0.5, "lower_is_better", "Monotonicity", "1.0 if ordering violated; 0.0 if sequence is monotonic"),
39
+ StatScale("column_pair_violation", 0.10, 0.001, 0.01, "lower_is_better", "Column pair rule", "Fraction of rows where the pair comparison rule is violated"),
40
+ StatScale("composite_uniqueness_violation", 0.10, 0.001, 0.01, "lower_is_better","Composite key uniqueness", "Fraction of rows that are duplicates on the composite key"),
41
+ StatScale("referential_integrity_rate", 1.0, 0.99, 0.95, "higher_is_better", "Referential integrity", "Fraction of FK values present in parent table"),
42
+ StatScale("ks_pvalue", 1.0, 0.95, 0.99, "lower_is_better", "KS drift (1−p)", "1 − p-value from two-sample KS test; warn p<0.05, fail p<0.01"),
43
+ StatScale("mad_outlier_fraction", 0.20, 0.01, 0.05, "lower_is_better", "Outlier fraction (MAD)", "Fraction of values with |modified Z| > 3.5"),
44
+ StatScale("double_mad_outlier_fraction", 0.20, 0.01, 0.05, "lower_is_better", "Outlier fraction (double-MAD)", "Fraction flagged by asymmetric double-MAD; robust on skewed distributions"),
45
+ StatScale("zscore_outlier_fraction", 0.10, 0.01, 0.05, "lower_is_better", "Outlier fraction (Z-score)", "Fraction of values with |Z| > threshold; valid only under normality"),
46
+ StatScale("adjusted_boxplot_fraction", 0.20, 0.01, 0.05, "lower_is_better", "Outlier fraction (adj. boxplot)", "Fraction outside medcouple-adjusted Tukey fences; Hubert & Vandervieren 2008"),
47
+ StatScale("isolation_forest_fraction", 0.20, 0.05, 0.10, "lower_is_better", "Outlier fraction (IF)", "Fraction of rows classified as anomalies by Isolation Forest"),
48
+ StatScale("stl_residual_zscore", 10.0, 3.0, 5.0, "lower_is_better", "STL residual Z-score", "Max absolute Z-score of STL residuals over the current window"),
49
+ StatScale("freshness_seconds_behind", 86400*7, 3600, 86400, "lower_is_better",
50
+ "Data freshness", "Seconds since the most recent row timestamp"),
51
+ StatScale("null_fraction", 1.0, 0.01, 0.05, "lower_is_better",
52
+ "Null fraction", "Fraction of rows where the column is NULL"),
53
+ StatScale("string_case_violation", 1.0, 0.001, 0.01, "lower_is_better",
54
+ "String case violation", "Fraction of rows with wrong case"),
55
+ StatScale("sql_assertion_violation", 1.0, 0.001, 0.01, "lower_is_better",
56
+ "SQL assertion violation", "Fraction of rows failing the custom SQL condition"),
57
+ StatScale("date_part_missing_fraction", 1.0, 0.01, 0.05, "lower_is_better",
58
+ "Date-part completeness", "Fraction of expected date buckets with no data"),
59
+ StatScale("outlier_fraction_drift", 1.0, 0.001, 0.01, "lower_is_better",
60
+ "Outlier fraction drift", "Deviation of current outlier fraction from historical baseline range"),
61
+ ]
62
+ }
@@ -0,0 +1,35 @@
1
+ from dqt.algorithms.basic.completeness import CompletenessDetector
2
+ from dqt.algorithms.basic.uniqueness import UniquenessDetector
3
+ from dqt.algorithms.basic.validity import ValidityDetector
4
+ from dqt.algorithms.basic.numeric import NumericMeanDetector
5
+ from dqt.algorithms.basic.volume import VolumeDetector
6
+ from dqt.algorithms.basic.numeric_bounds import (
7
+ MaxInRangeDetector, MinInRangeDetector, MedianInRangeDetector,
8
+ StdDevInRangeDetector, SumInRangeDetector, CardinalityInRangeDetector,
9
+ QuantileInRangeDetector,
10
+ )
11
+ from dqt.algorithms.basic.value_checks import (
12
+ ValueInRangeDetector, SetMembershipDetector, SetExclusionDetector,
13
+ RegexMatchDetector, StringLengthRangeDetector, DateFormatDetector,
14
+ )
15
+ from dqt.algorithms.basic.monotonicity import MonotonicityDetector
16
+ from dqt.algorithms.basic.column_pairs import ColumnPairComparisonDetector, CompositeUniquenessDetector
17
+ from dqt.algorithms.basic.freshness import FreshnessDetector
18
+ from dqt.algorithms.basic.null_fraction import NullFractionDetector
19
+ from dqt.algorithms.basic.string_case import StringCaseDetector
20
+ from dqt.algorithms.basic.sql_assertion import SqlAssertionDetector
21
+ from dqt.algorithms.basic.date_part import DatePartCompletenessDetector
22
+
23
+ __all__ = [
24
+ "CompletenessDetector", "UniquenessDetector", "ValidityDetector",
25
+ "NumericMeanDetector", "VolumeDetector",
26
+ "MaxInRangeDetector", "MinInRangeDetector", "MedianInRangeDetector",
27
+ "StdDevInRangeDetector", "SumInRangeDetector", "CardinalityInRangeDetector",
28
+ "QuantileInRangeDetector",
29
+ "ValueInRangeDetector", "SetMembershipDetector", "SetExclusionDetector",
30
+ "RegexMatchDetector", "StringLengthRangeDetector", "DateFormatDetector",
31
+ "MonotonicityDetector",
32
+ "ColumnPairComparisonDetector", "CompositeUniquenessDetector",
33
+ "FreshnessDetector", "NullFractionDetector", "StringCaseDetector",
34
+ "SqlAssertionDetector", "DatePartCompletenessDetector",
35
+ ]
@@ -0,0 +1,18 @@
1
+ from __future__ import annotations
2
+
3
+ import pandas as pd
4
+
5
+ from dqt.algorithms._base import DetectorResult
6
+
7
+
8
+ def fraction_result(df: pd.DataFrame, slug: str, label: str) -> DetectorResult:
9
+ from dqt.algorithms._base import compute_verdict
10
+ row = df.iloc[0]
11
+ total = int(row["total_count"])
12
+ frac = int(row["violation_count"]) / total if total > 0 else 0.0
13
+ return DetectorResult(
14
+ score=frac,
15
+ verdict=compute_verdict(frac, slug),
16
+ plain_english=f"{frac:.2%} of values violate {label}",
17
+ details={"violation_fraction": frac, "violation_count": int(row["violation_count"]), "total": total},
18
+ )