dqtlib 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dqtlib-0.1.0/.gitignore +128 -0
- dqtlib-0.1.0/PKG-INFO +64 -0
- dqtlib-0.1.0/README.md +17 -0
- dqtlib-0.1.0/pyproject.toml +44 -0
- dqtlib-0.1.0/src/dqt/__init__.py +49 -0
- dqtlib-0.1.0/src/dqt/adapters/__init__.py +0 -0
- dqtlib-0.1.0/src/dqt/adapters/_protocol.py +47 -0
- dqtlib-0.1.0/src/dqt/adapters/local/__init__.py +3 -0
- dqtlib-0.1.0/src/dqt/adapters/local/adapter.py +118 -0
- dqtlib-0.1.0/src/dqt/adapters/postgres/__init__.py +4 -0
- dqtlib-0.1.0/src/dqt/adapters/postgres/adapter.py +154 -0
- dqtlib-0.1.0/src/dqt/adapters/postgres/config.py +17 -0
- dqtlib-0.1.0/src/dqt/agent/__init__.py +0 -0
- dqtlib-0.1.0/src/dqt/algorithms/__init__.py +0 -0
- dqtlib-0.1.0/src/dqt/algorithms/_base.py +70 -0
- dqtlib-0.1.0/src/dqt/algorithms/_registry.py +28 -0
- dqtlib-0.1.0/src/dqt/algorithms/_scales.py +62 -0
- dqtlib-0.1.0/src/dqt/algorithms/basic/__init__.py +35 -0
- dqtlib-0.1.0/src/dqt/algorithms/basic/_helpers.py +18 -0
- dqtlib-0.1.0/src/dqt/algorithms/basic/column_pairs.py +80 -0
- dqtlib-0.1.0/src/dqt/algorithms/basic/completeness.py +40 -0
- dqtlib-0.1.0/src/dqt/algorithms/basic/date_part.py +52 -0
- dqtlib-0.1.0/src/dqt/algorithms/basic/freshness.py +53 -0
- dqtlib-0.1.0/src/dqt/algorithms/basic/monotonicity.py +42 -0
- dqtlib-0.1.0/src/dqt/algorithms/basic/null_fraction.py +37 -0
- dqtlib-0.1.0/src/dqt/algorithms/basic/numeric.py +42 -0
- dqtlib-0.1.0/src/dqt/algorithms/basic/numeric_bounds.py +155 -0
- dqtlib-0.1.0/src/dqt/algorithms/basic/sql_assertion.py +40 -0
- dqtlib-0.1.0/src/dqt/algorithms/basic/string_case.py +51 -0
- dqtlib-0.1.0/src/dqt/algorithms/basic/uniqueness.py +40 -0
- dqtlib-0.1.0/src/dqt/algorithms/basic/validity.py +46 -0
- dqtlib-0.1.0/src/dqt/algorithms/basic/value_checks.py +174 -0
- dqtlib-0.1.0/src/dqt/algorithms/basic/volume.py +35 -0
- dqtlib-0.1.0/src/dqt/algorithms/distribution/__init__.py +7 -0
- dqtlib-0.1.0/src/dqt/algorithms/distribution/profiler.py +127 -0
- dqtlib-0.1.0/src/dqt/algorithms/drift/__init__.py +3 -0
- dqtlib-0.1.0/src/dqt/algorithms/drift/ks2sample.py +40 -0
- dqtlib-0.1.0/src/dqt/algorithms/outliers_multi/__init__.py +3 -0
- dqtlib-0.1.0/src/dqt/algorithms/outliers_multi/isolation_forest.py +41 -0
- dqtlib-0.1.0/src/dqt/algorithms/outliers_uni/__init__.py +14 -0
- dqtlib-0.1.0/src/dqt/algorithms/outliers_uni/adjusted_boxplot.py +59 -0
- dqtlib-0.1.0/src/dqt/algorithms/outliers_uni/auto_outlier.py +110 -0
- dqtlib-0.1.0/src/dqt/algorithms/outliers_uni/mad.py +84 -0
- dqtlib-0.1.0/src/dqt/algorithms/outliers_uni/outlier_fraction_range.py +116 -0
- dqtlib-0.1.0/src/dqt/algorithms/outliers_uni/zscore.py +35 -0
- dqtlib-0.1.0/src/dqt/algorithms/referential/__init__.py +2 -0
- dqtlib-0.1.0/src/dqt/algorithms/referential/referential.py +52 -0
- dqtlib-0.1.0/src/dqt/algorithms/schema/__init__.py +2 -0
- dqtlib-0.1.0/src/dqt/algorithms/schema/schema_checks.py +56 -0
- dqtlib-0.1.0/src/dqt/algorithms/timeseries/__init__.py +3 -0
- dqtlib-0.1.0/src/dqt/algorithms/timeseries/stl.py +51 -0
- dqtlib-0.1.0/src/dqt/causality/__init__.py +0 -0
- dqtlib-0.1.0/src/dqt/checks/__init__.py +0 -0
- dqtlib-0.1.0/src/dqt/checks/loader.py +70 -0
- dqtlib-0.1.0/src/dqt/checks/models.py +42 -0
- dqtlib-0.1.0/src/dqt/checks/schema/__init__.py +0 -0
- dqtlib-0.1.0/src/dqt/checks/schema/check.schema.json +74 -0
- dqtlib-0.1.0/src/dqt/compat/__init__.py +0 -0
- dqtlib-0.1.0/src/dqt/governance/__init__.py +0 -0
- dqtlib-0.1.0/src/dqt/hitl/__init__.py +0 -0
- dqtlib-0.1.0/src/dqt/lineage/__init__.py +4 -0
- dqtlib-0.1.0/src/dqt/lineage/models.py +42 -0
- dqtlib-0.1.0/src/dqt/lineage/vault.py +273 -0
- dqtlib-0.1.0/src/dqt/profiling/__init__.py +12 -0
- dqtlib-0.1.0/src/dqt/profiling/models.py +79 -0
- dqtlib-0.1.0/src/dqt/profiling/profiler.py +236 -0
- dqtlib-0.1.0/src/dqt/reporting/__init__.py +5 -0
- dqtlib-0.1.0/src/dqt/reporting/_charts.py +173 -0
- dqtlib-0.1.0/src/dqt/reporting/html_report.py +480 -0
- dqtlib-0.1.0/src/dqt/runner/__init__.py +0 -0
- dqtlib-0.1.0/src/dqt/runner/runner.py +121 -0
- dqtlib-0.1.0/src/dqt/semantic/__init__.py +9 -0
- dqtlib-0.1.0/src/dqt/semantic/loader.py +10 -0
- dqtlib-0.1.0/src/dqt/semantic/models.py +35 -0
- dqtlib-0.1.0/src/dqt/store/__init__.py +0 -0
- dqtlib-0.1.0/src/dqt/store/_protocol.py +43 -0
- dqtlib-0.1.0/src/dqt/store/memory.py +27 -0
- dqtlib-0.1.0/src/dqt/utils/__init__.py +0 -0
- dqtlib-0.1.0/src/dqt/utils/logging.py +26 -0
- dqtlib-0.1.0/tests/__init__.py +0 -0
- dqtlib-0.1.0/tests/adapters/__init__.py +0 -0
- dqtlib-0.1.0/tests/adapters/test_local_adapter.py +141 -0
- dqtlib-0.1.0/tests/adapters/test_postgres_adapter.py +77 -0
- dqtlib-0.1.0/tests/algorithms/__init__.py +0 -0
- dqtlib-0.1.0/tests/algorithms/basic/__init__.py +0 -0
- dqtlib-0.1.0/tests/algorithms/basic/test_column_pairs.py +86 -0
- dqtlib-0.1.0/tests/algorithms/basic/test_completeness.py +77 -0
- dqtlib-0.1.0/tests/algorithms/basic/test_date_part.py +33 -0
- dqtlib-0.1.0/tests/algorithms/basic/test_freshness.py +40 -0
- dqtlib-0.1.0/tests/algorithms/basic/test_monotonicity.py +70 -0
- dqtlib-0.1.0/tests/algorithms/basic/test_null_fraction.py +34 -0
- dqtlib-0.1.0/tests/algorithms/basic/test_numeric.py +68 -0
- dqtlib-0.1.0/tests/algorithms/basic/test_numeric_bounds.py +124 -0
- dqtlib-0.1.0/tests/algorithms/basic/test_sql_assertion.py +27 -0
- dqtlib-0.1.0/tests/algorithms/basic/test_string_case.py +34 -0
- dqtlib-0.1.0/tests/algorithms/basic/test_uniqueness.py +62 -0
- dqtlib-0.1.0/tests/algorithms/basic/test_validity.py +70 -0
- dqtlib-0.1.0/tests/algorithms/basic/test_value_checks.py +138 -0
- dqtlib-0.1.0/tests/algorithms/basic/test_volume.py +67 -0
- dqtlib-0.1.0/tests/algorithms/distribution/__init__.py +0 -0
- dqtlib-0.1.0/tests/algorithms/distribution/test_profiler.py +57 -0
- dqtlib-0.1.0/tests/algorithms/drift/__init__.py +0 -0
- dqtlib-0.1.0/tests/algorithms/drift/test_ks2sample.py +75 -0
- dqtlib-0.1.0/tests/algorithms/outliers_multi/__init__.py +0 -0
- dqtlib-0.1.0/tests/algorithms/outliers_multi/test_isolation_forest.py +54 -0
- dqtlib-0.1.0/tests/algorithms/outliers_uni/__init__.py +0 -0
- dqtlib-0.1.0/tests/algorithms/outliers_uni/test_adjusted_boxplot.py +70 -0
- dqtlib-0.1.0/tests/algorithms/outliers_uni/test_auto_outlier.py +52 -0
- dqtlib-0.1.0/tests/algorithms/outliers_uni/test_mad.py +134 -0
- dqtlib-0.1.0/tests/algorithms/outliers_uni/test_zscore.py +70 -0
- dqtlib-0.1.0/tests/algorithms/referential/__init__.py +0 -0
- dqtlib-0.1.0/tests/algorithms/referential/test_referential.py +72 -0
- dqtlib-0.1.0/tests/algorithms/schema/__init__.py +0 -0
- dqtlib-0.1.0/tests/algorithms/schema/test_schema_checks.py +74 -0
- dqtlib-0.1.0/tests/algorithms/test_outlier_fraction_range.py +83 -0
- dqtlib-0.1.0/tests/algorithms/test_registry.py +87 -0
- dqtlib-0.1.0/tests/algorithms/timeseries/__init__.py +0 -0
- dqtlib-0.1.0/tests/algorithms/timeseries/test_stl.py +87 -0
- dqtlib-0.1.0/tests/checks/__init__.py +0 -0
- dqtlib-0.1.0/tests/checks/test_loader.py +135 -0
- dqtlib-0.1.0/tests/conftest.py +35 -0
- dqtlib-0.1.0/tests/profiling/__init__.py +0 -0
- dqtlib-0.1.0/tests/profiling/test_profiler.py +323 -0
- dqtlib-0.1.0/tests/reporting/__init__.py +0 -0
- dqtlib-0.1.0/tests/reporting/test_html_report.py +45 -0
- dqtlib-0.1.0/tests/runner/__init__.py +0 -0
- dqtlib-0.1.0/tests/runner/test_runner.py +191 -0
- dqtlib-0.1.0/tests/store/__init__.py +0 -0
- dqtlib-0.1.0/tests/store/test_memory_store.py +95 -0
- dqtlib-0.1.0/tests/test_core_types.py +112 -0
- dqtlib-0.1.0/tests/test_public_api.py +61 -0
- dqtlib-0.1.0/tests/test_vault.py +41 -0
dqtlib-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
# --- Python ---
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.Python
|
|
7
|
+
build/
|
|
8
|
+
develop-eggs/
|
|
9
|
+
dist/
|
|
10
|
+
downloads/
|
|
11
|
+
eggs/
|
|
12
|
+
.eggs/
|
|
13
|
+
lib/
|
|
14
|
+
lib64/
|
|
15
|
+
parts/
|
|
16
|
+
sdist/
|
|
17
|
+
var/
|
|
18
|
+
wheels/
|
|
19
|
+
share/python-wheels/
|
|
20
|
+
*.egg-info/
|
|
21
|
+
.installed.cfg
|
|
22
|
+
*.egg
|
|
23
|
+
MANIFEST
|
|
24
|
+
|
|
25
|
+
# uv
|
|
26
|
+
.venv/
|
|
27
|
+
.uv-cache/
|
|
28
|
+
|
|
29
|
+
# pip
|
|
30
|
+
pip-log.txt
|
|
31
|
+
pip-delete-this-directory.txt
|
|
32
|
+
|
|
33
|
+
# pytest
|
|
34
|
+
.pytest_cache/
|
|
35
|
+
.cache/
|
|
36
|
+
.coverage
|
|
37
|
+
.coverage.*
|
|
38
|
+
htmlcov/
|
|
39
|
+
.tox/
|
|
40
|
+
.nox/
|
|
41
|
+
coverage.xml
|
|
42
|
+
*.cover
|
|
43
|
+
*.py,cover
|
|
44
|
+
.hypothesis/
|
|
45
|
+
|
|
46
|
+
# mypy / ruff
|
|
47
|
+
.mypy_cache/
|
|
48
|
+
.ruff_cache/
|
|
49
|
+
.dmypy.json
|
|
50
|
+
dmypy.json
|
|
51
|
+
|
|
52
|
+
# --- Node / Next.js ---
|
|
53
|
+
node_modules/
|
|
54
|
+
.next/
|
|
55
|
+
.turbo/
|
|
56
|
+
.swc/
|
|
57
|
+
out/
|
|
58
|
+
dist/
|
|
59
|
+
build/
|
|
60
|
+
.npm/
|
|
61
|
+
.pnpm-store/
|
|
62
|
+
pnpm-debug.log*
|
|
63
|
+
yarn-debug.log*
|
|
64
|
+
npm-debug.log*
|
|
65
|
+
|
|
66
|
+
# --- Generated artifacts (NEVER commit) ---
|
|
67
|
+
packages/dqt-types/
|
|
68
|
+
shared/generated/
|
|
69
|
+
apps/web/src/generated/
|
|
70
|
+
apps/web/src/lib/stats.generated.ts
|
|
71
|
+
apps/web/src/components/connections/engines.generated.ts
|
|
72
|
+
packages/dqt/src/dqt/generated/
|
|
73
|
+
apps/server/openapi.json
|
|
74
|
+
|
|
75
|
+
# --- Env / secrets ---
|
|
76
|
+
.env
|
|
77
|
+
.env.local
|
|
78
|
+
.env.*.local
|
|
79
|
+
*.pem
|
|
80
|
+
*.key
|
|
81
|
+
*.p12
|
|
82
|
+
*.crt
|
|
83
|
+
service-account*.json
|
|
84
|
+
|
|
85
|
+
# --- OS / editor ---
|
|
86
|
+
.DS_Store
|
|
87
|
+
Thumbs.db
|
|
88
|
+
.idea/
|
|
89
|
+
.vscode/
|
|
90
|
+
!.vscode/settings.json.example
|
|
91
|
+
*.swp
|
|
92
|
+
*.swo
|
|
93
|
+
*~
|
|
94
|
+
.history/
|
|
95
|
+
|
|
96
|
+
# --- Local dev ---
|
|
97
|
+
tmp/
|
|
98
|
+
*.log
|
|
99
|
+
logs/
|
|
100
|
+
|
|
101
|
+
# --- Generated demo reports ---
|
|
102
|
+
examples/*/reports/
|
|
103
|
+
apps/web/tsconfig.tsbuildinfo
|
|
104
|
+
|
|
105
|
+
# --- Unrelated legacy code ---
|
|
106
|
+
dql/
|
|
107
|
+
.cache/
|
|
108
|
+
|
|
109
|
+
# --- Docker / DB volumes ---
|
|
110
|
+
run_local/data/
|
|
111
|
+
run_local/postgres-data/
|
|
112
|
+
run_local/redis-data/
|
|
113
|
+
|
|
114
|
+
# --- Test artifacts ---
|
|
115
|
+
playwright-report/
|
|
116
|
+
test-results/
|
|
117
|
+
e2e-screenshots/
|
|
118
|
+
*.pyc
|
|
119
|
+
.benchmarks/
|
|
120
|
+
|
|
121
|
+
# --- Built docs ---
|
|
122
|
+
docs/_build/
|
|
123
|
+
site/
|
|
124
|
+
|
|
125
|
+
# --- Archives (created by build scripts) ---
|
|
126
|
+
*.zip
|
|
127
|
+
*.tar.gz
|
|
128
|
+
!reference_data/**/*.zip
|
dqtlib-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dqtlib
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Data quality, observability, semantic, and causality library
|
|
5
|
+
Project-URL: Homepage, https://github.com/antonbarr-data/dqt
|
|
6
|
+
Project-URL: Repository, https://github.com/antonbarr-data/dqt
|
|
7
|
+
Author-email: Anton Barr <antonbar@gmail.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
Keywords: anomaly-detection,causal-inference,causality,data-drift,data-quality,observability
|
|
10
|
+
Requires-Python: >=3.12
|
|
11
|
+
Requires-Dist: diptest>=0.6
|
|
12
|
+
Requires-Dist: duckdb>=0.9
|
|
13
|
+
Requires-Dist: ibis-framework>=9.0
|
|
14
|
+
Requires-Dist: jsonschema>=4.22
|
|
15
|
+
Requires-Dist: numpy>=1.26
|
|
16
|
+
Requires-Dist: pandas>=2.2
|
|
17
|
+
Requires-Dist: pydantic>=2.7
|
|
18
|
+
Requires-Dist: pyod>=1.1
|
|
19
|
+
Requires-Dist: pyyaml>=6.0
|
|
20
|
+
Requires-Dist: river>=0.21
|
|
21
|
+
Requires-Dist: scikit-learn>=1.5
|
|
22
|
+
Requires-Dist: scipy>=1.13
|
|
23
|
+
Requires-Dist: statsmodels>=0.14
|
|
24
|
+
Requires-Dist: structlog>=24.0
|
|
25
|
+
Provides-Extra: causal
|
|
26
|
+
Requires-Dist: causal-learn>=0.1; extra == 'causal'
|
|
27
|
+
Requires-Dist: dowhy>=0.11; extra == 'causal'
|
|
28
|
+
Requires-Dist: tigramite>=0.7; extra == 'causal'
|
|
29
|
+
Provides-Extra: deep
|
|
30
|
+
Requires-Dist: pyod[deep]>=1.1; extra == 'deep'
|
|
31
|
+
Requires-Dist: torch>=2.3; extra == 'deep'
|
|
32
|
+
Provides-Extra: explain
|
|
33
|
+
Requires-Dist: pgmpy>=0.1; extra == 'explain'
|
|
34
|
+
Requires-Dist: shap>=0.45; extra == 'explain'
|
|
35
|
+
Provides-Extra: files
|
|
36
|
+
Requires-Dist: openpyxl>=3.0; extra == 'files'
|
|
37
|
+
Requires-Dist: pyarrow>=14.0; extra == 'files'
|
|
38
|
+
Provides-Extra: forecast
|
|
39
|
+
Requires-Dist: prophet>=1.1; extra == 'forecast'
|
|
40
|
+
Requires-Dist: stumpy>=1.4; extra == 'forecast'
|
|
41
|
+
Provides-Extra: postgres
|
|
42
|
+
Requires-Dist: ibis-framework[postgres]>=9.0; extra == 'postgres'
|
|
43
|
+
Requires-Dist: psycopg2-binary>=2.9; extra == 'postgres'
|
|
44
|
+
Provides-Extra: reports
|
|
45
|
+
Requires-Dist: matplotlib>=3.8; extra == 'reports'
|
|
46
|
+
Description-Content-Type: text/markdown
|
|
47
|
+
|
|
48
|
+
# dqtlib
|
|
49
|
+
|
|
50
|
+
**Open-source data quality, lineage, semantic layer & causality — for dbt, warehouses and data lakes.**
|
|
51
|
+
|
|
52
|
+
pip-installable Python library for watching dbt-built warehouses and any SQL warehouse for statistical drift, anomalies, silent regressions, and explaining *why* metrics moved.
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install dqtlib
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
The import name is `dqt`:
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from dqt import Check, Runner, MemoryStore
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Full documentation and examples: https://github.com/antonbarr-data/dqt
|
dqtlib-0.1.0/README.md
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# dqtlib
|
|
2
|
+
|
|
3
|
+
**Open-source data quality, lineage, semantic layer & causality — for dbt, warehouses and data lakes.**
|
|
4
|
+
|
|
5
|
+
pip-installable Python library for watching dbt-built warehouses and any SQL warehouse for statistical drift, anomalies, silent regressions, and explaining *why* metrics moved.
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install dqtlib
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
The import name is `dqt`:
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
from dqt import Check, Runner, MemoryStore
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
Full documentation and examples: https://github.com/antonbarr-data/dqt
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "dqtlib"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Data quality, observability, semantic, and causality library"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.12"
|
|
7
|
+
license = { text = "MIT" }
|
|
8
|
+
keywords = ["data-quality", "observability", "anomaly-detection", "causality", "data-drift", "causal-inference"]
|
|
9
|
+
authors = [
|
|
10
|
+
{ name = "Anton Barr", email = "antonbar@gmail.com" },
|
|
11
|
+
]
|
|
12
|
+
urls = { Homepage = "https://github.com/antonbarr-data/dqt", Repository = "https://github.com/antonbarr-data/dqt" }
|
|
13
|
+
dependencies = [
|
|
14
|
+
"numpy>=1.26",
|
|
15
|
+
"scipy>=1.13",
|
|
16
|
+
"pandas>=2.2",
|
|
17
|
+
"statsmodels>=0.14",
|
|
18
|
+
"scikit-learn>=1.5",
|
|
19
|
+
"pyod>=1.1",
|
|
20
|
+
"diptest>=0.6",
|
|
21
|
+
"river>=0.21",
|
|
22
|
+
"structlog>=24.0",
|
|
23
|
+
"pydantic>=2.7",
|
|
24
|
+
"ibis-framework>=9.0",
|
|
25
|
+
"duckdb>=0.9",
|
|
26
|
+
"jsonschema>=4.22",
|
|
27
|
+
"pyyaml>=6.0",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[project.optional-dependencies]
|
|
31
|
+
postgres = ["psycopg2-binary>=2.9", "ibis-framework[postgres]>=9.0"]
|
|
32
|
+
causal = ["tigramite>=0.7", "dowhy>=0.11", "causal-learn>=0.1"]
|
|
33
|
+
forecast = ["prophet>=1.1", "stumpy>=1.4"]
|
|
34
|
+
deep = ["torch>=2.3", "pyod[deep]>=1.1"]
|
|
35
|
+
explain = ["shap>=0.45", "pgmpy>=0.1"]
|
|
36
|
+
files = ["openpyxl>=3.0", "pyarrow>=14.0"]
|
|
37
|
+
reports = ["matplotlib>=3.8"]
|
|
38
|
+
|
|
39
|
+
[build-system]
|
|
40
|
+
requires = ["hatchling"]
|
|
41
|
+
build-backend = "hatchling.build"
|
|
42
|
+
|
|
43
|
+
[tool.hatch.build.targets.wheel]
|
|
44
|
+
packages = ["src/dqt"]
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# packages/dqt/src/dqt/__init__.py
|
|
2
|
+
"""dqt — open-source data quality, observability, and causality library."""
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
__version__ = "0.1.0"
|
|
6
|
+
|
|
7
|
+
from dqt.algorithms._base import (
|
|
8
|
+
BaseAggregateDetector,
|
|
9
|
+
BaseDetector,
|
|
10
|
+
DetectorResult,
|
|
11
|
+
Verdict,
|
|
12
|
+
compute_verdict,
|
|
13
|
+
)
|
|
14
|
+
from dqt.adapters._protocol import AggExpr, ColumnMeta, HealthCheckResult, WarehouseAdapter
|
|
15
|
+
from dqt.store._protocol import Incident, ResultsStore, RunResult
|
|
16
|
+
from dqt.store.memory import MemoryStore
|
|
17
|
+
from dqt.checks.models import BaselineConfig, Check, CheckFilter, CheckScope
|
|
18
|
+
from dqt.runner.runner import Runner
|
|
19
|
+
|
|
20
|
+
# Import all detector groups to trigger @registry.register side effects
|
|
21
|
+
import dqt.algorithms.basic # noqa: F401
|
|
22
|
+
import dqt.algorithms.schema # noqa: F401
|
|
23
|
+
import dqt.algorithms.referential # noqa: F401
|
|
24
|
+
import dqt.algorithms.drift # noqa: F401
|
|
25
|
+
import dqt.algorithms.outliers_uni # noqa: F401
|
|
26
|
+
import dqt.algorithms.outliers_multi # noqa: F401
|
|
27
|
+
import dqt.algorithms.timeseries # noqa: F401
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
"__version__",
|
|
31
|
+
"Verdict",
|
|
32
|
+
"DetectorResult",
|
|
33
|
+
"BaseDetector",
|
|
34
|
+
"BaseAggregateDetector",
|
|
35
|
+
"compute_verdict",
|
|
36
|
+
"AggExpr",
|
|
37
|
+
"ColumnMeta",
|
|
38
|
+
"HealthCheckResult",
|
|
39
|
+
"WarehouseAdapter",
|
|
40
|
+
"ResultsStore",
|
|
41
|
+
"RunResult",
|
|
42
|
+
"Incident",
|
|
43
|
+
"MemoryStore",
|
|
44
|
+
"Check",
|
|
45
|
+
"CheckScope",
|
|
46
|
+
"CheckFilter",
|
|
47
|
+
"BaselineConfig",
|
|
48
|
+
"Runner",
|
|
49
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Literal, Protocol, runtime_checkable
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class AggExpr:
|
|
11
|
+
name: str
|
|
12
|
+
sql: str
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class HealthCheckStep:
|
|
17
|
+
name: str
|
|
18
|
+
status: Literal["pass", "fail", "skip"]
|
|
19
|
+
latency_ms: float
|
|
20
|
+
detail: str
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class HealthCheckResult:
|
|
25
|
+
steps: list[HealthCheckStep] = field(default_factory=list)
|
|
26
|
+
|
|
27
|
+
@property
|
|
28
|
+
def passed(self) -> bool:
|
|
29
|
+
return all(s.status in ("pass", "skip") for s in self.steps)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class ColumnMeta:
|
|
34
|
+
name: str
|
|
35
|
+
data_type: str
|
|
36
|
+
nullable: bool
|
|
37
|
+
position: int
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@runtime_checkable
|
|
41
|
+
class WarehouseAdapter(Protocol):
|
|
42
|
+
def health_check(self) -> HealthCheckResult: ...
|
|
43
|
+
def sample(self, schema: str, table: str, n: int = 100_000) -> pd.DataFrame: ...
|
|
44
|
+
def aggregate(self, schema: str, table: str, exprs: list[AggExpr]) -> dict[str, object]: ...
|
|
45
|
+
def describe_columns(self, schema: str, table: str) -> list[ColumnMeta]: ...
|
|
46
|
+
def list_schemas(self) -> list[str]: ...
|
|
47
|
+
def list_tables(self, schema: str) -> list[str]: ...
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# Ref: https://duckdb.org/docs/api/python/overview — used for SQL aggregations on DataFrames
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import pathlib
|
|
5
|
+
import time
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from dqt.adapters._protocol import (
|
|
11
|
+
AggExpr,
|
|
12
|
+
ColumnMeta,
|
|
13
|
+
HealthCheckResult,
|
|
14
|
+
HealthCheckStep,
|
|
15
|
+
)
|
|
16
|
+
from dqt.utils.logging import get_logger
|
|
17
|
+
|
|
18
|
+
_log = get_logger(__name__)
|
|
19
|
+
|
|
20
|
+
_READERS: dict[str, Any] = {
|
|
21
|
+
".csv": lambda p: pd.read_csv(p),
|
|
22
|
+
".tsv": lambda p: pd.read_csv(p, sep="\t"),
|
|
23
|
+
".xlsx": lambda p: pd.read_excel(p),
|
|
24
|
+
".xls": lambda p: pd.read_excel(p),
|
|
25
|
+
".parquet": lambda p: pd.read_parquet(p),
|
|
26
|
+
".json": lambda p: pd.read_json(p),
|
|
27
|
+
".jsonl": lambda p: pd.read_json(p, lines=True),
|
|
28
|
+
".ndjson": lambda p: pd.read_json(p, lines=True),
|
|
29
|
+
".feather": lambda p: pd.read_feather(p),
|
|
30
|
+
".arrow": lambda p: pd.read_feather(p),
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
_HEALTH_STEPS = ("readable", "parseable", "columns", "sample_read", "row_count")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class LocalFileAdapter:
|
|
37
|
+
"""Reads a local file and exposes it as a single-table WarehouseAdapter."""
|
|
38
|
+
|
|
39
|
+
def __init__(self, path: str | pathlib.Path) -> None:
|
|
40
|
+
self._path = pathlib.Path(path)
|
|
41
|
+
self._suffix = self._path.suffix.lower()
|
|
42
|
+
if self._suffix not in _READERS:
|
|
43
|
+
supported = ", ".join(sorted(_READERS))
|
|
44
|
+
raise ValueError(f"Unsupported format '{self._suffix}'. Supported: {supported}")
|
|
45
|
+
self._table_name = self._path.stem
|
|
46
|
+
|
|
47
|
+
def _read(self) -> pd.DataFrame:
|
|
48
|
+
return _READERS[self._suffix](self._path)
|
|
49
|
+
|
|
50
|
+
def health_check(self) -> HealthCheckResult:
|
|
51
|
+
steps: list[HealthCheckStep] = []
|
|
52
|
+
|
|
53
|
+
t0 = time.perf_counter()
|
|
54
|
+
if not self._path.exists():
|
|
55
|
+
steps.append(HealthCheckStep("file_exists", "fail", 0.0, f"not found: {self._path}"))
|
|
56
|
+
for name in _HEALTH_STEPS:
|
|
57
|
+
steps.append(HealthCheckStep(name, "skip", 0.0, "skipped"))
|
|
58
|
+
return HealthCheckResult(steps=steps)
|
|
59
|
+
steps.append(HealthCheckStep("file_exists", "pass", (time.perf_counter() - t0) * 1000, str(self._path)))
|
|
60
|
+
|
|
61
|
+
t0 = time.perf_counter()
|
|
62
|
+
try:
|
|
63
|
+
self._path.read_bytes()[:1024]
|
|
64
|
+
steps.append(HealthCheckStep("readable", "pass", (time.perf_counter() - t0) * 1000, "ok"))
|
|
65
|
+
except Exception as exc:
|
|
66
|
+
steps.append(HealthCheckStep("readable", "fail", 0.0, str(exc)))
|
|
67
|
+
for name in ("parseable", "columns", "sample_read", "row_count"):
|
|
68
|
+
steps.append(HealthCheckStep(name, "skip", 0.0, "skipped"))
|
|
69
|
+
return HealthCheckResult(steps=steps)
|
|
70
|
+
|
|
71
|
+
t0 = time.perf_counter()
|
|
72
|
+
try:
|
|
73
|
+
df = self._read()
|
|
74
|
+
except Exception as exc:
|
|
75
|
+
steps.append(HealthCheckStep("parseable", "fail", 0.0, str(exc)))
|
|
76
|
+
for name in ("columns", "sample_read", "row_count"):
|
|
77
|
+
steps.append(HealthCheckStep(name, "skip", 0.0, "skipped"))
|
|
78
|
+
return HealthCheckResult(steps=steps)
|
|
79
|
+
steps.append(HealthCheckStep("parseable", "pass", (time.perf_counter() - t0) * 1000, f"{len(df.columns)} columns"))
|
|
80
|
+
|
|
81
|
+
steps.append(HealthCheckStep("columns", "pass", 0.0, str(list(df.columns)[:5])))
|
|
82
|
+
steps.append(HealthCheckStep("sample_read", "pass", 0.0, "ok"))
|
|
83
|
+
steps.append(HealthCheckStep("row_count", "pass", 0.0, f"{len(df)} rows"))
|
|
84
|
+
return HealthCheckResult(steps=steps)
|
|
85
|
+
|
|
86
|
+
def list_schemas(self) -> list[str]:
|
|
87
|
+
return ["default"]
|
|
88
|
+
|
|
89
|
+
def list_tables(self, schema: str) -> list[str]:
|
|
90
|
+
return [self._table_name]
|
|
91
|
+
|
|
92
|
+
def describe_columns(self, schema: str, table: str) -> list[ColumnMeta]:
|
|
93
|
+
df = self._read()
|
|
94
|
+
return [
|
|
95
|
+
ColumnMeta(
|
|
96
|
+
name=col,
|
|
97
|
+
data_type=str(df[col].dtype),
|
|
98
|
+
nullable=bool(df[col].isna().any()),
|
|
99
|
+
position=i + 1,
|
|
100
|
+
)
|
|
101
|
+
for i, col in enumerate(df.columns)
|
|
102
|
+
]
|
|
103
|
+
|
|
104
|
+
def sample(self, schema: str, table: str, n: int = 100_000) -> pd.DataFrame:
|
|
105
|
+
df = self._read()
|
|
106
|
+
if len(df) <= n:
|
|
107
|
+
return df.reset_index(drop=True)
|
|
108
|
+
return df.sample(n=n, random_state=42).reset_index(drop=True)
|
|
109
|
+
|
|
110
|
+
def aggregate(self, schema: str, table: str, exprs: list[AggExpr]) -> dict[str, Any]:
|
|
111
|
+
import duckdb
|
|
112
|
+
df = self._read()
|
|
113
|
+
con = duckdb.connect()
|
|
114
|
+
con.register("_data", df)
|
|
115
|
+
cols = ", ".join(f"{e.sql} AS {e.name}" for e in exprs)
|
|
116
|
+
row = con.execute(f"SELECT {cols} FROM _data").fetchone() # noqa: S608
|
|
117
|
+
con.close()
|
|
118
|
+
return dict(zip([e.name for e in exprs], row))
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
# PostgresAdapter wraps SQLAlchemy for all warehouse operations.
|
|
2
|
+
# Sampling uses LIMIT for portable random rows; TABLESAMPLE BERNOULLI available as an option.
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import datetime
|
|
6
|
+
import time
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
import sqlalchemy as sa
|
|
11
|
+
|
|
12
|
+
from dqt.adapters._protocol import (
|
|
13
|
+
AggExpr,
|
|
14
|
+
ColumnMeta,
|
|
15
|
+
HealthCheckResult,
|
|
16
|
+
HealthCheckStep,
|
|
17
|
+
)
|
|
18
|
+
from dqt.utils.logging import get_logger
|
|
19
|
+
|
|
20
|
+
_log = get_logger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class PostgresAdapter:
|
|
24
|
+
def __init__(self, conn_str: str) -> None:
|
|
25
|
+
self._conn_str = conn_str
|
|
26
|
+
self._engine = sa.create_engine(
|
|
27
|
+
conn_str,
|
|
28
|
+
pool_pre_ping=True,
|
|
29
|
+
execution_options={"isolation_level": "READ COMMITTED"},
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
def health_check(self) -> HealthCheckResult:
|
|
33
|
+
steps: list[HealthCheckStep] = []
|
|
34
|
+
steps.append(self._step_tcp())
|
|
35
|
+
if steps[-1].status == "fail":
|
|
36
|
+
for name in ("auth", "info_schema", "sample_select", "latency_probe", "clock_skew"):
|
|
37
|
+
steps.append(HealthCheckStep(name=name, status="skip", latency_ms=0.0, detail="skipped"))
|
|
38
|
+
return HealthCheckResult(steps=steps)
|
|
39
|
+
steps.append(self._step_auth())
|
|
40
|
+
steps.append(self._step_info_schema())
|
|
41
|
+
steps.append(self._step_sample_select())
|
|
42
|
+
steps.append(self._step_latency())
|
|
43
|
+
steps.append(self._step_clock_skew())
|
|
44
|
+
return HealthCheckResult(steps=steps)
|
|
45
|
+
|
|
46
|
+
def _step_tcp(self) -> HealthCheckStep:
|
|
47
|
+
t0 = time.perf_counter()
|
|
48
|
+
try:
|
|
49
|
+
with self._engine.connect() as conn:
|
|
50
|
+
conn.execute(sa.text("SELECT 1"))
|
|
51
|
+
return HealthCheckStep("tcp_reach", "pass", (time.perf_counter() - t0) * 1000, "ok")
|
|
52
|
+
except Exception as exc:
|
|
53
|
+
return HealthCheckStep("tcp_reach", "fail", 0.0, str(exc))
|
|
54
|
+
|
|
55
|
+
def _step_auth(self) -> HealthCheckStep:
|
|
56
|
+
t0 = time.perf_counter()
|
|
57
|
+
try:
|
|
58
|
+
with self._engine.connect() as conn:
|
|
59
|
+
user = conn.execute(sa.text("SELECT current_user")).scalar()
|
|
60
|
+
return HealthCheckStep("auth", "pass", (time.perf_counter() - t0) * 1000, f"user={user}")
|
|
61
|
+
except Exception as exc:
|
|
62
|
+
return HealthCheckStep("auth", "fail", 0.0, str(exc))
|
|
63
|
+
|
|
64
|
+
def _step_info_schema(self) -> HealthCheckStep:
|
|
65
|
+
t0 = time.perf_counter()
|
|
66
|
+
try:
|
|
67
|
+
with self._engine.connect() as conn:
|
|
68
|
+
conn.execute(sa.text(
|
|
69
|
+
"SELECT COUNT(*) FROM information_schema.tables "
|
|
70
|
+
"WHERE table_schema NOT IN ('pg_catalog','information_schema')"
|
|
71
|
+
)).scalar()
|
|
72
|
+
return HealthCheckStep("info_schema", "pass", (time.perf_counter() - t0) * 1000, "readable")
|
|
73
|
+
except Exception as exc:
|
|
74
|
+
return HealthCheckStep("info_schema", "fail", 0.0, str(exc))
|
|
75
|
+
|
|
76
|
+
def _step_sample_select(self) -> HealthCheckStep:
|
|
77
|
+
t0 = time.perf_counter()
|
|
78
|
+
try:
|
|
79
|
+
with self._engine.connect() as conn:
|
|
80
|
+
conn.execute(sa.text(
|
|
81
|
+
"SELECT table_name FROM information_schema.tables "
|
|
82
|
+
"WHERE table_schema NOT IN ('pg_catalog','information_schema') LIMIT 1"
|
|
83
|
+
)).fetchone()
|
|
84
|
+
return HealthCheckStep("sample_select", "pass", (time.perf_counter() - t0) * 1000, "ok")
|
|
85
|
+
except Exception as exc:
|
|
86
|
+
return HealthCheckStep("sample_select", "fail", 0.0, str(exc))
|
|
87
|
+
|
|
88
|
+
def _step_latency(self) -> HealthCheckStep:
|
|
89
|
+
t0 = time.perf_counter()
|
|
90
|
+
try:
|
|
91
|
+
with self._engine.connect() as conn:
|
|
92
|
+
conn.execute(sa.text("SELECT 1"))
|
|
93
|
+
latency = (time.perf_counter() - t0) * 1000
|
|
94
|
+
return HealthCheckStep("latency_probe", "pass", latency, f"{latency:.1f}ms")
|
|
95
|
+
except Exception as exc:
|
|
96
|
+
return HealthCheckStep("latency_probe", "fail", 0.0, str(exc))
|
|
97
|
+
|
|
98
|
+
def _step_clock_skew(self) -> HealthCheckStep:
|
|
99
|
+
t0 = time.perf_counter()
|
|
100
|
+
try:
|
|
101
|
+
with self._engine.connect() as conn:
|
|
102
|
+
db_now = conn.execute(sa.text("SELECT NOW()")).scalar()
|
|
103
|
+
local_now = datetime.datetime.now(datetime.timezone.utc)
|
|
104
|
+
if db_now.tzinfo is None:
|
|
105
|
+
db_now = db_now.replace(tzinfo=datetime.timezone.utc)
|
|
106
|
+
skew_s = abs((db_now - local_now).total_seconds())
|
|
107
|
+
status = "pass" if skew_s < 60 else "fail"
|
|
108
|
+
return HealthCheckStep("clock_skew", status, (time.perf_counter() - t0) * 1000, f"skew={skew_s:.1f}s")
|
|
109
|
+
except Exception as exc:
|
|
110
|
+
return HealthCheckStep("clock_skew", "fail", 0.0, str(exc))
|
|
111
|
+
|
|
112
|
+
def list_schemas(self) -> list[str]:
|
|
113
|
+
with self._engine.connect() as conn:
|
|
114
|
+
rows = conn.execute(sa.text(
|
|
115
|
+
"SELECT DISTINCT table_schema FROM information_schema.tables "
|
|
116
|
+
"WHERE table_schema NOT IN ('pg_catalog','information_schema') ORDER BY 1"
|
|
117
|
+
)).fetchall()
|
|
118
|
+
return [r[0] for r in rows]
|
|
119
|
+
|
|
120
|
+
def list_tables(self, schema: str) -> list[str]:
|
|
121
|
+
with self._engine.connect() as conn:
|
|
122
|
+
rows = conn.execute(sa.text(
|
|
123
|
+
"SELECT table_name FROM information_schema.tables "
|
|
124
|
+
"WHERE table_schema = :schema ORDER BY 1"
|
|
125
|
+
), {"schema": schema}).fetchall()
|
|
126
|
+
return [r[0] for r in rows]
|
|
127
|
+
|
|
128
|
+
def describe_columns(self, schema: str, table: str) -> list[ColumnMeta]:
|
|
129
|
+
with self._engine.connect() as conn:
|
|
130
|
+
rows = conn.execute(sa.text(
|
|
131
|
+
"SELECT column_name, data_type, is_nullable, ordinal_position "
|
|
132
|
+
"FROM information_schema.columns "
|
|
133
|
+
"WHERE table_schema = :schema AND table_name = :table "
|
|
134
|
+
"ORDER BY ordinal_position"
|
|
135
|
+
), {"schema": schema, "table": table}).fetchall()
|
|
136
|
+
return [
|
|
137
|
+
ColumnMeta(name=r[0], data_type=r[1], nullable=(r[2] == "YES"), position=r[3])
|
|
138
|
+
for r in rows
|
|
139
|
+
]
|
|
140
|
+
|
|
141
|
+
def sample(self, schema: str, table: str, n: int = 100_000) -> pd.DataFrame:
|
|
142
|
+
# Use ORDER BY random() to get a genuine random sample without TABLESAMPLE bias on small tables.
|
|
143
|
+
# schema/table are double-quoted identifiers, not user values in SQL context.
|
|
144
|
+
query = sa.text(f'SELECT * FROM "{schema}"."{table}" ORDER BY random() LIMIT :n')
|
|
145
|
+
with self._engine.connect() as conn:
|
|
146
|
+
return pd.read_sql(query, conn, params={"n": n})
|
|
147
|
+
|
|
148
|
+
def aggregate(self, schema: str, table: str, exprs: list[AggExpr]) -> dict[str, Any]:
|
|
149
|
+
cols = ", ".join(f"{e.sql} AS {e.name}" for e in exprs)
|
|
150
|
+
# schema/table are double-quoted identifiers; cols are built from AggExpr.sql (caller-controlled).
|
|
151
|
+
query = sa.text(f'SELECT {cols} FROM "{schema}"."{table}"')
|
|
152
|
+
with self._engine.connect() as conn:
|
|
153
|
+
row = conn.execute(query).fetchone()
|
|
154
|
+
return dict(zip([e.name for e in exprs], row))
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
@dataclass
|
|
5
|
+
class PostgresConfig:
|
|
6
|
+
host: str = "localhost"
|
|
7
|
+
port: int = 5432
|
|
8
|
+
database: str = "postgres"
|
|
9
|
+
username: str = "postgres"
|
|
10
|
+
password: str = ""
|
|
11
|
+
ssl_mode: str = "prefer"
|
|
12
|
+
|
|
13
|
+
def to_conn_str(self) -> str:
|
|
14
|
+
return (
|
|
15
|
+
f"postgresql+psycopg2://{self.username}:{self.password}"
|
|
16
|
+
f"@{self.host}:{self.port}/{self.database}?sslmode={self.ssl_mode}"
|
|
17
|
+
)
|
|
File without changes
|
|
File without changes
|