dqtlib 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. dqtlib-0.1.0/.gitignore +128 -0
  2. dqtlib-0.1.0/PKG-INFO +64 -0
  3. dqtlib-0.1.0/README.md +17 -0
  4. dqtlib-0.1.0/pyproject.toml +44 -0
  5. dqtlib-0.1.0/src/dqt/__init__.py +49 -0
  6. dqtlib-0.1.0/src/dqt/adapters/__init__.py +0 -0
  7. dqtlib-0.1.0/src/dqt/adapters/_protocol.py +47 -0
  8. dqtlib-0.1.0/src/dqt/adapters/local/__init__.py +3 -0
  9. dqtlib-0.1.0/src/dqt/adapters/local/adapter.py +118 -0
  10. dqtlib-0.1.0/src/dqt/adapters/postgres/__init__.py +4 -0
  11. dqtlib-0.1.0/src/dqt/adapters/postgres/adapter.py +154 -0
  12. dqtlib-0.1.0/src/dqt/adapters/postgres/config.py +17 -0
  13. dqtlib-0.1.0/src/dqt/agent/__init__.py +0 -0
  14. dqtlib-0.1.0/src/dqt/algorithms/__init__.py +0 -0
  15. dqtlib-0.1.0/src/dqt/algorithms/_base.py +70 -0
  16. dqtlib-0.1.0/src/dqt/algorithms/_registry.py +28 -0
  17. dqtlib-0.1.0/src/dqt/algorithms/_scales.py +62 -0
  18. dqtlib-0.1.0/src/dqt/algorithms/basic/__init__.py +35 -0
  19. dqtlib-0.1.0/src/dqt/algorithms/basic/_helpers.py +18 -0
  20. dqtlib-0.1.0/src/dqt/algorithms/basic/column_pairs.py +80 -0
  21. dqtlib-0.1.0/src/dqt/algorithms/basic/completeness.py +40 -0
  22. dqtlib-0.1.0/src/dqt/algorithms/basic/date_part.py +52 -0
  23. dqtlib-0.1.0/src/dqt/algorithms/basic/freshness.py +53 -0
  24. dqtlib-0.1.0/src/dqt/algorithms/basic/monotonicity.py +42 -0
  25. dqtlib-0.1.0/src/dqt/algorithms/basic/null_fraction.py +37 -0
  26. dqtlib-0.1.0/src/dqt/algorithms/basic/numeric.py +42 -0
  27. dqtlib-0.1.0/src/dqt/algorithms/basic/numeric_bounds.py +155 -0
  28. dqtlib-0.1.0/src/dqt/algorithms/basic/sql_assertion.py +40 -0
  29. dqtlib-0.1.0/src/dqt/algorithms/basic/string_case.py +51 -0
  30. dqtlib-0.1.0/src/dqt/algorithms/basic/uniqueness.py +40 -0
  31. dqtlib-0.1.0/src/dqt/algorithms/basic/validity.py +46 -0
  32. dqtlib-0.1.0/src/dqt/algorithms/basic/value_checks.py +174 -0
  33. dqtlib-0.1.0/src/dqt/algorithms/basic/volume.py +35 -0
  34. dqtlib-0.1.0/src/dqt/algorithms/distribution/__init__.py +7 -0
  35. dqtlib-0.1.0/src/dqt/algorithms/distribution/profiler.py +127 -0
  36. dqtlib-0.1.0/src/dqt/algorithms/drift/__init__.py +3 -0
  37. dqtlib-0.1.0/src/dqt/algorithms/drift/ks2sample.py +40 -0
  38. dqtlib-0.1.0/src/dqt/algorithms/outliers_multi/__init__.py +3 -0
  39. dqtlib-0.1.0/src/dqt/algorithms/outliers_multi/isolation_forest.py +41 -0
  40. dqtlib-0.1.0/src/dqt/algorithms/outliers_uni/__init__.py +14 -0
  41. dqtlib-0.1.0/src/dqt/algorithms/outliers_uni/adjusted_boxplot.py +59 -0
  42. dqtlib-0.1.0/src/dqt/algorithms/outliers_uni/auto_outlier.py +110 -0
  43. dqtlib-0.1.0/src/dqt/algorithms/outliers_uni/mad.py +84 -0
  44. dqtlib-0.1.0/src/dqt/algorithms/outliers_uni/outlier_fraction_range.py +116 -0
  45. dqtlib-0.1.0/src/dqt/algorithms/outliers_uni/zscore.py +35 -0
  46. dqtlib-0.1.0/src/dqt/algorithms/referential/__init__.py +2 -0
  47. dqtlib-0.1.0/src/dqt/algorithms/referential/referential.py +52 -0
  48. dqtlib-0.1.0/src/dqt/algorithms/schema/__init__.py +2 -0
  49. dqtlib-0.1.0/src/dqt/algorithms/schema/schema_checks.py +56 -0
  50. dqtlib-0.1.0/src/dqt/algorithms/timeseries/__init__.py +3 -0
  51. dqtlib-0.1.0/src/dqt/algorithms/timeseries/stl.py +51 -0
  52. dqtlib-0.1.0/src/dqt/causality/__init__.py +0 -0
  53. dqtlib-0.1.0/src/dqt/checks/__init__.py +0 -0
  54. dqtlib-0.1.0/src/dqt/checks/loader.py +70 -0
  55. dqtlib-0.1.0/src/dqt/checks/models.py +42 -0
  56. dqtlib-0.1.0/src/dqt/checks/schema/__init__.py +0 -0
  57. dqtlib-0.1.0/src/dqt/checks/schema/check.schema.json +74 -0
  58. dqtlib-0.1.0/src/dqt/compat/__init__.py +0 -0
  59. dqtlib-0.1.0/src/dqt/governance/__init__.py +0 -0
  60. dqtlib-0.1.0/src/dqt/hitl/__init__.py +0 -0
  61. dqtlib-0.1.0/src/dqt/lineage/__init__.py +4 -0
  62. dqtlib-0.1.0/src/dqt/lineage/models.py +42 -0
  63. dqtlib-0.1.0/src/dqt/lineage/vault.py +273 -0
  64. dqtlib-0.1.0/src/dqt/profiling/__init__.py +12 -0
  65. dqtlib-0.1.0/src/dqt/profiling/models.py +79 -0
  66. dqtlib-0.1.0/src/dqt/profiling/profiler.py +236 -0
  67. dqtlib-0.1.0/src/dqt/reporting/__init__.py +5 -0
  68. dqtlib-0.1.0/src/dqt/reporting/_charts.py +173 -0
  69. dqtlib-0.1.0/src/dqt/reporting/html_report.py +480 -0
  70. dqtlib-0.1.0/src/dqt/runner/__init__.py +0 -0
  71. dqtlib-0.1.0/src/dqt/runner/runner.py +121 -0
  72. dqtlib-0.1.0/src/dqt/semantic/__init__.py +9 -0
  73. dqtlib-0.1.0/src/dqt/semantic/loader.py +10 -0
  74. dqtlib-0.1.0/src/dqt/semantic/models.py +35 -0
  75. dqtlib-0.1.0/src/dqt/store/__init__.py +0 -0
  76. dqtlib-0.1.0/src/dqt/store/_protocol.py +43 -0
  77. dqtlib-0.1.0/src/dqt/store/memory.py +27 -0
  78. dqtlib-0.1.0/src/dqt/utils/__init__.py +0 -0
  79. dqtlib-0.1.0/src/dqt/utils/logging.py +26 -0
  80. dqtlib-0.1.0/tests/__init__.py +0 -0
  81. dqtlib-0.1.0/tests/adapters/__init__.py +0 -0
  82. dqtlib-0.1.0/tests/adapters/test_local_adapter.py +141 -0
  83. dqtlib-0.1.0/tests/adapters/test_postgres_adapter.py +77 -0
  84. dqtlib-0.1.0/tests/algorithms/__init__.py +0 -0
  85. dqtlib-0.1.0/tests/algorithms/basic/__init__.py +0 -0
  86. dqtlib-0.1.0/tests/algorithms/basic/test_column_pairs.py +86 -0
  87. dqtlib-0.1.0/tests/algorithms/basic/test_completeness.py +77 -0
  88. dqtlib-0.1.0/tests/algorithms/basic/test_date_part.py +33 -0
  89. dqtlib-0.1.0/tests/algorithms/basic/test_freshness.py +40 -0
  90. dqtlib-0.1.0/tests/algorithms/basic/test_monotonicity.py +70 -0
  91. dqtlib-0.1.0/tests/algorithms/basic/test_null_fraction.py +34 -0
  92. dqtlib-0.1.0/tests/algorithms/basic/test_numeric.py +68 -0
  93. dqtlib-0.1.0/tests/algorithms/basic/test_numeric_bounds.py +124 -0
  94. dqtlib-0.1.0/tests/algorithms/basic/test_sql_assertion.py +27 -0
  95. dqtlib-0.1.0/tests/algorithms/basic/test_string_case.py +34 -0
  96. dqtlib-0.1.0/tests/algorithms/basic/test_uniqueness.py +62 -0
  97. dqtlib-0.1.0/tests/algorithms/basic/test_validity.py +70 -0
  98. dqtlib-0.1.0/tests/algorithms/basic/test_value_checks.py +138 -0
  99. dqtlib-0.1.0/tests/algorithms/basic/test_volume.py +67 -0
  100. dqtlib-0.1.0/tests/algorithms/distribution/__init__.py +0 -0
  101. dqtlib-0.1.0/tests/algorithms/distribution/test_profiler.py +57 -0
  102. dqtlib-0.1.0/tests/algorithms/drift/__init__.py +0 -0
  103. dqtlib-0.1.0/tests/algorithms/drift/test_ks2sample.py +75 -0
  104. dqtlib-0.1.0/tests/algorithms/outliers_multi/__init__.py +0 -0
  105. dqtlib-0.1.0/tests/algorithms/outliers_multi/test_isolation_forest.py +54 -0
  106. dqtlib-0.1.0/tests/algorithms/outliers_uni/__init__.py +0 -0
  107. dqtlib-0.1.0/tests/algorithms/outliers_uni/test_adjusted_boxplot.py +70 -0
  108. dqtlib-0.1.0/tests/algorithms/outliers_uni/test_auto_outlier.py +52 -0
  109. dqtlib-0.1.0/tests/algorithms/outliers_uni/test_mad.py +134 -0
  110. dqtlib-0.1.0/tests/algorithms/outliers_uni/test_zscore.py +70 -0
  111. dqtlib-0.1.0/tests/algorithms/referential/__init__.py +0 -0
  112. dqtlib-0.1.0/tests/algorithms/referential/test_referential.py +72 -0
  113. dqtlib-0.1.0/tests/algorithms/schema/__init__.py +0 -0
  114. dqtlib-0.1.0/tests/algorithms/schema/test_schema_checks.py +74 -0
  115. dqtlib-0.1.0/tests/algorithms/test_outlier_fraction_range.py +83 -0
  116. dqtlib-0.1.0/tests/algorithms/test_registry.py +87 -0
  117. dqtlib-0.1.0/tests/algorithms/timeseries/__init__.py +0 -0
  118. dqtlib-0.1.0/tests/algorithms/timeseries/test_stl.py +87 -0
  119. dqtlib-0.1.0/tests/checks/__init__.py +0 -0
  120. dqtlib-0.1.0/tests/checks/test_loader.py +135 -0
  121. dqtlib-0.1.0/tests/conftest.py +35 -0
  122. dqtlib-0.1.0/tests/profiling/__init__.py +0 -0
  123. dqtlib-0.1.0/tests/profiling/test_profiler.py +323 -0
  124. dqtlib-0.1.0/tests/reporting/__init__.py +0 -0
  125. dqtlib-0.1.0/tests/reporting/test_html_report.py +45 -0
  126. dqtlib-0.1.0/tests/runner/__init__.py +0 -0
  127. dqtlib-0.1.0/tests/runner/test_runner.py +191 -0
  128. dqtlib-0.1.0/tests/store/__init__.py +0 -0
  129. dqtlib-0.1.0/tests/store/test_memory_store.py +95 -0
  130. dqtlib-0.1.0/tests/test_core_types.py +112 -0
  131. dqtlib-0.1.0/tests/test_public_api.py +61 -0
  132. dqtlib-0.1.0/tests/test_vault.py +41 -0
@@ -0,0 +1,128 @@
1
+ # --- Python ---
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ share/python-wheels/
20
+ *.egg-info/
21
+ .installed.cfg
22
+ *.egg
23
+ MANIFEST
24
+
25
+ # uv
26
+ .venv/
27
+ .uv-cache/
28
+
29
+ # pip
30
+ pip-log.txt
31
+ pip-delete-this-directory.txt
32
+
33
+ # pytest
34
+ .pytest_cache/
35
+ .cache/
36
+ .coverage
37
+ .coverage.*
38
+ htmlcov/
39
+ .tox/
40
+ .nox/
41
+ coverage.xml
42
+ *.cover
43
+ *.py,cover
44
+ .hypothesis/
45
+
46
+ # mypy / ruff
47
+ .mypy_cache/
48
+ .ruff_cache/
49
+ .dmypy.json
50
+ dmypy.json
51
+
52
+ # --- Node / Next.js ---
53
+ node_modules/
54
+ .next/
55
+ .turbo/
56
+ .swc/
57
+ out/
58
+ dist/
59
+ build/
60
+ .npm/
61
+ .pnpm-store/
62
+ pnpm-debug.log*
63
+ yarn-debug.log*
64
+ npm-debug.log*
65
+
66
+ # --- Generated artifacts (NEVER commit) ---
67
+ packages/dqt-types/
68
+ shared/generated/
69
+ apps/web/src/generated/
70
+ apps/web/src/lib/stats.generated.ts
71
+ apps/web/src/components/connections/engines.generated.ts
72
+ packages/dqt/src/dqt/generated/
73
+ apps/server/openapi.json
74
+
75
+ # --- Env / secrets ---
76
+ .env
77
+ .env.local
78
+ .env.*.local
79
+ *.pem
80
+ *.key
81
+ *.p12
82
+ *.crt
83
+ service-account*.json
84
+
85
+ # --- OS / editor ---
86
+ .DS_Store
87
+ Thumbs.db
88
+ .idea/
89
+ .vscode/
90
+ !.vscode/settings.json.example
91
+ *.swp
92
+ *.swo
93
+ *~
94
+ .history/
95
+
96
+ # --- Local dev ---
97
+ tmp/
98
+ *.log
99
+ logs/
100
+
101
+ # --- Generated demo reports ---
102
+ examples/*/reports/
103
+ apps/web/tsconfig.tsbuildinfo
104
+
105
+ # --- Unrelated legacy code ---
106
+ dql/
107
+ .cache/
108
+
109
+ # --- Docker / DB volumes ---
110
+ run_local/data/
111
+ run_local/postgres-data/
112
+ run_local/redis-data/
113
+
114
+ # --- Test artifacts ---
115
+ playwright-report/
116
+ test-results/
117
+ e2e-screenshots/
118
+ *.pyc
119
+ .benchmarks/
120
+
121
+ # --- Built docs ---
122
+ docs/_build/
123
+ site/
124
+
125
+ # --- Archives (created by build scripts) ---
126
+ *.zip
127
+ *.tar.gz
128
+ !reference_data/**/*.zip
dqtlib-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,64 @@
1
+ Metadata-Version: 2.4
2
+ Name: dqtlib
3
+ Version: 0.1.0
4
+ Summary: Data quality, observability, semantic, and causality library
5
+ Project-URL: Homepage, https://github.com/antonbarr-data/dqt
6
+ Project-URL: Repository, https://github.com/antonbarr-data/dqt
7
+ Author-email: Anton Barr <antonbar@gmail.com>
8
+ License: MIT
9
+ Keywords: anomaly-detection,causal-inference,causality,data-drift,data-quality,observability
10
+ Requires-Python: >=3.12
11
+ Requires-Dist: diptest>=0.6
12
+ Requires-Dist: duckdb>=0.9
13
+ Requires-Dist: ibis-framework>=9.0
14
+ Requires-Dist: jsonschema>=4.22
15
+ Requires-Dist: numpy>=1.26
16
+ Requires-Dist: pandas>=2.2
17
+ Requires-Dist: pydantic>=2.7
18
+ Requires-Dist: pyod>=1.1
19
+ Requires-Dist: pyyaml>=6.0
20
+ Requires-Dist: river>=0.21
21
+ Requires-Dist: scikit-learn>=1.5
22
+ Requires-Dist: scipy>=1.13
23
+ Requires-Dist: statsmodels>=0.14
24
+ Requires-Dist: structlog>=24.0
25
+ Provides-Extra: causal
26
+ Requires-Dist: causal-learn>=0.1; extra == 'causal'
27
+ Requires-Dist: dowhy>=0.11; extra == 'causal'
28
+ Requires-Dist: tigramite>=0.7; extra == 'causal'
29
+ Provides-Extra: deep
30
+ Requires-Dist: pyod[deep]>=1.1; extra == 'deep'
31
+ Requires-Dist: torch>=2.3; extra == 'deep'
32
+ Provides-Extra: explain
33
+ Requires-Dist: pgmpy>=0.1; extra == 'explain'
34
+ Requires-Dist: shap>=0.45; extra == 'explain'
35
+ Provides-Extra: files
36
+ Requires-Dist: openpyxl>=3.0; extra == 'files'
37
+ Requires-Dist: pyarrow>=14.0; extra == 'files'
38
+ Provides-Extra: forecast
39
+ Requires-Dist: prophet>=1.1; extra == 'forecast'
40
+ Requires-Dist: stumpy>=1.4; extra == 'forecast'
41
+ Provides-Extra: postgres
42
+ Requires-Dist: ibis-framework[postgres]>=9.0; extra == 'postgres'
43
+ Requires-Dist: psycopg2-binary>=2.9; extra == 'postgres'
44
+ Provides-Extra: reports
45
+ Requires-Dist: matplotlib>=3.8; extra == 'reports'
46
+ Description-Content-Type: text/markdown
47
+
48
+ # dqtlib
49
+
50
+ **Open-source data quality, lineage, semantic layer & causality — for dbt, warehouses and data lakes.**
51
+
52
+ pip-installable Python library for watching dbt-built warehouses and any SQL warehouse for statistical drift, anomalies, silent regressions, and explaining *why* metrics moved.
53
+
54
+ ```bash
55
+ pip install dqtlib
56
+ ```
57
+
58
+ The import name is `dqt`:
59
+
60
+ ```python
61
+ from dqt import Check, Runner, MemoryStore
62
+ ```
63
+
64
+ Full documentation and examples: https://github.com/antonbarr-data/dqt
dqtlib-0.1.0/README.md ADDED
@@ -0,0 +1,17 @@
1
+ # dqtlib
2
+
3
+ **Open-source data quality, lineage, semantic layer & causality — for dbt, warehouses and data lakes.**
4
+
5
+ pip-installable Python library for watching dbt-built warehouses and any SQL warehouse for statistical drift, anomalies, silent regressions, and explaining *why* metrics moved.
6
+
7
+ ```bash
8
+ pip install dqtlib
9
+ ```
10
+
11
+ The import name is `dqt`:
12
+
13
+ ```python
14
+ from dqt import Check, Runner, MemoryStore
15
+ ```
16
+
17
+ Full documentation and examples: https://github.com/antonbarr-data/dqt
@@ -0,0 +1,44 @@
1
+ [project]
2
+ name = "dqtlib"
3
+ version = "0.1.0"
4
+ description = "Data quality, observability, semantic, and causality library"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ license = { text = "MIT" }
8
+ keywords = ["data-quality", "observability", "anomaly-detection", "causality", "data-drift", "causal-inference"]
9
+ authors = [
10
+ { name = "Anton Barr", email = "antonbar@gmail.com" },
11
+ ]
12
+ urls = { Homepage = "https://github.com/antonbarr-data/dqt", Repository = "https://github.com/antonbarr-data/dqt" }
13
+ dependencies = [
14
+ "numpy>=1.26",
15
+ "scipy>=1.13",
16
+ "pandas>=2.2",
17
+ "statsmodels>=0.14",
18
+ "scikit-learn>=1.5",
19
+ "pyod>=1.1",
20
+ "diptest>=0.6",
21
+ "river>=0.21",
22
+ "structlog>=24.0",
23
+ "pydantic>=2.7",
24
+ "ibis-framework>=9.0",
25
+ "duckdb>=0.9",
26
+ "jsonschema>=4.22",
27
+ "pyyaml>=6.0",
28
+ ]
29
+
30
+ [project.optional-dependencies]
31
+ postgres = ["psycopg2-binary>=2.9", "ibis-framework[postgres]>=9.0"]
32
+ causal = ["tigramite>=0.7", "dowhy>=0.11", "causal-learn>=0.1"]
33
+ forecast = ["prophet>=1.1", "stumpy>=1.4"]
34
+ deep = ["torch>=2.3", "pyod[deep]>=1.1"]
35
+ explain = ["shap>=0.45", "pgmpy>=0.1"]
36
+ files = ["openpyxl>=3.0", "pyarrow>=14.0"]
37
+ reports = ["matplotlib>=3.8"]
38
+
39
+ [build-system]
40
+ requires = ["hatchling"]
41
+ build-backend = "hatchling.build"
42
+
43
+ [tool.hatch.build.targets.wheel]
44
+ packages = ["src/dqt"]
@@ -0,0 +1,49 @@
1
+ # packages/dqt/src/dqt/__init__.py
2
+ """dqt — open-source data quality, observability, and causality library."""
3
+ from __future__ import annotations
4
+
5
+ __version__ = "0.1.0"
6
+
7
+ from dqt.algorithms._base import (
8
+ BaseAggregateDetector,
9
+ BaseDetector,
10
+ DetectorResult,
11
+ Verdict,
12
+ compute_verdict,
13
+ )
14
+ from dqt.adapters._protocol import AggExpr, ColumnMeta, HealthCheckResult, WarehouseAdapter
15
+ from dqt.store._protocol import Incident, ResultsStore, RunResult
16
+ from dqt.store.memory import MemoryStore
17
+ from dqt.checks.models import BaselineConfig, Check, CheckFilter, CheckScope
18
+ from dqt.runner.runner import Runner
19
+
20
+ # Import all detector groups to trigger @registry.register side effects
21
+ import dqt.algorithms.basic # noqa: F401
22
+ import dqt.algorithms.schema # noqa: F401
23
+ import dqt.algorithms.referential # noqa: F401
24
+ import dqt.algorithms.drift # noqa: F401
25
+ import dqt.algorithms.outliers_uni # noqa: F401
26
+ import dqt.algorithms.outliers_multi # noqa: F401
27
+ import dqt.algorithms.timeseries # noqa: F401
28
+
29
+ __all__ = [
30
+ "__version__",
31
+ "Verdict",
32
+ "DetectorResult",
33
+ "BaseDetector",
34
+ "BaseAggregateDetector",
35
+ "compute_verdict",
36
+ "AggExpr",
37
+ "ColumnMeta",
38
+ "HealthCheckResult",
39
+ "WarehouseAdapter",
40
+ "ResultsStore",
41
+ "RunResult",
42
+ "Incident",
43
+ "MemoryStore",
44
+ "Check",
45
+ "CheckScope",
46
+ "CheckFilter",
47
+ "BaselineConfig",
48
+ "Runner",
49
+ ]
File without changes
@@ -0,0 +1,47 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Literal, Protocol, runtime_checkable
5
+
6
+ import pandas as pd
7
+
8
+
9
+ @dataclass
10
+ class AggExpr:
11
+ name: str
12
+ sql: str
13
+
14
+
15
+ @dataclass
16
+ class HealthCheckStep:
17
+ name: str
18
+ status: Literal["pass", "fail", "skip"]
19
+ latency_ms: float
20
+ detail: str
21
+
22
+
23
+ @dataclass
24
+ class HealthCheckResult:
25
+ steps: list[HealthCheckStep] = field(default_factory=list)
26
+
27
+ @property
28
+ def passed(self) -> bool:
29
+ return all(s.status in ("pass", "skip") for s in self.steps)
30
+
31
+
32
+ @dataclass
33
+ class ColumnMeta:
34
+ name: str
35
+ data_type: str
36
+ nullable: bool
37
+ position: int
38
+
39
+
40
+ @runtime_checkable
41
+ class WarehouseAdapter(Protocol):
42
+ def health_check(self) -> HealthCheckResult: ...
43
+ def sample(self, schema: str, table: str, n: int = 100_000) -> pd.DataFrame: ...
44
+ def aggregate(self, schema: str, table: str, exprs: list[AggExpr]) -> dict[str, object]: ...
45
+ def describe_columns(self, schema: str, table: str) -> list[ColumnMeta]: ...
46
+ def list_schemas(self) -> list[str]: ...
47
+ def list_tables(self, schema: str) -> list[str]: ...
@@ -0,0 +1,3 @@
1
+ from dqt.adapters.local.adapter import LocalFileAdapter
2
+
3
+ __all__ = ["LocalFileAdapter"]
@@ -0,0 +1,118 @@
1
+ # Ref: https://duckdb.org/docs/api/python/overview — used for SQL aggregations on DataFrames
2
+ from __future__ import annotations
3
+
4
+ import pathlib
5
+ import time
6
+ from typing import Any
7
+
8
+ import pandas as pd
9
+
10
+ from dqt.adapters._protocol import (
11
+ AggExpr,
12
+ ColumnMeta,
13
+ HealthCheckResult,
14
+ HealthCheckStep,
15
+ )
16
+ from dqt.utils.logging import get_logger
17
+
18
+ _log = get_logger(__name__)
19
+
20
+ _READERS: dict[str, Any] = {
21
+ ".csv": lambda p: pd.read_csv(p),
22
+ ".tsv": lambda p: pd.read_csv(p, sep="\t"),
23
+ ".xlsx": lambda p: pd.read_excel(p),
24
+ ".xls": lambda p: pd.read_excel(p),
25
+ ".parquet": lambda p: pd.read_parquet(p),
26
+ ".json": lambda p: pd.read_json(p),
27
+ ".jsonl": lambda p: pd.read_json(p, lines=True),
28
+ ".ndjson": lambda p: pd.read_json(p, lines=True),
29
+ ".feather": lambda p: pd.read_feather(p),
30
+ ".arrow": lambda p: pd.read_feather(p),
31
+ }
32
+
33
+ _HEALTH_STEPS = ("readable", "parseable", "columns", "sample_read", "row_count")
34
+
35
+
36
+ class LocalFileAdapter:
37
+ """Reads a local file and exposes it as a single-table WarehouseAdapter."""
38
+
39
+ def __init__(self, path: str | pathlib.Path) -> None:
40
+ self._path = pathlib.Path(path)
41
+ self._suffix = self._path.suffix.lower()
42
+ if self._suffix not in _READERS:
43
+ supported = ", ".join(sorted(_READERS))
44
+ raise ValueError(f"Unsupported format '{self._suffix}'. Supported: {supported}")
45
+ self._table_name = self._path.stem
46
+
47
+ def _read(self) -> pd.DataFrame:
48
+ return _READERS[self._suffix](self._path)
49
+
50
+ def health_check(self) -> HealthCheckResult:
51
+ steps: list[HealthCheckStep] = []
52
+
53
+ t0 = time.perf_counter()
54
+ if not self._path.exists():
55
+ steps.append(HealthCheckStep("file_exists", "fail", 0.0, f"not found: {self._path}"))
56
+ for name in _HEALTH_STEPS:
57
+ steps.append(HealthCheckStep(name, "skip", 0.0, "skipped"))
58
+ return HealthCheckResult(steps=steps)
59
+ steps.append(HealthCheckStep("file_exists", "pass", (time.perf_counter() - t0) * 1000, str(self._path)))
60
+
61
+ t0 = time.perf_counter()
62
+ try:
63
+ self._path.read_bytes()[:1024]
64
+ steps.append(HealthCheckStep("readable", "pass", (time.perf_counter() - t0) * 1000, "ok"))
65
+ except Exception as exc:
66
+ steps.append(HealthCheckStep("readable", "fail", 0.0, str(exc)))
67
+ for name in ("parseable", "columns", "sample_read", "row_count"):
68
+ steps.append(HealthCheckStep(name, "skip", 0.0, "skipped"))
69
+ return HealthCheckResult(steps=steps)
70
+
71
+ t0 = time.perf_counter()
72
+ try:
73
+ df = self._read()
74
+ except Exception as exc:
75
+ steps.append(HealthCheckStep("parseable", "fail", 0.0, str(exc)))
76
+ for name in ("columns", "sample_read", "row_count"):
77
+ steps.append(HealthCheckStep(name, "skip", 0.0, "skipped"))
78
+ return HealthCheckResult(steps=steps)
79
+ steps.append(HealthCheckStep("parseable", "pass", (time.perf_counter() - t0) * 1000, f"{len(df.columns)} columns"))
80
+
81
+ steps.append(HealthCheckStep("columns", "pass", 0.0, str(list(df.columns)[:5])))
82
+ steps.append(HealthCheckStep("sample_read", "pass", 0.0, "ok"))
83
+ steps.append(HealthCheckStep("row_count", "pass", 0.0, f"{len(df)} rows"))
84
+ return HealthCheckResult(steps=steps)
85
+
86
+ def list_schemas(self) -> list[str]:
87
+ return ["default"]
88
+
89
+ def list_tables(self, schema: str) -> list[str]:
90
+ return [self._table_name]
91
+
92
+ def describe_columns(self, schema: str, table: str) -> list[ColumnMeta]:
93
+ df = self._read()
94
+ return [
95
+ ColumnMeta(
96
+ name=col,
97
+ data_type=str(df[col].dtype),
98
+ nullable=bool(df[col].isna().any()),
99
+ position=i + 1,
100
+ )
101
+ for i, col in enumerate(df.columns)
102
+ ]
103
+
104
+ def sample(self, schema: str, table: str, n: int = 100_000) -> pd.DataFrame:
105
+ df = self._read()
106
+ if len(df) <= n:
107
+ return df.reset_index(drop=True)
108
+ return df.sample(n=n, random_state=42).reset_index(drop=True)
109
+
110
+ def aggregate(self, schema: str, table: str, exprs: list[AggExpr]) -> dict[str, Any]:
111
+ import duckdb
112
+ df = self._read()
113
+ con = duckdb.connect()
114
+ con.register("_data", df)
115
+ cols = ", ".join(f"{e.sql} AS {e.name}" for e in exprs)
116
+ row = con.execute(f"SELECT {cols} FROM _data").fetchone() # noqa: S608
117
+ con.close()
118
+ return dict(zip([e.name for e in exprs], row))
@@ -0,0 +1,4 @@
1
+ from dqt.adapters.postgres.adapter import PostgresAdapter
2
+ from dqt.adapters.postgres.config import PostgresConfig
3
+
4
+ __all__ = ["PostgresAdapter", "PostgresConfig"]
@@ -0,0 +1,154 @@
1
+ # PostgresAdapter wraps SQLAlchemy for all warehouse operations.
2
+ # Sampling uses LIMIT for portable random rows; TABLESAMPLE BERNOULLI available as an option.
3
+ from __future__ import annotations
4
+
5
+ import datetime
6
+ import time
7
+ from typing import Any
8
+
9
+ import pandas as pd
10
+ import sqlalchemy as sa
11
+
12
+ from dqt.adapters._protocol import (
13
+ AggExpr,
14
+ ColumnMeta,
15
+ HealthCheckResult,
16
+ HealthCheckStep,
17
+ )
18
+ from dqt.utils.logging import get_logger
19
+
20
+ _log = get_logger(__name__)
21
+
22
+
23
+ class PostgresAdapter:
24
+ def __init__(self, conn_str: str) -> None:
25
+ self._conn_str = conn_str
26
+ self._engine = sa.create_engine(
27
+ conn_str,
28
+ pool_pre_ping=True,
29
+ execution_options={"isolation_level": "READ COMMITTED"},
30
+ )
31
+
32
+ def health_check(self) -> HealthCheckResult:
33
+ steps: list[HealthCheckStep] = []
34
+ steps.append(self._step_tcp())
35
+ if steps[-1].status == "fail":
36
+ for name in ("auth", "info_schema", "sample_select", "latency_probe", "clock_skew"):
37
+ steps.append(HealthCheckStep(name=name, status="skip", latency_ms=0.0, detail="skipped"))
38
+ return HealthCheckResult(steps=steps)
39
+ steps.append(self._step_auth())
40
+ steps.append(self._step_info_schema())
41
+ steps.append(self._step_sample_select())
42
+ steps.append(self._step_latency())
43
+ steps.append(self._step_clock_skew())
44
+ return HealthCheckResult(steps=steps)
45
+
46
+ def _step_tcp(self) -> HealthCheckStep:
47
+ t0 = time.perf_counter()
48
+ try:
49
+ with self._engine.connect() as conn:
50
+ conn.execute(sa.text("SELECT 1"))
51
+ return HealthCheckStep("tcp_reach", "pass", (time.perf_counter() - t0) * 1000, "ok")
52
+ except Exception as exc:
53
+ return HealthCheckStep("tcp_reach", "fail", 0.0, str(exc))
54
+
55
+ def _step_auth(self) -> HealthCheckStep:
56
+ t0 = time.perf_counter()
57
+ try:
58
+ with self._engine.connect() as conn:
59
+ user = conn.execute(sa.text("SELECT current_user")).scalar()
60
+ return HealthCheckStep("auth", "pass", (time.perf_counter() - t0) * 1000, f"user={user}")
61
+ except Exception as exc:
62
+ return HealthCheckStep("auth", "fail", 0.0, str(exc))
63
+
64
+ def _step_info_schema(self) -> HealthCheckStep:
65
+ t0 = time.perf_counter()
66
+ try:
67
+ with self._engine.connect() as conn:
68
+ conn.execute(sa.text(
69
+ "SELECT COUNT(*) FROM information_schema.tables "
70
+ "WHERE table_schema NOT IN ('pg_catalog','information_schema')"
71
+ )).scalar()
72
+ return HealthCheckStep("info_schema", "pass", (time.perf_counter() - t0) * 1000, "readable")
73
+ except Exception as exc:
74
+ return HealthCheckStep("info_schema", "fail", 0.0, str(exc))
75
+
76
+ def _step_sample_select(self) -> HealthCheckStep:
77
+ t0 = time.perf_counter()
78
+ try:
79
+ with self._engine.connect() as conn:
80
+ conn.execute(sa.text(
81
+ "SELECT table_name FROM information_schema.tables "
82
+ "WHERE table_schema NOT IN ('pg_catalog','information_schema') LIMIT 1"
83
+ )).fetchone()
84
+ return HealthCheckStep("sample_select", "pass", (time.perf_counter() - t0) * 1000, "ok")
85
+ except Exception as exc:
86
+ return HealthCheckStep("sample_select", "fail", 0.0, str(exc))
87
+
88
+ def _step_latency(self) -> HealthCheckStep:
89
+ t0 = time.perf_counter()
90
+ try:
91
+ with self._engine.connect() as conn:
92
+ conn.execute(sa.text("SELECT 1"))
93
+ latency = (time.perf_counter() - t0) * 1000
94
+ return HealthCheckStep("latency_probe", "pass", latency, f"{latency:.1f}ms")
95
+ except Exception as exc:
96
+ return HealthCheckStep("latency_probe", "fail", 0.0, str(exc))
97
+
98
+ def _step_clock_skew(self) -> HealthCheckStep:
99
+ t0 = time.perf_counter()
100
+ try:
101
+ with self._engine.connect() as conn:
102
+ db_now = conn.execute(sa.text("SELECT NOW()")).scalar()
103
+ local_now = datetime.datetime.now(datetime.timezone.utc)
104
+ if db_now.tzinfo is None:
105
+ db_now = db_now.replace(tzinfo=datetime.timezone.utc)
106
+ skew_s = abs((db_now - local_now).total_seconds())
107
+ status = "pass" if skew_s < 60 else "fail"
108
+ return HealthCheckStep("clock_skew", status, (time.perf_counter() - t0) * 1000, f"skew={skew_s:.1f}s")
109
+ except Exception as exc:
110
+ return HealthCheckStep("clock_skew", "fail", 0.0, str(exc))
111
+
112
+ def list_schemas(self) -> list[str]:
113
+ with self._engine.connect() as conn:
114
+ rows = conn.execute(sa.text(
115
+ "SELECT DISTINCT table_schema FROM information_schema.tables "
116
+ "WHERE table_schema NOT IN ('pg_catalog','information_schema') ORDER BY 1"
117
+ )).fetchall()
118
+ return [r[0] for r in rows]
119
+
120
+ def list_tables(self, schema: str) -> list[str]:
121
+ with self._engine.connect() as conn:
122
+ rows = conn.execute(sa.text(
123
+ "SELECT table_name FROM information_schema.tables "
124
+ "WHERE table_schema = :schema ORDER BY 1"
125
+ ), {"schema": schema}).fetchall()
126
+ return [r[0] for r in rows]
127
+
128
+ def describe_columns(self, schema: str, table: str) -> list[ColumnMeta]:
129
+ with self._engine.connect() as conn:
130
+ rows = conn.execute(sa.text(
131
+ "SELECT column_name, data_type, is_nullable, ordinal_position "
132
+ "FROM information_schema.columns "
133
+ "WHERE table_schema = :schema AND table_name = :table "
134
+ "ORDER BY ordinal_position"
135
+ ), {"schema": schema, "table": table}).fetchall()
136
+ return [
137
+ ColumnMeta(name=r[0], data_type=r[1], nullable=(r[2] == "YES"), position=r[3])
138
+ for r in rows
139
+ ]
140
+
141
+ def sample(self, schema: str, table: str, n: int = 100_000) -> pd.DataFrame:
142
+ # Use ORDER BY random() to get a genuine random sample without TABLESAMPLE bias on small tables.
143
+ # schema/table are double-quoted identifiers, not user values in SQL context.
144
+ query = sa.text(f'SELECT * FROM "{schema}"."{table}" ORDER BY random() LIMIT :n')
145
+ with self._engine.connect() as conn:
146
+ return pd.read_sql(query, conn, params={"n": n})
147
+
148
+ def aggregate(self, schema: str, table: str, exprs: list[AggExpr]) -> dict[str, Any]:
149
+ cols = ", ".join(f"{e.sql} AS {e.name}" for e in exprs)
150
+ # schema/table are double-quoted identifiers; cols are built from AggExpr.sql (caller-controlled).
151
+ query = sa.text(f'SELECT {cols} FROM "{schema}"."{table}"')
152
+ with self._engine.connect() as conn:
153
+ row = conn.execute(query).fetchone()
154
+ return dict(zip([e.name for e in exprs], row))
@@ -0,0 +1,17 @@
1
+ from dataclasses import dataclass
2
+
3
+
4
+ @dataclass
5
+ class PostgresConfig:
6
+ host: str = "localhost"
7
+ port: int = 5432
8
+ database: str = "postgres"
9
+ username: str = "postgres"
10
+ password: str = ""
11
+ ssl_mode: str = "prefer"
12
+
13
+ def to_conn_str(self) -> str:
14
+ return (
15
+ f"postgresql+psycopg2://{self.username}:{self.password}"
16
+ f"@{self.host}:{self.port}/{self.database}?sslmode={self.ssl_mode}"
17
+ )
File without changes
File without changes