driftguard-prathvi 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- driftguard_prathvi-0.1.0/LICENSE +21 -0
- driftguard_prathvi-0.1.0/PKG-INFO +114 -0
- driftguard_prathvi-0.1.0/README.md +86 -0
- driftguard_prathvi-0.1.0/driftguard/__init__.py +8 -0
- driftguard_prathvi-0.1.0/driftguard/checks/__init__.py +7 -0
- driftguard_prathvi-0.1.0/driftguard/checks/drift.py +196 -0
- driftguard_prathvi-0.1.0/driftguard/checks/nulls.py +71 -0
- driftguard_prathvi-0.1.0/driftguard/checks/schema.py +95 -0
- driftguard_prathvi-0.1.0/driftguard/report.py +1061 -0
- driftguard_prathvi-0.1.0/driftguard/utils.py +61 -0
- driftguard_prathvi-0.1.0/driftguard/validator.py +82 -0
- driftguard_prathvi-0.1.0/driftguard_prathvi.egg-info/PKG-INFO +114 -0
- driftguard_prathvi-0.1.0/driftguard_prathvi.egg-info/SOURCES.txt +19 -0
- driftguard_prathvi-0.1.0/driftguard_prathvi.egg-info/dependency_links.txt +1 -0
- driftguard_prathvi-0.1.0/driftguard_prathvi.egg-info/requires.txt +3 -0
- driftguard_prathvi-0.1.0/driftguard_prathvi.egg-info/top_level.txt +1 -0
- driftguard_prathvi-0.1.0/pyproject.toml +41 -0
- driftguard_prathvi-0.1.0/setup.cfg +4 -0
- driftguard_prathvi-0.1.0/tests/test_drift.py +102 -0
- driftguard_prathvi-0.1.0/tests/test_nulls.py +49 -0
- driftguard_prathvi-0.1.0/tests/test_schema.py +65 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 DriftGuard Developers
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: driftguard-prathvi
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A robust data drift detection and schema validation library for machine learning pipelines.
|
|
5
|
+
Author-email: DriftGuard Developer <developer@driftguard.org>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/developer/driftguard
|
|
8
|
+
Project-URL: Documentation, https://github.com/developer/driftguard#readme
|
|
9
|
+
Project-URL: Bug-Tracker, https://github.com/developer/driftguard/issues
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Requires-Python: >=3.8
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: numpy>=1.18.0
|
|
25
|
+
Requires-Dist: pandas>=1.0.0
|
|
26
|
+
Requires-Dist: scipy>=1.5.0
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
|
|
29
|
+
# DriftGuard
|
|
30
|
+
|
|
31
|
+
DriftGuard is a lightweight, robust, and publish-ready Python library designed to automate dataset validation and detect statistical data drift in machine learning pipelines.
|
|
32
|
+
|
|
33
|
+
By comparing new incoming datasets against a trusted baseline reference dataset, DriftGuard alerts you to schema changes, increases in missing values, or shifts in feature distributions before they affect downstream model performance.
|
|
34
|
+
|
|
35
|
+
---
|
|
36
|
+
|
|
37
|
+
## Features
|
|
38
|
+
|
|
39
|
+
- **Schema Validation**: Detect missing columns, new columns, and data type mismatches.
|
|
40
|
+
- **Null Rate Analysis**: Monitor and flag columns where the rate of missing values increases beyond a configurable threshold.
|
|
41
|
+
- **Statistical Drift Detection**:
|
|
42
|
+
- **Kolmogorov-Smirnov (KS) Test** (`scipy.stats.ks_2samp`) for numerical columns.
|
|
43
|
+
- **Chi-Square Test** (`scipy.stats.chi2_contingency`) for categorical columns.
|
|
44
|
+
- **Severity Tagging**: Categorizes issues as `INFO`, `WARNING`, or `CRITICAL` for pipeline routing or CI/CD gate checks.
|
|
45
|
+
- **Interactive Reports**:
|
|
46
|
+
- An ASCII summary table output directly to console.
|
|
47
|
+
- Machine-readable JSON output for automated pipelines.
|
|
48
|
+
- A beautiful, self-contained interactive HTML dashboard with per-column breakdowns and interactive searching/filtering.
|
|
49
|
+
|
|
50
|
+
---
|
|
51
|
+
|
|
52
|
+
## Installation
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install driftguard
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
*Note: Depends on `numpy`, `pandas`, and `scipy` only.*
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## Quickstart
|
|
63
|
+
|
|
64
|
+
Validate your production features in real-time or as part of a batch training/inference pipeline:
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
import numpy as np
|
|
68
|
+
import pandas as pd
|
|
69
|
+
import driftguard as dg
|
|
70
|
+
|
|
71
|
+
# 1. Create a reference dataset (baseline)
|
|
72
|
+
np.random.seed(42)
|
|
73
|
+
ref_data = {
|
|
74
|
+
"age": np.random.normal(35, 10, 1000),
|
|
75
|
+
"income": np.random.uniform(30000, 120000, 1000),
|
|
76
|
+
"city": np.random.choice(["New York", "Chicago", "San Francisco"], 1000),
|
|
77
|
+
"target": np.random.choice([0, 1], 1000, p=[0.7, 0.3])
|
|
78
|
+
}
|
|
79
|
+
reference_df = pd.DataFrame(ref_data)
|
|
80
|
+
|
|
81
|
+
# 2. Create a new dataset (with some drift and schema issues)
|
|
82
|
+
new_data = {
|
|
83
|
+
"age": np.random.normal(38, 10, 1000), # Slight shift
|
|
84
|
+
"income": np.random.uniform(30000, 120000, 1000),
|
|
85
|
+
"city": np.random.choice(["New York", "Chicago", "Boston"], 1000), # "Boston" is a new category
|
|
86
|
+
"target": np.random.choice([0, 1], 1000, p=[0.7, 0.3]),
|
|
87
|
+
"extra_col": np.random.random(1000) # New column
|
|
88
|
+
}
|
|
89
|
+
new_df = pd.DataFrame(new_data)
|
|
90
|
+
# Add some null values to 'income'
|
|
91
|
+
new_df.loc[np.random.choice(1000, 150, replace=False), "income"] = np.nan
|
|
92
|
+
|
|
93
|
+
# 3. Instantiate Validator and run checks
|
|
94
|
+
# p_threshold matches the significance alpha for KS/Chi2
|
|
95
|
+
# null_threshold is the maximum allowed null rate increase
|
|
96
|
+
validator = dg.Validator(reference_df, p_threshold=0.05, null_threshold=0.10)
|
|
97
|
+
report = validator.check(new_df)
|
|
98
|
+
|
|
99
|
+
# 4. Consume the report
|
|
100
|
+
# Prints a formatted ASCII table of all issues
|
|
101
|
+
report.summary()
|
|
102
|
+
|
|
103
|
+
# Export interactive HTML dashboard (saves report.html)
|
|
104
|
+
report.export("html")
|
|
105
|
+
|
|
106
|
+
# Export machine-readable JSON (saves report.json)
|
|
107
|
+
report.export("json")
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
---
|
|
111
|
+
|
|
112
|
+
## License
|
|
113
|
+
|
|
114
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# DriftGuard
|
|
2
|
+
|
|
3
|
+
DriftGuard is a lightweight, robust, and publish-ready Python library designed to automate dataset validation and detect statistical data drift in machine learning pipelines.
|
|
4
|
+
|
|
5
|
+
By comparing new incoming datasets against a trusted baseline reference dataset, DriftGuard alerts you to schema changes, increases in missing values, or shifts in feature distributions before they affect downstream model performance.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## Features
|
|
10
|
+
|
|
11
|
+
- **Schema Validation**: Detect missing columns, new columns, and data type mismatches.
|
|
12
|
+
- **Null Rate Analysis**: Monitor and flag columns where the rate of missing values increases beyond a configurable threshold.
|
|
13
|
+
- **Statistical Drift Detection**:
|
|
14
|
+
- **Kolmogorov-Smirnov (KS) Test** (`scipy.stats.ks_2samp`) for numerical columns.
|
|
15
|
+
- **Chi-Square Test** (`scipy.stats.chi2_contingency`) for categorical columns.
|
|
16
|
+
- **Severity Tagging**: Categorizes issues as `INFO`, `WARNING`, or `CRITICAL` for pipeline routing or CI/CD gate checks.
|
|
17
|
+
- **Interactive Reports**:
|
|
18
|
+
- An ASCII summary table output directly to console.
|
|
19
|
+
- Machine-readable JSON output for automated pipelines.
|
|
20
|
+
- A beautiful, self-contained interactive HTML dashboard with per-column breakdowns and interactive searching/filtering.
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Installation
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install driftguard
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
*Note: Depends on `numpy`, `pandas`, and `scipy` only.*
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
## Quickstart
|
|
35
|
+
|
|
36
|
+
Validate your production features in real-time or as part of a batch training/inference pipeline:
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
import numpy as np
|
|
40
|
+
import pandas as pd
|
|
41
|
+
import driftguard as dg
|
|
42
|
+
|
|
43
|
+
# 1. Create a reference dataset (baseline)
|
|
44
|
+
np.random.seed(42)
|
|
45
|
+
ref_data = {
|
|
46
|
+
"age": np.random.normal(35, 10, 1000),
|
|
47
|
+
"income": np.random.uniform(30000, 120000, 1000),
|
|
48
|
+
"city": np.random.choice(["New York", "Chicago", "San Francisco"], 1000),
|
|
49
|
+
"target": np.random.choice([0, 1], 1000, p=[0.7, 0.3])
|
|
50
|
+
}
|
|
51
|
+
reference_df = pd.DataFrame(ref_data)
|
|
52
|
+
|
|
53
|
+
# 2. Create a new dataset (with some drift and schema issues)
|
|
54
|
+
new_data = {
|
|
55
|
+
"age": np.random.normal(38, 10, 1000), # Slight shift
|
|
56
|
+
"income": np.random.uniform(30000, 120000, 1000),
|
|
57
|
+
"city": np.random.choice(["New York", "Chicago", "Boston"], 1000), # "Boston" is a new category
|
|
58
|
+
"target": np.random.choice([0, 1], 1000, p=[0.7, 0.3]),
|
|
59
|
+
"extra_col": np.random.random(1000) # New column
|
|
60
|
+
}
|
|
61
|
+
new_df = pd.DataFrame(new_data)
|
|
62
|
+
# Add some null values to 'income'
|
|
63
|
+
new_df.loc[np.random.choice(1000, 150, replace=False), "income"] = np.nan
|
|
64
|
+
|
|
65
|
+
# 3. Instantiate Validator and run checks
|
|
66
|
+
# p_threshold matches the significance alpha for KS/Chi2
|
|
67
|
+
# null_threshold is the maximum allowed null rate increase
|
|
68
|
+
validator = dg.Validator(reference_df, p_threshold=0.05, null_threshold=0.10)
|
|
69
|
+
report = validator.check(new_df)
|
|
70
|
+
|
|
71
|
+
# 4. Consume the report
|
|
72
|
+
# Prints a formatted ASCII table of all issues
|
|
73
|
+
report.summary()
|
|
74
|
+
|
|
75
|
+
# Export interactive HTML dashboard (saves report.html)
|
|
76
|
+
report.export("html")
|
|
77
|
+
|
|
78
|
+
# Export machine-readable JSON (saves report.json)
|
|
79
|
+
report.export("json")
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## License
|
|
85
|
+
|
|
86
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""DriftGuard: Data drift detection and schema validation for ML pipelines."""
|
|
2
|
+
|
|
3
|
+
from driftguard.validator import Validator
|
|
4
|
+
from driftguard.report import Report
|
|
5
|
+
from driftguard.utils import CheckResult
|
|
6
|
+
|
|
7
|
+
__version__ = "0.1.0"
|
|
8
|
+
__all__ = ["Validator", "Report", "CheckResult"]
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
"""Checks sub-module for data validation and drift detection."""
|
|
2
|
+
|
|
3
|
+
from driftguard.checks.schema import check_schema
|
|
4
|
+
from driftguard.checks.nulls import check_nulls
|
|
5
|
+
from driftguard.checks.drift import check_drift
|
|
6
|
+
|
|
7
|
+
__all__ = ["check_schema", "check_nulls", "check_drift"]
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
"""Statistical drift detection checks for DriftGuard."""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from scipy.stats import ks_2samp, chi2_contingency
|
|
6
|
+
from typing import List
|
|
7
|
+
from driftguard.utils import CheckResult
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def check_drift(
|
|
11
|
+
ref_df: pd.DataFrame,
|
|
12
|
+
new_df: pd.DataFrame,
|
|
13
|
+
columns: List[str],
|
|
14
|
+
p_threshold: float,
|
|
15
|
+
) -> List[CheckResult]:
|
|
16
|
+
"""Performs statistical drift tests on the specified columns.
|
|
17
|
+
|
|
18
|
+
Uses:
|
|
19
|
+
- Kolmogorov-Smirnov test for numerical columns.
|
|
20
|
+
- Chi-Square test of independence for categorical/boolean columns.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
ref_df: The reference pandas DataFrame.
|
|
24
|
+
new_df: The new pandas DataFrame.
|
|
25
|
+
columns: The list of common columns to check.
|
|
26
|
+
p_threshold: The significance level (alpha) below which drift is detected.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
A list of CheckResult objects containing the statistical results.
|
|
30
|
+
"""
|
|
31
|
+
results = []
|
|
32
|
+
|
|
33
|
+
for col in columns:
|
|
34
|
+
is_ref_numeric = pd.api.types.is_numeric_dtype(ref_df[col])
|
|
35
|
+
is_new_numeric = pd.api.types.is_numeric_dtype(new_df[col])
|
|
36
|
+
|
|
37
|
+
# If they are both numeric, apply Kolmogorov-Smirnov test
|
|
38
|
+
if is_ref_numeric and is_new_numeric:
|
|
39
|
+
ref_data = ref_df[col].dropna()
|
|
40
|
+
new_data = new_df[col].dropna()
|
|
41
|
+
|
|
42
|
+
if len(ref_data) == 0 or len(new_data) == 0:
|
|
43
|
+
results.append(
|
|
44
|
+
CheckResult(
|
|
45
|
+
column=col,
|
|
46
|
+
check_type="drift",
|
|
47
|
+
metric_name="numerical_drift",
|
|
48
|
+
status="PASSED",
|
|
49
|
+
severity="INFO",
|
|
50
|
+
message=f"Skipped drift check for column '{col}': insufficient data (all nulls).",
|
|
51
|
+
reference_value=None,
|
|
52
|
+
new_value=None,
|
|
53
|
+
p_value=1.0,
|
|
54
|
+
)
|
|
55
|
+
)
|
|
56
|
+
continue
|
|
57
|
+
|
|
58
|
+
# Perform the Kolmogorov-Smirnov two-sample test
|
|
59
|
+
statistic, p_value = ks_2samp(ref_data, new_data)
|
|
60
|
+
p_value = float(p_value)
|
|
61
|
+
|
|
62
|
+
ref_stats = {"mean": float(ref_data.mean()), "std": float(ref_data.std())}
|
|
63
|
+
new_stats = {"mean": float(new_data.mean()), "std": float(new_data.std())}
|
|
64
|
+
|
|
65
|
+
if p_value < p_threshold:
|
|
66
|
+
results.append(
|
|
67
|
+
CheckResult(
|
|
68
|
+
column=col,
|
|
69
|
+
check_type="drift",
|
|
70
|
+
metric_name="numerical_drift",
|
|
71
|
+
status="FAILED",
|
|
72
|
+
severity="WARNING",
|
|
73
|
+
message=(
|
|
74
|
+
f"Numerical drift detected in column '{col}' "
|
|
75
|
+
f"(KS test p-value: {p_value:.4e} < threshold {p_threshold})."
|
|
76
|
+
),
|
|
77
|
+
reference_value=ref_stats,
|
|
78
|
+
new_value=new_stats,
|
|
79
|
+
p_value=p_value,
|
|
80
|
+
)
|
|
81
|
+
)
|
|
82
|
+
else:
|
|
83
|
+
results.append(
|
|
84
|
+
CheckResult(
|
|
85
|
+
column=col,
|
|
86
|
+
check_type="drift",
|
|
87
|
+
metric_name="numerical_drift",
|
|
88
|
+
status="PASSED",
|
|
89
|
+
severity="INFO",
|
|
90
|
+
message=(
|
|
91
|
+
f"No numerical drift detected in column '{col}' "
|
|
92
|
+
f"(KS test p-value: {p_value:.4f})."
|
|
93
|
+
),
|
|
94
|
+
reference_value=ref_stats,
|
|
95
|
+
new_value=new_stats,
|
|
96
|
+
p_value=p_value,
|
|
97
|
+
)
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
else:
|
|
101
|
+
# Treat as categorical/boolean/object
|
|
102
|
+
ref_data = ref_df[col].dropna().astype(str)
|
|
103
|
+
new_data = new_df[col].dropna().astype(str)
|
|
104
|
+
|
|
105
|
+
if len(ref_data) == 0 or len(new_data) == 0:
|
|
106
|
+
results.append(
|
|
107
|
+
CheckResult(
|
|
108
|
+
column=col,
|
|
109
|
+
check_type="drift",
|
|
110
|
+
metric_name="categorical_drift",
|
|
111
|
+
status="PASSED",
|
|
112
|
+
severity="INFO",
|
|
113
|
+
message=f"Skipped drift check for column '{col}': insufficient data (all nulls).",
|
|
114
|
+
reference_value=None,
|
|
115
|
+
new_value=None,
|
|
116
|
+
p_value=1.0,
|
|
117
|
+
)
|
|
118
|
+
)
|
|
119
|
+
continue
|
|
120
|
+
|
|
121
|
+
ref_counts = ref_data.value_counts()
|
|
122
|
+
new_counts = new_data.value_counts()
|
|
123
|
+
all_cats = list(set(ref_counts.index).union(new_counts.index))
|
|
124
|
+
|
|
125
|
+
if len(all_cats) < 2:
|
|
126
|
+
results.append(
|
|
127
|
+
CheckResult(
|
|
128
|
+
column=col,
|
|
129
|
+
check_type="drift",
|
|
130
|
+
metric_name="categorical_drift",
|
|
131
|
+
status="PASSED",
|
|
132
|
+
severity="INFO",
|
|
133
|
+
message=(
|
|
134
|
+
f"Skipped drift check for column '{col}': "
|
|
135
|
+
f"less than 2 unique categories found."
|
|
136
|
+
),
|
|
137
|
+
reference_value=ref_counts.to_dict(),
|
|
138
|
+
new_value=new_counts.to_dict(),
|
|
139
|
+
p_value=1.0,
|
|
140
|
+
)
|
|
141
|
+
)
|
|
142
|
+
continue
|
|
143
|
+
|
|
144
|
+
# Construct contingency table
|
|
145
|
+
contingency_table = []
|
|
146
|
+
for cat in all_cats:
|
|
147
|
+
contingency_table.append([ref_counts.get(cat, 0), new_counts.get(cat, 0)])
|
|
148
|
+
|
|
149
|
+
contingency_table = np.array(contingency_table)
|
|
150
|
+
|
|
151
|
+
try:
|
|
152
|
+
# chi2_contingency returns: chi2, p, dof, expected
|
|
153
|
+
_, p_value, _, _ = chi2_contingency(contingency_table)
|
|
154
|
+
p_value = float(p_value)
|
|
155
|
+
except Exception:
|
|
156
|
+
p_value = 1.0
|
|
157
|
+
|
|
158
|
+
ref_dist = (ref_counts / len(ref_data)).to_dict()
|
|
159
|
+
new_dist = (new_counts / len(new_data)).to_dict()
|
|
160
|
+
|
|
161
|
+
if p_value < p_threshold:
|
|
162
|
+
results.append(
|
|
163
|
+
CheckResult(
|
|
164
|
+
column=col,
|
|
165
|
+
check_type="drift",
|
|
166
|
+
metric_name="categorical_drift",
|
|
167
|
+
status="FAILED",
|
|
168
|
+
severity="WARNING",
|
|
169
|
+
message=(
|
|
170
|
+
f"Categorical drift detected in column '{col}' "
|
|
171
|
+
f"(Chi-Square test p-value: {p_value:.4e} < threshold {p_threshold})."
|
|
172
|
+
),
|
|
173
|
+
reference_value=ref_dist,
|
|
174
|
+
new_value=new_dist,
|
|
175
|
+
p_value=p_value,
|
|
176
|
+
)
|
|
177
|
+
)
|
|
178
|
+
else:
|
|
179
|
+
results.append(
|
|
180
|
+
CheckResult(
|
|
181
|
+
column=col,
|
|
182
|
+
check_type="drift",
|
|
183
|
+
metric_name="categorical_drift",
|
|
184
|
+
status="PASSED",
|
|
185
|
+
severity="INFO",
|
|
186
|
+
message=(
|
|
187
|
+
f"No categorical drift detected in column '{col}' "
|
|
188
|
+
f"(Chi-Square test p-value: {p_value:.4f})."
|
|
189
|
+
),
|
|
190
|
+
reference_value=ref_dist,
|
|
191
|
+
new_value=new_dist,
|
|
192
|
+
p_value=p_value,
|
|
193
|
+
)
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
return results
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""Null rate validation checks for DriftGuard."""
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from typing import List
|
|
5
|
+
from driftguard.utils import CheckResult
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def check_nulls(
|
|
9
|
+
ref_df: pd.DataFrame,
|
|
10
|
+
new_df: pd.DataFrame,
|
|
11
|
+
columns: List[str],
|
|
12
|
+
null_threshold: float,
|
|
13
|
+
) -> List[CheckResult]:
|
|
14
|
+
"""Compares the null rates of specified columns between ref_df and new_df.
|
|
15
|
+
|
|
16
|
+
Flags columns where the null rate increased beyond the specified threshold.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
ref_df: The reference pandas DataFrame.
|
|
20
|
+
new_df: The new pandas DataFrame.
|
|
21
|
+
columns: The list of common columns to check.
|
|
22
|
+
null_threshold: The maximum allowed increase in null rate (absolute).
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
A list of CheckResult objects containing null analysis details.
|
|
26
|
+
"""
|
|
27
|
+
results = []
|
|
28
|
+
|
|
29
|
+
for col in columns:
|
|
30
|
+
ref_null_rate = float(ref_df[col].isnull().mean())
|
|
31
|
+
new_null_rate = float(new_df[col].isnull().mean())
|
|
32
|
+
diff = new_null_rate - ref_null_rate
|
|
33
|
+
|
|
34
|
+
if diff > null_threshold:
|
|
35
|
+
# If the new null rate is exactly 100%, it is a critical issue.
|
|
36
|
+
# Otherwise, it is a warning.
|
|
37
|
+
severity = "CRITICAL" if new_null_rate == 1.0 else "WARNING"
|
|
38
|
+
results.append(
|
|
39
|
+
CheckResult(
|
|
40
|
+
column=col,
|
|
41
|
+
check_type="nulls",
|
|
42
|
+
metric_name="null_rate_increase",
|
|
43
|
+
status="FAILED",
|
|
44
|
+
severity=severity,
|
|
45
|
+
message=(
|
|
46
|
+
f"Null rate for column '{col}' increased from {ref_null_rate:.2%} "
|
|
47
|
+
f"to {new_null_rate:.2%} (increase of {diff:.2%} exceeds "
|
|
48
|
+
f"threshold of {null_threshold:.2%})."
|
|
49
|
+
),
|
|
50
|
+
reference_value=ref_null_rate,
|
|
51
|
+
new_value=new_null_rate,
|
|
52
|
+
)
|
|
53
|
+
)
|
|
54
|
+
else:
|
|
55
|
+
results.append(
|
|
56
|
+
CheckResult(
|
|
57
|
+
column=col,
|
|
58
|
+
check_type="nulls",
|
|
59
|
+
metric_name="null_rate_check",
|
|
60
|
+
status="PASSED",
|
|
61
|
+
severity="INFO",
|
|
62
|
+
message=(
|
|
63
|
+
f"Null rate for column '{col}' is within acceptable limits "
|
|
64
|
+
f"(increase: {diff:.2%})."
|
|
65
|
+
),
|
|
66
|
+
reference_value=ref_null_rate,
|
|
67
|
+
new_value=new_null_rate,
|
|
68
|
+
)
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
return results
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""Schema validation checks for DriftGuard."""
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from typing import List
|
|
5
|
+
from driftguard.utils import CheckResult
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def check_schema(ref_df: pd.DataFrame, new_df: pd.DataFrame) -> List[CheckResult]:
|
|
9
|
+
"""Validates the schema of the new DataFrame against the reference DataFrame.
|
|
10
|
+
|
|
11
|
+
Checks for:
|
|
12
|
+
- Missing columns: Columns present in ref_df but not in new_df.
|
|
13
|
+
- New columns: Columns present in new_df but not in ref_df.
|
|
14
|
+
- Data type mismatches: Columns present in both but with different dtypes.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
ref_df: The reference pandas DataFrame.
|
|
18
|
+
new_df: The new pandas DataFrame to validate.
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
A list of CheckResult objects containing the validation details.
|
|
22
|
+
"""
|
|
23
|
+
results = []
|
|
24
|
+
ref_cols = set(ref_df.columns)
|
|
25
|
+
new_cols = set(new_df.columns)
|
|
26
|
+
|
|
27
|
+
# Detect missing columns
|
|
28
|
+
missing_cols = ref_cols - new_cols
|
|
29
|
+
for col in missing_cols:
|
|
30
|
+
results.append(
|
|
31
|
+
CheckResult(
|
|
32
|
+
column=col,
|
|
33
|
+
check_type="schema",
|
|
34
|
+
metric_name="missing_column",
|
|
35
|
+
status="FAILED",
|
|
36
|
+
severity="CRITICAL",
|
|
37
|
+
message=f"Column '{col}' is missing in the new dataset.",
|
|
38
|
+
reference_value=str(ref_df[col].dtype),
|
|
39
|
+
new_value=None,
|
|
40
|
+
)
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# Detect new columns
|
|
44
|
+
new_cols_detected = new_cols - ref_cols
|
|
45
|
+
for col in new_cols_detected:
|
|
46
|
+
results.append(
|
|
47
|
+
CheckResult(
|
|
48
|
+
column=col,
|
|
49
|
+
check_type="schema",
|
|
50
|
+
metric_name="new_column",
|
|
51
|
+
status="FAILED",
|
|
52
|
+
severity="INFO",
|
|
53
|
+
message=f"New column '{col}' detected in the new dataset.",
|
|
54
|
+
reference_value=None,
|
|
55
|
+
new_value=str(new_df[col].dtype),
|
|
56
|
+
)
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# Detect dtype mismatches
|
|
60
|
+
common_cols = ref_cols.intersection(new_cols)
|
|
61
|
+
for col in common_cols:
|
|
62
|
+
ref_dtype = ref_df[col].dtype
|
|
63
|
+
new_dtype = new_df[col].dtype
|
|
64
|
+
|
|
65
|
+
if ref_dtype != new_dtype:
|
|
66
|
+
results.append(
|
|
67
|
+
CheckResult(
|
|
68
|
+
column=col,
|
|
69
|
+
check_type="schema",
|
|
70
|
+
metric_name="dtype_mismatch",
|
|
71
|
+
status="FAILED",
|
|
72
|
+
severity="CRITICAL",
|
|
73
|
+
message=(
|
|
74
|
+
f"Data type mismatch for column '{col}': "
|
|
75
|
+
f"expected {ref_dtype}, got {new_dtype}."
|
|
76
|
+
),
|
|
77
|
+
reference_value=str(ref_dtype),
|
|
78
|
+
new_value=str(new_dtype),
|
|
79
|
+
)
|
|
80
|
+
)
|
|
81
|
+
else:
|
|
82
|
+
results.append(
|
|
83
|
+
CheckResult(
|
|
84
|
+
column=col,
|
|
85
|
+
check_type="schema",
|
|
86
|
+
metric_name="schema_validation",
|
|
87
|
+
status="PASSED",
|
|
88
|
+
severity="INFO",
|
|
89
|
+
message=f"Schema and data type validated for column '{col}'.",
|
|
90
|
+
reference_value=str(ref_dtype),
|
|
91
|
+
new_value=str(new_dtype),
|
|
92
|
+
)
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
return results
|