driftguard-prathvi 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 DriftGuard Developers
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,114 @@
1
+ Metadata-Version: 2.4
2
+ Name: driftguard-prathvi
3
+ Version: 0.1.0
4
+ Summary: A robust data drift detection and schema validation library for machine learning pipelines.
5
+ Author-email: DriftGuard Developer <developer@driftguard.org>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/developer/driftguard
8
+ Project-URL: Documentation, https://github.com/developer/driftguard#readme
9
+ Project-URL: Bug-Tracker, https://github.com/developer/driftguard/issues
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.8
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
+ Requires-Python: >=3.8
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Requires-Dist: numpy>=1.18.0
25
+ Requires-Dist: pandas>=1.0.0
26
+ Requires-Dist: scipy>=1.5.0
27
+ Dynamic: license-file
28
+
29
+ # DriftGuard
30
+
31
+ DriftGuard is a lightweight, robust, and publish-ready Python library designed to automate dataset validation and detect statistical data drift in machine learning pipelines.
32
+
33
+ By comparing new incoming datasets against a trusted baseline reference dataset, DriftGuard alerts you to schema changes, increases in missing values, or shifts in feature distributions before they affect downstream model performance.
34
+
35
+ ---
36
+
37
+ ## Features
38
+
39
+ - **Schema Validation**: Detect missing columns, new columns, and data type mismatches.
40
+ - **Null Rate Analysis**: Monitor and flag columns where the rate of missing values increases beyond a configurable threshold.
41
+ - **Statistical Drift Detection**:
42
+ - **Kolmogorov-Smirnov (KS) Test** (`scipy.stats.ks_2samp`) for numerical columns.
43
+ - **Chi-Square Test** (`scipy.stats.chi2_contingency`) for categorical columns.
44
+ - **Severity Tagging**: Categorizes issues as `INFO`, `WARNING`, or `CRITICAL` for pipeline routing or CI/CD gate checks.
45
+ - **Interactive Reports**:
46
+ - An ASCII summary table output directly to console.
47
+ - Machine-readable JSON output for automated pipelines.
48
+ - A beautiful, self-contained interactive HTML dashboard with per-column breakdowns and interactive searching/filtering.
49
+
50
+ ---
51
+
52
+ ## Installation
53
+
54
+ ```bash
55
+ pip install driftguard
56
+ ```
57
+
58
+ *Note: Depends on `numpy`, `pandas`, and `scipy` only.*
59
+
60
+ ---
61
+
62
+ ## Quickstart
63
+
64
+ Validate your production features in real-time or as part of a batch training/inference pipeline:
65
+
66
+ ```python
67
+ import numpy as np
68
+ import pandas as pd
69
+ import driftguard as dg
70
+
71
+ # 1. Create a reference dataset (baseline)
72
+ np.random.seed(42)
73
+ ref_data = {
74
+ "age": np.random.normal(35, 10, 1000),
75
+ "income": np.random.uniform(30000, 120000, 1000),
76
+ "city": np.random.choice(["New York", "Chicago", "San Francisco"], 1000),
77
+ "target": np.random.choice([0, 1], 1000, p=[0.7, 0.3])
78
+ }
79
+ reference_df = pd.DataFrame(ref_data)
80
+
81
+ # 2. Create a new dataset (with some drift and schema issues)
82
+ new_data = {
83
+ "age": np.random.normal(38, 10, 1000), # Slight shift
84
+ "income": np.random.uniform(30000, 120000, 1000),
85
+ "city": np.random.choice(["New York", "Chicago", "Boston"], 1000), # "Boston" is a new category
86
+ "target": np.random.choice([0, 1], 1000, p=[0.7, 0.3]),
87
+ "extra_col": np.random.random(1000) # New column
88
+ }
89
+ new_df = pd.DataFrame(new_data)
90
+ # Add some null values to 'income'
91
+ new_df.loc[np.random.choice(1000, 150, replace=False), "income"] = np.nan
92
+
93
+ # 3. Instantiate Validator and run checks
94
+ # p_threshold matches the significance alpha for KS/Chi2
95
+ # null_threshold is the maximum allowed null rate increase
96
+ validator = dg.Validator(reference_df, p_threshold=0.05, null_threshold=0.10)
97
+ report = validator.check(new_df)
98
+
99
+ # 4. Consume the report
100
+ # Prints a formatted ASCII table of all issues
101
+ report.summary()
102
+
103
+ # Export interactive HTML dashboard (saves report.html)
104
+ report.export("html")
105
+
106
+ # Export machine-readable JSON (saves report.json)
107
+ report.export("json")
108
+ ```
109
+
110
+ ---
111
+
112
+ ## License
113
+
114
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
@@ -0,0 +1,86 @@
1
+ # DriftGuard
2
+
3
+ DriftGuard is a lightweight, robust, and publish-ready Python library designed to automate dataset validation and detect statistical data drift in machine learning pipelines.
4
+
5
+ By comparing new incoming datasets against a trusted baseline reference dataset, DriftGuard alerts you to schema changes, increases in missing values, or shifts in feature distributions before they affect downstream model performance.
6
+
7
+ ---
8
+
9
+ ## Features
10
+
11
+ - **Schema Validation**: Detect missing columns, new columns, and data type mismatches.
12
+ - **Null Rate Analysis**: Monitor and flag columns where the rate of missing values increases beyond a configurable threshold.
13
+ - **Statistical Drift Detection**:
14
+ - **Kolmogorov-Smirnov (KS) Test** (`scipy.stats.ks_2samp`) for numerical columns.
15
+ - **Chi-Square Test** (`scipy.stats.chi2_contingency`) for categorical columns.
16
+ - **Severity Tagging**: Categorizes issues as `INFO`, `WARNING`, or `CRITICAL` for pipeline routing or CI/CD gate checks.
17
+ - **Interactive Reports**:
18
+ - An ASCII summary table output directly to console.
19
+ - Machine-readable JSON output for automated pipelines.
20
+ - A beautiful, self-contained interactive HTML dashboard with per-column breakdowns and interactive searching/filtering.
21
+
22
+ ---
23
+
24
+ ## Installation
25
+
26
+ ```bash
27
+ pip install driftguard
28
+ ```
29
+
30
+ *Note: Depends on `numpy`, `pandas`, and `scipy` only.*
31
+
32
+ ---
33
+
34
+ ## Quickstart
35
+
36
+ Validate your production features in real-time or as part of a batch training/inference pipeline:
37
+
38
+ ```python
39
+ import numpy as np
40
+ import pandas as pd
41
+ import driftguard as dg
42
+
43
+ # 1. Create a reference dataset (baseline)
44
+ np.random.seed(42)
45
+ ref_data = {
46
+ "age": np.random.normal(35, 10, 1000),
47
+ "income": np.random.uniform(30000, 120000, 1000),
48
+ "city": np.random.choice(["New York", "Chicago", "San Francisco"], 1000),
49
+ "target": np.random.choice([0, 1], 1000, p=[0.7, 0.3])
50
+ }
51
+ reference_df = pd.DataFrame(ref_data)
52
+
53
+ # 2. Create a new dataset (with some drift and schema issues)
54
+ new_data = {
55
+ "age": np.random.normal(38, 10, 1000), # Slight shift
56
+ "income": np.random.uniform(30000, 120000, 1000),
57
+ "city": np.random.choice(["New York", "Chicago", "Boston"], 1000), # "Boston" is a new category
58
+ "target": np.random.choice([0, 1], 1000, p=[0.7, 0.3]),
59
+ "extra_col": np.random.random(1000) # New column
60
+ }
61
+ new_df = pd.DataFrame(new_data)
62
+ # Add some null values to 'income'
63
+ new_df.loc[np.random.choice(1000, 150, replace=False), "income"] = np.nan
64
+
65
+ # 3. Instantiate Validator and run checks
66
+ # p_threshold matches the significance alpha for KS/Chi2
67
+ # null_threshold is the maximum allowed null rate increase
68
+ validator = dg.Validator(reference_df, p_threshold=0.05, null_threshold=0.10)
69
+ report = validator.check(new_df)
70
+
71
+ # 4. Consume the report
72
+ # Prints a formatted ASCII table of all issues
73
+ report.summary()
74
+
75
+ # Export interactive HTML dashboard (saves report.html)
76
+ report.export("html")
77
+
78
+ # Export machine-readable JSON (saves report.json)
79
+ report.export("json")
80
+ ```
81
+
82
+ ---
83
+
84
+ ## License
85
+
86
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
@@ -0,0 +1,8 @@
1
+ """DriftGuard: Data drift detection and schema validation for ML pipelines."""
2
+
3
+ from driftguard.validator import Validator
4
+ from driftguard.report import Report
5
+ from driftguard.utils import CheckResult
6
+
7
+ __version__ = "0.1.0"
8
+ __all__ = ["Validator", "Report", "CheckResult"]
@@ -0,0 +1,7 @@
1
+ """Checks sub-module for data validation and drift detection."""
2
+
3
+ from driftguard.checks.schema import check_schema
4
+ from driftguard.checks.nulls import check_nulls
5
+ from driftguard.checks.drift import check_drift
6
+
7
+ __all__ = ["check_schema", "check_nulls", "check_drift"]
@@ -0,0 +1,196 @@
1
+ """Statistical drift detection checks for DriftGuard."""
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ from scipy.stats import ks_2samp, chi2_contingency
6
+ from typing import List
7
+ from driftguard.utils import CheckResult
8
+
9
+
10
+ def check_drift(
11
+ ref_df: pd.DataFrame,
12
+ new_df: pd.DataFrame,
13
+ columns: List[str],
14
+ p_threshold: float,
15
+ ) -> List[CheckResult]:
16
+ """Performs statistical drift tests on the specified columns.
17
+
18
+ Uses:
19
+ - Kolmogorov-Smirnov test for numerical columns.
20
+ - Chi-Square test of independence for categorical/boolean columns.
21
+
22
+ Args:
23
+ ref_df: The reference pandas DataFrame.
24
+ new_df: The new pandas DataFrame.
25
+ columns: The list of common columns to check.
26
+ p_threshold: The significance level (alpha) below which drift is detected.
27
+
28
+ Returns:
29
+ A list of CheckResult objects containing the statistical results.
30
+ """
31
+ results = []
32
+
33
+ for col in columns:
34
+ is_ref_numeric = pd.api.types.is_numeric_dtype(ref_df[col])
35
+ is_new_numeric = pd.api.types.is_numeric_dtype(new_df[col])
36
+
37
+ # If they are both numeric, apply Kolmogorov-Smirnov test
38
+ if is_ref_numeric and is_new_numeric:
39
+ ref_data = ref_df[col].dropna()
40
+ new_data = new_df[col].dropna()
41
+
42
+ if len(ref_data) == 0 or len(new_data) == 0:
43
+ results.append(
44
+ CheckResult(
45
+ column=col,
46
+ check_type="drift",
47
+ metric_name="numerical_drift",
48
+ status="PASSED",
49
+ severity="INFO",
50
+ message=f"Skipped drift check for column '{col}': insufficient data (all nulls).",
51
+ reference_value=None,
52
+ new_value=None,
53
+ p_value=1.0,
54
+ )
55
+ )
56
+ continue
57
+
58
+ # Perform the Kolmogorov-Smirnov two-sample test
59
+ statistic, p_value = ks_2samp(ref_data, new_data)
60
+ p_value = float(p_value)
61
+
62
+ ref_stats = {"mean": float(ref_data.mean()), "std": float(ref_data.std())}
63
+ new_stats = {"mean": float(new_data.mean()), "std": float(new_data.std())}
64
+
65
+ if p_value < p_threshold:
66
+ results.append(
67
+ CheckResult(
68
+ column=col,
69
+ check_type="drift",
70
+ metric_name="numerical_drift",
71
+ status="FAILED",
72
+ severity="WARNING",
73
+ message=(
74
+ f"Numerical drift detected in column '{col}' "
75
+ f"(KS test p-value: {p_value:.4e} < threshold {p_threshold})."
76
+ ),
77
+ reference_value=ref_stats,
78
+ new_value=new_stats,
79
+ p_value=p_value,
80
+ )
81
+ )
82
+ else:
83
+ results.append(
84
+ CheckResult(
85
+ column=col,
86
+ check_type="drift",
87
+ metric_name="numerical_drift",
88
+ status="PASSED",
89
+ severity="INFO",
90
+ message=(
91
+ f"No numerical drift detected in column '{col}' "
92
+ f"(KS test p-value: {p_value:.4f})."
93
+ ),
94
+ reference_value=ref_stats,
95
+ new_value=new_stats,
96
+ p_value=p_value,
97
+ )
98
+ )
99
+
100
+ else:
101
+ # Treat as categorical/boolean/object
102
+ ref_data = ref_df[col].dropna().astype(str)
103
+ new_data = new_df[col].dropna().astype(str)
104
+
105
+ if len(ref_data) == 0 or len(new_data) == 0:
106
+ results.append(
107
+ CheckResult(
108
+ column=col,
109
+ check_type="drift",
110
+ metric_name="categorical_drift",
111
+ status="PASSED",
112
+ severity="INFO",
113
+ message=f"Skipped drift check for column '{col}': insufficient data (all nulls).",
114
+ reference_value=None,
115
+ new_value=None,
116
+ p_value=1.0,
117
+ )
118
+ )
119
+ continue
120
+
121
+ ref_counts = ref_data.value_counts()
122
+ new_counts = new_data.value_counts()
123
+ all_cats = list(set(ref_counts.index).union(new_counts.index))
124
+
125
+ if len(all_cats) < 2:
126
+ results.append(
127
+ CheckResult(
128
+ column=col,
129
+ check_type="drift",
130
+ metric_name="categorical_drift",
131
+ status="PASSED",
132
+ severity="INFO",
133
+ message=(
134
+ f"Skipped drift check for column '{col}': "
135
+ f"less than 2 unique categories found."
136
+ ),
137
+ reference_value=ref_counts.to_dict(),
138
+ new_value=new_counts.to_dict(),
139
+ p_value=1.0,
140
+ )
141
+ )
142
+ continue
143
+
144
+ # Construct contingency table
145
+ contingency_table = []
146
+ for cat in all_cats:
147
+ contingency_table.append([ref_counts.get(cat, 0), new_counts.get(cat, 0)])
148
+
149
+ contingency_table = np.array(contingency_table)
150
+
151
+ try:
152
+ # chi2_contingency returns: chi2, p, dof, expected
153
+ _, p_value, _, _ = chi2_contingency(contingency_table)
154
+ p_value = float(p_value)
155
+ except Exception:
156
+ p_value = 1.0
157
+
158
+ ref_dist = (ref_counts / len(ref_data)).to_dict()
159
+ new_dist = (new_counts / len(new_data)).to_dict()
160
+
161
+ if p_value < p_threshold:
162
+ results.append(
163
+ CheckResult(
164
+ column=col,
165
+ check_type="drift",
166
+ metric_name="categorical_drift",
167
+ status="FAILED",
168
+ severity="WARNING",
169
+ message=(
170
+ f"Categorical drift detected in column '{col}' "
171
+ f"(Chi-Square test p-value: {p_value:.4e} < threshold {p_threshold})."
172
+ ),
173
+ reference_value=ref_dist,
174
+ new_value=new_dist,
175
+ p_value=p_value,
176
+ )
177
+ )
178
+ else:
179
+ results.append(
180
+ CheckResult(
181
+ column=col,
182
+ check_type="drift",
183
+ metric_name="categorical_drift",
184
+ status="PASSED",
185
+ severity="INFO",
186
+ message=(
187
+ f"No categorical drift detected in column '{col}' "
188
+ f"(Chi-Square test p-value: {p_value:.4f})."
189
+ ),
190
+ reference_value=ref_dist,
191
+ new_value=new_dist,
192
+ p_value=p_value,
193
+ )
194
+ )
195
+
196
+ return results
@@ -0,0 +1,71 @@
1
+ """Null rate validation checks for DriftGuard."""
2
+
3
+ import pandas as pd
4
+ from typing import List
5
+ from driftguard.utils import CheckResult
6
+
7
+
8
+ def check_nulls(
9
+ ref_df: pd.DataFrame,
10
+ new_df: pd.DataFrame,
11
+ columns: List[str],
12
+ null_threshold: float,
13
+ ) -> List[CheckResult]:
14
+ """Compares the null rates of specified columns between ref_df and new_df.
15
+
16
+ Flags columns where the null rate increased beyond the specified threshold.
17
+
18
+ Args:
19
+ ref_df: The reference pandas DataFrame.
20
+ new_df: The new pandas DataFrame.
21
+ columns: The list of common columns to check.
22
+ null_threshold: The maximum allowed increase in null rate (absolute).
23
+
24
+ Returns:
25
+ A list of CheckResult objects containing null analysis details.
26
+ """
27
+ results = []
28
+
29
+ for col in columns:
30
+ ref_null_rate = float(ref_df[col].isnull().mean())
31
+ new_null_rate = float(new_df[col].isnull().mean())
32
+ diff = new_null_rate - ref_null_rate
33
+
34
+ if diff > null_threshold:
35
+ # If the new null rate is exactly 100%, it is a critical issue.
36
+ # Otherwise, it is a warning.
37
+ severity = "CRITICAL" if new_null_rate == 1.0 else "WARNING"
38
+ results.append(
39
+ CheckResult(
40
+ column=col,
41
+ check_type="nulls",
42
+ metric_name="null_rate_increase",
43
+ status="FAILED",
44
+ severity=severity,
45
+ message=(
46
+ f"Null rate for column '{col}' increased from {ref_null_rate:.2%} "
47
+ f"to {new_null_rate:.2%} (increase of {diff:.2%} exceeds "
48
+ f"threshold of {null_threshold:.2%})."
49
+ ),
50
+ reference_value=ref_null_rate,
51
+ new_value=new_null_rate,
52
+ )
53
+ )
54
+ else:
55
+ results.append(
56
+ CheckResult(
57
+ column=col,
58
+ check_type="nulls",
59
+ metric_name="null_rate_check",
60
+ status="PASSED",
61
+ severity="INFO",
62
+ message=(
63
+ f"Null rate for column '{col}' is within acceptable limits "
64
+ f"(increase: {diff:.2%})."
65
+ ),
66
+ reference_value=ref_null_rate,
67
+ new_value=new_null_rate,
68
+ )
69
+ )
70
+
71
+ return results
@@ -0,0 +1,95 @@
1
+ """Schema validation checks for DriftGuard."""
2
+
3
+ import pandas as pd
4
+ from typing import List
5
+ from driftguard.utils import CheckResult
6
+
7
+
8
+ def check_schema(ref_df: pd.DataFrame, new_df: pd.DataFrame) -> List[CheckResult]:
9
+ """Validates the schema of the new DataFrame against the reference DataFrame.
10
+
11
+ Checks for:
12
+ - Missing columns: Columns present in ref_df but not in new_df.
13
+ - New columns: Columns present in new_df but not in ref_df.
14
+ - Data type mismatches: Columns present in both but with different dtypes.
15
+
16
+ Args:
17
+ ref_df: The reference pandas DataFrame.
18
+ new_df: The new pandas DataFrame to validate.
19
+
20
+ Returns:
21
+ A list of CheckResult objects containing the validation details.
22
+ """
23
+ results = []
24
+ ref_cols = set(ref_df.columns)
25
+ new_cols = set(new_df.columns)
26
+
27
+ # Detect missing columns
28
+ missing_cols = ref_cols - new_cols
29
+ for col in missing_cols:
30
+ results.append(
31
+ CheckResult(
32
+ column=col,
33
+ check_type="schema",
34
+ metric_name="missing_column",
35
+ status="FAILED",
36
+ severity="CRITICAL",
37
+ message=f"Column '{col}' is missing in the new dataset.",
38
+ reference_value=str(ref_df[col].dtype),
39
+ new_value=None,
40
+ )
41
+ )
42
+
43
+ # Detect new columns
44
+ new_cols_detected = new_cols - ref_cols
45
+ for col in new_cols_detected:
46
+ results.append(
47
+ CheckResult(
48
+ column=col,
49
+ check_type="schema",
50
+ metric_name="new_column",
51
+ status="FAILED",
52
+ severity="INFO",
53
+ message=f"New column '{col}' detected in the new dataset.",
54
+ reference_value=None,
55
+ new_value=str(new_df[col].dtype),
56
+ )
57
+ )
58
+
59
+ # Detect dtype mismatches
60
+ common_cols = ref_cols.intersection(new_cols)
61
+ for col in common_cols:
62
+ ref_dtype = ref_df[col].dtype
63
+ new_dtype = new_df[col].dtype
64
+
65
+ if ref_dtype != new_dtype:
66
+ results.append(
67
+ CheckResult(
68
+ column=col,
69
+ check_type="schema",
70
+ metric_name="dtype_mismatch",
71
+ status="FAILED",
72
+ severity="CRITICAL",
73
+ message=(
74
+ f"Data type mismatch for column '{col}': "
75
+ f"expected {ref_dtype}, got {new_dtype}."
76
+ ),
77
+ reference_value=str(ref_dtype),
78
+ new_value=str(new_dtype),
79
+ )
80
+ )
81
+ else:
82
+ results.append(
83
+ CheckResult(
84
+ column=col,
85
+ check_type="schema",
86
+ metric_name="schema_validation",
87
+ status="PASSED",
88
+ severity="INFO",
89
+ message=f"Schema and data type validated for column '{col}'.",
90
+ reference_value=str(ref_dtype),
91
+ new_value=str(new_dtype),
92
+ )
93
+ )
94
+
95
+ return results