datasentry 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,15 @@
1
+ Metadata-Version: 2.4
2
+ Name: datasentry
3
+ Version: 0.1.0
4
+ Summary: Data-centric ML inspection and auto-remediation toolkit
5
+ Author: Ankush Sharma
6
+ Requires-Python: >=3.9
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: numpy
9
+ Requires-Dist: pandas
10
+ Requires-Dist: scikit-learn
11
+ Requires-Dist: imbalanced-learn
12
+ Provides-Extra: dev
13
+ Requires-Dist: pytest; extra == "dev"
14
+ Requires-Dist: black; extra == "dev"
15
+ Requires-Dist: flake8; extra == "dev"
File without changes
@@ -0,0 +1,22 @@
1
+ """
2
+ DataSentry
3
+ ==========
4
+
5
+ A lightweight, production-ready, data-centric machine learning inspection library.
6
+
7
+ Main entry point:
8
+ analyze()
9
+
10
+ Example:
11
+ --------
12
+ >>> from datasentry import analyze
13
+ >>> report = analyze(X, y)
14
+ >>> report.show()
15
+ """
16
+
17
+ from .analyzer import analyze
18
+ from .config import DataSentryConfig
19
+
20
+ __all__ = ["analyze", "DataSentryConfig"]
21
+
22
+ __version__ = "0.1.0"
@@ -0,0 +1,57 @@
1
+ from typing import Union
2
+ import numpy as np
3
+ import pandas as pd
4
+
5
+
6
+ ArrayLike = Union[np.ndarray, pd.DataFrame, pd.Series]
7
+
8
+
9
+ def to_numpy(X: ArrayLike) -> np.ndarray:
10
+ """
11
+ Convert input features to validated NumPy array.
12
+
13
+ Ensures:
14
+ - Numeric dtype
15
+ - 2D shape
16
+
17
+ Raises
18
+ ------
19
+ ValueError
20
+ If data is non-numeric.
21
+ """
22
+
23
+ if isinstance(X, (pd.DataFrame, pd.Series)):
24
+ X = X.values
25
+
26
+ X = np.asarray(X)
27
+
28
+ if X.ndim == 1:
29
+ X = X.reshape(-1, 1)
30
+
31
+ if not np.issubdtype(X.dtype, np.number):
32
+ raise ValueError("All features must be numeric.")
33
+
34
+ return X
35
+
36
+
37
+ def validate_y(y: ArrayLike) -> np.ndarray:
38
+ """
39
+ Validate target vector.
40
+
41
+ Ensures:
42
+ - 1D
43
+ - Non-empty
44
+ """
45
+
46
+ if isinstance(y, pd.Series):
47
+ y = y.values
48
+
49
+ y = np.asarray(y)
50
+
51
+ if y.ndim != 1:
52
+ raise ValueError("Target y must be 1-dimensional.")
53
+
54
+ if len(y) == 0:
55
+ raise ValueError("Target y cannot be empty.")
56
+
57
+ return y
@@ -0,0 +1,41 @@
1
+ from typing import Optional
2
+ from .config import DataSentryConfig
3
+ from .detectors import imbalance, outliers, shift, leakage, label_noise
4
+ from .report import Report
5
+ from .fixer import AutoFixer
6
+
7
+
8
+ def analyze(X, y, X_test=None, config: Optional[DataSentryConfig] = None) -> Report:
9
+ """
10
+ Run full data inspection pipeline.
11
+
12
+ Parameters
13
+ ----------
14
+ X : array-like
15
+ Training features
16
+ y : array-like
17
+ Target labels
18
+ X_test : array-like, optional
19
+ Test features for drift detection
20
+ config : DataSentryConfig
21
+ Custom configuration
22
+
23
+ Returns
24
+ -------
25
+ Report
26
+ """
27
+
28
+ config = config or DataSentryConfig()
29
+
30
+ issues = {
31
+ "imbalance": imbalance.detect(y, config.imbalance_threshold),
32
+ "outliers": outliers.detect(X, config.outlier_contamination, config.random_state),
33
+ "distribution_shift": shift.detect(X, X_test, config.drift_threshold),
34
+ "data_leakage": leakage.detect(X, y, config.leakage_threshold),
35
+ "label_noise": label_noise.detect(X, y, config.noise_threshold, config.random_state),
36
+ }
37
+
38
+ report = Report(issues)
39
+ report.fixer = AutoFixer(issues, config)
40
+
41
+ return report
@@ -0,0 +1,40 @@
1
+ from dataclasses import dataclass
2
+
3
+
4
+ @dataclass
5
+ class DataSentryConfig:
6
+ """
7
+ Configuration object for DataSentry.
8
+
9
+ Parameters
10
+ ----------
11
+ imbalance_threshold : float
12
+ Controls sensitivity of imbalance detection.
13
+ outlier_contamination : float
14
+ Expected proportion of outliers (0 < value < 0.5).
15
+ drift_threshold : float
16
+ PSI threshold for distribution shift detection.
17
+ leakage_threshold : float
18
+ Mutual information threshold for leakage detection.
19
+ noise_threshold : float
20
+ Label noise ratio threshold.
21
+ random_state : int
22
+ Random seed for reproducibility.
23
+ """
24
+
25
+ imbalance_threshold: float = 3.0
26
+ outlier_contamination: float = 0.05
27
+ drift_threshold: float = 0.2
28
+ leakage_threshold: float = 0.5
29
+ noise_threshold: float = 0.2
30
+ random_state: int = 42
31
+
32
+ def __post_init__(self) -> None:
33
+ if not (0 < self.outlier_contamination < 0.5):
34
+ raise ValueError("outlier_contamination must be between 0 and 0.5.")
35
+
36
+ if self.imbalance_threshold <= 0:
37
+ raise ValueError("imbalance_threshold must be positive.")
38
+
39
+ if self.noise_threshold < 0:
40
+ raise ValueError("noise_threshold must be >= 0.")
@@ -0,0 +1,37 @@
1
+ import numpy as np
2
+ from typing import Dict
3
+ from .._utils import validate_y
4
+
5
+
6
+ def detect(y: np.ndarray, threshold: float = 3.0) -> Dict[str, float]:
7
+ """
8
+ Detect class imbalance using normalized majority/minority ratio.
9
+
10
+ imbalance_score is scaled between 0 and 1:
11
+ 0 → perfectly balanced
12
+ ~1 → highly imbalanced
13
+ """
14
+
15
+ y = validate_y(y)
16
+
17
+ _, counts = np.unique(y, return_counts=True)
18
+
19
+ if len(counts) < 2:
20
+ ratio = float("inf")
21
+ imbalance_score = 1.0
22
+ else:
23
+ max_count = np.max(counts)
24
+ min_count = np.min(counts)
25
+
26
+ if min_count == 0:
27
+ ratio = float("inf")
28
+ imbalance_score = 1.0
29
+ else:
30
+ ratio = max_count / min_count
31
+ imbalance_score = (ratio - 1) / ratio # normalized 0–1
32
+
33
+ return {
34
+ "imbalance_score": float(imbalance_score),
35
+ "is_problematic": bool(ratio > threshold),
36
+ "severity": float(imbalance_score),
37
+ }
@@ -0,0 +1,33 @@
1
+ from typing import Dict
2
+ import numpy as np
3
+ from sklearn.ensemble import RandomForestClassifier
4
+ from sklearn.model_selection import StratifiedKFold
5
+ from .._utils import to_numpy, validate_y
6
+
7
+
8
+ def detect(X, y, threshold: float, random_state: int) -> Dict[str, float]:
9
+ """
10
+ Detect label noise using cross-validated disagreement.
11
+ """
12
+
13
+ X = to_numpy(X)
14
+ y = validate_y(y)
15
+
16
+ if len(np.unique(y)) < 2:
17
+ return {"noise_ratio": 0.0, "is_problematic": False, "severity": 0.0}
18
+
19
+ skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
20
+ preds = np.zeros(len(y))
21
+
22
+ for train_idx, val_idx in skf.split(X, y):
23
+ clf = RandomForestClassifier(random_state=random_state)
24
+ clf.fit(X[train_idx], y[train_idx])
25
+ preds[val_idx] = clf.predict(X[val_idx])
26
+
27
+ noise_ratio = float((preds != y).mean())
28
+
29
+ return {
30
+ "noise_ratio": noise_ratio,
31
+ "is_problematic": bool(noise_ratio > threshold),
32
+ "severity": noise_ratio,
33
+ }
@@ -0,0 +1,22 @@
1
+ from typing import Dict
2
+ import numpy as np
3
+ from sklearn.feature_selection import mutual_info_classif
4
+ from .._utils import to_numpy, validate_y
5
+
6
+
7
+ def detect(X, y, threshold: float) -> Dict[str, float]:
8
+ """
9
+ Detect potential feature leakage via mutual information.
10
+ """
11
+
12
+ X = to_numpy(X)
13
+ y = validate_y(y)
14
+
15
+ mi_scores = mutual_info_classif(X, y)
16
+ max_mi = float(np.max(mi_scores))
17
+
18
+ return {
19
+ "max_mutual_information": max_mi,
20
+ "is_problematic": bool(max_mi > threshold),
21
+ "severity": max_mi,
22
+ }
@@ -0,0 +1,25 @@
1
+ from typing import Dict
2
+ from sklearn.ensemble import IsolationForest
3
+ from .._utils import to_numpy
4
+
5
+
6
+ def detect(X, contamination: float, random_state: int) -> Dict[str, float]:
7
+ """
8
+ Detect outliers using Isolation Forest.
9
+ """
10
+
11
+ X = to_numpy(X)
12
+
13
+ model = IsolationForest(
14
+ contamination=contamination,
15
+ random_state=random_state,
16
+ )
17
+
18
+ preds = model.fit_predict(X)
19
+ outlier_ratio = (preds == -1).mean()
20
+
21
+ return {
22
+ "outlier_ratio": float(outlier_ratio),
23
+ "is_problematic": bool(outlier_ratio > contamination),
24
+ "severity": float(outlier_ratio),
25
+ }
@@ -0,0 +1,46 @@
1
+ import numpy as np
2
+ from typing import Dict
3
+ from .._utils import to_numpy
4
+
5
+
6
+ def _psi(expected: np.ndarray, actual: np.ndarray, bins: int = 10) -> float:
7
+ """
8
+ Compute Population Stability Index.
9
+ """
10
+
11
+ breakpoints = np.percentile(expected, np.linspace(0, 100, bins + 1))
12
+
13
+ expected_counts = np.histogram(expected, bins=breakpoints)[0]
14
+ actual_counts = np.histogram(actual, bins=breakpoints)[0]
15
+
16
+ expected_perc = expected_counts / len(expected)
17
+ actual_perc = actual_counts / len(actual)
18
+
19
+ psi = np.sum(
20
+ (expected_perc - actual_perc)
21
+ * np.log((expected_perc + 1e-12) / (actual_perc + 1e-12))
22
+ )
23
+
24
+ return float(psi)
25
+
26
+
27
+ def detect(X_train, X_test, threshold: float) -> Dict[str, float]:
28
+
29
+ if X_test is None:
30
+ return {"psi": 0.0, "is_problematic": False, "severity": 0.0}
31
+
32
+ X_train = to_numpy(X_train)
33
+ X_test = to_numpy(X_test)
34
+
35
+ psi_values = [
36
+ _psi(X_train[:, i], X_test[:, i])
37
+ for i in range(X_train.shape[1])
38
+ ]
39
+
40
+ mean_psi = float(np.mean(psi_values))
41
+
42
+ return {
43
+ "psi": mean_psi,
44
+ "is_problematic": bool(mean_psi > threshold),
45
+ "severity": mean_psi,
46
+ }
@@ -0,0 +1,28 @@
1
+ from typing import Tuple
2
+ import numpy as np
3
+ from imblearn.over_sampling import SMOTE
4
+ from ._utils import to_numpy, validate_y
5
+
6
+
7
+ class AutoFixer:
8
+ """
9
+ Automatically fix detected issues where possible.
10
+ """
11
+
12
+ def __init__(self, issues: dict, config):
13
+ self.issues = issues
14
+ self.config = config
15
+
16
+ def fix(self, X, y) -> Tuple[np.ndarray, np.ndarray]:
17
+ """
18
+ Apply automatic corrections.
19
+ """
20
+
21
+ X = to_numpy(X)
22
+ y = validate_y(y)
23
+
24
+ if self.issues.get("imbalance", {}).get("is_problematic"):
25
+ smote = SMOTE(random_state=self.config.random_state)
26
+ X, y = smote.fit_resample(X, y)
27
+
28
+ return X, y
@@ -0,0 +1,36 @@
1
+ from typing import Dict
2
+
3
+
4
+ class Report:
5
+ """
6
+ Inspection report returned by analyze().
7
+ """
8
+
9
+ def __init__(self, issues: Dict):
10
+ self.issues = issues
11
+ self.score = self._compute_score()
12
+ self.fixer = None # injected later
13
+
14
+ def _compute_score(self) -> int:
15
+ score = 100
16
+
17
+ for issue in self.issues.values():
18
+ if issue["is_problematic"]:
19
+ score -= min(30, issue["severity"] * 50)
20
+
21
+ return max(int(score), 0)
22
+
23
+ def show(self) -> None:
24
+ """
25
+ Pretty-print report to console.
26
+ """
27
+
28
+ print("\n=== DataSentry Report ===\n")
29
+
30
+ for name, result in self.issues.items():
31
+ print(f"[{name.upper()}]")
32
+ for k, v in result.items():
33
+ print(f" {k}: {v}")
34
+ print()
35
+
36
+ print(f"Overall Health Score: {self.score}/100\n")
@@ -0,0 +1,15 @@
1
+ Metadata-Version: 2.4
2
+ Name: datasentry
3
+ Version: 0.1.0
4
+ Summary: Data-centric ML inspection and auto-remediation toolkit
5
+ Author: Ankush Sharma
6
+ Requires-Python: >=3.9
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: numpy
9
+ Requires-Dist: pandas
10
+ Requires-Dist: scikit-learn
11
+ Requires-Dist: imbalanced-learn
12
+ Provides-Extra: dev
13
+ Requires-Dist: pytest; extra == "dev"
14
+ Requires-Dist: black; extra == "dev"
15
+ Requires-Dist: flake8; extra == "dev"
@@ -0,0 +1,21 @@
1
+ README.md
2
+ pyproject.toml
3
+ datasentry/__init__.py
4
+ datasentry/_utils.py
5
+ datasentry/analyzer.py
6
+ datasentry/config.py
7
+ datasentry/fixer.py
8
+ datasentry/report.py
9
+ datasentry.egg-info/PKG-INFO
10
+ datasentry.egg-info/SOURCES.txt
11
+ datasentry.egg-info/dependency_links.txt
12
+ datasentry.egg-info/requires.txt
13
+ datasentry.egg-info/top_level.txt
14
+ datasentry/detectors/imbalance.py
15
+ datasentry/detectors/label_noise.py
16
+ datasentry/detectors/leakage.py
17
+ datasentry/detectors/outliers.py
18
+ datasentry/detectors/shift.py
19
+ tests/test_analyzer.py
20
+ tests/test_config.py
21
+ tests/test_detectors.py
@@ -0,0 +1,9 @@
1
+ numpy
2
+ pandas
3
+ scikit-learn
4
+ imbalanced-learn
5
+
6
+ [dev]
7
+ pytest
8
+ black
9
+ flake8
@@ -0,0 +1 @@
1
+ datasentry
@@ -0,0 +1,30 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "datasentry"
7
+ version = "0.1.0"
8
+ description = "Data-centric ML inspection and auto-remediation toolkit"
9
+ authors = [
10
+ {name = "Ankush Sharma"}
11
+ ]
12
+ readme = "README.md"
13
+ requires-python = ">=3.9"
14
+
15
+ dependencies = [
16
+ "numpy",
17
+ "pandas",
18
+ "scikit-learn",
19
+ "imbalanced-learn"
20
+ ]
21
+
22
+ [project.optional-dependencies]
23
+ dev = [
24
+ "pytest",
25
+ "black",
26
+ "flake8"
27
+ ]
28
+
29
+ [tool.pytest.ini_options]
30
+ testpaths = ["tests"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,11 @@
1
+ import numpy as np
2
+ from datasentry import analyze
3
+
4
+
5
+ def test_analyze_runs():
6
+ X = np.random.rand(100, 5)
7
+ y = np.random.randint(0, 2, 100)
8
+
9
+ report = analyze(X, y)
10
+ assert hasattr(report, "score")
11
+ assert report.score <= 100
@@ -0,0 +1,12 @@
1
+ import pytest
2
+ from datasentry.config import DataSentryConfig
3
+
4
+
5
+ def test_default_config():
6
+ config = DataSentryConfig()
7
+ assert config.outlier_contamination == 0.05
8
+
9
+
10
+ def test_invalid_contamination():
11
+ with pytest.raises(ValueError):
12
+ DataSentryConfig(outlier_contamination=0.9)
@@ -0,0 +1,14 @@
1
+ import numpy as np
2
+ from datasentry.detectors import imbalance
3
+
4
+
5
+ def test_imbalance_detect_balanced():
6
+ y = np.array([0, 1, 0, 1])
7
+ result = imbalance.detect(y, threshold=3.0)
8
+ assert result["imbalance_score"] < 0.5
9
+
10
+
11
+ def test_imbalance_detect_imbalanced():
12
+ y = np.array([0, 0, 0, 0, 1])
13
+ result = imbalance.detect(y, threshold=3.0)
14
+ assert result["is_problematic"] is True