datasentry 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datasentry-0.1.0/PKG-INFO +15 -0
- datasentry-0.1.0/README.md +0 -0
- datasentry-0.1.0/datasentry/__init__.py +22 -0
- datasentry-0.1.0/datasentry/_utils.py +57 -0
- datasentry-0.1.0/datasentry/analyzer.py +41 -0
- datasentry-0.1.0/datasentry/config.py +40 -0
- datasentry-0.1.0/datasentry/detectors/imbalance.py +37 -0
- datasentry-0.1.0/datasentry/detectors/label_noise.py +33 -0
- datasentry-0.1.0/datasentry/detectors/leakage.py +22 -0
- datasentry-0.1.0/datasentry/detectors/outliers.py +25 -0
- datasentry-0.1.0/datasentry/detectors/shift.py +46 -0
- datasentry-0.1.0/datasentry/fixer.py +28 -0
- datasentry-0.1.0/datasentry/report.py +36 -0
- datasentry-0.1.0/datasentry.egg-info/PKG-INFO +15 -0
- datasentry-0.1.0/datasentry.egg-info/SOURCES.txt +21 -0
- datasentry-0.1.0/datasentry.egg-info/dependency_links.txt +1 -0
- datasentry-0.1.0/datasentry.egg-info/requires.txt +9 -0
- datasentry-0.1.0/datasentry.egg-info/top_level.txt +1 -0
- datasentry-0.1.0/pyproject.toml +30 -0
- datasentry-0.1.0/setup.cfg +4 -0
- datasentry-0.1.0/tests/test_analyzer.py +11 -0
- datasentry-0.1.0/tests/test_config.py +12 -0
- datasentry-0.1.0/tests/test_detectors.py +14 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datasentry
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Data-centric ML inspection and auto-remediation toolkit
|
|
5
|
+
Author: Ankush Sharma
|
|
6
|
+
Requires-Python: >=3.9
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: numpy
|
|
9
|
+
Requires-Dist: pandas
|
|
10
|
+
Requires-Dist: scikit-learn
|
|
11
|
+
Requires-Dist: imbalanced-learn
|
|
12
|
+
Provides-Extra: dev
|
|
13
|
+
Requires-Dist: pytest; extra == "dev"
|
|
14
|
+
Requires-Dist: black; extra == "dev"
|
|
15
|
+
Requires-Dist: flake8; extra == "dev"
|
|
File without changes
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DataSentry
|
|
3
|
+
==========
|
|
4
|
+
|
|
5
|
+
A lightweight, production-ready, data-centric machine learning inspection library.
|
|
6
|
+
|
|
7
|
+
Main entry point:
|
|
8
|
+
analyze()
|
|
9
|
+
|
|
10
|
+
Example:
|
|
11
|
+
--------
|
|
12
|
+
>>> from datasentry import analyze
|
|
13
|
+
>>> report = analyze(X, y)
|
|
14
|
+
>>> report.show()
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from .analyzer import analyze
|
|
18
|
+
from .config import DataSentryConfig
|
|
19
|
+
|
|
20
|
+
__all__ = ["analyze", "DataSentryConfig"]
|
|
21
|
+
|
|
22
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from typing import Union
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
ArrayLike = Union[np.ndarray, pd.DataFrame, pd.Series]
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def to_numpy(X: ArrayLike) -> np.ndarray:
|
|
10
|
+
"""
|
|
11
|
+
Convert input features to validated NumPy array.
|
|
12
|
+
|
|
13
|
+
Ensures:
|
|
14
|
+
- Numeric dtype
|
|
15
|
+
- 2D shape
|
|
16
|
+
|
|
17
|
+
Raises
|
|
18
|
+
------
|
|
19
|
+
ValueError
|
|
20
|
+
If data is non-numeric.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
if isinstance(X, (pd.DataFrame, pd.Series)):
|
|
24
|
+
X = X.values
|
|
25
|
+
|
|
26
|
+
X = np.asarray(X)
|
|
27
|
+
|
|
28
|
+
if X.ndim == 1:
|
|
29
|
+
X = X.reshape(-1, 1)
|
|
30
|
+
|
|
31
|
+
if not np.issubdtype(X.dtype, np.number):
|
|
32
|
+
raise ValueError("All features must be numeric.")
|
|
33
|
+
|
|
34
|
+
return X
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def validate_y(y: ArrayLike) -> np.ndarray:
|
|
38
|
+
"""
|
|
39
|
+
Validate target vector.
|
|
40
|
+
|
|
41
|
+
Ensures:
|
|
42
|
+
- 1D
|
|
43
|
+
- Non-empty
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
if isinstance(y, pd.Series):
|
|
47
|
+
y = y.values
|
|
48
|
+
|
|
49
|
+
y = np.asarray(y)
|
|
50
|
+
|
|
51
|
+
if y.ndim != 1:
|
|
52
|
+
raise ValueError("Target y must be 1-dimensional.")
|
|
53
|
+
|
|
54
|
+
if len(y) == 0:
|
|
55
|
+
raise ValueError("Target y cannot be empty.")
|
|
56
|
+
|
|
57
|
+
return y
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
from .config import DataSentryConfig
|
|
3
|
+
from .detectors import imbalance, outliers, shift, leakage, label_noise
|
|
4
|
+
from .report import Report
|
|
5
|
+
from .fixer import AutoFixer
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def analyze(X, y, X_test=None, config: Optional[DataSentryConfig] = None) -> Report:
|
|
9
|
+
"""
|
|
10
|
+
Run full data inspection pipeline.
|
|
11
|
+
|
|
12
|
+
Parameters
|
|
13
|
+
----------
|
|
14
|
+
X : array-like
|
|
15
|
+
Training features
|
|
16
|
+
y : array-like
|
|
17
|
+
Target labels
|
|
18
|
+
X_test : array-like, optional
|
|
19
|
+
Test features for drift detection
|
|
20
|
+
config : DataSentryConfig
|
|
21
|
+
Custom configuration
|
|
22
|
+
|
|
23
|
+
Returns
|
|
24
|
+
-------
|
|
25
|
+
Report
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
config = config or DataSentryConfig()
|
|
29
|
+
|
|
30
|
+
issues = {
|
|
31
|
+
"imbalance": imbalance.detect(y, config.imbalance_threshold),
|
|
32
|
+
"outliers": outliers.detect(X, config.outlier_contamination, config.random_state),
|
|
33
|
+
"distribution_shift": shift.detect(X, X_test, config.drift_threshold),
|
|
34
|
+
"data_leakage": leakage.detect(X, y, config.leakage_threshold),
|
|
35
|
+
"label_noise": label_noise.detect(X, y, config.noise_threshold, config.random_state),
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
report = Report(issues)
|
|
39
|
+
report.fixer = AutoFixer(issues, config)
|
|
40
|
+
|
|
41
|
+
return report
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
@dataclass
|
|
5
|
+
class DataSentryConfig:
|
|
6
|
+
"""
|
|
7
|
+
Configuration object for DataSentry.
|
|
8
|
+
|
|
9
|
+
Parameters
|
|
10
|
+
----------
|
|
11
|
+
imbalance_threshold : float
|
|
12
|
+
Controls sensitivity of imbalance detection.
|
|
13
|
+
outlier_contamination : float
|
|
14
|
+
Expected proportion of outliers (0 < value < 0.5).
|
|
15
|
+
drift_threshold : float
|
|
16
|
+
PSI threshold for distribution shift detection.
|
|
17
|
+
leakage_threshold : float
|
|
18
|
+
Mutual information threshold for leakage detection.
|
|
19
|
+
noise_threshold : float
|
|
20
|
+
Label noise ratio threshold.
|
|
21
|
+
random_state : int
|
|
22
|
+
Random seed for reproducibility.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
imbalance_threshold: float = 3.0
|
|
26
|
+
outlier_contamination: float = 0.05
|
|
27
|
+
drift_threshold: float = 0.2
|
|
28
|
+
leakage_threshold: float = 0.5
|
|
29
|
+
noise_threshold: float = 0.2
|
|
30
|
+
random_state: int = 42
|
|
31
|
+
|
|
32
|
+
def __post_init__(self) -> None:
|
|
33
|
+
if not (0 < self.outlier_contamination < 0.5):
|
|
34
|
+
raise ValueError("outlier_contamination must be between 0 and 0.5.")
|
|
35
|
+
|
|
36
|
+
if self.imbalance_threshold <= 0:
|
|
37
|
+
raise ValueError("imbalance_threshold must be positive.")
|
|
38
|
+
|
|
39
|
+
if self.noise_threshold < 0:
|
|
40
|
+
raise ValueError("noise_threshold must be >= 0.")
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from typing import Dict
|
|
3
|
+
from .._utils import validate_y
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def detect(y: np.ndarray, threshold: float = 3.0) -> Dict[str, float]:
|
|
7
|
+
"""
|
|
8
|
+
Detect class imbalance using normalized majority/minority ratio.
|
|
9
|
+
|
|
10
|
+
imbalance_score is scaled between 0 and 1:
|
|
11
|
+
0 → perfectly balanced
|
|
12
|
+
~1 → highly imbalanced
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
y = validate_y(y)
|
|
16
|
+
|
|
17
|
+
_, counts = np.unique(y, return_counts=True)
|
|
18
|
+
|
|
19
|
+
if len(counts) < 2:
|
|
20
|
+
ratio = float("inf")
|
|
21
|
+
imbalance_score = 1.0
|
|
22
|
+
else:
|
|
23
|
+
max_count = np.max(counts)
|
|
24
|
+
min_count = np.min(counts)
|
|
25
|
+
|
|
26
|
+
if min_count == 0:
|
|
27
|
+
ratio = float("inf")
|
|
28
|
+
imbalance_score = 1.0
|
|
29
|
+
else:
|
|
30
|
+
ratio = max_count / min_count
|
|
31
|
+
imbalance_score = (ratio - 1) / ratio # normalized 0–1
|
|
32
|
+
|
|
33
|
+
return {
|
|
34
|
+
"imbalance_score": float(imbalance_score),
|
|
35
|
+
"is_problematic": bool(ratio > threshold),
|
|
36
|
+
"severity": float(imbalance_score),
|
|
37
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
import numpy as np
|
|
3
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
4
|
+
from sklearn.model_selection import StratifiedKFold
|
|
5
|
+
from .._utils import to_numpy, validate_y
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def detect(X, y, threshold: float, random_state: int) -> Dict[str, float]:
|
|
9
|
+
"""
|
|
10
|
+
Detect label noise using cross-validated disagreement.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
X = to_numpy(X)
|
|
14
|
+
y = validate_y(y)
|
|
15
|
+
|
|
16
|
+
if len(np.unique(y)) < 2:
|
|
17
|
+
return {"noise_ratio": 0.0, "is_problematic": False, "severity": 0.0}
|
|
18
|
+
|
|
19
|
+
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
|
|
20
|
+
preds = np.zeros(len(y))
|
|
21
|
+
|
|
22
|
+
for train_idx, val_idx in skf.split(X, y):
|
|
23
|
+
clf = RandomForestClassifier(random_state=random_state)
|
|
24
|
+
clf.fit(X[train_idx], y[train_idx])
|
|
25
|
+
preds[val_idx] = clf.predict(X[val_idx])
|
|
26
|
+
|
|
27
|
+
noise_ratio = float((preds != y).mean())
|
|
28
|
+
|
|
29
|
+
return {
|
|
30
|
+
"noise_ratio": noise_ratio,
|
|
31
|
+
"is_problematic": bool(noise_ratio > threshold),
|
|
32
|
+
"severity": noise_ratio,
|
|
33
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
import numpy as np
|
|
3
|
+
from sklearn.feature_selection import mutual_info_classif
|
|
4
|
+
from .._utils import to_numpy, validate_y
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def detect(X, y, threshold: float) -> Dict[str, float]:
|
|
8
|
+
"""
|
|
9
|
+
Detect potential feature leakage via mutual information.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
X = to_numpy(X)
|
|
13
|
+
y = validate_y(y)
|
|
14
|
+
|
|
15
|
+
mi_scores = mutual_info_classif(X, y)
|
|
16
|
+
max_mi = float(np.max(mi_scores))
|
|
17
|
+
|
|
18
|
+
return {
|
|
19
|
+
"max_mutual_information": max_mi,
|
|
20
|
+
"is_problematic": bool(max_mi > threshold),
|
|
21
|
+
"severity": max_mi,
|
|
22
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
from sklearn.ensemble import IsolationForest
|
|
3
|
+
from .._utils import to_numpy
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def detect(X, contamination: float, random_state: int) -> Dict[str, float]:
|
|
7
|
+
"""
|
|
8
|
+
Detect outliers using Isolation Forest.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
X = to_numpy(X)
|
|
12
|
+
|
|
13
|
+
model = IsolationForest(
|
|
14
|
+
contamination=contamination,
|
|
15
|
+
random_state=random_state,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
preds = model.fit_predict(X)
|
|
19
|
+
outlier_ratio = (preds == -1).mean()
|
|
20
|
+
|
|
21
|
+
return {
|
|
22
|
+
"outlier_ratio": float(outlier_ratio),
|
|
23
|
+
"is_problematic": bool(outlier_ratio > contamination),
|
|
24
|
+
"severity": float(outlier_ratio),
|
|
25
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from typing import Dict
|
|
3
|
+
from .._utils import to_numpy
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _psi(expected: np.ndarray, actual: np.ndarray, bins: int = 10) -> float:
|
|
7
|
+
"""
|
|
8
|
+
Compute Population Stability Index.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
breakpoints = np.percentile(expected, np.linspace(0, 100, bins + 1))
|
|
12
|
+
|
|
13
|
+
expected_counts = np.histogram(expected, bins=breakpoints)[0]
|
|
14
|
+
actual_counts = np.histogram(actual, bins=breakpoints)[0]
|
|
15
|
+
|
|
16
|
+
expected_perc = expected_counts / len(expected)
|
|
17
|
+
actual_perc = actual_counts / len(actual)
|
|
18
|
+
|
|
19
|
+
psi = np.sum(
|
|
20
|
+
(expected_perc - actual_perc)
|
|
21
|
+
* np.log((expected_perc + 1e-12) / (actual_perc + 1e-12))
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
return float(psi)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def detect(X_train, X_test, threshold: float) -> Dict[str, float]:
|
|
28
|
+
|
|
29
|
+
if X_test is None:
|
|
30
|
+
return {"psi": 0.0, "is_problematic": False, "severity": 0.0}
|
|
31
|
+
|
|
32
|
+
X_train = to_numpy(X_train)
|
|
33
|
+
X_test = to_numpy(X_test)
|
|
34
|
+
|
|
35
|
+
psi_values = [
|
|
36
|
+
_psi(X_train[:, i], X_test[:, i])
|
|
37
|
+
for i in range(X_train.shape[1])
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
mean_psi = float(np.mean(psi_values))
|
|
41
|
+
|
|
42
|
+
return {
|
|
43
|
+
"psi": mean_psi,
|
|
44
|
+
"is_problematic": bool(mean_psi > threshold),
|
|
45
|
+
"severity": mean_psi,
|
|
46
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from typing import Tuple
|
|
2
|
+
import numpy as np
|
|
3
|
+
from imblearn.over_sampling import SMOTE
|
|
4
|
+
from ._utils import to_numpy, validate_y
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class AutoFixer:
|
|
8
|
+
"""
|
|
9
|
+
Automatically fix detected issues where possible.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def __init__(self, issues: dict, config):
|
|
13
|
+
self.issues = issues
|
|
14
|
+
self.config = config
|
|
15
|
+
|
|
16
|
+
def fix(self, X, y) -> Tuple[np.ndarray, np.ndarray]:
|
|
17
|
+
"""
|
|
18
|
+
Apply automatic corrections.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
X = to_numpy(X)
|
|
22
|
+
y = validate_y(y)
|
|
23
|
+
|
|
24
|
+
if self.issues.get("imbalance", {}).get("is_problematic"):
|
|
25
|
+
smote = SMOTE(random_state=self.config.random_state)
|
|
26
|
+
X, y = smote.fit_resample(X, y)
|
|
27
|
+
|
|
28
|
+
return X, y
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class Report:
|
|
5
|
+
"""
|
|
6
|
+
Inspection report returned by analyze().
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
def __init__(self, issues: Dict):
|
|
10
|
+
self.issues = issues
|
|
11
|
+
self.score = self._compute_score()
|
|
12
|
+
self.fixer = None # injected later
|
|
13
|
+
|
|
14
|
+
def _compute_score(self) -> int:
|
|
15
|
+
score = 100
|
|
16
|
+
|
|
17
|
+
for issue in self.issues.values():
|
|
18
|
+
if issue["is_problematic"]:
|
|
19
|
+
score -= min(30, issue["severity"] * 50)
|
|
20
|
+
|
|
21
|
+
return max(int(score), 0)
|
|
22
|
+
|
|
23
|
+
def show(self) -> None:
|
|
24
|
+
"""
|
|
25
|
+
Pretty-print report to console.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
print("\n=== DataSentry Report ===\n")
|
|
29
|
+
|
|
30
|
+
for name, result in self.issues.items():
|
|
31
|
+
print(f"[{name.upper()}]")
|
|
32
|
+
for k, v in result.items():
|
|
33
|
+
print(f" {k}: {v}")
|
|
34
|
+
print()
|
|
35
|
+
|
|
36
|
+
print(f"Overall Health Score: {self.score}/100\n")
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datasentry
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Data-centric ML inspection and auto-remediation toolkit
|
|
5
|
+
Author: Ankush Sharma
|
|
6
|
+
Requires-Python: >=3.9
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: numpy
|
|
9
|
+
Requires-Dist: pandas
|
|
10
|
+
Requires-Dist: scikit-learn
|
|
11
|
+
Requires-Dist: imbalanced-learn
|
|
12
|
+
Provides-Extra: dev
|
|
13
|
+
Requires-Dist: pytest; extra == "dev"
|
|
14
|
+
Requires-Dist: black; extra == "dev"
|
|
15
|
+
Requires-Dist: flake8; extra == "dev"
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
datasentry/__init__.py
|
|
4
|
+
datasentry/_utils.py
|
|
5
|
+
datasentry/analyzer.py
|
|
6
|
+
datasentry/config.py
|
|
7
|
+
datasentry/fixer.py
|
|
8
|
+
datasentry/report.py
|
|
9
|
+
datasentry.egg-info/PKG-INFO
|
|
10
|
+
datasentry.egg-info/SOURCES.txt
|
|
11
|
+
datasentry.egg-info/dependency_links.txt
|
|
12
|
+
datasentry.egg-info/requires.txt
|
|
13
|
+
datasentry.egg-info/top_level.txt
|
|
14
|
+
datasentry/detectors/imbalance.py
|
|
15
|
+
datasentry/detectors/label_noise.py
|
|
16
|
+
datasentry/detectors/leakage.py
|
|
17
|
+
datasentry/detectors/outliers.py
|
|
18
|
+
datasentry/detectors/shift.py
|
|
19
|
+
tests/test_analyzer.py
|
|
20
|
+
tests/test_config.py
|
|
21
|
+
tests/test_detectors.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
datasentry
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "datasentry"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Data-centric ML inspection and auto-remediation toolkit"
|
|
9
|
+
authors = [
|
|
10
|
+
{name = "Ankush Sharma"}
|
|
11
|
+
]
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.9"
|
|
14
|
+
|
|
15
|
+
dependencies = [
|
|
16
|
+
"numpy",
|
|
17
|
+
"pandas",
|
|
18
|
+
"scikit-learn",
|
|
19
|
+
"imbalanced-learn"
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
[project.optional-dependencies]
|
|
23
|
+
dev = [
|
|
24
|
+
"pytest",
|
|
25
|
+
"black",
|
|
26
|
+
"flake8"
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
[tool.pytest.ini_options]
|
|
30
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from datasentry.config import DataSentryConfig
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def test_default_config():
|
|
6
|
+
config = DataSentryConfig()
|
|
7
|
+
assert config.outlier_contamination == 0.05
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def test_invalid_contamination():
|
|
11
|
+
with pytest.raises(ValueError):
|
|
12
|
+
DataSentryConfig(outlier_contamination=0.9)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from datasentry.detectors import imbalance
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def test_imbalance_detect_balanced():
|
|
6
|
+
y = np.array([0, 1, 0, 1])
|
|
7
|
+
result = imbalance.detect(y, threshold=3.0)
|
|
8
|
+
assert result["imbalance_score"] < 0.5
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def test_imbalance_detect_imbalanced():
|
|
12
|
+
y = np.array([0, 0, 0, 0, 1])
|
|
13
|
+
result = imbalance.detect(y, threshold=3.0)
|
|
14
|
+
assert result["is_problematic"] is True
|