PyPI - datasentry - Versions diffs - 0.1.0__tar.gz - Mend

datasentry 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

datasentry-0.1.0/PKG-INFO +15 -0
datasentry-0.1.0/README.md +0 -0
datasentry-0.1.0/datasentry/__init__.py +22 -0
datasentry-0.1.0/datasentry/_utils.py +57 -0
datasentry-0.1.0/datasentry/analyzer.py +41 -0
datasentry-0.1.0/datasentry/config.py +40 -0
datasentry-0.1.0/datasentry/detectors/imbalance.py +37 -0
datasentry-0.1.0/datasentry/detectors/label_noise.py +33 -0
datasentry-0.1.0/datasentry/detectors/leakage.py +22 -0
datasentry-0.1.0/datasentry/detectors/outliers.py +25 -0
datasentry-0.1.0/datasentry/detectors/shift.py +46 -0
datasentry-0.1.0/datasentry/fixer.py +28 -0
datasentry-0.1.0/datasentry/report.py +36 -0
datasentry-0.1.0/datasentry.egg-info/PKG-INFO +15 -0
datasentry-0.1.0/datasentry.egg-info/SOURCES.txt +21 -0
datasentry-0.1.0/datasentry.egg-info/dependency_links.txt +1 -0
datasentry-0.1.0/datasentry.egg-info/requires.txt +9 -0
datasentry-0.1.0/datasentry.egg-info/top_level.txt +1 -0
datasentry-0.1.0/pyproject.toml +30 -0
datasentry-0.1.0/setup.cfg +4 -0
datasentry-0.1.0/tests/test_analyzer.py +11 -0
datasentry-0.1.0/tests/test_config.py +12 -0
datasentry-0.1.0/tests/test_detectors.py +14 -0

datasentry-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,15 @@
+Metadata-Version: 2.4
+Name: datasentry
+Version: 0.1.0
+Summary: Data-centric ML inspection and auto-remediation toolkit
+Author: Ankush Sharma
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+Requires-Dist: numpy
+Requires-Dist: pandas
+Requires-Dist: scikit-learn
+Requires-Dist: imbalanced-learn
+Provides-Extra: dev
+Requires-Dist: pytest; extra == "dev"
+Requires-Dist: black; extra == "dev"
+Requires-Dist: flake8; extra == "dev"

datasentry-0.1.0/README.md ADDED Viewed

File without changes

datasentry-0.1.0/datasentry/__init__.py ADDED Viewed

@@ -0,0 +1,22 @@
+"""
+DataSentry
+==========
+A lightweight, production-ready, data-centric machine learning inspection library.
+Main entry point:
+    analyze()
+Example:
+--------
+>>> from datasentry import analyze
+>>> report = analyze(X, y)
+>>> report.show()
+"""
+from .analyzer import analyze
+from .config import DataSentryConfig
+__all__ = ["analyze", "DataSentryConfig"]
+__version__ = "0.1.0"

datasentry-0.1.0/datasentry/_utils.py ADDED Viewed

@@ -0,0 +1,57 @@
+from typing import Union
+import numpy as np
+import pandas as pd
+ArrayLike = Union[np.ndarray, pd.DataFrame, pd.Series]
+def to_numpy(X: ArrayLike) -> np.ndarray:
+    """
+    Convert input features to validated NumPy array.
+    Ensures:
+        - Numeric dtype
+        - 2D shape
+    Raises
+    ------
+    ValueError
+        If data is non-numeric.
+    """
+    if isinstance(X, (pd.DataFrame, pd.Series)):
+        X = X.values
+    X = np.asarray(X)
+    if X.ndim == 1:
+        X = X.reshape(-1, 1)
+    if not np.issubdtype(X.dtype, np.number):
+        raise ValueError("All features must be numeric.")
+    return X
+def validate_y(y: ArrayLike) -> np.ndarray:
+    """
+    Validate target vector.
+    Ensures:
+        - 1D
+        - Non-empty
+    """
+    if isinstance(y, pd.Series):
+        y = y.values
+    y = np.asarray(y)
+    if y.ndim != 1:
+        raise ValueError("Target y must be 1-dimensional.")
+    if len(y) == 0:
+        raise ValueError("Target y cannot be empty.")
+    return y

datasentry-0.1.0/datasentry/analyzer.py ADDED Viewed

@@ -0,0 +1,41 @@
+from typing import Optional
+from .config import DataSentryConfig
+from .detectors import imbalance, outliers, shift, leakage, label_noise
+from .report import Report
+from .fixer import AutoFixer
+def analyze(X, y, X_test=None, config: Optional[DataSentryConfig] = None) -> Report:
+    """
+    Run full data inspection pipeline.
+    Parameters
+    ----------
+    X : array-like
+        Training features
+    y : array-like
+        Target labels
+    X_test : array-like, optional
+        Test features for drift detection
+    config : DataSentryConfig
+        Custom configuration
+    Returns
+    -------
+    Report
+    """
+    config = config or DataSentryConfig()
+    issues = {
+        "imbalance": imbalance.detect(y, config.imbalance_threshold),
+        "outliers": outliers.detect(X, config.outlier_contamination, config.random_state),
+        "distribution_shift": shift.detect(X, X_test, config.drift_threshold),
+        "data_leakage": leakage.detect(X, y, config.leakage_threshold),
+        "label_noise": label_noise.detect(X, y, config.noise_threshold, config.random_state),
+    }
+    report = Report(issues)
+    report.fixer = AutoFixer(issues, config)
+    return report

datasentry-0.1.0/datasentry/config.py ADDED Viewed

@@ -0,0 +1,40 @@
+from dataclasses import dataclass
+@dataclass
+class DataSentryConfig:
+    """
+    Configuration object for DataSentry.
+    Parameters
+    ----------
+    imbalance_threshold : float
+        Controls sensitivity of imbalance detection.
+    outlier_contamination : float
+        Expected proportion of outliers (0 < value < 0.5).
+    drift_threshold : float
+        PSI threshold for distribution shift detection.
+    leakage_threshold : float
+        Mutual information threshold for leakage detection.
+    noise_threshold : float
+        Label noise ratio threshold.
+    random_state : int
+        Random seed for reproducibility.
+    """
+    imbalance_threshold: float = 3.0
+    outlier_contamination: float = 0.05
+    drift_threshold: float = 0.2
+    leakage_threshold: float = 0.5
+    noise_threshold: float = 0.2
+    random_state: int = 42
+    def __post_init__(self) -> None:
+        if not (0 < self.outlier_contamination < 0.5):
+            raise ValueError("outlier_contamination must be between 0 and 0.5.")
+        if self.imbalance_threshold <= 0:
+            raise ValueError("imbalance_threshold must be positive.")
+        if self.noise_threshold < 0:
+            raise ValueError("noise_threshold must be >= 0.")

datasentry-0.1.0/datasentry/detectors/imbalance.py ADDED Viewed

@@ -0,0 +1,37 @@
+import numpy as np
+from typing import Dict
+from .._utils import validate_y
+def detect(y: np.ndarray, threshold: float = 3.0) -> Dict[str, float]:
+    """
+    Detect class imbalance using normalized majority/minority ratio.
+    imbalance_score is scaled between 0 and 1:
+        0   → perfectly balanced
+        ~1  → highly imbalanced
+    """
+    y = validate_y(y)
+    _, counts = np.unique(y, return_counts=True)
+    if len(counts) < 2:
+        ratio = float("inf")
+        imbalance_score = 1.0
+    else:
+        max_count = np.max(counts)
+        min_count = np.min(counts)
+        if min_count == 0:
+            ratio = float("inf")
+            imbalance_score = 1.0
+        else:
+            ratio = max_count / min_count
+            imbalance_score = (ratio - 1) / ratio  # normalized 0–1
+    return {
+        "imbalance_score": float(imbalance_score),
+        "is_problematic": bool(ratio > threshold),
+        "severity": float(imbalance_score),
+    }

datasentry-0.1.0/datasentry/detectors/label_noise.py ADDED Viewed

@@ -0,0 +1,33 @@
+from typing import Dict
+import numpy as np
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import StratifiedKFold
+from .._utils import to_numpy, validate_y
+def detect(X, y, threshold: float, random_state: int) -> Dict[str, float]:
+    """
+    Detect label noise using cross-validated disagreement.
+    """
+    X = to_numpy(X)
+    y = validate_y(y)
+    if len(np.unique(y)) < 2:
+        return {"noise_ratio": 0.0, "is_problematic": False, "severity": 0.0}
+    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
+    preds = np.zeros(len(y))
+    for train_idx, val_idx in skf.split(X, y):
+        clf = RandomForestClassifier(random_state=random_state)
+        clf.fit(X[train_idx], y[train_idx])
+        preds[val_idx] = clf.predict(X[val_idx])
+    noise_ratio = float((preds != y).mean())
+    return {
+        "noise_ratio": noise_ratio,
+        "is_problematic": bool(noise_ratio > threshold),
+        "severity": noise_ratio,
+    }

datasentry-0.1.0/datasentry/detectors/leakage.py ADDED Viewed

@@ -0,0 +1,22 @@
+from typing import Dict
+import numpy as np
+from sklearn.feature_selection import mutual_info_classif
+from .._utils import to_numpy, validate_y
+def detect(X, y, threshold: float) -> Dict[str, float]:
+    """
+    Detect potential feature leakage via mutual information.
+    """
+    X = to_numpy(X)
+    y = validate_y(y)
+    mi_scores = mutual_info_classif(X, y)
+    max_mi = float(np.max(mi_scores))
+    return {
+        "max_mutual_information": max_mi,
+        "is_problematic": bool(max_mi > threshold),
+        "severity": max_mi,
+    }

datasentry-0.1.0/datasentry/detectors/outliers.py ADDED Viewed

@@ -0,0 +1,25 @@
+from typing import Dict
+from sklearn.ensemble import IsolationForest
+from .._utils import to_numpy
+def detect(X, contamination: float, random_state: int) -> Dict[str, float]:
+    """
+    Detect outliers using Isolation Forest.
+    """
+    X = to_numpy(X)
+    model = IsolationForest(
+        contamination=contamination,
+        random_state=random_state,
+    )
+    preds = model.fit_predict(X)
+    outlier_ratio = (preds == -1).mean()
+    return {
+        "outlier_ratio": float(outlier_ratio),
+        "is_problematic": bool(outlier_ratio > contamination),
+        "severity": float(outlier_ratio),
+    }

datasentry-0.1.0/datasentry/detectors/shift.py ADDED Viewed

@@ -0,0 +1,46 @@
+import numpy as np
+from typing import Dict
+from .._utils import to_numpy
+def _psi(expected: np.ndarray, actual: np.ndarray, bins: int = 10) -> float:
+    """
+    Compute Population Stability Index.
+    """
+    breakpoints = np.percentile(expected, np.linspace(0, 100, bins + 1))
+    expected_counts = np.histogram(expected, bins=breakpoints)[0]
+    actual_counts = np.histogram(actual, bins=breakpoints)[0]
+    expected_perc = expected_counts / len(expected)
+    actual_perc = actual_counts / len(actual)
+    psi = np.sum(
+        (expected_perc - actual_perc)
+        * np.log((expected_perc + 1e-12) / (actual_perc + 1e-12))
+    )
+    return float(psi)
+def detect(X_train, X_test, threshold: float) -> Dict[str, float]:
+    if X_test is None:
+        return {"psi": 0.0, "is_problematic": False, "severity": 0.0}
+    X_train = to_numpy(X_train)
+    X_test = to_numpy(X_test)
+    psi_values = [
+        _psi(X_train[:, i], X_test[:, i])
+        for i in range(X_train.shape[1])
+    ]
+    mean_psi = float(np.mean(psi_values))
+    return {
+        "psi": mean_psi,
+        "is_problematic": bool(mean_psi > threshold),
+        "severity": mean_psi,
+    }

datasentry-0.1.0/datasentry/fixer.py ADDED Viewed

@@ -0,0 +1,28 @@
+from typing import Tuple
+import numpy as np
+from imblearn.over_sampling import SMOTE
+from ._utils import to_numpy, validate_y
+class AutoFixer:
+    """
+    Automatically fix detected issues where possible.
+    """
+    def __init__(self, issues: dict, config):
+        self.issues = issues
+        self.config = config
+    def fix(self, X, y) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Apply automatic corrections.
+        """
+        X = to_numpy(X)
+        y = validate_y(y)
+        if self.issues.get("imbalance", {}).get("is_problematic"):
+            smote = SMOTE(random_state=self.config.random_state)
+            X, y = smote.fit_resample(X, y)
+        return X, y

datasentry-0.1.0/datasentry/report.py ADDED Viewed

@@ -0,0 +1,36 @@
+from typing import Dict
+class Report:
+    """
+    Inspection report returned by analyze().
+    """
+    def __init__(self, issues: Dict):
+        self.issues = issues
+        self.score = self._compute_score()
+        self.fixer = None  # injected later
+    def _compute_score(self) -> int:
+        score = 100
+        for issue in self.issues.values():
+            if issue["is_problematic"]:
+                score -= min(30, issue["severity"] * 50)
+        return max(int(score), 0)
+    def show(self) -> None:
+        """
+        Pretty-print report to console.
+        """
+        print("\n=== DataSentry Report ===\n")
+        for name, result in self.issues.items():
+            print(f"[{name.upper()}]")
+            for k, v in result.items():
+                print(f"  {k}: {v}")
+            print()
+        print(f"Overall Health Score: {self.score}/100\n")

datasentry-0.1.0/datasentry.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,15 @@
+Metadata-Version: 2.4
+Name: datasentry
+Version: 0.1.0
+Summary: Data-centric ML inspection and auto-remediation toolkit
+Author: Ankush Sharma
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+Requires-Dist: numpy
+Requires-Dist: pandas
+Requires-Dist: scikit-learn
+Requires-Dist: imbalanced-learn
+Provides-Extra: dev
+Requires-Dist: pytest; extra == "dev"
+Requires-Dist: black; extra == "dev"
+Requires-Dist: flake8; extra == "dev"

datasentry-0.1.0/datasentry.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,21 @@
+README.md
+pyproject.toml
+datasentry/__init__.py
+datasentry/_utils.py
+datasentry/analyzer.py
+datasentry/config.py
+datasentry/fixer.py
+datasentry/report.py
+datasentry.egg-info/PKG-INFO
+datasentry.egg-info/SOURCES.txt
+datasentry.egg-info/dependency_links.txt
+datasentry.egg-info/requires.txt
+datasentry.egg-info/top_level.txt
+datasentry/detectors/imbalance.py
+datasentry/detectors/label_noise.py
+datasentry/detectors/leakage.py
+datasentry/detectors/outliers.py
+datasentry/detectors/shift.py
+tests/test_analyzer.py
+tests/test_config.py
+tests/test_detectors.py

datasentry-0.1.0/datasentry.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

datasentry-0.1.0/datasentry.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,9 @@
+numpy
+pandas
+scikit-learn
+imbalanced-learn
+[dev]
+pytest
+black
+flake8

datasentry-0.1.0/datasentry.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ datasentry

datasentry-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,30 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "datasentry"
+version = "0.1.0"
+description = "Data-centric ML inspection and auto-remediation toolkit"
+authors = [
+    {name = "Ankush Sharma"}
+]
+readme = "README.md"
+requires-python = ">=3.9"
+dependencies = [
+    "numpy",
+    "pandas",
+    "scikit-learn",
+    "imbalanced-learn"
+]
+[project.optional-dependencies]
+dev = [
+    "pytest",
+    "black",
+    "flake8"
+]
+[tool.pytest.ini_options]
+testpaths = ["tests"]

datasentry-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

datasentry-0.1.0/tests/test_analyzer.py ADDED Viewed

@@ -0,0 +1,11 @@
+import numpy as np
+from datasentry import analyze
+def test_analyze_runs():
+    X = np.random.rand(100, 5)
+    y = np.random.randint(0, 2, 100)
+    report = analyze(X, y)
+    assert hasattr(report, "score")
+    assert report.score <= 100

datasentry-0.1.0/tests/test_config.py ADDED Viewed

@@ -0,0 +1,12 @@
+import pytest
+from datasentry.config import DataSentryConfig
+def test_default_config():
+    config = DataSentryConfig()
+    assert config.outlier_contamination == 0.05
+def test_invalid_contamination():
+    with pytest.raises(ValueError):
+        DataSentryConfig(outlier_contamination=0.9)

datasentry-0.1.0/tests/test_detectors.py ADDED Viewed

@@ -0,0 +1,14 @@
+import numpy as np
+from datasentry.detectors import imbalance
+def test_imbalance_detect_balanced():
+    y = np.array([0, 1, 0, 1])
+    result = imbalance.detect(y, threshold=3.0)
+    assert result["imbalance_score"] < 0.5
+def test_imbalance_detect_imbalanced():
+    y = np.array([0, 0, 0, 0, 1])
+    result = imbalance.detect(y, threshold=3.0)
+    assert result["is_problematic"] is True