PyPI - ml4t-diagnostic - Versions diffs - 0.1.0a1__py3-none-any.whl - Mend

ml4t-diagnostic 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (242) hide show

ml4t/diagnostic/AGENT.md +25 -0
ml4t/diagnostic/__init__.py +166 -0
ml4t/diagnostic/backends/__init__.py +10 -0
ml4t/diagnostic/backends/adapter.py +192 -0
ml4t/diagnostic/backends/polars_backend.py +899 -0
ml4t/diagnostic/caching/__init__.py +40 -0
ml4t/diagnostic/caching/cache.py +331 -0
ml4t/diagnostic/caching/decorators.py +131 -0
ml4t/diagnostic/caching/smart_cache.py +339 -0
ml4t/diagnostic/config/AGENT.md +24 -0
ml4t/diagnostic/config/README.md +267 -0
ml4t/diagnostic/config/__init__.py +219 -0
ml4t/diagnostic/config/barrier_config.py +277 -0
ml4t/diagnostic/config/base.py +301 -0
ml4t/diagnostic/config/event_config.py +148 -0
ml4t/diagnostic/config/feature_config.py +404 -0
ml4t/diagnostic/config/multi_signal_config.py +55 -0
ml4t/diagnostic/config/portfolio_config.py +215 -0
ml4t/diagnostic/config/report_config.py +391 -0
ml4t/diagnostic/config/sharpe_config.py +202 -0
ml4t/diagnostic/config/signal_config.py +206 -0
ml4t/diagnostic/config/trade_analysis_config.py +310 -0
ml4t/diagnostic/config/validation.py +279 -0
ml4t/diagnostic/core/__init__.py +29 -0
ml4t/diagnostic/core/numba_utils.py +315 -0
ml4t/diagnostic/core/purging.py +372 -0
ml4t/diagnostic/core/sampling.py +471 -0
ml4t/diagnostic/errors/__init__.py +205 -0
ml4t/diagnostic/evaluation/AGENT.md +26 -0
ml4t/diagnostic/evaluation/__init__.py +437 -0
ml4t/diagnostic/evaluation/autocorrelation.py +531 -0
ml4t/diagnostic/evaluation/barrier_analysis.py +1050 -0
ml4t/diagnostic/evaluation/binary_metrics.py +910 -0
ml4t/diagnostic/evaluation/dashboard.py +715 -0
ml4t/diagnostic/evaluation/diagnostic_plots.py +1037 -0
ml4t/diagnostic/evaluation/distribution/__init__.py +499 -0
ml4t/diagnostic/evaluation/distribution/moments.py +299 -0
ml4t/diagnostic/evaluation/distribution/tails.py +777 -0
ml4t/diagnostic/evaluation/distribution/tests.py +470 -0
ml4t/diagnostic/evaluation/drift/__init__.py +139 -0
ml4t/diagnostic/evaluation/drift/analysis.py +432 -0
ml4t/diagnostic/evaluation/drift/domain_classifier.py +517 -0
ml4t/diagnostic/evaluation/drift/population_stability_index.py +310 -0
ml4t/diagnostic/evaluation/drift/wasserstein.py +388 -0
ml4t/diagnostic/evaluation/event_analysis.py +647 -0
ml4t/diagnostic/evaluation/excursion.py +390 -0
ml4t/diagnostic/evaluation/feature_diagnostics.py +873 -0
ml4t/diagnostic/evaluation/feature_outcome.py +666 -0
ml4t/diagnostic/evaluation/framework.py +935 -0
ml4t/diagnostic/evaluation/metric_registry.py +255 -0
ml4t/diagnostic/evaluation/metrics/AGENT.md +23 -0
ml4t/diagnostic/evaluation/metrics/__init__.py +133 -0
ml4t/diagnostic/evaluation/metrics/basic.py +160 -0
ml4t/diagnostic/evaluation/metrics/conditional_ic.py +469 -0
ml4t/diagnostic/evaluation/metrics/feature_outcome.py +475 -0
ml4t/diagnostic/evaluation/metrics/ic_statistics.py +446 -0
ml4t/diagnostic/evaluation/metrics/importance_analysis.py +338 -0
ml4t/diagnostic/evaluation/metrics/importance_classical.py +375 -0
ml4t/diagnostic/evaluation/metrics/importance_mda.py +371 -0
ml4t/diagnostic/evaluation/metrics/importance_shap.py +715 -0
ml4t/diagnostic/evaluation/metrics/information_coefficient.py +527 -0
ml4t/diagnostic/evaluation/metrics/interactions.py +772 -0
ml4t/diagnostic/evaluation/metrics/monotonicity.py +226 -0
ml4t/diagnostic/evaluation/metrics/risk_adjusted.py +324 -0
ml4t/diagnostic/evaluation/multi_signal.py +550 -0
ml4t/diagnostic/evaluation/portfolio_analysis/__init__.py +83 -0
ml4t/diagnostic/evaluation/portfolio_analysis/analysis.py +734 -0
ml4t/diagnostic/evaluation/portfolio_analysis/metrics.py +589 -0
ml4t/diagnostic/evaluation/portfolio_analysis/results.py +334 -0
ml4t/diagnostic/evaluation/report_generation.py +824 -0
ml4t/diagnostic/evaluation/signal_selector.py +452 -0
ml4t/diagnostic/evaluation/stat_registry.py +139 -0
ml4t/diagnostic/evaluation/stationarity/__init__.py +97 -0
ml4t/diagnostic/evaluation/stationarity/analysis.py +518 -0
ml4t/diagnostic/evaluation/stationarity/augmented_dickey_fuller.py +296 -0
ml4t/diagnostic/evaluation/stationarity/kpss_test.py +308 -0
ml4t/diagnostic/evaluation/stationarity/phillips_perron.py +365 -0
ml4t/diagnostic/evaluation/stats/AGENT.md +43 -0
ml4t/diagnostic/evaluation/stats/__init__.py +191 -0
ml4t/diagnostic/evaluation/stats/backtest_overfitting.py +219 -0
ml4t/diagnostic/evaluation/stats/bootstrap.py +228 -0
ml4t/diagnostic/evaluation/stats/deflated_sharpe_ratio.py +591 -0
ml4t/diagnostic/evaluation/stats/false_discovery_rate.py +295 -0
ml4t/diagnostic/evaluation/stats/hac_standard_errors.py +108 -0
ml4t/diagnostic/evaluation/stats/minimum_track_record.py +408 -0
ml4t/diagnostic/evaluation/stats/moments.py +164 -0
ml4t/diagnostic/evaluation/stats/rademacher_adjustment.py +436 -0
ml4t/diagnostic/evaluation/stats/reality_check.py +155 -0
ml4t/diagnostic/evaluation/stats/sharpe_inference.py +219 -0
ml4t/diagnostic/evaluation/themes.py +330 -0
ml4t/diagnostic/evaluation/threshold_analysis.py +957 -0
ml4t/diagnostic/evaluation/trade_analysis.py +1136 -0
ml4t/diagnostic/evaluation/trade_dashboard/__init__.py +32 -0
ml4t/diagnostic/evaluation/trade_dashboard/app.py +315 -0
ml4t/diagnostic/evaluation/trade_dashboard/export/__init__.py +18 -0
ml4t/diagnostic/evaluation/trade_dashboard/export/csv.py +82 -0
ml4t/diagnostic/evaluation/trade_dashboard/export/html.py +276 -0
ml4t/diagnostic/evaluation/trade_dashboard/io.py +166 -0
ml4t/diagnostic/evaluation/trade_dashboard/normalize.py +304 -0
ml4t/diagnostic/evaluation/trade_dashboard/stats.py +386 -0
ml4t/diagnostic/evaluation/trade_dashboard/style.py +79 -0
ml4t/diagnostic/evaluation/trade_dashboard/tabs/__init__.py +21 -0
ml4t/diagnostic/evaluation/trade_dashboard/tabs/patterns.py +354 -0
ml4t/diagnostic/evaluation/trade_dashboard/tabs/shap_analysis.py +280 -0
ml4t/diagnostic/evaluation/trade_dashboard/tabs/stat_validation.py +186 -0
ml4t/diagnostic/evaluation/trade_dashboard/tabs/worst_trades.py +236 -0
ml4t/diagnostic/evaluation/trade_dashboard/types.py +129 -0
ml4t/diagnostic/evaluation/trade_shap/__init__.py +102 -0
ml4t/diagnostic/evaluation/trade_shap/alignment.py +188 -0
ml4t/diagnostic/evaluation/trade_shap/characterize.py +413 -0
ml4t/diagnostic/evaluation/trade_shap/cluster.py +302 -0
ml4t/diagnostic/evaluation/trade_shap/explain.py +208 -0
ml4t/diagnostic/evaluation/trade_shap/hypotheses/__init__.py +23 -0
ml4t/diagnostic/evaluation/trade_shap/hypotheses/generator.py +290 -0
ml4t/diagnostic/evaluation/trade_shap/hypotheses/matcher.py +251 -0
ml4t/diagnostic/evaluation/trade_shap/hypotheses/templates.yaml +467 -0
ml4t/diagnostic/evaluation/trade_shap/models.py +386 -0
ml4t/diagnostic/evaluation/trade_shap/normalize.py +116 -0
ml4t/diagnostic/evaluation/trade_shap/pipeline.py +263 -0
ml4t/diagnostic/evaluation/trade_shap_dashboard.py +283 -0
ml4t/diagnostic/evaluation/trade_shap_diagnostics.py +588 -0
ml4t/diagnostic/evaluation/validated_cv.py +535 -0
ml4t/diagnostic/evaluation/visualization.py +1050 -0
ml4t/diagnostic/evaluation/volatility/__init__.py +45 -0
ml4t/diagnostic/evaluation/volatility/analysis.py +351 -0
ml4t/diagnostic/evaluation/volatility/arch.py +258 -0
ml4t/diagnostic/evaluation/volatility/garch.py +460 -0
ml4t/diagnostic/integration/__init__.py +48 -0
ml4t/diagnostic/integration/backtest_contract.py +671 -0
ml4t/diagnostic/integration/data_contract.py +316 -0
ml4t/diagnostic/integration/engineer_contract.py +226 -0
ml4t/diagnostic/logging/__init__.py +77 -0
ml4t/diagnostic/logging/logger.py +245 -0
ml4t/diagnostic/logging/performance.py +234 -0
ml4t/diagnostic/logging/progress.py +234 -0
ml4t/diagnostic/logging/wandb.py +412 -0
ml4t/diagnostic/metrics/__init__.py +9 -0
ml4t/diagnostic/metrics/percentiles.py +128 -0
ml4t/diagnostic/py.typed +1 -0
ml4t/diagnostic/reporting/__init__.py +43 -0
ml4t/diagnostic/reporting/base.py +130 -0
ml4t/diagnostic/reporting/html_renderer.py +275 -0
ml4t/diagnostic/reporting/json_renderer.py +51 -0
ml4t/diagnostic/reporting/markdown_renderer.py +117 -0
ml4t/diagnostic/results/AGENT.md +24 -0
ml4t/diagnostic/results/__init__.py +105 -0
ml4t/diagnostic/results/barrier_results/__init__.py +36 -0
ml4t/diagnostic/results/barrier_results/hit_rate.py +304 -0
ml4t/diagnostic/results/barrier_results/precision_recall.py +266 -0
ml4t/diagnostic/results/barrier_results/profit_factor.py +297 -0
ml4t/diagnostic/results/barrier_results/tearsheet.py +397 -0
ml4t/diagnostic/results/barrier_results/time_to_target.py +305 -0
ml4t/diagnostic/results/barrier_results/validation.py +38 -0
ml4t/diagnostic/results/base.py +177 -0
ml4t/diagnostic/results/event_results.py +349 -0
ml4t/diagnostic/results/feature_results.py +787 -0
ml4t/diagnostic/results/multi_signal_results.py +431 -0
ml4t/diagnostic/results/portfolio_results.py +281 -0
ml4t/diagnostic/results/sharpe_results.py +448 -0
ml4t/diagnostic/results/signal_results/__init__.py +74 -0
ml4t/diagnostic/results/signal_results/ic.py +581 -0
ml4t/diagnostic/results/signal_results/irtc.py +110 -0
ml4t/diagnostic/results/signal_results/quantile.py +392 -0
ml4t/diagnostic/results/signal_results/tearsheet.py +456 -0
ml4t/diagnostic/results/signal_results/turnover.py +213 -0
ml4t/diagnostic/results/signal_results/validation.py +147 -0
ml4t/diagnostic/signal/AGENT.md +17 -0
ml4t/diagnostic/signal/__init__.py +69 -0
ml4t/diagnostic/signal/_report.py +152 -0
ml4t/diagnostic/signal/_utils.py +261 -0
ml4t/diagnostic/signal/core.py +275 -0
ml4t/diagnostic/signal/quantile.py +148 -0
ml4t/diagnostic/signal/result.py +214 -0
ml4t/diagnostic/signal/signal_ic.py +129 -0
ml4t/diagnostic/signal/turnover.py +182 -0
ml4t/diagnostic/splitters/AGENT.md +19 -0
ml4t/diagnostic/splitters/__init__.py +36 -0
ml4t/diagnostic/splitters/base.py +501 -0
ml4t/diagnostic/splitters/calendar.py +421 -0
ml4t/diagnostic/splitters/calendar_config.py +91 -0
ml4t/diagnostic/splitters/combinatorial.py +1064 -0
ml4t/diagnostic/splitters/config.py +322 -0
ml4t/diagnostic/splitters/cpcv/__init__.py +57 -0
ml4t/diagnostic/splitters/cpcv/combinations.py +119 -0
ml4t/diagnostic/splitters/cpcv/partitioning.py +263 -0
ml4t/diagnostic/splitters/cpcv/purge_engine.py +379 -0
ml4t/diagnostic/splitters/cpcv/windows.py +190 -0
ml4t/diagnostic/splitters/group_isolation.py +329 -0
ml4t/diagnostic/splitters/persistence.py +316 -0
ml4t/diagnostic/splitters/utils.py +207 -0
ml4t/diagnostic/splitters/walk_forward.py +757 -0
ml4t/diagnostic/utils/__init__.py +42 -0
ml4t/diagnostic/utils/config.py +542 -0
ml4t/diagnostic/utils/dependencies.py +318 -0
ml4t/diagnostic/utils/sessions.py +127 -0
ml4t/diagnostic/validation/__init__.py +54 -0
ml4t/diagnostic/validation/dataframe.py +274 -0
ml4t/diagnostic/validation/returns.py +280 -0
ml4t/diagnostic/validation/timeseries.py +299 -0
ml4t/diagnostic/visualization/AGENT.md +19 -0
ml4t/diagnostic/visualization/__init__.py +223 -0
ml4t/diagnostic/visualization/backtest/__init__.py +98 -0
ml4t/diagnostic/visualization/backtest/cost_attribution.py +762 -0
ml4t/diagnostic/visualization/backtest/executive_summary.py +895 -0
ml4t/diagnostic/visualization/backtest/interactive_controls.py +673 -0
ml4t/diagnostic/visualization/backtest/statistical_validity.py +874 -0
ml4t/diagnostic/visualization/backtest/tearsheet.py +565 -0
ml4t/diagnostic/visualization/backtest/template_system.py +373 -0
ml4t/diagnostic/visualization/backtest/trade_plots.py +1172 -0
ml4t/diagnostic/visualization/barrier_plots.py +782 -0
ml4t/diagnostic/visualization/core.py +1060 -0
ml4t/diagnostic/visualization/dashboards/__init__.py +36 -0
ml4t/diagnostic/visualization/dashboards/base.py +582 -0
ml4t/diagnostic/visualization/dashboards/importance.py +801 -0
ml4t/diagnostic/visualization/dashboards/interaction.py +263 -0
ml4t/diagnostic/visualization/dashboards.py +43 -0
ml4t/diagnostic/visualization/data_extraction/__init__.py +48 -0
ml4t/diagnostic/visualization/data_extraction/importance.py +649 -0
ml4t/diagnostic/visualization/data_extraction/interaction.py +504 -0
ml4t/diagnostic/visualization/data_extraction/types.py +113 -0
ml4t/diagnostic/visualization/data_extraction/validation.py +66 -0
ml4t/diagnostic/visualization/feature_plots.py +888 -0
ml4t/diagnostic/visualization/interaction_plots.py +618 -0
ml4t/diagnostic/visualization/portfolio/__init__.py +41 -0
ml4t/diagnostic/visualization/portfolio/dashboard.py +514 -0
ml4t/diagnostic/visualization/portfolio/drawdown_plots.py +341 -0
ml4t/diagnostic/visualization/portfolio/returns_plots.py +487 -0
ml4t/diagnostic/visualization/portfolio/risk_plots.py +301 -0
ml4t/diagnostic/visualization/report_generation.py +1343 -0
ml4t/diagnostic/visualization/signal/__init__.py +103 -0
ml4t/diagnostic/visualization/signal/dashboard.py +911 -0
ml4t/diagnostic/visualization/signal/event_plots.py +514 -0
ml4t/diagnostic/visualization/signal/ic_plots.py +635 -0
ml4t/diagnostic/visualization/signal/multi_signal_dashboard.py +974 -0
ml4t/diagnostic/visualization/signal/multi_signal_plots.py +603 -0
ml4t/diagnostic/visualization/signal/quantile_plots.py +625 -0
ml4t/diagnostic/visualization/signal/turnover_plots.py +400 -0
ml4t/diagnostic/visualization/trade_shap/__init__.py +90 -0
ml4t_diagnostic-0.1.0a1.dist-info/METADATA +1044 -0
ml4t_diagnostic-0.1.0a1.dist-info/RECORD +242 -0
ml4t_diagnostic-0.1.0a1.dist-info/WHEEL +4 -0
ml4t_diagnostic-0.1.0a1.dist-info/licenses/LICENSE +21 -0

ml4t/diagnostic/evaluation/drift/population_stability_index.py ADDED Viewed

@@ -0,0 +1,310 @@
+"""Population Stability Index (PSI) for distribution drift detection.
+PSI measures the distribution shift between a reference dataset (e.g., training)
+and a test dataset (e.g., production).
+PSI Interpretation:
+    - PSI < 0.1: No significant change (green)
+    - 0.1 ≤ PSI < 0.2: Small change, monitor (yellow)
+    - PSI ≥ 0.2: Significant change, investigate (red)
+References:
+    - Yurdakul, B. (2018). Statistical Properties of Population Stability Index.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Literal
+import numpy as np
+import polars as pl
+@dataclass
+class PSIResult:
+    """Result of Population Stability Index calculation.
+    Attributes:
+        psi: Overall PSI value (sum of bin-level PSI contributions)
+        bin_psi: PSI contribution per bin
+        bin_edges: Bin boundaries (continuous) or category labels (categorical)
+        reference_counts: Number of samples per bin in reference distribution
+        test_counts: Number of samples per bin in test distribution
+        reference_percents: Percentage of samples per bin in reference
+        test_percents: Percentage of samples per bin in test
+        n_bins: Number of bins used
+        is_categorical: Whether feature is categorical
+        alert_level: Alert level based on PSI thresholds
+            - "green": PSI < 0.1 (no significant change)
+            - "yellow": 0.1 ≤ PSI < 0.2 (small change, monitor)
+            - "red": PSI ≥ 0.2 (significant change, investigate)
+        interpretation: Human-readable interpretation
+    """
+    psi: float
+    bin_psi: np.ndarray
+    bin_edges: np.ndarray | list[str]
+    reference_counts: np.ndarray
+    test_counts: np.ndarray
+    reference_percents: np.ndarray
+    test_percents: np.ndarray
+    n_bins: int
+    is_categorical: bool
+    alert_level: Literal["green", "yellow", "red"]
+    interpretation: str
+    def summary(self) -> str:
+        """Return formatted summary of PSI results."""
+        lines = [
+            "Population Stability Index (PSI) Report",
+            "=" * 50,
+            f"PSI Value: {self.psi:.4f}",
+            f"Alert Level: {self.alert_level.upper()}",
+            f"Feature Type: {'Categorical' if self.is_categorical else 'Continuous'}",
+            f"Number of Bins: {self.n_bins}",
+            "",
+            f"Interpretation: {self.interpretation}",
+            "",
+            "Bin-Level Analysis:",
+            "-" * 50,
+        ]
+        # Add bin-level details
+        for i in range(self.n_bins):
+            if self.is_categorical:
+                bin_label = self.bin_edges[i]
+            else:
+                if i == 0:
+                    bin_label = f"(-inf, {self.bin_edges[i + 1]:.3f}]"
+                elif i == self.n_bins - 1:
+                    bin_label = f"({self.bin_edges[i]:.3f}, +inf)"
+                else:
+                    bin_label = f"({self.bin_edges[i]:.3f}, {self.bin_edges[i + 1]:.3f}]"
+            lines.append(
+                f"Bin {i + 1:2d} {bin_label:20s}: "
+                f"Ref={self.reference_percents[i]:6.2%} "
+                f"Test={self.test_percents[i]:6.2%} "
+                f"PSI={self.bin_psi[i]:.4f}"
+            )
+        return "\n".join(lines)
+def compute_psi(
+    reference: np.ndarray | pl.Series,
+    test: np.ndarray | pl.Series,
+    n_bins: int = 10,
+    is_categorical: bool = False,
+    missing_category_handling: Literal["ignore", "separate", "error"] = "separate",
+    psi_threshold_yellow: float = 0.1,
+    psi_threshold_red: float = 0.2,
+) -> PSIResult:
+    """Compute Population Stability Index (PSI) between two distributions.
+    PSI measures the distribution shift between a reference dataset (e.g., training)
+    and a test dataset (e.g., production). It quantifies how much the distribution
+    has changed.
+    Formula:
+        PSI = Σ (test_% - ref_%) × ln(test_% / ref_%)
+    For each bin i:
+        PSI_i = (P_test[i] - P_ref[i]) × ln(P_test[i] / P_ref[i])
+    Args:
+        reference: Reference distribution (e.g., training data)
+        test: Test distribution (e.g., production data)
+        n_bins: Number of quantile bins for continuous features (default: 10)
+        is_categorical: Whether feature is categorical (default: False)
+        missing_category_handling: How to handle categories in test not in reference:
+            - "ignore": Skip missing categories (not recommended)
+            - "separate": Create separate bin for missing categories (default)
+            - "error": Raise error if new categories found
+        psi_threshold_yellow: Threshold for yellow alert (default: 0.1)
+        psi_threshold_red: Threshold for red alert (default: 0.2)
+    Returns:
+        PSIResult with overall PSI, bin-level contributions, and interpretation
+    Raises:
+        ValueError: If inputs are invalid or missing categories found with "error" handling
+    Example:
+        >>> # Continuous feature
+        >>> ref = np.random.normal(0, 1, 1000)
+        >>> test = np.random.normal(0.5, 1, 1000)  # Mean shifted
+        >>> result = compute_psi(ref, test, n_bins=10)
+        >>> print(result.summary())
+        >>>
+        >>> # Categorical feature
+        >>> ref_cat = np.array(['A', 'B', 'C'] * 100)
+        >>> test_cat = np.array(['A', 'A', 'B'] * 100)  # Distribution changed
+        >>> result = compute_psi(ref_cat, test_cat, is_categorical=True)
+        >>> print(f"PSI: {result.psi:.4f}, Alert: {result.alert_level}")
+    """
+    # Convert to numpy arrays
+    if isinstance(reference, pl.Series):
+        reference = reference.to_numpy()
+    if isinstance(test, pl.Series):
+        test = test.to_numpy()
+    reference = np.asarray(reference)
+    test = np.asarray(test)
+    # Validate inputs
+    if len(reference) == 0 or len(test) == 0:
+        raise ValueError("Reference and test arrays must not be empty")
+    # Variables with union types for both branches
+    bin_labels: np.ndarray | list[str]
+    bin_edges: np.ndarray | list[str]
+    if not is_categorical:
+        # Continuous feature: quantile binning
+        bin_edges, ref_counts, test_counts = _bin_continuous(reference, test, n_bins)
+        bin_labels = bin_edges  # Will be formatted in summary()
+    else:
+        # Categorical feature: category-based binning
+        bin_labels, ref_counts, test_counts = _bin_categorical(
+            reference, test, missing_category_handling
+        )
+        bin_edges = bin_labels
+        n_bins = len(bin_labels)
+    # Convert counts to percentages
+    ref_percents = ref_counts / ref_counts.sum()
+    test_percents = test_counts / test_counts.sum()
+    # Compute PSI per bin with numerical stability
+    # Add small epsilon to avoid log(0) and division by zero
+    epsilon = 1e-10
+    ref_percents_safe = np.maximum(ref_percents, epsilon)
+    test_percents_safe = np.maximum(test_percents, epsilon)
+    # PSI formula: (test% - ref%) * ln(test% / ref%)
+    bin_psi = (test_percents_safe - ref_percents_safe) * np.log(
+        test_percents_safe / ref_percents_safe
+    )
+    # Total PSI is sum of bin contributions
+    psi = float(np.sum(bin_psi))
+    # Determine alert level
+    alert_level: Literal["green", "yellow", "red"]
+    if psi < psi_threshold_yellow:
+        alert_level = "green"
+        interpretation = (
+            f"No significant distribution change detected (PSI={psi:.4f} < {psi_threshold_yellow}). "
+            "Feature distribution is stable."
+        )
+    elif psi < psi_threshold_red:
+        alert_level = "yellow"
+        interpretation = (
+            f"Small distribution change detected ({psi_threshold_yellow} ≤ PSI={psi:.4f} < {psi_threshold_red}). "
+            "Monitor feature closely but no immediate action required."
+        )
+    else:
+        alert_level = "red"
+        interpretation = (
+            f"Significant distribution change detected (PSI={psi:.4f} ≥ {psi_threshold_red}). "
+            "Investigate cause and consider model retraining."
+        )
+    return PSIResult(
+        psi=psi,
+        bin_psi=bin_psi,
+        bin_edges=bin_edges,
+        reference_counts=ref_counts,
+        test_counts=test_counts,
+        reference_percents=ref_percents,
+        test_percents=test_percents,
+        n_bins=n_bins,
+        is_categorical=is_categorical,
+        alert_level=alert_level,
+        interpretation=interpretation,
+    )
+def _bin_continuous(
+    reference: np.ndarray, test: np.ndarray, n_bins: int
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """Bin continuous features using quantiles from reference distribution.
+    Uses quantile binning to ensure roughly equal-sized bins in reference distribution.
+    Test distribution is binned using same bin edges.
+    Args:
+        reference: Reference data (used to compute quantiles)
+        test: Test data (binned using reference quantiles)
+        n_bins: Number of bins
+    Returns:
+        Tuple of (bin_edges, reference_counts, test_counts)
+    """
+    # Compute quantiles from reference distribution
+    # Use (n_bins + 1) to get n_bins bins with n_bins + 1 edges
+    quantiles = np.linspace(0, 100, n_bins + 1)
+    bin_edges = np.percentile(reference, quantiles)
+    # Ensure edges are unique (handle constant features)
+    bin_edges = np.unique(bin_edges)
+    # If all values are the same, create a single bin
+    if len(bin_edges) == 1:
+        return bin_edges, np.array([len(reference)]), np.array([len(test)])
+    # Bin both distributions using same edges
+    # Use digitize for open-interval binning
+    ref_bins = np.digitize(reference, bin_edges[1:-1])
+    test_bins = np.digitize(test, bin_edges[1:-1])
+    # Count samples per bin
+    ref_counts = np.bincount(ref_bins, minlength=len(bin_edges) - 1)
+    test_counts = np.bincount(test_bins, minlength=len(bin_edges) - 1)
+    return bin_edges, ref_counts, test_counts
+def _bin_categorical(
+    reference: np.ndarray,
+    test: np.ndarray,
+    missing_handling: Literal["ignore", "separate", "error"],
+) -> tuple[list[str], np.ndarray, np.ndarray]:
+    """Bin categorical features by category labels.
+    Args:
+        reference: Reference categories
+        test: Test categories
+        missing_handling: How to handle new categories in test
+    Returns:
+        Tuple of (category_labels, reference_counts, test_counts)
+    Raises:
+        ValueError: If new categories found and missing_handling="error"
+    """
+    # Get unique categories from reference
+    ref_categories = sorted(set(reference))
+    test_categories = set(test)
+    # Check for new categories in test
+    new_categories = test_categories - set(ref_categories)
+    if new_categories:
+        if missing_handling == "error":
+            raise ValueError(
+                f"New categories found in test set: {new_categories}. "
+                "These categories were not present in reference distribution."
+            )
+        elif missing_handling == "separate":
+            # Add new categories to the end
+            ref_categories.extend(sorted(new_categories))
+        # else "ignore": new categories will be dropped
+    # Count occurrences per category
+    ref_counts = np.array([np.sum(reference == cat) for cat in ref_categories])
+    test_counts = np.array([np.sum(test == cat) for cat in ref_categories])
+    return ref_categories, ref_counts, test_counts

ml4t/diagnostic/evaluation/drift/wasserstein.py ADDED Viewed

@@ -0,0 +1,388 @@
+"""Wasserstein distance for continuous distribution drift detection.
+The Wasserstein distance (Earth Mover's Distance) measures the minimum cost
+to transform one probability distribution into another.
+Properties:
+- True metric: non-negative, symmetric, triangle inequality
+- More sensitive to small shifts than PSI
+- Natural interpretation as "transport cost"
+- No binning artifacts
+References:
+    - Villani, C. (2009). Optimal Transport: Old and New. Springer.
+    - Ramdas, A., et al. (2017). On Wasserstein Two-Sample Testing.
+      Entropy, 19(2), 47.
+"""
+from __future__ import annotations
+import time
+from dataclasses import dataclass
+from typing import Any
+import numpy as np
+import polars as pl
+from scipy.stats import wasserstein_distance
+@dataclass
+class WassersteinResult:
+    """Result of Wasserstein distance calculation.
+    The Wasserstein distance (also called Earth Mover's Distance) measures the
+    minimum "cost" to transform one distribution into another. It's a true metric
+    and doesn't require binning, making it ideal for continuous features.
+    Attributes:
+        distance: Wasserstein distance value (W_p)
+        p: Order of Wasserstein distance (1 or 2)
+        threshold: Calibrated threshold from permutation test (if calibrated)
+        p_value: Statistical significance p-value (if calibrated)
+        drifted: Whether drift was detected (distance > threshold)
+        n_reference: Number of samples in reference distribution
+        n_test: Number of samples in test distribution
+        reference_stats: Summary statistics of reference distribution
+        test_stats: Summary statistics of test distribution
+        threshold_calibration_config: Configuration used for threshold calibration
+        interpretation: Human-readable interpretation
+        computation_time: Time taken to compute (seconds)
+    """
+    distance: float
+    p: int
+    threshold: float | None
+    p_value: float | None
+    drifted: bool
+    n_reference: int
+    n_test: int
+    reference_stats: dict[str, float]
+    test_stats: dict[str, float]
+    threshold_calibration_config: dict[str, Any] | None
+    interpretation: str
+    computation_time: float
+    def summary(self) -> str:
+        """Return formatted summary of Wasserstein distance results."""
+        lines = [
+            "Wasserstein Distance Drift Detection Report",
+            "=" * 60,
+            f"Wasserstein-{self.p} Distance: {self.distance:.6f}",
+            f"Drift Detected: {'YES' if self.drifted else 'NO'}",
+            "",
+            "Sample Sizes:",
+            f"  Reference: {self.n_reference:,}",
+            f"  Test: {self.n_test:,}",
+            "",
+        ]
+        if self.threshold is not None:
+            lines.extend(
+                [
+                    "Threshold Calibration:",
+                    f"  Threshold: {self.threshold:.6f}",
+                    f"  P-value: {self.p_value:.4f}" if self.p_value else "  P-value: N/A",
+                    f"  Config: {self.threshold_calibration_config}",
+                    "",
+                ]
+            )
+        lines.extend(
+            [
+                "Distribution Statistics:",
+                "-" * 60,
+                f"Reference: Mean={self.reference_stats['mean']:.4f}, "
+                f"Std={self.reference_stats['std']:.4f}, "
+                f"Min={self.reference_stats['min']:.4f}, "
+                f"Max={self.reference_stats['max']:.4f}",
+                f"Test:      Mean={self.test_stats['mean']:.4f}, "
+                f"Std={self.test_stats['std']:.4f}, "
+                f"Min={self.test_stats['min']:.4f}, "
+                f"Max={self.test_stats['max']:.4f}",
+                "",
+                f"Interpretation: {self.interpretation}",
+                "",
+                f"Computation Time: {self.computation_time:.3f}s",
+            ]
+        )
+        return "\n".join(lines)
+def compute_wasserstein_distance(
+    reference: np.ndarray | pl.Series,
+    test: np.ndarray | pl.Series,
+    p: int = 1,
+    threshold_calibration: bool = True,
+    n_permutations: int = 1000,
+    alpha: float = 0.05,
+    n_samples: int | None = None,
+    random_state: int | None = None,
+) -> WassersteinResult:
+    """Compute Wasserstein distance between reference and test distributions.
+    The Wasserstein distance (Earth Mover's Distance) measures the minimum cost
+    to transform one probability distribution into another. Unlike PSI, it doesn't
+    require binning and provides a true metric with desirable properties:
+    - Metric properties: non-negative, symmetric, triangle inequality
+    - More sensitive to small shifts than PSI
+    - Natural interpretation as "transport cost"
+    - No binning artifacts
+    The p-Wasserstein distance is defined as:
+        W_p(P, Q) = (∫|F_P^{-1}(u) - F_Q^{-1}(u)|^p du)^{1/p}
+    For empirical distributions with sorted samples x_1 ≤ ... ≤ x_n:
+        W_1(P, Q) = (1/n) Σ|x_i^P - x_i^Q|
+    Threshold calibration uses a permutation test:
+        H0: reference and test come from the same distribution
+        H1: distributions differ
+    Args:
+        reference: Reference distribution (e.g., training data)
+        test: Test distribution (e.g., production data)
+        p: Order of Wasserstein distance (1 or 2). Default: 1
+            - p=1: More robust, easier to interpret
+            - p=2: More sensitive to tail differences
+        threshold_calibration: Whether to calibrate threshold via permutation test
+        n_permutations: Number of permutations for threshold calibration
+        alpha: Significance level for threshold (default: 0.05)
+        n_samples: Subsample to this many samples if provided (for large datasets)
+        random_state: Random seed for reproducibility
+    Returns:
+        WassersteinResult with distance, threshold, p-value, and interpretation
+    Raises:
+        ValueError: If inputs are invalid or p not in {1, 2}
+    Example:
+        >>> # Detect mean shift
+        >>> ref = np.random.normal(0, 1, 1000)
+        >>> test = np.random.normal(0.5, 1, 1000)  # Mean shifted by 0.5
+        >>> result = compute_wasserstein_distance(ref, test)
+        >>> print(result.summary())
+        >>>
+        >>> # Detect variance shift
+        >>> test_var = np.random.normal(0, 2, 1000)  # Variance doubled
+        >>> result = compute_wasserstein_distance(ref, test_var)
+        >>> print(f"Distance: {result.distance:.4f}, Drifted: {result.drifted}")
+        >>>
+        >>> # Without threshold calibration (faster)
+        >>> result = compute_wasserstein_distance(
+        ...     ref, test, threshold_calibration=False
+        ... )
+    """
+    start_time = time.time()
+    # Convert to numpy arrays
+    if isinstance(reference, pl.Series):
+        reference = reference.to_numpy()
+    if isinstance(test, pl.Series):
+        test = test.to_numpy()
+    reference = np.asarray(reference, dtype=np.float64)
+    test = np.asarray(test, dtype=np.float64)
+    # Validate inputs
+    if len(reference) == 0 or len(test) == 0:
+        raise ValueError("Reference and test arrays must not be empty")
+    if p not in [1, 2]:
+        raise ValueError(f"p must be 1 or 2, got {p}")
+    # Set random state
+    if random_state is not None:
+        np.random.seed(random_state)
+    # Subsample if requested
+    if n_samples is not None and len(reference) > n_samples:
+        indices_ref = np.random.choice(len(reference), n_samples, replace=False)
+        reference = reference[indices_ref]
+    if n_samples is not None and len(test) > n_samples:
+        indices_test = np.random.choice(len(test), n_samples, replace=False)
+        test = test[indices_test]
+    n_reference = len(reference)
+    n_test = len(test)
+    # Compute distribution statistics
+    reference_stats = {
+        "mean": float(np.mean(reference)),
+        "std": float(np.std(reference)),
+        "min": float(np.min(reference)),
+        "max": float(np.max(reference)),
+        "median": float(np.median(reference)),
+        "q25": float(np.percentile(reference, 25)),
+        "q75": float(np.percentile(reference, 75)),
+    }
+    test_stats = {
+        "mean": float(np.mean(test)),
+        "std": float(np.std(test)),
+        "min": float(np.min(test)),
+        "max": float(np.max(test)),
+        "median": float(np.median(test)),
+        "q25": float(np.percentile(test, 25)),
+        "q75": float(np.percentile(test, 75)),
+    }
+    # Compute Wasserstein distance
+    if p == 1:
+        distance = float(wasserstein_distance(reference, test))
+    else:  # p == 2
+        # scipy's wasserstein_distance computes W_1
+        # For W_2, we need to compute it manually
+        distance = _wasserstein_2(reference, test)
+    # Threshold calibration via permutation test
+    threshold = None
+    p_value = None
+    calibration_config = None
+    if threshold_calibration:
+        threshold, p_value = _calibrate_wasserstein_threshold(
+            reference, test, distance, n_permutations, alpha, p
+        )
+        calibration_config = {
+            "n_permutations": n_permutations,
+            "alpha": alpha,
+            "method": "permutation",
+        }
+    # Determine drift status
+    if threshold is not None:
+        drifted = distance > threshold
+    else:
+        # Without calibration, use heuristic based on distribution statistics
+        # Drift if distance > 0.5 * std of reference
+        drifted = distance > 0.5 * reference_stats["std"]
+        threshold = 0.5 * reference_stats["std"]
+    # Generate interpretation
+    if drifted:
+        if p_value is not None:
+            interpretation = (
+                f"Distribution drift detected (W_{p}={distance:.6f} > {threshold:.6f}, "
+                f"p={p_value:.4f}). The test distribution differs significantly from "
+                f"the reference distribution."
+            )
+        else:
+            interpretation = (
+                f"Distribution drift detected (W_{p}={distance:.6f} > {threshold:.6f}). "
+                f"The test distribution differs from the reference distribution."
+            )
+    else:
+        if p_value is not None:
+            interpretation = (
+                f"No significant drift detected (W_{p}={distance:.6f} ≤ {threshold:.6f}, "
+                f"p={p_value:.4f}). Distributions are consistent."
+            )
+        else:
+            interpretation = f"No significant drift detected (W_{p}={distance:.6f} ≤ {threshold:.6f}). Distributions are consistent."
+    computation_time = time.time() - start_time
+    return WassersteinResult(
+        distance=distance,
+        p=p,
+        threshold=threshold,
+        p_value=p_value,
+        drifted=drifted,
+        n_reference=n_reference,
+        n_test=n_test,
+        reference_stats=reference_stats,
+        test_stats=test_stats,
+        threshold_calibration_config=calibration_config,
+        interpretation=interpretation,
+        computation_time=computation_time,
+    )
+def _wasserstein_2(u_values: np.ndarray, v_values: np.ndarray) -> float:
+    """Compute Wasserstein-2 distance between two 1D distributions.
+    W_2(P, Q) = sqrt(∫|F_P^{-1}(u) - F_Q^{-1}(u)|^2 du)
+    For empirical distributions, this is computed as:
+    W_2 = sqrt((1/n) Σ(x_i - y_i)^2) where x, y are sorted samples
+    Args:
+        u_values: First distribution samples
+        v_values: Second distribution samples
+    Returns:
+        Wasserstein-2 distance
+    """
+    u_sorted = np.sort(u_values)
+    v_sorted = np.sort(v_values)
+    # Align to same length via CDF interpolation
+    # Use linear interpolation between sorted samples
+    n = min(len(u_sorted), len(v_sorted))
+    u_quantiles = np.interp(np.linspace(0, 1, n), np.linspace(0, 1, len(u_sorted)), u_sorted)
+    v_quantiles = np.interp(np.linspace(0, 1, n), np.linspace(0, 1, len(v_sorted)), v_sorted)
+    # Compute L2 distance
+    return float(np.sqrt(np.mean((u_quantiles - v_quantiles) ** 2)))
+def _calibrate_wasserstein_threshold(
+    reference: np.ndarray,
+    test: np.ndarray,
+    observed_distance: float,
+    n_permutations: int,
+    alpha: float,
+    p: int,
+) -> tuple[float, float]:
+    """Calibrate Wasserstein distance threshold via permutation test.
+    Tests the null hypothesis that reference and test come from the same
+    distribution by computing the null distribution of Wasserstein distances
+    under random permutations.
+    H0: P_ref = P_test (no drift)
+    H1: P_ref ≠ P_test (drift detected)
+    Args:
+        reference: Reference distribution samples
+        test: Test distribution samples
+        observed_distance: Observed Wasserstein distance
+        n_permutations: Number of permutations
+        alpha: Significance level
+        p: Order of Wasserstein distance
+    Returns:
+        Tuple of (threshold, p_value)
+            - threshold: (1-alpha) quantile of null distribution
+            - p_value: Fraction of null distances >= observed
+    """
+    # Pool all samples
+    pooled = np.concatenate([reference, test])
+    n_ref = len(reference)
+    # Compute null distribution
+    null_distances = np.zeros(n_permutations)
+    for i in range(n_permutations):
+        # Random permutation
+        np.random.shuffle(pooled)
+        # Split into two groups
+        ref_perm = pooled[:n_ref]
+        test_perm = pooled[n_ref:]
+        # Compute distance
+        if p == 1:
+            null_distances[i] = wasserstein_distance(ref_perm, test_perm)
+        else:  # p == 2
+            null_distances[i] = _wasserstein_2(ref_perm, test_perm)
+    # Compute threshold as (1-alpha) quantile
+    threshold = float(np.percentile(null_distances, (1 - alpha) * 100))
+    # Compute p-value
+    p_value = float(np.mean(null_distances >= observed_distance))
+    return threshold, p_value