PyPI - tanml - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl - Mend

tanml 0.1.6py3-none-any.whl → 0.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tanml might be problematic. Click here for more details.

Files changed (49) hide show

tanml/__init__.py +1 -1
tanml/check_runners/cleaning_repro_runner.py +2 -2
tanml/check_runners/correlation_runner.py +49 -12
tanml/check_runners/explainability_runner.py +12 -22
tanml/check_runners/logistic_stats_runner.py +196 -17
tanml/check_runners/performance_runner.py +82 -26
tanml/check_runners/raw_data_runner.py +29 -14
tanml/check_runners/regression_metrics_runner.py +195 -0
tanml/check_runners/stress_test_runner.py +23 -6
tanml/check_runners/vif_runner.py +33 -27
tanml/checks/correlation.py +241 -41
tanml/checks/explainability/shap_check.py +261 -29
tanml/checks/logit_stats.py +186 -54
tanml/checks/performance_classification.py +305 -0
tanml/checks/raw_data.py +58 -23
tanml/checks/regression_metrics.py +167 -0
tanml/checks/stress_test.py +157 -53
tanml/cli/main.py +99 -27
tanml/engine/check_agent_registry.py +20 -10
tanml/engine/core_engine_agent.py +199 -37
tanml/models/registry.py +329 -0
tanml/report/report_builder.py +1180 -147
tanml/report/templates/report_template_cls.docx +0 -0
tanml/report/templates/report_template_reg.docx +0 -0
tanml/ui/app.py +1205 -0
tanml/utils/data_loader.py +105 -15
tanml-0.1.7.dist-info/METADATA +164 -0
tanml-0.1.7.dist-info/RECORD +54 -0
tanml/cli/arg_parser.py +0 -31
tanml/cli/init_cmd.py +0 -8
tanml/cli/validate_cmd.py +0 -7
tanml/config_templates/rules_multiple_models_datasets.yaml +0 -144
tanml/config_templates/rules_one_dataset_segment_column.yaml +0 -140
tanml/config_templates/rules_one_model_one_dataset.yaml +0 -143
tanml/engine/segmentation_agent.py +0 -118
tanml/engine/validation_agent.py +0 -91
tanml/report/templates/report_template.docx +0 -0
tanml/utils/model_loader.py +0 -35
tanml/utils/r_loader.py +0 -30
tanml/utils/sas_loader.py +0 -50
tanml/utils/yaml_generator.py +0 -34
tanml/utils/yaml_loader.py +0 -5
tanml/validate.py +0 -209
tanml-0.1.6.dist-info/METADATA +0 -317
tanml-0.1.6.dist-info/RECORD +0 -62
{tanml-0.1.6.dist-info → tanml-0.1.7.dist-info}/WHEEL +0 -0
{tanml-0.1.6.dist-info → tanml-0.1.7.dist-info}/entry_points.txt +0 -0
{tanml-0.1.6.dist-info → tanml-0.1.7.dist-info}/licenses/LICENSE +0 -0
{tanml-0.1.6.dist-info → tanml-0.1.7.dist-info}/top_level.txt +0 -0

tanml/check_runners/regression_metrics_runner.py ADDED Viewed

@@ -0,0 +1,195 @@
+# tanml/check_runners/regression_metrics_runner.py
+from __future__ import annotations
+import os
+from typing import Any, Dict, Optional
+import numpy as np
+import matplotlib.pyplot as plt
+from tanml.checks.regression_metrics import RegressionMetricsCheck
+try:
+    from scipy import stats as _scipy_stats
+    _HAS_SCIPY = True
+except Exception:
+    _HAS_SCIPY = False
+# ---------- utils ----------
+def _ensure_outdir(config: Dict[str, Any]) -> str:
+    base = (config.get("options") or {}).get("save_artifacts_dir") or "reports"
+    outdir = os.path.join(base, "regression_metrics")
+    os.makedirs(outdir, exist_ok=True)
+    return outdir
+def _to_1d(x: Any) -> np.ndarray:
+    return np.asarray(x).reshape(-1)
+def _residuals(y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray:
+    return y_true - y_pred
+def _plot_pred_vs_actual(y_true: np.ndarray, y_pred: np.ndarray, save_path: str) -> str:
+    plt.figure()
+    plt.scatter(y_true, y_pred, s=12, alpha=0.75)
+    mn = float(min(np.min(y_true), np.min(y_pred)))
+    mx = float(max(np.max(y_true), np.max(y_pred)))
+    plt.plot([mn, mx], [mn, mx])  # reference y=x
+    plt.xlabel("Actual")
+    plt.ylabel("Predicted")
+    plt.title("Predicted vs Actual")
+    plt.tight_layout()
+    plt.savefig(save_path, dpi=160)
+    plt.close()
+    return save_path
+def _plot_residuals_vs_pred(y_pred: np.ndarray, resid: np.ndarray, save_path: str) -> str:
+    plt.figure()
+    plt.scatter(y_pred, resid, s=12, alpha=0.75)
+    plt.axhline(0.0)
+    plt.xlabel("Predicted")
+    plt.ylabel("Residual (y_true - y_pred)")
+    plt.title("Residuals vs Predicted")
+    plt.tight_layout()
+    plt.savefig(save_path, dpi=160)
+    plt.close()
+    return save_path
+def _plot_residual_hist(resid: np.ndarray, save_path: str) -> str:
+    plt.figure()
+    plt.hist(resid, bins=30, alpha=0.9)
+    plt.xlabel("Residual")
+    plt.ylabel("Count")
+    plt.title("Residual Distribution")
+    plt.tight_layout()
+    plt.savefig(save_path, dpi=160)
+    plt.close()
+    return save_path
+def _plot_qq(resid: np.ndarray, save_path: str) -> str:
+    osm, osr = _scipy_stats.probplot(resid, dist="norm", fit=False)
+    plt.figure()
+    plt.scatter(osm, osr, s=12, alpha=0.8)
+    mn = float(min(np.min(osm), np.min(osr)))
+    mx = float(max(np.max(osm), np.max(osr)))
+    plt.plot([mn, mx], [mn, mx])
+    plt.xlabel("Theoretical Quantiles (Normal)")
+    plt.ylabel("Ordered Residuals")
+    plt.title("Residuals Q–Q Plot")
+    plt.tight_layout()
+    plt.savefig(save_path, dpi=160)
+    plt.close()
+    return save_path
+def _plot_abs_error_box(abs_err: np.ndarray, save_path: str) -> str:
+    plt.figure()
+    plt.boxplot(abs_err, vert=True, showfliers=True)
+    plt.ylabel("|Residual|")
+    plt.title("Absolute Error — Box Plot")
+    plt.tight_layout()
+    plt.savefig(save_path, dpi=160)
+    plt.close()
+    return save_path
+def _plot_abs_error_violin(abs_err: np.ndarray, save_path: str) -> str:
+    plt.figure()
+    plt.violinplot(abs_err, showmeans=True, showmedians=True)
+    plt.ylabel("|Residual|")
+    plt.title("Absolute Error — Violin Plot")
+    plt.tight_layout()
+    plt.savefig(save_path, dpi=160)
+    plt.close()
+    return save_path
+def RegressionMetricsCheckRunner(
+    model: Any,
+    X_train: Any,
+    X_test: Any,
+    y_train: Any,
+    y_test: Any,
+    config: Dict[str, Any],
+    cleaned_df: Optional[Any] = None,
+    raw_df: Optional[Any] = None,
+    ctx: Optional[Dict[str, Any]] = None,
+) -> Dict[str, Any]:
+    """
+    1) Predict on X_test
+    2) Compute frozen regression metrics
+    3) Save 5 standard charts (Q–Q skipped if SciPy missing)
+    4) Return structured results for engine/report
+    """
+    # 1) predictions
+    try:
+        y_pred = model.predict(X_test)
+    except Exception as e:
+        raise RuntimeError(f"Model prediction failed in RegressionMetricsCheckRunner: {e}")
+    y_true = _to_1d(y_test)
+    y_pred = _to_1d(y_pred)
+    # n_features for Adjusted R²
+    try:
+        n_features = int(getattr(X_train, "shape", [None, None])[1])
+    except Exception:
+        n_features = None
+    # 2) metrics
+    chk = RegressionMetricsCheck(
+        y_true=y_true,
+        y_pred=y_pred,
+        n_features=n_features,
+        config=(config or {}),
+    )
+    metrics = chk.run()
+    # 3) plots
+    outdir = _ensure_outdir(config)
+    resid = _residuals(y_true, y_pred)
+    abs_err = np.abs(resid)
+    p1 = os.path.join(outdir, "pred_vs_actual.png")
+    p2 = os.path.join(outdir, "residuals_vs_pred.png")
+    p3 = os.path.join(outdir, "residual_hist.png")
+    p4 = os.path.join(outdir, "qq_plot.png")
+    b1 = os.path.join(outdir, "abs_error_box.png")
+    v1 = os.path.join(outdir, "abs_error_violin.png")
+    try:
+        _plot_pred_vs_actual(y_true, y_pred, p1)
+        _plot_residuals_vs_pred(y_pred, resid, p2)
+        _plot_residual_hist(resid, p3)
+        if _HAS_SCIPY:
+            _plot_qq(resid, p4)
+        else:
+            (metrics.get("notes") or []).append("Q–Q plot skipped: SciPy not available.")
+            p4 = None
+        _plot_abs_error_box(abs_err, b1)
+        _plot_abs_error_violin(abs_err, v1)
+    except Exception as e:
+        (metrics.get("notes") or []).append(f"Plotting failed: {e}")
+    # 4) return
+    return {
+        "RegressionMetrics": {
+            **metrics,
+            "artifacts": {
+                "pred_vs_actual": p1,
+                "residuals_vs_pred": p2,
+                "residual_hist": p3,
+                "qq_plot": p4,
+                "abs_error_box": b1,
+                "abs_error_violin": v1,
+            },
+        }
+    }

tanml/check_runners/stress_test_runner.py CHANGED Viewed

@@ -1,26 +1,43 @@
+from __future__ import annotations
+from typing import Any, Dict
+import pandas as pd
 from tanml.checks.stress_test import StressTestCheck
 def run_stress_test_check(model, X_train, X_test, y_train, y_test, rule_config, cleaned_df, *args, **kwargs):
-    cfg = rule_config.get("StressTestCheck", {})
+    cfg = (rule_config or {}).get("StressTestCheck", {})
     if not cfg.get("enabled", True):
         print("ℹ️ Skipping StressTestCheck (disabled in rules.yaml)")
         return {"StressTestCheck": {"skipped": True}}
     try:
-        epsilon = cfg.get("epsilon", 0.01)
+        epsilon = cfg.get("epsilon", 0.01)
         perturb_fraction = cfg.get("perturb_fraction", 0.2)
-        checker = StressTestCheck(model, X_test, y_test, epsilon, perturb_fraction)
+        cols_test = getattr(X_test, "columns", None)
+        cols_train = getattr(X_train, "columns", None)
+        if cols_test is not None:
+            columns = list(cols_test)
+        elif cols_train is not None:
+            columns = list(cols_train)
+        else:
+            columns = None
+        X_test_df = pd.DataFrame(X_test, columns=columns)
+        checker = StressTestCheck(model, X_test_df, y_test, epsilon, perturb_fraction)
         result = checker.run()
-        # Ensure output is always a dictionary
         if isinstance(result, list):
-            return {"StressTestCheck": {"table": result}}
+            table = result
         elif hasattr(result, "to_dict"):
-            return {"StressTestCheck": {"table": result.to_dict(orient="records")}}
+            table = result.to_dict(orient="records")
         else:
             return {"StressTestCheck": {"output": result}}
+        return {"StressTestCheck": {"table": table}}
     except Exception as e:
         print(f"⚠️ StressTestCheck failed: {e}")
         return {"StressTestCheck": {"error": str(e)}}

tanml/check_runners/vif_runner.py CHANGED Viewed

@@ -1,27 +1,40 @@
 # tanml/check_runners/vif_runner.py
+from __future__ import annotations
-from tanml.checks.vif import VIFCheck
-import pandas as pd
+import os
 from pathlib import Path
+import pandas as pd
+from tanml.utils.data_loader import load_dataframe
+from tanml.checks.vif import VIFCheck
 def VIFCheckRunner(
     model, X_train, X_test, y_train, y_test,
     rule_config, cleaned_df, *args, **kwargs
 ):
-    # Ensure cleaned_df is a DataFrame
-    if isinstance(cleaned_df, (str, Path)):
-        try:
-            cleaned_df = pd.read_csv(cleaned_df)
-        except Exception as e:
-            err = f"Could not read cleaned_df CSV: {e}"
-            print(f"⚠️ {err}")
-            return {"vif_table": [], "high_vif_features": [], "error": err}
+    """
+    Ensure cleaned_df is a DataFrame; if a path (csv/xlsx/parquet/etc.),
+    load it via the universal loader, then run VIFCheck.
+    """
+    # 1) Normalize cleaned_df to a DataFrame
+    try:
+        if isinstance(cleaned_df, (str, bytes, os.PathLike, Path)):
+            cleaned_df = load_dataframe(cleaned_df)
+        elif not isinstance(cleaned_df, pd.DataFrame):
+            msg = "cleaned_df is not a DataFrame or loadable path; skipping VIF."
+            print(f"ℹ️ {msg}")
+            return {"vif_table": [], "high_vif_features": [], "error": msg}
+    except Exception as e:
+        err = f"Could not load cleaned_df: {e}"
+        print(f"⚠️ {err}")
+        return {"vif_table": [], "high_vif_features": [], "error": err}
+    # 2) Run the check
     try:
         check = VIFCheck(model, X_train, X_test, y_train, y_test, rule_config, cleaned_df)
-        result = check.run()  # Could be dict or list
-        # Normalize result regardless of format
+        result = check.run()
+        # 3) Normalize result
         if isinstance(result, dict) and "vif_table" in result:
             vif_rows = result["vif_table"]
         elif isinstance(result, list):
@@ -29,25 +42,18 @@ def VIFCheckRunner(
         else:
             raise ValueError("Unexpected VIFCheck return shape")
-        # Rename 'feature' to 'Feature', round VIF values
+        # 4) Canonicalize keys and values
         for row in vif_rows:
             if "Feature" not in row and "feature" in row:
                 row["Feature"] = row.pop("feature")
-            row["VIF"] = round(float(row["VIF"]), 2)
+            if "VIF" in row and row["VIF"] is not None:
+                row["VIF"] = round(float(row["VIF"]), 2)
-        # Identify high VIF features
+        # 5) Identify high VIF features
         threshold = rule_config.get("vif_threshold", 5)
-        high_vif = [
-            row["Feature"] for row in vif_rows
-            if row.get("VIF") is not None and row["VIF"] > threshold
-        ]
-        # Return final output
-        return {
-            "vif_table": vif_rows,
-            "high_vif_features": high_vif,
-            "error": None,
-        }
+        high_vif = [r["Feature"] for r in vif_rows if r.get("VIF") is not None and r["VIF"] > threshold]
+        return {"vif_table": vif_rows, "high_vif_features": high_vif, "error": None}
     except Exception as e:
         print(f"⚠️ VIFCheck failed: {e}")

tanml/checks/correlation.py CHANGED Viewed

@@ -1,61 +1,261 @@
-from .base import BaseCheck
+# checks/correlation.py
+from __future__ import annotations
+import os
+from pathlib import Path
+from typing import Dict, List, Tuple
+import numpy as np
 import pandas as pd
-import seaborn as sns
 import matplotlib.pyplot as plt
-import os
+try:
+    from scipy.cluster.hierarchy import linkage, leaves_list
+    from scipy.spatial.distance import squareform
+    _HAS_SCIPY = True
+except Exception:
+    _HAS_SCIPY = False
+from .base import BaseCheck
+DEFAULT_CFG = {
+    "method": "pearson",                 # "pearson" | "spearman"
+    "high_corr_threshold": 0.80,         # |r| >= threshold flagged
+    "top_pairs_max": 200,                # rows in the "main" table CSV
+    "heatmap_max_features_default": 20,  # default plotted features
+    "heatmap_max_features_limit": 60,    # max allowed via UI/slider
+    "subset_strategy": "cluster",        # "cluster" | "degree"
+    "sample_rows": 150_000,              # downsample for speed on huge data
+    "seed": 42,
+    "save_csv": True,
+    "save_fig": True,
+    "appendix_csv_cap": None,            # None = no cap; or int (e.g., 5000)
+}
+def _ensure_dir(p: Path) -> None:
+    p.mkdir(parents=True, exist_ok=True)
+def _numeric_columns(df: pd.DataFrame) -> List[str]:
+    return [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
+def _drop_constant_columns(df: pd.DataFrame) -> Tuple[pd.DataFrame, List[str]]:
+    keep, dropped = [], []
+    for c in df.columns:
+        s = df[c]
+        if s.dropna().nunique() <= 1:
+            dropped.append(c)
+        else:
+            keep.append(c)
+    return df[keep], dropped
+def _subset_by_degree(corr_abs: pd.DataFrame, max_feats: int) -> List[str]:
+    if corr_abs.shape[0] <= max_feats:
+        return list(corr_abs.index)
+    scores = corr_abs.sum().sort_values(ascending=False)
+    return list(scores.head(max_feats).index)
+def _subset_by_cluster(corr_abs: pd.DataFrame, max_feats: int) -> List[str]:
+    if corr_abs.shape[0] <= max_feats:
+        return list(corr_abs.index)
+    if not _HAS_SCIPY:
+        return _subset_by_degree(corr_abs, max_feats)
+    # distance = 1 - |corr|
+    dist = 1.0 - corr_abs
+    dist = (dist + dist.T) / 2.0
+    np.fill_diagonal(dist.values, 0.0)
+    Z = linkage(squareform(dist.values, checks=False), method="average")
+    order = leaves_list(Z)
+    ordered = corr_abs.index[order]
+    step = max(1, len(ordered) // max_feats)
+    return list(ordered[::step][:max_feats])
+def _render_heatmap(corr: pd.DataFrame, out_path: Path, title: str) -> None:
+    fig, ax = plt.subplots(figsize=(10, 8))
+    im = ax.imshow(corr.values, vmin=-1, vmax=1)
+    ax.set_xticks(range(corr.shape[1]))
+    ax.set_yticks(range(corr.shape[0]))
+    ax.set_xticklabels(corr.columns, rotation=90, fontsize=6)
+    ax.set_yticklabels(corr.index, fontsize=6)
+    ax.set_title(title)
+    cbar = plt.colorbar(im, ax=ax)
+    cbar.set_label("Correlation")
+    fig.tight_layout()
+    fig.savefig(out_path, dpi=200)
+    plt.close(fig)
 class CorrelationCheck(BaseCheck):
-    def __init__(self, cleaned_data: pd.DataFrame, output_dir: str = "reports/correlation"):
-        """
-        Computes Pearson and Spearman correlation matrices and saves them to disk,
-        along with a heatmap for visualization.
-        """
+    """
+    Numeric-only correlation analysis:
+      • Pearson or Spearman (pairwise complete obs)
+      • Heatmap on ≤20 features by default (clustered subset up to 60 max)
+      • CSV of high-correlation pairs (|r| ≥ threshold), sorted by |r|
+      • Handles constant/all-NA columns, optional sampling for speed
+    """
+    def __init__(
+        self,
+        cleaned_data: pd.DataFrame,
+        cfg: Dict | None = None,
+        output_dir: str = "reports/correlation",
+    ):
         self.cleaned_data = cleaned_data
+        self.cfg = {**DEFAULT_CFG, **(cfg or {})}
         self.output_dir = output_dir
         os.makedirs(self.output_dir, exist_ok=True)
+    def _top_corr_pairs(self, corr: pd.DataFrame, thr: float) -> pd.DataFrame:
+        a = corr.copy()
+        np.fill_diagonal(a.values, np.nan)
+        s = a.stack().reset_index()
+        s.columns = ["feature_i", "feature_j", "corr"]
+        s = s.dropna()
+        # remove duplicate symmetric pairs
+        s["pair"] = s.apply(lambda r: tuple(sorted([r["feature_i"], r["feature_j"]])), axis=1)
+        s = s.drop_duplicates(subset=["pair"]).drop(columns=["pair"])
+        s["abs_corr"] = s["corr"].abs()
+        s = s[s["abs_corr"] >= thr].sort_values("abs_corr", ascending=False)
+        return s
     def run(self):
-        # Select numeric features only
-        numeric_data = self.cleaned_data.select_dtypes(include="number")
+        cfg = self.cfg
+        method = cfg["method"]
+        thr = float(cfg["high_corr_threshold"])
+        top_cap = int(cfg["top_pairs_max"])
+        default_cap = int(cfg["heatmap_max_features_default"])
+        max_cap = int(cfg["heatmap_max_features_limit"])
+        subset_strategy = cfg["subset_strategy"]
+        sample_rows = int(cfg["sample_rows"])
+        seed = int(cfg["seed"])
+        save_csv = bool(cfg["save_csv"])
+        save_fig = bool(cfg["save_fig"])
+        appendix_cap = cfg.get("appendix_csv_cap", None)
+        # ===== 1) Select numeric & (optional) sample rows =====
+        X = self.cleaned_data.copy()
+        if len(X) > sample_rows:
+            X = X.sample(sample_rows, random_state=seed)
+        num_cols = _numeric_columns(X)
+        X = X[num_cols]
+        X, dropped_constants = _drop_constant_columns(X)
-        if numeric_data.shape[1] < 2:
-            print("⚠️ Not enough numeric features for correlation.")
+        if X.shape[1] < 2:
+            msg = "⚠️ Not enough numeric features for correlation."
+            print(msg)
             return {
                 "pearson_csv": None,
                 "spearman_csv": None,
                 "heatmap_path": None,
+                "top_pairs_csv": None,
+                "summary": {"n_numeric_features": X.shape[1]},
+                "notes": [msg, f"Dropped constant/all-NA columns: {dropped_constants}"] if dropped_constants else [msg],
                 "error": "Not enough numeric features for correlation",
             }
-        # Compute correlations
-        pearson_corr = numeric_data.corr(method="pearson")
-        spearman_corr = numeric_data.corr(method="spearman")
-        # Save CSVs
-        pearson_path = os.path.join(self.output_dir, "pearson_corr.csv")
-        spearman_path = os.path.join(self.output_dir, "spearman_corr.csv")
-        pearson_corr.to_csv(pearson_path)
-        spearman_corr.to_csv(spearman_path)
-        # Create heatmap
-        heatmap_path = os.path.join(self.output_dir, "heatmap.png")
-        plt.figure(figsize=(10, 8))
-        sns.heatmap(
-            pearson_corr,
-            annot=True,
-            fmt=".2f",
-            cmap="coolwarm",
-            cbar_kws={"label": "Pearson Coefficient"},
-        )
-        plt.title("Pearson Correlation Heatmap")
-        plt.xticks(rotation=45, ha="right")
-        plt.yticks(rotation=0)
-        plt.tight_layout()
-        plt.savefig(heatmap_path)
-        plt.close()
+        # ===== 2) Correlation matrix =====
+        # Compute both; pick one to drive plotting/threshold logic
+        corr_pearson = X.corr(method="pearson")
+        corr_spearman = X.corr(method="spearman")
+        corr = corr_pearson if method == "pearson" else corr_spearman
+        corr_abs = corr.abs()
+        # ===== 3) High-correlation pairs CSV =====
+        pairs = self._top_corr_pairs(corr, thr)
+        # augment with pairwise n_used and feature missingness %
+        non_null_counts = X.notna().sum()
+        total_rows = len(X)
+        if not pairs.empty:
+            pairs["n_used"] = pairs.apply(
+                lambda r: X[[r["feature_i"], r["feature_j"]]].dropna().shape[0], axis=1
+            )
+            pairs["pct_missing_i"] = pairs.apply(
+                lambda r: 1 - non_null_counts[r["feature_i"]] / total_rows, axis=1
+            )
+            pairs["pct_missing_j"] = pairs.apply(
+                lambda r: 1 - non_null_counts[r["feature_j"]] / total_rows, axis=1
+            )
+        artifacts: Dict[str, str] = {}
+        outdir = Path(self.output_dir)
+        _ensure_dir(outdir)
+        # Save full correlation matrices (if enabled)
+        pearson_csv_path = outdir / "pearson_corr.csv"
+        spearman_csv_path = outdir / "spearman_corr.csv"
+        if save_csv:
+            corr_pearson.to_csv(pearson_csv_path, index=True)
+            corr_spearman.to_csv(spearman_csv_path, index=True)
+        # Save top-pairs CSVs (main + full/appendix)
+        if save_csv:
+            full_csv = outdir / "correlation_top_pairs.csv"
+            if appendix_cap is not None:
+                pairs.head(int(appendix_cap)).to_csv(full_csv, index=False)
+            else:
+                pairs.to_csv(full_csv, index=False)
+            artifacts["top_pairs_csv"] = str(full_csv)
+            main_csv = outdir / "correlation_top_pairs_main.csv"
+            pairs.head(top_cap).to_csv(main_csv, index=False)
+            artifacts["top_pairs_main_csv"] = str(main_csv)
+        # ===== 4) Adaptive heatmap =====
+        n_features_total = X.shape[1]
+        plotted_full_matrix = n_features_total <= default_cap
+        if not plotted_full_matrix:
+            cap = min(max_cap, n_features_total)
+            if subset_strategy == "cluster" and _HAS_SCIPY:
+                subset = _subset_by_cluster(corr_abs, cap)
+            else:
+                subset = _subset_by_degree(corr_abs, cap)
+            corr_plot = corr.loc[subset, subset]
+            title = f"Correlation Heatmap ({method}) — {len(subset)}/{n_features_total} features (subset)"
+        else:
+            corr_plot = corr
+            title = f"Correlation Heatmap ({method}) — full matrix ({n_features_total} features)"
+        heatmap_path = None
+        if save_fig:
+            heatmap_path = outdir / "heatmap.png"
+            _render_heatmap(corr_plot, heatmap_path, title)
+            artifacts["heatmap_path"] = str(heatmap_path)
+        # ===== 5) Summary/notes =====
+        n_pairs_total = n_features_total * (n_features_total - 1) // 2
+        n_pairs_flagged = int(pairs.shape[0]) if not pairs.empty else 0
+        notes = []
+        if dropped_constants:
+            notes.append(f"Dropped constant/all-NA columns: {sorted(dropped_constants)}")
+        if len(self.cleaned_data) > sample_rows:
+            notes.append(f"Computed on a {sample_rows}-row sample (seed={seed}).")
+        if not plotted_full_matrix:
+            notes.append(
+                f"Heatmap shows a subset ({corr_plot.shape[0]}/{n_features_total}); see CSV for full list of pairs."
+            )
         return {
-            "pearson_csv": pearson_path,
-            "spearman_csv": spearman_path,
-            "heatmap_path": heatmap_path,
+            "pearson_csv": str(pearson_csv_path) if save_csv else None,
+            "spearman_csv": str(spearman_csv_path) if save_csv else None,
+            "heatmap_path": str(heatmap_path) if heatmap_path else None,
+            "top_pairs_csv": artifacts.get("top_pairs_csv"),
+            "top_pairs_main_csv": artifacts.get("top_pairs_main_csv"),
+            "summary": {
+                "n_numeric_features": int(n_features_total),
+                "n_pairs_total": int(n_pairs_total),
+                "n_pairs_flagged_ge_threshold": int(n_pairs_flagged),
+                "threshold": float(thr),
+                "method": method,
+                "plotted_features": int(corr_plot.shape[0]),
+                "plotted_full_matrix": bool(plotted_full_matrix),
+            },
+            "notes": notes,
         }

tanml 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

Potentially problematic release.

tanml 0.1.6py3-none-any.whl → 0.1.7py3-none-any.whl