PyPI - model-eval-toolkit - Versions diffs - 0.1.0__py3-none-any.whl - Mend

model-eval-toolkit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

evalreport/__init__.py +28 -0
evalreport/__version__.py +2 -0
evalreport/classification/__init__.py +4 -0
evalreport/classification/report.py +319 -0
evalreport/clustering/__init__.py +4 -0
evalreport/clustering/report.py +174 -0
evalreport/core/base_report.py +479 -0
evalreport/core/entrypoints.py +97 -0
evalreport/core/task_inference.py +180 -0
evalreport/nlp/__init__.py +5 -0
evalreport/nlp/text_classification.py +21 -0
evalreport/nlp/text_generation.py +202 -0
evalreport/ranking/__init__.py +3 -0
evalreport/ranking/report.py +274 -0
evalreport/regression/__init__.py +4 -0
evalreport/regression/report.py +173 -0
evalreport/timeseries/__init__.py +4 -0
evalreport/timeseries/report.py +211 -0
evalreport/vision/__init__.py +6 -0
evalreport/vision/detection.py +359 -0
evalreport/vision/image_classification.py +25 -0
evalreport/vision/segmentation.py +140 -0
model_eval_toolkit-0.1.0.dist-info/METADATA +339 -0
model_eval_toolkit-0.1.0.dist-info/RECORD +27 -0
model_eval_toolkit-0.1.0.dist-info/WHEEL +5 -0
model_eval_toolkit-0.1.0.dist-info/licenses/LICENSE +21 -0
model_eval_toolkit-0.1.0.dist-info/top_level.txt +1 -0

evalreport/__init__.py ADDED Viewed

@@ -0,0 +1,28 @@
+from .core.entrypoints import generate_report
+from .classification.report import ClassificationReport
+from .regression.report import RegressionReport
+from .__version__ import __version__
+from .clustering.report import ClusteringReport
+from .timeseries.report import TimeSeriesReport
+from .nlp.text_classification import TextClassificationReport
+from .nlp.text_generation import TextGenerationReport
+from .vision.segmentation import SegmentationReport
+from .vision.detection import DetectionReport
+from .vision.image_classification import ImageClassificationReport
+from .ranking.report import RankingReport
+__all__ = [
+    "generate_report",
+    "__version__",
+    "ClassificationReport",
+    "RegressionReport",
+    "ClusteringReport",
+    "TimeSeriesReport",
+    "TextClassificationReport",
+    "TextGenerationReport",
+    "SegmentationReport",
+    "DetectionReport",
+    "ImageClassificationReport",
+    "RankingReport",
+]

evalreport/__version__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ __version__ = "0.1.0"
2	+

evalreport/classification/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .report import ClassificationReport
+__all__ = ["ClassificationReport"]

evalreport/classification/report.py ADDED Viewed

@@ -0,0 +1,319 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Iterable, List, Optional, Sequence
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import seaborn as sns
+import numpy as np
+from sklearn.metrics import (
+    accuracy_score,
+    cohen_kappa_score,
+    confusion_matrix,
+    f1_score,
+    log_loss,
+    matthews_corrcoef,
+    precision_score,
+    recall_score,
+    roc_auc_score,
+    average_precision_score,
+    roc_curve,
+    precision_recall_curve,
+)
+from ..core.base_report import BaseReport
+def _as_array(x: Optional[Iterable[Any]]) -> Optional[np.ndarray]:
+    if x is None:
+        return None
+    arr = np.asarray(list(x))
+    return arr
+def _safe_float(x: Any) -> Any:
+    try:
+        if isinstance(x, (np.floating, np.integer)):
+            return x.item()
+        if isinstance(x, float) and (np.isnan(x) or np.isinf(x)):
+            return None
+        return float(x)
+    except Exception:
+        return x
+@dataclass
+class ClassificationReport(BaseReport):
+    y_true: Optional[Iterable[Any]] = None
+    y_pred: Optional[Iterable[Any]] = None
+    y_prob: Optional[Iterable[Any]] = None
+    labels: Optional[Sequence[Any]] = None
+    def _compute_metrics(self) -> None:
+        y_true = _as_array(self.y_true)
+        y_pred = _as_array(self.y_pred)
+        if y_true is None or y_pred is None:
+            raise ValueError("ClassificationReport requires y_true and y_pred.")
+        average_modes = ["micro", "macro", "weighted"]
+        self.metrics["accuracy"] = _safe_float(accuracy_score(y_true, y_pred))
+        for avg in average_modes:
+            self.metrics[f"precision_{avg}"] = _safe_float(
+                precision_score(y_true, y_pred, average=avg, zero_division=0)
+            )
+            self.metrics[f"recall_{avg}"] = _safe_float(
+                recall_score(y_true, y_pred, average=avg, zero_division=0)
+            )
+            self.metrics[f"f1_{avg}"] = _safe_float(
+                f1_score(y_true, y_pred, average=avg, zero_division=0)
+            )
+        # Extras
+        self.metrics["mcc"] = _safe_float(matthews_corrcoef(y_true, y_pred))
+        try:
+            self.metrics["cohen_kappa"] = _safe_float(cohen_kappa_score(y_true, y_pred))
+        except Exception:
+            self.metrics["cohen_kappa"] = None
+        # Probabilistic metrics (best-effort; may be None)
+        y_prob = _as_array(self.y_prob)
+        if y_prob is not None:
+            try:
+                self.metrics["log_loss"] = _safe_float(log_loss(y_true, y_prob, labels=self.labels))
+            except Exception:
+                self.metrics["log_loss"] = None
+            # ROC/PR AUC (binary or multiclass if possible)
+            try:
+                # Binary case: accept shape (n,) or (n,2) and use positive class scores
+                if y_prob.ndim == 1:
+                    y_score = y_prob
+                elif y_prob.ndim == 2 and y_prob.shape[1] == 2:
+                    y_score = y_prob[:, 1]
+                else:
+                    y_score = y_prob
+                self.metrics["roc_auc"] = _safe_float(
+                    roc_auc_score(y_true, y_score, multi_class="ovr" if getattr(y_score, "ndim", 1) == 2 else "raise")
+                )
+            except Exception:
+                self.metrics["roc_auc"] = None
+            try:
+                if y_prob.ndim == 1:
+                    y_score = y_prob
+                elif y_prob.ndim == 2 and y_prob.shape[1] == 2:
+                    y_score = y_prob[:, 1]
+                else:
+                    y_score = y_prob
+                # average_precision_score supports binary directly; multiclass handled as macro if possible
+                self.metrics["pr_auc"] = _safe_float(
+                    average_precision_score(
+                        y_true,
+                        y_score,
+                        average="macro" if getattr(y_score, "ndim", 1) == 2 else "macro",
+                    )
+                )
+            except Exception:
+                self.metrics["pr_auc"] = None
+        # Confusion matrix as a small, JSON-serializable payload
+        try:
+            cm = confusion_matrix(y_true, y_pred, labels=self.labels)
+            self.metrics["confusion_matrix"] = cm.tolist()
+        except Exception:
+            self.metrics["confusion_matrix"] = None
+    def _generate_plots(self) -> None:
+        y_true = _as_array(self.y_true)
+        y_pred = _as_array(self.y_pred)
+        if y_true is None or y_pred is None:
+            self.plots = {}
+            return
+        root = self.output_dir or Path("reports")
+        plot_dir = root / "evalreport_plots"
+        plot_dir.mkdir(parents=True, exist_ok=True)
+        plots: dict[str, str] = {}
+        # Confusion matrix heatmap
+        try:
+            labels = self.labels
+            if labels is None:
+                labels = list(np.unique(np.concatenate([y_true, y_pred])))
+            cm = confusion_matrix(y_true, y_pred, labels=labels)
+            plt.figure(figsize=(4, 3))
+            sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)
+            plt.xlabel("Predicted")
+            plt.ylabel("True")
+            plt.title("Confusion Matrix")
+            path = plot_dir / "classification_confusion_matrix.png"
+            plt.tight_layout()
+            plt.savefig(path)
+            plt.close()
+            plots["confusion_matrix"] = str(path)
+        except Exception:
+            pass
+        # ROC and PR curves with probabilities
+        y_prob = _as_array(self.y_prob)
+        if y_prob is not None:
+            try:
+                y_prob_arr = y_prob
+                n_classes = len(np.unique(y_true))
+                # Determine class ordering for column mapping:
+                # - if `labels` provided, assume columns follow that order
+                # - else assume columns follow sorted unique labels from y_true
+                class_order = list(self.labels) if self.labels is not None else sorted(np.unique(y_true).tolist())
+                # handle probability input shape
+                y_score_for_binary = None
+                if y_prob_arr.ndim == 1:
+                    # binary: P(positive class)
+                    y_score_for_binary = y_prob_arr
+                elif y_prob_arr.ndim == 2:
+                    # For binary, accept both (n,2) and (n,) variants.
+                    if n_classes == 2 and y_prob_arr.shape[1] == 2:
+                        # Use column 1 as "positive" by convention
+                        y_score_for_binary = y_prob_arr[:, 1]
+                # Binary case ------------------------------------------------
+                if n_classes == 2 and y_score_for_binary is not None:
+                    # ROC
+                    fpr, tpr, _ = roc_curve(y_true, y_score_for_binary)
+                    plt.figure(figsize=(4, 3))
+                    plt.plot(fpr, tpr, label="ROC curve")
+                    plt.plot([0, 1], [0, 1], "k--", label="Random")
+                    plt.xlabel("False Positive Rate")
+                    plt.ylabel("True Positive Rate")
+                    plt.title("ROC Curve (binary)")
+                    plt.legend()
+                    path = plot_dir / "classification_roc_curve.png"
+                    plt.tight_layout()
+                    plt.savefig(path)
+                    plt.close()
+                    plots["roc_curve"] = str(path)
+                    # PR
+                    prec, rec, _ = precision_recall_curve(y_true, y_score_for_binary)
+                    plt.figure(figsize=(4, 3))
+                    plt.plot(rec, prec, label="PR curve")
+                    plt.xlabel("Recall")
+                    plt.ylabel("Precision")
+                    plt.title("Precision-Recall Curve (binary)")
+                    plt.legend()
+                    path = plot_dir / "classification_pr_curve.png"
+                    plt.tight_layout()
+                    plt.savefig(path)
+                    plt.close()
+                    plots["pr_curve"] = str(path)
+                # Multiclass case ------------------------------------------
+                if n_classes > 2 and y_prob_arr.ndim == 2:
+                    # Only proceed if columns match the class order we will use.
+                    if y_prob_arr.shape[1] == len(class_order):
+                        # One-vs-rest ROC
+                        plt.figure(figsize=(5, 4))
+                        for col_idx, cls in enumerate(class_order):
+                            y_bin = (y_true == cls).astype(int)
+                            fpr, tpr, _ = roc_curve(y_bin, y_prob_arr[:, col_idx])
+                            plt.plot(fpr, tpr, linewidth=2, label=str(cls))
+                        plt.plot([0, 1], [0, 1], "k--", linewidth=1, label="Random")
+                        plt.xlabel("False Positive Rate")
+                        plt.ylabel("True Positive Rate")
+                        plt.title("ROC Curve (multiclass, one-vs-rest)")
+                        plt.legend(title="Class", fontsize=8)
+                        path = plot_dir / "classification_roc_curve_multiclass.png"
+                        plt.tight_layout()
+                        plt.savefig(path)
+                        plt.close()
+                        plots["roc_curve_multiclass"] = str(path)
+                        # One-vs-rest PR
+                        plt.figure(figsize=(5, 4))
+                        for col_idx, cls in enumerate(class_order):
+                            y_bin = (y_true == cls).astype(int)
+                            prec, rec, _ = precision_recall_curve(y_bin, y_prob_arr[:, col_idx])
+                            plt.plot(rec, prec, linewidth=2, label=str(cls))
+                        plt.xlabel("Recall")
+                        plt.ylabel("Precision")
+                        plt.title("Precision-Recall Curve (multiclass, one-vs-rest)")
+                        plt.legend(title="Class", fontsize=8)
+                        path = plot_dir / "classification_pr_curve_multiclass.png"
+                        plt.tight_layout()
+                        plt.savefig(path)
+                        plt.close()
+                        plots["pr_curve_multiclass"] = str(path)
+            except Exception:
+                pass
+        self.plots = plots
+    def _generate_insights(self) -> None:
+        y_true = _as_array(self.y_true)
+        y_pred = _as_array(self.y_pred)
+        if y_true is None or y_pred is None:
+            return
+        insights: List[str] = []
+        # Class imbalance detection (simple heuristic)
+        try:
+            values, counts = np.unique(y_true, return_counts=True)
+            if len(counts) > 1:
+                ratio = counts.max() / max(1, counts.min())
+                if ratio >= 5:
+                    minority = values[np.argmin(counts)]
+                    majority = values[np.argmax(counts)]
+                    insights.append(
+                        f"Class imbalance detected (majority={majority!r}, minority={minority!r}, ratio≈{ratio:.1f})."
+                    )
+        except Exception:
+            pass
+        # Misclassification trends: top confusions off-diagonal
+        try:
+            labels = self.labels
+            if labels is None:
+                labels = list(np.unique(np.concatenate([y_true, y_pred])))
+            cm = confusion_matrix(y_true, y_pred, labels=labels)
+            cm_off = cm.copy()
+            np.fill_diagonal(cm_off, 0)
+            if cm_off.sum() > 0:
+                i, j = np.unravel_index(np.argmax(cm_off), cm_off.shape)
+                if cm_off[i, j] > 0:
+                    insights.append(
+                        f"Most common confusion: true={labels[i]!r} predicted={labels[j]!r} ({int(cm_off[i, j])} samples)."
+                    )
+        except Exception:
+            pass
+        self.insights = insights
+        # Descriptions for key metrics shown in HTML/PDF
+        self.metric_descriptions.update(
+            {
+                "accuracy": "Overall fraction of correct predictions.",
+                "precision_micro": "Precision aggregated over all classes (micro-average).",
+                "recall_micro": "Recall aggregated over all classes (micro-average).",
+                "f1_micro": "F1 score aggregated over all classes (micro-average).",
+                "precision_macro": "Unweighted mean of per-class precision.",
+                "recall_macro": "Unweighted mean of per-class recall.",
+                "f1_macro": "Unweighted mean of per-class F1 score.",
+                "precision_weighted": "Precision averaged over classes, weighted by support.",
+                "recall_weighted": "Recall averaged over classes, weighted by support.",
+                "f1_weighted": "F1 averaged over classes, weighted by support.",
+                "mcc": "Matthews correlation coefficient; balanced measure even under class imbalance.",
+                "cohen_kappa": "Cohen’s kappa; agreement between predictions and truth beyond chance.",
+                "log_loss": "Logarithmic loss; lower values indicate better calibrated probabilities.",
+                "roc_auc": "Area under the ROC curve; trade-off between TPR and FPR.",
+                "pr_auc": "Area under the precision–recall curve; useful for imbalanced data.",
+                "confusion_matrix": "Counts of predictions vs true labels for each class pair.",
+            }
+        )

evalreport/clustering/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .report import ClusteringReport
+__all__ = ["ClusteringReport"]

evalreport/clustering/report.py ADDED Viewed

@@ -0,0 +1,174 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Iterable, List, Optional, Sequence
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import numpy as np
+import seaborn as sns
+from sklearn.cluster import KMeans
+from sklearn.decomposition import PCA
+from sklearn.metrics import (
+    calinski_harabasz_score,
+    davies_bouldin_score,
+    silhouette_score,
+)
+from ..core.base_report import BaseReport
+def _as_2d_array(x: Optional[Iterable[Any]]) -> Optional[np.ndarray]:
+    if x is None:
+        return None
+    arr = np.asarray(list(x))
+    if arr.ndim == 1:
+        arr = arr.reshape(-1, 1)
+    return arr
+def _as_array(x: Optional[Iterable[Any]]) -> Optional[np.ndarray]:
+    if x is None:
+        return None
+    return np.asarray(list(x))
+@dataclass
+class ClusteringReport(BaseReport):
+    X: Optional[Iterable[Any]] = None
+    labels: Optional[Iterable[Any]] = None
+    # When provided, we will fit a basic KMeans model to derive cluster centers
+    # for plotting/diagnostics.
+    n_clusters: Optional[int] = None
+    random_state: int = 0
+    def _compute_metrics(self) -> None:
+        X = _as_2d_array(self.X)
+        labels = _as_array(self.labels)
+        if X is None or labels is None:
+            raise ValueError("ClusteringReport requires X and labels (cluster assignments).")
+        unique = np.unique(labels)
+        if unique.size < 2:
+            self.metrics.update(
+                {
+                    "silhouette_score": None,
+                    "davies_bouldin_index": None,
+                    "calinski_harabasz_score": None,
+                }
+            )
+        else:
+            self.metrics["silhouette_score"] = float(silhouette_score(X, labels))
+            self.metrics["davies_bouldin_index"] = float(davies_bouldin_score(X, labels))
+            self.metrics["calinski_harabasz_score"] = float(calinski_harabasz_score(X, labels))
+        self.metrics["num_clusters"] = int(unique.size)
+        # cluster_size distribution as counts
+        vals, counts = np.unique(labels, return_counts=True)
+        self.metrics["cluster_sizes"] = {str(v): int(c) for v, c in zip(vals, counts)}
+    def _generate_plots(self) -> None:
+        X = _as_2d_array(self.X)
+        labels = _as_array(self.labels)
+        if X is None or labels is None:
+            self.plots = {}
+            return
+        root = self.output_dir or Path("reports")
+        plot_dir = root / "evalreport_plots"
+        plot_dir.mkdir(parents=True, exist_ok=True)
+        plots: dict[str, str] = {}
+        # Scatter (PCA projection to 2D)
+        try:
+            pca_dim = 2 if X.shape[1] >= 2 and X.shape[0] >= 3 else 1
+            pca_dim = min(pca_dim, X.shape[1], max(1, X.shape[0] - 1))
+            pca = PCA(n_components=pca_dim, random_state=self.random_state)
+            X2 = pca.fit_transform(X)
+            plt.figure(figsize=(5, 4))
+            if X2.shape[1] == 1:
+                plt.scatter(X2[:, 0], np.zeros_like(X2[:, 0]), c=labels, cmap="tab10", alpha=0.8)
+            else:
+                plt.scatter(X2[:, 0], X2[:, 1], c=labels, cmap="tab10", alpha=0.8)
+            plt.title("Cluster scatter (PCA)")
+            plt.xlabel("PC1")
+            plt.ylabel("PC2" if X2.shape[1] > 1 else "")
+            path = plot_dir / "clustering_scatter_pca.png"
+            plt.tight_layout()
+            plt.savefig(path)
+            plt.close()
+            plots["cluster_scatter_pca"] = str(path)
+        except Exception:
+            pass
+        # Cluster size distribution
+        try:
+            vals, counts = np.unique(labels, return_counts=True)
+            plt.figure(figsize=(5, 3.5))
+            sns.barplot(x=[str(v) for v in vals], y=counts, color="#4C78A8")
+            plt.xlabel("Cluster")
+            plt.ylabel("Count")
+            plt.title("Cluster size distribution")
+            for i, c in enumerate(counts):
+                plt.text(i, c, str(int(c)), ha="center", va="bottom", fontsize=8)
+            path = plot_dir / "clustering_cluster_sizes.png"
+            plt.tight_layout()
+            plt.savefig(path)
+            plt.close()
+            plots["cluster_size_distribution"] = str(path)
+        except Exception:
+            pass
+        self.plots = plots
+    def _generate_insights(self) -> None:
+        labels = _as_array(self.labels)
+        if labels is None:
+            self.insights = []
+            return
+        insights: List[str] = []
+        unique, counts = np.unique(labels, return_counts=True)
+        if unique.size >= 2:
+            max_c = counts.max()
+            min_c = counts.min()
+            ratio = float(max_c) / float(max(1, min_c))
+            if ratio >= 5:
+                # report most/least dominant clusters
+                maj = unique[np.argmax(counts)]
+                min_label = unique[np.argmin(counts)]
+                insights.append(f"Cluster imbalance detected (majority={maj!r}, minority={min_label!r}, ratio≈{ratio:.1f}).")
+        # Separability heuristics using silhouette
+        sil = self.metrics.get("silhouette_score")
+        if isinstance(sil, (int, float)) and sil is not None:
+            if sil < 0.25:
+                insights.append("Clusters overlap significantly (low silhouette). Consider revisiting features, scaling, or k.")
+            elif sil > 0.5:
+                insights.append("Clusters appear well separated (high silhouette).")
+        # Davies-Bouldin lower is better
+        dbi = self.metrics.get("davies_bouldin_index")
+        if isinstance(dbi, (int, float)) and dbi is not None:
+            if dbi > 1.0:
+                insights.append("Higher Davies–Bouldin suggests clusters may be less distinct; inspect overlaps.")
+        self.insights = insights
+        # Human-readable explanations
+        self.metric_descriptions.update(
+            {
+                "silhouette_score": "How well points fit their own cluster vs other clusters (higher is better).",
+                "davies_bouldin_index": "Average similarity between clusters (lower is better).",
+                "calinski_harabasz_score": "Variance ratio criterion (higher suggests clearer separation).",
+                "num_clusters": "Number of unique clusters in the provided assignments.",
+                "cluster_sizes": "Counts per cluster; helps detect cluster imbalance.",
+            }
+        )