PyPI - validmind - Versions diffs - 2.5.25__py3-none-any.whl → 2.6.8__py3-none-any.whl - Mend

validmind 2.5.25py3-none-any.whl → 2.6.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (198) hide show

validmind/__init__.py +8 -17
validmind/__version__.py +1 -1
validmind/ai/test_descriptions.py +66 -85
validmind/ai/test_result_description/context.py +2 -2
validmind/ai/utils.py +26 -1
validmind/api_client.py +43 -79
validmind/client.py +5 -7
validmind/client_config.py +1 -1
validmind/datasets/__init__.py +1 -1
validmind/datasets/classification/customer_churn.py +7 -5
validmind/datasets/nlp/__init__.py +2 -2
validmind/errors.py +6 -10
validmind/html_templates/content_blocks.py +18 -16
validmind/logging.py +21 -16
validmind/tests/__init__.py +28 -5
validmind/tests/__types__.py +186 -170
validmind/tests/_store.py +7 -21
validmind/tests/comparison.py +362 -0
validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
validmind/tests/data_validation/ADF.py +49 -83
validmind/tests/data_validation/AutoAR.py +59 -96
validmind/tests/data_validation/AutoMA.py +59 -96
validmind/tests/data_validation/AutoStationarity.py +66 -114
validmind/tests/data_validation/ClassImbalance.py +48 -117
validmind/tests/data_validation/DatasetDescription.py +180 -209
validmind/tests/data_validation/DatasetSplit.py +50 -75
validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
validmind/tests/data_validation/Duplicates.py +21 -90
validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
validmind/tests/data_validation/HighCardinality.py +32 -80
validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
validmind/tests/data_validation/IQROutliersTable.py +40 -80
validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
validmind/tests/data_validation/KPSS.py +33 -81
validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
validmind/tests/data_validation/MissingValues.py +17 -58
validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
validmind/tests/data_validation/RollingStatsPlot.py +50 -81
validmind/tests/data_validation/SeasonalDecompose.py +102 -184
validmind/tests/data_validation/Skewness.py +27 -64
validmind/tests/data_validation/SpreadPlot.py +34 -57
validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
validmind/tests/data_validation/TooManyZeroValues.py +21 -70
validmind/tests/data_validation/UniqueRows.py +23 -62
validmind/tests/data_validation/WOEBinPlots.py +83 -109
validmind/tests/data_validation/WOEBinTable.py +28 -69
validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
validmind/tests/data_validation/nlp/CommonWords.py +49 -57
validmind/tests/data_validation/nlp/Hashtags.py +27 -49
validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
validmind/tests/data_validation/nlp/Mentions.py +32 -63
validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
validmind/tests/data_validation/nlp/Punctuations.py +63 -47
validmind/tests/data_validation/nlp/Sentiment.py +4 -0
validmind/tests/data_validation/nlp/StopWords.py +62 -91
validmind/tests/data_validation/nlp/TextDescription.py +116 -159
validmind/tests/data_validation/nlp/Toxicity.py +12 -4
validmind/tests/decorator.py +33 -242
validmind/tests/load.py +212 -153
validmind/tests/model_validation/BertScore.py +13 -7
validmind/tests/model_validation/BleuScore.py +4 -0
validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
validmind/tests/model_validation/ContextualRecall.py +3 -0
validmind/tests/model_validation/FeaturesAUC.py +43 -74
validmind/tests/model_validation/MeteorScore.py +3 -0
validmind/tests/model_validation/RegardScore.py +5 -1
validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
validmind/tests/model_validation/embeddings/utils.py +53 -0
validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
validmind/tests/output.py +120 -0
validmind/tests/prompt_validation/Bias.py +55 -98
validmind/tests/prompt_validation/Clarity.py +56 -99
validmind/tests/prompt_validation/Conciseness.py +63 -101
validmind/tests/prompt_validation/Delimitation.py +48 -89
validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
validmind/tests/prompt_validation/Robustness.py +80 -121
validmind/tests/prompt_validation/Specificity.py +61 -95
validmind/tests/prompt_validation/ai_powered_test.py +2 -2
validmind/tests/run.py +314 -496
validmind/tests/test_providers.py +109 -79
validmind/tests/utils.py +91 -0
validmind/unit_metrics/__init__.py +16 -155
validmind/unit_metrics/classification/F1.py +1 -0
validmind/unit_metrics/classification/Precision.py +1 -0
validmind/unit_metrics/classification/ROC_AUC.py +1 -0
validmind/unit_metrics/classification/Recall.py +1 -0
validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
validmind/unit_metrics/regression/HuberLoss.py +1 -0
validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
validmind/unit_metrics/regression/QuantileLoss.py +1 -0
validmind/unit_metrics/regression/RSquaredScore.py +2 -1
validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
validmind/utils.py +66 -17
validmind/vm_models/__init__.py +2 -17
validmind/vm_models/dataset/dataset.py +31 -4
validmind/vm_models/figure.py +7 -37
validmind/vm_models/model.py +3 -0
validmind/vm_models/result/__init__.py +7 -0
validmind/vm_models/result/result.jinja +21 -0
validmind/vm_models/result/result.py +337 -0
validmind/vm_models/result/utils.py +160 -0
validmind/vm_models/test_suite/runner.py +16 -54
validmind/vm_models/test_suite/summary.py +3 -3
validmind/vm_models/test_suite/test.py +43 -77
validmind/vm_models/test_suite/test_suite.py +8 -40
validmind-2.6.8.dist-info/METADATA +137 -0
{validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/RECORD +182 -189
validmind/tests/data_validation/AutoSeasonality.py +0 -190
validmind/tests/metadata.py +0 -59
validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
validmind/unit_metrics/composite.py +0 -238
validmind/vm_models/test/metric.py +0 -98
validmind/vm_models/test/metric_result.py +0 -61
validmind/vm_models/test/output_template.py +0 -55
validmind/vm_models/test/result_summary.py +0 -76
validmind/vm_models/test/result_wrapper.py +0 -488
validmind/vm_models/test/test.py +0 -103
validmind/vm_models/test/threshold_test.py +0 -106
validmind/vm_models/test/threshold_test_result.py +0 -75
validmind/vm_models/test_context.py +0 -259
validmind-2.5.25.dist-info/METADATA +0 -118
{validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/LICENSE +0 -0
{validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/WHEEL +0 -0
{validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/entry_points.txt +0 -0

validmind/tests/model_validation/sklearn/MinimumF1Score.py CHANGED Viewed

@@ -2,24 +2,18 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
-from dataclasses import dataclass
-from typing import List
-import pandas as pd
-from numpy import unique
-from sklearn import metrics
-from validmind.vm_models import (
-    ResultSummary,
-    ResultTable,
-    ResultTableMetadata,
-    ThresholdTest,
-    ThresholdTestResult,
-)
+import numpy as np
+from sklearn.metrics import f1_score
+from validmind.tests import tags, tasks
+from validmind.vm_models import VMDataset, VMModel
-@dataclass
-class MinimumF1Score(ThresholdTest):
+@tags(
+    "sklearn", "binary_classification", "multiclass_classification", "model_performance"
+)
+@tasks("classification", "text_classification")
+def MinimumF1Score(dataset: VMDataset, model: VMModel, min_threshold: float = 0.5):
     """
     Assesses if the model's F1 score on the validation set meets a predefined minimum threshold, ensuring balanced
     performance between precision and recall.
@@ -59,59 +53,15 @@ class MinimumF1Score(ThresholdTest):
     closely with specific requirements.
     """
-    name = "f1_score"
-    required_inputs = ["model", "dataset"]
-    default_params = {"min_threshold": 0.5}
-    tasks = ["classification", "text_classification"]
-    tags = [
-        "sklearn",
-        "binary_classification",
-        "multiclass_classification",
-        "model_performance",
-    ]
-    def summary(self, results: List[ThresholdTestResult], all_passed: bool):
-        """
-        The f1 score test returns results like these:
-        [{"values": {"score": 0.734375, "threshold": 0.7}, "passed": true}]
-        """
-        result = results[0]
-        results_table = [
-            {
-                "Score": result.values["score"],
-                "Threshold": result.values["threshold"],
-                "Pass/Fail": "Pass" if result.passed else "Fail",
-            }
-        ]
-        return ResultSummary(
-            results=[
-                ResultTable(
-                    data=pd.DataFrame(results_table),
-                    metadata=ResultTableMetadata(title="Minimum F1 Score Test"),
-                )
-            ]
-        )
-    def run(self):
-        y_true = self.inputs.dataset.y
-        class_pred = self.inputs.dataset.y_pred(self.inputs.model)
-        y_true = y_true.astype(class_pred.dtype)
-        if len(unique(y_true)) > 2:
-            f1_score = metrics.f1_score(y_true, class_pred, average="macro")
-        else:
-            f1_score = metrics.f1_score(y_true, class_pred)
-        passed = f1_score > self.params["min_threshold"]
-        results = [
-            ThresholdTestResult(
-                passed=passed,
-                values={
-                    "score": f1_score,
-                    "threshold": self.params["min_threshold"],
-                },
-            )
-        ]
-        return self.cache_results(results, passed=all([r.passed for r in results]))
+    if len(np.unique(dataset.y)) > 2:
+        score = f1_score(dataset.y, dataset.y_pred(model), average="macro")
+    else:
+        score = f1_score(dataset.y, dataset.y_pred(model))
+    return [
+        {
+            "Score": score,
+            "Threshold": min_threshold,
+            "Pass/Fail": "Pass" if score > min_threshold else "Fail",
+        }
+    ], score > min_threshold

validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py CHANGED Viewed

@@ -2,24 +2,19 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
-from dataclasses import dataclass
-from typing import List
 import numpy as np
-import pandas as pd
-from sklearn import metrics, preprocessing
-from validmind.vm_models import (
-    ResultSummary,
-    ResultTable,
-    ResultTableMetadata,
-    ThresholdTest,
-    ThresholdTestResult,
-)
+from sklearn.metrics import roc_auc_score
+from sklearn.preprocessing import LabelBinarizer
+from validmind.tests import tags, tasks
+from validmind.vm_models import VMDataset, VMModel
-@dataclass
-class MinimumROCAUCScore(ThresholdTest):
+@tags(
+    "sklearn", "binary_classification", "multiclass_classification", "model_performance"
+)
+@tasks("classification", "text_classification")
+def MinimumROCAUCScore(dataset: VMDataset, model: VMModel, min_threshold: float = 0.5):
     """
     Validates model by checking if the ROC AUC score meets or surpasses a specified threshold.
@@ -61,69 +56,25 @@ class MinimumROCAUCScore(ThresholdTest):
     - The use of macro average for multiclass ROC AUC score implies equal weightage to each class, which might not be
     appropriate if the classes are imbalanced.
     """
+    y_true = dataset.y
+    if len(np.unique(y_true)) > 2:
+        lb = LabelBinarizer()
+        lb.fit(y_true)
-    name = "roc_auc_score"
-    required_inputs = ["model", "dataset"]
-    default_params = {"min_threshold": 0.5}
-    tasks = ["classification", "text_classification"]
-    tags = [
-        "sklearn",
-        "binary_classification",
-        "multiclass_classification",
-        "model_performance",
-    ]
-    def summary(self, results: List[ThresholdTestResult], all_passed: bool):
-        """
-        The roc auc score test returns results like these:
-        [{"values": {"score": 0.734375, "threshold": 0.7}, "passed": true}]
-        """
-        result = results[0]
-        results_table = [
-            {
-                "Score": result.values["score"],
-                "Threshold": result.values["threshold"],
-                "Pass/Fail": "Pass" if result.passed else "Fail",
-            }
-        ]
-        return ResultSummary(
-            results=[
-                ResultTable(
-                    data=pd.DataFrame(results_table),
-                    metadata=ResultTableMetadata(title="Minimum ROC AUC Score Test"),
-                )
-            ]
+        roc_auc = roc_auc_score(
+            y_true=lb.transform(y_true),
+            y_score=lb.transform(dataset.y_pred(model)),
+            average="macro",
         )
-    def multiclass_roc_auc_score(self, y_test, y_pred, average="macro"):
-        lb = preprocessing.LabelBinarizer()
-        lb.fit(y_test)
-        y_test = lb.transform(y_test)
-        y_pred = lb.transform(y_pred)
-        return metrics.roc_auc_score(y_test, y_pred, average=average)
-    def run(self):
-        y_true = self.inputs.dataset.y
-        if len(np.unique(y_true)) > 2:
-            class_pred = self.inputs.dataset.y_pred(self.inputs.model)
-            y_true = y_true.astype(class_pred.dtype)
-            roc_auc = self.multiclass_roc_auc_score(y_true, class_pred)
-        else:
-            y_prob = self.inputs.dataset.y_prob(self.inputs.model)
-            y_true = y_true.astype(y_prob.dtype).flatten()
-            roc_auc = metrics.roc_auc_score(y_true, y_prob)
-        passed = roc_auc > self.params["min_threshold"]
-        results = [
-            ThresholdTestResult(
-                passed=passed,
-                values={
-                    "score": roc_auc,
-                    "threshold": self.params["min_threshold"],
-                },
-            )
-        ]
-        return self.cache_results(results, passed=all([r.passed for r in results]))
+    else:
+        roc_auc = roc_auc_score(y_true=y_true, y_score=dataset.y_prob(model))
+    return [
+        {
+            "Score": roc_auc,
+            "Threshold": min_threshold,
+            "Pass/Fail": "Pass" if roc_auc > min_threshold else "Fail",
+        }
+    ], roc_auc > min_threshold

validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py CHANGED Viewed

@@ -2,19 +2,24 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
-from dataclasses import dataclass
-from numpy import unique
+import numpy as np
 from sklearn.metrics import classification_report
-from validmind.errors import SkipTestError
-from validmind.vm_models import ResultSummary, ResultTable, ResultTableMetadata
+from validmind import tags, tasks
+from validmind.vm_models import VMDataset, VMModel
-from .ClassifierPerformance import ClassifierPerformance, multiclass_roc_auc_score
+from .ClassifierPerformance import multiclass_roc_auc_score
-@dataclass
-class ModelsPerformanceComparison(ClassifierPerformance):
+@tags(
+    "sklearn",
+    "binary_classification",
+    "multiclass_classification",
+    "model_performance",
+    "model_comparison",
+)
+@tasks("classification", "text_classification")
+def ModelsPerformanceComparison(dataset: VMDataset, models: list[VMModel]):
     """
     Evaluates and compares the performance of multiple Machine Learning models using various metrics like accuracy,
     precision, recall, and F1 score.
@@ -57,84 +62,49 @@ class ModelsPerformanceComparison(ClassifierPerformance):
     with unseen data or changes in the data distribution.
     - The ROC AUC score might not be as meaningful or easily interpretable for multilabel/multiclass tasks.
     """
+    y_true = dataset.y
+    classes = {str(i) for i in np.unique(y_true)}
+    prf_table = []
+    acc_roc_auc_table = []
+    for model in models:
+        y_pred = dataset.y_pred(model)
-    name = "models_performance_comparison"
-    required_inputs = ["dataset", "models"]
-    tasks = ["classification", "text_classification"]
-    tags = [
-        "sklearn",
-        "binary_classification",
-        "multiclass_classification",
-        "model_performance",
-        "model_comparison",
-    ]
-    def summary(self, metric_value: dict):
-        """
-        This summary varies depending if we're evaluating a binary or multi-class model
-        """
-        results = []
-        prf_table = []
-        classes = {str(i) for i in unique(self.inputs.dataset.y)}
+        report = classification_report(y_true, y_pred, output_dict=True)
+        report["roc_auc"] = multiclass_roc_auc_score(y_true, y_pred)
         for class_name in classes:
-            prf_dict = {}
-            prf_dict["Class"] = class_name
-            for m, _ in metric_value.items():
-                prf_dict[f"Precision- {m}"] = metric_value[m][class_name]["precision"]
-                prf_dict[f"Recall- {m}"] = metric_value[m][class_name]["recall"]
-                prf_dict[f"F1- {m}"] = metric_value[m][class_name]["f1-score"]
-            prf_table.append(prf_dict)
-        avg_metrics = ["weighted avg", "macro avg"]
-        for class_name in avg_metrics:
-            avg_dict = {}
-            avg_dict["Class"] = class_name
-            for m, _ in metric_value.items():
-                avg_dict[f"Precision- {m}"] = metric_value[m][class_name]["precision"]
-                avg_dict[f"Recall- {m}"] = metric_value[m][class_name]["recall"]
-                avg_dict[f"F1- {m}"] = metric_value[m][class_name]["f1-score"]
-            prf_table.append(avg_dict)
-        results.append(
-            ResultTable(
-                data=prf_table,
-                metadata=ResultTableMetadata(
-                    title="Precision, Recall, and F1 Comparison"
-                ),
+            prf_table.append(
+                {
+                    "Model": model.input_id,
+                    "Class": class_name,
+                    "Precision": report[class_name]["precision"],
+                    "Recall": report[class_name]["recall"],
+                    "F1-Score": report[class_name]["f1-score"],
+                }
             )
-        )
-        acc_roc_auc_table = []
-        for metric_name in ["accuracy", "roc_auc"]:
-            acc_roc_auc_dict = {}
-            acc_roc_auc_dict["Metric"] = metric_name
-            for m, _ in metric_value.items():
-                acc_roc_auc_dict[f"accuracy- {m}"] = metric_value[m]["accuracy"]
-                acc_roc_auc_dict[f"roc_auc- {m}"] = metric_value[m]["roc_auc"]
-            acc_roc_auc_table.append(acc_roc_auc_dict)
-        results.append(
-            ResultTable(
-                data=acc_roc_auc_table,
-                metadata=ResultTableMetadata(title="Accuracy and ROC AUC Comparison"),
+        for avg_metric in ["weighted avg", "macro avg"]:
+            prf_table.append(
+                {
+                    "Model": model.input_id,
+                    "Class": avg_metric,
+                    "Precision": report[avg_metric]["precision"],
+                    "Recall": report[avg_metric]["recall"],
+                    "F1-Score": report[avg_metric]["f1-score"],
+                }
             )
-        )
-        return ResultSummary(results=results)
-    def run(self):
-        # Check models list is not empty
-        if not self.inputs.models:
-            raise SkipTestError(
-                "List of models must be provided as a `models` parameter to compare performance"
-            )
-        all_models = self.inputs.models
-        results = {}
-        for idx, model in enumerate(all_models):
-            y_true = self.inputs.dataset.y
-            y_pred = self.inputs.dataset.y_pred(model)
-            report = classification_report(y_true, y_pred, output_dict=True)
-            report["roc_auc"] = multiclass_roc_auc_score(y_true, y_pred)
-            results["model_" + str(idx)] = report
+        for metric in ["accuracy", "roc_auc"]:
+            acc_roc_auc_table.append(
+                {
+                    "Model": model.input_id,
+                    "Metric": metric,
+                    "Value": report[metric],
+                }
+            )
-        return self.cache_results(results)
+    return {
+        "Precision, Recall, and F1 Comparison": prf_table,
+        "Accuracy and ROC AUC Comparison": acc_roc_auc_table,
+    }

validmind/tests/model_validation/sklearn/OverfitDiagnosis.py CHANGED Viewed

@@ -2,7 +2,6 @@
 # See the LICENSE file in the root of this repository for details.
 # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
-from dataclasses import dataclass
 from typing import List
 import matplotlib.pyplot as plt
@@ -11,17 +10,9 @@ import pandas as pd
 import seaborn as sns
 from sklearn import metrics
+from validmind import tags, tasks
 from validmind.logging import get_logger
-from validmind.vm_models import (
-    Figure,
-    ResultSummary,
-    ResultTable,
-    ResultTableMetadata,
-    ThresholdTest,
-    ThresholdTestResult,
-    VMDataset,
-    VMModel,
-)
+from validmind.vm_models import VMDataset, VMModel
 logger = get_logger(__name__)
@@ -173,56 +164,69 @@ def _plot_overfit_regions(
     return fig
-# TODO: make this a functional test instead of class-based when appropriate
-# simply have to remove the class and rename this func to OverfitDiagnosis
-def overfit_diagnosis(  # noqa: C901
+@tags(
+    "sklearn",
+    "binary_classification",
+    "multiclass_classification",
+    "linear_regression",
+    "model_diagnosis",
+)
+@tasks("classification", "regression")
+def OverfitDiagnosis(
     model: VMModel,
     datasets: List[VMDataset],
     metric: str = None,
     cut_off_threshold: float = DEFAULT_THRESHOLD,
 ):
-    """Identify overfit regions in a model's predictions.
-    This test compares the model's performance on training versus test data, grouped by
-    feature columns. It calculates the difference between the training and test performance
-    for each group and identifies regions where the difference exceeds a specified threshold.
-    ## Test Methodology
-    This test works for both classification and regression models and with a variety of
-    performance metrics. By default, it uses the AUC metric for classification models and
-    the MSE metric for regression models. The threshold for identifying overfit regions
-    defaults to 0.04 but should be adjusted based on the specific use case.
-    ## Inputs
-    - `model` (VMModel): The ValidMind model object to evaluate.
-    - `datasets` (List[VMDataset]): A list of two VMDataset objects where the first dataset
-        is the training data and the second dataset is the test data.
-    ## Parameters
-    - `metric` (str, optional): The performance metric to use for evaluation. Choose from:
-        'accuracy', 'auc', 'f1', 'precision', 'recall', 'mse', 'mae', 'r2', 'mape'.
-        Defaults to 'auc' for classification models and 'mse' for regression models.
-    - `cut_off_threshold` (float, optional): The threshold for identifying overfit regions.
-        Defaults to 0.04.
     """
+    Assesses potential overfitting in a model's predictions, identifying regions where performance between training and
+    testing sets deviates significantly.
+    ### Purpose
+    The Overfit Diagnosis test aims to identify areas in a model's predictions where there is a significant difference
+    in performance between the training and testing sets. This test helps to pinpoint specific regions or feature
+    segments where the model may be overfitting.
+    ### Test Mechanism
+    This test compares the model's performance on training versus test data, grouped by feature columns. It calculates
+    the difference between the training and test performance for each group and identifies regions where this
+    difference exceeds a specified threshold:
+    - The test works for both classification and regression models.
+    - It defaults to using the AUC metric for classification models and the MSE metric for regression models.
+    - The threshold for identifying overfitting regions is set to 0.04 by default.
+    - The test calculates the performance metrics for each feature segment and plots regions where the performance gap
+    exceeds the threshold.
+    ### Signs of High Risk
-    # Determine if it's a classification or regression model
+    - Significant gaps between training and test performance metrics for specific feature segments.
+    - Multiple regions with performance gaps exceeding the defined threshold.
+    - Higher than expected differences in predicted versus actual values in the test set compared to the training set.
+    ### Strengths
+    - Identifies specific areas where overfitting occurs.
+    - Supports multiple performance metrics, providing flexibility.
+    - Applicable to both classification and regression models.
+    - Visualization of overfitting segments aids in better understanding and debugging.
+    ### Limitations
+    - The default threshold may not be suitable for all use cases and requires tuning.
+    - May not capture more subtle forms of overfitting that do not exceed the threshold.
+    - Assumes that the binning of features adequately represents the data segments.
+    """
     is_classification = bool(datasets[0].probability_column(model))
-    # Set default metric if not provided
     if not metric:
         metric = (
             DEFAULT_CLASSIFICATION_METRIC
             if is_classification
             else DEFAULT_REGRESSION_METRIC
         )
-        logger.info(
-            f"Using default {'classification' if is_classification else 'regression'} metric: {metric}"
-        )
-    if id(cut_off_threshold) == id(DEFAULT_THRESHOLD):
-        logger.info("Using default cut-off threshold of 0.04")
     train_df = datasets[0].df
     test_df = datasets[1].df
@@ -279,18 +283,8 @@ def overfit_diagnosis(  # noqa: C901
             )
         results = _prepare_results(results_train, results_test, metric)
-        fig = _plot_overfit_regions(results, feature_column, cut_off_threshold, metric)
         test_figures.append(
-            Figure(
-                key=f"overfit_diagnosis:{metric}:{feature_column}",
-                figure=fig,
-                metadata={
-                    "metric": metric,
-                    "cut_off_threshold": cut_off_threshold,
-                    "feature": feature_column,
-                },
-            )
+            _plot_overfit_regions(results, feature_column, cut_off_threshold, metric)
         )
         for _, row in results[results["gap"] > cut_off_threshold].iterrows():
@@ -306,91 +300,3 @@ def overfit_diagnosis(  # noqa: C901
             )
     return {"Overfit Diagnosis": test_results}, *test_figures
-@dataclass
-class OverfitDiagnosis(ThresholdTest):
-    """
-    Assesses potential overfitting in a model's predictions, identifying regions where performance between training and
-    testing sets deviates significantly.
-    ### Purpose
-    The Overfit Diagnosis test aims to identify areas in a model's predictions where there is a significant difference
-    in performance between the training and testing sets. This test helps to pinpoint specific regions or feature
-    segments where the model may be overfitting.
-    ### Test Mechanism
-    This test compares the model's performance on training versus test data, grouped by feature columns. It calculates
-    the difference between the training and test performance for each group and identifies regions where this
-    difference exceeds a specified threshold:
-    - The test works for both classification and regression models.
-    - It defaults to using the AUC metric for classification models and the MSE metric for regression models.
-    - The threshold for identifying overfitting regions is set to 0.04 by default.
-    - The test calculates the performance metrics for each feature segment and plots regions where the performance gap
-    exceeds the threshold.
-    ### Signs of High Risk
-    - Significant gaps between training and test performance metrics for specific feature segments.
-    - Multiple regions with performance gaps exceeding the defined threshold.
-    - Higher than expected differences in predicted versus actual values in the test set compared to the training set.
-    ### Strengths
-    - Identifies specific areas where overfitting occurs.
-    - Supports multiple performance metrics, providing flexibility.
-    - Applicable to both classification and regression models.
-    - Visualization of overfitting segments aids in better understanding and debugging.
-    ### Limitations
-    - The default threshold may not be suitable for all use cases and requires tuning.
-    - May not capture more subtle forms of overfitting that do not exceed the threshold.
-    - Assumes that the binning of features adequately represents the data segments.
-    """
-    required_inputs = ["model", "datasets"]
-    default_params = {"metric": None, "cut_off_threshold": DEFAULT_THRESHOLD}
-    tasks = ["classification", "regression"]
-    tags = [
-        "sklearn",
-        "binary_classification",
-        "multiclass_classification",
-        "linear_regression",
-        "model_diagnosis",
-    ]
-    def run(self):
-        func_result = overfit_diagnosis(
-            self.inputs.model,
-            self.inputs.datasets,
-            metric=self.params["metric"],
-            cut_off_threshold=self.params["cut_off_threshold"],
-        )
-        return self.cache_results(
-            test_results_list=[
-                ThresholdTestResult(
-                    test_name=self.params["metric"],
-                    column=row["Feature"],
-                    passed=False,
-                    values={k: v for k, v in row.items()},
-                )
-                for row in func_result[0]["Overfit Diagnosis"]
-            ],
-            passed=(not func_result[0]["Overfit Diagnosis"]),
-            figures=func_result[1:],
-        )
-    def summary(self, results, _):
-        return ResultSummary(
-            results=[
-                ResultTable(
-                    data=[result.values for result in results],
-                    metadata=ResultTableMetadata(title="Overfit Diagnosis"),
-                )
-            ],
-        )

validmind 2.5.25__py3-none-any.whl → 2.6.8__py3-none-any.whl

validmind 2.5.25py3-none-any.whl → 2.6.8py3-none-any.whl