validmind 2.5.25__py3-none-any.whl → 2.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. validmind/__init__.py +8 -17
  2. validmind/__version__.py +1 -1
  3. validmind/ai/test_descriptions.py +66 -85
  4. validmind/ai/test_result_description/context.py +2 -2
  5. validmind/ai/utils.py +26 -1
  6. validmind/api_client.py +43 -79
  7. validmind/client.py +5 -7
  8. validmind/client_config.py +1 -1
  9. validmind/datasets/__init__.py +1 -1
  10. validmind/datasets/classification/customer_churn.py +7 -5
  11. validmind/datasets/nlp/__init__.py +2 -2
  12. validmind/errors.py +6 -10
  13. validmind/html_templates/content_blocks.py +18 -16
  14. validmind/logging.py +21 -16
  15. validmind/tests/__init__.py +28 -5
  16. validmind/tests/__types__.py +186 -170
  17. validmind/tests/_store.py +7 -21
  18. validmind/tests/comparison.py +362 -0
  19. validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
  20. validmind/tests/data_validation/ADF.py +49 -83
  21. validmind/tests/data_validation/AutoAR.py +59 -96
  22. validmind/tests/data_validation/AutoMA.py +59 -96
  23. validmind/tests/data_validation/AutoStationarity.py +66 -114
  24. validmind/tests/data_validation/ClassImbalance.py +48 -117
  25. validmind/tests/data_validation/DatasetDescription.py +180 -209
  26. validmind/tests/data_validation/DatasetSplit.py +50 -75
  27. validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
  28. validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
  29. validmind/tests/data_validation/Duplicates.py +21 -90
  30. validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
  31. validmind/tests/data_validation/HighCardinality.py +32 -80
  32. validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
  33. validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
  34. validmind/tests/data_validation/IQROutliersTable.py +40 -80
  35. validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
  36. validmind/tests/data_validation/KPSS.py +33 -81
  37. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
  38. validmind/tests/data_validation/MissingValues.py +17 -58
  39. validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
  40. validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
  41. validmind/tests/data_validation/RollingStatsPlot.py +50 -81
  42. validmind/tests/data_validation/SeasonalDecompose.py +102 -184
  43. validmind/tests/data_validation/Skewness.py +27 -64
  44. validmind/tests/data_validation/SpreadPlot.py +34 -57
  45. validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
  46. validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
  47. validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
  48. validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
  49. validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
  50. validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
  51. validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
  52. validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
  53. validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
  54. validmind/tests/data_validation/TooManyZeroValues.py +21 -70
  55. validmind/tests/data_validation/UniqueRows.py +23 -62
  56. validmind/tests/data_validation/WOEBinPlots.py +83 -109
  57. validmind/tests/data_validation/WOEBinTable.py +28 -69
  58. validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
  59. validmind/tests/data_validation/nlp/CommonWords.py +49 -57
  60. validmind/tests/data_validation/nlp/Hashtags.py +27 -49
  61. validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
  62. validmind/tests/data_validation/nlp/Mentions.py +32 -63
  63. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
  64. validmind/tests/data_validation/nlp/Punctuations.py +63 -47
  65. validmind/tests/data_validation/nlp/Sentiment.py +4 -0
  66. validmind/tests/data_validation/nlp/StopWords.py +62 -91
  67. validmind/tests/data_validation/nlp/TextDescription.py +116 -159
  68. validmind/tests/data_validation/nlp/Toxicity.py +12 -4
  69. validmind/tests/decorator.py +33 -242
  70. validmind/tests/load.py +212 -153
  71. validmind/tests/model_validation/BertScore.py +13 -7
  72. validmind/tests/model_validation/BleuScore.py +4 -0
  73. validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
  74. validmind/tests/model_validation/ContextualRecall.py +3 -0
  75. validmind/tests/model_validation/FeaturesAUC.py +43 -74
  76. validmind/tests/model_validation/MeteorScore.py +3 -0
  77. validmind/tests/model_validation/RegardScore.py +5 -1
  78. validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
  79. validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
  80. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
  81. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
  82. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
  83. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
  84. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
  85. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
  86. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
  87. validmind/tests/model_validation/embeddings/utils.py +53 -0
  88. validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
  89. validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
  90. validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
  91. validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
  92. validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
  93. validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
  94. validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
  95. validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
  96. validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
  97. validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
  98. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
  99. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
  100. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
  101. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
  102. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
  103. validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
  104. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
  105. validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
  106. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
  107. validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
  108. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
  109. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
  110. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
  111. validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
  112. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
  113. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
  114. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
  115. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
  116. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
  117. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
  118. validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
  119. validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
  120. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
  121. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
  122. validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
  123. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
  124. validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
  125. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
  126. validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
  127. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
  128. validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
  129. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
  130. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
  131. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
  132. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
  133. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
  134. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
  135. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
  136. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
  137. validmind/tests/output.py +120 -0
  138. validmind/tests/prompt_validation/Bias.py +55 -98
  139. validmind/tests/prompt_validation/Clarity.py +56 -99
  140. validmind/tests/prompt_validation/Conciseness.py +63 -101
  141. validmind/tests/prompt_validation/Delimitation.py +48 -89
  142. validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
  143. validmind/tests/prompt_validation/Robustness.py +80 -121
  144. validmind/tests/prompt_validation/Specificity.py +61 -95
  145. validmind/tests/prompt_validation/ai_powered_test.py +2 -2
  146. validmind/tests/run.py +314 -496
  147. validmind/tests/test_providers.py +109 -79
  148. validmind/tests/utils.py +91 -0
  149. validmind/unit_metrics/__init__.py +16 -155
  150. validmind/unit_metrics/classification/F1.py +1 -0
  151. validmind/unit_metrics/classification/Precision.py +1 -0
  152. validmind/unit_metrics/classification/ROC_AUC.py +1 -0
  153. validmind/unit_metrics/classification/Recall.py +1 -0
  154. validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
  155. validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
  156. validmind/unit_metrics/regression/HuberLoss.py +1 -0
  157. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
  158. validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
  159. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
  160. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
  161. validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
  162. validmind/unit_metrics/regression/QuantileLoss.py +1 -0
  163. validmind/unit_metrics/regression/RSquaredScore.py +2 -1
  164. validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
  165. validmind/utils.py +66 -17
  166. validmind/vm_models/__init__.py +2 -17
  167. validmind/vm_models/dataset/dataset.py +31 -4
  168. validmind/vm_models/figure.py +7 -37
  169. validmind/vm_models/model.py +3 -0
  170. validmind/vm_models/result/__init__.py +7 -0
  171. validmind/vm_models/result/result.jinja +21 -0
  172. validmind/vm_models/result/result.py +337 -0
  173. validmind/vm_models/result/utils.py +160 -0
  174. validmind/vm_models/test_suite/runner.py +16 -54
  175. validmind/vm_models/test_suite/summary.py +3 -3
  176. validmind/vm_models/test_suite/test.py +43 -77
  177. validmind/vm_models/test_suite/test_suite.py +8 -40
  178. validmind-2.6.7.dist-info/METADATA +137 -0
  179. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/RECORD +182 -189
  180. validmind/tests/data_validation/AutoSeasonality.py +0 -190
  181. validmind/tests/metadata.py +0 -59
  182. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
  183. validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
  184. validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
  185. validmind/unit_metrics/composite.py +0 -238
  186. validmind/vm_models/test/metric.py +0 -98
  187. validmind/vm_models/test/metric_result.py +0 -61
  188. validmind/vm_models/test/output_template.py +0 -55
  189. validmind/vm_models/test/result_summary.py +0 -76
  190. validmind/vm_models/test/result_wrapper.py +0 -488
  191. validmind/vm_models/test/test.py +0 -103
  192. validmind/vm_models/test/threshold_test.py +0 -106
  193. validmind/vm_models/test/threshold_test_result.py +0 -75
  194. validmind/vm_models/test_context.py +0 -259
  195. validmind-2.5.25.dist-info/METADATA +0 -118
  196. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/LICENSE +0 -0
  197. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/WHEEL +0 -0
  198. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/entry_points.txt +0 -0
@@ -2,24 +2,18 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
- from typing import List
7
-
8
- import pandas as pd
9
- from numpy import unique
10
- from sklearn import metrics
11
-
12
- from validmind.vm_models import (
13
- ResultSummary,
14
- ResultTable,
15
- ResultTableMetadata,
16
- ThresholdTest,
17
- ThresholdTestResult,
18
- )
5
+ import numpy as np
6
+ from sklearn.metrics import f1_score
7
+
8
+ from validmind.tests import tags, tasks
9
+ from validmind.vm_models import VMDataset, VMModel
19
10
 
20
11
 
21
- @dataclass
22
- class MinimumF1Score(ThresholdTest):
12
+ @tags(
13
+ "sklearn", "binary_classification", "multiclass_classification", "model_performance"
14
+ )
15
+ @tasks("classification", "text_classification")
16
+ def MinimumF1Score(dataset: VMDataset, model: VMModel, min_threshold: float = 0.5):
23
17
  """
24
18
  Assesses if the model's F1 score on the validation set meets a predefined minimum threshold, ensuring balanced
25
19
  performance between precision and recall.
@@ -59,59 +53,15 @@ class MinimumF1Score(ThresholdTest):
59
53
  closely with specific requirements.
60
54
  """
61
55
 
62
- name = "f1_score"
63
- required_inputs = ["model", "dataset"]
64
- default_params = {"min_threshold": 0.5}
65
- tasks = ["classification", "text_classification"]
66
- tags = [
67
- "sklearn",
68
- "binary_classification",
69
- "multiclass_classification",
70
- "model_performance",
71
- ]
72
-
73
- def summary(self, results: List[ThresholdTestResult], all_passed: bool):
74
- """
75
- The f1 score test returns results like these:
76
- [{"values": {"score": 0.734375, "threshold": 0.7}, "passed": true}]
77
- """
78
- result = results[0]
79
- results_table = [
80
- {
81
- "Score": result.values["score"],
82
- "Threshold": result.values["threshold"],
83
- "Pass/Fail": "Pass" if result.passed else "Fail",
84
- }
85
- ]
86
-
87
- return ResultSummary(
88
- results=[
89
- ResultTable(
90
- data=pd.DataFrame(results_table),
91
- metadata=ResultTableMetadata(title="Minimum F1 Score Test"),
92
- )
93
- ]
94
- )
95
-
96
- def run(self):
97
- y_true = self.inputs.dataset.y
98
- class_pred = self.inputs.dataset.y_pred(self.inputs.model)
99
- y_true = y_true.astype(class_pred.dtype)
100
-
101
- if len(unique(y_true)) > 2:
102
- f1_score = metrics.f1_score(y_true, class_pred, average="macro")
103
- else:
104
- f1_score = metrics.f1_score(y_true, class_pred)
105
-
106
- passed = f1_score > self.params["min_threshold"]
107
- results = [
108
- ThresholdTestResult(
109
- passed=passed,
110
- values={
111
- "score": f1_score,
112
- "threshold": self.params["min_threshold"],
113
- },
114
- )
115
- ]
116
-
117
- return self.cache_results(results, passed=all([r.passed for r in results]))
56
+ if len(np.unique(dataset.y)) > 2:
57
+ score = f1_score(dataset.y, dataset.y_pred(model), average="macro")
58
+ else:
59
+ score = f1_score(dataset.y, dataset.y_pred(model))
60
+
61
+ return [
62
+ {
63
+ "Score": score,
64
+ "Threshold": min_threshold,
65
+ "Pass/Fail": "Pass" if score > min_threshold else "Fail",
66
+ }
67
+ ], score > min_threshold
@@ -2,24 +2,19 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
- from typing import List
7
-
8
5
  import numpy as np
9
- import pandas as pd
10
- from sklearn import metrics, preprocessing
11
-
12
- from validmind.vm_models import (
13
- ResultSummary,
14
- ResultTable,
15
- ResultTableMetadata,
16
- ThresholdTest,
17
- ThresholdTestResult,
18
- )
6
+ from sklearn.metrics import roc_auc_score
7
+ from sklearn.preprocessing import LabelBinarizer
8
+
9
+ from validmind.tests import tags, tasks
10
+ from validmind.vm_models import VMDataset, VMModel
19
11
 
20
12
 
21
- @dataclass
22
- class MinimumROCAUCScore(ThresholdTest):
13
+ @tags(
14
+ "sklearn", "binary_classification", "multiclass_classification", "model_performance"
15
+ )
16
+ @tasks("classification", "text_classification")
17
+ def MinimumROCAUCScore(dataset: VMDataset, model: VMModel, min_threshold: float = 0.5):
23
18
  """
24
19
  Validates model by checking if the ROC AUC score meets or surpasses a specified threshold.
25
20
 
@@ -61,69 +56,25 @@ class MinimumROCAUCScore(ThresholdTest):
61
56
  - The use of macro average for multiclass ROC AUC score implies equal weightage to each class, which might not be
62
57
  appropriate if the classes are imbalanced.
63
58
  """
59
+ y_true = dataset.y
60
+
61
+ if len(np.unique(y_true)) > 2:
62
+ lb = LabelBinarizer()
63
+ lb.fit(y_true)
64
64
 
65
- name = "roc_auc_score"
66
- required_inputs = ["model", "dataset"]
67
- default_params = {"min_threshold": 0.5}
68
- tasks = ["classification", "text_classification"]
69
- tags = [
70
- "sklearn",
71
- "binary_classification",
72
- "multiclass_classification",
73
- "model_performance",
74
- ]
75
-
76
- def summary(self, results: List[ThresholdTestResult], all_passed: bool):
77
- """
78
- The roc auc score test returns results like these:
79
- [{"values": {"score": 0.734375, "threshold": 0.7}, "passed": true}]
80
- """
81
- result = results[0]
82
- results_table = [
83
- {
84
- "Score": result.values["score"],
85
- "Threshold": result.values["threshold"],
86
- "Pass/Fail": "Pass" if result.passed else "Fail",
87
- }
88
- ]
89
-
90
- return ResultSummary(
91
- results=[
92
- ResultTable(
93
- data=pd.DataFrame(results_table),
94
- metadata=ResultTableMetadata(title="Minimum ROC AUC Score Test"),
95
- )
96
- ]
65
+ roc_auc = roc_auc_score(
66
+ y_true=lb.transform(y_true),
67
+ y_score=lb.transform(dataset.y_pred(model)),
68
+ average="macro",
97
69
  )
98
70
 
99
- def multiclass_roc_auc_score(self, y_test, y_pred, average="macro"):
100
- lb = preprocessing.LabelBinarizer()
101
- lb.fit(y_test)
102
- y_test = lb.transform(y_test)
103
- y_pred = lb.transform(y_pred)
104
- return metrics.roc_auc_score(y_test, y_pred, average=average)
105
-
106
- def run(self):
107
- y_true = self.inputs.dataset.y
108
-
109
- if len(np.unique(y_true)) > 2:
110
- class_pred = self.inputs.dataset.y_pred(self.inputs.model)
111
- y_true = y_true.astype(class_pred.dtype)
112
- roc_auc = self.multiclass_roc_auc_score(y_true, class_pred)
113
- else:
114
- y_prob = self.inputs.dataset.y_prob(self.inputs.model)
115
- y_true = y_true.astype(y_prob.dtype).flatten()
116
- roc_auc = metrics.roc_auc_score(y_true, y_prob)
117
-
118
- passed = roc_auc > self.params["min_threshold"]
119
- results = [
120
- ThresholdTestResult(
121
- passed=passed,
122
- values={
123
- "score": roc_auc,
124
- "threshold": self.params["min_threshold"],
125
- },
126
- )
127
- ]
128
-
129
- return self.cache_results(results, passed=all([r.passed for r in results]))
71
+ else:
72
+ roc_auc = roc_auc_score(y_true=y_true, y_score=dataset.y_prob(model))
73
+
74
+ return [
75
+ {
76
+ "Score": roc_auc,
77
+ "Threshold": min_threshold,
78
+ "Pass/Fail": "Pass" if roc_auc > min_threshold else "Fail",
79
+ }
80
+ ], roc_auc > min_threshold
@@ -2,19 +2,24 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
-
7
- from numpy import unique
5
+ import numpy as np
8
6
  from sklearn.metrics import classification_report
9
7
 
10
- from validmind.errors import SkipTestError
11
- from validmind.vm_models import ResultSummary, ResultTable, ResultTableMetadata
8
+ from validmind import tags, tasks
9
+ from validmind.vm_models import VMDataset, VMModel
12
10
 
13
- from .ClassifierPerformance import ClassifierPerformance, multiclass_roc_auc_score
11
+ from .ClassifierPerformance import multiclass_roc_auc_score
14
12
 
15
13
 
16
- @dataclass
17
- class ModelsPerformanceComparison(ClassifierPerformance):
14
+ @tags(
15
+ "sklearn",
16
+ "binary_classification",
17
+ "multiclass_classification",
18
+ "model_performance",
19
+ "model_comparison",
20
+ )
21
+ @tasks("classification", "text_classification")
22
+ def ModelsPerformanceComparison(dataset: VMDataset, models: list[VMModel]):
18
23
  """
19
24
  Evaluates and compares the performance of multiple Machine Learning models using various metrics like accuracy,
20
25
  precision, recall, and F1 score.
@@ -57,84 +62,49 @@ class ModelsPerformanceComparison(ClassifierPerformance):
57
62
  with unseen data or changes in the data distribution.
58
63
  - The ROC AUC score might not be as meaningful or easily interpretable for multilabel/multiclass tasks.
59
64
  """
65
+ y_true = dataset.y
66
+ classes = {str(i) for i in np.unique(y_true)}
67
+
68
+ prf_table = []
69
+ acc_roc_auc_table = []
70
+
71
+ for model in models:
72
+ y_pred = dataset.y_pred(model)
60
73
 
61
- name = "models_performance_comparison"
62
- required_inputs = ["dataset", "models"]
63
- tasks = ["classification", "text_classification"]
64
- tags = [
65
- "sklearn",
66
- "binary_classification",
67
- "multiclass_classification",
68
- "model_performance",
69
- "model_comparison",
70
- ]
71
-
72
- def summary(self, metric_value: dict):
73
- """
74
- This summary varies depending if we're evaluating a binary or multi-class model
75
- """
76
- results = []
77
- prf_table = []
78
- classes = {str(i) for i in unique(self.inputs.dataset.y)}
74
+ report = classification_report(y_true, y_pred, output_dict=True)
75
+ report["roc_auc"] = multiclass_roc_auc_score(y_true, y_pred)
79
76
 
80
77
  for class_name in classes:
81
- prf_dict = {}
82
- prf_dict["Class"] = class_name
83
- for m, _ in metric_value.items():
84
- prf_dict[f"Precision- {m}"] = metric_value[m][class_name]["precision"]
85
- prf_dict[f"Recall- {m}"] = metric_value[m][class_name]["recall"]
86
- prf_dict[f"F1- {m}"] = metric_value[m][class_name]["f1-score"]
87
- prf_table.append(prf_dict)
88
-
89
- avg_metrics = ["weighted avg", "macro avg"]
90
- for class_name in avg_metrics:
91
- avg_dict = {}
92
- avg_dict["Class"] = class_name
93
- for m, _ in metric_value.items():
94
- avg_dict[f"Precision- {m}"] = metric_value[m][class_name]["precision"]
95
- avg_dict[f"Recall- {m}"] = metric_value[m][class_name]["recall"]
96
- avg_dict[f"F1- {m}"] = metric_value[m][class_name]["f1-score"]
97
- prf_table.append(avg_dict)
98
- results.append(
99
- ResultTable(
100
- data=prf_table,
101
- metadata=ResultTableMetadata(
102
- title="Precision, Recall, and F1 Comparison"
103
- ),
78
+ prf_table.append(
79
+ {
80
+ "Model": model.input_id,
81
+ "Class": class_name,
82
+ "Precision": report[class_name]["precision"],
83
+ "Recall": report[class_name]["recall"],
84
+ "F1-Score": report[class_name]["f1-score"],
85
+ }
104
86
  )
105
- )
106
-
107
- acc_roc_auc_table = []
108
- for metric_name in ["accuracy", "roc_auc"]:
109
- acc_roc_auc_dict = {}
110
- acc_roc_auc_dict["Metric"] = metric_name
111
- for m, _ in metric_value.items():
112
- acc_roc_auc_dict[f"accuracy- {m}"] = metric_value[m]["accuracy"]
113
- acc_roc_auc_dict[f"roc_auc- {m}"] = metric_value[m]["roc_auc"]
114
- acc_roc_auc_table.append(acc_roc_auc_dict)
115
- results.append(
116
- ResultTable(
117
- data=acc_roc_auc_table,
118
- metadata=ResultTableMetadata(title="Accuracy and ROC AUC Comparison"),
87
+ for avg_metric in ["weighted avg", "macro avg"]:
88
+ prf_table.append(
89
+ {
90
+ "Model": model.input_id,
91
+ "Class": avg_metric,
92
+ "Precision": report[avg_metric]["precision"],
93
+ "Recall": report[avg_metric]["recall"],
94
+ "F1-Score": report[avg_metric]["f1-score"],
95
+ }
119
96
  )
120
- )
121
- return ResultSummary(results=results)
122
-
123
- def run(self):
124
- # Check models list is not empty
125
- if not self.inputs.models:
126
- raise SkipTestError(
127
- "List of models must be provided as a `models` parameter to compare performance"
128
- )
129
-
130
- all_models = self.inputs.models
131
97
 
132
- results = {}
133
- for idx, model in enumerate(all_models):
134
- y_true = self.inputs.dataset.y
135
- y_pred = self.inputs.dataset.y_pred(model)
136
- report = classification_report(y_true, y_pred, output_dict=True)
137
- report["roc_auc"] = multiclass_roc_auc_score(y_true, y_pred)
138
- results["model_" + str(idx)] = report
98
+ for metric in ["accuracy", "roc_auc"]:
99
+ acc_roc_auc_table.append(
100
+ {
101
+ "Model": model.input_id,
102
+ "Metric": metric,
103
+ "Value": report[metric],
104
+ }
105
+ )
139
106
 
140
- return self.cache_results(results)
107
+ return {
108
+ "Precision, Recall, and F1 Comparison": prf_table,
109
+ "Accuracy and ROC AUC Comparison": acc_roc_auc_table,
110
+ }
@@ -2,7 +2,6 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
5
  from typing import List
7
6
 
8
7
  import matplotlib.pyplot as plt
@@ -11,17 +10,9 @@ import pandas as pd
11
10
  import seaborn as sns
12
11
  from sklearn import metrics
13
12
 
13
+ from validmind import tags, tasks
14
14
  from validmind.logging import get_logger
15
- from validmind.vm_models import (
16
- Figure,
17
- ResultSummary,
18
- ResultTable,
19
- ResultTableMetadata,
20
- ThresholdTest,
21
- ThresholdTestResult,
22
- VMDataset,
23
- VMModel,
24
- )
15
+ from validmind.vm_models import VMDataset, VMModel
25
16
 
26
17
  logger = get_logger(__name__)
27
18
 
@@ -173,56 +164,69 @@ def _plot_overfit_regions(
173
164
  return fig
174
165
 
175
166
 
176
- # TODO: make this a functional test instead of class-based when appropriate
177
- # simply have to remove the class and rename this func to OverfitDiagnosis
178
- def overfit_diagnosis( # noqa: C901
167
+ @tags(
168
+ "sklearn",
169
+ "binary_classification",
170
+ "multiclass_classification",
171
+ "linear_regression",
172
+ "model_diagnosis",
173
+ )
174
+ @tasks("classification", "regression")
175
+ def OverfitDiagnosis(
179
176
  model: VMModel,
180
177
  datasets: List[VMDataset],
181
178
  metric: str = None,
182
179
  cut_off_threshold: float = DEFAULT_THRESHOLD,
183
180
  ):
184
- """Identify overfit regions in a model's predictions.
185
-
186
- This test compares the model's performance on training versus test data, grouped by
187
- feature columns. It calculates the difference between the training and test performance
188
- for each group and identifies regions where the difference exceeds a specified threshold.
189
-
190
- ## Test Methodology
191
-
192
- This test works for both classification and regression models and with a variety of
193
- performance metrics. By default, it uses the AUC metric for classification models and
194
- the MSE metric for regression models. The threshold for identifying overfit regions
195
- defaults to 0.04 but should be adjusted based on the specific use case.
196
-
197
- ## Inputs
198
- - `model` (VMModel): The ValidMind model object to evaluate.
199
- - `datasets` (List[VMDataset]): A list of two VMDataset objects where the first dataset
200
- is the training data and the second dataset is the test data.
201
-
202
- ## Parameters
203
- - `metric` (str, optional): The performance metric to use for evaluation. Choose from:
204
- 'accuracy', 'auc', 'f1', 'precision', 'recall', 'mse', 'mae', 'r2', 'mape'.
205
- Defaults to 'auc' for classification models and 'mse' for regression models.
206
- - `cut_off_threshold` (float, optional): The threshold for identifying overfit regions.
207
- Defaults to 0.04.
208
181
  """
182
+ Assesses potential overfitting in a model's predictions, identifying regions where performance between training and
183
+ testing sets deviates significantly.
184
+
185
+ ### Purpose
186
+
187
+ The Overfit Diagnosis test aims to identify areas in a model's predictions where there is a significant difference
188
+ in performance between the training and testing sets. This test helps to pinpoint specific regions or feature
189
+ segments where the model may be overfitting.
190
+
191
+ ### Test Mechanism
192
+
193
+ This test compares the model's performance on training versus test data, grouped by feature columns. It calculates
194
+ the difference between the training and test performance for each group and identifies regions where this
195
+ difference exceeds a specified threshold:
196
+
197
+ - The test works for both classification and regression models.
198
+ - It defaults to using the AUC metric for classification models and the MSE metric for regression models.
199
+ - The threshold for identifying overfitting regions is set to 0.04 by default.
200
+ - The test calculates the performance metrics for each feature segment and plots regions where the performance gap
201
+ exceeds the threshold.
202
+
203
+ ### Signs of High Risk
209
204
 
210
- # Determine if it's a classification or regression model
205
+ - Significant gaps between training and test performance metrics for specific feature segments.
206
+ - Multiple regions with performance gaps exceeding the defined threshold.
207
+ - Higher than expected differences in predicted versus actual values in the test set compared to the training set.
208
+
209
+ ### Strengths
210
+
211
+ - Identifies specific areas where overfitting occurs.
212
+ - Supports multiple performance metrics, providing flexibility.
213
+ - Applicable to both classification and regression models.
214
+ - Visualization of overfitting segments aids in better understanding and debugging.
215
+
216
+ ### Limitations
217
+
218
+ - The default threshold may not be suitable for all use cases and requires tuning.
219
+ - May not capture more subtle forms of overfitting that do not exceed the threshold.
220
+ - Assumes that the binning of features adequately represents the data segments.
221
+ """
211
222
  is_classification = bool(datasets[0].probability_column(model))
212
223
 
213
- # Set default metric if not provided
214
224
  if not metric:
215
225
  metric = (
216
226
  DEFAULT_CLASSIFICATION_METRIC
217
227
  if is_classification
218
228
  else DEFAULT_REGRESSION_METRIC
219
229
  )
220
- logger.info(
221
- f"Using default {'classification' if is_classification else 'regression'} metric: {metric}"
222
- )
223
-
224
- if id(cut_off_threshold) == id(DEFAULT_THRESHOLD):
225
- logger.info("Using default cut-off threshold of 0.04")
226
230
 
227
231
  train_df = datasets[0].df
228
232
  test_df = datasets[1].df
@@ -279,18 +283,8 @@ def overfit_diagnosis( # noqa: C901
279
283
  )
280
284
 
281
285
  results = _prepare_results(results_train, results_test, metric)
282
-
283
- fig = _plot_overfit_regions(results, feature_column, cut_off_threshold, metric)
284
286
  test_figures.append(
285
- Figure(
286
- key=f"overfit_diagnosis:{metric}:{feature_column}",
287
- figure=fig,
288
- metadata={
289
- "metric": metric,
290
- "cut_off_threshold": cut_off_threshold,
291
- "feature": feature_column,
292
- },
293
- )
287
+ _plot_overfit_regions(results, feature_column, cut_off_threshold, metric)
294
288
  )
295
289
 
296
290
  for _, row in results[results["gap"] > cut_off_threshold].iterrows():
@@ -306,91 +300,3 @@ def overfit_diagnosis( # noqa: C901
306
300
  )
307
301
 
308
302
  return {"Overfit Diagnosis": test_results}, *test_figures
309
-
310
-
311
- @dataclass
312
- class OverfitDiagnosis(ThresholdTest):
313
- """
314
- Assesses potential overfitting in a model's predictions, identifying regions where performance between training and
315
- testing sets deviates significantly.
316
-
317
- ### Purpose
318
-
319
- The Overfit Diagnosis test aims to identify areas in a model's predictions where there is a significant difference
320
- in performance between the training and testing sets. This test helps to pinpoint specific regions or feature
321
- segments where the model may be overfitting.
322
-
323
- ### Test Mechanism
324
-
325
- This test compares the model's performance on training versus test data, grouped by feature columns. It calculates
326
- the difference between the training and test performance for each group and identifies regions where this
327
- difference exceeds a specified threshold:
328
-
329
- - The test works for both classification and regression models.
330
- - It defaults to using the AUC metric for classification models and the MSE metric for regression models.
331
- - The threshold for identifying overfitting regions is set to 0.04 by default.
332
- - The test calculates the performance metrics for each feature segment and plots regions where the performance gap
333
- exceeds the threshold.
334
-
335
- ### Signs of High Risk
336
-
337
- - Significant gaps between training and test performance metrics for specific feature segments.
338
- - Multiple regions with performance gaps exceeding the defined threshold.
339
- - Higher than expected differences in predicted versus actual values in the test set compared to the training set.
340
-
341
- ### Strengths
342
-
343
- - Identifies specific areas where overfitting occurs.
344
- - Supports multiple performance metrics, providing flexibility.
345
- - Applicable to both classification and regression models.
346
- - Visualization of overfitting segments aids in better understanding and debugging.
347
-
348
- ### Limitations
349
-
350
- - The default threshold may not be suitable for all use cases and requires tuning.
351
- - May not capture more subtle forms of overfitting that do not exceed the threshold.
352
- - Assumes that the binning of features adequately represents the data segments.
353
- """
354
-
355
- required_inputs = ["model", "datasets"]
356
- default_params = {"metric": None, "cut_off_threshold": DEFAULT_THRESHOLD}
357
- tasks = ["classification", "regression"]
358
- tags = [
359
- "sklearn",
360
- "binary_classification",
361
- "multiclass_classification",
362
- "linear_regression",
363
- "model_diagnosis",
364
- ]
365
-
366
- def run(self):
367
- func_result = overfit_diagnosis(
368
- self.inputs.model,
369
- self.inputs.datasets,
370
- metric=self.params["metric"],
371
- cut_off_threshold=self.params["cut_off_threshold"],
372
- )
373
-
374
- return self.cache_results(
375
- test_results_list=[
376
- ThresholdTestResult(
377
- test_name=self.params["metric"],
378
- column=row["Feature"],
379
- passed=False,
380
- values={k: v for k, v in row.items()},
381
- )
382
- for row in func_result[0]["Overfit Diagnosis"]
383
- ],
384
- passed=(not func_result[0]["Overfit Diagnosis"]),
385
- figures=func_result[1:],
386
- )
387
-
388
- def summary(self, results, _):
389
- return ResultSummary(
390
- results=[
391
- ResultTable(
392
- data=[result.values for result in results],
393
- metadata=ResultTableMetadata(title="Overfit Diagnosis"),
394
- )
395
- ],
396
- )