validmind 2.5.25__py3-none-any.whl → 2.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. validmind/__init__.py +8 -17
  2. validmind/__version__.py +1 -1
  3. validmind/ai/test_descriptions.py +66 -85
  4. validmind/ai/test_result_description/context.py +2 -2
  5. validmind/ai/utils.py +26 -1
  6. validmind/api_client.py +43 -79
  7. validmind/client.py +5 -7
  8. validmind/client_config.py +1 -1
  9. validmind/datasets/__init__.py +1 -1
  10. validmind/datasets/classification/customer_churn.py +7 -5
  11. validmind/datasets/nlp/__init__.py +2 -2
  12. validmind/errors.py +6 -10
  13. validmind/html_templates/content_blocks.py +18 -16
  14. validmind/logging.py +21 -16
  15. validmind/tests/__init__.py +28 -5
  16. validmind/tests/__types__.py +186 -170
  17. validmind/tests/_store.py +7 -21
  18. validmind/tests/comparison.py +362 -0
  19. validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
  20. validmind/tests/data_validation/ADF.py +49 -83
  21. validmind/tests/data_validation/AutoAR.py +59 -96
  22. validmind/tests/data_validation/AutoMA.py +59 -96
  23. validmind/tests/data_validation/AutoStationarity.py +66 -114
  24. validmind/tests/data_validation/ClassImbalance.py +48 -117
  25. validmind/tests/data_validation/DatasetDescription.py +180 -209
  26. validmind/tests/data_validation/DatasetSplit.py +50 -75
  27. validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
  28. validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
  29. validmind/tests/data_validation/Duplicates.py +21 -90
  30. validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
  31. validmind/tests/data_validation/HighCardinality.py +32 -80
  32. validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
  33. validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
  34. validmind/tests/data_validation/IQROutliersTable.py +40 -80
  35. validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
  36. validmind/tests/data_validation/KPSS.py +33 -81
  37. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
  38. validmind/tests/data_validation/MissingValues.py +17 -58
  39. validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
  40. validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
  41. validmind/tests/data_validation/RollingStatsPlot.py +50 -81
  42. validmind/tests/data_validation/SeasonalDecompose.py +102 -184
  43. validmind/tests/data_validation/Skewness.py +27 -64
  44. validmind/tests/data_validation/SpreadPlot.py +34 -57
  45. validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
  46. validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
  47. validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
  48. validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
  49. validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
  50. validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
  51. validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
  52. validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
  53. validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
  54. validmind/tests/data_validation/TooManyZeroValues.py +21 -70
  55. validmind/tests/data_validation/UniqueRows.py +23 -62
  56. validmind/tests/data_validation/WOEBinPlots.py +83 -109
  57. validmind/tests/data_validation/WOEBinTable.py +28 -69
  58. validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
  59. validmind/tests/data_validation/nlp/CommonWords.py +49 -57
  60. validmind/tests/data_validation/nlp/Hashtags.py +27 -49
  61. validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
  62. validmind/tests/data_validation/nlp/Mentions.py +32 -63
  63. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
  64. validmind/tests/data_validation/nlp/Punctuations.py +63 -47
  65. validmind/tests/data_validation/nlp/Sentiment.py +4 -0
  66. validmind/tests/data_validation/nlp/StopWords.py +62 -91
  67. validmind/tests/data_validation/nlp/TextDescription.py +116 -159
  68. validmind/tests/data_validation/nlp/Toxicity.py +12 -4
  69. validmind/tests/decorator.py +33 -242
  70. validmind/tests/load.py +212 -153
  71. validmind/tests/model_validation/BertScore.py +13 -7
  72. validmind/tests/model_validation/BleuScore.py +4 -0
  73. validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
  74. validmind/tests/model_validation/ContextualRecall.py +3 -0
  75. validmind/tests/model_validation/FeaturesAUC.py +43 -74
  76. validmind/tests/model_validation/MeteorScore.py +3 -0
  77. validmind/tests/model_validation/RegardScore.py +5 -1
  78. validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
  79. validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
  80. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
  81. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
  82. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
  83. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
  84. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
  85. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
  86. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
  87. validmind/tests/model_validation/embeddings/utils.py +53 -0
  88. validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
  89. validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
  90. validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
  91. validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
  92. validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
  93. validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
  94. validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
  95. validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
  96. validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
  97. validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
  98. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
  99. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
  100. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
  101. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
  102. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
  103. validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
  104. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
  105. validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
  106. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
  107. validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
  108. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
  109. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
  110. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
  111. validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
  112. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
  113. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
  114. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
  115. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
  116. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
  117. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
  118. validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
  119. validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
  120. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
  121. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
  122. validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
  123. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
  124. validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
  125. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
  126. validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
  127. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
  128. validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
  129. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
  130. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
  131. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
  132. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
  133. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
  134. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
  135. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
  136. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
  137. validmind/tests/output.py +120 -0
  138. validmind/tests/prompt_validation/Bias.py +55 -98
  139. validmind/tests/prompt_validation/Clarity.py +56 -99
  140. validmind/tests/prompt_validation/Conciseness.py +63 -101
  141. validmind/tests/prompt_validation/Delimitation.py +48 -89
  142. validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
  143. validmind/tests/prompt_validation/Robustness.py +80 -121
  144. validmind/tests/prompt_validation/Specificity.py +61 -95
  145. validmind/tests/prompt_validation/ai_powered_test.py +2 -2
  146. validmind/tests/run.py +314 -496
  147. validmind/tests/test_providers.py +109 -79
  148. validmind/tests/utils.py +91 -0
  149. validmind/unit_metrics/__init__.py +16 -155
  150. validmind/unit_metrics/classification/F1.py +1 -0
  151. validmind/unit_metrics/classification/Precision.py +1 -0
  152. validmind/unit_metrics/classification/ROC_AUC.py +1 -0
  153. validmind/unit_metrics/classification/Recall.py +1 -0
  154. validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
  155. validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
  156. validmind/unit_metrics/regression/HuberLoss.py +1 -0
  157. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
  158. validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
  159. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
  160. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
  161. validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
  162. validmind/unit_metrics/regression/QuantileLoss.py +1 -0
  163. validmind/unit_metrics/regression/RSquaredScore.py +2 -1
  164. validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
  165. validmind/utils.py +66 -17
  166. validmind/vm_models/__init__.py +2 -17
  167. validmind/vm_models/dataset/dataset.py +31 -4
  168. validmind/vm_models/figure.py +7 -37
  169. validmind/vm_models/model.py +3 -0
  170. validmind/vm_models/result/__init__.py +7 -0
  171. validmind/vm_models/result/result.jinja +21 -0
  172. validmind/vm_models/result/result.py +337 -0
  173. validmind/vm_models/result/utils.py +160 -0
  174. validmind/vm_models/test_suite/runner.py +16 -54
  175. validmind/vm_models/test_suite/summary.py +3 -3
  176. validmind/vm_models/test_suite/test.py +43 -77
  177. validmind/vm_models/test_suite/test_suite.py +8 -40
  178. validmind-2.6.8.dist-info/METADATA +137 -0
  179. {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/RECORD +182 -189
  180. validmind/tests/data_validation/AutoSeasonality.py +0 -190
  181. validmind/tests/metadata.py +0 -59
  182. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
  183. validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
  184. validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
  185. validmind/unit_metrics/composite.py +0 -238
  186. validmind/vm_models/test/metric.py +0 -98
  187. validmind/vm_models/test/metric_result.py +0 -61
  188. validmind/vm_models/test/output_template.py +0 -55
  189. validmind/vm_models/test/result_summary.py +0 -76
  190. validmind/vm_models/test/result_wrapper.py +0 -488
  191. validmind/vm_models/test/test.py +0 -103
  192. validmind/vm_models/test/threshold_test.py +0 -106
  193. validmind/vm_models/test/threshold_test_result.py +0 -75
  194. validmind/vm_models/test_context.py +0 -259
  195. validmind-2.5.25.dist-info/METADATA +0 -118
  196. {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/LICENSE +0 -0
  197. {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/WHEEL +0 -0
  198. {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/entry_points.txt +0 -0
@@ -2,19 +2,24 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
-
7
5
  import numpy as np
8
6
  import plotly.graph_objects as go
9
7
  from sklearn.metrics import roc_auc_score, roc_curve
10
8
 
9
+ from validmind import tags, tasks
11
10
  from validmind.errors import SkipTestError
12
- from validmind.models import FoundationModel
13
- from validmind.vm_models import Figure, Metric
14
-
15
-
16
- @dataclass
17
- class ROCCurve(Metric):
11
+ from validmind.vm_models import VMDataset, VMModel
12
+
13
+
14
+ @tags(
15
+ "sklearn",
16
+ "binary_classification",
17
+ "multiclass_classification",
18
+ "model_performance",
19
+ "visualization",
20
+ )
21
+ @tasks("classification", "text_classification")
22
+ def ROCCurve(model: VMModel, dataset: VMDataset):
18
23
  """
19
24
  Evaluates binary classification model performance by generating and plotting the Receiver Operating Characteristic
20
25
  (ROC) curve and calculating the Area Under Curve (AUC) score.
@@ -61,78 +66,39 @@ class ROCCurve(Metric):
61
66
  incorrect, provided that the model's ranking format is retained. This phenomenon is commonly termed the "Class
62
67
  Imbalance Problem".
63
68
  """
64
-
65
- name = "roc_curve"
66
- required_inputs = ["model", "dataset"]
67
- tasks = ["classification", "text_classification"]
68
- tags = [
69
- "sklearn",
70
- "binary_classification",
71
- "multiclass_classification",
72
- "model_performance",
73
- "visualization",
74
- ]
75
-
76
- def run(self):
77
- if isinstance(self.inputs.model, FoundationModel):
78
- raise SkipTestError("Skipping ROCCurve for Foundation models")
79
-
80
- y_true = self.inputs.dataset.y
81
- y_prob = self.inputs.dataset.y_prob(self.inputs.model)
82
-
83
- # ROC curve is only supported for binary classification
84
- if len(np.unique(y_true)) > 2:
85
- raise SkipTestError(
86
- "ROC Curve is only supported for binary classification models"
87
- )
88
-
89
- y_true = y_true.astype(y_prob.dtype).flatten()
90
- assert np.all((y_prob >= 0) & (y_prob <= 1)), "Invalid probabilities in y_prob."
91
-
92
- fpr, tpr, roc_thresholds = roc_curve(y_true, y_prob, drop_intermediate=False)
93
-
94
- # Remove Inf values from roc_thresholds
95
- valid_thresholds_mask = np.isfinite(roc_thresholds)
96
- roc_thresholds = roc_thresholds[valid_thresholds_mask]
97
- auc = roc_auc_score(y_true, y_prob)
98
-
99
- trace0 = go.Scatter(
100
- x=fpr,
101
- y=tpr,
102
- mode="lines",
103
- name=f"ROC curve (AUC = {auc:.2f})",
104
- line=dict(color="#DE257E"),
105
- )
106
- trace1 = go.Scatter(
107
- x=[0, 1],
108
- y=[0, 1],
109
- mode="lines",
110
- name="Random (AUC = 0.5)",
111
- line=dict(color="grey", dash="dash"),
69
+ if len(np.unique(dataset.y)) > 2:
70
+ raise SkipTestError(
71
+ "ROC Curve is only supported for binary classification models"
112
72
  )
113
73
 
114
- layout = go.Layout(
115
- title=f"ROC Curve for {self.inputs.model.input_id} on {self.inputs.dataset.input_id}",
74
+ y_prob = dataset.y_prob(model)
75
+ y_true = dataset.y.astype(y_prob.dtype).flatten()
76
+
77
+ fpr, tpr, _ = roc_curve(y_true, y_prob, drop_intermediate=False)
78
+ auc = roc_auc_score(y_true, y_prob)
79
+
80
+ return go.Figure(
81
+ data=[
82
+ go.Scatter(
83
+ x=fpr,
84
+ y=tpr,
85
+ mode="lines",
86
+ name=f"ROC curve (AUC = {auc:.2f})",
87
+ line=dict(color="#DE257E"),
88
+ ),
89
+ go.Scatter(
90
+ x=[0, 1],
91
+ y=[0, 1],
92
+ mode="lines",
93
+ name="Random (AUC = 0.5)",
94
+ line=dict(color="grey", dash="dash"),
95
+ ),
96
+ ],
97
+ layout=go.Layout(
98
+ title=f"ROC Curve for {model.input_id} on {dataset.input_id}",
116
99
  xaxis=dict(title="False Positive Rate"),
117
100
  yaxis=dict(title="True Positive Rate"),
118
101
  width=700,
119
102
  height=500,
120
- )
121
-
122
- fig = go.Figure(data=[trace0, trace1], layout=layout)
123
-
124
- return self.cache_results(
125
- metric_value={
126
- "auc": auc,
127
- "fpr": fpr,
128
- "tpr": tpr,
129
- "thresholds": roc_thresholds,
130
- },
131
- figures=[
132
- Figure(
133
- for_object=self,
134
- key="roc_auc_curve",
135
- figure=fig,
136
- )
137
- ],
138
- )
103
+ ),
104
+ )
@@ -2,52 +2,43 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- import re
6
- from dataclasses import dataclass
7
-
8
5
  import numpy as np
9
6
  from sklearn.metrics import mean_absolute_error, mean_squared_error
10
7
 
11
- from validmind.errors import SkipTestError
8
+ from validmind import tags, tasks
12
9
  from validmind.logging import get_logger
13
- from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
10
+ from validmind.vm_models import VMDataset, VMModel
14
11
 
15
12
  logger = get_logger(__name__)
16
13
 
17
14
 
18
- @dataclass
19
- class RegressionPerformance(Metric):
15
+ @tags("sklearn", "model_performance")
16
+ @tasks("regression")
17
+ def RegressionPerformance(model: VMModel, dataset: VMDataset):
20
18
  """
21
- Compares and evaluates the performance of multiple regression models using five different metrics: MAE, MSE, RMSE,
22
- MAPE, and MBD.
19
+ Evaluates the performance of a regression model using five different metrics: MAE, MSE, RMSE, MAPE, and MBD.
23
20
 
24
21
  ### Purpose
25
22
 
26
- The Regression Models Performance Comparison metric is used to measure and compare the performance of regression
27
- models. It calculates multiple evaluation metrics, including Mean Absolute Error (MAE), Mean Squared Error (MSE),
23
+ The Regression Models Performance Comparison metric is used to measure the performance of regression models. It
24
+ calculates multiple evaluation metrics, including Mean Absolute Error (MAE), Mean Squared Error (MSE),
28
25
  Root Mean Squared Error (RMSE), Mean Absolute Percentage Error (MAPE), and Mean Bias Deviation (MBD), thereby
29
26
  enabling a comprehensive view of model performance.
30
27
 
31
28
  ### Test Mechanism
32
29
 
33
- The test starts by sourcing the true and predicted values from the models. It then computes the MAE, MSE, RMSE,
34
- MAPE, and MBD. These calculations encapsulate both the direction and the magnitude of error in predictions, thereby
35
- providing a multi-faceted view of model accuracy. It captures these results in a dictionary and compares the
36
- performance of all models using these metrics. The results are then appended to a table for presenting a
37
- comparative summary.
30
+ The test uses the sklearn library to calculate the MAE, MSE, RMSE, MAPE, and MBD. These calculations encapsulate both
31
+ the direction and the magnitude of error in predictions, thereby providing a multi-faceted view of model accuracy.
38
32
 
39
33
  ### Signs of High Risk
40
34
 
41
35
  - High values of MAE, MSE, RMSE, and MAPE, which indicate a high error rate and imply a larger departure of the
42
36
  model's predictions from the true values.
43
37
  - A large value of MBD, which shows a consistent bias in the model’s predictions.
44
- - If the test returns an error citing that no models were provided for comparison, it implies a risk in the
45
- evaluation process itself.
46
38
 
47
39
  ### Strengths
48
40
 
49
41
  - The metric evaluates models on five different metrics offering a comprehensive analysis of model performance.
50
- - It compares multiple models simultaneously, aiding in the selection of the best-performing models.
51
42
  - It is designed to handle regression tasks and can be seamlessly integrated with libraries like sklearn.
52
43
 
53
44
  ### Limitations
@@ -55,82 +46,38 @@ class RegressionPerformance(Metric):
55
46
  - The metric only evaluates regression models and does not evaluate classification models.
56
47
  - The test assumes that the models have been trained and tested appropriately prior to evaluation. It does not
57
48
  handle pre-processing, feature selection, or other stages in the model lifecycle.
58
- - It may fail to run if it doesn't receive valid models as inputs. The models are passed externally and the test
59
- doesn't have an internal mechanism to verify their validity.
60
- - The test could exhibit performance limitations if a large number of models is input for comparison.
61
49
  """
62
-
63
- name = "regression_performance"
64
- required_inputs = ["dataset", "model"]
65
-
66
- tasks = ["regression"]
67
- tags = [
68
- "sklearn",
69
- "model_performance",
70
- ]
71
-
72
- def regression_errors(self, y_true_test, y_pred_test):
73
- mae_test = mean_absolute_error(y_true_test, y_pred_test)
74
-
75
- results = {}
76
- results["Mean Absolute Error (MAE)"] = mae_test
77
-
78
- mse_test = mean_squared_error(y_true_test, y_pred_test)
79
- results["Mean Squared Error (MSE)"] = mse_test
80
- results["Root Mean Squared Error (RMSE)"] = np.sqrt(mse_test)
81
-
82
- if np.any(y_true_test == 0):
83
- logger.warning(
84
- "y_true_test contains zero values. Skipping MAPE calculation to avoid division by zero."
85
- )
86
- results["Mean Absolute Percentage Error (MAPE)"] = None
87
- else:
88
- mape_test = np.mean(np.abs((y_true_test - y_pred_test) / y_true_test)) * 100
89
- results["Mean Absolute Percentage Error (MAPE)"] = mape_test
90
-
91
- mbd_test = np.mean(y_pred_test - y_true_test)
92
- results["Mean Bias Deviation (MBD)"] = mbd_test
93
-
94
- return results
95
-
96
- def summary(self, metric_value: dict):
97
- """
98
- This summary varies depending if we're evaluating a binary or multi-class model
99
- """
100
- results = []
101
- metrics = metric_value[self.inputs.model.input_id].keys()
102
- error_table = []
103
- for metric_name in metrics:
104
- errors_dict = {}
105
- errors_dict["Errors"] = metric_name
106
- for m, _ in metric_value.items():
107
- for metric in metrics:
108
- res = re.findall(r"\(.*?\)", metric)
109
- res[0][1:-1]
110
- errors_dict[f"{res[0][1:-1]}-{m}"] = metric_value[m][metric]
111
- error_table.append(errors_dict)
112
-
113
- results.append(
114
- ResultTable(
115
- data=error_table,
116
- metadata=ResultTableMetadata(title="Regression Errors Comparison"),
117
- )
50
+ y_true = dataset.y
51
+ y_pred = dataset.y_pred(model)
52
+
53
+ # MAE calculation
54
+ metrics = {
55
+ "Mean Absolute Error (MAE)": mean_absolute_error(y_true, y_pred),
56
+ }
57
+
58
+ # MSE and RMSE calculations
59
+ mse = mean_squared_error(y_true, y_pred)
60
+ metrics["Mean Squared Error (MSE)"] = mse
61
+ metrics["Root Mean Squared Error (RMSE)"] = np.sqrt(mse)
62
+
63
+ # MAPE calculation
64
+ if np.any(y_true == 0):
65
+ logger.warning(
66
+ "y_true contains zero values. Skipping MAPE calculation to avoid division by zero."
118
67
  )
119
-
120
- return ResultSummary(results=results)
121
-
122
- def run(self):
123
- # Check models list is not empty
124
- if not self.inputs.model:
125
- raise SkipTestError(
126
- "Model must be provided as a `models` parameter to compare performance"
127
- )
128
- results = {}
129
-
130
- result = self.regression_errors(
131
- y_true_test=self.inputs.dataset.y,
132
- y_pred_test=self.inputs.dataset.y_pred(self.inputs.model),
68
+ metrics["Mean Absolute Percentage Error (MAPE)"] = None
69
+ else:
70
+ metrics["Mean Absolute Percentage Error (MAPE)"] = (
71
+ np.mean(np.abs((y_true - y_pred) / y_true)) * 100
133
72
  )
134
- results[self.inputs.model.input_id] = result
135
73
 
136
- return self.cache_results(results)
74
+ # MBD calculation
75
+ metrics["Mean Bias Deviation (MBD)"] = np.mean(y_pred - y_true)
76
+
77
+ return [
78
+ {
79
+ "Metric": metric,
80
+ "Value": value,
81
+ }
82
+ for metric, value in metrics.items()
83
+ ]
@@ -3,7 +3,6 @@
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
5
  from collections import defaultdict
6
- from dataclasses import dataclass
7
6
  from operator import add
8
7
  from typing import List, Tuple
9
8
 
@@ -15,16 +14,8 @@ from sklearn import metrics
15
14
 
16
15
  from validmind.errors import MissingOrInvalidModelPredictFnError
17
16
  from validmind.logging import get_logger
18
- from validmind.vm_models import (
19
- Figure,
20
- ResultSummary,
21
- ResultTable,
22
- ResultTableMetadata,
23
- ThresholdTest,
24
- ThresholdTestResult,
25
- VMDataset,
26
- VMModel,
27
- )
17
+ from validmind.tests import tags, tasks
18
+ from validmind.vm_models import VMDataset, VMModel
28
19
 
29
20
  logger = get_logger(__name__)
30
21
 
@@ -222,32 +213,59 @@ def _plot_robustness(
222
213
  return fig
223
214
 
224
215
 
225
- # TODO: make this a functional test instead of class-based when appropriate
226
- # simply have to remove the class and rename this func to OverfitDiagnosis
227
- def robustness_diagnosis(
228
- model: VMModel,
216
+ @tags("sklearn", "model_diagnosis", "visualization")
217
+ @tasks("classification", "regression")
218
+ def RobustnessDiagnosis(
229
219
  datasets: List[VMDataset],
220
+ model: VMModel,
230
221
  metric: str = None,
231
222
  scaling_factor_std_dev_list: List[float] = DEFAULT_STD_DEV_LIST,
232
223
  performance_decay_threshold: float = DEFAULT_DECAY_THRESHOLD,
233
224
  ):
225
+ """
226
+ Assesses the robustness of a machine learning model by evaluating performance decay under noisy conditions.
227
+
228
+ ### Purpose
229
+
230
+ The Robustness Diagnosis test aims to evaluate the resilience of a machine learning model when subjected to
231
+ perturbations or noise in its input data. This is essential for understanding the model's ability to handle
232
+ real-world scenarios where data may be imperfect or corrupted.
233
+
234
+ ### Test Mechanism
235
+
236
+ This test introduces Gaussian noise to the numeric input features of the datasets at varying scales of standard
237
+ deviation. The performance of the model is then measured using a specified metric. The process includes:
238
+
239
+ - Adding Gaussian noise to numerical input features based on scaling factors.
240
+ - Evaluating the model's performance on the perturbed data using metrics like AUC for classification tasks and MSE
241
+ for regression tasks.
242
+ - Aggregating and plotting the results to visualize performance decay relative to perturbation size.
243
+
244
+ ### Signs of High Risk
245
+
246
+ - A significant drop in performance metrics with minimal noise.
247
+ - Performance decay values exceeding the specified threshold.
248
+ - Consistent failure to meet performance standards across multiple perturbation scales.
249
+
250
+ ### Strengths
251
+
252
+ - Provides insights into the model's robustness against noisy or corrupted data.
253
+ - Utilizes a variety of performance metrics suitable for both classification and regression tasks.
254
+ - Visualization helps in understanding the extent of performance degradation.
255
+
256
+ ### Limitations
257
+
258
+ - Gaussian noise might not adequately represent all types of real-world data perturbations.
259
+ - Performance thresholds are somewhat arbitrary and might need tuning.
260
+ - The test may not account for more complex or unstructured noise patterns that could affect model robustness.
261
+ """
262
+ # TODO: use single dataset
234
263
  if not metric:
235
264
  metric = (
236
265
  DEFAULT_CLASSIFICATION_METRIC
237
266
  if datasets[0].probability_column(model)
238
267
  else DEFAULT_REGRESSION_METRIC
239
268
  )
240
- logger.info(f"Using default metric ({metric.upper()}) for robustness diagnosis")
241
-
242
- if id(scaling_factor_std_dev_list) == id(DEFAULT_STD_DEV_LIST):
243
- logger.info(
244
- f"Using default scaling factors for the standard deviation of the noise: {DEFAULT_STD_DEV_LIST}"
245
- )
246
-
247
- if id(performance_decay_threshold) == id(DEFAULT_DECAY_THRESHOLD):
248
- logger.info(
249
- f"Using default performance decay threshold of {DEFAULT_DECAY_THRESHOLD}"
250
- )
251
269
 
252
270
  results = [{} for _ in range(len(datasets))]
253
271
 
@@ -304,116 +322,9 @@ def robustness_diagnosis(
304
322
  columns=datasets[0].feature_columns_numeric,
305
323
  model=model.input_id,
306
324
  )
307
-
308
325
  # rename perturbation size for baseline
309
- results_df["Perturbation Size"][
310
- results_df["Perturbation Size"] == 0.0
326
+ results_df.loc[
327
+ results_df["Perturbation Size"] == 0.0, "Perturbation Size"
311
328
  ] = "Baseline (0.0)"
312
329
 
313
- return results_df, fig
314
-
315
-
316
- @dataclass
317
- class RobustnessDiagnosis(ThresholdTest):
318
- """
319
- Assesses the robustness of a machine learning model by evaluating performance decay under noisy conditions.
320
-
321
- ### Purpose
322
-
323
- The Robustness Diagnosis test aims to evaluate the resilience of a machine learning model when subjected to
324
- perturbations or noise in its input data. This is essential for understanding the model's ability to handle
325
- real-world scenarios where data may be imperfect or corrupted.
326
-
327
- ### Test Mechanism
328
-
329
- This test introduces Gaussian noise to the numeric input features of the datasets at varying scales of standard
330
- deviation. The performance of the model is then measured using a specified metric. The process includes:
331
-
332
- - Adding Gaussian noise to numerical input features based on scaling factors.
333
- - Evaluating the model's performance on the perturbed data using metrics like AUC for classification tasks and MSE
334
- for regression tasks.
335
- - Aggregating and plotting the results to visualize performance decay relative to perturbation size.
336
-
337
- ### Signs of High Risk
338
-
339
- - A significant drop in performance metrics with minimal noise.
340
- - Performance decay values exceeding the specified threshold.
341
- - Consistent failure to meet performance standards across multiple perturbation scales.
342
-
343
- ### Strengths
344
-
345
- - Provides insights into the model's robustness against noisy or corrupted data.
346
- - Utilizes a variety of performance metrics suitable for both classification and regression tasks.
347
- - Visualization helps in understanding the extent of performance degradation.
348
-
349
- ### Limitations
350
-
351
- - Gaussian noise might not adequately represent all types of real-world data perturbations.
352
- - Performance thresholds are somewhat arbitrary and might need tuning.
353
- - The test may not account for more complex or unstructured noise patterns that could affect model robustness.
354
- """
355
-
356
- name = "robustness"
357
- required_inputs = ["model", "datasets"]
358
- default_params = {
359
- "metric": None,
360
- "scaling_factor_std_dev_list": DEFAULT_STD_DEV_LIST,
361
- "performance_decay_threshold": DEFAULT_DECAY_THRESHOLD,
362
- }
363
- tasks = ["classification", "regression"]
364
- tags = [
365
- "sklearn",
366
- "model_diagnosis",
367
- "visualization",
368
- ]
369
-
370
- def run(self):
371
- results, fig = robustness_diagnosis(
372
- model=self.inputs.model,
373
- datasets=self.inputs.datasets,
374
- metric=self.params["metric"],
375
- scaling_factor_std_dev_list=self.params["scaling_factor_std_dev_list"],
376
- performance_decay_threshold=self.params["performance_decay_threshold"],
377
- )
378
-
379
- return self.cache_results(
380
- passed=results["Passed"].all(),
381
- test_results_list=[
382
- ThresholdTestResult(
383
- test_name=self.params["metric"],
384
- passed=results["Passed"].all(),
385
- values=results.to_dict(orient="records"),
386
- )
387
- ],
388
- figures=[
389
- Figure(
390
- for_object=self,
391
- key=f"{self.name}:{self.params['metric']}",
392
- figure=fig,
393
- )
394
- ],
395
- )
396
-
397
- def summary(self, results: List[ThresholdTestResult], _):
398
- return ResultSummary(
399
- results=[
400
- ResultTable(
401
- data=results[0].values,
402
- metadata=ResultTableMetadata(title="Robustness Diagnosis Results"),
403
- )
404
- ]
405
- )
406
-
407
- def test(self):
408
- """Unit Test for Robustness Diagnosis Threshold Test"""
409
- # Verify the result object is present
410
- assert self.result is not None
411
-
412
- # Verify test results and their type
413
- assert isinstance(self.result.test_results.results, list)
414
-
415
- # Check for presence and validity of 'values' and 'passed' flag in each result
416
- for test_result in self.result.test_results.results:
417
- assert "values" in test_result.__dict__
418
- assert "passed" in test_result.__dict__
419
- assert isinstance(test_result.values, list)
330
+ return results_df, fig, all(results_df["Passed"])