validmind 2.5.6__py3-none-any.whl → 2.5.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (212) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai/test_descriptions.py +26 -7
  3. validmind/api_client.py +89 -43
  4. validmind/client.py +2 -2
  5. validmind/client_config.py +11 -14
  6. validmind/datasets/regression/fred_timeseries.py +67 -138
  7. validmind/template.py +1 -0
  8. validmind/test_suites/__init__.py +0 -2
  9. validmind/test_suites/statsmodels_timeseries.py +1 -1
  10. validmind/test_suites/summarization.py +0 -1
  11. validmind/test_suites/time_series.py +0 -43
  12. validmind/tests/__types__.py +3 -13
  13. validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
  14. validmind/tests/data_validation/ADF.py +31 -24
  15. validmind/tests/data_validation/AutoAR.py +9 -9
  16. validmind/tests/data_validation/AutoMA.py +23 -16
  17. validmind/tests/data_validation/AutoSeasonality.py +18 -16
  18. validmind/tests/data_validation/AutoStationarity.py +21 -16
  19. validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
  20. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +82 -124
  21. validmind/tests/data_validation/ClassImbalance.py +15 -12
  22. validmind/tests/data_validation/DFGLSArch.py +19 -13
  23. validmind/tests/data_validation/DatasetDescription.py +17 -11
  24. validmind/tests/data_validation/DatasetSplit.py +7 -5
  25. validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
  26. validmind/tests/data_validation/Duplicates.py +33 -25
  27. validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
  28. validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
  29. validmind/tests/data_validation/HighCardinality.py +19 -12
  30. validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
  31. validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
  32. validmind/tests/data_validation/IQROutliersTable.py +40 -36
  33. validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
  34. validmind/tests/data_validation/KPSS.py +34 -29
  35. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
  36. validmind/tests/data_validation/MissingValues.py +32 -27
  37. validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
  38. validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
  39. validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
  40. validmind/tests/data_validation/RollingStatsPlot.py +31 -23
  41. validmind/tests/data_validation/ScatterPlot.py +63 -78
  42. validmind/tests/data_validation/SeasonalDecompose.py +38 -34
  43. validmind/tests/data_validation/Skewness.py +35 -37
  44. validmind/tests/data_validation/SpreadPlot.py +35 -35
  45. validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
  46. validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
  47. validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
  48. validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
  49. validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
  50. validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
  51. validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
  52. validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
  53. validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
  54. validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
  55. validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
  56. validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
  57. validmind/tests/data_validation/TooManyZeroValues.py +16 -11
  58. validmind/tests/data_validation/UniqueRows.py +11 -6
  59. validmind/tests/data_validation/WOEBinPlots.py +23 -16
  60. validmind/tests/data_validation/WOEBinTable.py +35 -30
  61. validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
  62. validmind/tests/data_validation/nlp/CommonWords.py +21 -14
  63. validmind/tests/data_validation/nlp/Hashtags.py +27 -20
  64. validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
  65. validmind/tests/data_validation/nlp/Mentions.py +21 -15
  66. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
  67. validmind/tests/data_validation/nlp/Punctuations.py +24 -20
  68. validmind/tests/data_validation/nlp/Sentiment.py +27 -8
  69. validmind/tests/data_validation/nlp/StopWords.py +26 -19
  70. validmind/tests/data_validation/nlp/TextDescription.py +36 -35
  71. validmind/tests/data_validation/nlp/Toxicity.py +32 -9
  72. validmind/tests/decorator.py +81 -42
  73. validmind/tests/model_validation/BertScore.py +36 -27
  74. validmind/tests/model_validation/BleuScore.py +25 -19
  75. validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
  76. validmind/tests/model_validation/ContextualRecall.py +35 -13
  77. validmind/tests/model_validation/FeaturesAUC.py +32 -13
  78. validmind/tests/model_validation/MeteorScore.py +46 -33
  79. validmind/tests/model_validation/ModelMetadata.py +32 -64
  80. validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
  81. validmind/tests/model_validation/RegardScore.py +30 -14
  82. validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
  83. validmind/tests/model_validation/RougeScore.py +36 -30
  84. validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
  85. validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
  86. validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
  87. validmind/tests/model_validation/TokenDisparity.py +31 -23
  88. validmind/tests/model_validation/ToxicityScore.py +26 -17
  89. validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
  90. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
  91. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
  92. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
  93. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
  94. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
  95. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
  96. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
  97. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
  98. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
  99. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
  100. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
  101. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
  102. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
  103. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
  104. validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
  105. validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
  106. validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
  107. validmind/tests/model_validation/ragas/AspectCritique.py +7 -0
  108. validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
  109. validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
  110. validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
  111. validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
  112. validmind/tests/model_validation/ragas/utils.py +6 -0
  113. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
  114. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
  115. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
  116. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
  117. validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
  118. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
  119. validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
  120. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
  121. validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
  122. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
  123. validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
  124. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
  125. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
  126. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
  127. validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
  128. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
  129. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
  130. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
  131. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +31 -25
  132. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
  133. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
  134. validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
  135. validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
  136. validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
  137. validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
  138. validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -93
  139. validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
  140. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +113 -73
  141. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +7 -5
  142. validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
  143. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
  144. validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
  145. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
  146. validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
  147. validmind/tests/model_validation/statsmodels/BoxPierce.py +14 -10
  148. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
  149. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +19 -12
  150. validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
  151. validmind/tests/model_validation/statsmodels/JarqueBera.py +27 -22
  152. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
  153. validmind/tests/model_validation/statsmodels/LJungBox.py +32 -28
  154. validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
  155. validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +87 -119
  156. validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
  157. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
  158. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
  159. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
  160. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
  161. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
  162. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
  163. validmind/tests/model_validation/statsmodels/RunsTest.py +32 -28
  164. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
  165. validmind/tests/model_validation/statsmodels/ShapiroWilk.py +15 -8
  166. validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
  167. validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
  168. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
  169. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
  170. validmind/tests/prompt_validation/Bias.py +14 -11
  171. validmind/tests/prompt_validation/Clarity.py +16 -14
  172. validmind/tests/prompt_validation/Conciseness.py +7 -5
  173. validmind/tests/prompt_validation/Delimitation.py +23 -22
  174. validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
  175. validmind/tests/prompt_validation/Robustness.py +12 -10
  176. validmind/tests/prompt_validation/Specificity.py +13 -11
  177. validmind/tests/prompt_validation/ai_powered_test.py +6 -0
  178. validmind/tests/run.py +68 -23
  179. validmind/unit_metrics/__init__.py +81 -144
  180. validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
  181. validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
  182. validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
  183. validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
  184. validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
  185. validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
  186. validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
  187. validmind/unit_metrics/regression/HuberLoss.py +1 -1
  188. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
  189. validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
  190. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
  191. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
  192. validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
  193. validmind/unit_metrics/regression/QuantileLoss.py +1 -1
  194. validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
  195. validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
  196. validmind/vm_models/dataset/dataset.py +2 -0
  197. validmind/vm_models/figure.py +5 -0
  198. validmind/vm_models/test/result_wrapper.py +93 -132
  199. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/METADATA +1 -1
  200. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/RECORD +203 -210
  201. validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
  202. validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
  203. validmind/tests/data_validation/BivariateHistograms.py +0 -117
  204. validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
  205. validmind/tests/data_validation/MissingValuesRisk.py +0 -88
  206. validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
  207. validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
  208. validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
  209. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
  210. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/LICENSE +0 -0
  211. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/WHEEL +0 -0
  212. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/entry_points.txt +0 -0
@@ -16,25 +16,27 @@ logger = get_logger(__name__)
16
16
 
17
17
 
18
18
  @dataclass
19
- class RegressionModelsPerformanceComparison(Metric):
19
+ class RegressionPerformance(Metric):
20
20
  """
21
21
  Compares and evaluates the performance of multiple regression models using five different metrics: MAE, MSE, RMSE,
22
22
  MAPE, and MBD.
23
23
 
24
- **1. Purpose:**
24
+ ### Purpose
25
+
25
26
  The Regression Models Performance Comparison metric is used to measure and compare the performance of regression
26
27
  models. It calculates multiple evaluation metrics, including Mean Absolute Error (MAE), Mean Squared Error (MSE),
27
28
  Root Mean Squared Error (RMSE), Mean Absolute Percentage Error (MAPE), and Mean Bias Deviation (MBD), thereby
28
29
  enabling a comprehensive view of model performance.
29
30
 
30
- **2. Test Mechanism:**
31
+ ### Test Mechanism
32
+
31
33
  The test starts by sourcing the true and predicted values from the models. It then computes the MAE, MSE, RMSE,
32
34
  MAPE, and MBD. These calculations encapsulate both the direction and the magnitude of error in predictions, thereby
33
35
  providing a multi-faceted view of model accuracy. It captures these results in a dictionary and compares the
34
36
  performance of all models using these metrics. The results are then appended to a table for presenting a
35
37
  comparative summary.
36
38
 
37
- **3. Signs of High Risk:**
39
+ ### Signs of High Risk
38
40
 
39
41
  - High values of MAE, MSE, RMSE, and MAPE, which indicate a high error rate and imply a larger departure of the
40
42
  model's predictions from the true values.
@@ -42,13 +44,13 @@ class RegressionModelsPerformanceComparison(Metric):
42
44
  - If the test returns an error citing that no models were provided for comparison, it implies a risk in the
43
45
  evaluation process itself.
44
46
 
45
- **4. Strengths:**
47
+ ### Strengths
46
48
 
47
49
  - The metric evaluates models on five different metrics offering a comprehensive analysis of model performance.
48
50
  - It compares multiple models simultaneously, aiding in the selection of the best-performing models.
49
51
  - It is designed to handle regression tasks and can be seamlessly integrated with libraries like sklearn.
50
52
 
51
- **5. Limitations:**
53
+ ### Limitations
52
54
 
53
55
  - The metric only evaluates regression models and does not evaluate classification models.
54
56
  - The test assumes that the models have been trained and tested appropriately prior to evaluation. It does not
@@ -58,8 +60,8 @@ class RegressionModelsPerformanceComparison(Metric):
58
60
  - The test could exhibit performance limitations if a large number of models is input for comparison.
59
61
  """
60
62
 
61
- name = "models_performance_comparison"
62
- required_inputs = ["dataset", "models"]
63
+ name = "regression_performance"
64
+ required_inputs = ["dataset", "model"]
63
65
 
64
66
  tasks = ["regression"]
65
67
  tags = [
@@ -96,7 +98,7 @@ class RegressionModelsPerformanceComparison(Metric):
96
98
  This summary varies depending if we're evaluating a binary or multi-class model
97
99
  """
98
100
  results = []
99
- metrics = metric_value["model_0"].keys()
101
+ metrics = metric_value[self.inputs.model.input_id].keys()
100
102
  error_table = []
101
103
  for metric_name in metrics:
102
104
  errors_dict = {}
@@ -119,20 +121,16 @@ class RegressionModelsPerformanceComparison(Metric):
119
121
 
120
122
  def run(self):
121
123
  # Check models list is not empty
122
- if not self.inputs.models:
124
+ if not self.inputs.model:
123
125
  raise SkipTestError(
124
- "List of models must be provided as a `models` parameter to compare performance"
126
+ "Model must be provided as a `models` parameter to compare performance"
125
127
  )
126
-
127
- all_models = self.inputs.models
128
-
129
128
  results = {}
130
129
 
131
- for idx, model in enumerate(all_models):
132
- result = self.regression_errors(
133
- y_true_test=self.inputs.dataset.y,
134
- y_pred_test=self.inputs.dataset.y_pred(model),
135
- )
136
- results["model_" + str(idx)] = result
130
+ result = self.regression_errors(
131
+ y_true_test=self.inputs.dataset.y,
132
+ y_pred_test=self.inputs.dataset.y_pred(self.inputs.model),
133
+ )
134
+ results[self.inputs.model.input_id] = result
137
135
 
138
136
  return self.cache_results(results)
@@ -2,105 +2,67 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
5
+ import pandas as pd
6
6
 
7
7
  from sklearn import metrics
8
8
 
9
9
  from validmind.tests.model_validation.statsmodels.statsutils import adj_r2_score
10
- from validmind.vm_models import Metric, ResultSummary, ResultTable
10
+ from validmind import tags, tasks
11
11
 
12
12
 
13
- @dataclass
14
- class RegressionR2Square(Metric):
13
+ @tags("sklearn", "model_performance")
14
+ @tasks("regression")
15
+ def RegressionR2Square(dataset, model):
15
16
  """
16
- **Purpose**: The purpose of the RegressionR2Square Metric test is to measure the overall goodness-of-fit of a
17
- regression model. Specifically, this Python-based test evaluates the R-squared (R2) and Adjusted R-squared (Adj R2)
18
- scores: two statistical measures within regression analysis used to evaluate the strength of the relationship
19
- between the model's predictors and the response variable.
20
-
21
- **Test Mechanism**: The test deploys the 'r2_score' method from the Scikit-learn metrics module, measuring the R2
22
- score on both training and test sets. This score reflects the proportion of the variance in the dependent variable
23
- that is predictable from independent variables. The test also considers the Adjusted R2 score, accounting for the
24
- number of predictors in the model, to penalize model complexity and thus reduce overfitting. The Adjusted R2 score
25
- will be smaller if unnecessary predictors are included in the model.
26
-
27
- **Signs of High Risk**: Indicators of high risk in this test may include a low R2 or Adjusted R2 score, which would
28
- suggest that the model does not explain much variation in the dependent variable. The occurrence of overfitting is
29
- also a high-risk sign, evident when the R2 score on the training set is significantly higher than on the test set,
30
- indicating that the model is not generalizing well to unseen data.
31
-
32
- **Strengths**: The R2 score is a widely-used measure in regression analysis, providing a sound general indication
33
- of model performance. It is easy to interpret and understand, as it is essentially representing the proportion of
34
- the dependent variable's variance explained by the independent variables. The Adjusted R2 score complements the R2
35
- score well by taking into account the number of predictors in the model, which helps control overfitting.
36
-
37
- **Limitations**: R2 and Adjusted R2 scores can be sensitive to the inclusion of unnecessary predictors in the model
38
- (even though Adjusted R2 is intended to penalize complexity). Their reliability might also lessen in cases of
39
- non-linear relationships or when the underlying assumptions of linear regression are violated. Additionally, while
40
- they summarize how well the model fits the data, they do not provide insight on whether the correct regression was
41
- used, or whether certain key assumptions have been fulfilled.
17
+ Assesses the overall goodness-of-fit of a regression model by evaluating R-squared (R2) and Adjusted R-squared (Adj
18
+ R2) scores to determine the model's explanatory power over the dependent variable.
19
+
20
+ ### Purpose
21
+
22
+ The purpose of the RegressionR2Square Metric test is to measure the overall goodness-of-fit of a regression model.
23
+ Specifically, this Python-based test evaluates the R-squared (R2) and Adjusted R-squared (Adj R2) scores, which are
24
+ statistical measures used to assess the strength of the relationship between the model's predictors and the
25
+ response variable.
26
+
27
+ ### Test Mechanism
28
+
29
+ The test deploys the `r2_score` method from the Scikit-learn metrics module to measure the R2 score on both
30
+ training and test sets. This score reflects the proportion of the variance in the dependent variable that is
31
+ predictable from the independent variables. The test also calculates the Adjusted R2 score, which accounts for the
32
+ number of predictors in the model to penalize model complexity and reduce overfitting. The Adjusted R2 score will
33
+ be smaller if unnecessary predictors are included in the model.
34
+
35
+ ### Signs of High Risk
36
+
37
+ - Low R2 or Adjusted R2 scores, suggesting that the model does not explain much variation in the dependent variable.
38
+ - Significant discrepancy between R2 scores on the training set and test set, indicating overfitting and poor
39
+ generalization to unseen data.
40
+
41
+ ### Strengths
42
+
43
+ - Widely-used measure in regression analysis, providing a sound general indication of model performance.
44
+ - Easy to interpret and understand, as it represents the proportion of the dependent variable's variance explained
45
+ by the independent variables.
46
+ - Adjusted R2 score helps control overfitting by penalizing unnecessary predictors.
47
+
48
+ ### Limitations
49
+
50
+ - Sensitive to the inclusion of unnecessary predictors even though Adjusted R2 penalizes complexity.
51
+ - Less reliable in cases of non-linear relationships or when the underlying assumptions of linear regression are
52
+ violated.
53
+ - Does not provide insight on whether the correct regression model was used or if key assumptions have been met.
42
54
  """
43
55
 
44
- name = "regression_errors_r2_square"
45
- required_inputs = ["model", "datasets"]
46
- tasks = ["regression"]
47
- tags = [
48
- "sklearn",
49
- "model_performance",
50
- ]
51
-
52
- def summary(self, raw_results):
53
- """
54
- Returns a summarized representation of the dataset split information
55
- """
56
- table_records = []
57
- for result in raw_results:
58
- for key, _ in result.items():
59
- table_records.append(
60
- {
61
- "Metric": key,
62
- "TRAIN": result[key]["train"],
63
- "TEST": result[key]["test"],
64
- }
65
- )
66
-
67
- return ResultSummary(results=[ResultTable(data=table_records)])
68
-
69
- def run(self):
70
- y_train_true = self.inputs.datasets[0].y
71
- y_train_pred = self.inputs.datasets[0].y_pred(self.inputs.model)
72
- y_train_true = y_train_true.astype(y_train_pred.dtype)
73
-
74
- y_test_true = self.inputs.datasets[1].y
75
- y_test_pred = self.inputs.datasets[1].y_pred(self.inputs.model)
76
- y_test_true = y_test_true.astype(y_test_pred.dtype)
77
-
78
- r2s_train = metrics.r2_score(y_train_true, y_train_pred)
79
- r2s_test = metrics.r2_score(y_test_true, y_test_pred)
80
-
81
- results = []
82
- results.append(
83
- {
84
- "R-squared (R2) Score": {
85
- "train": r2s_train,
86
- "test": r2s_test,
87
- }
88
- }
89
- )
90
-
91
- X_columns = self.inputs.datasets[0].feature_columns
92
- adj_r2_train = adj_r2_score(
93
- y_train_true, y_train_pred, len(y_train_true), len(X_columns)
94
- )
95
- adj_r2_test = adj_r2_score(
96
- y_test_true, y_test_pred, len(y_test_true), len(X_columns)
97
- )
98
- results.append(
99
- {
100
- "Adjusted R-squared (R2) Score": {
101
- "train": adj_r2_train,
102
- "test": adj_r2_test,
103
- }
104
- }
105
- )
106
- return self.cache_results(metric_value=results)
56
+ y_true = dataset.y
57
+ y_pred = dataset.y_pred(model)
58
+ y_true = y_true.astype(y_pred.dtype)
59
+
60
+ r2s = metrics.r2_score(y_true, y_pred)
61
+ adj_r2 = adj_r2_score(y_true, y_pred, len(y_true), len(dataset.feature_columns))
62
+
63
+ # Create dataframe with R2 and Adjusted R2 in one row
64
+ results_df = pd.DataFrame(
65
+ {"R-squared (R2) Score": [r2s], "Adjusted R-squared (R2) Score": [adj_r2]}
66
+ )
67
+
68
+ return results_df
@@ -13,26 +13,45 @@ from validmind.tests.model_validation.statsmodels.statsutils import adj_r2_score
13
13
  @tasks("regression", "time_series_forecasting")
14
14
  def RegressionR2SquareComparison(datasets, models):
15
15
  """
16
- Compare R-Squared and Adjusted R-Squared values for each model and generate a summary table
17
- with the results.
16
+ Compares R-Squared and Adjusted R-Squared values for different regression models across multiple datasets to assess
17
+ model performance and relevance of features.
18
18
 
19
- **Purpose**: The purpose of this function is to compare the R-Squared and Adjusted R-Squared values for different models applied to various datasets.
19
+ ### Purpose
20
20
 
21
- **Test Mechanism**: The function iterates through each dataset-model pair, calculates the R-Squared and Adjusted R-Squared values, and generates a summary table with these results.
21
+ The Regression R2 Square Comparison test aims to compare the R-Squared and Adjusted R-Squared values for different
22
+ regression models across various datasets. It helps in assessing how well each model explains the variability in
23
+ the dataset, and whether the models include irrelevant features.
22
24
 
23
- **Signs of High Risk**:
24
- - If the R-Squared values are significantly low, it could indicate that the model is not explaining much of the variability in the dataset.
25
- - A significant difference between R-Squared and Adjusted R-Squared values might indicate that the model includes irrelevant features.
25
+ ### Test Mechanism
26
+
27
+ This test operates by:
28
+
29
+ - Iterating through each dataset-model pair.
30
+ - Calculating the R-Squared values to measure how much of the variability in the dataset is explained by the model.
31
+ - Calculating the Adjusted R-Squared values, which adjust the R-Squared based on the number of predictors in the
32
+ model, making it more reliable when comparing models with different numbers of features.
33
+ - Generating a summary table containing these values for each combination of dataset and model.
34
+
35
+ ### Signs of High Risk
36
+
37
+ - If the R-Squared values are significantly low, it indicates the model isn't explaining much of the variability in
38
+ the dataset.
39
+ - A significant difference between R-Squared and Adjusted R-Squared values might indicate that the model includes
40
+ irrelevant features.
41
+
42
+ ### Strengths
26
43
 
27
- **Strengths**:
28
44
  - Provides a quantitative measure of model performance in terms of variance explained.
29
- - Adjusted R-Squared accounts for the number of predictors, making it a more reliable measure when comparing models with different numbers of features.
45
+ - Adjusted R-Squared accounts for the number of predictors, making it a more reliable measure when comparing models
46
+ with different numbers of features.
47
+ - Useful for time-series forecasting and regression tasks.
30
48
 
31
- **Limitations**:
32
- - Assumes that the dataset is provided as a DataFrameDataset object with `y`, `y_pred`, and `feature_columns` attributes.
33
- - The function relies on `adj_r2_score` from the `statsmodels.statsutils` module, which should be correctly implemented and imported.
34
- - Requires that `dataset.y_pred(model)` returns the predicted values for the model.
49
+ ### Limitations
35
50
 
51
+ - Assumes the dataset is provided as a DataFrameDataset object with `y`, `y_pred`, and `feature_columns` attributes.
52
+ - Relies on `adj_r2_score` from the `statsmodels.statsutils` module, which needs to be correctly implemented and
53
+ imported.
54
+ - Requires that `dataset.y_pred(model)` returns the predicted values for the model.
36
55
  """
37
56
  results_list = []
38
57
 
@@ -7,9 +7,9 @@ from dataclasses import dataclass
7
7
  from operator import add
8
8
  from typing import List, Tuple
9
9
 
10
- import matplotlib.pyplot as plt
11
10
  import numpy as np
12
11
  import pandas as pd
12
+ import plotly.graph_objects as go
13
13
  import seaborn as sns
14
14
  from sklearn import metrics
15
15
 
@@ -132,24 +132,28 @@ def _combine_results(results: List[dict]):
132
132
 
133
133
 
134
134
  def _plot_robustness(
135
- results: pd.DataFrame, metric: str, threshold: float, columns: List[str]
135
+ results: pd.DataFrame, metric: str, threshold: float, columns: List[str], model: str
136
136
  ):
137
- fig, ax = plt.subplots()
138
-
139
- pallete = sns.color_palette("muted", len(results["Dataset"].unique()))
140
- sns.lineplot(
141
- data=results,
142
- x="Perturbation Size",
143
- y=metric.upper(),
144
- hue="Dataset",
145
- style="Dataset",
146
- linewidth=3,
147
- markers=True,
148
- markersize=10,
149
- dashes=False,
150
- palette=pallete,
151
- ax=ax,
152
- )
137
+ fig = go.Figure()
138
+
139
+ datasets = results["Dataset"].unique()
140
+ pallete = [
141
+ f"#{int(r*255):02x}{int(g*255):02x}{int(b*255):02x}"
142
+ for r, g, b in sns.color_palette("husl", len(datasets))
143
+ ]
144
+
145
+ for i, dataset in enumerate(datasets):
146
+ dataset_results = results[results["Dataset"] == dataset]
147
+ fig.add_trace(
148
+ go.Scatter(
149
+ x=dataset_results["Perturbation Size"],
150
+ y=dataset_results[metric.upper()],
151
+ mode="lines+markers",
152
+ name=dataset,
153
+ line=dict(width=3, color=pallete[i]),
154
+ marker=dict(size=10),
155
+ )
156
+ )
153
157
 
154
158
  if PERFORMANCE_METRICS[metric]["is_lower_better"]:
155
159
  y_label = f"{metric.upper()} (lower is better)"
@@ -157,33 +161,64 @@ def _plot_robustness(
157
161
  threshold = -threshold
158
162
  y_label = f"{metric.upper()} (higher is better)"
159
163
 
160
- # add dotted threshold line
161
- for i in range(len(results["Dataset"].unique())):
162
- baseline = results[results["Dataset"] == results["Dataset"].unique()[i]][
163
- metric.upper()
164
- ].iloc[0]
165
- ax.axhline(
166
- y=baseline + threshold,
167
- color=pallete[i],
168
- linestyle="dotted",
164
+ # add threshold lines
165
+ for i, dataset in enumerate(datasets):
166
+ baseline = results[results["Dataset"] == dataset][metric.upper()].iloc[0]
167
+ fig.add_trace(
168
+ go.Scatter(
169
+ x=results["Perturbation Size"].unique(),
170
+ y=[baseline + threshold] * len(results["Perturbation Size"].unique()),
171
+ mode="lines",
172
+ name=f"threshold_{dataset}",
173
+ line=dict(dash="dash", width=2, color=pallete[i]),
174
+ showlegend=True,
175
+ )
169
176
  )
170
177
 
171
- ax.tick_params(axis="x")
172
- ax.set_ylabel(y_label, weight="bold", fontsize=18)
173
- ax.legend(fontsize=18)
174
- ax.set_xlabel(
175
- "Perturbation Size (X * Standard Deviation)", weight="bold", fontsize=18
176
- )
177
- ax.set_title(
178
- f"Perturbed Features: {', '.join(columns)}",
179
- weight="bold",
180
- fontsize=20,
181
- wrap=True,
178
+ columns_lines = [""]
179
+ for column in columns:
180
+ # keep adding to the last line in list until character limit (40)
181
+ if len(columns_lines[-1]) + len(column) < 40:
182
+ columns_lines[-1] += f"{column}, "
183
+ else:
184
+ columns_lines.append(f"{column}, ")
185
+
186
+ fig.update_layout(
187
+ title=dict(
188
+ text=(
189
+ f"Model Robustness for '{model}'<br><sup>As determined by calculating "
190
+ f"{metric.upper()} decay in the presence of random gaussian noise</sup>"
191
+ ),
192
+ font=dict(size=20),
193
+ x=0.5,
194
+ xanchor="center",
195
+ ),
196
+ xaxis_title=dict(
197
+ text="Perturbation Size (X * Standard Deviation)",
198
+ ),
199
+ yaxis_title=dict(text=y_label),
200
+ plot_bgcolor="white",
201
+ margin=dict(t=60, b=80, r=20, l=60),
202
+ xaxis=dict(showgrid=True, gridcolor="lightgrey"),
203
+ yaxis=dict(showgrid=True, gridcolor="lightgrey"),
204
+ annotations=[
205
+ go.layout.Annotation(
206
+ text=f"Perturbed Features:<br><sup>{'<br>'.join(columns_lines)}</sup>",
207
+ align="left",
208
+ font=dict(size=14),
209
+ bordercolor="lightgrey",
210
+ borderwidth=1,
211
+ borderpad=4,
212
+ showarrow=False,
213
+ x=1.025,
214
+ xref="paper",
215
+ xanchor="left",
216
+ y=-0.15,
217
+ yref="paper",
218
+ )
219
+ ],
182
220
  )
183
221
 
184
- # prevent the figure from being displayed
185
- plt.close("all")
186
-
187
222
  return fig
188
223
 
189
224
 
@@ -267,6 +302,7 @@ def robustness_diagnosis(
267
302
  metric=metric,
268
303
  threshold=performance_decay_threshold,
269
304
  columns=datasets[0].feature_columns_numeric,
305
+ model=model.input_id,
270
306
  )
271
307
 
272
308
  # rename perturbation size for baseline
@@ -279,38 +315,42 @@ def robustness_diagnosis(
279
315
 
280
316
  @dataclass
281
317
  class RobustnessDiagnosis(ThresholdTest):
282
- """Evaluate the robustness of a machine learning model to noise
283
-
284
- Robustness refers to a model's ability to maintain a high level of performance in
285
- the face of perturbations or changes (particularly noise) added to its input data.
286
- This test is designed to help gauge how well the model can handle potential real-
287
- world scenarios where the input data might be incomplete or corrupted.
288
-
289
- ## Test Methodology
290
- This test is conducted by adding Gaussian noise, proportional to a particular standard
291
- deviation scale, to numeric input features of the input datasets. The model's
292
- performance on the perturbed data is then evaluated using a user-defined metric or the
293
- default metric of AUC for classification tasks and MSE for regression tasks. The results
294
- are then plotted to visualize the model's performance decay as the perturbation size
295
- increases.
296
-
297
- When using this test, it is highly recommended to tailor the performance metric, list
298
- of scaling factors for the standard deviation of the noise, and the performance decay
299
- threshold to the specific use case of the model being evaluated.
300
-
301
- **Inputs**:
302
- - model (VMModel): The trained model to be evaluated.
303
- - datasets (List[VMDataset]): A list of datasets to evaluate the model against.
304
-
305
- ## Parameters
306
- - metric (str, optional): The performance metric to be used for evaluation. If not
307
- provided, the default metric is used based on the task of the model. Default values
308
- are "auc" for classification tasks and "mse" for regression tasks.
309
- - scaling_factor_std_dev_list (List[float], optional): A list of scaling factors for
310
- the standard deviation of the noise to be added to the input features. The default
311
- values are [0.1, 0.2, 0.3, 0.4, 0.5].
312
- - performance_decay_threshold (float, optional): The threshold for the performance
313
- decay of the model. The default value is 0.05.
318
+ """
319
+ Assesses the robustness of a machine learning model by evaluating performance decay under noisy conditions.
320
+
321
+ ### Purpose
322
+
323
+ The Robustness Diagnosis test aims to evaluate the resilience of a machine learning model when subjected to
324
+ perturbations or noise in its input data. This is essential for understanding the model's ability to handle
325
+ real-world scenarios where data may be imperfect or corrupted.
326
+
327
+ ### Test Mechanism
328
+
329
+ This test introduces Gaussian noise to the numeric input features of the datasets at varying scales of standard
330
+ deviation. The performance of the model is then measured using a specified metric. The process includes:
331
+
332
+ - Adding Gaussian noise to numerical input features based on scaling factors.
333
+ - Evaluating the model's performance on the perturbed data using metrics like AUC for classification tasks and MSE
334
+ for regression tasks.
335
+ - Aggregating and plotting the results to visualize performance decay relative to perturbation size.
336
+
337
+ ### Signs of High Risk
338
+
339
+ - A significant drop in performance metrics with minimal noise.
340
+ - Performance decay values exceeding the specified threshold.
341
+ - Consistent failure to meet performance standards across multiple perturbation scales.
342
+
343
+ ### Strengths
344
+
345
+ - Provides insights into the model's robustness against noisy or corrupted data.
346
+ - Utilizes a variety of performance metrics suitable for both classification and regression tasks.
347
+ - Visualization helps in understanding the extent of performance degradation.
348
+
349
+ ### Limitations
350
+
351
+ - Gaussian noise might not adequately represent all types of real-world data perturbations.
352
+ - Performance thresholds are somewhat arbitrary and might need tuning.
353
+ - The test may not account for more complex or unstructured noise patterns that could affect model robustness.
314
354
  """
315
355
 
316
356
  name = "robustness"
@@ -22,13 +22,15 @@ class SHAPGlobalImportance(Metric):
22
22
  """
23
23
  Evaluates and visualizes global feature importance using SHAP values for model explanation and risk identification.
24
24
 
25
- **Purpose:**
25
+ ### Purpose
26
+
26
27
  The SHAP (SHapley Additive exPlanations) Global Importance metric aims to elucidate model outcomes by attributing
27
28
  them to the contributing features. It assigns a quantifiable global importance to each feature via their respective
28
29
  absolute Shapley values, thereby making it suitable for tasks like classification (both binary and multiclass).
29
30
  This metric forms an essential part of model risk management.
30
31
 
31
- **Test Mechanism:**
32
+ ### Test Mechanism
33
+
32
34
  The exam begins with the selection of a suitable explainer which aligns with the model's type. For tree-based
33
35
  models like XGBClassifier, RandomForestClassifier, CatBoostClassifier, TreeExplainer is used whereas for linear
34
36
  models like LogisticRegression, XGBRegressor, LinearRegression, it is the LinearExplainer. Once the explainer
@@ -44,20 +46,20 @@ class SHAPGlobalImportance(Metric):
44
46
  gradually changing from low to high. Features are systematically organized in accordance with their importance.
45
47
  These plots are generated by the function `_generate_shap_plot()`.
46
48
 
47
- **Signs of High Risk:**
49
+ ### Signs of High Risk
48
50
 
49
51
  - Overemphasis on certain features in SHAP importance plots, thus hinting at the possibility of model overfitting
50
52
  - Anomalies such as unexpected or illogical features showing high importance, which might suggest that the model's
51
53
  decisions are rooted in incorrect or undesirable reasoning
52
54
  - A SHAP summary plot filled with high variability or scattered data points, indicating a cause for concern
53
55
 
54
- **Strengths:**
56
+ ### Strengths
55
57
 
56
58
  - SHAP does more than just illustrating global feature significance, it offers a detailed perspective on how
57
59
  different features shape the model's decision-making logic for each instance.
58
60
  - It provides clear insights into model behavior.
59
61
 
60
- **Limitations:**
62
+ ### Limitations
61
63
 
62
64
  - High-dimensional data can convolute interpretations.
63
65
  - Associating importance with tangible real-world impact still involves a certain degree of subjectivity.