validmind 2.5.8__py3-none-any.whl → 2.5.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai/test_descriptions.py +80 -119
  3. validmind/ai/test_result_description/config.yaml +29 -0
  4. validmind/ai/test_result_description/context.py +73 -0
  5. validmind/ai/test_result_description/image_processing.py +124 -0
  6. validmind/ai/test_result_description/system.jinja +39 -0
  7. validmind/ai/test_result_description/user.jinja +25 -0
  8. validmind/api_client.py +89 -43
  9. validmind/client.py +2 -2
  10. validmind/client_config.py +11 -14
  11. validmind/datasets/credit_risk/__init__.py +1 -0
  12. validmind/datasets/credit_risk/datasets/lending_club_biased.csv.gz +0 -0
  13. validmind/datasets/credit_risk/lending_club_bias.py +142 -0
  14. validmind/datasets/regression/fred_timeseries.py +67 -138
  15. validmind/template.py +1 -0
  16. validmind/test_suites/__init__.py +0 -2
  17. validmind/test_suites/statsmodels_timeseries.py +1 -1
  18. validmind/test_suites/summarization.py +0 -1
  19. validmind/test_suites/time_series.py +0 -43
  20. validmind/tests/__types__.py +14 -15
  21. validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
  22. validmind/tests/data_validation/ADF.py +31 -24
  23. validmind/tests/data_validation/AutoAR.py +9 -9
  24. validmind/tests/data_validation/AutoMA.py +23 -16
  25. validmind/tests/data_validation/AutoSeasonality.py +18 -16
  26. validmind/tests/data_validation/AutoStationarity.py +21 -16
  27. validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
  28. validmind/tests/{model_validation/statsmodels → data_validation}/BoxPierce.py +34 -34
  29. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +85 -124
  30. validmind/tests/data_validation/ClassImbalance.py +15 -12
  31. validmind/tests/data_validation/DFGLSArch.py +19 -13
  32. validmind/tests/data_validation/DatasetDescription.py +17 -11
  33. validmind/tests/data_validation/DatasetSplit.py +7 -5
  34. validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
  35. validmind/tests/data_validation/Duplicates.py +33 -25
  36. validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
  37. validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
  38. validmind/tests/data_validation/HighCardinality.py +19 -12
  39. validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
  40. validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
  41. validmind/tests/data_validation/IQROutliersTable.py +40 -36
  42. validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
  43. validmind/tests/data_validation/JarqueBera.py +70 -0
  44. validmind/tests/data_validation/KPSS.py +34 -29
  45. validmind/tests/data_validation/LJungBox.py +66 -0
  46. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
  47. validmind/tests/data_validation/MissingValues.py +32 -27
  48. validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
  49. validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
  50. validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
  51. validmind/tests/data_validation/ProtectedClassesCombination.py +197 -0
  52. validmind/tests/data_validation/ProtectedClassesDescription.py +130 -0
  53. validmind/tests/data_validation/ProtectedClassesDisparity.py +133 -0
  54. validmind/tests/data_validation/ProtectedClassesThresholdOptimizer.py +172 -0
  55. validmind/tests/data_validation/RollingStatsPlot.py +31 -23
  56. validmind/tests/data_validation/RunsTest.py +72 -0
  57. validmind/tests/data_validation/ScatterPlot.py +63 -78
  58. validmind/tests/data_validation/SeasonalDecompose.py +38 -34
  59. validmind/tests/{model_validation/statsmodels → data_validation}/ShapiroWilk.py +35 -30
  60. validmind/tests/data_validation/Skewness.py +35 -37
  61. validmind/tests/data_validation/SpreadPlot.py +35 -35
  62. validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
  63. validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
  64. validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
  65. validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
  66. validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
  67. validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
  68. validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
  69. validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
  70. validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
  71. validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
  72. validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
  73. validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
  74. validmind/tests/data_validation/TooManyZeroValues.py +16 -11
  75. validmind/tests/data_validation/UniqueRows.py +11 -6
  76. validmind/tests/data_validation/WOEBinPlots.py +23 -16
  77. validmind/tests/data_validation/WOEBinTable.py +35 -30
  78. validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
  79. validmind/tests/data_validation/nlp/CommonWords.py +21 -14
  80. validmind/tests/data_validation/nlp/Hashtags.py +42 -40
  81. validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
  82. validmind/tests/data_validation/nlp/Mentions.py +21 -15
  83. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
  84. validmind/tests/data_validation/nlp/Punctuations.py +24 -20
  85. validmind/tests/data_validation/nlp/Sentiment.py +27 -8
  86. validmind/tests/data_validation/nlp/StopWords.py +26 -19
  87. validmind/tests/data_validation/nlp/TextDescription.py +39 -36
  88. validmind/tests/data_validation/nlp/Toxicity.py +32 -9
  89. validmind/tests/decorator.py +81 -42
  90. validmind/tests/model_validation/BertScore.py +36 -27
  91. validmind/tests/model_validation/BleuScore.py +25 -19
  92. validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
  93. validmind/tests/model_validation/ContextualRecall.py +38 -13
  94. validmind/tests/model_validation/FeaturesAUC.py +32 -13
  95. validmind/tests/model_validation/MeteorScore.py +46 -33
  96. validmind/tests/model_validation/ModelMetadata.py +32 -64
  97. validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
  98. validmind/tests/model_validation/RegardScore.py +30 -14
  99. validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
  100. validmind/tests/model_validation/RougeScore.py +36 -30
  101. validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
  102. validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
  103. validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
  104. validmind/tests/model_validation/TokenDisparity.py +31 -23
  105. validmind/tests/model_validation/ToxicityScore.py +26 -17
  106. validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
  107. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
  108. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
  109. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
  110. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
  111. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
  112. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
  113. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
  114. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
  115. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
  116. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
  117. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
  118. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
  119. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
  120. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
  121. validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
  122. validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
  123. validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
  124. validmind/tests/model_validation/ragas/AspectCritique.py +12 -6
  125. validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
  126. validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
  127. validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
  128. validmind/tests/model_validation/ragas/ContextUtilization.py +155 -0
  129. validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
  130. validmind/tests/model_validation/ragas/NoiseSensitivity.py +152 -0
  131. validmind/tests/model_validation/ragas/utils.py +6 -0
  132. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
  133. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
  134. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
  135. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
  136. validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
  137. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
  138. validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
  139. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
  140. validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
  141. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
  142. validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
  143. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
  144. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
  145. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
  146. validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
  147. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
  148. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
  149. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
  150. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +32 -26
  151. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
  152. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
  153. validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
  154. validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
  155. validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
  156. validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
  157. validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -94
  158. validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
  159. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +36 -32
  160. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +66 -5
  161. validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
  162. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
  163. validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
  164. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
  165. validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
  166. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
  167. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +59 -32
  168. validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
  169. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
  170. validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
  171. validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +86 -119
  172. validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
  173. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
  174. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
  175. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
  176. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
  177. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
  178. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
  179. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
  180. validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
  181. validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
  182. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
  183. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
  184. validmind/tests/prompt_validation/Bias.py +14 -11
  185. validmind/tests/prompt_validation/Clarity.py +16 -14
  186. validmind/tests/prompt_validation/Conciseness.py +7 -5
  187. validmind/tests/prompt_validation/Delimitation.py +23 -22
  188. validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
  189. validmind/tests/prompt_validation/Robustness.py +12 -10
  190. validmind/tests/prompt_validation/Specificity.py +13 -11
  191. validmind/tests/prompt_validation/ai_powered_test.py +6 -0
  192. validmind/tests/run.py +68 -23
  193. validmind/unit_metrics/__init__.py +81 -144
  194. validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
  195. validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
  196. validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
  197. validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
  198. validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
  199. validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
  200. validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
  201. validmind/unit_metrics/regression/HuberLoss.py +1 -1
  202. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
  203. validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
  204. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
  205. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
  206. validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
  207. validmind/unit_metrics/regression/QuantileLoss.py +1 -1
  208. validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
  209. validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
  210. validmind/utils.py +4 -0
  211. validmind/vm_models/dataset/dataset.py +2 -0
  212. validmind/vm_models/figure.py +5 -0
  213. validmind/vm_models/test/metric.py +1 -0
  214. validmind/vm_models/test/result_wrapper.py +143 -158
  215. validmind/vm_models/test/threshold_test.py +1 -0
  216. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/METADATA +4 -3
  217. validmind-2.5.18.dist-info/RECORD +324 -0
  218. validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
  219. validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
  220. validmind/tests/data_validation/BivariateHistograms.py +0 -117
  221. validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
  222. validmind/tests/data_validation/MissingValuesRisk.py +0 -88
  223. validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
  224. validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
  225. validmind/tests/model_validation/statsmodels/JarqueBera.py +0 -73
  226. validmind/tests/model_validation/statsmodels/LJungBox.py +0 -66
  227. validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
  228. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
  229. validmind/tests/model_validation/statsmodels/RunsTest.py +0 -71
  230. validmind-2.5.8.dist-info/RECORD +0 -318
  231. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/LICENSE +0 -0
  232. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/WHEEL +0 -0
  233. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/entry_points.txt +0 -0
@@ -23,17 +23,19 @@ logger = get_logger(__name__)
23
23
  @dataclass
24
24
  class PopulationStabilityIndex(Metric):
25
25
  """
26
- Evaluates the Population Stability Index (PSI) to quantify the stability of an ML model's predictions across
26
+ Assesses the Population Stability Index (PSI) to quantify the stability of an ML model's predictions across
27
27
  different datasets.
28
28
 
29
- **Purpose:**
29
+ ### Purpose
30
+
30
31
  The Population Stability Index (PSI) serves as a quantitative assessment for evaluating the stability of a machine
31
32
  learning model's output distributions when comparing two different datasets. Typically, these would be a
32
33
  development and a validation dataset or two datasets collected at different periods. The PSI provides a measurable
33
34
  indication of any significant shift in the model's performance over time or noticeable changes in the
34
35
  characteristics of the population the model is making predictions for.
35
36
 
36
- **Test Mechanism:**
37
+ ### Test Mechanism
38
+
37
39
  The implementation of the PSI in this script involves calculating the PSI for each feature between the training and
38
40
  test datasets. Data from both datasets is sorted and placed into either a predetermined number of bins or
39
41
  quantiles. The boundaries for these bins are initially determined based on the distribution of the training data.
@@ -42,14 +44,14 @@ class PopulationStabilityIndex(Metric):
42
44
  in the training and test datasets. The PSI, along with the proportions of data in each bin for both datasets, are
43
45
  displayed in a summary table, a grouped bar chart, and a scatter plot.
44
46
 
45
- **Signs of High Risk:**
47
+ ### Signs of High Risk
46
48
 
47
49
  - A high PSI value is a clear indicator of high risk. Such a value suggests a significant shift in the model
48
50
  predictions or severe changes in the characteristics of the underlying population.
49
51
  - This ultimately suggests that the model may not be performing as well as expected and that it may be less
50
52
  reliable for making future predictions.
51
53
 
52
- **Strengths:**
54
+ ### Strengths
53
55
 
54
56
  - The PSI provides a quantitative measure of the stability of a model over time or across different samples, making
55
57
  it an invaluable tool for evaluating changes in a model's performance.
@@ -58,7 +60,7 @@ class PopulationStabilityIndex(Metric):
58
60
  - The use of visual aids such as tables and charts further simplifies the comprehension and interpretation of the
59
61
  PSI.
60
62
 
61
- **Limitations:**
63
+ ### Limitations
62
64
 
63
65
  - The PSI test does not account for the interdependence between features: features that are dependent on one
64
66
  another may show similar shifts in their distributions, which in turn may result in similar PSI values.
@@ -18,34 +18,41 @@ class PrecisionRecallCurve(Metric):
18
18
  """
19
19
  Evaluates the precision-recall trade-off for binary classification models and visualizes the Precision-Recall curve.
20
20
 
21
- **Purpose**: The Precision Recall Curve metric is intended to evaluate the trade-off between precision and recall
22
- in classification models, particularly binary classification models. It assesses the model's capacity to produce
21
+ ### Purpose
22
+
23
+ The Precision Recall Curve metric is intended to evaluate the trade-off between precision and recall in
24
+ classification models, particularly binary classification models. It assesses the model's capacity to produce
23
25
  accurate results (high precision), as well as its ability to capture a majority of all positive instances (high
24
26
  recall).
25
27
 
26
- **Test Mechanism**: The test extracts ground truth labels and prediction probabilities from the model's test
27
- dataset. It applies the precision_recall_curve method from the sklearn metrics module to these extracted labels and
28
- predictions, which computes a precision-recall pair for each possible threshold. This calculation results in an
29
- array of precision and recall scores that can be plotted against each other to form the Precision-Recall Curve.
30
- This curve is then visually represented by using Plotly's scatter plot.
28
+ ### Test Mechanism
29
+
30
+ The test extracts ground truth labels and prediction probabilities from the model's test dataset. It applies the
31
+ `precision_recall_curve` method from the sklearn metrics module to these extracted labels and predictions, which
32
+ computes a precision-recall pair for each possible threshold. This calculation results in an array of precision and
33
+ recall scores that can be plotted against each other to form the Precision-Recall Curve. This curve is then
34
+ visually represented by using Plotly's scatter plot.
35
+
36
+ ### Signs of High Risk
31
37
 
32
- **Signs of High Risk**:
33
- * A lower area under the Precision-Recall Curve signifies high risk.
34
- * This corresponds to a model yielding a high amount of false positives (low precision) and/or false negatives (low
38
+ - A lower area under the Precision-Recall Curve signifies high risk.
39
+ - This corresponds to a model yielding a high amount of false positives (low precision) and/or false negatives (low
35
40
  recall).
36
- * If the curve is closer to the bottom left of the plot, rather than being closer to the top right corner, it can
41
+ - If the curve is closer to the bottom left of the plot, rather than being closer to the top right corner, it can
37
42
  be a sign of high risk.
38
43
 
39
- **Strengths**:
40
- * This metric aptly represents the balance between precision (minimizing false positives) and recall (minimizing
44
+ ### Strengths
45
+
46
+ - This metric aptly represents the balance between precision (minimizing false positives) and recall (minimizing
41
47
  false negatives), which is especially critical in scenarios where both values are significant.
42
- * Through the graphic representation, it enables an intuitive understanding of the model's performance across
48
+ - Through the graphic representation, it enables an intuitive understanding of the model's performance across
43
49
  different threshold levels.
44
50
 
45
- **Limitations**:
46
- * This metric is only applicable to binary classification models - it raises errors for multiclass classification
51
+ ### Limitations
52
+
53
+ - This metric is only applicable to binary classification models - it raises errors for multiclass classification
47
54
  models or Foundation models.
48
- * It may not fully represent the overall accuracy of the model if the cost of false positives and false negatives
55
+ - It may not fully represent the overall accuracy of the model if the cost of false positives and false negatives
49
56
  are extremely different, or if the dataset is heavily imbalanced.
50
57
  """
51
58
 
@@ -19,7 +19,8 @@ class ROCCurve(Metric):
19
19
  Evaluates binary classification model performance by generating and plotting the Receiver Operating Characteristic
20
20
  (ROC) curve and calculating the Area Under Curve (AUC) score.
21
21
 
22
- **Purpose**:
22
+ ### Purpose
23
+
23
24
  The Receiver Operating Characteristic (ROC) curve is designed to evaluate the performance of binary classification
24
25
  models. This curve illustrates the balance between the True Positive Rate (TPR) and False Positive Rate (FPR)
25
26
  across various threshold levels. In combination with the Area Under the Curve (AUC), the ROC curve aims to measure
@@ -27,28 +28,32 @@ class ROCCurve(Metric):
27
28
  default vs non-default). Ideally, a higher AUC score signifies superior model performance in accurately
28
29
  distinguishing between the positive and negative classes.
29
30
 
30
- **Test Mechanism**:
31
+ ### Test Mechanism
32
+
31
33
  First, this script selects the target model and datasets that require binary classification. It then calculates the
32
34
  predicted probabilities for the test set, and uses this data, along with the true outcomes, to generate and plot
33
- the ROC curve. Additionally, it concludes a line signifying randomness (AUC of 0.5). The AUC score for the model's
35
+ the ROC curve. Additionally, it includes a line signifying randomness (AUC of 0.5). The AUC score for the model's
34
36
  ROC curve is also computed, presenting a numerical estimation of the model's performance. If any Infinite values
35
37
  are detected in the ROC threshold, these are effectively eliminated. The resulting ROC curve, AUC score, and
36
38
  thresholds are consequently saved for future reference.
37
39
 
38
- **Signs of High Risk**:
40
+ ### Signs of High Risk
41
+
39
42
  - A high risk is potentially linked to the model's performance if the AUC score drops below or nears 0.5.
40
43
  - Another warning sign would be the ROC curve lying closer to the line of randomness, indicating no discriminative
41
44
  ability.
42
45
  - For the model to be deemed competent at its classification tasks, it is crucial that the AUC score is
43
46
  significantly above 0.5.
44
47
 
45
- **Strengths**:
46
- - This ROC Curve offers an inclusive visual depiction of a model's discriminative power throughout all conceivable
48
+ ### Strengths
49
+
50
+ - The ROC Curve offers an inclusive visual depiction of a model's discriminative power throughout all conceivable
47
51
  classification thresholds, unlike other metrics that solely disclose model performance at one fixed threshold.
48
52
  - Despite the proportions of the dataset, the AUC Score, which represents the entire ROC curve as a single data
49
53
  point, continues to be consistent, proving to be the ideal choice for such situations.
50
54
 
51
- **Limitations**:
55
+ ### Limitations
56
+
52
57
  - The primary limitation is that this test is exclusively structured for binary classification tasks, thus limiting
53
58
  its application towards other model types.
54
59
  - Furthermore, its performance might be subpar with models that output probabilities highly skewed towards 0 or 1.
@@ -2,141 +2,85 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
-
7
5
  import numpy as np
6
+ import pandas as pd
8
7
  from sklearn import metrics
9
8
 
10
- from validmind.vm_models import Metric, ResultSummary, ResultTable
9
+ from validmind import tags, tasks
11
10
 
12
11
 
13
- @dataclass
14
- class RegressionErrors(Metric):
12
+ @tags("sklearn", "model_performance")
13
+ @tasks("regression", "classification")
14
+ def RegressionErrors(model, dataset):
15
15
  """
16
- **Purpose**: This metric is used to measure the performance of a regression model. It gauges the model's accuracy
17
- by computing several error metrics such as Mean Absolute Error (MAE), Mean Squared Error (MSE), Root Mean Squared
18
- Error (RMSE), Mean Absolute Percentage Error (MAPE), and Mean Bias Deviation (MBD) on both the training and testing
19
- dataset.
20
-
21
- **Test Mechanism**: The test computes each of the aforementioned metrics. MAE calculates the average of the
22
- absolute difference between the true value and the predicted value. MSE squares the difference before averaging it.
23
- RMSE then takes the square root of the MSE. MAPE evaluates the average of the absolute difference between true and
24
- predicted values divided by the true value, expressed as a percentage. Lastly, MBD is a measure of average bias in
25
- the prediction. The results are compared between the training dataset and the testing dataset.
26
-
27
- **Signs of High Risk**: High values for any of the metrics, or particularly different metric outcomes for the
28
- training set versus the test set, are signs of high risk. Specifically, high MAE, MSE, RMSE, or MAPE values could
29
- indicate poor model performance and overfitting. If MBD is significantly different from zero, it could signify that
30
- the model's predictions are systematically biased.
31
-
32
- **Strengths**: These metrics collectively provide a comprehensive view of model performance and error distribution.
33
- Individually, MAE provides a linear score that could be more interpretable, while MSE gives more weight to larger
34
- errors. RMSE is useful because it is in the same unit as the target variable. MAPE expresses error as a percentage,
35
- making it a good measure of prediction accuracy. MBD helps to detect systematic bias in predictions.
36
-
37
- **Limitations**: Each of these metrics has its own limitations. MAE and MSE are sensitive to outliers. While RMSE
38
- is good for giving high weight to larger errors, it might too heavily penalize these errors. MAPE might be biased
39
- if actual values are near zero, and MBD would not work well if the difference between predictions and actual values
40
- changes with the magnitude of the actual values. Overall, these metrics will not capture all model performance
41
- nuances, and they should be used with contextual understanding of the problem at hand.
16
+ Assesses the performance and error distribution of a regression model using various error metrics.
17
+
18
+ ### Purpose
19
+
20
+ The purpose of the Regression Errors test is to measure the performance of a regression model by calculating
21
+ several error metrics. This evaluation helps determine the model's accuracy and potential issues like overfitting
22
+ or bias by analyzing differences in error metrics between the training and testing datasets.
23
+
24
+ ### Test Mechanism
25
+
26
+ The test computes the following error metrics:
27
+ - **Mean Absolute Error (MAE)**: Average of the absolute differences between true values and predicted values.
28
+ - **Mean Squared Error (MSE)**: Average of the squared differences between true values and predicted values.
29
+ - **Root Mean Squared Error (RMSE)**: Square root of the mean squared error.
30
+ - **Mean Absolute Percentage Error (MAPE)**: Average of the absolute differences between true values and predicted
31
+ values, divided by the true values, and expressed as a percentage.
32
+ - **Mean Bias Deviation (MBD)**: Average bias between true values and predicted values.
33
+
34
+ These metrics are calculated separately for the training and testing datasets and compared to identify
35
+ discrepancies.
36
+
37
+ ### Signs of High Risk
38
+
39
+ - High values for MAE, MSE, RMSE, or MAPE indicating poor model performance.
40
+ - Large differences in error metrics between the training and testing datasets, suggesting overfitting.
41
+ - Significant deviation of MBD from zero, indicating systematic bias in model predictions.
42
+
43
+ ### Strengths
44
+
45
+ - Provides a comprehensive overview of model performance through multiple error metrics.
46
+ - Individual metrics offer specific insights, e.g., MAE for interpretability, MSE for emphasizing larger errors.
47
+ - RMSE is useful for being in the same unit as the target variable.
48
+ - MAPE allows the error to be expressed as a percentage.
49
+ - MBD detects systematic bias in model predictions.
50
+
51
+ ### Limitations
52
+
53
+ - MAE and MSE are sensitive to outliers.
54
+ - RMSE heavily penalizes larger errors, which might not always be desirable.
55
+ - MAPE can be misleading when actual values are near zero.
56
+ - MBD may not be suitable if bias varies with the magnitude of actual values.
57
+ - These metrics may not capture all nuances of model performance and should be interpreted with domain-specific
58
+ context.
42
59
  """
43
60
 
44
- name = "regression_errors"
45
- required_inputs = ["model", "datasets"]
46
- tasks = ["regression"]
47
- tags = [
48
- "sklearn",
49
- "model_performance",
50
- ]
51
-
52
- def summary(self, raw_results):
53
- """
54
- Returns a summarized representation of the dataset split information
55
- """
56
- table_records = []
57
- for result in raw_results:
58
- for key, _ in result.items():
59
- table_records.append(
60
- {
61
- "Metric": key,
62
- "TRAIN": result[key]["train"],
63
- "TEST": result[key]["test"],
64
- }
65
- )
66
-
67
- return ResultSummary(results=[ResultTable(data=table_records)])
68
-
69
- def regression_errors(
70
- self, y_true_train, class_pred_train, y_true_test, class_pred_test
71
- ):
72
- mae_train = metrics.mean_absolute_error(y_true_train, class_pred_train)
73
- mae_test = metrics.mean_absolute_error(y_true_test, class_pred_test)
74
-
75
- results = []
76
- results.append(
77
- {
78
- "Mean Absolute Error (MAE)": {
79
- "train": mae_train,
80
- "test": mae_test,
81
- }
82
- }
83
- )
84
-
85
- mse_train = metrics.mean_squared_error(y_true_train, class_pred_train)
86
- mse_test = metrics.mean_squared_error(y_true_test, class_pred_test)
87
- results.append(
88
- {
89
- "Mean Squared Error (MSE)": {
90
- "train": mse_train,
91
- "test": mse_test,
92
- }
93
- }
94
- )
95
- results.append(
96
- {
97
- "Root Mean Squared Error (RMSE)": {
98
- "train": np.sqrt(mse_train),
99
- "test": np.sqrt(mse_test),
100
- }
101
- }
102
- )
103
-
104
- mape_train = (
105
- np.mean(np.abs((y_true_train - class_pred_train) / y_true_train)) * 100
106
- )
107
- mape_test = np.mean(np.abs((y_true_test - class_pred_test) / y_true_test)) * 100
108
- results.append(
109
- {
110
- "Mean Absolute Percentage Error (MAPE)": {
111
- "train": mape_train,
112
- "test": mape_test,
113
- }
114
- }
115
- )
116
-
117
- mbd_train = np.mean(class_pred_train - y_true_train)
118
- mbd_test = np.mean(class_pred_test - y_true_test)
119
- results.append(
120
- {
121
- "Mean Bias Deviation (MBD)": {
122
- "train": mbd_train,
123
- "test": mbd_test,
124
- }
125
- }
126
- )
127
- return results
128
-
129
- def run(self):
130
- y_train_true = self.inputs.datasets[0].y
131
- y_train_pred = self.inputs.datasets[0].y_pred(self.inputs.model)
132
- y_train_true = y_train_true.astype(y_train_pred.dtype)
133
-
134
- y_test_true = self.inputs.datasets[1].y
135
- y_test_pred = self.inputs.datasets[1].y_pred(self.inputs.model)
136
- y_test_true = y_test_true.astype(y_test_pred.dtype)
137
-
138
- results = self.regression_errors(
139
- y_train_true, y_train_pred, y_test_true, y_test_pred
140
- )
141
-
142
- return self.cache_results(metric_value=results)
61
+ y_true = dataset.y
62
+ y_pred = dataset.y_pred(model)
63
+ y_true = y_true.astype(y_pred.dtype)
64
+
65
+ return _regression_errors(y_true, y_pred)
66
+
67
+
68
+ def _regression_errors(y_true, y_pred):
69
+ mae_train = metrics.mean_absolute_error(y_true, y_pred)
70
+ mse_train = metrics.mean_squared_error(y_true, y_pred)
71
+ rmse_train = np.sqrt(mse_train)
72
+ mape_train = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
73
+ mbd_train = np.mean(y_pred - y_true)
74
+
75
+ # Create dataframe with one row and each error metric as a column
76
+ results_df = pd.DataFrame(
77
+ {
78
+ "Mean Absolute Error (MAE)": [mae_train],
79
+ "Mean Squared Error (MSE)": [mse_train],
80
+ "Root Mean Squared Error (RMSE)": [rmse_train],
81
+ "Mean Absolute Percentage Error (MAPE)": [mape_train],
82
+ "Mean Bias Deviation (MBD)": [mbd_train],
83
+ }
84
+ )
85
+
86
+ return results_df
@@ -16,25 +16,40 @@ logger = get_logger(__name__)
16
16
  @tasks("regression", "time_series_forecasting")
17
17
  def RegressionErrorsComparison(datasets, models):
18
18
  """
19
- Compare regression error metrics for each model and generate a summary table
20
- with the results.
19
+ Assesses multiple regression error metrics to compare model performance across different datasets, emphasizing
20
+ systematic overestimation or underestimation and large percentage errors.
21
21
 
22
- **Purpose**: The purpose of this function is to compare the regression errors for different models applied to various datasets.
22
+ ### Purpose
23
23
 
24
- **Test Mechanism**: The function iterates through each dataset-model pair, calculates various error metrics (MAE, MSE, MAPE, MBD), and generates a summary table with these results.
24
+ The purpose of this test is to compare regression errors for different models applied to various datasets. It aims
25
+ to examine model performance using multiple error metrics, thereby identifying areas where models may be
26
+ underperforming or exhibiting bias.
25
27
 
26
- **Signs of High Risk**:
27
- - High Mean Absolute Error (MAE) or Mean Squared Error (MSE) indicates poor model performance.
28
- - High Mean Absolute Percentage Error (MAPE) suggests large percentage errors, especially problematic if the true values are small.
29
- - Mean Bias Deviation (MBD) significantly different from zero indicates systematic overestimation or underestimation by the model.
28
+ ### Test Mechanism
29
+
30
+ The function iterates through each dataset-model pair and calculates various error metrics, including Mean Absolute
31
+ Error (MAE), Mean Squared Error (MSE), Mean Absolute Percentage Error (MAPE), and Mean Bias Deviation (MBD). The
32
+ results are summarized in a table, which provides a comprehensive view of each model's performance on the datasets.
33
+
34
+ ### Signs of High Risk
35
+
36
+ - High Mean Absolute Error (MAE) or Mean Squared Error (MSE), indicating poor model performance.
37
+ - High Mean Absolute Percentage Error (MAPE), suggesting large percentage errors, especially problematic if the
38
+ true values are small.
39
+ - Mean Bias Deviation (MBD) significantly different from zero, indicating systematic overestimation or
40
+ underestimation by the model.
41
+
42
+ ### Strengths
30
43
 
31
- **Strengths**:
32
44
  - Provides multiple error metrics to assess model performance from different perspectives.
33
45
  - Includes a check to avoid division by zero when calculating MAPE.
34
46
 
35
- **Limitations**:
36
- - Assumes that the dataset is provided as a DataFrameDataset object with `y`, `y_pred`, and `feature_columns` attributes.
37
- - The function relies on the `logger` from `validmind.logging` to warn about zero values in `y_true`, which should be correctly implemented and imported.
47
+ ### Limitations
48
+
49
+ - Assumes that the dataset is provided as a DataFrameDataset object with `y`, `y_pred`, and `feature_columns`
50
+ attributes.
51
+ - Relies on the `logger` from `validmind.logging` to warn about zero values in `y_true`, which should be correctly
52
+ implemented and imported.
38
53
  - Requires that `dataset.y_pred(model)` returns the predicted values for the model.
39
54
  """
40
55
  results_list = []
@@ -16,25 +16,27 @@ logger = get_logger(__name__)
16
16
 
17
17
 
18
18
  @dataclass
19
- class RegressionModelsPerformanceComparison(Metric):
19
+ class RegressionPerformance(Metric):
20
20
  """
21
21
  Compares and evaluates the performance of multiple regression models using five different metrics: MAE, MSE, RMSE,
22
22
  MAPE, and MBD.
23
23
 
24
- **1. Purpose:**
24
+ ### Purpose
25
+
25
26
  The Regression Models Performance Comparison metric is used to measure and compare the performance of regression
26
27
  models. It calculates multiple evaluation metrics, including Mean Absolute Error (MAE), Mean Squared Error (MSE),
27
28
  Root Mean Squared Error (RMSE), Mean Absolute Percentage Error (MAPE), and Mean Bias Deviation (MBD), thereby
28
29
  enabling a comprehensive view of model performance.
29
30
 
30
- **2. Test Mechanism:**
31
+ ### Test Mechanism
32
+
31
33
  The test starts by sourcing the true and predicted values from the models. It then computes the MAE, MSE, RMSE,
32
34
  MAPE, and MBD. These calculations encapsulate both the direction and the magnitude of error in predictions, thereby
33
35
  providing a multi-faceted view of model accuracy. It captures these results in a dictionary and compares the
34
36
  performance of all models using these metrics. The results are then appended to a table for presenting a
35
37
  comparative summary.
36
38
 
37
- **3. Signs of High Risk:**
39
+ ### Signs of High Risk
38
40
 
39
41
  - High values of MAE, MSE, RMSE, and MAPE, which indicate a high error rate and imply a larger departure of the
40
42
  model's predictions from the true values.
@@ -42,13 +44,13 @@ class RegressionModelsPerformanceComparison(Metric):
42
44
  - If the test returns an error citing that no models were provided for comparison, it implies a risk in the
43
45
  evaluation process itself.
44
46
 
45
- **4. Strengths:**
47
+ ### Strengths
46
48
 
47
49
  - The metric evaluates models on five different metrics offering a comprehensive analysis of model performance.
48
50
  - It compares multiple models simultaneously, aiding in the selection of the best-performing models.
49
51
  - It is designed to handle regression tasks and can be seamlessly integrated with libraries like sklearn.
50
52
 
51
- **5. Limitations:**
53
+ ### Limitations
52
54
 
53
55
  - The metric only evaluates regression models and does not evaluate classification models.
54
56
  - The test assumes that the models have been trained and tested appropriately prior to evaluation. It does not
@@ -58,8 +60,8 @@ class RegressionModelsPerformanceComparison(Metric):
58
60
  - The test could exhibit performance limitations if a large number of models is input for comparison.
59
61
  """
60
62
 
61
- name = "models_performance_comparison"
62
- required_inputs = ["dataset", "models"]
63
+ name = "regression_performance"
64
+ required_inputs = ["dataset", "model"]
63
65
 
64
66
  tasks = ["regression"]
65
67
  tags = [
@@ -96,7 +98,7 @@ class RegressionModelsPerformanceComparison(Metric):
96
98
  This summary varies depending if we're evaluating a binary or multi-class model
97
99
  """
98
100
  results = []
99
- metrics = metric_value["model_0"].keys()
101
+ metrics = metric_value[self.inputs.model.input_id].keys()
100
102
  error_table = []
101
103
  for metric_name in metrics:
102
104
  errors_dict = {}
@@ -119,20 +121,16 @@ class RegressionModelsPerformanceComparison(Metric):
119
121
 
120
122
  def run(self):
121
123
  # Check models list is not empty
122
- if not self.inputs.models:
124
+ if not self.inputs.model:
123
125
  raise SkipTestError(
124
- "List of models must be provided as a `models` parameter to compare performance"
126
+ "Model must be provided as a `models` parameter to compare performance"
125
127
  )
126
-
127
- all_models = self.inputs.models
128
-
129
128
  results = {}
130
129
 
131
- for idx, model in enumerate(all_models):
132
- result = self.regression_errors(
133
- y_true_test=self.inputs.dataset.y,
134
- y_pred_test=self.inputs.dataset.y_pred(model),
135
- )
136
- results["model_" + str(idx)] = result
130
+ result = self.regression_errors(
131
+ y_true_test=self.inputs.dataset.y,
132
+ y_pred_test=self.inputs.dataset.y_pred(self.inputs.model),
133
+ )
134
+ results[self.inputs.model.input_id] = result
137
135
 
138
136
  return self.cache_results(results)