validmind 2.5.8__py3-none-any.whl → 2.5.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai/test_descriptions.py +80 -119
  3. validmind/ai/test_result_description/config.yaml +29 -0
  4. validmind/ai/test_result_description/context.py +73 -0
  5. validmind/ai/test_result_description/image_processing.py +124 -0
  6. validmind/ai/test_result_description/system.jinja +39 -0
  7. validmind/ai/test_result_description/user.jinja +25 -0
  8. validmind/api_client.py +89 -43
  9. validmind/client.py +2 -2
  10. validmind/client_config.py +11 -14
  11. validmind/datasets/credit_risk/__init__.py +1 -0
  12. validmind/datasets/credit_risk/datasets/lending_club_biased.csv.gz +0 -0
  13. validmind/datasets/credit_risk/lending_club_bias.py +142 -0
  14. validmind/datasets/regression/fred_timeseries.py +67 -138
  15. validmind/template.py +1 -0
  16. validmind/test_suites/__init__.py +0 -2
  17. validmind/test_suites/statsmodels_timeseries.py +1 -1
  18. validmind/test_suites/summarization.py +0 -1
  19. validmind/test_suites/time_series.py +0 -43
  20. validmind/tests/__types__.py +14 -15
  21. validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
  22. validmind/tests/data_validation/ADF.py +31 -24
  23. validmind/tests/data_validation/AutoAR.py +9 -9
  24. validmind/tests/data_validation/AutoMA.py +23 -16
  25. validmind/tests/data_validation/AutoSeasonality.py +18 -16
  26. validmind/tests/data_validation/AutoStationarity.py +21 -16
  27. validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
  28. validmind/tests/{model_validation/statsmodels → data_validation}/BoxPierce.py +34 -34
  29. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +85 -124
  30. validmind/tests/data_validation/ClassImbalance.py +15 -12
  31. validmind/tests/data_validation/DFGLSArch.py +19 -13
  32. validmind/tests/data_validation/DatasetDescription.py +17 -11
  33. validmind/tests/data_validation/DatasetSplit.py +7 -5
  34. validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
  35. validmind/tests/data_validation/Duplicates.py +33 -25
  36. validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
  37. validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
  38. validmind/tests/data_validation/HighCardinality.py +19 -12
  39. validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
  40. validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
  41. validmind/tests/data_validation/IQROutliersTable.py +40 -36
  42. validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
  43. validmind/tests/data_validation/JarqueBera.py +70 -0
  44. validmind/tests/data_validation/KPSS.py +34 -29
  45. validmind/tests/data_validation/LJungBox.py +66 -0
  46. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
  47. validmind/tests/data_validation/MissingValues.py +32 -27
  48. validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
  49. validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
  50. validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
  51. validmind/tests/data_validation/ProtectedClassesCombination.py +197 -0
  52. validmind/tests/data_validation/ProtectedClassesDescription.py +130 -0
  53. validmind/tests/data_validation/ProtectedClassesDisparity.py +133 -0
  54. validmind/tests/data_validation/ProtectedClassesThresholdOptimizer.py +172 -0
  55. validmind/tests/data_validation/RollingStatsPlot.py +31 -23
  56. validmind/tests/data_validation/RunsTest.py +72 -0
  57. validmind/tests/data_validation/ScatterPlot.py +63 -78
  58. validmind/tests/data_validation/SeasonalDecompose.py +38 -34
  59. validmind/tests/{model_validation/statsmodels → data_validation}/ShapiroWilk.py +35 -30
  60. validmind/tests/data_validation/Skewness.py +35 -37
  61. validmind/tests/data_validation/SpreadPlot.py +35 -35
  62. validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
  63. validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
  64. validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
  65. validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
  66. validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
  67. validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
  68. validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
  69. validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
  70. validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
  71. validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
  72. validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
  73. validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
  74. validmind/tests/data_validation/TooManyZeroValues.py +16 -11
  75. validmind/tests/data_validation/UniqueRows.py +11 -6
  76. validmind/tests/data_validation/WOEBinPlots.py +23 -16
  77. validmind/tests/data_validation/WOEBinTable.py +35 -30
  78. validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
  79. validmind/tests/data_validation/nlp/CommonWords.py +21 -14
  80. validmind/tests/data_validation/nlp/Hashtags.py +42 -40
  81. validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
  82. validmind/tests/data_validation/nlp/Mentions.py +21 -15
  83. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
  84. validmind/tests/data_validation/nlp/Punctuations.py +24 -20
  85. validmind/tests/data_validation/nlp/Sentiment.py +27 -8
  86. validmind/tests/data_validation/nlp/StopWords.py +26 -19
  87. validmind/tests/data_validation/nlp/TextDescription.py +39 -36
  88. validmind/tests/data_validation/nlp/Toxicity.py +32 -9
  89. validmind/tests/decorator.py +81 -42
  90. validmind/tests/model_validation/BertScore.py +36 -27
  91. validmind/tests/model_validation/BleuScore.py +25 -19
  92. validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
  93. validmind/tests/model_validation/ContextualRecall.py +38 -13
  94. validmind/tests/model_validation/FeaturesAUC.py +32 -13
  95. validmind/tests/model_validation/MeteorScore.py +46 -33
  96. validmind/tests/model_validation/ModelMetadata.py +32 -64
  97. validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
  98. validmind/tests/model_validation/RegardScore.py +30 -14
  99. validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
  100. validmind/tests/model_validation/RougeScore.py +36 -30
  101. validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
  102. validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
  103. validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
  104. validmind/tests/model_validation/TokenDisparity.py +31 -23
  105. validmind/tests/model_validation/ToxicityScore.py +26 -17
  106. validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
  107. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
  108. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
  109. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
  110. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
  111. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
  112. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
  113. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
  114. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
  115. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
  116. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
  117. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
  118. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
  119. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
  120. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
  121. validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
  122. validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
  123. validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
  124. validmind/tests/model_validation/ragas/AspectCritique.py +12 -6
  125. validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
  126. validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
  127. validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
  128. validmind/tests/model_validation/ragas/ContextUtilization.py +155 -0
  129. validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
  130. validmind/tests/model_validation/ragas/NoiseSensitivity.py +152 -0
  131. validmind/tests/model_validation/ragas/utils.py +6 -0
  132. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
  133. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
  134. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
  135. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
  136. validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
  137. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
  138. validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
  139. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
  140. validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
  141. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
  142. validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
  143. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
  144. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
  145. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
  146. validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
  147. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
  148. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
  149. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
  150. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +32 -26
  151. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
  152. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
  153. validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
  154. validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
  155. validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
  156. validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
  157. validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -94
  158. validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
  159. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +36 -32
  160. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +66 -5
  161. validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
  162. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
  163. validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
  164. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
  165. validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
  166. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
  167. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +59 -32
  168. validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
  169. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
  170. validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
  171. validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +86 -119
  172. validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
  173. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
  174. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
  175. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
  176. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
  177. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
  178. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
  179. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
  180. validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
  181. validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
  182. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
  183. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
  184. validmind/tests/prompt_validation/Bias.py +14 -11
  185. validmind/tests/prompt_validation/Clarity.py +16 -14
  186. validmind/tests/prompt_validation/Conciseness.py +7 -5
  187. validmind/tests/prompt_validation/Delimitation.py +23 -22
  188. validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
  189. validmind/tests/prompt_validation/Robustness.py +12 -10
  190. validmind/tests/prompt_validation/Specificity.py +13 -11
  191. validmind/tests/prompt_validation/ai_powered_test.py +6 -0
  192. validmind/tests/run.py +68 -23
  193. validmind/unit_metrics/__init__.py +81 -144
  194. validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
  195. validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
  196. validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
  197. validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
  198. validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
  199. validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
  200. validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
  201. validmind/unit_metrics/regression/HuberLoss.py +1 -1
  202. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
  203. validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
  204. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
  205. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
  206. validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
  207. validmind/unit_metrics/regression/QuantileLoss.py +1 -1
  208. validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
  209. validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
  210. validmind/utils.py +4 -0
  211. validmind/vm_models/dataset/dataset.py +2 -0
  212. validmind/vm_models/figure.py +5 -0
  213. validmind/vm_models/test/metric.py +1 -0
  214. validmind/vm_models/test/result_wrapper.py +143 -158
  215. validmind/vm_models/test/threshold_test.py +1 -0
  216. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/METADATA +4 -3
  217. validmind-2.5.18.dist-info/RECORD +324 -0
  218. validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
  219. validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
  220. validmind/tests/data_validation/BivariateHistograms.py +0 -117
  221. validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
  222. validmind/tests/data_validation/MissingValuesRisk.py +0 -88
  223. validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
  224. validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
  225. validmind/tests/model_validation/statsmodels/JarqueBera.py +0 -73
  226. validmind/tests/model_validation/statsmodels/LJungBox.py +0 -66
  227. validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
  228. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
  229. validmind/tests/model_validation/statsmodels/RunsTest.py +0 -71
  230. validmind-2.5.8.dist-info/RECORD +0 -318
  231. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/LICENSE +0 -0
  232. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/WHEEL +0 -0
  233. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/entry_points.txt +0 -0
@@ -27,21 +27,23 @@ class WeakspotsDiagnosis(ThresholdTest):
27
27
  Identifies and visualizes weak spots in a machine learning model's performance across various sections of the
28
28
  feature space.
29
29
 
30
- **Purpose:**
30
+ ### Purpose
31
+
31
32
  The weak spots test is applied to evaluate the performance of a machine learning model within specific regions of
32
33
  its feature space. This test slices the feature space into various sections, evaluating the model's outputs within
33
34
  each section against specific performance metrics (e.g., accuracy, precision, recall, and F1 scores). The ultimate
34
35
  aim is to identify areas where the model's performance falls below the set thresholds, thereby exposing its
35
36
  possible weaknesses and limitations.
36
37
 
37
- **Test Mechanism:**
38
+ ### Test Mechanism
39
+
38
40
  The test mechanism adopts an approach of dividing the feature space of the training dataset into numerous bins. The
39
41
  model's performance metrics (accuracy, precision, recall, F1 scores) are then computed for each bin on both the
40
42
  training and test datasets. A "weak spot" is identified if any of the performance metrics fall below a
41
43
  predetermined threshold for a particular bin on the test dataset. The test results are visually plotted as bar
42
44
  charts for each performance metric, indicating the bins which fail to meet the established threshold.
43
45
 
44
- **Signs of High Risk:**
46
+ ### Signs of High Risk
45
47
 
46
48
  - Any performance metric of the model dropping below the set thresholds.
47
49
  - Significant disparity in performance between the training and test datasets within a bin could be an indication
@@ -49,7 +51,7 @@ class WeakspotsDiagnosis(ThresholdTest):
49
51
  - Regions or slices with consistently low performance metrics. Such instances could mean that the model struggles
50
52
  to handle specific types of input data adequately, resulting in potentially inaccurate predictions.
51
53
 
52
- **Strengths:**
54
+ ### Strengths
53
55
 
54
56
  - The test helps pinpoint precise regions of the feature space where the model's performance is below par, allowing
55
57
  for more targeted improvements to the model.
@@ -58,7 +60,7 @@ class WeakspotsDiagnosis(ThresholdTest):
58
60
  - The test exhibits flexibility, letting users set different thresholds for various performance metrics according
59
61
  to the specific requirements of the application.
60
62
 
61
- **Limitations:**
63
+ ### Limitations
62
64
 
63
65
  - The binning system utilized for the feature space in the test could over-simplify the model's behavior within
64
66
  each bin. The granularity of this slicing depends on the chosen 'bins' parameter and can sometimes be arbitrary.
@@ -15,13 +15,16 @@ class AutoARIMA(Metric):
15
15
  """
16
16
  Evaluates ARIMA models for time-series forecasting, ranking them using Bayesian and Akaike Information Criteria.
17
17
 
18
- **Purpose**: The AutoARIMA validation test is designed to evaluate and rank AutoRegressive Integrated Moving
19
- Average (ARIMA) models. These models are primarily used for forecasting time-series data. The validation test
20
- automatically fits multiple ARIMA models, with varying parameters, to every variable within the given dataset. The
21
- models are then ranked based on their Bayesian Information Criterion (BIC) and Akaike Information Criterion (AIC)
22
- values, which provide a basis for the efficient model selection process.
18
+ ### Purpose
19
+
20
+ The AutoARIMA validation test is designed to evaluate and rank AutoRegressive Integrated Moving Average (ARIMA)
21
+ models. These models are primarily used for forecasting time-series data. The validation test automatically fits
22
+ multiple ARIMA models, with varying parameters, to every variable within the given dataset. The models are then
23
+ ranked based on their Bayesian Information Criterion (BIC) and Akaike Information Criterion (AIC) values, which
24
+ provide a basis for the efficient model selection process.
25
+
26
+ ### Test Mechanism
23
27
 
24
- **Test Mechanism**:
25
28
  This metric proceeds by generating an array of feasible combinations of ARIMA model parameters which are within a
26
29
  prescribed limit. These limits include `max_p`, `max_d`, `max_q`; they represent the autoregressive, differencing,
27
30
  and moving average components respectively. Upon applying these sets of parameters, the validation test fits each
@@ -31,28 +34,31 @@ class AutoARIMA(Metric):
31
34
  found to be non-stationary, a warning message is sent out, given that ARIMA models necessitate input series to be
32
35
  stationary.
33
36
 
34
- **Signs of High Risk**:
35
- * If the p-value of the Augmented Dickey-Fuller test for a variable exceeds 0.05, a warning is logged. This warning
37
+ ### Signs of High Risk
38
+
39
+ - If the p-value of the Augmented Dickey-Fuller test for a variable exceeds 0.05, a warning is logged. This warning
36
40
  indicates that the series might not be stationary, leading to potentially inaccurate results.
37
- * Consistent failure in fitting ARIMA models (as made evident through logged errors) might disclose issues with
41
+ - Consistent failure in fitting ARIMA models (as made evident through logged errors) might disclose issues with
38
42
  either the data or model stability.
39
43
 
40
- **Strengths**:
41
- * The AutoARIMA validation test simplifies the often complex task of selecting the most suitable ARIMA model based
44
+ ### Strengths
45
+
46
+ - The AutoARIMA validation test simplifies the often complex task of selecting the most suitable ARIMA model based
42
47
  on BIC and AIC criteria.
43
- * The mechanism incorporates a check for non-stationarity within the data, which is a critical prerequisite for
48
+ - The mechanism incorporates a check for non-stationarity within the data, which is a critical prerequisite for
44
49
  ARIMA models.
45
- * The exhaustive search through all possible combinations of model parameters enhances the likelihood of
50
+ - The exhaustive search through all possible combinations of model parameters enhances the likelihood of
46
51
  identifying the best-fit model.
47
52
 
48
- **Limitations**:
49
- * This validation test can be computationally costly as it involves creating and fitting multiple ARIMA models for
53
+ ### Limitations
54
+
55
+ - This validation test can be computationally costly as it involves creating and fitting multiple ARIMA models for
50
56
  every variable.
51
- * Although the test checks for non-stationarity and logs warnings where present, it does not apply any
57
+ - Although the test checks for non-stationarity and logs warnings where present, it does not apply any
52
58
  transformations to the data to establish stationarity.
53
- * The selection of models leans solely on BIC and AIC criteria, which may not yield the best predictive model in
59
+ - The selection of models leans solely on BIC and AIC criteria, which may not yield the best predictive model in
54
60
  all scenarios.
55
- * The test is only applicable to regression tasks involving time-series data, and may not work effectively for
61
+ - The test is only applicable to regression tasks involving time-series data, and may not work effectively for
56
62
  other types of machine learning tasks.
57
63
  """
58
64
 
@@ -2,138 +2,107 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
-
7
5
  import numpy as np
8
6
  import plotly.graph_objects as go
9
7
  from matplotlib import cm
10
8
 
11
- from validmind.vm_models import Figure, Metric
9
+ from validmind import tags, tasks
12
10
 
13
11
 
14
- @dataclass
15
- class CumulativePredictionProbabilities(Metric):
12
+ @tags("visualization", "credit_risk", "logistic_regression")
13
+ @tasks("classification")
14
+ def CumulativePredictionProbabilities(dataset, model, title="Cumulative Probabilities"):
16
15
  """
17
16
  Visualizes cumulative probabilities of positive and negative classes for both training and testing in logistic
18
17
  regression models.
19
18
 
20
- **Purpose**: This metric is utilized to evaluate the distribution of predicted probabilities for positive and
21
- negative classes in a logistic regression model. It's not solely intended to measure the model's performance but
22
- also provides a visual assessment of the model's behavior by plotting the cumulative probabilities for positive and
23
- negative classes across both the training and test datasets.
19
+ ### Purpose
20
+
21
+ This metric is utilized to evaluate the distribution of predicted probabilities for positive and negative classes
22
+ in a logistic regression model. It provides a visual assessment of the model's behavior by plotting the cumulative
23
+ probabilities for positive and negative classes across both the training and test datasets.
24
+
25
+ ### Test Mechanism
26
+
27
+ The logistic regression model is evaluated by first computing the predicted probabilities for each instance in both
28
+ the training and test datasets, which are then added as a new column in these sets. The cumulative probabilities
29
+ for positive and negative classes are subsequently calculated and sorted in ascending order. Cumulative
30
+ distributions of these probabilities are created for both positive and negative classes across both training and
31
+ test datasets. These cumulative probabilities are represented visually in a plot, containing two subplots - one for
32
+ the training data and the other for the test data, with lines representing cumulative distributions of positive and
33
+ negative classes.
24
34
 
25
- **Test Mechanism**: The logistic regression model is evaluated by first computing the predicted probabilities for
26
- each instance in both the training and test datasets, which are then added as a new column in these sets. The
27
- cumulative probabilities for positive and negative classes are subsequently calculated and sorted in ascending
28
- order. Cumulative distributions of these probabilities are created for both positive and negative classes across
29
- both training and test datasets. These cumulative probabilities are represented visually in a plot, containing two
30
- subplots - one for the training data and the other for the test data, with lines representing cumulative
31
- distributions of positive and negative classes.
35
+ ### Signs of High Risk
32
36
 
33
- **Signs of High Risk**:
34
37
  - Imbalanced distribution of probabilities for either positive or negative classes.
35
38
  - Notable discrepancies or significant differences between the cumulative probability distributions for the
36
39
  training data versus the test data.
37
40
  - Marked discrepancies or large differences between the cumulative probability distributions for positive and
38
41
  negative classes.
39
42
 
40
- **Strengths**:
41
- - It offers not only numerical probabilities but also provides a visual illustration of data, which enhances the
42
- ease of understanding and interpreting the model's behavior.
43
+ ### Strengths
44
+
45
+ - Provides a visual illustration of data, which enhances the ease of understanding and interpreting the model's
46
+ behavior.
43
47
  - Allows for the comparison of model's behavior across training and testing datasets, providing insights about how
44
48
  well the model is generalized.
45
- - It differentiates between positive and negative classes and their respective distribution patterns, which can aid
46
- in problem diagnosis.
49
+ - Differentiates between positive and negative classes and their respective distribution patterns, aiding in
50
+ problem diagnosis.
51
+
52
+ ### Limitations
47
53
 
48
- **Limitations**:
49
54
  - Exclusive to classification tasks and specifically to logistic regression models.
50
55
  - Graphical results necessitate human interpretation and may not be directly applicable for automated risk
51
56
  detection.
52
- - The method does not give a solitary quantifiable measure of model risk, rather it offers a visual representation
53
- and broad distributional information.
57
+ - The method does not give a solitary quantifiable measure of model risk, instead, it offers a visual
58
+ representation and broad distributional information.
54
59
  - If the training and test datasets are not representative of the overall data distribution, the metric could
55
60
  provide misleading results.
56
61
  """
57
62
 
58
- name = "cumulative_prediction_probabilities"
59
- required_inputs = ["model", "datasets"]
60
- tasks = ["classification"]
61
- tags = ["logistic_regression", "visualization"]
62
-
63
- default_params = {"title": "Cumulative Probabilities"}
64
-
65
- @staticmethod
66
- def plot_cumulative_prob(dataframes, dataset_titles, target_col, title):
67
- figures = []
68
-
69
- # Generate a colormap and convert to Plotly-accepted color format
70
- # Adjust 'viridis' to any other matplotlib colormap if desired
71
- colormap = cm.get_cmap("viridis")
72
-
73
- for _, (df, dataset_title) in enumerate(zip(dataframes, dataset_titles)):
74
- fig = go.Figure()
75
-
76
- # Get unique classes and assign colors
77
- classes = sorted(df[target_col].unique())
78
- colors = [
79
- colormap(i / len(classes))[:3] for i in range(len(classes))
80
- ] # RGB
81
- color_dict = {
82
- cls: f"rgb({int(rgb[0]*255)}, {int(rgb[1]*255)}, {int(rgb[2]*255)})"
83
- for cls, rgb in zip(classes, colors)
84
- }
85
- for class_value in sorted(df[target_col].unique()):
86
- # Calculate cumulative distribution for the current class
87
- sorted_probs = np.sort(
88
- df[df[target_col] == class_value]["probabilities"]
89
- )
90
- cumulative_probs = np.cumsum(sorted_probs) / np.sum(sorted_probs)
91
-
92
- fig.add_trace(
93
- go.Scatter(
94
- x=sorted_probs,
95
- y=cumulative_probs,
96
- mode="lines",
97
- name=f"{dataset_title} {target_col} = {class_value}",
98
- line=dict(
99
- color=color_dict[class_value],
100
- ),
101
- )
102
- )
103
- fig.update_layout(
104
- title_text=f"{title} - {dataset_title}",
105
- xaxis_title="Probability",
106
- yaxis_title="Cumulative Distribution",
107
- legend_title=target_col,
108
- )
109
- figures.append(fig)
110
- return figures
111
-
112
- def run(self):
113
- dataset_titles = [dataset.input_id for dataset in self.inputs.datasets]
114
- target_column = self.inputs.datasets[0].target_column
115
- title = self.params.get("title", self.default_params["title"])
116
-
117
- dataframes = []
118
- metric_value = {"cum_prob": {}}
119
- for dataset in self.inputs.datasets:
120
- df = dataset.df.copy()
121
- y_prob = dataset.y_prob(self.inputs.model)
122
- df["probabilities"] = y_prob
123
- dataframes.append(df)
124
- metric_value["cum_prob"][dataset.input_id] = list(df["probabilities"])
125
-
126
- figures = self.plot_cumulative_prob(
127
- dataframes, dataset_titles, target_column, title
128
- )
63
+ df = dataset.df
64
+ df["probabilities"] = dataset.y_prob(model)
129
65
 
130
- figures_list = [
131
- Figure(
132
- for_object=self,
133
- key=f"cumulative_prob_{title.replace(' ', '_')}_{i+1}",
134
- figure=fig,
66
+ fig = _plot_cumulative_prob(df, dataset.target_column, title)
67
+
68
+ return fig
69
+
70
+
71
+ def _plot_cumulative_prob(df, target_col, title):
72
+
73
+ # Generate a colormap and convert to Plotly-accepted color format
74
+ # Adjust 'viridis' to any other matplotlib colormap if desired
75
+ colormap = cm.get_cmap("viridis")
76
+
77
+ fig = go.Figure()
78
+
79
+ # Get unique classes and assign colors
80
+ classes = sorted(df[target_col].unique())
81
+ colors = [colormap(i / len(classes))[:3] for i in range(len(classes))] # RGB
82
+ color_dict = {
83
+ cls: f"rgb({int(rgb[0]*255)}, {int(rgb[1]*255)}, {int(rgb[2]*255)})"
84
+ for cls, rgb in zip(classes, colors)
85
+ }
86
+ for class_value in sorted(df[target_col].unique()):
87
+ # Calculate cumulative distribution for the current class
88
+ sorted_probs = np.sort(df[df[target_col] == class_value]["probabilities"])
89
+ cumulative_probs = np.cumsum(sorted_probs) / np.sum(sorted_probs)
90
+
91
+ fig.add_trace(
92
+ go.Scatter(
93
+ x=sorted_probs,
94
+ y=cumulative_probs,
95
+ mode="lines",
96
+ name=f"{target_col} = {class_value}",
97
+ line=dict(
98
+ color=color_dict[class_value],
99
+ ),
135
100
  )
136
- for i, fig in enumerate(figures)
137
- ]
101
+ )
102
+ fig.update_layout(
103
+ title_text=f"{title}",
104
+ xaxis_title="Probability",
105
+ yaxis_title="Cumulative Distribution",
106
+ )
138
107
 
139
- return self.cache_results(metric_value=metric_value, figures=figures_list)
108
+ return fig
@@ -2,58 +2,85 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
-
5
+ import pandas as pd
7
6
  from statsmodels.stats.stattools import durbin_watson
8
7
 
9
- from validmind.vm_models import Metric
8
+ from validmind import tags, tasks
10
9
 
11
10
 
12
- @dataclass
13
- class DurbinWatsonTest(Metric):
11
+ @tasks("regression")
12
+ @tags("time_series_data", "forecasting", "statistical_test", "statsmodels")
13
+ def DurbinWatsonTest(dataset, model, threshold=[1.5, 2.5]):
14
14
  """
15
15
  Assesses autocorrelation in time series data features using the Durbin-Watson statistic.
16
16
 
17
- **Purpose**: The Durbin-Watson Test metric detects autocorrelation in time series data (where a set of data values
18
- influences their predecessors). Autocorrelation is a crucial factor for regression tasks as these often assume the
17
+ ### Purpose
18
+
19
+ The Durbin-Watson Test metric detects autocorrelation in time series data (where a set of data values influences
20
+ their predecessors). Autocorrelation is a crucial factor for regression tasks as these often assume the
19
21
  independence of residuals. A model with significant autocorrelation may give unreliable predictions.
20
22
 
21
- **Test Mechanism**: Utilizing the `durbin_watson` function in the `statsmodels` Python library, the Durbin-Watson
22
- (DW) Test metric generates a statistical value for each feature of the training dataset. The function is looped
23
- over all columns of the dataset, calculating and caching the DW value for each column for further analysis. A DW
24
- metric value nearing 2 indicates no autocorrelation. Conversely, values approaching 0 suggest positive
25
- autocorrelation, and those leaning towards 4 imply negative autocorrelation.
23
+ ### Test Mechanism
24
+
25
+ Utilizing the `durbin_watson` function in the `statsmodels` Python library, the Durbin-Watson (DW) Test metric
26
+ generates a statistical value for each feature of the training dataset. The function is looped over all columns of
27
+ the dataset, calculating and caching the DW value for each column for further analysis. A DW metric value nearing 2
28
+ indicates no autocorrelation. Conversely, values approaching 0 suggest positive autocorrelation, and those leaning
29
+ towards 4 imply negative autocorrelation.
30
+
31
+ ### Signs of High Risk
26
32
 
27
- **Signs of High Risk**:
28
33
  - If a feature's DW value significantly deviates from 2, it could signal a high risk due to potential
29
34
  autocorrelation issues in the dataset.
30
- - A value closer to '0' could imply positive autocorrelation, while a value nearer to '4' could point to negative
35
+ - A value closer to 0 could imply positive autocorrelation, while a value nearer to 4 could point to negative
31
36
  autocorrelation, both leading to potentially unreliable prediction models.
32
37
 
33
- **Strengths**:
38
+ ### Strengths
39
+
34
40
  - The metric specializes in identifying autocorrelation in prediction model residuals.
35
41
  - Autocorrelation detection assists in diagnosing violation of various modeling technique assumptions, particularly
36
42
  in regression analysis and time-series data modeling.
37
43
 
38
- **Limitations**:
44
+ ### Limitations
45
+
39
46
  - The Durbin-Watson Test mainly detects linear autocorrelation and could overlook other types of relationships.
40
47
  - The metric is highly sensitive to data points order. Shuffling the order could lead to notably different results.
41
48
  - The test only checks for first-order autocorrelation (between a variable and its immediate predecessor) and fails
42
- to detect higher order autocorrelation.
49
+ to detect higher-order autocorrelation.
43
50
  """
44
51
 
45
- name = "durbin_watson"
46
- required_inputs = ["dataset"]
47
- tasks = ["regression"]
48
- tags = ["time_series_data", "forecasting", "statistical_test", "statsmodels"]
49
-
50
- def run(self):
51
- """
52
- Calculates DB for each of the dataset features
53
- """
54
- x_train = self.inputs.dataset.df
55
- dw_values = {}
56
- for col in x_train.columns:
57
- dw_values[col] = durbin_watson(x_train[col].values)
58
-
59
- return self.cache_results(dw_values)
52
+ # Validate threshold values
53
+ if not (0 < threshold[0] < threshold[1] < 4):
54
+ raise ValueError(
55
+ "Invalid threshold. It should be in the form [a, b] where 0 < a < b < 4."
56
+ )
57
+
58
+ # Check if threshold values are around 2
59
+ if abs(2 - threshold[0]) > 1 or abs(2 - threshold[1]) > 1:
60
+ raise ValueError(
61
+ "Threshold values should be around 2 for meaningful Durbin-Watson test results."
62
+ )
63
+
64
+ y_true = dataset.y
65
+ y_pred = dataset.y_pred(model)
66
+ residuals = y_true - y_pred
67
+
68
+ dw_statistic = durbin_watson(residuals)
69
+
70
+ def get_autocorrelation(dw_value, threshold):
71
+ if dw_value < threshold[0]:
72
+ return "Positive autocorrelation"
73
+ elif dw_value > threshold[1]:
74
+ return "Negative autocorrelation"
75
+ else:
76
+ return "No autocorrelation"
77
+
78
+ results = pd.DataFrame(
79
+ {
80
+ "dw_statistic": [dw_statistic],
81
+ "threshold": [str(threshold)],
82
+ "autocorrelation": [get_autocorrelation(dw_statistic, threshold)],
83
+ }
84
+ )
85
+
86
+ return results
@@ -2,34 +2,37 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
-
7
5
  import numpy as np
8
6
  import pandas as pd
9
7
  from sklearn.metrics import roc_auc_score, roc_curve
10
8
 
11
- from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
9
+ from validmind import tags, tasks
12
10
 
13
11
 
14
- @dataclass
15
- class GINITable(Metric):
12
+ @tags("model_performance")
13
+ @tasks("classification")
14
+ def GINITable(dataset, model):
16
15
  """
17
16
  Evaluates classification model performance using AUC, GINI, and KS metrics for training and test datasets.
18
17
 
19
- **Purpose**: The 'GINITable' metric is designed to evaluate the performance of a classification model by
20
- emphasizing its discriminatory power. Specifically, it calculates and presents three important metrics - the Area
21
- under the ROC Curve (AUC), the GINI coefficient, and the Kolmogov-Smirnov (KS) statistic - for both training and
22
- test datasets.
18
+ ### Purpose
19
+
20
+ The 'GINITable' metric is designed to evaluate the performance of a classification model by emphasizing its
21
+ discriminatory power. Specifically, it calculates and presents three important metrics - the Area under the ROC
22
+ Curve (AUC), the GINI coefficient, and the Kolmogorov-Smirnov (KS) statistic - for both training and test datasets.
23
+
24
+ ### Test Mechanism
23
25
 
24
- **Test Mechanism**: Using a dictionary for storing performance metrics for both the training and test datasets, the
25
- 'GINITable' metric calculates each of these metrics sequentially. The Area under the ROC Curve (AUC) is calculated
26
- via the `roc_auc_score` function from the Scikit-Learn library. The GINI coefficient, a measure of statistical
27
- dispersion, is then computed by doubling the AUC and subtracting 1. Finally, the Kolmogov-Smirnov (KS) statistic is
26
+ Using a dictionary for storing performance metrics for both the training and test datasets, the 'GINITable' metric
27
+ calculates each of these metrics sequentially. The Area under the ROC Curve (AUC) is calculated via the
28
+ `roc_auc_score` function from the Scikit-Learn library. The GINI coefficient, a measure of statistical dispersion,
29
+ is then computed by doubling the AUC and subtracting 1. Finally, the Kolmogorov-Smirnov (KS) statistic is
28
30
  calculated via the `roc_curve` function from Scikit-Learn, with the False Positive Rate (FPR) subtracted from the
29
31
  True Positive Rate (TPR) and the maximum value taken from the resulting data. These metrics are then stored in a
30
32
  pandas DataFrame for convenient visualization.
31
33
 
32
- **Signs of High Risk**:
34
+ ### Signs of High Risk
35
+
33
36
  - Low values for performance metrics may suggest a reduction in model performance, particularly a low AUC which
34
37
  indicates poor classification performance, or a low GINI coefficient, which could suggest a decreased ability to
35
38
  discriminate different classes.
@@ -38,7 +41,8 @@ class GINITable(Metric):
38
41
  - Significant discrepancies between the performance on the training dataset and the test dataset may present
39
42
  another signal of high risk.
40
43
 
41
- **Strengths**:
44
+ ### Strengths
45
+
42
46
  - Offers three key performance metrics (AUC, GINI, and KS) in one test, providing a more comprehensive evaluation
43
47
  of the model.
44
48
  - Provides a direct comparison between the model's performance on training and testing datasets, which aids in
@@ -47,7 +51,8 @@ class GINITable(Metric):
47
51
  performance even when dealing with imbalanced datasets.
48
52
  - Presents the metrics in a user-friendly table format for easy comprehension and analysis.
49
53
 
50
- **Limitations**:
54
+ ### Limitations
55
+
51
56
  - The GINI coefficient and KS statistic are both dependent on the AUC value. Therefore, any errors in the
52
57
  calculation of the latter will adversely impact the former metrics too.
53
58
  - Mainly suited for binary classification models and may require modifications for effective application in
@@ -57,64 +62,26 @@ class GINITable(Metric):
57
62
  lead to inaccuracies in the metrics if the data is not appropriately preprocessed.
58
63
  """
59
64
 
60
- name = "gini_table"
61
- required_inputs = ["model", "datasets"]
62
- tasks = ["classification"]
63
- tags = ["visualization", "model_performance"]
64
-
65
- def run(self):
66
-
67
- summary_metrics = self.compute_metrics()
68
-
69
- return self.cache_results(
70
- {
71
- "metrics_summary": summary_metrics.to_dict(orient="records"),
72
- }
73
- )
74
-
75
- def compute_metrics(self):
76
- """Computes AUC, GINI, and KS for an arbitrary number of datasets."""
77
- # Initialize the dictionary to store results
78
- metrics_dict = {"Dataset": [], "AUC": [], "GINI": [], "KS": []}
79
-
80
- # Iterate over each dataset in the inputs
81
- for _, dataset in enumerate(self.inputs.datasets):
82
- dataset_label = (
83
- dataset.input_id
84
- ) # Use input_id as the label for each dataset
85
- metrics_dict["Dataset"].append(dataset_label)
86
-
87
- # Retrieve y_true and y_pred for the current dataset
88
- y_true = np.ravel(dataset.y) # Flatten y_true to make it one-dimensional
89
- y_prob = dataset.y_prob(self.inputs.model)
90
-
91
- # Compute metrics
92
- y_true = np.array(y_true, dtype=float)
93
- y_prob = np.array(y_prob, dtype=float)
94
-
95
- fpr, tpr, _ = roc_curve(y_true, y_prob)
96
- ks = max(tpr - fpr)
97
- auc = roc_auc_score(y_true, y_prob)
98
- gini = 2 * auc - 1
99
-
100
- # Add the metrics to the dictionary
101
- metrics_dict["AUC"].append(auc)
102
- metrics_dict["GINI"].append(gini)
103
- metrics_dict["KS"].append(ks)
104
-
105
- # Create a DataFrame to store and return the results
106
- metrics_df = pd.DataFrame(metrics_dict)
107
- return metrics_df
108
-
109
- def summary(self, metric_value):
110
- summary_metrics_table = metric_value["metrics_summary"]
111
- return ResultSummary(
112
- results=[
113
- ResultTable(
114
- data=summary_metrics_table,
115
- metadata=ResultTableMetadata(
116
- title="AUC, GINI and KS for train and test datasets"
117
- ),
118
- )
119
- ]
120
- )
65
+ metrics_dict = {"AUC": [], "GINI": [], "KS": []}
66
+
67
+ # Retrieve y_true and y_pred for the current dataset
68
+ y_true = np.ravel(dataset.y) # Flatten y_true to make it one-dimensional
69
+ y_prob = dataset.y_prob(model)
70
+
71
+ # Compute metrics
72
+ y_true = np.array(y_true, dtype=float)
73
+ y_prob = np.array(y_prob, dtype=float)
74
+
75
+ fpr, tpr, _ = roc_curve(y_true, y_prob)
76
+ ks = max(tpr - fpr)
77
+ auc = roc_auc_score(y_true, y_prob)
78
+ gini = 2 * auc - 1
79
+
80
+ # Add the metrics to the dictionary
81
+ metrics_dict["AUC"].append(auc)
82
+ metrics_dict["GINI"].append(gini)
83
+ metrics_dict["KS"].append(ks)
84
+
85
+ # Create a DataFrame to store and return the results
86
+ metrics_df = pd.DataFrame(metrics_dict)
87
+ return metrics_df