validmind 2.5.6__py3-none-any.whl → 2.5.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (212) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai/test_descriptions.py +26 -7
  3. validmind/api_client.py +89 -43
  4. validmind/client.py +2 -2
  5. validmind/client_config.py +11 -14
  6. validmind/datasets/regression/fred_timeseries.py +67 -138
  7. validmind/template.py +1 -0
  8. validmind/test_suites/__init__.py +0 -2
  9. validmind/test_suites/statsmodels_timeseries.py +1 -1
  10. validmind/test_suites/summarization.py +0 -1
  11. validmind/test_suites/time_series.py +0 -43
  12. validmind/tests/__types__.py +3 -13
  13. validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
  14. validmind/tests/data_validation/ADF.py +31 -24
  15. validmind/tests/data_validation/AutoAR.py +9 -9
  16. validmind/tests/data_validation/AutoMA.py +23 -16
  17. validmind/tests/data_validation/AutoSeasonality.py +18 -16
  18. validmind/tests/data_validation/AutoStationarity.py +21 -16
  19. validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
  20. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +82 -124
  21. validmind/tests/data_validation/ClassImbalance.py +15 -12
  22. validmind/tests/data_validation/DFGLSArch.py +19 -13
  23. validmind/tests/data_validation/DatasetDescription.py +17 -11
  24. validmind/tests/data_validation/DatasetSplit.py +7 -5
  25. validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
  26. validmind/tests/data_validation/Duplicates.py +33 -25
  27. validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
  28. validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
  29. validmind/tests/data_validation/HighCardinality.py +19 -12
  30. validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
  31. validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
  32. validmind/tests/data_validation/IQROutliersTable.py +40 -36
  33. validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
  34. validmind/tests/data_validation/KPSS.py +34 -29
  35. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
  36. validmind/tests/data_validation/MissingValues.py +32 -27
  37. validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
  38. validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
  39. validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
  40. validmind/tests/data_validation/RollingStatsPlot.py +31 -23
  41. validmind/tests/data_validation/ScatterPlot.py +63 -78
  42. validmind/tests/data_validation/SeasonalDecompose.py +38 -34
  43. validmind/tests/data_validation/Skewness.py +35 -37
  44. validmind/tests/data_validation/SpreadPlot.py +35 -35
  45. validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
  46. validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
  47. validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
  48. validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
  49. validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
  50. validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
  51. validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
  52. validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
  53. validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
  54. validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
  55. validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
  56. validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
  57. validmind/tests/data_validation/TooManyZeroValues.py +16 -11
  58. validmind/tests/data_validation/UniqueRows.py +11 -6
  59. validmind/tests/data_validation/WOEBinPlots.py +23 -16
  60. validmind/tests/data_validation/WOEBinTable.py +35 -30
  61. validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
  62. validmind/tests/data_validation/nlp/CommonWords.py +21 -14
  63. validmind/tests/data_validation/nlp/Hashtags.py +27 -20
  64. validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
  65. validmind/tests/data_validation/nlp/Mentions.py +21 -15
  66. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
  67. validmind/tests/data_validation/nlp/Punctuations.py +24 -20
  68. validmind/tests/data_validation/nlp/Sentiment.py +27 -8
  69. validmind/tests/data_validation/nlp/StopWords.py +26 -19
  70. validmind/tests/data_validation/nlp/TextDescription.py +36 -35
  71. validmind/tests/data_validation/nlp/Toxicity.py +32 -9
  72. validmind/tests/decorator.py +81 -42
  73. validmind/tests/model_validation/BertScore.py +36 -27
  74. validmind/tests/model_validation/BleuScore.py +25 -19
  75. validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
  76. validmind/tests/model_validation/ContextualRecall.py +35 -13
  77. validmind/tests/model_validation/FeaturesAUC.py +32 -13
  78. validmind/tests/model_validation/MeteorScore.py +46 -33
  79. validmind/tests/model_validation/ModelMetadata.py +32 -64
  80. validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
  81. validmind/tests/model_validation/RegardScore.py +30 -14
  82. validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
  83. validmind/tests/model_validation/RougeScore.py +36 -30
  84. validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
  85. validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
  86. validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
  87. validmind/tests/model_validation/TokenDisparity.py +31 -23
  88. validmind/tests/model_validation/ToxicityScore.py +26 -17
  89. validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
  90. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
  91. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
  92. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
  93. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
  94. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
  95. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
  96. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
  97. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
  98. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
  99. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
  100. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
  101. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
  102. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
  103. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
  104. validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
  105. validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
  106. validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
  107. validmind/tests/model_validation/ragas/AspectCritique.py +7 -0
  108. validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
  109. validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
  110. validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
  111. validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
  112. validmind/tests/model_validation/ragas/utils.py +6 -0
  113. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
  114. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
  115. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
  116. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
  117. validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
  118. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
  119. validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
  120. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
  121. validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
  122. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
  123. validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
  124. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
  125. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
  126. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
  127. validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
  128. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
  129. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
  130. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
  131. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +31 -25
  132. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
  133. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
  134. validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
  135. validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
  136. validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
  137. validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
  138. validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -93
  139. validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
  140. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +113 -73
  141. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +7 -5
  142. validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
  143. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
  144. validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
  145. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
  146. validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
  147. validmind/tests/model_validation/statsmodels/BoxPierce.py +14 -10
  148. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
  149. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +19 -12
  150. validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
  151. validmind/tests/model_validation/statsmodels/JarqueBera.py +27 -22
  152. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
  153. validmind/tests/model_validation/statsmodels/LJungBox.py +32 -28
  154. validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
  155. validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +87 -119
  156. validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
  157. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
  158. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
  159. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
  160. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
  161. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
  162. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
  163. validmind/tests/model_validation/statsmodels/RunsTest.py +32 -28
  164. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
  165. validmind/tests/model_validation/statsmodels/ShapiroWilk.py +15 -8
  166. validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
  167. validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
  168. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
  169. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
  170. validmind/tests/prompt_validation/Bias.py +14 -11
  171. validmind/tests/prompt_validation/Clarity.py +16 -14
  172. validmind/tests/prompt_validation/Conciseness.py +7 -5
  173. validmind/tests/prompt_validation/Delimitation.py +23 -22
  174. validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
  175. validmind/tests/prompt_validation/Robustness.py +12 -10
  176. validmind/tests/prompt_validation/Specificity.py +13 -11
  177. validmind/tests/prompt_validation/ai_powered_test.py +6 -0
  178. validmind/tests/run.py +68 -23
  179. validmind/unit_metrics/__init__.py +81 -144
  180. validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
  181. validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
  182. validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
  183. validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
  184. validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
  185. validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
  186. validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
  187. validmind/unit_metrics/regression/HuberLoss.py +1 -1
  188. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
  189. validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
  190. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
  191. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
  192. validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
  193. validmind/unit_metrics/regression/QuantileLoss.py +1 -1
  194. validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
  195. validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
  196. validmind/vm_models/dataset/dataset.py +2 -0
  197. validmind/vm_models/figure.py +5 -0
  198. validmind/vm_models/test/result_wrapper.py +93 -132
  199. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/METADATA +1 -1
  200. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/RECORD +203 -210
  201. validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
  202. validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
  203. validmind/tests/data_validation/BivariateHistograms.py +0 -117
  204. validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
  205. validmind/tests/data_validation/MissingValuesRisk.py +0 -88
  206. validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
  207. validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
  208. validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
  209. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
  210. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/LICENSE +0 -0
  211. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/WHEEL +0 -0
  212. {validmind-2.5.6.dist-info → validmind-2.5.15.dist-info}/entry_points.txt +0 -0
@@ -17,36 +17,38 @@ class RegressionModelSummary(Metric):
17
17
  """
18
18
  Evaluates regression model performance using metrics including R-Squared, Adjusted R-Squared, MSE, and RMSE.
19
19
 
20
- **Purpose**: This metric test evaluates the performance of regression models by measuring their predictive ability
21
- with regards to dependent variables given changes in the independent variables. Its measurement tools include
22
- conventional regression metrics such as R-Squared, Adjusted R-Squared, Mean Squared Error (MSE), and Root Mean
23
- Squared Error (RMSE).
24
-
25
- **Test Mechanism**: This test employs the 'train_ds' attribute of the model to gather and analyze the training
26
- data. Initially, it fetches the independent variables and uses the model to make predictions on these given
27
- features. Subsequently, it calculates several standard regression performance metrics including R-Square, Adjusted
28
- R-Squared, Mean Squared Error (MSE), and Root Mean Squared Error (RMSE), which quantify the approximation of the
29
- predicted responses to the actual responses.
30
-
31
- **Signs of High Risk**:
32
- - Low R-Squared and Adjusted R-Squared values. A poor fit between the model predictions and the true responses is
33
- indicated by low values of these metrics, suggesting the model explains a small fraction of the variance in the
34
- target variable.
35
- - High MSE and RMSE values represent a high prediction error and point to poor model performance.
36
-
37
- **Strengths**:
20
+ ### Purpose
21
+
22
+ The Regression Model Summary test evaluates the performance of regression models by measuring their predictive
23
+ ability regarding dependent variables given changes in the independent variables. It uses conventional regression
24
+ metrics such as R-Squared, Adjusted R-Squared, Mean Squared Error (MSE), and Root Mean Squared Error (RMSE) to
25
+ assess the model's accuracy and fit.
26
+
27
+ ### Test Mechanism
28
+
29
+ This test employs the 'train_ds' attribute of the model to gather and analyze the training data. Initially, it
30
+ fetches the independent variables and uses the model to make predictions on these given features. Subsequently, it
31
+ calculates several standard regression performance metrics including R-Squared, Adjusted R-Squared, Mean Squared
32
+ Error (MSE), and Root Mean Squared Error (RMSE), which quantify the approximation of the predicted responses to the
33
+ actual responses.
34
+
35
+ ### Signs of High Risk
36
+
37
+ - Low R-Squared and Adjusted R-Squared values.
38
+ - High MSE and RMSE values.
39
+
40
+ ### Strengths
41
+
38
42
  - Offers an extensive evaluation of regression models by combining four key measures of model accuracy and fit.
39
43
  - Provides a comprehensive view of the model's performance.
40
- - Both the R-Squared and Adjusted R-Squared measures are readily interpretable. They represent the proportion of
41
- total variation in the dependent variable that can be explained by the independent variables.
42
-
43
- **Limitations**:
44
- - Applicable exclusively to regression models. It is not suited for evaluating binary classification models or time
45
- series models, thus limiting its scope.
46
- - Although RMSE and MSE are sound measures of prediction error, they might be sensitive to outliers, potentially
47
- leading to an overestimation of the model's prediction error.
48
- - A high R-squared or adjusted R-squared may not necessarily indicate a good model, especially in cases where the
49
- model is possibly overfitting the data.
44
+ - Both the R-Squared and Adjusted R-Squared measures are readily interpretable.
45
+
46
+ ### Limitations
47
+
48
+ - Applicable exclusively to regression models.
49
+ - RMSE and MSE might be sensitive to outliers.
50
+ - A high R-Squared or Adjusted R-Squared may not necessarily indicate a good model, especially in cases of
51
+ overfitting.
50
52
  """
51
53
 
52
54
  name = "regression_model_summary"
@@ -21,28 +21,35 @@ logger = get_logger(__name__)
21
21
  class RegressionPermutationFeatureImportance(Metric):
22
22
  """
23
23
  Assesses the significance of each feature in a model by evaluating the impact on model performance when feature
24
- values are randomly rearranged. Specifically designed for use with statsmodels, this metric offers insight into the
25
- importance of features based on the decrease in model's predictive accuracy, typically R².
24
+ values are randomly rearranged.
26
25
 
27
- **Purpose**: The primary purpose of this metric is to determine which features significantly impact the performance
28
- of a regression model developed using statsmodels. The metric measures how much the prediction accuracy deteriorates
26
+ ### Purpose
27
+
28
+ The primary purpose of this metric is to determine which features significantly impact the performance of a
29
+ regression model developed using statsmodels. The metric measures how much the prediction accuracy deteriorates
29
30
  when each feature's values are permuted.
30
31
 
31
- **Test Mechanism**: This metric shuffles the values of each feature one at a time in the dataset, computes the model's
32
- performance after each permutation, and compares it to the baseline performance. A significant decrease in performance
32
+ ### Test Mechanism
33
+
34
+ This metric shuffles the values of each feature one at a time in the dataset, computes the model's performance
35
+ after each permutation, and compares it to the baseline performance. A significant decrease in performance
33
36
  indicates the importance of the feature.
34
37
 
35
- **Signs of High Risk**:
36
- - Significant reliance on a feature that when permuted leads to a substantial decrease in performance, suggesting
38
+ ### Signs of High Risk
39
+
40
+ - Significant reliance on a feature that, when permuted, leads to a substantial decrease in performance, suggesting
37
41
  overfitting or high model dependency on that feature.
38
42
  - Features identified as unimportant despite known impacts from domain knowledge, suggesting potential issues in
39
43
  model training or data preprocessing.
40
44
 
41
- **Strengths**:
42
- - Directly assesses the impact of each feature on model performance, providing clear insights into model dependencies.
45
+ ### Strengths
46
+
47
+ - Directly assesses the impact of each feature on model performance, providing clear insights into model
48
+ dependencies.
43
49
  - Model-agnostic within the scope of statsmodels, applicable to any regression model that outputs predictions.
44
50
 
45
- **Limitations**:
51
+ ### Limitations
52
+
46
53
  - The metric is specific to statsmodels and cannot be used with other types of models without adaptation.
47
54
  - It does not capture interactions between features, which can lead to underestimating the importance of correlated
48
55
  features.
@@ -11,41 +11,45 @@ class RunsTest(Metric):
11
11
  """
12
12
  Executes Runs Test on ML model to detect non-random patterns in output data sequence.
13
13
 
14
- **Purpose**: The Runs Test is a statistical procedure used to determine whether the sequence of data extracted from
15
- the ML model behaves randomly or not. Specifically, it analyzes runs, sequences of consecutive positives or
16
- negatives, in the data to check if there are more or fewer runs than expected under the assumption of randomness.
17
- This can be an indication of some pattern, trend, or cycle in the model's output which may need attention.
18
-
19
- **Test Mechanism**: The testing mechanism applies the Runs Test from the statsmodels module on each column of the
20
- training dataset. For every feature in the dataset, a Runs Test is executed, whose output includes a Runs Statistic
21
- and P-value. A low P-value suggests that data arrangement in the feature is not likely to be random. The results
22
- are stored in a dictionary where the keys are the feature names, and the values are another dictionary storing the
23
- test statistic and the P-value for each feature.
24
-
25
- **Signs of High Risk**:
14
+ ### Purpose
15
+
16
+ The Runs Test is a statistical procedure used to determine whether the sequence of data extracted from the ML model
17
+ behaves randomly or not. Specifically, it analyzes runs, sequences of consecutive positives or negatives, in the
18
+ data to check if there are more or fewer runs than expected under the assumption of randomness. This can be an
19
+ indication of some pattern, trend, or cycle in the model's output which may need attention.
20
+
21
+ ### Test Mechanism
22
+
23
+ The testing mechanism applies the Runs Test from the statsmodels module on each column of the training dataset. For
24
+ every feature in the dataset, a Runs Test is executed, whose output includes a Runs Statistic and P-value. A low
25
+ P-value suggests that data arrangement in the feature is not likely to be random. The results are stored in a
26
+ dictionary where the keys are the feature names, and the values are another dictionary storing the test statistic
27
+ and the P-value for each feature.
28
+
29
+ ### Signs of High Risk
30
+
26
31
  - High risk is indicated when the P-value is close to zero.
27
- - If the p-value is less than a predefined significance level (like 0.05), it suggests that the runs (series of
32
+ - If the P-value is less than a predefined significance level (like 0.05), it suggests that the runs (series of
28
33
  positive or negative values) in the model's output are not random and are longer or shorter than what is expected
29
34
  under a random scenario.
30
35
  - This would mean there's a high risk of non-random distribution of errors or model outcomes, suggesting potential
31
36
  issues with the model.
32
37
 
33
- **Strengths**:
34
- - The strength of the Runs Test is that it's straightforward and fast for detecting non-random patterns in data
35
- sequence.
36
- - It can validate assumptions of randomness, which is particularly valuable for checking error distributions in
37
- regression models, trendless time series data, and making sure a classifier doesn't favour one class over another.
38
- - Moreover, it can be applied to both classification and regression tasks, making it versatile.
39
-
40
- **Limitations**:
41
- - The test assumes that the data is independently and identically distributed (i.i.d.), which might not be the case
42
- for many real-world datasets.
43
- - The conclusion drawn from the low p-value indicating non-randomness does not provide information about the type
38
+ ### Strengths
39
+
40
+ - Straightforward and fast for detecting non-random patterns in data sequence.
41
+ - Validates assumptions of randomness, which is valuable for checking error distributions in regression models,
42
+ trendless time series data, and ensuring a classifier doesn't favor one class over another.
43
+ - Can be applied to both classification and regression tasks, making it versatile.
44
+
45
+ ### Limitations
46
+
47
+ - Assumes that the data is independently and identically distributed (i.i.d.), which might not be the case for many
48
+ real-world datasets.
49
+ - The conclusion drawn from the low P-value indicating non-randomness does not provide information about the type
44
50
  or the source of the detected pattern.
45
- - Also, it is sensitive to extreme values (outliers), and overly large or small run sequences can influence the
46
- results.
47
- - Furthermore, this test does not provide model performance evaluation; it is used to detect patterns in the
48
- sequence of outputs only.
51
+ - Sensitive to extreme values (outliers), and overly large or small run sequences can influence the results.
52
+ - Does not provide model performance evaluation; it is used to detect patterns in the sequence of outputs only.
49
53
  """
50
54
 
51
55
  name = "runs_test"
@@ -2,136 +2,104 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
-
7
5
  import plotly.graph_objects as go
8
6
  from matplotlib import cm
9
7
 
10
- from validmind.vm_models import Figure, Metric
8
+ from validmind import tags, tasks
11
9
 
12
10
 
13
- @dataclass
14
- class ScorecardHistogram(Metric):
11
+ @tags("visualization", "credit_risk", "logistic_regression")
12
+ @tasks("classification")
13
+ def ScorecardHistogram(dataset, title="Histogram of Scores", score_column="score"):
15
14
  """
16
- Creates histograms of credit scores, from both default and non-default instances, generated by a credit-risk model.
17
-
18
- **Purpose**: The Scorecard Histogram test metric provides a visual interpretation of the credit scores generated by
19
- a machine learning model for credit-risk classification tasks. It aims to compare the alignment of the model's
20
- scoring decisions with the actual outcomes of credit loan applications. It helps in identifying potential
21
- discrepancies between the model's predictions and real-world risk levels.
22
-
23
- **Test Mechanism**: This metric uses logistic regression to generate a histogram of credit scores for both default
24
- (negative class) and non-default (positive class) instances. Using both training and test datasets, the metric
25
- calculates the credit score of each instance with a scorecard method, considering the impact of different features
26
- on the likelihood of default. İncludes the default point to odds (PDO) scaling factor and predefined target score
27
- and odds settings. Histograms for training and test sets are computed and plotted separately to offer insights into
28
- the model's generalizability to unseen data.
29
-
30
- **Signs of High Risk**:
31
- - Discrepancies between the distributions of training and testing data, indicating a model's poor generalisation
15
+ The Scorecard Histogram test evaluates the distribution of credit scores between default and non-default instances,
16
+ providing critical insights into the performance and generalizability of credit-risk models.
17
+
18
+ ### Purpose
19
+
20
+ The Scorecard Histogram test metric provides a visual interpretation of the credit scores generated by a machine
21
+ learning model for credit-risk classification tasks. It aims to compare the alignment of the model's scoring
22
+ decisions with the actual outcomes of credit loan applications. It helps in identifying potential discrepancies
23
+ between the model's predictions and real-world risk levels.
24
+
25
+ ### Test Mechanism
26
+
27
+ This metric uses logistic regression to generate a histogram of credit scores for both default (negative class) and
28
+ non-default (positive class) instances. Using both training and test datasets, the metric calculates the credit
29
+ score of each instance with a scorecard method, considering the impact of different features on the likelihood of
30
+ default. It includes the default point to odds (PDO) scaling factor and predefined target score and odds settings.
31
+ Histograms for training and test sets are computed and plotted separately to offer insights into the model's
32
+ generalizability to unseen data.
33
+
34
+ ### Signs of High Risk
35
+
36
+ - Discrepancies between the distributions of training and testing data, indicating a model's poor generalization
32
37
  ability
33
- - Skewed distributions favouring specific scores or classes, representing potential bias
38
+ - Skewed distributions favoring specific scores or classes, representing potential bias
39
+
40
+ ### Strengths
34
41
 
35
- **Strengths**:
36
42
  - Provides a visual interpretation of the model's credit scoring system, enhancing comprehension of model behavior
37
43
  - Enables a direct comparison between actual and predicted scores for both training and testing data
38
44
  - Its intuitive visualization helps understand the model's ability to differentiate between positive and negative
39
45
  classes
40
46
  - Can unveil patterns or anomalies not easily discerned through numerical metrics alone
41
47
 
42
- **Limitations**:
43
- - Despite its value for visual interpretation, it doesn't quantify the performance of the model, and therefore may
48
+ ### Limitations
49
+
50
+ - Despite its value for visual interpretation, it doesn't quantify the performance of the model and therefore may
44
51
  lack precision for thorough model evaluation
45
52
  - The quality of input data can strongly influence the metric, as bias or noise in the data will affect both the
46
53
  score calculation and resultant histogram
47
54
  - Its specificity to credit scoring models limits its applicability across a wider variety of machine learning
48
55
  tasks and models
49
- - The metric's effectiveness is somewhat tied to the subjective interpretation of the analyst, since it relies on
50
- the analyst's judgment of the characteristics and implications of the plot.
56
+ - The metric's effectiveness is somewhat tied to the subjective interpretation of the analyst, relying on their
57
+ judgment of the characteristics and implications of the plot.
51
58
  """
52
59
 
53
- name = "scorecard_histogram"
54
- required_inputs = ["datasets"]
55
- tasks = ["classification"]
56
- tags = ["tabular_data", "visualization", "credit_risk"]
60
+ if score_column not in dataset.df.columns:
61
+ raise ValueError(
62
+ f"The required column '{score_column}' is not present in the dataset with input_id {dataset.input_id}"
63
+ )
57
64
 
58
- default_params = {
59
- "title": "Histogram of Scores",
60
- "score_column": "score",
61
- }
65
+ df = dataset.df
62
66
 
63
- @staticmethod
64
- def plot_score_histogram(dataframes, dataset_titles, score_col, target_col, title):
65
- figures = []
66
- # Generate a colormap and convert to Plotly-accepted color format
67
- # Adjust 'viridis' to any other matplotlib colormap if desired
68
- colormap = cm.get_cmap("viridis")
69
-
70
- for _, (df, dataset_title) in enumerate(zip(dataframes, dataset_titles)):
71
- fig = go.Figure()
72
-
73
- # Get unique classes and assign colors
74
- classes = sorted(df[target_col].unique())
75
- colors = [
76
- colormap(i / len(classes))[:3] for i in range(len(classes))
77
- ] # RGB
78
- color_dict = {
79
- cls: f"rgb({int(rgb[0]*255)}, {int(rgb[1]*255)}, {int(rgb[2]*255)})"
80
- for cls, rgb in zip(classes, colors)
81
- }
82
-
83
- for class_value in sorted(df[target_col].unique()):
84
- scores_class = df[df[target_col] == class_value][score_col]
85
- fig.add_trace(
86
- go.Histogram(
87
- x=scores_class,
88
- opacity=0.75,
89
- name=f"{dataset_title} {target_col} = {class_value}",
90
- marker=dict(
91
- color=color_dict[class_value],
92
- ),
93
- )
94
- )
95
- fig.update_layout(
96
- barmode="overlay",
97
- title_text=f"{title} - {dataset_title}",
98
- xaxis_title="Score",
99
- yaxis_title="Frequency",
100
- legend_title=target_col,
101
- )
102
- figures.append(fig)
103
- return figures
104
-
105
- def run(self):
106
- title = self.params["title"]
107
- score_column = self.params["score_column"]
108
- dataset_titles = [dataset.input_id for dataset in self.inputs.datasets]
109
- target_column = self.inputs.datasets[0].target_column
110
-
111
- dataframes = []
112
- metric_value = {"score_histogram": {}}
113
- for dataset in self.inputs.datasets:
114
- if score_column not in dataset.df.columns:
115
- raise ValueError(
116
- f"The required column '{score_column}' is not present in the dataset with input_id {dataset.input_id}"
117
- )
118
-
119
- dataframes.append(dataset.df.copy())
120
- metric_value["score_histogram"][dataset.input_id] = list(
121
- dataset.df[score_column]
122
- )
67
+ fig = _plot_score_histogram(df, score_column, dataset.target_column, title)
123
68
 
124
- figures = self.plot_score_histogram(
125
- dataframes, dataset_titles, score_column, target_column, title
126
- )
69
+ return fig
127
70
 
128
- figures_list = [
129
- Figure(
130
- for_object=self,
131
- key=f"score_histogram_{title.replace(' ', '_')}_{i+1}",
132
- figure=fig,
133
- )
134
- for i, fig in enumerate(figures)
135
- ]
136
71
 
137
- return self.cache_results(metric_value=metric_value, figures=figures_list)
72
+ def _plot_score_histogram(df, score_col, target_col, title):
73
+ # Generate a colormap and convert to Plotly-accepted color format
74
+ # Adjust 'viridis' to any other matplotlib colormap if desired
75
+ colormap = cm.get_cmap("viridis")
76
+
77
+ fig = go.Figure()
78
+
79
+ # Get unique classes and assign colors
80
+ classes = sorted(df[target_col].unique())
81
+ colors = [colormap(i / len(classes))[:3] for i in range(len(classes))] # RGB
82
+ color_dict = {
83
+ cls: f"rgb({int(rgb[0]*255)}, {int(rgb[1]*255)}, {int(rgb[2]*255)})"
84
+ for cls, rgb in zip(classes, colors)
85
+ }
86
+
87
+ for class_value in sorted(df[target_col].unique()):
88
+ scores_class = df[df[target_col] == class_value][score_col]
89
+ fig.add_trace(
90
+ go.Histogram(
91
+ x=scores_class,
92
+ opacity=0.75,
93
+ name=f"{target_col} = {class_value}",
94
+ marker=dict(
95
+ color=color_dict[class_value],
96
+ ),
97
+ )
98
+ )
99
+ fig.update_layout(
100
+ barmode="overlay",
101
+ title_text=f"{title}",
102
+ xaxis_title="Score",
103
+ yaxis_title="Frequency",
104
+ )
105
+ return fig
@@ -11,29 +11,36 @@ class ShapiroWilk(Metric):
11
11
  """
12
12
  Evaluates feature-wise normality of training data using the Shapiro-Wilk test.
13
13
 
14
- **Purpose**: The Shapiro-Wilk test is utilized to investigate whether a particular dataset conforms to the standard
15
- normal distribution. This analysis is crucial in machine learning modeling because the normality of the data can
14
+ ### Purpose
15
+
16
+ The Shapiro-Wilk test is utilized to investigate whether a particular dataset conforms to the standard normal
17
+ distribution. This analysis is crucial in machine learning modeling because the normality of the data can
16
18
  profoundly impact the performance of the model. This metric is especially useful in evaluating various features of
17
19
  the dataset in both classification and regression tasks.
18
20
 
19
- **Test Mechanism**: The Shapiro-Wilk test is conducted on each feature column of the training dataset to determine
20
- if the data contained fall within the normal distribution. The test presents a statistic and a p-value, with the
21
- p-value serving to validate or repudiate the null hypothesis, which is that the tested data is normally distributed.
21
+ ### Test Mechanism
22
+
23
+ The Shapiro-Wilk test is conducted on each feature column of the training dataset to determine if the data
24
+ contained fall within the normal distribution. The test presents a statistic and a p-value, with the p-value
25
+ serving to validate or repudiate the null hypothesis, which is that the tested data is normally distributed.
26
+
27
+ ### Signs of High Risk
22
28
 
23
- **Signs of High Risk**:
24
29
  - A p-value that falls below 0.05 signifies a high risk as it discards the null hypothesis, indicating that the
25
30
  data does not adhere to the normal distribution.
26
31
  - For machine learning models built on the presumption of data normality, such an outcome could result in subpar
27
32
  performance or incorrect predictions.
28
33
 
29
- **Strengths**:
34
+ ### Strengths
35
+
30
36
  - The Shapiro-Wilk test is esteemed for its level of accuracy, thereby making it particularly well-suited to
31
37
  datasets of small to moderate sizes.
32
38
  - It proves its versatility through its efficient functioning in both classification and regression tasks.
33
39
  - By separately testing each feature column, the Shapiro-Wilk test can raise an alarm if a specific feature does
34
40
  not comply with the normality.
35
41
 
36
- **Limitations**:
42
+ ### Limitations
43
+
37
44
  - The Shapiro-Wilk test's sensitivity can be a disadvantage as it often rejects the null hypothesis (i.e., data is
38
45
  normally distributed), even for minor deviations, especially in large datasets. This may lead to unwarranted 'false
39
46
  alarms' of high risk by deeming the data as not normally distributed even if it approximates normal distribution.
@@ -16,37 +16,41 @@ def FeatureDrift(
16
16
  datasets, bins=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], feature_columns=None
17
17
  ):
18
18
  """
19
- **Purpose**:
19
+ Evaluates changes in feature distribution over time to identify potential model drift.
20
+
21
+ ### Purpose
20
22
 
21
23
  The Feature Drift test aims to evaluate how much the distribution of features has shifted over time between two
22
24
  datasets, typically training and monitoring datasets. It uses the Population Stability Index (PSI) to quantify this
23
- change, providing insights into the model's robustness and the necessity for retraining or feature engineering.
25
+ change, providing insights into the models robustness and the necessity for retraining or feature engineering.
24
26
 
25
- **Test Mechanism**:
27
+ ### Test Mechanism
26
28
 
27
29
  This test calculates the PSI by:
30
+
28
31
  - Bucketing the distributions of each feature in both datasets.
29
32
  - Comparing the percentage of observations in each bucket between the two datasets.
30
33
  - Aggregating the differences across all buckets for each feature to produce the PSI score for that feature.
31
34
 
32
35
  The PSI score is interpreted as:
36
+
33
37
  - PSI < 0.1: No significant population change.
34
38
  - PSI < 0.2: Moderate population change.
35
39
  - PSI >= 0.2: Significant population change.
36
40
 
37
- **Signs of High Risk**:
41
+ ### Signs of High Risk
38
42
 
39
43
  - PSI >= 0.2 for any feature, indicating a significant distribution shift.
40
44
  - Consistently high PSI scores across multiple features.
41
45
  - Sudden spikes in PSI in recent monitoring data compared to historical data.
42
46
 
43
- **Strengths**:
47
+ ### Strengths
44
48
 
45
49
  - Provides a quantitative measure of feature distribution changes.
46
50
  - Easily interpretable thresholds for decision-making.
47
51
  - Helps in early detection of data drift, prompting timely interventions.
48
52
 
49
- **Limitations**:
53
+ ### Limitations
50
54
 
51
55
  - May not capture more intricate changes in data distribution nuances.
52
56
  - Assumes that bucket thresholds (quantiles) adequately represent distribution shifts.
@@ -12,31 +12,37 @@ from validmind import tags, tasks
12
12
  @tasks("monitoring")
13
13
  def PredictionAcrossEachFeature(datasets, model):
14
14
  """
15
- **Purpose:**
16
- This test shows visually the prediction using reference data and monitoring data across each individual feature. If
17
- there are significant differences in predictions across feature values from reference to monitoring dataset, then
18
- further investigation is needed as the model is producing predictions that are different than what was observed
19
- during the training of the model.
20
-
21
- **Test Mechanism:**
22
- The test creates scatter plots for each feature, comparing the reference dataset (used for training) with the
23
- monitoring dataset (used in production). Each plot has two subplots: one for the reference data and one for the
24
- monitoring data, visualizing the prediction probabilities. This allows for a visual comparison of the model's
25
- behavior across different datasets.
26
-
27
- **Signs of High Risk:**
28
- - Significant discrepancies between the reference and monitoring subplots for the same feature
29
- - Unexpected patterns or trends in monitoring data that weren't present in reference data
30
-
31
- **Strengths:**
32
- - Provides a clear visual representation of model performance across different features
33
- - Allows for easy identification of features where the model's predictions have changed
34
- - Facilitates quick detection of potential issues with the model when deployed in production
35
-
36
- **Limitations:**
37
- - Interpretation of scatter plots can be subjective and may require expertise
38
- - Visualizations do not provide quantitative metrics for objective evaluation
39
- - May not capture all types of distribution changes or issues with the model's predictions
15
+ Assesses differences in model predictions across individual features between reference and monitoring datasets
16
+ through visual analysis.
17
+
18
+ ### Purpose
19
+
20
+ The Prediction Across Each Feature test aims to visually compare model predictions for each feature between
21
+ reference (training) and monitoring (production) datasets. It helps identify significant differences in prediction
22
+ patterns for further investigation and ensures the model's consistency and stability over time.
23
+
24
+ ### Test Mechanism
25
+
26
+ The test generates scatter plots for each feature, comparing prediction probabilities between the reference and
27
+ monitoring datasets. Each plot consists of two subplots: one for reference data and one for monitoring data,
28
+ enabling visual comparison of the model's predictive behavior.
29
+
30
+ ### Signs of High Risk
31
+
32
+ - Significant discrepancies between the reference and monitoring subplots for the same feature.
33
+ - Unexpected patterns or trends in monitoring data that were absent in reference data.
34
+
35
+ ### Strengths
36
+
37
+ - Provides a clear visual representation of model performance across different features.
38
+ - Facilitates easy identification of features where the model's predictions have diverged.
39
+ - Enables quick detection of potential model performance issues in production.
40
+
41
+ ### Limitations
42
+
43
+ - Interpretation of scatter plots can be subjective and may require expertise.
44
+ - Visualizations do not provide quantitative metrics for objective evaluation.
45
+ - May not capture all types of distribution changes or issues with the model's predictions.
40
46
  """
41
47
 
42
48
  """
@@ -13,30 +13,38 @@ from validmind import tags, tasks
13
13
  @tasks("monitoring")
14
14
  def PredictionCorrelation(datasets, model):
15
15
  """
16
- **Purpose:**
17
- The test is used to assess the correlation pairs for each feature between model predictions from reference and
18
- monitoring datasets. The primary goal is to detect significant changes in these pairs, which may signal target
19
- drift, leading to lower model performance.
16
+ Assesses correlation changes between model predictions from reference and monitoring datasets to detect potential
17
+ target drift.
20
18
 
21
- **Test Mechanism:**
22
- The test calculates the correlation of each feature with model predictions for both reference and monitoring
23
- datasets. The test then compares these correlations side-by-side via a bar plot and a correlation table. Features
24
- with significant changes in correlation pairs highlight potential risks of model drift.
19
+ ### Purpose
20
+
21
+ To evaluate the changes in correlation pairs between model predictions and features from reference and monitoring
22
+ datasets. This helps in identifying significant shifts that may indicate target drift, potentially affecting model
23
+ performance.
24
+
25
+ ### Test Mechanism
26
+
27
+ This test calculates the correlation of each feature with model predictions for both reference and monitoring
28
+ datasets. It then compares these correlations side-by-side using a bar plot and a correlation table. Significant
29
+ changes in correlation pairs are highlighted to signal possible model drift.
30
+
31
+ ### Signs of High Risk
25
32
 
26
- **Signs of High Risk:**
27
33
  - Significant changes in correlation pairs between the reference and monitoring predictions.
28
- - Notable correlation differences indicating a potential shift in the relationship between features and the target
29
- variable.
30
-
31
- **Strengths:**
32
- - Allows for visual identification of drift in feature relationships with model predictions.
33
- - Comparison via a clear bar plot assists in understanding model stability over time.
34
- - Helps in early detection of target drift, enabling timely interventions.
35
-
36
- **Limitations:**
37
- - May require substantial reference and monitoring data for accurate comparison.
38
- - Correlation does not imply causation, and other factors might influence changes.
39
- - The method solely focuses on linear relationships, potentially missing non-linear interactions.
34
+ - Notable differences in correlation values, indicating a possible shift in the relationship between features and
35
+ the target variable.
36
+
37
+ ### Strengths
38
+
39
+ - Provides visual identification of drift in feature relationships with model predictions.
40
+ - Clear bar plot comparison aids in understanding model stability over time.
41
+ - Enables early detection of target drift, facilitating timely interventions.
42
+
43
+ ### Limitations
44
+
45
+ - Requires substantial reference and monitoring data for accurate comparison.
46
+ - Correlation does not imply causation; other factors may influence changes.
47
+ - Focuses solely on linear relationships, potentially missing non-linear interactions.
40
48
  """
41
49
 
42
50
  prediction_prob_column = f"{model.input_id}_probabilities"