validmind 2.5.8__py3-none-any.whl → 2.5.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai/test_descriptions.py +80 -119
  3. validmind/ai/test_result_description/config.yaml +29 -0
  4. validmind/ai/test_result_description/context.py +73 -0
  5. validmind/ai/test_result_description/image_processing.py +124 -0
  6. validmind/ai/test_result_description/system.jinja +39 -0
  7. validmind/ai/test_result_description/user.jinja +25 -0
  8. validmind/api_client.py +89 -43
  9. validmind/client.py +2 -2
  10. validmind/client_config.py +11 -14
  11. validmind/datasets/credit_risk/__init__.py +1 -0
  12. validmind/datasets/credit_risk/datasets/lending_club_biased.csv.gz +0 -0
  13. validmind/datasets/credit_risk/lending_club_bias.py +142 -0
  14. validmind/datasets/regression/fred_timeseries.py +67 -138
  15. validmind/template.py +1 -0
  16. validmind/test_suites/__init__.py +0 -2
  17. validmind/test_suites/statsmodels_timeseries.py +1 -1
  18. validmind/test_suites/summarization.py +0 -1
  19. validmind/test_suites/time_series.py +0 -43
  20. validmind/tests/__types__.py +14 -15
  21. validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
  22. validmind/tests/data_validation/ADF.py +31 -24
  23. validmind/tests/data_validation/AutoAR.py +9 -9
  24. validmind/tests/data_validation/AutoMA.py +23 -16
  25. validmind/tests/data_validation/AutoSeasonality.py +18 -16
  26. validmind/tests/data_validation/AutoStationarity.py +21 -16
  27. validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
  28. validmind/tests/{model_validation/statsmodels → data_validation}/BoxPierce.py +34 -34
  29. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +85 -124
  30. validmind/tests/data_validation/ClassImbalance.py +15 -12
  31. validmind/tests/data_validation/DFGLSArch.py +19 -13
  32. validmind/tests/data_validation/DatasetDescription.py +17 -11
  33. validmind/tests/data_validation/DatasetSplit.py +7 -5
  34. validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
  35. validmind/tests/data_validation/Duplicates.py +33 -25
  36. validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
  37. validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
  38. validmind/tests/data_validation/HighCardinality.py +19 -12
  39. validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
  40. validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
  41. validmind/tests/data_validation/IQROutliersTable.py +40 -36
  42. validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
  43. validmind/tests/data_validation/JarqueBera.py +70 -0
  44. validmind/tests/data_validation/KPSS.py +34 -29
  45. validmind/tests/data_validation/LJungBox.py +66 -0
  46. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
  47. validmind/tests/data_validation/MissingValues.py +32 -27
  48. validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
  49. validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
  50. validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
  51. validmind/tests/data_validation/ProtectedClassesCombination.py +197 -0
  52. validmind/tests/data_validation/ProtectedClassesDescription.py +130 -0
  53. validmind/tests/data_validation/ProtectedClassesDisparity.py +133 -0
  54. validmind/tests/data_validation/ProtectedClassesThresholdOptimizer.py +172 -0
  55. validmind/tests/data_validation/RollingStatsPlot.py +31 -23
  56. validmind/tests/data_validation/RunsTest.py +72 -0
  57. validmind/tests/data_validation/ScatterPlot.py +63 -78
  58. validmind/tests/data_validation/SeasonalDecompose.py +38 -34
  59. validmind/tests/{model_validation/statsmodels → data_validation}/ShapiroWilk.py +35 -30
  60. validmind/tests/data_validation/Skewness.py +35 -37
  61. validmind/tests/data_validation/SpreadPlot.py +35 -35
  62. validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
  63. validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
  64. validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
  65. validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
  66. validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
  67. validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
  68. validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
  69. validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
  70. validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
  71. validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
  72. validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
  73. validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
  74. validmind/tests/data_validation/TooManyZeroValues.py +16 -11
  75. validmind/tests/data_validation/UniqueRows.py +11 -6
  76. validmind/tests/data_validation/WOEBinPlots.py +23 -16
  77. validmind/tests/data_validation/WOEBinTable.py +35 -30
  78. validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
  79. validmind/tests/data_validation/nlp/CommonWords.py +21 -14
  80. validmind/tests/data_validation/nlp/Hashtags.py +42 -40
  81. validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
  82. validmind/tests/data_validation/nlp/Mentions.py +21 -15
  83. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
  84. validmind/tests/data_validation/nlp/Punctuations.py +24 -20
  85. validmind/tests/data_validation/nlp/Sentiment.py +27 -8
  86. validmind/tests/data_validation/nlp/StopWords.py +26 -19
  87. validmind/tests/data_validation/nlp/TextDescription.py +39 -36
  88. validmind/tests/data_validation/nlp/Toxicity.py +32 -9
  89. validmind/tests/decorator.py +81 -42
  90. validmind/tests/model_validation/BertScore.py +36 -27
  91. validmind/tests/model_validation/BleuScore.py +25 -19
  92. validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
  93. validmind/tests/model_validation/ContextualRecall.py +38 -13
  94. validmind/tests/model_validation/FeaturesAUC.py +32 -13
  95. validmind/tests/model_validation/MeteorScore.py +46 -33
  96. validmind/tests/model_validation/ModelMetadata.py +32 -64
  97. validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
  98. validmind/tests/model_validation/RegardScore.py +30 -14
  99. validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
  100. validmind/tests/model_validation/RougeScore.py +36 -30
  101. validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
  102. validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
  103. validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
  104. validmind/tests/model_validation/TokenDisparity.py +31 -23
  105. validmind/tests/model_validation/ToxicityScore.py +26 -17
  106. validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
  107. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
  108. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
  109. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
  110. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
  111. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
  112. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
  113. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
  114. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
  115. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
  116. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
  117. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
  118. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
  119. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
  120. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
  121. validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
  122. validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
  123. validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
  124. validmind/tests/model_validation/ragas/AspectCritique.py +12 -6
  125. validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
  126. validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
  127. validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
  128. validmind/tests/model_validation/ragas/ContextUtilization.py +155 -0
  129. validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
  130. validmind/tests/model_validation/ragas/NoiseSensitivity.py +152 -0
  131. validmind/tests/model_validation/ragas/utils.py +6 -0
  132. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
  133. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
  134. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
  135. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
  136. validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
  137. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
  138. validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
  139. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
  140. validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
  141. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
  142. validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
  143. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
  144. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
  145. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
  146. validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
  147. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
  148. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
  149. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
  150. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +32 -26
  151. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
  152. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
  153. validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
  154. validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
  155. validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
  156. validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
  157. validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -94
  158. validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
  159. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +36 -32
  160. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +66 -5
  161. validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
  162. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
  163. validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
  164. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
  165. validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
  166. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
  167. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +59 -32
  168. validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
  169. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
  170. validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
  171. validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +86 -119
  172. validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
  173. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
  174. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
  175. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
  176. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
  177. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
  178. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
  179. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
  180. validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
  181. validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
  182. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
  183. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
  184. validmind/tests/prompt_validation/Bias.py +14 -11
  185. validmind/tests/prompt_validation/Clarity.py +16 -14
  186. validmind/tests/prompt_validation/Conciseness.py +7 -5
  187. validmind/tests/prompt_validation/Delimitation.py +23 -22
  188. validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
  189. validmind/tests/prompt_validation/Robustness.py +12 -10
  190. validmind/tests/prompt_validation/Specificity.py +13 -11
  191. validmind/tests/prompt_validation/ai_powered_test.py +6 -0
  192. validmind/tests/run.py +68 -23
  193. validmind/unit_metrics/__init__.py +81 -144
  194. validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
  195. validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
  196. validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
  197. validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
  198. validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
  199. validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
  200. validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
  201. validmind/unit_metrics/regression/HuberLoss.py +1 -1
  202. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
  203. validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
  204. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
  205. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
  206. validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
  207. validmind/unit_metrics/regression/QuantileLoss.py +1 -1
  208. validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
  209. validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
  210. validmind/utils.py +4 -0
  211. validmind/vm_models/dataset/dataset.py +2 -0
  212. validmind/vm_models/figure.py +5 -0
  213. validmind/vm_models/test/metric.py +1 -0
  214. validmind/vm_models/test/result_wrapper.py +143 -158
  215. validmind/vm_models/test/threshold_test.py +1 -0
  216. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/METADATA +4 -3
  217. validmind-2.5.18.dist-info/RECORD +324 -0
  218. validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
  219. validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
  220. validmind/tests/data_validation/BivariateHistograms.py +0 -117
  221. validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
  222. validmind/tests/data_validation/MissingValuesRisk.py +0 -88
  223. validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
  224. validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
  225. validmind/tests/model_validation/statsmodels/JarqueBera.py +0 -73
  226. validmind/tests/model_validation/statsmodels/LJungBox.py +0 -66
  227. validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
  228. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
  229. validmind/tests/model_validation/statsmodels/RunsTest.py +0 -71
  230. validmind-2.5.8.dist-info/RECORD +0 -318
  231. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/LICENSE +0 -0
  232. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/WHEEL +0 -0
  233. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/entry_points.txt +0 -0
@@ -14,49 +14,52 @@ from validmind.vm_models import Figure, Metric
14
14
  @dataclass
15
15
  class RegressionModelForecastPlotLevels(Metric):
16
16
  """
17
- Compares and visualizes forecasted and actual values of regression models on both raw and transformed datasets.
18
-
19
- **Purpose:**
20
- The `RegressionModelForecastPlotLevels` metric is designed to visually assess a series of regression models'
21
- performance. It achieves this by contrasting the models' forecasts with the observed data from the respective
22
- training and test datasets. The gauge of accuracy here involves determining the extent of closeness between
23
- forecasted and actual values. Accordingly, if any transformations are specified, the metric will handle
24
- transforming the data before making this comparison.
25
-
26
- **Test Mechanism:**
27
- The `RegressionModelForecastPlotLevels` class in Python initiates with a `transformation` parameter, which default
28
- aggregates to None. Initially, the class checks for the presence of model objects and raises a `ValueError` if none
29
- are found. Each model is then processed, creating predictive forecasts for both its training and testing datasets.
30
- These forecasts are then contrasted with the actual values and plotted. In situations where a specified
31
- transformation, like "integrate," is specified, the class navigates the transformation steps (performing cumulative
32
- sums to generate a novel series, for instance). Finally, plots are produced that compare observed and forecasted
33
- values for both the raw and transformed datasets.
34
-
35
- **Signs of High Risk:**
36
- Indications of high risk or failure in the model's performance can be derived from checking the generated plots.
37
- When the forecasted values dramatically deviate from the observed values in either the training or testing
38
- datasets, it suggests a high risk situation. A significant deviation could be a symptom of either overfitting or
39
- underfitting, both scenarios are worrying. Such discrepancies could inhibit the model's ability to create precise,
40
- generalized results.
41
-
42
- **Strengths:**
43
-
44
- - Visual Evaluations: The metric provides a visual and comparative way of assessing multiple regression models at
45
- once. This allows easier interpretation and evaluation of their forecasting accuracy.
46
- - Transformation Handling: This metric can handle transformations like "integrate," enhancing its breadth and
47
- flexibility in evaluating different models.
48
- - Detailed Perspective: By looking at the performance on both datasets (training and testing), the metric may give
49
- a detailed overview of the model.
50
-
51
- **Limitations:**
52
-
53
- - Subjectivity: Relying heavily on visual interpretations; assessments may differ from person to person.
54
- - Limited Transformation Capability: Currently, only the "integrate" transformation is supported, implying complex
55
- transformations might go unchecked or unhandled.
56
- - Overhead: The plotting mechanism may become computationally costly when applying to extensive datasets,
57
- increasing runtime.
58
- - Numerical Measurement: Although visualization is instrumental, a corresponding numerical measure would further
59
- reinforce the observations. However, this metric does not provide numerical measures.
17
+ Assesses the alignment between forecasted and observed values in regression models through visual plots, including
18
+ handling data transformations.
19
+
20
+ ### Purpose
21
+
22
+ The `RegressionModelForecastPlotLevels` test aims to visually assess the performance of a series of regression
23
+ models by comparing their forecasted values against the actual observed values in both training and test datasets.
24
+ This test helps determine the accuracy of the models and can handle specific data transformations before making the
25
+ comparison, providing a comprehensive evaluation of model performance.
26
+
27
+ ### Test Mechanism
28
+
29
+ The test mechanism involves initializing the `RegressionModelForecastPlotLevels` class with an optional
30
+ `transformation` parameter. The class then:
31
+
32
+ - Checks for the presence of model objects and raises a `ValueError` if none are found.
33
+ - Processes each model to generate predictive forecasts for both training and testing datasets.
34
+ - Contrasts these forecasts with the actual observed values.
35
+ - Produces plots to visually compare forecasted and observed values for both raw and transformed datasets.
36
+ - Handles specified transformations (e.g., "integrate") by performing cumulative sums to create a new series before
37
+ plotting.
38
+
39
+ ### Signs of High Risk
40
+
41
+ - Significant deviation between forecasted and observed values in training or testing datasets.
42
+ - Patterns suggesting overfitting or underfitting.
43
+ - Large discrepancies in the plotted forecasts, indicating potential issues with model generalizability and
44
+ precision.
45
+
46
+ ### Strengths
47
+
48
+ - **Visual Evaluations**: Provides an intuitive, visual way to assess multiple regression models, aiding in easier
49
+ interpretation and evaluation of forecast accuracy.
50
+ - **Transformation Handling**: Can process specified data transformations such as "integrate," enhancing
51
+ flexibility.
52
+ - **Detailed Perspective**: Assesses performance on both training and testing datasets, offering a comprehensive
53
+ view of model behavior.
54
+
55
+ ### Limitations
56
+
57
+ - **Subjectivity**: Relies heavily on visual interpretation, which may vary between individuals.
58
+ - **Limited Transformation Capability**: Supports only the "integrate" transformation; other complex
59
+ transformations might not be handled.
60
+ - **Overhead**: Plotting can be computationally intensive for large datasets, increasing runtime.
61
+ - **Numerical Measurement**: Does not provide a numerical metric to quantify forecast accuracy, relying solely on
62
+ visual assessment.
60
63
  """
61
64
 
62
65
  name = "regression_forecast_plot_levels"
@@ -16,44 +16,46 @@ logger = get_logger(__name__)
16
16
  @dataclass
17
17
  class RegressionModelSensitivityPlot(Metric):
18
18
  """
19
- Tests the sensitivity of a regression model to variations in independent variables by applying shocks and
20
- visualizing the effects.
21
-
22
- **Purpose**: The Regression Sensitivity Plot metric is designed to perform sensitivity analysis on regression
23
- models. This metric aims to measure the impact of slight changes (shocks) applied to individual variables on the
24
- system's outcome while keeping all other variables constant. By doing so, it analyzes the effects of each
25
- independent variable on the dependent variable within the regression model and helps identify significant risk
26
- factors that could substantially influence the model's output.
27
-
28
- **Test Mechanism**: This metric operates by initially applying shocks of varying magnitudes, defined by specific
29
- parameters, to each of the model's features, one at a time. With all other variables held constant, a new
30
- prediction is made for each dataset subjected to shocks. Any changes in the model's predictions are directly
31
- attributed to the shocks applied. In the event that the transformation parameter is set to "integrate", initial
32
- predictions and target values undergo transformation via an integration function before being plotted. Lastly, a
33
- plot demonstrating observed values against predicted values for each model is generated, showcasing a distinct line
34
- graph illustrating predictions for each shock.
35
-
36
- **Signs of High Risk**:
37
- - If the plot exhibits drastic alterations in model predictions consequent to minor shocks to an individual
38
- variable, it may indicate high risk. This underscores potentially high model sensitivity to changes in that
39
- variable, suggesting over-dependence on that variable for predictions.
40
- - Unusually high or unpredictable shifts in response to shocks may also denote potential model instability or
19
+ Assesses the sensitivity of a regression model to changes in independent variables by applying shocks and
20
+ visualizing the impact.
21
+
22
+ ### Purpose
23
+
24
+ The Regression Sensitivity Plot test is designed to perform sensitivity analysis on regression models. This test
25
+ aims to measure the impact of slight changes (shocks) applied to individual variables on the system's outcome while
26
+ keeping all other variables constant. By doing so, it analyzes the effects of each independent variable on the
27
+ dependent variable within the regression model, helping identify significant risk factors that could substantially
28
+ influence the model's output.
29
+
30
+ ### Test Mechanism
31
+
32
+ This test operates by initially applying shocks of varying magnitudes, defined by specific parameters, to each of
33
+ the model's features, one at a time. With all other variables held constant, a new prediction is made for each
34
+ dataset subjected to shocks. Any changes in the model's predictions are directly attributed to the shocks applied.
35
+ If the transformation parameter is set to "integrate," initial predictions and target values undergo transformation
36
+ via an integration function before being plotted. Finally, a plot demonstrating observed values against predicted
37
+ values for each model is generated, showcasing a distinct line graph illustrating predictions for each shock.
38
+
39
+ ### Signs of High Risk
40
+
41
+ - Drastic alterations in model predictions due to minor shocks to an individual variable, indicating high
42
+ sensitivity and potential over-dependence on that variable.
43
+ - Unusually high or unpredictable shifts in response to shocks, suggesting potential model instability or
41
44
  overfitting.
42
45
 
43
- **Strengths**:
44
- - The metric allows identification of variables strongly influencing the model outcomes, paving the way for
45
- understanding feature importance.
46
- - It generates visual plots which make the results easily interpretable even to non-technical stakeholders.
47
- - Beneficial in identifying overfitting and detecting unstable models that over-react to minor changes in variables.
48
-
49
- **Limitations**:
50
- - The metric operates on the assumption that all other variables remain unchanged during the application of a
51
- shock. However, real-world situations where variables may possess intricate interdependencies may not always
52
- reflect this.
53
- - It is best compatible with linear models and may not effectively evaluate the sensitivity of non-linear model
54
- configurations.
55
- - The metric does not provide a numerical risk measure. It offers only a visual representation, which may invite
56
- subjectivity in interpretation.
46
+ ### Strengths
47
+
48
+ - Helps identify variables that strongly influence model outcomes, aiding in understanding feature importance.
49
+ - Generates visual plots, making results easily interpretable even to non-technical stakeholders.
50
+ - Useful in identifying overfitting and detecting unstable models that react excessively to minor variable changes.
51
+
52
+ ### Limitations
53
+
54
+ - Operates on the assumption that all other variables remain unchanged during the application of a shock, which may
55
+ not reflect real-world interdependencies.
56
+ - Best compatible with linear models and may not effectively evaluate the sensitivity of non-linear models.
57
+ - Provides a visual representation without a numerical risk measure, potentially introducing subjectivity in
58
+ interpretation.
57
59
  """
58
60
 
59
61
  name = "regression_sensitivity_plot"
@@ -17,36 +17,38 @@ class RegressionModelSummary(Metric):
17
17
  """
18
18
  Evaluates regression model performance using metrics including R-Squared, Adjusted R-Squared, MSE, and RMSE.
19
19
 
20
- **Purpose**: This metric test evaluates the performance of regression models by measuring their predictive ability
21
- with regards to dependent variables given changes in the independent variables. Its measurement tools include
22
- conventional regression metrics such as R-Squared, Adjusted R-Squared, Mean Squared Error (MSE), and Root Mean
23
- Squared Error (RMSE).
24
-
25
- **Test Mechanism**: This test employs the 'train_ds' attribute of the model to gather and analyze the training
26
- data. Initially, it fetches the independent variables and uses the model to make predictions on these given
27
- features. Subsequently, it calculates several standard regression performance metrics including R-Square, Adjusted
28
- R-Squared, Mean Squared Error (MSE), and Root Mean Squared Error (RMSE), which quantify the approximation of the
29
- predicted responses to the actual responses.
30
-
31
- **Signs of High Risk**:
32
- - Low R-Squared and Adjusted R-Squared values. A poor fit between the model predictions and the true responses is
33
- indicated by low values of these metrics, suggesting the model explains a small fraction of the variance in the
34
- target variable.
35
- - High MSE and RMSE values represent a high prediction error and point to poor model performance.
36
-
37
- **Strengths**:
20
+ ### Purpose
21
+
22
+ The Regression Model Summary test evaluates the performance of regression models by measuring their predictive
23
+ ability regarding dependent variables given changes in the independent variables. It uses conventional regression
24
+ metrics such as R-Squared, Adjusted R-Squared, Mean Squared Error (MSE), and Root Mean Squared Error (RMSE) to
25
+ assess the model's accuracy and fit.
26
+
27
+ ### Test Mechanism
28
+
29
+ This test employs the 'train_ds' attribute of the model to gather and analyze the training data. Initially, it
30
+ fetches the independent variables and uses the model to make predictions on these given features. Subsequently, it
31
+ calculates several standard regression performance metrics including R-Squared, Adjusted R-Squared, Mean Squared
32
+ Error (MSE), and Root Mean Squared Error (RMSE), which quantify the approximation of the predicted responses to the
33
+ actual responses.
34
+
35
+ ### Signs of High Risk
36
+
37
+ - Low R-Squared and Adjusted R-Squared values.
38
+ - High MSE and RMSE values.
39
+
40
+ ### Strengths
41
+
38
42
  - Offers an extensive evaluation of regression models by combining four key measures of model accuracy and fit.
39
43
  - Provides a comprehensive view of the model's performance.
40
- - Both the R-Squared and Adjusted R-Squared measures are readily interpretable. They represent the proportion of
41
- total variation in the dependent variable that can be explained by the independent variables.
42
-
43
- **Limitations**:
44
- - Applicable exclusively to regression models. It is not suited for evaluating binary classification models or time
45
- series models, thus limiting its scope.
46
- - Although RMSE and MSE are sound measures of prediction error, they might be sensitive to outliers, potentially
47
- leading to an overestimation of the model's prediction error.
48
- - A high R-squared or adjusted R-squared may not necessarily indicate a good model, especially in cases where the
49
- model is possibly overfitting the data.
44
+ - Both the R-Squared and Adjusted R-Squared measures are readily interpretable.
45
+
46
+ ### Limitations
47
+
48
+ - Applicable exclusively to regression models.
49
+ - RMSE and MSE might be sensitive to outliers.
50
+ - A high R-Squared or Adjusted R-Squared may not necessarily indicate a good model, especially in cases of
51
+ overfitting.
50
52
  """
51
53
 
52
54
  name = "regression_model_summary"
@@ -21,28 +21,35 @@ logger = get_logger(__name__)
21
21
  class RegressionPermutationFeatureImportance(Metric):
22
22
  """
23
23
  Assesses the significance of each feature in a model by evaluating the impact on model performance when feature
24
- values are randomly rearranged. Specifically designed for use with statsmodels, this metric offers insight into the
25
- importance of features based on the decrease in model's predictive accuracy, typically R².
24
+ values are randomly rearranged.
26
25
 
27
- **Purpose**: The primary purpose of this metric is to determine which features significantly impact the performance
28
- of a regression model developed using statsmodels. The metric measures how much the prediction accuracy deteriorates
26
+ ### Purpose
27
+
28
+ The primary purpose of this metric is to determine which features significantly impact the performance of a
29
+ regression model developed using statsmodels. The metric measures how much the prediction accuracy deteriorates
29
30
  when each feature's values are permuted.
30
31
 
31
- **Test Mechanism**: This metric shuffles the values of each feature one at a time in the dataset, computes the model's
32
- performance after each permutation, and compares it to the baseline performance. A significant decrease in performance
32
+ ### Test Mechanism
33
+
34
+ This metric shuffles the values of each feature one at a time in the dataset, computes the model's performance
35
+ after each permutation, and compares it to the baseline performance. A significant decrease in performance
33
36
  indicates the importance of the feature.
34
37
 
35
- **Signs of High Risk**:
36
- - Significant reliance on a feature that when permuted leads to a substantial decrease in performance, suggesting
38
+ ### Signs of High Risk
39
+
40
+ - Significant reliance on a feature that, when permuted, leads to a substantial decrease in performance, suggesting
37
41
  overfitting or high model dependency on that feature.
38
42
  - Features identified as unimportant despite known impacts from domain knowledge, suggesting potential issues in
39
43
  model training or data preprocessing.
40
44
 
41
- **Strengths**:
42
- - Directly assesses the impact of each feature on model performance, providing clear insights into model dependencies.
45
+ ### Strengths
46
+
47
+ - Directly assesses the impact of each feature on model performance, providing clear insights into model
48
+ dependencies.
43
49
  - Model-agnostic within the scope of statsmodels, applicable to any regression model that outputs predictions.
44
50
 
45
- **Limitations**:
51
+ ### Limitations
52
+
46
53
  - The metric is specific to statsmodels and cannot be used with other types of models without adaptation.
47
54
  - It does not capture interactions between features, which can lead to underestimating the importance of correlated
48
55
  features.
@@ -2,136 +2,104 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
-
7
5
  import plotly.graph_objects as go
8
6
  from matplotlib import cm
9
7
 
10
- from validmind.vm_models import Figure, Metric
8
+ from validmind import tags, tasks
11
9
 
12
10
 
13
- @dataclass
14
- class ScorecardHistogram(Metric):
11
+ @tags("visualization", "credit_risk", "logistic_regression")
12
+ @tasks("classification")
13
+ def ScorecardHistogram(dataset, title="Histogram of Scores", score_column="score"):
15
14
  """
16
- Creates histograms of credit scores, from both default and non-default instances, generated by a credit-risk model.
17
-
18
- **Purpose**: The Scorecard Histogram test metric provides a visual interpretation of the credit scores generated by
19
- a machine learning model for credit-risk classification tasks. It aims to compare the alignment of the model's
20
- scoring decisions with the actual outcomes of credit loan applications. It helps in identifying potential
21
- discrepancies between the model's predictions and real-world risk levels.
22
-
23
- **Test Mechanism**: This metric uses logistic regression to generate a histogram of credit scores for both default
24
- (negative class) and non-default (positive class) instances. Using both training and test datasets, the metric
25
- calculates the credit score of each instance with a scorecard method, considering the impact of different features
26
- on the likelihood of default. İncludes the default point to odds (PDO) scaling factor and predefined target score
27
- and odds settings. Histograms for training and test sets are computed and plotted separately to offer insights into
28
- the model's generalizability to unseen data.
29
-
30
- **Signs of High Risk**:
31
- - Discrepancies between the distributions of training and testing data, indicating a model's poor generalisation
15
+ The Scorecard Histogram test evaluates the distribution of credit scores between default and non-default instances,
16
+ providing critical insights into the performance and generalizability of credit-risk models.
17
+
18
+ ### Purpose
19
+
20
+ The Scorecard Histogram test metric provides a visual interpretation of the credit scores generated by a machine
21
+ learning model for credit-risk classification tasks. It aims to compare the alignment of the model's scoring
22
+ decisions with the actual outcomes of credit loan applications. It helps in identifying potential discrepancies
23
+ between the model's predictions and real-world risk levels.
24
+
25
+ ### Test Mechanism
26
+
27
+ This metric uses logistic regression to generate a histogram of credit scores for both default (negative class) and
28
+ non-default (positive class) instances. Using both training and test datasets, the metric calculates the credit
29
+ score of each instance with a scorecard method, considering the impact of different features on the likelihood of
30
+ default. It includes the default point to odds (PDO) scaling factor and predefined target score and odds settings.
31
+ Histograms for training and test sets are computed and plotted separately to offer insights into the model's
32
+ generalizability to unseen data.
33
+
34
+ ### Signs of High Risk
35
+
36
+ - Discrepancies between the distributions of training and testing data, indicating a model's poor generalization
32
37
  ability
33
- - Skewed distributions favouring specific scores or classes, representing potential bias
38
+ - Skewed distributions favoring specific scores or classes, representing potential bias
39
+
40
+ ### Strengths
34
41
 
35
- **Strengths**:
36
42
  - Provides a visual interpretation of the model's credit scoring system, enhancing comprehension of model behavior
37
43
  - Enables a direct comparison between actual and predicted scores for both training and testing data
38
44
  - Its intuitive visualization helps understand the model's ability to differentiate between positive and negative
39
45
  classes
40
46
  - Can unveil patterns or anomalies not easily discerned through numerical metrics alone
41
47
 
42
- **Limitations**:
43
- - Despite its value for visual interpretation, it doesn't quantify the performance of the model, and therefore may
48
+ ### Limitations
49
+
50
+ - Despite its value for visual interpretation, it doesn't quantify the performance of the model and therefore may
44
51
  lack precision for thorough model evaluation
45
52
  - The quality of input data can strongly influence the metric, as bias or noise in the data will affect both the
46
53
  score calculation and resultant histogram
47
54
  - Its specificity to credit scoring models limits its applicability across a wider variety of machine learning
48
55
  tasks and models
49
- - The metric's effectiveness is somewhat tied to the subjective interpretation of the analyst, since it relies on
50
- the analyst's judgment of the characteristics and implications of the plot.
56
+ - The metric's effectiveness is somewhat tied to the subjective interpretation of the analyst, relying on their
57
+ judgment of the characteristics and implications of the plot.
51
58
  """
52
59
 
53
- name = "scorecard_histogram"
54
- required_inputs = ["datasets"]
55
- tasks = ["classification"]
56
- tags = ["tabular_data", "visualization", "credit_risk"]
60
+ if score_column not in dataset.df.columns:
61
+ raise ValueError(
62
+ f"The required column '{score_column}' is not present in the dataset with input_id {dataset.input_id}"
63
+ )
57
64
 
58
- default_params = {
59
- "title": "Histogram of Scores",
60
- "score_column": "score",
61
- }
65
+ df = dataset.df
62
66
 
63
- @staticmethod
64
- def plot_score_histogram(dataframes, dataset_titles, score_col, target_col, title):
65
- figures = []
66
- # Generate a colormap and convert to Plotly-accepted color format
67
- # Adjust 'viridis' to any other matplotlib colormap if desired
68
- colormap = cm.get_cmap("viridis")
69
-
70
- for _, (df, dataset_title) in enumerate(zip(dataframes, dataset_titles)):
71
- fig = go.Figure()
72
-
73
- # Get unique classes and assign colors
74
- classes = sorted(df[target_col].unique())
75
- colors = [
76
- colormap(i / len(classes))[:3] for i in range(len(classes))
77
- ] # RGB
78
- color_dict = {
79
- cls: f"rgb({int(rgb[0]*255)}, {int(rgb[1]*255)}, {int(rgb[2]*255)})"
80
- for cls, rgb in zip(classes, colors)
81
- }
82
-
83
- for class_value in sorted(df[target_col].unique()):
84
- scores_class = df[df[target_col] == class_value][score_col]
85
- fig.add_trace(
86
- go.Histogram(
87
- x=scores_class,
88
- opacity=0.75,
89
- name=f"{dataset_title} {target_col} = {class_value}",
90
- marker=dict(
91
- color=color_dict[class_value],
92
- ),
93
- )
94
- )
95
- fig.update_layout(
96
- barmode="overlay",
97
- title_text=f"{title} - {dataset_title}",
98
- xaxis_title="Score",
99
- yaxis_title="Frequency",
100
- legend_title=target_col,
101
- )
102
- figures.append(fig)
103
- return figures
104
-
105
- def run(self):
106
- title = self.params["title"]
107
- score_column = self.params["score_column"]
108
- dataset_titles = [dataset.input_id for dataset in self.inputs.datasets]
109
- target_column = self.inputs.datasets[0].target_column
110
-
111
- dataframes = []
112
- metric_value = {"score_histogram": {}}
113
- for dataset in self.inputs.datasets:
114
- if score_column not in dataset.df.columns:
115
- raise ValueError(
116
- f"The required column '{score_column}' is not present in the dataset with input_id {dataset.input_id}"
117
- )
118
-
119
- dataframes.append(dataset.df.copy())
120
- metric_value["score_histogram"][dataset.input_id] = list(
121
- dataset.df[score_column]
122
- )
67
+ fig = _plot_score_histogram(df, score_column, dataset.target_column, title)
123
68
 
124
- figures = self.plot_score_histogram(
125
- dataframes, dataset_titles, score_column, target_column, title
126
- )
69
+ return fig
127
70
 
128
- figures_list = [
129
- Figure(
130
- for_object=self,
131
- key=f"score_histogram_{title.replace(' ', '_')}_{i+1}",
132
- figure=fig,
133
- )
134
- for i, fig in enumerate(figures)
135
- ]
136
71
 
137
- return self.cache_results(metric_value=metric_value, figures=figures_list)
72
+ def _plot_score_histogram(df, score_col, target_col, title):
73
+ # Generate a colormap and convert to Plotly-accepted color format
74
+ # Adjust 'viridis' to any other matplotlib colormap if desired
75
+ colormap = cm.get_cmap("viridis")
76
+
77
+ fig = go.Figure()
78
+
79
+ # Get unique classes and assign colors
80
+ classes = sorted(df[target_col].unique())
81
+ colors = [colormap(i / len(classes))[:3] for i in range(len(classes))] # RGB
82
+ color_dict = {
83
+ cls: f"rgb({int(rgb[0]*255)}, {int(rgb[1]*255)}, {int(rgb[2]*255)})"
84
+ for cls, rgb in zip(classes, colors)
85
+ }
86
+
87
+ for class_value in sorted(df[target_col].unique()):
88
+ scores_class = df[df[target_col] == class_value][score_col]
89
+ fig.add_trace(
90
+ go.Histogram(
91
+ x=scores_class,
92
+ opacity=0.75,
93
+ name=f"{target_col} = {class_value}",
94
+ marker=dict(
95
+ color=color_dict[class_value],
96
+ ),
97
+ )
98
+ )
99
+ fig.update_layout(
100
+ barmode="overlay",
101
+ title_text=f"{title}",
102
+ xaxis_title="Score",
103
+ yaxis_title="Frequency",
104
+ )
105
+ return fig
@@ -16,37 +16,41 @@ def FeatureDrift(
16
16
  datasets, bins=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], feature_columns=None
17
17
  ):
18
18
  """
19
- **Purpose**:
19
+ Evaluates changes in feature distribution over time to identify potential model drift.
20
+
21
+ ### Purpose
20
22
 
21
23
  The Feature Drift test aims to evaluate how much the distribution of features has shifted over time between two
22
24
  datasets, typically training and monitoring datasets. It uses the Population Stability Index (PSI) to quantify this
23
- change, providing insights into the model's robustness and the necessity for retraining or feature engineering.
25
+ change, providing insights into the models robustness and the necessity for retraining or feature engineering.
24
26
 
25
- **Test Mechanism**:
27
+ ### Test Mechanism
26
28
 
27
29
  This test calculates the PSI by:
30
+
28
31
  - Bucketing the distributions of each feature in both datasets.
29
32
  - Comparing the percentage of observations in each bucket between the two datasets.
30
33
  - Aggregating the differences across all buckets for each feature to produce the PSI score for that feature.
31
34
 
32
35
  The PSI score is interpreted as:
36
+
33
37
  - PSI < 0.1: No significant population change.
34
38
  - PSI < 0.2: Moderate population change.
35
39
  - PSI >= 0.2: Significant population change.
36
40
 
37
- **Signs of High Risk**:
41
+ ### Signs of High Risk
38
42
 
39
43
  - PSI >= 0.2 for any feature, indicating a significant distribution shift.
40
44
  - Consistently high PSI scores across multiple features.
41
45
  - Sudden spikes in PSI in recent monitoring data compared to historical data.
42
46
 
43
- **Strengths**:
47
+ ### Strengths
44
48
 
45
49
  - Provides a quantitative measure of feature distribution changes.
46
50
  - Easily interpretable thresholds for decision-making.
47
51
  - Helps in early detection of data drift, prompting timely interventions.
48
52
 
49
- **Limitations**:
53
+ ### Limitations
50
54
 
51
55
  - May not capture more intricate changes in data distribution nuances.
52
56
  - Assumes that bucket thresholds (quantiles) adequately represent distribution shifts.