validmind 2.2.6__py3-none-any.whl → 2.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. validmind/__init__.py +2 -1
  2. validmind/__version__.py +1 -1
  3. validmind/{ai.py → ai/test_descriptions.py} +74 -82
  4. validmind/ai/utils.py +104 -0
  5. validmind/api_client.py +58 -19
  6. validmind/client.py +5 -5
  7. validmind/models/foundation.py +10 -6
  8. validmind/models/function.py +3 -1
  9. validmind/models/metadata.py +1 -1
  10. validmind/test_suites/__init__.py +1 -9
  11. validmind/test_suites/regression.py +0 -16
  12. validmind/test_suites/statsmodels_timeseries.py +1 -1
  13. validmind/tests/__init__.py +7 -7
  14. validmind/tests/__types__.py +170 -0
  15. validmind/tests/data_validation/ACFandPACFPlot.py +36 -27
  16. validmind/tests/{model_validation/statsmodels → data_validation}/ADF.py +42 -13
  17. validmind/tests/data_validation/BivariateScatterPlots.py +38 -41
  18. validmind/tests/{model_validation/statsmodels → data_validation}/DFGLSArch.py +67 -11
  19. validmind/tests/data_validation/HeatmapFeatureCorrelations.py +1 -1
  20. validmind/tests/data_validation/HighPearsonCorrelation.py +12 -3
  21. validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
  22. validmind/tests/{model_validation/statsmodels → data_validation}/KPSS.py +64 -11
  23. validmind/tests/{model_validation/statsmodels → data_validation}/PhillipsPerronArch.py +65 -11
  24. validmind/tests/data_validation/ScatterPlot.py +1 -1
  25. validmind/tests/data_validation/SeasonalDecompose.py +12 -7
  26. validmind/tests/data_validation/TabularDateTimeHistograms.py +29 -33
  27. validmind/tests/data_validation/WOEBinPlots.py +1 -1
  28. validmind/tests/data_validation/WOEBinTable.py +1 -1
  29. validmind/tests/{model_validation/statsmodels → data_validation}/ZivotAndrewsArch.py +65 -11
  30. validmind/tests/data_validation/nlp/CommonWords.py +1 -1
  31. validmind/tests/data_validation/nlp/Hashtags.py +1 -1
  32. validmind/tests/data_validation/nlp/Mentions.py +1 -1
  33. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +2 -1
  34. validmind/tests/data_validation/nlp/Punctuations.py +1 -1
  35. validmind/tests/data_validation/nlp/Sentiment.py +1 -1
  36. validmind/tests/data_validation/nlp/TextDescription.py +5 -1
  37. validmind/tests/data_validation/nlp/Toxicity.py +1 -1
  38. validmind/tests/decorator.py +13 -1
  39. validmind/tests/model_validation/FeaturesAUC.py +5 -3
  40. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +4 -0
  41. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +4 -0
  42. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +4 -0
  43. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +4 -0
  44. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +4 -0
  45. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +4 -0
  46. validmind/tests/model_validation/ragas/AnswerCorrectness.py +3 -3
  47. validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
  48. validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
  49. validmind/tests/model_validation/ragas/AspectCritique.py +14 -8
  50. validmind/tests/model_validation/ragas/ContextEntityRecall.py +3 -4
  51. validmind/tests/model_validation/ragas/ContextPrecision.py +4 -5
  52. validmind/tests/model_validation/ragas/ContextRecall.py +3 -4
  53. validmind/tests/model_validation/ragas/ContextRelevancy.py +5 -4
  54. validmind/tests/model_validation/ragas/Faithfulness.py +6 -5
  55. validmind/tests/model_validation/ragas/utils.py +35 -9
  56. validmind/tests/model_validation/sklearn/ClusterPerformance.py +2 -2
  57. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +1 -1
  58. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +6 -8
  59. validmind/tests/model_validation/sklearn/RegressionErrors.py +1 -1
  60. validmind/tests/model_validation/sklearn/RegressionModelsPerformanceComparison.py +14 -8
  61. validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
  62. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +1 -1
  63. validmind/tests/model_validation/statsmodels/GINITable.py +1 -1
  64. validmind/tests/model_validation/statsmodels/JarqueBera.py +1 -1
  65. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +1 -1
  66. validmind/tests/model_validation/statsmodels/LJungBox.py +1 -1
  67. validmind/tests/model_validation/statsmodels/Lilliefors.py +1 -1
  68. validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +4 -0
  69. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +9 -4
  70. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +2 -2
  71. validmind/tests/model_validation/statsmodels/RunsTest.py +1 -1
  72. validmind/tests/model_validation/statsmodels/ShapiroWilk.py +1 -1
  73. validmind/tests/prompt_validation/Bias.py +14 -11
  74. validmind/tests/prompt_validation/Clarity.py +14 -11
  75. validmind/tests/prompt_validation/Conciseness.py +14 -11
  76. validmind/tests/prompt_validation/Delimitation.py +14 -11
  77. validmind/tests/prompt_validation/NegativeInstruction.py +14 -11
  78. validmind/tests/prompt_validation/Robustness.py +11 -11
  79. validmind/tests/prompt_validation/Specificity.py +14 -11
  80. validmind/tests/prompt_validation/ai_powered_test.py +53 -75
  81. validmind/unit_metrics/composite.py +2 -1
  82. validmind/utils.py +34 -59
  83. validmind/vm_models/dataset/dataset.py +17 -3
  84. validmind/vm_models/dataset/utils.py +2 -2
  85. validmind/vm_models/model.py +1 -1
  86. validmind/vm_models/test/metric.py +1 -8
  87. validmind/vm_models/test/result_wrapper.py +2 -2
  88. validmind/vm_models/test/test.py +3 -0
  89. validmind/vm_models/test/threshold_test.py +1 -1
  90. validmind/vm_models/test_suite/runner.py +7 -4
  91. {validmind-2.2.6.dist-info → validmind-2.3.3.dist-info}/METADATA +1 -1
  92. {validmind-2.2.6.dist-info → validmind-2.3.3.dist-info}/RECORD +95 -103
  93. validmind/tests/data_validation/DefaultRatesbyRiskBandPlot.py +0 -114
  94. validmind/tests/data_validation/PiTCreditScoresHistogram.py +0 -150
  95. validmind/tests/data_validation/PiTPDHistogram.py +0 -152
  96. validmind/tests/model_validation/statsmodels/ADFTest.py +0 -88
  97. validmind/tests/model_validation/statsmodels/FeatureImportanceAndSignificance.py +0 -198
  98. validmind/tests/model_validation/statsmodels/PDRatingClassPlot.py +0 -151
  99. validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +0 -146
  100. validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +0 -144
  101. validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +0 -127
  102. validmind/tests/model_validation/statsmodels/ResidualsVisualInspection.py +0 -130
  103. {validmind-2.2.6.dist-info → validmind-2.3.3.dist-info}/LICENSE +0 -0
  104. {validmind-2.2.6.dist-info → validmind-2.3.3.dist-info}/WHEEL +0 -0
  105. {validmind-2.2.6.dist-info → validmind-2.3.3.dist-info}/entry_points.txt +0 -0
@@ -1,152 +0,0 @@
1
- # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
- # See the LICENSE file in the root of this repository for details.
3
- # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
-
5
- from dataclasses import dataclass
6
-
7
- import pandas as pd
8
- import plotly.graph_objects as go
9
- from plotly.subplots import make_subplots
10
-
11
- from validmind.vm_models import Figure, Metric
12
-
13
-
14
- @dataclass
15
- class PiTPDHistogram(Metric):
16
- """
17
- Assesses credit risk prediction accuracy of a model by comparing actual and predicted defaults at a chosen point in
18
- time.
19
-
20
- **Purpose**: The PiTPDHistogram metric uses Probability of Default (PD) calculations for individual instances
21
- within both training and test data sets in order to assess a model's proficiency in predicting credit risk. A
22
- distinctive point in time (PiT) is chosen for these PD calculations, and the results for both actual and predicted
23
- defaults are presented in histogram form. This visualization is aimed at simplifying the understanding of model
24
- prediction accuracy.
25
-
26
- **Test Mechanism**: Instances are categorized into two groups - those for actual defaults and those for predicted
27
- defaults, with '1' indicating a default and '0' indicating non-default. PD is calculated for each instance, and
28
- based on these calculations, two histograms are created, one for actual defaults and one for predicted defaults. If
29
- the predicted default frequency matches that of the actual defaults, the model's performance is deemed effective.
30
-
31
- **Signs of High Risk**:
32
- - Discrepancies between the actual and predicted default histograms may suggest model inefficiency.
33
- - Variations in histogram shapes or divergences in default probability distributions could be concerning.
34
- - Significant mismatches in peak default probabilities could also be red flags.
35
-
36
- **Strengths**:
37
- - Provides a visual comparison between actual and predicted defaults, aiding in the understanding of model
38
- performance.
39
- - Helps reveal model bias and areas where the model's performance could be improved.
40
- - Easier to understand than purely numerical evaluations or other complicated visualization measures.
41
-
42
- **Limitations**:
43
- - The metric remains largely interpretive and subjective, as the extent and relevance of visual discrepancies often
44
- need to be evaluated manually, leading to potentially inconsistent results across different analyses.
45
- - This metric alone may not capture all the complexities and nuances of model performance.
46
- - The information provided is limited to a specific point in time, potentially neglecting the model's performance
47
- under various circumstances or different time periods.
48
- """
49
-
50
- name = "pit_pd_histogram"
51
- required_context = ["dataset"]
52
- default_params = {"title": "Histogram of PiT Probability of Default"}
53
- metadata = {
54
- "task_types": ["classification"],
55
- "tags": ["tabular_data", "visualization", "credit_risk"],
56
- }
57
-
58
- @staticmethod
59
- def plot_pit_pd_histogram(
60
- df,
61
- default_column,
62
- predicted_default_column,
63
- default_probabilities_column,
64
- title,
65
- point_in_time_date,
66
- ):
67
- fig = make_subplots(
68
- rows=1, cols=2, subplot_titles=("Observed Default", "Predicted Default")
69
- )
70
-
71
- observed_data_0 = df[df[default_column] == 0][default_probabilities_column]
72
- observed_data_1 = df[df[default_column] == 1][default_probabilities_column]
73
-
74
- predicted_data_0 = df[df[predicted_default_column] == 0][
75
- default_probabilities_column
76
- ]
77
- predicted_data_1 = df[df[predicted_default_column] == 1][
78
- default_probabilities_column
79
- ]
80
-
81
- fig.add_trace(
82
- go.Histogram(x=observed_data_0, opacity=0.75, name="Observed Default = 0"),
83
- row=1,
84
- col=1,
85
- )
86
- fig.add_trace(
87
- go.Histogram(x=observed_data_1, opacity=0.75, name="Observed Default = 1"),
88
- row=1,
89
- col=1,
90
- )
91
-
92
- fig.add_trace(
93
- go.Histogram(
94
- x=predicted_data_0, opacity=0.75, name="Predicted Default = 0"
95
- ),
96
- row=1,
97
- col=2,
98
- )
99
- fig.add_trace(
100
- go.Histogram(
101
- x=predicted_data_1, opacity=0.75, name="Predicted Default = 1"
102
- ),
103
- row=1,
104
- col=2,
105
- )
106
-
107
- title += f" (PiT: {point_in_time_date.strftime('%d %b %Y')})"
108
- fig.update_layout(barmode="overlay", title_text=title)
109
-
110
- return fig
111
-
112
- def run(self):
113
- df = self.inputs.dataset.df
114
- default_column = self.params["default_column"]
115
- predicted_default_column = self.params["predicted_default_column"]
116
- default_probabilities_column = self.params["default_probabilities_column"]
117
- point_in_time_column = self.params["point_in_time_column"]
118
-
119
- title = self.params["title"]
120
-
121
- point_in_time_date = pd.to_datetime(df[point_in_time_column].iloc[0])
122
-
123
- fig = self.plot_pit_pd_histogram(
124
- df,
125
- default_column,
126
- predicted_default_column,
127
- default_probabilities_column,
128
- title,
129
- point_in_time_date,
130
- )
131
-
132
- return self.cache_results(
133
- metric_value={
134
- "prob_histogram": {
135
- "observed_probs": list(
136
- df[df[default_column] == 1][default_probabilities_column]
137
- ),
138
- "predicted_probs": list(
139
- df[df[predicted_default_column] == 1][
140
- default_probabilities_column
141
- ]
142
- ),
143
- },
144
- },
145
- figures=[
146
- Figure(
147
- for_object=self,
148
- key="prob_histogram",
149
- figure=fig,
150
- )
151
- ],
152
- )
@@ -1,88 +0,0 @@
1
- # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
- # See the LICENSE file in the root of this repository for details.
3
- # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
-
5
- from dataclasses import dataclass
6
-
7
- from statsmodels.tsa.stattools import adfuller
8
-
9
- from validmind.vm_models import ThresholdTest, ThresholdTestResult
10
-
11
-
12
- @dataclass
13
- class ADFTest(ThresholdTest):
14
- """
15
- Assesses the stationarity of time series data using the Augmented Dickey-Fuller (ADF) test.
16
-
17
- **Purpose**: The Augmented Dickey-Fuller (ADF) metric test is designed to evaluate the presence of a unit root in a
18
- time series. This essentially translates to assessing the stationarity of a time series dataset. This is vital in
19
- time series analysis, regression tasks, and forecasting, as these often need the data to be stationary.
20
-
21
- **Test Mechanism**: This test application utilizes the "adfuller" function from Python's “statsmodels” library. It
22
- applies this function to each column of the training dataset, subsequently calculating the ADF statistic, p-value,
23
- the number of lags used, and the number of observations in the sample for each column. If a column's p-value is
24
- lower than the predetermined threshold (usually 0.05), the series is considered stationary, and the test is deemed
25
- passed for that column.
26
-
27
- **Signs of High Risk**:
28
- - A p-value that surpasses the threshold value indicates a high risk or potential model performance issue.
29
- - A high p-value suggests that the null hypothesis (of a unit root being present) cannot be rejected. This in turn
30
- suggests that the series is non-stationary which could potentially yield unreliable and falsified results for the
31
- model's performance and forecast.
32
-
33
- **Strengths**:
34
- - Archetypal Test for Stationarity: The ADF test is a comprehensive approach towards testing the stationarity of
35
- time series data. Such testing is vital for many machine learning and statistical models.
36
- - Detailed Output: The function generates detailed output, including the number of lags used and the number of
37
- observations, which adds to understanding a series’ behaviour.
38
-
39
- **Limitations**:
40
- - Dependence on Threshold: The result of this test freights heavily on the threshold chosen. Hence, an imprudent
41
- threshold value might lead to false acceptance or rejection of the null hypothesis.
42
- - Not Effective for Trending Data: The test suffers when it operates under the assumption that the data does not
43
- encapsulate any deterministic trend. In the presence of such a trend, it might falsely identify a series as
44
- non-stationary.
45
- - Potential for False Positives: The ADF test especially in the case of larger datasets, tends to reject the null
46
- hypothesis, escalating the chances of false positives.
47
- """
48
-
49
- name = "adf_test"
50
- required_inputs = ["dataset"]
51
- default_params = {"threshold": 0.05}
52
- metadata = {
53
- "task_types": ["regression"],
54
- "tags": [
55
- "time_series_data",
56
- "statsmodels",
57
- "forecasting",
58
- "statistical_test",
59
- "stationarity",
60
- ],
61
- }
62
-
63
- def run(self):
64
- x_train = self.inputs.dataset.df
65
-
66
- results = []
67
- for col in x_train.columns:
68
- # adf_values[col] = adfuller(x_train[col].values)
69
- adf, pvalue, usedlag, nobs, critical_values, icbest = adfuller(
70
- x_train[col].values
71
- )
72
-
73
- col_passed = pvalue < self.params["threshold"]
74
- results.append(
75
- ThresholdTestResult(
76
- column=col,
77
- passed=col_passed,
78
- values={
79
- "adf": adf,
80
- "pvalue": pvalue,
81
- "usedlag": usedlag,
82
- "nobs": nobs,
83
- "icbest": icbest,
84
- },
85
- )
86
- )
87
-
88
- return self.cache_results(results, passed=all([r.passed for r in results]))
@@ -1,198 +0,0 @@
1
- # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
- # See the LICENSE file in the root of this repository for details.
3
- # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
-
5
- from dataclasses import dataclass
6
-
7
- import pandas as pd
8
- import plotly.graph_objects as go
9
- from sklearn.inspection import permutation_importance
10
-
11
- from validmind.logging import get_logger
12
- from validmind.vm_models import Figure, Metric
13
-
14
- logger = get_logger(__name__)
15
-
16
-
17
- @dataclass
18
- class FeatureImportanceAndSignificance(Metric):
19
- """
20
- Evaluates and visualizes the statistical significance and feature importance using regression and decision tree
21
- models.
22
-
23
- **Purpose**: The 'FeatureImportanceAndSignificance' test evaluates the statistical significance and the importance
24
- of features in the context of the machine learning model. By comparing the p-values from a regression model and the
25
- feature importances from a decision tree model, this test aids in determining the most significant variables from a
26
- statistical and a machine learning perspective, assisting in feature selection during the model development process.
27
-
28
- **Test Mechanism**: The test first compares the p-values from a regression model and the feature importances from a
29
- decision tree model. These values are normalized to ensure a uniform comparison. The 'p_threshold' parameter is
30
- used to determine what p-value is considered statistically significant and if the 'significant_only' parameter is
31
- true, only features with p-values below this threshold are included in the final output. The output from this test
32
- includes an interactive visualization displaying normalized p-values and the associated feature importances. The
33
- test throws an error if it does not receive both a regression model and a decision tree model.
34
-
35
- **Signs of High Risk**:
36
- - Exceptionally high or low p-values, which suggest that a feature may not be significant or meaningful in the
37
- context of the model.
38
- - If many variables with small feature importance values have significant p-values, this could indicate that the
39
- model might be overfitting.
40
-
41
- **Strengths**:
42
- - Combines two perspectives statistical significance (p-values) and feature importance (decision tree model),
43
- making it a robust feature selection test.
44
- - Provides an interactive visualization making it easy to interpret and understand the results.
45
-
46
- **Limitations**:
47
- - The test only works with a regression model and a decision tree model which may limit its applicability.
48
- - The test does not take into account potential correlations or causative relationships between features which may
49
- lead to misinterpretations of significance and importance.
50
- - Over-reliance on the p-value as a cut-off for feature significance can be seen as arbitrary and may not truly
51
- reflect the real-world importance of the feature.
52
- """
53
-
54
- name = "feature_importance_and_significance"
55
- required_inputs = ["models"]
56
- default_params = {
57
- "fontsize": 10,
58
- "p_threshold": 0.05,
59
- "significant_only": False,
60
- "figure_height": 800,
61
- "bar_width": 0.3,
62
- }
63
- metadata = {
64
- "task_types": ["regression"],
65
- "tags": [
66
- "statsmodels",
67
- "feature_importance",
68
- "statistical_test",
69
- "visualization",
70
- ],
71
- }
72
-
73
- def compute_p_values_and_feature_importances(
74
- self, regression_model, decision_tree_model
75
- ):
76
- p_values = regression_model.model.pvalues
77
- feature_importances = permutation_importance(
78
- decision_tree_model.model,
79
- decision_tree_model.train_ds.x,
80
- decision_tree_model.train_ds.y,
81
- random_state=0,
82
- n_jobs=-2,
83
- ).importances_mean
84
-
85
- p_values = p_values / max(p_values)
86
- feature_importances = feature_importances / max(feature_importances)
87
-
88
- return p_values, feature_importances
89
-
90
- def create_dataframe(
91
- self,
92
- p_values,
93
- feature_importances,
94
- regression_model,
95
- significant_only,
96
- p_threshold,
97
- ):
98
- df = pd.DataFrame(
99
- {
100
- "Normalized p-value": p_values,
101
- "Normalized Feature Importance": feature_importances,
102
- },
103
- index=regression_model.train_ds.x_df().columns,
104
- )
105
-
106
- if significant_only:
107
- df = df[df["Normalized p-value"] <= p_threshold]
108
-
109
- df = df.sort_values(by="Normalized Feature Importance", ascending=True)
110
-
111
- return df
112
-
113
- def create_figure(self, df, fontsize, figure_height, bar_width):
114
- fig = go.Figure()
115
-
116
- title_text = (
117
- "Significant Features (p-value <= {0})".format(self.params["p_threshold"])
118
- if self.params["significant_only"]
119
- else "All Features"
120
- )
121
-
122
- fig.update_layout(
123
- title=title_text,
124
- barmode="group",
125
- height=figure_height,
126
- yaxis=dict(tickfont=dict(size=fontsize)),
127
- xaxis=dict(title="Normalized Value", titlefont=dict(size=fontsize)),
128
- )
129
-
130
- fig.add_trace(
131
- go.Bar(
132
- y=df.index,
133
- x=df["Normalized p-value"],
134
- name="Normalized p-value",
135
- orientation="h",
136
- marker=dict(color="skyblue"),
137
- width=bar_width,
138
- )
139
- )
140
-
141
- fig.add_trace(
142
- go.Bar(
143
- y=df.index,
144
- x=df["Normalized Feature Importance"],
145
- name="Normalized Feature Importance",
146
- orientation="h",
147
- marker=dict(color="orange"),
148
- width=bar_width,
149
- )
150
- )
151
-
152
- return fig
153
-
154
- def run(self):
155
- fontsize = self.params["fontsize"]
156
- significant_only = self.params["significant_only"]
157
- p_threshold = self.params["p_threshold"]
158
- figure_height = self.params["figure_height"]
159
- bar_width = self.params["bar_width"]
160
-
161
- all_models = []
162
-
163
- if self.inputs.models is not None:
164
- all_models.extend(self.inputs.models)
165
-
166
- if len(self.inputs.models) != 2:
167
- raise ValueError("Two models must be provided")
168
-
169
- regression_model = self.inputs.models[0]
170
- decision_tree_model = self.inputs.models[1]
171
-
172
- p_values, feature_importances = self.compute_p_values_and_feature_importances(
173
- regression_model, decision_tree_model
174
- )
175
-
176
- df = self.create_dataframe(
177
- p_values,
178
- feature_importances,
179
- regression_model,
180
- significant_only,
181
- p_threshold,
182
- )
183
-
184
- fig = self.create_figure(df, fontsize, figure_height, bar_width)
185
-
186
- return self.cache_results(
187
- figures=[
188
- Figure(
189
- for_object=self,
190
- key=self.key,
191
- figure=fig,
192
- metadata={
193
- "model_regression": str(regression_model.model),
194
- "model_decision_tree": str(decision_tree_model.model),
195
- },
196
- )
197
- ]
198
- )
@@ -1,151 +0,0 @@
1
- # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
- # See the LICENSE file in the root of this repository for details.
3
- # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
-
5
- from dataclasses import dataclass
6
-
7
- import pandas as pd
8
- import plotly.graph_objects as go
9
-
10
- from validmind.vm_models import Figure, Metric
11
-
12
-
13
- @dataclass
14
- class PDRatingClassPlot(Metric):
15
- """
16
- Assesses and visualizes credit risk distribution across different rating classes within a dataset via default
17
- probabilities.
18
-
19
- **Purpose**: The purpose of the Probability of Default (PD) Rating Class Plot test is to measure and evaluate the
20
- distribution of calculated default probabilities across different rating classes. This is critical for
21
- understanding and inferring credit risk and can provide insights into how effectively the model is differentiating
22
- between different risk levels in a credit dataset.
23
-
24
- **Test Mechanism**: This metric is implemented via a visualization mechanism. It sorts the predicted probabilities
25
- of defaults into user-defined rating classes defined in "rating_classes" in default parameters. When it has
26
- classified the probabilities, it then calculates the average default rates within each rating class. Subsequently,
27
- it produces bar plots for each of these rating classes, illustrating the average likelihood of a default within
28
- each class. This process is executed separately for both the training and testing data sets. The classification of
29
- predicted probabilities utilizes the pandas "cut" function, sorting and sectioning the data values into bins.
30
-
31
- **Signs of High Risk**:
32
-
33
- - If lower rating classes present higher average likelihoods of default than higher rating classes
34
- - If there is poor differentiation between the averages across the different rating classes
35
- - If the model generates a significant contrast between the likelihoods for the training set and the testing set,
36
- suggestive of model overfitting
37
-
38
- **Strengths**:
39
-
40
- - Presents a clear visual representation of how efficient the model is at predicting credit risk across different
41
- risk levels
42
- - Allows for rapid identification and understanding of model performance per rating class
43
- - Highlights potential overfitting issues by including both training and testing datasets in the analysis
44
-
45
- **Limitations**:
46
-
47
- - Making an incorrect choice for the number of rating classes, either oversimplifying or overcomplicating the
48
- distribution of default rates
49
- - Relying on the assumption that the rating classes are effective at differentiating risk levels and that the
50
- boundaries between classes truly represent the risk distribution
51
- - Not accounting for data set class imbalance, which could cause skewed average probabilities
52
- - Inability to gauge the overall performance of the model only based on this metric, emphasizing the requirement of
53
- combining it with other evaluation metrics
54
- """
55
-
56
- name = "pd_rating_class_plot"
57
- required_inputs = ["model", "datasets"]
58
-
59
- metadata = {
60
- "task_types": ["classification"],
61
- "tags": ["visualization", "credit_risk"],
62
- }
63
-
64
- default_params = {
65
- "title": "PD by Rating Class",
66
- "rating_classes": ["A", "B", "C", "D"],
67
- }
68
-
69
- @staticmethod
70
- def plot_bucket_analysis(df, prob_col, target_col, title, rating_classes):
71
- df["bucket"] = pd.cut(
72
- df[prob_col], bins=len(rating_classes), labels=rating_classes, right=False
73
- )
74
- default_rate = df.groupby("bucket")[target_col].mean()
75
-
76
- # Sort the data based on the order of rating_classes
77
- sorted_data = sorted(
78
- zip(rating_classes, default_rate),
79
- key=lambda x: rating_classes.index(x[0]),
80
- )
81
- rating_classes_sorted, default_rate_sorted = zip(*sorted_data)
82
-
83
- fig = go.Figure()
84
-
85
- # Iterate through the sorted data and create a bar for each score bucket
86
- for i, (bucket, rate) in enumerate(
87
- zip(rating_classes_sorted, default_rate_sorted)
88
- ):
89
- fig.add_trace(go.Bar(x=[bucket], y=[rate], name=bucket))
90
-
91
- fig.update_layout(
92
- title_text=title,
93
- xaxis_title="Rating Class",
94
- yaxis_title="Probability of Default",
95
- barmode="group",
96
- )
97
-
98
- return fig
99
-
100
- def run(self):
101
- target_column = self.inputs.model.train_ds.target_column
102
- title = self.params["title"]
103
- rating_classes = self.params["rating_classes"]
104
-
105
- X_train = self.inputs.datasets[0].x.copy()
106
- y_train = self.inputs.datasets[0].y.copy()
107
- X_test = self.inputs.datasets[1].x.copy()
108
- y_test = self.inputs.datasets[1].y.copy()
109
-
110
- # Compute probabilities
111
- X_train["probability"] = self.inputs.model.predict_proba(X_train)
112
- X_test["probability"] = self.inputs.model.predict_proba(X_test)
113
-
114
- df_train = pd.concat([X_train, y_train], axis=1)
115
- df_test = pd.concat([X_test, y_test], axis=1)
116
-
117
- fig_train = self.plot_bucket_analysis(
118
- df_train,
119
- "probability",
120
- target_column,
121
- title + " - Train Data",
122
- rating_classes,
123
- )
124
- fig_test = self.plot_bucket_analysis(
125
- df_test,
126
- "probability",
127
- target_column,
128
- title + " - Test Data",
129
- rating_classes,
130
- )
131
-
132
- return self.cache_results(
133
- metric_value={
134
- "bucket_analysis": {
135
- "train_probs": list(X_train["probability"]),
136
- "test_probs": list(X_test["probability"]),
137
- },
138
- },
139
- figures=[
140
- Figure(
141
- for_object=self,
142
- key="bucket_analysis_train",
143
- figure=fig_train,
144
- ),
145
- Figure(
146
- for_object=self,
147
- key="bucket_analysis_test",
148
- figure=fig_test,
149
- ),
150
- ],
151
- )