validmind 2.2.6__py3-none-any.whl → 2.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +2 -1
- validmind/__version__.py +1 -1
- validmind/{ai.py → ai/test_descriptions.py} +74 -82
- validmind/ai/utils.py +104 -0
- validmind/api_client.py +58 -19
- validmind/client.py +5 -5
- validmind/models/foundation.py +10 -6
- validmind/models/function.py +3 -1
- validmind/models/metadata.py +1 -1
- validmind/test_suites/__init__.py +1 -9
- validmind/test_suites/regression.py +0 -16
- validmind/test_suites/statsmodels_timeseries.py +1 -1
- validmind/tests/__init__.py +7 -7
- validmind/tests/__types__.py +170 -0
- validmind/tests/data_validation/ACFandPACFPlot.py +36 -27
- validmind/tests/{model_validation/statsmodels → data_validation}/ADF.py +42 -13
- validmind/tests/data_validation/BivariateScatterPlots.py +38 -41
- validmind/tests/{model_validation/statsmodels → data_validation}/DFGLSArch.py +67 -11
- validmind/tests/data_validation/HeatmapFeatureCorrelations.py +1 -1
- validmind/tests/data_validation/HighPearsonCorrelation.py +12 -3
- validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
- validmind/tests/{model_validation/statsmodels → data_validation}/KPSS.py +64 -11
- validmind/tests/{model_validation/statsmodels → data_validation}/PhillipsPerronArch.py +65 -11
- validmind/tests/data_validation/ScatterPlot.py +1 -1
- validmind/tests/data_validation/SeasonalDecompose.py +12 -7
- validmind/tests/data_validation/TabularDateTimeHistograms.py +29 -33
- validmind/tests/data_validation/WOEBinPlots.py +1 -1
- validmind/tests/data_validation/WOEBinTable.py +1 -1
- validmind/tests/{model_validation/statsmodels → data_validation}/ZivotAndrewsArch.py +65 -11
- validmind/tests/data_validation/nlp/CommonWords.py +1 -1
- validmind/tests/data_validation/nlp/Hashtags.py +1 -1
- validmind/tests/data_validation/nlp/Mentions.py +1 -1
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +2 -1
- validmind/tests/data_validation/nlp/Punctuations.py +1 -1
- validmind/tests/data_validation/nlp/Sentiment.py +1 -1
- validmind/tests/data_validation/nlp/TextDescription.py +5 -1
- validmind/tests/data_validation/nlp/Toxicity.py +1 -1
- validmind/tests/decorator.py +13 -1
- validmind/tests/model_validation/FeaturesAUC.py +5 -3
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +4 -0
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +4 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +4 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +4 -0
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +4 -0
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +4 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +3 -3
- validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
- validmind/tests/model_validation/ragas/AspectCritique.py +14 -8
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +3 -4
- validmind/tests/model_validation/ragas/ContextPrecision.py +4 -5
- validmind/tests/model_validation/ragas/ContextRecall.py +3 -4
- validmind/tests/model_validation/ragas/ContextRelevancy.py +5 -4
- validmind/tests/model_validation/ragas/Faithfulness.py +6 -5
- validmind/tests/model_validation/ragas/utils.py +35 -9
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +2 -2
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +1 -1
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +6 -8
- validmind/tests/model_validation/sklearn/RegressionErrors.py +1 -1
- validmind/tests/model_validation/sklearn/RegressionModelsPerformanceComparison.py +14 -8
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
- validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +1 -1
- validmind/tests/model_validation/statsmodels/GINITable.py +1 -1
- validmind/tests/model_validation/statsmodels/JarqueBera.py +1 -1
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +1 -1
- validmind/tests/model_validation/statsmodels/LJungBox.py +1 -1
- validmind/tests/model_validation/statsmodels/Lilliefors.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +4 -0
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +9 -4
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +2 -2
- validmind/tests/model_validation/statsmodels/RunsTest.py +1 -1
- validmind/tests/model_validation/statsmodels/ShapiroWilk.py +1 -1
- validmind/tests/prompt_validation/Bias.py +14 -11
- validmind/tests/prompt_validation/Clarity.py +14 -11
- validmind/tests/prompt_validation/Conciseness.py +14 -11
- validmind/tests/prompt_validation/Delimitation.py +14 -11
- validmind/tests/prompt_validation/NegativeInstruction.py +14 -11
- validmind/tests/prompt_validation/Robustness.py +11 -11
- validmind/tests/prompt_validation/Specificity.py +14 -11
- validmind/tests/prompt_validation/ai_powered_test.py +53 -75
- validmind/unit_metrics/composite.py +2 -1
- validmind/utils.py +34 -59
- validmind/vm_models/dataset/dataset.py +17 -3
- validmind/vm_models/dataset/utils.py +2 -2
- validmind/vm_models/model.py +1 -1
- validmind/vm_models/test/metric.py +1 -8
- validmind/vm_models/test/result_wrapper.py +2 -2
- validmind/vm_models/test/test.py +3 -0
- validmind/vm_models/test/threshold_test.py +1 -1
- validmind/vm_models/test_suite/runner.py +7 -4
- {validmind-2.2.6.dist-info → validmind-2.3.3.dist-info}/METADATA +1 -1
- {validmind-2.2.6.dist-info → validmind-2.3.3.dist-info}/RECORD +95 -103
- validmind/tests/data_validation/DefaultRatesbyRiskBandPlot.py +0 -114
- validmind/tests/data_validation/PiTCreditScoresHistogram.py +0 -150
- validmind/tests/data_validation/PiTPDHistogram.py +0 -152
- validmind/tests/model_validation/statsmodels/ADFTest.py +0 -88
- validmind/tests/model_validation/statsmodels/FeatureImportanceAndSignificance.py +0 -198
- validmind/tests/model_validation/statsmodels/PDRatingClassPlot.py +0 -151
- validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +0 -146
- validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +0 -144
- validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +0 -127
- validmind/tests/model_validation/statsmodels/ResidualsVisualInspection.py +0 -130
- {validmind-2.2.6.dist-info → validmind-2.3.3.dist-info}/LICENSE +0 -0
- {validmind-2.2.6.dist-info → validmind-2.3.3.dist-info}/WHEEL +0 -0
- {validmind-2.2.6.dist-info → validmind-2.3.3.dist-info}/entry_points.txt +0 -0
@@ -1,152 +0,0 @@
|
|
1
|
-
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
-
# See the LICENSE file in the root of this repository for details.
|
3
|
-
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
-
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
|
-
import pandas as pd
|
8
|
-
import plotly.graph_objects as go
|
9
|
-
from plotly.subplots import make_subplots
|
10
|
-
|
11
|
-
from validmind.vm_models import Figure, Metric
|
12
|
-
|
13
|
-
|
14
|
-
@dataclass
|
15
|
-
class PiTPDHistogram(Metric):
|
16
|
-
"""
|
17
|
-
Assesses credit risk prediction accuracy of a model by comparing actual and predicted defaults at a chosen point in
|
18
|
-
time.
|
19
|
-
|
20
|
-
**Purpose**: The PiTPDHistogram metric uses Probability of Default (PD) calculations for individual instances
|
21
|
-
within both training and test data sets in order to assess a model's proficiency in predicting credit risk. A
|
22
|
-
distinctive point in time (PiT) is chosen for these PD calculations, and the results for both actual and predicted
|
23
|
-
defaults are presented in histogram form. This visualization is aimed at simplifying the understanding of model
|
24
|
-
prediction accuracy.
|
25
|
-
|
26
|
-
**Test Mechanism**: Instances are categorized into two groups - those for actual defaults and those for predicted
|
27
|
-
defaults, with '1' indicating a default and '0' indicating non-default. PD is calculated for each instance, and
|
28
|
-
based on these calculations, two histograms are created, one for actual defaults and one for predicted defaults. If
|
29
|
-
the predicted default frequency matches that of the actual defaults, the model's performance is deemed effective.
|
30
|
-
|
31
|
-
**Signs of High Risk**:
|
32
|
-
- Discrepancies between the actual and predicted default histograms may suggest model inefficiency.
|
33
|
-
- Variations in histogram shapes or divergences in default probability distributions could be concerning.
|
34
|
-
- Significant mismatches in peak default probabilities could also be red flags.
|
35
|
-
|
36
|
-
**Strengths**:
|
37
|
-
- Provides a visual comparison between actual and predicted defaults, aiding in the understanding of model
|
38
|
-
performance.
|
39
|
-
- Helps reveal model bias and areas where the model's performance could be improved.
|
40
|
-
- Easier to understand than purely numerical evaluations or other complicated visualization measures.
|
41
|
-
|
42
|
-
**Limitations**:
|
43
|
-
- The metric remains largely interpretive and subjective, as the extent and relevance of visual discrepancies often
|
44
|
-
need to be evaluated manually, leading to potentially inconsistent results across different analyses.
|
45
|
-
- This metric alone may not capture all the complexities and nuances of model performance.
|
46
|
-
- The information provided is limited to a specific point in time, potentially neglecting the model's performance
|
47
|
-
under various circumstances or different time periods.
|
48
|
-
"""
|
49
|
-
|
50
|
-
name = "pit_pd_histogram"
|
51
|
-
required_context = ["dataset"]
|
52
|
-
default_params = {"title": "Histogram of PiT Probability of Default"}
|
53
|
-
metadata = {
|
54
|
-
"task_types": ["classification"],
|
55
|
-
"tags": ["tabular_data", "visualization", "credit_risk"],
|
56
|
-
}
|
57
|
-
|
58
|
-
@staticmethod
|
59
|
-
def plot_pit_pd_histogram(
|
60
|
-
df,
|
61
|
-
default_column,
|
62
|
-
predicted_default_column,
|
63
|
-
default_probabilities_column,
|
64
|
-
title,
|
65
|
-
point_in_time_date,
|
66
|
-
):
|
67
|
-
fig = make_subplots(
|
68
|
-
rows=1, cols=2, subplot_titles=("Observed Default", "Predicted Default")
|
69
|
-
)
|
70
|
-
|
71
|
-
observed_data_0 = df[df[default_column] == 0][default_probabilities_column]
|
72
|
-
observed_data_1 = df[df[default_column] == 1][default_probabilities_column]
|
73
|
-
|
74
|
-
predicted_data_0 = df[df[predicted_default_column] == 0][
|
75
|
-
default_probabilities_column
|
76
|
-
]
|
77
|
-
predicted_data_1 = df[df[predicted_default_column] == 1][
|
78
|
-
default_probabilities_column
|
79
|
-
]
|
80
|
-
|
81
|
-
fig.add_trace(
|
82
|
-
go.Histogram(x=observed_data_0, opacity=0.75, name="Observed Default = 0"),
|
83
|
-
row=1,
|
84
|
-
col=1,
|
85
|
-
)
|
86
|
-
fig.add_trace(
|
87
|
-
go.Histogram(x=observed_data_1, opacity=0.75, name="Observed Default = 1"),
|
88
|
-
row=1,
|
89
|
-
col=1,
|
90
|
-
)
|
91
|
-
|
92
|
-
fig.add_trace(
|
93
|
-
go.Histogram(
|
94
|
-
x=predicted_data_0, opacity=0.75, name="Predicted Default = 0"
|
95
|
-
),
|
96
|
-
row=1,
|
97
|
-
col=2,
|
98
|
-
)
|
99
|
-
fig.add_trace(
|
100
|
-
go.Histogram(
|
101
|
-
x=predicted_data_1, opacity=0.75, name="Predicted Default = 1"
|
102
|
-
),
|
103
|
-
row=1,
|
104
|
-
col=2,
|
105
|
-
)
|
106
|
-
|
107
|
-
title += f" (PiT: {point_in_time_date.strftime('%d %b %Y')})"
|
108
|
-
fig.update_layout(barmode="overlay", title_text=title)
|
109
|
-
|
110
|
-
return fig
|
111
|
-
|
112
|
-
def run(self):
|
113
|
-
df = self.inputs.dataset.df
|
114
|
-
default_column = self.params["default_column"]
|
115
|
-
predicted_default_column = self.params["predicted_default_column"]
|
116
|
-
default_probabilities_column = self.params["default_probabilities_column"]
|
117
|
-
point_in_time_column = self.params["point_in_time_column"]
|
118
|
-
|
119
|
-
title = self.params["title"]
|
120
|
-
|
121
|
-
point_in_time_date = pd.to_datetime(df[point_in_time_column].iloc[0])
|
122
|
-
|
123
|
-
fig = self.plot_pit_pd_histogram(
|
124
|
-
df,
|
125
|
-
default_column,
|
126
|
-
predicted_default_column,
|
127
|
-
default_probabilities_column,
|
128
|
-
title,
|
129
|
-
point_in_time_date,
|
130
|
-
)
|
131
|
-
|
132
|
-
return self.cache_results(
|
133
|
-
metric_value={
|
134
|
-
"prob_histogram": {
|
135
|
-
"observed_probs": list(
|
136
|
-
df[df[default_column] == 1][default_probabilities_column]
|
137
|
-
),
|
138
|
-
"predicted_probs": list(
|
139
|
-
df[df[predicted_default_column] == 1][
|
140
|
-
default_probabilities_column
|
141
|
-
]
|
142
|
-
),
|
143
|
-
},
|
144
|
-
},
|
145
|
-
figures=[
|
146
|
-
Figure(
|
147
|
-
for_object=self,
|
148
|
-
key="prob_histogram",
|
149
|
-
figure=fig,
|
150
|
-
)
|
151
|
-
],
|
152
|
-
)
|
@@ -1,88 +0,0 @@
|
|
1
|
-
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
-
# See the LICENSE file in the root of this repository for details.
|
3
|
-
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
-
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
|
-
from statsmodels.tsa.stattools import adfuller
|
8
|
-
|
9
|
-
from validmind.vm_models import ThresholdTest, ThresholdTestResult
|
10
|
-
|
11
|
-
|
12
|
-
@dataclass
|
13
|
-
class ADFTest(ThresholdTest):
|
14
|
-
"""
|
15
|
-
Assesses the stationarity of time series data using the Augmented Dickey-Fuller (ADF) test.
|
16
|
-
|
17
|
-
**Purpose**: The Augmented Dickey-Fuller (ADF) metric test is designed to evaluate the presence of a unit root in a
|
18
|
-
time series. This essentially translates to assessing the stationarity of a time series dataset. This is vital in
|
19
|
-
time series analysis, regression tasks, and forecasting, as these often need the data to be stationary.
|
20
|
-
|
21
|
-
**Test Mechanism**: This test application utilizes the "adfuller" function from Python's “statsmodels” library. It
|
22
|
-
applies this function to each column of the training dataset, subsequently calculating the ADF statistic, p-value,
|
23
|
-
the number of lags used, and the number of observations in the sample for each column. If a column's p-value is
|
24
|
-
lower than the predetermined threshold (usually 0.05), the series is considered stationary, and the test is deemed
|
25
|
-
passed for that column.
|
26
|
-
|
27
|
-
**Signs of High Risk**:
|
28
|
-
- A p-value that surpasses the threshold value indicates a high risk or potential model performance issue.
|
29
|
-
- A high p-value suggests that the null hypothesis (of a unit root being present) cannot be rejected. This in turn
|
30
|
-
suggests that the series is non-stationary which could potentially yield unreliable and falsified results for the
|
31
|
-
model's performance and forecast.
|
32
|
-
|
33
|
-
**Strengths**:
|
34
|
-
- Archetypal Test for Stationarity: The ADF test is a comprehensive approach towards testing the stationarity of
|
35
|
-
time series data. Such testing is vital for many machine learning and statistical models.
|
36
|
-
- Detailed Output: The function generates detailed output, including the number of lags used and the number of
|
37
|
-
observations, which adds to understanding a series’ behaviour.
|
38
|
-
|
39
|
-
**Limitations**:
|
40
|
-
- Dependence on Threshold: The result of this test freights heavily on the threshold chosen. Hence, an imprudent
|
41
|
-
threshold value might lead to false acceptance or rejection of the null hypothesis.
|
42
|
-
- Not Effective for Trending Data: The test suffers when it operates under the assumption that the data does not
|
43
|
-
encapsulate any deterministic trend. In the presence of such a trend, it might falsely identify a series as
|
44
|
-
non-stationary.
|
45
|
-
- Potential for False Positives: The ADF test especially in the case of larger datasets, tends to reject the null
|
46
|
-
hypothesis, escalating the chances of false positives.
|
47
|
-
"""
|
48
|
-
|
49
|
-
name = "adf_test"
|
50
|
-
required_inputs = ["dataset"]
|
51
|
-
default_params = {"threshold": 0.05}
|
52
|
-
metadata = {
|
53
|
-
"task_types": ["regression"],
|
54
|
-
"tags": [
|
55
|
-
"time_series_data",
|
56
|
-
"statsmodels",
|
57
|
-
"forecasting",
|
58
|
-
"statistical_test",
|
59
|
-
"stationarity",
|
60
|
-
],
|
61
|
-
}
|
62
|
-
|
63
|
-
def run(self):
|
64
|
-
x_train = self.inputs.dataset.df
|
65
|
-
|
66
|
-
results = []
|
67
|
-
for col in x_train.columns:
|
68
|
-
# adf_values[col] = adfuller(x_train[col].values)
|
69
|
-
adf, pvalue, usedlag, nobs, critical_values, icbest = adfuller(
|
70
|
-
x_train[col].values
|
71
|
-
)
|
72
|
-
|
73
|
-
col_passed = pvalue < self.params["threshold"]
|
74
|
-
results.append(
|
75
|
-
ThresholdTestResult(
|
76
|
-
column=col,
|
77
|
-
passed=col_passed,
|
78
|
-
values={
|
79
|
-
"adf": adf,
|
80
|
-
"pvalue": pvalue,
|
81
|
-
"usedlag": usedlag,
|
82
|
-
"nobs": nobs,
|
83
|
-
"icbest": icbest,
|
84
|
-
},
|
85
|
-
)
|
86
|
-
)
|
87
|
-
|
88
|
-
return self.cache_results(results, passed=all([r.passed for r in results]))
|
@@ -1,198 +0,0 @@
|
|
1
|
-
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
-
# See the LICENSE file in the root of this repository for details.
|
3
|
-
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
-
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
|
-
import pandas as pd
|
8
|
-
import plotly.graph_objects as go
|
9
|
-
from sklearn.inspection import permutation_importance
|
10
|
-
|
11
|
-
from validmind.logging import get_logger
|
12
|
-
from validmind.vm_models import Figure, Metric
|
13
|
-
|
14
|
-
logger = get_logger(__name__)
|
15
|
-
|
16
|
-
|
17
|
-
@dataclass
|
18
|
-
class FeatureImportanceAndSignificance(Metric):
|
19
|
-
"""
|
20
|
-
Evaluates and visualizes the statistical significance and feature importance using regression and decision tree
|
21
|
-
models.
|
22
|
-
|
23
|
-
**Purpose**: The 'FeatureImportanceAndSignificance' test evaluates the statistical significance and the importance
|
24
|
-
of features in the context of the machine learning model. By comparing the p-values from a regression model and the
|
25
|
-
feature importances from a decision tree model, this test aids in determining the most significant variables from a
|
26
|
-
statistical and a machine learning perspective, assisting in feature selection during the model development process.
|
27
|
-
|
28
|
-
**Test Mechanism**: The test first compares the p-values from a regression model and the feature importances from a
|
29
|
-
decision tree model. These values are normalized to ensure a uniform comparison. The 'p_threshold' parameter is
|
30
|
-
used to determine what p-value is considered statistically significant and if the 'significant_only' parameter is
|
31
|
-
true, only features with p-values below this threshold are included in the final output. The output from this test
|
32
|
-
includes an interactive visualization displaying normalized p-values and the associated feature importances. The
|
33
|
-
test throws an error if it does not receive both a regression model and a decision tree model.
|
34
|
-
|
35
|
-
**Signs of High Risk**:
|
36
|
-
- Exceptionally high or low p-values, which suggest that a feature may not be significant or meaningful in the
|
37
|
-
context of the model.
|
38
|
-
- If many variables with small feature importance values have significant p-values, this could indicate that the
|
39
|
-
model might be overfitting.
|
40
|
-
|
41
|
-
**Strengths**:
|
42
|
-
- Combines two perspectives statistical significance (p-values) and feature importance (decision tree model),
|
43
|
-
making it a robust feature selection test.
|
44
|
-
- Provides an interactive visualization making it easy to interpret and understand the results.
|
45
|
-
|
46
|
-
**Limitations**:
|
47
|
-
- The test only works with a regression model and a decision tree model which may limit its applicability.
|
48
|
-
- The test does not take into account potential correlations or causative relationships between features which may
|
49
|
-
lead to misinterpretations of significance and importance.
|
50
|
-
- Over-reliance on the p-value as a cut-off for feature significance can be seen as arbitrary and may not truly
|
51
|
-
reflect the real-world importance of the feature.
|
52
|
-
"""
|
53
|
-
|
54
|
-
name = "feature_importance_and_significance"
|
55
|
-
required_inputs = ["models"]
|
56
|
-
default_params = {
|
57
|
-
"fontsize": 10,
|
58
|
-
"p_threshold": 0.05,
|
59
|
-
"significant_only": False,
|
60
|
-
"figure_height": 800,
|
61
|
-
"bar_width": 0.3,
|
62
|
-
}
|
63
|
-
metadata = {
|
64
|
-
"task_types": ["regression"],
|
65
|
-
"tags": [
|
66
|
-
"statsmodels",
|
67
|
-
"feature_importance",
|
68
|
-
"statistical_test",
|
69
|
-
"visualization",
|
70
|
-
],
|
71
|
-
}
|
72
|
-
|
73
|
-
def compute_p_values_and_feature_importances(
|
74
|
-
self, regression_model, decision_tree_model
|
75
|
-
):
|
76
|
-
p_values = regression_model.model.pvalues
|
77
|
-
feature_importances = permutation_importance(
|
78
|
-
decision_tree_model.model,
|
79
|
-
decision_tree_model.train_ds.x,
|
80
|
-
decision_tree_model.train_ds.y,
|
81
|
-
random_state=0,
|
82
|
-
n_jobs=-2,
|
83
|
-
).importances_mean
|
84
|
-
|
85
|
-
p_values = p_values / max(p_values)
|
86
|
-
feature_importances = feature_importances / max(feature_importances)
|
87
|
-
|
88
|
-
return p_values, feature_importances
|
89
|
-
|
90
|
-
def create_dataframe(
|
91
|
-
self,
|
92
|
-
p_values,
|
93
|
-
feature_importances,
|
94
|
-
regression_model,
|
95
|
-
significant_only,
|
96
|
-
p_threshold,
|
97
|
-
):
|
98
|
-
df = pd.DataFrame(
|
99
|
-
{
|
100
|
-
"Normalized p-value": p_values,
|
101
|
-
"Normalized Feature Importance": feature_importances,
|
102
|
-
},
|
103
|
-
index=regression_model.train_ds.x_df().columns,
|
104
|
-
)
|
105
|
-
|
106
|
-
if significant_only:
|
107
|
-
df = df[df["Normalized p-value"] <= p_threshold]
|
108
|
-
|
109
|
-
df = df.sort_values(by="Normalized Feature Importance", ascending=True)
|
110
|
-
|
111
|
-
return df
|
112
|
-
|
113
|
-
def create_figure(self, df, fontsize, figure_height, bar_width):
|
114
|
-
fig = go.Figure()
|
115
|
-
|
116
|
-
title_text = (
|
117
|
-
"Significant Features (p-value <= {0})".format(self.params["p_threshold"])
|
118
|
-
if self.params["significant_only"]
|
119
|
-
else "All Features"
|
120
|
-
)
|
121
|
-
|
122
|
-
fig.update_layout(
|
123
|
-
title=title_text,
|
124
|
-
barmode="group",
|
125
|
-
height=figure_height,
|
126
|
-
yaxis=dict(tickfont=dict(size=fontsize)),
|
127
|
-
xaxis=dict(title="Normalized Value", titlefont=dict(size=fontsize)),
|
128
|
-
)
|
129
|
-
|
130
|
-
fig.add_trace(
|
131
|
-
go.Bar(
|
132
|
-
y=df.index,
|
133
|
-
x=df["Normalized p-value"],
|
134
|
-
name="Normalized p-value",
|
135
|
-
orientation="h",
|
136
|
-
marker=dict(color="skyblue"),
|
137
|
-
width=bar_width,
|
138
|
-
)
|
139
|
-
)
|
140
|
-
|
141
|
-
fig.add_trace(
|
142
|
-
go.Bar(
|
143
|
-
y=df.index,
|
144
|
-
x=df["Normalized Feature Importance"],
|
145
|
-
name="Normalized Feature Importance",
|
146
|
-
orientation="h",
|
147
|
-
marker=dict(color="orange"),
|
148
|
-
width=bar_width,
|
149
|
-
)
|
150
|
-
)
|
151
|
-
|
152
|
-
return fig
|
153
|
-
|
154
|
-
def run(self):
|
155
|
-
fontsize = self.params["fontsize"]
|
156
|
-
significant_only = self.params["significant_only"]
|
157
|
-
p_threshold = self.params["p_threshold"]
|
158
|
-
figure_height = self.params["figure_height"]
|
159
|
-
bar_width = self.params["bar_width"]
|
160
|
-
|
161
|
-
all_models = []
|
162
|
-
|
163
|
-
if self.inputs.models is not None:
|
164
|
-
all_models.extend(self.inputs.models)
|
165
|
-
|
166
|
-
if len(self.inputs.models) != 2:
|
167
|
-
raise ValueError("Two models must be provided")
|
168
|
-
|
169
|
-
regression_model = self.inputs.models[0]
|
170
|
-
decision_tree_model = self.inputs.models[1]
|
171
|
-
|
172
|
-
p_values, feature_importances = self.compute_p_values_and_feature_importances(
|
173
|
-
regression_model, decision_tree_model
|
174
|
-
)
|
175
|
-
|
176
|
-
df = self.create_dataframe(
|
177
|
-
p_values,
|
178
|
-
feature_importances,
|
179
|
-
regression_model,
|
180
|
-
significant_only,
|
181
|
-
p_threshold,
|
182
|
-
)
|
183
|
-
|
184
|
-
fig = self.create_figure(df, fontsize, figure_height, bar_width)
|
185
|
-
|
186
|
-
return self.cache_results(
|
187
|
-
figures=[
|
188
|
-
Figure(
|
189
|
-
for_object=self,
|
190
|
-
key=self.key,
|
191
|
-
figure=fig,
|
192
|
-
metadata={
|
193
|
-
"model_regression": str(regression_model.model),
|
194
|
-
"model_decision_tree": str(decision_tree_model.model),
|
195
|
-
},
|
196
|
-
)
|
197
|
-
]
|
198
|
-
)
|
@@ -1,151 +0,0 @@
|
|
1
|
-
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
-
# See the LICENSE file in the root of this repository for details.
|
3
|
-
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
-
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
|
-
import pandas as pd
|
8
|
-
import plotly.graph_objects as go
|
9
|
-
|
10
|
-
from validmind.vm_models import Figure, Metric
|
11
|
-
|
12
|
-
|
13
|
-
@dataclass
|
14
|
-
class PDRatingClassPlot(Metric):
|
15
|
-
"""
|
16
|
-
Assesses and visualizes credit risk distribution across different rating classes within a dataset via default
|
17
|
-
probabilities.
|
18
|
-
|
19
|
-
**Purpose**: The purpose of the Probability of Default (PD) Rating Class Plot test is to measure and evaluate the
|
20
|
-
distribution of calculated default probabilities across different rating classes. This is critical for
|
21
|
-
understanding and inferring credit risk and can provide insights into how effectively the model is differentiating
|
22
|
-
between different risk levels in a credit dataset.
|
23
|
-
|
24
|
-
**Test Mechanism**: This metric is implemented via a visualization mechanism. It sorts the predicted probabilities
|
25
|
-
of defaults into user-defined rating classes defined in "rating_classes" in default parameters. When it has
|
26
|
-
classified the probabilities, it then calculates the average default rates within each rating class. Subsequently,
|
27
|
-
it produces bar plots for each of these rating classes, illustrating the average likelihood of a default within
|
28
|
-
each class. This process is executed separately for both the training and testing data sets. The classification of
|
29
|
-
predicted probabilities utilizes the pandas "cut" function, sorting and sectioning the data values into bins.
|
30
|
-
|
31
|
-
**Signs of High Risk**:
|
32
|
-
|
33
|
-
- If lower rating classes present higher average likelihoods of default than higher rating classes
|
34
|
-
- If there is poor differentiation between the averages across the different rating classes
|
35
|
-
- If the model generates a significant contrast between the likelihoods for the training set and the testing set,
|
36
|
-
suggestive of model overfitting
|
37
|
-
|
38
|
-
**Strengths**:
|
39
|
-
|
40
|
-
- Presents a clear visual representation of how efficient the model is at predicting credit risk across different
|
41
|
-
risk levels
|
42
|
-
- Allows for rapid identification and understanding of model performance per rating class
|
43
|
-
- Highlights potential overfitting issues by including both training and testing datasets in the analysis
|
44
|
-
|
45
|
-
**Limitations**:
|
46
|
-
|
47
|
-
- Making an incorrect choice for the number of rating classes, either oversimplifying or overcomplicating the
|
48
|
-
distribution of default rates
|
49
|
-
- Relying on the assumption that the rating classes are effective at differentiating risk levels and that the
|
50
|
-
boundaries between classes truly represent the risk distribution
|
51
|
-
- Not accounting for data set class imbalance, which could cause skewed average probabilities
|
52
|
-
- Inability to gauge the overall performance of the model only based on this metric, emphasizing the requirement of
|
53
|
-
combining it with other evaluation metrics
|
54
|
-
"""
|
55
|
-
|
56
|
-
name = "pd_rating_class_plot"
|
57
|
-
required_inputs = ["model", "datasets"]
|
58
|
-
|
59
|
-
metadata = {
|
60
|
-
"task_types": ["classification"],
|
61
|
-
"tags": ["visualization", "credit_risk"],
|
62
|
-
}
|
63
|
-
|
64
|
-
default_params = {
|
65
|
-
"title": "PD by Rating Class",
|
66
|
-
"rating_classes": ["A", "B", "C", "D"],
|
67
|
-
}
|
68
|
-
|
69
|
-
@staticmethod
|
70
|
-
def plot_bucket_analysis(df, prob_col, target_col, title, rating_classes):
|
71
|
-
df["bucket"] = pd.cut(
|
72
|
-
df[prob_col], bins=len(rating_classes), labels=rating_classes, right=False
|
73
|
-
)
|
74
|
-
default_rate = df.groupby("bucket")[target_col].mean()
|
75
|
-
|
76
|
-
# Sort the data based on the order of rating_classes
|
77
|
-
sorted_data = sorted(
|
78
|
-
zip(rating_classes, default_rate),
|
79
|
-
key=lambda x: rating_classes.index(x[0]),
|
80
|
-
)
|
81
|
-
rating_classes_sorted, default_rate_sorted = zip(*sorted_data)
|
82
|
-
|
83
|
-
fig = go.Figure()
|
84
|
-
|
85
|
-
# Iterate through the sorted data and create a bar for each score bucket
|
86
|
-
for i, (bucket, rate) in enumerate(
|
87
|
-
zip(rating_classes_sorted, default_rate_sorted)
|
88
|
-
):
|
89
|
-
fig.add_trace(go.Bar(x=[bucket], y=[rate], name=bucket))
|
90
|
-
|
91
|
-
fig.update_layout(
|
92
|
-
title_text=title,
|
93
|
-
xaxis_title="Rating Class",
|
94
|
-
yaxis_title="Probability of Default",
|
95
|
-
barmode="group",
|
96
|
-
)
|
97
|
-
|
98
|
-
return fig
|
99
|
-
|
100
|
-
def run(self):
|
101
|
-
target_column = self.inputs.model.train_ds.target_column
|
102
|
-
title = self.params["title"]
|
103
|
-
rating_classes = self.params["rating_classes"]
|
104
|
-
|
105
|
-
X_train = self.inputs.datasets[0].x.copy()
|
106
|
-
y_train = self.inputs.datasets[0].y.copy()
|
107
|
-
X_test = self.inputs.datasets[1].x.copy()
|
108
|
-
y_test = self.inputs.datasets[1].y.copy()
|
109
|
-
|
110
|
-
# Compute probabilities
|
111
|
-
X_train["probability"] = self.inputs.model.predict_proba(X_train)
|
112
|
-
X_test["probability"] = self.inputs.model.predict_proba(X_test)
|
113
|
-
|
114
|
-
df_train = pd.concat([X_train, y_train], axis=1)
|
115
|
-
df_test = pd.concat([X_test, y_test], axis=1)
|
116
|
-
|
117
|
-
fig_train = self.plot_bucket_analysis(
|
118
|
-
df_train,
|
119
|
-
"probability",
|
120
|
-
target_column,
|
121
|
-
title + " - Train Data",
|
122
|
-
rating_classes,
|
123
|
-
)
|
124
|
-
fig_test = self.plot_bucket_analysis(
|
125
|
-
df_test,
|
126
|
-
"probability",
|
127
|
-
target_column,
|
128
|
-
title + " - Test Data",
|
129
|
-
rating_classes,
|
130
|
-
)
|
131
|
-
|
132
|
-
return self.cache_results(
|
133
|
-
metric_value={
|
134
|
-
"bucket_analysis": {
|
135
|
-
"train_probs": list(X_train["probability"]),
|
136
|
-
"test_probs": list(X_test["probability"]),
|
137
|
-
},
|
138
|
-
},
|
139
|
-
figures=[
|
140
|
-
Figure(
|
141
|
-
for_object=self,
|
142
|
-
key="bucket_analysis_train",
|
143
|
-
figure=fig_train,
|
144
|
-
),
|
145
|
-
Figure(
|
146
|
-
for_object=self,
|
147
|
-
key="bucket_analysis_test",
|
148
|
-
figure=fig_test,
|
149
|
-
),
|
150
|
-
],
|
151
|
-
)
|