validmind 2.2.5__py3-none-any.whl → 2.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/{ai.py → ai/test_descriptions.py} +127 -69
- validmind/ai/utils.py +104 -0
- validmind/api_client.py +70 -31
- validmind/client.py +5 -5
- validmind/logging.py +38 -32
- validmind/models/foundation.py +10 -6
- validmind/models/function.py +3 -1
- validmind/models/metadata.py +1 -1
- validmind/test_suites/__init__.py +1 -7
- validmind/test_suites/regression.py +0 -16
- validmind/test_suites/statsmodels_timeseries.py +1 -1
- validmind/tests/data_validation/ACFandPACFPlot.py +36 -27
- validmind/tests/{model_validation/statsmodels → data_validation}/ADF.py +42 -13
- validmind/tests/data_validation/BivariateScatterPlots.py +38 -41
- validmind/tests/{model_validation/statsmodels → data_validation}/DFGLSArch.py +67 -11
- validmind/tests/data_validation/HeatmapFeatureCorrelations.py +1 -1
- validmind/tests/data_validation/HighPearsonCorrelation.py +12 -3
- validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
- validmind/tests/{model_validation/statsmodels → data_validation}/KPSS.py +64 -11
- validmind/tests/{model_validation/statsmodels → data_validation}/PhillipsPerronArch.py +65 -11
- validmind/tests/data_validation/ScatterPlot.py +1 -1
- validmind/tests/data_validation/SeasonalDecompose.py +12 -7
- validmind/tests/data_validation/TabularDateTimeHistograms.py +29 -33
- validmind/tests/data_validation/WOEBinPlots.py +1 -1
- validmind/tests/data_validation/WOEBinTable.py +1 -1
- validmind/tests/{model_validation/statsmodels → data_validation}/ZivotAndrewsArch.py +65 -11
- validmind/tests/data_validation/nlp/CommonWords.py +1 -1
- validmind/tests/data_validation/nlp/Hashtags.py +1 -1
- validmind/tests/data_validation/nlp/Mentions.py +1 -1
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +2 -1
- validmind/tests/data_validation/nlp/Punctuations.py +1 -1
- validmind/tests/data_validation/nlp/Sentiment.py +1 -1
- validmind/tests/data_validation/nlp/TextDescription.py +5 -1
- validmind/tests/data_validation/nlp/Toxicity.py +1 -1
- validmind/tests/decorator.py +1 -1
- validmind/tests/model_validation/FeaturesAUC.py +5 -3
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +4 -0
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +4 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +4 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +4 -0
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +4 -0
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +4 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +3 -3
- validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
- validmind/tests/model_validation/ragas/AspectCritique.py +14 -8
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +3 -4
- validmind/tests/model_validation/ragas/ContextPrecision.py +4 -5
- validmind/tests/model_validation/ragas/ContextRecall.py +3 -4
- validmind/tests/model_validation/ragas/ContextRelevancy.py +5 -4
- validmind/tests/model_validation/ragas/Faithfulness.py +6 -5
- validmind/tests/model_validation/ragas/utils.py +35 -9
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +2 -2
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +1 -1
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +6 -8
- validmind/tests/model_validation/sklearn/RegressionErrors.py +1 -1
- validmind/tests/model_validation/sklearn/RegressionModelsPerformanceComparison.py +14 -8
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
- validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +1 -1
- validmind/tests/model_validation/statsmodels/GINITable.py +1 -1
- validmind/tests/model_validation/statsmodels/JarqueBera.py +1 -1
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +1 -1
- validmind/tests/model_validation/statsmodels/LJungBox.py +1 -1
- validmind/tests/model_validation/statsmodels/Lilliefors.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +4 -0
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +9 -4
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +2 -2
- validmind/tests/model_validation/statsmodels/RunsTest.py +1 -1
- validmind/tests/model_validation/statsmodels/ShapiroWilk.py +1 -1
- validmind/tests/prompt_validation/Bias.py +14 -11
- validmind/tests/prompt_validation/Clarity.py +14 -11
- validmind/tests/prompt_validation/Conciseness.py +14 -11
- validmind/tests/prompt_validation/Delimitation.py +14 -11
- validmind/tests/prompt_validation/NegativeInstruction.py +14 -11
- validmind/tests/prompt_validation/Robustness.py +11 -11
- validmind/tests/prompt_validation/Specificity.py +14 -11
- validmind/tests/prompt_validation/ai_powered_test.py +53 -75
- validmind/unit_metrics/composite.py +2 -1
- validmind/utils.py +4 -49
- validmind/vm_models/dataset/dataset.py +17 -3
- validmind/vm_models/dataset/utils.py +2 -2
- validmind/vm_models/model.py +1 -1
- validmind/vm_models/test/metric.py +1 -8
- validmind/vm_models/test/result_wrapper.py +27 -34
- validmind/vm_models/test/test.py +3 -0
- validmind/vm_models/test/threshold_test.py +1 -1
- validmind/vm_models/test_suite/runner.py +12 -6
- validmind/vm_models/test_suite/summary.py +18 -7
- validmind/vm_models/test_suite/test.py +13 -20
- {validmind-2.2.5.dist-info → validmind-2.3.1.dist-info}/METADATA +1 -1
- {validmind-2.2.5.dist-info → validmind-2.3.1.dist-info}/RECORD +95 -104
- validmind/tests/data_validation/DefaultRatesbyRiskBandPlot.py +0 -114
- validmind/tests/data_validation/PiTCreditScoresHistogram.py +0 -150
- validmind/tests/data_validation/PiTPDHistogram.py +0 -152
- validmind/tests/model_validation/statsmodels/ADFTest.py +0 -88
- validmind/tests/model_validation/statsmodels/FeatureImportanceAndSignificance.py +0 -198
- validmind/tests/model_validation/statsmodels/PDRatingClassPlot.py +0 -151
- validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +0 -146
- validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +0 -144
- validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +0 -127
- validmind/tests/model_validation/statsmodels/ResidualsVisualInspection.py +0 -130
- {validmind-2.2.5.dist-info → validmind-2.3.1.dist-info}/LICENSE +0 -0
- {validmind-2.2.5.dist-info → validmind-2.3.1.dist-info}/WHEEL +0 -0
- {validmind-2.2.5.dist-info → validmind-2.3.1.dist-info}/entry_points.txt +0 -0
@@ -1,151 +0,0 @@
|
|
1
|
-
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
-
# See the LICENSE file in the root of this repository for details.
|
3
|
-
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
-
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
|
-
import pandas as pd
|
8
|
-
import plotly.graph_objects as go
|
9
|
-
|
10
|
-
from validmind.vm_models import Figure, Metric
|
11
|
-
|
12
|
-
|
13
|
-
@dataclass
|
14
|
-
class PDRatingClassPlot(Metric):
|
15
|
-
"""
|
16
|
-
Assesses and visualizes credit risk distribution across different rating classes within a dataset via default
|
17
|
-
probabilities.
|
18
|
-
|
19
|
-
**Purpose**: The purpose of the Probability of Default (PD) Rating Class Plot test is to measure and evaluate the
|
20
|
-
distribution of calculated default probabilities across different rating classes. This is critical for
|
21
|
-
understanding and inferring credit risk and can provide insights into how effectively the model is differentiating
|
22
|
-
between different risk levels in a credit dataset.
|
23
|
-
|
24
|
-
**Test Mechanism**: This metric is implemented via a visualization mechanism. It sorts the predicted probabilities
|
25
|
-
of defaults into user-defined rating classes defined in "rating_classes" in default parameters. When it has
|
26
|
-
classified the probabilities, it then calculates the average default rates within each rating class. Subsequently,
|
27
|
-
it produces bar plots for each of these rating classes, illustrating the average likelihood of a default within
|
28
|
-
each class. This process is executed separately for both the training and testing data sets. The classification of
|
29
|
-
predicted probabilities utilizes the pandas "cut" function, sorting and sectioning the data values into bins.
|
30
|
-
|
31
|
-
**Signs of High Risk**:
|
32
|
-
|
33
|
-
- If lower rating classes present higher average likelihoods of default than higher rating classes
|
34
|
-
- If there is poor differentiation between the averages across the different rating classes
|
35
|
-
- If the model generates a significant contrast between the likelihoods for the training set and the testing set,
|
36
|
-
suggestive of model overfitting
|
37
|
-
|
38
|
-
**Strengths**:
|
39
|
-
|
40
|
-
- Presents a clear visual representation of how efficient the model is at predicting credit risk across different
|
41
|
-
risk levels
|
42
|
-
- Allows for rapid identification and understanding of model performance per rating class
|
43
|
-
- Highlights potential overfitting issues by including both training and testing datasets in the analysis
|
44
|
-
|
45
|
-
**Limitations**:
|
46
|
-
|
47
|
-
- Making an incorrect choice for the number of rating classes, either oversimplifying or overcomplicating the
|
48
|
-
distribution of default rates
|
49
|
-
- Relying on the assumption that the rating classes are effective at differentiating risk levels and that the
|
50
|
-
boundaries between classes truly represent the risk distribution
|
51
|
-
- Not accounting for data set class imbalance, which could cause skewed average probabilities
|
52
|
-
- Inability to gauge the overall performance of the model only based on this metric, emphasizing the requirement of
|
53
|
-
combining it with other evaluation metrics
|
54
|
-
"""
|
55
|
-
|
56
|
-
name = "pd_rating_class_plot"
|
57
|
-
required_inputs = ["model", "datasets"]
|
58
|
-
|
59
|
-
metadata = {
|
60
|
-
"task_types": ["classification"],
|
61
|
-
"tags": ["visualization", "credit_risk"],
|
62
|
-
}
|
63
|
-
|
64
|
-
default_params = {
|
65
|
-
"title": "PD by Rating Class",
|
66
|
-
"rating_classes": ["A", "B", "C", "D"],
|
67
|
-
}
|
68
|
-
|
69
|
-
@staticmethod
|
70
|
-
def plot_bucket_analysis(df, prob_col, target_col, title, rating_classes):
|
71
|
-
df["bucket"] = pd.cut(
|
72
|
-
df[prob_col], bins=len(rating_classes), labels=rating_classes, right=False
|
73
|
-
)
|
74
|
-
default_rate = df.groupby("bucket")[target_col].mean()
|
75
|
-
|
76
|
-
# Sort the data based on the order of rating_classes
|
77
|
-
sorted_data = sorted(
|
78
|
-
zip(rating_classes, default_rate),
|
79
|
-
key=lambda x: rating_classes.index(x[0]),
|
80
|
-
)
|
81
|
-
rating_classes_sorted, default_rate_sorted = zip(*sorted_data)
|
82
|
-
|
83
|
-
fig = go.Figure()
|
84
|
-
|
85
|
-
# Iterate through the sorted data and create a bar for each score bucket
|
86
|
-
for i, (bucket, rate) in enumerate(
|
87
|
-
zip(rating_classes_sorted, default_rate_sorted)
|
88
|
-
):
|
89
|
-
fig.add_trace(go.Bar(x=[bucket], y=[rate], name=bucket))
|
90
|
-
|
91
|
-
fig.update_layout(
|
92
|
-
title_text=title,
|
93
|
-
xaxis_title="Rating Class",
|
94
|
-
yaxis_title="Probability of Default",
|
95
|
-
barmode="group",
|
96
|
-
)
|
97
|
-
|
98
|
-
return fig
|
99
|
-
|
100
|
-
def run(self):
|
101
|
-
target_column = self.inputs.model.train_ds.target_column
|
102
|
-
title = self.params["title"]
|
103
|
-
rating_classes = self.params["rating_classes"]
|
104
|
-
|
105
|
-
X_train = self.inputs.datasets[0].x.copy()
|
106
|
-
y_train = self.inputs.datasets[0].y.copy()
|
107
|
-
X_test = self.inputs.datasets[1].x.copy()
|
108
|
-
y_test = self.inputs.datasets[1].y.copy()
|
109
|
-
|
110
|
-
# Compute probabilities
|
111
|
-
X_train["probability"] = self.inputs.model.predict_proba(X_train)
|
112
|
-
X_test["probability"] = self.inputs.model.predict_proba(X_test)
|
113
|
-
|
114
|
-
df_train = pd.concat([X_train, y_train], axis=1)
|
115
|
-
df_test = pd.concat([X_test, y_test], axis=1)
|
116
|
-
|
117
|
-
fig_train = self.plot_bucket_analysis(
|
118
|
-
df_train,
|
119
|
-
"probability",
|
120
|
-
target_column,
|
121
|
-
title + " - Train Data",
|
122
|
-
rating_classes,
|
123
|
-
)
|
124
|
-
fig_test = self.plot_bucket_analysis(
|
125
|
-
df_test,
|
126
|
-
"probability",
|
127
|
-
target_column,
|
128
|
-
title + " - Test Data",
|
129
|
-
rating_classes,
|
130
|
-
)
|
131
|
-
|
132
|
-
return self.cache_results(
|
133
|
-
metric_value={
|
134
|
-
"bucket_analysis": {
|
135
|
-
"train_probs": list(X_train["probability"]),
|
136
|
-
"test_probs": list(X_test["probability"]),
|
137
|
-
},
|
138
|
-
},
|
139
|
-
figures=[
|
140
|
-
Figure(
|
141
|
-
for_object=self,
|
142
|
-
key="bucket_analysis_train",
|
143
|
-
figure=fig_train,
|
144
|
-
),
|
145
|
-
Figure(
|
146
|
-
for_object=self,
|
147
|
-
key="bucket_analysis_test",
|
148
|
-
figure=fig_test,
|
149
|
-
),
|
150
|
-
],
|
151
|
-
)
|
@@ -1,146 +0,0 @@
|
|
1
|
-
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
-
# See the LICENSE file in the root of this repository for details.
|
3
|
-
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
-
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
|
-
import pandas as pd
|
8
|
-
from sklearn.metrics import mean_squared_error, r2_score
|
9
|
-
|
10
|
-
from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
|
11
|
-
|
12
|
-
from .statsutils import adj_r2_score
|
13
|
-
|
14
|
-
|
15
|
-
@dataclass
|
16
|
-
class RegressionModelInsampleComparison(Metric):
|
17
|
-
"""
|
18
|
-
Evaluates and compares in-sample performance of multiple regression models using R-Squared, Adjusted R-Squared,
|
19
|
-
MSE, and RMSE.
|
20
|
-
|
21
|
-
**Purpose**: The RegressionModelInsampleComparison test metric is utilized to evaluate and compare the performance
|
22
|
-
of multiple regression models trained on the same dataset. Key performance indicators for this comparison include
|
23
|
-
statistics related to the goodness of fit - R-Squared, Adjusted R-Squared, Mean Squared Error (MSE), and Root Mean
|
24
|
-
Squared Error (RMSE).
|
25
|
-
|
26
|
-
**Test Mechanism**: The methodology behind this test is as follows -
|
27
|
-
- Firstly, a verification that the list of models to be tested is indeed not empty occurs.
|
28
|
-
- Once confirmed, the In-Sample performance of the models is calculated by a private function,
|
29
|
-
`_in_sample_performance_ols`, that executes the following steps:
|
30
|
-
- Iterates through each model in the supplied list.
|
31
|
-
- For each model, the function extracts the features (`X`) and the target (`y_true`) from the training dataset
|
32
|
-
and computes the predicted target values (`y_pred`).
|
33
|
-
- The performance metrics for the model are calculated using formulas for R-Squared, Adjusted R-Squared, MSE, and
|
34
|
-
RMSE.
|
35
|
-
- The results, including the computed metrics, variables of the model, and the model's identifier, are stored in
|
36
|
-
a dictionary that is appended to a list.
|
37
|
-
- The collected results are finally returned as a pandas dataframe.
|
38
|
-
|
39
|
-
**Signs of High Risk**:
|
40
|
-
- Significantly low values for R-Squared or Adjusted R-Squared.
|
41
|
-
- Significantly high values for MSE and RMSE.
|
42
|
-
Please note that what constitutes as "low" or "high" will vary based on the specific context or domain in which the
|
43
|
-
model is being utilized.
|
44
|
-
|
45
|
-
**Strengths**:
|
46
|
-
- Enables comparison of in-sample performance across different models on the same dataset, providing insights into
|
47
|
-
which model fits the data the best.
|
48
|
-
- Utilizes multiple evaluation methods (R-Squared, Adjusted R-Squared, MSE, RMSE), offering a comprehensive review
|
49
|
-
of a model's performance.
|
50
|
-
|
51
|
-
**Limitations**:
|
52
|
-
- The test measures only in-sample performance, i.e., how well a model fits the data it was trained on. However, it
|
53
|
-
does not give any information on the performance of the model on new, unseen, or out-of-sample data.
|
54
|
-
- Higher in-sample performance might be a result of overfitting, where the model is just memorizing the training
|
55
|
-
data. This test is sensitive to such cases.
|
56
|
-
- The test does not consider additional key factors such as the temporal dynamics of the data, that is, the pattern
|
57
|
-
of changes in data over time.
|
58
|
-
- The test does not provide an automated mechanism to determine if the reported metrics are within acceptable
|
59
|
-
ranges, necessitating human judgment.
|
60
|
-
"""
|
61
|
-
|
62
|
-
name = "regression_insample_performance"
|
63
|
-
required_inputs = ["model", "dataset"]
|
64
|
-
metadata = {
|
65
|
-
"task_types": ["regression"],
|
66
|
-
"tags": ["model_comparison"],
|
67
|
-
}
|
68
|
-
|
69
|
-
def run(self):
|
70
|
-
# Check models list is not empty
|
71
|
-
if not self.inputs.models:
|
72
|
-
raise ValueError("List of models must be provided in the models parameter")
|
73
|
-
all_models = []
|
74
|
-
|
75
|
-
if self.inputs.models is not None:
|
76
|
-
all_models.extend(self.inputs.models)
|
77
|
-
|
78
|
-
in_sample_performance = self._in_sample_performance_ols(
|
79
|
-
all_models, self.inputs.dataset
|
80
|
-
)
|
81
|
-
in_sample_performance_df = pd.DataFrame(in_sample_performance)
|
82
|
-
|
83
|
-
return self.cache_results(
|
84
|
-
{
|
85
|
-
"in_sample_performance": in_sample_performance_df.to_dict(
|
86
|
-
orient="records"
|
87
|
-
),
|
88
|
-
}
|
89
|
-
)
|
90
|
-
|
91
|
-
def _in_sample_performance_ols(self, models, dataset):
|
92
|
-
"""
|
93
|
-
Computes the in-sample performance evaluation metrics for a list of OLS models.
|
94
|
-
Args:
|
95
|
-
models (list): A list of statsmodels OLS models.
|
96
|
-
Returns:
|
97
|
-
list: A list of dictionaries containing the evaluation results for each model.
|
98
|
-
Each dictionary contains the following keys:
|
99
|
-
- 'Model': A string identifying the model.
|
100
|
-
- 'Independent Variables': A list of strings identifying the independent variables used in the model.
|
101
|
-
- 'R-Squared': The R-squared value of the model.
|
102
|
-
- 'Adjusted R-Squared': The adjusted R-squared value of the model.
|
103
|
-
- 'MSE': The mean squared error of the model.
|
104
|
-
- 'RMSE': The root mean squared error of the model.
|
105
|
-
"""
|
106
|
-
evaluation_results = []
|
107
|
-
|
108
|
-
for i, model in enumerate(models):
|
109
|
-
X_columns = dataset.feature_columns
|
110
|
-
y_true = dataset.y
|
111
|
-
y_pred = dataset.y_pred(model)
|
112
|
-
|
113
|
-
# Extract R-squared and Adjusted R-squared
|
114
|
-
r2 = r2_score(y_true, y_pred)
|
115
|
-
adj_r2 = adj_r2_score(y_true, y_pred, len(y_true), len(X_columns))
|
116
|
-
mse = mean_squared_error(y_true=y_true, y_pred=y_pred, squared=True)
|
117
|
-
rmse = mean_squared_error(y_true=y_true, y_pred=y_pred, squared=False)
|
118
|
-
|
119
|
-
# Append the results to the evaluation_results list
|
120
|
-
evaluation_results.append(
|
121
|
-
{
|
122
|
-
"Model": f"Model {i + 1}",
|
123
|
-
"Independent Variables": X_columns,
|
124
|
-
"R-Squared": r2,
|
125
|
-
"Adjusted R-Squared": adj_r2,
|
126
|
-
"MSE": mse,
|
127
|
-
"RMSE": rmse,
|
128
|
-
}
|
129
|
-
)
|
130
|
-
|
131
|
-
return evaluation_results
|
132
|
-
|
133
|
-
def summary(self, metric_value):
|
134
|
-
"""
|
135
|
-
Build one table for summarizing the in-sample performance results
|
136
|
-
"""
|
137
|
-
summary_in_sample_performance = metric_value["in_sample_performance"]
|
138
|
-
|
139
|
-
return ResultSummary(
|
140
|
-
results=[
|
141
|
-
ResultTable(
|
142
|
-
data=summary_in_sample_performance,
|
143
|
-
metadata=ResultTableMetadata(title="In-Sample Performance Results"),
|
144
|
-
),
|
145
|
-
]
|
146
|
-
)
|
@@ -1,144 +0,0 @@
|
|
1
|
-
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
-
# See the LICENSE file in the root of this repository for details.
|
3
|
-
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
-
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
|
-
import numpy as np
|
8
|
-
import pandas as pd
|
9
|
-
|
10
|
-
from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
|
11
|
-
|
12
|
-
|
13
|
-
@dataclass
|
14
|
-
class RegressionModelOutsampleComparison(Metric):
|
15
|
-
"""
|
16
|
-
Computes MSE and RMSE for multiple regression models using out-of-sample test to assess model's prediction accuracy
|
17
|
-
on unseen data.
|
18
|
-
|
19
|
-
**Purpose**: The RegressionModelOutsampleComparison test is designed to evaluate the predictive performance of
|
20
|
-
multiple regression models by means of an out-of-sample test. The primary aim of this test is to validate the
|
21
|
-
model's ability to generalize to unseen data, a common challenge in the context of overfitting. It does this by
|
22
|
-
computing two critical metrics — Mean Squared Error (MSE) and Root Mean Squared Error (RMSE), which provide a
|
23
|
-
quantifiable measure of the model's prediction accuracy on the testing dataset.
|
24
|
-
|
25
|
-
**Test Mechanism**: This test requires multiple models (specifically Ordinary Least Squares - OLS regression
|
26
|
-
models) and a test dataset as inputs. Each model generates predictions using the test dataset. The residuals are
|
27
|
-
then calculated and used to compute the MSE and RMSE for each model. The test outcomes, which include the model's
|
28
|
-
name, its MSE, and RMSE, are recorded and returned in a structured dataframe format.
|
29
|
-
|
30
|
-
**Signs of High Risk**:
|
31
|
-
- High values of MSE or RMSE indicate significant risk, signifying that the model's predictions considerably
|
32
|
-
deviate from the actual values in the test dataset.
|
33
|
-
- Consistently large discrepancies between training and testing performance across various models may indicate an
|
34
|
-
issue with the input data itself or the model selection strategies employed.
|
35
|
-
|
36
|
-
**Strengths**:
|
37
|
-
- This test offers a comparative evaluation of multiple models' out-of-sample performance, enabling the selection
|
38
|
-
of the best performing model.
|
39
|
-
- The use of both MSE and RMSE provides insights into the model's prediction error. While MSE is sensitive to
|
40
|
-
outliers, emphasizing larger errors, RMSE provides a more interpretable measure of average prediction error given
|
41
|
-
that it's in the same unit as the dependent variable.
|
42
|
-
|
43
|
-
**Limitations**:
|
44
|
-
- The applicability of this test is limited to regression tasks, specifically OLS models.
|
45
|
-
- The test operates under the assumption that the test dataset is a representative sample of the population. This
|
46
|
-
might not always hold true and can result in less accurate insights.
|
47
|
-
- The interpretability and the objectivity of the output (MSE and RMSE) can be influenced when the scale of the
|
48
|
-
dependent variable varies significantly, or the distribution of residuals is heavily skewed or contains outliers.
|
49
|
-
"""
|
50
|
-
|
51
|
-
name = "regression_outsample_performance"
|
52
|
-
required_inputs = ["model", "dataset"]
|
53
|
-
metadata = {
|
54
|
-
"task_types": ["regression"],
|
55
|
-
"tags": ["model_comparison"],
|
56
|
-
}
|
57
|
-
|
58
|
-
def run(self):
|
59
|
-
# Check models list is not empty
|
60
|
-
if not self.inputs.models:
|
61
|
-
raise ValueError("List of models must be provided in the models parameter")
|
62
|
-
all_models = []
|
63
|
-
if self.inputs.model is not None:
|
64
|
-
all_models.append(self.inputs.model)
|
65
|
-
|
66
|
-
if self.inputs.models is not None:
|
67
|
-
all_models.extend(self.inputs.models)
|
68
|
-
|
69
|
-
for model in all_models:
|
70
|
-
if model.test_ds is None:
|
71
|
-
raise ValueError(
|
72
|
-
"Test dataset is missing in the ValidMind Model object"
|
73
|
-
)
|
74
|
-
|
75
|
-
results = self._out_sample_performance_ols(all_models, self.inputs.dataset)
|
76
|
-
return self.cache_results(
|
77
|
-
{
|
78
|
-
"out_sample_performance": results.to_dict(orient="records"),
|
79
|
-
}
|
80
|
-
)
|
81
|
-
|
82
|
-
def _out_sample_performance_ols(self, model_list, dataset):
|
83
|
-
"""
|
84
|
-
Returns the out-of-sample performance evaluation metrics of a list of OLS regression models.
|
85
|
-
Args:
|
86
|
-
model_list (list): A list of OLS models to evaluate.
|
87
|
-
test_data (pandas.DataFrame): The test dataset containing the independent and dependent variables.
|
88
|
-
target_col (str): The name of the target variable column in the test dataset.
|
89
|
-
Returns:
|
90
|
-
pandas.DataFrame: A DataFrame containing the evaluation results of the OLS models. The columns are 'Model',
|
91
|
-
'MSE' (Mean Squared Error), and 'RMSE' (Root Mean Squared Error).
|
92
|
-
"""
|
93
|
-
|
94
|
-
# Initialize a list to store results
|
95
|
-
results = []
|
96
|
-
|
97
|
-
for fitted_model in model_list:
|
98
|
-
# Extract the column names of the independent variables from the model
|
99
|
-
independent_vars = dataset.feature_columns
|
100
|
-
|
101
|
-
# Separate the target variable and features in the test dataset
|
102
|
-
y_test = dataset.y
|
103
|
-
|
104
|
-
# Predict the test data
|
105
|
-
y_pred = dataset.y_pred(fitted_model)
|
106
|
-
|
107
|
-
# Calculate the residuals
|
108
|
-
residuals = y_test - y_pred
|
109
|
-
|
110
|
-
# Calculate the mean squared error and root mean squared error
|
111
|
-
mse = np.mean(residuals**2)
|
112
|
-
rmse_val = np.sqrt(mse)
|
113
|
-
|
114
|
-
# Store the results
|
115
|
-
model_name_with_vars = f"({', '.join(independent_vars)})"
|
116
|
-
results.append(
|
117
|
-
{
|
118
|
-
"Model": model_name_with_vars,
|
119
|
-
"MSE": mse,
|
120
|
-
"RMSE": rmse_val,
|
121
|
-
}
|
122
|
-
)
|
123
|
-
|
124
|
-
# Create a DataFrame to display the results
|
125
|
-
results_df = pd.DataFrame(results)
|
126
|
-
|
127
|
-
return results_df
|
128
|
-
|
129
|
-
def summary(self, metric_value):
|
130
|
-
"""
|
131
|
-
Build one table for summarizing the out-of-sample performance results
|
132
|
-
"""
|
133
|
-
summary_out_sample_performance = metric_value["out_sample_performance"]
|
134
|
-
|
135
|
-
return ResultSummary(
|
136
|
-
results=[
|
137
|
-
ResultTable(
|
138
|
-
data=summary_out_sample_performance,
|
139
|
-
metadata=ResultTableMetadata(
|
140
|
-
title="Out-of-Sample Performance Results"
|
141
|
-
),
|
142
|
-
),
|
143
|
-
]
|
144
|
-
)
|
@@ -1,127 +0,0 @@
|
|
1
|
-
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
-
# See the LICENSE file in the root of this repository for details.
|
3
|
-
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
-
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
|
-
from sklearn.metrics import mean_squared_error, r2_score
|
8
|
-
|
9
|
-
from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
|
10
|
-
|
11
|
-
|
12
|
-
@dataclass
|
13
|
-
class RegressionModelsPerformance(Metric):
|
14
|
-
"""
|
15
|
-
Evaluates and compares regression models' performance using R-squared, Adjusted R-squared, and MSE metrics.
|
16
|
-
|
17
|
-
**Purpose**: This metric is used to evaluate and compare the performance of various regression models. Through the
|
18
|
-
use of key statistical measures such as R-squared, Adjusted R-squared, and Mean Squared Error (MSE), the
|
19
|
-
performance of different models in predicting dependent variables can be assessed both on the data used for
|
20
|
-
training (in-sample) and new, unseen data (out-of-sample).
|
21
|
-
|
22
|
-
**Test Mechanism**: The test evaluates a list of provided regression models. For each model, it calculates their
|
23
|
-
in-sample and out-of-sample performance by deriving the model predictions for the training and testing datasets
|
24
|
-
respectively, and then comparing these predictions to the actual values. In doing so, it calculates R-squared,
|
25
|
-
Adjusted R-squared, and MSE for each model, stores the results, and returns them for comparison.
|
26
|
-
|
27
|
-
**Signs of High Risk**:
|
28
|
-
- High Mean Squared Error (MSE) values.
|
29
|
-
- Strikingly low values of R-squared and Adjusted R-squared.
|
30
|
-
- A significant drop in performance when transitioning from in-sample to out-of-sample evaluations, signaling a
|
31
|
-
potential overfitting issue.
|
32
|
-
|
33
|
-
**Strengths**:
|
34
|
-
- The test permits comparisons of multiple models simultaneously, providing an objective base for identifying the
|
35
|
-
top-performing model.
|
36
|
-
- It delivers both in-sample and out-of-sample evaluations, presenting performance data on unseen data.
|
37
|
-
- The utilization of R-squared and Adjusted R-squared in conjunction with MSE allows for a detailed view of the
|
38
|
-
model's explainability and error rate.
|
39
|
-
|
40
|
-
**Limitations**:
|
41
|
-
- This test is built around the assumption that the residuals of the regression model are normally distributed,
|
42
|
-
which is a fundamental requirement for Ordinary Least Squares (OLS) regression; thus, it could be not suitable for
|
43
|
-
models where this assumption is broken.
|
44
|
-
- The test does not consider cases where higher R-squared or lower MSE values do not necessarily correlate with
|
45
|
-
better predictive performance, particularly in instances of excessively complex models.
|
46
|
-
"""
|
47
|
-
|
48
|
-
name = "regression_models_performance"
|
49
|
-
required_inputs = ["models", "in_sample_datasets", "out_of_sample_datasets"]
|
50
|
-
metadata = {
|
51
|
-
"task_types": ["regression"],
|
52
|
-
"tags": ["model_performance", "model_comparison"],
|
53
|
-
}
|
54
|
-
|
55
|
-
def run(self):
|
56
|
-
# Check models list is not empty
|
57
|
-
if not self.inputs.models:
|
58
|
-
raise ValueError("List of models must be provided in the models parameter")
|
59
|
-
|
60
|
-
all_models = []
|
61
|
-
|
62
|
-
if self.inputs.models is not None:
|
63
|
-
all_models.extend(self.inputs.models)
|
64
|
-
|
65
|
-
in_sample_results = self.sample_performance_ols(
|
66
|
-
self.inputs.models, self.inputs.in_sample_datasets
|
67
|
-
)
|
68
|
-
out_of_sample_results = self.sample_performance_ols(
|
69
|
-
self.inputs.models, self.inputs.out_of_sample_datasets
|
70
|
-
)
|
71
|
-
|
72
|
-
return self.cache_results(
|
73
|
-
{
|
74
|
-
"in_sample_performance": in_sample_results,
|
75
|
-
"out_of_sample_performance": out_of_sample_results,
|
76
|
-
}
|
77
|
-
)
|
78
|
-
|
79
|
-
def sample_performance_ols(self, models, datasets):
|
80
|
-
evaluation_results = []
|
81
|
-
|
82
|
-
for model, dataset in zip(models, datasets):
|
83
|
-
X_columns = dataset.feature_columns
|
84
|
-
y_true = dataset.y
|
85
|
-
y_pred = dataset.y_pred(model)
|
86
|
-
|
87
|
-
# Extract R-squared and Adjusted R-squared
|
88
|
-
r2 = r2_score(y_true, y_pred)
|
89
|
-
mse = mean_squared_error(y_true, y_pred)
|
90
|
-
adj_r2 = 1 - ((1 - r2) * (len(y_true) - 1)) / (
|
91
|
-
len(y_true) - len(X_columns) - 1
|
92
|
-
)
|
93
|
-
|
94
|
-
# Append the results to the evaluation_results list
|
95
|
-
evaluation_results.append(
|
96
|
-
{
|
97
|
-
"Model": model.input_id,
|
98
|
-
"Independent Variables": X_columns,
|
99
|
-
"R-Squared": r2,
|
100
|
-
"Adjusted R-Squared": adj_r2,
|
101
|
-
"MSE": mse,
|
102
|
-
}
|
103
|
-
)
|
104
|
-
|
105
|
-
return evaluation_results
|
106
|
-
|
107
|
-
def summary(self, metric_value):
|
108
|
-
"""
|
109
|
-
Build a table for summarizing the in-sample and out-of-sample performance results
|
110
|
-
"""
|
111
|
-
summary_in_sample_performance = metric_value["in_sample_performance"]
|
112
|
-
summary_out_of_sample_performance = metric_value["out_of_sample_performance"]
|
113
|
-
|
114
|
-
return ResultSummary(
|
115
|
-
results=[
|
116
|
-
ResultTable(
|
117
|
-
data=summary_in_sample_performance,
|
118
|
-
metadata=ResultTableMetadata(title="In-Sample Performance Results"),
|
119
|
-
),
|
120
|
-
ResultTable(
|
121
|
-
data=summary_out_of_sample_performance,
|
122
|
-
metadata=ResultTableMetadata(
|
123
|
-
title="Out-of-Sample Performance Results"
|
124
|
-
),
|
125
|
-
),
|
126
|
-
]
|
127
|
-
)
|