validmind 2.2.6__py3-none-any.whl → 2.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +2 -1
- validmind/__version__.py +1 -1
- validmind/{ai.py → ai/test_descriptions.py} +74 -82
- validmind/ai/utils.py +104 -0
- validmind/api_client.py +58 -19
- validmind/client.py +5 -5
- validmind/models/foundation.py +10 -6
- validmind/models/function.py +3 -1
- validmind/models/metadata.py +1 -1
- validmind/test_suites/__init__.py +1 -9
- validmind/test_suites/regression.py +0 -16
- validmind/test_suites/statsmodels_timeseries.py +1 -1
- validmind/tests/__init__.py +7 -7
- validmind/tests/__types__.py +170 -0
- validmind/tests/data_validation/ACFandPACFPlot.py +36 -27
- validmind/tests/{model_validation/statsmodels → data_validation}/ADF.py +42 -13
- validmind/tests/data_validation/BivariateScatterPlots.py +38 -41
- validmind/tests/{model_validation/statsmodels → data_validation}/DFGLSArch.py +67 -11
- validmind/tests/data_validation/HeatmapFeatureCorrelations.py +1 -1
- validmind/tests/data_validation/HighPearsonCorrelation.py +12 -3
- validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
- validmind/tests/{model_validation/statsmodels → data_validation}/KPSS.py +64 -11
- validmind/tests/{model_validation/statsmodels → data_validation}/PhillipsPerronArch.py +65 -11
- validmind/tests/data_validation/ScatterPlot.py +1 -1
- validmind/tests/data_validation/SeasonalDecompose.py +12 -7
- validmind/tests/data_validation/TabularDateTimeHistograms.py +29 -33
- validmind/tests/data_validation/WOEBinPlots.py +1 -1
- validmind/tests/data_validation/WOEBinTable.py +1 -1
- validmind/tests/{model_validation/statsmodels → data_validation}/ZivotAndrewsArch.py +65 -11
- validmind/tests/data_validation/nlp/CommonWords.py +1 -1
- validmind/tests/data_validation/nlp/Hashtags.py +1 -1
- validmind/tests/data_validation/nlp/Mentions.py +1 -1
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +2 -1
- validmind/tests/data_validation/nlp/Punctuations.py +1 -1
- validmind/tests/data_validation/nlp/Sentiment.py +1 -1
- validmind/tests/data_validation/nlp/TextDescription.py +5 -1
- validmind/tests/data_validation/nlp/Toxicity.py +1 -1
- validmind/tests/decorator.py +13 -1
- validmind/tests/model_validation/FeaturesAUC.py +5 -3
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +4 -0
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +4 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +4 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +4 -0
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +4 -0
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +4 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +3 -3
- validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
- validmind/tests/model_validation/ragas/AspectCritique.py +14 -8
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +3 -4
- validmind/tests/model_validation/ragas/ContextPrecision.py +4 -5
- validmind/tests/model_validation/ragas/ContextRecall.py +3 -4
- validmind/tests/model_validation/ragas/ContextRelevancy.py +5 -4
- validmind/tests/model_validation/ragas/Faithfulness.py +6 -5
- validmind/tests/model_validation/ragas/utils.py +35 -9
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +2 -2
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +1 -1
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +6 -8
- validmind/tests/model_validation/sklearn/RegressionErrors.py +1 -1
- validmind/tests/model_validation/sklearn/RegressionModelsPerformanceComparison.py +14 -8
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
- validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +1 -1
- validmind/tests/model_validation/statsmodels/GINITable.py +1 -1
- validmind/tests/model_validation/statsmodels/JarqueBera.py +1 -1
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +1 -1
- validmind/tests/model_validation/statsmodels/LJungBox.py +1 -1
- validmind/tests/model_validation/statsmodels/Lilliefors.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +4 -0
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +9 -4
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +2 -2
- validmind/tests/model_validation/statsmodels/RunsTest.py +1 -1
- validmind/tests/model_validation/statsmodels/ShapiroWilk.py +1 -1
- validmind/tests/prompt_validation/Bias.py +14 -11
- validmind/tests/prompt_validation/Clarity.py +14 -11
- validmind/tests/prompt_validation/Conciseness.py +14 -11
- validmind/tests/prompt_validation/Delimitation.py +14 -11
- validmind/tests/prompt_validation/NegativeInstruction.py +14 -11
- validmind/tests/prompt_validation/Robustness.py +11 -11
- validmind/tests/prompt_validation/Specificity.py +14 -11
- validmind/tests/prompt_validation/ai_powered_test.py +53 -75
- validmind/unit_metrics/composite.py +2 -1
- validmind/utils.py +34 -59
- validmind/vm_models/dataset/dataset.py +17 -3
- validmind/vm_models/dataset/utils.py +2 -2
- validmind/vm_models/model.py +1 -1
- validmind/vm_models/test/metric.py +1 -8
- validmind/vm_models/test/result_wrapper.py +2 -2
- validmind/vm_models/test/test.py +3 -0
- validmind/vm_models/test/threshold_test.py +1 -1
- validmind/vm_models/test_suite/runner.py +7 -4
- {validmind-2.2.6.dist-info → validmind-2.3.3.dist-info}/METADATA +1 -1
- {validmind-2.2.6.dist-info → validmind-2.3.3.dist-info}/RECORD +95 -103
- validmind/tests/data_validation/DefaultRatesbyRiskBandPlot.py +0 -114
- validmind/tests/data_validation/PiTCreditScoresHistogram.py +0 -150
- validmind/tests/data_validation/PiTPDHistogram.py +0 -152
- validmind/tests/model_validation/statsmodels/ADFTest.py +0 -88
- validmind/tests/model_validation/statsmodels/FeatureImportanceAndSignificance.py +0 -198
- validmind/tests/model_validation/statsmodels/PDRatingClassPlot.py +0 -151
- validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +0 -146
- validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +0 -144
- validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +0 -127
- validmind/tests/model_validation/statsmodels/ResidualsVisualInspection.py +0 -130
- {validmind-2.2.6.dist-info → validmind-2.3.3.dist-info}/LICENSE +0 -0
- {validmind-2.2.6.dist-info → validmind-2.3.3.dist-info}/WHEEL +0 -0
- {validmind-2.2.6.dist-info → validmind-2.3.3.dist-info}/entry_points.txt +0 -0
@@ -1,146 +0,0 @@
|
|
1
|
-
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
-
# See the LICENSE file in the root of this repository for details.
|
3
|
-
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
-
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
|
-
import pandas as pd
|
8
|
-
from sklearn.metrics import mean_squared_error, r2_score
|
9
|
-
|
10
|
-
from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
|
11
|
-
|
12
|
-
from .statsutils import adj_r2_score
|
13
|
-
|
14
|
-
|
15
|
-
@dataclass
|
16
|
-
class RegressionModelInsampleComparison(Metric):
|
17
|
-
"""
|
18
|
-
Evaluates and compares in-sample performance of multiple regression models using R-Squared, Adjusted R-Squared,
|
19
|
-
MSE, and RMSE.
|
20
|
-
|
21
|
-
**Purpose**: The RegressionModelInsampleComparison test metric is utilized to evaluate and compare the performance
|
22
|
-
of multiple regression models trained on the same dataset. Key performance indicators for this comparison include
|
23
|
-
statistics related to the goodness of fit - R-Squared, Adjusted R-Squared, Mean Squared Error (MSE), and Root Mean
|
24
|
-
Squared Error (RMSE).
|
25
|
-
|
26
|
-
**Test Mechanism**: The methodology behind this test is as follows -
|
27
|
-
- Firstly, a verification that the list of models to be tested is indeed not empty occurs.
|
28
|
-
- Once confirmed, the In-Sample performance of the models is calculated by a private function,
|
29
|
-
`_in_sample_performance_ols`, that executes the following steps:
|
30
|
-
- Iterates through each model in the supplied list.
|
31
|
-
- For each model, the function extracts the features (`X`) and the target (`y_true`) from the training dataset
|
32
|
-
and computes the predicted target values (`y_pred`).
|
33
|
-
- The performance metrics for the model are calculated using formulas for R-Squared, Adjusted R-Squared, MSE, and
|
34
|
-
RMSE.
|
35
|
-
- The results, including the computed metrics, variables of the model, and the model's identifier, are stored in
|
36
|
-
a dictionary that is appended to a list.
|
37
|
-
- The collected results are finally returned as a pandas dataframe.
|
38
|
-
|
39
|
-
**Signs of High Risk**:
|
40
|
-
- Significantly low values for R-Squared or Adjusted R-Squared.
|
41
|
-
- Significantly high values for MSE and RMSE.
|
42
|
-
Please note that what constitutes as "low" or "high" will vary based on the specific context or domain in which the
|
43
|
-
model is being utilized.
|
44
|
-
|
45
|
-
**Strengths**:
|
46
|
-
- Enables comparison of in-sample performance across different models on the same dataset, providing insights into
|
47
|
-
which model fits the data the best.
|
48
|
-
- Utilizes multiple evaluation methods (R-Squared, Adjusted R-Squared, MSE, RMSE), offering a comprehensive review
|
49
|
-
of a model's performance.
|
50
|
-
|
51
|
-
**Limitations**:
|
52
|
-
- The test measures only in-sample performance, i.e., how well a model fits the data it was trained on. However, it
|
53
|
-
does not give any information on the performance of the model on new, unseen, or out-of-sample data.
|
54
|
-
- Higher in-sample performance might be a result of overfitting, where the model is just memorizing the training
|
55
|
-
data. This test is sensitive to such cases.
|
56
|
-
- The test does not consider additional key factors such as the temporal dynamics of the data, that is, the pattern
|
57
|
-
of changes in data over time.
|
58
|
-
- The test does not provide an automated mechanism to determine if the reported metrics are within acceptable
|
59
|
-
ranges, necessitating human judgment.
|
60
|
-
"""
|
61
|
-
|
62
|
-
name = "regression_insample_performance"
|
63
|
-
required_inputs = ["model", "dataset"]
|
64
|
-
metadata = {
|
65
|
-
"task_types": ["regression"],
|
66
|
-
"tags": ["model_comparison"],
|
67
|
-
}
|
68
|
-
|
69
|
-
def run(self):
|
70
|
-
# Check models list is not empty
|
71
|
-
if not self.inputs.models:
|
72
|
-
raise ValueError("List of models must be provided in the models parameter")
|
73
|
-
all_models = []
|
74
|
-
|
75
|
-
if self.inputs.models is not None:
|
76
|
-
all_models.extend(self.inputs.models)
|
77
|
-
|
78
|
-
in_sample_performance = self._in_sample_performance_ols(
|
79
|
-
all_models, self.inputs.dataset
|
80
|
-
)
|
81
|
-
in_sample_performance_df = pd.DataFrame(in_sample_performance)
|
82
|
-
|
83
|
-
return self.cache_results(
|
84
|
-
{
|
85
|
-
"in_sample_performance": in_sample_performance_df.to_dict(
|
86
|
-
orient="records"
|
87
|
-
),
|
88
|
-
}
|
89
|
-
)
|
90
|
-
|
91
|
-
def _in_sample_performance_ols(self, models, dataset):
|
92
|
-
"""
|
93
|
-
Computes the in-sample performance evaluation metrics for a list of OLS models.
|
94
|
-
Args:
|
95
|
-
models (list): A list of statsmodels OLS models.
|
96
|
-
Returns:
|
97
|
-
list: A list of dictionaries containing the evaluation results for each model.
|
98
|
-
Each dictionary contains the following keys:
|
99
|
-
- 'Model': A string identifying the model.
|
100
|
-
- 'Independent Variables': A list of strings identifying the independent variables used in the model.
|
101
|
-
- 'R-Squared': The R-squared value of the model.
|
102
|
-
- 'Adjusted R-Squared': The adjusted R-squared value of the model.
|
103
|
-
- 'MSE': The mean squared error of the model.
|
104
|
-
- 'RMSE': The root mean squared error of the model.
|
105
|
-
"""
|
106
|
-
evaluation_results = []
|
107
|
-
|
108
|
-
for i, model in enumerate(models):
|
109
|
-
X_columns = dataset.feature_columns
|
110
|
-
y_true = dataset.y
|
111
|
-
y_pred = dataset.y_pred(model)
|
112
|
-
|
113
|
-
# Extract R-squared and Adjusted R-squared
|
114
|
-
r2 = r2_score(y_true, y_pred)
|
115
|
-
adj_r2 = adj_r2_score(y_true, y_pred, len(y_true), len(X_columns))
|
116
|
-
mse = mean_squared_error(y_true=y_true, y_pred=y_pred, squared=True)
|
117
|
-
rmse = mean_squared_error(y_true=y_true, y_pred=y_pred, squared=False)
|
118
|
-
|
119
|
-
# Append the results to the evaluation_results list
|
120
|
-
evaluation_results.append(
|
121
|
-
{
|
122
|
-
"Model": f"Model {i + 1}",
|
123
|
-
"Independent Variables": X_columns,
|
124
|
-
"R-Squared": r2,
|
125
|
-
"Adjusted R-Squared": adj_r2,
|
126
|
-
"MSE": mse,
|
127
|
-
"RMSE": rmse,
|
128
|
-
}
|
129
|
-
)
|
130
|
-
|
131
|
-
return evaluation_results
|
132
|
-
|
133
|
-
def summary(self, metric_value):
|
134
|
-
"""
|
135
|
-
Build one table for summarizing the in-sample performance results
|
136
|
-
"""
|
137
|
-
summary_in_sample_performance = metric_value["in_sample_performance"]
|
138
|
-
|
139
|
-
return ResultSummary(
|
140
|
-
results=[
|
141
|
-
ResultTable(
|
142
|
-
data=summary_in_sample_performance,
|
143
|
-
metadata=ResultTableMetadata(title="In-Sample Performance Results"),
|
144
|
-
),
|
145
|
-
]
|
146
|
-
)
|
@@ -1,144 +0,0 @@
|
|
1
|
-
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
-
# See the LICENSE file in the root of this repository for details.
|
3
|
-
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
-
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
|
-
import numpy as np
|
8
|
-
import pandas as pd
|
9
|
-
|
10
|
-
from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
|
11
|
-
|
12
|
-
|
13
|
-
@dataclass
|
14
|
-
class RegressionModelOutsampleComparison(Metric):
|
15
|
-
"""
|
16
|
-
Computes MSE and RMSE for multiple regression models using out-of-sample test to assess model's prediction accuracy
|
17
|
-
on unseen data.
|
18
|
-
|
19
|
-
**Purpose**: The RegressionModelOutsampleComparison test is designed to evaluate the predictive performance of
|
20
|
-
multiple regression models by means of an out-of-sample test. The primary aim of this test is to validate the
|
21
|
-
model's ability to generalize to unseen data, a common challenge in the context of overfitting. It does this by
|
22
|
-
computing two critical metrics — Mean Squared Error (MSE) and Root Mean Squared Error (RMSE), which provide a
|
23
|
-
quantifiable measure of the model's prediction accuracy on the testing dataset.
|
24
|
-
|
25
|
-
**Test Mechanism**: This test requires multiple models (specifically Ordinary Least Squares - OLS regression
|
26
|
-
models) and a test dataset as inputs. Each model generates predictions using the test dataset. The residuals are
|
27
|
-
then calculated and used to compute the MSE and RMSE for each model. The test outcomes, which include the model's
|
28
|
-
name, its MSE, and RMSE, are recorded and returned in a structured dataframe format.
|
29
|
-
|
30
|
-
**Signs of High Risk**:
|
31
|
-
- High values of MSE or RMSE indicate significant risk, signifying that the model's predictions considerably
|
32
|
-
deviate from the actual values in the test dataset.
|
33
|
-
- Consistently large discrepancies between training and testing performance across various models may indicate an
|
34
|
-
issue with the input data itself or the model selection strategies employed.
|
35
|
-
|
36
|
-
**Strengths**:
|
37
|
-
- This test offers a comparative evaluation of multiple models' out-of-sample performance, enabling the selection
|
38
|
-
of the best performing model.
|
39
|
-
- The use of both MSE and RMSE provides insights into the model's prediction error. While MSE is sensitive to
|
40
|
-
outliers, emphasizing larger errors, RMSE provides a more interpretable measure of average prediction error given
|
41
|
-
that it's in the same unit as the dependent variable.
|
42
|
-
|
43
|
-
**Limitations**:
|
44
|
-
- The applicability of this test is limited to regression tasks, specifically OLS models.
|
45
|
-
- The test operates under the assumption that the test dataset is a representative sample of the population. This
|
46
|
-
might not always hold true and can result in less accurate insights.
|
47
|
-
- The interpretability and the objectivity of the output (MSE and RMSE) can be influenced when the scale of the
|
48
|
-
dependent variable varies significantly, or the distribution of residuals is heavily skewed or contains outliers.
|
49
|
-
"""
|
50
|
-
|
51
|
-
name = "regression_outsample_performance"
|
52
|
-
required_inputs = ["model", "dataset"]
|
53
|
-
metadata = {
|
54
|
-
"task_types": ["regression"],
|
55
|
-
"tags": ["model_comparison"],
|
56
|
-
}
|
57
|
-
|
58
|
-
def run(self):
|
59
|
-
# Check models list is not empty
|
60
|
-
if not self.inputs.models:
|
61
|
-
raise ValueError("List of models must be provided in the models parameter")
|
62
|
-
all_models = []
|
63
|
-
if self.inputs.model is not None:
|
64
|
-
all_models.append(self.inputs.model)
|
65
|
-
|
66
|
-
if self.inputs.models is not None:
|
67
|
-
all_models.extend(self.inputs.models)
|
68
|
-
|
69
|
-
for model in all_models:
|
70
|
-
if model.test_ds is None:
|
71
|
-
raise ValueError(
|
72
|
-
"Test dataset is missing in the ValidMind Model object"
|
73
|
-
)
|
74
|
-
|
75
|
-
results = self._out_sample_performance_ols(all_models, self.inputs.dataset)
|
76
|
-
return self.cache_results(
|
77
|
-
{
|
78
|
-
"out_sample_performance": results.to_dict(orient="records"),
|
79
|
-
}
|
80
|
-
)
|
81
|
-
|
82
|
-
def _out_sample_performance_ols(self, model_list, dataset):
|
83
|
-
"""
|
84
|
-
Returns the out-of-sample performance evaluation metrics of a list of OLS regression models.
|
85
|
-
Args:
|
86
|
-
model_list (list): A list of OLS models to evaluate.
|
87
|
-
test_data (pandas.DataFrame): The test dataset containing the independent and dependent variables.
|
88
|
-
target_col (str): The name of the target variable column in the test dataset.
|
89
|
-
Returns:
|
90
|
-
pandas.DataFrame: A DataFrame containing the evaluation results of the OLS models. The columns are 'Model',
|
91
|
-
'MSE' (Mean Squared Error), and 'RMSE' (Root Mean Squared Error).
|
92
|
-
"""
|
93
|
-
|
94
|
-
# Initialize a list to store results
|
95
|
-
results = []
|
96
|
-
|
97
|
-
for fitted_model in model_list:
|
98
|
-
# Extract the column names of the independent variables from the model
|
99
|
-
independent_vars = dataset.feature_columns
|
100
|
-
|
101
|
-
# Separate the target variable and features in the test dataset
|
102
|
-
y_test = dataset.y
|
103
|
-
|
104
|
-
# Predict the test data
|
105
|
-
y_pred = dataset.y_pred(fitted_model)
|
106
|
-
|
107
|
-
# Calculate the residuals
|
108
|
-
residuals = y_test - y_pred
|
109
|
-
|
110
|
-
# Calculate the mean squared error and root mean squared error
|
111
|
-
mse = np.mean(residuals**2)
|
112
|
-
rmse_val = np.sqrt(mse)
|
113
|
-
|
114
|
-
# Store the results
|
115
|
-
model_name_with_vars = f"({', '.join(independent_vars)})"
|
116
|
-
results.append(
|
117
|
-
{
|
118
|
-
"Model": model_name_with_vars,
|
119
|
-
"MSE": mse,
|
120
|
-
"RMSE": rmse_val,
|
121
|
-
}
|
122
|
-
)
|
123
|
-
|
124
|
-
# Create a DataFrame to display the results
|
125
|
-
results_df = pd.DataFrame(results)
|
126
|
-
|
127
|
-
return results_df
|
128
|
-
|
129
|
-
def summary(self, metric_value):
|
130
|
-
"""
|
131
|
-
Build one table for summarizing the out-of-sample performance results
|
132
|
-
"""
|
133
|
-
summary_out_sample_performance = metric_value["out_sample_performance"]
|
134
|
-
|
135
|
-
return ResultSummary(
|
136
|
-
results=[
|
137
|
-
ResultTable(
|
138
|
-
data=summary_out_sample_performance,
|
139
|
-
metadata=ResultTableMetadata(
|
140
|
-
title="Out-of-Sample Performance Results"
|
141
|
-
),
|
142
|
-
),
|
143
|
-
]
|
144
|
-
)
|
@@ -1,127 +0,0 @@
|
|
1
|
-
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
-
# See the LICENSE file in the root of this repository for details.
|
3
|
-
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
-
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
|
-
from sklearn.metrics import mean_squared_error, r2_score
|
8
|
-
|
9
|
-
from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
|
10
|
-
|
11
|
-
|
12
|
-
@dataclass
|
13
|
-
class RegressionModelsPerformance(Metric):
|
14
|
-
"""
|
15
|
-
Evaluates and compares regression models' performance using R-squared, Adjusted R-squared, and MSE metrics.
|
16
|
-
|
17
|
-
**Purpose**: This metric is used to evaluate and compare the performance of various regression models. Through the
|
18
|
-
use of key statistical measures such as R-squared, Adjusted R-squared, and Mean Squared Error (MSE), the
|
19
|
-
performance of different models in predicting dependent variables can be assessed both on the data used for
|
20
|
-
training (in-sample) and new, unseen data (out-of-sample).
|
21
|
-
|
22
|
-
**Test Mechanism**: The test evaluates a list of provided regression models. For each model, it calculates their
|
23
|
-
in-sample and out-of-sample performance by deriving the model predictions for the training and testing datasets
|
24
|
-
respectively, and then comparing these predictions to the actual values. In doing so, it calculates R-squared,
|
25
|
-
Adjusted R-squared, and MSE for each model, stores the results, and returns them for comparison.
|
26
|
-
|
27
|
-
**Signs of High Risk**:
|
28
|
-
- High Mean Squared Error (MSE) values.
|
29
|
-
- Strikingly low values of R-squared and Adjusted R-squared.
|
30
|
-
- A significant drop in performance when transitioning from in-sample to out-of-sample evaluations, signaling a
|
31
|
-
potential overfitting issue.
|
32
|
-
|
33
|
-
**Strengths**:
|
34
|
-
- The test permits comparisons of multiple models simultaneously, providing an objective base for identifying the
|
35
|
-
top-performing model.
|
36
|
-
- It delivers both in-sample and out-of-sample evaluations, presenting performance data on unseen data.
|
37
|
-
- The utilization of R-squared and Adjusted R-squared in conjunction with MSE allows for a detailed view of the
|
38
|
-
model's explainability and error rate.
|
39
|
-
|
40
|
-
**Limitations**:
|
41
|
-
- This test is built around the assumption that the residuals of the regression model are normally distributed,
|
42
|
-
which is a fundamental requirement for Ordinary Least Squares (OLS) regression; thus, it could be not suitable for
|
43
|
-
models where this assumption is broken.
|
44
|
-
- The test does not consider cases where higher R-squared or lower MSE values do not necessarily correlate with
|
45
|
-
better predictive performance, particularly in instances of excessively complex models.
|
46
|
-
"""
|
47
|
-
|
48
|
-
name = "regression_models_performance"
|
49
|
-
required_inputs = ["models", "in_sample_datasets", "out_of_sample_datasets"]
|
50
|
-
metadata = {
|
51
|
-
"task_types": ["regression"],
|
52
|
-
"tags": ["model_performance", "model_comparison"],
|
53
|
-
}
|
54
|
-
|
55
|
-
def run(self):
|
56
|
-
# Check models list is not empty
|
57
|
-
if not self.inputs.models:
|
58
|
-
raise ValueError("List of models must be provided in the models parameter")
|
59
|
-
|
60
|
-
all_models = []
|
61
|
-
|
62
|
-
if self.inputs.models is not None:
|
63
|
-
all_models.extend(self.inputs.models)
|
64
|
-
|
65
|
-
in_sample_results = self.sample_performance_ols(
|
66
|
-
self.inputs.models, self.inputs.in_sample_datasets
|
67
|
-
)
|
68
|
-
out_of_sample_results = self.sample_performance_ols(
|
69
|
-
self.inputs.models, self.inputs.out_of_sample_datasets
|
70
|
-
)
|
71
|
-
|
72
|
-
return self.cache_results(
|
73
|
-
{
|
74
|
-
"in_sample_performance": in_sample_results,
|
75
|
-
"out_of_sample_performance": out_of_sample_results,
|
76
|
-
}
|
77
|
-
)
|
78
|
-
|
79
|
-
def sample_performance_ols(self, models, datasets):
|
80
|
-
evaluation_results = []
|
81
|
-
|
82
|
-
for model, dataset in zip(models, datasets):
|
83
|
-
X_columns = dataset.feature_columns
|
84
|
-
y_true = dataset.y
|
85
|
-
y_pred = dataset.y_pred(model)
|
86
|
-
|
87
|
-
# Extract R-squared and Adjusted R-squared
|
88
|
-
r2 = r2_score(y_true, y_pred)
|
89
|
-
mse = mean_squared_error(y_true, y_pred)
|
90
|
-
adj_r2 = 1 - ((1 - r2) * (len(y_true) - 1)) / (
|
91
|
-
len(y_true) - len(X_columns) - 1
|
92
|
-
)
|
93
|
-
|
94
|
-
# Append the results to the evaluation_results list
|
95
|
-
evaluation_results.append(
|
96
|
-
{
|
97
|
-
"Model": model.input_id,
|
98
|
-
"Independent Variables": X_columns,
|
99
|
-
"R-Squared": r2,
|
100
|
-
"Adjusted R-Squared": adj_r2,
|
101
|
-
"MSE": mse,
|
102
|
-
}
|
103
|
-
)
|
104
|
-
|
105
|
-
return evaluation_results
|
106
|
-
|
107
|
-
def summary(self, metric_value):
|
108
|
-
"""
|
109
|
-
Build a table for summarizing the in-sample and out-of-sample performance results
|
110
|
-
"""
|
111
|
-
summary_in_sample_performance = metric_value["in_sample_performance"]
|
112
|
-
summary_out_of_sample_performance = metric_value["out_of_sample_performance"]
|
113
|
-
|
114
|
-
return ResultSummary(
|
115
|
-
results=[
|
116
|
-
ResultTable(
|
117
|
-
data=summary_in_sample_performance,
|
118
|
-
metadata=ResultTableMetadata(title="In-Sample Performance Results"),
|
119
|
-
),
|
120
|
-
ResultTable(
|
121
|
-
data=summary_out_of_sample_performance,
|
122
|
-
metadata=ResultTableMetadata(
|
123
|
-
title="Out-of-Sample Performance Results"
|
124
|
-
),
|
125
|
-
),
|
126
|
-
]
|
127
|
-
)
|
@@ -1,130 +0,0 @@
|
|
1
|
-
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
-
# See the LICENSE file in the root of this repository for details.
|
3
|
-
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
-
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
|
-
import matplotlib.pyplot as plt
|
8
|
-
import pandas as pd
|
9
|
-
import seaborn as sns
|
10
|
-
from scipy import stats
|
11
|
-
from statsmodels.graphics.tsaplots import plot_acf
|
12
|
-
from statsmodels.tsa.seasonal import seasonal_decompose
|
13
|
-
|
14
|
-
from validmind.vm_models import Figure, Metric
|
15
|
-
|
16
|
-
|
17
|
-
@dataclass
|
18
|
-
class ResidualsVisualInspection(Metric):
|
19
|
-
"""
|
20
|
-
Provides a comprehensive visual analysis of residuals for regression models utilizing various plot types.
|
21
|
-
|
22
|
-
**Purpose**: The main purpose of this metric is to visualize and analyze the residuals (the differences between the
|
23
|
-
observed and predicted values) of a regression problem. It allows for a graphical exploration of the model's
|
24
|
-
errors, helping to identify statistical patterns or anomalies that may indicate a systematic bias in the model's
|
25
|
-
predictions. By inspecting the residuals, we can check how well the model fits the data and meets the assumptions
|
26
|
-
of the model.
|
27
|
-
|
28
|
-
**Test Mechanism**: The metric generates four common types of residual plots which are: a histogram with kernel
|
29
|
-
density estimation, a quantile-quantile (Q-Q) plot, a residuals series dot plot, and an autocorrelation function
|
30
|
-
(ACF) plot.
|
31
|
-
|
32
|
-
- The residuals histogram with kernel density estimation visualizes the distribution of residuals and allows to
|
33
|
-
check if they are normally distributed.
|
34
|
-
- Q-Q plot compares the observed quantiles of the data to the quantiles of a standard normal distribution, helping
|
35
|
-
to assess the normality of residuals.
|
36
|
-
- A residuals dot plot indicates the variation in residuals over time, which helps in identifying any time-related
|
37
|
-
pattern in residuals.
|
38
|
-
- ACF plot visualizes the correlation of an observation with its previous observations, helping to pinpoint any
|
39
|
-
seasonality effect within residuals.
|
40
|
-
|
41
|
-
**Signs of High Risk**:
|
42
|
-
|
43
|
-
- Skewness or asymmetry in the histogram or a significant deviation from the straight line in the Q-Q plot, which
|
44
|
-
indicates that the residuals aren't normally distributed.
|
45
|
-
- Large spikes in the ACF plot, indicating that the residuals are correlated, in violation of the assumption that
|
46
|
-
they are independent.
|
47
|
-
- Non-random patterns in the dot plot of residuals, indicating potential model misspecification.
|
48
|
-
|
49
|
-
**Strengths**:
|
50
|
-
|
51
|
-
- Visual analysis of residuals is a powerful yet simple way to understand a model’s behavior across the data set
|
52
|
-
and to identify problems with the model's assumptions or its fit to the data.
|
53
|
-
- The test is applicable to any regression model, irrespective of complexity.
|
54
|
-
- By exploring residuals, we might uncover relationships that were not captured by the model, revealing
|
55
|
-
opportunities for model improvement.
|
56
|
-
|
57
|
-
**Limitations**:
|
58
|
-
|
59
|
-
- Visual tests are largely subjective and can be open to interpretation. Clear-cut decisions about the model based
|
60
|
-
solely on these plots may not be possible.
|
61
|
-
- The metrics from the test do not directly infer the action based on the results; domain-specific knowledge and
|
62
|
-
expert judgement is often required to interpret the results.
|
63
|
-
- These plots can indicate a problem with the model but they do not necessarily reveal the nature or cause of the
|
64
|
-
problem.
|
65
|
-
- The test assumes that the error terms are identically distributed, which might not always be the case in
|
66
|
-
real-world scenarios.
|
67
|
-
"""
|
68
|
-
|
69
|
-
name = "residuals_visual_inspection"
|
70
|
-
required_inputs = ["dataset"]
|
71
|
-
metadata = {
|
72
|
-
"task_types": ["regression"],
|
73
|
-
"tags": ["statsmodels", "visualization"],
|
74
|
-
}
|
75
|
-
|
76
|
-
@staticmethod
|
77
|
-
def residual_analysis(residuals, variable_name, axes):
|
78
|
-
residuals = residuals.dropna().reset_index(
|
79
|
-
drop=True
|
80
|
-
) # drop NaN values and reset index
|
81
|
-
|
82
|
-
# QQ plot
|
83
|
-
stats.probplot(residuals, dist="norm", plot=axes[0, 1])
|
84
|
-
axes[0, 1].set_title(f"Residuals Q-Q Plot ({variable_name})")
|
85
|
-
|
86
|
-
# Histogram with KDE
|
87
|
-
sns.histplot(residuals, kde=True, ax=axes[0, 0])
|
88
|
-
axes[0, 0].set_xlabel("Residuals")
|
89
|
-
axes[0, 0].set_title(f"Residuals Histogram ({variable_name})")
|
90
|
-
|
91
|
-
# Residual series dot plot
|
92
|
-
sns.lineplot(data=residuals, linewidth=0.5, color="red", ax=axes[1, 0])
|
93
|
-
axes[1, 0].set_title(f"Residual Series Dot Plot ({variable_name})")
|
94
|
-
|
95
|
-
# ACF plot
|
96
|
-
n_lags = min(100, len(residuals) - 1) # Adjust the number of lags
|
97
|
-
plot_acf(residuals, ax=axes[1, 1], lags=n_lags, zero=False) # Added zero=False
|
98
|
-
axes[1, 1].set_title(f"ACF Plot of Residuals ({variable_name})")
|
99
|
-
|
100
|
-
def run(self):
|
101
|
-
x_train = self.inputs.dataset.df
|
102
|
-
figures = []
|
103
|
-
|
104
|
-
# TODO: specify which columns to plot via params
|
105
|
-
for col in x_train.columns:
|
106
|
-
sd = seasonal_decompose(x_train[col], model="additive")
|
107
|
-
|
108
|
-
# Remove NaN values from the residuals and reset the index
|
109
|
-
residuals = pd.Series(sd.resid).dropna().reset_index(drop=True)
|
110
|
-
|
111
|
-
# Create subplots
|
112
|
-
fig, axes = plt.subplots(nrows=2, ncols=2)
|
113
|
-
fig.suptitle(f"Residuals Inspection for {col}", fontsize=24)
|
114
|
-
|
115
|
-
self.residual_analysis(residuals, col, axes)
|
116
|
-
|
117
|
-
# Adjust the layout
|
118
|
-
plt.tight_layout()
|
119
|
-
|
120
|
-
# Do this if you want to prevent the figure from being displayed
|
121
|
-
plt.close("all")
|
122
|
-
|
123
|
-
figures.append(
|
124
|
-
Figure(
|
125
|
-
for_object=self,
|
126
|
-
key=self.key,
|
127
|
-
figure=fig,
|
128
|
-
)
|
129
|
-
)
|
130
|
-
return self.cache_results(figures=figures)
|
File without changes
|
File without changes
|
File without changes
|