validmind 2.2.5__py3-none-any.whl → 2.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/{ai.py → ai/test_descriptions.py} +127 -69
  3. validmind/ai/utils.py +104 -0
  4. validmind/api_client.py +70 -31
  5. validmind/client.py +5 -5
  6. validmind/logging.py +38 -32
  7. validmind/models/foundation.py +10 -6
  8. validmind/models/function.py +3 -1
  9. validmind/models/metadata.py +1 -1
  10. validmind/test_suites/__init__.py +1 -7
  11. validmind/test_suites/regression.py +0 -16
  12. validmind/test_suites/statsmodels_timeseries.py +1 -1
  13. validmind/tests/data_validation/ACFandPACFPlot.py +36 -27
  14. validmind/tests/{model_validation/statsmodels → data_validation}/ADF.py +42 -13
  15. validmind/tests/data_validation/BivariateScatterPlots.py +38 -41
  16. validmind/tests/{model_validation/statsmodels → data_validation}/DFGLSArch.py +67 -11
  17. validmind/tests/data_validation/HeatmapFeatureCorrelations.py +1 -1
  18. validmind/tests/data_validation/HighPearsonCorrelation.py +12 -3
  19. validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
  20. validmind/tests/{model_validation/statsmodels → data_validation}/KPSS.py +64 -11
  21. validmind/tests/{model_validation/statsmodels → data_validation}/PhillipsPerronArch.py +65 -11
  22. validmind/tests/data_validation/ScatterPlot.py +1 -1
  23. validmind/tests/data_validation/SeasonalDecompose.py +12 -7
  24. validmind/tests/data_validation/TabularDateTimeHistograms.py +29 -33
  25. validmind/tests/data_validation/WOEBinPlots.py +1 -1
  26. validmind/tests/data_validation/WOEBinTable.py +1 -1
  27. validmind/tests/{model_validation/statsmodels → data_validation}/ZivotAndrewsArch.py +65 -11
  28. validmind/tests/data_validation/nlp/CommonWords.py +1 -1
  29. validmind/tests/data_validation/nlp/Hashtags.py +1 -1
  30. validmind/tests/data_validation/nlp/Mentions.py +1 -1
  31. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +2 -1
  32. validmind/tests/data_validation/nlp/Punctuations.py +1 -1
  33. validmind/tests/data_validation/nlp/Sentiment.py +1 -1
  34. validmind/tests/data_validation/nlp/TextDescription.py +5 -1
  35. validmind/tests/data_validation/nlp/Toxicity.py +1 -1
  36. validmind/tests/decorator.py +1 -1
  37. validmind/tests/model_validation/FeaturesAUC.py +5 -3
  38. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +4 -0
  39. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +4 -0
  40. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +4 -0
  41. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +4 -0
  42. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +4 -0
  43. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +4 -0
  44. validmind/tests/model_validation/ragas/AnswerCorrectness.py +3 -3
  45. validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
  46. validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
  47. validmind/tests/model_validation/ragas/AspectCritique.py +14 -8
  48. validmind/tests/model_validation/ragas/ContextEntityRecall.py +3 -4
  49. validmind/tests/model_validation/ragas/ContextPrecision.py +4 -5
  50. validmind/tests/model_validation/ragas/ContextRecall.py +3 -4
  51. validmind/tests/model_validation/ragas/ContextRelevancy.py +5 -4
  52. validmind/tests/model_validation/ragas/Faithfulness.py +6 -5
  53. validmind/tests/model_validation/ragas/utils.py +35 -9
  54. validmind/tests/model_validation/sklearn/ClusterPerformance.py +2 -2
  55. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +1 -1
  56. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +6 -8
  57. validmind/tests/model_validation/sklearn/RegressionErrors.py +1 -1
  58. validmind/tests/model_validation/sklearn/RegressionModelsPerformanceComparison.py +14 -8
  59. validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
  60. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +1 -1
  61. validmind/tests/model_validation/statsmodels/GINITable.py +1 -1
  62. validmind/tests/model_validation/statsmodels/JarqueBera.py +1 -1
  63. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +1 -1
  64. validmind/tests/model_validation/statsmodels/LJungBox.py +1 -1
  65. validmind/tests/model_validation/statsmodels/Lilliefors.py +1 -1
  66. validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +4 -0
  67. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +9 -4
  68. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +2 -2
  69. validmind/tests/model_validation/statsmodels/RunsTest.py +1 -1
  70. validmind/tests/model_validation/statsmodels/ShapiroWilk.py +1 -1
  71. validmind/tests/prompt_validation/Bias.py +14 -11
  72. validmind/tests/prompt_validation/Clarity.py +14 -11
  73. validmind/tests/prompt_validation/Conciseness.py +14 -11
  74. validmind/tests/prompt_validation/Delimitation.py +14 -11
  75. validmind/tests/prompt_validation/NegativeInstruction.py +14 -11
  76. validmind/tests/prompt_validation/Robustness.py +11 -11
  77. validmind/tests/prompt_validation/Specificity.py +14 -11
  78. validmind/tests/prompt_validation/ai_powered_test.py +53 -75
  79. validmind/unit_metrics/composite.py +2 -1
  80. validmind/utils.py +4 -49
  81. validmind/vm_models/dataset/dataset.py +17 -3
  82. validmind/vm_models/dataset/utils.py +2 -2
  83. validmind/vm_models/model.py +1 -1
  84. validmind/vm_models/test/metric.py +1 -8
  85. validmind/vm_models/test/result_wrapper.py +27 -34
  86. validmind/vm_models/test/test.py +3 -0
  87. validmind/vm_models/test/threshold_test.py +1 -1
  88. validmind/vm_models/test_suite/runner.py +12 -6
  89. validmind/vm_models/test_suite/summary.py +18 -7
  90. validmind/vm_models/test_suite/test.py +13 -20
  91. {validmind-2.2.5.dist-info → validmind-2.3.1.dist-info}/METADATA +1 -1
  92. {validmind-2.2.5.dist-info → validmind-2.3.1.dist-info}/RECORD +95 -104
  93. validmind/tests/data_validation/DefaultRatesbyRiskBandPlot.py +0 -114
  94. validmind/tests/data_validation/PiTCreditScoresHistogram.py +0 -150
  95. validmind/tests/data_validation/PiTPDHistogram.py +0 -152
  96. validmind/tests/model_validation/statsmodels/ADFTest.py +0 -88
  97. validmind/tests/model_validation/statsmodels/FeatureImportanceAndSignificance.py +0 -198
  98. validmind/tests/model_validation/statsmodels/PDRatingClassPlot.py +0 -151
  99. validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +0 -146
  100. validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +0 -144
  101. validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +0 -127
  102. validmind/tests/model_validation/statsmodels/ResidualsVisualInspection.py +0 -130
  103. {validmind-2.2.5.dist-info → validmind-2.3.1.dist-info}/LICENSE +0 -0
  104. {validmind-2.2.5.dist-info → validmind-2.3.1.dist-info}/WHEEL +0 -0
  105. {validmind-2.2.5.dist-info → validmind-2.3.1.dist-info}/entry_points.txt +0 -0
@@ -1,151 +0,0 @@
1
- # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
- # See the LICENSE file in the root of this repository for details.
3
- # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
-
5
- from dataclasses import dataclass
6
-
7
- import pandas as pd
8
- import plotly.graph_objects as go
9
-
10
- from validmind.vm_models import Figure, Metric
11
-
12
-
13
- @dataclass
14
- class PDRatingClassPlot(Metric):
15
- """
16
- Assesses and visualizes credit risk distribution across different rating classes within a dataset via default
17
- probabilities.
18
-
19
- **Purpose**: The purpose of the Probability of Default (PD) Rating Class Plot test is to measure and evaluate the
20
- distribution of calculated default probabilities across different rating classes. This is critical for
21
- understanding and inferring credit risk and can provide insights into how effectively the model is differentiating
22
- between different risk levels in a credit dataset.
23
-
24
- **Test Mechanism**: This metric is implemented via a visualization mechanism. It sorts the predicted probabilities
25
- of defaults into user-defined rating classes defined in "rating_classes" in default parameters. When it has
26
- classified the probabilities, it then calculates the average default rates within each rating class. Subsequently,
27
- it produces bar plots for each of these rating classes, illustrating the average likelihood of a default within
28
- each class. This process is executed separately for both the training and testing data sets. The classification of
29
- predicted probabilities utilizes the pandas "cut" function, sorting and sectioning the data values into bins.
30
-
31
- **Signs of High Risk**:
32
-
33
- - If lower rating classes present higher average likelihoods of default than higher rating classes
34
- - If there is poor differentiation between the averages across the different rating classes
35
- - If the model generates a significant contrast between the likelihoods for the training set and the testing set,
36
- suggestive of model overfitting
37
-
38
- **Strengths**:
39
-
40
- - Presents a clear visual representation of how efficient the model is at predicting credit risk across different
41
- risk levels
42
- - Allows for rapid identification and understanding of model performance per rating class
43
- - Highlights potential overfitting issues by including both training and testing datasets in the analysis
44
-
45
- **Limitations**:
46
-
47
- - Making an incorrect choice for the number of rating classes, either oversimplifying or overcomplicating the
48
- distribution of default rates
49
- - Relying on the assumption that the rating classes are effective at differentiating risk levels and that the
50
- boundaries between classes truly represent the risk distribution
51
- - Not accounting for data set class imbalance, which could cause skewed average probabilities
52
- - Inability to gauge the overall performance of the model only based on this metric, emphasizing the requirement of
53
- combining it with other evaluation metrics
54
- """
55
-
56
- name = "pd_rating_class_plot"
57
- required_inputs = ["model", "datasets"]
58
-
59
- metadata = {
60
- "task_types": ["classification"],
61
- "tags": ["visualization", "credit_risk"],
62
- }
63
-
64
- default_params = {
65
- "title": "PD by Rating Class",
66
- "rating_classes": ["A", "B", "C", "D"],
67
- }
68
-
69
- @staticmethod
70
- def plot_bucket_analysis(df, prob_col, target_col, title, rating_classes):
71
- df["bucket"] = pd.cut(
72
- df[prob_col], bins=len(rating_classes), labels=rating_classes, right=False
73
- )
74
- default_rate = df.groupby("bucket")[target_col].mean()
75
-
76
- # Sort the data based on the order of rating_classes
77
- sorted_data = sorted(
78
- zip(rating_classes, default_rate),
79
- key=lambda x: rating_classes.index(x[0]),
80
- )
81
- rating_classes_sorted, default_rate_sorted = zip(*sorted_data)
82
-
83
- fig = go.Figure()
84
-
85
- # Iterate through the sorted data and create a bar for each score bucket
86
- for i, (bucket, rate) in enumerate(
87
- zip(rating_classes_sorted, default_rate_sorted)
88
- ):
89
- fig.add_trace(go.Bar(x=[bucket], y=[rate], name=bucket))
90
-
91
- fig.update_layout(
92
- title_text=title,
93
- xaxis_title="Rating Class",
94
- yaxis_title="Probability of Default",
95
- barmode="group",
96
- )
97
-
98
- return fig
99
-
100
- def run(self):
101
- target_column = self.inputs.model.train_ds.target_column
102
- title = self.params["title"]
103
- rating_classes = self.params["rating_classes"]
104
-
105
- X_train = self.inputs.datasets[0].x.copy()
106
- y_train = self.inputs.datasets[0].y.copy()
107
- X_test = self.inputs.datasets[1].x.copy()
108
- y_test = self.inputs.datasets[1].y.copy()
109
-
110
- # Compute probabilities
111
- X_train["probability"] = self.inputs.model.predict_proba(X_train)
112
- X_test["probability"] = self.inputs.model.predict_proba(X_test)
113
-
114
- df_train = pd.concat([X_train, y_train], axis=1)
115
- df_test = pd.concat([X_test, y_test], axis=1)
116
-
117
- fig_train = self.plot_bucket_analysis(
118
- df_train,
119
- "probability",
120
- target_column,
121
- title + " - Train Data",
122
- rating_classes,
123
- )
124
- fig_test = self.plot_bucket_analysis(
125
- df_test,
126
- "probability",
127
- target_column,
128
- title + " - Test Data",
129
- rating_classes,
130
- )
131
-
132
- return self.cache_results(
133
- metric_value={
134
- "bucket_analysis": {
135
- "train_probs": list(X_train["probability"]),
136
- "test_probs": list(X_test["probability"]),
137
- },
138
- },
139
- figures=[
140
- Figure(
141
- for_object=self,
142
- key="bucket_analysis_train",
143
- figure=fig_train,
144
- ),
145
- Figure(
146
- for_object=self,
147
- key="bucket_analysis_test",
148
- figure=fig_test,
149
- ),
150
- ],
151
- )
@@ -1,146 +0,0 @@
1
- # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
- # See the LICENSE file in the root of this repository for details.
3
- # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
-
5
- from dataclasses import dataclass
6
-
7
- import pandas as pd
8
- from sklearn.metrics import mean_squared_error, r2_score
9
-
10
- from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
11
-
12
- from .statsutils import adj_r2_score
13
-
14
-
15
- @dataclass
16
- class RegressionModelInsampleComparison(Metric):
17
- """
18
- Evaluates and compares in-sample performance of multiple regression models using R-Squared, Adjusted R-Squared,
19
- MSE, and RMSE.
20
-
21
- **Purpose**: The RegressionModelInsampleComparison test metric is utilized to evaluate and compare the performance
22
- of multiple regression models trained on the same dataset. Key performance indicators for this comparison include
23
- statistics related to the goodness of fit - R-Squared, Adjusted R-Squared, Mean Squared Error (MSE), and Root Mean
24
- Squared Error (RMSE).
25
-
26
- **Test Mechanism**: The methodology behind this test is as follows -
27
- - Firstly, a verification that the list of models to be tested is indeed not empty occurs.
28
- - Once confirmed, the In-Sample performance of the models is calculated by a private function,
29
- `_in_sample_performance_ols`, that executes the following steps:
30
- - Iterates through each model in the supplied list.
31
- - For each model, the function extracts the features (`X`) and the target (`y_true`) from the training dataset
32
- and computes the predicted target values (`y_pred`).
33
- - The performance metrics for the model are calculated using formulas for R-Squared, Adjusted R-Squared, MSE, and
34
- RMSE.
35
- - The results, including the computed metrics, variables of the model, and the model's identifier, are stored in
36
- a dictionary that is appended to a list.
37
- - The collected results are finally returned as a pandas dataframe.
38
-
39
- **Signs of High Risk**:
40
- - Significantly low values for R-Squared or Adjusted R-Squared.
41
- - Significantly high values for MSE and RMSE.
42
- Please note that what constitutes as "low" or "high" will vary based on the specific context or domain in which the
43
- model is being utilized.
44
-
45
- **Strengths**:
46
- - Enables comparison of in-sample performance across different models on the same dataset, providing insights into
47
- which model fits the data the best.
48
- - Utilizes multiple evaluation methods (R-Squared, Adjusted R-Squared, MSE, RMSE), offering a comprehensive review
49
- of a model's performance.
50
-
51
- **Limitations**:
52
- - The test measures only in-sample performance, i.e., how well a model fits the data it was trained on. However, it
53
- does not give any information on the performance of the model on new, unseen, or out-of-sample data.
54
- - Higher in-sample performance might be a result of overfitting, where the model is just memorizing the training
55
- data. This test is sensitive to such cases.
56
- - The test does not consider additional key factors such as the temporal dynamics of the data, that is, the pattern
57
- of changes in data over time.
58
- - The test does not provide an automated mechanism to determine if the reported metrics are within acceptable
59
- ranges, necessitating human judgment.
60
- """
61
-
62
- name = "regression_insample_performance"
63
- required_inputs = ["model", "dataset"]
64
- metadata = {
65
- "task_types": ["regression"],
66
- "tags": ["model_comparison"],
67
- }
68
-
69
- def run(self):
70
- # Check models list is not empty
71
- if not self.inputs.models:
72
- raise ValueError("List of models must be provided in the models parameter")
73
- all_models = []
74
-
75
- if self.inputs.models is not None:
76
- all_models.extend(self.inputs.models)
77
-
78
- in_sample_performance = self._in_sample_performance_ols(
79
- all_models, self.inputs.dataset
80
- )
81
- in_sample_performance_df = pd.DataFrame(in_sample_performance)
82
-
83
- return self.cache_results(
84
- {
85
- "in_sample_performance": in_sample_performance_df.to_dict(
86
- orient="records"
87
- ),
88
- }
89
- )
90
-
91
- def _in_sample_performance_ols(self, models, dataset):
92
- """
93
- Computes the in-sample performance evaluation metrics for a list of OLS models.
94
- Args:
95
- models (list): A list of statsmodels OLS models.
96
- Returns:
97
- list: A list of dictionaries containing the evaluation results for each model.
98
- Each dictionary contains the following keys:
99
- - 'Model': A string identifying the model.
100
- - 'Independent Variables': A list of strings identifying the independent variables used in the model.
101
- - 'R-Squared': The R-squared value of the model.
102
- - 'Adjusted R-Squared': The adjusted R-squared value of the model.
103
- - 'MSE': The mean squared error of the model.
104
- - 'RMSE': The root mean squared error of the model.
105
- """
106
- evaluation_results = []
107
-
108
- for i, model in enumerate(models):
109
- X_columns = dataset.feature_columns
110
- y_true = dataset.y
111
- y_pred = dataset.y_pred(model)
112
-
113
- # Extract R-squared and Adjusted R-squared
114
- r2 = r2_score(y_true, y_pred)
115
- adj_r2 = adj_r2_score(y_true, y_pred, len(y_true), len(X_columns))
116
- mse = mean_squared_error(y_true=y_true, y_pred=y_pred, squared=True)
117
- rmse = mean_squared_error(y_true=y_true, y_pred=y_pred, squared=False)
118
-
119
- # Append the results to the evaluation_results list
120
- evaluation_results.append(
121
- {
122
- "Model": f"Model {i + 1}",
123
- "Independent Variables": X_columns,
124
- "R-Squared": r2,
125
- "Adjusted R-Squared": adj_r2,
126
- "MSE": mse,
127
- "RMSE": rmse,
128
- }
129
- )
130
-
131
- return evaluation_results
132
-
133
- def summary(self, metric_value):
134
- """
135
- Build one table for summarizing the in-sample performance results
136
- """
137
- summary_in_sample_performance = metric_value["in_sample_performance"]
138
-
139
- return ResultSummary(
140
- results=[
141
- ResultTable(
142
- data=summary_in_sample_performance,
143
- metadata=ResultTableMetadata(title="In-Sample Performance Results"),
144
- ),
145
- ]
146
- )
@@ -1,144 +0,0 @@
1
- # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
- # See the LICENSE file in the root of this repository for details.
3
- # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
-
5
- from dataclasses import dataclass
6
-
7
- import numpy as np
8
- import pandas as pd
9
-
10
- from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
11
-
12
-
13
- @dataclass
14
- class RegressionModelOutsampleComparison(Metric):
15
- """
16
- Computes MSE and RMSE for multiple regression models using out-of-sample test to assess model's prediction accuracy
17
- on unseen data.
18
-
19
- **Purpose**: The RegressionModelOutsampleComparison test is designed to evaluate the predictive performance of
20
- multiple regression models by means of an out-of-sample test. The primary aim of this test is to validate the
21
- model's ability to generalize to unseen data, a common challenge in the context of overfitting. It does this by
22
- computing two critical metrics — Mean Squared Error (MSE) and Root Mean Squared Error (RMSE), which provide a
23
- quantifiable measure of the model's prediction accuracy on the testing dataset.
24
-
25
- **Test Mechanism**: This test requires multiple models (specifically Ordinary Least Squares - OLS regression
26
- models) and a test dataset as inputs. Each model generates predictions using the test dataset. The residuals are
27
- then calculated and used to compute the MSE and RMSE for each model. The test outcomes, which include the model's
28
- name, its MSE, and RMSE, are recorded and returned in a structured dataframe format.
29
-
30
- **Signs of High Risk**:
31
- - High values of MSE or RMSE indicate significant risk, signifying that the model's predictions considerably
32
- deviate from the actual values in the test dataset.
33
- - Consistently large discrepancies between training and testing performance across various models may indicate an
34
- issue with the input data itself or the model selection strategies employed.
35
-
36
- **Strengths**:
37
- - This test offers a comparative evaluation of multiple models' out-of-sample performance, enabling the selection
38
- of the best performing model.
39
- - The use of both MSE and RMSE provides insights into the model's prediction error. While MSE is sensitive to
40
- outliers, emphasizing larger errors, RMSE provides a more interpretable measure of average prediction error given
41
- that it's in the same unit as the dependent variable.
42
-
43
- **Limitations**:
44
- - The applicability of this test is limited to regression tasks, specifically OLS models.
45
- - The test operates under the assumption that the test dataset is a representative sample of the population. This
46
- might not always hold true and can result in less accurate insights.
47
- - The interpretability and the objectivity of the output (MSE and RMSE) can be influenced when the scale of the
48
- dependent variable varies significantly, or the distribution of residuals is heavily skewed or contains outliers.
49
- """
50
-
51
- name = "regression_outsample_performance"
52
- required_inputs = ["model", "dataset"]
53
- metadata = {
54
- "task_types": ["regression"],
55
- "tags": ["model_comparison"],
56
- }
57
-
58
- def run(self):
59
- # Check models list is not empty
60
- if not self.inputs.models:
61
- raise ValueError("List of models must be provided in the models parameter")
62
- all_models = []
63
- if self.inputs.model is not None:
64
- all_models.append(self.inputs.model)
65
-
66
- if self.inputs.models is not None:
67
- all_models.extend(self.inputs.models)
68
-
69
- for model in all_models:
70
- if model.test_ds is None:
71
- raise ValueError(
72
- "Test dataset is missing in the ValidMind Model object"
73
- )
74
-
75
- results = self._out_sample_performance_ols(all_models, self.inputs.dataset)
76
- return self.cache_results(
77
- {
78
- "out_sample_performance": results.to_dict(orient="records"),
79
- }
80
- )
81
-
82
- def _out_sample_performance_ols(self, model_list, dataset):
83
- """
84
- Returns the out-of-sample performance evaluation metrics of a list of OLS regression models.
85
- Args:
86
- model_list (list): A list of OLS models to evaluate.
87
- test_data (pandas.DataFrame): The test dataset containing the independent and dependent variables.
88
- target_col (str): The name of the target variable column in the test dataset.
89
- Returns:
90
- pandas.DataFrame: A DataFrame containing the evaluation results of the OLS models. The columns are 'Model',
91
- 'MSE' (Mean Squared Error), and 'RMSE' (Root Mean Squared Error).
92
- """
93
-
94
- # Initialize a list to store results
95
- results = []
96
-
97
- for fitted_model in model_list:
98
- # Extract the column names of the independent variables from the model
99
- independent_vars = dataset.feature_columns
100
-
101
- # Separate the target variable and features in the test dataset
102
- y_test = dataset.y
103
-
104
- # Predict the test data
105
- y_pred = dataset.y_pred(fitted_model)
106
-
107
- # Calculate the residuals
108
- residuals = y_test - y_pred
109
-
110
- # Calculate the mean squared error and root mean squared error
111
- mse = np.mean(residuals**2)
112
- rmse_val = np.sqrt(mse)
113
-
114
- # Store the results
115
- model_name_with_vars = f"({', '.join(independent_vars)})"
116
- results.append(
117
- {
118
- "Model": model_name_with_vars,
119
- "MSE": mse,
120
- "RMSE": rmse_val,
121
- }
122
- )
123
-
124
- # Create a DataFrame to display the results
125
- results_df = pd.DataFrame(results)
126
-
127
- return results_df
128
-
129
- def summary(self, metric_value):
130
- """
131
- Build one table for summarizing the out-of-sample performance results
132
- """
133
- summary_out_sample_performance = metric_value["out_sample_performance"]
134
-
135
- return ResultSummary(
136
- results=[
137
- ResultTable(
138
- data=summary_out_sample_performance,
139
- metadata=ResultTableMetadata(
140
- title="Out-of-Sample Performance Results"
141
- ),
142
- ),
143
- ]
144
- )
@@ -1,127 +0,0 @@
1
- # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
- # See the LICENSE file in the root of this repository for details.
3
- # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
-
5
- from dataclasses import dataclass
6
-
7
- from sklearn.metrics import mean_squared_error, r2_score
8
-
9
- from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
10
-
11
-
12
- @dataclass
13
- class RegressionModelsPerformance(Metric):
14
- """
15
- Evaluates and compares regression models' performance using R-squared, Adjusted R-squared, and MSE metrics.
16
-
17
- **Purpose**: This metric is used to evaluate and compare the performance of various regression models. Through the
18
- use of key statistical measures such as R-squared, Adjusted R-squared, and Mean Squared Error (MSE), the
19
- performance of different models in predicting dependent variables can be assessed both on the data used for
20
- training (in-sample) and new, unseen data (out-of-sample).
21
-
22
- **Test Mechanism**: The test evaluates a list of provided regression models. For each model, it calculates their
23
- in-sample and out-of-sample performance by deriving the model predictions for the training and testing datasets
24
- respectively, and then comparing these predictions to the actual values. In doing so, it calculates R-squared,
25
- Adjusted R-squared, and MSE for each model, stores the results, and returns them for comparison.
26
-
27
- **Signs of High Risk**:
28
- - High Mean Squared Error (MSE) values.
29
- - Strikingly low values of R-squared and Adjusted R-squared.
30
- - A significant drop in performance when transitioning from in-sample to out-of-sample evaluations, signaling a
31
- potential overfitting issue.
32
-
33
- **Strengths**:
34
- - The test permits comparisons of multiple models simultaneously, providing an objective base for identifying the
35
- top-performing model.
36
- - It delivers both in-sample and out-of-sample evaluations, presenting performance data on unseen data.
37
- - The utilization of R-squared and Adjusted R-squared in conjunction with MSE allows for a detailed view of the
38
- model's explainability and error rate.
39
-
40
- **Limitations**:
41
- - This test is built around the assumption that the residuals of the regression model are normally distributed,
42
- which is a fundamental requirement for Ordinary Least Squares (OLS) regression; thus, it could be not suitable for
43
- models where this assumption is broken.
44
- - The test does not consider cases where higher R-squared or lower MSE values do not necessarily correlate with
45
- better predictive performance, particularly in instances of excessively complex models.
46
- """
47
-
48
- name = "regression_models_performance"
49
- required_inputs = ["models", "in_sample_datasets", "out_of_sample_datasets"]
50
- metadata = {
51
- "task_types": ["regression"],
52
- "tags": ["model_performance", "model_comparison"],
53
- }
54
-
55
- def run(self):
56
- # Check models list is not empty
57
- if not self.inputs.models:
58
- raise ValueError("List of models must be provided in the models parameter")
59
-
60
- all_models = []
61
-
62
- if self.inputs.models is not None:
63
- all_models.extend(self.inputs.models)
64
-
65
- in_sample_results = self.sample_performance_ols(
66
- self.inputs.models, self.inputs.in_sample_datasets
67
- )
68
- out_of_sample_results = self.sample_performance_ols(
69
- self.inputs.models, self.inputs.out_of_sample_datasets
70
- )
71
-
72
- return self.cache_results(
73
- {
74
- "in_sample_performance": in_sample_results,
75
- "out_of_sample_performance": out_of_sample_results,
76
- }
77
- )
78
-
79
- def sample_performance_ols(self, models, datasets):
80
- evaluation_results = []
81
-
82
- for model, dataset in zip(models, datasets):
83
- X_columns = dataset.feature_columns
84
- y_true = dataset.y
85
- y_pred = dataset.y_pred(model)
86
-
87
- # Extract R-squared and Adjusted R-squared
88
- r2 = r2_score(y_true, y_pred)
89
- mse = mean_squared_error(y_true, y_pred)
90
- adj_r2 = 1 - ((1 - r2) * (len(y_true) - 1)) / (
91
- len(y_true) - len(X_columns) - 1
92
- )
93
-
94
- # Append the results to the evaluation_results list
95
- evaluation_results.append(
96
- {
97
- "Model": model.input_id,
98
- "Independent Variables": X_columns,
99
- "R-Squared": r2,
100
- "Adjusted R-Squared": adj_r2,
101
- "MSE": mse,
102
- }
103
- )
104
-
105
- return evaluation_results
106
-
107
- def summary(self, metric_value):
108
- """
109
- Build a table for summarizing the in-sample and out-of-sample performance results
110
- """
111
- summary_in_sample_performance = metric_value["in_sample_performance"]
112
- summary_out_of_sample_performance = metric_value["out_of_sample_performance"]
113
-
114
- return ResultSummary(
115
- results=[
116
- ResultTable(
117
- data=summary_in_sample_performance,
118
- metadata=ResultTableMetadata(title="In-Sample Performance Results"),
119
- ),
120
- ResultTable(
121
- data=summary_out_of_sample_performance,
122
- metadata=ResultTableMetadata(
123
- title="Out-of-Sample Performance Results"
124
- ),
125
- ),
126
- ]
127
- )