validmind 2.0.7__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. validmind/__init__.py +3 -3
  2. validmind/__version__.py +1 -1
  3. validmind/ai.py +7 -11
  4. validmind/api_client.py +29 -27
  5. validmind/client.py +10 -3
  6. validmind/datasets/credit_risk/__init__.py +11 -0
  7. validmind/datasets/credit_risk/datasets/lending_club_loan_data_2007_2014_clean.csv.gz +0 -0
  8. validmind/datasets/credit_risk/lending_club.py +394 -0
  9. validmind/logging.py +9 -2
  10. validmind/template.py +2 -2
  11. validmind/test_suites/__init__.py +4 -2
  12. validmind/tests/__init__.py +97 -50
  13. validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +3 -1
  14. validmind/tests/data_validation/PiTCreditScoresHistogram.py +1 -1
  15. validmind/tests/data_validation/ScatterPlot.py +8 -2
  16. validmind/tests/decorator.py +138 -14
  17. validmind/tests/model_validation/BertScore.py +1 -1
  18. validmind/tests/model_validation/BertScoreAggregate.py +1 -1
  19. validmind/tests/model_validation/BleuScore.py +1 -1
  20. validmind/tests/model_validation/ClusterSizeDistribution.py +1 -1
  21. validmind/tests/model_validation/ContextualRecall.py +1 -1
  22. validmind/tests/model_validation/FeaturesAUC.py +110 -0
  23. validmind/tests/model_validation/MeteorScore.py +1 -1
  24. validmind/tests/model_validation/RegardHistogram.py +1 -1
  25. validmind/tests/model_validation/RegardScore.py +1 -1
  26. validmind/tests/model_validation/RegressionResidualsPlot.py +127 -0
  27. validmind/tests/model_validation/RougeMetrics.py +1 -1
  28. validmind/tests/model_validation/RougeMetricsAggregate.py +1 -1
  29. validmind/tests/model_validation/SelfCheckNLIScore.py +1 -1
  30. validmind/tests/model_validation/TokenDisparity.py +1 -1
  31. validmind/tests/model_validation/ToxicityHistogram.py +1 -1
  32. validmind/tests/model_validation/ToxicityScore.py +1 -1
  33. validmind/tests/model_validation/embeddings/ClusterDistribution.py +1 -1
  34. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +1 -3
  35. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +1 -1
  36. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +1 -1
  37. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +15 -18
  38. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +1 -1
  39. validmind/tests/model_validation/sklearn/ClusterPerformance.py +2 -2
  40. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +21 -3
  41. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +1 -1
  42. validmind/tests/model_validation/sklearn/MinimumF1Score.py +1 -1
  43. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +1 -1
  44. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +5 -4
  45. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +2 -2
  46. validmind/tests/model_validation/sklearn/ROCCurve.py +6 -12
  47. validmind/tests/model_validation/sklearn/RegressionErrors.py +2 -2
  48. validmind/tests/model_validation/sklearn/RegressionModelsPerformanceComparison.py +6 -4
  49. validmind/tests/model_validation/sklearn/RegressionR2Square.py +2 -2
  50. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +27 -3
  51. validmind/tests/model_validation/sklearn/SilhouettePlot.py +1 -1
  52. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +2 -2
  53. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +2 -2
  54. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +140 -0
  55. validmind/tests/model_validation/statsmodels/GINITable.py +22 -45
  56. validmind/tests/model_validation/statsmodels/{LogisticRegPredictionHistogram.py → PredictionProbabilitiesHistogram.py} +67 -92
  57. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +2 -2
  58. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +2 -2
  59. validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +1 -1
  60. validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +1 -1
  61. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +1 -1
  62. validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +1 -1
  63. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +128 -0
  64. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +70 -103
  65. validmind/tests/test_providers.py +14 -124
  66. validmind/unit_metrics/__init__.py +76 -69
  67. validmind/unit_metrics/classification/sklearn/Accuracy.py +14 -0
  68. validmind/unit_metrics/classification/sklearn/F1.py +13 -0
  69. validmind/unit_metrics/classification/sklearn/Precision.py +13 -0
  70. validmind/unit_metrics/classification/sklearn/ROC_AUC.py +13 -0
  71. validmind/unit_metrics/classification/sklearn/Recall.py +13 -0
  72. validmind/unit_metrics/composite.py +24 -71
  73. validmind/unit_metrics/regression/GiniCoefficient.py +20 -26
  74. validmind/unit_metrics/regression/HuberLoss.py +12 -16
  75. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +18 -24
  76. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +7 -13
  77. validmind/unit_metrics/regression/MeanBiasDeviation.py +5 -14
  78. validmind/unit_metrics/regression/QuantileLoss.py +6 -16
  79. validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +12 -18
  80. validmind/unit_metrics/regression/sklearn/MeanAbsoluteError.py +6 -15
  81. validmind/unit_metrics/regression/sklearn/MeanSquaredError.py +5 -14
  82. validmind/unit_metrics/regression/sklearn/RSquaredScore.py +6 -15
  83. validmind/unit_metrics/regression/sklearn/RootMeanSquaredError.py +11 -14
  84. validmind/utils.py +18 -45
  85. validmind/vm_models/__init__.py +0 -2
  86. validmind/vm_models/dataset.py +255 -16
  87. validmind/vm_models/test/metric.py +1 -2
  88. validmind/vm_models/test/result_wrapper.py +12 -13
  89. validmind/vm_models/test/test.py +2 -1
  90. validmind/vm_models/test/threshold_test.py +1 -2
  91. validmind/vm_models/test_suite/summary.py +3 -3
  92. validmind/vm_models/test_suite/test_suite.py +2 -1
  93. {validmind-2.0.7.dist-info → validmind-2.1.0.dist-info}/METADATA +10 -6
  94. {validmind-2.0.7.dist-info → validmind-2.1.0.dist-info}/RECORD +97 -96
  95. validmind/tests/__types__.py +0 -62
  96. validmind/tests/model_validation/statsmodels/LogRegressionConfusionMatrix.py +0 -128
  97. validmind/tests/model_validation/statsmodels/LogisticRegCumulativeProb.py +0 -172
  98. validmind/tests/model_validation/statsmodels/ScorecardBucketHistogram.py +0 -181
  99. validmind/tests/model_validation/statsmodels/ScorecardProbabilitiesHistogram.py +0 -175
  100. validmind/unit_metrics/sklearn/classification/Accuracy.py +0 -22
  101. validmind/unit_metrics/sklearn/classification/F1.py +0 -24
  102. validmind/unit_metrics/sklearn/classification/Precision.py +0 -24
  103. validmind/unit_metrics/sklearn/classification/ROC_AUC.py +0 -22
  104. validmind/unit_metrics/sklearn/classification/Recall.py +0 -22
  105. validmind/vm_models/test/unit_metric.py +0 -88
  106. {validmind-2.0.7.dist-info → validmind-2.1.0.dist-info}/LICENSE +0 -0
  107. {validmind-2.0.7.dist-info → validmind-2.1.0.dist-info}/WHEEL +0 -0
  108. {validmind-2.0.7.dist-info → validmind-2.1.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,110 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ from dataclasses import dataclass
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ import plotly.graph_objects as go
10
+ from sklearn.metrics import roc_auc_score
11
+
12
+ from validmind.errors import SkipTestError
13
+ from validmind.logging import get_logger
14
+ from validmind.vm_models import Figure, Metric
15
+
16
+ logger = get_logger(__name__)
17
+
18
+
19
+ @dataclass
20
+ class FeaturesAUC(Metric):
21
+ """
22
+ Evaluates the discriminatory power of each individual feature within a binary classification model by calculating the Area Under the Curve (AUC) for each feature separately.
23
+
24
+ **Purpose**: The central objective of this metric is to quantify how well each feature on its own can differentiate between the two classes in a binary classification problem. It serves as a univariate analysis tool that can help in pre-modeling feature selection or post-modeling interpretation.
25
+
26
+ **Test Mechanism**: For each feature, the metric treats the feature values as raw scores to compute the AUC against the actual binary outcomes. It provides an AUC value for each feature, offering a simple yet powerful indication of each feature's univariate classification strength.
27
+
28
+ **Signs of High Risk**:
29
+ - A feature with a low AUC score may not be contributing significantly to the differentiation between the two classes, which could be a concern if it is expected to be predictive.
30
+ - Conversely, a surprisingly high AUC for a feature not believed to be informative may suggest data leakage or other issues with the data.
31
+
32
+ **Strengths**:
33
+ - By isolating each feature, it highlights the individual contribution of features to the classification task without the influence of other variables.
34
+ - Useful for both initial feature evaluation and for providing insights into the model's reliance on individual features after model training.
35
+
36
+ **Limitations**:
37
+ - Does not reflect the combined effects of features or any interaction between them, which can be critical in certain models.
38
+ - The AUC values are calculated without considering the model's use of the features, which could lead to different interpretations of feature importance when considering the model holistically.
39
+ - This metric is applicable only to binary classification tasks and cannot be directly extended to multiclass classification or regression without modifications.
40
+ """
41
+
42
+ name = "features_auc"
43
+ required_inputs = ["model", "dataset"]
44
+ default_params = {
45
+ "fontsize": 12,
46
+ "figure_height": 500,
47
+ }
48
+ metadata = {
49
+ "task_types": ["classification"],
50
+ "tags": [
51
+ "feature_importance",
52
+ "AUC",
53
+ "visualization",
54
+ ],
55
+ }
56
+
57
+ def run(self):
58
+ x = self.inputs.dataset.x_df()
59
+ y = self.inputs.dataset.y_df()
60
+
61
+ if y.nunique() != 2:
62
+ raise SkipTestError("FeaturesAUC metric requires a binary target variable.")
63
+
64
+ aucs = pd.DataFrame(index=x.columns, columns=["AUC"])
65
+
66
+ for column in x.columns:
67
+ feature_values = x[column]
68
+ if feature_values.nunique() > 1:
69
+ auc_score = roc_auc_score(y, feature_values)
70
+ aucs.loc[column, "AUC"] = auc_score
71
+ else:
72
+ aucs.loc[
73
+ column, "AUC"
74
+ ] = np.nan # Not enough unique values to calculate AUC
75
+
76
+ # Sorting the AUC scores in descending order
77
+ sorted_indices = aucs["AUC"].dropna().sort_values(ascending=False).index
78
+
79
+ # Plotting the results
80
+ fig = go.Figure()
81
+ fig.add_trace(
82
+ go.Bar(
83
+ y=[column for column in sorted_indices],
84
+ x=[aucs.loc[column, "AUC"] for column in sorted_indices],
85
+ orientation="h",
86
+ )
87
+ )
88
+ fig.update_layout(
89
+ title_text="Feature AUC Scores",
90
+ yaxis=dict(
91
+ tickmode="linear",
92
+ dtick=1,
93
+ tickfont=dict(size=self.params["fontsize"]),
94
+ title="Features",
95
+ autorange="reversed", # Ensure that the highest AUC is at the top
96
+ ),
97
+ xaxis=dict(title="AUC"),
98
+ height=self.params["figure_height"],
99
+ )
100
+
101
+ return self.cache_results(
102
+ metric_value=aucs.to_dict(),
103
+ figures=[
104
+ Figure(
105
+ for_object=self,
106
+ key="features_auc",
107
+ figure=fig,
108
+ ),
109
+ ],
110
+ )
@@ -54,7 +54,7 @@ class MeteorScore(Metric):
54
54
  meteor_scores = []
55
55
 
56
56
  for prediction, reference in zip(
57
- self.inputs.dataset.y_pred(self.inputs.model.input_id),
57
+ self.inputs.dataset.y_pred(self.inputs.model),
58
58
  self.inputs.dataset.y,
59
59
  ):
60
60
  # Compute the METEOR score for the current prediction-reference pair
@@ -57,7 +57,7 @@ class RegardHistogram(Metric):
57
57
  raise AttributeError("The 'model' attribute is missing.")
58
58
 
59
59
  y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
60
- y_pred = self.inputs.dataset.y_pred(self.inputs.model.input_id)
60
+ y_pred = self.inputs.dataset.y_pred(self.inputs.model)
61
61
 
62
62
  if not len(y_true) == len(y_pred):
63
63
  raise ValueError(
@@ -58,7 +58,7 @@ class RegardScore(Metric):
58
58
  raise AttributeError("The 'model' attribute is missing.")
59
59
 
60
60
  y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
61
- y_pred = self.inputs.dataset.y_pred(self.inputs.model.input_id)
61
+ y_pred = self.inputs.dataset.y_pred(self.inputs.model)
62
62
 
63
63
  if not len(y_true) == len(y_pred):
64
64
  raise ValueError(
@@ -0,0 +1,127 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ from dataclasses import dataclass
6
+
7
+ import numpy as np
8
+ import plotly.figure_factory as ff
9
+ import plotly.graph_objects as go
10
+
11
+ from validmind.vm_models import Figure, Metric
12
+
13
+
14
+ @dataclass
15
+ class RegressionResidualsPlot(Metric):
16
+ """
17
+ Evaluates regression model performance using residual distribution and actual vs. predicted plots.
18
+
19
+ **Purpose:**
20
+ The `RegressionResidualsPlot` metric aims to evaluate the performance of regression models. By generating and
21
+ analyzing two plots – a distribution of residuals and a scatter plot of actual versus predicted values – this tool
22
+ helps to visually appraise how well the model predicts and the nature of errors it makes.
23
+
24
+ **Test Mechanism:**
25
+ The process begins by extracting the true output values (`y_true`) and the model's predicted values (`y_pred`).
26
+ Residuals are computed by subtracting predicted from true values. These residuals are then visualized using a
27
+ histogram to display their distribution. Additionally, a scatter plot is derived to compare true values against
28
+ predicted values, together with a "Perfect Fit" line, which represents an ideal match (predicted values equal
29
+ actual values), facilitating the assessment of the model's predictive accuracy.
30
+
31
+ **Signs of High Risk:**
32
+ - Residuals showing a non-normal distribution, especially those with frequent extreme values.
33
+ - Significant deviations of predicted values from actual values in the scatter plot.
34
+ - Sparse density of data points near the "Perfect Fit" line in the scatter plot, indicating poor prediction
35
+ accuracy.
36
+ - Visible patterns or trends in the residuals plot, suggesting the model's failure to capture the underlying data
37
+ structure adequately.
38
+
39
+ **Strengths:**
40
+ - Provides a direct, visually intuitive assessment of a regression model’s accuracy and handling of data.
41
+ - Visual plots can highlight issues of underfitting or overfitting.
42
+ - Can reveal systematic deviations or trends that purely numerical metrics might miss.
43
+ - Applicable across various regression model types.
44
+
45
+ **Limitations:**
46
+ - Relies on visual interpretation, which can be subjective and less precise than numerical evaluations.
47
+ - May be difficult to interpret in cases with multi-dimensional outputs due to the plots’ two-dimensional nature.
48
+ - Overlapping data points in the residuals plot can complicate interpretation efforts.
49
+ - Does not summarize model performance into a single quantifiable metric, which might be needed for comparative or
50
+ summary analyses.
51
+ """
52
+
53
+ name = "regression_residuals_plot"
54
+ required_inputs = ["model", "dataset"]
55
+ metadata = {
56
+ "task_types": ["regression"],
57
+ "tags": [
58
+ "model_performance",
59
+ ],
60
+ }
61
+ default_params = {"bin_size": 0.1}
62
+
63
+ def run(self):
64
+ y_true = self.inputs.dataset.y
65
+ y_pred = self.inputs.dataset.y_pred(self.inputs.model)
66
+ # Calculate residuals
67
+ residuals = y_true.flatten() - y_pred.flatten()
68
+ # Create residuals plot
69
+ hist_data = [residuals]
70
+ group_labels = ["Residuals"] # Names of the dataset
71
+ bin_size = self.params["bin_size"]
72
+ fig = ff.create_distplot(
73
+ hist_data, group_labels, bin_size=[bin_size], show_hist=True, show_rug=False
74
+ )
75
+ fig.update_layout(
76
+ title="Distribution of Residuals",
77
+ xaxis_title="Residuals",
78
+ yaxis_title="Density",
79
+ )
80
+ figures = [
81
+ Figure(
82
+ for_object=self,
83
+ key=self.key,
84
+ figure=fig,
85
+ )
86
+ ]
87
+ # Create a scatter plot of actual vs predicted values
88
+ scatter = go.Scatter(
89
+ x=y_true.flatten(),
90
+ y=y_pred.flatten(),
91
+ mode="markers",
92
+ name="True vs Predicted",
93
+ marker=dict(color="blue", opacity=0.5),
94
+ )
95
+
96
+ # Line of perfect prediction
97
+ max_val = np.nanmax([np.nanmax(y_true), np.nanmax(y_pred)])
98
+ min_val = np.nanmin([np.nanmin(y_true), np.nanmin(y_pred)])
99
+ line = go.Scatter(
100
+ x=[min_val, max_val],
101
+ y=[min_val, max_val],
102
+ mode="lines",
103
+ name="Perfect Fit",
104
+ line=dict(color="red", dash="dash"),
105
+ )
106
+
107
+ # Layout settings
108
+ layout = go.Layout(
109
+ title="True vs. Predicted Values",
110
+ xaxis_title="True Values",
111
+ yaxis_title="Predicted Values",
112
+ showlegend=True,
113
+ )
114
+
115
+ fig = go.Figure(data=[scatter, line], layout=layout)
116
+
117
+ figures.append(
118
+ Figure(
119
+ for_object=self,
120
+ key=self.key,
121
+ figure=fig,
122
+ )
123
+ )
124
+
125
+ return self.cache_results(
126
+ figures=figures,
127
+ )
@@ -85,7 +85,7 @@ class RougeMetrics(Metric):
85
85
  )
86
86
 
87
87
  y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
88
- y_pred = self.inputs.dataset.y_pred(self.inputs.model.input_id)
88
+ y_pred = self.inputs.dataset.y_pred(self.inputs.model)
89
89
 
90
90
  rouge = Rouge(metrics=r_metrics)
91
91
 
@@ -81,7 +81,7 @@ class RougeMetricsAggregate(Metric):
81
81
  )
82
82
 
83
83
  y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
84
- y_pred = self.inputs.dataset.y_pred(self.inputs.model.input_id)
84
+ y_pred = self.inputs.dataset.y_pred(self.inputs.model)
85
85
 
86
86
  rouge = Rouge(metrics=r_metrics)
87
87
 
@@ -45,7 +45,7 @@ class SelfCheckNLIScore(Metric):
45
45
  def run(self):
46
46
  # Assuming the dataset is structured with generated sentences and reference samples
47
47
  y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
48
- y_pred = self.inputs.dataset.y_pred(self.inputs.model.input_id)
48
+ y_pred = self.inputs.dataset.y_pred(self.inputs.model)
49
49
 
50
50
  hallucination_scores = self.compute_hallucination_scores(y_pred, y_true)
51
51
 
@@ -62,7 +62,7 @@ class TokenDisparity(Metric):
62
62
 
63
63
  def run(self):
64
64
  y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
65
- y_pred = self.inputs.dataset.y_pred(self.inputs.model.input_id)
65
+ y_pred = self.inputs.dataset.y_pred(self.inputs.model)
66
66
 
67
67
  df = pd.DataFrame({"reference_column": y_true, "generated_column": y_pred})
68
68
 
@@ -57,7 +57,7 @@ class ToxicityHistogram(Metric):
57
57
  raise AttributeError("The 'model' attribute is missing.")
58
58
 
59
59
  y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
60
- y_pred = self.inputs.dataset.y_pred(self.inputs.model.input_id)
60
+ y_pred = self.inputs.dataset.y_pred(self.inputs.model)
61
61
  input_text = self.inputs.dataset.df[self.inputs.dataset.text_column]
62
62
 
63
63
  # Ensure consistency in lengths
@@ -60,7 +60,7 @@ class ToxicityScore(Metric):
60
60
  raise AttributeError("The 'model' attribute is missing.")
61
61
 
62
62
  y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
63
- y_pred = self.inputs.dataset.y_pred(self.inputs.model.input_id)
63
+ y_pred = self.inputs.dataset.y_pred(self.inputs.model)
64
64
  input_text = self.inputs.dataset.df[self.inputs.dataset.text_column]
65
65
 
66
66
  # Ensure consistency in lengths
@@ -59,7 +59,7 @@ class ClusterDistribution(Metric):
59
59
  def run(self):
60
60
  # run kmeans clustering on embeddings
61
61
  kmeans = KMeans(n_clusters=self.params["num_clusters"]).fit(
62
- self.inputs.dataset.y_pred(self.inputs.model.input_id)
62
+ self.inputs.dataset.y_pred(self.inputs.model)
63
63
  )
64
64
 
65
65
  # plot the distribution
@@ -57,9 +57,7 @@ class CosineSimilarityDistribution(Metric):
57
57
 
58
58
  def run(self):
59
59
  # Compute cosine similarity
60
- similarities = cosine_similarity(
61
- self.inputs.dataset.y_pred(self.inputs.model.input_id)
62
- )
60
+ similarities = cosine_similarity(self.inputs.dataset.y_pred(self.inputs.model))
63
61
 
64
62
  # plot the distribution
65
63
  fig = px.histogram(
@@ -60,7 +60,7 @@ class DescriptiveAnalytics(Metric):
60
60
 
61
61
  def run(self):
62
62
  # Assuming y_pred returns a 2D array of embeddings [samples, features]
63
- preds = self.inputs.dataset.y_pred(self.inputs.model.input_id)
63
+ preds = self.inputs.dataset.y_pred(self.inputs.model)
64
64
 
65
65
  # Calculate statistics across the embedding dimensions, not across all embeddings
66
66
  means = np.mean(preds, axis=0) # Mean of each feature across all samples
@@ -67,7 +67,7 @@ class EmbeddingsVisualization2D(Metric):
67
67
  )
68
68
 
69
69
  # use TSNE to reduce dimensionality of embeddings
70
- num_samples = len(self.inputs.dataset.y_pred(self.inputs.model.input_id))
70
+ num_samples = len(self.inputs.dataset.y_pred(self.inputs.model))
71
71
 
72
72
  if self.params["perplexity"] >= num_samples:
73
73
  perplexity = num_samples - 1
@@ -5,17 +5,17 @@
5
5
  from dataclasses import dataclass
6
6
 
7
7
  from numpy import unique
8
- from sklearn import metrics, preprocessing
8
+ from sklearn.metrics import classification_report, roc_auc_score
9
+ from sklearn.preprocessing import LabelBinarizer
9
10
 
10
11
  from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
11
12
 
12
13
 
13
14
  def multiclass_roc_auc_score(y_test, y_pred, average="macro"):
14
- lb = preprocessing.LabelBinarizer()
15
+ lb = LabelBinarizer()
15
16
  lb.fit(y_test)
16
- y_test = lb.transform(y_test)
17
- y_pred = lb.transform(y_pred)
18
- return metrics.roc_auc_score(y_test, y_pred, average=average)
17
+
18
+ return roc_auc_score(lb.transform(y_test), lb.transform(y_pred), average=average)
19
19
 
20
20
 
21
21
  @dataclass
@@ -73,7 +73,7 @@ class ClassifierPerformance(Metric):
73
73
  When building a multi-class summary we need to calculate weighted average,
74
74
  macro average and per class metrics.
75
75
  """
76
- classes = {str(i) for i in unique(self.y_true())}
76
+ classes = {str(i) for i in unique(self.inputs.dataset.y)}
77
77
  pr_f1_table = [
78
78
  {
79
79
  "Class": class_name,
@@ -121,19 +121,16 @@ class ClassifierPerformance(Metric):
121
121
  ]
122
122
  )
123
123
 
124
- def y_true(self):
125
- return self.inputs.dataset.y
126
-
127
- def y_pred(self):
128
- return self.inputs.dataset.y_pred(model_id=self.inputs.model.input_id)
129
-
130
124
  def run(self):
131
- y_true = self.y_true()
132
- class_pred = self.y_pred()
133
-
134
- report = metrics.classification_report(
135
- y_true, class_pred, output_dict=True, zero_division=0
125
+ report = classification_report(
126
+ self.inputs.dataset.y,
127
+ self.inputs.dataset.y_pred(self.inputs.model),
128
+ output_dict=True,
129
+ zero_division=0,
130
+ )
131
+ report["roc_auc"] = multiclass_roc_auc_score(
132
+ self.inputs.dataset.y,
133
+ self.inputs.dataset.y_pred(self.inputs.model),
136
134
  )
137
- report["roc_auc"] = multiclass_roc_auc_score(y_true, class_pred)
138
135
 
139
136
  return self.cache_results(report)
@@ -67,7 +67,7 @@ class ClusterCosineSimilarity(Metric):
67
67
 
68
68
  def run(self):
69
69
  y_true_train = self.inputs.dataset.y
70
- y_pred_train = self.inputs.dataset.y_pred(self.inputs.model.input_id)
70
+ y_pred_train = self.inputs.dataset.y_pred(self.inputs.model)
71
71
  y_true_train = y_true_train.astype(y_pred_train.dtype).flatten()
72
72
  num_clusters = len(np.unique(y_pred_train))
73
73
  # Calculate cosine similarity for each cluster
@@ -101,11 +101,11 @@ class ClusterPerformance(Metric):
101
101
 
102
102
  def run(self):
103
103
  y_true_train = self.inputs.datasets[0].y
104
- class_pred_train = self.inputs.datasets[0].y_pred(self.inputs.model.input_id)
104
+ class_pred_train = self.inputs.datasets[0].y_pred(self.inputs.model)
105
105
  y_true_train = y_true_train.astype(class_pred_train.dtype)
106
106
 
107
107
  y_true_test = self.inputs.datasets[1].y
108
- class_pred_test = self.inputs.datasets[1].y_pred(self.inputs.model.input_id)
108
+ class_pred_test = self.inputs.datasets[1].y_pred(self.inputs.model)
109
109
  y_true_test = y_true_test.astype(class_pred_test.dtype)
110
110
 
111
111
  samples = ["train", "test"]
@@ -72,15 +72,33 @@ class ConfusionMatrix(Metric):
72
72
  labels.sort()
73
73
  labels = np.array(labels).T.tolist()
74
74
 
75
- class_pred = self.inputs.dataset.y_pred(model_id=self.inputs.model.input_id)
76
- y_true = y_true.astype(class_pred.dtype)
77
- cm = metrics.confusion_matrix(y_true, class_pred, labels=labels)
75
+ y_pred = self.inputs.dataset.y_pred(self.inputs.model)
76
+ y_true = y_true.astype(y_pred.dtype)
77
+
78
+ cm = metrics.confusion_matrix(y_true, y_pred, labels=labels)
79
+
80
+ text = None
81
+ if len(labels) == 2:
82
+ tn, fp, fn, tp = cm.ravel()
83
+
84
+ # Custom text to display on the heatmap cells
85
+ text = [
86
+ [
87
+ f"<b>True Negatives (TN)</b><br />{tn}",
88
+ f"<b>False Positives (FP)</b><br />{fp}",
89
+ ],
90
+ [
91
+ f"<b>False Negatives (FN)</b><br />{fn}",
92
+ f"<b>True Positives (TP)</b><br />{tp}",
93
+ ],
94
+ ]
78
95
 
79
96
  fig = ff.create_annotated_heatmap(
80
97
  z=cm,
81
98
  colorscale="Blues",
82
99
  x=labels,
83
100
  y=labels,
101
+ annotation_text=text,
84
102
  )
85
103
 
86
104
  fig["data"][0][
@@ -96,7 +96,7 @@ class MinimumAccuracy(ThresholdTest):
96
96
 
97
97
  def run(self):
98
98
  y_true = self.inputs.dataset.y
99
- class_pred = self.inputs.dataset.y_pred(self.inputs.model.input_id)
99
+ class_pred = self.inputs.dataset.y_pred(self.inputs.model)
100
100
  y_true = y_true.astype(class_pred.dtype)
101
101
 
102
102
  accuracy_score = metrics.accuracy_score(y_true, class_pred)
@@ -97,7 +97,7 @@ class MinimumF1Score(ThresholdTest):
97
97
 
98
98
  def run(self):
99
99
  y_true = self.inputs.dataset.y
100
- class_pred = self.inputs.dataset.y_pred(self.inputs.model.input_id)
100
+ class_pred = self.inputs.dataset.y_pred(self.inputs.model)
101
101
  y_true = y_true.astype(class_pred.dtype)
102
102
 
103
103
  if len(unique(y_true)) > 2:
@@ -101,7 +101,7 @@ class MinimumROCAUCScore(ThresholdTest):
101
101
 
102
102
  def run(self):
103
103
  y_true = self.inputs.dataset.y
104
- class_pred = self.inputs.dataset.y_pred(self.inputs.model.input_id)
104
+ class_pred = self.inputs.dataset.y_pred(self.inputs.model)
105
105
  y_true = y_true.astype(class_pred.dtype)
106
106
  roc_auc = self.multiclass_roc_auc_score(y_true, class_pred)
107
107
 
@@ -5,7 +5,7 @@
5
5
  from dataclasses import dataclass
6
6
 
7
7
  from numpy import unique
8
- from sklearn import metrics
8
+ from sklearn.metrics import classification_report
9
9
 
10
10
  from validmind.errors import SkipTestError
11
11
  from validmind.vm_models import ResultSummary, ResultTable, ResultTableMetadata
@@ -129,8 +129,9 @@ class ModelsPerformanceComparison(ClassifierPerformance):
129
129
  results = {}
130
130
  for idx, model in enumerate(all_models):
131
131
  y_true = self.inputs.dataset.y
132
- class_pred = self.inputs.dataset.y_pred(model.input_id)
133
- report = metrics.classification_report(y_true, class_pred, output_dict=True)
134
- report["roc_auc"] = multiclass_roc_auc_score(y_true, class_pred)
132
+ y_pred = self.inputs.dataset.y_pred(model)
133
+ report = classification_report(y_true, y_pred, output_dict=True)
134
+ report["roc_auc"] = multiclass_roc_auc_score(y_true, y_pred)
135
135
  results["model_" + str(idx)] = report
136
+
136
137
  return self.cache_results(results)
@@ -119,12 +119,12 @@ class OverfitDiagnosis(ThresholdTest):
119
119
 
120
120
  # Add prediction column in the training dataset
121
121
  train_df = self.inputs.datasets[0].df.copy()
122
- train_class_pred = self.inputs.datasets[0].y_pred(self.inputs.model.input_id)
122
+ train_class_pred = self.inputs.datasets[0].y_pred(self.inputs.model)
123
123
  train_df[prediction_column] = train_class_pred
124
124
 
125
125
  # Add prediction column in the test dataset
126
126
  test_df = self.inputs.datasets[1].df.copy()
127
- test_class_pred = self.inputs.datasets[1].y_pred(self.inputs.model.input_id)
127
+ test_class_pred = self.inputs.datasets[1].y_pred(self.inputs.model)
128
128
  test_df[prediction_column] = test_class_pred
129
129
 
130
130
  test_results = []
@@ -73,15 +73,8 @@ class ROCCurve(Metric):
73
73
  if self.inputs.model.model_library() == "FoundationModel":
74
74
  raise SkipTestError("Skipping ROCCurve for Foundation models")
75
75
 
76
- # Extract the actual model
77
- model = (
78
- self.inputs.model[0]
79
- if isinstance(self.inputs.model, list)
80
- else self.inputs.model
81
- )
82
-
83
76
  y_true = self.inputs.dataset.y
84
- y_pred = model.predict_proba(self.inputs.dataset.x)
77
+ y_prob = self.inputs.dataset.y_prob(self.inputs.model)
85
78
 
86
79
  # ROC curve is only supported for binary classification
87
80
  if len(np.unique(y_true)) > 2:
@@ -89,14 +82,15 @@ class ROCCurve(Metric):
89
82
  "ROC Curve is only supported for binary classification models"
90
83
  )
91
84
 
92
- y_true = y_true.astype(y_pred.dtype).flatten()
93
- assert np.all((y_pred >= 0) & (y_pred <= 1)), "Invalid probabilities in y_pred."
85
+ y_true = y_true.astype(y_prob.dtype).flatten()
86
+ assert np.all((y_prob >= 0) & (y_prob <= 1)), "Invalid probabilities in y_prob."
87
+
88
+ fpr, tpr, roc_thresholds = roc_curve(y_true, y_prob, drop_intermediate=False)
94
89
 
95
- fpr, tpr, roc_thresholds = roc_curve(y_true, y_pred, drop_intermediate=False)
96
90
  # Remove Inf values from roc_thresholds
97
91
  valid_thresholds_mask = np.isfinite(roc_thresholds)
98
92
  roc_thresholds = roc_thresholds[valid_thresholds_mask]
99
- auc = roc_auc_score(y_true, y_pred)
93
+ auc = roc_auc_score(y_true, y_prob)
100
94
 
101
95
  trace0 = go.Scatter(
102
96
  x=fpr,
@@ -130,11 +130,11 @@ class RegressionErrors(Metric):
130
130
 
131
131
  def run(self):
132
132
  y_train_true = self.inputs.datasets[0].y
133
- y_train_pred = self.inputs.datasets[0].y_pred(self.inputs.model.input_id)
133
+ y_train_pred = self.inputs.datasets[0].y_pred(self.inputs.model)
134
134
  y_train_true = y_train_true.astype(y_train_pred.dtype)
135
135
 
136
136
  y_test_true = self.inputs.datasets[1].y
137
- y_test_pred = self.inputs.datasets[1].y_pred(self.inputs.model.input_id)
137
+ y_test_pred = self.inputs.datasets[1].y_pred(self.inputs.model)
138
138
  y_test_true = y_test_true.astype(y_test_pred.dtype)
139
139
 
140
140
  results = self.regression_errors(
@@ -6,7 +6,7 @@ import re
6
6
  from dataclasses import dataclass
7
7
 
8
8
  import numpy as np
9
- from sklearn import metrics
9
+ from sklearn.metrics import mean_absolute_error, mean_squared_error
10
10
 
11
11
  from validmind.errors import SkipTestError
12
12
  from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
@@ -67,12 +67,12 @@ class RegressionModelsPerformanceComparison(Metric):
67
67
  }
68
68
 
69
69
  def regression_errors(self, y_true_test, y_pred_test):
70
- mae_test = metrics.mean_absolute_error(y_true_test, y_pred_test)
70
+ mae_test = mean_absolute_error(y_true_test, y_pred_test)
71
71
 
72
72
  results = {}
73
73
  results["Mean Absolute Error (MAE)"] = mae_test
74
74
 
75
- mse_test = metrics.mean_squared_error(y_true_test, y_pred_test)
75
+ mse_test = mean_squared_error(y_true_test, y_pred_test)
76
76
  results["Mean Squared Error (MSE)"] = mse_test
77
77
  results["Root Mean Squared Error (RMSE)"] = np.sqrt(mse_test)
78
78
 
@@ -121,12 +121,14 @@ class RegressionModelsPerformanceComparison(Metric):
121
121
 
122
122
  if self.inputs.models is not None:
123
123
  all_models.extend(self.inputs.models)
124
+
124
125
  results = {}
125
126
 
126
127
  for idx, model in enumerate(all_models):
127
128
  result = self.regression_errors(
128
129
  y_true_test=self.inputs.dataset.y,
129
- y_pred_test=self.inputs.dataset.y_pred(model.input_id),
130
+ y_pred_test=self.inputs.dataset.y_pred(model),
130
131
  )
131
132
  results["model_" + str(idx)] = result
133
+
132
134
  return self.cache_results(results)