validmind 2.0.1__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +6 -3
- validmind/__version__.py +1 -1
- validmind/ai.py +193 -0
- validmind/api_client.py +45 -31
- validmind/client.py +33 -6
- validmind/datasets/classification/customer_churn.py +2 -2
- validmind/datasets/credit_risk/__init__.py +11 -0
- validmind/datasets/credit_risk/datasets/lending_club_loan_data_2007_2014_clean.csv.gz +0 -0
- validmind/datasets/credit_risk/lending_club.py +394 -0
- validmind/datasets/nlp/__init__.py +5 -0
- validmind/datasets/nlp/cnn_dailymail.py +98 -0
- validmind/datasets/nlp/datasets/cnn_dailymail_100_with_predictions.csv +255 -0
- validmind/datasets/nlp/datasets/cnn_dailymail_500_with_predictions.csv +1277 -0
- validmind/datasets/nlp/datasets/sentiments_with_predictions.csv +4847 -0
- validmind/errors.py +11 -1
- validmind/logging.py +9 -2
- validmind/models/huggingface.py +2 -2
- validmind/models/pytorch.py +3 -3
- validmind/models/sklearn.py +4 -4
- validmind/template.py +2 -2
- validmind/test_suites/__init__.py +4 -2
- validmind/tests/__init__.py +130 -45
- validmind/tests/data_validation/DatasetDescription.py +0 -1
- validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +3 -1
- validmind/tests/data_validation/PiTCreditScoresHistogram.py +1 -1
- validmind/tests/data_validation/ScatterPlot.py +8 -2
- validmind/tests/data_validation/nlp/StopWords.py +1 -6
- validmind/tests/data_validation/nlp/TextDescription.py +20 -9
- validmind/tests/decorator.py +313 -0
- validmind/tests/model_validation/BertScore.py +1 -1
- validmind/tests/model_validation/BertScoreAggregate.py +1 -1
- validmind/tests/model_validation/BleuScore.py +1 -1
- validmind/tests/model_validation/ClusterSizeDistribution.py +1 -1
- validmind/tests/model_validation/ContextualRecall.py +1 -1
- validmind/tests/model_validation/FeaturesAUC.py +110 -0
- validmind/tests/model_validation/MeteorScore.py +92 -0
- validmind/tests/model_validation/RegardHistogram.py +6 -7
- validmind/tests/model_validation/RegardScore.py +4 -6
- validmind/tests/model_validation/RegressionResidualsPlot.py +127 -0
- validmind/tests/model_validation/RougeMetrics.py +7 -5
- validmind/tests/model_validation/RougeMetricsAggregate.py +1 -1
- validmind/tests/model_validation/SelfCheckNLIScore.py +112 -0
- validmind/tests/model_validation/TokenDisparity.py +1 -1
- validmind/tests/model_validation/ToxicityHistogram.py +1 -1
- validmind/tests/model_validation/ToxicityScore.py +1 -1
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +1 -1
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +1 -3
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +17 -22
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +1 -1
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +16 -17
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +1 -1
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +2 -2
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +21 -3
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +1 -1
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +1 -1
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +1 -1
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +5 -4
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +2 -2
- validmind/tests/model_validation/sklearn/ROCCurve.py +6 -12
- validmind/tests/model_validation/sklearn/RegressionErrors.py +2 -2
- validmind/tests/model_validation/sklearn/RegressionModelsPerformanceComparison.py +6 -4
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +2 -2
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +55 -5
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +1 -1
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +11 -5
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +2 -2
- validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +140 -0
- validmind/tests/model_validation/statsmodels/GINITable.py +22 -45
- validmind/tests/model_validation/statsmodels/{LogisticRegPredictionHistogram.py → PredictionProbabilitiesHistogram.py} +67 -92
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +2 -2
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +2 -2
- validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +2 -2
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +128 -0
- validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +70 -103
- validmind/tests/prompt_validation/ai_powered_test.py +2 -0
- validmind/tests/test_providers.py +14 -124
- validmind/unit_metrics/__init__.py +75 -70
- validmind/unit_metrics/classification/sklearn/Accuracy.py +14 -0
- validmind/unit_metrics/classification/sklearn/F1.py +13 -0
- validmind/unit_metrics/classification/sklearn/Precision.py +13 -0
- validmind/unit_metrics/classification/sklearn/ROC_AUC.py +13 -0
- validmind/unit_metrics/classification/sklearn/Recall.py +13 -0
- validmind/unit_metrics/composite.py +228 -0
- validmind/unit_metrics/regression/GiniCoefficient.py +33 -0
- validmind/unit_metrics/regression/HuberLoss.py +23 -0
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +30 -0
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +16 -0
- validmind/unit_metrics/regression/MeanBiasDeviation.py +13 -0
- validmind/unit_metrics/regression/QuantileLoss.py +15 -0
- validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +21 -0
- validmind/unit_metrics/regression/sklearn/MeanAbsoluteError.py +13 -0
- validmind/unit_metrics/regression/sklearn/MeanSquaredError.py +13 -0
- validmind/unit_metrics/regression/sklearn/RSquaredScore.py +13 -0
- validmind/unit_metrics/regression/sklearn/RootMeanSquaredError.py +20 -0
- validmind/utils.py +20 -31
- validmind/vm_models/__init__.py +0 -2
- validmind/vm_models/dataset.py +623 -29
- validmind/vm_models/figure.py +52 -17
- validmind/vm_models/test/metric.py +33 -31
- validmind/vm_models/test/output_template.py +0 -27
- validmind/vm_models/test/result_wrapper.py +68 -36
- validmind/vm_models/test/test.py +4 -2
- validmind/vm_models/test/threshold_test.py +24 -14
- validmind/vm_models/test_context.py +7 -0
- validmind/vm_models/test_suite/runner.py +1 -1
- validmind/vm_models/test_suite/summary.py +3 -3
- validmind/vm_models/test_suite/test.py +1 -1
- validmind/vm_models/test_suite/test_suite.py +2 -1
- {validmind-2.0.1.dist-info → validmind-2.1.0.dist-info}/METADATA +18 -18
- {validmind-2.0.1.dist-info → validmind-2.1.0.dist-info}/RECORD +116 -94
- validmind-2.1.0.dist-info/entry_points.txt +3 -0
- validmind/tests/__types__.py +0 -62
- validmind/tests/model_validation/statsmodels/LogRegressionConfusionMatrix.py +0 -128
- validmind/tests/model_validation/statsmodels/LogisticRegCumulativeProb.py +0 -172
- validmind/tests/model_validation/statsmodels/ScorecardBucketHistogram.py +0 -181
- validmind/tests/model_validation/statsmodels/ScorecardProbabilitiesHistogram.py +0 -175
- validmind/unit_metrics/sklearn/classification/Accuracy.py +0 -20
- validmind/unit_metrics/sklearn/classification/F1.py +0 -22
- validmind/unit_metrics/sklearn/classification/Precision.py +0 -22
- validmind/unit_metrics/sklearn/classification/ROC_AUC.py +0 -20
- validmind/unit_metrics/sklearn/classification/Recall.py +0 -20
- validmind/vm_models/test/unit_metric.py +0 -88
- {validmind-2.0.1.dist-info → validmind-2.1.0.dist-info}/LICENSE +0 -0
- {validmind-2.0.1.dist-info → validmind-2.1.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,127 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
from dataclasses import dataclass
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
import plotly.figure_factory as ff
|
9
|
+
import plotly.graph_objects as go
|
10
|
+
|
11
|
+
from validmind.vm_models import Figure, Metric
|
12
|
+
|
13
|
+
|
14
|
+
@dataclass
|
15
|
+
class RegressionResidualsPlot(Metric):
|
16
|
+
"""
|
17
|
+
Evaluates regression model performance using residual distribution and actual vs. predicted plots.
|
18
|
+
|
19
|
+
**Purpose:**
|
20
|
+
The `RegressionResidualsPlot` metric aims to evaluate the performance of regression models. By generating and
|
21
|
+
analyzing two plots – a distribution of residuals and a scatter plot of actual versus predicted values – this tool
|
22
|
+
helps to visually appraise how well the model predicts and the nature of errors it makes.
|
23
|
+
|
24
|
+
**Test Mechanism:**
|
25
|
+
The process begins by extracting the true output values (`y_true`) and the model's predicted values (`y_pred`).
|
26
|
+
Residuals are computed by subtracting predicted from true values. These residuals are then visualized using a
|
27
|
+
histogram to display their distribution. Additionally, a scatter plot is derived to compare true values against
|
28
|
+
predicted values, together with a "Perfect Fit" line, which represents an ideal match (predicted values equal
|
29
|
+
actual values), facilitating the assessment of the model's predictive accuracy.
|
30
|
+
|
31
|
+
**Signs of High Risk:**
|
32
|
+
- Residuals showing a non-normal distribution, especially those with frequent extreme values.
|
33
|
+
- Significant deviations of predicted values from actual values in the scatter plot.
|
34
|
+
- Sparse density of data points near the "Perfect Fit" line in the scatter plot, indicating poor prediction
|
35
|
+
accuracy.
|
36
|
+
- Visible patterns or trends in the residuals plot, suggesting the model's failure to capture the underlying data
|
37
|
+
structure adequately.
|
38
|
+
|
39
|
+
**Strengths:**
|
40
|
+
- Provides a direct, visually intuitive assessment of a regression model’s accuracy and handling of data.
|
41
|
+
- Visual plots can highlight issues of underfitting or overfitting.
|
42
|
+
- Can reveal systematic deviations or trends that purely numerical metrics might miss.
|
43
|
+
- Applicable across various regression model types.
|
44
|
+
|
45
|
+
**Limitations:**
|
46
|
+
- Relies on visual interpretation, which can be subjective and less precise than numerical evaluations.
|
47
|
+
- May be difficult to interpret in cases with multi-dimensional outputs due to the plots’ two-dimensional nature.
|
48
|
+
- Overlapping data points in the residuals plot can complicate interpretation efforts.
|
49
|
+
- Does not summarize model performance into a single quantifiable metric, which might be needed for comparative or
|
50
|
+
summary analyses.
|
51
|
+
"""
|
52
|
+
|
53
|
+
name = "regression_residuals_plot"
|
54
|
+
required_inputs = ["model", "dataset"]
|
55
|
+
metadata = {
|
56
|
+
"task_types": ["regression"],
|
57
|
+
"tags": [
|
58
|
+
"model_performance",
|
59
|
+
],
|
60
|
+
}
|
61
|
+
default_params = {"bin_size": 0.1}
|
62
|
+
|
63
|
+
def run(self):
|
64
|
+
y_true = self.inputs.dataset.y
|
65
|
+
y_pred = self.inputs.dataset.y_pred(self.inputs.model)
|
66
|
+
# Calculate residuals
|
67
|
+
residuals = y_true.flatten() - y_pred.flatten()
|
68
|
+
# Create residuals plot
|
69
|
+
hist_data = [residuals]
|
70
|
+
group_labels = ["Residuals"] # Names of the dataset
|
71
|
+
bin_size = self.params["bin_size"]
|
72
|
+
fig = ff.create_distplot(
|
73
|
+
hist_data, group_labels, bin_size=[bin_size], show_hist=True, show_rug=False
|
74
|
+
)
|
75
|
+
fig.update_layout(
|
76
|
+
title="Distribution of Residuals",
|
77
|
+
xaxis_title="Residuals",
|
78
|
+
yaxis_title="Density",
|
79
|
+
)
|
80
|
+
figures = [
|
81
|
+
Figure(
|
82
|
+
for_object=self,
|
83
|
+
key=self.key,
|
84
|
+
figure=fig,
|
85
|
+
)
|
86
|
+
]
|
87
|
+
# Create a scatter plot of actual vs predicted values
|
88
|
+
scatter = go.Scatter(
|
89
|
+
x=y_true.flatten(),
|
90
|
+
y=y_pred.flatten(),
|
91
|
+
mode="markers",
|
92
|
+
name="True vs Predicted",
|
93
|
+
marker=dict(color="blue", opacity=0.5),
|
94
|
+
)
|
95
|
+
|
96
|
+
# Line of perfect prediction
|
97
|
+
max_val = np.nanmax([np.nanmax(y_true), np.nanmax(y_pred)])
|
98
|
+
min_val = np.nanmin([np.nanmin(y_true), np.nanmin(y_pred)])
|
99
|
+
line = go.Scatter(
|
100
|
+
x=[min_val, max_val],
|
101
|
+
y=[min_val, max_val],
|
102
|
+
mode="lines",
|
103
|
+
name="Perfect Fit",
|
104
|
+
line=dict(color="red", dash="dash"),
|
105
|
+
)
|
106
|
+
|
107
|
+
# Layout settings
|
108
|
+
layout = go.Layout(
|
109
|
+
title="True vs. Predicted Values",
|
110
|
+
xaxis_title="True Values",
|
111
|
+
yaxis_title="Predicted Values",
|
112
|
+
showlegend=True,
|
113
|
+
)
|
114
|
+
|
115
|
+
fig = go.Figure(data=[scatter, line], layout=layout)
|
116
|
+
|
117
|
+
figures.append(
|
118
|
+
Figure(
|
119
|
+
for_object=self,
|
120
|
+
key=self.key,
|
121
|
+
figure=fig,
|
122
|
+
)
|
123
|
+
)
|
124
|
+
|
125
|
+
return self.cache_results(
|
126
|
+
figures=figures,
|
127
|
+
)
|
@@ -76,7 +76,6 @@ class RougeMetrics(Metric):
|
|
76
76
|
if r_metrics is None:
|
77
77
|
raise ValueError("rouge_metrics must be provided in params")
|
78
78
|
|
79
|
-
# With all
|
80
79
|
if not (
|
81
80
|
set(self.default_params.get("rouge_metrics")).intersection(r_metrics)
|
82
81
|
== set(r_metrics)
|
@@ -86,7 +85,7 @@ class RougeMetrics(Metric):
|
|
86
85
|
)
|
87
86
|
|
88
87
|
y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
|
89
|
-
y_pred = self.inputs.dataset.y_pred(self.inputs.model
|
88
|
+
y_pred = self.inputs.dataset.y_pred(self.inputs.model)
|
90
89
|
|
91
90
|
rouge = Rouge(metrics=r_metrics)
|
92
91
|
|
@@ -97,12 +96,13 @@ class RougeMetrics(Metric):
|
|
97
96
|
|
98
97
|
metrics_df = pd.DataFrame(score_list)
|
99
98
|
figures = []
|
99
|
+
|
100
100
|
for m in metrics_df.columns:
|
101
101
|
df_scores = pd.DataFrame(metrics_df[m].tolist())
|
102
102
|
# Visualization part
|
103
103
|
fig = go.Figure()
|
104
104
|
|
105
|
-
# Adding the line plots
|
105
|
+
# Adding the line plots for precision, recall, and F1-score with lines and markers
|
106
106
|
fig.add_trace(
|
107
107
|
go.Scatter(
|
108
108
|
x=df_scores.index,
|
@@ -129,11 +129,13 @@ class RougeMetrics(Metric):
|
|
129
129
|
)
|
130
130
|
|
131
131
|
fig.update_layout(
|
132
|
-
title="ROUGE Scores for
|
132
|
+
title=f"ROUGE Scores for {m}",
|
133
133
|
xaxis_title="Row Index",
|
134
134
|
yaxis_title="Score",
|
135
135
|
)
|
136
|
-
|
136
|
+
|
137
|
+
# Ensure a unique key for each metric
|
138
|
+
k = f"{m.replace('-', '')}_{len(figures)}"
|
137
139
|
figures.append(
|
138
140
|
Figure(
|
139
141
|
for_object=self,
|
@@ -81,7 +81,7 @@ class RougeMetricsAggregate(Metric):
|
|
81
81
|
)
|
82
82
|
|
83
83
|
y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
|
84
|
-
y_pred = self.inputs.dataset.y_pred(self.inputs.model
|
84
|
+
y_pred = self.inputs.dataset.y_pred(self.inputs.model)
|
85
85
|
|
86
86
|
rouge = Rouge(metrics=r_metrics)
|
87
87
|
|
@@ -0,0 +1,112 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import itertools
|
6
|
+
from dataclasses import dataclass
|
7
|
+
|
8
|
+
import pandas as pd
|
9
|
+
import plotly.graph_objects as go
|
10
|
+
import torch
|
11
|
+
from selfcheckgpt.modeling_selfcheck import SelfCheckNLI
|
12
|
+
from tqdm import tqdm
|
13
|
+
|
14
|
+
from validmind.vm_models import Figure, Metric
|
15
|
+
|
16
|
+
|
17
|
+
@dataclass
|
18
|
+
class SelfCheckNLIScore(Metric):
|
19
|
+
"""
|
20
|
+
Evaluates text generation models' performance by quantifying the level of hallucination in generated texts compared to reference texts.
|
21
|
+
|
22
|
+
**Purpose**: The HallucinationScore metric is designed to assess the factual accuracy and reliability of text generated by models, focusing on the detection and quantification of hallucinations—instances where generated content deviates from factual or expected outputs. By comparing generated texts against reference texts, this metric highlights discrepancies indicative of hallucinations, offering insights into the model's ability to produce contextually and factually coherent content.
|
23
|
+
|
24
|
+
**Test Mechanism**: To compute the HallucinationScore, the metric employs a comparison between the generated texts (model predictions) and the provided reference texts (true values). Using the SelfCheckNLI model, it evaluates each generated text's level of factual congruence with the reference, assigning a hallucination score based on the semantic coherence and factual accuracy. The scores for each text instance are then visualized in a line plot, allowing for the examination of hallucination trends across the dataset.
|
25
|
+
|
26
|
+
**Signs of High Risk**:
|
27
|
+
- High hallucination scores across a significant portion of the dataset, indicating a prevalence of factually inaccurate or irrelevant content generation.
|
28
|
+
- Patterns of consistent hallucination in specific contexts or subjects, suggesting gaps in the model's understanding or knowledge.
|
29
|
+
- Sharp fluctuations in hallucination scores, which may reveal inconsistencies in the model's performance or sensitivity to certain types of input.
|
30
|
+
|
31
|
+
**Strengths**:
|
32
|
+
- Directly addresses the critical aspect of factual accuracy in generated text, beyond mere linguistic or stylistic coherence.
|
33
|
+
- Provides a granular, instance-by-instance analysis of model performance, allowing for targeted improvements and diagnostics.
|
34
|
+
- Facilitates a deeper understanding of a model's capabilities and limitations in producing reliable and accurate content.
|
35
|
+
|
36
|
+
**Limitations**:
|
37
|
+
- Reliance on the SelfCheckNLI model means the accuracy and effectiveness of the HallucinationScore are contingent upon the performance and suitability of the underlying NLI model.
|
38
|
+
- May not fully capture the subtleties of certain factual inaccuracies or the contextual relevance of reference texts, especially in complex or nuanced domains.
|
39
|
+
- Potentially resource-intensive, given the computational demands of running advanced NLI models for large datasets.
|
40
|
+
"""
|
41
|
+
|
42
|
+
name = "self_check_nli_score"
|
43
|
+
required_inputs = ["model", "dataset"]
|
44
|
+
|
45
|
+
def run(self):
|
46
|
+
# Assuming the dataset is structured with generated sentences and reference samples
|
47
|
+
y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
|
48
|
+
y_pred = self.inputs.dataset.y_pred(self.inputs.model)
|
49
|
+
|
50
|
+
hallucination_scores = self.compute_hallucination_scores(y_pred, y_true)
|
51
|
+
|
52
|
+
# Visualization of scores
|
53
|
+
figures = self.visualize_scores(hallucination_scores)
|
54
|
+
|
55
|
+
return self.cache_results(figures=figures)
|
56
|
+
|
57
|
+
def compute_hallucination_scores(self, predictions, references):
|
58
|
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
59
|
+
selfcheck_nli = SelfCheckNLI(device=device)
|
60
|
+
hallucination_scores = []
|
61
|
+
|
62
|
+
print("Starting hallucination score computation...")
|
63
|
+
|
64
|
+
for index, (sentences, samples) in enumerate(
|
65
|
+
tqdm(zip(predictions, references), total=len(predictions))
|
66
|
+
):
|
67
|
+
sent_scores_nli = selfcheck_nli.predict(
|
68
|
+
sentences=sentences, sampled_passages=samples
|
69
|
+
)
|
70
|
+
|
71
|
+
# Compute the mean of the hallucination scores for this row
|
72
|
+
average_score = sent_scores_nli.mean()
|
73
|
+
hallucination_scores.append(average_score)
|
74
|
+
|
75
|
+
# Print a progress update for each row
|
76
|
+
print(
|
77
|
+
f"Row {index + 1}/{len(predictions)}: Average hallucination score: {average_score}"
|
78
|
+
)
|
79
|
+
|
80
|
+
print("Completed hallucination score computation.")
|
81
|
+
|
82
|
+
return hallucination_scores
|
83
|
+
|
84
|
+
def visualize_scores(self, scores):
|
85
|
+
scores_df = pd.DataFrame(scores, columns=["Hallucination Score"])
|
86
|
+
|
87
|
+
fig = go.Figure()
|
88
|
+
fig.add_trace(
|
89
|
+
go.Scatter(
|
90
|
+
x=scores_df.index,
|
91
|
+
y=scores_df["Hallucination Score"],
|
92
|
+
mode="lines+markers",
|
93
|
+
name="Hallucination Score",
|
94
|
+
)
|
95
|
+
)
|
96
|
+
|
97
|
+
fig.update_layout(
|
98
|
+
title="Hallucination Scores Across Text Instances",
|
99
|
+
xaxis_title="Text Instance Index",
|
100
|
+
yaxis_title="Hallucination Score",
|
101
|
+
)
|
102
|
+
|
103
|
+
# Wrapping the plotly figure for compatibility with your framework might be needed
|
104
|
+
figures = [
|
105
|
+
Figure(
|
106
|
+
for_object=self,
|
107
|
+
key=self.key,
|
108
|
+
figure=fig,
|
109
|
+
)
|
110
|
+
]
|
111
|
+
|
112
|
+
return figures
|
@@ -62,7 +62,7 @@ class TokenDisparity(Metric):
|
|
62
62
|
|
63
63
|
def run(self):
|
64
64
|
y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
|
65
|
-
y_pred = self.inputs.dataset.y_pred(self.inputs.model
|
65
|
+
y_pred = self.inputs.dataset.y_pred(self.inputs.model)
|
66
66
|
|
67
67
|
df = pd.DataFrame({"reference_column": y_true, "generated_column": y_pred})
|
68
68
|
|
@@ -57,7 +57,7 @@ class ToxicityHistogram(Metric):
|
|
57
57
|
raise AttributeError("The 'model' attribute is missing.")
|
58
58
|
|
59
59
|
y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
|
60
|
-
y_pred = self.inputs.dataset.y_pred(self.inputs.model
|
60
|
+
y_pred = self.inputs.dataset.y_pred(self.inputs.model)
|
61
61
|
input_text = self.inputs.dataset.df[self.inputs.dataset.text_column]
|
62
62
|
|
63
63
|
# Ensure consistency in lengths
|
@@ -60,7 +60,7 @@ class ToxicityScore(Metric):
|
|
60
60
|
raise AttributeError("The 'model' attribute is missing.")
|
61
61
|
|
62
62
|
y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
|
63
|
-
y_pred = self.inputs.dataset.y_pred(self.inputs.model
|
63
|
+
y_pred = self.inputs.dataset.y_pred(self.inputs.model)
|
64
64
|
input_text = self.inputs.dataset.df[self.inputs.dataset.text_column]
|
65
65
|
|
66
66
|
# Ensure consistency in lengths
|
@@ -59,7 +59,7 @@ class ClusterDistribution(Metric):
|
|
59
59
|
def run(self):
|
60
60
|
# run kmeans clustering on embeddings
|
61
61
|
kmeans = KMeans(n_clusters=self.params["num_clusters"]).fit(
|
62
|
-
self.inputs.dataset.y_pred(self.inputs.model
|
62
|
+
self.inputs.dataset.y_pred(self.inputs.model)
|
63
63
|
)
|
64
64
|
|
65
65
|
# plot the distribution
|
@@ -57,9 +57,7 @@ class CosineSimilarityDistribution(Metric):
|
|
57
57
|
|
58
58
|
def run(self):
|
59
59
|
# Compute cosine similarity
|
60
|
-
similarities = cosine_similarity(
|
61
|
-
self.inputs.dataset.y_pred(self.inputs.model.input_id)
|
62
|
-
)
|
60
|
+
similarities = cosine_similarity(self.inputs.dataset.y_pred(self.inputs.model))
|
63
61
|
|
64
62
|
# plot the distribution
|
65
63
|
fig = px.histogram(
|
@@ -59,30 +59,25 @@ class DescriptiveAnalytics(Metric):
|
|
59
59
|
}
|
60
60
|
|
61
61
|
def run(self):
|
62
|
-
|
63
|
-
|
64
|
-
|
62
|
+
# Assuming y_pred returns a 2D array of embeddings [samples, features]
|
63
|
+
preds = self.inputs.dataset.y_pred(self.inputs.model)
|
64
|
+
|
65
|
+
# Calculate statistics across the embedding dimensions, not across all embeddings
|
66
|
+
means = np.mean(preds, axis=0) # Mean of each feature across all samples
|
67
|
+
medians = np.median(preds, axis=0) # Median of each feature across all samples
|
68
|
+
stds = np.std(preds, axis=0) # Std. dev. of each feature across all samples
|
69
|
+
|
70
|
+
# Plot histograms of the calculated statistics
|
71
|
+
mean_fig = px.histogram(x=means, title="Distribution of Embedding Means")
|
72
|
+
median_fig = px.histogram(x=medians, title="Distribution of Embedding Medians")
|
73
|
+
std_fig = px.histogram(
|
74
|
+
x=stds, title="Distribution of Embedding Standard Deviations"
|
75
|
+
)
|
65
76
|
|
66
77
|
return self.cache_results(
|
67
78
|
figures=[
|
68
|
-
Figure(
|
69
|
-
|
70
|
-
|
71
|
-
figure=px.histogram(mean, title="Distribution of Embedding Means"),
|
72
|
-
),
|
73
|
-
Figure(
|
74
|
-
for_object=self,
|
75
|
-
key=self.key,
|
76
|
-
figure=px.histogram(
|
77
|
-
median, title="Distribution of Embedding Medians"
|
78
|
-
),
|
79
|
-
),
|
80
|
-
Figure(
|
81
|
-
for_object=self,
|
82
|
-
key=self.key,
|
83
|
-
figure=px.histogram(
|
84
|
-
std, title="Distribution of Embedding Standard Deviations"
|
85
|
-
),
|
86
|
-
),
|
79
|
+
Figure(for_object=self, key=f"{self.key}_mean", figure=mean_fig),
|
80
|
+
Figure(for_object=self, key=f"{self.key}_median", figure=median_fig),
|
81
|
+
Figure(for_object=self, key=f"{self.key}_std", figure=std_fig),
|
87
82
|
],
|
88
83
|
)
|
@@ -67,7 +67,7 @@ class EmbeddingsVisualization2D(Metric):
|
|
67
67
|
)
|
68
68
|
|
69
69
|
# use TSNE to reduce dimensionality of embeddings
|
70
|
-
num_samples = len(self.inputs.dataset.y_pred(self.inputs.model
|
70
|
+
num_samples = len(self.inputs.dataset.y_pred(self.inputs.model))
|
71
71
|
|
72
72
|
if self.params["perplexity"] >= num_samples:
|
73
73
|
perplexity = num_samples - 1
|
@@ -5,17 +5,17 @@
|
|
5
5
|
from dataclasses import dataclass
|
6
6
|
|
7
7
|
from numpy import unique
|
8
|
-
from sklearn import
|
8
|
+
from sklearn.metrics import classification_report, roc_auc_score
|
9
|
+
from sklearn.preprocessing import LabelBinarizer
|
9
10
|
|
10
11
|
from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
|
11
12
|
|
12
13
|
|
13
14
|
def multiclass_roc_auc_score(y_test, y_pred, average="macro"):
|
14
|
-
lb =
|
15
|
+
lb = LabelBinarizer()
|
15
16
|
lb.fit(y_test)
|
16
|
-
|
17
|
-
|
18
|
-
return metrics.roc_auc_score(y_test, y_pred, average=average)
|
17
|
+
|
18
|
+
return roc_auc_score(lb.transform(y_test), lb.transform(y_pred), average=average)
|
19
19
|
|
20
20
|
|
21
21
|
@dataclass
|
@@ -73,7 +73,7 @@ class ClassifierPerformance(Metric):
|
|
73
73
|
When building a multi-class summary we need to calculate weighted average,
|
74
74
|
macro average and per class metrics.
|
75
75
|
"""
|
76
|
-
classes = {str(i) for i in unique(self.
|
76
|
+
classes = {str(i) for i in unique(self.inputs.dataset.y)}
|
77
77
|
pr_f1_table = [
|
78
78
|
{
|
79
79
|
"Class": class_name,
|
@@ -121,17 +121,16 @@ class ClassifierPerformance(Metric):
|
|
121
121
|
]
|
122
122
|
)
|
123
123
|
|
124
|
-
def y_true(self):
|
125
|
-
return self.inputs.dataset.y
|
126
|
-
|
127
|
-
def y_pred(self):
|
128
|
-
return self.inputs.dataset.y_pred(model_id=self.inputs.model.input_id)
|
129
|
-
|
130
124
|
def run(self):
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
125
|
+
report = classification_report(
|
126
|
+
self.inputs.dataset.y,
|
127
|
+
self.inputs.dataset.y_pred(self.inputs.model),
|
128
|
+
output_dict=True,
|
129
|
+
zero_division=0,
|
130
|
+
)
|
131
|
+
report["roc_auc"] = multiclass_roc_auc_score(
|
132
|
+
self.inputs.dataset.y,
|
133
|
+
self.inputs.dataset.y_pred(self.inputs.model),
|
134
|
+
)
|
136
135
|
|
137
136
|
return self.cache_results(report)
|
@@ -67,7 +67,7 @@ class ClusterCosineSimilarity(Metric):
|
|
67
67
|
|
68
68
|
def run(self):
|
69
69
|
y_true_train = self.inputs.dataset.y
|
70
|
-
y_pred_train = self.inputs.dataset.y_pred(self.inputs.model
|
70
|
+
y_pred_train = self.inputs.dataset.y_pred(self.inputs.model)
|
71
71
|
y_true_train = y_true_train.astype(y_pred_train.dtype).flatten()
|
72
72
|
num_clusters = len(np.unique(y_pred_train))
|
73
73
|
# Calculate cosine similarity for each cluster
|
@@ -101,11 +101,11 @@ class ClusterPerformance(Metric):
|
|
101
101
|
|
102
102
|
def run(self):
|
103
103
|
y_true_train = self.inputs.datasets[0].y
|
104
|
-
class_pred_train = self.inputs.datasets[0].y_pred(self.inputs.model
|
104
|
+
class_pred_train = self.inputs.datasets[0].y_pred(self.inputs.model)
|
105
105
|
y_true_train = y_true_train.astype(class_pred_train.dtype)
|
106
106
|
|
107
107
|
y_true_test = self.inputs.datasets[1].y
|
108
|
-
class_pred_test = self.inputs.datasets[1].y_pred(self.inputs.model
|
108
|
+
class_pred_test = self.inputs.datasets[1].y_pred(self.inputs.model)
|
109
109
|
y_true_test = y_true_test.astype(class_pred_test.dtype)
|
110
110
|
|
111
111
|
samples = ["train", "test"]
|
@@ -72,15 +72,33 @@ class ConfusionMatrix(Metric):
|
|
72
72
|
labels.sort()
|
73
73
|
labels = np.array(labels).T.tolist()
|
74
74
|
|
75
|
-
|
76
|
-
y_true = y_true.astype(
|
77
|
-
|
75
|
+
y_pred = self.inputs.dataset.y_pred(self.inputs.model)
|
76
|
+
y_true = y_true.astype(y_pred.dtype)
|
77
|
+
|
78
|
+
cm = metrics.confusion_matrix(y_true, y_pred, labels=labels)
|
79
|
+
|
80
|
+
text = None
|
81
|
+
if len(labels) == 2:
|
82
|
+
tn, fp, fn, tp = cm.ravel()
|
83
|
+
|
84
|
+
# Custom text to display on the heatmap cells
|
85
|
+
text = [
|
86
|
+
[
|
87
|
+
f"<b>True Negatives (TN)</b><br />{tn}",
|
88
|
+
f"<b>False Positives (FP)</b><br />{fp}",
|
89
|
+
],
|
90
|
+
[
|
91
|
+
f"<b>False Negatives (FN)</b><br />{fn}",
|
92
|
+
f"<b>True Positives (TP)</b><br />{tp}",
|
93
|
+
],
|
94
|
+
]
|
78
95
|
|
79
96
|
fig = ff.create_annotated_heatmap(
|
80
97
|
z=cm,
|
81
98
|
colorscale="Blues",
|
82
99
|
x=labels,
|
83
100
|
y=labels,
|
101
|
+
annotation_text=text,
|
84
102
|
)
|
85
103
|
|
86
104
|
fig["data"][0][
|
@@ -96,7 +96,7 @@ class MinimumAccuracy(ThresholdTest):
|
|
96
96
|
|
97
97
|
def run(self):
|
98
98
|
y_true = self.inputs.dataset.y
|
99
|
-
class_pred = self.inputs.dataset.y_pred(self.inputs.model
|
99
|
+
class_pred = self.inputs.dataset.y_pred(self.inputs.model)
|
100
100
|
y_true = y_true.astype(class_pred.dtype)
|
101
101
|
|
102
102
|
accuracy_score = metrics.accuracy_score(y_true, class_pred)
|
@@ -97,7 +97,7 @@ class MinimumF1Score(ThresholdTest):
|
|
97
97
|
|
98
98
|
def run(self):
|
99
99
|
y_true = self.inputs.dataset.y
|
100
|
-
class_pred = self.inputs.dataset.y_pred(self.inputs.model
|
100
|
+
class_pred = self.inputs.dataset.y_pred(self.inputs.model)
|
101
101
|
y_true = y_true.astype(class_pred.dtype)
|
102
102
|
|
103
103
|
if len(unique(y_true)) > 2:
|
@@ -101,7 +101,7 @@ class MinimumROCAUCScore(ThresholdTest):
|
|
101
101
|
|
102
102
|
def run(self):
|
103
103
|
y_true = self.inputs.dataset.y
|
104
|
-
class_pred = self.inputs.dataset.y_pred(self.inputs.model
|
104
|
+
class_pred = self.inputs.dataset.y_pred(self.inputs.model)
|
105
105
|
y_true = y_true.astype(class_pred.dtype)
|
106
106
|
roc_auc = self.multiclass_roc_auc_score(y_true, class_pred)
|
107
107
|
|
@@ -5,7 +5,7 @@
|
|
5
5
|
from dataclasses import dataclass
|
6
6
|
|
7
7
|
from numpy import unique
|
8
|
-
from sklearn import
|
8
|
+
from sklearn.metrics import classification_report
|
9
9
|
|
10
10
|
from validmind.errors import SkipTestError
|
11
11
|
from validmind.vm_models import ResultSummary, ResultTable, ResultTableMetadata
|
@@ -129,8 +129,9 @@ class ModelsPerformanceComparison(ClassifierPerformance):
|
|
129
129
|
results = {}
|
130
130
|
for idx, model in enumerate(all_models):
|
131
131
|
y_true = self.inputs.dataset.y
|
132
|
-
|
133
|
-
report =
|
134
|
-
report["roc_auc"] = multiclass_roc_auc_score(y_true,
|
132
|
+
y_pred = self.inputs.dataset.y_pred(model)
|
133
|
+
report = classification_report(y_true, y_pred, output_dict=True)
|
134
|
+
report["roc_auc"] = multiclass_roc_auc_score(y_true, y_pred)
|
135
135
|
results["model_" + str(idx)] = report
|
136
|
+
|
136
137
|
return self.cache_results(results)
|
@@ -119,12 +119,12 @@ class OverfitDiagnosis(ThresholdTest):
|
|
119
119
|
|
120
120
|
# Add prediction column in the training dataset
|
121
121
|
train_df = self.inputs.datasets[0].df.copy()
|
122
|
-
train_class_pred = self.inputs.datasets[0].y_pred(self.inputs.model
|
122
|
+
train_class_pred = self.inputs.datasets[0].y_pred(self.inputs.model)
|
123
123
|
train_df[prediction_column] = train_class_pred
|
124
124
|
|
125
125
|
# Add prediction column in the test dataset
|
126
126
|
test_df = self.inputs.datasets[1].df.copy()
|
127
|
-
test_class_pred = self.inputs.datasets[1].y_pred(self.inputs.model
|
127
|
+
test_class_pred = self.inputs.datasets[1].y_pred(self.inputs.model)
|
128
128
|
test_df[prediction_column] = test_class_pred
|
129
129
|
|
130
130
|
test_results = []
|
@@ -73,15 +73,8 @@ class ROCCurve(Metric):
|
|
73
73
|
if self.inputs.model.model_library() == "FoundationModel":
|
74
74
|
raise SkipTestError("Skipping ROCCurve for Foundation models")
|
75
75
|
|
76
|
-
# Extract the actual model
|
77
|
-
model = (
|
78
|
-
self.inputs.model[0]
|
79
|
-
if isinstance(self.inputs.model, list)
|
80
|
-
else self.inputs.model
|
81
|
-
)
|
82
|
-
|
83
76
|
y_true = self.inputs.dataset.y
|
84
|
-
|
77
|
+
y_prob = self.inputs.dataset.y_prob(self.inputs.model)
|
85
78
|
|
86
79
|
# ROC curve is only supported for binary classification
|
87
80
|
if len(np.unique(y_true)) > 2:
|
@@ -89,14 +82,15 @@ class ROCCurve(Metric):
|
|
89
82
|
"ROC Curve is only supported for binary classification models"
|
90
83
|
)
|
91
84
|
|
92
|
-
y_true = y_true.astype(
|
93
|
-
assert np.all((
|
85
|
+
y_true = y_true.astype(y_prob.dtype).flatten()
|
86
|
+
assert np.all((y_prob >= 0) & (y_prob <= 1)), "Invalid probabilities in y_prob."
|
87
|
+
|
88
|
+
fpr, tpr, roc_thresholds = roc_curve(y_true, y_prob, drop_intermediate=False)
|
94
89
|
|
95
|
-
fpr, tpr, roc_thresholds = roc_curve(y_true, y_pred, drop_intermediate=False)
|
96
90
|
# Remove Inf values from roc_thresholds
|
97
91
|
valid_thresholds_mask = np.isfinite(roc_thresholds)
|
98
92
|
roc_thresholds = roc_thresholds[valid_thresholds_mask]
|
99
|
-
auc = roc_auc_score(y_true,
|
93
|
+
auc = roc_auc_score(y_true, y_prob)
|
100
94
|
|
101
95
|
trace0 = go.Scatter(
|
102
96
|
x=fpr,
|
@@ -130,11 +130,11 @@ class RegressionErrors(Metric):
|
|
130
130
|
|
131
131
|
def run(self):
|
132
132
|
y_train_true = self.inputs.datasets[0].y
|
133
|
-
y_train_pred = self.inputs.datasets[0].y_pred(self.inputs.model
|
133
|
+
y_train_pred = self.inputs.datasets[0].y_pred(self.inputs.model)
|
134
134
|
y_train_true = y_train_true.astype(y_train_pred.dtype)
|
135
135
|
|
136
136
|
y_test_true = self.inputs.datasets[1].y
|
137
|
-
y_test_pred = self.inputs.datasets[1].y_pred(self.inputs.model
|
137
|
+
y_test_pred = self.inputs.datasets[1].y_pred(self.inputs.model)
|
138
138
|
y_test_true = y_test_true.astype(y_test_pred.dtype)
|
139
139
|
|
140
140
|
results = self.regression_errors(
|