validmind 2.1.0__py3-none-any.whl → 2.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/ai.py +3 -3
- validmind/api_client.py +2 -3
- validmind/client.py +68 -25
- validmind/datasets/llm/rag/__init__.py +11 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_1.csv +30 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_2.csv +30 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_3.csv +53 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_4.csv +53 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_5.csv +53 -0
- validmind/datasets/llm/rag/rfp.py +41 -0
- validmind/html_templates/__init__.py +0 -0
- validmind/html_templates/content_blocks.py +89 -14
- validmind/models/__init__.py +7 -4
- validmind/models/foundation.py +8 -34
- validmind/models/function.py +51 -0
- validmind/models/huggingface.py +16 -46
- validmind/models/metadata.py +42 -0
- validmind/models/pipeline.py +66 -0
- validmind/models/pytorch.py +8 -42
- validmind/models/r_model.py +33 -82
- validmind/models/sklearn.py +39 -38
- validmind/template.py +8 -26
- validmind/tests/__init__.py +43 -20
- validmind/tests/data_validation/ANOVAOneWayTable.py +1 -1
- validmind/tests/data_validation/ChiSquaredFeaturesTable.py +1 -1
- validmind/tests/data_validation/DescriptiveStatistics.py +2 -4
- validmind/tests/data_validation/Duplicates.py +1 -1
- validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +1 -1
- validmind/tests/data_validation/TargetRateBarPlots.py +1 -1
- validmind/tests/data_validation/nlp/LanguageDetection.py +59 -0
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +48 -0
- validmind/tests/data_validation/nlp/Punctuations.py +11 -12
- validmind/tests/data_validation/nlp/Sentiment.py +57 -0
- validmind/tests/data_validation/nlp/Toxicity.py +45 -0
- validmind/tests/decorator.py +2 -2
- validmind/tests/model_validation/BertScore.py +100 -98
- validmind/tests/model_validation/BleuScore.py +93 -64
- validmind/tests/model_validation/ContextualRecall.py +74 -91
- validmind/tests/model_validation/MeteorScore.py +86 -74
- validmind/tests/model_validation/RegardScore.py +103 -121
- validmind/tests/model_validation/RougeScore.py +118 -0
- validmind/tests/model_validation/TokenDisparity.py +84 -121
- validmind/tests/model_validation/ToxicityScore.py +109 -123
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +96 -0
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +71 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +92 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +69 -0
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +78 -0
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +35 -23
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +3 -0
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +7 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +3 -0
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +3 -0
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +99 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +131 -0
- validmind/tests/model_validation/ragas/AnswerRelevance.py +134 -0
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +119 -0
- validmind/tests/model_validation/ragas/AspectCritique.py +167 -0
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +133 -0
- validmind/tests/model_validation/ragas/ContextPrecision.py +123 -0
- validmind/tests/model_validation/ragas/ContextRecall.py +123 -0
- validmind/tests/model_validation/ragas/ContextRelevancy.py +114 -0
- validmind/tests/model_validation/ragas/Faithfulness.py +119 -0
- validmind/tests/model_validation/ragas/utils.py +66 -0
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +3 -7
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +8 -9
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +5 -10
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +3 -2
- validmind/tests/model_validation/sklearn/ROCCurve.py +2 -1
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +2 -3
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +14 -12
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +3 -4
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +1 -1
- validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +5 -6
- validmind/unit_metrics/__init__.py +26 -49
- validmind/unit_metrics/composite.py +5 -1
- validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +1 -1
- validmind/utils.py +56 -6
- validmind/vm_models/__init__.py +1 -1
- validmind/vm_models/dataset/__init__.py +7 -0
- validmind/vm_models/dataset/dataset.py +558 -0
- validmind/vm_models/dataset/utils.py +146 -0
- validmind/vm_models/model.py +97 -72
- validmind/vm_models/test/result_wrapper.py +61 -24
- validmind/vm_models/test_context.py +1 -1
- validmind/vm_models/test_suite/summary.py +3 -4
- {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/METADATA +5 -3
- {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/RECORD +100 -75
- validmind/models/catboost.py +0 -33
- validmind/models/statsmodels.py +0 -50
- validmind/models/xgboost.py +0 -30
- validmind/tests/model_validation/BertScoreAggregate.py +0 -90
- validmind/tests/model_validation/RegardHistogram.py +0 -148
- validmind/tests/model_validation/RougeMetrics.py +0 -147
- validmind/tests/model_validation/RougeMetricsAggregate.py +0 -133
- validmind/tests/model_validation/SelfCheckNLIScore.py +0 -112
- validmind/tests/model_validation/ToxicityHistogram.py +0 -136
- validmind/vm_models/dataset.py +0 -1303
- {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/LICENSE +0 -0
- {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/WHEEL +0 -0
- {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/entry_points.txt +0 -0
@@ -1,148 +0,0 @@
|
|
1
|
-
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
-
# See the LICENSE file in the root of this repository for details.
|
3
|
-
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
-
|
5
|
-
import itertools
|
6
|
-
from dataclasses import dataclass
|
7
|
-
|
8
|
-
import evaluate
|
9
|
-
import plotly.graph_objects as go
|
10
|
-
from plotly.subplots import make_subplots
|
11
|
-
|
12
|
-
from validmind.vm_models import Figure, Metric
|
13
|
-
|
14
|
-
|
15
|
-
@dataclass
|
16
|
-
class RegardHistogram(Metric):
|
17
|
-
"""
|
18
|
-
**Purpose:**
|
19
|
-
The `RegardHistogram` metric offers a histogram-based visualization of regard scores across different text
|
20
|
-
samples. As an evolution from the line plot representation, the histogram provides a distributional perspective
|
21
|
-
on how often certain regard scores (positive, negative, neutral, or other) are perceived or generated by the model.
|
22
|
-
|
23
|
-
**Test Mechanism:**
|
24
|
-
This metric extracts the necessary data from the model's test dataset: the input text, the true regard (target text),
|
25
|
-
and the model's predicted regard. After ensuring data consistency, the `evaluate.load("regard")` tool computes the
|
26
|
-
regard scores. Histograms are then created for each category of regard against input, target, and predicted texts.
|
27
|
-
These histograms illustrate the frequency distribution of scores within each category, shedding light on commonalities
|
28
|
-
or outliers in the model's performance.
|
29
|
-
|
30
|
-
**Signs of High Risk:**
|
31
|
-
Any noticeable skewness in the histogram, especially when comparing the predicted regard scores with the target regard
|
32
|
-
scores, could indicate biases or inconsistencies in the model. For instance, a lack of neutral scores in the model's
|
33
|
-
predictions, despite a balanced distribution in the target data, might signal an issue.
|
34
|
-
|
35
|
-
**Strengths:**
|
36
|
-
Histogram representations give a quick, intuitive snapshot of score distributions. The immediate visual contrast
|
37
|
-
between the model's predictions and target data is useful for stakeholders and researchers aiming to gauge
|
38
|
-
model reliability in regard to sentiment assessments.
|
39
|
-
|
40
|
-
**Limitations:**
|
41
|
-
The efficacy of `RegardHistogram` hinges upon the precision of underlying tools like `evaluate.load("regard")`, which
|
42
|
-
might have their biases or inaccuracies. The metric portrays regard in set categories, but sentiments in real-world scenarios
|
43
|
-
can be more nuanced. Assumptions made, like the expectation of consistent regard across texts, may not always hold, given that
|
44
|
-
sentiments can be subjective. While histograms showcase distributions, they may not capture the intricate contexts behind
|
45
|
-
texts, possibly leading to oversimplifications or misinterpretations.
|
46
|
-
"""
|
47
|
-
|
48
|
-
name = "regard_histogram"
|
49
|
-
required_inputs = ["model", "dataset"]
|
50
|
-
metadata = {
|
51
|
-
"task_types": ["text_classification", "text_summarization"],
|
52
|
-
"tags": ["regard_histogram"],
|
53
|
-
}
|
54
|
-
|
55
|
-
def _get_datasets(self):
|
56
|
-
if not hasattr(self, "model"):
|
57
|
-
raise AttributeError("The 'model' attribute is missing.")
|
58
|
-
|
59
|
-
y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
|
60
|
-
y_pred = self.inputs.dataset.y_pred(self.inputs.model)
|
61
|
-
|
62
|
-
if not len(y_true) == len(y_pred):
|
63
|
-
raise ValueError(
|
64
|
-
"Inconsistent lengths among true summaries and predicted summaries."
|
65
|
-
)
|
66
|
-
|
67
|
-
return y_true, y_pred
|
68
|
-
|
69
|
-
def regard_histogram(self):
|
70
|
-
regard_tool = evaluate.load("regard")
|
71
|
-
y_true, y_pred = self._get_datasets()
|
72
|
-
|
73
|
-
dataframes = {
|
74
|
-
"Target Text": y_true,
|
75
|
-
"Predicted Summaries": y_pred,
|
76
|
-
}
|
77
|
-
|
78
|
-
total_text_columns = len(dataframes)
|
79
|
-
total_rows = total_text_columns * 2
|
80
|
-
|
81
|
-
categories_order = ["positive", "negative", "neutral", "other"]
|
82
|
-
category_colors = {
|
83
|
-
"negative": "#d9534f",
|
84
|
-
"neutral": "#5bc0de",
|
85
|
-
"other": "#f0ad4e",
|
86
|
-
"positive": "#5cb85c",
|
87
|
-
}
|
88
|
-
|
89
|
-
fig = make_subplots(
|
90
|
-
rows=total_rows,
|
91
|
-
cols=2,
|
92
|
-
subplot_titles=[
|
93
|
-
f"{col_name} {cat}"
|
94
|
-
for col_name in dataframes
|
95
|
-
for cat in categories_order
|
96
|
-
],
|
97
|
-
shared_xaxes=True,
|
98
|
-
vertical_spacing=0.1,
|
99
|
-
)
|
100
|
-
|
101
|
-
row_offset = 0
|
102
|
-
|
103
|
-
for column_name, column_data in dataframes.items():
|
104
|
-
results = regard_tool.compute(data=column_data)["regard"]
|
105
|
-
regard_dicts = [
|
106
|
-
dict((x["label"], x["score"]) for x in sublist) for sublist in results
|
107
|
-
]
|
108
|
-
|
109
|
-
for idx, category in enumerate(categories_order, start=1):
|
110
|
-
row, col = ((idx - 1) // 2 + 1 + row_offset, (idx - 1) % 2 + 1)
|
111
|
-
fig.add_trace(
|
112
|
-
go.Histogram(
|
113
|
-
name=f"{category} ({column_name})",
|
114
|
-
x=[res_dict[category] for res_dict in regard_dicts],
|
115
|
-
marker_color=category_colors[category],
|
116
|
-
showlegend=False, # Disable the legend
|
117
|
-
),
|
118
|
-
row=row,
|
119
|
-
col=col,
|
120
|
-
)
|
121
|
-
row_offset += 2 # Move to the next pair of rows for the next text column
|
122
|
-
|
123
|
-
subplot_height = 350
|
124
|
-
total_height = (
|
125
|
-
total_rows * subplot_height + 200
|
126
|
-
) # 200 for padding, titles, etc.
|
127
|
-
|
128
|
-
fig.update_layout(
|
129
|
-
title_text="Regard Score Histogram Distribution", height=total_height
|
130
|
-
)
|
131
|
-
|
132
|
-
# Specify x and y titles only for the first subplot
|
133
|
-
fig.update_xaxes(title_text="Index", showticklabels=True, row=1, col=1)
|
134
|
-
fig.update_yaxes(title_text="Score", showticklabels=True, row=1, col=1)
|
135
|
-
|
136
|
-
# Show tick labels on all subplots
|
137
|
-
for row in range(total_rows):
|
138
|
-
for col in range(2): # since you have 2 columns
|
139
|
-
fig.update_xaxes(showticklabels=True, row=row + 1, col=col + 1)
|
140
|
-
fig.update_yaxes(showticklabels=True, row=row + 1, col=col + 1)
|
141
|
-
|
142
|
-
return fig
|
143
|
-
|
144
|
-
def run(self):
|
145
|
-
fig = self.regard_histogram()
|
146
|
-
return self.cache_results(
|
147
|
-
figures=[Figure(for_object=self, key=self.key, figure=fig)]
|
148
|
-
)
|
@@ -1,147 +0,0 @@
|
|
1
|
-
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
-
# See the LICENSE file in the root of this repository for details.
|
3
|
-
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
-
|
5
|
-
import itertools
|
6
|
-
from dataclasses import dataclass
|
7
|
-
|
8
|
-
import pandas as pd
|
9
|
-
import plotly.graph_objects as go
|
10
|
-
from rouge import Rouge
|
11
|
-
|
12
|
-
from validmind.vm_models import Figure, Metric
|
13
|
-
|
14
|
-
|
15
|
-
@dataclass
|
16
|
-
class RougeMetrics(Metric):
|
17
|
-
"""
|
18
|
-
Evaluates the quality of machine-generated text using various ROUGE metrics, and visualizes the results.
|
19
|
-
|
20
|
-
**Purpose**: The ROUGE, or Recall-Oriented Understudy for Gisting Evaluation, is a metric employed to assess the
|
21
|
-
quality of machine-generated text. This evaluation technique is mainly used in natural language processing tasks,
|
22
|
-
such as text summarization, machine translation, and text generation. Its goal is to measure how well the
|
23
|
-
machine-generated text reflects the key information and concepts in the human-crafted reference text.
|
24
|
-
|
25
|
-
**Test Mechanism**:
|
26
|
-
|
27
|
-
1. **Comparison Procedure**: The testing mechanism involves comparing machine-generated content with a reference
|
28
|
-
human-constructed text.
|
29
|
-
|
30
|
-
2. **Integral Metrics**:
|
31
|
-
- **ROUGE-N (N-gram Overlap)**: This evaluates the overlap of n-grams (sequences of n words) between the
|
32
|
-
generated and reference texts. The common n-values are 1 (unigrams), 2 (bigrams), and 3 (trigrams). Each metric
|
33
|
-
calculates precision, recall, and F1-score.
|
34
|
-
|
35
|
-
- **ROUGE-L (Longest Common Subsequence)**: This identifies the longest shared word sequence in both the machine
|
36
|
-
and reference texts, thus evaluating the capability of the generated text to mirror key phrases.
|
37
|
-
|
38
|
-
- **ROUGE-S (Skip-bigram)**: This measures the concurrence of skip-bigrams — word pairings that appear within a
|
39
|
-
predefined word window in the text. This metric maintains sensitivity to word order while allowing for sporadic
|
40
|
-
word omissions.
|
41
|
-
|
42
|
-
3. **Visual Representation**: Precision, recall, and F1-score for all the metrics are visually charted, which makes
|
43
|
-
the results easier to comprehend.
|
44
|
-
|
45
|
-
**Signs of High Risk**:
|
46
|
-
|
47
|
-
- Low scores across the suite of ROUGE metrics
|
48
|
-
- Low precision might indicate redundant information in machine-produced text
|
49
|
-
- Low recall may suggest the omission of important information from the reference text
|
50
|
-
- Low F1 score could indicate an imbalanced performance between precision and recall
|
51
|
-
- Persistent low scores could signal inherent flaws in the model
|
52
|
-
|
53
|
-
**Strengths**:
|
54
|
-
|
55
|
-
- Offers a multifaceted perspective on text quality using various evaluation metrics
|
56
|
-
- Adapts to synonyms and rewording, thanks to n-gram-based evaluation
|
57
|
-
- Encourages the retention of key word sequences using the longest common subsequence method
|
58
|
-
- Visual representation of precision, recall, and F1-scores enhances understandability of model performance
|
59
|
-
|
60
|
-
**Limitations**:
|
61
|
-
|
62
|
-
- May fail to fully address the semantic coherence, fluency, or grammatical quality of the generated text
|
63
|
-
- Tends to evaluate isolated phrases or n-grams rather than comprehensive sentences
|
64
|
-
- May prove challenging when reference texts are difficult or impractical to obtain due to its reliance on
|
65
|
-
comparisons with human-made references.
|
66
|
-
"""
|
67
|
-
|
68
|
-
name = "rouge_metric"
|
69
|
-
required_inputs = ["model", "dataset"]
|
70
|
-
default_params = {
|
71
|
-
"rouge_metrics": ["rouge-1", "rouge-2", "rouge-l"],
|
72
|
-
}
|
73
|
-
|
74
|
-
def run(self):
|
75
|
-
r_metrics = self.params["rouge_metrics"]
|
76
|
-
if r_metrics is None:
|
77
|
-
raise ValueError("rouge_metrics must be provided in params")
|
78
|
-
|
79
|
-
if not (
|
80
|
-
set(self.default_params.get("rouge_metrics")).intersection(r_metrics)
|
81
|
-
== set(r_metrics)
|
82
|
-
):
|
83
|
-
raise ValueError(
|
84
|
-
f"Invalid metrics from {self.default_params.get('rouge_metrics')}"
|
85
|
-
)
|
86
|
-
|
87
|
-
y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
|
88
|
-
y_pred = self.inputs.dataset.y_pred(self.inputs.model)
|
89
|
-
|
90
|
-
rouge = Rouge(metrics=r_metrics)
|
91
|
-
|
92
|
-
score_list = []
|
93
|
-
for y_t, y_p in zip(y_true, y_pred):
|
94
|
-
scores = rouge.get_scores(y_p, y_t, avg=True)
|
95
|
-
score_list.append(scores)
|
96
|
-
|
97
|
-
metrics_df = pd.DataFrame(score_list)
|
98
|
-
figures = []
|
99
|
-
|
100
|
-
for m in metrics_df.columns:
|
101
|
-
df_scores = pd.DataFrame(metrics_df[m].tolist())
|
102
|
-
# Visualization part
|
103
|
-
fig = go.Figure()
|
104
|
-
|
105
|
-
# Adding the line plots for precision, recall, and F1-score with lines and markers
|
106
|
-
fig.add_trace(
|
107
|
-
go.Scatter(
|
108
|
-
x=df_scores.index,
|
109
|
-
y=df_scores["p"],
|
110
|
-
mode="lines+markers",
|
111
|
-
name="Precision",
|
112
|
-
)
|
113
|
-
)
|
114
|
-
fig.add_trace(
|
115
|
-
go.Scatter(
|
116
|
-
x=df_scores.index,
|
117
|
-
y=df_scores["r"],
|
118
|
-
mode="lines+markers",
|
119
|
-
name="Recall",
|
120
|
-
)
|
121
|
-
)
|
122
|
-
fig.add_trace(
|
123
|
-
go.Scatter(
|
124
|
-
x=df_scores.index,
|
125
|
-
y=df_scores["f"],
|
126
|
-
mode="lines+markers",
|
127
|
-
name="F1 Score",
|
128
|
-
)
|
129
|
-
)
|
130
|
-
|
131
|
-
fig.update_layout(
|
132
|
-
title=f"ROUGE Scores for {m}",
|
133
|
-
xaxis_title="Row Index",
|
134
|
-
yaxis_title="Score",
|
135
|
-
)
|
136
|
-
|
137
|
-
# Ensure a unique key for each metric
|
138
|
-
k = f"{m.replace('-', '')}_{len(figures)}"
|
139
|
-
figures.append(
|
140
|
-
Figure(
|
141
|
-
for_object=self,
|
142
|
-
key=k,
|
143
|
-
figure=fig,
|
144
|
-
)
|
145
|
-
)
|
146
|
-
|
147
|
-
return self.cache_results(figures=figures)
|
@@ -1,133 +0,0 @@
|
|
1
|
-
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
-
# See the LICENSE file in the root of this repository for details.
|
3
|
-
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
-
|
5
|
-
import itertools
|
6
|
-
from dataclasses import dataclass
|
7
|
-
|
8
|
-
import pandas as pd
|
9
|
-
import plotly.graph_objects as go
|
10
|
-
from rouge import Rouge
|
11
|
-
|
12
|
-
from validmind.vm_models import Figure, Metric
|
13
|
-
|
14
|
-
|
15
|
-
@dataclass
|
16
|
-
class RougeMetricsAggregate(Metric):
|
17
|
-
"""
|
18
|
-
Evaluates the average quality of machine-generated text using various ROUGE metrics and visualizes the aggregated results.
|
19
|
-
|
20
|
-
**Purpose**: The ROUGE, or Recall-Oriented Understudy for Gisting Evaluation, remains a cornerstone for assessing
|
21
|
-
machine-generated text quality. Predominantly used in tasks such as text summarization, machine translation,
|
22
|
-
and text generation, the emphasis of ROUGE is to gauge the reflection of pivotal information and core concepts
|
23
|
-
from human references in machine-produced content.
|
24
|
-
|
25
|
-
**Test Mechanism**:
|
26
|
-
|
27
|
-
1. **Comparison Procedure**: The evaluation requires contrasting machine-rendered text against a human-made reference.
|
28
|
-
|
29
|
-
2. **Integral Metrics**:
|
30
|
-
- **ROUGE-N (N-gram Overlap)**: Assesses the commonality of n-grams between both sets of texts. Regularly,
|
31
|
-
metrics consider 1 (unigrams), 2 (bigrams), and 3 (trigrams), rendering precision, recall, and F1-score.
|
32
|
-
|
33
|
-
- **ROUGE-L (Longest Common Subsequence)**: Discerns the lengthiest mutually inclusive word chain in both
|
34
|
-
texts, ascertaining the machine text's efficacy in capturing essential phrases.
|
35
|
-
|
36
|
-
- **ROUGE-S (Skip-bigram)**: Quantifies the concurrence of skip-bigrams. This metric cherishes word order
|
37
|
-
but tolerates occasional omissions.
|
38
|
-
|
39
|
-
3. **Visual Representation**: The aggregate approach underscores the visualization of average scores across
|
40
|
-
precision, recall, and F1-score, enhancing result interpretation.
|
41
|
-
|
42
|
-
**Signs of High Risk**:
|
43
|
-
|
44
|
-
- Diminished average scores across ROUGE metrics
|
45
|
-
- Depressed precision may highlight verbosity in machine text
|
46
|
-
- Lacking recall might hint at missed critical details from the reference
|
47
|
-
- A dwindling F1 score might spotlight a disjointed precision-recall performance
|
48
|
-
- Consistently low averages could reveal deep-rooted model inadequacies
|
49
|
-
|
50
|
-
**Strengths**:
|
51
|
-
|
52
|
-
- Provides a holistic view of text quality via diverse metrics
|
53
|
-
- Gracefully handles paraphrasing owing to n-gram evaluations
|
54
|
-
- Promotes the capture of crucial word chains through the longest common subsequence
|
55
|
-
- Aggregate visual insights bolster comprehension of overall model behavior
|
56
|
-
|
57
|
-
**Limitations**:
|
58
|
-
|
59
|
-
- Might overlook nuances like semantic integrity, fluency, or syntactic correctness
|
60
|
-
- Focuses more on discrete phrases or n-grams over holistic sentences
|
61
|
-
- Reliance on human references can be limiting when they're hard to source or infeasible.
|
62
|
-
"""
|
63
|
-
|
64
|
-
name = "rouge_metrics_aggregate"
|
65
|
-
required_inputs = ["model", "dataset"]
|
66
|
-
default_params = {
|
67
|
-
"rouge_metrics": ["rouge-1", "rouge-2", "rouge-l"],
|
68
|
-
}
|
69
|
-
|
70
|
-
def run(self):
|
71
|
-
r_metrics = self.params["rouge_metrics"]
|
72
|
-
if r_metrics is None:
|
73
|
-
raise ValueError("rouge_metrics must be provided in params")
|
74
|
-
|
75
|
-
if not (
|
76
|
-
set(self.default_params.get("rouge_metrics")).intersection(r_metrics)
|
77
|
-
== set(r_metrics)
|
78
|
-
):
|
79
|
-
raise ValueError(
|
80
|
-
f"Invalid metrics from {self.default_params.get('rouge_metrics')}"
|
81
|
-
)
|
82
|
-
|
83
|
-
y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
|
84
|
-
y_pred = self.inputs.dataset.y_pred(self.inputs.model)
|
85
|
-
|
86
|
-
rouge = Rouge(metrics=r_metrics)
|
87
|
-
|
88
|
-
score_list = []
|
89
|
-
for y_t, y_p in zip(y_true, y_pred):
|
90
|
-
scores = rouge.get_scores(y_p, y_t, avg=True)
|
91
|
-
score_list.append(scores)
|
92
|
-
|
93
|
-
metrics_df = pd.DataFrame(score_list)
|
94
|
-
figures = []
|
95
|
-
|
96
|
-
colors = {"Precision": "blue", "Recall": "green", "F1 Score": "red"}
|
97
|
-
mapping = {"p": "Precision", "r": "Recall", "f": "F1 Score"}
|
98
|
-
|
99
|
-
for m in metrics_df.columns:
|
100
|
-
df_scores = pd.DataFrame(metrics_df[m].tolist())
|
101
|
-
avg_scores = df_scores.mean()
|
102
|
-
|
103
|
-
# Visualization part
|
104
|
-
fig = go.Figure()
|
105
|
-
|
106
|
-
# Adding the bar plots for average scores with specified colors
|
107
|
-
for metric_short, metric_full in mapping.items():
|
108
|
-
fig.add_trace(
|
109
|
-
go.Bar(
|
110
|
-
x=[metric_full],
|
111
|
-
y=[avg_scores[metric_short]],
|
112
|
-
name=metric_full,
|
113
|
-
marker_color=colors[metric_full],
|
114
|
-
showlegend=False,
|
115
|
-
)
|
116
|
-
)
|
117
|
-
|
118
|
-
fig.update_layout(
|
119
|
-
title=f"Average ROUGE Scores for {m}",
|
120
|
-
xaxis_title="Metric",
|
121
|
-
yaxis_title="Average Score",
|
122
|
-
)
|
123
|
-
|
124
|
-
k = m.replace("-", "")
|
125
|
-
figures.append(
|
126
|
-
Figure(
|
127
|
-
for_object=self,
|
128
|
-
key=k,
|
129
|
-
figure=fig,
|
130
|
-
)
|
131
|
-
)
|
132
|
-
|
133
|
-
return self.cache_results(figures=figures)
|
@@ -1,112 +0,0 @@
|
|
1
|
-
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
-
# See the LICENSE file in the root of this repository for details.
|
3
|
-
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
-
|
5
|
-
import itertools
|
6
|
-
from dataclasses import dataclass
|
7
|
-
|
8
|
-
import pandas as pd
|
9
|
-
import plotly.graph_objects as go
|
10
|
-
import torch
|
11
|
-
from selfcheckgpt.modeling_selfcheck import SelfCheckNLI
|
12
|
-
from tqdm import tqdm
|
13
|
-
|
14
|
-
from validmind.vm_models import Figure, Metric
|
15
|
-
|
16
|
-
|
17
|
-
@dataclass
|
18
|
-
class SelfCheckNLIScore(Metric):
|
19
|
-
"""
|
20
|
-
Evaluates text generation models' performance by quantifying the level of hallucination in generated texts compared to reference texts.
|
21
|
-
|
22
|
-
**Purpose**: The HallucinationScore metric is designed to assess the factual accuracy and reliability of text generated by models, focusing on the detection and quantification of hallucinations—instances where generated content deviates from factual or expected outputs. By comparing generated texts against reference texts, this metric highlights discrepancies indicative of hallucinations, offering insights into the model's ability to produce contextually and factually coherent content.
|
23
|
-
|
24
|
-
**Test Mechanism**: To compute the HallucinationScore, the metric employs a comparison between the generated texts (model predictions) and the provided reference texts (true values). Using the SelfCheckNLI model, it evaluates each generated text's level of factual congruence with the reference, assigning a hallucination score based on the semantic coherence and factual accuracy. The scores for each text instance are then visualized in a line plot, allowing for the examination of hallucination trends across the dataset.
|
25
|
-
|
26
|
-
**Signs of High Risk**:
|
27
|
-
- High hallucination scores across a significant portion of the dataset, indicating a prevalence of factually inaccurate or irrelevant content generation.
|
28
|
-
- Patterns of consistent hallucination in specific contexts or subjects, suggesting gaps in the model's understanding or knowledge.
|
29
|
-
- Sharp fluctuations in hallucination scores, which may reveal inconsistencies in the model's performance or sensitivity to certain types of input.
|
30
|
-
|
31
|
-
**Strengths**:
|
32
|
-
- Directly addresses the critical aspect of factual accuracy in generated text, beyond mere linguistic or stylistic coherence.
|
33
|
-
- Provides a granular, instance-by-instance analysis of model performance, allowing for targeted improvements and diagnostics.
|
34
|
-
- Facilitates a deeper understanding of a model's capabilities and limitations in producing reliable and accurate content.
|
35
|
-
|
36
|
-
**Limitations**:
|
37
|
-
- Reliance on the SelfCheckNLI model means the accuracy and effectiveness of the HallucinationScore are contingent upon the performance and suitability of the underlying NLI model.
|
38
|
-
- May not fully capture the subtleties of certain factual inaccuracies or the contextual relevance of reference texts, especially in complex or nuanced domains.
|
39
|
-
- Potentially resource-intensive, given the computational demands of running advanced NLI models for large datasets.
|
40
|
-
"""
|
41
|
-
|
42
|
-
name = "self_check_nli_score"
|
43
|
-
required_inputs = ["model", "dataset"]
|
44
|
-
|
45
|
-
def run(self):
|
46
|
-
# Assuming the dataset is structured with generated sentences and reference samples
|
47
|
-
y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
|
48
|
-
y_pred = self.inputs.dataset.y_pred(self.inputs.model)
|
49
|
-
|
50
|
-
hallucination_scores = self.compute_hallucination_scores(y_pred, y_true)
|
51
|
-
|
52
|
-
# Visualization of scores
|
53
|
-
figures = self.visualize_scores(hallucination_scores)
|
54
|
-
|
55
|
-
return self.cache_results(figures=figures)
|
56
|
-
|
57
|
-
def compute_hallucination_scores(self, predictions, references):
|
58
|
-
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
59
|
-
selfcheck_nli = SelfCheckNLI(device=device)
|
60
|
-
hallucination_scores = []
|
61
|
-
|
62
|
-
print("Starting hallucination score computation...")
|
63
|
-
|
64
|
-
for index, (sentences, samples) in enumerate(
|
65
|
-
tqdm(zip(predictions, references), total=len(predictions))
|
66
|
-
):
|
67
|
-
sent_scores_nli = selfcheck_nli.predict(
|
68
|
-
sentences=sentences, sampled_passages=samples
|
69
|
-
)
|
70
|
-
|
71
|
-
# Compute the mean of the hallucination scores for this row
|
72
|
-
average_score = sent_scores_nli.mean()
|
73
|
-
hallucination_scores.append(average_score)
|
74
|
-
|
75
|
-
# Print a progress update for each row
|
76
|
-
print(
|
77
|
-
f"Row {index + 1}/{len(predictions)}: Average hallucination score: {average_score}"
|
78
|
-
)
|
79
|
-
|
80
|
-
print("Completed hallucination score computation.")
|
81
|
-
|
82
|
-
return hallucination_scores
|
83
|
-
|
84
|
-
def visualize_scores(self, scores):
|
85
|
-
scores_df = pd.DataFrame(scores, columns=["Hallucination Score"])
|
86
|
-
|
87
|
-
fig = go.Figure()
|
88
|
-
fig.add_trace(
|
89
|
-
go.Scatter(
|
90
|
-
x=scores_df.index,
|
91
|
-
y=scores_df["Hallucination Score"],
|
92
|
-
mode="lines+markers",
|
93
|
-
name="Hallucination Score",
|
94
|
-
)
|
95
|
-
)
|
96
|
-
|
97
|
-
fig.update_layout(
|
98
|
-
title="Hallucination Scores Across Text Instances",
|
99
|
-
xaxis_title="Text Instance Index",
|
100
|
-
yaxis_title="Hallucination Score",
|
101
|
-
)
|
102
|
-
|
103
|
-
# Wrapping the plotly figure for compatibility with your framework might be needed
|
104
|
-
figures = [
|
105
|
-
Figure(
|
106
|
-
for_object=self,
|
107
|
-
key=self.key,
|
108
|
-
figure=fig,
|
109
|
-
)
|
110
|
-
]
|
111
|
-
|
112
|
-
return figures
|
@@ -1,136 +0,0 @@
|
|
1
|
-
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
-
# See the LICENSE file in the root of this repository for details.
|
3
|
-
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
-
|
5
|
-
import itertools
|
6
|
-
from dataclasses import dataclass
|
7
|
-
|
8
|
-
import evaluate
|
9
|
-
import pandas as pd
|
10
|
-
import plotly.graph_objects as go
|
11
|
-
import plotly.subplots as sp
|
12
|
-
|
13
|
-
from validmind.vm_models import Figure, Metric
|
14
|
-
|
15
|
-
|
16
|
-
@dataclass
|
17
|
-
class ToxicityHistogram(Metric):
|
18
|
-
"""
|
19
|
-
**Purpose:**
|
20
|
-
The ToxicityHistogram metric visualizes and analyzes the toxicity scores of various texts. Through histograms, it
|
21
|
-
provides insights into the distribution and nature of toxicity present in the evaluated text segments.
|
22
|
-
|
23
|
-
**Test Mechanism:**
|
24
|
-
Texts are fetched from specified columns and their toxicity scores are computed using a preloaded `toxicity`
|
25
|
-
evaluation tool. Each text data column is visualized with its own histogram, culminating in a multi-panel
|
26
|
-
visualization.
|
27
|
-
|
28
|
-
**Signs of High Risk:**
|
29
|
-
High toxicity concentrations in the histogram, especially on the upper scale, signify a higher presence of toxic
|
30
|
-
content in the respective text segment. If predicted summaries show significantly differing patterns from input or
|
31
|
-
target texts, it could indicate issues with the model's output.
|
32
|
-
|
33
|
-
**Strengths:**
|
34
|
-
The metric offers a lucid representation of toxicity distributions, facilitating the swift identification of
|
35
|
-
concerning patterns. It's instrumental for gauging potential pitfalls of generated content, particularly in the
|
36
|
-
realm of predicted summaries.
|
37
|
-
|
38
|
-
**Limitations:**
|
39
|
-
The ToxicityHistogram's efficacy hinges on the accuracy of the `toxicity` tool it employs. While histograms depict
|
40
|
-
distribution patterns, they omit details about which specific text portions or tokens result in high toxicity
|
41
|
-
scores. Therefore, for a comprehensive understanding, more in-depth analysis might be requisite.
|
42
|
-
"""
|
43
|
-
|
44
|
-
name = "toxicity_histogram"
|
45
|
-
required_inputs = ["model"]
|
46
|
-
metadata = {
|
47
|
-
"task_types": [
|
48
|
-
"text_classification",
|
49
|
-
"text_summarization",
|
50
|
-
],
|
51
|
-
"tags": ["toxicity_histogram"],
|
52
|
-
}
|
53
|
-
|
54
|
-
def _get_datasets(self):
|
55
|
-
# Check model attributes
|
56
|
-
if not hasattr(self, "model"):
|
57
|
-
raise AttributeError("The 'model' attribute is missing.")
|
58
|
-
|
59
|
-
y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
|
60
|
-
y_pred = self.inputs.dataset.y_pred(self.inputs.model)
|
61
|
-
input_text = self.inputs.dataset.df[self.inputs.dataset.text_column]
|
62
|
-
|
63
|
-
# Ensure consistency in lengths
|
64
|
-
if not len(y_true) == len(y_pred) == len(input_text):
|
65
|
-
raise ValueError(
|
66
|
-
"Inconsistent lengths among input text, true summaries, and predicted summaries."
|
67
|
-
)
|
68
|
-
|
69
|
-
return input_text, y_true, y_pred
|
70
|
-
|
71
|
-
def toxicity_histograms(self, df):
|
72
|
-
"""
|
73
|
-
Compute toxicity scores for texts and then plot histograms for all columns of df.
|
74
|
-
|
75
|
-
Parameters:
|
76
|
-
- df (pd.DataFrame): The dataframe containing texts.
|
77
|
-
"""
|
78
|
-
|
79
|
-
# Extract necessary parameters
|
80
|
-
toxicity = evaluate.load("toxicity")
|
81
|
-
|
82
|
-
# Get all columns of df
|
83
|
-
text_columns = df.columns.tolist()
|
84
|
-
|
85
|
-
# Determine the number of rows required based on the number of text columns
|
86
|
-
num_rows = (len(text_columns) + 1) // 2 # +1 to handle odd number of columns
|
87
|
-
|
88
|
-
# Create a subplot layout
|
89
|
-
fig = sp.make_subplots(rows=num_rows, cols=2, subplot_titles=text_columns)
|
90
|
-
|
91
|
-
subplot_height = 350 # Height of each subplot
|
92
|
-
total_height = num_rows * subplot_height + 200 # 200 for padding, titles, etc.
|
93
|
-
|
94
|
-
for idx, col in enumerate(text_columns, start=1):
|
95
|
-
row = (idx - 1) // 2 + 1
|
96
|
-
col_idx = (idx - 1) % 2 + 1 # to place subplots in two columns
|
97
|
-
|
98
|
-
# Get list of texts from dataframe
|
99
|
-
texts = df[col].tolist()
|
100
|
-
|
101
|
-
# Compute toxicity for texts
|
102
|
-
toxicity_scores = toxicity.compute(predictions=texts)["toxicity"]
|
103
|
-
|
104
|
-
# Add traces to the corresponding subplot without legend
|
105
|
-
fig.add_trace(
|
106
|
-
go.Histogram(x=toxicity_scores, showlegend=False), row=row, col=col_idx
|
107
|
-
)
|
108
|
-
|
109
|
-
# Update xaxes and yaxes titles only for the first subplot
|
110
|
-
if idx == 1:
|
111
|
-
fig.update_xaxes(title_text="Toxicity Score", row=row, col=col_idx)
|
112
|
-
fig.update_yaxes(title_text="Frequency", row=row, col=col_idx)
|
113
|
-
|
114
|
-
# Update layout
|
115
|
-
fig.update_layout(
|
116
|
-
title_text="Histograms of Toxicity Scores", height=total_height
|
117
|
-
)
|
118
|
-
|
119
|
-
return fig
|
120
|
-
|
121
|
-
def run(self):
|
122
|
-
input_text, y_true, y_pred = self._get_datasets()
|
123
|
-
|
124
|
-
df = pd.DataFrame(
|
125
|
-
{
|
126
|
-
"Input Text": input_text,
|
127
|
-
"Target Text": y_true,
|
128
|
-
"Predicted Summaries": y_pred,
|
129
|
-
}
|
130
|
-
)
|
131
|
-
|
132
|
-
fig = self.toxicity_histograms(df)
|
133
|
-
|
134
|
-
return self.cache_results(
|
135
|
-
figures=[Figure(for_object=self, key=self.key, figure=fig)]
|
136
|
-
)
|