validmind 2.1.1__py3-none-any.whl → 2.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/ai.py +72 -49
- validmind/api_client.py +42 -16
- validmind/client.py +68 -25
- validmind/datasets/llm/rag/__init__.py +11 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_1.csv +30 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_2.csv +30 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_3.csv +53 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_4.csv +53 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_5.csv +53 -0
- validmind/datasets/llm/rag/rfp.py +41 -0
- validmind/errors.py +1 -1
- validmind/html_templates/__init__.py +0 -0
- validmind/html_templates/content_blocks.py +89 -14
- validmind/models/__init__.py +7 -4
- validmind/models/foundation.py +8 -34
- validmind/models/function.py +51 -0
- validmind/models/huggingface.py +16 -46
- validmind/models/metadata.py +42 -0
- validmind/models/pipeline.py +66 -0
- validmind/models/pytorch.py +8 -42
- validmind/models/r_model.py +33 -82
- validmind/models/sklearn.py +39 -38
- validmind/template.py +8 -26
- validmind/tests/__init__.py +43 -20
- validmind/tests/data_validation/ANOVAOneWayTable.py +1 -1
- validmind/tests/data_validation/ChiSquaredFeaturesTable.py +1 -1
- validmind/tests/data_validation/DescriptiveStatistics.py +2 -4
- validmind/tests/data_validation/Duplicates.py +1 -1
- validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +1 -1
- validmind/tests/data_validation/TargetRateBarPlots.py +1 -1
- validmind/tests/data_validation/nlp/LanguageDetection.py +59 -0
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +48 -0
- validmind/tests/data_validation/nlp/Punctuations.py +11 -12
- validmind/tests/data_validation/nlp/Sentiment.py +57 -0
- validmind/tests/data_validation/nlp/Toxicity.py +45 -0
- validmind/tests/decorator.py +12 -7
- validmind/tests/model_validation/BertScore.py +100 -98
- validmind/tests/model_validation/BleuScore.py +93 -64
- validmind/tests/model_validation/ContextualRecall.py +74 -91
- validmind/tests/model_validation/MeteorScore.py +86 -74
- validmind/tests/model_validation/RegardScore.py +103 -121
- validmind/tests/model_validation/RougeScore.py +118 -0
- validmind/tests/model_validation/TokenDisparity.py +84 -121
- validmind/tests/model_validation/ToxicityScore.py +109 -123
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +96 -0
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +71 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +92 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +69 -0
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +78 -0
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +35 -23
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +3 -0
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +7 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +3 -0
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +3 -0
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +99 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +131 -0
- validmind/tests/model_validation/ragas/AnswerRelevance.py +134 -0
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +119 -0
- validmind/tests/model_validation/ragas/AspectCritique.py +167 -0
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +133 -0
- validmind/tests/model_validation/ragas/ContextPrecision.py +123 -0
- validmind/tests/model_validation/ragas/ContextRecall.py +123 -0
- validmind/tests/model_validation/ragas/ContextRelevancy.py +114 -0
- validmind/tests/model_validation/ragas/Faithfulness.py +119 -0
- validmind/tests/model_validation/ragas/utils.py +66 -0
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +3 -7
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +8 -9
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +5 -10
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +3 -2
- validmind/tests/model_validation/sklearn/ROCCurve.py +2 -1
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +2 -3
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +7 -11
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +3 -4
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +1 -1
- validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +5 -6
- validmind/unit_metrics/__init__.py +26 -49
- validmind/unit_metrics/composite.py +13 -7
- validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +1 -1
- validmind/utils.py +99 -6
- validmind/vm_models/__init__.py +1 -1
- validmind/vm_models/dataset/__init__.py +7 -0
- validmind/vm_models/dataset/dataset.py +560 -0
- validmind/vm_models/dataset/utils.py +146 -0
- validmind/vm_models/model.py +97 -72
- validmind/vm_models/test/metric.py +9 -24
- validmind/vm_models/test/result_wrapper.py +124 -28
- validmind/vm_models/test/threshold_test.py +10 -28
- validmind/vm_models/test_context.py +1 -1
- validmind/vm_models/test_suite/summary.py +3 -4
- {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/METADATA +5 -3
- {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/RECORD +103 -78
- validmind/models/catboost.py +0 -33
- validmind/models/statsmodels.py +0 -50
- validmind/models/xgboost.py +0 -30
- validmind/tests/model_validation/BertScoreAggregate.py +0 -90
- validmind/tests/model_validation/RegardHistogram.py +0 -148
- validmind/tests/model_validation/RougeMetrics.py +0 -147
- validmind/tests/model_validation/RougeMetricsAggregate.py +0 -133
- validmind/tests/model_validation/SelfCheckNLIScore.py +0 -112
- validmind/tests/model_validation/ToxicityHistogram.py +0 -136
- validmind/vm_models/dataset.py +0 -1303
- {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/LICENSE +0 -0
- {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/WHEEL +0 -0
- {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/entry_points.txt +0 -0
@@ -2,91 +2,103 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
5
|
import evaluate
|
8
6
|
import pandas as pd
|
9
7
|
import plotly.graph_objects as go
|
10
8
|
|
11
|
-
from validmind
|
9
|
+
from validmind import tags, tasks
|
12
10
|
|
13
11
|
|
14
|
-
@
|
15
|
-
|
12
|
+
@tags("nlp", "text_data", "visualization")
|
13
|
+
@tasks("text_classification", "text_summarization")
|
14
|
+
def MeteorScore(dataset, model):
|
16
15
|
"""
|
17
16
|
Computes and visualizes the METEOR score for each text generation instance, assessing translation quality.
|
18
17
|
|
19
|
-
**Purpose
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
**
|
34
|
-
-
|
35
|
-
|
18
|
+
**Purpose:**
|
19
|
+
METEOR (Metric for Evaluation of Translation with Explicit ORdering) is designed to evaluate the quality of machine translations
|
20
|
+
by comparing them against reference translations. It emphasizes both the accuracy and fluency of translations, incorporating
|
21
|
+
precision, recall, and word order into its assessment.
|
22
|
+
|
23
|
+
**Test Mechanism:**
|
24
|
+
The function starts by extracting the true and predicted values from the provided dataset and model. The METEOR score is computed
|
25
|
+
for each pair of machine-generated translation (prediction) and its corresponding human-produced reference. This is done by
|
26
|
+
considering unigram matches between the translations, including matches based on surface forms, stemmed forms, and synonyms.
|
27
|
+
The score is a combination of unigram precision and recall, adjusted for word order through a fragmentation penalty. Scores are
|
28
|
+
compiled into a dataframe, and histograms and bar charts are generated to visualize the distribution of METEOR scores. Additionally,
|
29
|
+
a table of descriptive statistics (mean, median, standard deviation, minimum, and maximum) is compiled for the METEOR scores,
|
30
|
+
providing a comprehensive summary of the model's performance.
|
31
|
+
|
32
|
+
**Signs of High Risk:**
|
33
|
+
- Lower METEOR scores can indicate a lack of alignment between the machine-generated translations and their human-produced references,
|
34
|
+
highlighting potential deficiencies in both the accuracy and fluency of translations.
|
35
|
+
- Significant discrepancies in word order or an excessive fragmentation penalty could signal issues with how the translation model processes
|
36
|
+
and reconstructs sentence structures, potentially compromising the natural flow of translated text.
|
37
|
+
- Persistent underperformance across a variety of text types or linguistic contexts might suggest a broader inability of the model to adapt to the
|
38
|
+
nuances of different languages or dialects, pointing towards gaps in its training or inherent limitations.
|
39
|
+
|
40
|
+
**Strengths:**
|
41
|
+
- Incorporates a balanced consideration of precision and recall, weighted towards recall to reflect the importance of content coverage in translations.
|
36
42
|
- Directly accounts for word order, offering a nuanced evaluation of translation fluency beyond simple lexical matching.
|
37
43
|
- Adapts to various forms of lexical similarity, including synonyms and stemmed forms, allowing for flexible matching.
|
38
44
|
|
39
|
-
**Limitations
|
40
|
-
- While comprehensive, the complexity of METEOR's calculation can make it computationally intensive, especially for
|
41
|
-
|
42
|
-
|
43
|
-
quality and relevance to the specific translation task.
|
45
|
+
**Limitations:**
|
46
|
+
- While comprehensive, the complexity of METEOR's calculation can make it computationally intensive, especially for large datasets.
|
47
|
+
- The use of external resources for synonym and stemming matching may introduce variability based on the resources' quality and relevance to the specific
|
48
|
+
translation task.
|
44
49
|
"""
|
45
50
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
51
|
+
# Extract true and predicted values
|
52
|
+
y_true = dataset.y
|
53
|
+
y_pred = dataset.y_pred(model)
|
54
|
+
|
55
|
+
# Load the METEOR evaluation metric
|
56
|
+
meteor = evaluate.load("meteor")
|
57
|
+
|
58
|
+
# Calculate METEOR scores
|
59
|
+
score_list = []
|
60
|
+
for y_t, y_p in zip(y_true, y_pred):
|
61
|
+
# Compute the METEOR score
|
62
|
+
score = meteor.compute(predictions=[y_p], references=[y_t])
|
63
|
+
score_list.append(score["meteor"])
|
64
|
+
|
65
|
+
# Convert scores to a dataframe
|
66
|
+
metrics_df = pd.DataFrame(score_list, columns=["METEOR Score"])
|
67
|
+
|
68
|
+
figures = []
|
69
|
+
|
70
|
+
# Histogram for METEOR Score
|
71
|
+
hist_fig = go.Figure(data=[go.Histogram(x=metrics_df["METEOR Score"])])
|
72
|
+
hist_fig.update_layout(
|
73
|
+
title="METEOR Score Histogram",
|
74
|
+
xaxis_title="METEOR Score",
|
75
|
+
yaxis_title="Count",
|
76
|
+
)
|
77
|
+
figures.append(hist_fig)
|
78
|
+
|
79
|
+
# Bar Chart for METEOR Score
|
80
|
+
bar_fig = go.Figure(data=[go.Bar(x=metrics_df.index, y=metrics_df["METEOR Score"])])
|
81
|
+
bar_fig.update_layout(
|
82
|
+
title="METEOR Score Bar Chart",
|
83
|
+
xaxis_title="Row Index",
|
84
|
+
yaxis_title="METEOR Score",
|
85
|
+
)
|
86
|
+
figures.append(bar_fig)
|
87
|
+
|
88
|
+
# Calculate statistics for METEOR Score
|
89
|
+
stats_df = metrics_df.describe().loc[["mean", "50%", "max", "min", "std"]]
|
90
|
+
stats_df = stats_df.rename(
|
91
|
+
index={
|
92
|
+
"mean": "Mean Score",
|
93
|
+
"50%": "Median Score",
|
94
|
+
"max": "Max Score",
|
95
|
+
"min": "Min Score",
|
96
|
+
"std": "Standard Deviation",
|
97
|
+
}
|
98
|
+
).T
|
99
|
+
stats_df["Count"] = len(metrics_df)
|
100
|
+
|
101
|
+
# Create a DataFrame from all collected statistics
|
102
|
+
result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
|
103
|
+
|
104
|
+
return (result_df, *tuple(figures))
|
@@ -2,142 +2,124 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
import itertools
|
6
|
-
from dataclasses import dataclass
|
7
|
-
|
8
5
|
import evaluate
|
6
|
+
import pandas as pd
|
9
7
|
import plotly.graph_objects as go
|
10
|
-
from plotly.subplots import make_subplots
|
11
8
|
|
12
|
-
from validmind
|
9
|
+
from validmind import tags, tasks
|
13
10
|
|
14
11
|
|
15
|
-
@
|
16
|
-
|
12
|
+
@tags("nlp", "text_data", "visualization")
|
13
|
+
@tasks("text_classification", "text_summarization")
|
14
|
+
def RegardScore(dataset, model):
|
17
15
|
"""
|
16
|
+
Computes and visualizes the regard score for each text instance, assessing sentiment and potential biases.
|
17
|
+
|
18
18
|
**Purpose:**
|
19
|
-
The `RegardScore` metric
|
20
|
-
whether it's a classification or summarization result. Especially crucial for applications like sentiment analysis,
|
21
|
-
product reviews, or opinion mining, it provides a granular understanding of how the model perceives or generates content
|
22
|
-
in terms of favorability or sentiment.
|
19
|
+
The `RegardScore` metric is designed to evaluate the regard levels (positive, negative, neutral, or other) of texts generated by models. This helps in understanding the sentiment and biases in the generated content.
|
23
20
|
|
24
21
|
**Test Mechanism:**
|
25
|
-
The
|
26
|
-
and the model's predicted regard. These elements undergo a series of consistency checks before being processed. Using
|
27
|
-
the `evaluate.load("regard")` tool, regard scores are computed for each segment of text. The results are then visualized
|
28
|
-
in a multi-subplot line graph, where each subplot corresponds to a particular category of regard (e.g., positive, negative,
|
29
|
-
neutral, other) against the input, target, and predicted texts.
|
22
|
+
The function starts by extracting the true and predicted values from the provided dataset and model. The regard scores are computed for each text using a preloaded `regard` evaluation tool. The scores are compiled into dataframes, and histograms and bar charts are generated to visualize the distribution of regard scores. Additionally, a table of descriptive statistics (mean, median, standard deviation, minimum, and maximum) is compiled for the regard scores, providing a comprehensive summary of the model's performance.
|
30
23
|
|
31
24
|
**Signs of High Risk:**
|
32
|
-
|
33
|
-
the model
|
34
|
-
indicate the model's inability to correctly identify or generate balanced sentiments.
|
25
|
+
- Noticeable skewness in the histogram, especially when comparing the predicted regard scores with the target regard scores, could indicate biases or inconsistencies in the model.
|
26
|
+
- Lack of neutral scores in the model's predictions, despite a balanced distribution in the target data, might signal an issue.
|
35
27
|
|
36
28
|
**Strengths:**
|
37
|
-
|
38
|
-
|
39
|
-
|
29
|
+
- Provides a clear evaluation of regard levels in generated texts, helping to ensure content appropriateness.
|
30
|
+
- Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of regard scores.
|
31
|
+
- Descriptive statistics offer a concise summary of the model's performance in generating texts with balanced sentiments.
|
40
32
|
|
41
33
|
**Limitations:**
|
42
|
-
The
|
43
|
-
|
44
|
-
|
45
|
-
real-world sentiments often exist on a more complex spectrum. The metric's efficacy is intertwined with the accuracy of
|
46
|
-
the model's predictions; any inherent model inaccuracies can impact the metric's reflection of true sentiments.
|
34
|
+
- The accuracy of the regard scores is contingent upon the underlying `regard` tool.
|
35
|
+
- The scores provide a broad overview but do not specify which portions or tokens of the text are responsible for high regard.
|
36
|
+
- Supplementary, in-depth analysis might be needed for granular insights.
|
47
37
|
"""
|
48
38
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
39
|
+
# Extract true and predicted values
|
40
|
+
y_true = dataset.y
|
41
|
+
y_pred = dataset.y_pred(model)
|
42
|
+
|
43
|
+
# Load the regard evaluation metric
|
44
|
+
regard_tool = evaluate.load("regard")
|
45
|
+
|
46
|
+
# Function to calculate regard scores
|
47
|
+
def compute_regard_scores(texts):
|
48
|
+
scores = regard_tool.compute(data=texts)["regard"]
|
49
|
+
regard_dicts = [
|
50
|
+
dict((x["label"], x["score"]) for x in sublist) for sublist in scores
|
51
|
+
]
|
52
|
+
return regard_dicts
|
53
|
+
|
54
|
+
# Calculate regard scores for true and predicted texts
|
55
|
+
true_regard = compute_regard_scores(y_true)
|
56
|
+
pred_regard = compute_regard_scores(y_pred)
|
57
|
+
|
58
|
+
# Convert scores to dataframes
|
59
|
+
true_df = pd.DataFrame(true_regard)
|
60
|
+
pred_df = pd.DataFrame(pred_regard)
|
61
|
+
|
62
|
+
figures = []
|
63
|
+
|
64
|
+
# Function to create histogram and bar chart for regard scores
|
65
|
+
def create_figures(df, title):
|
66
|
+
for category in df.columns:
|
67
|
+
# Histogram
|
68
|
+
hist_fig = go.Figure(data=[go.Histogram(x=df[category])])
|
69
|
+
hist_fig.update_layout(
|
70
|
+
title=f"{title} - {category.capitalize()} Histogram",
|
71
|
+
xaxis_title=category.capitalize(),
|
72
|
+
yaxis_title="Count",
|
66
73
|
)
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
"
|
87
|
-
"
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
hoverinfo="y+name",
|
120
|
-
line=dict(color=category_colors[category], width=1.5),
|
121
|
-
showlegend=False,
|
122
|
-
),
|
123
|
-
row=row,
|
124
|
-
col=col,
|
125
|
-
)
|
126
|
-
row_offset += 2
|
127
|
-
|
128
|
-
subplot_height = 350
|
129
|
-
total_height = total_rows * subplot_height + 200
|
130
|
-
|
131
|
-
fig.update_layout(title_text="Regard Scores", height=total_height)
|
132
|
-
fig.update_yaxes(range=[0, 1])
|
133
|
-
fig.update_xaxes(showticklabels=False, row=1, col=1)
|
134
|
-
fig.update_xaxes(title_text="Index", showticklabels=True, row=1, col=1)
|
135
|
-
fig.update_yaxes(title_text="Score", showticklabels=True, row=1, col=1)
|
136
|
-
|
137
|
-
return fig
|
138
|
-
|
139
|
-
def run(self):
|
140
|
-
fig = self.regard_line_plot()
|
141
|
-
return self.cache_results(
|
142
|
-
figures=[Figure(for_object=self, key=self.key, figure=fig)]
|
143
|
-
)
|
74
|
+
figures.append(hist_fig)
|
75
|
+
|
76
|
+
# Bar Chart
|
77
|
+
bar_fig = go.Figure(data=[go.Bar(x=df.index, y=df[category])])
|
78
|
+
bar_fig.update_layout(
|
79
|
+
title=f"{title} - {category.capitalize()} Bar Chart",
|
80
|
+
xaxis_title="Text Instance Index",
|
81
|
+
yaxis_title=category.capitalize(),
|
82
|
+
)
|
83
|
+
figures.append(bar_fig)
|
84
|
+
|
85
|
+
# Create figures for each regard score dataframe
|
86
|
+
create_figures(true_df, "True Text Regard")
|
87
|
+
create_figures(pred_df, "Predicted Text Regard")
|
88
|
+
|
89
|
+
# Calculate statistics for each regard score dataframe
|
90
|
+
def calculate_stats(df, metric_name):
|
91
|
+
stats = df.describe().loc[["mean", "50%", "max", "min", "std"]].T
|
92
|
+
stats.columns = [
|
93
|
+
"Mean Score",
|
94
|
+
"Median Score",
|
95
|
+
"Max Score",
|
96
|
+
"Min Score",
|
97
|
+
"Standard Deviation",
|
98
|
+
]
|
99
|
+
stats["Metric"] = metric_name
|
100
|
+
stats["Count"] = len(df)
|
101
|
+
return stats
|
102
|
+
|
103
|
+
true_stats = calculate_stats(true_df, "True Text Regard")
|
104
|
+
pred_stats = calculate_stats(pred_df, "Predicted Text Regard")
|
105
|
+
|
106
|
+
# Combine statistics into a single dataframe
|
107
|
+
result_df = (
|
108
|
+
pd.concat([true_stats, pred_stats])
|
109
|
+
.reset_index()
|
110
|
+
.rename(columns={"index": "Category"})
|
111
|
+
)
|
112
|
+
result_df = result_df[
|
113
|
+
[
|
114
|
+
"Metric",
|
115
|
+
"Category",
|
116
|
+
"Mean Score",
|
117
|
+
"Median Score",
|
118
|
+
"Max Score",
|
119
|
+
"Min Score",
|
120
|
+
"Standard Deviation",
|
121
|
+
"Count",
|
122
|
+
]
|
123
|
+
]
|
124
|
+
|
125
|
+
return (result_df, *tuple(figures))
|
@@ -0,0 +1,118 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import pandas as pd
|
6
|
+
import plotly.graph_objects as go
|
7
|
+
from rouge import Rouge
|
8
|
+
|
9
|
+
from validmind import tags, tasks
|
10
|
+
|
11
|
+
|
12
|
+
@tags("nlp", "text_data", "visualization")
|
13
|
+
@tasks("text_classification", "text_summarization")
|
14
|
+
def RougeScore(dataset, model, metric="rouge-1"):
|
15
|
+
"""
|
16
|
+
Evaluates the quality of machine-generated text using ROUGE metrics and visualizes the results through histograms
|
17
|
+
and bar charts, alongside compiling a comprehensive table of descriptive statistics for each ROUGE metric.
|
18
|
+
|
19
|
+
**Purpose:**
|
20
|
+
This function is designed to assess the quality of text generated by machine learning models using various ROUGE metrics.
|
21
|
+
ROUGE, which stands for Recall-Oriented Understudy for Gisting Evaluation, is a set of metrics used to evaluate the
|
22
|
+
overlap of n-grams, word sequences, and word pairs between the machine-generated text and reference texts. This evaluation
|
23
|
+
is crucial for tasks such as text summarization, machine translation, and text generation, where the goal is to produce text
|
24
|
+
that accurately reflects the content and meaning of human-crafted references.
|
25
|
+
|
26
|
+
**Test Mechanism:**
|
27
|
+
The function starts by extracting the true and predicted values from the provided dataset and model. It then initializes the ROUGE
|
28
|
+
evaluator with the specified metric (e.g., ROUGE-1). For each pair of true and predicted texts, the function calculates the ROUGE
|
29
|
+
scores and compiles them into a dataframe. Histograms and bar charts are generated for each ROUGE metric (Precision, Recall, and F1 Score)
|
30
|
+
to visualize their distribution. Additionally, a table of descriptive statistics (mean, median, standard deviation, minimum, and maximum)
|
31
|
+
is compiled for each metric, providing a comprehensive summary of the model's performance.
|
32
|
+
|
33
|
+
**Signs of High Risk:**
|
34
|
+
|
35
|
+
- Consistently low scores across ROUGE metrics could indicate poor quality in the generated text, suggesting that the model fails
|
36
|
+
to capture the essential content of the reference texts.
|
37
|
+
- Low precision scores might suggest that the generated text contains a lot of redundant or irrelevant information.
|
38
|
+
- Low recall scores may indicate that important information from the reference text is being omitted.
|
39
|
+
- An imbalanced performance between precision and recall, reflected by a low F1 Score, could signal issues in the model's ability
|
40
|
+
to balance informativeness and conciseness.
|
41
|
+
|
42
|
+
**Strengths:**
|
43
|
+
|
44
|
+
- Provides a multifaceted evaluation of text quality through different ROUGE metrics, offering a detailed view of model performance.
|
45
|
+
- Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of the scores.
|
46
|
+
- Descriptive statistics offer a concise summary of the model's strengths and weaknesses in generating text.
|
47
|
+
|
48
|
+
**Limitations:**
|
49
|
+
|
50
|
+
- ROUGE metrics primarily focus on n-gram overlap and may not fully capture semantic coherence, fluency, or grammatical quality of the text.
|
51
|
+
- The evaluation relies on the availability of high-quality reference texts, which may not always be obtainable.
|
52
|
+
- While useful for comparison, ROUGE scores alone do not provide a complete assessment of a model's performance and should be
|
53
|
+
supplemented with other metrics and qualitative analysis.
|
54
|
+
"""
|
55
|
+
|
56
|
+
# Extract true and predicted values
|
57
|
+
y_true = dataset.y
|
58
|
+
y_pred = dataset.y_pred(model)
|
59
|
+
|
60
|
+
# Initialize Rouge with the specified metric
|
61
|
+
rouge = Rouge(metrics=[metric])
|
62
|
+
|
63
|
+
# Calculate ROUGE scores
|
64
|
+
score_list = []
|
65
|
+
for y_t, y_p in zip(y_true, y_pred):
|
66
|
+
scores = rouge.get_scores(y_p, y_t, avg=True)
|
67
|
+
score_list.append(scores)
|
68
|
+
|
69
|
+
# Convert scores to a dataframe
|
70
|
+
metrics_df = pd.DataFrame(score_list)
|
71
|
+
df_scores = pd.DataFrame(metrics_df[metric].tolist())
|
72
|
+
|
73
|
+
# Generate histograms and bar charts for each score type
|
74
|
+
score_types = ["p", "r", "f"]
|
75
|
+
score_names = ["Precision", "Recall", "F1 Score"]
|
76
|
+
figures = []
|
77
|
+
|
78
|
+
for score_type, score_name in zip(score_types, score_names):
|
79
|
+
# Histogram
|
80
|
+
hist_fig = go.Figure(data=[go.Histogram(x=df_scores[score_type])])
|
81
|
+
hist_fig.update_layout(
|
82
|
+
title=f"{score_name} Histogram for {metric.upper()}",
|
83
|
+
xaxis_title=score_name,
|
84
|
+
yaxis_title="Count",
|
85
|
+
)
|
86
|
+
figures.append(hist_fig)
|
87
|
+
|
88
|
+
# Bar Chart
|
89
|
+
bar_fig = go.Figure(data=[go.Bar(x=df_scores.index, y=df_scores[score_type])])
|
90
|
+
bar_fig.update_layout(
|
91
|
+
title=f"{score_name} Bar Chart for {metric.upper()}",
|
92
|
+
xaxis_title="Row Index",
|
93
|
+
yaxis_title=score_name,
|
94
|
+
)
|
95
|
+
figures.append(bar_fig)
|
96
|
+
|
97
|
+
# Calculate statistics for each score type
|
98
|
+
stats_df = df_scores.describe().loc[["mean", "50%", "max", "min", "std"]]
|
99
|
+
stats_df = stats_df.rename(
|
100
|
+
index={
|
101
|
+
"mean": "Mean Score",
|
102
|
+
"50%": "Median Score",
|
103
|
+
"max": "Max Score",
|
104
|
+
"min": "Min Score",
|
105
|
+
"std": "Standard Deviation",
|
106
|
+
}
|
107
|
+
).T
|
108
|
+
stats_df["Count"] = len(df_scores)
|
109
|
+
|
110
|
+
# Rename metrics for clarity
|
111
|
+
stats_df.index = stats_df.index.map(
|
112
|
+
{"p": "Precision", "r": "Recall", "f": "F1 Score"}
|
113
|
+
)
|
114
|
+
|
115
|
+
# Create a DataFrame from all collected statistics
|
116
|
+
result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
|
117
|
+
|
118
|
+
return (result_df, *tuple(figures))
|