validmind 2.1.0__py3-none-any.whl → 2.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/ai.py +3 -3
- validmind/api_client.py +2 -3
- validmind/client.py +68 -25
- validmind/datasets/llm/rag/__init__.py +11 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_1.csv +30 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_2.csv +30 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_3.csv +53 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_4.csv +53 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_5.csv +53 -0
- validmind/datasets/llm/rag/rfp.py +41 -0
- validmind/html_templates/__init__.py +0 -0
- validmind/html_templates/content_blocks.py +89 -14
- validmind/models/__init__.py +7 -4
- validmind/models/foundation.py +8 -34
- validmind/models/function.py +51 -0
- validmind/models/huggingface.py +16 -46
- validmind/models/metadata.py +42 -0
- validmind/models/pipeline.py +66 -0
- validmind/models/pytorch.py +8 -42
- validmind/models/r_model.py +33 -82
- validmind/models/sklearn.py +39 -38
- validmind/template.py +8 -26
- validmind/tests/__init__.py +43 -20
- validmind/tests/data_validation/ANOVAOneWayTable.py +1 -1
- validmind/tests/data_validation/ChiSquaredFeaturesTable.py +1 -1
- validmind/tests/data_validation/DescriptiveStatistics.py +2 -4
- validmind/tests/data_validation/Duplicates.py +1 -1
- validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +1 -1
- validmind/tests/data_validation/TargetRateBarPlots.py +1 -1
- validmind/tests/data_validation/nlp/LanguageDetection.py +59 -0
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +48 -0
- validmind/tests/data_validation/nlp/Punctuations.py +11 -12
- validmind/tests/data_validation/nlp/Sentiment.py +57 -0
- validmind/tests/data_validation/nlp/Toxicity.py +45 -0
- validmind/tests/decorator.py +2 -2
- validmind/tests/model_validation/BertScore.py +100 -98
- validmind/tests/model_validation/BleuScore.py +93 -64
- validmind/tests/model_validation/ContextualRecall.py +74 -91
- validmind/tests/model_validation/MeteorScore.py +86 -74
- validmind/tests/model_validation/RegardScore.py +103 -121
- validmind/tests/model_validation/RougeScore.py +118 -0
- validmind/tests/model_validation/TokenDisparity.py +84 -121
- validmind/tests/model_validation/ToxicityScore.py +109 -123
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +96 -0
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +71 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +92 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +69 -0
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +78 -0
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +35 -23
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +3 -0
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +7 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +3 -0
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +3 -0
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +99 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +131 -0
- validmind/tests/model_validation/ragas/AnswerRelevance.py +134 -0
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +119 -0
- validmind/tests/model_validation/ragas/AspectCritique.py +167 -0
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +133 -0
- validmind/tests/model_validation/ragas/ContextPrecision.py +123 -0
- validmind/tests/model_validation/ragas/ContextRecall.py +123 -0
- validmind/tests/model_validation/ragas/ContextRelevancy.py +114 -0
- validmind/tests/model_validation/ragas/Faithfulness.py +119 -0
- validmind/tests/model_validation/ragas/utils.py +66 -0
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +3 -7
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +8 -9
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +5 -10
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +3 -2
- validmind/tests/model_validation/sklearn/ROCCurve.py +2 -1
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +2 -3
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +14 -12
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +3 -4
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +1 -1
- validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +5 -6
- validmind/unit_metrics/__init__.py +26 -49
- validmind/unit_metrics/composite.py +5 -1
- validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +1 -1
- validmind/utils.py +56 -6
- validmind/vm_models/__init__.py +1 -1
- validmind/vm_models/dataset/__init__.py +7 -0
- validmind/vm_models/dataset/dataset.py +558 -0
- validmind/vm_models/dataset/utils.py +146 -0
- validmind/vm_models/model.py +97 -72
- validmind/vm_models/test/result_wrapper.py +61 -24
- validmind/vm_models/test_context.py +1 -1
- validmind/vm_models/test_suite/summary.py +3 -4
- {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/METADATA +5 -3
- {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/RECORD +100 -75
- validmind/models/catboost.py +0 -33
- validmind/models/statsmodels.py +0 -50
- validmind/models/xgboost.py +0 -30
- validmind/tests/model_validation/BertScoreAggregate.py +0 -90
- validmind/tests/model_validation/RegardHistogram.py +0 -148
- validmind/tests/model_validation/RougeMetrics.py +0 -147
- validmind/tests/model_validation/RougeMetricsAggregate.py +0 -133
- validmind/tests/model_validation/SelfCheckNLIScore.py +0 -112
- validmind/tests/model_validation/ToxicityHistogram.py +0 -136
- validmind/vm_models/dataset.py +0 -1303
- {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/LICENSE +0 -0
- {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/WHEEL +0 -0
- {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,48 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
|
6
|
+
import pandas as pd
|
7
|
+
import plotly.express as px
|
8
|
+
from textblob import TextBlob
|
9
|
+
|
10
|
+
from validmind import tags, tasks
|
11
|
+
|
12
|
+
|
13
|
+
@tags("data_validation")
|
14
|
+
@tasks("nlp")
|
15
|
+
def PolarityAndSubjectivity(dataset):
|
16
|
+
"""
|
17
|
+
Analyzes the polarity and subjectivity of text data within a dataset.
|
18
|
+
|
19
|
+
This method processes a dataset containing textual data to compute the polarity and
|
20
|
+
subjectivity scores using TextBlob, and returns a Plotly scatter plot visualizing
|
21
|
+
these scores.
|
22
|
+
|
23
|
+
Args:
|
24
|
+
dataset (Dataset): A dataset object which must have a `df` attribute (a pandas DataFrame)
|
25
|
+
and a `text_column` attribute indicating the name of the column containing text.
|
26
|
+
|
27
|
+
Returns:
|
28
|
+
plotly.graph_objs._figure.Figure: A Plotly scatter plot of polarity vs subjectivity.
|
29
|
+
"""
|
30
|
+
# Function to calculate sentiment and subjectivity
|
31
|
+
def analyze_sentiment(text):
|
32
|
+
analysis = TextBlob(text)
|
33
|
+
return analysis.sentiment.polarity, analysis.sentiment.subjectivity
|
34
|
+
|
35
|
+
data = pd.DataFrame()
|
36
|
+
# Apply the function to each row
|
37
|
+
data[["polarity", "subjectivity"]] = dataset.df[dataset.text_column].apply(
|
38
|
+
lambda x: pd.Series(analyze_sentiment(x))
|
39
|
+
)
|
40
|
+
|
41
|
+
# Create a Plotly scatter plot
|
42
|
+
fig = px.scatter(
|
43
|
+
data, x="polarity", y="subjectivity", title="Polarity vs Subjectivity"
|
44
|
+
)
|
45
|
+
fig.update_traces(textposition="top center")
|
46
|
+
fig.update_layout(xaxis_title="Polarity", yaxis_title="Subjectivity")
|
47
|
+
|
48
|
+
return fig
|
@@ -72,25 +72,24 @@ class Punctuations(Metric):
|
|
72
72
|
text_column = self.inputs.dataset.text_column
|
73
73
|
corpus = create_corpus(self.inputs.dataset.df, text_column=text_column)
|
74
74
|
|
75
|
-
dic = defaultdict(int)
|
76
75
|
special = string.punctuation
|
76
|
+
dic = defaultdict(int, {key: 0 for key in special})
|
77
77
|
for i in corpus:
|
78
78
|
if i in special:
|
79
79
|
dic[i] += 1
|
80
|
-
|
80
|
+
figures = []
|
81
|
+
# if dic:
|
81
82
|
fig = plt.figure()
|
82
83
|
x, y = zip(*dic.items())
|
83
84
|
plt.bar(x, y, color="#17C37B")
|
84
|
-
|
85
|
+
figures.append(
|
86
|
+
Figure(
|
87
|
+
for_object=self,
|
88
|
+
key=self.key,
|
89
|
+
figure=fig,
|
90
|
+
)
|
91
|
+
)
|
85
92
|
# Do this if you want to prevent the figure from being displayed
|
86
93
|
plt.close("all")
|
87
94
|
|
88
|
-
return self.cache_results(
|
89
|
-
figures=[
|
90
|
-
Figure(
|
91
|
-
for_object=self,
|
92
|
-
key=self.key,
|
93
|
-
figure=fig,
|
94
|
-
)
|
95
|
-
]
|
96
|
-
)
|
95
|
+
return self.cache_results(figures=figures)
|
@@ -0,0 +1,57 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
|
6
|
+
import matplotlib.pyplot as plt
|
7
|
+
import nltk
|
8
|
+
import seaborn as sns
|
9
|
+
from nltk.sentiment import SentimentIntensityAnalyzer
|
10
|
+
|
11
|
+
from validmind import tags, tasks
|
12
|
+
|
13
|
+
|
14
|
+
@tags("data_validation")
|
15
|
+
@tasks("nlp")
|
16
|
+
def Sentiment(dataset):
|
17
|
+
"""
|
18
|
+
Analyzes the sentiment of text data within a dataset using the VADER sentiment analysis tool.
|
19
|
+
|
20
|
+
This method initializes the VADER SentimentIntensityAnalyzer and applies it to each text entry
|
21
|
+
in the specified column of the dataset's dataframe. It returns a KDE plot visualizing the distribution
|
22
|
+
of sentiment scores across the dataset.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
dataset (Dataset): A dataset object which must have a `df` attribute (a pandas DataFrame)
|
26
|
+
and a `text_column` attribute indicating the name of the column containing text.
|
27
|
+
|
28
|
+
Returns:
|
29
|
+
matplotlib.figure.Figure: A KDE plot visualizing the distribution of sentiment scores.
|
30
|
+
"""
|
31
|
+
nltk.download("vader_lexicon", quiet=True)
|
32
|
+
# Initialize VADER
|
33
|
+
sia = SentimentIntensityAnalyzer()
|
34
|
+
|
35
|
+
# Function to get VADER sentiment scores
|
36
|
+
def get_vader_sentiment(text):
|
37
|
+
sentiment_score = sia.polarity_scores(text)
|
38
|
+
return sentiment_score["compound"]
|
39
|
+
|
40
|
+
# Apply the function to each row
|
41
|
+
vader_sentiment = dataset.df[dataset.text_column].apply(get_vader_sentiment)
|
42
|
+
|
43
|
+
fig = plt.figure()
|
44
|
+
ax = sns.kdeplot(
|
45
|
+
x=vader_sentiment,
|
46
|
+
fill=True,
|
47
|
+
common_norm=False,
|
48
|
+
palette="crest",
|
49
|
+
alpha=0.5,
|
50
|
+
linewidth=0,
|
51
|
+
)
|
52
|
+
ax.set_title(f"Sentiment score of {dataset.text_column} ")
|
53
|
+
ax.set_xlabel("Sentiment score")
|
54
|
+
|
55
|
+
plt.close("all")
|
56
|
+
|
57
|
+
return fig
|
@@ -0,0 +1,45 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import evaluate
|
6
|
+
import matplotlib.pyplot as plt
|
7
|
+
import seaborn as sns
|
8
|
+
|
9
|
+
from validmind import tags, tasks
|
10
|
+
|
11
|
+
|
12
|
+
@tags("data_validation")
|
13
|
+
@tasks("nlp")
|
14
|
+
def Toxicity(dataset):
|
15
|
+
"""
|
16
|
+
Analyzes the toxicity of text data within a dataset using a pre-trained toxicity model.
|
17
|
+
|
18
|
+
This method loads a toxicity evaluation model and applies it to each text entry
|
19
|
+
in the specified column of the dataset's dataframe. It returns a KDE plot visualizing the distribution
|
20
|
+
of toxicity scores across the dataset.
|
21
|
+
|
22
|
+
Args:
|
23
|
+
dataset (Dataset): A dataset object which must have a `df` attribute (a pandas DataFrame)
|
24
|
+
and a `text_column` attribute indicating the name of the column containing text.
|
25
|
+
|
26
|
+
Returns:
|
27
|
+
matplotlib.figure.Figure: A KDE plot visualizing the distribution of toxicity scores.
|
28
|
+
"""
|
29
|
+
toxicity = evaluate.load("toxicity")
|
30
|
+
input_text = dataset.df[dataset.text_column]
|
31
|
+
toxicity_scores = toxicity.compute(predictions=list(input_text.values))["toxicity"]
|
32
|
+
|
33
|
+
fig = plt.figure()
|
34
|
+
ax = sns.kdeplot(
|
35
|
+
x=toxicity_scores,
|
36
|
+
fill=True,
|
37
|
+
common_norm=False,
|
38
|
+
palette="crest",
|
39
|
+
alpha=0.5,
|
40
|
+
linewidth=0,
|
41
|
+
)
|
42
|
+
ax.set_title(f"Toxicity score of {dataset.text_column} ")
|
43
|
+
ax.set_xlabel("Toxicity score")
|
44
|
+
plt.close("all")
|
45
|
+
return fig
|
validmind/tests/decorator.py
CHANGED
@@ -153,7 +153,7 @@ def _get_run_method(func, inputs, params):
|
|
153
153
|
test_id=self.test_id,
|
154
154
|
description=inspect.getdoc(self),
|
155
155
|
output_template=self.output_template,
|
156
|
-
inputs=
|
156
|
+
inputs=self.get_accessed_inputs(),
|
157
157
|
)
|
158
158
|
|
159
159
|
return self.result
|
@@ -264,7 +264,7 @@ def metric(func_or_id):
|
|
264
264
|
{
|
265
265
|
"run": _get_run_method(func, inputs, params),
|
266
266
|
"required_inputs": list(inputs.keys()),
|
267
|
-
"
|
267
|
+
"default_params": {k: v["default"] for k, v in params.items()},
|
268
268
|
"__doc__": description,
|
269
269
|
"metadata": {
|
270
270
|
"task_types": tasks,
|
@@ -2,116 +2,118 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
import itertools
|
6
|
-
from dataclasses import dataclass
|
7
|
-
|
8
5
|
import evaluate
|
9
6
|
import pandas as pd
|
10
7
|
import plotly.graph_objects as go
|
11
8
|
|
12
|
-
from validmind
|
9
|
+
from validmind import tags, tasks
|
13
10
|
|
14
11
|
|
15
|
-
@
|
16
|
-
|
12
|
+
@tags("nlp", "text_data", "visualization")
|
13
|
+
@tasks("text_classification", "text_summarization")
|
14
|
+
def BertScore(dataset, model):
|
17
15
|
"""
|
18
|
-
Evaluates
|
16
|
+
Evaluates the quality of machine-generated text using BERTScore metrics and visualizes the results through histograms
|
17
|
+
and bar charts, alongside compiling a comprehensive table of descriptive statistics for each BERTScore metric.
|
18
|
+
|
19
|
+
**Purpose:**
|
20
|
+
This function is designed to assess the quality of text generated by machine learning models using BERTScore metrics.
|
21
|
+
BERTScore evaluates text generation models' performance by calculating precision, recall, and F1 score based on BERT
|
19
22
|
contextual embeddings.
|
20
23
|
|
21
|
-
**
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
-
|
38
|
-
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
**Limitations**:
|
48
|
-
- Dependence on BERT model embeddings for BERTScore implies that if the base BERT model is not suitable for a
|
49
|
-
specific task, it might impair the accuracy of BERTScore.
|
50
|
-
- Despite being good at understanding semantics, it might be incapable of capturing certain nuances in text
|
51
|
-
similarity that other metrics like BLEU or ROUGE could detect.
|
52
|
-
- Can be computationally expensive due to the utilization of BERT embeddings.
|
24
|
+
**Test Mechanism:**
|
25
|
+
The function starts by extracting the true and predicted values from the provided dataset and model. It then initializes
|
26
|
+
the BERTScore evaluator. For each pair of true and predicted texts, the function calculates the BERTScore metrics and
|
27
|
+
compiles them into a dataframe. Histograms and bar charts are generated for each BERTScore metric (Precision, Recall,
|
28
|
+
and F1 Score) to visualize their distribution. Additionally, a table of descriptive statistics (mean, median, standard
|
29
|
+
deviation, minimum, and maximum) is compiled for each metric, providing a comprehensive summary of the model's performance.
|
30
|
+
|
31
|
+
**Signs of High Risk:**
|
32
|
+
- Consistently low scores across BERTScore metrics could indicate poor quality in the generated text, suggesting that the model
|
33
|
+
fails to capture the essential content of the reference texts.
|
34
|
+
- Low precision scores might suggest that the generated text contains a lot of redundant or irrelevant information.
|
35
|
+
- Low recall scores may indicate that important information from the reference text is being omitted.
|
36
|
+
- An imbalanced performance between precision and recall, reflected by a low F1 Score, could signal issues in the model's ability
|
37
|
+
to balance informativeness and conciseness.
|
38
|
+
|
39
|
+
**Strengths:**
|
40
|
+
- Provides a multifaceted evaluation of text quality through different BERTScore metrics, offering a detailed view of model performance.
|
41
|
+
- Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of the scores.
|
42
|
+
- Descriptive statistics offer a concise summary of the model's strengths and weaknesses in generating text.
|
43
|
+
|
44
|
+
**Limitations:**
|
45
|
+
- BERTScore relies on the contextual embeddings from BERT models, which may not fully capture all nuances of text similarity.
|
46
|
+
- The evaluation relies on the availability of high-quality reference texts, which may not always be obtainable.
|
47
|
+
- While useful for comparison, BERTScore metrics alone do not provide a complete assessment of a model's performance and should be
|
48
|
+
supplemented with other metrics and qualitative analysis.
|
53
49
|
"""
|
54
50
|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
51
|
+
# Extract true and predicted values
|
52
|
+
y_true = dataset.y
|
53
|
+
y_pred = dataset.y_pred(model)
|
54
|
+
|
55
|
+
# Ensure y_true and y_pred have the same length
|
56
|
+
if len(y_true) != len(y_pred):
|
57
|
+
min_length = min(len(y_true), len(y_pred))
|
58
|
+
y_true = y_true[:min_length]
|
59
|
+
y_pred = y_pred[:min_length]
|
60
|
+
|
61
|
+
# Load the BERT evaluation metric
|
62
|
+
bert = evaluate.load("bertscore")
|
63
|
+
|
64
|
+
# Compute the BERT score
|
65
|
+
bert_s = bert.compute(
|
66
|
+
predictions=y_pred,
|
67
|
+
references=y_true,
|
68
|
+
lang="en",
|
69
|
+
)
|
70
|
+
|
71
|
+
# Convert scores to a dataframe
|
72
|
+
metrics_df = pd.DataFrame(bert_s)
|
73
|
+
figures = []
|
74
|
+
|
75
|
+
# Generate histograms and bar charts for each score type
|
76
|
+
score_types = ["precision", "recall", "f1"]
|
77
|
+
score_names = ["Precision", "Recall", "F1 Score"]
|
78
|
+
|
79
|
+
for score_type, score_name in zip(score_types, score_names):
|
80
|
+
# Histogram
|
81
|
+
hist_fig = go.Figure(data=[go.Histogram(x=metrics_df[score_type])])
|
82
|
+
hist_fig.update_layout(
|
83
|
+
title=f"{score_name} Histogram",
|
84
|
+
xaxis_title=score_name,
|
85
|
+
yaxis_title="Count",
|
70
86
|
)
|
87
|
+
figures.append(hist_fig)
|
71
88
|
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
fig = go.Figure()
|
77
|
-
|
78
|
-
# Adding the line plots
|
79
|
-
fig.add_trace(
|
80
|
-
go.Scatter(
|
81
|
-
x=metrics_df.index,
|
82
|
-
y=metrics_df["precision"],
|
83
|
-
mode="lines+markers",
|
84
|
-
name="Precision",
|
85
|
-
)
|
86
|
-
)
|
87
|
-
fig.add_trace(
|
88
|
-
go.Scatter(
|
89
|
-
x=metrics_df.index,
|
90
|
-
y=metrics_df["recall"],
|
91
|
-
mode="lines+markers",
|
92
|
-
name="Recall",
|
93
|
-
)
|
94
|
-
)
|
95
|
-
fig.add_trace(
|
96
|
-
go.Scatter(
|
97
|
-
x=metrics_df.index,
|
98
|
-
y=metrics_df["f1"],
|
99
|
-
mode="lines+markers",
|
100
|
-
name="F1 Score",
|
101
|
-
)
|
102
|
-
)
|
103
|
-
|
104
|
-
fig.update_layout(
|
105
|
-
title="Bert Scores for Each Row",
|
89
|
+
# Bar Chart
|
90
|
+
bar_fig = go.Figure(data=[go.Bar(x=metrics_df.index, y=metrics_df[score_type])])
|
91
|
+
bar_fig.update_layout(
|
92
|
+
title=f"{score_name} Bar Chart",
|
106
93
|
xaxis_title="Row Index",
|
107
|
-
yaxis_title=
|
108
|
-
)
|
109
|
-
figures.append(
|
110
|
-
Figure(
|
111
|
-
for_object=self,
|
112
|
-
key=self.key,
|
113
|
-
figure=fig,
|
114
|
-
)
|
94
|
+
yaxis_title=score_name,
|
115
95
|
)
|
116
|
-
|
117
|
-
|
96
|
+
figures.append(bar_fig)
|
97
|
+
|
98
|
+
# Calculate statistics for each score type
|
99
|
+
stats_df = metrics_df.describe().loc[["mean", "50%", "max", "min", "std"]]
|
100
|
+
stats_df = stats_df.rename(
|
101
|
+
index={
|
102
|
+
"mean": "Mean Score",
|
103
|
+
"50%": "Median Score",
|
104
|
+
"max": "Max Score",
|
105
|
+
"min": "Min Score",
|
106
|
+
"std": "Standard Deviation",
|
107
|
+
}
|
108
|
+
).T
|
109
|
+
stats_df["Count"] = len(metrics_df)
|
110
|
+
|
111
|
+
# Rename metrics for clarity
|
112
|
+
stats_df.index = stats_df.index.map(
|
113
|
+
{"precision": "Precision", "recall": "Recall", "f1": "F1 Score"}
|
114
|
+
)
|
115
|
+
|
116
|
+
# Create a DataFrame from all collected statistics
|
117
|
+
result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
|
118
|
+
|
119
|
+
return (result_df, *tuple(figures))
|
@@ -2,77 +2,106 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
5
|
import evaluate
|
6
|
+
import pandas as pd
|
7
|
+
import plotly.graph_objects as go
|
8
8
|
|
9
|
-
from validmind
|
9
|
+
from validmind import tags, tasks
|
10
10
|
|
11
11
|
|
12
|
-
@
|
13
|
-
|
12
|
+
@tags("nlp", "text_data", "visualization")
|
13
|
+
@tasks("text_classification", "text_summarization")
|
14
|
+
def BleuScore(dataset, model):
|
14
15
|
"""
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
-
|
34
|
-
|
35
|
-
|
36
|
-
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
-
|
44
|
-
|
45
|
-
|
46
|
-
-
|
16
|
+
Evaluates the quality of machine-generated text using BLEU metrics and visualizes the results through histograms
|
17
|
+
and bar charts, alongside compiling a comprehensive table of descriptive statistics for BLEU scores.
|
18
|
+
|
19
|
+
**Purpose:**
|
20
|
+
This function is designed to assess the quality of text generated by machine learning models using the BLEU metric.
|
21
|
+
BLEU, which stands for Bilingual Evaluation Understudy, is a metric used to evaluate the overlap of n-grams between
|
22
|
+
the machine-generated text and reference texts. This evaluation is crucial for tasks such as text summarization,
|
23
|
+
machine translation, and text generation, where the goal is to produce text that accurately reflects the content
|
24
|
+
and meaning of human-crafted references.
|
25
|
+
|
26
|
+
**Test Mechanism:**
|
27
|
+
The function starts by extracting the true and predicted values from the provided dataset and model. It then initializes
|
28
|
+
the BLEU evaluator. For each pair of true and predicted texts, the function calculates the BLEU scores and compiles them
|
29
|
+
into a dataframe. Histograms and bar charts are generated for the BLEU scores to visualize their distribution. Additionally,
|
30
|
+
a table of descriptive statistics (mean, median, standard deviation, minimum, and maximum) is compiled for the BLEU scores,
|
31
|
+
providing a comprehensive summary of the model's performance.
|
32
|
+
|
33
|
+
**Signs of High Risk:**
|
34
|
+
- Consistently low BLEU scores could indicate poor quality in the generated text, suggesting that the model fails to capture
|
35
|
+
the essential content of the reference texts.
|
36
|
+
- Low precision scores might suggest that the generated text contains a lot of redundant or irrelevant information.
|
37
|
+
- Low recall scores may indicate that important information from the reference text is being omitted.
|
38
|
+
- An imbalanced performance between precision and recall, reflected by a low BLEU score, could signal issues in the model's
|
39
|
+
ability to balance informativeness and conciseness.
|
40
|
+
|
41
|
+
**Strengths:**
|
42
|
+
- Provides a straightforward and widely-used evaluation of text quality through BLEU scores.
|
43
|
+
- Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of the scores.
|
44
|
+
- Descriptive statistics offer a concise summary of the model's strengths and weaknesses in generating text.
|
45
|
+
|
46
|
+
**Limitations:**
|
47
|
+
- BLEU metrics primarily focus on n-gram overlap and may not fully capture semantic coherence, fluency, or grammatical quality
|
48
|
+
of the text.
|
49
|
+
- The evaluation relies on the availability of high-quality reference texts, which may not always be obtainable.
|
50
|
+
- While useful for comparison, BLEU scores alone do not provide a complete assessment of a model's performance and should be
|
51
|
+
supplemented with other metrics and qualitative analysis.
|
47
52
|
"""
|
48
53
|
|
49
|
-
|
50
|
-
|
54
|
+
# Extract true and predicted values
|
55
|
+
y_true = dataset.y
|
56
|
+
y_pred = dataset.y_pred(model)
|
51
57
|
|
52
|
-
|
53
|
-
|
54
|
-
bleu = evaluate.load("bleu")
|
58
|
+
# Load the BLEU evaluation metric
|
59
|
+
bleu = evaluate.load("bleu")
|
55
60
|
|
61
|
+
# Calculate BLEU scores
|
62
|
+
score_list = []
|
63
|
+
for y_t, y_p in zip(y_true, y_pred):
|
56
64
|
# Compute the BLEU score
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
65
|
+
score = bleu.compute(predictions=[y_p], references=[[y_t]])
|
66
|
+
score_list.append(score["bleu"])
|
67
|
+
|
68
|
+
# Convert scores to a dataframe
|
69
|
+
metrics_df = pd.DataFrame(score_list, columns=["BLEU Score"])
|
70
|
+
|
71
|
+
figures = []
|
72
|
+
|
73
|
+
# Histogram for BLEU Score
|
74
|
+
hist_fig = go.Figure(data=[go.Histogram(x=metrics_df["BLEU Score"])])
|
75
|
+
hist_fig.update_layout(
|
76
|
+
title="BLEU Score Histogram",
|
77
|
+
xaxis_title="BLEU Score",
|
78
|
+
yaxis_title="Count",
|
79
|
+
)
|
80
|
+
figures.append(hist_fig)
|
81
|
+
|
82
|
+
# Bar Chart for BLEU Score
|
83
|
+
bar_fig = go.Figure(data=[go.Bar(x=metrics_df.index, y=metrics_df["BLEU Score"])])
|
84
|
+
bar_fig.update_layout(
|
85
|
+
title="BLEU Score Bar Chart",
|
86
|
+
xaxis_title="Row Index",
|
87
|
+
yaxis_title="BLEU Score",
|
88
|
+
)
|
89
|
+
figures.append(bar_fig)
|
90
|
+
|
91
|
+
# Calculate statistics for BLEU Score
|
92
|
+
stats_df = metrics_df.describe().loc[["mean", "50%", "max", "min", "std"]]
|
93
|
+
stats_df = stats_df.rename(
|
94
|
+
index={
|
95
|
+
"mean": "Mean Score",
|
96
|
+
"50%": "Median Score",
|
97
|
+
"max": "Max Score",
|
98
|
+
"min": "Min Score",
|
99
|
+
"std": "Standard Deviation",
|
100
|
+
}
|
101
|
+
).T
|
102
|
+
stats_df["Count"] = len(metrics_df)
|
103
|
+
|
104
|
+
# Create a DataFrame from all collected statistics
|
105
|
+
result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
|
106
|
+
|
107
|
+
return (result_df, *tuple(figures))
|