validmind 2.1.0__py3-none-any.whl → 2.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/ai.py +3 -3
- validmind/api_client.py +2 -3
- validmind/client.py +68 -25
- validmind/datasets/llm/rag/__init__.py +11 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_1.csv +30 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_2.csv +30 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_3.csv +53 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_4.csv +53 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_5.csv +53 -0
- validmind/datasets/llm/rag/rfp.py +41 -0
- validmind/html_templates/__init__.py +0 -0
- validmind/html_templates/content_blocks.py +89 -14
- validmind/models/__init__.py +7 -4
- validmind/models/foundation.py +8 -34
- validmind/models/function.py +51 -0
- validmind/models/huggingface.py +16 -46
- validmind/models/metadata.py +42 -0
- validmind/models/pipeline.py +66 -0
- validmind/models/pytorch.py +8 -42
- validmind/models/r_model.py +33 -82
- validmind/models/sklearn.py +39 -38
- validmind/template.py +8 -26
- validmind/tests/__init__.py +43 -20
- validmind/tests/data_validation/ANOVAOneWayTable.py +1 -1
- validmind/tests/data_validation/ChiSquaredFeaturesTable.py +1 -1
- validmind/tests/data_validation/DescriptiveStatistics.py +2 -4
- validmind/tests/data_validation/Duplicates.py +1 -1
- validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +1 -1
- validmind/tests/data_validation/TargetRateBarPlots.py +1 -1
- validmind/tests/data_validation/nlp/LanguageDetection.py +59 -0
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +48 -0
- validmind/tests/data_validation/nlp/Punctuations.py +11 -12
- validmind/tests/data_validation/nlp/Sentiment.py +57 -0
- validmind/tests/data_validation/nlp/Toxicity.py +45 -0
- validmind/tests/decorator.py +2 -2
- validmind/tests/model_validation/BertScore.py +100 -98
- validmind/tests/model_validation/BleuScore.py +93 -64
- validmind/tests/model_validation/ContextualRecall.py +74 -91
- validmind/tests/model_validation/MeteorScore.py +86 -74
- validmind/tests/model_validation/RegardScore.py +103 -121
- validmind/tests/model_validation/RougeScore.py +118 -0
- validmind/tests/model_validation/TokenDisparity.py +84 -121
- validmind/tests/model_validation/ToxicityScore.py +109 -123
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +96 -0
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +71 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +92 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +69 -0
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +78 -0
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +35 -23
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +3 -0
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +7 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +3 -0
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +3 -0
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +99 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +131 -0
- validmind/tests/model_validation/ragas/AnswerRelevance.py +134 -0
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +119 -0
- validmind/tests/model_validation/ragas/AspectCritique.py +167 -0
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +133 -0
- validmind/tests/model_validation/ragas/ContextPrecision.py +123 -0
- validmind/tests/model_validation/ragas/ContextRecall.py +123 -0
- validmind/tests/model_validation/ragas/ContextRelevancy.py +114 -0
- validmind/tests/model_validation/ragas/Faithfulness.py +119 -0
- validmind/tests/model_validation/ragas/utils.py +66 -0
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +3 -7
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +8 -9
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +5 -10
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +3 -2
- validmind/tests/model_validation/sklearn/ROCCurve.py +2 -1
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +2 -3
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +14 -12
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +3 -4
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +1 -1
- validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +5 -6
- validmind/unit_metrics/__init__.py +26 -49
- validmind/unit_metrics/composite.py +5 -1
- validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +1 -1
- validmind/utils.py +56 -6
- validmind/vm_models/__init__.py +1 -1
- validmind/vm_models/dataset/__init__.py +7 -0
- validmind/vm_models/dataset/dataset.py +558 -0
- validmind/vm_models/dataset/utils.py +146 -0
- validmind/vm_models/model.py +97 -72
- validmind/vm_models/test/result_wrapper.py +61 -24
- validmind/vm_models/test_context.py +1 -1
- validmind/vm_models/test_suite/summary.py +3 -4
- {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/METADATA +5 -3
- {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/RECORD +100 -75
- validmind/models/catboost.py +0 -33
- validmind/models/statsmodels.py +0 -50
- validmind/models/xgboost.py +0 -30
- validmind/tests/model_validation/BertScoreAggregate.py +0 -90
- validmind/tests/model_validation/RegardHistogram.py +0 -148
- validmind/tests/model_validation/RougeMetrics.py +0 -147
- validmind/tests/model_validation/RougeMetricsAggregate.py +0 -133
- validmind/tests/model_validation/SelfCheckNLIScore.py +0 -112
- validmind/tests/model_validation/ToxicityHistogram.py +0 -136
- validmind/vm_models/dataset.py +0 -1303
- {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/LICENSE +0 -0
- {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/WHEEL +0 -0
- {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/entry_points.txt +0 -0
@@ -2,109 +2,92 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
import itertools
|
6
|
-
from dataclasses import dataclass
|
7
|
-
|
8
5
|
import nltk
|
9
6
|
import pandas as pd
|
10
7
|
import plotly.graph_objects as go
|
11
8
|
|
12
|
-
from validmind
|
9
|
+
from validmind import tags, tasks
|
13
10
|
|
14
11
|
|
15
|
-
@
|
16
|
-
|
12
|
+
@tags("nlp", "text_data", "visualization")
|
13
|
+
@tasks("text_classification", "text_summarization")
|
14
|
+
def ContextualRecall(dataset, model):
|
17
15
|
"""
|
18
|
-
Evaluates a Natural Language Generation model's ability to generate contextually relevant and factually correct
|
19
|
-
text.
|
20
|
-
|
21
|
-
**Purpose**:
|
22
|
-
The Contextual Recall metric is used to evaluate the ability of a natural language generation (NLG) model to
|
23
|
-
generate text that appropriately reflects the given context or prompt. It measures the model's capability to
|
24
|
-
remember and reproduce the main context in its resulting output. This metric is critical in natural language
|
25
|
-
processing tasks, as the coherency and contextuality of the generated text are essential.
|
26
|
-
|
27
|
-
**Test Mechanism**:
|
28
|
-
|
29
|
-
1. **Preparation of Reference and Candidate Texts**:
|
30
|
-
- **Reference Texts**: Gather the reference text(s) which exemplify the expected or ideal output for a specific
|
31
|
-
context or prompt.
|
32
|
-
- **Candidate Texts**: Generate candidate text(s) from the NLG model under evaluation using the same context.
|
33
|
-
2. **Tokenization and Preprocessing**:
|
34
|
-
- Tokenize the reference and candidate texts into discernible words or tokens using libraries such as NLTK.
|
35
|
-
3. **Computation of Contextual Recall**:
|
36
|
-
- Identify the token overlap between the reference and candidate texts.
|
37
|
-
- The Contextual Recall score is computed by dividing the number of overlapping tokens by the total number of
|
38
|
-
tokens in the reference text. Scores are calculated for each test dataset instance, resulting in an array of
|
39
|
-
scores. These scores are then visualized using a line plot to show score variations across different rows.
|
40
|
-
|
41
|
-
**Signs of High Risk**:
|
42
|
-
|
43
|
-
- Low contextual recall scores could indicate that the model is not effectively reflecting the original context in
|
44
|
-
its output, leading to incoherent or contextually misaligned text.
|
45
|
-
- A consistent trend of low recall scores could suggest underperformance of the model.
|
16
|
+
Evaluates a Natural Language Generation model's ability to generate contextually relevant and factually correct text, visualizing the results through histograms and bar charts, alongside compiling a comprehensive table of descriptive statistics for contextual recall scores.
|
46
17
|
|
47
|
-
**
|
18
|
+
**Purpose:**
|
19
|
+
The Contextual Recall metric is used to evaluate the ability of a natural language generation (NLG) model to generate text that appropriately reflects the given context or prompt. It measures the model's capability to remember and reproduce the main context in its resulting output. This metric is critical in natural language processing tasks, as the coherency and contextuality of the generated text are essential.
|
48
20
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
-
|
21
|
+
**Test Mechanism:**
|
22
|
+
The function starts by extracting the true and predicted values from the provided dataset and model. It then tokenizes the reference and candidate texts into discernible words or tokens using NLTK. The token overlap between the reference and candidate texts is identified, and the Contextual Recall score is computed by dividing the number of overlapping tokens by the total number of tokens in the reference text. Scores are calculated for each test dataset instance, resulting in an array of scores. These scores are visualized using a histogram and a bar chart to show score variations across different rows. Additionally, a table of descriptive statistics (mean, median, standard deviation, minimum, and maximum) is compiled for the contextual recall scores, providing a comprehensive summary of the model's performance.
|
23
|
+
|
24
|
+
**Signs of High Risk:**
|
25
|
+
- Low contextual recall scores could indicate that the model is not effectively reflecting the original context in its output, leading to incoherent or contextually misaligned text.
|
26
|
+
- A consistent trend of low recall scores could suggest underperformance of the model.
|
54
27
|
|
55
|
-
**
|
28
|
+
**Strengths:**
|
29
|
+
- Provides a quantifiable measure of a model's adherence to the context and factual elements of the generated narrative.
|
30
|
+
- Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of contextual recall scores.
|
31
|
+
- Descriptive statistics offer a concise summary of the model's performance in generating contextually relevant texts.
|
56
32
|
|
57
|
-
|
58
|
-
|
59
|
-
texts lack coherence or meaningful context.
|
33
|
+
**Limitations:**
|
34
|
+
- The focus on word overlap could result in high scores for texts that use many common words, even when these texts lack coherence or meaningful context.
|
60
35
|
- This metric does not consider the order of words, which could lead to overestimated scores for scrambled outputs.
|
61
36
|
- Models that effectively use infrequent words might be undervalued, as these words might not overlap as often.
|
62
37
|
"""
|
63
38
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
39
|
+
y_true = dataset.y
|
40
|
+
y_pred = dataset.y_pred(model)
|
41
|
+
|
42
|
+
score_list = []
|
43
|
+
for y_t, y_p in zip(y_true, y_pred):
|
44
|
+
# Tokenize the reference and candidate texts
|
45
|
+
reference_tokens = nltk.word_tokenize(y_t.lower())
|
46
|
+
candidate_tokens = nltk.word_tokenize(y_p.lower())
|
47
|
+
|
48
|
+
# Calculate overlapping tokens
|
49
|
+
overlapping_tokens = set(reference_tokens) & set(candidate_tokens)
|
50
|
+
|
51
|
+
# Compute contextual recall
|
52
|
+
score_list.append(len(overlapping_tokens) / len(reference_tokens))
|
53
|
+
|
54
|
+
metrics_df = pd.DataFrame(score_list, columns=["Contextual Recall"])
|
55
|
+
figures = []
|
56
|
+
|
57
|
+
# Histogram for Contextual Recall
|
58
|
+
hist_fig = go.Figure(data=[go.Histogram(x=metrics_df["Contextual Recall"])])
|
59
|
+
hist_fig.update_layout(
|
60
|
+
title="Contextual Recall Histogram",
|
61
|
+
xaxis_title="Contextual Recall",
|
62
|
+
yaxis_title="Count",
|
63
|
+
)
|
64
|
+
figures.append(hist_fig)
|
65
|
+
|
66
|
+
# Bar Chart for Contextual Recall
|
67
|
+
bar_fig = go.Figure(
|
68
|
+
data=[go.Bar(x=metrics_df.index, y=metrics_df["Contextual Recall"])]
|
69
|
+
)
|
70
|
+
bar_fig.update_layout(
|
71
|
+
title="Contextual Recall Bar Chart",
|
72
|
+
xaxis_title="Row Index",
|
73
|
+
yaxis_title="Contextual Recall",
|
74
|
+
)
|
75
|
+
figures.append(bar_fig)
|
76
|
+
|
77
|
+
# Calculate statistics for Contextual Recall
|
78
|
+
stats_df = metrics_df.describe().loc[["mean", "50%", "max", "min", "std"]]
|
79
|
+
stats_df = stats_df.rename(
|
80
|
+
index={
|
81
|
+
"mean": "Mean Score",
|
82
|
+
"50%": "Median Score",
|
83
|
+
"max": "Max Score",
|
84
|
+
"min": "Min Score",
|
85
|
+
"std": "Standard Deviation",
|
86
|
+
}
|
87
|
+
).T
|
88
|
+
stats_df["Count"] = len(metrics_df)
|
89
|
+
|
90
|
+
# Create a DataFrame from all collected statistics
|
91
|
+
result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
|
92
|
+
|
93
|
+
return (result_df, *tuple(figures))
|
@@ -2,91 +2,103 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
5
|
import evaluate
|
8
6
|
import pandas as pd
|
9
7
|
import plotly.graph_objects as go
|
10
8
|
|
11
|
-
from validmind
|
9
|
+
from validmind import tags, tasks
|
12
10
|
|
13
11
|
|
14
|
-
@
|
15
|
-
|
12
|
+
@tags("nlp", "text_data", "visualization")
|
13
|
+
@tasks("text_classification", "text_summarization")
|
14
|
+
def MeteorScore(dataset, model):
|
16
15
|
"""
|
17
16
|
Computes and visualizes the METEOR score for each text generation instance, assessing translation quality.
|
18
17
|
|
19
|
-
**Purpose
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
**
|
34
|
-
-
|
35
|
-
|
18
|
+
**Purpose:**
|
19
|
+
METEOR (Metric for Evaluation of Translation with Explicit ORdering) is designed to evaluate the quality of machine translations
|
20
|
+
by comparing them against reference translations. It emphasizes both the accuracy and fluency of translations, incorporating
|
21
|
+
precision, recall, and word order into its assessment.
|
22
|
+
|
23
|
+
**Test Mechanism:**
|
24
|
+
The function starts by extracting the true and predicted values from the provided dataset and model. The METEOR score is computed
|
25
|
+
for each pair of machine-generated translation (prediction) and its corresponding human-produced reference. This is done by
|
26
|
+
considering unigram matches between the translations, including matches based on surface forms, stemmed forms, and synonyms.
|
27
|
+
The score is a combination of unigram precision and recall, adjusted for word order through a fragmentation penalty. Scores are
|
28
|
+
compiled into a dataframe, and histograms and bar charts are generated to visualize the distribution of METEOR scores. Additionally,
|
29
|
+
a table of descriptive statistics (mean, median, standard deviation, minimum, and maximum) is compiled for the METEOR scores,
|
30
|
+
providing a comprehensive summary of the model's performance.
|
31
|
+
|
32
|
+
**Signs of High Risk:**
|
33
|
+
- Lower METEOR scores can indicate a lack of alignment between the machine-generated translations and their human-produced references,
|
34
|
+
highlighting potential deficiencies in both the accuracy and fluency of translations.
|
35
|
+
- Significant discrepancies in word order or an excessive fragmentation penalty could signal issues with how the translation model processes
|
36
|
+
and reconstructs sentence structures, potentially compromising the natural flow of translated text.
|
37
|
+
- Persistent underperformance across a variety of text types or linguistic contexts might suggest a broader inability of the model to adapt to the
|
38
|
+
nuances of different languages or dialects, pointing towards gaps in its training or inherent limitations.
|
39
|
+
|
40
|
+
**Strengths:**
|
41
|
+
- Incorporates a balanced consideration of precision and recall, weighted towards recall to reflect the importance of content coverage in translations.
|
36
42
|
- Directly accounts for word order, offering a nuanced evaluation of translation fluency beyond simple lexical matching.
|
37
43
|
- Adapts to various forms of lexical similarity, including synonyms and stemmed forms, allowing for flexible matching.
|
38
44
|
|
39
|
-
**Limitations
|
40
|
-
- While comprehensive, the complexity of METEOR's calculation can make it computationally intensive, especially for
|
41
|
-
|
42
|
-
|
43
|
-
quality and relevance to the specific translation task.
|
45
|
+
**Limitations:**
|
46
|
+
- While comprehensive, the complexity of METEOR's calculation can make it computationally intensive, especially for large datasets.
|
47
|
+
- The use of external resources for synonym and stemming matching may introduce variability based on the resources' quality and relevance to the specific
|
48
|
+
translation task.
|
44
49
|
"""
|
45
50
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
51
|
+
# Extract true and predicted values
|
52
|
+
y_true = dataset.y
|
53
|
+
y_pred = dataset.y_pred(model)
|
54
|
+
|
55
|
+
# Load the METEOR evaluation metric
|
56
|
+
meteor = evaluate.load("meteor")
|
57
|
+
|
58
|
+
# Calculate METEOR scores
|
59
|
+
score_list = []
|
60
|
+
for y_t, y_p in zip(y_true, y_pred):
|
61
|
+
# Compute the METEOR score
|
62
|
+
score = meteor.compute(predictions=[y_p], references=[y_t])
|
63
|
+
score_list.append(score["meteor"])
|
64
|
+
|
65
|
+
# Convert scores to a dataframe
|
66
|
+
metrics_df = pd.DataFrame(score_list, columns=["METEOR Score"])
|
67
|
+
|
68
|
+
figures = []
|
69
|
+
|
70
|
+
# Histogram for METEOR Score
|
71
|
+
hist_fig = go.Figure(data=[go.Histogram(x=metrics_df["METEOR Score"])])
|
72
|
+
hist_fig.update_layout(
|
73
|
+
title="METEOR Score Histogram",
|
74
|
+
xaxis_title="METEOR Score",
|
75
|
+
yaxis_title="Count",
|
76
|
+
)
|
77
|
+
figures.append(hist_fig)
|
78
|
+
|
79
|
+
# Bar Chart for METEOR Score
|
80
|
+
bar_fig = go.Figure(data=[go.Bar(x=metrics_df.index, y=metrics_df["METEOR Score"])])
|
81
|
+
bar_fig.update_layout(
|
82
|
+
title="METEOR Score Bar Chart",
|
83
|
+
xaxis_title="Row Index",
|
84
|
+
yaxis_title="METEOR Score",
|
85
|
+
)
|
86
|
+
figures.append(bar_fig)
|
87
|
+
|
88
|
+
# Calculate statistics for METEOR Score
|
89
|
+
stats_df = metrics_df.describe().loc[["mean", "50%", "max", "min", "std"]]
|
90
|
+
stats_df = stats_df.rename(
|
91
|
+
index={
|
92
|
+
"mean": "Mean Score",
|
93
|
+
"50%": "Median Score",
|
94
|
+
"max": "Max Score",
|
95
|
+
"min": "Min Score",
|
96
|
+
"std": "Standard Deviation",
|
97
|
+
}
|
98
|
+
).T
|
99
|
+
stats_df["Count"] = len(metrics_df)
|
100
|
+
|
101
|
+
# Create a DataFrame from all collected statistics
|
102
|
+
result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
|
103
|
+
|
104
|
+
return (result_df, *tuple(figures))
|
@@ -2,142 +2,124 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
import itertools
|
6
|
-
from dataclasses import dataclass
|
7
|
-
|
8
5
|
import evaluate
|
6
|
+
import pandas as pd
|
9
7
|
import plotly.graph_objects as go
|
10
|
-
from plotly.subplots import make_subplots
|
11
8
|
|
12
|
-
from validmind
|
9
|
+
from validmind import tags, tasks
|
13
10
|
|
14
11
|
|
15
|
-
@
|
16
|
-
|
12
|
+
@tags("nlp", "text_data", "visualization")
|
13
|
+
@tasks("text_classification", "text_summarization")
|
14
|
+
def RegardScore(dataset, model):
|
17
15
|
"""
|
16
|
+
Computes and visualizes the regard score for each text instance, assessing sentiment and potential biases.
|
17
|
+
|
18
18
|
**Purpose:**
|
19
|
-
The `RegardScore` metric
|
20
|
-
whether it's a classification or summarization result. Especially crucial for applications like sentiment analysis,
|
21
|
-
product reviews, or opinion mining, it provides a granular understanding of how the model perceives or generates content
|
22
|
-
in terms of favorability or sentiment.
|
19
|
+
The `RegardScore` metric is designed to evaluate the regard levels (positive, negative, neutral, or other) of texts generated by models. This helps in understanding the sentiment and biases in the generated content.
|
23
20
|
|
24
21
|
**Test Mechanism:**
|
25
|
-
The
|
26
|
-
and the model's predicted regard. These elements undergo a series of consistency checks before being processed. Using
|
27
|
-
the `evaluate.load("regard")` tool, regard scores are computed for each segment of text. The results are then visualized
|
28
|
-
in a multi-subplot line graph, where each subplot corresponds to a particular category of regard (e.g., positive, negative,
|
29
|
-
neutral, other) against the input, target, and predicted texts.
|
22
|
+
The function starts by extracting the true and predicted values from the provided dataset and model. The regard scores are computed for each text using a preloaded `regard` evaluation tool. The scores are compiled into dataframes, and histograms and bar charts are generated to visualize the distribution of regard scores. Additionally, a table of descriptive statistics (mean, median, standard deviation, minimum, and maximum) is compiled for the regard scores, providing a comprehensive summary of the model's performance.
|
30
23
|
|
31
24
|
**Signs of High Risk:**
|
32
|
-
|
33
|
-
the model
|
34
|
-
indicate the model's inability to correctly identify or generate balanced sentiments.
|
25
|
+
- Noticeable skewness in the histogram, especially when comparing the predicted regard scores with the target regard scores, could indicate biases or inconsistencies in the model.
|
26
|
+
- Lack of neutral scores in the model's predictions, despite a balanced distribution in the target data, might signal an issue.
|
35
27
|
|
36
28
|
**Strengths:**
|
37
|
-
|
38
|
-
|
39
|
-
|
29
|
+
- Provides a clear evaluation of regard levels in generated texts, helping to ensure content appropriateness.
|
30
|
+
- Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of regard scores.
|
31
|
+
- Descriptive statistics offer a concise summary of the model's performance in generating texts with balanced sentiments.
|
40
32
|
|
41
33
|
**Limitations:**
|
42
|
-
The
|
43
|
-
|
44
|
-
|
45
|
-
real-world sentiments often exist on a more complex spectrum. The metric's efficacy is intertwined with the accuracy of
|
46
|
-
the model's predictions; any inherent model inaccuracies can impact the metric's reflection of true sentiments.
|
34
|
+
- The accuracy of the regard scores is contingent upon the underlying `regard` tool.
|
35
|
+
- The scores provide a broad overview but do not specify which portions or tokens of the text are responsible for high regard.
|
36
|
+
- Supplementary, in-depth analysis might be needed for granular insights.
|
47
37
|
"""
|
48
38
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
39
|
+
# Extract true and predicted values
|
40
|
+
y_true = dataset.y
|
41
|
+
y_pred = dataset.y_pred(model)
|
42
|
+
|
43
|
+
# Load the regard evaluation metric
|
44
|
+
regard_tool = evaluate.load("regard")
|
45
|
+
|
46
|
+
# Function to calculate regard scores
|
47
|
+
def compute_regard_scores(texts):
|
48
|
+
scores = regard_tool.compute(data=texts)["regard"]
|
49
|
+
regard_dicts = [
|
50
|
+
dict((x["label"], x["score"]) for x in sublist) for sublist in scores
|
51
|
+
]
|
52
|
+
return regard_dicts
|
53
|
+
|
54
|
+
# Calculate regard scores for true and predicted texts
|
55
|
+
true_regard = compute_regard_scores(y_true)
|
56
|
+
pred_regard = compute_regard_scores(y_pred)
|
57
|
+
|
58
|
+
# Convert scores to dataframes
|
59
|
+
true_df = pd.DataFrame(true_regard)
|
60
|
+
pred_df = pd.DataFrame(pred_regard)
|
61
|
+
|
62
|
+
figures = []
|
63
|
+
|
64
|
+
# Function to create histogram and bar chart for regard scores
|
65
|
+
def create_figures(df, title):
|
66
|
+
for category in df.columns:
|
67
|
+
# Histogram
|
68
|
+
hist_fig = go.Figure(data=[go.Histogram(x=df[category])])
|
69
|
+
hist_fig.update_layout(
|
70
|
+
title=f"{title} - {category.capitalize()} Histogram",
|
71
|
+
xaxis_title=category.capitalize(),
|
72
|
+
yaxis_title="Count",
|
66
73
|
)
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
"
|
87
|
-
"
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
hoverinfo="y+name",
|
120
|
-
line=dict(color=category_colors[category], width=1.5),
|
121
|
-
showlegend=False,
|
122
|
-
),
|
123
|
-
row=row,
|
124
|
-
col=col,
|
125
|
-
)
|
126
|
-
row_offset += 2
|
127
|
-
|
128
|
-
subplot_height = 350
|
129
|
-
total_height = total_rows * subplot_height + 200
|
130
|
-
|
131
|
-
fig.update_layout(title_text="Regard Scores", height=total_height)
|
132
|
-
fig.update_yaxes(range=[0, 1])
|
133
|
-
fig.update_xaxes(showticklabels=False, row=1, col=1)
|
134
|
-
fig.update_xaxes(title_text="Index", showticklabels=True, row=1, col=1)
|
135
|
-
fig.update_yaxes(title_text="Score", showticklabels=True, row=1, col=1)
|
136
|
-
|
137
|
-
return fig
|
138
|
-
|
139
|
-
def run(self):
|
140
|
-
fig = self.regard_line_plot()
|
141
|
-
return self.cache_results(
|
142
|
-
figures=[Figure(for_object=self, key=self.key, figure=fig)]
|
143
|
-
)
|
74
|
+
figures.append(hist_fig)
|
75
|
+
|
76
|
+
# Bar Chart
|
77
|
+
bar_fig = go.Figure(data=[go.Bar(x=df.index, y=df[category])])
|
78
|
+
bar_fig.update_layout(
|
79
|
+
title=f"{title} - {category.capitalize()} Bar Chart",
|
80
|
+
xaxis_title="Text Instance Index",
|
81
|
+
yaxis_title=category.capitalize(),
|
82
|
+
)
|
83
|
+
figures.append(bar_fig)
|
84
|
+
|
85
|
+
# Create figures for each regard score dataframe
|
86
|
+
create_figures(true_df, "True Text Regard")
|
87
|
+
create_figures(pred_df, "Predicted Text Regard")
|
88
|
+
|
89
|
+
# Calculate statistics for each regard score dataframe
|
90
|
+
def calculate_stats(df, metric_name):
|
91
|
+
stats = df.describe().loc[["mean", "50%", "max", "min", "std"]].T
|
92
|
+
stats.columns = [
|
93
|
+
"Mean Score",
|
94
|
+
"Median Score",
|
95
|
+
"Max Score",
|
96
|
+
"Min Score",
|
97
|
+
"Standard Deviation",
|
98
|
+
]
|
99
|
+
stats["Metric"] = metric_name
|
100
|
+
stats["Count"] = len(df)
|
101
|
+
return stats
|
102
|
+
|
103
|
+
true_stats = calculate_stats(true_df, "True Text Regard")
|
104
|
+
pred_stats = calculate_stats(pred_df, "Predicted Text Regard")
|
105
|
+
|
106
|
+
# Combine statistics into a single dataframe
|
107
|
+
result_df = (
|
108
|
+
pd.concat([true_stats, pred_stats])
|
109
|
+
.reset_index()
|
110
|
+
.rename(columns={"index": "Category"})
|
111
|
+
)
|
112
|
+
result_df = result_df[
|
113
|
+
[
|
114
|
+
"Metric",
|
115
|
+
"Category",
|
116
|
+
"Mean Score",
|
117
|
+
"Median Score",
|
118
|
+
"Max Score",
|
119
|
+
"Min Score",
|
120
|
+
"Standard Deviation",
|
121
|
+
"Count",
|
122
|
+
]
|
123
|
+
]
|
124
|
+
|
125
|
+
return (result_df, *tuple(figures))
|