validmind 2.1.1__py3-none-any.whl → 2.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/ai.py +72 -49
- validmind/api_client.py +42 -16
- validmind/client.py +68 -25
- validmind/datasets/llm/rag/__init__.py +11 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_1.csv +30 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_2.csv +30 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_3.csv +53 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_4.csv +53 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_5.csv +53 -0
- validmind/datasets/llm/rag/rfp.py +41 -0
- validmind/errors.py +1 -1
- validmind/html_templates/__init__.py +0 -0
- validmind/html_templates/content_blocks.py +89 -14
- validmind/models/__init__.py +7 -4
- validmind/models/foundation.py +8 -34
- validmind/models/function.py +51 -0
- validmind/models/huggingface.py +16 -46
- validmind/models/metadata.py +42 -0
- validmind/models/pipeline.py +66 -0
- validmind/models/pytorch.py +8 -42
- validmind/models/r_model.py +33 -82
- validmind/models/sklearn.py +39 -38
- validmind/template.py +8 -26
- validmind/tests/__init__.py +43 -20
- validmind/tests/data_validation/ANOVAOneWayTable.py +1 -1
- validmind/tests/data_validation/ChiSquaredFeaturesTable.py +1 -1
- validmind/tests/data_validation/DescriptiveStatistics.py +2 -4
- validmind/tests/data_validation/Duplicates.py +1 -1
- validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +1 -1
- validmind/tests/data_validation/TargetRateBarPlots.py +1 -1
- validmind/tests/data_validation/nlp/LanguageDetection.py +59 -0
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +48 -0
- validmind/tests/data_validation/nlp/Punctuations.py +11 -12
- validmind/tests/data_validation/nlp/Sentiment.py +57 -0
- validmind/tests/data_validation/nlp/Toxicity.py +45 -0
- validmind/tests/decorator.py +12 -7
- validmind/tests/model_validation/BertScore.py +100 -98
- validmind/tests/model_validation/BleuScore.py +93 -64
- validmind/tests/model_validation/ContextualRecall.py +74 -91
- validmind/tests/model_validation/MeteorScore.py +86 -74
- validmind/tests/model_validation/RegardScore.py +103 -121
- validmind/tests/model_validation/RougeScore.py +118 -0
- validmind/tests/model_validation/TokenDisparity.py +84 -121
- validmind/tests/model_validation/ToxicityScore.py +109 -123
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +96 -0
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +71 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +92 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +69 -0
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +78 -0
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +35 -23
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +3 -0
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +7 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +3 -0
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +3 -0
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +99 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +131 -0
- validmind/tests/model_validation/ragas/AnswerRelevance.py +134 -0
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +119 -0
- validmind/tests/model_validation/ragas/AspectCritique.py +167 -0
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +133 -0
- validmind/tests/model_validation/ragas/ContextPrecision.py +123 -0
- validmind/tests/model_validation/ragas/ContextRecall.py +123 -0
- validmind/tests/model_validation/ragas/ContextRelevancy.py +114 -0
- validmind/tests/model_validation/ragas/Faithfulness.py +119 -0
- validmind/tests/model_validation/ragas/utils.py +66 -0
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +3 -7
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +8 -9
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +5 -10
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +3 -2
- validmind/tests/model_validation/sklearn/ROCCurve.py +2 -1
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +2 -3
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +7 -11
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +3 -4
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +1 -1
- validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +5 -6
- validmind/unit_metrics/__init__.py +26 -49
- validmind/unit_metrics/composite.py +13 -7
- validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +1 -1
- validmind/utils.py +99 -6
- validmind/vm_models/__init__.py +1 -1
- validmind/vm_models/dataset/__init__.py +7 -0
- validmind/vm_models/dataset/dataset.py +560 -0
- validmind/vm_models/dataset/utils.py +146 -0
- validmind/vm_models/model.py +97 -72
- validmind/vm_models/test/metric.py +9 -24
- validmind/vm_models/test/result_wrapper.py +124 -28
- validmind/vm_models/test/threshold_test.py +10 -28
- validmind/vm_models/test_context.py +1 -1
- validmind/vm_models/test_suite/summary.py +3 -4
- {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/METADATA +5 -3
- {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/RECORD +103 -78
- validmind/models/catboost.py +0 -33
- validmind/models/statsmodels.py +0 -50
- validmind/models/xgboost.py +0 -30
- validmind/tests/model_validation/BertScoreAggregate.py +0 -90
- validmind/tests/model_validation/RegardHistogram.py +0 -148
- validmind/tests/model_validation/RougeMetrics.py +0 -147
- validmind/tests/model_validation/RougeMetricsAggregate.py +0 -133
- validmind/tests/model_validation/SelfCheckNLIScore.py +0 -112
- validmind/tests/model_validation/ToxicityHistogram.py +0 -136
- validmind/vm_models/dataset.py +0 -1303
- {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/LICENSE +0 -0
- {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/WHEEL +0 -0
- {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/entry_points.txt +0 -0
@@ -2,139 +2,102 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
import itertools
|
6
|
-
from dataclasses import dataclass
|
7
|
-
|
8
5
|
import pandas as pd
|
9
6
|
import plotly.graph_objects as go
|
10
|
-
from plotly.subplots import make_subplots
|
11
|
-
from transformers import BertTokenizer
|
12
7
|
|
13
|
-
from validmind
|
8
|
+
from validmind import tags, tasks
|
14
9
|
|
15
10
|
|
16
|
-
@
|
17
|
-
|
11
|
+
@tags("nlp", "text_data", "visualization")
|
12
|
+
@tasks("text_classification", "text_summarization")
|
13
|
+
def TokenDisparity(dataset, model):
|
18
14
|
"""
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
-
|
46
|
-
of the model's output consistency and verbosity.
|
47
|
-
- It is able to detect potential issues with the model's output generation capability, such as over-production or
|
48
|
-
under-production of tokens compared to the actual data set.
|
49
|
-
|
50
|
-
**Limitations**:
|
51
|
-
Limitations of the Token Disparity metric include:
|
52
|
-
|
53
|
-
- The metric focuses solely on token count, disregarding the semantics behind those tokens. Consequently, it may
|
54
|
-
miss out on issues related to relevance or meaningfulness of produced tokens.
|
55
|
-
- The assumption that similar token count between predicted and actual data suggests accurate output, which is not
|
56
|
-
always the case.
|
57
|
-
- Dependence on the BERT tokenizer, which may not always be the optimum choice for all types of text data.
|
15
|
+
Evaluates the token disparity between reference and generated texts, visualizing the results through histograms
|
16
|
+
and bar charts, alongside compiling a comprehensive table of descriptive statistics for token counts.
|
17
|
+
|
18
|
+
**Purpose:**
|
19
|
+
This function is designed to assess the token disparity between reference and generated texts. Token disparity is
|
20
|
+
important for understanding how closely the length and token usage of generated texts match the reference texts.
|
21
|
+
|
22
|
+
**Test Mechanism:**
|
23
|
+
The function starts by extracting the true and predicted values from the provided dataset and model. It then calculates
|
24
|
+
the number of tokens in each reference and generated text. Histograms and bar charts are generated for the token counts
|
25
|
+
of both reference and generated texts to visualize their distribution. Additionally, a table of descriptive statistics
|
26
|
+
(mean, median, standard deviation, minimum, and maximum) is compiled for the token counts, providing a comprehensive
|
27
|
+
summary of the model's performance.
|
28
|
+
|
29
|
+
**Signs of High Risk:**
|
30
|
+
- Significant disparity in token counts between reference and generated texts could indicate issues with text generation
|
31
|
+
quality, such as verbosity or lack of detail.
|
32
|
+
- Consistently low token counts in generated texts compared to references might suggest that the model is producing
|
33
|
+
incomplete or overly concise outputs.
|
34
|
+
|
35
|
+
**Strengths:**
|
36
|
+
- Provides a simple yet effective evaluation of text length and token usage.
|
37
|
+
- Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of token counts.
|
38
|
+
- Descriptive statistics offer a concise summary of the model's performance in generating texts of appropriate length.
|
39
|
+
|
40
|
+
**Limitations:**
|
41
|
+
- Token counts alone do not provide a complete assessment of text quality and should be supplemented with other metrics and qualitative analysis.
|
58
42
|
"""
|
59
43
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
def run(self):
|
64
|
-
y_true = list(itertools.chain.from_iterable(self.inputs.dataset.y))
|
65
|
-
y_pred = self.inputs.dataset.y_pred(self.inputs.model)
|
66
|
-
|
67
|
-
df = pd.DataFrame({"reference_column": y_true, "generated_column": y_pred})
|
68
|
-
|
69
|
-
fig = self.token_disparity_histograms(df)
|
70
|
-
figures = []
|
71
|
-
figures.append(
|
72
|
-
Figure(
|
73
|
-
for_object=self,
|
74
|
-
key=self.key,
|
75
|
-
figure=fig,
|
76
|
-
)
|
77
|
-
)
|
78
|
-
return self.cache_results(figures=figures)
|
79
|
-
|
80
|
-
def token_disparity_histograms(self, df):
|
81
|
-
"""
|
82
|
-
Visualize the token counts distribution of two given columns using histograms.
|
83
|
-
|
84
|
-
:param df: DataFrame containing the text columns.
|
85
|
-
:param params: Dictionary with the keys ["reference_column", "generated_column"].
|
86
|
-
"""
|
44
|
+
# Extract true and predicted values
|
45
|
+
y_true = dataset.y
|
46
|
+
y_pred = dataset.y_pred(model)
|
87
47
|
|
88
|
-
|
89
|
-
|
48
|
+
# Calculate token counts
|
49
|
+
token_counts_true = [len(text.split()) for text in y_true]
|
50
|
+
token_counts_pred = [len(text.split()) for text in y_pred]
|
90
51
|
|
91
|
-
|
92
|
-
|
52
|
+
# Create a dataframe for reference and generated token counts
|
53
|
+
df = pd.DataFrame(
|
54
|
+
{"reference_tokens": token_counts_true, "generated_tokens": token_counts_pred}
|
55
|
+
)
|
93
56
|
|
94
|
-
|
95
|
-
df["tokens_1"] = df[reference_column].apply(
|
96
|
-
lambda x: len(tokenizer.tokenize(x))
|
97
|
-
)
|
98
|
-
df["tokens_2"] = df[generated_column].apply(
|
99
|
-
lambda x: len(tokenizer.tokenize(x))
|
100
|
-
)
|
57
|
+
figures = []
|
101
58
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
cols=2,
|
106
|
-
subplot_titles=(
|
107
|
-
f"Tokens in {reference_column}",
|
108
|
-
f"Tokens in {generated_column}",
|
109
|
-
),
|
110
|
-
)
|
59
|
+
# Generate histograms and bar charts for reference and generated token counts
|
60
|
+
token_types = ["reference_tokens", "generated_tokens"]
|
61
|
+
token_names = ["Reference Tokens", "Generated Tokens"]
|
111
62
|
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
row=1,
|
120
|
-
col=1,
|
63
|
+
for token_type, token_name in zip(token_types, token_names):
|
64
|
+
# Histogram
|
65
|
+
hist_fig = go.Figure(data=[go.Histogram(x=df[token_type])])
|
66
|
+
hist_fig.update_layout(
|
67
|
+
title=f"{token_name} Histogram",
|
68
|
+
xaxis_title=token_name,
|
69
|
+
yaxis_title="Count",
|
121
70
|
)
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
col=2,
|
71
|
+
figures.append(hist_fig)
|
72
|
+
|
73
|
+
# Bar Chart
|
74
|
+
bar_fig = go.Figure(data=[go.Bar(x=df.index, y=df[token_type])])
|
75
|
+
bar_fig.update_layout(
|
76
|
+
title=f"{token_name} Bar Chart",
|
77
|
+
xaxis_title="Row Index",
|
78
|
+
yaxis_title=token_name,
|
131
79
|
)
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
80
|
+
figures.append(bar_fig)
|
81
|
+
|
82
|
+
# Calculate statistics for each token count type
|
83
|
+
stats_df = df.describe().loc[["mean", "50%", "max", "min", "std"]]
|
84
|
+
stats_df = stats_df.rename(
|
85
|
+
index={
|
86
|
+
"mean": "Mean Count",
|
87
|
+
"50%": "Median Count",
|
88
|
+
"max": "Max Count",
|
89
|
+
"min": "Min Count",
|
90
|
+
"std": "Standard Deviation",
|
91
|
+
}
|
92
|
+
).T
|
93
|
+
stats_df["Count"] = len(df)
|
94
|
+
|
95
|
+
# Rename columns for clarity
|
96
|
+
stats_df.index = stats_df.index.map(
|
97
|
+
{"reference_tokens": "Reference Tokens", "generated_tokens": "Generated Tokens"}
|
98
|
+
)
|
99
|
+
|
100
|
+
# Create a DataFrame from all collected statistics
|
101
|
+
result_df = pd.DataFrame(stats_df).reset_index().rename(columns={"index": "Metric"})
|
102
|
+
|
103
|
+
return (result_df, *tuple(figures))
|
@@ -2,146 +2,132 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
import itertools
|
6
|
-
from dataclasses import dataclass
|
7
|
-
|
8
5
|
import evaluate
|
9
6
|
import pandas as pd
|
10
7
|
import plotly.graph_objects as go
|
11
|
-
import plotly.subplots as sp
|
12
8
|
|
13
|
-
from validmind
|
9
|
+
from validmind import tags, tasks
|
14
10
|
|
15
11
|
|
16
|
-
@
|
17
|
-
|
12
|
+
@tags("nlp", "text_data", "visualization")
|
13
|
+
@tasks("text_classification", "text_summarization")
|
14
|
+
def ToxicityScore(dataset, model):
|
18
15
|
"""
|
16
|
+
Computes and visualizes the toxicity score for input text, true text, and predicted text, assessing content quality and potential risk.
|
17
|
+
|
19
18
|
**Purpose:**
|
20
|
-
The ToxicityScore metric is designed to
|
21
|
-
|
22
|
-
trends and patterns.
|
19
|
+
The ToxicityScore metric is designed to evaluate the toxicity levels of texts generated by models. This is crucial for
|
20
|
+
identifying and mitigating harmful or offensive content in machine-generated texts.
|
23
21
|
|
24
22
|
**Test Mechanism:**
|
25
|
-
The
|
26
|
-
`toxicity` evaluation tool
|
27
|
-
|
28
|
-
|
29
|
-
|
23
|
+
The function starts by extracting the input, true, and predicted values from the provided dataset and model. The toxicity score is
|
24
|
+
computed for each text using a preloaded `toxicity` evaluation tool. The scores are compiled into dataframes, and histograms
|
25
|
+
and bar charts are generated to visualize the distribution of toxicity scores. Additionally, a table of descriptive statistics
|
26
|
+
(mean, median, standard deviation, minimum, and maximum) is compiled for the toxicity scores, providing a comprehensive
|
27
|
+
summary of the model's performance.
|
30
28
|
|
31
29
|
**Signs of High Risk:**
|
32
|
-
Drastic spikes in
|
33
|
-
|
34
|
-
texts, it could be indicative of issues in the model's generated content.
|
30
|
+
- Drastic spikes in toxicity scores indicate potentially toxic content within the associated text segment.
|
31
|
+
- Persistent high toxicity scores across multiple texts may suggest systemic issues in the model's text generation process.
|
35
32
|
|
36
33
|
**Strengths:**
|
37
|
-
|
38
|
-
|
39
|
-
|
34
|
+
- Provides a clear evaluation of toxicity levels in generated texts, helping to ensure content safety and appropriateness.
|
35
|
+
- Visual representations (histograms and bar charts) make it easier to interpret the distribution and trends of toxicity scores.
|
36
|
+
- Descriptive statistics offer a concise summary of the model's performance in generating non-toxic texts.
|
40
37
|
|
41
38
|
**Limitations:**
|
42
|
-
|
43
|
-
|
44
|
-
|
39
|
+
- The accuracy of the toxicity scores is contingent upon the underlying `toxicity` tool.
|
40
|
+
- The scores provide a broad overview but do not specify which portions or tokens of the text are responsible for high toxicity.
|
41
|
+
- Supplementary, in-depth analysis might be needed for granular insights.
|
45
42
|
"""
|
46
43
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
toxicity = evaluate.load("toxicity")
|
84
|
-
|
85
|
-
# Get all columns of df
|
86
|
-
text_columns = df.columns.tolist()
|
87
|
-
|
88
|
-
# Determine the number of rows required based on the number of text columns
|
89
|
-
num_rows = (len(text_columns) + 1) // 2
|
90
|
-
|
91
|
-
# Create a subplot layout
|
92
|
-
fig = sp.make_subplots(rows=num_rows, cols=2, subplot_titles=text_columns)
|
93
|
-
|
94
|
-
subplot_height = 350
|
95
|
-
total_height = num_rows * subplot_height + 200
|
96
|
-
|
97
|
-
for idx, col in enumerate(text_columns, start=1):
|
98
|
-
row = (idx - 1) // 2 + 1
|
99
|
-
col_idx = (idx - 1) % 2 + 1
|
100
|
-
|
101
|
-
# Get list of texts from dataframe
|
102
|
-
texts = df[col].tolist()
|
103
|
-
|
104
|
-
# Compute toxicity for texts
|
105
|
-
toxicity_scores = toxicity.compute(predictions=texts)["toxicity"]
|
106
|
-
|
107
|
-
# Add traces to the corresponding subplot
|
108
|
-
fig.add_trace(
|
109
|
-
go.Scatter(
|
110
|
-
y=toxicity_scores,
|
111
|
-
mode="lines+markers",
|
112
|
-
marker=dict(size=5),
|
113
|
-
line=dict(width=1.5),
|
114
|
-
showlegend=False,
|
115
|
-
),
|
116
|
-
row=row,
|
117
|
-
col=col_idx,
|
118
|
-
)
|
119
|
-
|
120
|
-
# Update xaxes and yaxes titles only for the first subplot
|
121
|
-
if idx == 1:
|
122
|
-
fig.update_xaxes(title_text="Text Index", row=row, col=col_idx)
|
123
|
-
fig.update_yaxes(title_text="Toxicity Score", row=row, col=col_idx)
|
124
|
-
|
125
|
-
# Update layout
|
126
|
-
fig.update_layout(
|
127
|
-
title_text="Line Plots of Toxicity Scores", height=total_height
|
128
|
-
)
|
129
|
-
|
130
|
-
return fig
|
131
|
-
|
132
|
-
def run(self):
|
133
|
-
input_text, y_true, y_pred = self._get_datasets()
|
134
|
-
|
135
|
-
df = pd.DataFrame(
|
136
|
-
{
|
137
|
-
"Input Text": input_text,
|
138
|
-
"Target Text": y_true,
|
139
|
-
"Predicted Summaries": y_pred,
|
140
|
-
}
|
44
|
+
# Extract true, predicted, and input values
|
45
|
+
y_true = dataset.y
|
46
|
+
y_pred = dataset.y_pred(model)
|
47
|
+
input_text = dataset.df[dataset.text_column]
|
48
|
+
|
49
|
+
# Load the toxicity evaluation metric
|
50
|
+
toxicity = evaluate.load("toxicity")
|
51
|
+
|
52
|
+
# Function to calculate toxicity scores
|
53
|
+
def compute_toxicity_scores(texts):
|
54
|
+
scores = []
|
55
|
+
for text in texts:
|
56
|
+
score = toxicity.compute(predictions=[text])
|
57
|
+
scores.append(score["toxicity"])
|
58
|
+
return scores
|
59
|
+
|
60
|
+
# Calculate toxicity scores for input, true, and predicted texts
|
61
|
+
input_toxicity = compute_toxicity_scores(input_text)
|
62
|
+
true_toxicity = compute_toxicity_scores(y_true)
|
63
|
+
pred_toxicity = compute_toxicity_scores(y_pred)
|
64
|
+
|
65
|
+
# Convert scores to dataframes
|
66
|
+
input_df = pd.DataFrame(input_toxicity, columns=["Input Text Toxicity"])
|
67
|
+
true_df = pd.DataFrame(true_toxicity, columns=["True Text Toxicity"])
|
68
|
+
pred_df = pd.DataFrame(pred_toxicity, columns=["Predicted Text Toxicity"])
|
69
|
+
|
70
|
+
figures = []
|
71
|
+
|
72
|
+
# Function to create histogram and bar chart for toxicity scores
|
73
|
+
def create_figures(df, title):
|
74
|
+
# Histogram
|
75
|
+
hist_fig = go.Figure(data=[go.Histogram(x=df.iloc[:, 0])])
|
76
|
+
hist_fig.update_layout(
|
77
|
+
title=f"{title} Histogram",
|
78
|
+
xaxis_title=title,
|
79
|
+
yaxis_title="Count",
|
141
80
|
)
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
81
|
+
figures.append(hist_fig)
|
82
|
+
|
83
|
+
# Bar Chart
|
84
|
+
bar_fig = go.Figure(data=[go.Bar(x=df.index, y=df.iloc[:, 0])])
|
85
|
+
bar_fig.update_layout(
|
86
|
+
title=f"{title} Bar Chart",
|
87
|
+
xaxis_title="Text Instance Index",
|
88
|
+
yaxis_title=title,
|
147
89
|
)
|
90
|
+
figures.append(bar_fig)
|
91
|
+
|
92
|
+
# Create figures for each toxicity score dataframe
|
93
|
+
create_figures(input_df, "Input Text Toxicity")
|
94
|
+
create_figures(true_df, "True Text Toxicity")
|
95
|
+
create_figures(pred_df, "Predicted Text Toxicity")
|
96
|
+
|
97
|
+
# Calculate statistics for each toxicity score dataframe
|
98
|
+
def calculate_stats(df):
|
99
|
+
stats = df.describe().loc[["mean", "50%", "max", "min", "std"]].T
|
100
|
+
stats.columns = [
|
101
|
+
"Mean Score",
|
102
|
+
"Median Score",
|
103
|
+
"Max Score",
|
104
|
+
"Min Score",
|
105
|
+
"Standard Deviation",
|
106
|
+
]
|
107
|
+
stats["Metric"] = df.columns[0]
|
108
|
+
stats["Count"] = len(df)
|
109
|
+
return stats
|
110
|
+
|
111
|
+
input_stats = calculate_stats(input_df)
|
112
|
+
true_stats = calculate_stats(true_df)
|
113
|
+
pred_stats = calculate_stats(pred_df)
|
114
|
+
|
115
|
+
# Combine statistics into a single dataframe
|
116
|
+
result_df = (
|
117
|
+
pd.concat([input_stats, true_stats, pred_stats])
|
118
|
+
.reset_index()
|
119
|
+
.rename(columns={"index": "Statistic"})
|
120
|
+
)
|
121
|
+
result_df = result_df[
|
122
|
+
[
|
123
|
+
"Metric",
|
124
|
+
"Mean Score",
|
125
|
+
"Median Score",
|
126
|
+
"Max Score",
|
127
|
+
"Min Score",
|
128
|
+
"Standard Deviation",
|
129
|
+
"Count",
|
130
|
+
]
|
131
|
+
]
|
132
|
+
|
133
|
+
return (result_df, *tuple(figures))
|
@@ -0,0 +1,96 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
from itertools import combinations
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
import pandas as pd
|
9
|
+
import plotly.express as px
|
10
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
11
|
+
|
12
|
+
|
13
|
+
def CosineSimilarityComparison(dataset, models):
|
14
|
+
"""
|
15
|
+
Computes pairwise cosine similarities between model embeddings and visualizes the results through bar charts,
|
16
|
+
alongside compiling a comprehensive table of descriptive statistics for each model pair.
|
17
|
+
|
18
|
+
**Purpose:**
|
19
|
+
This function is designed to analyze and compare the embeddings produced by different models using Cosine Similarity.
|
20
|
+
Cosine Similarity, a measure calculating the cosine of the angle between two vectors, is widely used to determine
|
21
|
+
the alignment or similarity between vectors in high-dimensional spaces, such as text embeddings. This analysis helps
|
22
|
+
to understand how similar or different the models' predictions are in terms of embedding generation.
|
23
|
+
|
24
|
+
**Test Mechanism:**
|
25
|
+
The function begins by computing the embeddings for each model using the provided dataset. It then calculates the
|
26
|
+
cosine similarity for every possible pair of models, generating a similarity matrix. Each element of this matrix
|
27
|
+
represents the cosine similarity between two model embeddings. The function flattens this matrix and uses it to
|
28
|
+
create a bar chart for each model pair, visualizing their similarity distribution. Additionally, it compiles a table
|
29
|
+
with descriptive statistics (mean, median, standard deviation, minimum, and maximum) for the similarities of each
|
30
|
+
pair, including a reference to the compared models.
|
31
|
+
|
32
|
+
**Signs of High Risk:**
|
33
|
+
|
34
|
+
- A high concentration of cosine similarity values close to 1 could suggest that the models are producing very
|
35
|
+
similar embeddings, which could be a sign of redundancy or lack of diversity in model training or design.
|
36
|
+
- Conversely, very low similarity values near -1 indicate strong dissimilarity, potentially highlighting models
|
37
|
+
that are too divergent, possibly focusing on very different features of the data.
|
38
|
+
|
39
|
+
**Strengths:**
|
40
|
+
|
41
|
+
- Enables detailed comparisons between multiple models' embedding strategies through visual and statistical means.
|
42
|
+
- Helps identify which models produce similar or dissimilar embeddings, useful for tasks requiring model diversity.
|
43
|
+
- Provides quantitative and visual feedback on the degree of similarity, enhancing interpretability of model
|
44
|
+
behavior in embedding spaces.
|
45
|
+
|
46
|
+
**Limitations:**
|
47
|
+
|
48
|
+
- The analysis is confined to the comparison of embeddings and does not assess the overall performance of the models
|
49
|
+
in terms of their primary tasks (e.g., classification, regression).
|
50
|
+
- Assumes that the models are suitable for generating comparable embeddings, which might not always be the case,
|
51
|
+
especially across different types of models.
|
52
|
+
- Interpretation of results is heavily dependent on the understanding of Cosine Similarity and the nature of high-dimensional
|
53
|
+
embedding spaces.
|
54
|
+
"""
|
55
|
+
|
56
|
+
figures = []
|
57
|
+
# Initialize a list to store data for the DataFrame
|
58
|
+
all_stats = []
|
59
|
+
|
60
|
+
# Generate all pairs of models for comparison
|
61
|
+
for model_A, model_B in combinations(models, 2):
|
62
|
+
embeddings_A = np.stack(dataset.y_pred(model_A))
|
63
|
+
embeddings_B = np.stack(dataset.y_pred(model_B))
|
64
|
+
|
65
|
+
# Calculate pairwise cosine similarity
|
66
|
+
similarity_matrix = cosine_similarity(embeddings_A, embeddings_B)
|
67
|
+
similarities = similarity_matrix.flatten()
|
68
|
+
|
69
|
+
# Generate statistics and add model combination as a column
|
70
|
+
stats_data = {
|
71
|
+
"Combination": f"{model_A.input_id} vs {model_B.input_id}",
|
72
|
+
"Mean": np.mean(similarities),
|
73
|
+
"Median": np.median(similarities),
|
74
|
+
"Standard Deviation": np.std(similarities),
|
75
|
+
"Minimum": np.min(similarities),
|
76
|
+
"Maximum": np.max(similarities),
|
77
|
+
}
|
78
|
+
all_stats.append(stats_data)
|
79
|
+
|
80
|
+
# Generate an index for each similarity value
|
81
|
+
indices = range(len(similarities))
|
82
|
+
|
83
|
+
# Create the bar chart using Plotly
|
84
|
+
fig = px.bar(
|
85
|
+
x=indices,
|
86
|
+
y=similarities,
|
87
|
+
labels={"x": "Pair Index", "y": "Cosine Similarity"},
|
88
|
+
title=f"Cosine Similarity - {model_A.input_id} vs {model_B.input_id}",
|
89
|
+
)
|
90
|
+
fig.update_layout(xaxis_title="Pair Index", yaxis_title="Cosine Similarity")
|
91
|
+
figures.append(fig)
|
92
|
+
|
93
|
+
# Create a DataFrame from all collected statistics
|
94
|
+
stats_df = pd.DataFrame(all_stats)
|
95
|
+
|
96
|
+
return (stats_df, *tuple(figures))
|
@@ -0,0 +1,71 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
import plotly.express as px
|
7
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
8
|
+
|
9
|
+
|
10
|
+
def CosineSimilarityHeatmap(
|
11
|
+
dataset,
|
12
|
+
model,
|
13
|
+
title="Cosine Similarity Matrix",
|
14
|
+
color="Cosine Similarity",
|
15
|
+
xaxis_title="Index",
|
16
|
+
yaxis_title="Index",
|
17
|
+
color_scale="Blues",
|
18
|
+
):
|
19
|
+
"""
|
20
|
+
Generates an interactive heatmap to visualize the cosine similarities among embeddings derived from a given model.
|
21
|
+
|
22
|
+
**Purpose:**
|
23
|
+
This function is designed to visually analyze the cosine similarities of embeddings from a specific model.
|
24
|
+
Cosine similarity, a measure of the cosine of the angle between two vectors, aids in understanding the
|
25
|
+
orientation and similarity of vectors in multi-dimensional space. This is particularly valuable for exploring
|
26
|
+
text embeddings and their relative similarities among documents, words, or phrases.
|
27
|
+
|
28
|
+
**Test Mechanism:**
|
29
|
+
The function operates through a sequence of steps to visualize cosine similarities. Initially,
|
30
|
+
embeddings are extracted for each dataset entry using the designated model. Following this,
|
31
|
+
the function computes the pairwise cosine similarities among these embeddings. The computed similarities
|
32
|
+
are then displayed in an interactive heatmap.
|
33
|
+
|
34
|
+
**Signs of High Risk:**
|
35
|
+
- High similarity values (close to 1) across the heatmap might not always be indicative of a risk;
|
36
|
+
however, in contexts where diverse perspectives or features are desired, this could suggest a lack of
|
37
|
+
diversity in the model's learning process or potential redundancy.
|
38
|
+
- Similarly, low similarity values (close to -1) indicate strong dissimilarity, which could be beneficial in
|
39
|
+
scenarios demanding diverse outputs. However, in cases where consistency is needed, these low values might
|
40
|
+
highlight that the model is unable to capture a coherent set of features from the data, potentially leading to poor performance on related tasks.
|
41
|
+
|
42
|
+
**Strengths:**
|
43
|
+
- Provides an interactive and intuitive visual representation of embedding similarities, facilitating easy exploration and analysis.
|
44
|
+
- Allows customization of visual elements such as title, axis labels, and color scale to suit specific analytical needs and preferences.
|
45
|
+
|
46
|
+
**Limitations:**
|
47
|
+
- As the number of embeddings increases, the effectiveness of the heatmap might diminish due to overcrowding, making it hard to discern detailed similarities.
|
48
|
+
- The interpretation of the heatmap heavily relies on the appropriate setting of the color scale, as incorrect settings can lead to misleading visual interpretations.
|
49
|
+
"""
|
50
|
+
|
51
|
+
embeddings = np.stack(dataset.y_pred(model))
|
52
|
+
|
53
|
+
# Calculate pairwise cosine similarity
|
54
|
+
similarity_matrix = cosine_similarity(embeddings)
|
55
|
+
|
56
|
+
# Create the heatmap using Plotly
|
57
|
+
fig = px.imshow(
|
58
|
+
similarity_matrix,
|
59
|
+
labels=dict(x=xaxis_title, y=yaxis_title, color=color),
|
60
|
+
text_auto=True,
|
61
|
+
aspect="auto",
|
62
|
+
color_continuous_scale=color_scale,
|
63
|
+
)
|
64
|
+
|
65
|
+
fig.update_layout(
|
66
|
+
title=f"{title} - {model.input_id}",
|
67
|
+
xaxis_title=xaxis_title,
|
68
|
+
yaxis_title=yaxis_title,
|
69
|
+
)
|
70
|
+
|
71
|
+
return fig
|