validmind 2.1.1__py3-none-any.whl → 2.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/ai.py +72 -49
- validmind/api_client.py +42 -16
- validmind/client.py +68 -25
- validmind/datasets/llm/rag/__init__.py +11 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_1.csv +30 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_2.csv +30 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_3.csv +53 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_4.csv +53 -0
- validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_5.csv +53 -0
- validmind/datasets/llm/rag/rfp.py +41 -0
- validmind/errors.py +1 -1
- validmind/html_templates/__init__.py +0 -0
- validmind/html_templates/content_blocks.py +89 -14
- validmind/models/__init__.py +7 -4
- validmind/models/foundation.py +8 -34
- validmind/models/function.py +51 -0
- validmind/models/huggingface.py +16 -46
- validmind/models/metadata.py +42 -0
- validmind/models/pipeline.py +66 -0
- validmind/models/pytorch.py +8 -42
- validmind/models/r_model.py +33 -82
- validmind/models/sklearn.py +39 -38
- validmind/template.py +8 -26
- validmind/tests/__init__.py +43 -20
- validmind/tests/data_validation/ANOVAOneWayTable.py +1 -1
- validmind/tests/data_validation/ChiSquaredFeaturesTable.py +1 -1
- validmind/tests/data_validation/DescriptiveStatistics.py +2 -4
- validmind/tests/data_validation/Duplicates.py +1 -1
- validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +1 -1
- validmind/tests/data_validation/TargetRateBarPlots.py +1 -1
- validmind/tests/data_validation/nlp/LanguageDetection.py +59 -0
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +48 -0
- validmind/tests/data_validation/nlp/Punctuations.py +11 -12
- validmind/tests/data_validation/nlp/Sentiment.py +57 -0
- validmind/tests/data_validation/nlp/Toxicity.py +45 -0
- validmind/tests/decorator.py +12 -7
- validmind/tests/model_validation/BertScore.py +100 -98
- validmind/tests/model_validation/BleuScore.py +93 -64
- validmind/tests/model_validation/ContextualRecall.py +74 -91
- validmind/tests/model_validation/MeteorScore.py +86 -74
- validmind/tests/model_validation/RegardScore.py +103 -121
- validmind/tests/model_validation/RougeScore.py +118 -0
- validmind/tests/model_validation/TokenDisparity.py +84 -121
- validmind/tests/model_validation/ToxicityScore.py +109 -123
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +96 -0
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +71 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +92 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +69 -0
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +78 -0
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +35 -23
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +3 -0
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +7 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +3 -0
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +3 -0
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +99 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +131 -0
- validmind/tests/model_validation/ragas/AnswerRelevance.py +134 -0
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +119 -0
- validmind/tests/model_validation/ragas/AspectCritique.py +167 -0
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +133 -0
- validmind/tests/model_validation/ragas/ContextPrecision.py +123 -0
- validmind/tests/model_validation/ragas/ContextRecall.py +123 -0
- validmind/tests/model_validation/ragas/ContextRelevancy.py +114 -0
- validmind/tests/model_validation/ragas/Faithfulness.py +119 -0
- validmind/tests/model_validation/ragas/utils.py +66 -0
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +3 -7
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +8 -9
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +5 -10
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +3 -2
- validmind/tests/model_validation/sklearn/ROCCurve.py +2 -1
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +2 -3
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +7 -11
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +3 -4
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +1 -1
- validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +5 -6
- validmind/unit_metrics/__init__.py +26 -49
- validmind/unit_metrics/composite.py +13 -7
- validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +1 -1
- validmind/utils.py +99 -6
- validmind/vm_models/__init__.py +1 -1
- validmind/vm_models/dataset/__init__.py +7 -0
- validmind/vm_models/dataset/dataset.py +560 -0
- validmind/vm_models/dataset/utils.py +146 -0
- validmind/vm_models/model.py +97 -72
- validmind/vm_models/test/metric.py +9 -24
- validmind/vm_models/test/result_wrapper.py +124 -28
- validmind/vm_models/test/threshold_test.py +10 -28
- validmind/vm_models/test_context.py +1 -1
- validmind/vm_models/test_suite/summary.py +3 -4
- {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/METADATA +5 -3
- {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/RECORD +103 -78
- validmind/models/catboost.py +0 -33
- validmind/models/statsmodels.py +0 -50
- validmind/models/xgboost.py +0 -30
- validmind/tests/model_validation/BertScoreAggregate.py +0 -90
- validmind/tests/model_validation/RegardHistogram.py +0 -148
- validmind/tests/model_validation/RougeMetrics.py +0 -147
- validmind/tests/model_validation/RougeMetricsAggregate.py +0 -133
- validmind/tests/model_validation/SelfCheckNLIScore.py +0 -112
- validmind/tests/model_validation/ToxicityHistogram.py +0 -136
- validmind/vm_models/dataset.py +0 -1303
- {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/LICENSE +0 -0
- {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/WHEEL +0 -0
- {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,92 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
from itertools import combinations
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
import pandas as pd
|
9
|
+
import plotly.express as px
|
10
|
+
from sklearn.metrics.pairwise import euclidean_distances
|
11
|
+
|
12
|
+
|
13
|
+
def EuclideanDistanceComparison(dataset, models):
|
14
|
+
"""
|
15
|
+
Computes pairwise Euclidean distances between model embeddings and visualizes the results through bar charts,
|
16
|
+
alongside compiling a comprehensive table of descriptive statistics for each model pair.
|
17
|
+
|
18
|
+
**Purpose:**
|
19
|
+
This function is designed to analyze and compare the embeddings produced by different models using Euclidean Distance.
|
20
|
+
Euclidean Distance measures the "ordinary" straight-line distance between two points in Euclidean space, providing a
|
21
|
+
straightforward metric to assess the absolute differences between vectors. This analysis helps in understanding the
|
22
|
+
magnitude of dissimilarity between the embeddings generated by different models, which is crucial for tasks that require
|
23
|
+
distinctive model responses or feature separations.
|
24
|
+
|
25
|
+
**Test Mechanism:**
|
26
|
+
The function begins by computing the embeddings for each model using the provided dataset. It then calculates the
|
27
|
+
Euclidean distance for every possible pair of models, generating a distance matrix. Each element of this matrix
|
28
|
+
represents the Euclidean distance between two model embeddings. The function flattens this matrix and uses it to
|
29
|
+
create a bar chart for each model pair, visualizing their distance distribution. Additionally, it compiles a table
|
30
|
+
with descriptive statistics (mean, median, standard deviation, minimum, and maximum) for the distances of each
|
31
|
+
pair, including a reference to the compared models.
|
32
|
+
|
33
|
+
**Signs of High Risk:**
|
34
|
+
|
35
|
+
- Very high distance values could suggest that the models are focusing on completely different features or aspects
|
36
|
+
of the data, which might be undesirable for ensemble methods or similar applications where some degree of
|
37
|
+
consensus is expected.
|
38
|
+
- Extremely low distances across different models might indicate redundancy, suggesting that the models are not
|
39
|
+
providing diverse enough perspectives on the data.
|
40
|
+
|
41
|
+
**Strengths:**
|
42
|
+
|
43
|
+
- Provides a clear and quantifiable measure of how different the embeddings from various models are.
|
44
|
+
- Useful for identifying outlier models or those that behave significantly differently from others in a group.
|
45
|
+
|
46
|
+
**Limitations:**
|
47
|
+
|
48
|
+
- Euclidean distance can be sensitive to the scale of the data, meaning that preprocessing steps like normalization
|
49
|
+
might be necessary to ensure meaningful comparisons.
|
50
|
+
- Does not consider the orientation or angle between vectors, focusing purely on magnitude differences.
|
51
|
+
"""
|
52
|
+
|
53
|
+
figures = []
|
54
|
+
all_stats = []
|
55
|
+
|
56
|
+
# Generate all pairs of models for comparison
|
57
|
+
for model_A, model_B in combinations(models, 2):
|
58
|
+
embeddings_A = np.stack(dataset.y_pred(model_A))
|
59
|
+
embeddings_B = np.stack(dataset.y_pred(model_B))
|
60
|
+
|
61
|
+
# Calculate pairwise Euclidean distances
|
62
|
+
distance_matrix = euclidean_distances(embeddings_A, embeddings_B)
|
63
|
+
distances = distance_matrix.flatten()
|
64
|
+
|
65
|
+
# Generate statistics and add model combination as a column
|
66
|
+
stats_data = {
|
67
|
+
"Combination": f"{model_A.input_id} vs {model_B.input_id}",
|
68
|
+
"Mean": np.mean(distances),
|
69
|
+
"Median": np.median(distances),
|
70
|
+
"Standard Deviation": np.std(distances),
|
71
|
+
"Minimum": np.min(distances),
|
72
|
+
"Maximum": np.max(distances),
|
73
|
+
}
|
74
|
+
all_stats.append(stats_data)
|
75
|
+
|
76
|
+
# Generate an index for each distance value
|
77
|
+
indices = range(len(distances))
|
78
|
+
|
79
|
+
# Create the bar chart using Plotly
|
80
|
+
fig = px.bar(
|
81
|
+
x=indices,
|
82
|
+
y=distances,
|
83
|
+
labels={"x": "Pair Index", "y": "Euclidean Distance"},
|
84
|
+
title=f"Euclidean Distance - {model_A.input_id} vs {model_B.input_id}",
|
85
|
+
)
|
86
|
+
fig.update_layout(xaxis_title="Pair Index", yaxis_title="Euclidean Distance")
|
87
|
+
figures.append(fig)
|
88
|
+
|
89
|
+
# Create a DataFrame from all collected statistics
|
90
|
+
stats_df = pd.DataFrame(all_stats)
|
91
|
+
|
92
|
+
return (stats_df, *tuple(figures))
|
@@ -0,0 +1,69 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
import plotly.express as px
|
7
|
+
from sklearn.metrics.pairwise import euclidean_distances
|
8
|
+
|
9
|
+
|
10
|
+
def EuclideanDistanceHeatmap(
|
11
|
+
dataset,
|
12
|
+
model,
|
13
|
+
title="Euclidean Distance Matrix",
|
14
|
+
color="Euclidean Distance",
|
15
|
+
xaxis_title="Index",
|
16
|
+
yaxis_title="Index",
|
17
|
+
color_scale="Blues",
|
18
|
+
):
|
19
|
+
"""
|
20
|
+
Generates an interactive heatmap to visualize the Euclidean distances among embeddings derived from a given model.
|
21
|
+
|
22
|
+
**Purpose:**
|
23
|
+
This function visualizes the Euclidean distances between embeddings generated by a model, offering insights into the
|
24
|
+
absolute differences between data points. Euclidean distance, a fundamental metric in data analysis, measures the
|
25
|
+
straight-line distance between two points in Euclidean space. It is particularly useful for understanding spatial
|
26
|
+
relationships and clustering tendencies in high-dimensional data.
|
27
|
+
|
28
|
+
**Test Mechanism:**
|
29
|
+
The function operates through a streamlined process: firstly, embeddings are extracted for each dataset entry using the specified model.
|
30
|
+
Subsequently, it computes the pairwise Euclidean distances among these embeddings. The results are then visualized in an interactive heatmap format,
|
31
|
+
where each cell's color intensity correlates with the distance magnitude between pairs of embeddings, providing a visual assessment of these distances.
|
32
|
+
|
33
|
+
**Signs of High Risk:**
|
34
|
+
- Uniform Distances: Uniformly low distances across the heatmap might suggest a lack of variability in the data or
|
35
|
+
model overfitting, where the model fails to distinguish between distinct data points effectively.
|
36
|
+
- High Variability: Conversely, excessive variability in distances could indicate inconsistent data representation,
|
37
|
+
potentially leading to unreliable model predictions.
|
38
|
+
|
39
|
+
**Strengths:**
|
40
|
+
- Provides a direct, intuitive visual representation of distances between embeddings, aiding in the detection of patterns or anomalies.
|
41
|
+
- Allows customization of visual aspects such as the heatmap's title, axis labels, and color scale, adapting to various analytical needs.
|
42
|
+
|
43
|
+
**Limitations:**
|
44
|
+
- The interpretation of distances can be sensitive to the scale of data; normalization might be necessary for meaningful analysis.
|
45
|
+
- Large datasets may lead to dense, cluttered heatmaps, making it difficult to discern individual distances, potentially requiring
|
46
|
+
techniques like data sampling or dimensionality reduction for clearer visualization.
|
47
|
+
"""
|
48
|
+
|
49
|
+
embeddings = np.stack(dataset.y_pred(model))
|
50
|
+
|
51
|
+
# Calculate pairwise Euclidean distance
|
52
|
+
distance_matrix = euclidean_distances(embeddings)
|
53
|
+
|
54
|
+
# Create the heatmap using Plotly
|
55
|
+
fig = px.imshow(
|
56
|
+
distance_matrix,
|
57
|
+
labels=dict(x=xaxis_title, y=yaxis_title, color=color),
|
58
|
+
text_auto=True,
|
59
|
+
aspect="auto",
|
60
|
+
color_continuous_scale=color_scale,
|
61
|
+
)
|
62
|
+
|
63
|
+
fig.update_layout(
|
64
|
+
title=f"{title} - {model.input_id}",
|
65
|
+
xaxis_title=xaxis_title,
|
66
|
+
yaxis_title=yaxis_title,
|
67
|
+
)
|
68
|
+
|
69
|
+
return fig
|
@@ -0,0 +1,78 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import itertools
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
import pandas as pd
|
9
|
+
import plotly.express as px
|
10
|
+
from sklearn.decomposition import PCA
|
11
|
+
from sklearn.preprocessing import StandardScaler
|
12
|
+
|
13
|
+
|
14
|
+
def PCAComponentsPairwisePlots(dataset, model, n_components=3):
|
15
|
+
"""
|
16
|
+
Generates scatter plots for pairwise combinations of principal component analysis (PCA) components of model embeddings.
|
17
|
+
|
18
|
+
**Purpose:**
|
19
|
+
This function visualizes the principal components of embeddings derived from a specified model. Principal Component Analysis (PCA)
|
20
|
+
is a statistical technique that emphasizes variation and uncovers strong patterns in a dataset.
|
21
|
+
It transforms the original variables into new, uncorrelated variables (principal components) that maximize variance.
|
22
|
+
|
23
|
+
**Test Mechanism:**
|
24
|
+
The function follows a sequential process to visualize PCA components effectively.
|
25
|
+
It starts by extracting embeddings from the dataset, utilizing the model specified by the user.
|
26
|
+
These embeddings are then standardized to ensure zero mean and unit variance, which is crucial to prevent
|
27
|
+
any single feature from dominating due to scale—this standardization is a critical preprocessing step for PCA.
|
28
|
+
Following this, the function calculates the specified number of principal components.
|
29
|
+
The core of the visualization process involves creating scatter plots for each pairwise combination of these principal components.
|
30
|
+
|
31
|
+
**Signs of High Risk:**
|
32
|
+
- If the principal components do not account for a significant portion of the variance, it may suggest that PCA is not capturing the essential structures of the data.
|
33
|
+
- Similarity in scatter plots across different pairs of components could indicate redundancy in the components, suggesting that fewer dimensions might be sufficient to represent the data.
|
34
|
+
|
35
|
+
**Strengths:**
|
36
|
+
- Enables a simplified visualization of multivariate data, helping to identify patterns across many variables effectively.
|
37
|
+
- Provides a clear depiction of the directions of maximum variance in the data, which is valuable for feature selection and dimensionality reduction.
|
38
|
+
|
39
|
+
**Limitations:**
|
40
|
+
- PCA's effectiveness hinges on the scaling of the variables; improper standardization can lead to misleading interpretations.
|
41
|
+
- The interpretation of principal components can be challenging, especially if they capture less significant variances or are difficult to relate back to the original features.
|
42
|
+
"""
|
43
|
+
|
44
|
+
# Get embeddings from the dataset using the model
|
45
|
+
embeddings = np.stack(dataset.y_pred(model))
|
46
|
+
|
47
|
+
# Standardize the embeddings
|
48
|
+
scaler = StandardScaler()
|
49
|
+
embeddings_scaled = scaler.fit_transform(embeddings)
|
50
|
+
|
51
|
+
# Perform PCA
|
52
|
+
pca = PCA(n_components=n_components)
|
53
|
+
pca_results = pca.fit_transform(embeddings_scaled)
|
54
|
+
|
55
|
+
# Prepare DataFrame for Plotly
|
56
|
+
pca_df = pd.DataFrame(
|
57
|
+
pca_results, columns=[f"PC{i+1}" for i in range(n_components)]
|
58
|
+
)
|
59
|
+
|
60
|
+
# List to store each plot
|
61
|
+
plots = []
|
62
|
+
|
63
|
+
# Create plots for each pair of principal components
|
64
|
+
for pc1, pc2 in itertools.combinations(range(1, n_components + 1), 2):
|
65
|
+
fig = px.scatter(
|
66
|
+
pca_df,
|
67
|
+
x=f"PC{pc1}",
|
68
|
+
y=f"PC{pc2}",
|
69
|
+
title=f"{getattr(model, 'input_id', 'Unknown Model')} (PC{pc1} vs PC{pc2})",
|
70
|
+
labels={
|
71
|
+
f"PC{pc1}": f"Principal Component {pc1}",
|
72
|
+
f"PC{pc2}": f"Principal Component {pc2}",
|
73
|
+
},
|
74
|
+
)
|
75
|
+
plots.append(fig)
|
76
|
+
|
77
|
+
# Return the list of plots as a tuple
|
78
|
+
return tuple(plots)
|
@@ -6,8 +6,10 @@ from abc import abstractmethod
|
|
6
6
|
from typing import List
|
7
7
|
|
8
8
|
import numpy as np
|
9
|
+
import plotly.express as px
|
9
10
|
from sklearn.metrics.pairwise import cosine_similarity
|
10
11
|
|
12
|
+
from validmind.logging import get_logger
|
11
13
|
from validmind.vm_models import (
|
12
14
|
Figure,
|
13
15
|
ResultSummary,
|
@@ -17,13 +19,14 @@ from validmind.vm_models import (
|
|
17
19
|
ThresholdTestResult,
|
18
20
|
)
|
19
21
|
|
22
|
+
logger = get_logger(__name__)
|
23
|
+
|
20
24
|
|
21
25
|
class StabilityAnalysis(ThresholdTest):
|
22
26
|
"""Base class for embeddings stability analysis tests"""
|
23
27
|
|
24
28
|
required_inputs = ["model", "dataset"]
|
25
29
|
default_params = {
|
26
|
-
"text_column": None,
|
27
30
|
"mean_similarity_threshold": 0.7,
|
28
31
|
}
|
29
32
|
metadata = {
|
@@ -61,25 +64,22 @@ class StabilityAnalysis(ThresholdTest):
|
|
61
64
|
|
62
65
|
def run(self):
|
63
66
|
# Perturb the test dataset
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
)
|
67
|
+
original = self.inputs.dataset.df
|
68
|
+
perturbed = original.copy()
|
69
|
+
perturbed.update(
|
70
|
+
perturbed.select_dtypes(include="object").applymap(self.perturb_data)
|
71
|
+
)
|
70
72
|
|
71
|
-
|
72
|
-
|
73
|
-
perturbed_data_df = perturbed_data_df.apply(self.perturb_data)
|
73
|
+
logger.debug(f"Original data: {original}")
|
74
|
+
logger.debug(f"Perturbed data: {perturbed}")
|
74
75
|
|
75
76
|
# Compute embeddings for the original and perturbed dataset
|
76
|
-
original_embeddings = self.inputs.
|
77
|
-
perturbed_embeddings = self.inputs.model.predict(
|
77
|
+
original_embeddings = self.inputs.dataset.y_pred(self.inputs.model)
|
78
|
+
perturbed_embeddings = np.stack(self.inputs.model.predict(perturbed))
|
78
79
|
|
79
80
|
# Compute cosine similarities between original and perturbed embeddings
|
80
81
|
similarities = cosine_similarity(
|
81
|
-
original_embeddings,
|
82
|
-
perturbed_embeddings,
|
82
|
+
original_embeddings, perturbed_embeddings
|
83
83
|
).diagonal()
|
84
84
|
|
85
85
|
mean = np.mean(similarities)
|
@@ -91,15 +91,26 @@ class StabilityAnalysis(ThresholdTest):
|
|
91
91
|
# Determine if the test passed based on the mean similarity and threshold
|
92
92
|
passed = mean > self.params["mean_similarity_threshold"]
|
93
93
|
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
94
|
+
figures = [
|
95
|
+
px.histogram(
|
96
|
+
x=similarities.flatten(),
|
97
|
+
nbins=100,
|
98
|
+
title="Cosine Similarity Distribution",
|
99
|
+
labels={"x": "Cosine Similarity"},
|
100
|
+
),
|
101
|
+
px.density_contour(
|
102
|
+
x=similarities.flatten(),
|
103
|
+
nbinsx=100,
|
104
|
+
title="Cosine Similarity Density",
|
105
|
+
labels={"x": "Cosine Similarity"},
|
106
|
+
marginal_x="histogram",
|
107
|
+
),
|
108
|
+
px.box(
|
109
|
+
x=similarities.flatten(),
|
110
|
+
labels={"x": "Cosine Similarity"},
|
111
|
+
title="Cosine Similarity Box Plot",
|
112
|
+
),
|
113
|
+
]
|
103
114
|
|
104
115
|
# For this example, we are not caching the results as done in the reference `run` method
|
105
116
|
return self.cache_results(
|
@@ -121,6 +132,7 @@ class StabilityAnalysis(ThresholdTest):
|
|
121
132
|
key=self.name,
|
122
133
|
figure=fig,
|
123
134
|
)
|
135
|
+
for fig in figures
|
124
136
|
],
|
125
137
|
passed=passed,
|
126
138
|
)
|
@@ -114,9 +114,15 @@ class StabilityAnalysisRandomNoise(StabilityAnalysis):
|
|
114
114
|
name = "Text Embeddings Stability Analysis to Random Noise"
|
115
115
|
default_params = {
|
116
116
|
**StabilityAnalysis.default_params,
|
117
|
+
"probability": 0.02,
|
117
118
|
}
|
118
119
|
|
119
|
-
def perturb_data(self, data
|
120
|
+
def perturb_data(self, data):
|
121
|
+
if not isinstance(data, str):
|
122
|
+
return data
|
123
|
+
|
124
|
+
probability = self.params["probability"]
|
125
|
+
|
120
126
|
# Tokenize the string based on spaces
|
121
127
|
words = data.split()
|
122
128
|
|
@@ -0,0 +1,99 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import itertools
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
import pandas as pd
|
9
|
+
import plotly.express as px
|
10
|
+
from sklearn.manifold import TSNE
|
11
|
+
from sklearn.preprocessing import StandardScaler
|
12
|
+
|
13
|
+
|
14
|
+
def TSNEComponentsPairwisePlots(
|
15
|
+
dataset,
|
16
|
+
model,
|
17
|
+
n_components=2,
|
18
|
+
perplexity=30,
|
19
|
+
title="t-SNE",
|
20
|
+
):
|
21
|
+
"""
|
22
|
+
Plots individual scatter plots for pairwise combinations of t-SNE components of embeddings.
|
23
|
+
|
24
|
+
**Purpose:**
|
25
|
+
This function creates scatter plots for each pairwise combination of t-SNE components derived from model embeddings.
|
26
|
+
t-SNE (t-Distributed Stochastic Neighbor Embedding) is a machine learning algorithm for dimensionality reduction that
|
27
|
+
is particularly well-suited for the visualization of high-dimensional datasets.
|
28
|
+
|
29
|
+
**Test Mechanism:**
|
30
|
+
The function begins by extracting embeddings from the provided dataset using the specified model.
|
31
|
+
These embeddings are then standardized to ensure that each dimension contributes equally to the distance computation.
|
32
|
+
Following this, the t-SNE algorithm is applied to reduce the dimensionality of the data, with the number of components
|
33
|
+
specified by the user. The results are plotted using Plotly, creating scatter plots for each unique pair of components
|
34
|
+
if more than one component is specified.
|
35
|
+
|
36
|
+
**Signs of High Risk:**
|
37
|
+
- If the scatter plots show overlapping clusters or indistinct groupings, it might suggest that the
|
38
|
+
t-SNE parameters (such as perplexity) are not optimally set for the given data, or the data itself does not exhibit clear, separable clusters.
|
39
|
+
- Similar plots across different pairs of components could indicate redundancy in the components generated by t-SNE,
|
40
|
+
suggesting that fewer dimensions might be sufficient to represent the data's structure.
|
41
|
+
|
42
|
+
**Strengths:**
|
43
|
+
- Provides a visual exploration tool for high-dimensional data, simplifying the detection of patterns and clusters which are not apparent in higher dimensions.
|
44
|
+
- Interactive plots generated by Plotly enhance user engagement and allow for a deeper dive into specific areas of the plot, aiding in detailed data analysis.
|
45
|
+
|
46
|
+
**Limitations:**
|
47
|
+
- The effectiveness of t-SNE is highly dependent on the choice of parameters like perplexity and the number of components,
|
48
|
+
which might require tuning and experimentation for optimal results.
|
49
|
+
- t-SNE visualizations can be misleading if interpreted without considering the stochastic nature of the algorithm;
|
50
|
+
two runs with the same parameters might yield different visual outputs, necessitating multiple runs for a consistent interpretation.
|
51
|
+
"""
|
52
|
+
|
53
|
+
# Get embeddings from the dataset using the model
|
54
|
+
embeddings = np.stack(dataset.y_pred(model))
|
55
|
+
|
56
|
+
# Standardize the embeddings
|
57
|
+
scaler = StandardScaler()
|
58
|
+
embeddings_scaled = scaler.fit_transform(embeddings)
|
59
|
+
|
60
|
+
# Perform t-SNE
|
61
|
+
tsne = TSNE(n_components=n_components, perplexity=perplexity)
|
62
|
+
tsne_results = tsne.fit_transform(embeddings_scaled)
|
63
|
+
|
64
|
+
# Prepare DataFrame for Plotly
|
65
|
+
tsne_df = pd.DataFrame(
|
66
|
+
tsne_results, columns=[f"Component {i+1}" for i in range(n_components)]
|
67
|
+
)
|
68
|
+
|
69
|
+
# List to store each plot
|
70
|
+
plots = []
|
71
|
+
|
72
|
+
# Create plots for each pair of t-SNE components (if n_components > 1)
|
73
|
+
if n_components > 1:
|
74
|
+
for comp1, comp2 in itertools.combinations(range(1, n_components + 1), 2):
|
75
|
+
fig = px.scatter(
|
76
|
+
tsne_df,
|
77
|
+
x=f"Component {comp1}",
|
78
|
+
y=f"Component {comp2}",
|
79
|
+
title=f"{title} - {getattr(model, 'input_id', 'Unknown Model')}",
|
80
|
+
labels={
|
81
|
+
f"Component {comp1}": f"Component {comp1}",
|
82
|
+
f"Component {comp2}": f"Component {comp2}",
|
83
|
+
},
|
84
|
+
)
|
85
|
+
plots.append(fig)
|
86
|
+
else:
|
87
|
+
fig = px.scatter(
|
88
|
+
tsne_df,
|
89
|
+
x="Component 1",
|
90
|
+
y="Component 1",
|
91
|
+
title=f"{title} - {getattr(model, 'input_id', 'Unknown Model')}",
|
92
|
+
labels={
|
93
|
+
"Component 1": "Component 1",
|
94
|
+
},
|
95
|
+
)
|
96
|
+
plots.append(fig)
|
97
|
+
|
98
|
+
# Return the list of plots as a tuple
|
99
|
+
return tuple(plots)
|
@@ -0,0 +1,131 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import warnings
|
6
|
+
|
7
|
+
import plotly.express as px
|
8
|
+
from datasets import Dataset
|
9
|
+
from ragas import evaluate
|
10
|
+
from ragas.metrics import answer_correctness
|
11
|
+
|
12
|
+
from validmind import tags, tasks
|
13
|
+
|
14
|
+
from .utils import get_renamed_columns
|
15
|
+
|
16
|
+
|
17
|
+
@tags("ragas", "llm")
|
18
|
+
@tasks("text_qa", "text_generation", "text_summarization")
|
19
|
+
def AnswerCorrectness(
|
20
|
+
dataset,
|
21
|
+
question_column="question",
|
22
|
+
answer_column="answer",
|
23
|
+
ground_truth_column="ground_truth",
|
24
|
+
):
|
25
|
+
"""
|
26
|
+
Evaluates the correctness of answers in a dataset with respect to the provided ground
|
27
|
+
truths and visualizes the results in a histogram.
|
28
|
+
|
29
|
+
The assessment of Answer Correctness involves gauging the accuracy of the generated
|
30
|
+
answer when compared to the ground truth. This evaluation relies on the `ground truth`
|
31
|
+
and the `answer`, with scores ranging from 0 to 1. A higher score indicates a closer
|
32
|
+
alignment between the generated answer and the ground truth, signifying better
|
33
|
+
correctness.
|
34
|
+
|
35
|
+
Answer correctness encompasses two critical aspects: semantic similarity between the
|
36
|
+
generated answer and the ground truth, as well as factual similarity. These aspects
|
37
|
+
are combined using a weighted scheme to formulate the answer correctness score. Users
|
38
|
+
also have the option to employ a `threshold` value to round the resulting score to
|
39
|
+
a binary value (0 or 1) based on the threshold.
|
40
|
+
|
41
|
+
Factual correctness quantifies the factual overlap between the generated answer and
|
42
|
+
the ground truth answer. This is done using the concepts of:
|
43
|
+
|
44
|
+
- TP (True Positive): Facts or statements that are present in both the ground truth
|
45
|
+
and the generated answer.
|
46
|
+
- FP (False Positive): Facts or statements that are present in the generated answer
|
47
|
+
but not in the ground truth.
|
48
|
+
- FN (False Negative): Facts or statements that are present in the ground truth but
|
49
|
+
not in the generated answer.
|
50
|
+
|
51
|
+
### Configuring Columns
|
52
|
+
|
53
|
+
This metric requires specific columns to be present in the dataset:
|
54
|
+
- `question` (str): The text prompt or query that was input into the model.
|
55
|
+
- `answer` (str): The text response generated by the model.
|
56
|
+
- `ground_truth` (str): The ground truth answer that the generated answer is compared
|
57
|
+
against.
|
58
|
+
|
59
|
+
If the above data is not in the appropriate column, you can specify different column
|
60
|
+
names for these fields using the parameters `question_column`, `answer_column`, and
|
61
|
+
`ground_truth_column`.
|
62
|
+
|
63
|
+
For example, if your dataset has this data stored in different columns, you can
|
64
|
+
pass the following parameters:
|
65
|
+
```python
|
66
|
+
params = {
|
67
|
+
"question_column": "input_text",
|
68
|
+
"answer_column": "output_text",
|
69
|
+
"ground_truth_column": "human_answer",
|
70
|
+
}
|
71
|
+
```
|
72
|
+
|
73
|
+
If answer and contexts are stored as a dictionary in another column, specify the
|
74
|
+
column and key like this:
|
75
|
+
```python
|
76
|
+
pred_col = dataset.prediction_column(model)
|
77
|
+
params = {
|
78
|
+
"answer_column": f"{pred_col}.generated_answer",
|
79
|
+
"ground_truth_column": f"{pred_col}.contexts",
|
80
|
+
}
|
81
|
+
```
|
82
|
+
|
83
|
+
For more complex data structures, you can use a function to extract the answers:
|
84
|
+
```python
|
85
|
+
pred_col = dataset.prediction_column(model)
|
86
|
+
params = {
|
87
|
+
"answer_column": lambda row: "\\n\\n".join(row[pred_col]["messages"]),
|
88
|
+
"ground_truth_column": lambda row: [row[pred_col]["context_message"]],
|
89
|
+
}
|
90
|
+
```
|
91
|
+
"""
|
92
|
+
warnings.filterwarnings(
|
93
|
+
"ignore",
|
94
|
+
category=FutureWarning,
|
95
|
+
message="promote has been superseded by promote_options='default'.",
|
96
|
+
)
|
97
|
+
|
98
|
+
required_columns = {
|
99
|
+
"question": question_column,
|
100
|
+
"answer": answer_column,
|
101
|
+
"ground_truth": ground_truth_column,
|
102
|
+
}
|
103
|
+
|
104
|
+
df = get_renamed_columns(dataset.df, required_columns)
|
105
|
+
|
106
|
+
result_df = evaluate(
|
107
|
+
Dataset.from_pandas(df), metrics=[answer_correctness]
|
108
|
+
).to_pandas()
|
109
|
+
|
110
|
+
fig_histogram = px.histogram(x=result_df["answer_correctness"].to_list(), nbins=10)
|
111
|
+
fig_box = px.box(x=result_df["answer_correctness"].to_list())
|
112
|
+
|
113
|
+
return (
|
114
|
+
{
|
115
|
+
"Scores": result_df[
|
116
|
+
["question", "answer", "ground_truth", "answer_correctness"]
|
117
|
+
],
|
118
|
+
"Aggregate Scores": [
|
119
|
+
{
|
120
|
+
"Mean Score": result_df["answer_correctness"].mean(),
|
121
|
+
"Median Score": result_df["answer_correctness"].median(),
|
122
|
+
"Max Score": result_df["answer_correctness"].max(),
|
123
|
+
"Min Score": result_df["answer_correctness"].min(),
|
124
|
+
"Standard Deviation": result_df["answer_correctness"].std(),
|
125
|
+
"Count": len(result_df),
|
126
|
+
}
|
127
|
+
],
|
128
|
+
},
|
129
|
+
fig_histogram,
|
130
|
+
fig_box,
|
131
|
+
)
|