validmind 2.5.25__py3-none-any.whl → 2.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +8 -17
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +66 -85
- validmind/ai/test_result_description/context.py +2 -2
- validmind/ai/utils.py +26 -1
- validmind/api_client.py +43 -79
- validmind/client.py +5 -7
- validmind/client_config.py +1 -1
- validmind/datasets/__init__.py +1 -1
- validmind/datasets/classification/customer_churn.py +7 -5
- validmind/datasets/nlp/__init__.py +2 -2
- validmind/errors.py +6 -10
- validmind/html_templates/content_blocks.py +18 -16
- validmind/logging.py +21 -16
- validmind/tests/__init__.py +28 -5
- validmind/tests/__types__.py +186 -170
- validmind/tests/_store.py +7 -21
- validmind/tests/comparison.py +362 -0
- validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
- validmind/tests/data_validation/ADF.py +49 -83
- validmind/tests/data_validation/AutoAR.py +59 -96
- validmind/tests/data_validation/AutoMA.py +59 -96
- validmind/tests/data_validation/AutoStationarity.py +66 -114
- validmind/tests/data_validation/ClassImbalance.py +48 -117
- validmind/tests/data_validation/DatasetDescription.py +180 -209
- validmind/tests/data_validation/DatasetSplit.py +50 -75
- validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
- validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
- validmind/tests/data_validation/Duplicates.py +21 -90
- validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
- validmind/tests/data_validation/HighCardinality.py +32 -80
- validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
- validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
- validmind/tests/data_validation/IQROutliersTable.py +40 -80
- validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
- validmind/tests/data_validation/KPSS.py +33 -81
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
- validmind/tests/data_validation/MissingValues.py +17 -58
- validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
- validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
- validmind/tests/data_validation/RollingStatsPlot.py +50 -81
- validmind/tests/data_validation/SeasonalDecompose.py +102 -184
- validmind/tests/data_validation/Skewness.py +27 -64
- validmind/tests/data_validation/SpreadPlot.py +34 -57
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
- validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
- validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
- validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
- validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
- validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
- validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
- validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
- validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
- validmind/tests/data_validation/TooManyZeroValues.py +21 -70
- validmind/tests/data_validation/UniqueRows.py +23 -62
- validmind/tests/data_validation/WOEBinPlots.py +83 -109
- validmind/tests/data_validation/WOEBinTable.py +28 -69
- validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
- validmind/tests/data_validation/nlp/CommonWords.py +49 -57
- validmind/tests/data_validation/nlp/Hashtags.py +27 -49
- validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
- validmind/tests/data_validation/nlp/Mentions.py +32 -63
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
- validmind/tests/data_validation/nlp/Punctuations.py +63 -47
- validmind/tests/data_validation/nlp/Sentiment.py +4 -0
- validmind/tests/data_validation/nlp/StopWords.py +62 -91
- validmind/tests/data_validation/nlp/TextDescription.py +116 -159
- validmind/tests/data_validation/nlp/Toxicity.py +12 -4
- validmind/tests/decorator.py +33 -242
- validmind/tests/load.py +212 -153
- validmind/tests/model_validation/BertScore.py +13 -7
- validmind/tests/model_validation/BleuScore.py +4 -0
- validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
- validmind/tests/model_validation/ContextualRecall.py +3 -0
- validmind/tests/model_validation/FeaturesAUC.py +43 -74
- validmind/tests/model_validation/MeteorScore.py +3 -0
- validmind/tests/model_validation/RegardScore.py +5 -1
- validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
- validmind/tests/model_validation/embeddings/utils.py +53 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
- validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
- validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
- validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
- validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
- validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
- validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
- validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
- validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
- validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
- validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
- validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
- validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
- validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
- validmind/tests/output.py +120 -0
- validmind/tests/prompt_validation/Bias.py +55 -98
- validmind/tests/prompt_validation/Clarity.py +56 -99
- validmind/tests/prompt_validation/Conciseness.py +63 -101
- validmind/tests/prompt_validation/Delimitation.py +48 -89
- validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
- validmind/tests/prompt_validation/Robustness.py +80 -121
- validmind/tests/prompt_validation/Specificity.py +61 -95
- validmind/tests/prompt_validation/ai_powered_test.py +2 -2
- validmind/tests/run.py +314 -496
- validmind/tests/test_providers.py +109 -79
- validmind/tests/utils.py +91 -0
- validmind/unit_metrics/__init__.py +16 -155
- validmind/unit_metrics/classification/F1.py +1 -0
- validmind/unit_metrics/classification/Precision.py +1 -0
- validmind/unit_metrics/classification/ROC_AUC.py +1 -0
- validmind/unit_metrics/classification/Recall.py +1 -0
- validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
- validmind/unit_metrics/regression/HuberLoss.py +1 -0
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
- validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
- validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
- validmind/unit_metrics/regression/QuantileLoss.py +1 -0
- validmind/unit_metrics/regression/RSquaredScore.py +2 -1
- validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
- validmind/utils.py +66 -17
- validmind/vm_models/__init__.py +2 -17
- validmind/vm_models/dataset/dataset.py +31 -4
- validmind/vm_models/figure.py +7 -37
- validmind/vm_models/model.py +3 -0
- validmind/vm_models/result/__init__.py +7 -0
- validmind/vm_models/result/result.jinja +21 -0
- validmind/vm_models/result/result.py +337 -0
- validmind/vm_models/result/utils.py +160 -0
- validmind/vm_models/test_suite/runner.py +16 -54
- validmind/vm_models/test_suite/summary.py +3 -3
- validmind/vm_models/test_suite/test.py +43 -77
- validmind/vm_models/test_suite/test_suite.py +8 -40
- validmind-2.6.7.dist-info/METADATA +137 -0
- {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/RECORD +182 -189
- validmind/tests/data_validation/AutoSeasonality.py +0 -190
- validmind/tests/metadata.py +0 -59
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
- validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
- validmind/unit_metrics/composite.py +0 -238
- validmind/vm_models/test/metric.py +0 -98
- validmind/vm_models/test/metric_result.py +0 -61
- validmind/vm_models/test/output_template.py +0 -55
- validmind/vm_models/test/result_summary.py +0 -76
- validmind/vm_models/test/result_wrapper.py +0 -488
- validmind/vm_models/test/test.py +0 -103
- validmind/vm_models/test/threshold_test.py +0 -106
- validmind/vm_models/test/threshold_test_result.py +0 -75
- validmind/vm_models/test_context.py +0 -259
- validmind-2.5.25.dist-info/METADATA +0 -118
- {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/LICENSE +0 -0
- {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/WHEEL +0 -0
- {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/entry_points.txt +0 -0
@@ -3,11 +3,22 @@
|
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
5
|
import re
|
6
|
+
from typing import Dict
|
6
7
|
|
7
|
-
from
|
8
|
+
from validmind import tags, tasks
|
9
|
+
from validmind.vm_models import VMDataset, VMModel
|
8
10
|
|
11
|
+
from .utils import create_stability_analysis_result
|
9
12
|
|
10
|
-
|
13
|
+
|
14
|
+
@tags("llm", "text_data", "embeddings", "visualization")
|
15
|
+
@tasks("feature_extraction")
|
16
|
+
def StabilityAnalysisKeyword(
|
17
|
+
dataset: VMDataset,
|
18
|
+
model: VMModel,
|
19
|
+
keyword_dict: Dict[str, str],
|
20
|
+
mean_similarity_threshold: float = 0.7,
|
21
|
+
):
|
11
22
|
"""
|
12
23
|
Evaluates robustness of embedding models to keyword swaps in the test dataset.
|
13
24
|
|
@@ -49,13 +60,9 @@ class StabilityAnalysisKeyword(StabilityAnalysis):
|
|
49
60
|
which might not always be the case.
|
50
61
|
"""
|
51
62
|
|
52
|
-
|
53
|
-
default_params = {
|
54
|
-
"keyword_dict": None, # set to none by default... this must be overridden
|
55
|
-
**StabilityAnalysis.default_params,
|
56
|
-
}
|
63
|
+
keyword_dict = {k.lower(): v for k, v in keyword_dict.items()}
|
57
64
|
|
58
|
-
def perturb_data(
|
65
|
+
def perturb_data(data: str):
|
59
66
|
if not isinstance(data, str):
|
60
67
|
return data
|
61
68
|
|
@@ -63,22 +70,29 @@ class StabilityAnalysisKeyword(StabilityAnalysis):
|
|
63
70
|
tokens = re.findall(r"[\w']+[.,!?;]?|[\w']+", data)
|
64
71
|
modified_tokens = []
|
65
72
|
|
66
|
-
# lowercase all keys in the keword_dict
|
67
|
-
self.params["keyword_dict"] = {
|
68
|
-
k.lower(): v for k, v in self.params["keyword_dict"].items()
|
69
|
-
}
|
70
|
-
|
71
73
|
for token in tokens:
|
72
74
|
# Separate word and punctuation
|
73
75
|
word_part = re.match(r"([\w']+)", token).group()
|
74
76
|
punctuation_part = token[len(word_part) :]
|
75
77
|
|
76
78
|
# Check if the token is a word and if it's in the dictionary
|
77
|
-
if token.lower() in
|
79
|
+
if token.lower() in keyword_dict:
|
78
80
|
modified_tokens.append(
|
79
|
-
|
81
|
+
keyword_dict[word_part.lower()] + punctuation_part
|
80
82
|
)
|
81
83
|
else:
|
82
84
|
modified_tokens.append(token)
|
83
85
|
|
84
86
|
return " ".join(modified_tokens)
|
87
|
+
|
88
|
+
original_df = dataset.df[[dataset.text_column]]
|
89
|
+
perturbed_df = original_df.copy()
|
90
|
+
perturbed_df[dataset.text_column] = perturbed_df[dataset.text_column].map(
|
91
|
+
perturb_data
|
92
|
+
)
|
93
|
+
|
94
|
+
return create_stability_analysis_result(
|
95
|
+
dataset.y_pred(model),
|
96
|
+
model.predict(perturbed_df),
|
97
|
+
mean_similarity_threshold,
|
98
|
+
)
|
@@ -5,7 +5,10 @@
|
|
5
5
|
import random
|
6
6
|
import string
|
7
7
|
|
8
|
-
from
|
8
|
+
from validmind import tags, tasks
|
9
|
+
from validmind.vm_models import VMDataset, VMModel
|
10
|
+
|
11
|
+
from .utils import create_stability_analysis_result
|
9
12
|
|
10
13
|
|
11
14
|
def random_swap(word_list):
|
@@ -59,7 +62,14 @@ def random_insertion(word_list):
|
|
59
62
|
return word_list[:index] + [random_word] + word_list[index:]
|
60
63
|
|
61
64
|
|
62
|
-
|
65
|
+
@tags("llm", "text_data", "embeddings", "visualization")
|
66
|
+
@tasks("feature_extraction")
|
67
|
+
def StabilityAnalysisRandomNoise(
|
68
|
+
dataset: VMDataset,
|
69
|
+
model: VMModel,
|
70
|
+
probability: float = 0.02,
|
71
|
+
mean_similarity_threshold: float = 0.7,
|
72
|
+
):
|
63
73
|
"""
|
64
74
|
Assesses the robustness of text embeddings models to random noise introduced via text perturbations.
|
65
75
|
|
@@ -106,18 +116,10 @@ class StabilityAnalysisRandomNoise(StabilityAnalysis):
|
|
106
116
|
- Does not guarantee model performance on new, unseen, real-world data beyond the generated noisy test data.
|
107
117
|
"""
|
108
118
|
|
109
|
-
|
110
|
-
default_params = {
|
111
|
-
**StabilityAnalysis.default_params,
|
112
|
-
"probability": 0.02,
|
113
|
-
}
|
114
|
-
|
115
|
-
def perturb_data(self, data):
|
119
|
+
def perturb_data(data):
|
116
120
|
if not isinstance(data, str):
|
117
121
|
return data
|
118
122
|
|
119
|
-
probability = self.params["probability"]
|
120
|
-
|
121
123
|
# Tokenize the string based on spaces
|
122
124
|
words = data.split()
|
123
125
|
|
@@ -136,3 +138,15 @@ class StabilityAnalysisRandomNoise(StabilityAnalysis):
|
|
136
138
|
words = random_insertion(words)
|
137
139
|
|
138
140
|
return " ".join(words)
|
141
|
+
|
142
|
+
original_df = dataset.df[[dataset.text_column]]
|
143
|
+
perturbed_df = original_df.copy()
|
144
|
+
perturbed_df[dataset.text_column] = perturbed_df[dataset.text_column].map(
|
145
|
+
perturb_data
|
146
|
+
)
|
147
|
+
|
148
|
+
return create_stability_analysis_result(
|
149
|
+
dataset.y_pred(model),
|
150
|
+
model.predict(perturbed_df),
|
151
|
+
mean_similarity_threshold,
|
152
|
+
)
|
@@ -7,10 +7,20 @@ import random
|
|
7
7
|
import nltk
|
8
8
|
from nltk.corpus import wordnet as wn
|
9
9
|
|
10
|
-
from
|
10
|
+
from validmind import tags, tasks
|
11
|
+
from validmind.vm_models import VMDataset, VMModel
|
11
12
|
|
13
|
+
from .utils import create_stability_analysis_result
|
12
14
|
|
13
|
-
|
15
|
+
|
16
|
+
@tags("llm", "text_data", "embeddings", "visualization")
|
17
|
+
@tasks("feature_extraction")
|
18
|
+
def StabilityAnalysisSynonyms(
|
19
|
+
dataset: VMDataset,
|
20
|
+
model: VMModel,
|
21
|
+
probability: float = 0.02,
|
22
|
+
mean_similarity_threshold: float = 0.7,
|
23
|
+
):
|
14
24
|
"""
|
15
25
|
Evaluates the stability of text embeddings models when words in test data are replaced by their synonyms randomly.
|
16
26
|
|
@@ -55,26 +65,19 @@ class StabilityAnalysisSynonyms(StabilityAnalysis):
|
|
55
65
|
- Does not consider the semantic role of the words in the sentence, meaning the swapped synonym could potentially
|
56
66
|
alter the overall meaning of the sentence, leading to a false perception of the model's stability.
|
57
67
|
"""
|
68
|
+
# download the nltk wordnet
|
69
|
+
nltk.download("wordnet", quiet=True)
|
58
70
|
|
59
|
-
|
60
|
-
default_params = {
|
61
|
-
"probability": 0.02, # probability of swapping a word with a synonym
|
62
|
-
**StabilityAnalysis.default_params,
|
63
|
-
}
|
64
|
-
|
65
|
-
def perturb_data(self, data):
|
71
|
+
def perturb_data(data):
|
66
72
|
if not isinstance(data, str):
|
67
73
|
return data
|
68
74
|
|
69
|
-
# download the nltk wordnet
|
70
|
-
nltk.download("wordnet", quiet=True)
|
71
|
-
|
72
75
|
words = nltk.word_tokenize(data)
|
73
76
|
modified_words = []
|
74
77
|
|
75
78
|
# For each word, check the probability and swap if needed
|
76
79
|
for word in words:
|
77
|
-
if random.random() <=
|
80
|
+
if random.random() <= probability:
|
78
81
|
# get synonyms for the word
|
79
82
|
synonyms = [
|
80
83
|
lemma.name() for syn in wn.synsets(word) for lemma in syn.lemmas()
|
@@ -91,3 +94,15 @@ class StabilityAnalysisSynonyms(StabilityAnalysis):
|
|
91
94
|
modified_words.append(word)
|
92
95
|
|
93
96
|
return " ".join(modified_words)
|
97
|
+
|
98
|
+
original_df = dataset.df[[dataset.text_column]]
|
99
|
+
perturbed_df = original_df.copy()
|
100
|
+
perturbed_df[dataset.text_column] = perturbed_df[dataset.text_column].map(
|
101
|
+
perturb_data
|
102
|
+
)
|
103
|
+
|
104
|
+
return create_stability_analysis_result(
|
105
|
+
dataset.y_pred(model),
|
106
|
+
model.predict(perturbed_df),
|
107
|
+
mean_similarity_threshold,
|
108
|
+
)
|
@@ -4,14 +4,24 @@
|
|
4
4
|
|
5
5
|
from transformers import MarianMTModel, MarianTokenizer
|
6
6
|
|
7
|
+
from validmind import tags, tasks
|
7
8
|
from validmind.logging import get_logger
|
9
|
+
from validmind.vm_models import VMDataset, VMModel
|
8
10
|
|
9
|
-
from .
|
11
|
+
from .utils import create_stability_analysis_result
|
10
12
|
|
11
13
|
logger = get_logger(__name__)
|
12
14
|
|
13
15
|
|
14
|
-
|
16
|
+
@tags("llm", "text_data", "embeddings", "visualization")
|
17
|
+
@tasks("feature_extraction")
|
18
|
+
def StabilityAnalysisTranslation(
|
19
|
+
dataset: VMDataset,
|
20
|
+
model: VMModel,
|
21
|
+
source_lang: str = "en",
|
22
|
+
target_lang: str = "fr",
|
23
|
+
mean_similarity_threshold: float = 0.7,
|
24
|
+
):
|
15
25
|
"""
|
16
26
|
Evaluates robustness of text embeddings models to noise introduced by translating the original text to another
|
17
27
|
language and back.
|
@@ -45,10 +55,10 @@ class StabilityAnalysisTranslation(StabilityAnalysis):
|
|
45
55
|
|
46
56
|
### Strengths
|
47
57
|
|
48
|
-
- An effective way to assess the model
|
58
|
+
- An effective way to assess the model's sensitivity and robustness to language translation noise.
|
49
59
|
- Provides a realistic scenario which the model might encounter in real-world applications by using translation to
|
50
60
|
introduce noise.
|
51
|
-
- Tests the model
|
61
|
+
- Tests the model's capacity to maintain semantic meaning under translational perturbations, extending beyond
|
52
62
|
simple lexical changes.
|
53
63
|
|
54
64
|
### Limitations
|
@@ -60,47 +70,66 @@ class StabilityAnalysisTranslation(StabilityAnalysis):
|
|
60
70
|
- Predominantly language-dependent, thus might not fully capture robustness for languages with fewer resources or
|
61
71
|
those highly dissimilar to the source language.
|
62
72
|
"""
|
73
|
+
# TODO: make the models and tokenizers configurable along with the max length
|
63
74
|
|
64
|
-
|
65
|
-
default_params = {
|
66
|
-
"source_lang": "en",
|
67
|
-
"target_lang": "fr",
|
68
|
-
**StabilityAnalysis.default_params,
|
69
|
-
}
|
70
|
-
|
71
|
-
def perturb_data(self, data: str):
|
72
|
-
if len(data) > 512:
|
73
|
-
logger.info(
|
74
|
-
"Data length exceeds 512 tokens. Truncating data to 512 tokens."
|
75
|
-
)
|
76
|
-
data = data[:512]
|
77
|
-
|
78
|
-
source_lang = self.params["source_lang"]
|
79
|
-
target_lang = self.params["target_lang"]
|
80
|
-
|
75
|
+
try:
|
81
76
|
# Initialize the Marian tokenizer and model for the source language
|
82
|
-
|
83
|
-
|
84
|
-
|
77
|
+
translate_model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
|
78
|
+
translate_model = MarianMTModel.from_pretrained(translate_model_name)
|
79
|
+
translate_tokenizer = MarianTokenizer.from_pretrained(translate_model_name)
|
85
80
|
|
86
81
|
# Initialize the Marian tokenizer and model for the target language
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
82
|
+
reverse_model_name = f"Helsinki-NLP/opus-mt-{target_lang}-{source_lang}"
|
83
|
+
reverse_model = MarianMTModel.from_pretrained(reverse_model_name)
|
84
|
+
reverse_tokenizer = MarianTokenizer.from_pretrained(reverse_model_name)
|
85
|
+
except Exception as e:
|
86
|
+
logger.error(f"Error initializing translation models: {str(e)}")
|
87
|
+
raise e
|
88
|
+
|
89
|
+
# Truncate input if too long (Marian models typically have max length of 512)
|
90
|
+
max_length = 512
|
91
|
+
|
92
|
+
def translate_data(data: str):
|
93
|
+
encoded = translate_tokenizer.encode(
|
94
|
+
data[:1024], # Truncate input text to avoid extremely long sequences
|
95
|
+
return_tensors="pt",
|
96
|
+
max_length=max_length,
|
97
|
+
truncation=True,
|
98
|
+
padding=True,
|
99
|
+
)
|
100
|
+
translated = translate_model.generate(
|
101
|
+
encoded, max_length=max_length, num_beams=2, early_stopping=True
|
102
|
+
)
|
103
|
+
decoded = translate_tokenizer.decode(translated[0], skip_special_tokens=True)
|
94
104
|
|
95
|
-
|
96
|
-
reverse_encoded = tokenizer_reverse.encode(
|
105
|
+
reverse_encoded = reverse_tokenizer.encode(
|
97
106
|
decoded,
|
98
107
|
return_tensors="pt",
|
99
|
-
|
108
|
+
max_length=max_length,
|
109
|
+
truncation=True,
|
110
|
+
padding=True,
|
100
111
|
)
|
101
|
-
|
102
|
-
|
103
|
-
skip_special_tokens=True,
|
112
|
+
reverse_translated = reverse_model.generate(
|
113
|
+
reverse_encoded, max_length=max_length, num_beams=2, early_stopping=True
|
104
114
|
)
|
105
115
|
|
106
|
-
return
|
116
|
+
return reverse_tokenizer.decode(reverse_translated[0], skip_special_tokens=True)
|
117
|
+
|
118
|
+
def perturb_data(data):
|
119
|
+
try:
|
120
|
+
return translate_data(data)
|
121
|
+
except Exception as e:
|
122
|
+
logger.error(f"Error translating data: {str(e)}")
|
123
|
+
return data
|
124
|
+
|
125
|
+
original_df = dataset.df[[dataset.text_column]]
|
126
|
+
perturbed_df = original_df.copy()
|
127
|
+
perturbed_df[dataset.text_column] = perturbed_df[dataset.text_column].map(
|
128
|
+
perturb_data
|
129
|
+
)
|
130
|
+
|
131
|
+
return create_stability_analysis_result(
|
132
|
+
dataset.y_pred(model),
|
133
|
+
model.predict(perturbed_df),
|
134
|
+
mean_similarity_threshold,
|
135
|
+
)
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
import plotly.express as px
|
7
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
8
|
+
|
9
|
+
|
10
|
+
def create_stability_analysis_result(
|
11
|
+
original_embeddings,
|
12
|
+
perturbed_embeddings,
|
13
|
+
mean_similarity_threshold=0.7,
|
14
|
+
):
|
15
|
+
# Compute cosine similarities between original and perturbed embeddings
|
16
|
+
similarities = cosine_similarity(
|
17
|
+
original_embeddings, perturbed_embeddings
|
18
|
+
).diagonal()
|
19
|
+
|
20
|
+
mean = np.mean(similarities)
|
21
|
+
passed = mean > mean_similarity_threshold
|
22
|
+
|
23
|
+
return (
|
24
|
+
[
|
25
|
+
{
|
26
|
+
"Mean Similarity": mean,
|
27
|
+
"Min Similarity": np.min(similarities),
|
28
|
+
"Max Similarity": np.max(similarities),
|
29
|
+
"Median Similarity": np.median(similarities),
|
30
|
+
"Std Similarity": np.std(similarities),
|
31
|
+
"Pass/Fail": "Pass" if passed else "Fail",
|
32
|
+
}
|
33
|
+
],
|
34
|
+
px.histogram(
|
35
|
+
x=similarities.flatten(),
|
36
|
+
nbins=100,
|
37
|
+
title="Cosine Similarity Distribution",
|
38
|
+
labels={"x": "Cosine Similarity"},
|
39
|
+
),
|
40
|
+
px.density_contour(
|
41
|
+
x=similarities.flatten(),
|
42
|
+
nbinsx=100,
|
43
|
+
title="Cosine Similarity Density",
|
44
|
+
labels={"x": "Cosine Similarity"},
|
45
|
+
marginal_x="histogram",
|
46
|
+
),
|
47
|
+
px.box(
|
48
|
+
x=similarities.flatten(),
|
49
|
+
labels={"x": "Cosine Similarity"},
|
50
|
+
title="Cosine Similarity Box Plot",
|
51
|
+
),
|
52
|
+
passed,
|
53
|
+
)
|
@@ -14,23 +14,26 @@ from .utils import get_ragas_config, get_renamed_columns
|
|
14
14
|
|
15
15
|
try:
|
16
16
|
from ragas import evaluate
|
17
|
-
from ragas.metrics import answer_correctness
|
17
|
+
from ragas.metrics import AnswerCorrectness as answer_correctness
|
18
18
|
except ImportError as e:
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
19
|
+
if "ragas" in str(e):
|
20
|
+
raise MissingDependencyError(
|
21
|
+
"Missing required package `ragas` for AnswerCorrectness. "
|
22
|
+
"Please run `pip install validmind[llm]` to use LLM tests",
|
23
|
+
required_dependencies=["ragas"],
|
24
|
+
extra="llm",
|
25
|
+
) from e
|
26
|
+
|
27
|
+
raise e
|
25
28
|
|
26
29
|
|
27
30
|
@tags("ragas", "llm")
|
28
31
|
@tasks("text_qa", "text_generation", "text_summarization")
|
29
32
|
def AnswerCorrectness(
|
30
33
|
dataset,
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
+
user_input_column="user_input",
|
35
|
+
response_column="response",
|
36
|
+
reference_column="reference",
|
34
37
|
):
|
35
38
|
"""
|
36
39
|
Evaluates the correctness of answers in a dataset with respect to the provided ground
|
@@ -62,9 +65,9 @@ def AnswerCorrectness(
|
|
62
65
|
|
63
66
|
This metric requires specific columns to be present in the dataset:
|
64
67
|
|
65
|
-
- `
|
66
|
-
- `
|
67
|
-
- `
|
68
|
+
- `user_input` (str): The text prompt or query that was input into the model.
|
69
|
+
- `response` (str): The text response generated by the model.
|
70
|
+
- `reference` (str): The ground truth answer that the generated answer is compared
|
68
71
|
against.
|
69
72
|
|
70
73
|
If the above data is not in the appropriate column, you can specify different column
|
@@ -75,9 +78,9 @@ def AnswerCorrectness(
|
|
75
78
|
pass the following parameters:
|
76
79
|
```python
|
77
80
|
params = {
|
78
|
-
"
|
79
|
-
"
|
80
|
-
"
|
81
|
+
"user_input_column": "input_text",
|
82
|
+
"response_column": "output_text",
|
83
|
+
"reference_column": "human_answer",
|
81
84
|
}
|
82
85
|
```
|
83
86
|
|
@@ -86,8 +89,8 @@ def AnswerCorrectness(
|
|
86
89
|
```python
|
87
90
|
pred_col = dataset.prediction_column(model)
|
88
91
|
params = {
|
89
|
-
"
|
90
|
-
"
|
92
|
+
"response_column": f"{pred_col}.generated_answer",
|
93
|
+
"reference_column": f"{pred_col}.contexts",
|
91
94
|
}
|
92
95
|
```
|
93
96
|
|
@@ -95,8 +98,8 @@ def AnswerCorrectness(
|
|
95
98
|
```python
|
96
99
|
pred_col = dataset.prediction_column(model)
|
97
100
|
params = {
|
98
|
-
"
|
99
|
-
"
|
101
|
+
"response_column": lambda row: "\\n\\n".join(row[pred_col]["messages"]),
|
102
|
+
"reference_column": lambda row: [row[pred_col]["context_message"]],
|
100
103
|
}
|
101
104
|
```
|
102
105
|
"""
|
@@ -107,32 +110,34 @@ def AnswerCorrectness(
|
|
107
110
|
)
|
108
111
|
|
109
112
|
required_columns = {
|
110
|
-
"
|
111
|
-
"
|
112
|
-
"
|
113
|
+
"user_input": user_input_column,
|
114
|
+
"response": response_column,
|
115
|
+
"reference": reference_column,
|
113
116
|
}
|
114
117
|
|
115
118
|
df = get_renamed_columns(dataset._df, required_columns)
|
116
119
|
|
117
120
|
result_df = evaluate(
|
118
|
-
Dataset.from_pandas(df), metrics=[answer_correctness], **get_ragas_config()
|
121
|
+
Dataset.from_pandas(df), metrics=[answer_correctness()], **get_ragas_config()
|
119
122
|
).to_pandas()
|
120
123
|
|
121
|
-
|
122
|
-
|
124
|
+
score_column = "answer_correctness"
|
125
|
+
|
126
|
+
fig_histogram = px.histogram(x=result_df[score_column].to_list(), nbins=10)
|
127
|
+
fig_box = px.box(x=result_df[score_column].to_list())
|
123
128
|
|
124
129
|
return (
|
125
130
|
{
|
126
|
-
# "Scores (will not be uploaded to
|
131
|
+
# "Scores (will not be uploaded to ValidMind Platform)": result_df[
|
127
132
|
# ["question", "answer", "ground_truth", "answer_correctness"]
|
128
133
|
# ],
|
129
134
|
"Aggregate Scores": [
|
130
135
|
{
|
131
|
-
"Mean Score": result_df[
|
132
|
-
"Median Score": result_df[
|
133
|
-
"Max Score": result_df[
|
134
|
-
"Min Score": result_df[
|
135
|
-
"Standard Deviation": result_df[
|
136
|
+
"Mean Score": result_df[score_column].mean(),
|
137
|
+
"Median Score": result_df[score_column].median(),
|
138
|
+
"Max Score": result_df[score_column].max(),
|
139
|
+
"Min Score": result_df[score_column].min(),
|
140
|
+
"Standard Deviation": result_df[score_column].std(),
|
136
141
|
"Count": result_df.shape[0],
|
137
142
|
}
|
138
143
|
],
|
@@ -14,7 +14,7 @@ from .utils import get_ragas_config, get_renamed_columns
|
|
14
14
|
|
15
15
|
try:
|
16
16
|
from ragas import evaluate
|
17
|
-
from ragas.metrics import AspectCritic
|
17
|
+
from ragas.metrics import AspectCritic as aspect_critic
|
18
18
|
from ragas.metrics._aspect_critic import (
|
19
19
|
coherence,
|
20
20
|
conciseness,
|
@@ -23,24 +23,27 @@ try:
|
|
23
23
|
maliciousness,
|
24
24
|
)
|
25
25
|
except ImportError as e:
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
26
|
+
if "ragas" in str(e):
|
27
|
+
raise MissingDependencyError(
|
28
|
+
"Missing required package `ragas` for AspectCritic. "
|
29
|
+
"Please run `pip install validmind[llm]` to use LLM tests",
|
30
|
+
required_dependencies=["ragas"],
|
31
|
+
extra="llm",
|
32
|
+
) from e
|
33
|
+
|
34
|
+
raise e
|
32
35
|
|
33
36
|
LOWER_IS_BETTER_ASPECTS = ["harmfulness", "maliciousness"]
|
34
37
|
|
35
38
|
|
36
39
|
@tags("ragas", "llm", "qualitative")
|
37
40
|
@tasks("text_summarization", "text_generation", "text_qa")
|
38
|
-
def
|
41
|
+
def AspectCritic(
|
39
42
|
dataset,
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
aspects: list = [
|
43
|
+
user_input_column="user_input",
|
44
|
+
response_column="response",
|
45
|
+
retrieved_contexts_column=None,
|
46
|
+
aspects: list = [
|
44
47
|
"coherence",
|
45
48
|
"conciseness",
|
46
49
|
"correctness",
|
@@ -62,13 +65,13 @@ def AspectCritique(
|
|
62
65
|
|
63
66
|
### Inputs and Outputs:
|
64
67
|
|
65
|
-
The input to this metric is a dataset containing the input `
|
66
|
-
and the `
|
68
|
+
The input to this metric is a dataset containing the input `user_input` (prompt to the LLM)
|
69
|
+
and the `response` (text generated by the LLM). Any retrieved `retrieved_contexts` can also be
|
67
70
|
included to enhance the evaluation.
|
68
71
|
|
69
|
-
The `
|
72
|
+
The `user_input_column`, `response_column`, and `retrieved_contexts_column` parameters can be used to
|
70
73
|
specify the names or sources for the data that this metric will evaluate if the dataset
|
71
|
-
does not contain the required columns `
|
74
|
+
does not contain the required columns `user_input`, `response`, and `retrieved_contexts`.
|
72
75
|
|
73
76
|
By default, the aspects evaluated are harmfulness, maliciousness, coherence,
|
74
77
|
correctness, and conciseness. To change the aspects evaluated, the `aspects` parameter
|
@@ -87,17 +90,17 @@ def AspectCritique(
|
|
87
90
|
### Examples:
|
88
91
|
|
89
92
|
- **Mapping to Required Columns:** If the dataset does not contain the columns required
|
90
|
-
to run this metric (i.e., `
|
93
|
+
to run this metric (i.e., `user_input`, `response`, and `retrieved_contexts`), the
|
91
94
|
|
92
95
|
```python
|
93
96
|
pred_col = my_vm_dataset.prediction_column(my_vm_model)
|
94
97
|
run_test(
|
95
|
-
"validmind.model_validation.ragas.
|
98
|
+
"validmind.model_validation.ragas.AspectCritic",
|
96
99
|
inputs={"dataset": my_vm_dataset},
|
97
100
|
params={
|
98
|
-
"
|
99
|
-
"
|
100
|
-
"
|
101
|
+
"user_input_column": "input_prompt",
|
102
|
+
"response_column": f"{pred_col}.llm_output",
|
103
|
+
"retrieved_contexts_column": "retrieval_model_prediction",
|
101
104
|
},
|
102
105
|
)
|
103
106
|
```
|
@@ -110,7 +113,7 @@ def AspectCritique(
|
|
110
113
|
|
111
114
|
```python
|
112
115
|
run_test(
|
113
|
-
"validmind.model_validation.ragas.
|
116
|
+
"validmind.model_validation.ragas.AspectCritic",
|
114
117
|
inputs={"dataset": my_vm_dataset},
|
115
118
|
params={
|
116
119
|
"additional_aspects": [
|
@@ -135,16 +138,18 @@ def AspectCritique(
|
|
135
138
|
)
|
136
139
|
|
137
140
|
required_columns = {
|
138
|
-
"
|
139
|
-
"
|
140
|
-
"contexts": contexts_column,
|
141
|
+
"user_input": user_input_column,
|
142
|
+
"response": response_column,
|
141
143
|
}
|
142
144
|
|
145
|
+
if retrieved_contexts_column:
|
146
|
+
required_columns["retrieved_contexts"] = retrieved_contexts_column
|
147
|
+
|
143
148
|
df = get_renamed_columns(dataset._df, required_columns)
|
144
149
|
|
145
150
|
custom_aspects = (
|
146
151
|
[
|
147
|
-
|
152
|
+
aspect_critic(name=name, definition=description)
|
148
153
|
for name, description in additional_aspects
|
149
154
|
]
|
150
155
|
if additional_aspects
|
@@ -162,7 +167,8 @@ def AspectCritique(
|
|
162
167
|
result_df[aspect] = 1 - result_df[aspect]
|
163
168
|
|
164
169
|
df_melted = result_df.melt(
|
165
|
-
id_vars=["
|
170
|
+
id_vars=["user_input", "response"]
|
171
|
+
+ (["retrieved_contexts"] if retrieved_contexts_column else []),
|
166
172
|
value_vars=[aspect.name for aspect in all_aspects],
|
167
173
|
var_name="Metric",
|
168
174
|
value_name="Result",
|