validmind 2.5.25__py3-none-any.whl → 2.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. validmind/__init__.py +8 -17
  2. validmind/__version__.py +1 -1
  3. validmind/ai/test_descriptions.py +66 -85
  4. validmind/ai/test_result_description/context.py +2 -2
  5. validmind/ai/utils.py +26 -1
  6. validmind/api_client.py +43 -79
  7. validmind/client.py +5 -7
  8. validmind/client_config.py +1 -1
  9. validmind/datasets/__init__.py +1 -1
  10. validmind/datasets/classification/customer_churn.py +7 -5
  11. validmind/datasets/nlp/__init__.py +2 -2
  12. validmind/errors.py +6 -10
  13. validmind/html_templates/content_blocks.py +18 -16
  14. validmind/logging.py +21 -16
  15. validmind/tests/__init__.py +28 -5
  16. validmind/tests/__types__.py +186 -170
  17. validmind/tests/_store.py +7 -21
  18. validmind/tests/comparison.py +362 -0
  19. validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
  20. validmind/tests/data_validation/ADF.py +49 -83
  21. validmind/tests/data_validation/AutoAR.py +59 -96
  22. validmind/tests/data_validation/AutoMA.py +59 -96
  23. validmind/tests/data_validation/AutoStationarity.py +66 -114
  24. validmind/tests/data_validation/ClassImbalance.py +48 -117
  25. validmind/tests/data_validation/DatasetDescription.py +180 -209
  26. validmind/tests/data_validation/DatasetSplit.py +50 -75
  27. validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
  28. validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
  29. validmind/tests/data_validation/Duplicates.py +21 -90
  30. validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
  31. validmind/tests/data_validation/HighCardinality.py +32 -80
  32. validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
  33. validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
  34. validmind/tests/data_validation/IQROutliersTable.py +40 -80
  35. validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
  36. validmind/tests/data_validation/KPSS.py +33 -81
  37. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
  38. validmind/tests/data_validation/MissingValues.py +17 -58
  39. validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
  40. validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
  41. validmind/tests/data_validation/RollingStatsPlot.py +50 -81
  42. validmind/tests/data_validation/SeasonalDecompose.py +102 -184
  43. validmind/tests/data_validation/Skewness.py +27 -64
  44. validmind/tests/data_validation/SpreadPlot.py +34 -57
  45. validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
  46. validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
  47. validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
  48. validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
  49. validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
  50. validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
  51. validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
  52. validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
  53. validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
  54. validmind/tests/data_validation/TooManyZeroValues.py +21 -70
  55. validmind/tests/data_validation/UniqueRows.py +23 -62
  56. validmind/tests/data_validation/WOEBinPlots.py +83 -109
  57. validmind/tests/data_validation/WOEBinTable.py +28 -69
  58. validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
  59. validmind/tests/data_validation/nlp/CommonWords.py +49 -57
  60. validmind/tests/data_validation/nlp/Hashtags.py +27 -49
  61. validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
  62. validmind/tests/data_validation/nlp/Mentions.py +32 -63
  63. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
  64. validmind/tests/data_validation/nlp/Punctuations.py +63 -47
  65. validmind/tests/data_validation/nlp/Sentiment.py +4 -0
  66. validmind/tests/data_validation/nlp/StopWords.py +62 -91
  67. validmind/tests/data_validation/nlp/TextDescription.py +116 -159
  68. validmind/tests/data_validation/nlp/Toxicity.py +12 -4
  69. validmind/tests/decorator.py +33 -242
  70. validmind/tests/load.py +212 -153
  71. validmind/tests/model_validation/BertScore.py +13 -7
  72. validmind/tests/model_validation/BleuScore.py +4 -0
  73. validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
  74. validmind/tests/model_validation/ContextualRecall.py +3 -0
  75. validmind/tests/model_validation/FeaturesAUC.py +43 -74
  76. validmind/tests/model_validation/MeteorScore.py +3 -0
  77. validmind/tests/model_validation/RegardScore.py +5 -1
  78. validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
  79. validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
  80. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
  81. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
  82. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
  83. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
  84. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
  85. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
  86. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
  87. validmind/tests/model_validation/embeddings/utils.py +53 -0
  88. validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
  89. validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
  90. validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
  91. validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
  92. validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
  93. validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
  94. validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
  95. validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
  96. validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
  97. validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
  98. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
  99. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
  100. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
  101. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
  102. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
  103. validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
  104. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
  105. validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
  106. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
  107. validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
  108. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
  109. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
  110. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
  111. validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
  112. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
  113. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
  114. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
  115. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
  116. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
  117. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
  118. validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
  119. validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
  120. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
  121. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
  122. validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
  123. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
  124. validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
  125. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
  126. validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
  127. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
  128. validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
  129. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
  130. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
  131. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
  132. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
  133. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
  134. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
  135. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
  136. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
  137. validmind/tests/output.py +120 -0
  138. validmind/tests/prompt_validation/Bias.py +55 -98
  139. validmind/tests/prompt_validation/Clarity.py +56 -99
  140. validmind/tests/prompt_validation/Conciseness.py +63 -101
  141. validmind/tests/prompt_validation/Delimitation.py +48 -89
  142. validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
  143. validmind/tests/prompt_validation/Robustness.py +80 -121
  144. validmind/tests/prompt_validation/Specificity.py +61 -95
  145. validmind/tests/prompt_validation/ai_powered_test.py +2 -2
  146. validmind/tests/run.py +314 -496
  147. validmind/tests/test_providers.py +109 -79
  148. validmind/tests/utils.py +91 -0
  149. validmind/unit_metrics/__init__.py +16 -155
  150. validmind/unit_metrics/classification/F1.py +1 -0
  151. validmind/unit_metrics/classification/Precision.py +1 -0
  152. validmind/unit_metrics/classification/ROC_AUC.py +1 -0
  153. validmind/unit_metrics/classification/Recall.py +1 -0
  154. validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
  155. validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
  156. validmind/unit_metrics/regression/HuberLoss.py +1 -0
  157. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
  158. validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
  159. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
  160. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
  161. validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
  162. validmind/unit_metrics/regression/QuantileLoss.py +1 -0
  163. validmind/unit_metrics/regression/RSquaredScore.py +2 -1
  164. validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
  165. validmind/utils.py +66 -17
  166. validmind/vm_models/__init__.py +2 -17
  167. validmind/vm_models/dataset/dataset.py +31 -4
  168. validmind/vm_models/figure.py +7 -37
  169. validmind/vm_models/model.py +3 -0
  170. validmind/vm_models/result/__init__.py +7 -0
  171. validmind/vm_models/result/result.jinja +21 -0
  172. validmind/vm_models/result/result.py +337 -0
  173. validmind/vm_models/result/utils.py +160 -0
  174. validmind/vm_models/test_suite/runner.py +16 -54
  175. validmind/vm_models/test_suite/summary.py +3 -3
  176. validmind/vm_models/test_suite/test.py +43 -77
  177. validmind/vm_models/test_suite/test_suite.py +8 -40
  178. validmind-2.6.7.dist-info/METADATA +137 -0
  179. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/RECORD +182 -189
  180. validmind/tests/data_validation/AutoSeasonality.py +0 -190
  181. validmind/tests/metadata.py +0 -59
  182. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
  183. validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
  184. validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
  185. validmind/unit_metrics/composite.py +0 -238
  186. validmind/vm_models/test/metric.py +0 -98
  187. validmind/vm_models/test/metric_result.py +0 -61
  188. validmind/vm_models/test/output_template.py +0 -55
  189. validmind/vm_models/test/result_summary.py +0 -76
  190. validmind/vm_models/test/result_wrapper.py +0 -488
  191. validmind/vm_models/test/test.py +0 -103
  192. validmind/vm_models/test/threshold_test.py +0 -106
  193. validmind/vm_models/test/threshold_test_result.py +0 -75
  194. validmind/vm_models/test_context.py +0 -259
  195. validmind-2.5.25.dist-info/METADATA +0 -118
  196. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/LICENSE +0 -0
  197. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/WHEEL +0 -0
  198. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/entry_points.txt +0 -0
@@ -3,11 +3,22 @@
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
5
  import re
6
+ from typing import Dict
6
7
 
7
- from .StabilityAnalysis import StabilityAnalysis
8
+ from validmind import tags, tasks
9
+ from validmind.vm_models import VMDataset, VMModel
8
10
 
11
+ from .utils import create_stability_analysis_result
9
12
 
10
- class StabilityAnalysisKeyword(StabilityAnalysis):
13
+
14
+ @tags("llm", "text_data", "embeddings", "visualization")
15
+ @tasks("feature_extraction")
16
+ def StabilityAnalysisKeyword(
17
+ dataset: VMDataset,
18
+ model: VMModel,
19
+ keyword_dict: Dict[str, str],
20
+ mean_similarity_threshold: float = 0.7,
21
+ ):
11
22
  """
12
23
  Evaluates robustness of embedding models to keyword swaps in the test dataset.
13
24
 
@@ -49,13 +60,9 @@ class StabilityAnalysisKeyword(StabilityAnalysis):
49
60
  which might not always be the case.
50
61
  """
51
62
 
52
- name = "Text Embeddings Stability Analysis to Keyword Swaps"
53
- default_params = {
54
- "keyword_dict": None, # set to none by default... this must be overridden
55
- **StabilityAnalysis.default_params,
56
- }
63
+ keyword_dict = {k.lower(): v for k, v in keyword_dict.items()}
57
64
 
58
- def perturb_data(self, data: str):
65
+ def perturb_data(data: str):
59
66
  if not isinstance(data, str):
60
67
  return data
61
68
 
@@ -63,22 +70,29 @@ class StabilityAnalysisKeyword(StabilityAnalysis):
63
70
  tokens = re.findall(r"[\w']+[.,!?;]?|[\w']+", data)
64
71
  modified_tokens = []
65
72
 
66
- # lowercase all keys in the keword_dict
67
- self.params["keyword_dict"] = {
68
- k.lower(): v for k, v in self.params["keyword_dict"].items()
69
- }
70
-
71
73
  for token in tokens:
72
74
  # Separate word and punctuation
73
75
  word_part = re.match(r"([\w']+)", token).group()
74
76
  punctuation_part = token[len(word_part) :]
75
77
 
76
78
  # Check if the token is a word and if it's in the dictionary
77
- if token.lower() in self.params["keyword_dict"]:
79
+ if token.lower() in keyword_dict:
78
80
  modified_tokens.append(
79
- self.params["keyword_dict"][word_part.lower()] + punctuation_part
81
+ keyword_dict[word_part.lower()] + punctuation_part
80
82
  )
81
83
  else:
82
84
  modified_tokens.append(token)
83
85
 
84
86
  return " ".join(modified_tokens)
87
+
88
+ original_df = dataset.df[[dataset.text_column]]
89
+ perturbed_df = original_df.copy()
90
+ perturbed_df[dataset.text_column] = perturbed_df[dataset.text_column].map(
91
+ perturb_data
92
+ )
93
+
94
+ return create_stability_analysis_result(
95
+ dataset.y_pred(model),
96
+ model.predict(perturbed_df),
97
+ mean_similarity_threshold,
98
+ )
@@ -5,7 +5,10 @@
5
5
  import random
6
6
  import string
7
7
 
8
- from .StabilityAnalysis import StabilityAnalysis
8
+ from validmind import tags, tasks
9
+ from validmind.vm_models import VMDataset, VMModel
10
+
11
+ from .utils import create_stability_analysis_result
9
12
 
10
13
 
11
14
  def random_swap(word_list):
@@ -59,7 +62,14 @@ def random_insertion(word_list):
59
62
  return word_list[:index] + [random_word] + word_list[index:]
60
63
 
61
64
 
62
- class StabilityAnalysisRandomNoise(StabilityAnalysis):
65
+ @tags("llm", "text_data", "embeddings", "visualization")
66
+ @tasks("feature_extraction")
67
+ def StabilityAnalysisRandomNoise(
68
+ dataset: VMDataset,
69
+ model: VMModel,
70
+ probability: float = 0.02,
71
+ mean_similarity_threshold: float = 0.7,
72
+ ):
63
73
  """
64
74
  Assesses the robustness of text embeddings models to random noise introduced via text perturbations.
65
75
 
@@ -106,18 +116,10 @@ class StabilityAnalysisRandomNoise(StabilityAnalysis):
106
116
  - Does not guarantee model performance on new, unseen, real-world data beyond the generated noisy test data.
107
117
  """
108
118
 
109
- name = "Text Embeddings Stability Analysis to Random Noise"
110
- default_params = {
111
- **StabilityAnalysis.default_params,
112
- "probability": 0.02,
113
- }
114
-
115
- def perturb_data(self, data):
119
+ def perturb_data(data):
116
120
  if not isinstance(data, str):
117
121
  return data
118
122
 
119
- probability = self.params["probability"]
120
-
121
123
  # Tokenize the string based on spaces
122
124
  words = data.split()
123
125
 
@@ -136,3 +138,15 @@ class StabilityAnalysisRandomNoise(StabilityAnalysis):
136
138
  words = random_insertion(words)
137
139
 
138
140
  return " ".join(words)
141
+
142
+ original_df = dataset.df[[dataset.text_column]]
143
+ perturbed_df = original_df.copy()
144
+ perturbed_df[dataset.text_column] = perturbed_df[dataset.text_column].map(
145
+ perturb_data
146
+ )
147
+
148
+ return create_stability_analysis_result(
149
+ dataset.y_pred(model),
150
+ model.predict(perturbed_df),
151
+ mean_similarity_threshold,
152
+ )
@@ -7,10 +7,20 @@ import random
7
7
  import nltk
8
8
  from nltk.corpus import wordnet as wn
9
9
 
10
- from .StabilityAnalysis import StabilityAnalysis
10
+ from validmind import tags, tasks
11
+ from validmind.vm_models import VMDataset, VMModel
11
12
 
13
+ from .utils import create_stability_analysis_result
12
14
 
13
- class StabilityAnalysisSynonyms(StabilityAnalysis):
15
+
16
+ @tags("llm", "text_data", "embeddings", "visualization")
17
+ @tasks("feature_extraction")
18
+ def StabilityAnalysisSynonyms(
19
+ dataset: VMDataset,
20
+ model: VMModel,
21
+ probability: float = 0.02,
22
+ mean_similarity_threshold: float = 0.7,
23
+ ):
14
24
  """
15
25
  Evaluates the stability of text embeddings models when words in test data are replaced by their synonyms randomly.
16
26
 
@@ -55,26 +65,19 @@ class StabilityAnalysisSynonyms(StabilityAnalysis):
55
65
  - Does not consider the semantic role of the words in the sentence, meaning the swapped synonym could potentially
56
66
  alter the overall meaning of the sentence, leading to a false perception of the model's stability.
57
67
  """
68
+ # download the nltk wordnet
69
+ nltk.download("wordnet", quiet=True)
58
70
 
59
- name = "Text Embeddings Stability Analysis to Synonym Swaps"
60
- default_params = {
61
- "probability": 0.02, # probability of swapping a word with a synonym
62
- **StabilityAnalysis.default_params,
63
- }
64
-
65
- def perturb_data(self, data):
71
+ def perturb_data(data):
66
72
  if not isinstance(data, str):
67
73
  return data
68
74
 
69
- # download the nltk wordnet
70
- nltk.download("wordnet", quiet=True)
71
-
72
75
  words = nltk.word_tokenize(data)
73
76
  modified_words = []
74
77
 
75
78
  # For each word, check the probability and swap if needed
76
79
  for word in words:
77
- if random.random() <= self.params["probability"]:
80
+ if random.random() <= probability:
78
81
  # get synonyms for the word
79
82
  synonyms = [
80
83
  lemma.name() for syn in wn.synsets(word) for lemma in syn.lemmas()
@@ -91,3 +94,15 @@ class StabilityAnalysisSynonyms(StabilityAnalysis):
91
94
  modified_words.append(word)
92
95
 
93
96
  return " ".join(modified_words)
97
+
98
+ original_df = dataset.df[[dataset.text_column]]
99
+ perturbed_df = original_df.copy()
100
+ perturbed_df[dataset.text_column] = perturbed_df[dataset.text_column].map(
101
+ perturb_data
102
+ )
103
+
104
+ return create_stability_analysis_result(
105
+ dataset.y_pred(model),
106
+ model.predict(perturbed_df),
107
+ mean_similarity_threshold,
108
+ )
@@ -4,14 +4,24 @@
4
4
 
5
5
  from transformers import MarianMTModel, MarianTokenizer
6
6
 
7
+ from validmind import tags, tasks
7
8
  from validmind.logging import get_logger
9
+ from validmind.vm_models import VMDataset, VMModel
8
10
 
9
- from .StabilityAnalysis import StabilityAnalysis
11
+ from .utils import create_stability_analysis_result
10
12
 
11
13
  logger = get_logger(__name__)
12
14
 
13
15
 
14
- class StabilityAnalysisTranslation(StabilityAnalysis):
16
+ @tags("llm", "text_data", "embeddings", "visualization")
17
+ @tasks("feature_extraction")
18
+ def StabilityAnalysisTranslation(
19
+ dataset: VMDataset,
20
+ model: VMModel,
21
+ source_lang: str = "en",
22
+ target_lang: str = "fr",
23
+ mean_similarity_threshold: float = 0.7,
24
+ ):
15
25
  """
16
26
  Evaluates robustness of text embeddings models to noise introduced by translating the original text to another
17
27
  language and back.
@@ -45,10 +55,10 @@ class StabilityAnalysisTranslation(StabilityAnalysis):
45
55
 
46
56
  ### Strengths
47
57
 
48
- - An effective way to assess the models sensitivity and robustness to language translation noise.
58
+ - An effective way to assess the model's sensitivity and robustness to language translation noise.
49
59
  - Provides a realistic scenario which the model might encounter in real-world applications by using translation to
50
60
  introduce noise.
51
- - Tests the models capacity to maintain semantic meaning under translational perturbations, extending beyond
61
+ - Tests the model's capacity to maintain semantic meaning under translational perturbations, extending beyond
52
62
  simple lexical changes.
53
63
 
54
64
  ### Limitations
@@ -60,47 +70,66 @@ class StabilityAnalysisTranslation(StabilityAnalysis):
60
70
  - Predominantly language-dependent, thus might not fully capture robustness for languages with fewer resources or
61
71
  those highly dissimilar to the source language.
62
72
  """
73
+ # TODO: make the models and tokenizers configurable along with the max length
63
74
 
64
- name = "Text Embeddings Stability Analysis to Translation"
65
- default_params = {
66
- "source_lang": "en",
67
- "target_lang": "fr",
68
- **StabilityAnalysis.default_params,
69
- }
70
-
71
- def perturb_data(self, data: str):
72
- if len(data) > 512:
73
- logger.info(
74
- "Data length exceeds 512 tokens. Truncating data to 512 tokens."
75
- )
76
- data = data[:512]
77
-
78
- source_lang = self.params["source_lang"]
79
- target_lang = self.params["target_lang"]
80
-
75
+ try:
81
76
  # Initialize the Marian tokenizer and model for the source language
82
- model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
83
- model = MarianMTModel.from_pretrained(model_name)
84
- tokenizer = MarianTokenizer.from_pretrained(model_name)
77
+ translate_model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
78
+ translate_model = MarianMTModel.from_pretrained(translate_model_name)
79
+ translate_tokenizer = MarianTokenizer.from_pretrained(translate_model_name)
85
80
 
86
81
  # Initialize the Marian tokenizer and model for the target language
87
- model_name_reverse = f"Helsinki-NLP/opus-mt-{target_lang}-{source_lang}"
88
- model_reverse = MarianMTModel.from_pretrained(model_name_reverse)
89
- tokenizer_reverse = MarianTokenizer.from_pretrained(model_name_reverse)
90
-
91
- # Translate to the target language
92
- encoded = tokenizer.encode(data, return_tensors="pt", add_special_tokens=True)
93
- decoded = tokenizer.decode(model.generate(encoded)[0], skip_special_tokens=True)
82
+ reverse_model_name = f"Helsinki-NLP/opus-mt-{target_lang}-{source_lang}"
83
+ reverse_model = MarianMTModel.from_pretrained(reverse_model_name)
84
+ reverse_tokenizer = MarianTokenizer.from_pretrained(reverse_model_name)
85
+ except Exception as e:
86
+ logger.error(f"Error initializing translation models: {str(e)}")
87
+ raise e
88
+
89
+ # Truncate input if too long (Marian models typically have max length of 512)
90
+ max_length = 512
91
+
92
+ def translate_data(data: str):
93
+ encoded = translate_tokenizer.encode(
94
+ data[:1024], # Truncate input text to avoid extremely long sequences
95
+ return_tensors="pt",
96
+ max_length=max_length,
97
+ truncation=True,
98
+ padding=True,
99
+ )
100
+ translated = translate_model.generate(
101
+ encoded, max_length=max_length, num_beams=2, early_stopping=True
102
+ )
103
+ decoded = translate_tokenizer.decode(translated[0], skip_special_tokens=True)
94
104
 
95
- # Translate back to the source language
96
- reverse_encoded = tokenizer_reverse.encode(
105
+ reverse_encoded = reverse_tokenizer.encode(
97
106
  decoded,
98
107
  return_tensors="pt",
99
- add_special_tokens=True,
108
+ max_length=max_length,
109
+ truncation=True,
110
+ padding=True,
100
111
  )
101
- reverse_decoded = tokenizer_reverse.decode(
102
- model_reverse.generate(reverse_encoded)[0],
103
- skip_special_tokens=True,
112
+ reverse_translated = reverse_model.generate(
113
+ reverse_encoded, max_length=max_length, num_beams=2, early_stopping=True
104
114
  )
105
115
 
106
- return reverse_decoded
116
+ return reverse_tokenizer.decode(reverse_translated[0], skip_special_tokens=True)
117
+
118
+ def perturb_data(data):
119
+ try:
120
+ return translate_data(data)
121
+ except Exception as e:
122
+ logger.error(f"Error translating data: {str(e)}")
123
+ return data
124
+
125
+ original_df = dataset.df[[dataset.text_column]]
126
+ perturbed_df = original_df.copy()
127
+ perturbed_df[dataset.text_column] = perturbed_df[dataset.text_column].map(
128
+ perturb_data
129
+ )
130
+
131
+ return create_stability_analysis_result(
132
+ dataset.y_pred(model),
133
+ model.predict(perturbed_df),
134
+ mean_similarity_threshold,
135
+ )
@@ -0,0 +1,53 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import numpy as np
6
+ import plotly.express as px
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
+
9
+
10
+ def create_stability_analysis_result(
11
+ original_embeddings,
12
+ perturbed_embeddings,
13
+ mean_similarity_threshold=0.7,
14
+ ):
15
+ # Compute cosine similarities between original and perturbed embeddings
16
+ similarities = cosine_similarity(
17
+ original_embeddings, perturbed_embeddings
18
+ ).diagonal()
19
+
20
+ mean = np.mean(similarities)
21
+ passed = mean > mean_similarity_threshold
22
+
23
+ return (
24
+ [
25
+ {
26
+ "Mean Similarity": mean,
27
+ "Min Similarity": np.min(similarities),
28
+ "Max Similarity": np.max(similarities),
29
+ "Median Similarity": np.median(similarities),
30
+ "Std Similarity": np.std(similarities),
31
+ "Pass/Fail": "Pass" if passed else "Fail",
32
+ }
33
+ ],
34
+ px.histogram(
35
+ x=similarities.flatten(),
36
+ nbins=100,
37
+ title="Cosine Similarity Distribution",
38
+ labels={"x": "Cosine Similarity"},
39
+ ),
40
+ px.density_contour(
41
+ x=similarities.flatten(),
42
+ nbinsx=100,
43
+ title="Cosine Similarity Density",
44
+ labels={"x": "Cosine Similarity"},
45
+ marginal_x="histogram",
46
+ ),
47
+ px.box(
48
+ x=similarities.flatten(),
49
+ labels={"x": "Cosine Similarity"},
50
+ title="Cosine Similarity Box Plot",
51
+ ),
52
+ passed,
53
+ )
@@ -14,23 +14,26 @@ from .utils import get_ragas_config, get_renamed_columns
14
14
 
15
15
  try:
16
16
  from ragas import evaluate
17
- from ragas.metrics import answer_correctness
17
+ from ragas.metrics import AnswerCorrectness as answer_correctness
18
18
  except ImportError as e:
19
- raise MissingDependencyError(
20
- "Missing required package `ragas` for AnswerCorrectness. "
21
- "Please run `pip install validmind[llm]` to use LLM tests",
22
- required_dependencies=["ragas"],
23
- extra="llm",
24
- ) from e
19
+ if "ragas" in str(e):
20
+ raise MissingDependencyError(
21
+ "Missing required package `ragas` for AnswerCorrectness. "
22
+ "Please run `pip install validmind[llm]` to use LLM tests",
23
+ required_dependencies=["ragas"],
24
+ extra="llm",
25
+ ) from e
26
+
27
+ raise e
25
28
 
26
29
 
27
30
  @tags("ragas", "llm")
28
31
  @tasks("text_qa", "text_generation", "text_summarization")
29
32
  def AnswerCorrectness(
30
33
  dataset,
31
- question_column="question",
32
- answer_column="answer",
33
- ground_truth_column="ground_truth",
34
+ user_input_column="user_input",
35
+ response_column="response",
36
+ reference_column="reference",
34
37
  ):
35
38
  """
36
39
  Evaluates the correctness of answers in a dataset with respect to the provided ground
@@ -62,9 +65,9 @@ def AnswerCorrectness(
62
65
 
63
66
  This metric requires specific columns to be present in the dataset:
64
67
 
65
- - `question` (str): The text prompt or query that was input into the model.
66
- - `answer` (str): The text response generated by the model.
67
- - `ground_truth` (str): The ground truth answer that the generated answer is compared
68
+ - `user_input` (str): The text prompt or query that was input into the model.
69
+ - `response` (str): The text response generated by the model.
70
+ - `reference` (str): The ground truth answer that the generated answer is compared
68
71
  against.
69
72
 
70
73
  If the above data is not in the appropriate column, you can specify different column
@@ -75,9 +78,9 @@ def AnswerCorrectness(
75
78
  pass the following parameters:
76
79
  ```python
77
80
  params = {
78
- "question_column": "input_text",
79
- "answer_column": "output_text",
80
- "ground_truth_column": "human_answer",
81
+ "user_input_column": "input_text",
82
+ "response_column": "output_text",
83
+ "reference_column": "human_answer",
81
84
  }
82
85
  ```
83
86
 
@@ -86,8 +89,8 @@ def AnswerCorrectness(
86
89
  ```python
87
90
  pred_col = dataset.prediction_column(model)
88
91
  params = {
89
- "answer_column": f"{pred_col}.generated_answer",
90
- "ground_truth_column": f"{pred_col}.contexts",
92
+ "response_column": f"{pred_col}.generated_answer",
93
+ "reference_column": f"{pred_col}.contexts",
91
94
  }
92
95
  ```
93
96
 
@@ -95,8 +98,8 @@ def AnswerCorrectness(
95
98
  ```python
96
99
  pred_col = dataset.prediction_column(model)
97
100
  params = {
98
- "answer_column": lambda row: "\\n\\n".join(row[pred_col]["messages"]),
99
- "ground_truth_column": lambda row: [row[pred_col]["context_message"]],
101
+ "response_column": lambda row: "\\n\\n".join(row[pred_col]["messages"]),
102
+ "reference_column": lambda row: [row[pred_col]["context_message"]],
100
103
  }
101
104
  ```
102
105
  """
@@ -107,32 +110,34 @@ def AnswerCorrectness(
107
110
  )
108
111
 
109
112
  required_columns = {
110
- "question": question_column,
111
- "answer": answer_column,
112
- "ground_truth": ground_truth_column,
113
+ "user_input": user_input_column,
114
+ "response": response_column,
115
+ "reference": reference_column,
113
116
  }
114
117
 
115
118
  df = get_renamed_columns(dataset._df, required_columns)
116
119
 
117
120
  result_df = evaluate(
118
- Dataset.from_pandas(df), metrics=[answer_correctness], **get_ragas_config()
121
+ Dataset.from_pandas(df), metrics=[answer_correctness()], **get_ragas_config()
119
122
  ).to_pandas()
120
123
 
121
- fig_histogram = px.histogram(x=result_df["answer_correctness"].to_list(), nbins=10)
122
- fig_box = px.box(x=result_df["answer_correctness"].to_list())
124
+ score_column = "answer_correctness"
125
+
126
+ fig_histogram = px.histogram(x=result_df[score_column].to_list(), nbins=10)
127
+ fig_box = px.box(x=result_df[score_column].to_list())
123
128
 
124
129
  return (
125
130
  {
126
- # "Scores (will not be uploaded to UI)": result_df[
131
+ # "Scores (will not be uploaded to ValidMind Platform)": result_df[
127
132
  # ["question", "answer", "ground_truth", "answer_correctness"]
128
133
  # ],
129
134
  "Aggregate Scores": [
130
135
  {
131
- "Mean Score": result_df["answer_correctness"].mean(),
132
- "Median Score": result_df["answer_correctness"].median(),
133
- "Max Score": result_df["answer_correctness"].max(),
134
- "Min Score": result_df["answer_correctness"].min(),
135
- "Standard Deviation": result_df["answer_correctness"].std(),
136
+ "Mean Score": result_df[score_column].mean(),
137
+ "Median Score": result_df[score_column].median(),
138
+ "Max Score": result_df[score_column].max(),
139
+ "Min Score": result_df[score_column].min(),
140
+ "Standard Deviation": result_df[score_column].std(),
136
141
  "Count": result_df.shape[0],
137
142
  }
138
143
  ],
@@ -14,7 +14,7 @@ from .utils import get_ragas_config, get_renamed_columns
14
14
 
15
15
  try:
16
16
  from ragas import evaluate
17
- from ragas.metrics import AspectCritic
17
+ from ragas.metrics import AspectCritic as aspect_critic
18
18
  from ragas.metrics._aspect_critic import (
19
19
  coherence,
20
20
  conciseness,
@@ -23,24 +23,27 @@ try:
23
23
  maliciousness,
24
24
  )
25
25
  except ImportError as e:
26
- raise MissingDependencyError(
27
- "Missing required package `ragas` for AspectCritique. "
28
- "Please run `pip install validmind[llm]` to use LLM tests",
29
- required_dependencies=["ragas"],
30
- extra="llm",
31
- ) from e
26
+ if "ragas" in str(e):
27
+ raise MissingDependencyError(
28
+ "Missing required package `ragas` for AspectCritic. "
29
+ "Please run `pip install validmind[llm]` to use LLM tests",
30
+ required_dependencies=["ragas"],
31
+ extra="llm",
32
+ ) from e
33
+
34
+ raise e
32
35
 
33
36
  LOWER_IS_BETTER_ASPECTS = ["harmfulness", "maliciousness"]
34
37
 
35
38
 
36
39
  @tags("ragas", "llm", "qualitative")
37
40
  @tasks("text_summarization", "text_generation", "text_qa")
38
- def AspectCritique(
41
+ def AspectCritic(
39
42
  dataset,
40
- question_column="question",
41
- answer_column="answer",
42
- contexts_column="contexts",
43
- aspects: list = [ # noqa: B006 this is fine as immutable default since it never gets modified
43
+ user_input_column="user_input",
44
+ response_column="response",
45
+ retrieved_contexts_column=None,
46
+ aspects: list = [
44
47
  "coherence",
45
48
  "conciseness",
46
49
  "correctness",
@@ -62,13 +65,13 @@ def AspectCritique(
62
65
 
63
66
  ### Inputs and Outputs:
64
67
 
65
- The input to this metric is a dataset containing the input `question` (prompt to the LLM)
66
- and the `answer` (text generated by the LLM). Any retrieved `contexts` can also be
68
+ The input to this metric is a dataset containing the input `user_input` (prompt to the LLM)
69
+ and the `response` (text generated by the LLM). Any retrieved `retrieved_contexts` can also be
67
70
  included to enhance the evaluation.
68
71
 
69
- The `question_column`, `answer_column`, and `contexts_column` parameters can be used to
72
+ The `user_input_column`, `response_column`, and `retrieved_contexts_column` parameters can be used to
70
73
  specify the names or sources for the data that this metric will evaluate if the dataset
71
- does not contain the required columns `question`, `answer`, and `contexts`.
74
+ does not contain the required columns `user_input`, `response`, and `retrieved_contexts`.
72
75
 
73
76
  By default, the aspects evaluated are harmfulness, maliciousness, coherence,
74
77
  correctness, and conciseness. To change the aspects evaluated, the `aspects` parameter
@@ -87,17 +90,17 @@ def AspectCritique(
87
90
  ### Examples:
88
91
 
89
92
  - **Mapping to Required Columns:** If the dataset does not contain the columns required
90
- to run this metric (i.e., `question`, `answer`, and `contexts`), the
93
+ to run this metric (i.e., `user_input`, `response`, and `retrieved_contexts`), the
91
94
 
92
95
  ```python
93
96
  pred_col = my_vm_dataset.prediction_column(my_vm_model)
94
97
  run_test(
95
- "validmind.model_validation.ragas.AspectCritique",
98
+ "validmind.model_validation.ragas.AspectCritic",
96
99
  inputs={"dataset": my_vm_dataset},
97
100
  params={
98
- "question_column": "input_prompt",
99
- "answer_column": f"{pred_col}.llm_output",
100
- "contexts_column": lambda row: [row[pred_col]["context_message"]],
101
+ "user_input_column": "input_prompt",
102
+ "response_column": f"{pred_col}.llm_output",
103
+ "retrieved_contexts_column": "retrieval_model_prediction",
101
104
  },
102
105
  )
103
106
  ```
@@ -110,7 +113,7 @@ def AspectCritique(
110
113
 
111
114
  ```python
112
115
  run_test(
113
- "validmind.model_validation.ragas.AspectCritique",
116
+ "validmind.model_validation.ragas.AspectCritic",
114
117
  inputs={"dataset": my_vm_dataset},
115
118
  params={
116
119
  "additional_aspects": [
@@ -135,16 +138,18 @@ def AspectCritique(
135
138
  )
136
139
 
137
140
  required_columns = {
138
- "question": question_column,
139
- "answer": answer_column,
140
- "contexts": contexts_column,
141
+ "user_input": user_input_column,
142
+ "response": response_column,
141
143
  }
142
144
 
145
+ if retrieved_contexts_column:
146
+ required_columns["retrieved_contexts"] = retrieved_contexts_column
147
+
143
148
  df = get_renamed_columns(dataset._df, required_columns)
144
149
 
145
150
  custom_aspects = (
146
151
  [
147
- AspectCritic(name=name, definition=description)
152
+ aspect_critic(name=name, definition=description)
148
153
  for name, description in additional_aspects
149
154
  ]
150
155
  if additional_aspects
@@ -162,7 +167,8 @@ def AspectCritique(
162
167
  result_df[aspect] = 1 - result_df[aspect]
163
168
 
164
169
  df_melted = result_df.melt(
165
- id_vars=["question", "answer", "contexts"],
170
+ id_vars=["user_input", "response"]
171
+ + (["retrieved_contexts"] if retrieved_contexts_column else []),
166
172
  value_vars=[aspect.name for aspect in all_aspects],
167
173
  var_name="Metric",
168
174
  value_name="Result",