validmind 2.5.8__py3-none-any.whl → 2.5.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai/test_descriptions.py +80 -119
  3. validmind/ai/test_result_description/config.yaml +29 -0
  4. validmind/ai/test_result_description/context.py +73 -0
  5. validmind/ai/test_result_description/image_processing.py +124 -0
  6. validmind/ai/test_result_description/system.jinja +39 -0
  7. validmind/ai/test_result_description/user.jinja +25 -0
  8. validmind/api_client.py +89 -43
  9. validmind/client.py +2 -2
  10. validmind/client_config.py +11 -14
  11. validmind/datasets/credit_risk/__init__.py +1 -0
  12. validmind/datasets/credit_risk/datasets/lending_club_biased.csv.gz +0 -0
  13. validmind/datasets/credit_risk/lending_club_bias.py +142 -0
  14. validmind/datasets/regression/fred_timeseries.py +67 -138
  15. validmind/template.py +1 -0
  16. validmind/test_suites/__init__.py +0 -2
  17. validmind/test_suites/statsmodels_timeseries.py +1 -1
  18. validmind/test_suites/summarization.py +0 -1
  19. validmind/test_suites/time_series.py +0 -43
  20. validmind/tests/__types__.py +14 -15
  21. validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
  22. validmind/tests/data_validation/ADF.py +31 -24
  23. validmind/tests/data_validation/AutoAR.py +9 -9
  24. validmind/tests/data_validation/AutoMA.py +23 -16
  25. validmind/tests/data_validation/AutoSeasonality.py +18 -16
  26. validmind/tests/data_validation/AutoStationarity.py +21 -16
  27. validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
  28. validmind/tests/{model_validation/statsmodels → data_validation}/BoxPierce.py +34 -34
  29. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +85 -124
  30. validmind/tests/data_validation/ClassImbalance.py +15 -12
  31. validmind/tests/data_validation/DFGLSArch.py +19 -13
  32. validmind/tests/data_validation/DatasetDescription.py +17 -11
  33. validmind/tests/data_validation/DatasetSplit.py +7 -5
  34. validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
  35. validmind/tests/data_validation/Duplicates.py +33 -25
  36. validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
  37. validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
  38. validmind/tests/data_validation/HighCardinality.py +19 -12
  39. validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
  40. validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
  41. validmind/tests/data_validation/IQROutliersTable.py +40 -36
  42. validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
  43. validmind/tests/data_validation/JarqueBera.py +70 -0
  44. validmind/tests/data_validation/KPSS.py +34 -29
  45. validmind/tests/data_validation/LJungBox.py +66 -0
  46. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
  47. validmind/tests/data_validation/MissingValues.py +32 -27
  48. validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
  49. validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
  50. validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
  51. validmind/tests/data_validation/ProtectedClassesCombination.py +197 -0
  52. validmind/tests/data_validation/ProtectedClassesDescription.py +130 -0
  53. validmind/tests/data_validation/ProtectedClassesDisparity.py +133 -0
  54. validmind/tests/data_validation/ProtectedClassesThresholdOptimizer.py +172 -0
  55. validmind/tests/data_validation/RollingStatsPlot.py +31 -23
  56. validmind/tests/data_validation/RunsTest.py +72 -0
  57. validmind/tests/data_validation/ScatterPlot.py +63 -78
  58. validmind/tests/data_validation/SeasonalDecompose.py +38 -34
  59. validmind/tests/{model_validation/statsmodels → data_validation}/ShapiroWilk.py +35 -30
  60. validmind/tests/data_validation/Skewness.py +35 -37
  61. validmind/tests/data_validation/SpreadPlot.py +35 -35
  62. validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
  63. validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
  64. validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
  65. validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
  66. validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
  67. validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
  68. validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
  69. validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
  70. validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
  71. validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
  72. validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
  73. validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
  74. validmind/tests/data_validation/TooManyZeroValues.py +16 -11
  75. validmind/tests/data_validation/UniqueRows.py +11 -6
  76. validmind/tests/data_validation/WOEBinPlots.py +23 -16
  77. validmind/tests/data_validation/WOEBinTable.py +35 -30
  78. validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
  79. validmind/tests/data_validation/nlp/CommonWords.py +21 -14
  80. validmind/tests/data_validation/nlp/Hashtags.py +42 -40
  81. validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
  82. validmind/tests/data_validation/nlp/Mentions.py +21 -15
  83. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
  84. validmind/tests/data_validation/nlp/Punctuations.py +24 -20
  85. validmind/tests/data_validation/nlp/Sentiment.py +27 -8
  86. validmind/tests/data_validation/nlp/StopWords.py +26 -19
  87. validmind/tests/data_validation/nlp/TextDescription.py +39 -36
  88. validmind/tests/data_validation/nlp/Toxicity.py +32 -9
  89. validmind/tests/decorator.py +81 -42
  90. validmind/tests/model_validation/BertScore.py +36 -27
  91. validmind/tests/model_validation/BleuScore.py +25 -19
  92. validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
  93. validmind/tests/model_validation/ContextualRecall.py +38 -13
  94. validmind/tests/model_validation/FeaturesAUC.py +32 -13
  95. validmind/tests/model_validation/MeteorScore.py +46 -33
  96. validmind/tests/model_validation/ModelMetadata.py +32 -64
  97. validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
  98. validmind/tests/model_validation/RegardScore.py +30 -14
  99. validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
  100. validmind/tests/model_validation/RougeScore.py +36 -30
  101. validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
  102. validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
  103. validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
  104. validmind/tests/model_validation/TokenDisparity.py +31 -23
  105. validmind/tests/model_validation/ToxicityScore.py +26 -17
  106. validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
  107. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
  108. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
  109. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
  110. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
  111. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
  112. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
  113. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
  114. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
  115. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
  116. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
  117. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
  118. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
  119. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
  120. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
  121. validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
  122. validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
  123. validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
  124. validmind/tests/model_validation/ragas/AspectCritique.py +12 -6
  125. validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
  126. validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
  127. validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
  128. validmind/tests/model_validation/ragas/ContextUtilization.py +155 -0
  129. validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
  130. validmind/tests/model_validation/ragas/NoiseSensitivity.py +152 -0
  131. validmind/tests/model_validation/ragas/utils.py +6 -0
  132. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
  133. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
  134. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
  135. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
  136. validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
  137. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
  138. validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
  139. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
  140. validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
  141. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
  142. validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
  143. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
  144. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
  145. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
  146. validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
  147. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
  148. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
  149. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
  150. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +32 -26
  151. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
  152. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
  153. validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
  154. validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
  155. validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
  156. validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
  157. validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -94
  158. validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
  159. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +36 -32
  160. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +66 -5
  161. validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
  162. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
  163. validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
  164. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
  165. validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
  166. validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
  167. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +59 -32
  168. validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
  169. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
  170. validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
  171. validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +86 -119
  172. validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
  173. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
  174. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
  175. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
  176. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
  177. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
  178. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
  179. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
  180. validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
  181. validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
  182. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
  183. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
  184. validmind/tests/prompt_validation/Bias.py +14 -11
  185. validmind/tests/prompt_validation/Clarity.py +16 -14
  186. validmind/tests/prompt_validation/Conciseness.py +7 -5
  187. validmind/tests/prompt_validation/Delimitation.py +23 -22
  188. validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
  189. validmind/tests/prompt_validation/Robustness.py +12 -10
  190. validmind/tests/prompt_validation/Specificity.py +13 -11
  191. validmind/tests/prompt_validation/ai_powered_test.py +6 -0
  192. validmind/tests/run.py +68 -23
  193. validmind/unit_metrics/__init__.py +81 -144
  194. validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
  195. validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
  196. validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
  197. validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
  198. validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
  199. validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
  200. validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
  201. validmind/unit_metrics/regression/HuberLoss.py +1 -1
  202. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
  203. validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
  204. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
  205. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
  206. validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
  207. validmind/unit_metrics/regression/QuantileLoss.py +1 -1
  208. validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
  209. validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
  210. validmind/utils.py +4 -0
  211. validmind/vm_models/dataset/dataset.py +2 -0
  212. validmind/vm_models/figure.py +5 -0
  213. validmind/vm_models/test/metric.py +1 -0
  214. validmind/vm_models/test/result_wrapper.py +143 -158
  215. validmind/vm_models/test/threshold_test.py +1 -0
  216. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/METADATA +4 -3
  217. validmind-2.5.18.dist-info/RECORD +324 -0
  218. validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
  219. validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
  220. validmind/tests/data_validation/BivariateHistograms.py +0 -117
  221. validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
  222. validmind/tests/data_validation/MissingValuesRisk.py +0 -88
  223. validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
  224. validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
  225. validmind/tests/model_validation/statsmodels/JarqueBera.py +0 -73
  226. validmind/tests/model_validation/statsmodels/LJungBox.py +0 -66
  227. validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
  228. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
  229. validmind/tests/model_validation/statsmodels/RunsTest.py +0 -71
  230. validmind-2.5.8.dist-info/RECORD +0 -318
  231. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/LICENSE +0 -0
  232. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/WHEEL +0 -0
  233. {validmind-2.5.8.dist-info → validmind-2.5.18.dist-info}/entry_points.txt +0 -0
@@ -17,32 +17,44 @@ from validmind import tags, tasks
17
17
  @tasks("text_qa", "text_generation", "text_summarization")
18
18
  def PCAComponentsPairwisePlots(dataset, model, n_components=3):
19
19
  """
20
- Generates scatter plots for pairwise combinations of principal component analysis (PCA) components of model embeddings.
21
-
22
- **Purpose:**
23
- This function visualizes the principal components of embeddings derived from a specified model. Principal Component Analysis (PCA)
24
- is a statistical technique that emphasizes variation and uncovers strong patterns in a dataset.
25
- It transforms the original variables into new, uncorrelated variables (principal components) that maximize variance.
26
-
27
- **Test Mechanism:**
28
- The function follows a sequential process to visualize PCA components effectively.
29
- It starts by extracting embeddings from the dataset, utilizing the model specified by the user.
30
- These embeddings are then standardized to ensure zero mean and unit variance, which is crucial to prevent
31
- any single feature from dominating due to scale—this standardization is a critical preprocessing step for PCA.
32
- Following this, the function calculates the specified number of principal components.
33
- The core of the visualization process involves creating scatter plots for each pairwise combination of these principal components.
34
-
35
- **Signs of High Risk:**
36
- - If the principal components do not account for a significant portion of the variance, it may suggest that PCA is not capturing the essential structures of the data.
37
- - Similarity in scatter plots across different pairs of components could indicate redundancy in the components, suggesting that fewer dimensions might be sufficient to represent the data.
38
-
39
- **Strengths:**
40
- - Enables a simplified visualization of multivariate data, helping to identify patterns across many variables effectively.
41
- - Provides a clear depiction of the directions of maximum variance in the data, which is valuable for feature selection and dimensionality reduction.
42
-
43
- **Limitations:**
44
- - PCA's effectiveness hinges on the scaling of the variables; improper standardization can lead to misleading interpretations.
45
- - The interpretation of principal components can be challenging, especially if they capture less significant variances or are difficult to relate back to the original features.
20
+ Generates scatter plots for pairwise combinations of principal component analysis (PCA) components of model
21
+ embeddings.
22
+
23
+ ### Purpose
24
+
25
+ This function visualizes the principal components of embeddings derived from a specified model. Principal Component
26
+ Analysis (PCA) is a statistical technique that emphasizes variation and uncovers strong patterns in a dataset. It
27
+ transforms the original variables into new, uncorrelated variables (principal components) that maximize variance.
28
+
29
+ ### Test Mechanism
30
+
31
+ The function follows a sequential process to visualize PCA components effectively. It starts by extracting
32
+ embeddings from the dataset, utilizing the model specified by the user. These embeddings are then standardized to
33
+ ensure zero mean and unit variance, which is crucial to prevent any single feature from dominating due to
34
+ scale—this standardization is a critical preprocessing step for PCA. Following this, the function calculates the
35
+ specified number of principal components. The core of the visualization process involves creating scatter plots for
36
+ each pairwise combination of these principal components.
37
+
38
+ ### Signs of High Risk
39
+
40
+ - If the principal components do not account for a significant portion of the variance, it may suggest that PCA is
41
+ not capturing the essential structures of the data.
42
+ - Similarity in scatter plots across different pairs of components could indicate redundancy in the components,
43
+ suggesting that fewer dimensions might be sufficient to represent the data.
44
+
45
+ ### Strengths
46
+
47
+ - Enables a simplified visualization of multivariate data, helping to identify patterns across many variables
48
+ effectively.
49
+ - Provides a clear depiction of the directions of maximum variance in the data, which is valuable for feature
50
+ selection and dimensionality reduction.
51
+
52
+ ### Limitations
53
+
54
+ - PCA's effectiveness hinges on the scaling of the variables; improper standardization can lead to misleading
55
+ interpretations.
56
+ - The interpretation of principal components can be challenging, especially if they capture less significant
57
+ variances or are difficult to relate back to the original features.
46
58
  """
47
59
 
48
60
  # Get embeddings from the dataset using the model
@@ -23,7 +23,46 @@ logger = get_logger(__name__)
23
23
 
24
24
 
25
25
  class StabilityAnalysis(ThresholdTest):
26
- """Base class for embeddings stability analysis tests"""
26
+ """
27
+ Assesses the stability of embeddings generated by a model when faced with perturbed input data to ensure robustness
28
+ and consistency.
29
+
30
+ ### Purpose
31
+
32
+ The Embedding Stability test evaluates the robustness of the embeddings generated by a model when the input text is
33
+ perturbed. By comparing the cosine similarities between the original and perturbed embeddings, it gauges the
34
+ model's ability to maintain consistent semantic representations under slight variations in the input data.
35
+
36
+ ### Test Mechanism
37
+
38
+ This test works by:
39
+
40
+ - Perturbing the original text data.
41
+ - Generating embeddings for both the original and perturbed datasets using the model.
42
+ - Calculating the cosine similarities between the original and perturbed embeddings.
43
+ - Analyzing the distribution of these similarities (mean, min, max, median, and standard deviation).
44
+ - Determining the test result based on whether the mean similarity exceeds a predefined threshold (default is 0.7).
45
+
46
+ ### Signs of High Risk
47
+
48
+ - Mean cosine similarity below the threshold (default is 0.7).
49
+ - Large standard deviation of cosine similarities, indicating inconsistency.
50
+ - Minimum similarity score significantly lower than expected.
51
+ - Failure to pass the threshold test based on the mean similarity.
52
+
53
+ ### Strengths
54
+
55
+ - Provides a quantitative measure of embedding stability.
56
+ - Helps in identifying weaknesses in the model's ability to handle minor input variations.
57
+ - Visualization of similarity distributions aids in comprehensive analysis.
58
+ - Easy to interpret results with clear pass/fail criteria.
59
+
60
+ ### Limitations
61
+
62
+ - Relies on the chosen perturbation method, which may not cover all possible variations in real-world data.
63
+ - Thresholds for similarity might need adjustment based on specific application requirements.
64
+ - Cosine similarity, while useful, may not capture all aspects of semantic stability.
65
+ """
27
66
 
28
67
  required_inputs = ["model", "dataset"]
29
68
  default_params = {
@@ -9,37 +9,38 @@ from .StabilityAnalysis import StabilityAnalysis
9
9
 
10
10
  class StabilityAnalysisKeyword(StabilityAnalysis):
11
11
  """
12
- Evaluate robustness of embeddings models to keyword swaps on the test dataset
12
+ Evaluates robustness of embedding models to keyword swaps in the test dataset.
13
13
 
14
- This tests expects a parameter `keyword_dict` that maps words to other words
15
- so that any instances of the key words in the test dataset will be replaced
16
- with the corresponding value.
14
+ ### Purpose
17
15
 
18
- **Purpose:**
19
- This test metric is used to evaluate the robustness of text embedding machine learning models to
20
- keyword swaps. A keyword swap is a scenario where instances of certain specified keywords in the dataset are
21
- replaced with other specified words (usually synonyms). The purpose of this metric is to ensure that these models
22
- maintain performance stability even when the input data slightly deviates, imitating real-world variability.
16
+ This test metric is used to evaluate the robustness of text embedding machine learning models to keyword swaps. A
17
+ keyword swap is a scenario where instances of certain specified keywords in the dataset are replaced with other
18
+ specified words (usually synonyms). The purpose of this metric is to ensure that these models maintain performance
19
+ stability even when the input data slightly deviates, imitating real-world variability.
23
20
 
24
- **Test Mechanism:**
25
- The test mechanism involves perturbation of the dataset used in testing the model. Each
26
- instance of a specific word found in the dataset is replaced with the corresponding word as specified in a
27
- 'keyword_dict' mapping. The model is then re-run with the perturbed dataset and the results are compared with the
28
- non-perturbed dataset. This comparison quantifies the extent to which keyword swaps impact the model's performance.
21
+ ### Test Mechanism
22
+
23
+ The test mechanism involves a perturbation of the dataset used in testing the model. Each instance of a specific
24
+ word found in the dataset is replaced with the corresponding word as specified in a 'keyword_dict' mapping. The
25
+ model is then re-run with the perturbed dataset and the results are compared with the non-perturbed dataset. This
26
+ comparison quantifies the extent to which keyword swaps impact the model's performance.
27
+
28
+ ### Signs of High Risk
29
29
 
30
- **Signs of High Risk:**
31
30
  - A significant drop in model performance after keyword swaps indicates a high risk of model failure in real-world
32
31
  scenarios.
33
32
  - The model results being heavily reliant on specific word choices instead of capturing the context properly.
34
33
 
35
- **Strengths:**
34
+ ### Strengths
35
+
36
36
  - This test provides a way to measure model robustness to small changes in input data, which reinforces its
37
37
  applicability and reliability in real-world scenarios.
38
38
  - This test encourages a model to understand the context of a sentence rather than memorizing specific words.
39
39
  - It helps to detect overfitting - a situation where a model performs well on training data but poorly on new or
40
40
  slightly altered data.
41
41
 
42
- **Limitations:**
42
+ ### Limitations
43
+
43
44
  - It may not fully address semantic differences that can be introduced through keyword swaps. That is, the
44
45
  replacement words might not preserve the exact semantic meaning of the original words.
45
46
  - It only tests for changes in keywords (word-level alterations) and might not expose model limitations related to
@@ -61,54 +61,49 @@ def random_insertion(word_list):
61
61
 
62
62
  class StabilityAnalysisRandomNoise(StabilityAnalysis):
63
63
  """
64
- Evaluate robustness of embeddings models to random noise introduced by using
65
- a `probability` parameter to choose random locations in the text to apply
66
- random perturbations. These perturbations include:
67
-
68
- - Swapping two adjacent words
69
- - Introducing a random typo in a word
70
- - Deleting a random word
71
- - Inserting a random word at a random position
72
-
73
- **Purpose:**
74
- The purpose of the stability analysis is to evaluate the robustness of a text embeddings model to random noise.
75
- Random perturbations such as swapping adjacent words, introducing random typos, deleting random words, or inserting
76
- random words at random positions in the text are introduced, to gauge the model's performance and stability.
77
-
78
- **Test Mechanism:**
79
- The test mechanism includes a series of function-defined random perturbations like swapping two words
80
- `random_swap`, introducing a typo in a word `introduce_typo`, deleting a random word `random_deletion`, and
81
- inserting a random word at a random position `random_insertion`. A probability parameter determines the likelihood
82
- or frequency of these perturbations within the text data.
83
-
84
- The `perturb_data` function initally tokenizes the string data based on spaces then applies selected random
85
- perturbations based on the provided probability for each word in the text.
86
-
87
- **Signs of High Risk:**
88
- - High error rates in model predictions or classifications after the introduction of the random noise
89
- - Greater sensitivity to certain types and degrees of noise such as typographical errors, insertion or deletion of
90
- words
91
- - Significant change in loss function or accuracy metric
92
- - Inconsistency in model outputs for slightly perturbed inputs
93
-
94
- **Strengths:**
95
- - Measures model robustness against noise thereby reflecting real-world scenarios where data may contain errors or
64
+ Assesses the robustness of text embeddings models to random noise introduced via text perturbations.
65
+
66
+ ### Purpose
67
+
68
+ The purpose of this test is to evaluate the robustness of a text embeddings model to random noise. It introduces
69
+ perturbations such as swapping adjacent words, inserting typos, deleting words, or inserting random words within
70
+ the text to determine how well the model performs under such noisy conditions.
71
+
72
+ ### Test Mechanism
73
+
74
+ The test applies a series of pre-defined random perturbations to the text data. These perturbations include:
75
+
76
+ - Swapping two adjacent words using the `random_swap` function.
77
+ - Introducing a typo in a word using the `introduce_typo` function.
78
+ - Deleting a word using the `random_deletion` function.
79
+ - Inserting a random word at a random position using the `random_insertion` function.
80
+
81
+ A probability parameter dictates the likelihood of each perturbation being applied to the words in the text. The
82
+ text is initially tokenized into words, and selected perturbations are applied based on this probability.
83
+
84
+ ### Signs of High Risk
85
+
86
+ - High error rates in model predictions or classifications after the introduction of random noise.
87
+ - Greater sensitivity to specific types of noise, such as typographical errors or word deletions.
88
+ - Significant change in loss function or accuracy metrics.
89
+ - Inconsistent model outputs for slightly perturbed inputs.
90
+
91
+ ### Strengths
92
+
93
+ - Measures model robustness against noise, reflecting real-world scenarios where data may contain errors or
96
94
  inconsistencies.
97
- - Easy to implement with adjustable perturbation severity through probability parameter.
98
- - Enables identification of model sensitivity to certain noise types, providing grounding for model improvement.
95
+ - Easy to implement with adjustable perturbation severity through a probability parameter.
96
+ - Identifies model sensitivity to specific types of noise, offering insights for model improvement.
99
97
  - Useful for testing models designed to handle text data.
100
98
 
101
- **Limitations:**
102
- - The test might not be effective for models that have been designed with a high resistance to noise or models that
103
- are inherently designed to handle such perturbations.
104
- - Pseudo-randomness may not accurately represent real-world distribution of noise or typographical errors, which
105
- could be biased towards certain types of errors or malware injections.
106
- - Highly dependent on the probability parameter to introduce noise, with artificial adjusting required to achieve
107
- an optimal balance.
108
- - Only validates the model's performance against noise in input data, not its ability to capture complex language
109
- structures or semantics.
110
- - Does not guarantee the model's performance in new, unseen, real-world data beyond what is represented by the
111
- noise-introduced test data.
99
+ ### Limitations
100
+
101
+ - May be ineffective for models that are inherently resistant to noise or designed to handle such perturbations.
102
+ - Pseudo-randomness may not accurately represent the real-world distribution of noise or typographical errors.
103
+ - Highly dependent on the probability parameter, requiring fine-tuning to achieve an optimal balance.
104
+ - Only assesses performance against noise in input data, not the ability to capture complex language structures or
105
+ semantics.
106
+ - Does not guarantee model performance on new, unseen, real-world data beyond the generated noisy test data.
112
107
  """
113
108
 
114
109
  name = "Text Embeddings Stability Analysis to Random Noise"
@@ -14,47 +14,45 @@ class StabilityAnalysisSynonyms(StabilityAnalysis):
14
14
  """
15
15
  Evaluates the stability of text embeddings models when words in test data are replaced by their synonyms randomly.
16
16
 
17
- This test uses WordNet to find synonyms for words in the test dataset and
18
- expects a parameter `probability` that determines the probability of swapping
19
- a word with a synonym.
17
+ ### Purpose
20
18
 
21
- **Purpose:**
22
19
  The Stability Analysis Synonyms test is designed to gauge the robustness and stability of an embeddings model on
23
20
  text-based data. The test does so by introducing random word changes through replacing words in the test dataset
24
21
  with their synonyms.
25
22
 
26
- **Test Mechanism:**
23
+ ### Test Mechanism
24
+
27
25
  This test utilizes WordNet to find synonyms for a given word present in the test data, replacing the original word
28
26
  with this synonym based on a given probability. The probability is defined as a parameter and determines the
29
27
  likelihood of swapping a word with its synonym. By default, this is set at 0.02 but can be adjusted based on
30
28
  specific test requirements. This methodology enables an evaluation of how such replacements can affect the model's
31
29
  performance.
32
30
 
33
- **Signs of High Risk:**
31
+ ### Signs of High Risk
34
32
 
35
33
  - The model's performance or predictions change significantly after swapping words with their synonyms.
36
34
  - The model shows high sensitivity to small perturbations, like modifying the data with synonyms.
37
- - The embeddings model fails to identify similar meanings between the original words and their synonyms, which
38
- means it lacks semantic understanding.
35
+ - The embeddings model fails to identify similar meanings between the original words and their synonyms, indicating
36
+ it lacks semantic understanding.
39
37
 
40
- **Strengths:**
38
+ ### Strengths
41
39
 
42
40
  - The test is flexible in its application. The 'probability' parameter can be adjusted based on the degree of
43
41
  synonym swapping required.
44
42
  - Efficient in gauging a model's sensitivity or robustness with respect to small changes in input data.
45
- - The test can provide insights into the semantic understanding of the model as it monitors the impact of swapping
46
- words with synonyms.
43
+ - Provides insights into the semantic understanding of the model as it monitors the impact of swapping words with
44
+ synonyms.
47
45
 
48
- **Limitations:**
46
+ ### Limitations
49
47
 
50
48
  - The ability to perturb data is reliant on the availability of synonyms, limiting its efficiency.
51
- - The test assumes that the synonyms provided by WordNet are accurate and interchangeable in all contexts, which
52
- may not always be the case given the intricacies of language and context-specific meanings.
53
- - This test does not consider the influence of multi-word expressions or phrases, as synonyms are considered at the
54
- word level only.
55
- - Relies solely on the WordNet corpus for synonyms, which would limit its effectiveness for specialized or
56
- domain-specific jargon not included in that corpus.
57
- - It does not consider the semantic role of the words in the sentence, so the swapped synonym could potentially
49
+ - It assumes that the synonyms provided by WordNet are accurate and interchangeable in all contexts, which may not
50
+ always be the case given the intricacies of language and context-specific meanings.
51
+ - It does not consider the influence of multi-word expressions or phrases, as synonyms are considered at the word
52
+ level only.
53
+ - Relies solely on the WordNet corpus for synonyms, limiting its effectiveness for specialized or domain-specific
54
+ jargon not included in that corpus.
55
+ - Does not consider the semantic role of the words in the sentence, meaning the swapped synonym could potentially
58
56
  alter the overall meaning of the sentence, leading to a false perception of the model's stability.
59
57
  """
60
58
 
@@ -13,15 +13,19 @@ logger = get_logger(__name__)
13
13
 
14
14
  class StabilityAnalysisTranslation(StabilityAnalysis):
15
15
  """
16
- Evaluate robustness of embeddings models to noise introduced by translating
17
- the original text to another language and back.
16
+ Evaluates robustness of text embeddings models to noise introduced by translating the original text to another
17
+ language and back.
18
18
 
19
- **Purpose:** The purpose of this test is to assess the robustness of text embeddings models under the influence of
20
- noise. The noise in this scenario is introduced by translating the original text into another language and then
21
- translating it back to the original language. Any significant changes in the model's output between the original
22
- and translated-then-retranslated texts can be indicators of the model's lack of robustness to noise.
19
+ ### Purpose
23
20
 
24
- **Test Mechanism:** The test mechanism involves several steps:
21
+ The purpose of this test is to assess the robustness of text embeddings models under the influence of noise. The
22
+ noise in this scenario is introduced by translating the original text into another language and then translating it
23
+ back to the original language. Any significant changes in the model's output between the original and
24
+ translated-then-retranslated texts can be indicators of the model's lack of robustness to noise.
25
+
26
+ ### Test Mechanism
27
+
28
+ The test mechanism involves several steps:
25
29
 
26
30
  1. Initialize the Marian tokenizer and model for both source and target languages.
27
31
  2. Translate the data from the source language to the target language.
@@ -32,29 +36,29 @@ class StabilityAnalysisTranslation(StabilityAnalysis):
32
36
  The threshold of this test output would then be determined by the tolerance level of the model to these potentially
33
37
  noisy instances.
34
38
 
35
- **Signs of High Risk:**
39
+ ### Signs of High Risk
36
40
 
37
- - If there large discrepancies between the original and double-translated text, this could indicate a high level of
38
- risk, signifying that the model is not robust to noise.
39
- - If a translation between languages does not closely maintain the meaning and context of the original language, it
40
- may suggest inadequate robustness against this type of noise.
41
+ - Large discrepancies between the original and double-translated text, indicating a high level of risk and a lack
42
+ of robustness to noise.
43
+ - Translations that do not closely maintain the meaning and context of the original language, suggesting inadequate
44
+ robustness against this type of noise.
41
45
 
42
- **Strengths:**
46
+ ### Strengths
43
47
 
44
- - This metric is an effective way to assess the model’s sensitivity and robustness to language translation noise.
45
- - The use of translation as a means to introduce noise provides a realistic scenario which the model might
46
- encounter in real-world situations.
47
- - This metric extends beyond simple lexical changes, testing the model’s capacity to maintain semantic meaning
48
- under translational perturbations.
48
+ - An effective way to assess the model’s sensitivity and robustness to language translation noise.
49
+ - Provides a realistic scenario which the model might encounter in real-world applications by using translation to
50
+ introduce noise.
51
+ - Tests the model’s capacity to maintain semantic meaning under translational perturbations, extending beyond
52
+ simple lexical changes.
49
53
 
50
- **Limitations:**
54
+ ### Limitations
51
55
 
52
- - Relying solely on translation-related noise for robustness testing can overlook other types of noise not
53
- reflected in language translation, such as typographical errors, grammatical mistakes, or random word substitutions.
54
- - Potential inaccuracies or discrepancies in the translation process itself might influence the resultant
55
- robustness score, rather than reflect an inherent failing of the model being tested.
56
- - The test is predominantly language-dependent, hence it might not fully capture the robustness of the model for
57
- languages with fewer resources or languages that are highly dissimilar to the source language.
56
+ - Relies solely on translation-related noise, potentially overlooking other types of noise such as typographical
57
+ errors, grammatical mistakes, or random word substitutions.
58
+ - Inaccuracies or discrepancies in the translation process itself might influence the resultant robustness score
59
+ rather than reflect an inherent failing of the model.
60
+ - Predominantly language-dependent, thus might not fully capture robustness for languages with fewer resources or
61
+ those highly dissimilar to the source language.
58
62
  """
59
63
 
60
64
  name = "Text Embeddings Stability Analysis to Translation"
@@ -23,35 +23,45 @@ def TSNEComponentsPairwisePlots(
23
23
  title="t-SNE",
24
24
  ):
25
25
  """
26
- Plots individual scatter plots for pairwise combinations of t-SNE components of embeddings.
27
-
28
- **Purpose:**
29
- This function creates scatter plots for each pairwise combination of t-SNE components derived from model embeddings.
30
- t-SNE (t-Distributed Stochastic Neighbor Embedding) is a machine learning algorithm for dimensionality reduction that
31
- is particularly well-suited for the visualization of high-dimensional datasets.
32
-
33
- **Test Mechanism:**
34
- The function begins by extracting embeddings from the provided dataset using the specified model.
35
- These embeddings are then standardized to ensure that each dimension contributes equally to the distance computation.
36
- Following this, the t-SNE algorithm is applied to reduce the dimensionality of the data, with the number of components
37
- specified by the user. The results are plotted using Plotly, creating scatter plots for each unique pair of components
38
- if more than one component is specified.
39
-
40
- **Signs of High Risk:**
41
- - If the scatter plots show overlapping clusters or indistinct groupings, it might suggest that the
42
- t-SNE parameters (such as perplexity) are not optimally set for the given data, or the data itself does not exhibit clear, separable clusters.
43
- - Similar plots across different pairs of components could indicate redundancy in the components generated by t-SNE,
44
- suggesting that fewer dimensions might be sufficient to represent the data's structure.
45
-
46
- **Strengths:**
47
- - Provides a visual exploration tool for high-dimensional data, simplifying the detection of patterns and clusters which are not apparent in higher dimensions.
48
- - Interactive plots generated by Plotly enhance user engagement and allow for a deeper dive into specific areas of the plot, aiding in detailed data analysis.
49
-
50
- **Limitations:**
51
- - The effectiveness of t-SNE is highly dependent on the choice of parameters like perplexity and the number of components,
52
- which might require tuning and experimentation for optimal results.
26
+ Creates scatter plots for pairwise combinations of t-SNE components to visualize embeddings and highlight potential
27
+ clustering structures.
28
+
29
+ ### Purpose
30
+
31
+ This function creates scatter plots for each pairwise combination of t-SNE components derived from model
32
+ embeddings. t-SNE (t-Distributed Stochastic Neighbor Embedding) is a machine learning algorithm for dimensionality
33
+ reduction that is particularly well-suited for the visualization of high-dimensional datasets.
34
+
35
+ ### Test Mechanism
36
+
37
+ The function begins by extracting embeddings from the provided dataset using the specified model. These embeddings
38
+ are then standardized to ensure that each dimension contributes equally to the distance computation. Following
39
+ this, the t-SNE algorithm is applied to reduce the dimensionality of the data, with the number of components
40
+ specified by the user. The results are plotted using Plotly, creating scatter plots for each unique pair of
41
+ components if more than one component is specified.
42
+
43
+ ### Signs of High Risk
44
+
45
+ - If the scatter plots show overlapping clusters or indistinct groupings, it might suggest that the t-SNE
46
+ parameters (such as perplexity) are not optimally set for the given data, or the data itself does not exhibit
47
+ clear, separable clusters.
48
+ - Similar plots across different pairs of components could indicate redundancy in the components generated by
49
+ t-SNE, suggesting that fewer dimensions might be sufficient to represent the data's structure.
50
+
51
+ ### Strengths
52
+
53
+ - Provides a visual exploration tool for high-dimensional data, simplifying the detection of patterns and clusters
54
+ which are not apparent in higher dimensions.
55
+ - Interactive plots generated by Plotly enhance user engagement and allow for a deeper dive into specific areas of
56
+ the plot, aiding in detailed data analysis.
57
+
58
+ ### Limitations
59
+
60
+ - The effectiveness of t-SNE is highly dependent on the choice of parameters like perplexity and the number of
61
+ components, which might require tuning and experimentation for optimal results.
53
62
  - t-SNE visualizations can be misleading if interpreted without considering the stochastic nature of the algorithm;
54
- two runs with the same parameters might yield different visual outputs, necessitating multiple runs for a consistent interpretation.
63
+ two runs with the same parameters might yield different visual outputs, necessitating multiple runs for a
64
+ consistent interpretation.
55
65
  """
56
66
  # Get embeddings from the dataset using the model
57
67
  embeddings = np.stack(dataset.y_pred(model))
@@ -49,6 +49,7 @@ def AnswerCorrectness(
49
49
  ### Configuring Columns
50
50
 
51
51
  This metric requires specific columns to be present in the dataset:
52
+
52
53
  - `question` (str): The text prompt or query that was input into the model.
53
54
  - `answer` (str): The text response generated by the model.
54
55
  - `ground_truth` (str): The ground truth answer that the generated answer is compared
@@ -116,9 +117,9 @@ def AnswerCorrectness(
116
117
 
117
118
  return (
118
119
  {
119
- "Scores (will not be uploaded to UI)": result_df[
120
- ["question", "answer", "ground_truth", "answer_correctness"]
121
- ],
120
+ # "Scores (will not be uploaded to UI)": result_df[
121
+ # ["question", "answer", "ground_truth", "answer_correctness"]
122
+ # ],
122
123
  "Aggregate Scores": [
123
124
  {
124
125
  "Mean Score": result_df["answer_correctness"].mean(),
@@ -126,7 +127,7 @@ def AnswerCorrectness(
126
127
  "Max Score": result_df["answer_correctness"].max(),
127
128
  "Min Score": result_df["answer_correctness"].min(),
128
129
  "Standard Deviation": result_df["answer_correctness"].std(),
129
- "Count": len(result_df),
130
+ "Count": result_df.shape[0],
130
131
  }
131
132
  ],
132
133
  },
@@ -53,6 +53,7 @@ def AnswerRelevance(
53
53
  ### Configuring Columns
54
54
 
55
55
  This metric requires the following columns in your dataset:
56
+
56
57
  - `question` (str): The text query that was input into the model.
57
58
  - `contexts` (List[str]): Any contextual information retrieved by the model before
58
59
  generating an answer.
@@ -120,9 +121,9 @@ def AnswerRelevance(
120
121
 
121
122
  return (
122
123
  {
123
- "Scores (will not be uploaded to UI)": result_df[
124
- ["question", "contexts", "answer", "answer_relevancy"]
125
- ],
124
+ # "Scores (will not be uploaded to UI)": result_df[
125
+ # ["question", "contexts", "answer", "answer_relevancy"]
126
+ # ],
126
127
  "Aggregate Scores": [
127
128
  {
128
129
  "Mean Score": result_df["answer_relevancy"].mean(),
@@ -130,7 +131,7 @@ def AnswerRelevance(
130
131
  "Max Score": result_df["answer_relevancy"].max(),
131
132
  "Min Score": result_df["answer_relevancy"].min(),
132
133
  "Standard Deviation": result_df["answer_relevancy"].std(),
133
- "Count": len(result_df),
134
+ "Count": result_df.shape[0],
134
135
  }
135
136
  ],
136
137
  },
@@ -42,6 +42,7 @@ def AnswerSimilarity(
42
42
  ### Configuring Columns
43
43
 
44
44
  This metric requires the following columns in your dataset:
45
+
45
46
  - `answer` (str): The text response generated by the model.
46
47
  - `ground_truth` (str): The ground truth answer that the generated answer is compared
47
48
  against.
@@ -105,9 +106,9 @@ def AnswerSimilarity(
105
106
 
106
107
  return (
107
108
  {
108
- "Scores (will not be uploaded to UI)": result_df[
109
- ["answer", "ground_truth", "answer_similarity"]
110
- ],
109
+ # "Scores (will not be uploaded to UI)": result_df[
110
+ # ["answer", "ground_truth", "answer_similarity"]
111
+ # ],
111
112
  "Aggregate Scores": [
112
113
  {
113
114
  "Mean Score": result_df["answer_similarity"].mean(),
@@ -115,7 +116,7 @@ def AnswerSimilarity(
115
116
  "Max Score": result_df["answer_similarity"].max(),
116
117
  "Min Score": result_df["answer_similarity"].min(),
117
118
  "Standard Deviation": result_df["answer_similarity"].std(),
118
- "Count": len(result_df),
119
+ "Count": result_df.shape[0],
119
120
  }
120
121
  ],
121
122
  },