validmind 2.5.8__py3-none-any.whl → 2.5.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +26 -7
- validmind/api_client.py +89 -43
- validmind/client.py +2 -2
- validmind/client_config.py +11 -14
- validmind/datasets/regression/fred_timeseries.py +67 -138
- validmind/template.py +1 -0
- validmind/test_suites/__init__.py +0 -2
- validmind/test_suites/statsmodels_timeseries.py +1 -1
- validmind/test_suites/summarization.py +0 -1
- validmind/test_suites/time_series.py +0 -43
- validmind/tests/__types__.py +3 -13
- validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
- validmind/tests/data_validation/ADF.py +31 -24
- validmind/tests/data_validation/AutoAR.py +9 -9
- validmind/tests/data_validation/AutoMA.py +23 -16
- validmind/tests/data_validation/AutoSeasonality.py +18 -16
- validmind/tests/data_validation/AutoStationarity.py +21 -16
- validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
- validmind/tests/data_validation/ChiSquaredFeaturesTable.py +82 -124
- validmind/tests/data_validation/ClassImbalance.py +15 -12
- validmind/tests/data_validation/DFGLSArch.py +19 -13
- validmind/tests/data_validation/DatasetDescription.py +17 -11
- validmind/tests/data_validation/DatasetSplit.py +7 -5
- validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
- validmind/tests/data_validation/Duplicates.py +33 -25
- validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
- validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
- validmind/tests/data_validation/HighCardinality.py +19 -12
- validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
- validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
- validmind/tests/data_validation/IQROutliersTable.py +40 -36
- validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
- validmind/tests/data_validation/KPSS.py +34 -29
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
- validmind/tests/data_validation/MissingValues.py +32 -27
- validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
- validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
- validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
- validmind/tests/data_validation/RollingStatsPlot.py +31 -23
- validmind/tests/data_validation/ScatterPlot.py +63 -78
- validmind/tests/data_validation/SeasonalDecompose.py +38 -34
- validmind/tests/data_validation/Skewness.py +35 -37
- validmind/tests/data_validation/SpreadPlot.py +35 -35
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
- validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
- validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
- validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
- validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
- validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
- validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
- validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
- validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
- validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
- validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
- validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
- validmind/tests/data_validation/TooManyZeroValues.py +16 -11
- validmind/tests/data_validation/UniqueRows.py +11 -6
- validmind/tests/data_validation/WOEBinPlots.py +23 -16
- validmind/tests/data_validation/WOEBinTable.py +35 -30
- validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
- validmind/tests/data_validation/nlp/CommonWords.py +21 -14
- validmind/tests/data_validation/nlp/Hashtags.py +27 -20
- validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
- validmind/tests/data_validation/nlp/Mentions.py +21 -15
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
- validmind/tests/data_validation/nlp/Punctuations.py +24 -20
- validmind/tests/data_validation/nlp/Sentiment.py +27 -8
- validmind/tests/data_validation/nlp/StopWords.py +26 -19
- validmind/tests/data_validation/nlp/TextDescription.py +36 -35
- validmind/tests/data_validation/nlp/Toxicity.py +32 -9
- validmind/tests/decorator.py +81 -42
- validmind/tests/model_validation/BertScore.py +36 -27
- validmind/tests/model_validation/BleuScore.py +25 -19
- validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
- validmind/tests/model_validation/ContextualRecall.py +35 -13
- validmind/tests/model_validation/FeaturesAUC.py +32 -13
- validmind/tests/model_validation/MeteorScore.py +46 -33
- validmind/tests/model_validation/ModelMetadata.py +32 -64
- validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
- validmind/tests/model_validation/RegardScore.py +30 -14
- validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
- validmind/tests/model_validation/RougeScore.py +36 -30
- validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
- validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
- validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
- validmind/tests/model_validation/TokenDisparity.py +31 -23
- validmind/tests/model_validation/ToxicityScore.py +26 -17
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
- validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
- validmind/tests/model_validation/ragas/AspectCritique.py +7 -0
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
- validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
- validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
- validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
- validmind/tests/model_validation/ragas/utils.py +6 -0
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
- validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
- validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +31 -25
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
- validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
- validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
- validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
- validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -93
- validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +36 -32
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +7 -5
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
- validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
- validmind/tests/model_validation/statsmodels/BoxPierce.py +14 -10
- validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
- validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +19 -12
- validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
- validmind/tests/model_validation/statsmodels/JarqueBera.py +27 -22
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
- validmind/tests/model_validation/statsmodels/LJungBox.py +32 -28
- validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
- validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +87 -119
- validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
- validmind/tests/model_validation/statsmodels/RunsTest.py +32 -28
- validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
- validmind/tests/model_validation/statsmodels/ShapiroWilk.py +15 -8
- validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
- validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
- validmind/tests/prompt_validation/Bias.py +14 -11
- validmind/tests/prompt_validation/Clarity.py +16 -14
- validmind/tests/prompt_validation/Conciseness.py +7 -5
- validmind/tests/prompt_validation/Delimitation.py +23 -22
- validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
- validmind/tests/prompt_validation/Robustness.py +12 -10
- validmind/tests/prompt_validation/Specificity.py +13 -11
- validmind/tests/prompt_validation/ai_powered_test.py +6 -0
- validmind/tests/run.py +68 -23
- validmind/unit_metrics/__init__.py +81 -144
- validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
- validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
- validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
- validmind/unit_metrics/regression/HuberLoss.py +1 -1
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
- validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
- validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
- validmind/unit_metrics/regression/QuantileLoss.py +1 -1
- validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
- validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
- validmind/vm_models/dataset/dataset.py +2 -0
- validmind/vm_models/figure.py +5 -0
- validmind/vm_models/test/result_wrapper.py +93 -132
- {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/METADATA +1 -1
- {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/RECORD +203 -210
- validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
- validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
- validmind/tests/data_validation/BivariateHistograms.py +0 -117
- validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
- validmind/tests/data_validation/MissingValuesRisk.py +0 -88
- validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
- validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
- validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
- {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/LICENSE +0 -0
- {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/WHEEL +0 -0
- {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/entry_points.txt +0 -0
@@ -61,54 +61,49 @@ def random_insertion(word_list):
|
|
61
61
|
|
62
62
|
class StabilityAnalysisRandomNoise(StabilityAnalysis):
|
63
63
|
"""
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
The
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
-
|
89
|
-
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
**Strengths:**
|
95
|
-
- Measures model robustness against noise thereby reflecting real-world scenarios where data may contain errors or
|
64
|
+
Assesses the robustness of text embeddings models to random noise introduced via text perturbations.
|
65
|
+
|
66
|
+
### Purpose
|
67
|
+
|
68
|
+
The purpose of this test is to evaluate the robustness of a text embeddings model to random noise. It introduces
|
69
|
+
perturbations such as swapping adjacent words, inserting typos, deleting words, or inserting random words within
|
70
|
+
the text to determine how well the model performs under such noisy conditions.
|
71
|
+
|
72
|
+
### Test Mechanism
|
73
|
+
|
74
|
+
The test applies a series of pre-defined random perturbations to the text data. These perturbations include:
|
75
|
+
|
76
|
+
- Swapping two adjacent words using the `random_swap` function.
|
77
|
+
- Introducing a typo in a word using the `introduce_typo` function.
|
78
|
+
- Deleting a word using the `random_deletion` function.
|
79
|
+
- Inserting a random word at a random position using the `random_insertion` function.
|
80
|
+
|
81
|
+
A probability parameter dictates the likelihood of each perturbation being applied to the words in the text. The
|
82
|
+
text is initially tokenized into words, and selected perturbations are applied based on this probability.
|
83
|
+
|
84
|
+
### Signs of High Risk
|
85
|
+
|
86
|
+
- High error rates in model predictions or classifications after the introduction of random noise.
|
87
|
+
- Greater sensitivity to specific types of noise, such as typographical errors or word deletions.
|
88
|
+
- Significant change in loss function or accuracy metrics.
|
89
|
+
- Inconsistent model outputs for slightly perturbed inputs.
|
90
|
+
|
91
|
+
### Strengths
|
92
|
+
|
93
|
+
- Measures model robustness against noise, reflecting real-world scenarios where data may contain errors or
|
96
94
|
inconsistencies.
|
97
|
-
- Easy to implement with adjustable perturbation severity through probability parameter.
|
98
|
-
-
|
95
|
+
- Easy to implement with adjustable perturbation severity through a probability parameter.
|
96
|
+
- Identifies model sensitivity to specific types of noise, offering insights for model improvement.
|
99
97
|
- Useful for testing models designed to handle text data.
|
100
98
|
|
101
|
-
|
102
|
-
|
103
|
-
are inherently designed to handle such perturbations.
|
104
|
-
- Pseudo-randomness may not accurately represent real-world distribution of noise or typographical errors
|
105
|
-
|
106
|
-
-
|
107
|
-
|
108
|
-
-
|
109
|
-
structures or semantics.
|
110
|
-
- Does not guarantee the model's performance in new, unseen, real-world data beyond what is represented by the
|
111
|
-
noise-introduced test data.
|
99
|
+
### Limitations
|
100
|
+
|
101
|
+
- May be ineffective for models that are inherently resistant to noise or designed to handle such perturbations.
|
102
|
+
- Pseudo-randomness may not accurately represent the real-world distribution of noise or typographical errors.
|
103
|
+
- Highly dependent on the probability parameter, requiring fine-tuning to achieve an optimal balance.
|
104
|
+
- Only assesses performance against noise in input data, not the ability to capture complex language structures or
|
105
|
+
semantics.
|
106
|
+
- Does not guarantee model performance on new, unseen, real-world data beyond the generated noisy test data.
|
112
107
|
"""
|
113
108
|
|
114
109
|
name = "Text Embeddings Stability Analysis to Random Noise"
|
@@ -14,47 +14,45 @@ class StabilityAnalysisSynonyms(StabilityAnalysis):
|
|
14
14
|
"""
|
15
15
|
Evaluates the stability of text embeddings models when words in test data are replaced by their synonyms randomly.
|
16
16
|
|
17
|
-
|
18
|
-
expects a parameter `probability` that determines the probability of swapping
|
19
|
-
a word with a synonym.
|
17
|
+
### Purpose
|
20
18
|
|
21
|
-
**Purpose:**
|
22
19
|
The Stability Analysis Synonyms test is designed to gauge the robustness and stability of an embeddings model on
|
23
20
|
text-based data. The test does so by introducing random word changes through replacing words in the test dataset
|
24
21
|
with their synonyms.
|
25
22
|
|
26
|
-
|
23
|
+
### Test Mechanism
|
24
|
+
|
27
25
|
This test utilizes WordNet to find synonyms for a given word present in the test data, replacing the original word
|
28
26
|
with this synonym based on a given probability. The probability is defined as a parameter and determines the
|
29
27
|
likelihood of swapping a word with its synonym. By default, this is set at 0.02 but can be adjusted based on
|
30
28
|
specific test requirements. This methodology enables an evaluation of how such replacements can affect the model's
|
31
29
|
performance.
|
32
30
|
|
33
|
-
|
31
|
+
### Signs of High Risk
|
34
32
|
|
35
33
|
- The model's performance or predictions change significantly after swapping words with their synonyms.
|
36
34
|
- The model shows high sensitivity to small perturbations, like modifying the data with synonyms.
|
37
|
-
- The embeddings model fails to identify similar meanings between the original words and their synonyms,
|
38
|
-
|
35
|
+
- The embeddings model fails to identify similar meanings between the original words and their synonyms, indicating
|
36
|
+
it lacks semantic understanding.
|
39
37
|
|
40
|
-
|
38
|
+
### Strengths
|
41
39
|
|
42
40
|
- The test is flexible in its application. The 'probability' parameter can be adjusted based on the degree of
|
43
41
|
synonym swapping required.
|
44
42
|
- Efficient in gauging a model's sensitivity or robustness with respect to small changes in input data.
|
45
|
-
-
|
46
|
-
|
43
|
+
- Provides insights into the semantic understanding of the model as it monitors the impact of swapping words with
|
44
|
+
synonyms.
|
47
45
|
|
48
|
-
|
46
|
+
### Limitations
|
49
47
|
|
50
48
|
- The ability to perturb data is reliant on the availability of synonyms, limiting its efficiency.
|
51
|
-
-
|
52
|
-
|
53
|
-
-
|
54
|
-
|
55
|
-
- Relies solely on the WordNet corpus for synonyms,
|
56
|
-
|
57
|
-
-
|
49
|
+
- It assumes that the synonyms provided by WordNet are accurate and interchangeable in all contexts, which may not
|
50
|
+
always be the case given the intricacies of language and context-specific meanings.
|
51
|
+
- It does not consider the influence of multi-word expressions or phrases, as synonyms are considered at the word
|
52
|
+
level only.
|
53
|
+
- Relies solely on the WordNet corpus for synonyms, limiting its effectiveness for specialized or domain-specific
|
54
|
+
jargon not included in that corpus.
|
55
|
+
- Does not consider the semantic role of the words in the sentence, meaning the swapped synonym could potentially
|
58
56
|
alter the overall meaning of the sentence, leading to a false perception of the model's stability.
|
59
57
|
"""
|
60
58
|
|
@@ -13,15 +13,19 @@ logger = get_logger(__name__)
|
|
13
13
|
|
14
14
|
class StabilityAnalysisTranslation(StabilityAnalysis):
|
15
15
|
"""
|
16
|
-
|
17
|
-
|
16
|
+
Evaluates robustness of text embeddings models to noise introduced by translating the original text to another
|
17
|
+
language and back.
|
18
18
|
|
19
|
-
|
20
|
-
noise. The noise in this scenario is introduced by translating the original text into another language and then
|
21
|
-
translating it back to the original language. Any significant changes in the model's output between the original
|
22
|
-
and translated-then-retranslated texts can be indicators of the model's lack of robustness to noise.
|
19
|
+
### Purpose
|
23
20
|
|
24
|
-
|
21
|
+
The purpose of this test is to assess the robustness of text embeddings models under the influence of noise. The
|
22
|
+
noise in this scenario is introduced by translating the original text into another language and then translating it
|
23
|
+
back to the original language. Any significant changes in the model's output between the original and
|
24
|
+
translated-then-retranslated texts can be indicators of the model's lack of robustness to noise.
|
25
|
+
|
26
|
+
### Test Mechanism
|
27
|
+
|
28
|
+
The test mechanism involves several steps:
|
25
29
|
|
26
30
|
1. Initialize the Marian tokenizer and model for both source and target languages.
|
27
31
|
2. Translate the data from the source language to the target language.
|
@@ -32,29 +36,29 @@ class StabilityAnalysisTranslation(StabilityAnalysis):
|
|
32
36
|
The threshold of this test output would then be determined by the tolerance level of the model to these potentially
|
33
37
|
noisy instances.
|
34
38
|
|
35
|
-
|
39
|
+
### Signs of High Risk
|
36
40
|
|
37
|
-
-
|
38
|
-
|
39
|
-
-
|
40
|
-
|
41
|
+
- Large discrepancies between the original and double-translated text, indicating a high level of risk and a lack
|
42
|
+
of robustness to noise.
|
43
|
+
- Translations that do not closely maintain the meaning and context of the original language, suggesting inadequate
|
44
|
+
robustness against this type of noise.
|
41
45
|
|
42
|
-
|
46
|
+
### Strengths
|
43
47
|
|
44
|
-
-
|
45
|
-
-
|
46
|
-
|
47
|
-
-
|
48
|
-
|
48
|
+
- An effective way to assess the model’s sensitivity and robustness to language translation noise.
|
49
|
+
- Provides a realistic scenario which the model might encounter in real-world applications by using translation to
|
50
|
+
introduce noise.
|
51
|
+
- Tests the model’s capacity to maintain semantic meaning under translational perturbations, extending beyond
|
52
|
+
simple lexical changes.
|
49
53
|
|
50
|
-
|
54
|
+
### Limitations
|
51
55
|
|
52
|
-
-
|
53
|
-
|
54
|
-
-
|
55
|
-
|
56
|
-
-
|
57
|
-
|
56
|
+
- Relies solely on translation-related noise, potentially overlooking other types of noise such as typographical
|
57
|
+
errors, grammatical mistakes, or random word substitutions.
|
58
|
+
- Inaccuracies or discrepancies in the translation process itself might influence the resultant robustness score
|
59
|
+
rather than reflect an inherent failing of the model.
|
60
|
+
- Predominantly language-dependent, thus might not fully capture robustness for languages with fewer resources or
|
61
|
+
those highly dissimilar to the source language.
|
58
62
|
"""
|
59
63
|
|
60
64
|
name = "Text Embeddings Stability Analysis to Translation"
|
@@ -23,35 +23,45 @@ def TSNEComponentsPairwisePlots(
|
|
23
23
|
title="t-SNE",
|
24
24
|
):
|
25
25
|
"""
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
26
|
+
Creates scatter plots for pairwise combinations of t-SNE components to visualize embeddings and highlight potential
|
27
|
+
clustering structures.
|
28
|
+
|
29
|
+
### Purpose
|
30
|
+
|
31
|
+
This function creates scatter plots for each pairwise combination of t-SNE components derived from model
|
32
|
+
embeddings. t-SNE (t-Distributed Stochastic Neighbor Embedding) is a machine learning algorithm for dimensionality
|
33
|
+
reduction that is particularly well-suited for the visualization of high-dimensional datasets.
|
34
|
+
|
35
|
+
### Test Mechanism
|
36
|
+
|
37
|
+
The function begins by extracting embeddings from the provided dataset using the specified model. These embeddings
|
38
|
+
are then standardized to ensure that each dimension contributes equally to the distance computation. Following
|
39
|
+
this, the t-SNE algorithm is applied to reduce the dimensionality of the data, with the number of components
|
40
|
+
specified by the user. The results are plotted using Plotly, creating scatter plots for each unique pair of
|
41
|
+
components if more than one component is specified.
|
42
|
+
|
43
|
+
### Signs of High Risk
|
44
|
+
|
45
|
+
- If the scatter plots show overlapping clusters or indistinct groupings, it might suggest that the t-SNE
|
46
|
+
parameters (such as perplexity) are not optimally set for the given data, or the data itself does not exhibit
|
47
|
+
clear, separable clusters.
|
48
|
+
- Similar plots across different pairs of components could indicate redundancy in the components generated by
|
49
|
+
t-SNE, suggesting that fewer dimensions might be sufficient to represent the data's structure.
|
50
|
+
|
51
|
+
### Strengths
|
52
|
+
|
53
|
+
- Provides a visual exploration tool for high-dimensional data, simplifying the detection of patterns and clusters
|
54
|
+
which are not apparent in higher dimensions.
|
55
|
+
- Interactive plots generated by Plotly enhance user engagement and allow for a deeper dive into specific areas of
|
56
|
+
the plot, aiding in detailed data analysis.
|
57
|
+
|
58
|
+
### Limitations
|
59
|
+
|
60
|
+
- The effectiveness of t-SNE is highly dependent on the choice of parameters like perplexity and the number of
|
61
|
+
components, which might require tuning and experimentation for optimal results.
|
53
62
|
- t-SNE visualizations can be misleading if interpreted without considering the stochastic nature of the algorithm;
|
54
|
-
two runs with the same parameters might yield different visual outputs, necessitating multiple runs for a
|
63
|
+
two runs with the same parameters might yield different visual outputs, necessitating multiple runs for a
|
64
|
+
consistent interpretation.
|
55
65
|
"""
|
56
66
|
# Get embeddings from the dataset using the model
|
57
67
|
embeddings = np.stack(dataset.y_pred(model))
|
@@ -49,6 +49,7 @@ def AnswerCorrectness(
|
|
49
49
|
### Configuring Columns
|
50
50
|
|
51
51
|
This metric requires specific columns to be present in the dataset:
|
52
|
+
|
52
53
|
- `question` (str): The text prompt or query that was input into the model.
|
53
54
|
- `answer` (str): The text response generated by the model.
|
54
55
|
- `ground_truth` (str): The ground truth answer that the generated answer is compared
|
@@ -116,9 +117,9 @@ def AnswerCorrectness(
|
|
116
117
|
|
117
118
|
return (
|
118
119
|
{
|
119
|
-
"Scores (will not be uploaded to UI)": result_df[
|
120
|
-
|
121
|
-
],
|
120
|
+
# "Scores (will not be uploaded to UI)": result_df[
|
121
|
+
# ["question", "answer", "ground_truth", "answer_correctness"]
|
122
|
+
# ],
|
122
123
|
"Aggregate Scores": [
|
123
124
|
{
|
124
125
|
"Mean Score": result_df["answer_correctness"].mean(),
|
@@ -126,7 +127,7 @@ def AnswerCorrectness(
|
|
126
127
|
"Max Score": result_df["answer_correctness"].max(),
|
127
128
|
"Min Score": result_df["answer_correctness"].min(),
|
128
129
|
"Standard Deviation": result_df["answer_correctness"].std(),
|
129
|
-
"Count":
|
130
|
+
"Count": result_df.shape[0],
|
130
131
|
}
|
131
132
|
],
|
132
133
|
},
|
@@ -53,6 +53,7 @@ def AnswerRelevance(
|
|
53
53
|
### Configuring Columns
|
54
54
|
|
55
55
|
This metric requires the following columns in your dataset:
|
56
|
+
|
56
57
|
- `question` (str): The text query that was input into the model.
|
57
58
|
- `contexts` (List[str]): Any contextual information retrieved by the model before
|
58
59
|
generating an answer.
|
@@ -120,9 +121,9 @@ def AnswerRelevance(
|
|
120
121
|
|
121
122
|
return (
|
122
123
|
{
|
123
|
-
"Scores (will not be uploaded to UI)": result_df[
|
124
|
-
|
125
|
-
],
|
124
|
+
# "Scores (will not be uploaded to UI)": result_df[
|
125
|
+
# ["question", "contexts", "answer", "answer_relevancy"]
|
126
|
+
# ],
|
126
127
|
"Aggregate Scores": [
|
127
128
|
{
|
128
129
|
"Mean Score": result_df["answer_relevancy"].mean(),
|
@@ -130,7 +131,7 @@ def AnswerRelevance(
|
|
130
131
|
"Max Score": result_df["answer_relevancy"].max(),
|
131
132
|
"Min Score": result_df["answer_relevancy"].min(),
|
132
133
|
"Standard Deviation": result_df["answer_relevancy"].std(),
|
133
|
-
"Count":
|
134
|
+
"Count": result_df.shape[0],
|
134
135
|
}
|
135
136
|
],
|
136
137
|
},
|
@@ -42,6 +42,7 @@ def AnswerSimilarity(
|
|
42
42
|
### Configuring Columns
|
43
43
|
|
44
44
|
This metric requires the following columns in your dataset:
|
45
|
+
|
45
46
|
- `answer` (str): The text response generated by the model.
|
46
47
|
- `ground_truth` (str): The ground truth answer that the generated answer is compared
|
47
48
|
against.
|
@@ -105,9 +106,9 @@ def AnswerSimilarity(
|
|
105
106
|
|
106
107
|
return (
|
107
108
|
{
|
108
|
-
"Scores (will not be uploaded to UI)": result_df[
|
109
|
-
|
110
|
-
],
|
109
|
+
# "Scores (will not be uploaded to UI)": result_df[
|
110
|
+
# ["answer", "ground_truth", "answer_similarity"]
|
111
|
+
# ],
|
111
112
|
"Aggregate Scores": [
|
112
113
|
{
|
113
114
|
"Mean Score": result_df["answer_similarity"].mean(),
|
@@ -115,7 +116,7 @@ def AnswerSimilarity(
|
|
115
116
|
"Max Score": result_df["answer_similarity"].max(),
|
116
117
|
"Min Score": result_df["answer_similarity"].min(),
|
117
118
|
"Standard Deviation": result_df["answer_similarity"].std(),
|
118
|
-
"Count":
|
119
|
+
"Count": result_df.shape[0],
|
119
120
|
}
|
120
121
|
],
|
121
122
|
},
|
@@ -11,6 +11,8 @@ from validmind import tags, tasks
|
|
11
11
|
|
12
12
|
from .utils import get_ragas_config, get_renamed_columns
|
13
13
|
|
14
|
+
LOWER_IS_BETTER_ASPECTS = ["harmfulness", "maliciousness"]
|
15
|
+
|
14
16
|
|
15
17
|
@tags("ragas", "llm", "qualitative")
|
16
18
|
@tasks("text_summarization", "text_generation", "text_qa")
|
@@ -149,6 +151,11 @@ def AspectCritique(
|
|
149
151
|
Dataset.from_pandas(df), metrics=all_aspects, **get_ragas_config()
|
150
152
|
).to_pandas()
|
151
153
|
|
154
|
+
# reverse the score for aspects where lower is better
|
155
|
+
for aspect in LOWER_IS_BETTER_ASPECTS:
|
156
|
+
if aspect in result_df.columns:
|
157
|
+
result_df[aspect] = 1 - result_df[aspect]
|
158
|
+
|
152
159
|
df_melted = result_df.melt(
|
153
160
|
id_vars=["question", "answer", "contexts"],
|
154
161
|
value_vars=[aspect.name for aspect in all_aspects],
|
@@ -47,6 +47,7 @@ def ContextEntityRecall(
|
|
47
47
|
### Configuring Columns
|
48
48
|
|
49
49
|
This metric requires the following columns in your dataset:
|
50
|
+
|
50
51
|
- `contexts` (List[str]): A list of text contexts which will be evaluated to make
|
51
52
|
sure if they contain the entities present in the ground truth.
|
52
53
|
- `ground_truth` (str): The ground truth text from which the entities will be
|
@@ -113,13 +114,13 @@ def ContextEntityRecall(
|
|
113
114
|
|
114
115
|
return (
|
115
116
|
{
|
116
|
-
"Scores (will not be uploaded to UI)": result_df[
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
],
|
117
|
+
# "Scores (will not be uploaded to UI)": result_df[
|
118
|
+
# [
|
119
|
+
# "contexts",
|
120
|
+
# "ground_truth",
|
121
|
+
# "context_entity_recall",
|
122
|
+
# ]
|
123
|
+
# ],
|
123
124
|
"Aggregate Scores": [
|
124
125
|
{
|
125
126
|
"Mean Score": result_df["context_entity_recall"].mean(),
|
@@ -127,7 +128,7 @@ def ContextEntityRecall(
|
|
127
128
|
"Max Score": result_df["context_entity_recall"].max(),
|
128
129
|
"Min Score": result_df["context_entity_recall"].min(),
|
129
130
|
"Standard Deviation": result_df["context_entity_recall"].std(),
|
130
|
-
"Count":
|
131
|
+
"Count": result_df.shape[0],
|
131
132
|
}
|
132
133
|
],
|
133
134
|
},
|
@@ -40,6 +40,7 @@ def ContextPrecision(
|
|
40
40
|
### Configuring Columns
|
41
41
|
|
42
42
|
This metric requires the following columns in your dataset:
|
43
|
+
|
43
44
|
- `question` (str): The text query that was input into the model.
|
44
45
|
- `contexts` (List[str]): A list of text contexts which are retrieved and which
|
45
46
|
will be evaluated to make sure they contain relevant info in the correct order.
|
@@ -107,9 +108,9 @@ def ContextPrecision(
|
|
107
108
|
|
108
109
|
return (
|
109
110
|
{
|
110
|
-
"Scores (will not be uploaded to UI)": result_df[
|
111
|
-
|
112
|
-
],
|
111
|
+
# "Scores (will not be uploaded to UI)": result_df[
|
112
|
+
# ["question", "contexts", "ground_truth", "context_precision"]
|
113
|
+
# ],
|
113
114
|
"Aggregate Scores": [
|
114
115
|
{
|
115
116
|
"Mean Score": result_df["context_precision"].mean(),
|
@@ -117,7 +118,7 @@ def ContextPrecision(
|
|
117
118
|
"Max Score": result_df["context_precision"].max(),
|
118
119
|
"Min Score": result_df["context_precision"].min(),
|
119
120
|
"Standard Deviation": result_df["context_precision"].std(),
|
120
|
-
"Count":
|
121
|
+
"Count": result_df.shape[0],
|
121
122
|
}
|
122
123
|
],
|
123
124
|
},
|
@@ -40,6 +40,7 @@ def ContextRecall(
|
|
40
40
|
### Configuring Columns
|
41
41
|
|
42
42
|
This metric requires the following columns in your dataset:
|
43
|
+
|
43
44
|
- `question` (str): The text query that was input into the model.
|
44
45
|
- `contexts` (List[str]): A list of text contexts which are retrieved and which
|
45
46
|
will be evaluated to make sure they contain all items in the ground truth.
|
@@ -107,9 +108,9 @@ def ContextRecall(
|
|
107
108
|
|
108
109
|
return (
|
109
110
|
{
|
110
|
-
"Scores (will not be uploaded to UI)": result_df[
|
111
|
-
|
112
|
-
],
|
111
|
+
# "Scores (will not be uploaded to UI)": result_df[
|
112
|
+
# ["question", "contexts", "ground_truth", "context_recall"]
|
113
|
+
# ],
|
113
114
|
"Aggregate Scores": [
|
114
115
|
{
|
115
116
|
"Mean Score": result_df["context_recall"].mean(),
|
@@ -117,7 +118,7 @@ def ContextRecall(
|
|
117
118
|
"Max Score": result_df["context_recall"].max(),
|
118
119
|
"Min Score": result_df["context_recall"].min(),
|
119
120
|
"Standard Deviation": result_df["context_recall"].std(),
|
120
|
-
"Count":
|
121
|
+
"Count": result_df.shape[0],
|
121
122
|
}
|
122
123
|
],
|
123
124
|
},
|
@@ -41,6 +41,7 @@ def Faithfulness(
|
|
41
41
|
### Configuring Columns
|
42
42
|
|
43
43
|
This metric requires the following columns in your dataset:
|
44
|
+
|
44
45
|
- `contexts` (List[str]): A list of text contexts which are retrieved to generate
|
45
46
|
the answer.
|
46
47
|
- `answer` (str): The response generated by the model which will be evaluated for
|
@@ -105,9 +106,9 @@ def Faithfulness(
|
|
105
106
|
|
106
107
|
return (
|
107
108
|
{
|
108
|
-
"Scores (will not be uploaded to UI)": result_df[
|
109
|
-
|
110
|
-
],
|
109
|
+
# "Scores (will not be uploaded to UI)": result_df[
|
110
|
+
# ["contexts", "answer", "faithfulness"]
|
111
|
+
# ],
|
111
112
|
"Aggregate Scores": [
|
112
113
|
{
|
113
114
|
"Mean Score": result_df["faithfulness"].mean(),
|
@@ -115,7 +116,7 @@ def Faithfulness(
|
|
115
116
|
"Max Score": result_df["faithfulness"].max(),
|
116
117
|
"Min Score": result_df["faithfulness"].min(),
|
117
118
|
"Standard Deviation": result_df["faithfulness"].std(),
|
118
|
-
"Count":
|
119
|
+
"Count": result_df.shape[0],
|
119
120
|
}
|
120
121
|
],
|
121
122
|
},
|
@@ -5,11 +5,17 @@
|
|
5
5
|
import os
|
6
6
|
|
7
7
|
from validmind.ai.utils import get_client_and_model
|
8
|
+
from validmind.client_config import client_config
|
8
9
|
|
9
10
|
EMBEDDINGS_MODEL = "text-embedding-3-small"
|
10
11
|
|
11
12
|
|
12
13
|
def get_ragas_config():
|
14
|
+
if not client_config.can_generate_llm_test_descriptions():
|
15
|
+
raise ValueError(
|
16
|
+
"LLM based descriptions are not enabled in the current configuration."
|
17
|
+
)
|
18
|
+
|
13
19
|
# import here since its an optional dependency
|
14
20
|
try:
|
15
21
|
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|