validmind 2.5.8__py3-none-any.whl → 2.5.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +26 -7
- validmind/api_client.py +89 -43
- validmind/client.py +2 -2
- validmind/client_config.py +11 -14
- validmind/datasets/regression/fred_timeseries.py +67 -138
- validmind/template.py +1 -0
- validmind/test_suites/__init__.py +0 -2
- validmind/test_suites/statsmodels_timeseries.py +1 -1
- validmind/test_suites/summarization.py +0 -1
- validmind/test_suites/time_series.py +0 -43
- validmind/tests/__types__.py +3 -13
- validmind/tests/data_validation/ACFandPACFPlot.py +15 -13
- validmind/tests/data_validation/ADF.py +31 -24
- validmind/tests/data_validation/AutoAR.py +9 -9
- validmind/tests/data_validation/AutoMA.py +23 -16
- validmind/tests/data_validation/AutoSeasonality.py +18 -16
- validmind/tests/data_validation/AutoStationarity.py +21 -16
- validmind/tests/data_validation/BivariateScatterPlots.py +67 -96
- validmind/tests/data_validation/ChiSquaredFeaturesTable.py +82 -124
- validmind/tests/data_validation/ClassImbalance.py +15 -12
- validmind/tests/data_validation/DFGLSArch.py +19 -13
- validmind/tests/data_validation/DatasetDescription.py +17 -11
- validmind/tests/data_validation/DatasetSplit.py +7 -5
- validmind/tests/data_validation/DescriptiveStatistics.py +28 -21
- validmind/tests/data_validation/Duplicates.py +33 -25
- validmind/tests/data_validation/EngleGrangerCoint.py +35 -33
- validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +59 -71
- validmind/tests/data_validation/HighCardinality.py +19 -12
- validmind/tests/data_validation/HighPearsonCorrelation.py +27 -22
- validmind/tests/data_validation/IQROutliersBarPlot.py +13 -10
- validmind/tests/data_validation/IQROutliersTable.py +40 -36
- validmind/tests/data_validation/IsolationForestOutliers.py +21 -14
- validmind/tests/data_validation/KPSS.py +34 -29
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +22 -15
- validmind/tests/data_validation/MissingValues.py +32 -27
- validmind/tests/data_validation/MissingValuesBarPlot.py +25 -21
- validmind/tests/data_validation/PearsonCorrelationMatrix.py +71 -84
- validmind/tests/data_validation/PhillipsPerronArch.py +37 -30
- validmind/tests/data_validation/RollingStatsPlot.py +31 -23
- validmind/tests/data_validation/ScatterPlot.py +63 -78
- validmind/tests/data_validation/SeasonalDecompose.py +38 -34
- validmind/tests/data_validation/Skewness.py +35 -37
- validmind/tests/data_validation/SpreadPlot.py +35 -35
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +23 -17
- validmind/tests/data_validation/TabularDateTimeHistograms.py +21 -13
- validmind/tests/data_validation/TabularDescriptionTables.py +51 -16
- validmind/tests/data_validation/TabularNumericalHistograms.py +25 -22
- validmind/tests/data_validation/TargetRateBarPlots.py +21 -14
- validmind/tests/data_validation/TimeSeriesDescription.py +25 -18
- validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +23 -17
- validmind/tests/data_validation/TimeSeriesFrequency.py +24 -17
- validmind/tests/data_validation/TimeSeriesHistogram.py +33 -32
- validmind/tests/data_validation/TimeSeriesLinePlot.py +17 -10
- validmind/tests/data_validation/TimeSeriesMissingValues.py +15 -10
- validmind/tests/data_validation/TimeSeriesOutliers.py +37 -33
- validmind/tests/data_validation/TooManyZeroValues.py +16 -11
- validmind/tests/data_validation/UniqueRows.py +11 -6
- validmind/tests/data_validation/WOEBinPlots.py +23 -16
- validmind/tests/data_validation/WOEBinTable.py +35 -30
- validmind/tests/data_validation/ZivotAndrewsArch.py +34 -28
- validmind/tests/data_validation/nlp/CommonWords.py +21 -14
- validmind/tests/data_validation/nlp/Hashtags.py +27 -20
- validmind/tests/data_validation/nlp/LanguageDetection.py +33 -14
- validmind/tests/data_validation/nlp/Mentions.py +21 -15
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +32 -9
- validmind/tests/data_validation/nlp/Punctuations.py +24 -20
- validmind/tests/data_validation/nlp/Sentiment.py +27 -8
- validmind/tests/data_validation/nlp/StopWords.py +26 -19
- validmind/tests/data_validation/nlp/TextDescription.py +36 -35
- validmind/tests/data_validation/nlp/Toxicity.py +32 -9
- validmind/tests/decorator.py +81 -42
- validmind/tests/model_validation/BertScore.py +36 -27
- validmind/tests/model_validation/BleuScore.py +25 -19
- validmind/tests/model_validation/ClusterSizeDistribution.py +38 -34
- validmind/tests/model_validation/ContextualRecall.py +35 -13
- validmind/tests/model_validation/FeaturesAUC.py +32 -13
- validmind/tests/model_validation/MeteorScore.py +46 -33
- validmind/tests/model_validation/ModelMetadata.py +32 -64
- validmind/tests/model_validation/ModelPredictionResiduals.py +75 -73
- validmind/tests/model_validation/RegardScore.py +30 -14
- validmind/tests/model_validation/RegressionResidualsPlot.py +10 -5
- validmind/tests/model_validation/RougeScore.py +36 -30
- validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +30 -14
- validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +27 -30
- validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +68 -63
- validmind/tests/model_validation/TokenDisparity.py +31 -23
- validmind/tests/model_validation/ToxicityScore.py +26 -17
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +24 -20
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +30 -27
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +7 -5
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +32 -23
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +7 -5
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +15 -11
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +29 -29
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +34 -25
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +38 -26
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +40 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +18 -17
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +40 -45
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +17 -19
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +29 -25
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +38 -28
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -4
- validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
- validmind/tests/model_validation/ragas/AspectCritique.py +7 -0
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +9 -8
- validmind/tests/model_validation/ragas/ContextPrecision.py +5 -4
- validmind/tests/model_validation/ragas/ContextRecall.py +5 -4
- validmind/tests/model_validation/ragas/Faithfulness.py +5 -4
- validmind/tests/model_validation/ragas/utils.py +6 -0
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +19 -12
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +22 -17
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +27 -25
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +7 -5
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +40 -78
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +15 -17
- validmind/tests/model_validation/sklearn/CompletenessScore.py +17 -11
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +22 -15
- validmind/tests/model_validation/sklearn/FeatureImportance.py +95 -0
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +7 -7
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +19 -12
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +35 -30
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +10 -5
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +32 -32
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +23 -23
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +15 -10
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +26 -19
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +38 -18
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +31 -25
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +8 -6
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +24 -17
- validmind/tests/model_validation/sklearn/ROCCurve.py +12 -7
- validmind/tests/model_validation/sklearn/RegressionErrors.py +74 -130
- validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +27 -12
- validmind/tests/model_validation/sklearn/{RegressionModelsPerformanceComparison.py → RegressionPerformance.py} +18 -20
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +55 -93
- validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +32 -13
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +36 -32
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +7 -5
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +27 -19
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +25 -18
- validmind/tests/model_validation/sklearn/VMeasure.py +14 -13
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +7 -5
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +24 -18
- validmind/tests/model_validation/statsmodels/BoxPierce.py +14 -10
- validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +73 -104
- validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +19 -12
- validmind/tests/model_validation/statsmodels/GINITable.py +44 -77
- validmind/tests/model_validation/statsmodels/JarqueBera.py +27 -22
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +33 -34
- validmind/tests/model_validation/statsmodels/LJungBox.py +32 -28
- validmind/tests/model_validation/statsmodels/Lilliefors.py +27 -24
- validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +87 -119
- validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +100 -0
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +14 -9
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +17 -13
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +46 -43
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +38 -36
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +30 -28
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +18 -11
- validmind/tests/model_validation/statsmodels/RunsTest.py +32 -28
- validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +75 -107
- validmind/tests/model_validation/statsmodels/ShapiroWilk.py +15 -8
- validmind/tests/ongoing_monitoring/FeatureDrift.py +10 -6
- validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +31 -25
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +29 -21
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +31 -23
- validmind/tests/prompt_validation/Bias.py +14 -11
- validmind/tests/prompt_validation/Clarity.py +16 -14
- validmind/tests/prompt_validation/Conciseness.py +7 -5
- validmind/tests/prompt_validation/Delimitation.py +23 -22
- validmind/tests/prompt_validation/NegativeInstruction.py +7 -5
- validmind/tests/prompt_validation/Robustness.py +12 -10
- validmind/tests/prompt_validation/Specificity.py +13 -11
- validmind/tests/prompt_validation/ai_powered_test.py +6 -0
- validmind/tests/run.py +68 -23
- validmind/unit_metrics/__init__.py +81 -144
- validmind/unit_metrics/classification/{sklearn/Accuracy.py → Accuracy.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/F1.py → F1.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/Precision.py → Precision.py} +1 -1
- validmind/unit_metrics/classification/{sklearn/ROC_AUC.py → ROC_AUC.py} +1 -2
- validmind/unit_metrics/classification/{sklearn/Recall.py → Recall.py} +1 -1
- validmind/unit_metrics/regression/{sklearn/AdjustedRSquaredScore.py → AdjustedRSquaredScore.py} +1 -1
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -1
- validmind/unit_metrics/regression/HuberLoss.py +1 -1
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -1
- validmind/unit_metrics/regression/{sklearn/MeanAbsoluteError.py → MeanAbsoluteError.py} +1 -1
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -1
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -1
- validmind/unit_metrics/regression/{sklearn/MeanSquaredError.py → MeanSquaredError.py} +1 -1
- validmind/unit_metrics/regression/QuantileLoss.py +1 -1
- validmind/unit_metrics/regression/{sklearn/RSquaredScore.py → RSquaredScore.py} +1 -1
- validmind/unit_metrics/regression/{sklearn/RootMeanSquaredError.py → RootMeanSquaredError.py} +1 -1
- validmind/vm_models/dataset/dataset.py +2 -0
- validmind/vm_models/figure.py +5 -0
- validmind/vm_models/test/result_wrapper.py +93 -132
- {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/METADATA +1 -1
- {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/RECORD +203 -210
- validmind/tests/data_validation/ANOVAOneWayTable.py +0 -138
- validmind/tests/data_validation/BivariateFeaturesBarPlots.py +0 -142
- validmind/tests/data_validation/BivariateHistograms.py +0 -117
- validmind/tests/data_validation/HeatmapFeatureCorrelations.py +0 -124
- validmind/tests/data_validation/MissingValuesRisk.py +0 -88
- validmind/tests/model_validation/ModelMetadataComparison.py +0 -59
- validmind/tests/model_validation/sklearn/FeatureImportanceComparison.py +0 -83
- validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +0 -135
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +0 -103
- {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/LICENSE +0 -0
- {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/WHEEL +0 -0
- {validmind-2.5.8.dist-info → validmind-2.5.15.dist-info}/entry_points.txt +0 -0
@@ -13,32 +13,34 @@ class CosineSimilarityDistribution(Metric):
|
|
13
13
|
Assesses the similarity between predicted text embeddings from a model using a Cosine Similarity distribution
|
14
14
|
histogram.
|
15
15
|
|
16
|
-
|
16
|
+
### Purpose
|
17
|
+
|
17
18
|
This metric is used to assess the degree of similarity between the embeddings produced by a text embedding model
|
18
19
|
using Cosine Similarity. Cosine Similarity is a measure that calculates the cosine of the angle between two
|
19
20
|
vectors. This metric is predominantly used in text analysis — in this case, to determine how closely the predicted
|
20
21
|
text embeddings align with one another.
|
21
22
|
|
22
|
-
|
23
|
+
### Test Mechanism
|
24
|
+
|
23
25
|
The implementation starts by computing the cosine similarity between the predicted values of the model's test
|
24
26
|
dataset. These cosine similarity scores are then plotted on a histogram with 100 bins to visualize the distribution
|
25
27
|
of the scores. The x-axis of the histogram represents the computed Cosine Similarity.
|
26
28
|
|
27
|
-
|
29
|
+
### Signs of High Risk
|
28
30
|
|
29
31
|
- If the cosine similarity scores cluster close to 1 or -1, it may indicate overfitting, as the model's predictions
|
30
32
|
are almost perfectly aligned. This could suggest that the model is not generalizable.
|
31
33
|
- A broad spread of cosine similarity scores across the histogram may indicate a potential issue with the model's
|
32
34
|
ability to generate consistent embeddings.
|
33
35
|
|
34
|
-
|
36
|
+
### Strengths
|
35
37
|
|
36
38
|
- Provides a visual representation of the model's performance which is easily interpretable.
|
37
39
|
- Can help identify patterns, trends, and outliers in the model's alignment of predicted text embeddings.
|
38
40
|
- Useful in measuring the similarity between vectors in multi-dimensional space, important in the case of text
|
39
41
|
embeddings.
|
40
42
|
|
41
|
-
|
43
|
+
### Limitations
|
42
44
|
|
43
45
|
- Only evaluates the similarity between outputs. It does not provide insight into the model's ability to correctly
|
44
46
|
classify or predict.
|
@@ -23,33 +23,42 @@ def CosineSimilarityHeatmap(
|
|
23
23
|
"""
|
24
24
|
Generates an interactive heatmap to visualize the cosine similarities among embeddings derived from a given model.
|
25
25
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
26
|
+
### Purpose
|
27
|
+
|
28
|
+
This function is designed to visually analyze the cosine similarities of embeddings from a specific model. Cosine
|
29
|
+
similarity, a measure of the cosine of the angle between two vectors, aids in understanding the orientation and
|
30
|
+
similarity of vectors in multi-dimensional space. This is particularly valuable for exploring text embeddings and
|
31
|
+
their relative similarities among documents, words, or phrases.
|
32
|
+
|
33
|
+
### Test Mechanism
|
34
|
+
|
35
|
+
The function operates through a sequence of steps to visualize cosine similarities. Initially, embeddings are
|
36
|
+
extracted for each dataset entry using the designated model. Following this, the function computes the pairwise
|
37
|
+
cosine similarities among these embeddings. The computed similarities are then displayed in an interactive heatmap.
|
38
|
+
|
39
|
+
### Signs of High Risk
|
40
|
+
|
41
|
+
- High similarity values (close to 1) across the heatmap might not always be indicative of a risk; however, in
|
42
|
+
contexts where diverse perspectives or features are desired, this could suggest a lack of diversity in the model's
|
43
|
+
learning process or potential redundancy.
|
42
44
|
- Similarly, low similarity values (close to -1) indicate strong dissimilarity, which could be beneficial in
|
43
45
|
scenarios demanding diverse outputs. However, in cases where consistency is needed, these low values might
|
44
|
-
highlight that the model is unable to capture a coherent set of features from the data, potentially leading to poor
|
46
|
+
highlight that the model is unable to capture a coherent set of features from the data, potentially leading to poor
|
47
|
+
performance on related tasks.
|
48
|
+
|
49
|
+
### Strengths
|
50
|
+
|
51
|
+
- Provides an interactive and intuitive visual representation of embedding similarities, facilitating easy
|
52
|
+
exploration and analysis.
|
53
|
+
- Allows customization of visual elements such as title, axis labels, and color scale to suit specific analytical
|
54
|
+
needs and preferences.
|
45
55
|
|
46
|
-
|
47
|
-
- Provides an interactive and intuitive visual representation of embedding similarities, facilitating easy exploration and analysis.
|
48
|
-
- Allows customization of visual elements such as title, axis labels, and color scale to suit specific analytical needs and preferences.
|
56
|
+
### Limitations
|
49
57
|
|
50
|
-
|
51
|
-
|
52
|
-
- The interpretation of the heatmap heavily relies on the appropriate setting of the color scale, as incorrect
|
58
|
+
- As the number of embeddings increases, the effectiveness of the heatmap might diminish due to overcrowding,
|
59
|
+
making it hard to discern detailed similarities.
|
60
|
+
- The interpretation of the heatmap heavily relies on the appropriate setting of the color scale, as incorrect
|
61
|
+
settings can lead to misleading visual interpretations.
|
53
62
|
"""
|
54
63
|
|
55
64
|
embeddings = np.stack(dataset.y_pred(model))
|
@@ -13,26 +13,28 @@ class DescriptiveAnalytics(Metric):
|
|
13
13
|
Evaluates statistical properties of text embeddings in an ML model via mean, median, and standard deviation
|
14
14
|
histograms.
|
15
15
|
|
16
|
-
|
16
|
+
### Purpose
|
17
|
+
|
17
18
|
This metric, Descriptive Analytics for Text Embeddings Models, is employed to comprehend the fundamental properties
|
18
19
|
and statistical characteristics of the embeddings in a Machine Learning model. It measures the dimensionality as
|
19
20
|
well as the statistical distributions of embedding values including the mean, median, and standard deviation.
|
20
21
|
|
21
|
-
|
22
|
+
### Test Mechanism
|
23
|
+
|
22
24
|
The test mechanism involves using the 'DescriptiveAnalytics' class provided in the code which includes the 'run'
|
23
25
|
function. This function computes three statistical measures - mean, median, and standard deviation of the test
|
24
26
|
predictions from the model. It generates and caches three separate histograms showing the distribution of these
|
25
27
|
measures. Each histogram visualizes the measure's distribution across the embedding values. Therefore, the method
|
26
28
|
does not utilize a grading scale or threshold; it is fundamentally a visual exploration and data exploration tool.
|
27
29
|
|
28
|
-
|
30
|
+
### Signs of High Risk
|
29
31
|
|
30
32
|
- Abnormal patterns or values in the distributions of the statistical measures. This may include skewed
|
31
33
|
distributions or a significant amount of outliers.
|
32
34
|
- Very high standard deviation values which indicate a high degree of variability in the data.
|
33
35
|
- The mean and median values are vastly different, suggesting skewed data.
|
34
36
|
|
35
|
-
|
37
|
+
### Strengths
|
36
38
|
|
37
39
|
- Provides a visual and quantifiable understanding of the embeddings' statistical characteristics, allowing for a
|
38
40
|
comprehensive evaluation.
|
@@ -41,7 +43,7 @@ class DescriptiveAnalytics(Metric):
|
|
41
43
|
- It considers three key statistical measures (mean, median, and standard deviation), offering a more well-rounded
|
42
44
|
understanding of the data.
|
43
45
|
|
44
|
-
|
46
|
+
### Limitations
|
45
47
|
|
46
48
|
- The method does not offer an explicit measure of model performance or accuracy, as it mainly focuses on
|
47
49
|
understanding data properties.
|
@@ -12,24 +12,28 @@ class EmbeddingsVisualization2D(Metric):
|
|
12
12
|
"""
|
13
13
|
Visualizes 2D representation of text embeddings generated by a model using t-SNE technique.
|
14
14
|
|
15
|
-
|
16
|
-
a text embedding machine learning model. By doing so, it aids in analyzing the embedding space created by the model
|
17
|
-
and helps in understanding how the learned embeddings are distributed and how they relate to each other.
|
15
|
+
### Purpose
|
18
16
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
samples, the perplexity is adjusted to be one less than the number of samples. Following the reduction of
|
23
|
-
dimensionality, a scatter plot is produced depicting each embedding as a data point in the visualized 2D plane.
|
17
|
+
The objective of this metric is to provide a visual 2D representation of the embeddings created by a text embedding
|
18
|
+
machine learning model. By doing so, it aids in analyzing the embedding space created by the model and helps in
|
19
|
+
understanding how the learned embeddings are distributed and how they relate to each other.
|
24
20
|
|
25
|
-
|
21
|
+
### Test Mechanism
|
22
|
+
|
23
|
+
This metric uses the t-Distributed Stochastic Neighbor Embedding (t-SNE) technique, which is a tool for visualizing
|
24
|
+
high-dimensional data by reducing the dimensionality to 2. The perplexity parameter for t-SNE is set to the value
|
25
|
+
provided by the user. If the input perplexity value is greater than the number of samples, the perplexity is
|
26
|
+
adjusted to be one less than the number of samples. Following the reduction of dimensionality, a scatter plot is
|
27
|
+
produced depicting each embedding as a data point in the visualized 2D plane.
|
28
|
+
|
29
|
+
### Signs of High Risk
|
26
30
|
|
27
31
|
- If the embeddings are highly concentrated in a specific region of the plane, it might indicate that the model is
|
28
32
|
not learning diverse representations of the text.
|
29
33
|
- Wide gaps or partitions in the visualization could suggest that the model is over-segmenting in the embedding
|
30
34
|
space and may lead to poor generalization.
|
31
35
|
|
32
|
-
|
36
|
+
### Strengths
|
33
37
|
|
34
38
|
- Offers a powerful visual tool that can assist in understanding and interpreting high-dimensional embeddings,
|
35
39
|
which could otherwise be difficult to visualize.
|
@@ -37,7 +41,7 @@ class EmbeddingsVisualization2D(Metric):
|
|
37
41
|
- t-SNE visualization helps in focusing on local structures and preserves the proximity of points that are close
|
38
42
|
together in the original high-dimensional space.
|
39
43
|
|
40
|
-
|
44
|
+
### Limitations
|
41
45
|
|
42
46
|
- The reduction of high-dimensional data to 2D can result in loss of some information, which may lead to
|
43
47
|
misinterpretation.
|
@@ -16,41 +16,41 @@ from validmind import tags, tasks
|
|
16
16
|
@tasks("text_qa", "text_generation", "text_summarization")
|
17
17
|
def EuclideanDistanceComparison(dataset, models):
|
18
18
|
"""
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
Euclidean Distance
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
pair, including
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
- Extremely low distances across different models might indicate redundancy, suggesting that
|
43
|
-
|
44
|
-
|
45
|
-
|
19
|
+
Assesses and visualizes the dissimilarity between model embeddings using Euclidean distance, providing insights
|
20
|
+
into model behavior and potential redundancy or diversity.
|
21
|
+
|
22
|
+
### Purpose
|
23
|
+
|
24
|
+
The Euclidean Distance Comparison test aims to analyze and compare the embeddings produced by different models. By
|
25
|
+
measuring the Euclidean distance between vectors in Euclidean space, it provides a metric to assess the magnitude
|
26
|
+
of dissimilarity between embeddings created by different models. This is crucial for tasks that require models to
|
27
|
+
produce distinct responses or feature separations.
|
28
|
+
|
29
|
+
### Test Mechanism
|
30
|
+
|
31
|
+
The test computes the embeddings for each model using the provided dataset and calculates the Euclidean distance
|
32
|
+
for every possible pair of models. It generates a distance matrix where each element represents the Euclidean
|
33
|
+
distance between two model embeddings. This matrix is then visualized through bar charts, showing the distance
|
34
|
+
distribution for each model pair. Additionally, it compiles a table with descriptive statistics such as mean,
|
35
|
+
median, standard deviation, minimum, and maximum distances for each model pair, including references to the
|
36
|
+
compared models.
|
37
|
+
|
38
|
+
### Signs of High Risk
|
39
|
+
|
40
|
+
- Very high distance values could suggest that models are focusing on entirely different features or aspects of the
|
41
|
+
data, which might be undesirable for ensemble methods or when a consensus is required.
|
42
|
+
- Extremely low distances across different models might indicate redundancy, suggesting that models are not
|
43
|
+
providing diverse enough perspectives on the data.
|
44
|
+
|
45
|
+
### Strengths
|
46
46
|
|
47
47
|
- Provides a clear and quantifiable measure of how different the embeddings from various models are.
|
48
48
|
- Useful for identifying outlier models or those that behave significantly differently from others in a group.
|
49
49
|
|
50
|
-
|
50
|
+
### Limitations
|
51
51
|
|
52
52
|
- Euclidean distance can be sensitive to the scale of the data, meaning that preprocessing steps like normalization
|
53
|
-
|
53
|
+
might be necessary to ensure meaningful comparisons.
|
54
54
|
- Does not consider the orientation or angle between vectors, focusing purely on magnitude differences.
|
55
55
|
"""
|
56
56
|
|
@@ -23,31 +23,40 @@ def EuclideanDistanceHeatmap(
|
|
23
23
|
"""
|
24
24
|
Generates an interactive heatmap to visualize the Euclidean distances among embeddings derived from a given model.
|
25
25
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
-
|
50
|
-
|
26
|
+
### Purpose
|
27
|
+
|
28
|
+
This function visualizes the Euclidean distances between embeddings generated by a model, offering insights into
|
29
|
+
the absolute differences between data points. Euclidean distance, a fundamental metric in data analysis, measures
|
30
|
+
the straight-line distance between two points in Euclidean space. It is particularly useful for understanding
|
31
|
+
spatial relationships and clustering tendencies in high-dimensional data.
|
32
|
+
|
33
|
+
### Test Mechanism
|
34
|
+
|
35
|
+
The function operates through a streamlined process: firstly, embeddings are extracted for each dataset entry using
|
36
|
+
the specified model. Subsequently, it computes the pairwise Euclidean distances among these embeddings. The results
|
37
|
+
are then visualized in an interactive heatmap format, where each cell's color intensity correlates with the
|
38
|
+
distance magnitude between pairs of embeddings, providing a visual assessment of these distances.
|
39
|
+
|
40
|
+
### Signs of High Risk
|
41
|
+
|
42
|
+
- Uniformly low distances across the heatmap might suggest a lack of variability in the data or model overfitting,
|
43
|
+
where the model fails to distinguish between distinct data points effectively.
|
44
|
+
- Excessive variability in distances could indicate inconsistent data representation, potentially leading to
|
45
|
+
unreliable model predictions.
|
46
|
+
|
47
|
+
### Strengths
|
48
|
+
|
49
|
+
- Provides a direct, intuitive visual representation of distances between embeddings, aiding in the detection of
|
50
|
+
patterns or anomalies.
|
51
|
+
- Allows customization of visual aspects such as the heatmap's title, axis labels, and color scale, adapting to
|
52
|
+
various analytical needs.
|
53
|
+
|
54
|
+
### Limitations
|
55
|
+
|
56
|
+
- The interpretation of distances can be sensitive to the scale of data; normalization might be necessary for
|
57
|
+
meaningful analysis.
|
58
|
+
- Large datasets may lead to dense, cluttered heatmaps, making it difficult to discern individual distances,
|
59
|
+
potentially requiring techniques like data sampling or dimensionality reduction for clearer visualization.
|
51
60
|
"""
|
52
61
|
|
53
62
|
embeddings = np.stack(dataset.y_pred(model))
|
@@ -17,32 +17,44 @@ from validmind import tags, tasks
|
|
17
17
|
@tasks("text_qa", "text_generation", "text_summarization")
|
18
18
|
def PCAComponentsPairwisePlots(dataset, model, n_components=3):
|
19
19
|
"""
|
20
|
-
Generates scatter plots for pairwise combinations of principal component analysis (PCA) components of model
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
20
|
+
Generates scatter plots for pairwise combinations of principal component analysis (PCA) components of model
|
21
|
+
embeddings.
|
22
|
+
|
23
|
+
### Purpose
|
24
|
+
|
25
|
+
This function visualizes the principal components of embeddings derived from a specified model. Principal Component
|
26
|
+
Analysis (PCA) is a statistical technique that emphasizes variation and uncovers strong patterns in a dataset. It
|
27
|
+
transforms the original variables into new, uncorrelated variables (principal components) that maximize variance.
|
28
|
+
|
29
|
+
### Test Mechanism
|
30
|
+
|
31
|
+
The function follows a sequential process to visualize PCA components effectively. It starts by extracting
|
32
|
+
embeddings from the dataset, utilizing the model specified by the user. These embeddings are then standardized to
|
33
|
+
ensure zero mean and unit variance, which is crucial to prevent any single feature from dominating due to
|
34
|
+
scale—this standardization is a critical preprocessing step for PCA. Following this, the function calculates the
|
35
|
+
specified number of principal components. The core of the visualization process involves creating scatter plots for
|
36
|
+
each pairwise combination of these principal components.
|
37
|
+
|
38
|
+
### Signs of High Risk
|
39
|
+
|
40
|
+
- If the principal components do not account for a significant portion of the variance, it may suggest that PCA is
|
41
|
+
not capturing the essential structures of the data.
|
42
|
+
- Similarity in scatter plots across different pairs of components could indicate redundancy in the components,
|
43
|
+
suggesting that fewer dimensions might be sufficient to represent the data.
|
44
|
+
|
45
|
+
### Strengths
|
46
|
+
|
47
|
+
- Enables a simplified visualization of multivariate data, helping to identify patterns across many variables
|
48
|
+
effectively.
|
49
|
+
- Provides a clear depiction of the directions of maximum variance in the data, which is valuable for feature
|
50
|
+
selection and dimensionality reduction.
|
51
|
+
|
52
|
+
### Limitations
|
53
|
+
|
54
|
+
- PCA's effectiveness hinges on the scaling of the variables; improper standardization can lead to misleading
|
55
|
+
interpretations.
|
56
|
+
- The interpretation of principal components can be challenging, especially if they capture less significant
|
57
|
+
variances or are difficult to relate back to the original features.
|
46
58
|
"""
|
47
59
|
|
48
60
|
# Get embeddings from the dataset using the model
|
@@ -23,7 +23,46 @@ logger = get_logger(__name__)
|
|
23
23
|
|
24
24
|
|
25
25
|
class StabilityAnalysis(ThresholdTest):
|
26
|
-
"""
|
26
|
+
"""
|
27
|
+
Assesses the stability of embeddings generated by a model when faced with perturbed input data to ensure robustness
|
28
|
+
and consistency.
|
29
|
+
|
30
|
+
### Purpose
|
31
|
+
|
32
|
+
The Embedding Stability test evaluates the robustness of the embeddings generated by a model when the input text is
|
33
|
+
perturbed. By comparing the cosine similarities between the original and perturbed embeddings, it gauges the
|
34
|
+
model's ability to maintain consistent semantic representations under slight variations in the input data.
|
35
|
+
|
36
|
+
### Test Mechanism
|
37
|
+
|
38
|
+
This test works by:
|
39
|
+
|
40
|
+
- Perturbing the original text data.
|
41
|
+
- Generating embeddings for both the original and perturbed datasets using the model.
|
42
|
+
- Calculating the cosine similarities between the original and perturbed embeddings.
|
43
|
+
- Analyzing the distribution of these similarities (mean, min, max, median, and standard deviation).
|
44
|
+
- Determining the test result based on whether the mean similarity exceeds a predefined threshold (default is 0.7).
|
45
|
+
|
46
|
+
### Signs of High Risk
|
47
|
+
|
48
|
+
- Mean cosine similarity below the threshold (default is 0.7).
|
49
|
+
- Large standard deviation of cosine similarities, indicating inconsistency.
|
50
|
+
- Minimum similarity score significantly lower than expected.
|
51
|
+
- Failure to pass the threshold test based on the mean similarity.
|
52
|
+
|
53
|
+
### Strengths
|
54
|
+
|
55
|
+
- Provides a quantitative measure of embedding stability.
|
56
|
+
- Helps in identifying weaknesses in the model's ability to handle minor input variations.
|
57
|
+
- Visualization of similarity distributions aids in comprehensive analysis.
|
58
|
+
- Easy to interpret results with clear pass/fail criteria.
|
59
|
+
|
60
|
+
### Limitations
|
61
|
+
|
62
|
+
- Relies on the chosen perturbation method, which may not cover all possible variations in real-world data.
|
63
|
+
- Thresholds for similarity might need adjustment based on specific application requirements.
|
64
|
+
- Cosine similarity, while useful, may not capture all aspects of semantic stability.
|
65
|
+
"""
|
27
66
|
|
28
67
|
required_inputs = ["model", "dataset"]
|
29
68
|
default_params = {
|
@@ -9,37 +9,38 @@ from .StabilityAnalysis import StabilityAnalysis
|
|
9
9
|
|
10
10
|
class StabilityAnalysisKeyword(StabilityAnalysis):
|
11
11
|
"""
|
12
|
-
|
12
|
+
Evaluates robustness of embedding models to keyword swaps in the test dataset.
|
13
13
|
|
14
|
-
|
15
|
-
so that any instances of the key words in the test dataset will be replaced
|
16
|
-
with the corresponding value.
|
14
|
+
### Purpose
|
17
15
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
maintain performance stability even when the input data slightly deviates, imitating real-world variability.
|
16
|
+
This test metric is used to evaluate the robustness of text embedding machine learning models to keyword swaps. A
|
17
|
+
keyword swap is a scenario where instances of certain specified keywords in the dataset are replaced with other
|
18
|
+
specified words (usually synonyms). The purpose of this metric is to ensure that these models maintain performance
|
19
|
+
stability even when the input data slightly deviates, imitating real-world variability.
|
23
20
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
21
|
+
### Test Mechanism
|
22
|
+
|
23
|
+
The test mechanism involves a perturbation of the dataset used in testing the model. Each instance of a specific
|
24
|
+
word found in the dataset is replaced with the corresponding word as specified in a 'keyword_dict' mapping. The
|
25
|
+
model is then re-run with the perturbed dataset and the results are compared with the non-perturbed dataset. This
|
26
|
+
comparison quantifies the extent to which keyword swaps impact the model's performance.
|
27
|
+
|
28
|
+
### Signs of High Risk
|
29
29
|
|
30
|
-
**Signs of High Risk:**
|
31
30
|
- A significant drop in model performance after keyword swaps indicates a high risk of model failure in real-world
|
32
31
|
scenarios.
|
33
32
|
- The model results being heavily reliant on specific word choices instead of capturing the context properly.
|
34
33
|
|
35
|
-
|
34
|
+
### Strengths
|
35
|
+
|
36
36
|
- This test provides a way to measure model robustness to small changes in input data, which reinforces its
|
37
37
|
applicability and reliability in real-world scenarios.
|
38
38
|
- This test encourages a model to understand the context of a sentence rather than memorizing specific words.
|
39
39
|
- It helps to detect overfitting - a situation where a model performs well on training data but poorly on new or
|
40
40
|
slightly altered data.
|
41
41
|
|
42
|
-
|
42
|
+
### Limitations
|
43
|
+
|
43
44
|
- It may not fully address semantic differences that can be introduced through keyword swaps. That is, the
|
44
45
|
replacement words might not preserve the exact semantic meaning of the original words.
|
45
46
|
- It only tests for changes in keywords (word-level alterations) and might not expose model limitations related to
|