validmind 2.1.0__py3-none-any.whl → 2.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai.py +3 -3
  3. validmind/api_client.py +2 -3
  4. validmind/client.py +68 -25
  5. validmind/datasets/llm/rag/__init__.py +11 -0
  6. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_1.csv +30 -0
  7. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_2.csv +30 -0
  8. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_3.csv +53 -0
  9. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_4.csv +53 -0
  10. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_5.csv +53 -0
  11. validmind/datasets/llm/rag/rfp.py +41 -0
  12. validmind/html_templates/__init__.py +0 -0
  13. validmind/html_templates/content_blocks.py +89 -14
  14. validmind/models/__init__.py +7 -4
  15. validmind/models/foundation.py +8 -34
  16. validmind/models/function.py +51 -0
  17. validmind/models/huggingface.py +16 -46
  18. validmind/models/metadata.py +42 -0
  19. validmind/models/pipeline.py +66 -0
  20. validmind/models/pytorch.py +8 -42
  21. validmind/models/r_model.py +33 -82
  22. validmind/models/sklearn.py +39 -38
  23. validmind/template.py +8 -26
  24. validmind/tests/__init__.py +43 -20
  25. validmind/tests/data_validation/ANOVAOneWayTable.py +1 -1
  26. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +1 -1
  27. validmind/tests/data_validation/DescriptiveStatistics.py +2 -4
  28. validmind/tests/data_validation/Duplicates.py +1 -1
  29. validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
  30. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +1 -1
  31. validmind/tests/data_validation/TargetRateBarPlots.py +1 -1
  32. validmind/tests/data_validation/nlp/LanguageDetection.py +59 -0
  33. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +48 -0
  34. validmind/tests/data_validation/nlp/Punctuations.py +11 -12
  35. validmind/tests/data_validation/nlp/Sentiment.py +57 -0
  36. validmind/tests/data_validation/nlp/Toxicity.py +45 -0
  37. validmind/tests/decorator.py +2 -2
  38. validmind/tests/model_validation/BertScore.py +100 -98
  39. validmind/tests/model_validation/BleuScore.py +93 -64
  40. validmind/tests/model_validation/ContextualRecall.py +74 -91
  41. validmind/tests/model_validation/MeteorScore.py +86 -74
  42. validmind/tests/model_validation/RegardScore.py +103 -121
  43. validmind/tests/model_validation/RougeScore.py +118 -0
  44. validmind/tests/model_validation/TokenDisparity.py +84 -121
  45. validmind/tests/model_validation/ToxicityScore.py +109 -123
  46. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +96 -0
  47. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +71 -0
  48. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +92 -0
  49. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +69 -0
  50. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +78 -0
  51. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +35 -23
  52. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +3 -0
  53. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +7 -1
  54. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +3 -0
  55. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +3 -0
  56. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +99 -0
  57. validmind/tests/model_validation/ragas/AnswerCorrectness.py +131 -0
  58. validmind/tests/model_validation/ragas/AnswerRelevance.py +134 -0
  59. validmind/tests/model_validation/ragas/AnswerSimilarity.py +119 -0
  60. validmind/tests/model_validation/ragas/AspectCritique.py +167 -0
  61. validmind/tests/model_validation/ragas/ContextEntityRecall.py +133 -0
  62. validmind/tests/model_validation/ragas/ContextPrecision.py +123 -0
  63. validmind/tests/model_validation/ragas/ContextRecall.py +123 -0
  64. validmind/tests/model_validation/ragas/ContextRelevancy.py +114 -0
  65. validmind/tests/model_validation/ragas/Faithfulness.py +119 -0
  66. validmind/tests/model_validation/ragas/utils.py +66 -0
  67. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +3 -7
  68. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +8 -9
  69. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +5 -10
  70. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +3 -2
  71. validmind/tests/model_validation/sklearn/ROCCurve.py +2 -1
  72. validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
  73. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +2 -3
  74. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +14 -12
  75. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +3 -4
  76. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +1 -1
  77. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +1 -1
  78. validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +1 -1
  79. validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +1 -1
  80. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +1 -1
  81. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +1 -1
  82. validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +1 -1
  83. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +5 -6
  84. validmind/unit_metrics/__init__.py +26 -49
  85. validmind/unit_metrics/composite.py +5 -1
  86. validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +1 -1
  87. validmind/utils.py +56 -6
  88. validmind/vm_models/__init__.py +1 -1
  89. validmind/vm_models/dataset/__init__.py +7 -0
  90. validmind/vm_models/dataset/dataset.py +558 -0
  91. validmind/vm_models/dataset/utils.py +146 -0
  92. validmind/vm_models/model.py +97 -72
  93. validmind/vm_models/test/result_wrapper.py +61 -24
  94. validmind/vm_models/test_context.py +1 -1
  95. validmind/vm_models/test_suite/summary.py +3 -4
  96. {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/METADATA +5 -3
  97. {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/RECORD +100 -75
  98. validmind/models/catboost.py +0 -33
  99. validmind/models/statsmodels.py +0 -50
  100. validmind/models/xgboost.py +0 -30
  101. validmind/tests/model_validation/BertScoreAggregate.py +0 -90
  102. validmind/tests/model_validation/RegardHistogram.py +0 -148
  103. validmind/tests/model_validation/RougeMetrics.py +0 -147
  104. validmind/tests/model_validation/RougeMetricsAggregate.py +0 -133
  105. validmind/tests/model_validation/SelfCheckNLIScore.py +0 -112
  106. validmind/tests/model_validation/ToxicityHistogram.py +0 -136
  107. validmind/vm_models/dataset.py +0 -1303
  108. {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/LICENSE +0 -0
  109. {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/WHEEL +0 -0
  110. {validmind-2.1.0.dist-info → validmind-2.2.2.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,96 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ from itertools import combinations
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ import plotly.express as px
10
+ from sklearn.metrics.pairwise import cosine_similarity
11
+
12
+
13
+ def CosineSimilarityComparison(dataset, models):
14
+ """
15
+ Computes pairwise cosine similarities between model embeddings and visualizes the results through bar charts,
16
+ alongside compiling a comprehensive table of descriptive statistics for each model pair.
17
+
18
+ **Purpose:**
19
+ This function is designed to analyze and compare the embeddings produced by different models using Cosine Similarity.
20
+ Cosine Similarity, a measure calculating the cosine of the angle between two vectors, is widely used to determine
21
+ the alignment or similarity between vectors in high-dimensional spaces, such as text embeddings. This analysis helps
22
+ to understand how similar or different the models' predictions are in terms of embedding generation.
23
+
24
+ **Test Mechanism:**
25
+ The function begins by computing the embeddings for each model using the provided dataset. It then calculates the
26
+ cosine similarity for every possible pair of models, generating a similarity matrix. Each element of this matrix
27
+ represents the cosine similarity between two model embeddings. The function flattens this matrix and uses it to
28
+ create a bar chart for each model pair, visualizing their similarity distribution. Additionally, it compiles a table
29
+ with descriptive statistics (mean, median, standard deviation, minimum, and maximum) for the similarities of each
30
+ pair, including a reference to the compared models.
31
+
32
+ **Signs of High Risk:**
33
+
34
+ - A high concentration of cosine similarity values close to 1 could suggest that the models are producing very
35
+ similar embeddings, which could be a sign of redundancy or lack of diversity in model training or design.
36
+ - Conversely, very low similarity values near -1 indicate strong dissimilarity, potentially highlighting models
37
+ that are too divergent, possibly focusing on very different features of the data.
38
+
39
+ **Strengths:**
40
+
41
+ - Enables detailed comparisons between multiple models' embedding strategies through visual and statistical means.
42
+ - Helps identify which models produce similar or dissimilar embeddings, useful for tasks requiring model diversity.
43
+ - Provides quantitative and visual feedback on the degree of similarity, enhancing interpretability of model
44
+ behavior in embedding spaces.
45
+
46
+ **Limitations:**
47
+
48
+ - The analysis is confined to the comparison of embeddings and does not assess the overall performance of the models
49
+ in terms of their primary tasks (e.g., classification, regression).
50
+ - Assumes that the models are suitable for generating comparable embeddings, which might not always be the case,
51
+ especially across different types of models.
52
+ - Interpretation of results is heavily dependent on the understanding of Cosine Similarity and the nature of high-dimensional
53
+ embedding spaces.
54
+ """
55
+
56
+ figures = []
57
+ # Initialize a list to store data for the DataFrame
58
+ all_stats = []
59
+
60
+ # Generate all pairs of models for comparison
61
+ for model_A, model_B in combinations(models, 2):
62
+ embeddings_A = np.stack(dataset.y_pred(model_A))
63
+ embeddings_B = np.stack(dataset.y_pred(model_B))
64
+
65
+ # Calculate pairwise cosine similarity
66
+ similarity_matrix = cosine_similarity(embeddings_A, embeddings_B)
67
+ similarities = similarity_matrix.flatten()
68
+
69
+ # Generate statistics and add model combination as a column
70
+ stats_data = {
71
+ "Combination": f"{model_A.input_id} vs {model_B.input_id}",
72
+ "Mean": np.mean(similarities),
73
+ "Median": np.median(similarities),
74
+ "Standard Deviation": np.std(similarities),
75
+ "Minimum": np.min(similarities),
76
+ "Maximum": np.max(similarities),
77
+ }
78
+ all_stats.append(stats_data)
79
+
80
+ # Generate an index for each similarity value
81
+ indices = range(len(similarities))
82
+
83
+ # Create the bar chart using Plotly
84
+ fig = px.bar(
85
+ x=indices,
86
+ y=similarities,
87
+ labels={"x": "Pair Index", "y": "Cosine Similarity"},
88
+ title=f"Cosine Similarity - {model_A.input_id} vs {model_B.input_id}",
89
+ )
90
+ fig.update_layout(xaxis_title="Pair Index", yaxis_title="Cosine Similarity")
91
+ figures.append(fig)
92
+
93
+ # Create a DataFrame from all collected statistics
94
+ stats_df = pd.DataFrame(all_stats)
95
+
96
+ return (stats_df, *tuple(figures))
@@ -0,0 +1,71 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import numpy as np
6
+ import plotly.express as px
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
+
9
+
10
+ def CosineSimilarityHeatmap(
11
+ dataset,
12
+ model,
13
+ title="Cosine Similarity Matrix",
14
+ color="Cosine Similarity",
15
+ xaxis_title="Index",
16
+ yaxis_title="Index",
17
+ color_scale="Blues",
18
+ ):
19
+ """
20
+ Generates an interactive heatmap to visualize the cosine similarities among embeddings derived from a given model.
21
+
22
+ **Purpose:**
23
+ This function is designed to visually analyze the cosine similarities of embeddings from a specific model.
24
+ Cosine similarity, a measure of the cosine of the angle between two vectors, aids in understanding the
25
+ orientation and similarity of vectors in multi-dimensional space. This is particularly valuable for exploring
26
+ text embeddings and their relative similarities among documents, words, or phrases.
27
+
28
+ **Test Mechanism:**
29
+ The function operates through a sequence of steps to visualize cosine similarities. Initially,
30
+ embeddings are extracted for each dataset entry using the designated model. Following this,
31
+ the function computes the pairwise cosine similarities among these embeddings. The computed similarities
32
+ are then displayed in an interactive heatmap.
33
+
34
+ **Signs of High Risk:**
35
+ - High similarity values (close to 1) across the heatmap might not always be indicative of a risk;
36
+ however, in contexts where diverse perspectives or features are desired, this could suggest a lack of
37
+ diversity in the model's learning process or potential redundancy.
38
+ - Similarly, low similarity values (close to -1) indicate strong dissimilarity, which could be beneficial in
39
+ scenarios demanding diverse outputs. However, in cases where consistency is needed, these low values might
40
+ highlight that the model is unable to capture a coherent set of features from the data, potentially leading to poor performance on related tasks.
41
+
42
+ **Strengths:**
43
+ - Provides an interactive and intuitive visual representation of embedding similarities, facilitating easy exploration and analysis.
44
+ - Allows customization of visual elements such as title, axis labels, and color scale to suit specific analytical needs and preferences.
45
+
46
+ **Limitations:**
47
+ - As the number of embeddings increases, the effectiveness of the heatmap might diminish due to overcrowding, making it hard to discern detailed similarities.
48
+ - The interpretation of the heatmap heavily relies on the appropriate setting of the color scale, as incorrect settings can lead to misleading visual interpretations.
49
+ """
50
+
51
+ embeddings = np.stack(dataset.y_pred(model))
52
+
53
+ # Calculate pairwise cosine similarity
54
+ similarity_matrix = cosine_similarity(embeddings)
55
+
56
+ # Create the heatmap using Plotly
57
+ fig = px.imshow(
58
+ similarity_matrix,
59
+ labels=dict(x=xaxis_title, y=yaxis_title, color=color),
60
+ text_auto=True,
61
+ aspect="auto",
62
+ color_continuous_scale=color_scale,
63
+ )
64
+
65
+ fig.update_layout(
66
+ title=f"{title} - {model.input_id}",
67
+ xaxis_title=xaxis_title,
68
+ yaxis_title=yaxis_title,
69
+ )
70
+
71
+ return fig
@@ -0,0 +1,92 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ from itertools import combinations
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ import plotly.express as px
10
+ from sklearn.metrics.pairwise import euclidean_distances
11
+
12
+
13
+ def EuclideanDistanceComparison(dataset, models):
14
+ """
15
+ Computes pairwise Euclidean distances between model embeddings and visualizes the results through bar charts,
16
+ alongside compiling a comprehensive table of descriptive statistics for each model pair.
17
+
18
+ **Purpose:**
19
+ This function is designed to analyze and compare the embeddings produced by different models using Euclidean Distance.
20
+ Euclidean Distance measures the "ordinary" straight-line distance between two points in Euclidean space, providing a
21
+ straightforward metric to assess the absolute differences between vectors. This analysis helps in understanding the
22
+ magnitude of dissimilarity between the embeddings generated by different models, which is crucial for tasks that require
23
+ distinctive model responses or feature separations.
24
+
25
+ **Test Mechanism:**
26
+ The function begins by computing the embeddings for each model using the provided dataset. It then calculates the
27
+ Euclidean distance for every possible pair of models, generating a distance matrix. Each element of this matrix
28
+ represents the Euclidean distance between two model embeddings. The function flattens this matrix and uses it to
29
+ create a bar chart for each model pair, visualizing their distance distribution. Additionally, it compiles a table
30
+ with descriptive statistics (mean, median, standard deviation, minimum, and maximum) for the distances of each
31
+ pair, including a reference to the compared models.
32
+
33
+ **Signs of High Risk:**
34
+
35
+ - Very high distance values could suggest that the models are focusing on completely different features or aspects
36
+ of the data, which might be undesirable for ensemble methods or similar applications where some degree of
37
+ consensus is expected.
38
+ - Extremely low distances across different models might indicate redundancy, suggesting that the models are not
39
+ providing diverse enough perspectives on the data.
40
+
41
+ **Strengths:**
42
+
43
+ - Provides a clear and quantifiable measure of how different the embeddings from various models are.
44
+ - Useful for identifying outlier models or those that behave significantly differently from others in a group.
45
+
46
+ **Limitations:**
47
+
48
+ - Euclidean distance can be sensitive to the scale of the data, meaning that preprocessing steps like normalization
49
+ might be necessary to ensure meaningful comparisons.
50
+ - Does not consider the orientation or angle between vectors, focusing purely on magnitude differences.
51
+ """
52
+
53
+ figures = []
54
+ all_stats = []
55
+
56
+ # Generate all pairs of models for comparison
57
+ for model_A, model_B in combinations(models, 2):
58
+ embeddings_A = np.stack(dataset.y_pred(model_A))
59
+ embeddings_B = np.stack(dataset.y_pred(model_B))
60
+
61
+ # Calculate pairwise Euclidean distances
62
+ distance_matrix = euclidean_distances(embeddings_A, embeddings_B)
63
+ distances = distance_matrix.flatten()
64
+
65
+ # Generate statistics and add model combination as a column
66
+ stats_data = {
67
+ "Combination": f"{model_A.input_id} vs {model_B.input_id}",
68
+ "Mean": np.mean(distances),
69
+ "Median": np.median(distances),
70
+ "Standard Deviation": np.std(distances),
71
+ "Minimum": np.min(distances),
72
+ "Maximum": np.max(distances),
73
+ }
74
+ all_stats.append(stats_data)
75
+
76
+ # Generate an index for each distance value
77
+ indices = range(len(distances))
78
+
79
+ # Create the bar chart using Plotly
80
+ fig = px.bar(
81
+ x=indices,
82
+ y=distances,
83
+ labels={"x": "Pair Index", "y": "Euclidean Distance"},
84
+ title=f"Euclidean Distance - {model_A.input_id} vs {model_B.input_id}",
85
+ )
86
+ fig.update_layout(xaxis_title="Pair Index", yaxis_title="Euclidean Distance")
87
+ figures.append(fig)
88
+
89
+ # Create a DataFrame from all collected statistics
90
+ stats_df = pd.DataFrame(all_stats)
91
+
92
+ return (stats_df, *tuple(figures))
@@ -0,0 +1,69 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import numpy as np
6
+ import plotly.express as px
7
+ from sklearn.metrics.pairwise import euclidean_distances
8
+
9
+
10
+ def EuclideanDistanceHeatmap(
11
+ dataset,
12
+ model,
13
+ title="Euclidean Distance Matrix",
14
+ color="Euclidean Distance",
15
+ xaxis_title="Index",
16
+ yaxis_title="Index",
17
+ color_scale="Blues",
18
+ ):
19
+ """
20
+ Generates an interactive heatmap to visualize the Euclidean distances among embeddings derived from a given model.
21
+
22
+ **Purpose:**
23
+ This function visualizes the Euclidean distances between embeddings generated by a model, offering insights into the
24
+ absolute differences between data points. Euclidean distance, a fundamental metric in data analysis, measures the
25
+ straight-line distance between two points in Euclidean space. It is particularly useful for understanding spatial
26
+ relationships and clustering tendencies in high-dimensional data.
27
+
28
+ **Test Mechanism:**
29
+ The function operates through a streamlined process: firstly, embeddings are extracted for each dataset entry using the specified model.
30
+ Subsequently, it computes the pairwise Euclidean distances among these embeddings. The results are then visualized in an interactive heatmap format,
31
+ where each cell's color intensity correlates with the distance magnitude between pairs of embeddings, providing a visual assessment of these distances.
32
+
33
+ **Signs of High Risk:**
34
+ - Uniform Distances: Uniformly low distances across the heatmap might suggest a lack of variability in the data or
35
+ model overfitting, where the model fails to distinguish between distinct data points effectively.
36
+ - High Variability: Conversely, excessive variability in distances could indicate inconsistent data representation,
37
+ potentially leading to unreliable model predictions.
38
+
39
+ **Strengths:**
40
+ - Provides a direct, intuitive visual representation of distances between embeddings, aiding in the detection of patterns or anomalies.
41
+ - Allows customization of visual aspects such as the heatmap's title, axis labels, and color scale, adapting to various analytical needs.
42
+
43
+ **Limitations:**
44
+ - The interpretation of distances can be sensitive to the scale of data; normalization might be necessary for meaningful analysis.
45
+ - Large datasets may lead to dense, cluttered heatmaps, making it difficult to discern individual distances, potentially requiring
46
+ techniques like data sampling or dimensionality reduction for clearer visualization.
47
+ """
48
+
49
+ embeddings = np.stack(dataset.y_pred(model))
50
+
51
+ # Calculate pairwise Euclidean distance
52
+ distance_matrix = euclidean_distances(embeddings)
53
+
54
+ # Create the heatmap using Plotly
55
+ fig = px.imshow(
56
+ distance_matrix,
57
+ labels=dict(x=xaxis_title, y=yaxis_title, color=color),
58
+ text_auto=True,
59
+ aspect="auto",
60
+ color_continuous_scale=color_scale,
61
+ )
62
+
63
+ fig.update_layout(
64
+ title=f"{title} - {model.input_id}",
65
+ xaxis_title=xaxis_title,
66
+ yaxis_title=yaxis_title,
67
+ )
68
+
69
+ return fig
@@ -0,0 +1,78 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import itertools
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ import plotly.express as px
10
+ from sklearn.decomposition import PCA
11
+ from sklearn.preprocessing import StandardScaler
12
+
13
+
14
+ def PCAComponentsPairwisePlots(dataset, model, n_components=3):
15
+ """
16
+ Generates scatter plots for pairwise combinations of principal component analysis (PCA) components of model embeddings.
17
+
18
+ **Purpose:**
19
+ This function visualizes the principal components of embeddings derived from a specified model. Principal Component Analysis (PCA)
20
+ is a statistical technique that emphasizes variation and uncovers strong patterns in a dataset.
21
+ It transforms the original variables into new, uncorrelated variables (principal components) that maximize variance.
22
+
23
+ **Test Mechanism:**
24
+ The function follows a sequential process to visualize PCA components effectively.
25
+ It starts by extracting embeddings from the dataset, utilizing the model specified by the user.
26
+ These embeddings are then standardized to ensure zero mean and unit variance, which is crucial to prevent
27
+ any single feature from dominating due to scale—this standardization is a critical preprocessing step for PCA.
28
+ Following this, the function calculates the specified number of principal components.
29
+ The core of the visualization process involves creating scatter plots for each pairwise combination of these principal components.
30
+
31
+ **Signs of High Risk:**
32
+ - If the principal components do not account for a significant portion of the variance, it may suggest that PCA is not capturing the essential structures of the data.
33
+ - Similarity in scatter plots across different pairs of components could indicate redundancy in the components, suggesting that fewer dimensions might be sufficient to represent the data.
34
+
35
+ **Strengths:**
36
+ - Enables a simplified visualization of multivariate data, helping to identify patterns across many variables effectively.
37
+ - Provides a clear depiction of the directions of maximum variance in the data, which is valuable for feature selection and dimensionality reduction.
38
+
39
+ **Limitations:**
40
+ - PCA's effectiveness hinges on the scaling of the variables; improper standardization can lead to misleading interpretations.
41
+ - The interpretation of principal components can be challenging, especially if they capture less significant variances or are difficult to relate back to the original features.
42
+ """
43
+
44
+ # Get embeddings from the dataset using the model
45
+ embeddings = np.stack(dataset.y_pred(model))
46
+
47
+ # Standardize the embeddings
48
+ scaler = StandardScaler()
49
+ embeddings_scaled = scaler.fit_transform(embeddings)
50
+
51
+ # Perform PCA
52
+ pca = PCA(n_components=n_components)
53
+ pca_results = pca.fit_transform(embeddings_scaled)
54
+
55
+ # Prepare DataFrame for Plotly
56
+ pca_df = pd.DataFrame(
57
+ pca_results, columns=[f"PC{i+1}" for i in range(n_components)]
58
+ )
59
+
60
+ # List to store each plot
61
+ plots = []
62
+
63
+ # Create plots for each pair of principal components
64
+ for pc1, pc2 in itertools.combinations(range(1, n_components + 1), 2):
65
+ fig = px.scatter(
66
+ pca_df,
67
+ x=f"PC{pc1}",
68
+ y=f"PC{pc2}",
69
+ title=f"{getattr(model, 'input_id', 'Unknown Model')} (PC{pc1} vs PC{pc2})",
70
+ labels={
71
+ f"PC{pc1}": f"Principal Component {pc1}",
72
+ f"PC{pc2}": f"Principal Component {pc2}",
73
+ },
74
+ )
75
+ plots.append(fig)
76
+
77
+ # Return the list of plots as a tuple
78
+ return tuple(plots)
@@ -6,8 +6,10 @@ from abc import abstractmethod
6
6
  from typing import List
7
7
 
8
8
  import numpy as np
9
+ import plotly.express as px
9
10
  from sklearn.metrics.pairwise import cosine_similarity
10
11
 
12
+ from validmind.logging import get_logger
11
13
  from validmind.vm_models import (
12
14
  Figure,
13
15
  ResultSummary,
@@ -17,13 +19,14 @@ from validmind.vm_models import (
17
19
  ThresholdTestResult,
18
20
  )
19
21
 
22
+ logger = get_logger(__name__)
23
+
20
24
 
21
25
  class StabilityAnalysis(ThresholdTest):
22
26
  """Base class for embeddings stability analysis tests"""
23
27
 
24
28
  required_inputs = ["model", "dataset"]
25
29
  default_params = {
26
- "text_column": None,
27
30
  "mean_similarity_threshold": 0.7,
28
31
  }
29
32
  metadata = {
@@ -61,25 +64,22 @@ class StabilityAnalysis(ThresholdTest):
61
64
 
62
65
  def run(self):
63
66
  # Perturb the test dataset
64
- col = self.params.get("text_column")
65
-
66
- if col is None:
67
- raise ValueError(
68
- "The `text_column` parameter must be provided to the StabilityAnalysis test."
69
- )
67
+ original = self.inputs.dataset.df
68
+ perturbed = original.copy()
69
+ perturbed.update(
70
+ perturbed.select_dtypes(include="object").applymap(self.perturb_data)
71
+ )
70
72
 
71
- original_data_df = self.inputs.dataset.df[col]
72
- perturbed_data_df = original_data_df.copy()
73
- perturbed_data_df = perturbed_data_df.apply(self.perturb_data)
73
+ logger.debug(f"Original data: {original}")
74
+ logger.debug(f"Perturbed data: {perturbed}")
74
75
 
75
76
  # Compute embeddings for the original and perturbed dataset
76
- original_embeddings = self.inputs.model.predict(original_data_df)
77
- perturbed_embeddings = self.inputs.model.predict(perturbed_data_df)
77
+ original_embeddings = self.inputs.dataset.y_pred(self.inputs.model)
78
+ perturbed_embeddings = np.stack(self.inputs.model.predict(perturbed))
78
79
 
79
80
  # Compute cosine similarities between original and perturbed embeddings
80
81
  similarities = cosine_similarity(
81
- original_embeddings,
82
- perturbed_embeddings,
82
+ original_embeddings, perturbed_embeddings
83
83
  ).diagonal()
84
84
 
85
85
  mean = np.mean(similarities)
@@ -91,15 +91,26 @@ class StabilityAnalysis(ThresholdTest):
91
91
  # Determine if the test passed based on the mean similarity and threshold
92
92
  passed = mean > self.params["mean_similarity_threshold"]
93
93
 
94
- # Plot the distribution of cosine similarities using plotly
95
- import plotly.express as px
96
-
97
- fig = px.histogram(
98
- x=similarities.flatten(),
99
- nbins=100,
100
- title="Cosine Similarity Distribution",
101
- labels={"x": "Cosine Similarity"},
102
- )
94
+ figures = [
95
+ px.histogram(
96
+ x=similarities.flatten(),
97
+ nbins=100,
98
+ title="Cosine Similarity Distribution",
99
+ labels={"x": "Cosine Similarity"},
100
+ ),
101
+ px.density_contour(
102
+ x=similarities.flatten(),
103
+ nbinsx=100,
104
+ title="Cosine Similarity Density",
105
+ labels={"x": "Cosine Similarity"},
106
+ marginal_x="histogram",
107
+ ),
108
+ px.box(
109
+ x=similarities.flatten(),
110
+ labels={"x": "Cosine Similarity"},
111
+ title="Cosine Similarity Box Plot",
112
+ ),
113
+ ]
103
114
 
104
115
  # For this example, we are not caching the results as done in the reference `run` method
105
116
  return self.cache_results(
@@ -121,6 +132,7 @@ class StabilityAnalysis(ThresholdTest):
121
132
  key=self.name,
122
133
  figure=fig,
123
134
  )
135
+ for fig in figures
124
136
  ],
125
137
  passed=passed,
126
138
  )
@@ -55,6 +55,9 @@ class StabilityAnalysisKeyword(StabilityAnalysis):
55
55
  }
56
56
 
57
57
  def perturb_data(self, data: str):
58
+ if not isinstance(data, str):
59
+ return data
60
+
58
61
  # Tokenize the string
59
62
  tokens = re.findall(r"[\w']+[.,!?;]?|[\w']+", data)
60
63
  modified_tokens = []
@@ -114,9 +114,15 @@ class StabilityAnalysisRandomNoise(StabilityAnalysis):
114
114
  name = "Text Embeddings Stability Analysis to Random Noise"
115
115
  default_params = {
116
116
  **StabilityAnalysis.default_params,
117
+ "probability": 0.02,
117
118
  }
118
119
 
119
- def perturb_data(self, data, probability=0.02):
120
+ def perturb_data(self, data):
121
+ if not isinstance(data, str):
122
+ return data
123
+
124
+ probability = self.params["probability"]
125
+
120
126
  # Tokenize the string based on spaces
121
127
  words = data.split()
122
128
 
@@ -65,6 +65,9 @@ class StabilityAnalysisSynonyms(StabilityAnalysis):
65
65
  }
66
66
 
67
67
  def perturb_data(self, data):
68
+ if not isinstance(data, str):
69
+ return data
70
+
68
71
  # download the nltk wordnet
69
72
  nltk.download("wordnet", quiet=True)
70
73
 
@@ -61,6 +61,9 @@ class StabilityAnalysisTranslation(StabilityAnalysis):
61
61
  }
62
62
 
63
63
  def perturb_data(self, data: str):
64
+ if not isinstance(data, str):
65
+ return data
66
+
64
67
  source_lang = self.params["source_lang"]
65
68
  target_lang = self.params["target_lang"]
66
69
 
@@ -0,0 +1,99 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import itertools
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ import plotly.express as px
10
+ from sklearn.manifold import TSNE
11
+ from sklearn.preprocessing import StandardScaler
12
+
13
+
14
+ def TSNEComponentsPairwisePlots(
15
+ dataset,
16
+ model,
17
+ n_components=2,
18
+ perplexity=30,
19
+ title="t-SNE",
20
+ ):
21
+ """
22
+ Plots individual scatter plots for pairwise combinations of t-SNE components of embeddings.
23
+
24
+ **Purpose:**
25
+ This function creates scatter plots for each pairwise combination of t-SNE components derived from model embeddings.
26
+ t-SNE (t-Distributed Stochastic Neighbor Embedding) is a machine learning algorithm for dimensionality reduction that
27
+ is particularly well-suited for the visualization of high-dimensional datasets.
28
+
29
+ **Test Mechanism:**
30
+ The function begins by extracting embeddings from the provided dataset using the specified model.
31
+ These embeddings are then standardized to ensure that each dimension contributes equally to the distance computation.
32
+ Following this, the t-SNE algorithm is applied to reduce the dimensionality of the data, with the number of components
33
+ specified by the user. The results are plotted using Plotly, creating scatter plots for each unique pair of components
34
+ if more than one component is specified.
35
+
36
+ **Signs of High Risk:**
37
+ - If the scatter plots show overlapping clusters or indistinct groupings, it might suggest that the
38
+ t-SNE parameters (such as perplexity) are not optimally set for the given data, or the data itself does not exhibit clear, separable clusters.
39
+ - Similar plots across different pairs of components could indicate redundancy in the components generated by t-SNE,
40
+ suggesting that fewer dimensions might be sufficient to represent the data's structure.
41
+
42
+ **Strengths:**
43
+ - Provides a visual exploration tool for high-dimensional data, simplifying the detection of patterns and clusters which are not apparent in higher dimensions.
44
+ - Interactive plots generated by Plotly enhance user engagement and allow for a deeper dive into specific areas of the plot, aiding in detailed data analysis.
45
+
46
+ **Limitations:**
47
+ - The effectiveness of t-SNE is highly dependent on the choice of parameters like perplexity and the number of components,
48
+ which might require tuning and experimentation for optimal results.
49
+ - t-SNE visualizations can be misleading if interpreted without considering the stochastic nature of the algorithm;
50
+ two runs with the same parameters might yield different visual outputs, necessitating multiple runs for a consistent interpretation.
51
+ """
52
+
53
+ # Get embeddings from the dataset using the model
54
+ embeddings = np.stack(dataset.y_pred(model))
55
+
56
+ # Standardize the embeddings
57
+ scaler = StandardScaler()
58
+ embeddings_scaled = scaler.fit_transform(embeddings)
59
+
60
+ # Perform t-SNE
61
+ tsne = TSNE(n_components=n_components, perplexity=perplexity)
62
+ tsne_results = tsne.fit_transform(embeddings_scaled)
63
+
64
+ # Prepare DataFrame for Plotly
65
+ tsne_df = pd.DataFrame(
66
+ tsne_results, columns=[f"Component {i+1}" for i in range(n_components)]
67
+ )
68
+
69
+ # List to store each plot
70
+ plots = []
71
+
72
+ # Create plots for each pair of t-SNE components (if n_components > 1)
73
+ if n_components > 1:
74
+ for comp1, comp2 in itertools.combinations(range(1, n_components + 1), 2):
75
+ fig = px.scatter(
76
+ tsne_df,
77
+ x=f"Component {comp1}",
78
+ y=f"Component {comp2}",
79
+ title=f"{title} - {getattr(model, 'input_id', 'Unknown Model')}",
80
+ labels={
81
+ f"Component {comp1}": f"Component {comp1}",
82
+ f"Component {comp2}": f"Component {comp2}",
83
+ },
84
+ )
85
+ plots.append(fig)
86
+ else:
87
+ fig = px.scatter(
88
+ tsne_df,
89
+ x="Component 1",
90
+ y="Component 1",
91
+ title=f"{title} - {getattr(model, 'input_id', 'Unknown Model')}",
92
+ labels={
93
+ "Component 1": "Component 1",
94
+ },
95
+ )
96
+ plots.append(fig)
97
+
98
+ # Return the list of plots as a tuple
99
+ return tuple(plots)