validmind 2.1.1__py3-none-any.whl → 2.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/ai.py +72 -49
  3. validmind/api_client.py +42 -16
  4. validmind/client.py +68 -25
  5. validmind/datasets/llm/rag/__init__.py +11 -0
  6. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_1.csv +30 -0
  7. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_2.csv +30 -0
  8. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_3.csv +53 -0
  9. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_4.csv +53 -0
  10. validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_5.csv +53 -0
  11. validmind/datasets/llm/rag/rfp.py +41 -0
  12. validmind/errors.py +1 -1
  13. validmind/html_templates/__init__.py +0 -0
  14. validmind/html_templates/content_blocks.py +89 -14
  15. validmind/models/__init__.py +7 -4
  16. validmind/models/foundation.py +8 -34
  17. validmind/models/function.py +51 -0
  18. validmind/models/huggingface.py +16 -46
  19. validmind/models/metadata.py +42 -0
  20. validmind/models/pipeline.py +66 -0
  21. validmind/models/pytorch.py +8 -42
  22. validmind/models/r_model.py +33 -82
  23. validmind/models/sklearn.py +39 -38
  24. validmind/template.py +8 -26
  25. validmind/tests/__init__.py +43 -20
  26. validmind/tests/data_validation/ANOVAOneWayTable.py +1 -1
  27. validmind/tests/data_validation/ChiSquaredFeaturesTable.py +1 -1
  28. validmind/tests/data_validation/DescriptiveStatistics.py +2 -4
  29. validmind/tests/data_validation/Duplicates.py +1 -1
  30. validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
  31. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +1 -1
  32. validmind/tests/data_validation/TargetRateBarPlots.py +1 -1
  33. validmind/tests/data_validation/nlp/LanguageDetection.py +59 -0
  34. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +48 -0
  35. validmind/tests/data_validation/nlp/Punctuations.py +11 -12
  36. validmind/tests/data_validation/nlp/Sentiment.py +57 -0
  37. validmind/tests/data_validation/nlp/Toxicity.py +45 -0
  38. validmind/tests/decorator.py +12 -7
  39. validmind/tests/model_validation/BertScore.py +100 -98
  40. validmind/tests/model_validation/BleuScore.py +93 -64
  41. validmind/tests/model_validation/ContextualRecall.py +74 -91
  42. validmind/tests/model_validation/MeteorScore.py +86 -74
  43. validmind/tests/model_validation/RegardScore.py +103 -121
  44. validmind/tests/model_validation/RougeScore.py +118 -0
  45. validmind/tests/model_validation/TokenDisparity.py +84 -121
  46. validmind/tests/model_validation/ToxicityScore.py +109 -123
  47. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +96 -0
  48. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +71 -0
  49. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +92 -0
  50. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +69 -0
  51. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +78 -0
  52. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +35 -23
  53. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +3 -0
  54. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +7 -1
  55. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +3 -0
  56. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +3 -0
  57. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +99 -0
  58. validmind/tests/model_validation/ragas/AnswerCorrectness.py +131 -0
  59. validmind/tests/model_validation/ragas/AnswerRelevance.py +134 -0
  60. validmind/tests/model_validation/ragas/AnswerSimilarity.py +119 -0
  61. validmind/tests/model_validation/ragas/AspectCritique.py +167 -0
  62. validmind/tests/model_validation/ragas/ContextEntityRecall.py +133 -0
  63. validmind/tests/model_validation/ragas/ContextPrecision.py +123 -0
  64. validmind/tests/model_validation/ragas/ContextRecall.py +123 -0
  65. validmind/tests/model_validation/ragas/ContextRelevancy.py +114 -0
  66. validmind/tests/model_validation/ragas/Faithfulness.py +119 -0
  67. validmind/tests/model_validation/ragas/utils.py +66 -0
  68. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +3 -7
  69. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +8 -9
  70. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +5 -10
  71. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +3 -2
  72. validmind/tests/model_validation/sklearn/ROCCurve.py +2 -1
  73. validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
  74. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +2 -3
  75. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +7 -11
  76. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +3 -4
  77. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +1 -1
  78. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +1 -1
  79. validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +1 -1
  80. validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +1 -1
  81. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +1 -1
  82. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +1 -1
  83. validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +1 -1
  84. validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +5 -6
  85. validmind/unit_metrics/__init__.py +26 -49
  86. validmind/unit_metrics/composite.py +13 -7
  87. validmind/unit_metrics/regression/sklearn/AdjustedRSquaredScore.py +1 -1
  88. validmind/utils.py +99 -6
  89. validmind/vm_models/__init__.py +1 -1
  90. validmind/vm_models/dataset/__init__.py +7 -0
  91. validmind/vm_models/dataset/dataset.py +560 -0
  92. validmind/vm_models/dataset/utils.py +146 -0
  93. validmind/vm_models/model.py +97 -72
  94. validmind/vm_models/test/metric.py +9 -24
  95. validmind/vm_models/test/result_wrapper.py +124 -28
  96. validmind/vm_models/test/threshold_test.py +10 -28
  97. validmind/vm_models/test_context.py +1 -1
  98. validmind/vm_models/test_suite/summary.py +3 -4
  99. {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/METADATA +5 -3
  100. {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/RECORD +103 -78
  101. validmind/models/catboost.py +0 -33
  102. validmind/models/statsmodels.py +0 -50
  103. validmind/models/xgboost.py +0 -30
  104. validmind/tests/model_validation/BertScoreAggregate.py +0 -90
  105. validmind/tests/model_validation/RegardHistogram.py +0 -148
  106. validmind/tests/model_validation/RougeMetrics.py +0 -147
  107. validmind/tests/model_validation/RougeMetricsAggregate.py +0 -133
  108. validmind/tests/model_validation/SelfCheckNLIScore.py +0 -112
  109. validmind/tests/model_validation/ToxicityHistogram.py +0 -136
  110. validmind/vm_models/dataset.py +0 -1303
  111. {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/LICENSE +0 -0
  112. {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/WHEEL +0 -0
  113. {validmind-2.1.1.dist-info → validmind-2.2.4.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,92 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ from itertools import combinations
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ import plotly.express as px
10
+ from sklearn.metrics.pairwise import euclidean_distances
11
+
12
+
13
+ def EuclideanDistanceComparison(dataset, models):
14
+ """
15
+ Computes pairwise Euclidean distances between model embeddings and visualizes the results through bar charts,
16
+ alongside compiling a comprehensive table of descriptive statistics for each model pair.
17
+
18
+ **Purpose:**
19
+ This function is designed to analyze and compare the embeddings produced by different models using Euclidean Distance.
20
+ Euclidean Distance measures the "ordinary" straight-line distance between two points in Euclidean space, providing a
21
+ straightforward metric to assess the absolute differences between vectors. This analysis helps in understanding the
22
+ magnitude of dissimilarity between the embeddings generated by different models, which is crucial for tasks that require
23
+ distinctive model responses or feature separations.
24
+
25
+ **Test Mechanism:**
26
+ The function begins by computing the embeddings for each model using the provided dataset. It then calculates the
27
+ Euclidean distance for every possible pair of models, generating a distance matrix. Each element of this matrix
28
+ represents the Euclidean distance between two model embeddings. The function flattens this matrix and uses it to
29
+ create a bar chart for each model pair, visualizing their distance distribution. Additionally, it compiles a table
30
+ with descriptive statistics (mean, median, standard deviation, minimum, and maximum) for the distances of each
31
+ pair, including a reference to the compared models.
32
+
33
+ **Signs of High Risk:**
34
+
35
+ - Very high distance values could suggest that the models are focusing on completely different features or aspects
36
+ of the data, which might be undesirable for ensemble methods or similar applications where some degree of
37
+ consensus is expected.
38
+ - Extremely low distances across different models might indicate redundancy, suggesting that the models are not
39
+ providing diverse enough perspectives on the data.
40
+
41
+ **Strengths:**
42
+
43
+ - Provides a clear and quantifiable measure of how different the embeddings from various models are.
44
+ - Useful for identifying outlier models or those that behave significantly differently from others in a group.
45
+
46
+ **Limitations:**
47
+
48
+ - Euclidean distance can be sensitive to the scale of the data, meaning that preprocessing steps like normalization
49
+ might be necessary to ensure meaningful comparisons.
50
+ - Does not consider the orientation or angle between vectors, focusing purely on magnitude differences.
51
+ """
52
+
53
+ figures = []
54
+ all_stats = []
55
+
56
+ # Generate all pairs of models for comparison
57
+ for model_A, model_B in combinations(models, 2):
58
+ embeddings_A = np.stack(dataset.y_pred(model_A))
59
+ embeddings_B = np.stack(dataset.y_pred(model_B))
60
+
61
+ # Calculate pairwise Euclidean distances
62
+ distance_matrix = euclidean_distances(embeddings_A, embeddings_B)
63
+ distances = distance_matrix.flatten()
64
+
65
+ # Generate statistics and add model combination as a column
66
+ stats_data = {
67
+ "Combination": f"{model_A.input_id} vs {model_B.input_id}",
68
+ "Mean": np.mean(distances),
69
+ "Median": np.median(distances),
70
+ "Standard Deviation": np.std(distances),
71
+ "Minimum": np.min(distances),
72
+ "Maximum": np.max(distances),
73
+ }
74
+ all_stats.append(stats_data)
75
+
76
+ # Generate an index for each distance value
77
+ indices = range(len(distances))
78
+
79
+ # Create the bar chart using Plotly
80
+ fig = px.bar(
81
+ x=indices,
82
+ y=distances,
83
+ labels={"x": "Pair Index", "y": "Euclidean Distance"},
84
+ title=f"Euclidean Distance - {model_A.input_id} vs {model_B.input_id}",
85
+ )
86
+ fig.update_layout(xaxis_title="Pair Index", yaxis_title="Euclidean Distance")
87
+ figures.append(fig)
88
+
89
+ # Create a DataFrame from all collected statistics
90
+ stats_df = pd.DataFrame(all_stats)
91
+
92
+ return (stats_df, *tuple(figures))
@@ -0,0 +1,69 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import numpy as np
6
+ import plotly.express as px
7
+ from sklearn.metrics.pairwise import euclidean_distances
8
+
9
+
10
+ def EuclideanDistanceHeatmap(
11
+ dataset,
12
+ model,
13
+ title="Euclidean Distance Matrix",
14
+ color="Euclidean Distance",
15
+ xaxis_title="Index",
16
+ yaxis_title="Index",
17
+ color_scale="Blues",
18
+ ):
19
+ """
20
+ Generates an interactive heatmap to visualize the Euclidean distances among embeddings derived from a given model.
21
+
22
+ **Purpose:**
23
+ This function visualizes the Euclidean distances between embeddings generated by a model, offering insights into the
24
+ absolute differences between data points. Euclidean distance, a fundamental metric in data analysis, measures the
25
+ straight-line distance between two points in Euclidean space. It is particularly useful for understanding spatial
26
+ relationships and clustering tendencies in high-dimensional data.
27
+
28
+ **Test Mechanism:**
29
+ The function operates through a streamlined process: firstly, embeddings are extracted for each dataset entry using the specified model.
30
+ Subsequently, it computes the pairwise Euclidean distances among these embeddings. The results are then visualized in an interactive heatmap format,
31
+ where each cell's color intensity correlates with the distance magnitude between pairs of embeddings, providing a visual assessment of these distances.
32
+
33
+ **Signs of High Risk:**
34
+ - Uniform Distances: Uniformly low distances across the heatmap might suggest a lack of variability in the data or
35
+ model overfitting, where the model fails to distinguish between distinct data points effectively.
36
+ - High Variability: Conversely, excessive variability in distances could indicate inconsistent data representation,
37
+ potentially leading to unreliable model predictions.
38
+
39
+ **Strengths:**
40
+ - Provides a direct, intuitive visual representation of distances between embeddings, aiding in the detection of patterns or anomalies.
41
+ - Allows customization of visual aspects such as the heatmap's title, axis labels, and color scale, adapting to various analytical needs.
42
+
43
+ **Limitations:**
44
+ - The interpretation of distances can be sensitive to the scale of data; normalization might be necessary for meaningful analysis.
45
+ - Large datasets may lead to dense, cluttered heatmaps, making it difficult to discern individual distances, potentially requiring
46
+ techniques like data sampling or dimensionality reduction for clearer visualization.
47
+ """
48
+
49
+ embeddings = np.stack(dataset.y_pred(model))
50
+
51
+ # Calculate pairwise Euclidean distance
52
+ distance_matrix = euclidean_distances(embeddings)
53
+
54
+ # Create the heatmap using Plotly
55
+ fig = px.imshow(
56
+ distance_matrix,
57
+ labels=dict(x=xaxis_title, y=yaxis_title, color=color),
58
+ text_auto=True,
59
+ aspect="auto",
60
+ color_continuous_scale=color_scale,
61
+ )
62
+
63
+ fig.update_layout(
64
+ title=f"{title} - {model.input_id}",
65
+ xaxis_title=xaxis_title,
66
+ yaxis_title=yaxis_title,
67
+ )
68
+
69
+ return fig
@@ -0,0 +1,78 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import itertools
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ import plotly.express as px
10
+ from sklearn.decomposition import PCA
11
+ from sklearn.preprocessing import StandardScaler
12
+
13
+
14
+ def PCAComponentsPairwisePlots(dataset, model, n_components=3):
15
+ """
16
+ Generates scatter plots for pairwise combinations of principal component analysis (PCA) components of model embeddings.
17
+
18
+ **Purpose:**
19
+ This function visualizes the principal components of embeddings derived from a specified model. Principal Component Analysis (PCA)
20
+ is a statistical technique that emphasizes variation and uncovers strong patterns in a dataset.
21
+ It transforms the original variables into new, uncorrelated variables (principal components) that maximize variance.
22
+
23
+ **Test Mechanism:**
24
+ The function follows a sequential process to visualize PCA components effectively.
25
+ It starts by extracting embeddings from the dataset, utilizing the model specified by the user.
26
+ These embeddings are then standardized to ensure zero mean and unit variance, which is crucial to prevent
27
+ any single feature from dominating due to scale—this standardization is a critical preprocessing step for PCA.
28
+ Following this, the function calculates the specified number of principal components.
29
+ The core of the visualization process involves creating scatter plots for each pairwise combination of these principal components.
30
+
31
+ **Signs of High Risk:**
32
+ - If the principal components do not account for a significant portion of the variance, it may suggest that PCA is not capturing the essential structures of the data.
33
+ - Similarity in scatter plots across different pairs of components could indicate redundancy in the components, suggesting that fewer dimensions might be sufficient to represent the data.
34
+
35
+ **Strengths:**
36
+ - Enables a simplified visualization of multivariate data, helping to identify patterns across many variables effectively.
37
+ - Provides a clear depiction of the directions of maximum variance in the data, which is valuable for feature selection and dimensionality reduction.
38
+
39
+ **Limitations:**
40
+ - PCA's effectiveness hinges on the scaling of the variables; improper standardization can lead to misleading interpretations.
41
+ - The interpretation of principal components can be challenging, especially if they capture less significant variances or are difficult to relate back to the original features.
42
+ """
43
+
44
+ # Get embeddings from the dataset using the model
45
+ embeddings = np.stack(dataset.y_pred(model))
46
+
47
+ # Standardize the embeddings
48
+ scaler = StandardScaler()
49
+ embeddings_scaled = scaler.fit_transform(embeddings)
50
+
51
+ # Perform PCA
52
+ pca = PCA(n_components=n_components)
53
+ pca_results = pca.fit_transform(embeddings_scaled)
54
+
55
+ # Prepare DataFrame for Plotly
56
+ pca_df = pd.DataFrame(
57
+ pca_results, columns=[f"PC{i+1}" for i in range(n_components)]
58
+ )
59
+
60
+ # List to store each plot
61
+ plots = []
62
+
63
+ # Create plots for each pair of principal components
64
+ for pc1, pc2 in itertools.combinations(range(1, n_components + 1), 2):
65
+ fig = px.scatter(
66
+ pca_df,
67
+ x=f"PC{pc1}",
68
+ y=f"PC{pc2}",
69
+ title=f"{getattr(model, 'input_id', 'Unknown Model')} (PC{pc1} vs PC{pc2})",
70
+ labels={
71
+ f"PC{pc1}": f"Principal Component {pc1}",
72
+ f"PC{pc2}": f"Principal Component {pc2}",
73
+ },
74
+ )
75
+ plots.append(fig)
76
+
77
+ # Return the list of plots as a tuple
78
+ return tuple(plots)
@@ -6,8 +6,10 @@ from abc import abstractmethod
6
6
  from typing import List
7
7
 
8
8
  import numpy as np
9
+ import plotly.express as px
9
10
  from sklearn.metrics.pairwise import cosine_similarity
10
11
 
12
+ from validmind.logging import get_logger
11
13
  from validmind.vm_models import (
12
14
  Figure,
13
15
  ResultSummary,
@@ -17,13 +19,14 @@ from validmind.vm_models import (
17
19
  ThresholdTestResult,
18
20
  )
19
21
 
22
+ logger = get_logger(__name__)
23
+
20
24
 
21
25
  class StabilityAnalysis(ThresholdTest):
22
26
  """Base class for embeddings stability analysis tests"""
23
27
 
24
28
  required_inputs = ["model", "dataset"]
25
29
  default_params = {
26
- "text_column": None,
27
30
  "mean_similarity_threshold": 0.7,
28
31
  }
29
32
  metadata = {
@@ -61,25 +64,22 @@ class StabilityAnalysis(ThresholdTest):
61
64
 
62
65
  def run(self):
63
66
  # Perturb the test dataset
64
- col = self.params.get("text_column")
65
-
66
- if col is None:
67
- raise ValueError(
68
- "The `text_column` parameter must be provided to the StabilityAnalysis test."
69
- )
67
+ original = self.inputs.dataset.df
68
+ perturbed = original.copy()
69
+ perturbed.update(
70
+ perturbed.select_dtypes(include="object").applymap(self.perturb_data)
71
+ )
70
72
 
71
- original_data_df = self.inputs.dataset.df[col]
72
- perturbed_data_df = original_data_df.copy()
73
- perturbed_data_df = perturbed_data_df.apply(self.perturb_data)
73
+ logger.debug(f"Original data: {original}")
74
+ logger.debug(f"Perturbed data: {perturbed}")
74
75
 
75
76
  # Compute embeddings for the original and perturbed dataset
76
- original_embeddings = self.inputs.model.predict(original_data_df)
77
- perturbed_embeddings = self.inputs.model.predict(perturbed_data_df)
77
+ original_embeddings = self.inputs.dataset.y_pred(self.inputs.model)
78
+ perturbed_embeddings = np.stack(self.inputs.model.predict(perturbed))
78
79
 
79
80
  # Compute cosine similarities between original and perturbed embeddings
80
81
  similarities = cosine_similarity(
81
- original_embeddings,
82
- perturbed_embeddings,
82
+ original_embeddings, perturbed_embeddings
83
83
  ).diagonal()
84
84
 
85
85
  mean = np.mean(similarities)
@@ -91,15 +91,26 @@ class StabilityAnalysis(ThresholdTest):
91
91
  # Determine if the test passed based on the mean similarity and threshold
92
92
  passed = mean > self.params["mean_similarity_threshold"]
93
93
 
94
- # Plot the distribution of cosine similarities using plotly
95
- import plotly.express as px
96
-
97
- fig = px.histogram(
98
- x=similarities.flatten(),
99
- nbins=100,
100
- title="Cosine Similarity Distribution",
101
- labels={"x": "Cosine Similarity"},
102
- )
94
+ figures = [
95
+ px.histogram(
96
+ x=similarities.flatten(),
97
+ nbins=100,
98
+ title="Cosine Similarity Distribution",
99
+ labels={"x": "Cosine Similarity"},
100
+ ),
101
+ px.density_contour(
102
+ x=similarities.flatten(),
103
+ nbinsx=100,
104
+ title="Cosine Similarity Density",
105
+ labels={"x": "Cosine Similarity"},
106
+ marginal_x="histogram",
107
+ ),
108
+ px.box(
109
+ x=similarities.flatten(),
110
+ labels={"x": "Cosine Similarity"},
111
+ title="Cosine Similarity Box Plot",
112
+ ),
113
+ ]
103
114
 
104
115
  # For this example, we are not caching the results as done in the reference `run` method
105
116
  return self.cache_results(
@@ -121,6 +132,7 @@ class StabilityAnalysis(ThresholdTest):
121
132
  key=self.name,
122
133
  figure=fig,
123
134
  )
135
+ for fig in figures
124
136
  ],
125
137
  passed=passed,
126
138
  )
@@ -55,6 +55,9 @@ class StabilityAnalysisKeyword(StabilityAnalysis):
55
55
  }
56
56
 
57
57
  def perturb_data(self, data: str):
58
+ if not isinstance(data, str):
59
+ return data
60
+
58
61
  # Tokenize the string
59
62
  tokens = re.findall(r"[\w']+[.,!?;]?|[\w']+", data)
60
63
  modified_tokens = []
@@ -114,9 +114,15 @@ class StabilityAnalysisRandomNoise(StabilityAnalysis):
114
114
  name = "Text Embeddings Stability Analysis to Random Noise"
115
115
  default_params = {
116
116
  **StabilityAnalysis.default_params,
117
+ "probability": 0.02,
117
118
  }
118
119
 
119
- def perturb_data(self, data, probability=0.02):
120
+ def perturb_data(self, data):
121
+ if not isinstance(data, str):
122
+ return data
123
+
124
+ probability = self.params["probability"]
125
+
120
126
  # Tokenize the string based on spaces
121
127
  words = data.split()
122
128
 
@@ -65,6 +65,9 @@ class StabilityAnalysisSynonyms(StabilityAnalysis):
65
65
  }
66
66
 
67
67
  def perturb_data(self, data):
68
+ if not isinstance(data, str):
69
+ return data
70
+
68
71
  # download the nltk wordnet
69
72
  nltk.download("wordnet", quiet=True)
70
73
 
@@ -61,6 +61,9 @@ class StabilityAnalysisTranslation(StabilityAnalysis):
61
61
  }
62
62
 
63
63
  def perturb_data(self, data: str):
64
+ if not isinstance(data, str):
65
+ return data
66
+
64
67
  source_lang = self.params["source_lang"]
65
68
  target_lang = self.params["target_lang"]
66
69
 
@@ -0,0 +1,99 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import itertools
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ import plotly.express as px
10
+ from sklearn.manifold import TSNE
11
+ from sklearn.preprocessing import StandardScaler
12
+
13
+
14
+ def TSNEComponentsPairwisePlots(
15
+ dataset,
16
+ model,
17
+ n_components=2,
18
+ perplexity=30,
19
+ title="t-SNE",
20
+ ):
21
+ """
22
+ Plots individual scatter plots for pairwise combinations of t-SNE components of embeddings.
23
+
24
+ **Purpose:**
25
+ This function creates scatter plots for each pairwise combination of t-SNE components derived from model embeddings.
26
+ t-SNE (t-Distributed Stochastic Neighbor Embedding) is a machine learning algorithm for dimensionality reduction that
27
+ is particularly well-suited for the visualization of high-dimensional datasets.
28
+
29
+ **Test Mechanism:**
30
+ The function begins by extracting embeddings from the provided dataset using the specified model.
31
+ These embeddings are then standardized to ensure that each dimension contributes equally to the distance computation.
32
+ Following this, the t-SNE algorithm is applied to reduce the dimensionality of the data, with the number of components
33
+ specified by the user. The results are plotted using Plotly, creating scatter plots for each unique pair of components
34
+ if more than one component is specified.
35
+
36
+ **Signs of High Risk:**
37
+ - If the scatter plots show overlapping clusters or indistinct groupings, it might suggest that the
38
+ t-SNE parameters (such as perplexity) are not optimally set for the given data, or the data itself does not exhibit clear, separable clusters.
39
+ - Similar plots across different pairs of components could indicate redundancy in the components generated by t-SNE,
40
+ suggesting that fewer dimensions might be sufficient to represent the data's structure.
41
+
42
+ **Strengths:**
43
+ - Provides a visual exploration tool for high-dimensional data, simplifying the detection of patterns and clusters which are not apparent in higher dimensions.
44
+ - Interactive plots generated by Plotly enhance user engagement and allow for a deeper dive into specific areas of the plot, aiding in detailed data analysis.
45
+
46
+ **Limitations:**
47
+ - The effectiveness of t-SNE is highly dependent on the choice of parameters like perplexity and the number of components,
48
+ which might require tuning and experimentation for optimal results.
49
+ - t-SNE visualizations can be misleading if interpreted without considering the stochastic nature of the algorithm;
50
+ two runs with the same parameters might yield different visual outputs, necessitating multiple runs for a consistent interpretation.
51
+ """
52
+
53
+ # Get embeddings from the dataset using the model
54
+ embeddings = np.stack(dataset.y_pred(model))
55
+
56
+ # Standardize the embeddings
57
+ scaler = StandardScaler()
58
+ embeddings_scaled = scaler.fit_transform(embeddings)
59
+
60
+ # Perform t-SNE
61
+ tsne = TSNE(n_components=n_components, perplexity=perplexity)
62
+ tsne_results = tsne.fit_transform(embeddings_scaled)
63
+
64
+ # Prepare DataFrame for Plotly
65
+ tsne_df = pd.DataFrame(
66
+ tsne_results, columns=[f"Component {i+1}" for i in range(n_components)]
67
+ )
68
+
69
+ # List to store each plot
70
+ plots = []
71
+
72
+ # Create plots for each pair of t-SNE components (if n_components > 1)
73
+ if n_components > 1:
74
+ for comp1, comp2 in itertools.combinations(range(1, n_components + 1), 2):
75
+ fig = px.scatter(
76
+ tsne_df,
77
+ x=f"Component {comp1}",
78
+ y=f"Component {comp2}",
79
+ title=f"{title} - {getattr(model, 'input_id', 'Unknown Model')}",
80
+ labels={
81
+ f"Component {comp1}": f"Component {comp1}",
82
+ f"Component {comp2}": f"Component {comp2}",
83
+ },
84
+ )
85
+ plots.append(fig)
86
+ else:
87
+ fig = px.scatter(
88
+ tsne_df,
89
+ x="Component 1",
90
+ y="Component 1",
91
+ title=f"{title} - {getattr(model, 'input_id', 'Unknown Model')}",
92
+ labels={
93
+ "Component 1": "Component 1",
94
+ },
95
+ )
96
+ plots.append(fig)
97
+
98
+ # Return the list of plots as a tuple
99
+ return tuple(plots)
@@ -0,0 +1,131 @@
1
+ # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
+ # See the LICENSE file in the root of this repository for details.
3
+ # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
+
5
+ import warnings
6
+
7
+ import plotly.express as px
8
+ from datasets import Dataset
9
+ from ragas import evaluate
10
+ from ragas.metrics import answer_correctness
11
+
12
+ from validmind import tags, tasks
13
+
14
+ from .utils import get_renamed_columns
15
+
16
+
17
+ @tags("ragas", "llm")
18
+ @tasks("text_qa", "text_generation", "text_summarization")
19
+ def AnswerCorrectness(
20
+ dataset,
21
+ question_column="question",
22
+ answer_column="answer",
23
+ ground_truth_column="ground_truth",
24
+ ):
25
+ """
26
+ Evaluates the correctness of answers in a dataset with respect to the provided ground
27
+ truths and visualizes the results in a histogram.
28
+
29
+ The assessment of Answer Correctness involves gauging the accuracy of the generated
30
+ answer when compared to the ground truth. This evaluation relies on the `ground truth`
31
+ and the `answer`, with scores ranging from 0 to 1. A higher score indicates a closer
32
+ alignment between the generated answer and the ground truth, signifying better
33
+ correctness.
34
+
35
+ Answer correctness encompasses two critical aspects: semantic similarity between the
36
+ generated answer and the ground truth, as well as factual similarity. These aspects
37
+ are combined using a weighted scheme to formulate the answer correctness score. Users
38
+ also have the option to employ a `threshold` value to round the resulting score to
39
+ a binary value (0 or 1) based on the threshold.
40
+
41
+ Factual correctness quantifies the factual overlap between the generated answer and
42
+ the ground truth answer. This is done using the concepts of:
43
+
44
+ - TP (True Positive): Facts or statements that are present in both the ground truth
45
+ and the generated answer.
46
+ - FP (False Positive): Facts or statements that are present in the generated answer
47
+ but not in the ground truth.
48
+ - FN (False Negative): Facts or statements that are present in the ground truth but
49
+ not in the generated answer.
50
+
51
+ ### Configuring Columns
52
+
53
+ This metric requires specific columns to be present in the dataset:
54
+ - `question` (str): The text prompt or query that was input into the model.
55
+ - `answer` (str): The text response generated by the model.
56
+ - `ground_truth` (str): The ground truth answer that the generated answer is compared
57
+ against.
58
+
59
+ If the above data is not in the appropriate column, you can specify different column
60
+ names for these fields using the parameters `question_column`, `answer_column`, and
61
+ `ground_truth_column`.
62
+
63
+ For example, if your dataset has this data stored in different columns, you can
64
+ pass the following parameters:
65
+ ```python
66
+ params = {
67
+ "question_column": "input_text",
68
+ "answer_column": "output_text",
69
+ "ground_truth_column": "human_answer",
70
+ }
71
+ ```
72
+
73
+ If answer and contexts are stored as a dictionary in another column, specify the
74
+ column and key like this:
75
+ ```python
76
+ pred_col = dataset.prediction_column(model)
77
+ params = {
78
+ "answer_column": f"{pred_col}.generated_answer",
79
+ "ground_truth_column": f"{pred_col}.contexts",
80
+ }
81
+ ```
82
+
83
+ For more complex data structures, you can use a function to extract the answers:
84
+ ```python
85
+ pred_col = dataset.prediction_column(model)
86
+ params = {
87
+ "answer_column": lambda row: "\\n\\n".join(row[pred_col]["messages"]),
88
+ "ground_truth_column": lambda row: [row[pred_col]["context_message"]],
89
+ }
90
+ ```
91
+ """
92
+ warnings.filterwarnings(
93
+ "ignore",
94
+ category=FutureWarning,
95
+ message="promote has been superseded by promote_options='default'.",
96
+ )
97
+
98
+ required_columns = {
99
+ "question": question_column,
100
+ "answer": answer_column,
101
+ "ground_truth": ground_truth_column,
102
+ }
103
+
104
+ df = get_renamed_columns(dataset.df, required_columns)
105
+
106
+ result_df = evaluate(
107
+ Dataset.from_pandas(df), metrics=[answer_correctness]
108
+ ).to_pandas()
109
+
110
+ fig_histogram = px.histogram(x=result_df["answer_correctness"].to_list(), nbins=10)
111
+ fig_box = px.box(x=result_df["answer_correctness"].to_list())
112
+
113
+ return (
114
+ {
115
+ "Scores": result_df[
116
+ ["question", "answer", "ground_truth", "answer_correctness"]
117
+ ],
118
+ "Aggregate Scores": [
119
+ {
120
+ "Mean Score": result_df["answer_correctness"].mean(),
121
+ "Median Score": result_df["answer_correctness"].median(),
122
+ "Max Score": result_df["answer_correctness"].max(),
123
+ "Min Score": result_df["answer_correctness"].min(),
124
+ "Standard Deviation": result_df["answer_correctness"].std(),
125
+ "Count": len(result_df),
126
+ }
127
+ ],
128
+ },
129
+ fig_histogram,
130
+ fig_box,
131
+ )