validmind 2.5.24__py3-none-any.whl → 2.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. validmind/__init__.py +8 -17
  2. validmind/__version__.py +1 -1
  3. validmind/ai/test_descriptions.py +66 -85
  4. validmind/ai/test_result_description/context.py +2 -2
  5. validmind/ai/utils.py +26 -1
  6. validmind/api_client.py +43 -79
  7. validmind/client.py +5 -7
  8. validmind/client_config.py +1 -1
  9. validmind/datasets/__init__.py +1 -1
  10. validmind/datasets/classification/customer_churn.py +7 -5
  11. validmind/datasets/nlp/__init__.py +2 -2
  12. validmind/errors.py +6 -10
  13. validmind/html_templates/content_blocks.py +18 -16
  14. validmind/logging.py +21 -16
  15. validmind/tests/__init__.py +28 -5
  16. validmind/tests/__types__.py +186 -170
  17. validmind/tests/_store.py +7 -21
  18. validmind/tests/comparison.py +362 -0
  19. validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
  20. validmind/tests/data_validation/ADF.py +49 -83
  21. validmind/tests/data_validation/AutoAR.py +59 -96
  22. validmind/tests/data_validation/AutoMA.py +59 -96
  23. validmind/tests/data_validation/AutoStationarity.py +66 -114
  24. validmind/tests/data_validation/ClassImbalance.py +48 -117
  25. validmind/tests/data_validation/DatasetDescription.py +180 -209
  26. validmind/tests/data_validation/DatasetSplit.py +50 -75
  27. validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
  28. validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
  29. validmind/tests/data_validation/Duplicates.py +21 -90
  30. validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
  31. validmind/tests/data_validation/HighCardinality.py +32 -80
  32. validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
  33. validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
  34. validmind/tests/data_validation/IQROutliersTable.py +40 -80
  35. validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
  36. validmind/tests/data_validation/KPSS.py +33 -81
  37. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
  38. validmind/tests/data_validation/MissingValues.py +17 -58
  39. validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
  40. validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
  41. validmind/tests/data_validation/RollingStatsPlot.py +50 -81
  42. validmind/tests/data_validation/SeasonalDecompose.py +102 -184
  43. validmind/tests/data_validation/Skewness.py +27 -64
  44. validmind/tests/data_validation/SpreadPlot.py +34 -57
  45. validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
  46. validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
  47. validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
  48. validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
  49. validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
  50. validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
  51. validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
  52. validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
  53. validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
  54. validmind/tests/data_validation/TooManyZeroValues.py +21 -70
  55. validmind/tests/data_validation/UniqueRows.py +23 -62
  56. validmind/tests/data_validation/WOEBinPlots.py +83 -109
  57. validmind/tests/data_validation/WOEBinTable.py +28 -69
  58. validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
  59. validmind/tests/data_validation/nlp/CommonWords.py +49 -57
  60. validmind/tests/data_validation/nlp/Hashtags.py +27 -49
  61. validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
  62. validmind/tests/data_validation/nlp/Mentions.py +32 -63
  63. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
  64. validmind/tests/data_validation/nlp/Punctuations.py +63 -47
  65. validmind/tests/data_validation/nlp/Sentiment.py +4 -0
  66. validmind/tests/data_validation/nlp/StopWords.py +62 -91
  67. validmind/tests/data_validation/nlp/TextDescription.py +116 -159
  68. validmind/tests/data_validation/nlp/Toxicity.py +12 -4
  69. validmind/tests/decorator.py +33 -242
  70. validmind/tests/load.py +212 -153
  71. validmind/tests/model_validation/BertScore.py +13 -7
  72. validmind/tests/model_validation/BleuScore.py +4 -0
  73. validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
  74. validmind/tests/model_validation/ContextualRecall.py +3 -0
  75. validmind/tests/model_validation/FeaturesAUC.py +43 -74
  76. validmind/tests/model_validation/MeteorScore.py +3 -0
  77. validmind/tests/model_validation/RegardScore.py +5 -1
  78. validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
  79. validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
  80. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
  81. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
  82. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
  83. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
  84. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
  85. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
  86. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
  87. validmind/tests/model_validation/embeddings/utils.py +53 -0
  88. validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
  89. validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
  90. validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
  91. validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
  92. validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
  93. validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
  94. validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
  95. validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
  96. validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
  97. validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
  98. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
  99. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
  100. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
  101. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
  102. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
  103. validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
  104. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
  105. validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
  106. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
  107. validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
  108. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
  109. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
  110. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
  111. validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
  112. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
  113. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
  114. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
  115. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
  116. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
  117. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
  118. validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
  119. validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
  120. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
  121. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
  122. validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
  123. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
  124. validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
  125. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
  126. validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
  127. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
  128. validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
  129. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
  130. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
  131. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
  132. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
  133. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
  134. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
  135. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
  136. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
  137. validmind/tests/output.py +120 -0
  138. validmind/tests/prompt_validation/Bias.py +55 -98
  139. validmind/tests/prompt_validation/Clarity.py +56 -99
  140. validmind/tests/prompt_validation/Conciseness.py +63 -101
  141. validmind/tests/prompt_validation/Delimitation.py +48 -89
  142. validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
  143. validmind/tests/prompt_validation/Robustness.py +80 -121
  144. validmind/tests/prompt_validation/Specificity.py +61 -95
  145. validmind/tests/prompt_validation/ai_powered_test.py +2 -2
  146. validmind/tests/run.py +314 -496
  147. validmind/tests/test_providers.py +109 -79
  148. validmind/tests/utils.py +91 -0
  149. validmind/unit_metrics/__init__.py +16 -155
  150. validmind/unit_metrics/classification/F1.py +1 -0
  151. validmind/unit_metrics/classification/Precision.py +1 -0
  152. validmind/unit_metrics/classification/ROC_AUC.py +1 -0
  153. validmind/unit_metrics/classification/Recall.py +1 -0
  154. validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
  155. validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
  156. validmind/unit_metrics/regression/HuberLoss.py +1 -0
  157. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
  158. validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
  159. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
  160. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
  161. validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
  162. validmind/unit_metrics/regression/QuantileLoss.py +1 -0
  163. validmind/unit_metrics/regression/RSquaredScore.py +2 -1
  164. validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
  165. validmind/utils.py +66 -17
  166. validmind/vm_models/__init__.py +2 -17
  167. validmind/vm_models/dataset/dataset.py +31 -4
  168. validmind/vm_models/figure.py +7 -37
  169. validmind/vm_models/model.py +3 -0
  170. validmind/vm_models/result/__init__.py +7 -0
  171. validmind/vm_models/result/result.jinja +21 -0
  172. validmind/vm_models/result/result.py +337 -0
  173. validmind/vm_models/result/utils.py +160 -0
  174. validmind/vm_models/test_suite/runner.py +16 -54
  175. validmind/vm_models/test_suite/summary.py +3 -3
  176. validmind/vm_models/test_suite/test.py +43 -77
  177. validmind/vm_models/test_suite/test_suite.py +8 -40
  178. validmind-2.6.7.dist-info/METADATA +137 -0
  179. {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/RECORD +182 -189
  180. validmind/tests/data_validation/AutoSeasonality.py +0 -190
  181. validmind/tests/metadata.py +0 -59
  182. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
  183. validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
  184. validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
  185. validmind/unit_metrics/composite.py +0 -238
  186. validmind/vm_models/test/metric.py +0 -98
  187. validmind/vm_models/test/metric_result.py +0 -61
  188. validmind/vm_models/test/output_template.py +0 -55
  189. validmind/vm_models/test/result_summary.py +0 -76
  190. validmind/vm_models/test/result_wrapper.py +0 -488
  191. validmind/vm_models/test/test.py +0 -103
  192. validmind/vm_models/test/threshold_test.py +0 -106
  193. validmind/vm_models/test/threshold_test_result.py +0 -75
  194. validmind/vm_models/test_context.py +0 -259
  195. validmind-2.5.24.dist-info/METADATA +0 -118
  196. {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/LICENSE +0 -0
  197. {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/WHEEL +0 -0
  198. {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/entry_points.txt +0 -0
@@ -1,190 +0,0 @@
1
- # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
- # See the LICENSE file in the root of this repository for details.
3
- # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
-
5
- import numpy as np
6
- import pandas as pd
7
- from statsmodels.tsa.seasonal import seasonal_decompose
8
-
9
- from validmind.logging import get_logger
10
- from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
11
-
12
- logger = get_logger(__name__)
13
-
14
-
15
- class AutoSeasonality(Metric):
16
- """
17
- Automatically identifies and quantifies optimal seasonality in time series data to improve forecasting model
18
- performance.
19
-
20
- ### Purpose
21
-
22
- The AutoSeasonality test aims to automatically detect and identify the best seasonal order or period for each
23
- variable in a time series dataset. This detection helps to quantify periodic patterns and seasonality that reoccur
24
- at fixed intervals in the data. Understanding the seasonality component can drastically improve prediction
25
- accuracy, which is especially significant for forecasting-based models.
26
-
27
- ### Test Mechanism
28
-
29
- This test uses the seasonal decomposition method from the Statsmodels Python library. The function takes the
30
- 'additive' model type for each variable and applies it within the prescribed range of 'min_period' and
31
- 'max_period'. It decomposes the seasonality for each period in the range and calculates the mean residual error for
32
- each period. The seasonal period that results in the minimum residuals is marked as the 'Best Period'. The test
33
- results include the 'Best Period', the calculated residual errors, and a determination of 'Seasonality' or 'No
34
- Seasonality'.
35
-
36
- ### Signs of High Risk
37
-
38
- - If the optimal seasonal period (or 'Best Period') is consistently at the maximum or minimum limit of the offered
39
- range for a majority of variables, it may suggest that the range set does not adequately capture the true seasonal
40
- pattern in the series.
41
- - A high average 'Residual Error' for the selected 'Best Period' could indicate issues with the model's performance.
42
-
43
- ### Strengths
44
-
45
- - The metric offers an automatic approach to identifying and quantifying the optimal seasonality, providing a
46
- robust method for analyzing time series datasets.
47
- - It is applicable to multiple variables in a dataset, providing a comprehensive evaluation of each variable's
48
- seasonality.
49
- - The use of concrete and measurable statistical methods improves the objectivity and reproducibility of the model.
50
-
51
- ### Limitations
52
-
53
- - This AutoSeasonality metric may not be suitable if the time series data exhibits random walk behavior or lacks
54
- clear seasonality, as the seasonal decomposition model may not be appropriate.
55
- - The defined range for the seasonal period (min_period and max_period) can influence the outcomes. If the actual
56
- seasonality period lies outside this range, this method will not be able to identify the true seasonal order.
57
- - This metric may not be able to fully interpret complex patterns that go beyond the simple additive model for
58
- seasonal decomposition.
59
- - The tool may incorrectly infer seasonality if random fluctuations in the data match the predefined seasonal
60
- period range.
61
- """
62
-
63
- name = "auto_seasonality"
64
- required_inputs = ["dataset"]
65
- default_params = {"min_period": 1, "max_period": 4}
66
- tasks = ["regression"]
67
- tags = [
68
- "time_series_data",
69
- "forecasting",
70
- "statistical_test",
71
- "statsmodels",
72
- "seasonality",
73
- ]
74
-
75
- def evaluate_seasonal_periods(self, series, min_period, max_period):
76
- seasonal_periods = []
77
- residual_errors = []
78
-
79
- for period in range(min_period, max_period + 1):
80
- try:
81
- sd = seasonal_decompose(series, model="additive", period=period)
82
- residual_error = np.abs(sd.resid.dropna()).mean()
83
-
84
- seasonal_periods.append(period)
85
- residual_errors.append(residual_error)
86
- except Exception as e:
87
- logger.error(f"Error evaluating period {period} for series: {e}")
88
-
89
- return seasonal_periods, residual_errors
90
-
91
- def run(self):
92
- # Parse input parameters
93
- if "min_period" not in self.params:
94
- raise ValueError("min_period must be provided in params")
95
- min_period = int(self.params["min_period"])
96
-
97
- if "max_period" not in self.params:
98
- raise ValueError("max_period must be provided in params")
99
- max_period = int(self.params["max_period"])
100
-
101
- df = self.inputs.dataset.df
102
-
103
- # Create an empty DataFrame to store the results
104
- summary_auto_seasonality = pd.DataFrame()
105
-
106
- for col_name, col in df.items():
107
- series = col.dropna()
108
-
109
- # Evaluate seasonal periods
110
- seasonal_periods, residual_errors = self.evaluate_seasonal_periods(
111
- series, min_period, max_period
112
- )
113
-
114
- for i, period in enumerate(seasonal_periods):
115
- decision = "Seasonality" if period > 1 else "No Seasonality"
116
- summary_auto_seasonality = pd.concat(
117
- [
118
- summary_auto_seasonality,
119
- pd.DataFrame(
120
- [
121
- {
122
- "Variable": col_name,
123
- "Seasonal Period": period,
124
- "Residual Error": residual_errors[i],
125
- "Decision": decision,
126
- }
127
- ]
128
- ),
129
- ],
130
- ignore_index=True,
131
- )
132
-
133
- # Convert the 'Seasonal Period' column to integer
134
- summary_auto_seasonality["Seasonal Period"] = summary_auto_seasonality[
135
- "Seasonal Period"
136
- ].astype(int)
137
-
138
- # Create a DataFrame to store the best seasonality period for each variable
139
- best_seasonality_period = pd.DataFrame()
140
-
141
- for variable in summary_auto_seasonality["Variable"].unique():
142
- temp_df = summary_auto_seasonality[
143
- summary_auto_seasonality["Variable"] == variable
144
- ]
145
- best_row = temp_df[
146
- temp_df["Residual Error"] == temp_df["Residual Error"].min()
147
- ]
148
- best_seasonality_period = pd.concat([best_seasonality_period, best_row])
149
-
150
- # Rename the 'Seasonal Period' column to 'Best Period'
151
- best_seasonality_period = best_seasonality_period.rename(
152
- columns={"Seasonal Period": "Best Period"}
153
- )
154
-
155
- # Convert the 'Best Period' column to integer
156
- best_seasonality_period["Best Period"] = best_seasonality_period[
157
- "Best Period"
158
- ].astype(int)
159
-
160
- return self.cache_results(
161
- {
162
- "auto_seasonality": summary_auto_seasonality.to_dict(orient="records"),
163
- "best_seasonality_period": best_seasonality_period.to_dict(
164
- orient="records"
165
- ),
166
- }
167
- )
168
-
169
- def summary(self, metric_value):
170
- """
171
- Build one table for summarizing the auto seasonality results
172
- and another for the best seasonality period results
173
- """
174
- summary_auto_seasonality = metric_value["auto_seasonality"]
175
- best_seasonality_period = metric_value["best_seasonality_period"]
176
-
177
- return ResultSummary(
178
- results=[
179
- ResultTable(
180
- data=summary_auto_seasonality,
181
- metadata=ResultTableMetadata(title="Auto Seasonality Results"),
182
- ),
183
- ResultTable(
184
- data=best_seasonality_period,
185
- metadata=ResultTableMetadata(
186
- title="Best Seasonality Period Results"
187
- ),
188
- ),
189
- ]
190
- )
@@ -1,59 +0,0 @@
1
- # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
- # See the LICENSE file in the root of this repository for details.
3
- # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
-
5
- import pandas as pd
6
-
7
- from validmind.utils import format_dataframe
8
-
9
- from .load import list_tests
10
-
11
-
12
- def list_tags():
13
- """
14
- List unique tags from all test classes.
15
- """
16
-
17
- unique_tags = set()
18
-
19
- for test in list_tests(__as_class=True):
20
- unique_tags.update(test.tags)
21
-
22
- return list(unique_tags)
23
-
24
-
25
- def list_tasks_and_tags():
26
- """
27
- List all task types and their associated tags, with one row per task type and
28
- all tags for a task type in one row.
29
-
30
- Returns:
31
- pandas.DataFrame: A DataFrame with 'Task Type' and concatenated 'Tags'.
32
- """
33
- task_tags_dict = {}
34
-
35
- for test in list_tests(__as_class=True):
36
- for task in test.tasks:
37
- task_tags_dict.setdefault(task, set()).update(test.tags)
38
-
39
- return format_dataframe(
40
- pd.DataFrame(
41
- [
42
- {"Task": task, "Tags": ", ".join(tags)}
43
- for task, tags in task_tags_dict.items()
44
- ]
45
- )
46
- )
47
-
48
-
49
- def list_tasks():
50
- """
51
- List unique tasks from all test classes.
52
- """
53
-
54
- unique_tasks = set()
55
-
56
- for test in list_tests(__as_class=True):
57
- unique_tasks.update(test.tasks)
58
-
59
- return list(unique_tasks)
@@ -1,176 +0,0 @@
1
- # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
- # See the LICENSE file in the root of this repository for details.
3
- # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
-
5
- from abc import abstractmethod
6
- from typing import List
7
-
8
- import numpy as np
9
- import plotly.express as px
10
- from sklearn.metrics.pairwise import cosine_similarity
11
-
12
- from validmind.logging import get_logger
13
- from validmind.vm_models import (
14
- Figure,
15
- ResultSummary,
16
- ResultTable,
17
- ResultTableMetadata,
18
- ThresholdTest,
19
- ThresholdTestResult,
20
- )
21
-
22
- logger = get_logger(__name__)
23
-
24
-
25
- class StabilityAnalysis(ThresholdTest):
26
- """
27
- Assesses the stability of embeddings generated by a model when faced with perturbed input data to ensure robustness
28
- and consistency.
29
-
30
- ### Purpose
31
-
32
- The Embedding Stability test evaluates the robustness of the embeddings generated by a model when the input text is
33
- perturbed. By comparing the cosine similarities between the original and perturbed embeddings, it gauges the
34
- model's ability to maintain consistent semantic representations under slight variations in the input data.
35
-
36
- ### Test Mechanism
37
-
38
- This test works by:
39
-
40
- - Perturbing the original text data.
41
- - Generating embeddings for both the original and perturbed datasets using the model.
42
- - Calculating the cosine similarities between the original and perturbed embeddings.
43
- - Analyzing the distribution of these similarities (mean, min, max, median, and standard deviation).
44
- - Determining the test result based on whether the mean similarity exceeds a predefined threshold (default is 0.7).
45
-
46
- ### Signs of High Risk
47
-
48
- - Mean cosine similarity below the threshold (default is 0.7).
49
- - Large standard deviation of cosine similarities, indicating inconsistency.
50
- - Minimum similarity score significantly lower than expected.
51
- - Failure to pass the threshold test based on the mean similarity.
52
-
53
- ### Strengths
54
-
55
- - Provides a quantitative measure of embedding stability.
56
- - Helps in identifying weaknesses in the model's ability to handle minor input variations.
57
- - Visualization of similarity distributions aids in comprehensive analysis.
58
- - Easy to interpret results with clear pass/fail criteria.
59
-
60
- ### Limitations
61
-
62
- - Relies on the chosen perturbation method, which may not cover all possible variations in real-world data.
63
- - Thresholds for similarity might need adjustment based on specific application requirements.
64
- - Cosine similarity, while useful, may not capture all aspects of semantic stability.
65
- """
66
-
67
- required_inputs = ["model", "dataset"]
68
- default_params = {
69
- "mean_similarity_threshold": 0.7,
70
- }
71
- tasks = ["feature_extraction"]
72
- tags = ["llm", "text_data", "embeddings", "visualization"]
73
-
74
- @abstractmethod
75
- def perturb_data(self, data: str) -> str:
76
- """Perturb a string of text (overriden by subclasses)"""
77
- pass
78
-
79
- def summary(self, results: List[ThresholdTestResult], all_passed: bool):
80
- results_table = [
81
- {
82
- "Mean Similarity": result.values["mean_similarity"],
83
- "Min Similarity": result.values["min_similarity"],
84
- "Max Similarity": result.values["max_similarity"],
85
- "Median Similarity": result.values["median_similarity"],
86
- "Std Similarity": result.values["std_similarity"],
87
- "Pass/Fail": "Pass" if result.passed else "Fail",
88
- }
89
- for result in results
90
- ]
91
- return ResultSummary(
92
- results=[
93
- ResultTable(
94
- data=results_table,
95
- metadata=ResultTableMetadata(
96
- title="Stability Analysis Results for Embeddings Model"
97
- ),
98
- )
99
- ]
100
- )
101
-
102
- def run(self):
103
- # Perturb the test dataset
104
- text_column = self.inputs.dataset.text_column
105
- original = self.inputs.dataset.df[[text_column]]
106
- perturbed = original.copy()
107
- perturbed.update(
108
- perturbed.select_dtypes(include="object").applymap(self.perturb_data)
109
- )
110
-
111
- logger.debug(f"Original data: {original}")
112
- logger.debug(f"Perturbed data: {perturbed}")
113
-
114
- # Compute embeddings for the original and perturbed dataset
115
- original_embeddings = self.inputs.dataset.y_pred(self.inputs.model)
116
- perturbed_embeddings = np.stack(self.inputs.model.predict(perturbed))
117
-
118
- # Compute cosine similarities between original and perturbed embeddings
119
- similarities = cosine_similarity(
120
- original_embeddings, perturbed_embeddings
121
- ).diagonal()
122
-
123
- mean = np.mean(similarities)
124
- min = np.min(similarities)
125
- max = np.max(similarities)
126
- median = np.median(similarities)
127
- std = np.std(similarities)
128
-
129
- # Determine if the test passed based on the mean similarity and threshold
130
- passed = mean > self.params["mean_similarity_threshold"]
131
-
132
- figures = [
133
- px.histogram(
134
- x=similarities.flatten(),
135
- nbins=100,
136
- title="Cosine Similarity Distribution",
137
- labels={"x": "Cosine Similarity"},
138
- ),
139
- px.density_contour(
140
- x=similarities.flatten(),
141
- nbinsx=100,
142
- title="Cosine Similarity Density",
143
- labels={"x": "Cosine Similarity"},
144
- marginal_x="histogram",
145
- ),
146
- px.box(
147
- x=similarities.flatten(),
148
- labels={"x": "Cosine Similarity"},
149
- title="Cosine Similarity Box Plot",
150
- ),
151
- ]
152
-
153
- # For this example, we are not caching the results as done in the reference `run` method
154
- return self.cache_results(
155
- [
156
- ThresholdTestResult(
157
- passed=passed,
158
- values={
159
- "mean_similarity": mean,
160
- "min_similarity": min,
161
- "max_similarity": max,
162
- "median_similarity": median,
163
- "std_similarity": std,
164
- },
165
- )
166
- ],
167
- figures=[
168
- Figure(
169
- for_object=self,
170
- key=self.name,
171
- figure=fig,
172
- )
173
- for fig in figures
174
- ],
175
- passed=passed,
176
- )
@@ -1,161 +0,0 @@
1
- # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
- # See the LICENSE file in the root of this repository for details.
3
- # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
-
5
- import warnings
6
-
7
- import plotly.express as px
8
- from datasets import Dataset
9
-
10
- from validmind import tags, tasks
11
- from validmind.errors import MissingDependencyError
12
-
13
- from .utils import get_ragas_config, get_renamed_columns
14
-
15
- try:
16
- from ragas import evaluate
17
- from ragas.metrics import context_utilization
18
- except ImportError as e:
19
- raise MissingDependencyError(
20
- "Missing required package `ragas` for ContextUtilization. "
21
- "Please run `pip install validmind[llm]` to use LLM tests",
22
- required_dependencies=["ragas"],
23
- extra="llm",
24
- ) from e
25
-
26
-
27
- @tags("ragas", "llm", "retrieval_performance")
28
- @tasks("text_qa", "text_generation", "text_summarization", "text_classification")
29
- def ContextUtilization(
30
- dataset,
31
- question_column: str = "question",
32
- contexts_column: str = "contexts",
33
- answer_column: str = "answer",
34
- ): # noqa: B950
35
- """
36
- Assesses how effectively relevant context chunks are utilized in generating answers by evaluating their ranking
37
- within the provided contexts.
38
-
39
- ### Purpose
40
-
41
- The Context Utilization test evaluates whether all of the answer-relevant items present in the contexts are ranked
42
- higher within the provided retrieval results. This metric is essential for assessing the performance of models,
43
- especially those involved in tasks such as text QA, text generation, text summarization, and text classification.
44
-
45
- ### Test Mechanism
46
-
47
- The test calculates Context Utilization using the formula:
48
-
49
- $$
50
- \\text{Context Utilization@K} = \\frac{\\sum_{k=1}^{K} \\left( \\text{Precision@k} \\times v_k \\right)}{\\text{Total number of relevant items in the top } K \\text{ results}}
51
- $$
52
- $$
53
- \\text{Precision@k} = {\\text{true positives@k} \\over (\\text{true positives@k} + \\text{false positives@k})}
54
- $$
55
-
56
- Where $K$ is the total number of chunks in `contexts` and $v_k \\in \\{0, 1\\}$ is the relevance indicator at rank $k$.
57
-
58
-
59
- This test uses columns for questions, contexts, and answers from the dataset and computes context utilization
60
- scores, generating a histogram and box plot for visualization.
61
-
62
- #### Configuring Columns
63
-
64
- This metric requires the following columns in your dataset:
65
-
66
- - `question` (str): The text query that was input into the model.
67
- - `contexts` (List[str]): A list of text contexts which are retrieved and which will be evaluated to
68
- make sure they contain relevant info in the correct order.
69
- - `answer` (str): The llm-generated response for the input `question`.
70
-
71
- If the above data is not in the appropriate column, you can specify different column
72
- names for these fields using the parameters `question_column`, `contexts_column`
73
- and `ground_truth_column`.
74
-
75
- For example, if your dataset has this data stored in different columns, you can
76
- pass the following parameters:
77
- ```python
78
- {
79
- "question_column": "question",
80
- "contexts_column": "context_info"
81
- "ground_truth_column": "my_ground_truth_col",
82
- }
83
- ```
84
-
85
- If the data is stored as a dictionary in another column, specify the column and key
86
- like this:
87
- ```python
88
- pred_col = dataset.prediction_column(model)
89
- params = {
90
- "contexts_column": f"{pred_col}.contexts",
91
- "ground_truth_column": "my_ground_truth_col",
92
- }
93
- ```
94
-
95
- For more complex situations, you can use a function to extract the data:
96
- ```python
97
- pred_col = dataset.prediction_column(model)
98
- params = {
99
- "contexts_column": lambda x: [x[pred_col]["context_message"]],
100
- "ground_truth_column": "my_ground_truth_col",
101
- }
102
- ```
103
-
104
- ### Signs of High Risk
105
-
106
- - Very low mean or median context utilization scores, indicating poor usage of retrieved contexts.
107
- - High standard deviation, suggesting inconsistent model performance.
108
- - Low or minimal max scores, pointing to the model's failure to rank relevant contexts at top positions.
109
-
110
- ### Strengths
111
-
112
- - Quantifies the rank of relevant context chunks in generating responses.
113
- - Provides clear visualizations through histograms and box plots for ease of interpretation.
114
- - Adapts to different dataset schema by allowing configurable column names.
115
-
116
- ### Limitations
117
-
118
- - Assumes the relevance of context chunks is binary and may not capture nuances of partial relevance.
119
- - Requires proper context retrieval to be effective; irrelevant context chunks can skew the results.
120
- - Dependent on large sample sizes to provide stable and reliable estimates of utilization performance.
121
- """
122
- warnings.filterwarnings(
123
- "ignore",
124
- category=FutureWarning,
125
- message="promote has been superseded by promote_options='default'.",
126
- )
127
-
128
- required_columns = {
129
- "question": question_column,
130
- "contexts": contexts_column,
131
- "answer": answer_column,
132
- }
133
-
134
- df = get_renamed_columns(dataset._df, required_columns)
135
-
136
- result_df = evaluate(
137
- Dataset.from_pandas(df), metrics=[context_utilization], **get_ragas_config()
138
- ).to_pandas()
139
-
140
- fig_histogram = px.histogram(x=result_df["context_utilization"].to_list(), nbins=10)
141
- fig_box = px.box(x=result_df["context_utilization"].to_list())
142
-
143
- return (
144
- {
145
- # "Scores (will not be uploaded to UI)": result_df[
146
- # ["question", "contexts", "answer", "context_utilization"]
147
- # ],
148
- "Aggregate Scores": [
149
- {
150
- "Mean Score": result_df["context_utilization"].mean(),
151
- "Median Score": result_df["context_utilization"].median(),
152
- "Max Score": result_df["context_utilization"].max(),
153
- "Min Score": result_df["context_utilization"].min(),
154
- "Standard Deviation": result_df["context_utilization"].std(),
155
- "Count": result_df.shape[0],
156
- }
157
- ],
158
- },
159
- fig_histogram,
160
- fig_box,
161
- )
@@ -1,80 +0,0 @@
1
- # Copyright © 2023-2024 ValidMind Inc. All rights reserved.
2
- # See the LICENSE file in the root of this repository for details.
3
- # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
-
5
- from dataclasses import dataclass
6
-
7
- from validmind.vm_models import Metric
8
-
9
-
10
- @dataclass
11
- class ClusterPerformance(Metric):
12
- """
13
- Evaluates and compares a clustering model's performance on training and testing datasets using multiple defined
14
- metrics.
15
-
16
- ### Purpose
17
-
18
- The Cluster Performance test evaluates the performance of a clustering model on both the training and testing
19
- datasets. It assesses how well the model defines, forms, and distinguishes clusters of data.
20
-
21
- ### Test Mechanism
22
-
23
- The test mechanism involves predicting the clusters of the training and testing datasets using the clustering
24
- model. After prediction, performance metrics defined in the `metric_info()` method are calculated against the true
25
- labels of the datasets. The results for each metric for both datasets are then collated and returned in a
26
- summarized table form listing each metric along with its corresponding train and test values.
27
-
28
- ### Signs of High Risk
29
-
30
- - High discrepancy between the performance metric values on the training and testing datasets.
31
- - Low performance metric values on both the training and testing datasets.
32
- - Consistent deterioration of performance across different metrics.
33
-
34
- ### Strengths
35
-
36
- - Tests the model's performance on both training and testing datasets, helping to identify overfitting or
37
- underfitting.
38
- - Allows for the use of a broad range of performance metrics, providing a comprehensive evaluation.
39
- - Returns a summarized table, making it easy to compare performance across different metrics and datasets.
40
-
41
- ### Limitations
42
-
43
- - The `metric_info()` method needs to be properly overridden in a subclass and metrics must be manually defined.
44
- - The test may not capture the model's performance well if clusters are not well-separated or the model struggles
45
- with certain clusters.
46
- - Does not consider the computational and time complexity of the model.
47
- - Binary comparison (train and test) might not capture performance changes under different circumstances or dataset
48
- categories.
49
- """
50
-
51
- name = "cluster_performance_metrics"
52
- required_inputs = ["model", "dataset"]
53
- tasks = ["clustering"]
54
- tags = [
55
- "sklearn",
56
- "model_performance",
57
- ]
58
-
59
- def cluster_performance_metrics(self, y_true_train, y_pred_train, metric_info):
60
- y_true_train = y_true_train.astype(y_pred_train.dtype).flatten()
61
- results = []
62
- for metric_name, metric_fcn in metric_info.items():
63
- train_value = metric_fcn(list(y_true_train), y_pred_train)
64
- results.append({metric_name: train_value})
65
- return results
66
-
67
- def metric_info(self):
68
- raise NotImplementedError
69
-
70
- def run(self):
71
- y_true_train = self.inputs.dataset.y
72
- class_pred_train = self.inputs.dataset.y_pred(self.inputs.model)
73
- y_true_train = y_true_train.astype(class_pred_train.dtype)
74
-
75
- results = self.cluster_performance_metrics(
76
- y_true_train,
77
- class_pred_train,
78
- self.metric_info(),
79
- )
80
- return self.cache_results(metric_value=results)