validmind 2.5.24__py3-none-any.whl → 2.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +8 -17
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +66 -85
- validmind/ai/test_result_description/context.py +2 -2
- validmind/ai/utils.py +26 -1
- validmind/api_client.py +43 -79
- validmind/client.py +5 -7
- validmind/client_config.py +1 -1
- validmind/datasets/__init__.py +1 -1
- validmind/datasets/classification/customer_churn.py +7 -5
- validmind/datasets/nlp/__init__.py +2 -2
- validmind/errors.py +6 -10
- validmind/html_templates/content_blocks.py +18 -16
- validmind/logging.py +21 -16
- validmind/tests/__init__.py +28 -5
- validmind/tests/__types__.py +186 -170
- validmind/tests/_store.py +7 -21
- validmind/tests/comparison.py +362 -0
- validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
- validmind/tests/data_validation/ADF.py +49 -83
- validmind/tests/data_validation/AutoAR.py +59 -96
- validmind/tests/data_validation/AutoMA.py +59 -96
- validmind/tests/data_validation/AutoStationarity.py +66 -114
- validmind/tests/data_validation/ClassImbalance.py +48 -117
- validmind/tests/data_validation/DatasetDescription.py +180 -209
- validmind/tests/data_validation/DatasetSplit.py +50 -75
- validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
- validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
- validmind/tests/data_validation/Duplicates.py +21 -90
- validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
- validmind/tests/data_validation/HighCardinality.py +32 -80
- validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
- validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
- validmind/tests/data_validation/IQROutliersTable.py +40 -80
- validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
- validmind/tests/data_validation/KPSS.py +33 -81
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
- validmind/tests/data_validation/MissingValues.py +17 -58
- validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
- validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
- validmind/tests/data_validation/RollingStatsPlot.py +50 -81
- validmind/tests/data_validation/SeasonalDecompose.py +102 -184
- validmind/tests/data_validation/Skewness.py +27 -64
- validmind/tests/data_validation/SpreadPlot.py +34 -57
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
- validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
- validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
- validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
- validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
- validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
- validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
- validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
- validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
- validmind/tests/data_validation/TooManyZeroValues.py +21 -70
- validmind/tests/data_validation/UniqueRows.py +23 -62
- validmind/tests/data_validation/WOEBinPlots.py +83 -109
- validmind/tests/data_validation/WOEBinTable.py +28 -69
- validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
- validmind/tests/data_validation/nlp/CommonWords.py +49 -57
- validmind/tests/data_validation/nlp/Hashtags.py +27 -49
- validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
- validmind/tests/data_validation/nlp/Mentions.py +32 -63
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
- validmind/tests/data_validation/nlp/Punctuations.py +63 -47
- validmind/tests/data_validation/nlp/Sentiment.py +4 -0
- validmind/tests/data_validation/nlp/StopWords.py +62 -91
- validmind/tests/data_validation/nlp/TextDescription.py +116 -159
- validmind/tests/data_validation/nlp/Toxicity.py +12 -4
- validmind/tests/decorator.py +33 -242
- validmind/tests/load.py +212 -153
- validmind/tests/model_validation/BertScore.py +13 -7
- validmind/tests/model_validation/BleuScore.py +4 -0
- validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
- validmind/tests/model_validation/ContextualRecall.py +3 -0
- validmind/tests/model_validation/FeaturesAUC.py +43 -74
- validmind/tests/model_validation/MeteorScore.py +3 -0
- validmind/tests/model_validation/RegardScore.py +5 -1
- validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
- validmind/tests/model_validation/embeddings/utils.py +53 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
- validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
- validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
- validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
- validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
- validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
- validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
- validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
- validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
- validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
- validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
- validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
- validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
- validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
- validmind/tests/output.py +120 -0
- validmind/tests/prompt_validation/Bias.py +55 -98
- validmind/tests/prompt_validation/Clarity.py +56 -99
- validmind/tests/prompt_validation/Conciseness.py +63 -101
- validmind/tests/prompt_validation/Delimitation.py +48 -89
- validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
- validmind/tests/prompt_validation/Robustness.py +80 -121
- validmind/tests/prompt_validation/Specificity.py +61 -95
- validmind/tests/prompt_validation/ai_powered_test.py +2 -2
- validmind/tests/run.py +314 -496
- validmind/tests/test_providers.py +109 -79
- validmind/tests/utils.py +91 -0
- validmind/unit_metrics/__init__.py +16 -155
- validmind/unit_metrics/classification/F1.py +1 -0
- validmind/unit_metrics/classification/Precision.py +1 -0
- validmind/unit_metrics/classification/ROC_AUC.py +1 -0
- validmind/unit_metrics/classification/Recall.py +1 -0
- validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
- validmind/unit_metrics/regression/HuberLoss.py +1 -0
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
- validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
- validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
- validmind/unit_metrics/regression/QuantileLoss.py +1 -0
- validmind/unit_metrics/regression/RSquaredScore.py +2 -1
- validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
- validmind/utils.py +66 -17
- validmind/vm_models/__init__.py +2 -17
- validmind/vm_models/dataset/dataset.py +31 -4
- validmind/vm_models/figure.py +7 -37
- validmind/vm_models/model.py +3 -0
- validmind/vm_models/result/__init__.py +7 -0
- validmind/vm_models/result/result.jinja +21 -0
- validmind/vm_models/result/result.py +337 -0
- validmind/vm_models/result/utils.py +160 -0
- validmind/vm_models/test_suite/runner.py +16 -54
- validmind/vm_models/test_suite/summary.py +3 -3
- validmind/vm_models/test_suite/test.py +43 -77
- validmind/vm_models/test_suite/test_suite.py +8 -40
- validmind-2.6.7.dist-info/METADATA +137 -0
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/RECORD +182 -189
- validmind/tests/data_validation/AutoSeasonality.py +0 -190
- validmind/tests/metadata.py +0 -59
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
- validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
- validmind/unit_metrics/composite.py +0 -238
- validmind/vm_models/test/metric.py +0 -98
- validmind/vm_models/test/metric_result.py +0 -61
- validmind/vm_models/test/output_template.py +0 -55
- validmind/vm_models/test/result_summary.py +0 -76
- validmind/vm_models/test/result_wrapper.py +0 -488
- validmind/vm_models/test/test.py +0 -103
- validmind/vm_models/test/threshold_test.py +0 -106
- validmind/vm_models/test/threshold_test_result.py +0 -75
- validmind/vm_models/test_context.py +0 -259
- validmind-2.5.24.dist-info/METADATA +0 -118
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/LICENSE +0 -0
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/WHEEL +0 -0
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/entry_points.txt +0 -0
@@ -1,190 +0,0 @@
|
|
1
|
-
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
-
# See the LICENSE file in the root of this repository for details.
|
3
|
-
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
-
|
5
|
-
import numpy as np
|
6
|
-
import pandas as pd
|
7
|
-
from statsmodels.tsa.seasonal import seasonal_decompose
|
8
|
-
|
9
|
-
from validmind.logging import get_logger
|
10
|
-
from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
|
11
|
-
|
12
|
-
logger = get_logger(__name__)
|
13
|
-
|
14
|
-
|
15
|
-
class AutoSeasonality(Metric):
|
16
|
-
"""
|
17
|
-
Automatically identifies and quantifies optimal seasonality in time series data to improve forecasting model
|
18
|
-
performance.
|
19
|
-
|
20
|
-
### Purpose
|
21
|
-
|
22
|
-
The AutoSeasonality test aims to automatically detect and identify the best seasonal order or period for each
|
23
|
-
variable in a time series dataset. This detection helps to quantify periodic patterns and seasonality that reoccur
|
24
|
-
at fixed intervals in the data. Understanding the seasonality component can drastically improve prediction
|
25
|
-
accuracy, which is especially significant for forecasting-based models.
|
26
|
-
|
27
|
-
### Test Mechanism
|
28
|
-
|
29
|
-
This test uses the seasonal decomposition method from the Statsmodels Python library. The function takes the
|
30
|
-
'additive' model type for each variable and applies it within the prescribed range of 'min_period' and
|
31
|
-
'max_period'. It decomposes the seasonality for each period in the range and calculates the mean residual error for
|
32
|
-
each period. The seasonal period that results in the minimum residuals is marked as the 'Best Period'. The test
|
33
|
-
results include the 'Best Period', the calculated residual errors, and a determination of 'Seasonality' or 'No
|
34
|
-
Seasonality'.
|
35
|
-
|
36
|
-
### Signs of High Risk
|
37
|
-
|
38
|
-
- If the optimal seasonal period (or 'Best Period') is consistently at the maximum or minimum limit of the offered
|
39
|
-
range for a majority of variables, it may suggest that the range set does not adequately capture the true seasonal
|
40
|
-
pattern in the series.
|
41
|
-
- A high average 'Residual Error' for the selected 'Best Period' could indicate issues with the model's performance.
|
42
|
-
|
43
|
-
### Strengths
|
44
|
-
|
45
|
-
- The metric offers an automatic approach to identifying and quantifying the optimal seasonality, providing a
|
46
|
-
robust method for analyzing time series datasets.
|
47
|
-
- It is applicable to multiple variables in a dataset, providing a comprehensive evaluation of each variable's
|
48
|
-
seasonality.
|
49
|
-
- The use of concrete and measurable statistical methods improves the objectivity and reproducibility of the model.
|
50
|
-
|
51
|
-
### Limitations
|
52
|
-
|
53
|
-
- This AutoSeasonality metric may not be suitable if the time series data exhibits random walk behavior or lacks
|
54
|
-
clear seasonality, as the seasonal decomposition model may not be appropriate.
|
55
|
-
- The defined range for the seasonal period (min_period and max_period) can influence the outcomes. If the actual
|
56
|
-
seasonality period lies outside this range, this method will not be able to identify the true seasonal order.
|
57
|
-
- This metric may not be able to fully interpret complex patterns that go beyond the simple additive model for
|
58
|
-
seasonal decomposition.
|
59
|
-
- The tool may incorrectly infer seasonality if random fluctuations in the data match the predefined seasonal
|
60
|
-
period range.
|
61
|
-
"""
|
62
|
-
|
63
|
-
name = "auto_seasonality"
|
64
|
-
required_inputs = ["dataset"]
|
65
|
-
default_params = {"min_period": 1, "max_period": 4}
|
66
|
-
tasks = ["regression"]
|
67
|
-
tags = [
|
68
|
-
"time_series_data",
|
69
|
-
"forecasting",
|
70
|
-
"statistical_test",
|
71
|
-
"statsmodels",
|
72
|
-
"seasonality",
|
73
|
-
]
|
74
|
-
|
75
|
-
def evaluate_seasonal_periods(self, series, min_period, max_period):
|
76
|
-
seasonal_periods = []
|
77
|
-
residual_errors = []
|
78
|
-
|
79
|
-
for period in range(min_period, max_period + 1):
|
80
|
-
try:
|
81
|
-
sd = seasonal_decompose(series, model="additive", period=period)
|
82
|
-
residual_error = np.abs(sd.resid.dropna()).mean()
|
83
|
-
|
84
|
-
seasonal_periods.append(period)
|
85
|
-
residual_errors.append(residual_error)
|
86
|
-
except Exception as e:
|
87
|
-
logger.error(f"Error evaluating period {period} for series: {e}")
|
88
|
-
|
89
|
-
return seasonal_periods, residual_errors
|
90
|
-
|
91
|
-
def run(self):
|
92
|
-
# Parse input parameters
|
93
|
-
if "min_period" not in self.params:
|
94
|
-
raise ValueError("min_period must be provided in params")
|
95
|
-
min_period = int(self.params["min_period"])
|
96
|
-
|
97
|
-
if "max_period" not in self.params:
|
98
|
-
raise ValueError("max_period must be provided in params")
|
99
|
-
max_period = int(self.params["max_period"])
|
100
|
-
|
101
|
-
df = self.inputs.dataset.df
|
102
|
-
|
103
|
-
# Create an empty DataFrame to store the results
|
104
|
-
summary_auto_seasonality = pd.DataFrame()
|
105
|
-
|
106
|
-
for col_name, col in df.items():
|
107
|
-
series = col.dropna()
|
108
|
-
|
109
|
-
# Evaluate seasonal periods
|
110
|
-
seasonal_periods, residual_errors = self.evaluate_seasonal_periods(
|
111
|
-
series, min_period, max_period
|
112
|
-
)
|
113
|
-
|
114
|
-
for i, period in enumerate(seasonal_periods):
|
115
|
-
decision = "Seasonality" if period > 1 else "No Seasonality"
|
116
|
-
summary_auto_seasonality = pd.concat(
|
117
|
-
[
|
118
|
-
summary_auto_seasonality,
|
119
|
-
pd.DataFrame(
|
120
|
-
[
|
121
|
-
{
|
122
|
-
"Variable": col_name,
|
123
|
-
"Seasonal Period": period,
|
124
|
-
"Residual Error": residual_errors[i],
|
125
|
-
"Decision": decision,
|
126
|
-
}
|
127
|
-
]
|
128
|
-
),
|
129
|
-
],
|
130
|
-
ignore_index=True,
|
131
|
-
)
|
132
|
-
|
133
|
-
# Convert the 'Seasonal Period' column to integer
|
134
|
-
summary_auto_seasonality["Seasonal Period"] = summary_auto_seasonality[
|
135
|
-
"Seasonal Period"
|
136
|
-
].astype(int)
|
137
|
-
|
138
|
-
# Create a DataFrame to store the best seasonality period for each variable
|
139
|
-
best_seasonality_period = pd.DataFrame()
|
140
|
-
|
141
|
-
for variable in summary_auto_seasonality["Variable"].unique():
|
142
|
-
temp_df = summary_auto_seasonality[
|
143
|
-
summary_auto_seasonality["Variable"] == variable
|
144
|
-
]
|
145
|
-
best_row = temp_df[
|
146
|
-
temp_df["Residual Error"] == temp_df["Residual Error"].min()
|
147
|
-
]
|
148
|
-
best_seasonality_period = pd.concat([best_seasonality_period, best_row])
|
149
|
-
|
150
|
-
# Rename the 'Seasonal Period' column to 'Best Period'
|
151
|
-
best_seasonality_period = best_seasonality_period.rename(
|
152
|
-
columns={"Seasonal Period": "Best Period"}
|
153
|
-
)
|
154
|
-
|
155
|
-
# Convert the 'Best Period' column to integer
|
156
|
-
best_seasonality_period["Best Period"] = best_seasonality_period[
|
157
|
-
"Best Period"
|
158
|
-
].astype(int)
|
159
|
-
|
160
|
-
return self.cache_results(
|
161
|
-
{
|
162
|
-
"auto_seasonality": summary_auto_seasonality.to_dict(orient="records"),
|
163
|
-
"best_seasonality_period": best_seasonality_period.to_dict(
|
164
|
-
orient="records"
|
165
|
-
),
|
166
|
-
}
|
167
|
-
)
|
168
|
-
|
169
|
-
def summary(self, metric_value):
|
170
|
-
"""
|
171
|
-
Build one table for summarizing the auto seasonality results
|
172
|
-
and another for the best seasonality period results
|
173
|
-
"""
|
174
|
-
summary_auto_seasonality = metric_value["auto_seasonality"]
|
175
|
-
best_seasonality_period = metric_value["best_seasonality_period"]
|
176
|
-
|
177
|
-
return ResultSummary(
|
178
|
-
results=[
|
179
|
-
ResultTable(
|
180
|
-
data=summary_auto_seasonality,
|
181
|
-
metadata=ResultTableMetadata(title="Auto Seasonality Results"),
|
182
|
-
),
|
183
|
-
ResultTable(
|
184
|
-
data=best_seasonality_period,
|
185
|
-
metadata=ResultTableMetadata(
|
186
|
-
title="Best Seasonality Period Results"
|
187
|
-
),
|
188
|
-
),
|
189
|
-
]
|
190
|
-
)
|
validmind/tests/metadata.py
DELETED
@@ -1,59 +0,0 @@
|
|
1
|
-
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
-
# See the LICENSE file in the root of this repository for details.
|
3
|
-
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
-
|
5
|
-
import pandas as pd
|
6
|
-
|
7
|
-
from validmind.utils import format_dataframe
|
8
|
-
|
9
|
-
from .load import list_tests
|
10
|
-
|
11
|
-
|
12
|
-
def list_tags():
|
13
|
-
"""
|
14
|
-
List unique tags from all test classes.
|
15
|
-
"""
|
16
|
-
|
17
|
-
unique_tags = set()
|
18
|
-
|
19
|
-
for test in list_tests(__as_class=True):
|
20
|
-
unique_tags.update(test.tags)
|
21
|
-
|
22
|
-
return list(unique_tags)
|
23
|
-
|
24
|
-
|
25
|
-
def list_tasks_and_tags():
|
26
|
-
"""
|
27
|
-
List all task types and their associated tags, with one row per task type and
|
28
|
-
all tags for a task type in one row.
|
29
|
-
|
30
|
-
Returns:
|
31
|
-
pandas.DataFrame: A DataFrame with 'Task Type' and concatenated 'Tags'.
|
32
|
-
"""
|
33
|
-
task_tags_dict = {}
|
34
|
-
|
35
|
-
for test in list_tests(__as_class=True):
|
36
|
-
for task in test.tasks:
|
37
|
-
task_tags_dict.setdefault(task, set()).update(test.tags)
|
38
|
-
|
39
|
-
return format_dataframe(
|
40
|
-
pd.DataFrame(
|
41
|
-
[
|
42
|
-
{"Task": task, "Tags": ", ".join(tags)}
|
43
|
-
for task, tags in task_tags_dict.items()
|
44
|
-
]
|
45
|
-
)
|
46
|
-
)
|
47
|
-
|
48
|
-
|
49
|
-
def list_tasks():
|
50
|
-
"""
|
51
|
-
List unique tasks from all test classes.
|
52
|
-
"""
|
53
|
-
|
54
|
-
unique_tasks = set()
|
55
|
-
|
56
|
-
for test in list_tests(__as_class=True):
|
57
|
-
unique_tasks.update(test.tasks)
|
58
|
-
|
59
|
-
return list(unique_tasks)
|
@@ -1,176 +0,0 @@
|
|
1
|
-
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
-
# See the LICENSE file in the root of this repository for details.
|
3
|
-
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
-
|
5
|
-
from abc import abstractmethod
|
6
|
-
from typing import List
|
7
|
-
|
8
|
-
import numpy as np
|
9
|
-
import plotly.express as px
|
10
|
-
from sklearn.metrics.pairwise import cosine_similarity
|
11
|
-
|
12
|
-
from validmind.logging import get_logger
|
13
|
-
from validmind.vm_models import (
|
14
|
-
Figure,
|
15
|
-
ResultSummary,
|
16
|
-
ResultTable,
|
17
|
-
ResultTableMetadata,
|
18
|
-
ThresholdTest,
|
19
|
-
ThresholdTestResult,
|
20
|
-
)
|
21
|
-
|
22
|
-
logger = get_logger(__name__)
|
23
|
-
|
24
|
-
|
25
|
-
class StabilityAnalysis(ThresholdTest):
|
26
|
-
"""
|
27
|
-
Assesses the stability of embeddings generated by a model when faced with perturbed input data to ensure robustness
|
28
|
-
and consistency.
|
29
|
-
|
30
|
-
### Purpose
|
31
|
-
|
32
|
-
The Embedding Stability test evaluates the robustness of the embeddings generated by a model when the input text is
|
33
|
-
perturbed. By comparing the cosine similarities between the original and perturbed embeddings, it gauges the
|
34
|
-
model's ability to maintain consistent semantic representations under slight variations in the input data.
|
35
|
-
|
36
|
-
### Test Mechanism
|
37
|
-
|
38
|
-
This test works by:
|
39
|
-
|
40
|
-
- Perturbing the original text data.
|
41
|
-
- Generating embeddings for both the original and perturbed datasets using the model.
|
42
|
-
- Calculating the cosine similarities between the original and perturbed embeddings.
|
43
|
-
- Analyzing the distribution of these similarities (mean, min, max, median, and standard deviation).
|
44
|
-
- Determining the test result based on whether the mean similarity exceeds a predefined threshold (default is 0.7).
|
45
|
-
|
46
|
-
### Signs of High Risk
|
47
|
-
|
48
|
-
- Mean cosine similarity below the threshold (default is 0.7).
|
49
|
-
- Large standard deviation of cosine similarities, indicating inconsistency.
|
50
|
-
- Minimum similarity score significantly lower than expected.
|
51
|
-
- Failure to pass the threshold test based on the mean similarity.
|
52
|
-
|
53
|
-
### Strengths
|
54
|
-
|
55
|
-
- Provides a quantitative measure of embedding stability.
|
56
|
-
- Helps in identifying weaknesses in the model's ability to handle minor input variations.
|
57
|
-
- Visualization of similarity distributions aids in comprehensive analysis.
|
58
|
-
- Easy to interpret results with clear pass/fail criteria.
|
59
|
-
|
60
|
-
### Limitations
|
61
|
-
|
62
|
-
- Relies on the chosen perturbation method, which may not cover all possible variations in real-world data.
|
63
|
-
- Thresholds for similarity might need adjustment based on specific application requirements.
|
64
|
-
- Cosine similarity, while useful, may not capture all aspects of semantic stability.
|
65
|
-
"""
|
66
|
-
|
67
|
-
required_inputs = ["model", "dataset"]
|
68
|
-
default_params = {
|
69
|
-
"mean_similarity_threshold": 0.7,
|
70
|
-
}
|
71
|
-
tasks = ["feature_extraction"]
|
72
|
-
tags = ["llm", "text_data", "embeddings", "visualization"]
|
73
|
-
|
74
|
-
@abstractmethod
|
75
|
-
def perturb_data(self, data: str) -> str:
|
76
|
-
"""Perturb a string of text (overriden by subclasses)"""
|
77
|
-
pass
|
78
|
-
|
79
|
-
def summary(self, results: List[ThresholdTestResult], all_passed: bool):
|
80
|
-
results_table = [
|
81
|
-
{
|
82
|
-
"Mean Similarity": result.values["mean_similarity"],
|
83
|
-
"Min Similarity": result.values["min_similarity"],
|
84
|
-
"Max Similarity": result.values["max_similarity"],
|
85
|
-
"Median Similarity": result.values["median_similarity"],
|
86
|
-
"Std Similarity": result.values["std_similarity"],
|
87
|
-
"Pass/Fail": "Pass" if result.passed else "Fail",
|
88
|
-
}
|
89
|
-
for result in results
|
90
|
-
]
|
91
|
-
return ResultSummary(
|
92
|
-
results=[
|
93
|
-
ResultTable(
|
94
|
-
data=results_table,
|
95
|
-
metadata=ResultTableMetadata(
|
96
|
-
title="Stability Analysis Results for Embeddings Model"
|
97
|
-
),
|
98
|
-
)
|
99
|
-
]
|
100
|
-
)
|
101
|
-
|
102
|
-
def run(self):
|
103
|
-
# Perturb the test dataset
|
104
|
-
text_column = self.inputs.dataset.text_column
|
105
|
-
original = self.inputs.dataset.df[[text_column]]
|
106
|
-
perturbed = original.copy()
|
107
|
-
perturbed.update(
|
108
|
-
perturbed.select_dtypes(include="object").applymap(self.perturb_data)
|
109
|
-
)
|
110
|
-
|
111
|
-
logger.debug(f"Original data: {original}")
|
112
|
-
logger.debug(f"Perturbed data: {perturbed}")
|
113
|
-
|
114
|
-
# Compute embeddings for the original and perturbed dataset
|
115
|
-
original_embeddings = self.inputs.dataset.y_pred(self.inputs.model)
|
116
|
-
perturbed_embeddings = np.stack(self.inputs.model.predict(perturbed))
|
117
|
-
|
118
|
-
# Compute cosine similarities between original and perturbed embeddings
|
119
|
-
similarities = cosine_similarity(
|
120
|
-
original_embeddings, perturbed_embeddings
|
121
|
-
).diagonal()
|
122
|
-
|
123
|
-
mean = np.mean(similarities)
|
124
|
-
min = np.min(similarities)
|
125
|
-
max = np.max(similarities)
|
126
|
-
median = np.median(similarities)
|
127
|
-
std = np.std(similarities)
|
128
|
-
|
129
|
-
# Determine if the test passed based on the mean similarity and threshold
|
130
|
-
passed = mean > self.params["mean_similarity_threshold"]
|
131
|
-
|
132
|
-
figures = [
|
133
|
-
px.histogram(
|
134
|
-
x=similarities.flatten(),
|
135
|
-
nbins=100,
|
136
|
-
title="Cosine Similarity Distribution",
|
137
|
-
labels={"x": "Cosine Similarity"},
|
138
|
-
),
|
139
|
-
px.density_contour(
|
140
|
-
x=similarities.flatten(),
|
141
|
-
nbinsx=100,
|
142
|
-
title="Cosine Similarity Density",
|
143
|
-
labels={"x": "Cosine Similarity"},
|
144
|
-
marginal_x="histogram",
|
145
|
-
),
|
146
|
-
px.box(
|
147
|
-
x=similarities.flatten(),
|
148
|
-
labels={"x": "Cosine Similarity"},
|
149
|
-
title="Cosine Similarity Box Plot",
|
150
|
-
),
|
151
|
-
]
|
152
|
-
|
153
|
-
# For this example, we are not caching the results as done in the reference `run` method
|
154
|
-
return self.cache_results(
|
155
|
-
[
|
156
|
-
ThresholdTestResult(
|
157
|
-
passed=passed,
|
158
|
-
values={
|
159
|
-
"mean_similarity": mean,
|
160
|
-
"min_similarity": min,
|
161
|
-
"max_similarity": max,
|
162
|
-
"median_similarity": median,
|
163
|
-
"std_similarity": std,
|
164
|
-
},
|
165
|
-
)
|
166
|
-
],
|
167
|
-
figures=[
|
168
|
-
Figure(
|
169
|
-
for_object=self,
|
170
|
-
key=self.name,
|
171
|
-
figure=fig,
|
172
|
-
)
|
173
|
-
for fig in figures
|
174
|
-
],
|
175
|
-
passed=passed,
|
176
|
-
)
|
@@ -1,161 +0,0 @@
|
|
1
|
-
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
-
# See the LICENSE file in the root of this repository for details.
|
3
|
-
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
-
|
5
|
-
import warnings
|
6
|
-
|
7
|
-
import plotly.express as px
|
8
|
-
from datasets import Dataset
|
9
|
-
|
10
|
-
from validmind import tags, tasks
|
11
|
-
from validmind.errors import MissingDependencyError
|
12
|
-
|
13
|
-
from .utils import get_ragas_config, get_renamed_columns
|
14
|
-
|
15
|
-
try:
|
16
|
-
from ragas import evaluate
|
17
|
-
from ragas.metrics import context_utilization
|
18
|
-
except ImportError as e:
|
19
|
-
raise MissingDependencyError(
|
20
|
-
"Missing required package `ragas` for ContextUtilization. "
|
21
|
-
"Please run `pip install validmind[llm]` to use LLM tests",
|
22
|
-
required_dependencies=["ragas"],
|
23
|
-
extra="llm",
|
24
|
-
) from e
|
25
|
-
|
26
|
-
|
27
|
-
@tags("ragas", "llm", "retrieval_performance")
|
28
|
-
@tasks("text_qa", "text_generation", "text_summarization", "text_classification")
|
29
|
-
def ContextUtilization(
|
30
|
-
dataset,
|
31
|
-
question_column: str = "question",
|
32
|
-
contexts_column: str = "contexts",
|
33
|
-
answer_column: str = "answer",
|
34
|
-
): # noqa: B950
|
35
|
-
"""
|
36
|
-
Assesses how effectively relevant context chunks are utilized in generating answers by evaluating their ranking
|
37
|
-
within the provided contexts.
|
38
|
-
|
39
|
-
### Purpose
|
40
|
-
|
41
|
-
The Context Utilization test evaluates whether all of the answer-relevant items present in the contexts are ranked
|
42
|
-
higher within the provided retrieval results. This metric is essential for assessing the performance of models,
|
43
|
-
especially those involved in tasks such as text QA, text generation, text summarization, and text classification.
|
44
|
-
|
45
|
-
### Test Mechanism
|
46
|
-
|
47
|
-
The test calculates Context Utilization using the formula:
|
48
|
-
|
49
|
-
$$
|
50
|
-
\\text{Context Utilization@K} = \\frac{\\sum_{k=1}^{K} \\left( \\text{Precision@k} \\times v_k \\right)}{\\text{Total number of relevant items in the top } K \\text{ results}}
|
51
|
-
$$
|
52
|
-
$$
|
53
|
-
\\text{Precision@k} = {\\text{true positives@k} \\over (\\text{true positives@k} + \\text{false positives@k})}
|
54
|
-
$$
|
55
|
-
|
56
|
-
Where $K$ is the total number of chunks in `contexts` and $v_k \\in \\{0, 1\\}$ is the relevance indicator at rank $k$.
|
57
|
-
|
58
|
-
|
59
|
-
This test uses columns for questions, contexts, and answers from the dataset and computes context utilization
|
60
|
-
scores, generating a histogram and box plot for visualization.
|
61
|
-
|
62
|
-
#### Configuring Columns
|
63
|
-
|
64
|
-
This metric requires the following columns in your dataset:
|
65
|
-
|
66
|
-
- `question` (str): The text query that was input into the model.
|
67
|
-
- `contexts` (List[str]): A list of text contexts which are retrieved and which will be evaluated to
|
68
|
-
make sure they contain relevant info in the correct order.
|
69
|
-
- `answer` (str): The llm-generated response for the input `question`.
|
70
|
-
|
71
|
-
If the above data is not in the appropriate column, you can specify different column
|
72
|
-
names for these fields using the parameters `question_column`, `contexts_column`
|
73
|
-
and `ground_truth_column`.
|
74
|
-
|
75
|
-
For example, if your dataset has this data stored in different columns, you can
|
76
|
-
pass the following parameters:
|
77
|
-
```python
|
78
|
-
{
|
79
|
-
"question_column": "question",
|
80
|
-
"contexts_column": "context_info"
|
81
|
-
"ground_truth_column": "my_ground_truth_col",
|
82
|
-
}
|
83
|
-
```
|
84
|
-
|
85
|
-
If the data is stored as a dictionary in another column, specify the column and key
|
86
|
-
like this:
|
87
|
-
```python
|
88
|
-
pred_col = dataset.prediction_column(model)
|
89
|
-
params = {
|
90
|
-
"contexts_column": f"{pred_col}.contexts",
|
91
|
-
"ground_truth_column": "my_ground_truth_col",
|
92
|
-
}
|
93
|
-
```
|
94
|
-
|
95
|
-
For more complex situations, you can use a function to extract the data:
|
96
|
-
```python
|
97
|
-
pred_col = dataset.prediction_column(model)
|
98
|
-
params = {
|
99
|
-
"contexts_column": lambda x: [x[pred_col]["context_message"]],
|
100
|
-
"ground_truth_column": "my_ground_truth_col",
|
101
|
-
}
|
102
|
-
```
|
103
|
-
|
104
|
-
### Signs of High Risk
|
105
|
-
|
106
|
-
- Very low mean or median context utilization scores, indicating poor usage of retrieved contexts.
|
107
|
-
- High standard deviation, suggesting inconsistent model performance.
|
108
|
-
- Low or minimal max scores, pointing to the model's failure to rank relevant contexts at top positions.
|
109
|
-
|
110
|
-
### Strengths
|
111
|
-
|
112
|
-
- Quantifies the rank of relevant context chunks in generating responses.
|
113
|
-
- Provides clear visualizations through histograms and box plots for ease of interpretation.
|
114
|
-
- Adapts to different dataset schema by allowing configurable column names.
|
115
|
-
|
116
|
-
### Limitations
|
117
|
-
|
118
|
-
- Assumes the relevance of context chunks is binary and may not capture nuances of partial relevance.
|
119
|
-
- Requires proper context retrieval to be effective; irrelevant context chunks can skew the results.
|
120
|
-
- Dependent on large sample sizes to provide stable and reliable estimates of utilization performance.
|
121
|
-
"""
|
122
|
-
warnings.filterwarnings(
|
123
|
-
"ignore",
|
124
|
-
category=FutureWarning,
|
125
|
-
message="promote has been superseded by promote_options='default'.",
|
126
|
-
)
|
127
|
-
|
128
|
-
required_columns = {
|
129
|
-
"question": question_column,
|
130
|
-
"contexts": contexts_column,
|
131
|
-
"answer": answer_column,
|
132
|
-
}
|
133
|
-
|
134
|
-
df = get_renamed_columns(dataset._df, required_columns)
|
135
|
-
|
136
|
-
result_df = evaluate(
|
137
|
-
Dataset.from_pandas(df), metrics=[context_utilization], **get_ragas_config()
|
138
|
-
).to_pandas()
|
139
|
-
|
140
|
-
fig_histogram = px.histogram(x=result_df["context_utilization"].to_list(), nbins=10)
|
141
|
-
fig_box = px.box(x=result_df["context_utilization"].to_list())
|
142
|
-
|
143
|
-
return (
|
144
|
-
{
|
145
|
-
# "Scores (will not be uploaded to UI)": result_df[
|
146
|
-
# ["question", "contexts", "answer", "context_utilization"]
|
147
|
-
# ],
|
148
|
-
"Aggregate Scores": [
|
149
|
-
{
|
150
|
-
"Mean Score": result_df["context_utilization"].mean(),
|
151
|
-
"Median Score": result_df["context_utilization"].median(),
|
152
|
-
"Max Score": result_df["context_utilization"].max(),
|
153
|
-
"Min Score": result_df["context_utilization"].min(),
|
154
|
-
"Standard Deviation": result_df["context_utilization"].std(),
|
155
|
-
"Count": result_df.shape[0],
|
156
|
-
}
|
157
|
-
],
|
158
|
-
},
|
159
|
-
fig_histogram,
|
160
|
-
fig_box,
|
161
|
-
)
|
@@ -1,80 +0,0 @@
|
|
1
|
-
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
-
# See the LICENSE file in the root of this repository for details.
|
3
|
-
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
-
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
|
-
from validmind.vm_models import Metric
|
8
|
-
|
9
|
-
|
10
|
-
@dataclass
|
11
|
-
class ClusterPerformance(Metric):
|
12
|
-
"""
|
13
|
-
Evaluates and compares a clustering model's performance on training and testing datasets using multiple defined
|
14
|
-
metrics.
|
15
|
-
|
16
|
-
### Purpose
|
17
|
-
|
18
|
-
The Cluster Performance test evaluates the performance of a clustering model on both the training and testing
|
19
|
-
datasets. It assesses how well the model defines, forms, and distinguishes clusters of data.
|
20
|
-
|
21
|
-
### Test Mechanism
|
22
|
-
|
23
|
-
The test mechanism involves predicting the clusters of the training and testing datasets using the clustering
|
24
|
-
model. After prediction, performance metrics defined in the `metric_info()` method are calculated against the true
|
25
|
-
labels of the datasets. The results for each metric for both datasets are then collated and returned in a
|
26
|
-
summarized table form listing each metric along with its corresponding train and test values.
|
27
|
-
|
28
|
-
### Signs of High Risk
|
29
|
-
|
30
|
-
- High discrepancy between the performance metric values on the training and testing datasets.
|
31
|
-
- Low performance metric values on both the training and testing datasets.
|
32
|
-
- Consistent deterioration of performance across different metrics.
|
33
|
-
|
34
|
-
### Strengths
|
35
|
-
|
36
|
-
- Tests the model's performance on both training and testing datasets, helping to identify overfitting or
|
37
|
-
underfitting.
|
38
|
-
- Allows for the use of a broad range of performance metrics, providing a comprehensive evaluation.
|
39
|
-
- Returns a summarized table, making it easy to compare performance across different metrics and datasets.
|
40
|
-
|
41
|
-
### Limitations
|
42
|
-
|
43
|
-
- The `metric_info()` method needs to be properly overridden in a subclass and metrics must be manually defined.
|
44
|
-
- The test may not capture the model's performance well if clusters are not well-separated or the model struggles
|
45
|
-
with certain clusters.
|
46
|
-
- Does not consider the computational and time complexity of the model.
|
47
|
-
- Binary comparison (train and test) might not capture performance changes under different circumstances or dataset
|
48
|
-
categories.
|
49
|
-
"""
|
50
|
-
|
51
|
-
name = "cluster_performance_metrics"
|
52
|
-
required_inputs = ["model", "dataset"]
|
53
|
-
tasks = ["clustering"]
|
54
|
-
tags = [
|
55
|
-
"sklearn",
|
56
|
-
"model_performance",
|
57
|
-
]
|
58
|
-
|
59
|
-
def cluster_performance_metrics(self, y_true_train, y_pred_train, metric_info):
|
60
|
-
y_true_train = y_true_train.astype(y_pred_train.dtype).flatten()
|
61
|
-
results = []
|
62
|
-
for metric_name, metric_fcn in metric_info.items():
|
63
|
-
train_value = metric_fcn(list(y_true_train), y_pred_train)
|
64
|
-
results.append({metric_name: train_value})
|
65
|
-
return results
|
66
|
-
|
67
|
-
def metric_info(self):
|
68
|
-
raise NotImplementedError
|
69
|
-
|
70
|
-
def run(self):
|
71
|
-
y_true_train = self.inputs.dataset.y
|
72
|
-
class_pred_train = self.inputs.dataset.y_pred(self.inputs.model)
|
73
|
-
y_true_train = y_true_train.astype(class_pred_train.dtype)
|
74
|
-
|
75
|
-
results = self.cluster_performance_metrics(
|
76
|
-
y_true_train,
|
77
|
-
class_pred_train,
|
78
|
-
self.metric_info(),
|
79
|
-
)
|
80
|
-
return self.cache_results(metric_value=results)
|