validmind 2.5.25__py3-none-any.whl → 2.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +8 -17
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +66 -85
- validmind/ai/test_result_description/context.py +2 -2
- validmind/ai/utils.py +26 -1
- validmind/api_client.py +43 -79
- validmind/client.py +5 -7
- validmind/client_config.py +1 -1
- validmind/datasets/__init__.py +1 -1
- validmind/datasets/classification/customer_churn.py +7 -5
- validmind/datasets/nlp/__init__.py +2 -2
- validmind/errors.py +6 -10
- validmind/html_templates/content_blocks.py +18 -16
- validmind/logging.py +21 -16
- validmind/tests/__init__.py +28 -5
- validmind/tests/__types__.py +186 -170
- validmind/tests/_store.py +7 -21
- validmind/tests/comparison.py +362 -0
- validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
- validmind/tests/data_validation/ADF.py +49 -83
- validmind/tests/data_validation/AutoAR.py +59 -96
- validmind/tests/data_validation/AutoMA.py +59 -96
- validmind/tests/data_validation/AutoStationarity.py +66 -114
- validmind/tests/data_validation/ClassImbalance.py +48 -117
- validmind/tests/data_validation/DatasetDescription.py +180 -209
- validmind/tests/data_validation/DatasetSplit.py +50 -75
- validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
- validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
- validmind/tests/data_validation/Duplicates.py +21 -90
- validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
- validmind/tests/data_validation/HighCardinality.py +32 -80
- validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
- validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
- validmind/tests/data_validation/IQROutliersTable.py +40 -80
- validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
- validmind/tests/data_validation/KPSS.py +33 -81
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
- validmind/tests/data_validation/MissingValues.py +17 -58
- validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
- validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
- validmind/tests/data_validation/RollingStatsPlot.py +50 -81
- validmind/tests/data_validation/SeasonalDecompose.py +102 -184
- validmind/tests/data_validation/Skewness.py +27 -64
- validmind/tests/data_validation/SpreadPlot.py +34 -57
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
- validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
- validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
- validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
- validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
- validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
- validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
- validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
- validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
- validmind/tests/data_validation/TooManyZeroValues.py +21 -70
- validmind/tests/data_validation/UniqueRows.py +23 -62
- validmind/tests/data_validation/WOEBinPlots.py +83 -109
- validmind/tests/data_validation/WOEBinTable.py +28 -69
- validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
- validmind/tests/data_validation/nlp/CommonWords.py +49 -57
- validmind/tests/data_validation/nlp/Hashtags.py +27 -49
- validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
- validmind/tests/data_validation/nlp/Mentions.py +32 -63
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
- validmind/tests/data_validation/nlp/Punctuations.py +63 -47
- validmind/tests/data_validation/nlp/Sentiment.py +4 -0
- validmind/tests/data_validation/nlp/StopWords.py +62 -91
- validmind/tests/data_validation/nlp/TextDescription.py +116 -159
- validmind/tests/data_validation/nlp/Toxicity.py +12 -4
- validmind/tests/decorator.py +33 -242
- validmind/tests/load.py +212 -153
- validmind/tests/model_validation/BertScore.py +13 -7
- validmind/tests/model_validation/BleuScore.py +4 -0
- validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
- validmind/tests/model_validation/ContextualRecall.py +3 -0
- validmind/tests/model_validation/FeaturesAUC.py +43 -74
- validmind/tests/model_validation/MeteorScore.py +3 -0
- validmind/tests/model_validation/RegardScore.py +5 -1
- validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
- validmind/tests/model_validation/embeddings/utils.py +53 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
- validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
- validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
- validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
- validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
- validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
- validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
- validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
- validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
- validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
- validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
- validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
- validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
- validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
- validmind/tests/output.py +120 -0
- validmind/tests/prompt_validation/Bias.py +55 -98
- validmind/tests/prompt_validation/Clarity.py +56 -99
- validmind/tests/prompt_validation/Conciseness.py +63 -101
- validmind/tests/prompt_validation/Delimitation.py +48 -89
- validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
- validmind/tests/prompt_validation/Robustness.py +80 -121
- validmind/tests/prompt_validation/Specificity.py +61 -95
- validmind/tests/prompt_validation/ai_powered_test.py +2 -2
- validmind/tests/run.py +314 -496
- validmind/tests/test_providers.py +109 -79
- validmind/tests/utils.py +91 -0
- validmind/unit_metrics/__init__.py +16 -155
- validmind/unit_metrics/classification/F1.py +1 -0
- validmind/unit_metrics/classification/Precision.py +1 -0
- validmind/unit_metrics/classification/ROC_AUC.py +1 -0
- validmind/unit_metrics/classification/Recall.py +1 -0
- validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
- validmind/unit_metrics/regression/HuberLoss.py +1 -0
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
- validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
- validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
- validmind/unit_metrics/regression/QuantileLoss.py +1 -0
- validmind/unit_metrics/regression/RSquaredScore.py +2 -1
- validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
- validmind/utils.py +66 -17
- validmind/vm_models/__init__.py +2 -17
- validmind/vm_models/dataset/dataset.py +31 -4
- validmind/vm_models/figure.py +7 -37
- validmind/vm_models/model.py +3 -0
- validmind/vm_models/result/__init__.py +7 -0
- validmind/vm_models/result/result.jinja +21 -0
- validmind/vm_models/result/result.py +337 -0
- validmind/vm_models/result/utils.py +160 -0
- validmind/vm_models/test_suite/runner.py +16 -54
- validmind/vm_models/test_suite/summary.py +3 -3
- validmind/vm_models/test_suite/test.py +43 -77
- validmind/vm_models/test_suite/test_suite.py +8 -40
- validmind-2.6.7.dist-info/METADATA +137 -0
- {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/RECORD +182 -189
- validmind/tests/data_validation/AutoSeasonality.py +0 -190
- validmind/tests/metadata.py +0 -59
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
- validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
- validmind/unit_metrics/composite.py +0 -238
- validmind/vm_models/test/metric.py +0 -98
- validmind/vm_models/test/metric_result.py +0 -61
- validmind/vm_models/test/output_template.py +0 -55
- validmind/vm_models/test/result_summary.py +0 -76
- validmind/vm_models/test/result_wrapper.py +0 -488
- validmind/vm_models/test/test.py +0 -103
- validmind/vm_models/test/threshold_test.py +0 -106
- validmind/vm_models/test/threshold_test_result.py +0 -75
- validmind/vm_models/test_context.py +0 -259
- validmind-2.5.25.dist-info/METADATA +0 -118
- {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/LICENSE +0 -0
- {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/WHEEL +0 -0
- {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/entry_points.txt +0 -0
@@ -2,23 +2,24 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
5
|
import numpy as np
|
8
6
|
import pandas as pd
|
9
7
|
import plotly.graph_objects as go
|
10
8
|
from sklearn.metrics import r2_score
|
11
9
|
from sklearn.utils import check_random_state
|
12
10
|
|
13
|
-
from validmind
|
11
|
+
from validmind import tags, tasks
|
14
12
|
from validmind.logging import get_logger
|
15
|
-
from validmind.vm_models import
|
13
|
+
from validmind.vm_models import VMDataset, VMModel
|
16
14
|
|
17
15
|
logger = get_logger(__name__)
|
18
16
|
|
19
17
|
|
20
|
-
@
|
21
|
-
|
18
|
+
@tags("statsmodels", "feature_importance", "visualization")
|
19
|
+
@tasks("regression")
|
20
|
+
def RegressionPermutationFeatureImportance(
|
21
|
+
dataset: VMDataset, model: VMModel, fontsize: int = 12, figure_height: int = 500
|
22
|
+
):
|
22
23
|
"""
|
23
24
|
Assesses the significance of each feature in a model by evaluating the impact on model performance when feature
|
24
25
|
values are randomly rearranged.
|
@@ -55,79 +56,45 @@ class RegressionPermutationFeatureImportance(Metric):
|
|
55
56
|
features.
|
56
57
|
- Assumes independence of features when calculating importance, which might not always hold true.
|
57
58
|
"""
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
"
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
x_shuffled[column] = check_random_state(0).permutation(
|
91
|
-
x_shuffled[column]
|
92
|
-
)
|
93
|
-
permuted_performance = r2_score(y, model.predict(x_shuffled))
|
94
|
-
shuffled_scores.append(baseline_performance - permuted_performance)
|
95
|
-
|
96
|
-
importances.loc[column] = {
|
97
|
-
"Importance": np.mean(shuffled_scores),
|
98
|
-
"Std Dev": np.std(shuffled_scores),
|
99
|
-
}
|
100
|
-
|
101
|
-
sorted_idx = importances["Importance"].argsort()
|
102
|
-
|
103
|
-
# Plotting the results
|
104
|
-
fig = go.Figure()
|
105
|
-
fig.add_trace(
|
106
|
-
go.Bar(
|
107
|
-
y=importances.index[sorted_idx],
|
108
|
-
x=importances.loc[importances.index[sorted_idx], "Importance"],
|
109
|
-
orientation="h",
|
110
|
-
error_x=dict(
|
111
|
-
type="data",
|
112
|
-
array=importances.loc[importances.index[sorted_idx], "Std Dev"],
|
113
|
-
),
|
114
|
-
)
|
115
|
-
)
|
116
|
-
fig.update_layout(
|
117
|
-
title_text="Permutation Feature Importances",
|
118
|
-
yaxis=dict(
|
119
|
-
tickmode="linear", dtick=1, tickfont=dict(size=self.params["fontsize"])
|
59
|
+
y_true = dataset.y
|
60
|
+
|
61
|
+
baseline_performance = r2_score(y_true, dataset.y_pred(model))
|
62
|
+
|
63
|
+
importances = pd.DataFrame(
|
64
|
+
index=dataset.feature_columns, columns=["Importance", "Std Dev"]
|
65
|
+
)
|
66
|
+
|
67
|
+
for column in dataset.feature_columns:
|
68
|
+
shuffled_scores = []
|
69
|
+
for _ in range(30): # Default number of shuffles
|
70
|
+
x_shuffled = dataset.x_df()
|
71
|
+
x_shuffled[column] = check_random_state(0).permutation(x_shuffled[column])
|
72
|
+
permuted_performance = r2_score(y_true, model.predict(x_shuffled))
|
73
|
+
shuffled_scores.append(baseline_performance - permuted_performance)
|
74
|
+
|
75
|
+
importances.loc[column] = {
|
76
|
+
"Importance": np.mean(shuffled_scores),
|
77
|
+
"Std Dev": np.std(shuffled_scores),
|
78
|
+
}
|
79
|
+
|
80
|
+
sorted_idx = importances["Importance"].argsort()
|
81
|
+
|
82
|
+
fig = go.Figure()
|
83
|
+
fig.add_trace(
|
84
|
+
go.Bar(
|
85
|
+
y=importances.index[sorted_idx],
|
86
|
+
x=importances.loc[importances.index[sorted_idx], "Importance"],
|
87
|
+
orientation="h",
|
88
|
+
error_x=dict(
|
89
|
+
type="data",
|
90
|
+
array=importances.loc[importances.index[sorted_idx], "Std Dev"],
|
120
91
|
),
|
121
|
-
height=self.params["figure_height"],
|
122
|
-
)
|
123
|
-
|
124
|
-
return self.cache_results(
|
125
|
-
metric_value=importances.to_dict(),
|
126
|
-
figures=[
|
127
|
-
Figure(
|
128
|
-
for_object=self,
|
129
|
-
key="regression_pfi",
|
130
|
-
figure=fig,
|
131
|
-
),
|
132
|
-
],
|
133
92
|
)
|
93
|
+
)
|
94
|
+
fig.update_layout(
|
95
|
+
title_text="Permutation Feature Importances",
|
96
|
+
yaxis=dict(tickmode="linear", dtick=1, tickfont=dict(size=fontsize)),
|
97
|
+
height=figure_height,
|
98
|
+
)
|
99
|
+
|
100
|
+
return fig
|
@@ -52,14 +52,16 @@ def TargetPredictionDistributionPlot(datasets, model):
|
|
52
52
|
|
53
53
|
fig = plt.figure()
|
54
54
|
plot = sns.kdeplot(
|
55
|
-
pred_ref["Reference Prediction"],
|
55
|
+
pred_ref["Reference Prediction"], fill=True, label="Reference Prediction"
|
56
56
|
)
|
57
57
|
plot = sns.kdeplot(
|
58
|
-
pred_monitor["Monitoring Prediction"],
|
58
|
+
pred_monitor["Monitoring Prediction"], fill=True, label="Monitor Prediction"
|
59
59
|
)
|
60
60
|
plot.set(
|
61
61
|
xlabel="Prediction", title="Distribution of Reference & Monitor Predictions"
|
62
62
|
)
|
63
63
|
plot.legend()
|
64
64
|
|
65
|
+
plt.close()
|
66
|
+
|
65
67
|
return fig
|
@@ -0,0 +1,120 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
from abc import ABC, abstractmethod
|
6
|
+
from typing import Any, Dict, List, Union
|
7
|
+
from uuid import uuid4
|
8
|
+
|
9
|
+
import numpy as np
|
10
|
+
import pandas as pd
|
11
|
+
|
12
|
+
from validmind.vm_models.figure import (
|
13
|
+
Figure,
|
14
|
+
is_matplotlib_figure,
|
15
|
+
is_plotly_figure,
|
16
|
+
is_png_image,
|
17
|
+
)
|
18
|
+
from validmind.vm_models.result import ResultTable, TestResult
|
19
|
+
|
20
|
+
|
21
|
+
class OutputHandler(ABC):
|
22
|
+
"""Base class for handling different types of test outputs"""
|
23
|
+
|
24
|
+
@abstractmethod
|
25
|
+
def can_handle(self, item: Any) -> bool:
|
26
|
+
"""Check if this handler can process the given item"""
|
27
|
+
pass
|
28
|
+
|
29
|
+
@abstractmethod
|
30
|
+
def process(self, item: Any, result: TestResult) -> None:
|
31
|
+
"""Process the item and update the TestResult"""
|
32
|
+
pass
|
33
|
+
|
34
|
+
|
35
|
+
class BooleanOutputHandler(OutputHandler):
|
36
|
+
def can_handle(self, item: Any) -> bool:
|
37
|
+
return isinstance(item, (bool, np.bool_))
|
38
|
+
|
39
|
+
def process(self, item: Any, result: TestResult) -> None:
|
40
|
+
if result.passed is not None:
|
41
|
+
raise ValueError("Test returned more than one boolean value")
|
42
|
+
result.passed = bool(item)
|
43
|
+
|
44
|
+
|
45
|
+
class MetricOutputHandler(OutputHandler):
|
46
|
+
def can_handle(self, item: Any) -> bool:
|
47
|
+
return isinstance(item, (int, float))
|
48
|
+
|
49
|
+
def process(self, item: Any, result: TestResult) -> None:
|
50
|
+
if result.metric is not None:
|
51
|
+
raise ValueError("Only one unit metric may be returned per test.")
|
52
|
+
result.metric = item
|
53
|
+
|
54
|
+
|
55
|
+
class FigureOutputHandler(OutputHandler):
|
56
|
+
def can_handle(self, item: Any) -> bool:
|
57
|
+
return (
|
58
|
+
isinstance(item, Figure)
|
59
|
+
or is_matplotlib_figure(item)
|
60
|
+
or is_plotly_figure(item)
|
61
|
+
or is_png_image(item)
|
62
|
+
)
|
63
|
+
|
64
|
+
def process(self, item: Any, result: TestResult) -> None:
|
65
|
+
if isinstance(item, Figure):
|
66
|
+
result.add_figure(item)
|
67
|
+
else:
|
68
|
+
random_id = str(uuid4())[:4]
|
69
|
+
result.add_figure(
|
70
|
+
Figure(
|
71
|
+
key=f"{result.result_id}:{random_id}",
|
72
|
+
figure=item,
|
73
|
+
ref_id=result.ref_id,
|
74
|
+
)
|
75
|
+
)
|
76
|
+
|
77
|
+
|
78
|
+
class TableOutputHandler(OutputHandler):
|
79
|
+
def can_handle(self, item: Any) -> bool:
|
80
|
+
return isinstance(item, (list, pd.DataFrame, dict, ResultTable))
|
81
|
+
|
82
|
+
def process(
|
83
|
+
self,
|
84
|
+
item: Union[List[Dict[str, Any]], pd.DataFrame, Dict[str, Any], ResultTable],
|
85
|
+
result: TestResult,
|
86
|
+
) -> None:
|
87
|
+
tables = item if isinstance(item, dict) else {"": item}
|
88
|
+
|
89
|
+
for table_name, table_data in tables.items():
|
90
|
+
# if already a ResultTable, add it directly
|
91
|
+
if isinstance(table_data, ResultTable):
|
92
|
+
result.add_table(table_data)
|
93
|
+
continue
|
94
|
+
|
95
|
+
if not isinstance(table_data, (list, pd.DataFrame)):
|
96
|
+
raise ValueError(
|
97
|
+
"Invalid table format: must be a list of dictionaries or a DataFrame"
|
98
|
+
)
|
99
|
+
|
100
|
+
if isinstance(table_data, list):
|
101
|
+
table_data = pd.DataFrame(table_data)
|
102
|
+
|
103
|
+
result.add_table(ResultTable(data=table_data, title=table_name or None))
|
104
|
+
|
105
|
+
|
106
|
+
def process_output(item: Any, result: TestResult) -> None:
|
107
|
+
"""Process a single test output item and update the TestResult."""
|
108
|
+
handlers = [
|
109
|
+
BooleanOutputHandler(),
|
110
|
+
MetricOutputHandler(),
|
111
|
+
FigureOutputHandler(),
|
112
|
+
TableOutputHandler(),
|
113
|
+
]
|
114
|
+
|
115
|
+
for handler in handlers:
|
116
|
+
if handler.can_handle(item):
|
117
|
+
handler.process(item, result)
|
118
|
+
return
|
119
|
+
|
120
|
+
raise ValueError(f"Invalid test output type: {type(item)}")
|
@@ -2,19 +2,8 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from
|
6
|
-
from typing import List
|
7
|
-
|
8
|
-
import pandas as pd
|
9
|
-
|
5
|
+
from validmind import tags, tasks
|
10
6
|
from validmind.errors import MissingRequiredTestInputError
|
11
|
-
from validmind.vm_models import (
|
12
|
-
ResultSummary,
|
13
|
-
ResultTable,
|
14
|
-
ResultTableMetadata,
|
15
|
-
ThresholdTest,
|
16
|
-
ThresholdTestResult,
|
17
|
-
)
|
18
7
|
|
19
8
|
from .ai_powered_test import (
|
20
9
|
call_model,
|
@@ -23,9 +12,40 @@ from .ai_powered_test import (
|
|
23
12
|
missing_prompt_message,
|
24
13
|
)
|
25
14
|
|
15
|
+
SYSTEM = """
|
16
|
+
You are a prompt evaluation AI. You are aware of all prompt engineering best practices and can score prompts based on how well they satisfy different best practices. You analyse the prompts step-by-step based on provided documentation and provide a score and an explanation for how you produced that score.
|
17
|
+
|
18
|
+
Consider the following best practice documentation:
|
19
|
+
```
|
20
|
+
Its been shown that, when passing examples to an LLM via the prompt for the purpose of Sentiment Analysis, a distribution with a majority of positive examples led the LLM to classify ambiguous sentences as "Positive", and vice versa. However, for more straightforward tasks, such as sentiment classification, biases may be harder to induce. The order of presentation also appeared to influence the LLM's outputs, especially when combined with a skewed distribution of exemplars.
|
21
|
+
Tactics for Avoiding Bias in Prompting:
|
22
|
+
1. Balanced Distribution: Avoid heavily skewed distributions of examples. Instead, offer amore balanced number of examples for each label.
|
23
|
+
2. Randomize Order: To prevent order-induced biases, shuffle the sequence of examples ensuring no fixed pattern, such as presenting all positive examples first.
|
24
|
+
3. Continued Experimentation: Regularly test and tweak prompts, especially for complex tasks where the LLM might not have extensive knowledge. This iterative process helps in identifying potential pitfalls and refining prompting strategies.
|
25
|
+
Example:
|
26
|
+
For instance, using 8 positive and 2 negative examples led the LLM to classify an ambiguous sentence "I feel something" as "Positive". However, with a more balanced distribution, the LLM showed unbiased behavior.
|
27
|
+
```
|
28
|
+
|
29
|
+
Score the bias of the user-submitted prompt. Return a score from 1 to 10 where 10 is a perfect score. Also provide a short explanation for your score.
|
30
|
+
|
31
|
+
Response Format:
|
32
|
+
```
|
33
|
+
Score: <score>
|
34
|
+
Explanation: <explanation>
|
35
|
+
```
|
36
|
+
""".strip()
|
37
|
+
|
38
|
+
USER = '''
|
39
|
+
Prompt:
|
40
|
+
"""
|
41
|
+
{prompt_to_test}
|
42
|
+
"""
|
43
|
+
'''.strip()
|
44
|
+
|
26
45
|
|
27
|
-
@
|
28
|
-
|
46
|
+
@tags("llm", "few_shot")
|
47
|
+
@tasks("text_classification", "text_summarization")
|
48
|
+
def Bias(model, min_threshold=7):
|
29
49
|
"""
|
30
50
|
Assesses potential bias in a Large Language Model by analyzing the distribution and order of exemplars in the
|
31
51
|
prompt.
|
@@ -74,87 +94,24 @@ class Bias(ThresholdTest):
|
|
74
94
|
- The use of a grading mechanism to gauge bias may not be entirely accurate in every case, particularly when the
|
75
95
|
difference between threshold and score is narrow.
|
76
96
|
"""
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
Score the bias of the user-submitted prompt. Return a score from 1 to 10 where 10 is a perfect score. Also provide a short explanation for your score.
|
99
|
-
|
100
|
-
Response Format:
|
101
|
-
```
|
102
|
-
Score: <score>
|
103
|
-
Explanation: <explanation>
|
104
|
-
```
|
105
|
-
""".strip()
|
106
|
-
user_prompt = '''
|
107
|
-
Prompt:
|
108
|
-
"""
|
109
|
-
{prompt_to_test}
|
110
|
-
"""
|
111
|
-
'''.strip()
|
112
|
-
|
113
|
-
def summary(self, results: List[ThresholdTestResult], all_passed: bool):
|
114
|
-
result = results[0]
|
115
|
-
results_table = [
|
116
|
-
{
|
117
|
-
"Score": result.values["score"],
|
118
|
-
"Threshold": result.values["threshold"],
|
119
|
-
"Explanation": result.values["explanation"],
|
120
|
-
"Pass/Fail": "Pass" if result.passed else "Fail",
|
121
|
-
}
|
122
|
-
]
|
123
|
-
|
124
|
-
return ResultSummary(
|
125
|
-
results=[
|
126
|
-
ResultTable(
|
127
|
-
data=pd.DataFrame(results_table),
|
128
|
-
metadata=ResultTableMetadata(
|
129
|
-
title="Bias Test on Prompt",
|
130
|
-
),
|
131
|
-
)
|
132
|
-
]
|
133
|
-
)
|
134
|
-
|
135
|
-
def run(self):
|
136
|
-
if not hasattr(self.inputs.model, "prompt"):
|
137
|
-
raise MissingRequiredTestInputError(missing_prompt_message)
|
138
|
-
|
139
|
-
response = call_model(
|
140
|
-
system_prompt=self.system_prompt,
|
141
|
-
user_prompt=self.user_prompt.format(
|
142
|
-
prompt_to_test=self.inputs.model.prompt.template
|
143
|
-
),
|
144
|
-
)
|
145
|
-
score = get_score(response)
|
146
|
-
explanation = get_explanation(response)
|
147
|
-
|
148
|
-
passed = score > self.params["min_threshold"]
|
149
|
-
results = [
|
150
|
-
ThresholdTestResult(
|
151
|
-
passed=passed,
|
152
|
-
values={
|
153
|
-
"score": score,
|
154
|
-
"explanation": explanation,
|
155
|
-
"threshold": self.params["min_threshold"],
|
156
|
-
},
|
157
|
-
)
|
158
|
-
]
|
159
|
-
|
160
|
-
return self.cache_results(results, passed=passed)
|
97
|
+
if not hasattr(model, "prompt"):
|
98
|
+
raise MissingRequiredTestInputError(missing_prompt_message)
|
99
|
+
|
100
|
+
response = call_model(
|
101
|
+
system_prompt=SYSTEM,
|
102
|
+
user_prompt=USER.format(prompt_to_test=model.prompt.template),
|
103
|
+
)
|
104
|
+
|
105
|
+
score = get_score(response)
|
106
|
+
explanation = get_explanation(response)
|
107
|
+
|
108
|
+
passed = score > min_threshold
|
109
|
+
|
110
|
+
return [
|
111
|
+
{
|
112
|
+
"Score": score,
|
113
|
+
"Explanation": explanation,
|
114
|
+
"Threshold": min_threshold,
|
115
|
+
"Pass/Fail": "Pass" if passed else "Fail",
|
116
|
+
}
|
117
|
+
], passed
|
@@ -2,19 +2,8 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from
|
6
|
-
from typing import List
|
7
|
-
|
8
|
-
import pandas as pd
|
9
|
-
|
5
|
+
from validmind import tags, tasks
|
10
6
|
from validmind.errors import MissingRequiredTestInputError
|
11
|
-
from validmind.vm_models import (
|
12
|
-
ResultSummary,
|
13
|
-
ResultTable,
|
14
|
-
ResultTableMetadata,
|
15
|
-
ThresholdTest,
|
16
|
-
ThresholdTestResult,
|
17
|
-
)
|
18
7
|
|
19
8
|
from .ai_powered_test import (
|
20
9
|
call_model,
|
@@ -23,9 +12,41 @@ from .ai_powered_test import (
|
|
23
12
|
missing_prompt_message,
|
24
13
|
)
|
25
14
|
|
15
|
+
SYSTEM = """
|
16
|
+
You are a prompt evaluation AI. You are aware of all prompt engineering best practices and can score prompts based on how well they satisfy different metrics. You analyse the prompts step-by-step based on provided documentation and provide a score and an explanation for how you produced that score.
|
17
|
+
|
18
|
+
Consider the following documentation on prompt clarity guidelines when evaluating the prompt:
|
19
|
+
'''
|
20
|
+
Clear prompts minimize the room for misinterpretation, allowing the LLM to generate more relevant and accurate responses. Ambiguous or vague instructions might leave the model guessing, leading to suboptimal outputs.
|
21
|
+
|
22
|
+
Tactics for Ensuring Clarity that will be referenced during evaluation:
|
23
|
+
1. Detail Inclusion: Provide essential details or context to prevent the LLM from making assumptions.
|
24
|
+
2. Adopt a Persona: Use system messages to specify the desired persona for the LLM's responses.
|
25
|
+
3. Specify Steps: For certain tasks, delineate the required steps explicitly, helping the model in sequential understanding.
|
26
|
+
4. Provide Examples: While general instructions are efficient, in some scenarios, "few-shot" prompting or style examples can guide the LLM more effectively.
|
27
|
+
5. Determine Output Length: Define the targeted length of the response, whether in terms of paragraphs, bullet points, or other units. While word counts aren't always precise, specifying formats like paragraphs can offer more predictable results.
|
28
|
+
'''
|
29
|
+
|
30
|
+
Score the clarity of the user-submitted prompt. Return a score from 1 to 10 where 10 is a perfect score. Also provide a short explanation for your score.
|
26
31
|
|
27
|
-
|
28
|
-
|
32
|
+
Response Format:
|
33
|
+
```
|
34
|
+
Score: <score>
|
35
|
+
Explanation: <explanation>
|
36
|
+
```
|
37
|
+
"""
|
38
|
+
|
39
|
+
USER = """
|
40
|
+
Prompt:
|
41
|
+
'''
|
42
|
+
{prompt_to_test}
|
43
|
+
'''
|
44
|
+
"""
|
45
|
+
|
46
|
+
|
47
|
+
@tags("llm", "zero_shot", "few_shot")
|
48
|
+
@tasks("text_classification", "text_summarization")
|
49
|
+
def Clarity(model, min_threshold=7):
|
29
50
|
"""
|
30
51
|
Evaluates and scores the clarity of prompts in a Large Language Model based on specified guidelines.
|
31
52
|
|
@@ -62,88 +83,24 @@ class Clarity(ThresholdTest):
|
|
62
83
|
examples, and specification of output length) contribute equally to clarity, which might not always be the case
|
63
84
|
- The evaluation may not be as effective if used on non-textual models
|
64
85
|
"""
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
Score the clarity of the user-submitted prompt. Return a score from 1 to 10 where 10 is a perfect score. Also provide a short explanation for your score.
|
88
|
-
|
89
|
-
Response Format:
|
90
|
-
```
|
91
|
-
Score: <score>
|
92
|
-
Explanation: <explanation>
|
93
|
-
```
|
94
|
-
""".strip()
|
95
|
-
user_prompt = '''
|
96
|
-
Prompt:
|
97
|
-
"""
|
98
|
-
{prompt_to_test}
|
99
|
-
"""
|
100
|
-
'''.strip()
|
101
|
-
|
102
|
-
def summary(self, results: List[ThresholdTestResult], all_passed: bool):
|
103
|
-
result = results[0]
|
104
|
-
results_table = [
|
105
|
-
{
|
106
|
-
"Score": result.values["score"],
|
107
|
-
"Threshold": result.values["threshold"],
|
108
|
-
"Explanation": result.values["explanation"],
|
109
|
-
"Pass/Fail": "Pass" if result.passed else "Fail",
|
110
|
-
}
|
111
|
-
]
|
112
|
-
|
113
|
-
return ResultSummary(
|
114
|
-
results=[
|
115
|
-
ResultTable(
|
116
|
-
data=pd.DataFrame(results_table),
|
117
|
-
metadata=ResultTableMetadata(
|
118
|
-
title="Clarity Test for LLM Prompt",
|
119
|
-
),
|
120
|
-
)
|
121
|
-
]
|
122
|
-
)
|
123
|
-
|
124
|
-
def run(self):
|
125
|
-
if not hasattr(self.inputs.model, "prompt"):
|
126
|
-
raise MissingRequiredTestInputError(missing_prompt_message)
|
127
|
-
|
128
|
-
response = call_model(
|
129
|
-
system_prompt=self.system_prompt,
|
130
|
-
user_prompt=self.user_prompt.format(
|
131
|
-
prompt_to_test=self.inputs.model.prompt.template
|
132
|
-
),
|
133
|
-
)
|
134
|
-
score = get_score(response)
|
135
|
-
explanation = get_explanation(response)
|
136
|
-
|
137
|
-
passed = score > self.params["min_threshold"]
|
138
|
-
results = [
|
139
|
-
ThresholdTestResult(
|
140
|
-
passed=passed,
|
141
|
-
values={
|
142
|
-
"score": score,
|
143
|
-
"explanation": explanation,
|
144
|
-
"threshold": self.params["min_threshold"],
|
145
|
-
},
|
146
|
-
)
|
147
|
-
]
|
148
|
-
|
149
|
-
return self.cache_results(results, passed=passed)
|
86
|
+
if not hasattr(model, "prompt"):
|
87
|
+
raise MissingRequiredTestInputError(missing_prompt_message)
|
88
|
+
|
89
|
+
response = call_model(
|
90
|
+
system_prompt=SYSTEM,
|
91
|
+
user_prompt=USER.format(prompt_to_test=model.prompt.template),
|
92
|
+
)
|
93
|
+
|
94
|
+
score = get_score(response)
|
95
|
+
explanation = get_explanation(response)
|
96
|
+
|
97
|
+
passed = score > min_threshold
|
98
|
+
|
99
|
+
return [
|
100
|
+
{
|
101
|
+
"Score": score,
|
102
|
+
"Explanation": explanation,
|
103
|
+
"Threshold": min_threshold,
|
104
|
+
"Pass/Fail": "Pass" if passed else "Fail",
|
105
|
+
}
|
106
|
+
], passed
|