validmind 2.5.24__py3-none-any.whl → 2.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +8 -17
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +66 -85
- validmind/ai/test_result_description/context.py +2 -2
- validmind/ai/utils.py +26 -1
- validmind/api_client.py +43 -79
- validmind/client.py +5 -7
- validmind/client_config.py +1 -1
- validmind/datasets/__init__.py +1 -1
- validmind/datasets/classification/customer_churn.py +7 -5
- validmind/datasets/nlp/__init__.py +2 -2
- validmind/errors.py +6 -10
- validmind/html_templates/content_blocks.py +18 -16
- validmind/logging.py +21 -16
- validmind/tests/__init__.py +28 -5
- validmind/tests/__types__.py +186 -170
- validmind/tests/_store.py +7 -21
- validmind/tests/comparison.py +362 -0
- validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
- validmind/tests/data_validation/ADF.py +49 -83
- validmind/tests/data_validation/AutoAR.py +59 -96
- validmind/tests/data_validation/AutoMA.py +59 -96
- validmind/tests/data_validation/AutoStationarity.py +66 -114
- validmind/tests/data_validation/ClassImbalance.py +48 -117
- validmind/tests/data_validation/DatasetDescription.py +180 -209
- validmind/tests/data_validation/DatasetSplit.py +50 -75
- validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
- validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
- validmind/tests/data_validation/Duplicates.py +21 -90
- validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
- validmind/tests/data_validation/HighCardinality.py +32 -80
- validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
- validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
- validmind/tests/data_validation/IQROutliersTable.py +40 -80
- validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
- validmind/tests/data_validation/KPSS.py +33 -81
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
- validmind/tests/data_validation/MissingValues.py +17 -58
- validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
- validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
- validmind/tests/data_validation/RollingStatsPlot.py +50 -81
- validmind/tests/data_validation/SeasonalDecompose.py +102 -184
- validmind/tests/data_validation/Skewness.py +27 -64
- validmind/tests/data_validation/SpreadPlot.py +34 -57
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
- validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
- validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
- validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
- validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
- validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
- validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
- validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
- validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
- validmind/tests/data_validation/TooManyZeroValues.py +21 -70
- validmind/tests/data_validation/UniqueRows.py +23 -62
- validmind/tests/data_validation/WOEBinPlots.py +83 -109
- validmind/tests/data_validation/WOEBinTable.py +28 -69
- validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
- validmind/tests/data_validation/nlp/CommonWords.py +49 -57
- validmind/tests/data_validation/nlp/Hashtags.py +27 -49
- validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
- validmind/tests/data_validation/nlp/Mentions.py +32 -63
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
- validmind/tests/data_validation/nlp/Punctuations.py +63 -47
- validmind/tests/data_validation/nlp/Sentiment.py +4 -0
- validmind/tests/data_validation/nlp/StopWords.py +62 -91
- validmind/tests/data_validation/nlp/TextDescription.py +116 -159
- validmind/tests/data_validation/nlp/Toxicity.py +12 -4
- validmind/tests/decorator.py +33 -242
- validmind/tests/load.py +212 -153
- validmind/tests/model_validation/BertScore.py +13 -7
- validmind/tests/model_validation/BleuScore.py +4 -0
- validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
- validmind/tests/model_validation/ContextualRecall.py +3 -0
- validmind/tests/model_validation/FeaturesAUC.py +43 -74
- validmind/tests/model_validation/MeteorScore.py +3 -0
- validmind/tests/model_validation/RegardScore.py +5 -1
- validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
- validmind/tests/model_validation/embeddings/utils.py +53 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
- validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
- validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
- validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
- validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
- validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
- validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
- validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
- validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
- validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
- validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
- validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
- validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
- validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
- validmind/tests/output.py +120 -0
- validmind/tests/prompt_validation/Bias.py +55 -98
- validmind/tests/prompt_validation/Clarity.py +56 -99
- validmind/tests/prompt_validation/Conciseness.py +63 -101
- validmind/tests/prompt_validation/Delimitation.py +48 -89
- validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
- validmind/tests/prompt_validation/Robustness.py +80 -121
- validmind/tests/prompt_validation/Specificity.py +61 -95
- validmind/tests/prompt_validation/ai_powered_test.py +2 -2
- validmind/tests/run.py +314 -496
- validmind/tests/test_providers.py +109 -79
- validmind/tests/utils.py +91 -0
- validmind/unit_metrics/__init__.py +16 -155
- validmind/unit_metrics/classification/F1.py +1 -0
- validmind/unit_metrics/classification/Precision.py +1 -0
- validmind/unit_metrics/classification/ROC_AUC.py +1 -0
- validmind/unit_metrics/classification/Recall.py +1 -0
- validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
- validmind/unit_metrics/regression/HuberLoss.py +1 -0
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
- validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
- validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
- validmind/unit_metrics/regression/QuantileLoss.py +1 -0
- validmind/unit_metrics/regression/RSquaredScore.py +2 -1
- validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
- validmind/utils.py +66 -17
- validmind/vm_models/__init__.py +2 -17
- validmind/vm_models/dataset/dataset.py +31 -4
- validmind/vm_models/figure.py +7 -37
- validmind/vm_models/model.py +3 -0
- validmind/vm_models/result/__init__.py +7 -0
- validmind/vm_models/result/result.jinja +21 -0
- validmind/vm_models/result/result.py +337 -0
- validmind/vm_models/result/utils.py +160 -0
- validmind/vm_models/test_suite/runner.py +16 -54
- validmind/vm_models/test_suite/summary.py +3 -3
- validmind/vm_models/test_suite/test.py +43 -77
- validmind/vm_models/test_suite/test_suite.py +8 -40
- validmind-2.6.7.dist-info/METADATA +137 -0
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/RECORD +182 -189
- validmind/tests/data_validation/AutoSeasonality.py +0 -190
- validmind/tests/metadata.py +0 -59
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
- validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
- validmind/unit_metrics/composite.py +0 -238
- validmind/vm_models/test/metric.py +0 -98
- validmind/vm_models/test/metric_result.py +0 -61
- validmind/vm_models/test/output_template.py +0 -55
- validmind/vm_models/test/result_summary.py +0 -76
- validmind/vm_models/test/result_wrapper.py +0 -488
- validmind/vm_models/test/test.py +0 -103
- validmind/vm_models/test/threshold_test.py +0 -106
- validmind/vm_models/test/threshold_test_result.py +0 -75
- validmind/vm_models/test_context.py +0 -259
- validmind-2.5.24.dist-info/METADATA +0 -118
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/LICENSE +0 -0
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/WHEEL +0 -0
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/entry_points.txt +0 -0
@@ -2,33 +2,72 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from dataclasses import dataclass
|
6
|
-
from typing import List
|
7
|
-
|
8
5
|
import pandas as pd
|
9
6
|
|
7
|
+
from validmind import tags, tasks
|
10
8
|
from validmind.errors import MissingRequiredTestInputError, SkipTestError
|
11
|
-
from validmind.vm_models import (
|
12
|
-
ResultSummary,
|
13
|
-
ResultTable,
|
14
|
-
ResultTableMetadata,
|
15
|
-
ThresholdTest,
|
16
|
-
ThresholdTestResult,
|
17
|
-
)
|
18
9
|
|
19
10
|
from .ai_powered_test import call_model, missing_prompt_message
|
20
11
|
|
12
|
+
SYSTEM = """
|
13
|
+
You are a prompt evaluation researcher AI who is tasked with testing the robustness of LLM prompts.
|
14
|
+
|
15
|
+
Consider the following guidelines:
|
16
|
+
'''
|
17
|
+
LLM prompts are used to guide a model to generate specific outputs or solve specific tasks.
|
18
|
+
These prompts can be more or less robust, meaning that they can be more or less susceptible to breaking especially when the output needs to be a specific type.
|
19
|
+
A robust prompt ensures consistent performance and reduces the likelihood of unexpected or off-tangent outputs.
|
20
|
+
This consistency is vital for applications where predictability and reliability of the LLM's response are paramount.
|
21
|
+
'''
|
22
|
+
|
23
|
+
Consider the user-submitted prompt template and generate an input for the variable in the template (denoted by brackets) that tests the robustness of the prompt.
|
24
|
+
Contradictions, edge cases, typos, bad phrasing, distracting, complex or out-of-place words and phrases are just some of the strategies you can use when generating inputs.
|
25
|
+
Be creative and think step-by-step how you would break the prompt.
|
26
|
+
Then generate {num_tests} inputs for the user-submitted prompt template that would break the prompt.
|
27
|
+
Each input should be different from the others.
|
28
|
+
Each input should be retured as a new line in your response.
|
29
|
+
Respond only with the values to be inserted into the prompt template and do not include quotes, explanations or any extra text.
|
30
|
+
|
31
|
+
Example:
|
32
|
+
|
33
|
+
User-provided prompt:
|
34
|
+
```
|
35
|
+
Analyse the following sentence and output its sentiment:
|
36
|
+
\\{sentence\\}
|
37
|
+
```
|
38
|
+
|
39
|
+
Your response (generated inputs):
|
40
|
+
```
|
41
|
+
I am a happy cat
|
42
|
+
You are a bad person
|
43
|
+
My name is bob
|
44
|
+
James' friend is really sick
|
45
|
+
```
|
46
|
+
"""
|
21
47
|
|
22
|
-
|
23
|
-
|
48
|
+
USER = """
|
49
|
+
Prompt:
|
50
|
+
```
|
51
|
+
{prompt_to_test}
|
52
|
+
```
|
53
|
+
Input:
|
54
|
+
"""
|
55
|
+
|
56
|
+
|
57
|
+
@tags("llm", "zero_shot", "few_shot")
|
58
|
+
@tasks("text_classification", "text_summarization")
|
59
|
+
def Robustness(model, dataset, num_tests=10):
|
24
60
|
"""
|
25
|
-
Assesses the robustness of prompts provided to a Large Language Model under varying conditions and contexts.
|
61
|
+
Assesses the robustness of prompts provided to a Large Language Model under varying conditions and contexts. This test
|
62
|
+
specifically measures the model's ability to generate correct classifications with the given prompt even when the
|
63
|
+
inputs are edge cases or otherwise difficult to classify.
|
26
64
|
|
27
65
|
### Purpose
|
28
66
|
|
29
67
|
The Robustness test is meant to evaluate the resilience and reliability of prompts provided to a Language Learning
|
30
68
|
Model (LLM). The aim of this test is to guarantee that the prompts consistently generate accurate and expected
|
31
|
-
outputs, even in diverse or challenging scenarios.
|
69
|
+
outputs, even in diverse or challenging scenarios. This test is only applicable to LLM-powered text classification
|
70
|
+
tasks where the prompt has a single input variable.
|
32
71
|
|
33
72
|
### Test Mechanism
|
34
73
|
|
@@ -58,114 +97,34 @@ class Robustness(ThresholdTest):
|
|
58
97
|
- The test may not account for all potential conditions or alterations that could show up in practical use
|
59
98
|
scenarios.
|
60
99
|
"""
|
100
|
+
if not hasattr(model, "prompt"):
|
101
|
+
raise MissingRequiredTestInputError(missing_prompt_message)
|
61
102
|
|
62
|
-
|
63
|
-
|
64
|
-
default_params = {"num_tests": 10}
|
65
|
-
tasks = ["text_classification", "text_summarization"]
|
66
|
-
tags = ["llm", "zero_shot", "few_shot"]
|
67
|
-
|
68
|
-
system_prompt = '''
|
69
|
-
You are a prompt evaluation researcher AI who is tasked with testing the robustness of LLM prompts.
|
103
|
+
if not model.prompt.variables or len(model.prompt.variables) > 1:
|
104
|
+
raise SkipTestError("Robustness only supports single-variable prompts for now")
|
70
105
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
```
|
76
|
-
|
77
|
-
Consider the user-submitted prompt template and generate an input for the variable in the template (denoted by brackets) that tests the robustness of the prompt.
|
78
|
-
Contradictions, edge cases, typos, bad phrasing, distracting, complex or out-of-place words and phrases are just some of the strategies you can use when generating inputs.
|
79
|
-
Be creative and think step-by-step how you would break the prompt. Then generate an input for the user-submitted prompt template that would break the prompt.
|
80
|
-
Respond only with the value to be inserted into the prompt template and do not include quotes, explanations or any extra text.
|
81
|
-
Example:
|
82
|
-
Prompt:
|
83
|
-
"""
|
84
|
-
Analyse the following sentence and output its sentiment\n{sentence}
|
85
|
-
"""
|
86
|
-
Input:
|
87
|
-
Nonsense string that has no sentiment
|
88
|
-
'''.strip()
|
89
|
-
user_prompt = '''
|
90
|
-
Prompt:
|
91
|
-
"""
|
92
|
-
{prompt_to_test}
|
93
|
-
"""
|
94
|
-
Input:
|
95
|
-
'''.strip()
|
96
|
-
|
97
|
-
def summary(self, results: List[ThresholdTestResult], all_passed: bool):
|
98
|
-
results_table = [
|
99
|
-
{
|
100
|
-
"Test Input": result.values["input"],
|
101
|
-
"Model Output": result.values["output"],
|
102
|
-
"Pass/Fail": "Pass" if result.passed else "Fail",
|
103
|
-
}
|
104
|
-
for result in results
|
105
|
-
]
|
106
|
-
|
107
|
-
return ResultSummary(
|
108
|
-
results=[
|
109
|
-
ResultTable(
|
110
|
-
data=pd.DataFrame(results_table),
|
111
|
-
metadata=ResultTableMetadata(
|
112
|
-
title="Robustness Test on Prompt",
|
113
|
-
),
|
114
|
-
)
|
115
|
-
]
|
106
|
+
target_class_labels = dataset.target_classes()
|
107
|
+
if len(target_class_labels) > 10:
|
108
|
+
raise SkipTestError(
|
109
|
+
"Too many target classes to test robustness. Skipping test."
|
116
110
|
)
|
117
111
|
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
results = []
|
139
|
-
|
140
|
-
for _ in range(self.params["num_tests"]):
|
141
|
-
response = call_model(
|
142
|
-
system_prompt=self.system_prompt,
|
143
|
-
user_prompt=self.user_prompt.format(
|
144
|
-
variables="\n".join(self.inputs.model.prompt.variables),
|
145
|
-
prompt_to_test=self.inputs.model.prompt.template,
|
146
|
-
),
|
147
|
-
)
|
148
|
-
|
149
|
-
test_input_df = pd.DataFrame(
|
150
|
-
[response],
|
151
|
-
columns=self.inputs.model.prompt.variables,
|
152
|
-
)
|
153
|
-
result = self.inputs.model.predict(test_input_df)[0]
|
154
|
-
|
155
|
-
fail = False
|
156
|
-
if result not in target_class_labels:
|
157
|
-
fail = True
|
158
|
-
|
159
|
-
results.append(
|
160
|
-
ThresholdTestResult(
|
161
|
-
passed=not fail,
|
162
|
-
values={
|
163
|
-
"input": response,
|
164
|
-
"output": result,
|
165
|
-
},
|
166
|
-
)
|
167
|
-
)
|
168
|
-
|
169
|
-
return self.cache_results(
|
170
|
-
results, passed=all([result.passed for result in results])
|
171
|
-
)
|
112
|
+
generated_inputs = call_model(
|
113
|
+
system_prompt=SYSTEM.format(num_tests=num_tests),
|
114
|
+
user_prompt=USER.format(prompt_to_test=model.prompt.template),
|
115
|
+
).split("\n")
|
116
|
+
|
117
|
+
responses = model.predict(
|
118
|
+
pd.DataFrame(generated_inputs, columns=model.prompt.variables)
|
119
|
+
)
|
120
|
+
|
121
|
+
results = [
|
122
|
+
{
|
123
|
+
"Generated Input": generated_input,
|
124
|
+
"Model Response": response,
|
125
|
+
"Pass/Fail": "Pass" if response in target_class_labels else "Fail",
|
126
|
+
}
|
127
|
+
for generated_input, response in zip(generated_inputs, responses)
|
128
|
+
]
|
129
|
+
|
130
|
+
return results, all(result["Pass/Fail"] == "Pass" for result in results)
|
@@ -2,19 +2,8 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from
|
6
|
-
from typing import List
|
7
|
-
|
8
|
-
import pandas as pd
|
9
|
-
|
5
|
+
from validmind import tags, tasks
|
10
6
|
from validmind.errors import MissingRequiredTestInputError
|
11
|
-
from validmind.vm_models import (
|
12
|
-
ResultSummary,
|
13
|
-
ResultTable,
|
14
|
-
ResultTableMetadata,
|
15
|
-
ThresholdTest,
|
16
|
-
ThresholdTestResult,
|
17
|
-
)
|
18
7
|
|
19
8
|
from .ai_powered_test import (
|
20
9
|
call_model,
|
@@ -23,9 +12,47 @@ from .ai_powered_test import (
|
|
23
12
|
missing_prompt_message,
|
24
13
|
)
|
25
14
|
|
15
|
+
SYSTEM = """
|
16
|
+
You are a prompt evaluation AI.
|
17
|
+
You are aware of all prompt engineering best practices and can score prompts based on how well they satisfy different metrics.
|
18
|
+
You analyse the prompts step-by-step based on provided documentation and provide a score and an explanation for how you produced that score.
|
19
|
+
|
20
|
+
Consider the following documentation regarding specificity in prompts and utilize it to grade the user-submitted prompt:
|
21
|
+
```
|
22
|
+
Prompts that are detailed and descriptive often yield better and more accurate results from an LLM.
|
23
|
+
Rather than relying on specific keywords or tokens, it's crucial to have a well-structured and descriptive prompt.
|
24
|
+
Including relevant examples within the prompt can be particularly effective, guiding the LLM to produce outputs in desired formats.
|
25
|
+
However, it's essential to strike a balance. While prompts need to be detailed, they shouldn't be overloaded with unnecessary information.
|
26
|
+
The emphasis should always be on relevancy and conciseness, considering there are limitations to how long a prompt can be.
|
27
|
+
|
28
|
+
Example:
|
29
|
+
Imagine wanting an LLM to extract specific details from a given text.
|
30
|
+
A vague prompt might yield varied results.
|
31
|
+
However, with a prompt like, "Extract the names of all characters and the cities they visited from the text", the LLM is guided more precisely towards the desired information extraction.
|
32
|
+
```
|
33
|
+
|
34
|
+
Score the specificity of the user-submitted prompt.
|
35
|
+
Return a score from 1 to 10 where 10 is a perfect score.
|
36
|
+
Also provide a short explanation for your score.
|
37
|
+
|
38
|
+
Response Format:
|
39
|
+
```
|
40
|
+
Score: <score>
|
41
|
+
Explanation: <explanation>
|
42
|
+
```
|
43
|
+
"""
|
44
|
+
|
45
|
+
USER = """
|
46
|
+
Prompt:
|
47
|
+
'''
|
48
|
+
{prompt_to_test}
|
49
|
+
'''
|
50
|
+
"""
|
26
51
|
|
27
|
-
|
28
|
-
|
52
|
+
|
53
|
+
@tags("llm", "zero_shot", "few_shot")
|
54
|
+
@tasks("text_classification", "text_summarization")
|
55
|
+
def Specificity(model, min_threshold=7):
|
29
56
|
"""
|
30
57
|
Evaluates and scores the specificity of prompts provided to a Large Language Model (LLM), based on clarity, detail,
|
31
58
|
and relevance.
|
@@ -64,84 +91,23 @@ class Specificity(ThresholdTest):
|
|
64
91
|
- Striking a balance between specificity and verbosity can be challenging, as overly detailed prompts might confuse
|
65
92
|
or mislead the model
|
66
93
|
"""
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
Response Format:
|
88
|
-
```
|
89
|
-
Score: <score>
|
90
|
-
Explanation: <explanation>
|
91
|
-
```
|
92
|
-
""".strip()
|
93
|
-
user_prompt = '''
|
94
|
-
Prompt:
|
95
|
-
"""
|
96
|
-
{prompt_to_test}
|
97
|
-
"""
|
98
|
-
'''.strip()
|
99
|
-
|
100
|
-
def summary(self, results: List[ThresholdTestResult], all_passed: bool):
|
101
|
-
result = results[0]
|
102
|
-
results_table = [
|
103
|
-
{
|
104
|
-
"Score": result.values["score"],
|
105
|
-
"Threshold": result.values["threshold"],
|
106
|
-
"Explanation": result.values["explanation"],
|
107
|
-
"Pass/Fail": "Pass" if result.passed else "Fail",
|
108
|
-
}
|
109
|
-
]
|
110
|
-
|
111
|
-
return ResultSummary(
|
112
|
-
results=[
|
113
|
-
ResultTable(
|
114
|
-
data=pd.DataFrame(results_table),
|
115
|
-
metadata=ResultTableMetadata(
|
116
|
-
title="Specificity Test for LLM Prompt",
|
117
|
-
),
|
118
|
-
)
|
119
|
-
]
|
120
|
-
)
|
121
|
-
|
122
|
-
def run(self):
|
123
|
-
if not hasattr(self.inputs.model, "prompt"):
|
124
|
-
raise MissingRequiredTestInputError(missing_prompt_message)
|
125
|
-
|
126
|
-
response = call_model(
|
127
|
-
system_prompt=self.system_prompt,
|
128
|
-
user_prompt=self.user_prompt.format(
|
129
|
-
prompt_to_test=self.inputs.model.prompt.template
|
130
|
-
),
|
131
|
-
)
|
132
|
-
score = get_score(response)
|
133
|
-
explanation = get_explanation(response)
|
134
|
-
|
135
|
-
passed = score > self.params["min_threshold"]
|
136
|
-
results = [
|
137
|
-
ThresholdTestResult(
|
138
|
-
passed=passed,
|
139
|
-
values={
|
140
|
-
"score": score,
|
141
|
-
"explanation": explanation,
|
142
|
-
"threshold": self.params["min_threshold"],
|
143
|
-
},
|
144
|
-
)
|
145
|
-
]
|
146
|
-
|
147
|
-
return self.cache_results(results, passed=passed)
|
94
|
+
if not hasattr(model, "prompt"):
|
95
|
+
raise MissingRequiredTestInputError(missing_prompt_message)
|
96
|
+
|
97
|
+
response = call_model(
|
98
|
+
system_prompt=SYSTEM,
|
99
|
+
user_prompt=USER.format(prompt_to_test=model.prompt.template),
|
100
|
+
)
|
101
|
+
score = get_score(response)
|
102
|
+
explanation = get_explanation(response)
|
103
|
+
|
104
|
+
passed = score > min_threshold
|
105
|
+
|
106
|
+
return [
|
107
|
+
{
|
108
|
+
"Score": score,
|
109
|
+
"Threshold": min_threshold,
|
110
|
+
"Explanation": explanation,
|
111
|
+
"Pass/Fail": "Pass" if passed else "Fail",
|
112
|
+
}
|
113
|
+
], passed
|
@@ -36,8 +36,8 @@ def call_model(
|
|
36
36
|
client.chat.completions.create(
|
37
37
|
model=model,
|
38
38
|
messages=[
|
39
|
-
{"role": "system", "content": system_prompt},
|
40
|
-
{"role": "user", "content": user_prompt},
|
39
|
+
{"role": "system", "content": system_prompt.strip("\n").strip()},
|
40
|
+
{"role": "user", "content": user_prompt.strip("\n").strip()},
|
41
41
|
],
|
42
42
|
temperature=temperature,
|
43
43
|
seed=seed,
|