validmind 2.5.24__py3-none-any.whl → 2.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +8 -17
- validmind/__version__.py +1 -1
- validmind/ai/test_descriptions.py +66 -85
- validmind/ai/test_result_description/context.py +2 -2
- validmind/ai/utils.py +26 -1
- validmind/api_client.py +43 -79
- validmind/client.py +5 -7
- validmind/client_config.py +1 -1
- validmind/datasets/__init__.py +1 -1
- validmind/datasets/classification/customer_churn.py +7 -5
- validmind/datasets/nlp/__init__.py +2 -2
- validmind/errors.py +6 -10
- validmind/html_templates/content_blocks.py +18 -16
- validmind/logging.py +21 -16
- validmind/tests/__init__.py +28 -5
- validmind/tests/__types__.py +186 -170
- validmind/tests/_store.py +7 -21
- validmind/tests/comparison.py +362 -0
- validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
- validmind/tests/data_validation/ADF.py +49 -83
- validmind/tests/data_validation/AutoAR.py +59 -96
- validmind/tests/data_validation/AutoMA.py +59 -96
- validmind/tests/data_validation/AutoStationarity.py +66 -114
- validmind/tests/data_validation/ClassImbalance.py +48 -117
- validmind/tests/data_validation/DatasetDescription.py +180 -209
- validmind/tests/data_validation/DatasetSplit.py +50 -75
- validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
- validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
- validmind/tests/data_validation/Duplicates.py +21 -90
- validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
- validmind/tests/data_validation/HighCardinality.py +32 -80
- validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
- validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
- validmind/tests/data_validation/IQROutliersTable.py +40 -80
- validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
- validmind/tests/data_validation/KPSS.py +33 -81
- validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
- validmind/tests/data_validation/MissingValues.py +17 -58
- validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
- validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
- validmind/tests/data_validation/RollingStatsPlot.py +50 -81
- validmind/tests/data_validation/SeasonalDecompose.py +102 -184
- validmind/tests/data_validation/Skewness.py +27 -64
- validmind/tests/data_validation/SpreadPlot.py +34 -57
- validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
- validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
- validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
- validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
- validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
- validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
- validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
- validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
- validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
- validmind/tests/data_validation/TooManyZeroValues.py +21 -70
- validmind/tests/data_validation/UniqueRows.py +23 -62
- validmind/tests/data_validation/WOEBinPlots.py +83 -109
- validmind/tests/data_validation/WOEBinTable.py +28 -69
- validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
- validmind/tests/data_validation/nlp/CommonWords.py +49 -57
- validmind/tests/data_validation/nlp/Hashtags.py +27 -49
- validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
- validmind/tests/data_validation/nlp/Mentions.py +32 -63
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
- validmind/tests/data_validation/nlp/Punctuations.py +63 -47
- validmind/tests/data_validation/nlp/Sentiment.py +4 -0
- validmind/tests/data_validation/nlp/StopWords.py +62 -91
- validmind/tests/data_validation/nlp/TextDescription.py +116 -159
- validmind/tests/data_validation/nlp/Toxicity.py +12 -4
- validmind/tests/decorator.py +33 -242
- validmind/tests/load.py +212 -153
- validmind/tests/model_validation/BertScore.py +13 -7
- validmind/tests/model_validation/BleuScore.py +4 -0
- validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
- validmind/tests/model_validation/ContextualRecall.py +3 -0
- validmind/tests/model_validation/FeaturesAUC.py +43 -74
- validmind/tests/model_validation/MeteorScore.py +3 -0
- validmind/tests/model_validation/RegardScore.py +5 -1
- validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
- validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
- validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
- validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
- validmind/tests/model_validation/embeddings/utils.py +53 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
- validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
- validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
- validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
- validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
- validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
- validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
- validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
- validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
- validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
- validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
- validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
- validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
- validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
- validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
- validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
- validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
- validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
- validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
- validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
- validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
- validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
- validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
- validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
- validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
- validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
- validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
- validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
- validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
- validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
- validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
- validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
- validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
- validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
- validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
- validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
- validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
- validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
- validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
- validmind/tests/output.py +120 -0
- validmind/tests/prompt_validation/Bias.py +55 -98
- validmind/tests/prompt_validation/Clarity.py +56 -99
- validmind/tests/prompt_validation/Conciseness.py +63 -101
- validmind/tests/prompt_validation/Delimitation.py +48 -89
- validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
- validmind/tests/prompt_validation/Robustness.py +80 -121
- validmind/tests/prompt_validation/Specificity.py +61 -95
- validmind/tests/prompt_validation/ai_powered_test.py +2 -2
- validmind/tests/run.py +314 -496
- validmind/tests/test_providers.py +109 -79
- validmind/tests/utils.py +91 -0
- validmind/unit_metrics/__init__.py +16 -155
- validmind/unit_metrics/classification/F1.py +1 -0
- validmind/unit_metrics/classification/Precision.py +1 -0
- validmind/unit_metrics/classification/ROC_AUC.py +1 -0
- validmind/unit_metrics/classification/Recall.py +1 -0
- validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
- validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
- validmind/unit_metrics/regression/HuberLoss.py +1 -0
- validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
- validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
- validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
- validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
- validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
- validmind/unit_metrics/regression/QuantileLoss.py +1 -0
- validmind/unit_metrics/regression/RSquaredScore.py +2 -1
- validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
- validmind/utils.py +66 -17
- validmind/vm_models/__init__.py +2 -17
- validmind/vm_models/dataset/dataset.py +31 -4
- validmind/vm_models/figure.py +7 -37
- validmind/vm_models/model.py +3 -0
- validmind/vm_models/result/__init__.py +7 -0
- validmind/vm_models/result/result.jinja +21 -0
- validmind/vm_models/result/result.py +337 -0
- validmind/vm_models/result/utils.py +160 -0
- validmind/vm_models/test_suite/runner.py +16 -54
- validmind/vm_models/test_suite/summary.py +3 -3
- validmind/vm_models/test_suite/test.py +43 -77
- validmind/vm_models/test_suite/test_suite.py +8 -40
- validmind-2.6.7.dist-info/METADATA +137 -0
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/RECORD +182 -189
- validmind/tests/data_validation/AutoSeasonality.py +0 -190
- validmind/tests/metadata.py +0 -59
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
- validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
- validmind/unit_metrics/composite.py +0 -238
- validmind/vm_models/test/metric.py +0 -98
- validmind/vm_models/test/metric_result.py +0 -61
- validmind/vm_models/test/output_template.py +0 -55
- validmind/vm_models/test/result_summary.py +0 -76
- validmind/vm_models/test/result_wrapper.py +0 -488
- validmind/vm_models/test/test.py +0 -103
- validmind/vm_models/test/threshold_test.py +0 -106
- validmind/vm_models/test/threshold_test_result.py +0 -75
- validmind/vm_models/test_context.py +0 -259
- validmind-2.5.24.dist-info/METADATA +0 -118
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/LICENSE +0 -0
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/WHEEL +0 -0
- {validmind-2.5.24.dist-info → validmind-2.6.7.dist-info}/entry_points.txt +0 -0
@@ -2,19 +2,8 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from
|
6
|
-
from typing import List
|
7
|
-
|
8
|
-
import pandas as pd
|
9
|
-
|
5
|
+
from validmind import tags, tasks
|
10
6
|
from validmind.errors import MissingRequiredTestInputError
|
11
|
-
from validmind.vm_models import (
|
12
|
-
ResultSummary,
|
13
|
-
ResultTable,
|
14
|
-
ResultTableMetadata,
|
15
|
-
ThresholdTest,
|
16
|
-
ThresholdTestResult,
|
17
|
-
)
|
18
7
|
|
19
8
|
from .ai_powered_test import (
|
20
9
|
call_model,
|
@@ -23,9 +12,49 @@ from .ai_powered_test import (
|
|
23
12
|
missing_prompt_message,
|
24
13
|
)
|
25
14
|
|
15
|
+
SYSTEM = """
|
16
|
+
You are a prompt evaluation AI.
|
17
|
+
You are aware of all prompt engineering best practices and can score prompts based on how well they satisfy different metrics.
|
18
|
+
You analyse the prompts step-by-step based on provided documentation and provide a score and an explanation for how you produced that score.
|
19
|
+
|
20
|
+
Consider the following documentation regarding conciseness in prompts and utilize it to grade the user-submitted prompt:
|
21
|
+
'''
|
22
|
+
While detailed prompts can guide an LLM towards accurate results, excessive details can clutter the instruction and potentially lead to undesired outputs.
|
23
|
+
Concise prompts are straightforward, reducing ambiguity and focusing the LLM's attention on the primary task.
|
24
|
+
This is especially important considering there are limitations to the length of prompts that can be fed to an LLM.
|
25
|
+
|
26
|
+
For an LLM tasked with summarizing a document, a verbose prompt might introduce unnecessary constraints or biases.
|
27
|
+
A concise, effective prompt like:
|
28
|
+
"Provide a brief summary highlighting the main points of the document"
|
29
|
+
ensures that the LLM captures the essence of the content without being sidetracked.
|
30
|
+
|
31
|
+
For example this prompt:
|
32
|
+
"The description for this product should be fairly short, a few sentences only, and not too much more."
|
33
|
+
could be better written like this:
|
34
|
+
"Use a 3 to 5 sentence paragraph to describe this product."
|
35
|
+
'''
|
36
|
+
|
37
|
+
Score the user-submitted prompt on a scale of 1 to 10, with 10 being the best possible score.
|
38
|
+
Provide an explanation for your score.
|
39
|
+
|
40
|
+
Response Format:
|
41
|
+
```
|
42
|
+
Score: <score>
|
43
|
+
Explanation: <explanation>
|
44
|
+
```
|
45
|
+
"""
|
26
46
|
|
27
|
-
|
28
|
-
|
47
|
+
USER = """
|
48
|
+
Prompt:
|
49
|
+
```
|
50
|
+
{prompt_to_test}
|
51
|
+
```
|
52
|
+
"""
|
53
|
+
|
54
|
+
|
55
|
+
@tags("llm", "zero_shot", "few_shot")
|
56
|
+
@tasks("text_classification", "text_summarization")
|
57
|
+
def Conciseness(model, min_threshold=7):
|
29
58
|
"""
|
30
59
|
Analyzes and grades the conciseness of prompts provided to a Large Language Model.
|
31
60
|
|
@@ -62,90 +91,23 @@ class Conciseness(ThresholdTest):
|
|
62
91
|
- The predefined threshold for conciseness could be subjective and might need adjustment based on application.
|
63
92
|
- The test is dependent on the LLM’s understanding of conciseness, which might vary from model to model.
|
64
93
|
"""
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
"
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
could be better written like this:
|
86
|
-
"Use a 3 to 5 sentence paragraph to describe this product."
|
87
|
-
'''
|
88
|
-
|
89
|
-
Score the user-submitted prompt on a scale of 1 to 10, with 10 being the best possible score. Provide an explanation for your score.
|
90
|
-
|
91
|
-
Response Format:
|
92
|
-
```
|
93
|
-
Score: <score>
|
94
|
-
Explanation: <explanation>
|
95
|
-
```
|
96
|
-
""".strip()
|
97
|
-
user_prompt = '''
|
98
|
-
Prompt:
|
99
|
-
"""
|
100
|
-
{prompt_to_test}
|
101
|
-
"""
|
102
|
-
'''.strip()
|
103
|
-
|
104
|
-
def summary(self, results: List[ThresholdTestResult], all_passed: bool):
|
105
|
-
result = results[0]
|
106
|
-
results_table = [
|
107
|
-
{
|
108
|
-
"Score": result.values["score"],
|
109
|
-
"Threshold": result.values["threshold"],
|
110
|
-
"Explanation": result.values["explanation"],
|
111
|
-
"Pass/Fail": "Pass" if result.passed else "Fail",
|
112
|
-
}
|
113
|
-
]
|
114
|
-
|
115
|
-
return ResultSummary(
|
116
|
-
results=[
|
117
|
-
ResultTable(
|
118
|
-
data=pd.DataFrame(results_table),
|
119
|
-
metadata=ResultTableMetadata(
|
120
|
-
title="Conciseness Test for LLM Prompt",
|
121
|
-
),
|
122
|
-
)
|
123
|
-
]
|
124
|
-
)
|
125
|
-
|
126
|
-
def run(self):
|
127
|
-
if not hasattr(self.inputs.model, "prompt"):
|
128
|
-
raise MissingRequiredTestInputError(missing_prompt_message)
|
129
|
-
|
130
|
-
response = call_model(
|
131
|
-
system_prompt=self.system_prompt,
|
132
|
-
user_prompt=self.user_prompt.format(
|
133
|
-
prompt_to_test=self.inputs.model.prompt.template
|
134
|
-
),
|
135
|
-
)
|
136
|
-
score = get_score(response)
|
137
|
-
explanation = get_explanation(response)
|
138
|
-
|
139
|
-
passed = score > self.params["min_threshold"]
|
140
|
-
results = [
|
141
|
-
ThresholdTestResult(
|
142
|
-
passed=passed,
|
143
|
-
values={
|
144
|
-
"score": score,
|
145
|
-
"explanation": explanation,
|
146
|
-
"threshold": self.params["min_threshold"],
|
147
|
-
},
|
148
|
-
)
|
149
|
-
]
|
150
|
-
|
151
|
-
return self.cache_results(results, passed=passed)
|
94
|
+
if not hasattr(model, "prompt"):
|
95
|
+
raise MissingRequiredTestInputError(missing_prompt_message)
|
96
|
+
|
97
|
+
response = call_model(
|
98
|
+
system_prompt=SYSTEM,
|
99
|
+
user_prompt=USER.format(prompt_to_test=model.prompt.template),
|
100
|
+
)
|
101
|
+
score = get_score(response)
|
102
|
+
explanation = get_explanation(response)
|
103
|
+
|
104
|
+
passed = score > min_threshold
|
105
|
+
|
106
|
+
return [
|
107
|
+
{
|
108
|
+
"Score": score,
|
109
|
+
"Threshold": min_threshold,
|
110
|
+
"Explanation": explanation,
|
111
|
+
"Pass/Fail": "Pass" if passed else "Fail",
|
112
|
+
}
|
113
|
+
], passed
|
@@ -2,19 +2,8 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from
|
6
|
-
from typing import List
|
7
|
-
|
8
|
-
import pandas as pd
|
9
|
-
|
5
|
+
from validmind import tags, tasks
|
10
6
|
from validmind.errors import MissingRequiredTestInputError
|
11
|
-
from validmind.vm_models import (
|
12
|
-
ResultSummary,
|
13
|
-
ResultTable,
|
14
|
-
ResultTableMetadata,
|
15
|
-
ThresholdTest,
|
16
|
-
ThresholdTestResult,
|
17
|
-
)
|
18
7
|
|
19
8
|
from .ai_powered_test import (
|
20
9
|
call_model,
|
@@ -23,9 +12,34 @@ from .ai_powered_test import (
|
|
23
12
|
missing_prompt_message,
|
24
13
|
)
|
25
14
|
|
15
|
+
SYSTEM = """
|
16
|
+
You are a prompt evaluation AI.
|
17
|
+
You are aware of all prompt engineering best practices and can score prompts based on how well they satisfy different metrics.
|
18
|
+
You analyse the prompts step-by-step based on provided documentation and provide a score and an explanation for how you produced that score.
|
19
|
+
|
20
|
+
LLM Prompts that include different sections and user inputs should be properly delimitated.
|
21
|
+
Ideally, the prompt should use triple quotes or backticks or at least single quotes around any user input, reference text or code block etc.
|
22
|
+
This is to ensure that the prompt is parsed correctly by the model, different pieces of the prompt are understood as separate and any user-provided inputs are not interpreted as part of the prompt.
|
23
|
+
Identify any issues in the user-submitted prompt and give a score from 1 to 10, where 10 is a perfect score, based on the number and severity of issues.
|
24
|
+
|
25
|
+
Response Format:
|
26
|
+
```
|
27
|
+
Score: <score>
|
28
|
+
Explanation: <explanation>
|
29
|
+
```
|
30
|
+
"""
|
31
|
+
|
32
|
+
USER = """
|
33
|
+
Prompt:
|
34
|
+
'''
|
35
|
+
{prompt_to_test}
|
36
|
+
'''
|
37
|
+
"""
|
26
38
|
|
27
|
-
|
28
|
-
|
39
|
+
|
40
|
+
@tags("llm", "zero_shot", "few_shot")
|
41
|
+
@tasks("text_classification", "text_summarization")
|
42
|
+
def Delimitation(model, min_threshold=7):
|
29
43
|
"""
|
30
44
|
Evaluates the proper use of delimiters in prompts provided to Large Language Models.
|
31
45
|
|
@@ -63,78 +77,23 @@ class Delimitation(ThresholdTest):
|
|
63
77
|
- The preset score threshold may not be refined enough for complex tasks and prompts, requiring regular manual
|
64
78
|
adjustment.
|
65
79
|
"""
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
user_prompt = '''
|
87
|
-
Prompt:
|
88
|
-
"""
|
89
|
-
{prompt_to_test}
|
90
|
-
"""
|
91
|
-
'''.strip()
|
92
|
-
|
93
|
-
def summary(self, results: List[ThresholdTestResult], all_passed: bool):
|
94
|
-
result = results[0]
|
95
|
-
results_table = [
|
96
|
-
{
|
97
|
-
"Score": result.values["score"],
|
98
|
-
"Threshold": result.values["threshold"],
|
99
|
-
"Explanation": result.values["explanation"],
|
100
|
-
"Pass/Fail": "Pass" if result.passed else "Fail",
|
101
|
-
}
|
102
|
-
]
|
103
|
-
|
104
|
-
return ResultSummary(
|
105
|
-
results=[
|
106
|
-
ResultTable(
|
107
|
-
data=pd.DataFrame(results_table),
|
108
|
-
metadata=ResultTableMetadata(
|
109
|
-
title="Delimination Test for LLM Prompt",
|
110
|
-
),
|
111
|
-
)
|
112
|
-
]
|
113
|
-
)
|
114
|
-
|
115
|
-
def run(self):
|
116
|
-
if not hasattr(self.inputs.model, "prompt"):
|
117
|
-
raise MissingRequiredTestInputError(missing_prompt_message)
|
118
|
-
|
119
|
-
response = call_model(
|
120
|
-
system_prompt=self.system_prompt,
|
121
|
-
user_prompt=self.user_prompt.format(
|
122
|
-
prompt_to_test=self.inputs.model.prompt.template
|
123
|
-
),
|
124
|
-
)
|
125
|
-
score = get_score(response)
|
126
|
-
explanation = get_explanation(response)
|
127
|
-
|
128
|
-
passed = score > self.params["min_threshold"]
|
129
|
-
results = [
|
130
|
-
ThresholdTestResult(
|
131
|
-
passed=passed,
|
132
|
-
values={
|
133
|
-
"score": score,
|
134
|
-
"explanation": explanation,
|
135
|
-
"threshold": self.params["min_threshold"],
|
136
|
-
},
|
137
|
-
)
|
138
|
-
]
|
139
|
-
|
140
|
-
return self.cache_results(results, passed=passed)
|
80
|
+
if not hasattr(model, "prompt"):
|
81
|
+
raise MissingRequiredTestInputError(missing_prompt_message)
|
82
|
+
|
83
|
+
response = call_model(
|
84
|
+
system_prompt=SYSTEM,
|
85
|
+
user_prompt=USER.format(prompt_to_test=model.prompt.template),
|
86
|
+
)
|
87
|
+
score = get_score(response)
|
88
|
+
explanation = get_explanation(response)
|
89
|
+
|
90
|
+
passed = score > min_threshold
|
91
|
+
|
92
|
+
return [
|
93
|
+
{
|
94
|
+
"Score": score,
|
95
|
+
"Threshold": min_threshold,
|
96
|
+
"Explanation": explanation,
|
97
|
+
"Pass/Fail": "Pass" if passed else "Fail",
|
98
|
+
}
|
99
|
+
], passed
|
@@ -2,19 +2,8 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from
|
6
|
-
from typing import List
|
7
|
-
|
8
|
-
import pandas as pd
|
9
|
-
|
5
|
+
from validmind import tags, tasks
|
10
6
|
from validmind.errors import MissingRequiredTestInputError
|
11
|
-
from validmind.vm_models import (
|
12
|
-
ResultSummary,
|
13
|
-
ResultTable,
|
14
|
-
ResultTableMetadata,
|
15
|
-
ThresholdTest,
|
16
|
-
ThresholdTestResult,
|
17
|
-
)
|
18
7
|
|
19
8
|
from .ai_powered_test import (
|
20
9
|
call_model,
|
@@ -23,9 +12,47 @@ from .ai_powered_test import (
|
|
23
12
|
missing_prompt_message,
|
24
13
|
)
|
25
14
|
|
15
|
+
SYSTEM = """
|
16
|
+
You are a prompt evaluation AI.
|
17
|
+
You are aware of all prompt engineering best practices and can score prompts based on how well they satisfy different metrics.
|
18
|
+
You analyse the prompts step-by-step based on provided documentation and provide a score and an explanation for how you produced that score.
|
19
|
+
|
20
|
+
Consider the following documentation regarding negative instructions in prompts and utilize it to grade the user-submitted prompt:
|
21
|
+
'''
|
22
|
+
Best practices for LLM prompt engineering suggest that positive instructions should be preferred over negative instructions.
|
23
|
+
For example, instead of saying "Don't do X", it is better to say "Do Y".
|
24
|
+
This is because the model is more likely to generate the desired output if it is given a positive instruction.
|
25
|
+
Prompts that are phrased in the affirmative, emphasizing what to do, tend to direct the LLM more clearly than those that focus on what not to do.
|
26
|
+
Negative instructions can lead to ambiguities and undesired model responses.
|
27
|
+
By emphasizing clarity and proactive guidance, we optimize the chances of obtaining relevant and targeted responses from the LLM.
|
28
|
+
|
29
|
+
Example:
|
30
|
+
Consider a scenario involving a chatbot designed to recommend movies.
|
31
|
+
An instruction framed as, "Don't recommend movies that are horror or thriller", might cause the LLM to fixate on the genres mentioned,inadvertently producing undesired results.
|
32
|
+
On the other hand, a positively-framed prompt like, "Recommend family-friendly movies or romantic comedies" provides clear guidance on the desired output.
|
33
|
+
'''
|
34
|
+
|
35
|
+
Based on this best practice, please score the user-submitted prompt on a scale of 1-10, where 10 is a perfect score.
|
36
|
+
Provide an explanation for your score.
|
37
|
+
|
38
|
+
Response Format:
|
39
|
+
```
|
40
|
+
Score: <score>
|
41
|
+
Explanation: <explanation>
|
42
|
+
```
|
43
|
+
"""
|
44
|
+
|
45
|
+
USER = """
|
46
|
+
Prompt:
|
47
|
+
'''
|
48
|
+
{prompt_to_test}
|
49
|
+
'''
|
50
|
+
"""
|
51
|
+
|
26
52
|
|
27
|
-
@
|
28
|
-
|
53
|
+
@tags("llm", "zero_shot", "few_shot")
|
54
|
+
@tasks("text_classification", "text_summarization")
|
55
|
+
def NegativeInstruction(model, min_threshold=7):
|
29
56
|
"""
|
30
57
|
Evaluates and grades the use of affirmative, proactive language over negative instructions in LLM prompts.
|
31
58
|
|
@@ -68,85 +95,24 @@ class NegativeInstruction(ThresholdTest):
|
|
68
95
|
- The effectiveness of the test hinges significantly on the predetermined threshold level, which can be subjective
|
69
96
|
and may need to be adjusted according to specific use-cases.
|
70
97
|
"""
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
Response Format:
|
93
|
-
```
|
94
|
-
Score: <score>
|
95
|
-
Explanation: <explanation>
|
96
|
-
```
|
97
|
-
""".strip()
|
98
|
-
user_prompt = '''
|
99
|
-
Prompt:
|
100
|
-
"""
|
101
|
-
{prompt_to_test}
|
102
|
-
"""
|
103
|
-
'''.strip()
|
104
|
-
|
105
|
-
def summary(self, results: List[ThresholdTestResult], all_passed: bool):
|
106
|
-
result = results[0]
|
107
|
-
results_table = [
|
108
|
-
{
|
109
|
-
"Score": result.values["score"],
|
110
|
-
"Threshold": result.values["threshold"],
|
111
|
-
"Explanation": result.values["explanation"],
|
112
|
-
"Pass/Fail": "Pass" if result.passed else "Fail",
|
113
|
-
}
|
114
|
-
]
|
115
|
-
|
116
|
-
return ResultSummary(
|
117
|
-
results=[
|
118
|
-
ResultTable(
|
119
|
-
data=pd.DataFrame(results_table),
|
120
|
-
metadata=ResultTableMetadata(
|
121
|
-
title="Negative Instruction Test on Prompt",
|
122
|
-
),
|
123
|
-
)
|
124
|
-
]
|
125
|
-
)
|
126
|
-
|
127
|
-
def run(self):
|
128
|
-
if not hasattr(self.inputs.model, "prompt"):
|
129
|
-
raise MissingRequiredTestInputError(missing_prompt_message)
|
130
|
-
|
131
|
-
response = call_model(
|
132
|
-
system_prompt=self.system_prompt,
|
133
|
-
user_prompt=self.user_prompt.format(
|
134
|
-
prompt_to_test=self.inputs.model.prompt.template
|
135
|
-
),
|
136
|
-
)
|
137
|
-
score = get_score(response)
|
138
|
-
explanation = get_explanation(response)
|
139
|
-
|
140
|
-
passed = score > self.params["min_threshold"]
|
141
|
-
results = [
|
142
|
-
ThresholdTestResult(
|
143
|
-
passed=passed,
|
144
|
-
values={
|
145
|
-
"score": score,
|
146
|
-
"explanation": explanation,
|
147
|
-
"threshold": self.params["min_threshold"],
|
148
|
-
},
|
149
|
-
)
|
150
|
-
]
|
151
|
-
|
152
|
-
return self.cache_results(results, passed=passed)
|
98
|
+
if not hasattr(model, "prompt"):
|
99
|
+
raise MissingRequiredTestInputError(missing_prompt_message)
|
100
|
+
|
101
|
+
response = call_model(
|
102
|
+
system_prompt=SYSTEM,
|
103
|
+
user_prompt=USER.format(prompt_to_test=model.prompt.template),
|
104
|
+
)
|
105
|
+
score = get_score(response)
|
106
|
+
explanation = get_explanation(response)
|
107
|
+
|
108
|
+
passed = score > min_threshold
|
109
|
+
result = [
|
110
|
+
{
|
111
|
+
"Score": score,
|
112
|
+
"Threshold": min_threshold,
|
113
|
+
"Explanation": explanation,
|
114
|
+
"Pass/Fail": "Pass" if passed else "Fail",
|
115
|
+
}
|
116
|
+
]
|
117
|
+
|
118
|
+
return result, passed
|