validmind 2.5.25__py3-none-any.whl → 2.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. validmind/__init__.py +8 -17
  2. validmind/__version__.py +1 -1
  3. validmind/ai/test_descriptions.py +66 -85
  4. validmind/ai/test_result_description/context.py +2 -2
  5. validmind/ai/utils.py +26 -1
  6. validmind/api_client.py +43 -79
  7. validmind/client.py +5 -7
  8. validmind/client_config.py +1 -1
  9. validmind/datasets/__init__.py +1 -1
  10. validmind/datasets/classification/customer_churn.py +7 -5
  11. validmind/datasets/nlp/__init__.py +2 -2
  12. validmind/errors.py +6 -10
  13. validmind/html_templates/content_blocks.py +18 -16
  14. validmind/logging.py +21 -16
  15. validmind/tests/__init__.py +28 -5
  16. validmind/tests/__types__.py +186 -170
  17. validmind/tests/_store.py +7 -21
  18. validmind/tests/comparison.py +362 -0
  19. validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
  20. validmind/tests/data_validation/ADF.py +49 -83
  21. validmind/tests/data_validation/AutoAR.py +59 -96
  22. validmind/tests/data_validation/AutoMA.py +59 -96
  23. validmind/tests/data_validation/AutoStationarity.py +66 -114
  24. validmind/tests/data_validation/ClassImbalance.py +48 -117
  25. validmind/tests/data_validation/DatasetDescription.py +180 -209
  26. validmind/tests/data_validation/DatasetSplit.py +50 -75
  27. validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
  28. validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
  29. validmind/tests/data_validation/Duplicates.py +21 -90
  30. validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
  31. validmind/tests/data_validation/HighCardinality.py +32 -80
  32. validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
  33. validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
  34. validmind/tests/data_validation/IQROutliersTable.py +40 -80
  35. validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
  36. validmind/tests/data_validation/KPSS.py +33 -81
  37. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
  38. validmind/tests/data_validation/MissingValues.py +17 -58
  39. validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
  40. validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
  41. validmind/tests/data_validation/RollingStatsPlot.py +50 -81
  42. validmind/tests/data_validation/SeasonalDecompose.py +102 -184
  43. validmind/tests/data_validation/Skewness.py +27 -64
  44. validmind/tests/data_validation/SpreadPlot.py +34 -57
  45. validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
  46. validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
  47. validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
  48. validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
  49. validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
  50. validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
  51. validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
  52. validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
  53. validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
  54. validmind/tests/data_validation/TooManyZeroValues.py +21 -70
  55. validmind/tests/data_validation/UniqueRows.py +23 -62
  56. validmind/tests/data_validation/WOEBinPlots.py +83 -109
  57. validmind/tests/data_validation/WOEBinTable.py +28 -69
  58. validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
  59. validmind/tests/data_validation/nlp/CommonWords.py +49 -57
  60. validmind/tests/data_validation/nlp/Hashtags.py +27 -49
  61. validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
  62. validmind/tests/data_validation/nlp/Mentions.py +32 -63
  63. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
  64. validmind/tests/data_validation/nlp/Punctuations.py +63 -47
  65. validmind/tests/data_validation/nlp/Sentiment.py +4 -0
  66. validmind/tests/data_validation/nlp/StopWords.py +62 -91
  67. validmind/tests/data_validation/nlp/TextDescription.py +116 -159
  68. validmind/tests/data_validation/nlp/Toxicity.py +12 -4
  69. validmind/tests/decorator.py +33 -242
  70. validmind/tests/load.py +212 -153
  71. validmind/tests/model_validation/BertScore.py +13 -7
  72. validmind/tests/model_validation/BleuScore.py +4 -0
  73. validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
  74. validmind/tests/model_validation/ContextualRecall.py +3 -0
  75. validmind/tests/model_validation/FeaturesAUC.py +43 -74
  76. validmind/tests/model_validation/MeteorScore.py +3 -0
  77. validmind/tests/model_validation/RegardScore.py +5 -1
  78. validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
  79. validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
  80. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
  81. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
  82. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
  83. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
  84. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
  85. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
  86. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
  87. validmind/tests/model_validation/embeddings/utils.py +53 -0
  88. validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
  89. validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
  90. validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
  91. validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
  92. validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
  93. validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
  94. validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
  95. validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
  96. validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
  97. validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
  98. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
  99. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
  100. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
  101. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
  102. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
  103. validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
  104. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
  105. validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
  106. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
  107. validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
  108. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
  109. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
  110. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
  111. validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
  112. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
  113. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
  114. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
  115. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
  116. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
  117. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
  118. validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
  119. validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
  120. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
  121. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
  122. validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
  123. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
  124. validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
  125. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
  126. validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
  127. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
  128. validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
  129. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
  130. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
  131. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
  132. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
  133. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
  134. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
  135. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
  136. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
  137. validmind/tests/output.py +120 -0
  138. validmind/tests/prompt_validation/Bias.py +55 -98
  139. validmind/tests/prompt_validation/Clarity.py +56 -99
  140. validmind/tests/prompt_validation/Conciseness.py +63 -101
  141. validmind/tests/prompt_validation/Delimitation.py +48 -89
  142. validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
  143. validmind/tests/prompt_validation/Robustness.py +80 -121
  144. validmind/tests/prompt_validation/Specificity.py +61 -95
  145. validmind/tests/prompt_validation/ai_powered_test.py +2 -2
  146. validmind/tests/run.py +314 -496
  147. validmind/tests/test_providers.py +109 -79
  148. validmind/tests/utils.py +91 -0
  149. validmind/unit_metrics/__init__.py +16 -155
  150. validmind/unit_metrics/classification/F1.py +1 -0
  151. validmind/unit_metrics/classification/Precision.py +1 -0
  152. validmind/unit_metrics/classification/ROC_AUC.py +1 -0
  153. validmind/unit_metrics/classification/Recall.py +1 -0
  154. validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
  155. validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
  156. validmind/unit_metrics/regression/HuberLoss.py +1 -0
  157. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
  158. validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
  159. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
  160. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
  161. validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
  162. validmind/unit_metrics/regression/QuantileLoss.py +1 -0
  163. validmind/unit_metrics/regression/RSquaredScore.py +2 -1
  164. validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
  165. validmind/utils.py +66 -17
  166. validmind/vm_models/__init__.py +2 -17
  167. validmind/vm_models/dataset/dataset.py +31 -4
  168. validmind/vm_models/figure.py +7 -37
  169. validmind/vm_models/model.py +3 -0
  170. validmind/vm_models/result/__init__.py +7 -0
  171. validmind/vm_models/result/result.jinja +21 -0
  172. validmind/vm_models/result/result.py +337 -0
  173. validmind/vm_models/result/utils.py +160 -0
  174. validmind/vm_models/test_suite/runner.py +16 -54
  175. validmind/vm_models/test_suite/summary.py +3 -3
  176. validmind/vm_models/test_suite/test.py +43 -77
  177. validmind/vm_models/test_suite/test_suite.py +8 -40
  178. validmind-2.6.8.dist-info/METADATA +137 -0
  179. {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/RECORD +182 -189
  180. validmind/tests/data_validation/AutoSeasonality.py +0 -190
  181. validmind/tests/metadata.py +0 -59
  182. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
  183. validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
  184. validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
  185. validmind/unit_metrics/composite.py +0 -238
  186. validmind/vm_models/test/metric.py +0 -98
  187. validmind/vm_models/test/metric_result.py +0 -61
  188. validmind/vm_models/test/output_template.py +0 -55
  189. validmind/vm_models/test/result_summary.py +0 -76
  190. validmind/vm_models/test/result_wrapper.py +0 -488
  191. validmind/vm_models/test/test.py +0 -103
  192. validmind/vm_models/test/threshold_test.py +0 -106
  193. validmind/vm_models/test/threshold_test_result.py +0 -75
  194. validmind/vm_models/test_context.py +0 -259
  195. validmind-2.5.25.dist-info/METADATA +0 -118
  196. {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/LICENSE +0 -0
  197. {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/WHEEL +0 -0
  198. {validmind-2.5.25.dist-info → validmind-2.6.8.dist-info}/entry_points.txt +0 -0
@@ -2,19 +2,8 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
- from typing import List
7
-
8
- import pandas as pd
9
-
5
+ from validmind import tags, tasks
10
6
  from validmind.errors import MissingRequiredTestInputError
11
- from validmind.vm_models import (
12
- ResultSummary,
13
- ResultTable,
14
- ResultTableMetadata,
15
- ThresholdTest,
16
- ThresholdTestResult,
17
- )
18
7
 
19
8
  from .ai_powered_test import (
20
9
  call_model,
@@ -23,9 +12,49 @@ from .ai_powered_test import (
23
12
  missing_prompt_message,
24
13
  )
25
14
 
15
+ SYSTEM = """
16
+ You are a prompt evaluation AI.
17
+ You are aware of all prompt engineering best practices and can score prompts based on how well they satisfy different metrics.
18
+ You analyse the prompts step-by-step based on provided documentation and provide a score and an explanation for how you produced that score.
19
+
20
+ Consider the following documentation regarding conciseness in prompts and utilize it to grade the user-submitted prompt:
21
+ '''
22
+ While detailed prompts can guide an LLM towards accurate results, excessive details can clutter the instruction and potentially lead to undesired outputs.
23
+ Concise prompts are straightforward, reducing ambiguity and focusing the LLM's attention on the primary task.
24
+ This is especially important considering there are limitations to the length of prompts that can be fed to an LLM.
25
+
26
+ For an LLM tasked with summarizing a document, a verbose prompt might introduce unnecessary constraints or biases.
27
+ A concise, effective prompt like:
28
+ "Provide a brief summary highlighting the main points of the document"
29
+ ensures that the LLM captures the essence of the content without being sidetracked.
30
+
31
+ For example this prompt:
32
+ "The description for this product should be fairly short, a few sentences only, and not too much more."
33
+ could be better written like this:
34
+ "Use a 3 to 5 sentence paragraph to describe this product."
35
+ '''
36
+
37
+ Score the user-submitted prompt on a scale of 1 to 10, with 10 being the best possible score.
38
+ Provide an explanation for your score.
39
+
40
+ Response Format:
41
+ ```
42
+ Score: <score>
43
+ Explanation: <explanation>
44
+ ```
45
+ """
26
46
 
27
- @dataclass
28
- class Conciseness(ThresholdTest):
47
+ USER = """
48
+ Prompt:
49
+ ```
50
+ {prompt_to_test}
51
+ ```
52
+ """
53
+
54
+
55
+ @tags("llm", "zero_shot", "few_shot")
56
+ @tasks("text_classification", "text_summarization")
57
+ def Conciseness(model, min_threshold=7):
29
58
  """
30
59
  Analyzes and grades the conciseness of prompts provided to a Large Language Model.
31
60
 
@@ -62,90 +91,23 @@ class Conciseness(ThresholdTest):
62
91
  - The predefined threshold for conciseness could be subjective and might need adjustment based on application.
63
92
  - The test is dependent on the LLM’s understanding of conciseness, which might vary from model to model.
64
93
  """
65
-
66
- name = "conciseness"
67
- required_inputs = ["model.prompt"]
68
- default_params = {"min_threshold": 7}
69
- tasks = ["text_classification", "text_summarization"]
70
- tags = ["llm", "zero_shot", "few_shot"]
71
-
72
- system_prompt = """
73
- You are a prompt evaluation AI. You are aware of all prompt engineering best practices and can score prompts based on how well they satisfy different metrics. You analyse the prompts step-by-step based on provided documentation and provide a score and an explanation for how you produced that score.
74
-
75
- Consider the following documentation regarding conciseness in prompts and utilize it to grade the user-submitted prompt:
76
- '''
77
- While detailed prompts can guide an LLM towards accurate results, excessive details can clutter the instruction and potentially lead to undesired outputs. Concise prompts are straightforward, reducing ambiguity and focusing the LLM's attention on the primary task. This is especially important considering there are limitations to the length of prompts that can be fed to an LLM.
78
-
79
- For an LLM tasked with summarizing a document, a verbose prompt might introduce unnecessary constraints or biases. A concise, effective prompt like:
80
- "Provide a brief summary highlighting the main points of the document"
81
- ensures that the LLM captures the essence of the content without being sidetracked.
82
-
83
- For example this prompt:
84
- "The description for this product should be fairly short, a few sentences only, and not too much more."
85
- could be better written like this:
86
- "Use a 3 to 5 sentence paragraph to describe this product."
87
- '''
88
-
89
- Score the user-submitted prompt on a scale of 1 to 10, with 10 being the best possible score. Provide an explanation for your score.
90
-
91
- Response Format:
92
- ```
93
- Score: <score>
94
- Explanation: <explanation>
95
- ```
96
- """.strip()
97
- user_prompt = '''
98
- Prompt:
99
- """
100
- {prompt_to_test}
101
- """
102
- '''.strip()
103
-
104
- def summary(self, results: List[ThresholdTestResult], all_passed: bool):
105
- result = results[0]
106
- results_table = [
107
- {
108
- "Score": result.values["score"],
109
- "Threshold": result.values["threshold"],
110
- "Explanation": result.values["explanation"],
111
- "Pass/Fail": "Pass" if result.passed else "Fail",
112
- }
113
- ]
114
-
115
- return ResultSummary(
116
- results=[
117
- ResultTable(
118
- data=pd.DataFrame(results_table),
119
- metadata=ResultTableMetadata(
120
- title="Conciseness Test for LLM Prompt",
121
- ),
122
- )
123
- ]
124
- )
125
-
126
- def run(self):
127
- if not hasattr(self.inputs.model, "prompt"):
128
- raise MissingRequiredTestInputError(missing_prompt_message)
129
-
130
- response = call_model(
131
- system_prompt=self.system_prompt,
132
- user_prompt=self.user_prompt.format(
133
- prompt_to_test=self.inputs.model.prompt.template
134
- ),
135
- )
136
- score = get_score(response)
137
- explanation = get_explanation(response)
138
-
139
- passed = score > self.params["min_threshold"]
140
- results = [
141
- ThresholdTestResult(
142
- passed=passed,
143
- values={
144
- "score": score,
145
- "explanation": explanation,
146
- "threshold": self.params["min_threshold"],
147
- },
148
- )
149
- ]
150
-
151
- return self.cache_results(results, passed=passed)
94
+ if not hasattr(model, "prompt"):
95
+ raise MissingRequiredTestInputError(missing_prompt_message)
96
+
97
+ response = call_model(
98
+ system_prompt=SYSTEM,
99
+ user_prompt=USER.format(prompt_to_test=model.prompt.template),
100
+ )
101
+ score = get_score(response)
102
+ explanation = get_explanation(response)
103
+
104
+ passed = score > min_threshold
105
+
106
+ return [
107
+ {
108
+ "Score": score,
109
+ "Threshold": min_threshold,
110
+ "Explanation": explanation,
111
+ "Pass/Fail": "Pass" if passed else "Fail",
112
+ }
113
+ ], passed
@@ -2,19 +2,8 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
- from typing import List
7
-
8
- import pandas as pd
9
-
5
+ from validmind import tags, tasks
10
6
  from validmind.errors import MissingRequiredTestInputError
11
- from validmind.vm_models import (
12
- ResultSummary,
13
- ResultTable,
14
- ResultTableMetadata,
15
- ThresholdTest,
16
- ThresholdTestResult,
17
- )
18
7
 
19
8
  from .ai_powered_test import (
20
9
  call_model,
@@ -23,9 +12,34 @@ from .ai_powered_test import (
23
12
  missing_prompt_message,
24
13
  )
25
14
 
15
+ SYSTEM = """
16
+ You are a prompt evaluation AI.
17
+ You are aware of all prompt engineering best practices and can score prompts based on how well they satisfy different metrics.
18
+ You analyse the prompts step-by-step based on provided documentation and provide a score and an explanation for how you produced that score.
19
+
20
+ LLM Prompts that include different sections and user inputs should be properly delimitated.
21
+ Ideally, the prompt should use triple quotes or backticks or at least single quotes around any user input, reference text or code block etc.
22
+ This is to ensure that the prompt is parsed correctly by the model, different pieces of the prompt are understood as separate and any user-provided inputs are not interpreted as part of the prompt.
23
+ Identify any issues in the user-submitted prompt and give a score from 1 to 10, where 10 is a perfect score, based on the number and severity of issues.
24
+
25
+ Response Format:
26
+ ```
27
+ Score: <score>
28
+ Explanation: <explanation>
29
+ ```
30
+ """
31
+
32
+ USER = """
33
+ Prompt:
34
+ '''
35
+ {prompt_to_test}
36
+ '''
37
+ """
26
38
 
27
- @dataclass
28
- class Delimitation(ThresholdTest):
39
+
40
+ @tags("llm", "zero_shot", "few_shot")
41
+ @tasks("text_classification", "text_summarization")
42
+ def Delimitation(model, min_threshold=7):
29
43
  """
30
44
  Evaluates the proper use of delimiters in prompts provided to Large Language Models.
31
45
 
@@ -63,78 +77,23 @@ class Delimitation(ThresholdTest):
63
77
  - The preset score threshold may not be refined enough for complex tasks and prompts, requiring regular manual
64
78
  adjustment.
65
79
  """
66
-
67
- name = "delimitation"
68
- required_inputs = ["model.prompt"]
69
- default_params = {"min_threshold": 7}
70
- tasks = ["text_classification", "text_summarization"]
71
- tags = ["llm", "zero_shot", "few_shot"]
72
-
73
- system_prompt = """
74
- You are a prompt evaluation AI. You are aware of all prompt engineering best practices and can score prompts based on how well they satisfy different metrics. You analyse the prompts step-by-step based on provided documentation and provide a score and an explanation for how you produced that score.
75
-
76
- LLM Prompts that include different sections and user inputs should be properly delimitated. Ideally, the prompt should use triple quotes or backticks or at least single quotes around any user input, reference text or code block etc.
77
- This is to ensure that the prompt is parsed correctly by the model, different pieces of the prompt are understood as separate and any user-provided inputs are not interpreted as part of the prompt.
78
- Identify any issues in the user-submitted prompt and give a score from 1 to 10, where 10 is a perfect score, based on the number and severity of issues.
79
-
80
- Response Format:
81
- ```
82
- Score: <score>
83
- Explanation: <explanation>
84
- ```
85
- """.strip()
86
- user_prompt = '''
87
- Prompt:
88
- """
89
- {prompt_to_test}
90
- """
91
- '''.strip()
92
-
93
- def summary(self, results: List[ThresholdTestResult], all_passed: bool):
94
- result = results[0]
95
- results_table = [
96
- {
97
- "Score": result.values["score"],
98
- "Threshold": result.values["threshold"],
99
- "Explanation": result.values["explanation"],
100
- "Pass/Fail": "Pass" if result.passed else "Fail",
101
- }
102
- ]
103
-
104
- return ResultSummary(
105
- results=[
106
- ResultTable(
107
- data=pd.DataFrame(results_table),
108
- metadata=ResultTableMetadata(
109
- title="Delimination Test for LLM Prompt",
110
- ),
111
- )
112
- ]
113
- )
114
-
115
- def run(self):
116
- if not hasattr(self.inputs.model, "prompt"):
117
- raise MissingRequiredTestInputError(missing_prompt_message)
118
-
119
- response = call_model(
120
- system_prompt=self.system_prompt,
121
- user_prompt=self.user_prompt.format(
122
- prompt_to_test=self.inputs.model.prompt.template
123
- ),
124
- )
125
- score = get_score(response)
126
- explanation = get_explanation(response)
127
-
128
- passed = score > self.params["min_threshold"]
129
- results = [
130
- ThresholdTestResult(
131
- passed=passed,
132
- values={
133
- "score": score,
134
- "explanation": explanation,
135
- "threshold": self.params["min_threshold"],
136
- },
137
- )
138
- ]
139
-
140
- return self.cache_results(results, passed=passed)
80
+ if not hasattr(model, "prompt"):
81
+ raise MissingRequiredTestInputError(missing_prompt_message)
82
+
83
+ response = call_model(
84
+ system_prompt=SYSTEM,
85
+ user_prompt=USER.format(prompt_to_test=model.prompt.template),
86
+ )
87
+ score = get_score(response)
88
+ explanation = get_explanation(response)
89
+
90
+ passed = score > min_threshold
91
+
92
+ return [
93
+ {
94
+ "Score": score,
95
+ "Threshold": min_threshold,
96
+ "Explanation": explanation,
97
+ "Pass/Fail": "Pass" if passed else "Fail",
98
+ }
99
+ ], passed
@@ -2,19 +2,8 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
- from typing import List
7
-
8
- import pandas as pd
9
-
5
+ from validmind import tags, tasks
10
6
  from validmind.errors import MissingRequiredTestInputError
11
- from validmind.vm_models import (
12
- ResultSummary,
13
- ResultTable,
14
- ResultTableMetadata,
15
- ThresholdTest,
16
- ThresholdTestResult,
17
- )
18
7
 
19
8
  from .ai_powered_test import (
20
9
  call_model,
@@ -23,9 +12,47 @@ from .ai_powered_test import (
23
12
  missing_prompt_message,
24
13
  )
25
14
 
15
+ SYSTEM = """
16
+ You are a prompt evaluation AI.
17
+ You are aware of all prompt engineering best practices and can score prompts based on how well they satisfy different metrics.
18
+ You analyse the prompts step-by-step based on provided documentation and provide a score and an explanation for how you produced that score.
19
+
20
+ Consider the following documentation regarding negative instructions in prompts and utilize it to grade the user-submitted prompt:
21
+ '''
22
+ Best practices for LLM prompt engineering suggest that positive instructions should be preferred over negative instructions.
23
+ For example, instead of saying "Don't do X", it is better to say "Do Y".
24
+ This is because the model is more likely to generate the desired output if it is given a positive instruction.
25
+ Prompts that are phrased in the affirmative, emphasizing what to do, tend to direct the LLM more clearly than those that focus on what not to do.
26
+ Negative instructions can lead to ambiguities and undesired model responses.
27
+ By emphasizing clarity and proactive guidance, we optimize the chances of obtaining relevant and targeted responses from the LLM.
28
+
29
+ Example:
30
+ Consider a scenario involving a chatbot designed to recommend movies.
31
+ An instruction framed as, "Don't recommend movies that are horror or thriller", might cause the LLM to fixate on the genres mentioned,inadvertently producing undesired results.
32
+ On the other hand, a positively-framed prompt like, "Recommend family-friendly movies or romantic comedies" provides clear guidance on the desired output.
33
+ '''
34
+
35
+ Based on this best practice, please score the user-submitted prompt on a scale of 1-10, where 10 is a perfect score.
36
+ Provide an explanation for your score.
37
+
38
+ Response Format:
39
+ ```
40
+ Score: <score>
41
+ Explanation: <explanation>
42
+ ```
43
+ """
44
+
45
+ USER = """
46
+ Prompt:
47
+ '''
48
+ {prompt_to_test}
49
+ '''
50
+ """
51
+
26
52
 
27
- @dataclass
28
- class NegativeInstruction(ThresholdTest):
53
+ @tags("llm", "zero_shot", "few_shot")
54
+ @tasks("text_classification", "text_summarization")
55
+ def NegativeInstruction(model, min_threshold=7):
29
56
  """
30
57
  Evaluates and grades the use of affirmative, proactive language over negative instructions in LLM prompts.
31
58
 
@@ -68,85 +95,24 @@ class NegativeInstruction(ThresholdTest):
68
95
  - The effectiveness of the test hinges significantly on the predetermined threshold level, which can be subjective
69
96
  and may need to be adjusted according to specific use-cases.
70
97
  """
71
-
72
- name = "negative_instruction"
73
- required_inputs = ["model.prompt"]
74
- default_params = {"min_threshold": 7}
75
- tasks = ["text_classification", "text_summarization"]
76
- tags = ["llm", "zero_shot", "few_shot"]
77
-
78
- system_prompt = """
79
- You are a prompt evaluation AI. You are aware of all prompt engineering best practices and can score prompts based on how well they satisfy different metrics. You analyse the prompts step-by-step based on provided documentation and provide a score and an explanation for how you produced that score.
80
-
81
- Consider the following documentation regarding negative instructions in prompts and utilize it to grade the user-submitted prompt:
82
- '''
83
- Best practices for LLM prompt engineering suggest that positive instructions should be preferred over negative instructions. For example, instead of saying "Don't do X", it is better to say "Do Y". This is because the model is more likely to generate the desired output if it is given a positive instruction.
84
- Prompts that are phrased in the affirmative, emphasizing what to do, tend to direct the LLM more clearly than those that focus on what not to do. Negative instructions can lead to ambiguities and undesired model responses. By emphasizing clarity and proactive guidance, we optimize the chances of obtaining relevant and targeted responses from the LLM.
85
- Example:
86
- Consider a scenario involving a chatbot designed to recommend movies. An instruction framed as, "Don't recommend movies that are horror or thriller" might cause the LLM to fixate on the genres mentioned, inadvertently producing undesired results. On the other hand, a positively-framed prompt like, "Recommend family-friendly movies or romantic comedies" provides clear guidance on the desired output.
87
- '''
88
-
89
- Based on this best practice, please score the user-submitted prompt on a scale of 1-10, where 10 is a perfect score.
90
- Provide an explanation for your score.
91
-
92
- Response Format:
93
- ```
94
- Score: <score>
95
- Explanation: <explanation>
96
- ```
97
- """.strip()
98
- user_prompt = '''
99
- Prompt:
100
- """
101
- {prompt_to_test}
102
- """
103
- '''.strip()
104
-
105
- def summary(self, results: List[ThresholdTestResult], all_passed: bool):
106
- result = results[0]
107
- results_table = [
108
- {
109
- "Score": result.values["score"],
110
- "Threshold": result.values["threshold"],
111
- "Explanation": result.values["explanation"],
112
- "Pass/Fail": "Pass" if result.passed else "Fail",
113
- }
114
- ]
115
-
116
- return ResultSummary(
117
- results=[
118
- ResultTable(
119
- data=pd.DataFrame(results_table),
120
- metadata=ResultTableMetadata(
121
- title="Negative Instruction Test on Prompt",
122
- ),
123
- )
124
- ]
125
- )
126
-
127
- def run(self):
128
- if not hasattr(self.inputs.model, "prompt"):
129
- raise MissingRequiredTestInputError(missing_prompt_message)
130
-
131
- response = call_model(
132
- system_prompt=self.system_prompt,
133
- user_prompt=self.user_prompt.format(
134
- prompt_to_test=self.inputs.model.prompt.template
135
- ),
136
- )
137
- score = get_score(response)
138
- explanation = get_explanation(response)
139
-
140
- passed = score > self.params["min_threshold"]
141
- results = [
142
- ThresholdTestResult(
143
- passed=passed,
144
- values={
145
- "score": score,
146
- "explanation": explanation,
147
- "threshold": self.params["min_threshold"],
148
- },
149
- )
150
- ]
151
-
152
- return self.cache_results(results, passed=passed)
98
+ if not hasattr(model, "prompt"):
99
+ raise MissingRequiredTestInputError(missing_prompt_message)
100
+
101
+ response = call_model(
102
+ system_prompt=SYSTEM,
103
+ user_prompt=USER.format(prompt_to_test=model.prompt.template),
104
+ )
105
+ score = get_score(response)
106
+ explanation = get_explanation(response)
107
+
108
+ passed = score > min_threshold
109
+ result = [
110
+ {
111
+ "Score": score,
112
+ "Threshold": min_threshold,
113
+ "Explanation": explanation,
114
+ "Pass/Fail": "Pass" if passed else "Fail",
115
+ }
116
+ ]
117
+
118
+ return result, passed