validmind 2.5.25__py3-none-any.whl → 2.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. validmind/__init__.py +8 -17
  2. validmind/__version__.py +1 -1
  3. validmind/ai/test_descriptions.py +66 -85
  4. validmind/ai/test_result_description/context.py +2 -2
  5. validmind/ai/utils.py +26 -1
  6. validmind/api_client.py +43 -79
  7. validmind/client.py +5 -7
  8. validmind/client_config.py +1 -1
  9. validmind/datasets/__init__.py +1 -1
  10. validmind/datasets/classification/customer_churn.py +7 -5
  11. validmind/datasets/nlp/__init__.py +2 -2
  12. validmind/errors.py +6 -10
  13. validmind/html_templates/content_blocks.py +18 -16
  14. validmind/logging.py +21 -16
  15. validmind/tests/__init__.py +28 -5
  16. validmind/tests/__types__.py +186 -170
  17. validmind/tests/_store.py +7 -21
  18. validmind/tests/comparison.py +362 -0
  19. validmind/tests/data_validation/ACFandPACFPlot.py +44 -73
  20. validmind/tests/data_validation/ADF.py +49 -83
  21. validmind/tests/data_validation/AutoAR.py +59 -96
  22. validmind/tests/data_validation/AutoMA.py +59 -96
  23. validmind/tests/data_validation/AutoStationarity.py +66 -114
  24. validmind/tests/data_validation/ClassImbalance.py +48 -117
  25. validmind/tests/data_validation/DatasetDescription.py +180 -209
  26. validmind/tests/data_validation/DatasetSplit.py +50 -75
  27. validmind/tests/data_validation/DescriptiveStatistics.py +59 -85
  28. validmind/tests/data_validation/{DFGLSArch.py → DickeyFullerGLS.py} +44 -76
  29. validmind/tests/data_validation/Duplicates.py +21 -90
  30. validmind/tests/data_validation/EngleGrangerCoint.py +53 -75
  31. validmind/tests/data_validation/HighCardinality.py +32 -80
  32. validmind/tests/data_validation/HighPearsonCorrelation.py +29 -97
  33. validmind/tests/data_validation/IQROutliersBarPlot.py +63 -94
  34. validmind/tests/data_validation/IQROutliersTable.py +40 -80
  35. validmind/tests/data_validation/IsolationForestOutliers.py +41 -63
  36. validmind/tests/data_validation/KPSS.py +33 -81
  37. validmind/tests/data_validation/LaggedCorrelationHeatmap.py +47 -95
  38. validmind/tests/data_validation/MissingValues.py +17 -58
  39. validmind/tests/data_validation/MissingValuesBarPlot.py +61 -87
  40. validmind/tests/data_validation/PhillipsPerronArch.py +56 -79
  41. validmind/tests/data_validation/RollingStatsPlot.py +50 -81
  42. validmind/tests/data_validation/SeasonalDecompose.py +102 -184
  43. validmind/tests/data_validation/Skewness.py +27 -64
  44. validmind/tests/data_validation/SpreadPlot.py +34 -57
  45. validmind/tests/data_validation/TabularCategoricalBarPlots.py +46 -65
  46. validmind/tests/data_validation/TabularDateTimeHistograms.py +23 -45
  47. validmind/tests/data_validation/TabularNumericalHistograms.py +27 -46
  48. validmind/tests/data_validation/TargetRateBarPlots.py +54 -93
  49. validmind/tests/data_validation/TimeSeriesFrequency.py +48 -133
  50. validmind/tests/data_validation/TimeSeriesHistogram.py +24 -3
  51. validmind/tests/data_validation/TimeSeriesLinePlot.py +29 -47
  52. validmind/tests/data_validation/TimeSeriesMissingValues.py +59 -135
  53. validmind/tests/data_validation/TimeSeriesOutliers.py +54 -171
  54. validmind/tests/data_validation/TooManyZeroValues.py +21 -70
  55. validmind/tests/data_validation/UniqueRows.py +23 -62
  56. validmind/tests/data_validation/WOEBinPlots.py +83 -109
  57. validmind/tests/data_validation/WOEBinTable.py +28 -69
  58. validmind/tests/data_validation/ZivotAndrewsArch.py +33 -75
  59. validmind/tests/data_validation/nlp/CommonWords.py +49 -57
  60. validmind/tests/data_validation/nlp/Hashtags.py +27 -49
  61. validmind/tests/data_validation/nlp/LanguageDetection.py +7 -13
  62. validmind/tests/data_validation/nlp/Mentions.py +32 -63
  63. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +89 -14
  64. validmind/tests/data_validation/nlp/Punctuations.py +63 -47
  65. validmind/tests/data_validation/nlp/Sentiment.py +4 -0
  66. validmind/tests/data_validation/nlp/StopWords.py +62 -91
  67. validmind/tests/data_validation/nlp/TextDescription.py +116 -159
  68. validmind/tests/data_validation/nlp/Toxicity.py +12 -4
  69. validmind/tests/decorator.py +33 -242
  70. validmind/tests/load.py +212 -153
  71. validmind/tests/model_validation/BertScore.py +13 -7
  72. validmind/tests/model_validation/BleuScore.py +4 -0
  73. validmind/tests/model_validation/ClusterSizeDistribution.py +24 -47
  74. validmind/tests/model_validation/ContextualRecall.py +3 -0
  75. validmind/tests/model_validation/FeaturesAUC.py +43 -74
  76. validmind/tests/model_validation/MeteorScore.py +3 -0
  77. validmind/tests/model_validation/RegardScore.py +5 -1
  78. validmind/tests/model_validation/RegressionResidualsPlot.py +54 -75
  79. validmind/tests/model_validation/embeddings/ClusterDistribution.py +10 -33
  80. validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +11 -29
  81. validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +19 -31
  82. validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +40 -49
  83. validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +29 -15
  84. validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +25 -11
  85. validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +28 -13
  86. validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +67 -38
  87. validmind/tests/model_validation/embeddings/utils.py +53 -0
  88. validmind/tests/model_validation/ragas/AnswerCorrectness.py +37 -32
  89. validmind/tests/model_validation/ragas/{AspectCritique.py → AspectCritic.py} +33 -27
  90. validmind/tests/model_validation/ragas/ContextEntityRecall.py +44 -41
  91. validmind/tests/model_validation/ragas/ContextPrecision.py +40 -35
  92. validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +133 -0
  93. validmind/tests/model_validation/ragas/ContextRecall.py +40 -35
  94. validmind/tests/model_validation/ragas/Faithfulness.py +42 -30
  95. validmind/tests/model_validation/ragas/NoiseSensitivity.py +59 -35
  96. validmind/tests/model_validation/ragas/{AnswerRelevance.py → ResponseRelevancy.py} +52 -41
  97. validmind/tests/model_validation/ragas/{AnswerSimilarity.py → SemanticSimilarity.py} +39 -34
  98. validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +13 -16
  99. validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +13 -16
  100. validmind/tests/model_validation/sklearn/ClassifierPerformance.py +51 -89
  101. validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +31 -61
  102. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +118 -83
  103. validmind/tests/model_validation/sklearn/CompletenessScore.py +13 -16
  104. validmind/tests/model_validation/sklearn/ConfusionMatrix.py +62 -94
  105. validmind/tests/model_validation/sklearn/FeatureImportance.py +7 -8
  106. validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +12 -15
  107. validmind/tests/model_validation/sklearn/HomogeneityScore.py +12 -15
  108. validmind/tests/model_validation/sklearn/HyperParametersTuning.py +23 -53
  109. validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +60 -74
  110. validmind/tests/model_validation/sklearn/MinimumAccuracy.py +16 -84
  111. validmind/tests/model_validation/sklearn/MinimumF1Score.py +22 -72
  112. validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +29 -78
  113. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +52 -82
  114. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +51 -145
  115. validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +60 -78
  116. validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +130 -172
  117. validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +26 -55
  118. validmind/tests/model_validation/sklearn/ROCCurve.py +43 -77
  119. validmind/tests/model_validation/sklearn/RegressionPerformance.py +41 -94
  120. validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +47 -136
  121. validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +164 -208
  122. validmind/tests/model_validation/sklearn/SilhouettePlot.py +54 -99
  123. validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +50 -124
  124. validmind/tests/model_validation/sklearn/VMeasure.py +12 -15
  125. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +225 -281
  126. validmind/tests/model_validation/statsmodels/AutoARIMA.py +40 -45
  127. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +22 -47
  128. validmind/tests/model_validation/statsmodels/Lilliefors.py +17 -28
  129. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +37 -81
  130. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +37 -105
  131. validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +62 -166
  132. validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +57 -119
  133. validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +20 -57
  134. validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +47 -80
  135. validmind/tests/ongoing_monitoring/PredictionCorrelation.py +2 -0
  136. validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +4 -2
  137. validmind/tests/output.py +120 -0
  138. validmind/tests/prompt_validation/Bias.py +55 -98
  139. validmind/tests/prompt_validation/Clarity.py +56 -99
  140. validmind/tests/prompt_validation/Conciseness.py +63 -101
  141. validmind/tests/prompt_validation/Delimitation.py +48 -89
  142. validmind/tests/prompt_validation/NegativeInstruction.py +62 -96
  143. validmind/tests/prompt_validation/Robustness.py +80 -121
  144. validmind/tests/prompt_validation/Specificity.py +61 -95
  145. validmind/tests/prompt_validation/ai_powered_test.py +2 -2
  146. validmind/tests/run.py +314 -496
  147. validmind/tests/test_providers.py +109 -79
  148. validmind/tests/utils.py +91 -0
  149. validmind/unit_metrics/__init__.py +16 -155
  150. validmind/unit_metrics/classification/F1.py +1 -0
  151. validmind/unit_metrics/classification/Precision.py +1 -0
  152. validmind/unit_metrics/classification/ROC_AUC.py +1 -0
  153. validmind/unit_metrics/classification/Recall.py +1 -0
  154. validmind/unit_metrics/regression/AdjustedRSquaredScore.py +1 -0
  155. validmind/unit_metrics/regression/GiniCoefficient.py +1 -0
  156. validmind/unit_metrics/regression/HuberLoss.py +1 -0
  157. validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +1 -0
  158. validmind/unit_metrics/regression/MeanAbsoluteError.py +1 -0
  159. validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +1 -0
  160. validmind/unit_metrics/regression/MeanBiasDeviation.py +1 -0
  161. validmind/unit_metrics/regression/MeanSquaredError.py +1 -0
  162. validmind/unit_metrics/regression/QuantileLoss.py +1 -0
  163. validmind/unit_metrics/regression/RSquaredScore.py +2 -1
  164. validmind/unit_metrics/regression/RootMeanSquaredError.py +1 -0
  165. validmind/utils.py +66 -17
  166. validmind/vm_models/__init__.py +2 -17
  167. validmind/vm_models/dataset/dataset.py +31 -4
  168. validmind/vm_models/figure.py +7 -37
  169. validmind/vm_models/model.py +3 -0
  170. validmind/vm_models/result/__init__.py +7 -0
  171. validmind/vm_models/result/result.jinja +21 -0
  172. validmind/vm_models/result/result.py +337 -0
  173. validmind/vm_models/result/utils.py +160 -0
  174. validmind/vm_models/test_suite/runner.py +16 -54
  175. validmind/vm_models/test_suite/summary.py +3 -3
  176. validmind/vm_models/test_suite/test.py +43 -77
  177. validmind/vm_models/test_suite/test_suite.py +8 -40
  178. validmind-2.6.7.dist-info/METADATA +137 -0
  179. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/RECORD +182 -189
  180. validmind/tests/data_validation/AutoSeasonality.py +0 -190
  181. validmind/tests/metadata.py +0 -59
  182. validmind/tests/model_validation/embeddings/StabilityAnalysis.py +0 -176
  183. validmind/tests/model_validation/ragas/ContextUtilization.py +0 -161
  184. validmind/tests/model_validation/sklearn/ClusterPerformance.py +0 -80
  185. validmind/unit_metrics/composite.py +0 -238
  186. validmind/vm_models/test/metric.py +0 -98
  187. validmind/vm_models/test/metric_result.py +0 -61
  188. validmind/vm_models/test/output_template.py +0 -55
  189. validmind/vm_models/test/result_summary.py +0 -76
  190. validmind/vm_models/test/result_wrapper.py +0 -488
  191. validmind/vm_models/test/test.py +0 -103
  192. validmind/vm_models/test/threshold_test.py +0 -106
  193. validmind/vm_models/test/threshold_test_result.py +0 -75
  194. validmind/vm_models/test_context.py +0 -259
  195. validmind-2.5.25.dist-info/METADATA +0 -118
  196. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/LICENSE +0 -0
  197. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/WHEEL +0 -0
  198. {validmind-2.5.25.dist-info → validmind-2.6.7.dist-info}/entry_points.txt +0 -0
@@ -2,33 +2,72 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
- from typing import List
7
-
8
5
  import pandas as pd
9
6
 
7
+ from validmind import tags, tasks
10
8
  from validmind.errors import MissingRequiredTestInputError, SkipTestError
11
- from validmind.vm_models import (
12
- ResultSummary,
13
- ResultTable,
14
- ResultTableMetadata,
15
- ThresholdTest,
16
- ThresholdTestResult,
17
- )
18
9
 
19
10
  from .ai_powered_test import call_model, missing_prompt_message
20
11
 
12
+ SYSTEM = """
13
+ You are a prompt evaluation researcher AI who is tasked with testing the robustness of LLM prompts.
14
+
15
+ Consider the following guidelines:
16
+ '''
17
+ LLM prompts are used to guide a model to generate specific outputs or solve specific tasks.
18
+ These prompts can be more or less robust, meaning that they can be more or less susceptible to breaking especially when the output needs to be a specific type.
19
+ A robust prompt ensures consistent performance and reduces the likelihood of unexpected or off-tangent outputs.
20
+ This consistency is vital for applications where predictability and reliability of the LLM's response are paramount.
21
+ '''
22
+
23
+ Consider the user-submitted prompt template and generate an input for the variable in the template (denoted by brackets) that tests the robustness of the prompt.
24
+ Contradictions, edge cases, typos, bad phrasing, distracting, complex or out-of-place words and phrases are just some of the strategies you can use when generating inputs.
25
+ Be creative and think step-by-step how you would break the prompt.
26
+ Then generate {num_tests} inputs for the user-submitted prompt template that would break the prompt.
27
+ Each input should be different from the others.
28
+ Each input should be retured as a new line in your response.
29
+ Respond only with the values to be inserted into the prompt template and do not include quotes, explanations or any extra text.
30
+
31
+ Example:
32
+
33
+ User-provided prompt:
34
+ ```
35
+ Analyse the following sentence and output its sentiment:
36
+ \\{sentence\\}
37
+ ```
38
+
39
+ Your response (generated inputs):
40
+ ```
41
+ I am a happy cat
42
+ You are a bad person
43
+ My name is bob
44
+ James' friend is really sick
45
+ ```
46
+ """
21
47
 
22
- @dataclass
23
- class Robustness(ThresholdTest):
48
+ USER = """
49
+ Prompt:
50
+ ```
51
+ {prompt_to_test}
52
+ ```
53
+ Input:
54
+ """
55
+
56
+
57
+ @tags("llm", "zero_shot", "few_shot")
58
+ @tasks("text_classification", "text_summarization")
59
+ def Robustness(model, dataset, num_tests=10):
24
60
  """
25
- Assesses the robustness of prompts provided to a Large Language Model under varying conditions and contexts.
61
+ Assesses the robustness of prompts provided to a Large Language Model under varying conditions and contexts. This test
62
+ specifically measures the model's ability to generate correct classifications with the given prompt even when the
63
+ inputs are edge cases or otherwise difficult to classify.
26
64
 
27
65
  ### Purpose
28
66
 
29
67
  The Robustness test is meant to evaluate the resilience and reliability of prompts provided to a Language Learning
30
68
  Model (LLM). The aim of this test is to guarantee that the prompts consistently generate accurate and expected
31
- outputs, even in diverse or challenging scenarios.
69
+ outputs, even in diverse or challenging scenarios. This test is only applicable to LLM-powered text classification
70
+ tasks where the prompt has a single input variable.
32
71
 
33
72
  ### Test Mechanism
34
73
 
@@ -58,114 +97,34 @@ class Robustness(ThresholdTest):
58
97
  - The test may not account for all potential conditions or alterations that could show up in practical use
59
98
  scenarios.
60
99
  """
100
+ if not hasattr(model, "prompt"):
101
+ raise MissingRequiredTestInputError(missing_prompt_message)
61
102
 
62
- name = "robustness"
63
- required_inputs = ["model"]
64
- default_params = {"num_tests": 10}
65
- tasks = ["text_classification", "text_summarization"]
66
- tags = ["llm", "zero_shot", "few_shot"]
67
-
68
- system_prompt = '''
69
- You are a prompt evaluation researcher AI who is tasked with testing the robustness of LLM prompts.
103
+ if not model.prompt.variables or len(model.prompt.variables) > 1:
104
+ raise SkipTestError("Robustness only supports single-variable prompts for now")
70
105
 
71
- Consider the following guidelines:
72
- ```
73
- LLM prompts are used to guide a model to generate specific outputs or solve specific tasks. These prompts can be more or less robust, meaning that they can be more or less susceptible to breaking especially when the output needs to be a specific type.
74
- A robust prompt ensures consistent performance and reduces the likelihood of unexpected or off-tangent outputs. This consistency is vital for applications where predictability and reliability of the LLM's response are paramount.
75
- ```
76
-
77
- Consider the user-submitted prompt template and generate an input for the variable in the template (denoted by brackets) that tests the robustness of the prompt.
78
- Contradictions, edge cases, typos, bad phrasing, distracting, complex or out-of-place words and phrases are just some of the strategies you can use when generating inputs.
79
- Be creative and think step-by-step how you would break the prompt. Then generate an input for the user-submitted prompt template that would break the prompt.
80
- Respond only with the value to be inserted into the prompt template and do not include quotes, explanations or any extra text.
81
- Example:
82
- Prompt:
83
- """
84
- Analyse the following sentence and output its sentiment\n{sentence}
85
- """
86
- Input:
87
- Nonsense string that has no sentiment
88
- '''.strip()
89
- user_prompt = '''
90
- Prompt:
91
- """
92
- {prompt_to_test}
93
- """
94
- Input:
95
- '''.strip()
96
-
97
- def summary(self, results: List[ThresholdTestResult], all_passed: bool):
98
- results_table = [
99
- {
100
- "Test Input": result.values["input"],
101
- "Model Output": result.values["output"],
102
- "Pass/Fail": "Pass" if result.passed else "Fail",
103
- }
104
- for result in results
105
- ]
106
-
107
- return ResultSummary(
108
- results=[
109
- ResultTable(
110
- data=pd.DataFrame(results_table),
111
- metadata=ResultTableMetadata(
112
- title="Robustness Test on Prompt",
113
- ),
114
- )
115
- ]
106
+ target_class_labels = dataset.target_classes()
107
+ if len(target_class_labels) > 10:
108
+ raise SkipTestError(
109
+ "Too many target classes to test robustness. Skipping test."
116
110
  )
117
111
 
118
- def run(self):
119
- if not hasattr(self.inputs.model, "prompt"):
120
- raise MissingRequiredTestInputError(missing_prompt_message)
121
-
122
- # TODO: add support for multi-variable prompts
123
- if (
124
- not self.inputs.model.prompt.variables
125
- or len(self.inputs.model.prompt.variables) > 1
126
- ):
127
- raise SkipTestError(
128
- "Robustness only supports single-variable prompts for now"
129
- )
130
-
131
- target_class_labels = self.inputs.dataset.target_classes()
132
- # Guard against too many classes (maybe not a classification model)
133
- if len(target_class_labels) > 10:
134
- raise SkipTestError(
135
- "Too many target classes to test robustness. Skipping test."
136
- )
137
-
138
- results = []
139
-
140
- for _ in range(self.params["num_tests"]):
141
- response = call_model(
142
- system_prompt=self.system_prompt,
143
- user_prompt=self.user_prompt.format(
144
- variables="\n".join(self.inputs.model.prompt.variables),
145
- prompt_to_test=self.inputs.model.prompt.template,
146
- ),
147
- )
148
-
149
- test_input_df = pd.DataFrame(
150
- [response],
151
- columns=self.inputs.model.prompt.variables,
152
- )
153
- result = self.inputs.model.predict(test_input_df)[0]
154
-
155
- fail = False
156
- if result not in target_class_labels:
157
- fail = True
158
-
159
- results.append(
160
- ThresholdTestResult(
161
- passed=not fail,
162
- values={
163
- "input": response,
164
- "output": result,
165
- },
166
- )
167
- )
168
-
169
- return self.cache_results(
170
- results, passed=all([result.passed for result in results])
171
- )
112
+ generated_inputs = call_model(
113
+ system_prompt=SYSTEM.format(num_tests=num_tests),
114
+ user_prompt=USER.format(prompt_to_test=model.prompt.template),
115
+ ).split("\n")
116
+
117
+ responses = model.predict(
118
+ pd.DataFrame(generated_inputs, columns=model.prompt.variables)
119
+ )
120
+
121
+ results = [
122
+ {
123
+ "Generated Input": generated_input,
124
+ "Model Response": response,
125
+ "Pass/Fail": "Pass" if response in target_class_labels else "Fail",
126
+ }
127
+ for generated_input, response in zip(generated_inputs, responses)
128
+ ]
129
+
130
+ return results, all(result["Pass/Fail"] == "Pass" for result in results)
@@ -2,19 +2,8 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from dataclasses import dataclass
6
- from typing import List
7
-
8
- import pandas as pd
9
-
5
+ from validmind import tags, tasks
10
6
  from validmind.errors import MissingRequiredTestInputError
11
- from validmind.vm_models import (
12
- ResultSummary,
13
- ResultTable,
14
- ResultTableMetadata,
15
- ThresholdTest,
16
- ThresholdTestResult,
17
- )
18
7
 
19
8
  from .ai_powered_test import (
20
9
  call_model,
@@ -23,9 +12,47 @@ from .ai_powered_test import (
23
12
  missing_prompt_message,
24
13
  )
25
14
 
15
+ SYSTEM = """
16
+ You are a prompt evaluation AI.
17
+ You are aware of all prompt engineering best practices and can score prompts based on how well they satisfy different metrics.
18
+ You analyse the prompts step-by-step based on provided documentation and provide a score and an explanation for how you produced that score.
19
+
20
+ Consider the following documentation regarding specificity in prompts and utilize it to grade the user-submitted prompt:
21
+ ```
22
+ Prompts that are detailed and descriptive often yield better and more accurate results from an LLM.
23
+ Rather than relying on specific keywords or tokens, it's crucial to have a well-structured and descriptive prompt.
24
+ Including relevant examples within the prompt can be particularly effective, guiding the LLM to produce outputs in desired formats.
25
+ However, it's essential to strike a balance. While prompts need to be detailed, they shouldn't be overloaded with unnecessary information.
26
+ The emphasis should always be on relevancy and conciseness, considering there are limitations to how long a prompt can be.
27
+
28
+ Example:
29
+ Imagine wanting an LLM to extract specific details from a given text.
30
+ A vague prompt might yield varied results.
31
+ However, with a prompt like, "Extract the names of all characters and the cities they visited from the text", the LLM is guided more precisely towards the desired information extraction.
32
+ ```
33
+
34
+ Score the specificity of the user-submitted prompt.
35
+ Return a score from 1 to 10 where 10 is a perfect score.
36
+ Also provide a short explanation for your score.
37
+
38
+ Response Format:
39
+ ```
40
+ Score: <score>
41
+ Explanation: <explanation>
42
+ ```
43
+ """
44
+
45
+ USER = """
46
+ Prompt:
47
+ '''
48
+ {prompt_to_test}
49
+ '''
50
+ """
26
51
 
27
- @dataclass
28
- class Specificity(ThresholdTest):
52
+
53
+ @tags("llm", "zero_shot", "few_shot")
54
+ @tasks("text_classification", "text_summarization")
55
+ def Specificity(model, min_threshold=7):
29
56
  """
30
57
  Evaluates and scores the specificity of prompts provided to a Large Language Model (LLM), based on clarity, detail,
31
58
  and relevance.
@@ -64,84 +91,23 @@ class Specificity(ThresholdTest):
64
91
  - Striking a balance between specificity and verbosity can be challenging, as overly detailed prompts might confuse
65
92
  or mislead the model
66
93
  """
67
-
68
- name = "specificity"
69
- required_inputs = ["model.prompt"]
70
- default_params = {"min_threshold": 7}
71
- tasks = ["text_classification", "text_summarization"]
72
- tags = ["llm", "zero_shot", "few_shot"]
73
-
74
- system_prompt = """
75
- You are a prompt evaluation AI. You are aware of all prompt engineering best practices and can score prompts based on how well they satisfy different metrics. You analyse the prompts step-by-step based on provided documentation and provide a score and an explanation for how you produced that score.
76
-
77
- Consider the following documentation regarding specificity in prompts and utilize it to grade the user-submitted prompt:
78
- ```
79
- Prompts that are detailed and descriptive often yield better and more accurate results from an LLM. Rather than relying on specific keywords or tokens, it's crucial to have a well-structured and descriptive prompt. Including relevant examples within the prompt can be particularly effective, guiding the LLM to produce outputs in desired formats. However, it's essential to strike a balance. While prompts need to be detailed, they shouldn't be overloaded with unnecessary information. The emphasis should always be on relevancy and conciseness, considering there are limitations to how long a prompt can be.
80
-
81
- Example:
82
- Imagine wanting an LLM to extract specific details from a given text. A vague prompt might yield varied results. However, with a prompt like, "Extract the names of all characters and the cities they visited from the text", the LLM is guided more precisely towards the desired information extraction.
83
- ```
84
-
85
- Score the specificity of the user-submitted prompt. Return a score from 1 to 10 where 10 is a perfect score. Also provide a short explanation for your score
86
-
87
- Response Format:
88
- ```
89
- Score: <score>
90
- Explanation: <explanation>
91
- ```
92
- """.strip()
93
- user_prompt = '''
94
- Prompt:
95
- """
96
- {prompt_to_test}
97
- """
98
- '''.strip()
99
-
100
- def summary(self, results: List[ThresholdTestResult], all_passed: bool):
101
- result = results[0]
102
- results_table = [
103
- {
104
- "Score": result.values["score"],
105
- "Threshold": result.values["threshold"],
106
- "Explanation": result.values["explanation"],
107
- "Pass/Fail": "Pass" if result.passed else "Fail",
108
- }
109
- ]
110
-
111
- return ResultSummary(
112
- results=[
113
- ResultTable(
114
- data=pd.DataFrame(results_table),
115
- metadata=ResultTableMetadata(
116
- title="Specificity Test for LLM Prompt",
117
- ),
118
- )
119
- ]
120
- )
121
-
122
- def run(self):
123
- if not hasattr(self.inputs.model, "prompt"):
124
- raise MissingRequiredTestInputError(missing_prompt_message)
125
-
126
- response = call_model(
127
- system_prompt=self.system_prompt,
128
- user_prompt=self.user_prompt.format(
129
- prompt_to_test=self.inputs.model.prompt.template
130
- ),
131
- )
132
- score = get_score(response)
133
- explanation = get_explanation(response)
134
-
135
- passed = score > self.params["min_threshold"]
136
- results = [
137
- ThresholdTestResult(
138
- passed=passed,
139
- values={
140
- "score": score,
141
- "explanation": explanation,
142
- "threshold": self.params["min_threshold"],
143
- },
144
- )
145
- ]
146
-
147
- return self.cache_results(results, passed=passed)
94
+ if not hasattr(model, "prompt"):
95
+ raise MissingRequiredTestInputError(missing_prompt_message)
96
+
97
+ response = call_model(
98
+ system_prompt=SYSTEM,
99
+ user_prompt=USER.format(prompt_to_test=model.prompt.template),
100
+ )
101
+ score = get_score(response)
102
+ explanation = get_explanation(response)
103
+
104
+ passed = score > min_threshold
105
+
106
+ return [
107
+ {
108
+ "Score": score,
109
+ "Threshold": min_threshold,
110
+ "Explanation": explanation,
111
+ "Pass/Fail": "Pass" if passed else "Fail",
112
+ }
113
+ ], passed
@@ -36,8 +36,8 @@ def call_model(
36
36
  client.chat.completions.create(
37
37
  model=model,
38
38
  messages=[
39
- {"role": "system", "content": system_prompt},
40
- {"role": "user", "content": user_prompt},
39
+ {"role": "system", "content": system_prompt.strip("\n").strip()},
40
+ {"role": "user", "content": user_prompt.strip("\n").strip()},
41
41
  ],
42
42
  temperature=temperature,
43
43
  seed=seed,