validmind 2.2.5__py3-none-any.whl → 2.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/{ai.py → ai/test_descriptions.py} +127 -69
- validmind/ai/utils.py +104 -0
- validmind/api_client.py +70 -31
- validmind/client.py +5 -5
- validmind/logging.py +38 -32
- validmind/models/foundation.py +10 -6
- validmind/models/function.py +3 -1
- validmind/models/metadata.py +1 -1
- validmind/test_suites/__init__.py +1 -7
- validmind/test_suites/regression.py +0 -16
- validmind/test_suites/statsmodels_timeseries.py +1 -1
- validmind/tests/data_validation/ACFandPACFPlot.py +36 -27
- validmind/tests/{model_validation/statsmodels → data_validation}/ADF.py +42 -13
- validmind/tests/data_validation/BivariateScatterPlots.py +38 -41
- validmind/tests/{model_validation/statsmodels → data_validation}/DFGLSArch.py +67 -11
- validmind/tests/data_validation/HeatmapFeatureCorrelations.py +1 -1
- validmind/tests/data_validation/HighPearsonCorrelation.py +12 -3
- validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
- validmind/tests/{model_validation/statsmodels → data_validation}/KPSS.py +64 -11
- validmind/tests/{model_validation/statsmodels → data_validation}/PhillipsPerronArch.py +65 -11
- validmind/tests/data_validation/ScatterPlot.py +1 -1
- validmind/tests/data_validation/SeasonalDecompose.py +12 -7
- validmind/tests/data_validation/TabularDateTimeHistograms.py +29 -33
- validmind/tests/data_validation/WOEBinPlots.py +1 -1
- validmind/tests/data_validation/WOEBinTable.py +1 -1
- validmind/tests/{model_validation/statsmodels → data_validation}/ZivotAndrewsArch.py +65 -11
- validmind/tests/data_validation/nlp/CommonWords.py +1 -1
- validmind/tests/data_validation/nlp/Hashtags.py +1 -1
- validmind/tests/data_validation/nlp/Mentions.py +1 -1
- validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +2 -1
- validmind/tests/data_validation/nlp/Punctuations.py +1 -1
- validmind/tests/data_validation/nlp/Sentiment.py +1 -1
- validmind/tests/data_validation/nlp/TextDescription.py +5 -1
- validmind/tests/data_validation/nlp/Toxicity.py +1 -1
- validmind/tests/decorator.py +1 -1
- validmind/tests/model_validation/FeaturesAUC.py +5 -3
- validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +4 -0
- validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +4 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +4 -0
- validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +4 -0
- validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +4 -0
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +4 -0
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +3 -3
- validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
- validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
- validmind/tests/model_validation/ragas/AspectCritique.py +14 -8
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +3 -4
- validmind/tests/model_validation/ragas/ContextPrecision.py +4 -5
- validmind/tests/model_validation/ragas/ContextRecall.py +3 -4
- validmind/tests/model_validation/ragas/ContextRelevancy.py +5 -4
- validmind/tests/model_validation/ragas/Faithfulness.py +6 -5
- validmind/tests/model_validation/ragas/utils.py +35 -9
- validmind/tests/model_validation/sklearn/ClusterPerformance.py +2 -2
- validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +1 -1
- validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +6 -8
- validmind/tests/model_validation/sklearn/RegressionErrors.py +1 -1
- validmind/tests/model_validation/sklearn/RegressionModelsPerformanceComparison.py +14 -8
- validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
- validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +1 -1
- validmind/tests/model_validation/statsmodels/GINITable.py +1 -1
- validmind/tests/model_validation/statsmodels/JarqueBera.py +1 -1
- validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +1 -1
- validmind/tests/model_validation/statsmodels/LJungBox.py +1 -1
- validmind/tests/model_validation/statsmodels/Lilliefors.py +1 -1
- validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +4 -0
- validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +9 -4
- validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +2 -2
- validmind/tests/model_validation/statsmodels/RunsTest.py +1 -1
- validmind/tests/model_validation/statsmodels/ShapiroWilk.py +1 -1
- validmind/tests/prompt_validation/Bias.py +14 -11
- validmind/tests/prompt_validation/Clarity.py +14 -11
- validmind/tests/prompt_validation/Conciseness.py +14 -11
- validmind/tests/prompt_validation/Delimitation.py +14 -11
- validmind/tests/prompt_validation/NegativeInstruction.py +14 -11
- validmind/tests/prompt_validation/Robustness.py +11 -11
- validmind/tests/prompt_validation/Specificity.py +14 -11
- validmind/tests/prompt_validation/ai_powered_test.py +53 -75
- validmind/unit_metrics/composite.py +2 -1
- validmind/utils.py +4 -49
- validmind/vm_models/dataset/dataset.py +17 -3
- validmind/vm_models/dataset/utils.py +2 -2
- validmind/vm_models/model.py +1 -1
- validmind/vm_models/test/metric.py +1 -8
- validmind/vm_models/test/result_wrapper.py +27 -34
- validmind/vm_models/test/test.py +3 -0
- validmind/vm_models/test/threshold_test.py +1 -1
- validmind/vm_models/test_suite/runner.py +12 -6
- validmind/vm_models/test_suite/summary.py +18 -7
- validmind/vm_models/test_suite/test.py +13 -20
- {validmind-2.2.5.dist-info → validmind-2.3.1.dist-info}/METADATA +1 -1
- {validmind-2.2.5.dist-info → validmind-2.3.1.dist-info}/RECORD +95 -104
- validmind/tests/data_validation/DefaultRatesbyRiskBandPlot.py +0 -114
- validmind/tests/data_validation/PiTCreditScoresHistogram.py +0 -150
- validmind/tests/data_validation/PiTPDHistogram.py +0 -152
- validmind/tests/model_validation/statsmodels/ADFTest.py +0 -88
- validmind/tests/model_validation/statsmodels/FeatureImportanceAndSignificance.py +0 -198
- validmind/tests/model_validation/statsmodels/PDRatingClassPlot.py +0 -151
- validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +0 -146
- validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +0 -144
- validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +0 -127
- validmind/tests/model_validation/statsmodels/ResidualsVisualInspection.py +0 -130
- {validmind-2.2.5.dist-info → validmind-2.3.1.dist-info}/LICENSE +0 -0
- {validmind-2.2.5.dist-info → validmind-2.3.1.dist-info}/WHEEL +0 -0
- {validmind-2.2.5.dist-info → validmind-2.3.1.dist-info}/entry_points.txt +0 -0
@@ -7,6 +7,7 @@ from typing import List
|
|
7
7
|
|
8
8
|
import pandas as pd
|
9
9
|
|
10
|
+
from validmind.errors import MissingRequiredTestInputError
|
10
11
|
from validmind.vm_models import (
|
11
12
|
ResultSummary,
|
12
13
|
ResultTable,
|
@@ -15,11 +16,16 @@ from validmind.vm_models import (
|
|
15
16
|
ThresholdTestResult,
|
16
17
|
)
|
17
18
|
|
18
|
-
from .ai_powered_test import
|
19
|
+
from .ai_powered_test import (
|
20
|
+
call_model,
|
21
|
+
get_explanation,
|
22
|
+
get_score,
|
23
|
+
missing_prompt_message,
|
24
|
+
)
|
19
25
|
|
20
26
|
|
21
27
|
@dataclass
|
22
|
-
class Bias(ThresholdTest
|
28
|
+
class Bias(ThresholdTest):
|
23
29
|
"""
|
24
30
|
Evaluates bias in a Large Language Model based on the order and distribution of exemplars in a prompt.
|
25
31
|
|
@@ -103,12 +109,6 @@ Prompt:
|
|
103
109
|
"""
|
104
110
|
'''.strip()
|
105
111
|
|
106
|
-
def __init__(self, *args, **kwargs):
|
107
|
-
super().__init__(*args, **kwargs) # Call ThresholdTest.__init__
|
108
|
-
AIPoweredTest.__init__(
|
109
|
-
self, *args, **kwargs
|
110
|
-
) # Explicitly call AIPoweredTest.__init__
|
111
|
-
|
112
112
|
def summary(self, results: List[ThresholdTestResult], all_passed: bool):
|
113
113
|
result = results[0]
|
114
114
|
results_table = [
|
@@ -132,14 +132,17 @@ Prompt:
|
|
132
132
|
)
|
133
133
|
|
134
134
|
def run(self):
|
135
|
-
|
135
|
+
if not hasattr(self.inputs.model, "prompt"):
|
136
|
+
raise MissingRequiredTestInputError(missing_prompt_message)
|
137
|
+
|
138
|
+
response = call_model(
|
136
139
|
system_prompt=self.system_prompt,
|
137
140
|
user_prompt=self.user_prompt.format(
|
138
141
|
prompt_to_test=self.inputs.model.prompt.template
|
139
142
|
),
|
140
143
|
)
|
141
|
-
score =
|
142
|
-
explanation =
|
144
|
+
score = get_score(response)
|
145
|
+
explanation = get_explanation(response)
|
143
146
|
|
144
147
|
passed = score > self.params["min_threshold"]
|
145
148
|
results = [
|
@@ -7,6 +7,7 @@ from typing import List
|
|
7
7
|
|
8
8
|
import pandas as pd
|
9
9
|
|
10
|
+
from validmind.errors import MissingRequiredTestInputError
|
10
11
|
from validmind.vm_models import (
|
11
12
|
ResultSummary,
|
12
13
|
ResultTable,
|
@@ -15,11 +16,16 @@ from validmind.vm_models import (
|
|
15
16
|
ThresholdTestResult,
|
16
17
|
)
|
17
18
|
|
18
|
-
from .ai_powered_test import
|
19
|
+
from .ai_powered_test import (
|
20
|
+
call_model,
|
21
|
+
get_explanation,
|
22
|
+
get_score,
|
23
|
+
missing_prompt_message,
|
24
|
+
)
|
19
25
|
|
20
26
|
|
21
27
|
@dataclass
|
22
|
-
class Clarity(ThresholdTest
|
28
|
+
class Clarity(ThresholdTest):
|
23
29
|
"""
|
24
30
|
Evaluates and scores the clarity of prompts in a Large Language Model based on specified guidelines.
|
25
31
|
|
@@ -93,12 +99,6 @@ Prompt:
|
|
93
99
|
"""
|
94
100
|
'''.strip()
|
95
101
|
|
96
|
-
def __init__(self, *args, **kwargs):
|
97
|
-
super().__init__(*args, **kwargs) # Call ThresholdTest.__init__
|
98
|
-
AIPoweredTest.__init__(
|
99
|
-
self, *args, **kwargs
|
100
|
-
) # Explicitly call AIPoweredTest.__init__
|
101
|
-
|
102
102
|
def summary(self, results: List[ThresholdTestResult], all_passed: bool):
|
103
103
|
result = results[0]
|
104
104
|
results_table = [
|
@@ -122,14 +122,17 @@ Prompt:
|
|
122
122
|
)
|
123
123
|
|
124
124
|
def run(self):
|
125
|
-
|
125
|
+
if not hasattr(self.inputs.model, "prompt"):
|
126
|
+
raise MissingRequiredTestInputError(missing_prompt_message)
|
127
|
+
|
128
|
+
response = call_model(
|
126
129
|
system_prompt=self.system_prompt,
|
127
130
|
user_prompt=self.user_prompt.format(
|
128
131
|
prompt_to_test=self.inputs.model.prompt.template
|
129
132
|
),
|
130
133
|
)
|
131
|
-
score =
|
132
|
-
explanation =
|
134
|
+
score = get_score(response)
|
135
|
+
explanation = get_explanation(response)
|
133
136
|
|
134
137
|
passed = score > self.params["min_threshold"]
|
135
138
|
results = [
|
@@ -7,6 +7,7 @@ from typing import List
|
|
7
7
|
|
8
8
|
import pandas as pd
|
9
9
|
|
10
|
+
from validmind.errors import MissingRequiredTestInputError
|
10
11
|
from validmind.vm_models import (
|
11
12
|
ResultSummary,
|
12
13
|
ResultTable,
|
@@ -15,11 +16,16 @@ from validmind.vm_models import (
|
|
15
16
|
ThresholdTestResult,
|
16
17
|
)
|
17
18
|
|
18
|
-
from .ai_powered_test import
|
19
|
+
from .ai_powered_test import (
|
20
|
+
call_model,
|
21
|
+
get_explanation,
|
22
|
+
get_score,
|
23
|
+
missing_prompt_message,
|
24
|
+
)
|
19
25
|
|
20
26
|
|
21
27
|
@dataclass
|
22
|
-
class Conciseness(ThresholdTest
|
28
|
+
class Conciseness(ThresholdTest):
|
23
29
|
"""
|
24
30
|
Analyzes and grades the conciseness of prompts provided to a Large Language Model.
|
25
31
|
|
@@ -95,12 +101,6 @@ Prompt:
|
|
95
101
|
"""
|
96
102
|
'''.strip()
|
97
103
|
|
98
|
-
def __init__(self, *args, **kwargs):
|
99
|
-
super().__init__(*args, **kwargs) # Call ThresholdTest.__init__
|
100
|
-
AIPoweredTest.__init__(
|
101
|
-
self, *args, **kwargs
|
102
|
-
) # Explicitly call AIPoweredTest.__init__
|
103
|
-
|
104
104
|
def summary(self, results: List[ThresholdTestResult], all_passed: bool):
|
105
105
|
result = results[0]
|
106
106
|
results_table = [
|
@@ -124,14 +124,17 @@ Prompt:
|
|
124
124
|
)
|
125
125
|
|
126
126
|
def run(self):
|
127
|
-
|
127
|
+
if not hasattr(self.inputs.model, "prompt"):
|
128
|
+
raise MissingRequiredTestInputError(missing_prompt_message)
|
129
|
+
|
130
|
+
response = call_model(
|
128
131
|
system_prompt=self.system_prompt,
|
129
132
|
user_prompt=self.user_prompt.format(
|
130
133
|
prompt_to_test=self.inputs.model.prompt.template
|
131
134
|
),
|
132
135
|
)
|
133
|
-
score =
|
134
|
-
explanation =
|
136
|
+
score = get_score(response)
|
137
|
+
explanation = get_explanation(response)
|
135
138
|
|
136
139
|
passed = score > self.params["min_threshold"]
|
137
140
|
results = [
|
@@ -7,6 +7,7 @@ from typing import List
|
|
7
7
|
|
8
8
|
import pandas as pd
|
9
9
|
|
10
|
+
from validmind.errors import MissingRequiredTestInputError
|
10
11
|
from validmind.vm_models import (
|
11
12
|
ResultSummary,
|
12
13
|
ResultTable,
|
@@ -15,11 +16,16 @@ from validmind.vm_models import (
|
|
15
16
|
ThresholdTestResult,
|
16
17
|
)
|
17
18
|
|
18
|
-
from .ai_powered_test import
|
19
|
+
from .ai_powered_test import (
|
20
|
+
call_model,
|
21
|
+
get_explanation,
|
22
|
+
get_score,
|
23
|
+
missing_prompt_message,
|
24
|
+
)
|
19
25
|
|
20
26
|
|
21
27
|
@dataclass
|
22
|
-
class Delimitation(ThresholdTest
|
28
|
+
class Delimitation(ThresholdTest):
|
23
29
|
"""
|
24
30
|
Evaluates the proper use of delimiters in prompts provided to Large Language Models.
|
25
31
|
|
@@ -85,12 +91,6 @@ Prompt:
|
|
85
91
|
"""
|
86
92
|
'''.strip()
|
87
93
|
|
88
|
-
def __init__(self, *args, **kwargs):
|
89
|
-
super().__init__(*args, **kwargs) # Call ThresholdTest.__init__
|
90
|
-
AIPoweredTest.__init__(
|
91
|
-
self, *args, **kwargs
|
92
|
-
) # Explicitly call AIPoweredTest.__init__
|
93
|
-
|
94
94
|
def summary(self, results: List[ThresholdTestResult], all_passed: bool):
|
95
95
|
result = results[0]
|
96
96
|
results_table = [
|
@@ -114,14 +114,17 @@ Prompt:
|
|
114
114
|
)
|
115
115
|
|
116
116
|
def run(self):
|
117
|
-
|
117
|
+
if not hasattr(self.inputs.model, "prompt"):
|
118
|
+
raise MissingRequiredTestInputError(missing_prompt_message)
|
119
|
+
|
120
|
+
response = call_model(
|
118
121
|
system_prompt=self.system_prompt,
|
119
122
|
user_prompt=self.user_prompt.format(
|
120
123
|
prompt_to_test=self.inputs.model.prompt.template
|
121
124
|
),
|
122
125
|
)
|
123
|
-
score =
|
124
|
-
explanation =
|
126
|
+
score = get_score(response)
|
127
|
+
explanation = get_explanation(response)
|
125
128
|
|
126
129
|
passed = score > self.params["min_threshold"]
|
127
130
|
results = [
|
@@ -7,6 +7,7 @@ from typing import List
|
|
7
7
|
|
8
8
|
import pandas as pd
|
9
9
|
|
10
|
+
from validmind.errors import MissingRequiredTestInputError
|
10
11
|
from validmind.vm_models import (
|
11
12
|
ResultSummary,
|
12
13
|
ResultTable,
|
@@ -15,11 +16,16 @@ from validmind.vm_models import (
|
|
15
16
|
ThresholdTestResult,
|
16
17
|
)
|
17
18
|
|
18
|
-
from .ai_powered_test import
|
19
|
+
from .ai_powered_test import (
|
20
|
+
call_model,
|
21
|
+
get_explanation,
|
22
|
+
get_score,
|
23
|
+
missing_prompt_message,
|
24
|
+
)
|
19
25
|
|
20
26
|
|
21
27
|
@dataclass
|
22
|
-
class NegativeInstruction(ThresholdTest
|
28
|
+
class NegativeInstruction(ThresholdTest):
|
23
29
|
"""
|
24
30
|
Evaluates and grades the use of affirmative, proactive language over negative instructions in LLM prompts.
|
25
31
|
|
@@ -96,12 +102,6 @@ Prompt:
|
|
96
102
|
"""
|
97
103
|
'''.strip()
|
98
104
|
|
99
|
-
def __init__(self, *args, **kwargs):
|
100
|
-
super().__init__(*args, **kwargs) # Call ThresholdTest.__init__
|
101
|
-
AIPoweredTest.__init__(
|
102
|
-
self, *args, **kwargs
|
103
|
-
) # Explicitly call AIPoweredTest.__init__
|
104
|
-
|
105
105
|
def summary(self, results: List[ThresholdTestResult], all_passed: bool):
|
106
106
|
result = results[0]
|
107
107
|
results_table = [
|
@@ -125,14 +125,17 @@ Prompt:
|
|
125
125
|
)
|
126
126
|
|
127
127
|
def run(self):
|
128
|
-
|
128
|
+
if not hasattr(self.inputs.model, "prompt"):
|
129
|
+
raise MissingRequiredTestInputError(missing_prompt_message)
|
130
|
+
|
131
|
+
response = call_model(
|
129
132
|
system_prompt=self.system_prompt,
|
130
133
|
user_prompt=self.user_prompt.format(
|
131
134
|
prompt_to_test=self.inputs.model.prompt.template
|
132
135
|
),
|
133
136
|
)
|
134
|
-
score =
|
135
|
-
explanation =
|
137
|
+
score = get_score(response)
|
138
|
+
explanation = get_explanation(response)
|
136
139
|
|
137
140
|
passed = score > self.params["min_threshold"]
|
138
141
|
results = [
|
@@ -7,7 +7,7 @@ from typing import List
|
|
7
7
|
|
8
8
|
import pandas as pd
|
9
9
|
|
10
|
-
from validmind.errors import SkipTestError
|
10
|
+
from validmind.errors import MissingRequiredTestInputError, SkipTestError
|
11
11
|
from validmind.vm_models import (
|
12
12
|
ResultSummary,
|
13
13
|
ResultTable,
|
@@ -16,11 +16,11 @@ from validmind.vm_models import (
|
|
16
16
|
ThresholdTestResult,
|
17
17
|
)
|
18
18
|
|
19
|
-
from .ai_powered_test import
|
19
|
+
from .ai_powered_test import call_model, missing_prompt_message
|
20
20
|
|
21
21
|
|
22
22
|
@dataclass
|
23
|
-
class Robustness(ThresholdTest
|
23
|
+
class Robustness(ThresholdTest):
|
24
24
|
"""
|
25
25
|
Assesses the robustness of prompts provided to a Large Language Model under varying conditions and contexts.
|
26
26
|
|
@@ -94,12 +94,6 @@ Prompt:
|
|
94
94
|
Input:
|
95
95
|
'''.strip()
|
96
96
|
|
97
|
-
def __init__(self, *args, **kwargs):
|
98
|
-
super().__init__(*args, **kwargs) # Call ThresholdTest.__init__
|
99
|
-
AIPoweredTest.__init__(
|
100
|
-
self, *args, **kwargs
|
101
|
-
) # Explicitly call AIPoweredTest.__init__
|
102
|
-
|
103
97
|
def summary(self, results: List[ThresholdTestResult], all_passed: bool):
|
104
98
|
results_table = [
|
105
99
|
{
|
@@ -122,8 +116,14 @@ Input:
|
|
122
116
|
)
|
123
117
|
|
124
118
|
def run(self):
|
119
|
+
if not hasattr(self.inputs.model, "prompt"):
|
120
|
+
raise MissingRequiredTestInputError(missing_prompt_message)
|
121
|
+
|
125
122
|
# TODO: add support for multi-variable prompts
|
126
|
-
if
|
123
|
+
if (
|
124
|
+
not self.inputs.model.prompt.variables
|
125
|
+
or len(self.inputs.model.prompt.variables) > 1
|
126
|
+
):
|
127
127
|
raise SkipTestError(
|
128
128
|
"Robustness only supports single-variable prompts for now"
|
129
129
|
)
|
@@ -138,7 +138,7 @@ Input:
|
|
138
138
|
results = []
|
139
139
|
|
140
140
|
for _ in range(self.params["num_tests"]):
|
141
|
-
response =
|
141
|
+
response = call_model(
|
142
142
|
system_prompt=self.system_prompt,
|
143
143
|
user_prompt=self.user_prompt.format(
|
144
144
|
variables="\n".join(self.inputs.model.prompt.variables),
|
@@ -7,6 +7,7 @@ from typing import List
|
|
7
7
|
|
8
8
|
import pandas as pd
|
9
9
|
|
10
|
+
from validmind.errors import MissingRequiredTestInputError
|
10
11
|
from validmind.vm_models import (
|
11
12
|
ResultSummary,
|
12
13
|
ResultTable,
|
@@ -15,11 +16,16 @@ from validmind.vm_models import (
|
|
15
16
|
ThresholdTestResult,
|
16
17
|
)
|
17
18
|
|
18
|
-
from .ai_powered_test import
|
19
|
+
from .ai_powered_test import (
|
20
|
+
call_model,
|
21
|
+
get_explanation,
|
22
|
+
get_score,
|
23
|
+
missing_prompt_message,
|
24
|
+
)
|
19
25
|
|
20
26
|
|
21
27
|
@dataclass
|
22
|
-
class Specificity(ThresholdTest
|
28
|
+
class Specificity(ThresholdTest):
|
23
29
|
"""
|
24
30
|
Evaluates and scores the specificity of prompts provided to a Large Language Model (LLM), based on clarity,
|
25
31
|
detail, and relevance.
|
@@ -91,12 +97,6 @@ Prompt:
|
|
91
97
|
"""
|
92
98
|
'''.strip()
|
93
99
|
|
94
|
-
def __init__(self, *args, **kwargs):
|
95
|
-
super().__init__(*args, **kwargs) # Call ThresholdTest.__init__
|
96
|
-
AIPoweredTest.__init__(
|
97
|
-
self, *args, **kwargs
|
98
|
-
) # Explicitly call AIPoweredTest.__init__
|
99
|
-
|
100
100
|
def summary(self, results: List[ThresholdTestResult], all_passed: bool):
|
101
101
|
result = results[0]
|
102
102
|
results_table = [
|
@@ -120,14 +120,17 @@ Prompt:
|
|
120
120
|
)
|
121
121
|
|
122
122
|
def run(self):
|
123
|
-
|
123
|
+
if not hasattr(self.inputs.model, "prompt"):
|
124
|
+
raise MissingRequiredTestInputError(missing_prompt_message)
|
125
|
+
|
126
|
+
response = call_model(
|
124
127
|
system_prompt=self.system_prompt,
|
125
128
|
user_prompt=self.user_prompt.format(
|
126
129
|
prompt_to_test=self.inputs.model.prompt.template
|
127
130
|
),
|
128
131
|
)
|
129
|
-
score =
|
130
|
-
explanation =
|
132
|
+
score = get_score(response)
|
133
|
+
explanation = get_explanation(response)
|
131
134
|
|
132
135
|
passed = score > self.params["min_threshold"]
|
133
136
|
results = [
|
@@ -2,90 +2,68 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
import os
|
6
5
|
import re
|
7
6
|
|
8
|
-
from
|
7
|
+
from validmind.ai.utils import get_client_and_model
|
8
|
+
|
9
|
+
missing_prompt_message = """
|
10
|
+
Cannot run prompt validation tests on a model with no prompt.
|
11
|
+
You can set a prompt when creating a vm_model object like this:
|
12
|
+
my_vm_model = vm.init_model(
|
13
|
+
predict_fn=call_model,
|
14
|
+
prompt=Prompt(
|
15
|
+
template="<your-prompt-here>",
|
16
|
+
variables=[],
|
17
|
+
),
|
18
|
+
input_id="my_llm_model",
|
19
|
+
)
|
20
|
+
"""
|
21
|
+
|
22
|
+
|
23
|
+
def call_model(
|
24
|
+
system_prompt: str, user_prompt: str, temperature: float = 0.0, seed: int = 42
|
25
|
+
):
|
26
|
+
"""Call LLM with the given prompts and return the response"""
|
27
|
+
client, model = get_client_and_model()
|
28
|
+
|
29
|
+
return (
|
30
|
+
client.chat.completions.create(
|
31
|
+
model=model,
|
32
|
+
messages=[
|
33
|
+
{"role": "system", "content": system_prompt},
|
34
|
+
{"role": "user", "content": user_prompt},
|
35
|
+
],
|
36
|
+
temperature=temperature,
|
37
|
+
seed=seed,
|
38
|
+
)
|
39
|
+
.choices[0]
|
40
|
+
.message.content
|
41
|
+
)
|
9
42
|
|
10
43
|
|
11
|
-
|
12
|
-
"""
|
13
|
-
|
14
|
-
"""
|
44
|
+
def get_score(response: str):
|
45
|
+
"""Get just the score from the response string
|
46
|
+
TODO: use json response mode instead of this
|
15
47
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
model_name = None
|
20
|
-
|
21
|
-
def __init__(self, *args, **kwargs):
|
22
|
-
if "OPENAI_API_KEY" in os.environ:
|
23
|
-
self.client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
|
24
|
-
self.model_name = os.environ.get("VM_OPENAI_MODEL", "gpt-3.5-turbo")
|
25
|
-
|
26
|
-
elif "AZURE_OPENAI_KEY" in os.environ:
|
27
|
-
if "AZURE_OPENAI_ENDPOINT" not in os.environ:
|
28
|
-
raise ValueError(
|
29
|
-
"AZURE_OPENAI_ENDPOINT must be set to run LLM tests with Azure"
|
30
|
-
)
|
31
|
-
|
32
|
-
if "AZURE_OPENAI_MODEL" not in os.environ:
|
33
|
-
raise ValueError(
|
34
|
-
"AZURE_OPENAI_MODEL must be set to run LLM tests with Azure"
|
35
|
-
)
|
36
|
-
|
37
|
-
self.client = AzureOpenAI(
|
38
|
-
azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"),
|
39
|
-
api_key=os.environ.get("AZURE_OPENAI_KEY"),
|
40
|
-
api_version=os.environ.get("AZURE_OPENAI_VERSION", "2023-05-15"),
|
41
|
-
)
|
42
|
-
self.model_name = os.environ.get("AZURE_OPENAI_MODEL")
|
43
|
-
|
44
|
-
else:
|
45
|
-
raise ValueError(
|
46
|
-
"OPENAI_API_KEY or AZURE_OPENAI_KEY must be set to run LLM tests"
|
47
|
-
)
|
48
|
-
|
49
|
-
def call_model(self, user_prompt: str, system_prompt: str = None):
|
50
|
-
"""
|
51
|
-
Call an LLM with the passed prompts and return the response. We're using GPT4 for now.
|
52
|
-
"""
|
53
|
-
return (
|
54
|
-
self.client.chat.completions.create(
|
55
|
-
model=self.model_name,
|
56
|
-
messages=[
|
57
|
-
{"role": "system", "content": system_prompt},
|
58
|
-
{"role": "user", "content": user_prompt},
|
59
|
-
],
|
60
|
-
temperature=0.0,
|
61
|
-
seed=42,
|
62
|
-
)
|
63
|
-
.choices[0]
|
64
|
-
.message.content
|
65
|
-
)
|
66
|
-
|
67
|
-
def get_score(self, response: str):
|
68
|
-
"""
|
69
|
-
Get just the numeric data in the response string and convert it to an int
|
48
|
+
e.g. "Score: 8\nExplanation: <some-explanation>" -> 8
|
49
|
+
"""
|
50
|
+
score = re.search(r"Score: (\d+)", response)
|
70
51
|
|
71
|
-
|
72
|
-
""
|
73
|
-
score = re.search(r"Score: (\d+)", response)
|
52
|
+
if not score:
|
53
|
+
raise ValueError("Could not find score in response")
|
74
54
|
|
75
|
-
|
76
|
-
raise ValueError("Could not find score in response")
|
55
|
+
return int(score.group(1))
|
77
56
|
|
78
|
-
return int(score.group(1))
|
79
57
|
|
80
|
-
|
81
|
-
|
82
|
-
|
58
|
+
def get_explanation(response: str):
|
59
|
+
"""Get just the explanation from the response string
|
60
|
+
TODO: use json response mode instead of this
|
83
61
|
|
84
|
-
|
85
|
-
|
86
|
-
|
62
|
+
e.g. "Score: 8\nExplanation: <some-explanation>" -> "<some-explanation>"
|
63
|
+
"""
|
64
|
+
explanation = re.search(r"Explanation: (.+)", response, re.DOTALL)
|
87
65
|
|
88
|
-
|
89
|
-
|
66
|
+
if not explanation:
|
67
|
+
raise ValueError("Could not find explanation in response")
|
90
68
|
|
91
|
-
|
69
|
+
return explanation.group(1).strip().strip("`")
|
@@ -6,9 +6,10 @@ from dataclasses import dataclass
|
|
6
6
|
from typing import List, Tuple, Union
|
7
7
|
from uuid import uuid4
|
8
8
|
|
9
|
+
from ..ai.test_descriptions import get_description_metadata
|
9
10
|
from ..logging import get_logger
|
10
11
|
from ..tests.decorator import _inspect_signature
|
11
|
-
from ..utils import
|
12
|
+
from ..utils import run_async, test_id_to_name
|
12
13
|
from ..vm_models.test.metric import Metric
|
13
14
|
from ..vm_models.test.metric_result import MetricResult
|
14
15
|
from ..vm_models.test.result_summary import ResultSummary, ResultTable
|
validmind/utils.py
CHANGED
@@ -6,7 +6,6 @@ import asyncio
|
|
6
6
|
import difflib
|
7
7
|
import json
|
8
8
|
import math
|
9
|
-
import os
|
10
9
|
import re
|
11
10
|
import sys
|
12
11
|
from platform import python_version
|
@@ -26,8 +25,8 @@ from matplotlib.axes._axes import _log as matplotlib_axes_logger
|
|
26
25
|
from numpy import ndarray
|
27
26
|
from tabulate import tabulate
|
28
27
|
|
29
|
-
from .ai import generate_description
|
30
28
|
from .html_templates.content_blocks import math_jax_snippet, python_syntax_highlighting
|
29
|
+
from .logging import get_logger
|
31
30
|
|
32
31
|
DEFAULT_BIG_NUMBER_DECIMALS = 2
|
33
32
|
DEFAULT_SMALL_NUMBER_DECIMALS = 4
|
@@ -50,6 +49,8 @@ params = {
|
|
50
49
|
pylab.rcParams.update(params)
|
51
50
|
#################################
|
52
51
|
|
52
|
+
logger = get_logger(__name__)
|
53
|
+
|
53
54
|
|
54
55
|
def is_notebook() -> bool:
|
55
56
|
"""
|
@@ -307,7 +308,7 @@ def run_async_check(func, *args, **kwargs):
|
|
307
308
|
if task.get_name() == name:
|
308
309
|
return task
|
309
310
|
|
310
|
-
return run_async(func, name=name, *args, **kwargs)
|
311
|
+
return run_async(func, name=name, *args, **kwargs) # noqa B026
|
311
312
|
|
312
313
|
except RuntimeError:
|
313
314
|
pass
|
@@ -457,49 +458,3 @@ def md_to_html(md: str, mathml=False) -> str:
|
|
457
458
|
)
|
458
459
|
|
459
460
|
return html
|
460
|
-
|
461
|
-
|
462
|
-
def get_description_metadata(
|
463
|
-
test_id,
|
464
|
-
default_description,
|
465
|
-
summary=None,
|
466
|
-
figures=None,
|
467
|
-
prefix="metric_description",
|
468
|
-
):
|
469
|
-
"""Get Metadata Dictionary for a Test or Metric Result
|
470
|
-
|
471
|
-
Generates an LLM interpretation of the test results or uses the default
|
472
|
-
description and returns a metadata object that can be logged with the test results.
|
473
|
-
|
474
|
-
To enable LLM-generated descriptions, set the VALIDMIND_LLM_DESCRIPTIONS_ENABLED
|
475
|
-
environment variable to "true". The default description will be used if LLM
|
476
|
-
descriptions are disabled.
|
477
|
-
|
478
|
-
Note: Either the summary or figures must be provided to generate the description.
|
479
|
-
|
480
|
-
Args:
|
481
|
-
test_id (str): The test ID
|
482
|
-
default_description (str): The default description for the test
|
483
|
-
summary (Any): The test summary or results to interpret
|
484
|
-
figures (List[Figure]): The figures to attach to the test suite result
|
485
|
-
prefix (str): The prefix to use for the content ID (Default: "metric_description")
|
486
|
-
|
487
|
-
Returns:
|
488
|
-
dict: The metadata object to be logged with the test results
|
489
|
-
"""
|
490
|
-
if os.environ.get("VALIDMIND_LLM_DESCRIPTIONS_ENABLED", "false").lower() == "true":
|
491
|
-
revision_name = "Generated by ValidMind AI"
|
492
|
-
description = generate_description(
|
493
|
-
test_name=test_id,
|
494
|
-
test_description=default_description,
|
495
|
-
test_summary=summary,
|
496
|
-
figures=figures,
|
497
|
-
)
|
498
|
-
else:
|
499
|
-
revision_name = "Default Description"
|
500
|
-
description = default_description
|
501
|
-
|
502
|
-
return {
|
503
|
-
"content_id": f"{prefix}:{test_id}::{revision_name}",
|
504
|
-
"text": description,
|
505
|
-
}
|