validmind 2.2.5__py3-none-any.whl → 2.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. validmind/__version__.py +1 -1
  2. validmind/{ai.py → ai/test_descriptions.py} +127 -69
  3. validmind/ai/utils.py +104 -0
  4. validmind/api_client.py +70 -31
  5. validmind/client.py +5 -5
  6. validmind/logging.py +38 -32
  7. validmind/models/foundation.py +10 -6
  8. validmind/models/function.py +3 -1
  9. validmind/models/metadata.py +1 -1
  10. validmind/test_suites/__init__.py +1 -7
  11. validmind/test_suites/regression.py +0 -16
  12. validmind/test_suites/statsmodels_timeseries.py +1 -1
  13. validmind/tests/data_validation/ACFandPACFPlot.py +36 -27
  14. validmind/tests/{model_validation/statsmodels → data_validation}/ADF.py +42 -13
  15. validmind/tests/data_validation/BivariateScatterPlots.py +38 -41
  16. validmind/tests/{model_validation/statsmodels → data_validation}/DFGLSArch.py +67 -11
  17. validmind/tests/data_validation/HeatmapFeatureCorrelations.py +1 -1
  18. validmind/tests/data_validation/HighPearsonCorrelation.py +12 -3
  19. validmind/tests/data_validation/IsolationForestOutliers.py +2 -2
  20. validmind/tests/{model_validation/statsmodels → data_validation}/KPSS.py +64 -11
  21. validmind/tests/{model_validation/statsmodels → data_validation}/PhillipsPerronArch.py +65 -11
  22. validmind/tests/data_validation/ScatterPlot.py +1 -1
  23. validmind/tests/data_validation/SeasonalDecompose.py +12 -7
  24. validmind/tests/data_validation/TabularDateTimeHistograms.py +29 -33
  25. validmind/tests/data_validation/WOEBinPlots.py +1 -1
  26. validmind/tests/data_validation/WOEBinTable.py +1 -1
  27. validmind/tests/{model_validation/statsmodels → data_validation}/ZivotAndrewsArch.py +65 -11
  28. validmind/tests/data_validation/nlp/CommonWords.py +1 -1
  29. validmind/tests/data_validation/nlp/Hashtags.py +1 -1
  30. validmind/tests/data_validation/nlp/Mentions.py +1 -1
  31. validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +2 -1
  32. validmind/tests/data_validation/nlp/Punctuations.py +1 -1
  33. validmind/tests/data_validation/nlp/Sentiment.py +1 -1
  34. validmind/tests/data_validation/nlp/TextDescription.py +5 -1
  35. validmind/tests/data_validation/nlp/Toxicity.py +1 -1
  36. validmind/tests/decorator.py +1 -1
  37. validmind/tests/model_validation/FeaturesAUC.py +5 -3
  38. validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +4 -0
  39. validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +4 -0
  40. validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +4 -0
  41. validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +4 -0
  42. validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +4 -0
  43. validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +4 -0
  44. validmind/tests/model_validation/ragas/AnswerCorrectness.py +3 -3
  45. validmind/tests/model_validation/ragas/AnswerRelevance.py +5 -4
  46. validmind/tests/model_validation/ragas/AnswerSimilarity.py +5 -4
  47. validmind/tests/model_validation/ragas/AspectCritique.py +14 -8
  48. validmind/tests/model_validation/ragas/ContextEntityRecall.py +3 -4
  49. validmind/tests/model_validation/ragas/ContextPrecision.py +4 -5
  50. validmind/tests/model_validation/ragas/ContextRecall.py +3 -4
  51. validmind/tests/model_validation/ragas/ContextRelevancy.py +5 -4
  52. validmind/tests/model_validation/ragas/Faithfulness.py +6 -5
  53. validmind/tests/model_validation/ragas/utils.py +35 -9
  54. validmind/tests/model_validation/sklearn/ClusterPerformance.py +2 -2
  55. validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +1 -1
  56. validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +6 -8
  57. validmind/tests/model_validation/sklearn/RegressionErrors.py +1 -1
  58. validmind/tests/model_validation/sklearn/RegressionModelsPerformanceComparison.py +14 -8
  59. validmind/tests/model_validation/sklearn/RegressionR2Square.py +1 -1
  60. validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +1 -1
  61. validmind/tests/model_validation/statsmodels/GINITable.py +1 -1
  62. validmind/tests/model_validation/statsmodels/JarqueBera.py +1 -1
  63. validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +1 -1
  64. validmind/tests/model_validation/statsmodels/LJungBox.py +1 -1
  65. validmind/tests/model_validation/statsmodels/Lilliefors.py +1 -1
  66. validmind/tests/model_validation/statsmodels/RegressionCoeffsPlot.py +4 -0
  67. validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +9 -4
  68. validmind/tests/model_validation/statsmodels/RegressionModelsCoeffs.py +2 -2
  69. validmind/tests/model_validation/statsmodels/RunsTest.py +1 -1
  70. validmind/tests/model_validation/statsmodels/ShapiroWilk.py +1 -1
  71. validmind/tests/prompt_validation/Bias.py +14 -11
  72. validmind/tests/prompt_validation/Clarity.py +14 -11
  73. validmind/tests/prompt_validation/Conciseness.py +14 -11
  74. validmind/tests/prompt_validation/Delimitation.py +14 -11
  75. validmind/tests/prompt_validation/NegativeInstruction.py +14 -11
  76. validmind/tests/prompt_validation/Robustness.py +11 -11
  77. validmind/tests/prompt_validation/Specificity.py +14 -11
  78. validmind/tests/prompt_validation/ai_powered_test.py +53 -75
  79. validmind/unit_metrics/composite.py +2 -1
  80. validmind/utils.py +4 -49
  81. validmind/vm_models/dataset/dataset.py +17 -3
  82. validmind/vm_models/dataset/utils.py +2 -2
  83. validmind/vm_models/model.py +1 -1
  84. validmind/vm_models/test/metric.py +1 -8
  85. validmind/vm_models/test/result_wrapper.py +27 -34
  86. validmind/vm_models/test/test.py +3 -0
  87. validmind/vm_models/test/threshold_test.py +1 -1
  88. validmind/vm_models/test_suite/runner.py +12 -6
  89. validmind/vm_models/test_suite/summary.py +18 -7
  90. validmind/vm_models/test_suite/test.py +13 -20
  91. {validmind-2.2.5.dist-info → validmind-2.3.1.dist-info}/METADATA +1 -1
  92. {validmind-2.2.5.dist-info → validmind-2.3.1.dist-info}/RECORD +95 -104
  93. validmind/tests/data_validation/DefaultRatesbyRiskBandPlot.py +0 -114
  94. validmind/tests/data_validation/PiTCreditScoresHistogram.py +0 -150
  95. validmind/tests/data_validation/PiTPDHistogram.py +0 -152
  96. validmind/tests/model_validation/statsmodels/ADFTest.py +0 -88
  97. validmind/tests/model_validation/statsmodels/FeatureImportanceAndSignificance.py +0 -198
  98. validmind/tests/model_validation/statsmodels/PDRatingClassPlot.py +0 -151
  99. validmind/tests/model_validation/statsmodels/RegressionModelInsampleComparison.py +0 -146
  100. validmind/tests/model_validation/statsmodels/RegressionModelOutsampleComparison.py +0 -144
  101. validmind/tests/model_validation/statsmodels/RegressionModelsPerformance.py +0 -127
  102. validmind/tests/model_validation/statsmodels/ResidualsVisualInspection.py +0 -130
  103. {validmind-2.2.5.dist-info → validmind-2.3.1.dist-info}/LICENSE +0 -0
  104. {validmind-2.2.5.dist-info → validmind-2.3.1.dist-info}/WHEEL +0 -0
  105. {validmind-2.2.5.dist-info → validmind-2.3.1.dist-info}/entry_points.txt +0 -0
@@ -7,6 +7,7 @@ from typing import List
7
7
 
8
8
  import pandas as pd
9
9
 
10
+ from validmind.errors import MissingRequiredTestInputError
10
11
  from validmind.vm_models import (
11
12
  ResultSummary,
12
13
  ResultTable,
@@ -15,11 +16,16 @@ from validmind.vm_models import (
15
16
  ThresholdTestResult,
16
17
  )
17
18
 
18
- from .ai_powered_test import AIPoweredTest
19
+ from .ai_powered_test import (
20
+ call_model,
21
+ get_explanation,
22
+ get_score,
23
+ missing_prompt_message,
24
+ )
19
25
 
20
26
 
21
27
  @dataclass
22
- class Bias(ThresholdTest, AIPoweredTest):
28
+ class Bias(ThresholdTest):
23
29
  """
24
30
  Evaluates bias in a Large Language Model based on the order and distribution of exemplars in a prompt.
25
31
 
@@ -103,12 +109,6 @@ Prompt:
103
109
  """
104
110
  '''.strip()
105
111
 
106
- def __init__(self, *args, **kwargs):
107
- super().__init__(*args, **kwargs) # Call ThresholdTest.__init__
108
- AIPoweredTest.__init__(
109
- self, *args, **kwargs
110
- ) # Explicitly call AIPoweredTest.__init__
111
-
112
112
  def summary(self, results: List[ThresholdTestResult], all_passed: bool):
113
113
  result = results[0]
114
114
  results_table = [
@@ -132,14 +132,17 @@ Prompt:
132
132
  )
133
133
 
134
134
  def run(self):
135
- response = self.call_model(
135
+ if not hasattr(self.inputs.model, "prompt"):
136
+ raise MissingRequiredTestInputError(missing_prompt_message)
137
+
138
+ response = call_model(
136
139
  system_prompt=self.system_prompt,
137
140
  user_prompt=self.user_prompt.format(
138
141
  prompt_to_test=self.inputs.model.prompt.template
139
142
  ),
140
143
  )
141
- score = self.get_score(response)
142
- explanation = self.get_explanation(response)
144
+ score = get_score(response)
145
+ explanation = get_explanation(response)
143
146
 
144
147
  passed = score > self.params["min_threshold"]
145
148
  results = [
@@ -7,6 +7,7 @@ from typing import List
7
7
 
8
8
  import pandas as pd
9
9
 
10
+ from validmind.errors import MissingRequiredTestInputError
10
11
  from validmind.vm_models import (
11
12
  ResultSummary,
12
13
  ResultTable,
@@ -15,11 +16,16 @@ from validmind.vm_models import (
15
16
  ThresholdTestResult,
16
17
  )
17
18
 
18
- from .ai_powered_test import AIPoweredTest
19
+ from .ai_powered_test import (
20
+ call_model,
21
+ get_explanation,
22
+ get_score,
23
+ missing_prompt_message,
24
+ )
19
25
 
20
26
 
21
27
  @dataclass
22
- class Clarity(ThresholdTest, AIPoweredTest):
28
+ class Clarity(ThresholdTest):
23
29
  """
24
30
  Evaluates and scores the clarity of prompts in a Large Language Model based on specified guidelines.
25
31
 
@@ -93,12 +99,6 @@ Prompt:
93
99
  """
94
100
  '''.strip()
95
101
 
96
- def __init__(self, *args, **kwargs):
97
- super().__init__(*args, **kwargs) # Call ThresholdTest.__init__
98
- AIPoweredTest.__init__(
99
- self, *args, **kwargs
100
- ) # Explicitly call AIPoweredTest.__init__
101
-
102
102
  def summary(self, results: List[ThresholdTestResult], all_passed: bool):
103
103
  result = results[0]
104
104
  results_table = [
@@ -122,14 +122,17 @@ Prompt:
122
122
  )
123
123
 
124
124
  def run(self):
125
- response = self.call_model(
125
+ if not hasattr(self.inputs.model, "prompt"):
126
+ raise MissingRequiredTestInputError(missing_prompt_message)
127
+
128
+ response = call_model(
126
129
  system_prompt=self.system_prompt,
127
130
  user_prompt=self.user_prompt.format(
128
131
  prompt_to_test=self.inputs.model.prompt.template
129
132
  ),
130
133
  )
131
- score = self.get_score(response)
132
- explanation = self.get_explanation(response)
134
+ score = get_score(response)
135
+ explanation = get_explanation(response)
133
136
 
134
137
  passed = score > self.params["min_threshold"]
135
138
  results = [
@@ -7,6 +7,7 @@ from typing import List
7
7
 
8
8
  import pandas as pd
9
9
 
10
+ from validmind.errors import MissingRequiredTestInputError
10
11
  from validmind.vm_models import (
11
12
  ResultSummary,
12
13
  ResultTable,
@@ -15,11 +16,16 @@ from validmind.vm_models import (
15
16
  ThresholdTestResult,
16
17
  )
17
18
 
18
- from .ai_powered_test import AIPoweredTest
19
+ from .ai_powered_test import (
20
+ call_model,
21
+ get_explanation,
22
+ get_score,
23
+ missing_prompt_message,
24
+ )
19
25
 
20
26
 
21
27
  @dataclass
22
- class Conciseness(ThresholdTest, AIPoweredTest):
28
+ class Conciseness(ThresholdTest):
23
29
  """
24
30
  Analyzes and grades the conciseness of prompts provided to a Large Language Model.
25
31
 
@@ -95,12 +101,6 @@ Prompt:
95
101
  """
96
102
  '''.strip()
97
103
 
98
- def __init__(self, *args, **kwargs):
99
- super().__init__(*args, **kwargs) # Call ThresholdTest.__init__
100
- AIPoweredTest.__init__(
101
- self, *args, **kwargs
102
- ) # Explicitly call AIPoweredTest.__init__
103
-
104
104
  def summary(self, results: List[ThresholdTestResult], all_passed: bool):
105
105
  result = results[0]
106
106
  results_table = [
@@ -124,14 +124,17 @@ Prompt:
124
124
  )
125
125
 
126
126
  def run(self):
127
- response = self.call_model(
127
+ if not hasattr(self.inputs.model, "prompt"):
128
+ raise MissingRequiredTestInputError(missing_prompt_message)
129
+
130
+ response = call_model(
128
131
  system_prompt=self.system_prompt,
129
132
  user_prompt=self.user_prompt.format(
130
133
  prompt_to_test=self.inputs.model.prompt.template
131
134
  ),
132
135
  )
133
- score = self.get_score(response)
134
- explanation = self.get_explanation(response)
136
+ score = get_score(response)
137
+ explanation = get_explanation(response)
135
138
 
136
139
  passed = score > self.params["min_threshold"]
137
140
  results = [
@@ -7,6 +7,7 @@ from typing import List
7
7
 
8
8
  import pandas as pd
9
9
 
10
+ from validmind.errors import MissingRequiredTestInputError
10
11
  from validmind.vm_models import (
11
12
  ResultSummary,
12
13
  ResultTable,
@@ -15,11 +16,16 @@ from validmind.vm_models import (
15
16
  ThresholdTestResult,
16
17
  )
17
18
 
18
- from .ai_powered_test import AIPoweredTest
19
+ from .ai_powered_test import (
20
+ call_model,
21
+ get_explanation,
22
+ get_score,
23
+ missing_prompt_message,
24
+ )
19
25
 
20
26
 
21
27
  @dataclass
22
- class Delimitation(ThresholdTest, AIPoweredTest):
28
+ class Delimitation(ThresholdTest):
23
29
  """
24
30
  Evaluates the proper use of delimiters in prompts provided to Large Language Models.
25
31
 
@@ -85,12 +91,6 @@ Prompt:
85
91
  """
86
92
  '''.strip()
87
93
 
88
- def __init__(self, *args, **kwargs):
89
- super().__init__(*args, **kwargs) # Call ThresholdTest.__init__
90
- AIPoweredTest.__init__(
91
- self, *args, **kwargs
92
- ) # Explicitly call AIPoweredTest.__init__
93
-
94
94
  def summary(self, results: List[ThresholdTestResult], all_passed: bool):
95
95
  result = results[0]
96
96
  results_table = [
@@ -114,14 +114,17 @@ Prompt:
114
114
  )
115
115
 
116
116
  def run(self):
117
- response = self.call_model(
117
+ if not hasattr(self.inputs.model, "prompt"):
118
+ raise MissingRequiredTestInputError(missing_prompt_message)
119
+
120
+ response = call_model(
118
121
  system_prompt=self.system_prompt,
119
122
  user_prompt=self.user_prompt.format(
120
123
  prompt_to_test=self.inputs.model.prompt.template
121
124
  ),
122
125
  )
123
- score = self.get_score(response)
124
- explanation = self.get_explanation(response)
126
+ score = get_score(response)
127
+ explanation = get_explanation(response)
125
128
 
126
129
  passed = score > self.params["min_threshold"]
127
130
  results = [
@@ -7,6 +7,7 @@ from typing import List
7
7
 
8
8
  import pandas as pd
9
9
 
10
+ from validmind.errors import MissingRequiredTestInputError
10
11
  from validmind.vm_models import (
11
12
  ResultSummary,
12
13
  ResultTable,
@@ -15,11 +16,16 @@ from validmind.vm_models import (
15
16
  ThresholdTestResult,
16
17
  )
17
18
 
18
- from .ai_powered_test import AIPoweredTest
19
+ from .ai_powered_test import (
20
+ call_model,
21
+ get_explanation,
22
+ get_score,
23
+ missing_prompt_message,
24
+ )
19
25
 
20
26
 
21
27
  @dataclass
22
- class NegativeInstruction(ThresholdTest, AIPoweredTest):
28
+ class NegativeInstruction(ThresholdTest):
23
29
  """
24
30
  Evaluates and grades the use of affirmative, proactive language over negative instructions in LLM prompts.
25
31
 
@@ -96,12 +102,6 @@ Prompt:
96
102
  """
97
103
  '''.strip()
98
104
 
99
- def __init__(self, *args, **kwargs):
100
- super().__init__(*args, **kwargs) # Call ThresholdTest.__init__
101
- AIPoweredTest.__init__(
102
- self, *args, **kwargs
103
- ) # Explicitly call AIPoweredTest.__init__
104
-
105
105
  def summary(self, results: List[ThresholdTestResult], all_passed: bool):
106
106
  result = results[0]
107
107
  results_table = [
@@ -125,14 +125,17 @@ Prompt:
125
125
  )
126
126
 
127
127
  def run(self):
128
- response = self.call_model(
128
+ if not hasattr(self.inputs.model, "prompt"):
129
+ raise MissingRequiredTestInputError(missing_prompt_message)
130
+
131
+ response = call_model(
129
132
  system_prompt=self.system_prompt,
130
133
  user_prompt=self.user_prompt.format(
131
134
  prompt_to_test=self.inputs.model.prompt.template
132
135
  ),
133
136
  )
134
- score = self.get_score(response)
135
- explanation = self.get_explanation(response)
137
+ score = get_score(response)
138
+ explanation = get_explanation(response)
136
139
 
137
140
  passed = score > self.params["min_threshold"]
138
141
  results = [
@@ -7,7 +7,7 @@ from typing import List
7
7
 
8
8
  import pandas as pd
9
9
 
10
- from validmind.errors import SkipTestError
10
+ from validmind.errors import MissingRequiredTestInputError, SkipTestError
11
11
  from validmind.vm_models import (
12
12
  ResultSummary,
13
13
  ResultTable,
@@ -16,11 +16,11 @@ from validmind.vm_models import (
16
16
  ThresholdTestResult,
17
17
  )
18
18
 
19
- from .ai_powered_test import AIPoweredTest
19
+ from .ai_powered_test import call_model, missing_prompt_message
20
20
 
21
21
 
22
22
  @dataclass
23
- class Robustness(ThresholdTest, AIPoweredTest):
23
+ class Robustness(ThresholdTest):
24
24
  """
25
25
  Assesses the robustness of prompts provided to a Large Language Model under varying conditions and contexts.
26
26
 
@@ -94,12 +94,6 @@ Prompt:
94
94
  Input:
95
95
  '''.strip()
96
96
 
97
- def __init__(self, *args, **kwargs):
98
- super().__init__(*args, **kwargs) # Call ThresholdTest.__init__
99
- AIPoweredTest.__init__(
100
- self, *args, **kwargs
101
- ) # Explicitly call AIPoweredTest.__init__
102
-
103
97
  def summary(self, results: List[ThresholdTestResult], all_passed: bool):
104
98
  results_table = [
105
99
  {
@@ -122,8 +116,14 @@ Input:
122
116
  )
123
117
 
124
118
  def run(self):
119
+ if not hasattr(self.inputs.model, "prompt"):
120
+ raise MissingRequiredTestInputError(missing_prompt_message)
121
+
125
122
  # TODO: add support for multi-variable prompts
126
- if len(self.inputs.model.prompt.variables) > 1:
123
+ if (
124
+ not self.inputs.model.prompt.variables
125
+ or len(self.inputs.model.prompt.variables) > 1
126
+ ):
127
127
  raise SkipTestError(
128
128
  "Robustness only supports single-variable prompts for now"
129
129
  )
@@ -138,7 +138,7 @@ Input:
138
138
  results = []
139
139
 
140
140
  for _ in range(self.params["num_tests"]):
141
- response = self.call_model(
141
+ response = call_model(
142
142
  system_prompt=self.system_prompt,
143
143
  user_prompt=self.user_prompt.format(
144
144
  variables="\n".join(self.inputs.model.prompt.variables),
@@ -7,6 +7,7 @@ from typing import List
7
7
 
8
8
  import pandas as pd
9
9
 
10
+ from validmind.errors import MissingRequiredTestInputError
10
11
  from validmind.vm_models import (
11
12
  ResultSummary,
12
13
  ResultTable,
@@ -15,11 +16,16 @@ from validmind.vm_models import (
15
16
  ThresholdTestResult,
16
17
  )
17
18
 
18
- from .ai_powered_test import AIPoweredTest
19
+ from .ai_powered_test import (
20
+ call_model,
21
+ get_explanation,
22
+ get_score,
23
+ missing_prompt_message,
24
+ )
19
25
 
20
26
 
21
27
  @dataclass
22
- class Specificity(ThresholdTest, AIPoweredTest):
28
+ class Specificity(ThresholdTest):
23
29
  """
24
30
  Evaluates and scores the specificity of prompts provided to a Large Language Model (LLM), based on clarity,
25
31
  detail, and relevance.
@@ -91,12 +97,6 @@ Prompt:
91
97
  """
92
98
  '''.strip()
93
99
 
94
- def __init__(self, *args, **kwargs):
95
- super().__init__(*args, **kwargs) # Call ThresholdTest.__init__
96
- AIPoweredTest.__init__(
97
- self, *args, **kwargs
98
- ) # Explicitly call AIPoweredTest.__init__
99
-
100
100
  def summary(self, results: List[ThresholdTestResult], all_passed: bool):
101
101
  result = results[0]
102
102
  results_table = [
@@ -120,14 +120,17 @@ Prompt:
120
120
  )
121
121
 
122
122
  def run(self):
123
- response = self.call_model(
123
+ if not hasattr(self.inputs.model, "prompt"):
124
+ raise MissingRequiredTestInputError(missing_prompt_message)
125
+
126
+ response = call_model(
124
127
  system_prompt=self.system_prompt,
125
128
  user_prompt=self.user_prompt.format(
126
129
  prompt_to_test=self.inputs.model.prompt.template
127
130
  ),
128
131
  )
129
- score = self.get_score(response)
130
- explanation = self.get_explanation(response)
132
+ score = get_score(response)
133
+ explanation = get_explanation(response)
131
134
 
132
135
  passed = score > self.params["min_threshold"]
133
136
  results = [
@@ -2,90 +2,68 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- import os
6
5
  import re
7
6
 
8
- from openai import AzureOpenAI, OpenAI
7
+ from validmind.ai.utils import get_client_and_model
8
+
9
+ missing_prompt_message = """
10
+ Cannot run prompt validation tests on a model with no prompt.
11
+ You can set a prompt when creating a vm_model object like this:
12
+ my_vm_model = vm.init_model(
13
+ predict_fn=call_model,
14
+ prompt=Prompt(
15
+ template="<your-prompt-here>",
16
+ variables=[],
17
+ ),
18
+ input_id="my_llm_model",
19
+ )
20
+ """
21
+
22
+
23
+ def call_model(
24
+ system_prompt: str, user_prompt: str, temperature: float = 0.0, seed: int = 42
25
+ ):
26
+ """Call LLM with the given prompts and return the response"""
27
+ client, model = get_client_and_model()
28
+
29
+ return (
30
+ client.chat.completions.create(
31
+ model=model,
32
+ messages=[
33
+ {"role": "system", "content": system_prompt},
34
+ {"role": "user", "content": user_prompt},
35
+ ],
36
+ temperature=temperature,
37
+ seed=seed,
38
+ )
39
+ .choices[0]
40
+ .message.content
41
+ )
9
42
 
10
43
 
11
- class AIPoweredTest:
12
- """
13
- Base class for tests powered by an LLM
14
- """
44
+ def get_score(response: str):
45
+ """Get just the score from the response string
46
+ TODO: use json response mode instead of this
15
47
 
16
- api_key = None
17
- client = None
18
- endpoint = None
19
- model_name = None
20
-
21
- def __init__(self, *args, **kwargs):
22
- if "OPENAI_API_KEY" in os.environ:
23
- self.client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
24
- self.model_name = os.environ.get("VM_OPENAI_MODEL", "gpt-3.5-turbo")
25
-
26
- elif "AZURE_OPENAI_KEY" in os.environ:
27
- if "AZURE_OPENAI_ENDPOINT" not in os.environ:
28
- raise ValueError(
29
- "AZURE_OPENAI_ENDPOINT must be set to run LLM tests with Azure"
30
- )
31
-
32
- if "AZURE_OPENAI_MODEL" not in os.environ:
33
- raise ValueError(
34
- "AZURE_OPENAI_MODEL must be set to run LLM tests with Azure"
35
- )
36
-
37
- self.client = AzureOpenAI(
38
- azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"),
39
- api_key=os.environ.get("AZURE_OPENAI_KEY"),
40
- api_version=os.environ.get("AZURE_OPENAI_VERSION", "2023-05-15"),
41
- )
42
- self.model_name = os.environ.get("AZURE_OPENAI_MODEL")
43
-
44
- else:
45
- raise ValueError(
46
- "OPENAI_API_KEY or AZURE_OPENAI_KEY must be set to run LLM tests"
47
- )
48
-
49
- def call_model(self, user_prompt: str, system_prompt: str = None):
50
- """
51
- Call an LLM with the passed prompts and return the response. We're using GPT4 for now.
52
- """
53
- return (
54
- self.client.chat.completions.create(
55
- model=self.model_name,
56
- messages=[
57
- {"role": "system", "content": system_prompt},
58
- {"role": "user", "content": user_prompt},
59
- ],
60
- temperature=0.0,
61
- seed=42,
62
- )
63
- .choices[0]
64
- .message.content
65
- )
66
-
67
- def get_score(self, response: str):
68
- """
69
- Get just the numeric data in the response string and convert it to an int
48
+ e.g. "Score: 8\nExplanation: <some-explanation>" -> 8
49
+ """
50
+ score = re.search(r"Score: (\d+)", response)
70
51
 
71
- e.g. "Score: 8\nExplanation: <some-explanation>" -> 8
72
- """
73
- score = re.search(r"Score: (\d+)", response)
52
+ if not score:
53
+ raise ValueError("Could not find score in response")
74
54
 
75
- if not score:
76
- raise ValueError("Could not find score in response")
55
+ return int(score.group(1))
77
56
 
78
- return int(score.group(1))
79
57
 
80
- def get_explanation(self, response: str):
81
- """
82
- Get just the explanation from the response string
58
+ def get_explanation(response: str):
59
+ """Get just the explanation from the response string
60
+ TODO: use json response mode instead of this
83
61
 
84
- e.g. "Score: 8\nExplanation: <some-explanation>" -> "<some-explanation>"
85
- """
86
- explanation = re.search(r"Explanation: (.+)", response, re.DOTALL)
62
+ e.g. "Score: 8\nExplanation: <some-explanation>" -> "<some-explanation>"
63
+ """
64
+ explanation = re.search(r"Explanation: (.+)", response, re.DOTALL)
87
65
 
88
- if not explanation:
89
- raise ValueError("Could not find explanation in response")
66
+ if not explanation:
67
+ raise ValueError("Could not find explanation in response")
90
68
 
91
- return explanation.group(1)
69
+ return explanation.group(1).strip().strip("`")
@@ -6,9 +6,10 @@ from dataclasses import dataclass
6
6
  from typing import List, Tuple, Union
7
7
  from uuid import uuid4
8
8
 
9
+ from ..ai.test_descriptions import get_description_metadata
9
10
  from ..logging import get_logger
10
11
  from ..tests.decorator import _inspect_signature
11
- from ..utils import get_description_metadata, run_async, test_id_to_name
12
+ from ..utils import run_async, test_id_to_name
12
13
  from ..vm_models.test.metric import Metric
13
14
  from ..vm_models.test.metric_result import MetricResult
14
15
  from ..vm_models.test.result_summary import ResultSummary, ResultTable
validmind/utils.py CHANGED
@@ -6,7 +6,6 @@ import asyncio
6
6
  import difflib
7
7
  import json
8
8
  import math
9
- import os
10
9
  import re
11
10
  import sys
12
11
  from platform import python_version
@@ -26,8 +25,8 @@ from matplotlib.axes._axes import _log as matplotlib_axes_logger
26
25
  from numpy import ndarray
27
26
  from tabulate import tabulate
28
27
 
29
- from .ai import generate_description
30
28
  from .html_templates.content_blocks import math_jax_snippet, python_syntax_highlighting
29
+ from .logging import get_logger
31
30
 
32
31
  DEFAULT_BIG_NUMBER_DECIMALS = 2
33
32
  DEFAULT_SMALL_NUMBER_DECIMALS = 4
@@ -50,6 +49,8 @@ params = {
50
49
  pylab.rcParams.update(params)
51
50
  #################################
52
51
 
52
+ logger = get_logger(__name__)
53
+
53
54
 
54
55
  def is_notebook() -> bool:
55
56
  """
@@ -307,7 +308,7 @@ def run_async_check(func, *args, **kwargs):
307
308
  if task.get_name() == name:
308
309
  return task
309
310
 
310
- return run_async(func, name=name, *args, **kwargs)
311
+ return run_async(func, name=name, *args, **kwargs) # noqa B026
311
312
 
312
313
  except RuntimeError:
313
314
  pass
@@ -457,49 +458,3 @@ def md_to_html(md: str, mathml=False) -> str:
457
458
  )
458
459
 
459
460
  return html
460
-
461
-
462
- def get_description_metadata(
463
- test_id,
464
- default_description,
465
- summary=None,
466
- figures=None,
467
- prefix="metric_description",
468
- ):
469
- """Get Metadata Dictionary for a Test or Metric Result
470
-
471
- Generates an LLM interpretation of the test results or uses the default
472
- description and returns a metadata object that can be logged with the test results.
473
-
474
- To enable LLM-generated descriptions, set the VALIDMIND_LLM_DESCRIPTIONS_ENABLED
475
- environment variable to "true". The default description will be used if LLM
476
- descriptions are disabled.
477
-
478
- Note: Either the summary or figures must be provided to generate the description.
479
-
480
- Args:
481
- test_id (str): The test ID
482
- default_description (str): The default description for the test
483
- summary (Any): The test summary or results to interpret
484
- figures (List[Figure]): The figures to attach to the test suite result
485
- prefix (str): The prefix to use for the content ID (Default: "metric_description")
486
-
487
- Returns:
488
- dict: The metadata object to be logged with the test results
489
- """
490
- if os.environ.get("VALIDMIND_LLM_DESCRIPTIONS_ENABLED", "false").lower() == "true":
491
- revision_name = "Generated by ValidMind AI"
492
- description = generate_description(
493
- test_name=test_id,
494
- test_description=default_description,
495
- test_summary=summary,
496
- figures=figures,
497
- )
498
- else:
499
- revision_name = "Default Description"
500
- description = default_description
501
-
502
- return {
503
- "content_id": f"{prefix}:{test_id}::{revision_name}",
504
- "text": description,
505
- }