validmind 2.8.20__py3-none-any.whl → 2.8.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +3 -0
- validmind/__version__.py +1 -1
- validmind/ai/utils.py +89 -0
- validmind/api_client.py +4 -0
- validmind/client.py +3 -0
- validmind/experimental/__init__.py +0 -0
- validmind/experimental/agents.py +65 -0
- validmind/template.py +3 -2
- validmind/tests/data_validation/MutualInformation.py +14 -2
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -1
- validmind/tests/model_validation/ragas/AspectCritic.py +5 -1
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +5 -1
- validmind/tests/model_validation/ragas/ContextPrecision.py +5 -1
- validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +5 -1
- validmind/tests/model_validation/ragas/ContextRecall.py +5 -1
- validmind/tests/model_validation/ragas/Faithfulness.py +5 -1
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +3 -1
- validmind/tests/model_validation/ragas/ResponseRelevancy.py +6 -4
- validmind/tests/model_validation/ragas/SemanticSimilarity.py +5 -1
- validmind/tests/model_validation/ragas/utils.py +4 -24
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +11 -1
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +13 -0
- validmind/tests/prompt_validation/Bias.py +2 -1
- validmind/tests/prompt_validation/Clarity.py +2 -1
- validmind/tests/prompt_validation/Conciseness.py +2 -1
- validmind/tests/prompt_validation/Delimitation.py +2 -1
- validmind/tests/prompt_validation/NegativeInstruction.py +2 -1
- validmind/tests/prompt_validation/Robustness.py +3 -2
- validmind/tests/prompt_validation/Specificity.py +2 -1
- validmind/tests/prompt_validation/ai_powered_test.py +18 -17
- validmind/vm_models/dataset/dataset.py +64 -27
- validmind/vm_models/result/__init__.py +16 -2
- validmind/vm_models/result/result.py +127 -14
- {validmind-2.8.20.dist-info → validmind-2.8.26.dist-info}/METADATA +4 -3
- {validmind-2.8.20.dist-info → validmind-2.8.26.dist-info}/RECORD +38 -36
- {validmind-2.8.20.dist-info → validmind-2.8.26.dist-info}/WHEEL +1 -1
- {validmind-2.8.20.dist-info → validmind-2.8.26.dist-info}/LICENSE +0 -0
- {validmind-2.8.20.dist-info → validmind-2.8.26.dist-info}/entry_points.txt +0 -0
validmind/__init__.py
CHANGED
@@ -53,6 +53,7 @@ from .client import ( # noqa: E402
|
|
53
53
|
run_documentation_tests,
|
54
54
|
run_test_suite,
|
55
55
|
)
|
56
|
+
from .experimental import agents as experimental_agent
|
56
57
|
from .tests.decorator import tags, tasks, test
|
57
58
|
from .tests.run import print_env
|
58
59
|
from .utils import is_notebook, parse_version
|
@@ -126,4 +127,6 @@ __all__ = [ # noqa
|
|
126
127
|
"unit_metrics",
|
127
128
|
"test_suites",
|
128
129
|
"log_text",
|
130
|
+
# experimental features
|
131
|
+
"experimental_agent",
|
129
132
|
]
|
validmind/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "2.8.
|
1
|
+
__version__ = "2.8.25"
|
validmind/ai/utils.py
CHANGED
@@ -15,6 +15,10 @@ logger = get_logger(__name__)
|
|
15
15
|
|
16
16
|
__client = None
|
17
17
|
__model = None
|
18
|
+
__judge_llm = None
|
19
|
+
__judge_embeddings = None
|
20
|
+
EMBEDDINGS_MODEL = "text-embedding-3-small"
|
21
|
+
|
18
22
|
# can be None, True or False (ternary to represent initial state, ack and failed ack)
|
19
23
|
__ack = None
|
20
24
|
|
@@ -105,6 +109,91 @@ def get_client_and_model():
|
|
105
109
|
return __client, __model
|
106
110
|
|
107
111
|
|
112
|
+
def get_judge_config(judge_llm=None, judge_embeddings=None):
|
113
|
+
try:
|
114
|
+
from langchain_core.embeddings import Embeddings
|
115
|
+
from langchain_core.language_models.chat_models import BaseChatModel
|
116
|
+
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
117
|
+
|
118
|
+
from validmind.models.function import FunctionModel
|
119
|
+
except ImportError:
|
120
|
+
raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
|
121
|
+
|
122
|
+
if judge_llm is not None or judge_embeddings is not None:
|
123
|
+
if isinstance(judge_llm, FunctionModel) and judge_llm is not None:
|
124
|
+
if isinstance(judge_llm.model, BaseChatModel):
|
125
|
+
judge_llm = judge_llm.model
|
126
|
+
else:
|
127
|
+
raise ValueError(
|
128
|
+
"The ValidMind Functional model provided does not have have a langchain compatible LLM as a model attribute."
|
129
|
+
"To use default ValidMind LLM, do not set judge_llm/judge_embedding parameter, "
|
130
|
+
"ensure that you are connected to the ValidMind API and confirm ValidMind AI is enabled for your account."
|
131
|
+
)
|
132
|
+
if isinstance(judge_embeddings, FunctionModel) and judge_embeddings is not None:
|
133
|
+
if isinstance(judge_llm.model, BaseChatModel):
|
134
|
+
judge_embeddings = judge_embeddings.model
|
135
|
+
else:
|
136
|
+
raise ValueError(
|
137
|
+
"The ValidMind Functional model provided does not have have a langchain compatible embeddings model as a model attribute."
|
138
|
+
"To use default ValidMind LLM, do not set judge_embedding parameter, "
|
139
|
+
"ensure that you are connected to the ValidMind API and confirm ValidMind AI is enabled for your account."
|
140
|
+
)
|
141
|
+
|
142
|
+
if (isinstance(judge_llm, BaseChatModel) or judge_llm is None) and (
|
143
|
+
isinstance(judge_embeddings, Embeddings) or judge_embeddings is None
|
144
|
+
):
|
145
|
+
return judge_llm, judge_embeddings
|
146
|
+
else:
|
147
|
+
raise ValueError(
|
148
|
+
"Provided Judge LLM/Embeddings are not Langchain compatible. Ensure the judge LLM/embedding provided are an instance of "
|
149
|
+
"Langchain BaseChatModel and LangchainEmbeddings. To use default ValidMind LLM, do not set judge_llm/judge_embedding parameter, "
|
150
|
+
"ensure that you are connected to the ValidMind API and confirm ValidMind AI is enabled for your account."
|
151
|
+
)
|
152
|
+
|
153
|
+
# grab default values if not passed at run time
|
154
|
+
global __judge_llm, __judge_embeddings
|
155
|
+
if __judge_llm and __judge_embeddings:
|
156
|
+
return __judge_llm, __judge_embeddings
|
157
|
+
|
158
|
+
client, model = get_client_and_model()
|
159
|
+
os.environ["OPENAI_API_BASE"] = str(client.base_url)
|
160
|
+
|
161
|
+
__judge_llm = ChatOpenAI(api_key=client.api_key, model=model)
|
162
|
+
__judge_embeddings = OpenAIEmbeddings(
|
163
|
+
api_key=client.api_key, model=EMBEDDINGS_MODEL
|
164
|
+
)
|
165
|
+
|
166
|
+
return __judge_llm, __judge_embeddings
|
167
|
+
|
168
|
+
|
169
|
+
def set_judge_config(judge_llm, judge_embeddings):
|
170
|
+
global __judge_llm, __judge_embeddings
|
171
|
+
try:
|
172
|
+
from langchain_core.embeddings import Embeddings
|
173
|
+
from langchain_core.language_models.chat_models import BaseChatModel
|
174
|
+
|
175
|
+
from validmind.models.function import FunctionModel
|
176
|
+
except ImportError:
|
177
|
+
raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
|
178
|
+
if isinstance(judge_llm, BaseChatModel) and isinstance(
|
179
|
+
judge_embeddings, Embeddings
|
180
|
+
):
|
181
|
+
__judge_llm = judge_llm
|
182
|
+
__judge_embeddings = judge_embeddings
|
183
|
+
# Assuming 'your_object' is the object you want to check
|
184
|
+
elif isinstance(judge_llm, FunctionModel) and isinstance(
|
185
|
+
judge_embeddings, FunctionModel
|
186
|
+
):
|
187
|
+
__judge_llm = judge_llm.model
|
188
|
+
__judge_embeddings = judge_embeddings.model
|
189
|
+
else:
|
190
|
+
raise ValueError(
|
191
|
+
"Provided Judge LLM/Embeddings are not Langchain compatible. Ensure the judge LLM/embedding provided are an instance of "
|
192
|
+
"Langchain BaseChatModel and LangchainEmbeddings. To use default ValidMind LLM, do not set judge_llm/judge_embedding parameter, "
|
193
|
+
"ensure that you are connected to the ValidMind API and confirm ValidMind AI is enabled for your account."
|
194
|
+
)
|
195
|
+
|
196
|
+
|
108
197
|
def is_configured():
|
109
198
|
global __ack
|
110
199
|
|
validmind/api_client.py
CHANGED
@@ -448,6 +448,7 @@ async def alog_metric(
|
|
448
448
|
params: Optional[Dict[str, Any]] = None,
|
449
449
|
recorded_at: Optional[str] = None,
|
450
450
|
thresholds: Optional[Dict[str, Any]] = None,
|
451
|
+
passed: Optional[bool] = None,
|
451
452
|
):
|
452
453
|
"""See log_metric for details."""
|
453
454
|
if not key or not isinstance(key, str):
|
@@ -476,6 +477,7 @@ async def alog_metric(
|
|
476
477
|
"params": params or {},
|
477
478
|
"recorded_at": recorded_at,
|
478
479
|
"thresholds": thresholds or {},
|
480
|
+
"passed": passed if passed is not None else None,
|
479
481
|
},
|
480
482
|
cls=NumpyEncoder,
|
481
483
|
allow_nan=False,
|
@@ -493,6 +495,7 @@ def log_metric(
|
|
493
495
|
params: Optional[Dict[str, Any]] = None,
|
494
496
|
recorded_at: Optional[str] = None,
|
495
497
|
thresholds: Optional[Dict[str, Any]] = None,
|
498
|
+
passed: Optional[bool] = None,
|
496
499
|
):
|
497
500
|
"""Logs a unit metric.
|
498
501
|
|
@@ -518,6 +521,7 @@ def log_metric(
|
|
518
521
|
params=params,
|
519
522
|
recorded_at=recorded_at,
|
520
523
|
thresholds=thresholds,
|
524
|
+
passed=passed,
|
521
525
|
)
|
522
526
|
|
523
527
|
|
validmind/client.py
CHANGED
@@ -61,6 +61,7 @@ def init_dataset(
|
|
61
61
|
class_labels: Optional[Dict[str, Any]] = None,
|
62
62
|
type: Optional[str] = None,
|
63
63
|
input_id: Optional[str] = None,
|
64
|
+
copy_data: bool = True,
|
64
65
|
__log: bool = True,
|
65
66
|
) -> VMDataset:
|
66
67
|
"""
|
@@ -92,6 +93,7 @@ def init_dataset(
|
|
92
93
|
this will be set to `dataset` but if you are passing this dataset as a
|
93
94
|
test input using some other key than `dataset`, then you should set
|
94
95
|
this to the same key.
|
96
|
+
copy_data (bool, optional): Whether to copy the data. Defaults to True.
|
95
97
|
__log (bool): Whether to log the input. Defaults to True.
|
96
98
|
|
97
99
|
Raises:
|
@@ -121,6 +123,7 @@ def init_dataset(
|
|
121
123
|
extra_columns=extra_columns,
|
122
124
|
target_class_labels=class_labels,
|
123
125
|
date_time_index=date_time_index,
|
126
|
+
copy_data=copy_data,
|
124
127
|
)
|
125
128
|
elif isinstance(dataset, pl.DataFrame):
|
126
129
|
vm_dataset = PolarsDataset(
|
File without changes
|
@@ -0,0 +1,65 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
"""
|
6
|
+
Agent interface for all text generation tasks
|
7
|
+
"""
|
8
|
+
|
9
|
+
import requests
|
10
|
+
|
11
|
+
from validmind.api_client import _get_api_headers, _get_url, raise_api_error
|
12
|
+
from validmind.utils import is_html, md_to_html
|
13
|
+
from validmind.vm_models.result import TextGenerationResult
|
14
|
+
|
15
|
+
|
16
|
+
def run_task(
|
17
|
+
task: str,
|
18
|
+
input: dict,
|
19
|
+
show: bool = True,
|
20
|
+
) -> TextGenerationResult:
|
21
|
+
"""
|
22
|
+
Run text generation tasks using AI models.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
task (str): Type of text generation task to run. Currently supports:
|
26
|
+
- 'code_explainer': Generates natural language explanations of code
|
27
|
+
input (dict): Input parameters for the generation task:
|
28
|
+
- For code_explainer: Must contain 'source_code' and optional parameters
|
29
|
+
show (bool): Whether to display the generated result. Defaults to True.
|
30
|
+
|
31
|
+
Returns:
|
32
|
+
TextGenerationResult: Result object containing the generated text and metadata
|
33
|
+
|
34
|
+
Raises:
|
35
|
+
ValueError: If an unsupported task is provided
|
36
|
+
requests.exceptions.RequestException: If the API request fails
|
37
|
+
"""
|
38
|
+
if task == "code_explainer" or task == "qualitative_text_generation":
|
39
|
+
r = requests.post(
|
40
|
+
url=_get_url(f"ai/generate/{task}"),
|
41
|
+
headers=_get_api_headers(),
|
42
|
+
json=input,
|
43
|
+
)
|
44
|
+
|
45
|
+
if r.status_code != 200:
|
46
|
+
raise_api_error(r.text)
|
47
|
+
|
48
|
+
generated_text = r.json()["content"]
|
49
|
+
else:
|
50
|
+
raise ValueError(f"Unsupported task: {task}")
|
51
|
+
|
52
|
+
if not is_html(generated_text):
|
53
|
+
generated_text = md_to_html(generated_text, mathml=True)
|
54
|
+
|
55
|
+
# Create a test result with the generated text
|
56
|
+
result = TextGenerationResult(
|
57
|
+
result_type=f"{task}",
|
58
|
+
description=generated_text,
|
59
|
+
title=f"Text Generation: {task}",
|
60
|
+
doc=f"Generated {task}",
|
61
|
+
)
|
62
|
+
if show:
|
63
|
+
result.show()
|
64
|
+
|
65
|
+
return result
|
validmind/template.py
CHANGED
@@ -53,8 +53,9 @@ def _convert_sections_to_section_tree(
|
|
53
53
|
|
54
54
|
if start_section_id and not section_tree:
|
55
55
|
raise ValueError(f"Section {start_section_id} not found in template")
|
56
|
-
|
57
|
-
|
56
|
+
# sort the section tree by the order of the sections in the template (if provided)
|
57
|
+
# set the order to 9999 for the sections that do not have an order
|
58
|
+
return sorted(section_tree, key=lambda x: x.get("order", 9999))
|
58
59
|
|
59
60
|
|
60
61
|
def _create_content_widget(content: Dict[str, Any]) -> Widget:
|
@@ -68,8 +68,20 @@ def MutualInformation(
|
|
68
68
|
if task not in ["classification", "regression"]:
|
69
69
|
raise ValueError("task must be either 'classification' or 'regression'")
|
70
70
|
|
71
|
-
|
72
|
-
|
71
|
+
# Check if numeric features exist
|
72
|
+
if not dataset.feature_columns_numeric:
|
73
|
+
raise ValueError(
|
74
|
+
"No numeric features found in dataset. Mutual Information test requires numeric features."
|
75
|
+
)
|
76
|
+
|
77
|
+
# Check if target column exists
|
78
|
+
if not dataset.target_column:
|
79
|
+
raise ValueError(
|
80
|
+
"Target column is required for Mutual Information calculation but was not provided."
|
81
|
+
)
|
82
|
+
|
83
|
+
X = dataset._df[dataset.feature_columns_numeric]
|
84
|
+
y = dataset._df[dataset.target_column]
|
73
85
|
|
74
86
|
# Select appropriate MI function based on task type
|
75
87
|
if task == "classification":
|
@@ -34,6 +34,8 @@ def AnswerCorrectness(
|
|
34
34
|
user_input_column="user_input",
|
35
35
|
response_column="response",
|
36
36
|
reference_column="reference",
|
37
|
+
judge_llm=None,
|
38
|
+
judge_embeddings=None,
|
37
39
|
):
|
38
40
|
"""
|
39
41
|
Evaluates the correctness of answers in a dataset with respect to the provided ground
|
@@ -118,7 +120,9 @@ def AnswerCorrectness(
|
|
118
120
|
df = get_renamed_columns(dataset._df, required_columns)
|
119
121
|
|
120
122
|
result_df = evaluate(
|
121
|
-
Dataset.from_pandas(df),
|
123
|
+
Dataset.from_pandas(df),
|
124
|
+
metrics=[answer_correctness()],
|
125
|
+
**get_ragas_config(judge_llm, judge_embeddings)
|
122
126
|
).to_pandas()
|
123
127
|
|
124
128
|
score_column = "answer_correctness"
|
@@ -51,6 +51,8 @@ def AspectCritic(
|
|
51
51
|
"maliciousness",
|
52
52
|
],
|
53
53
|
additional_aspects: list = None,
|
54
|
+
judge_llm=None,
|
55
|
+
judge_embeddings=None,
|
54
56
|
):
|
55
57
|
"""
|
56
58
|
Evaluates generations against the following aspects: harmfulness, maliciousness,
|
@@ -158,7 +160,9 @@ def AspectCritic(
|
|
158
160
|
all_aspects = [built_in_aspects[aspect] for aspect in aspects] + custom_aspects
|
159
161
|
|
160
162
|
result_df = evaluate(
|
161
|
-
Dataset.from_pandas(df),
|
163
|
+
Dataset.from_pandas(df),
|
164
|
+
metrics=all_aspects,
|
165
|
+
**get_ragas_config(judge_llm, judge_embeddings)
|
162
166
|
).to_pandas()
|
163
167
|
|
164
168
|
# reverse the score for aspects where lower is better
|
@@ -33,6 +33,8 @@ def ContextEntityRecall(
|
|
33
33
|
dataset,
|
34
34
|
retrieved_contexts_column: str = "retrieved_contexts",
|
35
35
|
reference_column: str = "reference",
|
36
|
+
judge_llm=None,
|
37
|
+
judge_embeddings=None,
|
36
38
|
):
|
37
39
|
"""
|
38
40
|
Evaluates the context entity recall for dataset entries and visualizes the results.
|
@@ -113,7 +115,9 @@ def ContextEntityRecall(
|
|
113
115
|
df = get_renamed_columns(dataset._df, required_columns)
|
114
116
|
|
115
117
|
result_df = evaluate(
|
116
|
-
Dataset.from_pandas(df),
|
118
|
+
Dataset.from_pandas(df),
|
119
|
+
metrics=[context_entity_recall()],
|
120
|
+
**get_ragas_config(judge_llm, judge_embeddings)
|
117
121
|
).to_pandas()
|
118
122
|
|
119
123
|
score_column = "context_entity_recall"
|
@@ -34,6 +34,8 @@ def ContextPrecision(
|
|
34
34
|
user_input_column: str = "user_input",
|
35
35
|
retrieved_contexts_column: str = "retrieved_contexts",
|
36
36
|
reference_column: str = "reference",
|
37
|
+
judge_llm=None,
|
38
|
+
judge_embeddings=None,
|
37
39
|
): # noqa: B950
|
38
40
|
"""
|
39
41
|
Context Precision is a metric that evaluates whether all of the ground-truth
|
@@ -109,7 +111,9 @@ def ContextPrecision(
|
|
109
111
|
df = get_renamed_columns(dataset._df, required_columns)
|
110
112
|
|
111
113
|
result_df = evaluate(
|
112
|
-
Dataset.from_pandas(df),
|
114
|
+
Dataset.from_pandas(df),
|
115
|
+
metrics=[context_precision()],
|
116
|
+
**get_ragas_config(judge_llm, judge_embeddings)
|
113
117
|
).to_pandas()
|
114
118
|
|
115
119
|
score_column = "llm_context_precision_with_reference"
|
@@ -34,6 +34,8 @@ def ContextPrecisionWithoutReference(
|
|
34
34
|
user_input_column: str = "user_input",
|
35
35
|
retrieved_contexts_column: str = "retrieved_contexts",
|
36
36
|
response_column: str = "response",
|
37
|
+
judge_llm=None,
|
38
|
+
judge_embeddings=None,
|
37
39
|
): # noqa: B950
|
38
40
|
"""
|
39
41
|
Context Precision Without Reference is a metric used to evaluate the relevance of
|
@@ -104,7 +106,9 @@ def ContextPrecisionWithoutReference(
|
|
104
106
|
df = get_renamed_columns(dataset._df, required_columns)
|
105
107
|
|
106
108
|
result_df = evaluate(
|
107
|
-
Dataset.from_pandas(df),
|
109
|
+
Dataset.from_pandas(df),
|
110
|
+
metrics=[context_precision()],
|
111
|
+
**get_ragas_config(judge_llm, judge_embeddings)
|
108
112
|
).to_pandas()
|
109
113
|
|
110
114
|
score_column = "llm_context_precision_without_reference"
|
@@ -34,6 +34,8 @@ def ContextRecall(
|
|
34
34
|
user_input_column: str = "user_input",
|
35
35
|
retrieved_contexts_column: str = "retrieved_contexts",
|
36
36
|
reference_column: str = "reference",
|
37
|
+
judge_llm=None,
|
38
|
+
judge_embeddings=None,
|
37
39
|
):
|
38
40
|
"""
|
39
41
|
Context recall measures the extent to which the retrieved context aligns with the
|
@@ -109,7 +111,9 @@ def ContextRecall(
|
|
109
111
|
df = get_renamed_columns(dataset._df, required_columns)
|
110
112
|
|
111
113
|
result_df = evaluate(
|
112
|
-
Dataset.from_pandas(df),
|
114
|
+
Dataset.from_pandas(df),
|
115
|
+
metrics=[context_recall()],
|
116
|
+
**get_ragas_config(judge_llm, judge_embeddings)
|
113
117
|
).to_pandas()
|
114
118
|
|
115
119
|
score_column = "context_recall"
|
@@ -34,6 +34,8 @@ def Faithfulness(
|
|
34
34
|
user_input_column="user_input",
|
35
35
|
response_column="response",
|
36
36
|
retrieved_contexts_column="retrieved_contexts",
|
37
|
+
judge_llm=None,
|
38
|
+
judge_embeddings=None,
|
37
39
|
): # noqa
|
38
40
|
"""
|
39
41
|
Evaluates the faithfulness of the generated answers with respect to retrieved contexts.
|
@@ -114,7 +116,9 @@ def Faithfulness(
|
|
114
116
|
df = get_renamed_columns(dataset._df, required_columns)
|
115
117
|
|
116
118
|
result_df = evaluate(
|
117
|
-
Dataset.from_pandas(df),
|
119
|
+
Dataset.from_pandas(df),
|
120
|
+
metrics=[faithfulness()],
|
121
|
+
**get_ragas_config(judge_llm, judge_embeddings)
|
118
122
|
).to_pandas()
|
119
123
|
|
120
124
|
score_column = "faithfulness"
|
@@ -38,6 +38,8 @@ def NoiseSensitivity(
|
|
38
38
|
reference_column="reference",
|
39
39
|
focus="relevant",
|
40
40
|
user_input_column="user_input",
|
41
|
+
judge_llm=None,
|
42
|
+
judge_embeddings=None,
|
41
43
|
):
|
42
44
|
"""
|
43
45
|
Assesses the sensitivity of a Large Language Model (LLM) to noise in retrieved context by measuring how often it
|
@@ -149,7 +151,7 @@ def NoiseSensitivity(
|
|
149
151
|
result_df = evaluate(
|
150
152
|
Dataset.from_pandas(df),
|
151
153
|
metrics=[noise_sensitivity(focus=focus)],
|
152
|
-
**get_ragas_config(),
|
154
|
+
**get_ragas_config(judge_llm, judge_embeddings),
|
153
155
|
).to_pandas()
|
154
156
|
|
155
157
|
score_column = f"noise_sensitivity_{focus}"
|
@@ -34,6 +34,8 @@ def ResponseRelevancy(
|
|
34
34
|
user_input_column="user_input",
|
35
35
|
retrieved_contexts_column=None,
|
36
36
|
response_column="response",
|
37
|
+
judge_llm=None,
|
38
|
+
judge_embeddings=None,
|
37
39
|
):
|
38
40
|
"""
|
39
41
|
Assesses how pertinent the generated answer is to the given prompt.
|
@@ -44,8 +46,8 @@ def ResponseRelevancy(
|
|
44
46
|
relevancy. This metric is computed using the `user_input`, the `retrieved_contexts`
|
45
47
|
and the `response`.
|
46
48
|
|
47
|
-
The Response Relevancy is defined as the mean cosine
|
48
|
-
`user_input` to a number of
|
49
|
+
The Response Relevancy is defined as the mean cosine similarity of the original
|
50
|
+
`user_input` to a number of artificial questions, which are generated (reverse-engineered)
|
49
51
|
based on the `response`:
|
50
52
|
|
51
53
|
$$
|
@@ -62,7 +64,7 @@ def ResponseRelevancy(
|
|
62
64
|
|
63
65
|
**Note**: *This is a reference-free metric, meaning that it does not require a
|
64
66
|
`ground_truth` answer to compare against. A similar metric that does evaluate the
|
65
|
-
correctness of a generated
|
67
|
+
correctness of a generated answers with respect to a `ground_truth` answer is
|
66
68
|
`validmind.model_validation.ragas.AnswerCorrectness`.*
|
67
69
|
|
68
70
|
### Configuring Columns
|
@@ -128,7 +130,7 @@ def ResponseRelevancy(
|
|
128
130
|
result_df = evaluate(
|
129
131
|
Dataset.from_pandas(df),
|
130
132
|
metrics=metrics,
|
131
|
-
**get_ragas_config(),
|
133
|
+
**get_ragas_config(judge_llm, judge_embeddings),
|
132
134
|
).to_pandas()
|
133
135
|
|
134
136
|
score_column = "answer_relevancy"
|
@@ -33,6 +33,8 @@ def SemanticSimilarity(
|
|
33
33
|
dataset,
|
34
34
|
response_column="response",
|
35
35
|
reference_column="reference",
|
36
|
+
judge_llm=None,
|
37
|
+
judge_embeddings=None,
|
36
38
|
):
|
37
39
|
"""
|
38
40
|
Calculates the semantic similarity between generated responses and ground truths
|
@@ -107,7 +109,9 @@ def SemanticSimilarity(
|
|
107
109
|
df = get_renamed_columns(dataset._df, required_columns)
|
108
110
|
|
109
111
|
result_df = evaluate(
|
110
|
-
Dataset.from_pandas(df),
|
112
|
+
Dataset.from_pandas(df),
|
113
|
+
metrics=[semantic_similarity()],
|
114
|
+
**get_ragas_config(judge_llm, judge_embeddings)
|
111
115
|
).to_pandas()
|
112
116
|
|
113
117
|
score_column = "semantic_similarity"
|
@@ -2,34 +2,14 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
import
|
6
|
-
|
7
|
-
from validmind.ai.utils import get_client_and_model, is_configured
|
5
|
+
from validmind.ai.utils import get_judge_config
|
8
6
|
|
9
7
|
EMBEDDINGS_MODEL = "text-embedding-3-small"
|
10
8
|
|
11
9
|
|
12
|
-
def get_ragas_config():
|
13
|
-
|
14
|
-
|
15
|
-
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
16
|
-
except ImportError:
|
17
|
-
raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
|
18
|
-
|
19
|
-
if not is_configured():
|
20
|
-
raise ValueError(
|
21
|
-
"LLM is not configured. Please set an `OPENAI_API_KEY` environment variable "
|
22
|
-
"or ensure that you are connected to the ValidMind API and ValidMind AI is "
|
23
|
-
"enabled for your account."
|
24
|
-
)
|
25
|
-
|
26
|
-
client, model = get_client_and_model()
|
27
|
-
os.environ["OPENAI_API_BASE"] = str(client.base_url)
|
28
|
-
|
29
|
-
return {
|
30
|
-
"llm": ChatOpenAI(api_key=client.api_key, model=model),
|
31
|
-
"embeddings": OpenAIEmbeddings(api_key=client.api_key, model=EMBEDDINGS_MODEL),
|
32
|
-
}
|
10
|
+
def get_ragas_config(judge_llm=None, judge_embeddings=None):
|
11
|
+
judge_llm, judge_embeddings = get_judge_config(judge_llm, judge_embeddings)
|
12
|
+
return {"llm": judge_llm, "embeddings": judge_embeddings}
|
33
13
|
|
34
14
|
|
35
15
|
def make_sub_col_udf(root_col, sub_col):
|
@@ -220,6 +220,16 @@ def OverfitDiagnosis(
|
|
220
220
|
- May not capture more subtle forms of overfitting that do not exceed the threshold.
|
221
221
|
- Assumes that the binning of features adequately represents the data segments.
|
222
222
|
"""
|
223
|
+
|
224
|
+
numeric_and_categorical_feature_columns = (
|
225
|
+
datasets[0].feature_columns_numeric + datasets[0].feature_columns_categorical
|
226
|
+
)
|
227
|
+
|
228
|
+
if not numeric_and_categorical_feature_columns:
|
229
|
+
raise ValueError(
|
230
|
+
"No valid numeric or categorical columns found in features_columns"
|
231
|
+
)
|
232
|
+
|
223
233
|
is_classification = bool(datasets[0].probability_column(model))
|
224
234
|
|
225
235
|
if not metric:
|
@@ -246,7 +256,7 @@ def OverfitDiagnosis(
|
|
246
256
|
figures = []
|
247
257
|
results_headers = ["slice", "shape", "feature", metric]
|
248
258
|
|
249
|
-
for feature_column in
|
259
|
+
for feature_column in numeric_and_categorical_feature_columns:
|
250
260
|
bins = 10
|
251
261
|
if feature_column in datasets[0].feature_columns_categorical:
|
252
262
|
bins = len(train_df[feature_column].unique())
|
@@ -211,6 +211,19 @@ def WeakspotsDiagnosis(
|
|
211
211
|
improvement.
|
212
212
|
"""
|
213
213
|
feature_columns = features_columns or datasets[0].feature_columns
|
214
|
+
numeric_and_categorical_columns = (
|
215
|
+
datasets[0].feature_columns_numeric + datasets[0].feature_columns_categorical
|
216
|
+
)
|
217
|
+
|
218
|
+
feature_columns = [
|
219
|
+
col for col in feature_columns if col in numeric_and_categorical_columns
|
220
|
+
]
|
221
|
+
|
222
|
+
if not feature_columns:
|
223
|
+
raise ValueError(
|
224
|
+
"No valid numeric or categorical columns found in features_columns"
|
225
|
+
)
|
226
|
+
|
214
227
|
if not all(col in datasets[0].feature_columns for col in feature_columns):
|
215
228
|
raise ValueError(
|
216
229
|
"Column(s) provided in features_columns do not exist in the dataset"
|
@@ -45,7 +45,7 @@ Prompt:
|
|
45
45
|
|
46
46
|
@tags("llm", "few_shot")
|
47
47
|
@tasks("text_classification", "text_summarization")
|
48
|
-
def Bias(model, min_threshold=7):
|
48
|
+
def Bias(model, min_threshold=7, judge_llm=None):
|
49
49
|
"""
|
50
50
|
Assesses potential bias in a Large Language Model by analyzing the distribution and order of exemplars in the
|
51
51
|
prompt.
|
@@ -100,6 +100,7 @@ def Bias(model, min_threshold=7):
|
|
100
100
|
response = call_model(
|
101
101
|
system_prompt=SYSTEM,
|
102
102
|
user_prompt=USER.format(prompt_to_test=model.prompt.template),
|
103
|
+
judge_llm=judge_llm,
|
103
104
|
)
|
104
105
|
|
105
106
|
score = get_score(response)
|
@@ -46,7 +46,7 @@ Prompt:
|
|
46
46
|
|
47
47
|
@tags("llm", "zero_shot", "few_shot")
|
48
48
|
@tasks("text_classification", "text_summarization")
|
49
|
-
def Clarity(model, min_threshold=7):
|
49
|
+
def Clarity(model, min_threshold=7, judge_llm=None):
|
50
50
|
"""
|
51
51
|
Evaluates and scores the clarity of prompts in a Large Language Model based on specified guidelines.
|
52
52
|
|
@@ -89,6 +89,7 @@ def Clarity(model, min_threshold=7):
|
|
89
89
|
response = call_model(
|
90
90
|
system_prompt=SYSTEM,
|
91
91
|
user_prompt=USER.format(prompt_to_test=model.prompt.template),
|
92
|
+
judge_llm=judge_llm,
|
92
93
|
)
|
93
94
|
|
94
95
|
score = get_score(response)
|
@@ -54,7 +54,7 @@ Prompt:
|
|
54
54
|
|
55
55
|
@tags("llm", "zero_shot", "few_shot")
|
56
56
|
@tasks("text_classification", "text_summarization")
|
57
|
-
def Conciseness(model, min_threshold=7):
|
57
|
+
def Conciseness(model, min_threshold=7, judge_llm=None):
|
58
58
|
"""
|
59
59
|
Analyzes and grades the conciseness of prompts provided to a Large Language Model.
|
60
60
|
|
@@ -97,6 +97,7 @@ def Conciseness(model, min_threshold=7):
|
|
97
97
|
response = call_model(
|
98
98
|
system_prompt=SYSTEM,
|
99
99
|
user_prompt=USER.format(prompt_to_test=model.prompt.template),
|
100
|
+
judge_llm=judge_llm,
|
100
101
|
)
|
101
102
|
score = get_score(response)
|
102
103
|
explanation = get_explanation(response)
|
@@ -39,7 +39,7 @@ Prompt:
|
|
39
39
|
|
40
40
|
@tags("llm", "zero_shot", "few_shot")
|
41
41
|
@tasks("text_classification", "text_summarization")
|
42
|
-
def Delimitation(model, min_threshold=7):
|
42
|
+
def Delimitation(model, min_threshold=7, judge_llm=None):
|
43
43
|
"""
|
44
44
|
Evaluates the proper use of delimiters in prompts provided to Large Language Models.
|
45
45
|
|
@@ -83,6 +83,7 @@ def Delimitation(model, min_threshold=7):
|
|
83
83
|
response = call_model(
|
84
84
|
system_prompt=SYSTEM,
|
85
85
|
user_prompt=USER.format(prompt_to_test=model.prompt.template),
|
86
|
+
judge_llm=judge_llm,
|
86
87
|
)
|
87
88
|
score = get_score(response)
|
88
89
|
explanation = get_explanation(response)
|