validmind 2.8.22__py3-none-any.whl → 2.8.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__init__.py +3 -0
- validmind/__version__.py +1 -1
- validmind/ai/utils.py +89 -0
- validmind/api_client.py +4 -0
- validmind/experimental/__init__.py +0 -0
- validmind/experimental/agents.py +65 -0
- validmind/tests/data_validation/MutualInformation.py +14 -2
- validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -1
- validmind/tests/model_validation/ragas/AspectCritic.py +5 -1
- validmind/tests/model_validation/ragas/ContextEntityRecall.py +5 -1
- validmind/tests/model_validation/ragas/ContextPrecision.py +5 -1
- validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +5 -1
- validmind/tests/model_validation/ragas/ContextRecall.py +5 -1
- validmind/tests/model_validation/ragas/Faithfulness.py +5 -1
- validmind/tests/model_validation/ragas/NoiseSensitivity.py +3 -1
- validmind/tests/model_validation/ragas/ResponseRelevancy.py +6 -4
- validmind/tests/model_validation/ragas/SemanticSimilarity.py +5 -1
- validmind/tests/model_validation/ragas/utils.py +4 -24
- validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +11 -1
- validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +13 -0
- validmind/tests/prompt_validation/Bias.py +2 -1
- validmind/tests/prompt_validation/Clarity.py +2 -1
- validmind/tests/prompt_validation/Conciseness.py +2 -1
- validmind/tests/prompt_validation/Delimitation.py +2 -1
- validmind/tests/prompt_validation/NegativeInstruction.py +2 -1
- validmind/tests/prompt_validation/Robustness.py +3 -2
- validmind/tests/prompt_validation/Specificity.py +2 -1
- validmind/tests/prompt_validation/ai_powered_test.py +18 -17
- validmind/vm_models/result/__init__.py +16 -2
- validmind/vm_models/result/result.py +127 -14
- {validmind-2.8.22.dist-info → validmind-2.8.26.dist-info}/METADATA +4 -3
- {validmind-2.8.22.dist-info → validmind-2.8.26.dist-info}/RECORD +35 -33
- {validmind-2.8.22.dist-info → validmind-2.8.26.dist-info}/WHEEL +1 -1
- {validmind-2.8.22.dist-info → validmind-2.8.26.dist-info}/LICENSE +0 -0
- {validmind-2.8.22.dist-info → validmind-2.8.26.dist-info}/entry_points.txt +0 -0
validmind/__init__.py
CHANGED
@@ -53,6 +53,7 @@ from .client import ( # noqa: E402
|
|
53
53
|
run_documentation_tests,
|
54
54
|
run_test_suite,
|
55
55
|
)
|
56
|
+
from .experimental import agents as experimental_agent
|
56
57
|
from .tests.decorator import tags, tasks, test
|
57
58
|
from .tests.run import print_env
|
58
59
|
from .utils import is_notebook, parse_version
|
@@ -126,4 +127,6 @@ __all__ = [ # noqa
|
|
126
127
|
"unit_metrics",
|
127
128
|
"test_suites",
|
128
129
|
"log_text",
|
130
|
+
# experimental features
|
131
|
+
"experimental_agent",
|
129
132
|
]
|
validmind/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "2.8.
|
1
|
+
__version__ = "2.8.25"
|
validmind/ai/utils.py
CHANGED
@@ -15,6 +15,10 @@ logger = get_logger(__name__)
|
|
15
15
|
|
16
16
|
__client = None
|
17
17
|
__model = None
|
18
|
+
__judge_llm = None
|
19
|
+
__judge_embeddings = None
|
20
|
+
EMBEDDINGS_MODEL = "text-embedding-3-small"
|
21
|
+
|
18
22
|
# can be None, True or False (ternary to represent initial state, ack and failed ack)
|
19
23
|
__ack = None
|
20
24
|
|
@@ -105,6 +109,91 @@ def get_client_and_model():
|
|
105
109
|
return __client, __model
|
106
110
|
|
107
111
|
|
112
|
+
def get_judge_config(judge_llm=None, judge_embeddings=None):
|
113
|
+
try:
|
114
|
+
from langchain_core.embeddings import Embeddings
|
115
|
+
from langchain_core.language_models.chat_models import BaseChatModel
|
116
|
+
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
117
|
+
|
118
|
+
from validmind.models.function import FunctionModel
|
119
|
+
except ImportError:
|
120
|
+
raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
|
121
|
+
|
122
|
+
if judge_llm is not None or judge_embeddings is not None:
|
123
|
+
if isinstance(judge_llm, FunctionModel) and judge_llm is not None:
|
124
|
+
if isinstance(judge_llm.model, BaseChatModel):
|
125
|
+
judge_llm = judge_llm.model
|
126
|
+
else:
|
127
|
+
raise ValueError(
|
128
|
+
"The ValidMind Functional model provided does not have have a langchain compatible LLM as a model attribute."
|
129
|
+
"To use default ValidMind LLM, do not set judge_llm/judge_embedding parameter, "
|
130
|
+
"ensure that you are connected to the ValidMind API and confirm ValidMind AI is enabled for your account."
|
131
|
+
)
|
132
|
+
if isinstance(judge_embeddings, FunctionModel) and judge_embeddings is not None:
|
133
|
+
if isinstance(judge_llm.model, BaseChatModel):
|
134
|
+
judge_embeddings = judge_embeddings.model
|
135
|
+
else:
|
136
|
+
raise ValueError(
|
137
|
+
"The ValidMind Functional model provided does not have have a langchain compatible embeddings model as a model attribute."
|
138
|
+
"To use default ValidMind LLM, do not set judge_embedding parameter, "
|
139
|
+
"ensure that you are connected to the ValidMind API and confirm ValidMind AI is enabled for your account."
|
140
|
+
)
|
141
|
+
|
142
|
+
if (isinstance(judge_llm, BaseChatModel) or judge_llm is None) and (
|
143
|
+
isinstance(judge_embeddings, Embeddings) or judge_embeddings is None
|
144
|
+
):
|
145
|
+
return judge_llm, judge_embeddings
|
146
|
+
else:
|
147
|
+
raise ValueError(
|
148
|
+
"Provided Judge LLM/Embeddings are not Langchain compatible. Ensure the judge LLM/embedding provided are an instance of "
|
149
|
+
"Langchain BaseChatModel and LangchainEmbeddings. To use default ValidMind LLM, do not set judge_llm/judge_embedding parameter, "
|
150
|
+
"ensure that you are connected to the ValidMind API and confirm ValidMind AI is enabled for your account."
|
151
|
+
)
|
152
|
+
|
153
|
+
# grab default values if not passed at run time
|
154
|
+
global __judge_llm, __judge_embeddings
|
155
|
+
if __judge_llm and __judge_embeddings:
|
156
|
+
return __judge_llm, __judge_embeddings
|
157
|
+
|
158
|
+
client, model = get_client_and_model()
|
159
|
+
os.environ["OPENAI_API_BASE"] = str(client.base_url)
|
160
|
+
|
161
|
+
__judge_llm = ChatOpenAI(api_key=client.api_key, model=model)
|
162
|
+
__judge_embeddings = OpenAIEmbeddings(
|
163
|
+
api_key=client.api_key, model=EMBEDDINGS_MODEL
|
164
|
+
)
|
165
|
+
|
166
|
+
return __judge_llm, __judge_embeddings
|
167
|
+
|
168
|
+
|
169
|
+
def set_judge_config(judge_llm, judge_embeddings):
|
170
|
+
global __judge_llm, __judge_embeddings
|
171
|
+
try:
|
172
|
+
from langchain_core.embeddings import Embeddings
|
173
|
+
from langchain_core.language_models.chat_models import BaseChatModel
|
174
|
+
|
175
|
+
from validmind.models.function import FunctionModel
|
176
|
+
except ImportError:
|
177
|
+
raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
|
178
|
+
if isinstance(judge_llm, BaseChatModel) and isinstance(
|
179
|
+
judge_embeddings, Embeddings
|
180
|
+
):
|
181
|
+
__judge_llm = judge_llm
|
182
|
+
__judge_embeddings = judge_embeddings
|
183
|
+
# Assuming 'your_object' is the object you want to check
|
184
|
+
elif isinstance(judge_llm, FunctionModel) and isinstance(
|
185
|
+
judge_embeddings, FunctionModel
|
186
|
+
):
|
187
|
+
__judge_llm = judge_llm.model
|
188
|
+
__judge_embeddings = judge_embeddings.model
|
189
|
+
else:
|
190
|
+
raise ValueError(
|
191
|
+
"Provided Judge LLM/Embeddings are not Langchain compatible. Ensure the judge LLM/embedding provided are an instance of "
|
192
|
+
"Langchain BaseChatModel and LangchainEmbeddings. To use default ValidMind LLM, do not set judge_llm/judge_embedding parameter, "
|
193
|
+
"ensure that you are connected to the ValidMind API and confirm ValidMind AI is enabled for your account."
|
194
|
+
)
|
195
|
+
|
196
|
+
|
108
197
|
def is_configured():
|
109
198
|
global __ack
|
110
199
|
|
validmind/api_client.py
CHANGED
@@ -448,6 +448,7 @@ async def alog_metric(
|
|
448
448
|
params: Optional[Dict[str, Any]] = None,
|
449
449
|
recorded_at: Optional[str] = None,
|
450
450
|
thresholds: Optional[Dict[str, Any]] = None,
|
451
|
+
passed: Optional[bool] = None,
|
451
452
|
):
|
452
453
|
"""See log_metric for details."""
|
453
454
|
if not key or not isinstance(key, str):
|
@@ -476,6 +477,7 @@ async def alog_metric(
|
|
476
477
|
"params": params or {},
|
477
478
|
"recorded_at": recorded_at,
|
478
479
|
"thresholds": thresholds or {},
|
480
|
+
"passed": passed if passed is not None else None,
|
479
481
|
},
|
480
482
|
cls=NumpyEncoder,
|
481
483
|
allow_nan=False,
|
@@ -493,6 +495,7 @@ def log_metric(
|
|
493
495
|
params: Optional[Dict[str, Any]] = None,
|
494
496
|
recorded_at: Optional[str] = None,
|
495
497
|
thresholds: Optional[Dict[str, Any]] = None,
|
498
|
+
passed: Optional[bool] = None,
|
496
499
|
):
|
497
500
|
"""Logs a unit metric.
|
498
501
|
|
@@ -518,6 +521,7 @@ def log_metric(
|
|
518
521
|
params=params,
|
519
522
|
recorded_at=recorded_at,
|
520
523
|
thresholds=thresholds,
|
524
|
+
passed=passed,
|
521
525
|
)
|
522
526
|
|
523
527
|
|
File without changes
|
@@ -0,0 +1,65 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
"""
|
6
|
+
Agent interface for all text generation tasks
|
7
|
+
"""
|
8
|
+
|
9
|
+
import requests
|
10
|
+
|
11
|
+
from validmind.api_client import _get_api_headers, _get_url, raise_api_error
|
12
|
+
from validmind.utils import is_html, md_to_html
|
13
|
+
from validmind.vm_models.result import TextGenerationResult
|
14
|
+
|
15
|
+
|
16
|
+
def run_task(
|
17
|
+
task: str,
|
18
|
+
input: dict,
|
19
|
+
show: bool = True,
|
20
|
+
) -> TextGenerationResult:
|
21
|
+
"""
|
22
|
+
Run text generation tasks using AI models.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
task (str): Type of text generation task to run. Currently supports:
|
26
|
+
- 'code_explainer': Generates natural language explanations of code
|
27
|
+
input (dict): Input parameters for the generation task:
|
28
|
+
- For code_explainer: Must contain 'source_code' and optional parameters
|
29
|
+
show (bool): Whether to display the generated result. Defaults to True.
|
30
|
+
|
31
|
+
Returns:
|
32
|
+
TextGenerationResult: Result object containing the generated text and metadata
|
33
|
+
|
34
|
+
Raises:
|
35
|
+
ValueError: If an unsupported task is provided
|
36
|
+
requests.exceptions.RequestException: If the API request fails
|
37
|
+
"""
|
38
|
+
if task == "code_explainer" or task == "qualitative_text_generation":
|
39
|
+
r = requests.post(
|
40
|
+
url=_get_url(f"ai/generate/{task}"),
|
41
|
+
headers=_get_api_headers(),
|
42
|
+
json=input,
|
43
|
+
)
|
44
|
+
|
45
|
+
if r.status_code != 200:
|
46
|
+
raise_api_error(r.text)
|
47
|
+
|
48
|
+
generated_text = r.json()["content"]
|
49
|
+
else:
|
50
|
+
raise ValueError(f"Unsupported task: {task}")
|
51
|
+
|
52
|
+
if not is_html(generated_text):
|
53
|
+
generated_text = md_to_html(generated_text, mathml=True)
|
54
|
+
|
55
|
+
# Create a test result with the generated text
|
56
|
+
result = TextGenerationResult(
|
57
|
+
result_type=f"{task}",
|
58
|
+
description=generated_text,
|
59
|
+
title=f"Text Generation: {task}",
|
60
|
+
doc=f"Generated {task}",
|
61
|
+
)
|
62
|
+
if show:
|
63
|
+
result.show()
|
64
|
+
|
65
|
+
return result
|
@@ -68,8 +68,20 @@ def MutualInformation(
|
|
68
68
|
if task not in ["classification", "regression"]:
|
69
69
|
raise ValueError("task must be either 'classification' or 'regression'")
|
70
70
|
|
71
|
-
|
72
|
-
|
71
|
+
# Check if numeric features exist
|
72
|
+
if not dataset.feature_columns_numeric:
|
73
|
+
raise ValueError(
|
74
|
+
"No numeric features found in dataset. Mutual Information test requires numeric features."
|
75
|
+
)
|
76
|
+
|
77
|
+
# Check if target column exists
|
78
|
+
if not dataset.target_column:
|
79
|
+
raise ValueError(
|
80
|
+
"Target column is required for Mutual Information calculation but was not provided."
|
81
|
+
)
|
82
|
+
|
83
|
+
X = dataset._df[dataset.feature_columns_numeric]
|
84
|
+
y = dataset._df[dataset.target_column]
|
73
85
|
|
74
86
|
# Select appropriate MI function based on task type
|
75
87
|
if task == "classification":
|
@@ -34,6 +34,8 @@ def AnswerCorrectness(
|
|
34
34
|
user_input_column="user_input",
|
35
35
|
response_column="response",
|
36
36
|
reference_column="reference",
|
37
|
+
judge_llm=None,
|
38
|
+
judge_embeddings=None,
|
37
39
|
):
|
38
40
|
"""
|
39
41
|
Evaluates the correctness of answers in a dataset with respect to the provided ground
|
@@ -118,7 +120,9 @@ def AnswerCorrectness(
|
|
118
120
|
df = get_renamed_columns(dataset._df, required_columns)
|
119
121
|
|
120
122
|
result_df = evaluate(
|
121
|
-
Dataset.from_pandas(df),
|
123
|
+
Dataset.from_pandas(df),
|
124
|
+
metrics=[answer_correctness()],
|
125
|
+
**get_ragas_config(judge_llm, judge_embeddings)
|
122
126
|
).to_pandas()
|
123
127
|
|
124
128
|
score_column = "answer_correctness"
|
@@ -51,6 +51,8 @@ def AspectCritic(
|
|
51
51
|
"maliciousness",
|
52
52
|
],
|
53
53
|
additional_aspects: list = None,
|
54
|
+
judge_llm=None,
|
55
|
+
judge_embeddings=None,
|
54
56
|
):
|
55
57
|
"""
|
56
58
|
Evaluates generations against the following aspects: harmfulness, maliciousness,
|
@@ -158,7 +160,9 @@ def AspectCritic(
|
|
158
160
|
all_aspects = [built_in_aspects[aspect] for aspect in aspects] + custom_aspects
|
159
161
|
|
160
162
|
result_df = evaluate(
|
161
|
-
Dataset.from_pandas(df),
|
163
|
+
Dataset.from_pandas(df),
|
164
|
+
metrics=all_aspects,
|
165
|
+
**get_ragas_config(judge_llm, judge_embeddings)
|
162
166
|
).to_pandas()
|
163
167
|
|
164
168
|
# reverse the score for aspects where lower is better
|
@@ -33,6 +33,8 @@ def ContextEntityRecall(
|
|
33
33
|
dataset,
|
34
34
|
retrieved_contexts_column: str = "retrieved_contexts",
|
35
35
|
reference_column: str = "reference",
|
36
|
+
judge_llm=None,
|
37
|
+
judge_embeddings=None,
|
36
38
|
):
|
37
39
|
"""
|
38
40
|
Evaluates the context entity recall for dataset entries and visualizes the results.
|
@@ -113,7 +115,9 @@ def ContextEntityRecall(
|
|
113
115
|
df = get_renamed_columns(dataset._df, required_columns)
|
114
116
|
|
115
117
|
result_df = evaluate(
|
116
|
-
Dataset.from_pandas(df),
|
118
|
+
Dataset.from_pandas(df),
|
119
|
+
metrics=[context_entity_recall()],
|
120
|
+
**get_ragas_config(judge_llm, judge_embeddings)
|
117
121
|
).to_pandas()
|
118
122
|
|
119
123
|
score_column = "context_entity_recall"
|
@@ -34,6 +34,8 @@ def ContextPrecision(
|
|
34
34
|
user_input_column: str = "user_input",
|
35
35
|
retrieved_contexts_column: str = "retrieved_contexts",
|
36
36
|
reference_column: str = "reference",
|
37
|
+
judge_llm=None,
|
38
|
+
judge_embeddings=None,
|
37
39
|
): # noqa: B950
|
38
40
|
"""
|
39
41
|
Context Precision is a metric that evaluates whether all of the ground-truth
|
@@ -109,7 +111,9 @@ def ContextPrecision(
|
|
109
111
|
df = get_renamed_columns(dataset._df, required_columns)
|
110
112
|
|
111
113
|
result_df = evaluate(
|
112
|
-
Dataset.from_pandas(df),
|
114
|
+
Dataset.from_pandas(df),
|
115
|
+
metrics=[context_precision()],
|
116
|
+
**get_ragas_config(judge_llm, judge_embeddings)
|
113
117
|
).to_pandas()
|
114
118
|
|
115
119
|
score_column = "llm_context_precision_with_reference"
|
@@ -34,6 +34,8 @@ def ContextPrecisionWithoutReference(
|
|
34
34
|
user_input_column: str = "user_input",
|
35
35
|
retrieved_contexts_column: str = "retrieved_contexts",
|
36
36
|
response_column: str = "response",
|
37
|
+
judge_llm=None,
|
38
|
+
judge_embeddings=None,
|
37
39
|
): # noqa: B950
|
38
40
|
"""
|
39
41
|
Context Precision Without Reference is a metric used to evaluate the relevance of
|
@@ -104,7 +106,9 @@ def ContextPrecisionWithoutReference(
|
|
104
106
|
df = get_renamed_columns(dataset._df, required_columns)
|
105
107
|
|
106
108
|
result_df = evaluate(
|
107
|
-
Dataset.from_pandas(df),
|
109
|
+
Dataset.from_pandas(df),
|
110
|
+
metrics=[context_precision()],
|
111
|
+
**get_ragas_config(judge_llm, judge_embeddings)
|
108
112
|
).to_pandas()
|
109
113
|
|
110
114
|
score_column = "llm_context_precision_without_reference"
|
@@ -34,6 +34,8 @@ def ContextRecall(
|
|
34
34
|
user_input_column: str = "user_input",
|
35
35
|
retrieved_contexts_column: str = "retrieved_contexts",
|
36
36
|
reference_column: str = "reference",
|
37
|
+
judge_llm=None,
|
38
|
+
judge_embeddings=None,
|
37
39
|
):
|
38
40
|
"""
|
39
41
|
Context recall measures the extent to which the retrieved context aligns with the
|
@@ -109,7 +111,9 @@ def ContextRecall(
|
|
109
111
|
df = get_renamed_columns(dataset._df, required_columns)
|
110
112
|
|
111
113
|
result_df = evaluate(
|
112
|
-
Dataset.from_pandas(df),
|
114
|
+
Dataset.from_pandas(df),
|
115
|
+
metrics=[context_recall()],
|
116
|
+
**get_ragas_config(judge_llm, judge_embeddings)
|
113
117
|
).to_pandas()
|
114
118
|
|
115
119
|
score_column = "context_recall"
|
@@ -34,6 +34,8 @@ def Faithfulness(
|
|
34
34
|
user_input_column="user_input",
|
35
35
|
response_column="response",
|
36
36
|
retrieved_contexts_column="retrieved_contexts",
|
37
|
+
judge_llm=None,
|
38
|
+
judge_embeddings=None,
|
37
39
|
): # noqa
|
38
40
|
"""
|
39
41
|
Evaluates the faithfulness of the generated answers with respect to retrieved contexts.
|
@@ -114,7 +116,9 @@ def Faithfulness(
|
|
114
116
|
df = get_renamed_columns(dataset._df, required_columns)
|
115
117
|
|
116
118
|
result_df = evaluate(
|
117
|
-
Dataset.from_pandas(df),
|
119
|
+
Dataset.from_pandas(df),
|
120
|
+
metrics=[faithfulness()],
|
121
|
+
**get_ragas_config(judge_llm, judge_embeddings)
|
118
122
|
).to_pandas()
|
119
123
|
|
120
124
|
score_column = "faithfulness"
|
@@ -38,6 +38,8 @@ def NoiseSensitivity(
|
|
38
38
|
reference_column="reference",
|
39
39
|
focus="relevant",
|
40
40
|
user_input_column="user_input",
|
41
|
+
judge_llm=None,
|
42
|
+
judge_embeddings=None,
|
41
43
|
):
|
42
44
|
"""
|
43
45
|
Assesses the sensitivity of a Large Language Model (LLM) to noise in retrieved context by measuring how often it
|
@@ -149,7 +151,7 @@ def NoiseSensitivity(
|
|
149
151
|
result_df = evaluate(
|
150
152
|
Dataset.from_pandas(df),
|
151
153
|
metrics=[noise_sensitivity(focus=focus)],
|
152
|
-
**get_ragas_config(),
|
154
|
+
**get_ragas_config(judge_llm, judge_embeddings),
|
153
155
|
).to_pandas()
|
154
156
|
|
155
157
|
score_column = f"noise_sensitivity_{focus}"
|
@@ -34,6 +34,8 @@ def ResponseRelevancy(
|
|
34
34
|
user_input_column="user_input",
|
35
35
|
retrieved_contexts_column=None,
|
36
36
|
response_column="response",
|
37
|
+
judge_llm=None,
|
38
|
+
judge_embeddings=None,
|
37
39
|
):
|
38
40
|
"""
|
39
41
|
Assesses how pertinent the generated answer is to the given prompt.
|
@@ -44,8 +46,8 @@ def ResponseRelevancy(
|
|
44
46
|
relevancy. This metric is computed using the `user_input`, the `retrieved_contexts`
|
45
47
|
and the `response`.
|
46
48
|
|
47
|
-
The Response Relevancy is defined as the mean cosine
|
48
|
-
`user_input` to a number of
|
49
|
+
The Response Relevancy is defined as the mean cosine similarity of the original
|
50
|
+
`user_input` to a number of artificial questions, which are generated (reverse-engineered)
|
49
51
|
based on the `response`:
|
50
52
|
|
51
53
|
$$
|
@@ -62,7 +64,7 @@ def ResponseRelevancy(
|
|
62
64
|
|
63
65
|
**Note**: *This is a reference-free metric, meaning that it does not require a
|
64
66
|
`ground_truth` answer to compare against. A similar metric that does evaluate the
|
65
|
-
correctness of a generated
|
67
|
+
correctness of a generated answers with respect to a `ground_truth` answer is
|
66
68
|
`validmind.model_validation.ragas.AnswerCorrectness`.*
|
67
69
|
|
68
70
|
### Configuring Columns
|
@@ -128,7 +130,7 @@ def ResponseRelevancy(
|
|
128
130
|
result_df = evaluate(
|
129
131
|
Dataset.from_pandas(df),
|
130
132
|
metrics=metrics,
|
131
|
-
**get_ragas_config(),
|
133
|
+
**get_ragas_config(judge_llm, judge_embeddings),
|
132
134
|
).to_pandas()
|
133
135
|
|
134
136
|
score_column = "answer_relevancy"
|
@@ -33,6 +33,8 @@ def SemanticSimilarity(
|
|
33
33
|
dataset,
|
34
34
|
response_column="response",
|
35
35
|
reference_column="reference",
|
36
|
+
judge_llm=None,
|
37
|
+
judge_embeddings=None,
|
36
38
|
):
|
37
39
|
"""
|
38
40
|
Calculates the semantic similarity between generated responses and ground truths
|
@@ -107,7 +109,9 @@ def SemanticSimilarity(
|
|
107
109
|
df = get_renamed_columns(dataset._df, required_columns)
|
108
110
|
|
109
111
|
result_df = evaluate(
|
110
|
-
Dataset.from_pandas(df),
|
112
|
+
Dataset.from_pandas(df),
|
113
|
+
metrics=[semantic_similarity()],
|
114
|
+
**get_ragas_config(judge_llm, judge_embeddings)
|
111
115
|
).to_pandas()
|
112
116
|
|
113
117
|
score_column = "semantic_similarity"
|
@@ -2,34 +2,14 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
import
|
6
|
-
|
7
|
-
from validmind.ai.utils import get_client_and_model, is_configured
|
5
|
+
from validmind.ai.utils import get_judge_config
|
8
6
|
|
9
7
|
EMBEDDINGS_MODEL = "text-embedding-3-small"
|
10
8
|
|
11
9
|
|
12
|
-
def get_ragas_config():
|
13
|
-
|
14
|
-
|
15
|
-
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
16
|
-
except ImportError:
|
17
|
-
raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
|
18
|
-
|
19
|
-
if not is_configured():
|
20
|
-
raise ValueError(
|
21
|
-
"LLM is not configured. Please set an `OPENAI_API_KEY` environment variable "
|
22
|
-
"or ensure that you are connected to the ValidMind API and ValidMind AI is "
|
23
|
-
"enabled for your account."
|
24
|
-
)
|
25
|
-
|
26
|
-
client, model = get_client_and_model()
|
27
|
-
os.environ["OPENAI_API_BASE"] = str(client.base_url)
|
28
|
-
|
29
|
-
return {
|
30
|
-
"llm": ChatOpenAI(api_key=client.api_key, model=model),
|
31
|
-
"embeddings": OpenAIEmbeddings(api_key=client.api_key, model=EMBEDDINGS_MODEL),
|
32
|
-
}
|
10
|
+
def get_ragas_config(judge_llm=None, judge_embeddings=None):
|
11
|
+
judge_llm, judge_embeddings = get_judge_config(judge_llm, judge_embeddings)
|
12
|
+
return {"llm": judge_llm, "embeddings": judge_embeddings}
|
33
13
|
|
34
14
|
|
35
15
|
def make_sub_col_udf(root_col, sub_col):
|
@@ -220,6 +220,16 @@ def OverfitDiagnosis(
|
|
220
220
|
- May not capture more subtle forms of overfitting that do not exceed the threshold.
|
221
221
|
- Assumes that the binning of features adequately represents the data segments.
|
222
222
|
"""
|
223
|
+
|
224
|
+
numeric_and_categorical_feature_columns = (
|
225
|
+
datasets[0].feature_columns_numeric + datasets[0].feature_columns_categorical
|
226
|
+
)
|
227
|
+
|
228
|
+
if not numeric_and_categorical_feature_columns:
|
229
|
+
raise ValueError(
|
230
|
+
"No valid numeric or categorical columns found in features_columns"
|
231
|
+
)
|
232
|
+
|
223
233
|
is_classification = bool(datasets[0].probability_column(model))
|
224
234
|
|
225
235
|
if not metric:
|
@@ -246,7 +256,7 @@ def OverfitDiagnosis(
|
|
246
256
|
figures = []
|
247
257
|
results_headers = ["slice", "shape", "feature", metric]
|
248
258
|
|
249
|
-
for feature_column in
|
259
|
+
for feature_column in numeric_and_categorical_feature_columns:
|
250
260
|
bins = 10
|
251
261
|
if feature_column in datasets[0].feature_columns_categorical:
|
252
262
|
bins = len(train_df[feature_column].unique())
|
@@ -211,6 +211,19 @@ def WeakspotsDiagnosis(
|
|
211
211
|
improvement.
|
212
212
|
"""
|
213
213
|
feature_columns = features_columns or datasets[0].feature_columns
|
214
|
+
numeric_and_categorical_columns = (
|
215
|
+
datasets[0].feature_columns_numeric + datasets[0].feature_columns_categorical
|
216
|
+
)
|
217
|
+
|
218
|
+
feature_columns = [
|
219
|
+
col for col in feature_columns if col in numeric_and_categorical_columns
|
220
|
+
]
|
221
|
+
|
222
|
+
if not feature_columns:
|
223
|
+
raise ValueError(
|
224
|
+
"No valid numeric or categorical columns found in features_columns"
|
225
|
+
)
|
226
|
+
|
214
227
|
if not all(col in datasets[0].feature_columns for col in feature_columns):
|
215
228
|
raise ValueError(
|
216
229
|
"Column(s) provided in features_columns do not exist in the dataset"
|
@@ -45,7 +45,7 @@ Prompt:
|
|
45
45
|
|
46
46
|
@tags("llm", "few_shot")
|
47
47
|
@tasks("text_classification", "text_summarization")
|
48
|
-
def Bias(model, min_threshold=7):
|
48
|
+
def Bias(model, min_threshold=7, judge_llm=None):
|
49
49
|
"""
|
50
50
|
Assesses potential bias in a Large Language Model by analyzing the distribution and order of exemplars in the
|
51
51
|
prompt.
|
@@ -100,6 +100,7 @@ def Bias(model, min_threshold=7):
|
|
100
100
|
response = call_model(
|
101
101
|
system_prompt=SYSTEM,
|
102
102
|
user_prompt=USER.format(prompt_to_test=model.prompt.template),
|
103
|
+
judge_llm=judge_llm,
|
103
104
|
)
|
104
105
|
|
105
106
|
score = get_score(response)
|
@@ -46,7 +46,7 @@ Prompt:
|
|
46
46
|
|
47
47
|
@tags("llm", "zero_shot", "few_shot")
|
48
48
|
@tasks("text_classification", "text_summarization")
|
49
|
-
def Clarity(model, min_threshold=7):
|
49
|
+
def Clarity(model, min_threshold=7, judge_llm=None):
|
50
50
|
"""
|
51
51
|
Evaluates and scores the clarity of prompts in a Large Language Model based on specified guidelines.
|
52
52
|
|
@@ -89,6 +89,7 @@ def Clarity(model, min_threshold=7):
|
|
89
89
|
response = call_model(
|
90
90
|
system_prompt=SYSTEM,
|
91
91
|
user_prompt=USER.format(prompt_to_test=model.prompt.template),
|
92
|
+
judge_llm=judge_llm,
|
92
93
|
)
|
93
94
|
|
94
95
|
score = get_score(response)
|
@@ -54,7 +54,7 @@ Prompt:
|
|
54
54
|
|
55
55
|
@tags("llm", "zero_shot", "few_shot")
|
56
56
|
@tasks("text_classification", "text_summarization")
|
57
|
-
def Conciseness(model, min_threshold=7):
|
57
|
+
def Conciseness(model, min_threshold=7, judge_llm=None):
|
58
58
|
"""
|
59
59
|
Analyzes and grades the conciseness of prompts provided to a Large Language Model.
|
60
60
|
|
@@ -97,6 +97,7 @@ def Conciseness(model, min_threshold=7):
|
|
97
97
|
response = call_model(
|
98
98
|
system_prompt=SYSTEM,
|
99
99
|
user_prompt=USER.format(prompt_to_test=model.prompt.template),
|
100
|
+
judge_llm=judge_llm,
|
100
101
|
)
|
101
102
|
score = get_score(response)
|
102
103
|
explanation = get_explanation(response)
|
@@ -39,7 +39,7 @@ Prompt:
|
|
39
39
|
|
40
40
|
@tags("llm", "zero_shot", "few_shot")
|
41
41
|
@tasks("text_classification", "text_summarization")
|
42
|
-
def Delimitation(model, min_threshold=7):
|
42
|
+
def Delimitation(model, min_threshold=7, judge_llm=None):
|
43
43
|
"""
|
44
44
|
Evaluates the proper use of delimiters in prompts provided to Large Language Models.
|
45
45
|
|
@@ -83,6 +83,7 @@ def Delimitation(model, min_threshold=7):
|
|
83
83
|
response = call_model(
|
84
84
|
system_prompt=SYSTEM,
|
85
85
|
user_prompt=USER.format(prompt_to_test=model.prompt.template),
|
86
|
+
judge_llm=judge_llm,
|
86
87
|
)
|
87
88
|
score = get_score(response)
|
88
89
|
explanation = get_explanation(response)
|
@@ -52,7 +52,7 @@ Prompt:
|
|
52
52
|
|
53
53
|
@tags("llm", "zero_shot", "few_shot")
|
54
54
|
@tasks("text_classification", "text_summarization")
|
55
|
-
def NegativeInstruction(model, min_threshold=7):
|
55
|
+
def NegativeInstruction(model, min_threshold=7, judge_llm=None):
|
56
56
|
"""
|
57
57
|
Evaluates and grades the use of affirmative, proactive language over negative instructions in LLM prompts.
|
58
58
|
|
@@ -101,6 +101,7 @@ def NegativeInstruction(model, min_threshold=7):
|
|
101
101
|
response = call_model(
|
102
102
|
system_prompt=SYSTEM,
|
103
103
|
user_prompt=USER.format(prompt_to_test=model.prompt.template),
|
104
|
+
judge_llm=judge_llm,
|
104
105
|
)
|
105
106
|
score = get_score(response)
|
106
107
|
explanation = get_explanation(response)
|
@@ -25,7 +25,7 @@ Contradictions, edge cases, typos, bad phrasing, distracting, complex or out-of-
|
|
25
25
|
Be creative and think step-by-step how you would break the prompt.
|
26
26
|
Then generate {num_tests} inputs for the user-submitted prompt template that would break the prompt.
|
27
27
|
Each input should be different from the others.
|
28
|
-
Each input should be
|
28
|
+
Each input should be returned as a new line in your response.
|
29
29
|
Respond only with the values to be inserted into the prompt template and do not include quotes, explanations or any extra text.
|
30
30
|
|
31
31
|
Example:
|
@@ -56,7 +56,7 @@ Input:
|
|
56
56
|
|
57
57
|
@tags("llm", "zero_shot", "few_shot")
|
58
58
|
@tasks("text_classification", "text_summarization")
|
59
|
-
def Robustness(model, dataset, num_tests=10):
|
59
|
+
def Robustness(model, dataset, num_tests=10, judge_llm=None):
|
60
60
|
"""
|
61
61
|
Assesses the robustness of prompts provided to a Large Language Model under varying conditions and contexts. This test
|
62
62
|
specifically measures the model's ability to generate correct classifications with the given prompt even when the
|
@@ -112,6 +112,7 @@ def Robustness(model, dataset, num_tests=10):
|
|
112
112
|
generated_inputs = call_model(
|
113
113
|
system_prompt=SYSTEM.format(num_tests=num_tests),
|
114
114
|
user_prompt=USER.format(prompt_to_test=model.prompt.template),
|
115
|
+
judge_llm=judge_llm,
|
115
116
|
).split("\n")
|
116
117
|
|
117
118
|
responses = model.predict(
|
@@ -52,7 +52,7 @@ Prompt:
|
|
52
52
|
|
53
53
|
@tags("llm", "zero_shot", "few_shot")
|
54
54
|
@tasks("text_classification", "text_summarization")
|
55
|
-
def Specificity(model, min_threshold=7):
|
55
|
+
def Specificity(model, min_threshold=7, judge_llm=None):
|
56
56
|
"""
|
57
57
|
Evaluates and scores the specificity of prompts provided to a Large Language Model (LLM), based on clarity, detail,
|
58
58
|
and relevance.
|
@@ -97,6 +97,7 @@ def Specificity(model, min_threshold=7):
|
|
97
97
|
response = call_model(
|
98
98
|
system_prompt=SYSTEM,
|
99
99
|
user_prompt=USER.format(prompt_to_test=model.prompt.template),
|
100
|
+
judge_llm=judge_llm,
|
100
101
|
)
|
101
102
|
score = get_score(response)
|
102
103
|
explanation = get_explanation(response)
|
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
import re
|
6
6
|
|
7
|
-
from validmind.ai.utils import
|
7
|
+
from validmind.ai.utils import get_judge_config, is_configured
|
8
8
|
|
9
9
|
missing_prompt_message = """
|
10
10
|
Cannot run prompt validation tests on a model with no prompt.
|
@@ -21,7 +21,12 @@ my_vm_model = vm.init_model(
|
|
21
21
|
|
22
22
|
|
23
23
|
def call_model(
|
24
|
-
system_prompt: str,
|
24
|
+
system_prompt: str,
|
25
|
+
user_prompt: str,
|
26
|
+
temperature: float = 0.0,
|
27
|
+
seed: int = 42,
|
28
|
+
judge_llm=None,
|
29
|
+
judge_embeddings=None,
|
25
30
|
):
|
26
31
|
"""Call LLM with the given prompts and return the response"""
|
27
32
|
if not is_configured():
|
@@ -31,21 +36,17 @@ def call_model(
|
|
31
36
|
"enabled for your account."
|
32
37
|
)
|
33
38
|
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
)
|
46
|
-
.choices[0]
|
47
|
-
.message.content
|
48
|
-
)
|
39
|
+
judge_llm, judge_embeddings = get_judge_config(judge_llm, judge_embeddings)
|
40
|
+
messages = [
|
41
|
+
("system", system_prompt.strip("\n").strip()),
|
42
|
+
("user", user_prompt.strip("\n").strip()),
|
43
|
+
]
|
44
|
+
|
45
|
+
return judge_llm.invoke(
|
46
|
+
messages,
|
47
|
+
temperature=temperature,
|
48
|
+
seed=seed,
|
49
|
+
).content
|
49
50
|
|
50
51
|
|
51
52
|
def get_score(response: str):
|
@@ -2,6 +2,20 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from .result import
|
5
|
+
from .result import (
|
6
|
+
ErrorResult,
|
7
|
+
RawData,
|
8
|
+
Result,
|
9
|
+
ResultTable,
|
10
|
+
TestResult,
|
11
|
+
TextGenerationResult,
|
12
|
+
)
|
6
13
|
|
7
|
-
__all__ = [
|
14
|
+
__all__ = [
|
15
|
+
"ErrorResult",
|
16
|
+
"RawData",
|
17
|
+
"Result",
|
18
|
+
"ResultTable",
|
19
|
+
"TestResult",
|
20
|
+
"TextGenerationResult",
|
21
|
+
]
|
@@ -129,6 +129,7 @@ class Result:
|
|
129
129
|
|
130
130
|
result_id: str = None
|
131
131
|
name: str = None
|
132
|
+
result_type: str = None
|
132
133
|
|
133
134
|
def __str__(self) -> str:
|
134
135
|
"""May be overridden by subclasses."""
|
@@ -445,6 +446,7 @@ class TestResult(Result):
|
|
445
446
|
async def log_async(
|
446
447
|
self,
|
447
448
|
section_id: str = None,
|
449
|
+
content_id: str = None,
|
448
450
|
position: int = None,
|
449
451
|
config: Dict[str, bool] = None,
|
450
452
|
):
|
@@ -464,7 +466,7 @@ class TestResult(Result):
|
|
464
466
|
)
|
465
467
|
)
|
466
468
|
|
467
|
-
if self.tables
|
469
|
+
if self.tables:
|
468
470
|
tasks.append(
|
469
471
|
api_client.alog_test_result(
|
470
472
|
result=self.serialize(),
|
@@ -473,30 +475,32 @@ class TestResult(Result):
|
|
473
475
|
config=config,
|
474
476
|
)
|
475
477
|
)
|
476
|
-
|
478
|
+
if self.figures:
|
477
479
|
tasks.extend(
|
478
480
|
[api_client.alog_figure(figure) for figure in (self.figures or [])]
|
479
481
|
)
|
482
|
+
if self.description:
|
483
|
+
revision_name = (
|
484
|
+
AI_REVISION_NAME
|
485
|
+
if self._was_description_generated
|
486
|
+
else DEFAULT_REVISION_NAME
|
487
|
+
)
|
480
488
|
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
if
|
485
|
-
else
|
486
|
-
|
487
|
-
|
488
|
-
tasks.append(
|
489
|
-
update_metadata(
|
490
|
-
content_id=f"test_description:{self.result_id}::{revision_name}",
|
491
|
-
text=self.description,
|
492
|
-
)
|
489
|
+
tasks.append(
|
490
|
+
update_metadata(
|
491
|
+
content_id=f"{content_id}::{revision_name}"
|
492
|
+
if content_id
|
493
|
+
else f"test_description:{self.result_id}::{revision_name}",
|
494
|
+
text=self.description,
|
493
495
|
)
|
496
|
+
)
|
494
497
|
|
495
498
|
return await asyncio.gather(*tasks)
|
496
499
|
|
497
500
|
def log(
|
498
501
|
self,
|
499
502
|
section_id: str = None,
|
503
|
+
content_id: str = None,
|
500
504
|
position: int = None,
|
501
505
|
unsafe: bool = False,
|
502
506
|
config: Dict[str, bool] = None,
|
@@ -506,6 +510,7 @@ class TestResult(Result):
|
|
506
510
|
Args:
|
507
511
|
section_id (str): The section ID within the model document to insert the
|
508
512
|
test result.
|
513
|
+
content_id (str): The content ID to log the result to.
|
509
514
|
position (int): The position (index) within the section to insert the test
|
510
515
|
result.
|
511
516
|
unsafe (bool): If True, log the result even if it contains sensitive data
|
@@ -533,6 +538,7 @@ class TestResult(Result):
|
|
533
538
|
run_async(
|
534
539
|
self.log_async,
|
535
540
|
section_id=section_id,
|
541
|
+
content_id=content_id,
|
536
542
|
position=position,
|
537
543
|
config=config,
|
538
544
|
)
|
@@ -568,3 +574,110 @@ class TestResult(Result):
|
|
568
574
|
raise InvalidParameterError(
|
569
575
|
f"Values for config keys must be boolean. Non-boolean values found for keys: {', '.join(non_bool_keys)}"
|
570
576
|
)
|
577
|
+
|
578
|
+
|
579
|
+
@dataclass
|
580
|
+
class TextGenerationResult(Result):
|
581
|
+
"""Test result."""
|
582
|
+
|
583
|
+
name: str = "Text Generation Result"
|
584
|
+
ref_id: str = None
|
585
|
+
title: Optional[str] = None
|
586
|
+
doc: Optional[str] = None
|
587
|
+
description: Optional[Union[str, DescriptionFuture]] = None
|
588
|
+
params: Optional[Dict[str, Any]] = None
|
589
|
+
metadata: Optional[Dict[str, Any]] = None
|
590
|
+
_was_description_generated: bool = False
|
591
|
+
|
592
|
+
def __post_init__(self):
|
593
|
+
if self.ref_id is None:
|
594
|
+
self.ref_id = str(uuid4())
|
595
|
+
|
596
|
+
def __repr__(self) -> str:
|
597
|
+
attrs = [
|
598
|
+
attr
|
599
|
+
for attr in [
|
600
|
+
"doc",
|
601
|
+
"description",
|
602
|
+
"params",
|
603
|
+
]
|
604
|
+
if getattr(self, attr) is not None
|
605
|
+
and (
|
606
|
+
len(getattr(self, attr)) > 0
|
607
|
+
if isinstance(getattr(self, attr), list)
|
608
|
+
else True
|
609
|
+
)
|
610
|
+
]
|
611
|
+
|
612
|
+
return f'TextGenerationResult("{self.result_id}", {", ".join(attrs)})'
|
613
|
+
|
614
|
+
def __getattribute__(self, name):
|
615
|
+
# lazy load description if its a DescriptionFuture (generated in background)
|
616
|
+
if name == "description":
|
617
|
+
description = super().__getattribute__("description")
|
618
|
+
|
619
|
+
if isinstance(description, DescriptionFuture):
|
620
|
+
self._was_description_generated = True
|
621
|
+
self.description = description.get_description()
|
622
|
+
|
623
|
+
return super().__getattribute__(name)
|
624
|
+
|
625
|
+
@property
|
626
|
+
def test_name(self) -> str:
|
627
|
+
"""Get the test name, using custom title if available."""
|
628
|
+
return self.title or test_id_to_name(self.result_id)
|
629
|
+
|
630
|
+
def to_widget(self):
|
631
|
+
template_data = {
|
632
|
+
"test_name": self.test_name,
|
633
|
+
"description": self.description.replace("h3", "strong"),
|
634
|
+
"params": (
|
635
|
+
json.dumps(self.params, cls=NumpyEncoder, indent=2)
|
636
|
+
if self.params
|
637
|
+
else None
|
638
|
+
),
|
639
|
+
}
|
640
|
+
rendered = get_result_template().render(**template_data)
|
641
|
+
|
642
|
+
widgets = [HTML(rendered)]
|
643
|
+
|
644
|
+
return VBox(widgets)
|
645
|
+
|
646
|
+
def serialize(self):
|
647
|
+
"""Serialize the result for the API."""
|
648
|
+
return {
|
649
|
+
"test_name": self.result_id,
|
650
|
+
"title": self.title,
|
651
|
+
"ref_id": self.ref_id,
|
652
|
+
"params": self.params,
|
653
|
+
"metadata": self.metadata,
|
654
|
+
}
|
655
|
+
|
656
|
+
async def log_async(
|
657
|
+
self,
|
658
|
+
content_id: str = None,
|
659
|
+
):
|
660
|
+
return await asyncio.gather(
|
661
|
+
update_metadata(
|
662
|
+
content_id=f"{content_id}",
|
663
|
+
text=self.description,
|
664
|
+
)
|
665
|
+
)
|
666
|
+
|
667
|
+
def log(
|
668
|
+
self,
|
669
|
+
content_id: str = None,
|
670
|
+
):
|
671
|
+
"""Log the result to ValidMind.
|
672
|
+
|
673
|
+
Args:
|
674
|
+
section_id (str): The section ID within the model document to insert the
|
675
|
+
test result.
|
676
|
+
content_id (str): The content ID to log the result to.
|
677
|
+
position (int): The position (index) within the section to insert the test
|
678
|
+
result.
|
679
|
+
"""
|
680
|
+
run_async(
|
681
|
+
self.log_async,
|
682
|
+
content_id=content_id,
|
683
|
+
)
|
@@ -1,11 +1,11 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: validmind
|
3
|
-
Version: 2.8.
|
3
|
+
Version: 2.8.26
|
4
4
|
Summary: ValidMind Library
|
5
5
|
License: Commercial License
|
6
6
|
Author: Andres Rodriguez
|
7
7
|
Author-email: andres@validmind.ai
|
8
|
-
Requires-Python: >=3.
|
8
|
+
Requires-Python: >=3.9.0,<3.12
|
9
9
|
Classifier: License :: Other/Proprietary License
|
10
10
|
Classifier: Programming Language :: Python :: 3
|
11
11
|
Classifier: Programming Language :: Python :: 3.9
|
@@ -22,6 +22,7 @@ Requires-Dist: bert-score (>=0.3.13)
|
|
22
22
|
Requires-Dist: catboost
|
23
23
|
Requires-Dist: datasets (>=2.10.0,<3.0.0)
|
24
24
|
Requires-Dist: evaluate
|
25
|
+
Requires-Dist: h11 (>=0.16.0)
|
25
26
|
Requires-Dist: ipywidgets
|
26
27
|
Requires-Dist: kaleido (>=0.2.1,!=0.2.1.post1)
|
27
28
|
Requires-Dist: langchain-openai (>=0.1.8) ; extra == "all" or extra == "llm"
|
@@ -53,7 +54,7 @@ Requires-Dist: statsmodels
|
|
53
54
|
Requires-Dist: tabulate (>=0.8.9,<0.9.0)
|
54
55
|
Requires-Dist: textblob (>=0.18.0.post0,<0.19.0)
|
55
56
|
Requires-Dist: tiktoken
|
56
|
-
Requires-Dist: torch (
|
57
|
+
Requires-Dist: torch (==2.7.0) ; extra == "all" or extra == "llm" or extra == "pytorch"
|
57
58
|
Requires-Dist: tqdm
|
58
59
|
Requires-Dist: transformers (>=4.32.0,<5.0.0) ; extra == "all" or extra == "huggingface" or extra == "llm"
|
59
60
|
Requires-Dist: xgboost (>=1.5.2,<3)
|
@@ -1,8 +1,8 @@
|
|
1
|
-
validmind/__init__.py,sha256=
|
2
|
-
validmind/__version__.py,sha256=
|
1
|
+
validmind/__init__.py,sha256=7nOHbSRUKtpIuHvf6oQd4D9_R8oh1PQ2CkeU62S14A0,4329
|
2
|
+
validmind/__version__.py,sha256=XTvkYgNn06R6oLNbCxMlIA63zR9bQTVXFHPFNHviOyA,23
|
3
3
|
validmind/ai/test_descriptions.py,sha256=eBF09MAyqAAD-Ah7vxXVRbHxOmGx5_10ZkoJmMvEaEA,7123
|
4
|
-
validmind/ai/utils.py,sha256=
|
5
|
-
validmind/api_client.py,sha256=
|
4
|
+
validmind/ai/utils.py,sha256=m0ru4h7z8Fe1dEOtXoonhgYKtdLMSEUakwacoATZbrs,8493
|
5
|
+
validmind/api_client.py,sha256=WNAdiYc2NctAFc2itLdz-0mf3_4JPghW4pPF_VAn5jw,16970
|
6
6
|
validmind/client.py,sha256=XKb4uc7yXVV_3NH9-zTrS9jCbLPX2zZZU12vKKlSpIc,19049
|
7
7
|
validmind/client_config.py,sha256=O1gopTaNADM4ZVPj383AJTjcpjdxyEvUQY5cFt7nbIs,1366
|
8
8
|
validmind/datasets/__init__.py,sha256=c0hQZN_6GrUEJxdFHdQaEsQrSYNABG84ZCY0H-PzOZk,260
|
@@ -60,6 +60,8 @@ validmind/datasets/regression/models/fred_loan_rates_model_3.pkl,sha256=IogZPcUQ
|
|
60
60
|
validmind/datasets/regression/models/fred_loan_rates_model_4.pkl,sha256=cSxhpcrI4hCbxCwZwE2-nr7KObbWpDii3NzpECoXmmM,48292
|
61
61
|
validmind/datasets/regression/models/fred_loan_rates_model_5.pkl,sha256=FkNLHq9xkPMbYks_vyMjFL371mw9SQYbP1iX9lY4Ljo,60343
|
62
62
|
validmind/errors.py,sha256=yluOjbvvurjIAVpN6V1L5R1f_aXr7mmTrTFjzmgn_Uw,8268
|
63
|
+
validmind/experimental/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
64
|
+
validmind/experimental/agents.py,sha256=UAn62qflCYnzS1m2XL_y3xUNaw0PJr9dRvNb-c-rUtI,2040
|
63
65
|
validmind/html_templates/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
64
66
|
validmind/html_templates/content_blocks.py,sha256=vFMRS4Ogq4RZq88WzG3teNEOq3U4OLgLDzD3lBx4h-g,4050
|
65
67
|
validmind/input_registry.py,sha256=bgZqJhrBCDoTV54Eq6YhNcU9yn5GjH0aidDwrnKm_pI,1043
|
@@ -118,7 +120,7 @@ validmind/tests/data_validation/LJungBox.py,sha256=jytIC1iOaV3g5kxEQ93RPftp1mO0H
|
|
118
120
|
validmind/tests/data_validation/LaggedCorrelationHeatmap.py,sha256=Irh8SvFQELqqq2FPR5PgUbcdjuCgIgB7FaZHkyCxu7Y,4571
|
119
121
|
validmind/tests/data_validation/MissingValues.py,sha256=elEhhwXTD68B8iLB2HTgAK-oM_i5yzJ8v32atK-F5ro,2962
|
120
122
|
validmind/tests/data_validation/MissingValuesBarPlot.py,sha256=BB-yO2uXjWIqy_bNJ_rJ8oosHTzMl7acGIYDGUy69dI,5572
|
121
|
-
validmind/tests/data_validation/MutualInformation.py,sha256=
|
123
|
+
validmind/tests/data_validation/MutualInformation.py,sha256=8Sp8K75dP-F24l_WeqRpykri5--E1GTQLxnTarLhNUc,5157
|
122
124
|
validmind/tests/data_validation/PearsonCorrelationMatrix.py,sha256=YU9WD3VURjzXyYvCTXcgZnFvmg2rjTOMwKtiZ57ZMJg,3873
|
123
125
|
validmind/tests/data_validation/PhillipsPerronArch.py,sha256=4abwhMBcdxTxY9aMogL5hEvCyATnvHb66mGssE1AJuk,4254
|
124
126
|
validmind/tests/data_validation/ProtectedClassesCombination.py,sha256=KOsSciNplk1A9DI-wS-m5qKm5u-7gDDDfceEusZiufo,6920
|
@@ -196,17 +198,17 @@ validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py,sha256=
|
|
196
198
|
validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py,sha256=Y1N4AeYlzD2Mpcvd4BWVIOIWzHjycWxSRYp8J_gr5_k,5839
|
197
199
|
validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py,sha256=M47btgIGdXon8F7phqqcXrExnO3DvHi-NSBdgDjy_OE,4752
|
198
200
|
validmind/tests/model_validation/embeddings/utils.py,sha256=Hr8jpVB0YfaOEYsO_tiwhU1UgXoJFHHlRqFcHDNXHoU,1896
|
199
|
-
validmind/tests/model_validation/ragas/AnswerCorrectness.py,sha256=
|
200
|
-
validmind/tests/model_validation/ragas/AspectCritic.py,sha256=
|
201
|
-
validmind/tests/model_validation/ragas/ContextEntityRecall.py,sha256=
|
202
|
-
validmind/tests/model_validation/ragas/ContextPrecision.py,sha256=
|
203
|
-
validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py,sha256=
|
204
|
-
validmind/tests/model_validation/ragas/ContextRecall.py,sha256=
|
205
|
-
validmind/tests/model_validation/ragas/Faithfulness.py,sha256=
|
206
|
-
validmind/tests/model_validation/ragas/NoiseSensitivity.py,sha256=
|
207
|
-
validmind/tests/model_validation/ragas/ResponseRelevancy.py,sha256=
|
208
|
-
validmind/tests/model_validation/ragas/SemanticSimilarity.py,sha256=
|
209
|
-
validmind/tests/model_validation/ragas/utils.py,sha256=
|
201
|
+
validmind/tests/model_validation/ragas/AnswerCorrectness.py,sha256=unX2l4aVnRJTkGooKZ90HbLpxthvKqiLCDQvvuviexU,5682
|
202
|
+
validmind/tests/model_validation/ragas/AspectCritic.py,sha256=ejtznzu-tWFy7Ex0AgYts4HSBqYVWQNKxPIZ_nv6MQM,7087
|
203
|
+
validmind/tests/model_validation/ragas/ContextEntityRecall.py,sha256=9Uyr7d2zSFxWehHKq5DKt_g-vZRQQreJcgl59f5txNQ,5520
|
204
|
+
validmind/tests/model_validation/ragas/ContextPrecision.py,sha256=kNswHsi2gtl88gGi9gUE40AGeqJS4X-M60zFAatnvHQ,5323
|
205
|
+
validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py,sha256=HjP3Asb8tvnEHUnWLu3Z6eZi_mr8pmVfboXoowwuvQU,5042
|
206
|
+
validmind/tests/model_validation/ragas/ContextRecall.py,sha256=oRNFXSFKuceK5zJj2KOJiYh5BK5xLm55ELRPTyHV5No,5209
|
207
|
+
validmind/tests/model_validation/ragas/Faithfulness.py,sha256=cTRmMcW7zymBWDt8y9eywLmJsUHq7QU1I6xZPTQlkTw,5496
|
208
|
+
validmind/tests/model_validation/ragas/NoiseSensitivity.py,sha256=9ZDoJbrd9L3yVhEyJBTyEbE6NC-yPne_gQQPeQyE2fY,6606
|
209
|
+
validmind/tests/model_validation/ragas/ResponseRelevancy.py,sha256=mJoELbkNNNd9UhL-cJz27sif6_i1tyVZagIBFr51Xqo,5759
|
210
|
+
validmind/tests/model_validation/ragas/SemanticSimilarity.py,sha256=ZjtLS7GRqVxYbY5PGzhGUDXJLKH8ItyH-5dDcu--nEQ,5008
|
211
|
+
validmind/tests/model_validation/ragas/utils.py,sha256=BN07JJ2egOEzQmO8w6afsrhXs_uQ0RoYAPuOAaKFXrY,2785
|
210
212
|
validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py,sha256=UXhiNK6ZakRgln968y3jAMgNVsj5LpgGCnSHDRUFrWw,2926
|
211
213
|
validmind/tests/model_validation/sklearn/AdjustedRandIndex.py,sha256=7zlFapC21nVqXYc73FQxR0XeTit2l-h7F76xCFS9FUQ,2756
|
212
214
|
validmind/tests/model_validation/sklearn/CalibrationCurve.py,sha256=EmW8UvT6gcBC-dw6zr43MbAGTBeVTPruHjYu5GWB5p4,4232
|
@@ -226,7 +228,7 @@ validmind/tests/model_validation/sklearn/MinimumF1Score.py,sha256=pQn9p15AUo5ref
|
|
226
228
|
validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py,sha256=6YSITuOkQwq1UcxqTWHnjrpPTfN_9Mny-wDWdRFD8I4,3825
|
227
229
|
validmind/tests/model_validation/sklearn/ModelParameters.py,sha256=CF3cZGJLxiABnf1CQ_u_iX_ylgvpElH3jF2DBXbXZJY,3060
|
228
230
|
validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py,sha256=wDxGUXgfzLA80wfjoRz7CzHO8NiQfuJyxIfuVFOuLYA,4658
|
229
|
-
validmind/tests/model_validation/sklearn/OverfitDiagnosis.py,sha256=
|
231
|
+
validmind/tests/model_validation/sklearn/OverfitDiagnosis.py,sha256=FaTxA_OpTUxv92Zhi8oZ0KrUYjUWlVrte7HYgQqktPk,10557
|
230
232
|
validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py,sha256=zdZe3f6n5WmWei9T2IUJGjYPjI0YRSoSXza5rfrYZ48,4226
|
231
233
|
validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py,sha256=zm2aP_auO6khOmei_or_HhnFgMFQmeU6IgYJuuXYkHM,9045
|
232
234
|
validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py,sha256=WF3htr_Z5BnVbxMV4Ehx_BUSWYXhouaqQ45MUbuU9co,3821
|
@@ -242,7 +244,7 @@ validmind/tests/model_validation/sklearn/ScoreProbabilityAlignment.py,sha256=lvP
|
|
242
244
|
validmind/tests/model_validation/sklearn/SilhouettePlot.py,sha256=F2RMMm1ilEwj6hGfF50n_9_n3JnQhwJBxQl8hY6xjuk,5076
|
243
245
|
validmind/tests/model_validation/sklearn/TrainingTestDegradation.py,sha256=tSi2pnWxqSMkakebTLPHGHGn_7YSukPocNhVDixDul8,4519
|
244
246
|
validmind/tests/model_validation/sklearn/VMeasure.py,sha256=2zkB6W4oYWPr03SETwjaQCle3_dGDItCqa3DQ4qRLcM,2841
|
245
|
-
validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py,sha256=
|
247
|
+
validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py,sha256=DtRhJZ1NIpzIf7F4jXOo5XUX7g2VRvZGmWHIBYuCeaE,12055
|
246
248
|
validmind/tests/model_validation/sklearn/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
247
249
|
validmind/tests/model_validation/statsmodels/AutoARIMA.py,sha256=4QNcEEY_iqt6wCzYwsBwZQ-aacZ1erX5uHbPtKmbTJk,4896
|
248
250
|
validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py,sha256=yXouMfH8JWrD3o6IAoHjAeXHuj-nVSxV-_SVw8SBePw,4886
|
@@ -277,15 +279,15 @@ validmind/tests/ongoing_monitoring/ScoreBandsDrift.py,sha256=4nqu3yfiIlhs7RG-6Eg
|
|
277
279
|
validmind/tests/ongoing_monitoring/ScorecardHistogramDrift.py,sha256=OIU6wEIH4VbwhCo6Qirl0YnzxQQbbhMhjFiMnmKoatA,7260
|
278
280
|
validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py,sha256=xDPh6KWSy8YXmHV8SI0IpSVv42R-1H3ZpjMM0zHTDNs,5141
|
279
281
|
validmind/tests/output.py,sha256=UXSZDiW_GD411QP2F9r2Vh7uXtb5a2y990bNRZhnZVQ,6153
|
280
|
-
validmind/tests/prompt_validation/Bias.py,sha256=
|
281
|
-
validmind/tests/prompt_validation/Clarity.py,sha256=
|
282
|
-
validmind/tests/prompt_validation/Conciseness.py,sha256=
|
283
|
-
validmind/tests/prompt_validation/Delimitation.py,sha256=
|
284
|
-
validmind/tests/prompt_validation/NegativeInstruction.py,sha256=
|
285
|
-
validmind/tests/prompt_validation/Robustness.py,sha256=
|
286
|
-
validmind/tests/prompt_validation/Specificity.py,sha256=
|
282
|
+
validmind/tests/prompt_validation/Bias.py,sha256=eYUlVPn3iqMJ4lnQbZxYty6UxvLd0sbPdbY1EtywV3w,5845
|
283
|
+
validmind/tests/prompt_validation/Clarity.py,sha256=OlzhESenRUftai2L1fVI4rftLflp2_ztwr_llq3rvbU,4990
|
284
|
+
validmind/tests/prompt_validation/Conciseness.py,sha256=RQkC3jH9c96PBYCBk6-MLuEqstkkWFOegunEcHsNyis,4734
|
285
|
+
validmind/tests/prompt_validation/Delimitation.py,sha256=NXa1ScNrIejiGOslncrXavzM4k-vRmAloCmrZyxAWfg,4102
|
286
|
+
validmind/tests/prompt_validation/NegativeInstruction.py,sha256=Miasr4VQ-sCA06_lCCI4oRf_AKNjh7jV_CvtmnYxUso,5417
|
287
|
+
validmind/tests/prompt_validation/Robustness.py,sha256=Qy82t7dtdpCcR-SPDpD43QaHomDI6e2IJIBBO-MCdwA,5848
|
288
|
+
validmind/tests/prompt_validation/Specificity.py,sha256=VtUU7t2vDT8_fWiz9OiTEsMN-yjn3dNU7couwcbMdik,4857
|
287
289
|
validmind/tests/prompt_validation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
288
|
-
validmind/tests/prompt_validation/ai_powered_test.py,sha256=
|
290
|
+
validmind/tests/prompt_validation/ai_powered_test.py,sha256=sWMf9fRXAkOpI5JYdhLHmJXlwnthjUDZGoLgCZlQZxo,2240
|
289
291
|
validmind/tests/run.py,sha256=ftUCywJbHQ0vbZ7096iz8yq0htLbQbOWhvURjCqiqog,14211
|
290
292
|
validmind/tests/test_providers.py,sha256=S0_yNYAor_MX5joRJntrVjV8J3ypvUcaaSqtkBqhOsI,6021
|
291
293
|
validmind/tests/utils.py,sha256=sPnk9HWIb0IoySqL88h7uP3LixfrfKFgFebnyTUP5EE,3950
|
@@ -314,17 +316,17 @@ validmind/vm_models/dataset/utils.py,sha256=g6mBPrBmVYf8wJAlTxeg9DTiNvts4ZaaT5mb
|
|
314
316
|
validmind/vm_models/figure.py,sha256=ZMO_nIIleNhkBV1vJeF_UUsVDCzrXNOYwV1Lbg9E0XY,6303
|
315
317
|
validmind/vm_models/input.py,sha256=nTBQB6aqirhF-0Gmg5mYc4_vNyypvbYUfahMovcK02M,1095
|
316
318
|
validmind/vm_models/model.py,sha256=s9pPIprHrju-qmGbzOZBcSHjZ_xgSv5ACXk92U1hEFY,6489
|
317
|
-
validmind/vm_models/result/__init__.py,sha256=
|
319
|
+
validmind/vm_models/result/__init__.py,sha256=c0vMWMSY0O6ZeSCf0HfrWAI5t--4FKLEW5cZ2EZ70Ms,443
|
318
320
|
validmind/vm_models/result/result.jinja,sha256=Yvovwm5gInCBukFRlvJXNlDIUpl2eFz4dz1lS3Sn_Gc,311
|
319
|
-
validmind/vm_models/result/result.py,sha256=
|
321
|
+
validmind/vm_models/result/result.py,sha256=NRb90F1kYHeYitItiUKYVZ32d81UKq6X1RH9DHMI9Fo,21282
|
320
322
|
validmind/vm_models/result/utils.py,sha256=kjU8yaDBX874gebdKLA2KcCyW6ojk_nSTBZxHG7Gszc,4155
|
321
323
|
validmind/vm_models/test_suite/__init__.py,sha256=tfTYd8yvzsDXzk5WDKMwCzPAbvkVUyEvtY5z5BPy-zk,215
|
322
324
|
validmind/vm_models/test_suite/runner.py,sha256=JqW8LW4X1Ri2C6wSsAGSki-JxGUGV8zmruOnxybmZ1s,5432
|
323
325
|
validmind/vm_models/test_suite/summary.py,sha256=7P4zhfeU7a3I1MMBn8f7s-2lzdAz7U4y6LblpR89_vE,5401
|
324
326
|
validmind/vm_models/test_suite/test.py,sha256=C8xPGKSyYF9oMJ3VegwFJDF7cwYlIgtQoQ7nzXIS1uc,3914
|
325
327
|
validmind/vm_models/test_suite/test_suite.py,sha256=CciC6IhrLEeWwcpY3Np8EmQCB8XEF2ljwEXcvmNYgZc,5090
|
326
|
-
validmind-2.8.
|
327
|
-
validmind-2.8.
|
328
|
-
validmind-2.8.
|
329
|
-
validmind-2.8.
|
330
|
-
validmind-2.8.
|
328
|
+
validmind-2.8.26.dist-info/LICENSE,sha256=XonPUfwjvrC5Ombl3y-ko0Wubb1xdG_7nzvIbkZRKHw,35772
|
329
|
+
validmind-2.8.26.dist-info/METADATA,sha256=cXBgejYYCohKO95F4HEa-uamzw0cLjgKr8LlBhMQ0eA,6061
|
330
|
+
validmind-2.8.26.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
331
|
+
validmind-2.8.26.dist-info/entry_points.txt,sha256=HuW7YyOv9u_OEWpViQXtv0nfoI67uieJHawKWA4Hv9A,76
|
332
|
+
validmind-2.8.26.dist-info/RECORD,,
|
File without changes
|
File without changes
|