deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +139 -2
- deepeval/evaluate/evaluate.py +16 -11
- deepeval/evaluate/execute.py +13 -181
- deepeval/evaluate/utils.py +6 -26
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +14 -16
- deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
- deepeval/metrics/answer_relevancy/template.py +22 -3
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -44
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +16 -2
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +138 -149
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +7 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -19
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +158 -122
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +49 -33
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +79 -33
- deepeval/models/embedding_models/local_embedding_model.py +39 -20
- deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
- deepeval/models/embedding_models/openai_embedding_model.py +42 -22
- deepeval/models/llms/amazon_bedrock_model.py +226 -72
- deepeval/models/llms/anthropic_model.py +178 -63
- deepeval/models/llms/azure_model.py +218 -60
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +95 -40
- deepeval/models/llms/gemini_model.py +209 -64
- deepeval/models/llms/grok_model.py +139 -68
- deepeval/models/llms/kimi_model.py +140 -90
- deepeval/models/llms/litellm_model.py +131 -37
- deepeval/models/llms/local_model.py +125 -21
- deepeval/models/llms/ollama_model.py +147 -24
- deepeval/models/llms/openai_model.py +222 -269
- deepeval/models/llms/portkey_model.py +81 -22
- deepeval/models/llms/utils.py +8 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +106 -5
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +26 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +68 -1
- deepeval/test_case/llm_test_case.py +206 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +18 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -2,11 +2,14 @@ from __future__ import annotations
|
|
|
2
2
|
import inspect
|
|
3
3
|
import random
|
|
4
4
|
import re
|
|
5
|
+
import statistics
|
|
5
6
|
from typing import (
|
|
6
7
|
Any,
|
|
7
8
|
Callable,
|
|
8
9
|
List,
|
|
9
10
|
Optional,
|
|
11
|
+
Protocol,
|
|
12
|
+
Sequence,
|
|
10
13
|
Tuple,
|
|
11
14
|
TYPE_CHECKING,
|
|
12
15
|
Union,
|
|
@@ -17,11 +20,13 @@ from typing import (
|
|
|
17
20
|
from deepeval.errors import DeepEvalError
|
|
18
21
|
from deepeval.metrics.base_metric import BaseMetric, BaseConversationalMetric
|
|
19
22
|
from deepeval.prompt.prompt import Prompt
|
|
20
|
-
from deepeval.prompt.api import
|
|
21
|
-
from deepeval.
|
|
23
|
+
from deepeval.prompt.api import PromptMessage
|
|
24
|
+
from deepeval.optimizer.types import (
|
|
25
|
+
ModelCallback,
|
|
22
26
|
ModuleId,
|
|
23
27
|
PromptConfigurationId,
|
|
24
28
|
PromptConfiguration,
|
|
29
|
+
PromptConfigSnapshot,
|
|
25
30
|
OptimizationReport,
|
|
26
31
|
)
|
|
27
32
|
|
|
@@ -54,7 +59,7 @@ def split_goldens(
|
|
|
54
59
|
pareto_size: Number of items to allocate to the Pareto set bound between [0, len(goldens)].
|
|
55
60
|
random_state: A shared `random.Random` instance that provides the source
|
|
56
61
|
of randomness. For reproducible runs, pass the same object used by
|
|
57
|
-
the GEPA loop constructed from `
|
|
62
|
+
the GEPA loop constructed from `GEPA.random_seed`
|
|
58
63
|
|
|
59
64
|
Returns:
|
|
60
65
|
(d_feedback, d_pareto)
|
|
@@ -151,87 +156,22 @@ def normalize_seed_prompts(
|
|
|
151
156
|
return mapping
|
|
152
157
|
|
|
153
158
|
|
|
154
|
-
def build_model_callback_kwargs(
|
|
155
|
-
*,
|
|
156
|
-
# scoring context
|
|
157
|
-
golden: Optional[Union["Golden", "ConversationalGolden"]] = None,
|
|
158
|
-
# rewriter context
|
|
159
|
-
feedback_text: Optional[str] = None,
|
|
160
|
-
# shared
|
|
161
|
-
prompt: Optional[Prompt] = None,
|
|
162
|
-
prompt_type: Optional[str] = None,
|
|
163
|
-
prompt_text: Optional[str] = None,
|
|
164
|
-
prompt_messages: Optional[List["PromptMessage"]] = None,
|
|
165
|
-
) -> Dict[str, Any]:
|
|
166
|
-
"""
|
|
167
|
-
Build a superset of kwargs for GEPA model callbacks.
|
|
168
|
-
|
|
169
|
-
All keys are present in the dict so callbacks can declare any subset of:
|
|
170
|
-
|
|
171
|
-
hook: str # injected by (a_)invoke_model_callback
|
|
172
|
-
prompt: Prompt
|
|
173
|
-
prompt_type: str
|
|
174
|
-
prompt_text: str
|
|
175
|
-
prompt_messages: List[PromptMessage]
|
|
176
|
-
golden: Golden | ConversationalGolden
|
|
177
|
-
feedback_text: str
|
|
178
|
-
|
|
179
|
-
Non applicable fields are set to None.
|
|
180
|
-
"""
|
|
181
|
-
return {
|
|
182
|
-
# scoring context
|
|
183
|
-
"golden": golden,
|
|
184
|
-
# rewriter context
|
|
185
|
-
"feedback_text": feedback_text,
|
|
186
|
-
# shared
|
|
187
|
-
"prompt": prompt,
|
|
188
|
-
"prompt_text": prompt_text,
|
|
189
|
-
"prompt_messages": prompt_messages,
|
|
190
|
-
}
|
|
191
|
-
|
|
192
|
-
|
|
193
159
|
def invoke_model_callback(
|
|
194
160
|
*,
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
str,
|
|
200
|
-
Dict,
|
|
201
|
-
Tuple[Union[str, Dict], float],
|
|
202
|
-
],
|
|
203
|
-
],
|
|
204
|
-
candidate_kwargs: Dict[str, Any],
|
|
205
|
-
) -> Union[
|
|
206
|
-
str,
|
|
207
|
-
Dict,
|
|
208
|
-
Tuple[Union[str, Dict], float],
|
|
209
|
-
]:
|
|
161
|
+
model_callback: ModelCallback,
|
|
162
|
+
prompt: Prompt,
|
|
163
|
+
golden: Union["Golden", "ConversationalGolden"],
|
|
164
|
+
) -> str:
|
|
210
165
|
"""
|
|
211
166
|
Call a user provided model_callback in a synchronous context.
|
|
212
167
|
|
|
213
|
-
|
|
214
|
-
- Injects `hook` if the callback declares it.
|
|
215
|
-
- Raises if the callback returns an awaitable; callers must use async
|
|
216
|
-
helpers for async callbacks.
|
|
168
|
+
Raises if the callback returns an awaitable.
|
|
217
169
|
"""
|
|
218
|
-
|
|
219
|
-
supported = set(sig.parameters.keys())
|
|
220
|
-
|
|
221
|
-
filtered = {
|
|
222
|
-
key: value
|
|
223
|
-
for key, value in candidate_kwargs.items()
|
|
224
|
-
if key in supported
|
|
225
|
-
}
|
|
226
|
-
|
|
227
|
-
if "hook" in supported:
|
|
228
|
-
filtered["hook"] = hook
|
|
229
|
-
|
|
230
|
-
result = model_callback(**filtered)
|
|
170
|
+
result = model_callback(prompt, golden)
|
|
231
171
|
if inspect.isawaitable(result):
|
|
232
172
|
raise DeepEvalError(
|
|
233
173
|
"model_callback returned an awaitable from a synchronous context. "
|
|
234
|
-
"Either declare the callback as `async def` and use async
|
|
174
|
+
"Either declare the callback as `async def` and use async optimization, or call "
|
|
235
175
|
"`model.generate(...)` instead of `model.a_generate(...)` inside a sync callback."
|
|
236
176
|
)
|
|
237
177
|
return result
|
|
@@ -239,41 +179,16 @@ def invoke_model_callback(
|
|
|
239
179
|
|
|
240
180
|
async def a_invoke_model_callback(
|
|
241
181
|
*,
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
str,
|
|
247
|
-
Dict,
|
|
248
|
-
Tuple[Union[str, Dict], float],
|
|
249
|
-
],
|
|
250
|
-
],
|
|
251
|
-
candidate_kwargs: Dict[str, Any],
|
|
252
|
-
) -> Union[
|
|
253
|
-
str,
|
|
254
|
-
Dict,
|
|
255
|
-
Tuple[Union[str, Dict], float],
|
|
256
|
-
]:
|
|
182
|
+
model_callback: ModelCallback,
|
|
183
|
+
prompt: Prompt,
|
|
184
|
+
golden: Union["Golden", "ConversationalGolden"],
|
|
185
|
+
) -> str:
|
|
257
186
|
"""
|
|
258
187
|
Call a user provided model_callback in an async context.
|
|
259
188
|
|
|
260
|
-
|
|
261
|
-
- Injects `hook` if the callback declares it.
|
|
262
|
-
- Supports both sync and async callbacks.
|
|
189
|
+
Supports both sync and async callbacks.
|
|
263
190
|
"""
|
|
264
|
-
|
|
265
|
-
supported = set(sig.parameters.keys())
|
|
266
|
-
|
|
267
|
-
filtered = {
|
|
268
|
-
key: value
|
|
269
|
-
for key, value in candidate_kwargs.items()
|
|
270
|
-
if key in supported
|
|
271
|
-
}
|
|
272
|
-
|
|
273
|
-
if "hook" in supported:
|
|
274
|
-
filtered["hook"] = hook
|
|
275
|
-
|
|
276
|
-
result = model_callback(**filtered)
|
|
191
|
+
result = model_callback(prompt, golden)
|
|
277
192
|
if inspect.isawaitable(result):
|
|
278
193
|
return await result
|
|
279
194
|
return result
|
|
@@ -288,58 +203,17 @@ def build_prompt_config_snapshots(
|
|
|
288
203
|
prompt_configurations_by_id: Dict[
|
|
289
204
|
PromptConfigurationId, "PromptConfiguration"
|
|
290
205
|
],
|
|
291
|
-
) -> Dict[PromptConfigurationId,
|
|
206
|
+
) -> Dict[PromptConfigurationId, PromptConfigSnapshot]:
|
|
292
207
|
"""
|
|
293
|
-
Build
|
|
294
|
-
|
|
295
|
-
Shape matches the docs for `prompt_configurations`:
|
|
296
|
-
|
|
297
|
-
{
|
|
298
|
-
"<config_id>": {
|
|
299
|
-
"parent": "<parent_id or None>",
|
|
300
|
-
"prompts": {
|
|
301
|
-
"<module_id>": {
|
|
302
|
-
"type": "TEXT",
|
|
303
|
-
"text_template": "...",
|
|
304
|
-
}
|
|
305
|
-
# or
|
|
306
|
-
"<module_id>": {
|
|
307
|
-
"type": "LIST",
|
|
308
|
-
"messages": [
|
|
309
|
-
{"role": "system", "content": "..."},
|
|
310
|
-
...
|
|
311
|
-
],
|
|
312
|
-
},
|
|
313
|
-
},
|
|
314
|
-
},
|
|
315
|
-
...
|
|
316
|
-
}
|
|
208
|
+
Build snapshots of all prompt configurations.
|
|
317
209
|
"""
|
|
318
|
-
snapshots: Dict[PromptConfigurationId,
|
|
210
|
+
snapshots: Dict[PromptConfigurationId, PromptConfigSnapshot] = {}
|
|
319
211
|
|
|
320
212
|
for cfg_id, cfg in prompt_configurations_by_id.items():
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
messages = [
|
|
326
|
-
{"role": msg.role, "content": (msg.content or "")}
|
|
327
|
-
for msg in (prompt.messages_template or [])
|
|
328
|
-
]
|
|
329
|
-
prompts_snapshot[module_id] = {
|
|
330
|
-
"type": "LIST",
|
|
331
|
-
"messages": messages,
|
|
332
|
-
}
|
|
333
|
-
else:
|
|
334
|
-
prompts_snapshot[module_id] = {
|
|
335
|
-
"type": "TEXT",
|
|
336
|
-
"text_template": (prompt.text_template or ""),
|
|
337
|
-
}
|
|
338
|
-
|
|
339
|
-
snapshots[cfg_id] = {
|
|
340
|
-
"parent": cfg.parent,
|
|
341
|
-
"prompts": prompts_snapshot,
|
|
342
|
-
}
|
|
213
|
+
snapshots[cfg_id] = PromptConfigSnapshot(
|
|
214
|
+
parent=cfg.parent,
|
|
215
|
+
prompts=dict(cfg.prompts),
|
|
216
|
+
)
|
|
343
217
|
|
|
344
218
|
return snapshots
|
|
345
219
|
|
|
@@ -494,17 +368,8 @@ def validate_sequence_of(
|
|
|
494
368
|
def validate_callback(
|
|
495
369
|
*,
|
|
496
370
|
component: str,
|
|
497
|
-
model_callback: Optional[
|
|
498
|
-
|
|
499
|
-
...,
|
|
500
|
-
Union[
|
|
501
|
-
str,
|
|
502
|
-
Dict,
|
|
503
|
-
Tuple[Union[str, Dict], float],
|
|
504
|
-
],
|
|
505
|
-
]
|
|
506
|
-
],
|
|
507
|
-
) -> Callable[..., Union[str, Dict, Tuple[Union[str, Dict], float]]]:
|
|
371
|
+
model_callback: Optional[ModelCallback],
|
|
372
|
+
) -> ModelCallback:
|
|
508
373
|
"""
|
|
509
374
|
Ensure that `model_callback` is provided.
|
|
510
375
|
|
|
@@ -596,3 +461,20 @@ def validate_int_in_range(
|
|
|
596
461
|
)
|
|
597
462
|
|
|
598
463
|
return value
|
|
464
|
+
|
|
465
|
+
|
|
466
|
+
##############
|
|
467
|
+
# Aggregates #
|
|
468
|
+
##############
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
class Aggregator(Protocol):
|
|
472
|
+
def __call__(self, scores: Sequence[float]) -> float: ...
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def mean_of_all(scores: Sequence[float]) -> float:
|
|
476
|
+
return statistics.fmean(scores) if scores else 0.0
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
def median_of_all(scores: Sequence[float]) -> float:
|
|
480
|
+
return statistics.median(scores) if scores else 0.0
|
deepeval/prompt/prompt.py
CHANGED
|
@@ -4,7 +4,7 @@ import json
|
|
|
4
4
|
import os
|
|
5
5
|
|
|
6
6
|
from enum import Enum
|
|
7
|
-
from typing import Optional, List, Dict, Type, Literal
|
|
7
|
+
from typing import Optional, List, Dict, Type, Literal
|
|
8
8
|
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
|
|
9
9
|
from rich.console import Console
|
|
10
10
|
from pydantic import BaseModel, ValidationError
|
|
@@ -34,10 +34,6 @@ from deepeval.prompt.utils import (
|
|
|
34
34
|
from deepeval.confident.api import Api, Endpoints, HttpMethods
|
|
35
35
|
from deepeval.constants import HIDDEN_DIR
|
|
36
36
|
|
|
37
|
-
|
|
38
|
-
if TYPE_CHECKING:
|
|
39
|
-
from deepeval.optimization.types import OptimizationReport
|
|
40
|
-
|
|
41
37
|
logger = logging.getLogger(__name__)
|
|
42
38
|
|
|
43
39
|
portalocker = None
|
|
@@ -117,6 +113,7 @@ class Prompt:
|
|
|
117
113
|
model_settings: Optional[ModelSettings] = None,
|
|
118
114
|
output_type: Optional[OutputType] = None,
|
|
119
115
|
output_schema: Optional[Type[BaseModel]] = None,
|
|
116
|
+
interpolation_type: Optional[PromptInterpolationType] = None,
|
|
120
117
|
):
|
|
121
118
|
if text_template and messages_template:
|
|
122
119
|
raise TypeError(
|
|
@@ -129,7 +126,9 @@ class Prompt:
|
|
|
129
126
|
self.output_type: Optional[OutputType] = output_type
|
|
130
127
|
self.output_schema: Optional[Type[BaseModel]] = output_schema
|
|
131
128
|
self.label: Optional[str] = None
|
|
132
|
-
self.interpolation_type:
|
|
129
|
+
self.interpolation_type: PromptInterpolationType = (
|
|
130
|
+
interpolation_type or PromptInterpolationType.FSTRING
|
|
131
|
+
)
|
|
133
132
|
|
|
134
133
|
self._version = None
|
|
135
134
|
self._prompt_version_id: Optional[str] = None
|
|
@@ -145,9 +144,6 @@ class Prompt:
|
|
|
145
144
|
elif messages_template:
|
|
146
145
|
self.type = PromptType.LIST
|
|
147
146
|
|
|
148
|
-
# updated after optimization runs
|
|
149
|
-
self.optimization_report: Optional["OptimizationReport"] = None
|
|
150
|
-
|
|
151
147
|
def __del__(self):
|
|
152
148
|
"""Cleanup polling tasks when instance is destroyed"""
|
|
153
149
|
try:
|
|
@@ -20,6 +20,7 @@ from deepeval.simulator.template import (
|
|
|
20
20
|
ConversationSimulatorTemplate,
|
|
21
21
|
)
|
|
22
22
|
from deepeval.models import DeepEvalBaseLLM
|
|
23
|
+
from deepeval.metrics.utils import MULTIMODAL_SUPPORTED_MODELS
|
|
23
24
|
from deepeval.simulator.schema import (
|
|
24
25
|
SimulatedInput,
|
|
25
26
|
ConversationCompletion,
|
|
@@ -94,6 +95,26 @@ class ConversationSimulator:
|
|
|
94
95
|
)
|
|
95
96
|
)
|
|
96
97
|
else:
|
|
98
|
+
multimodal = any(
|
|
99
|
+
[golden.multimodal for golden in conversational_goldens]
|
|
100
|
+
)
|
|
101
|
+
if multimodal:
|
|
102
|
+
if (
|
|
103
|
+
not self.simulator_model
|
|
104
|
+
or not self.simulator_model.supports_multimodal()
|
|
105
|
+
):
|
|
106
|
+
if (
|
|
107
|
+
self.simulator_model
|
|
108
|
+
and type(self.simulator_model)
|
|
109
|
+
in MULTIMODAL_SUPPORTED_MODELS
|
|
110
|
+
):
|
|
111
|
+
raise ValueError(
|
|
112
|
+
f"The evaluation model {self.simulator_model.name} does not support multimodal evaluations at the moment. Available multi-modal models for the {self.simulator_model.__class__.__name__} provider includes {', '.join(self.simulator_model.__class__.valid_multimodal_models)}."
|
|
113
|
+
)
|
|
114
|
+
else:
|
|
115
|
+
raise ValueError(
|
|
116
|
+
f"The evaluation model {self.simulator_model.name} does not support multimodal inputs, please use one of the following evaluation models: {', '.join([cls.__name__ for cls in MULTIMODAL_SUPPORTED_MODELS])}"
|
|
117
|
+
)
|
|
97
118
|
conversational_test_cases: List[ConversationalTestCase] = []
|
|
98
119
|
for conversation_index, golden in enumerate(
|
|
99
120
|
conversational_goldens
|
|
@@ -124,6 +145,28 @@ class ConversationSimulator:
|
|
|
124
145
|
progress: Optional[Progress] = None,
|
|
125
146
|
pbar_id: Optional[int] = None,
|
|
126
147
|
) -> List[ConversationalTestCase]:
|
|
148
|
+
|
|
149
|
+
multimodal = any(
|
|
150
|
+
[golden.multimodal for golden in conversational_goldens]
|
|
151
|
+
)
|
|
152
|
+
if multimodal:
|
|
153
|
+
if (
|
|
154
|
+
not self.simulator_model
|
|
155
|
+
or not self.simulator_model.supports_multimodal()
|
|
156
|
+
):
|
|
157
|
+
if (
|
|
158
|
+
self.simulator_model
|
|
159
|
+
and type(self.simulator_model)
|
|
160
|
+
in MULTIMODAL_SUPPORTED_MODELS
|
|
161
|
+
):
|
|
162
|
+
raise ValueError(
|
|
163
|
+
f"The evaluation model {self.simulator_model.name} does not support multimodal evaluations at the moment. Available multi-modal models for the {self.simulator_model.__class__.__name__} provider includes {', '.join(self.simulator_model.__class__.valid_multimodal_models)}."
|
|
164
|
+
)
|
|
165
|
+
else:
|
|
166
|
+
raise ValueError(
|
|
167
|
+
f"The evaluation model {self.simulator_model.name} does not support multimodal inputs, please use one of the following evaluation models: {', '.join([cls.__name__ for cls in MULTIMODAL_SUPPORTED_MODELS])}"
|
|
168
|
+
)
|
|
169
|
+
|
|
127
170
|
self.simulation_cost = 0 if self.using_native_model else None
|
|
128
171
|
|
|
129
172
|
async def simulate_conversations(
|
deepeval/simulator/template.py
CHANGED
|
@@ -7,6 +7,13 @@ from deepeval.test_case import Turn
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class ConversationSimulatorTemplate:
|
|
10
|
+
multimodal_rules = """
|
|
11
|
+
--- MULTIMODAL INPUT RULES ---
|
|
12
|
+
- Treat image content as factual evidence.
|
|
13
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
14
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
15
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
16
|
+
"""
|
|
10
17
|
|
|
11
18
|
@staticmethod
|
|
12
19
|
def simulate_first_user_turn(
|
|
@@ -23,6 +30,8 @@ class ConversationSimulatorTemplate:
|
|
|
23
30
|
3. Avoid providing excessive details upfront; the goal is to initiate the conversation and build rapport, not to solve it in the first message.
|
|
24
31
|
4. The message should be concise, ideally no more than 1-3 sentences.
|
|
25
32
|
|
|
33
|
+
{ConversationSimulatorTemplate.multimodal_rules}
|
|
34
|
+
|
|
26
35
|
IMPORTANT: The output must be formatted as a JSON object with a single key `simulated_input`, where the value is the generated opening message in {language}.
|
|
27
36
|
|
|
28
37
|
Example Language: english
|
|
@@ -61,6 +70,8 @@ class ConversationSimulatorTemplate:
|
|
|
61
70
|
3. Keep the tone consistent with the previous user inputs.
|
|
62
71
|
4. The generated user input should be concise, ideally no more than 1-2 sentences.
|
|
63
72
|
|
|
73
|
+
{ConversationSimulatorTemplate.multimodal_rules}
|
|
74
|
+
|
|
64
75
|
IMPORTANT: The output must be formatted as a JSON object with a single key `simulated_input`,
|
|
65
76
|
where the value is the generated user input in {language}.
|
|
66
77
|
|
|
@@ -101,6 +112,8 @@ class ConversationSimulatorTemplate:
|
|
|
101
112
|
2. If the expected outcome has been met, mark the conversation as complete.
|
|
102
113
|
3. If not, mark it as incomplete and briefly describe what remains to be done.
|
|
103
114
|
|
|
115
|
+
{ConversationSimulatorTemplate.multimodal_rules}
|
|
116
|
+
|
|
104
117
|
IMPORTANT: The output must be formatted as a JSON object with two keys:
|
|
105
118
|
`is_complete` (a boolean) and `reason` (a string).
|
|
106
119
|
|
deepeval/test_case/__init__.py
CHANGED
|
@@ -3,13 +3,13 @@ from .llm_test_case import (
|
|
|
3
3
|
LLMTestCaseParams,
|
|
4
4
|
ToolCall,
|
|
5
5
|
ToolCallParams,
|
|
6
|
+
MLLMImage,
|
|
6
7
|
)
|
|
7
8
|
from .conversational_test_case import (
|
|
8
9
|
ConversationalTestCase,
|
|
9
10
|
Turn,
|
|
10
11
|
TurnParams,
|
|
11
12
|
)
|
|
12
|
-
from .mllm_test_case import MLLMTestCase, MLLMTestCaseParams, MLLMImage
|
|
13
13
|
from .arena_test_case import ArenaTestCase, Contestant
|
|
14
14
|
from .mcp import (
|
|
15
15
|
MCPServer,
|
|
@@ -31,8 +31,6 @@ __all__ = [
|
|
|
31
31
|
"MCPPromptCall",
|
|
32
32
|
"MCPResourceCall",
|
|
33
33
|
"MCPToolCall",
|
|
34
|
-
"MLLMTestCase",
|
|
35
|
-
"MLLMTestCaseParams",
|
|
36
34
|
"MLLMImage",
|
|
37
35
|
"ArenaTestCase",
|
|
38
36
|
"Contestant",
|
deepeval/test_case/api.py
CHANGED
|
@@ -10,7 +10,6 @@ from deepeval.test_run.api import (
|
|
|
10
10
|
from deepeval.test_case import (
|
|
11
11
|
LLMTestCase,
|
|
12
12
|
ConversationalTestCase,
|
|
13
|
-
MLLMTestCase,
|
|
14
13
|
Turn,
|
|
15
14
|
)
|
|
16
15
|
from deepeval.constants import PYTEST_RUN_TEST_NAME
|
|
@@ -29,10 +28,11 @@ def create_api_turn(turn: Turn, index: int) -> TurnApi:
|
|
|
29
28
|
|
|
30
29
|
|
|
31
30
|
def create_api_test_case(
|
|
32
|
-
test_case: Union[LLMTestCase, ConversationalTestCase
|
|
31
|
+
test_case: Union[LLMTestCase, ConversationalTestCase],
|
|
33
32
|
trace: Optional[TraceApi] = None,
|
|
34
33
|
index: Optional[int] = None,
|
|
35
34
|
) -> Union[LLMApiTestCase, ConversationalApiTestCase]:
|
|
35
|
+
|
|
36
36
|
if isinstance(test_case, ConversationalTestCase):
|
|
37
37
|
order = (
|
|
38
38
|
test_case._dataset_rank
|
|
@@ -59,8 +59,10 @@ def create_api_test_case(
|
|
|
59
59
|
context=test_case.context,
|
|
60
60
|
tags=test_case.tags,
|
|
61
61
|
comments=test_case.comments,
|
|
62
|
+
imagesMapping=test_case._get_images_mapping(),
|
|
62
63
|
additionalMetadata=test_case.additional_metadata,
|
|
63
64
|
)
|
|
65
|
+
|
|
64
66
|
api_test_case.turns = [
|
|
65
67
|
create_api_turn(
|
|
66
68
|
turn=turn,
|
|
@@ -84,48 +86,27 @@ def create_api_test_case(
|
|
|
84
86
|
name = os.getenv(PYTEST_RUN_TEST_NAME, f"test_case_{order}")
|
|
85
87
|
metrics_data = []
|
|
86
88
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
elif isinstance(test_case, MLLMTestCase):
|
|
110
|
-
api_test_case = LLMApiTestCase(
|
|
111
|
-
name=name,
|
|
112
|
-
input="",
|
|
113
|
-
multimodalInput=test_case.input,
|
|
114
|
-
multimodalActualOutput=test_case.actual_output,
|
|
115
|
-
multimodalExpectedOutput=test_case.expected_output,
|
|
116
|
-
multimodalRetrievalContext=test_case.retrieval_context,
|
|
117
|
-
multimodalContext=test_case.context,
|
|
118
|
-
toolsCalled=test_case.tools_called,
|
|
119
|
-
expectedTools=test_case.expected_tools,
|
|
120
|
-
tokenCost=test_case.token_cost,
|
|
121
|
-
completionTime=test_case.completion_time,
|
|
122
|
-
success=success,
|
|
123
|
-
metricsData=metrics_data,
|
|
124
|
-
runDuration=None,
|
|
125
|
-
evaluationCost=None,
|
|
126
|
-
order=order,
|
|
127
|
-
additionalMetadata=test_case.additional_metadata,
|
|
128
|
-
comments=test_case.comments,
|
|
129
|
-
)
|
|
89
|
+
api_test_case = LLMApiTestCase(
|
|
90
|
+
name=name,
|
|
91
|
+
input=test_case.input,
|
|
92
|
+
actualOutput=test_case.actual_output,
|
|
93
|
+
expectedOutput=test_case.expected_output,
|
|
94
|
+
retrievalContext=test_case.retrieval_context,
|
|
95
|
+
context=test_case.context,
|
|
96
|
+
imagesMapping=test_case._get_images_mapping(),
|
|
97
|
+
toolsCalled=test_case.tools_called,
|
|
98
|
+
expectedTools=test_case.expected_tools,
|
|
99
|
+
tokenCost=test_case.token_cost,
|
|
100
|
+
completionTime=test_case.completion_time,
|
|
101
|
+
success=success,
|
|
102
|
+
metricsData=metrics_data,
|
|
103
|
+
runDuration=None,
|
|
104
|
+
evaluationCost=None,
|
|
105
|
+
order=order,
|
|
106
|
+
additionalMetadata=test_case.additional_metadata,
|
|
107
|
+
comments=test_case.comments,
|
|
108
|
+
tags=test_case.tags,
|
|
109
|
+
trace=trace,
|
|
110
|
+
)
|
|
130
111
|
# llm_test_case_lookup_map[instance_id] = api_test_case
|
|
131
112
|
return api_test_case
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import List, Dict, Optional, Union
|
|
2
|
-
from dataclasses import dataclass
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
3
|
from pydantic import BaseModel
|
|
4
|
-
|
|
4
|
+
import re
|
|
5
5
|
from deepeval.test_case import (
|
|
6
6
|
LLMTestCase,
|
|
7
7
|
)
|
|
@@ -19,6 +19,7 @@ class Contestant(BaseModel):
|
|
|
19
19
|
@dataclass
|
|
20
20
|
class ArenaTestCase:
|
|
21
21
|
contestants: List[Contestant]
|
|
22
|
+
multimodal: bool = field(default=False)
|
|
22
23
|
|
|
23
24
|
def __post_init__(self):
|
|
24
25
|
contestant_names = [contestant.name for contestant in self.contestants]
|
|
@@ -38,6 +39,10 @@ class ArenaTestCase:
|
|
|
38
39
|
"All contestants must have the same 'expected_output'."
|
|
39
40
|
)
|
|
40
41
|
|
|
42
|
+
for contestant in self.contestants:
|
|
43
|
+
if contestant.test_case.multimodal:
|
|
44
|
+
self.multimodal = True
|
|
45
|
+
|
|
41
46
|
|
|
42
47
|
class Arena:
|
|
43
48
|
test_cases: List[ArenaTestCase]
|