deepeval 3.7.4__py3-none-any.whl → 3.7.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/dataset/golden.py +54 -2
- deepeval/evaluate/evaluate.py +16 -8
- deepeval/evaluate/execute.py +70 -26
- deepeval/evaluate/utils.py +26 -22
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/metrics/__init__.py +14 -12
- deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
- deepeval/metrics/answer_relevancy/template.py +188 -92
- deepeval/metrics/base_metric.py +2 -5
- deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/faithfulness/faithfulness.py +70 -27
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/utils.py +2 -2
- deepeval/metrics/indicator.py +4 -4
- deepeval/metrics/multimodal_metrics/__init__.py +0 -18
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
- deepeval/metrics/utils.py +39 -58
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +16 -38
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +52 -28
- deepeval/models/embedding_models/local_embedding_model.py +18 -14
- deepeval/models/embedding_models/ollama_embedding_model.py +38 -16
- deepeval/models/embedding_models/openai_embedding_model.py +40 -21
- deepeval/models/llms/amazon_bedrock_model.py +1 -2
- deepeval/models/llms/anthropic_model.py +44 -23
- deepeval/models/llms/azure_model.py +121 -36
- deepeval/models/llms/deepseek_model.py +18 -13
- deepeval/models/llms/gemini_model.py +129 -43
- deepeval/models/llms/grok_model.py +18 -13
- deepeval/models/llms/kimi_model.py +18 -13
- deepeval/models/llms/litellm_model.py +42 -22
- deepeval/models/llms/local_model.py +12 -7
- deepeval/models/llms/ollama_model.py +114 -12
- deepeval/models/llms/openai_model.py +137 -41
- deepeval/models/llms/portkey_model.py +24 -7
- deepeval/models/llms/utils.py +5 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +46 -1
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +12 -10
- deepeval/test_case/conversational_test_case.py +19 -1
- deepeval/test_case/llm_test_case.py +152 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +15 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/RECORD +116 -125
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
|
@@ -2,11 +2,14 @@ from __future__ import annotations
|
|
|
2
2
|
import inspect
|
|
3
3
|
import random
|
|
4
4
|
import re
|
|
5
|
+
import statistics
|
|
5
6
|
from typing import (
|
|
6
7
|
Any,
|
|
7
8
|
Callable,
|
|
8
9
|
List,
|
|
9
10
|
Optional,
|
|
11
|
+
Protocol,
|
|
12
|
+
Sequence,
|
|
10
13
|
Tuple,
|
|
11
14
|
TYPE_CHECKING,
|
|
12
15
|
Union,
|
|
@@ -17,11 +20,13 @@ from typing import (
|
|
|
17
20
|
from deepeval.errors import DeepEvalError
|
|
18
21
|
from deepeval.metrics.base_metric import BaseMetric, BaseConversationalMetric
|
|
19
22
|
from deepeval.prompt.prompt import Prompt
|
|
20
|
-
from deepeval.prompt.api import
|
|
21
|
-
from deepeval.
|
|
23
|
+
from deepeval.prompt.api import PromptMessage
|
|
24
|
+
from deepeval.optimizer.types import (
|
|
25
|
+
ModelCallback,
|
|
22
26
|
ModuleId,
|
|
23
27
|
PromptConfigurationId,
|
|
24
28
|
PromptConfiguration,
|
|
29
|
+
PromptConfigSnapshot,
|
|
25
30
|
OptimizationReport,
|
|
26
31
|
)
|
|
27
32
|
|
|
@@ -54,7 +59,7 @@ def split_goldens(
|
|
|
54
59
|
pareto_size: Number of items to allocate to the Pareto set bound between [0, len(goldens)].
|
|
55
60
|
random_state: A shared `random.Random` instance that provides the source
|
|
56
61
|
of randomness. For reproducible runs, pass the same object used by
|
|
57
|
-
the GEPA loop constructed from `
|
|
62
|
+
the GEPA loop constructed from `GEPA.random_seed`
|
|
58
63
|
|
|
59
64
|
Returns:
|
|
60
65
|
(d_feedback, d_pareto)
|
|
@@ -151,87 +156,22 @@ def normalize_seed_prompts(
|
|
|
151
156
|
return mapping
|
|
152
157
|
|
|
153
158
|
|
|
154
|
-
def build_model_callback_kwargs(
|
|
155
|
-
*,
|
|
156
|
-
# scoring context
|
|
157
|
-
golden: Optional[Union["Golden", "ConversationalGolden"]] = None,
|
|
158
|
-
# rewriter context
|
|
159
|
-
feedback_text: Optional[str] = None,
|
|
160
|
-
# shared
|
|
161
|
-
prompt: Optional[Prompt] = None,
|
|
162
|
-
prompt_type: Optional[str] = None,
|
|
163
|
-
prompt_text: Optional[str] = None,
|
|
164
|
-
prompt_messages: Optional[List["PromptMessage"]] = None,
|
|
165
|
-
) -> Dict[str, Any]:
|
|
166
|
-
"""
|
|
167
|
-
Build a superset of kwargs for GEPA model callbacks.
|
|
168
|
-
|
|
169
|
-
All keys are present in the dict so callbacks can declare any subset of:
|
|
170
|
-
|
|
171
|
-
hook: str # injected by (a_)invoke_model_callback
|
|
172
|
-
prompt: Prompt
|
|
173
|
-
prompt_type: str
|
|
174
|
-
prompt_text: str
|
|
175
|
-
prompt_messages: List[PromptMessage]
|
|
176
|
-
golden: Golden | ConversationalGolden
|
|
177
|
-
feedback_text: str
|
|
178
|
-
|
|
179
|
-
Non applicable fields are set to None.
|
|
180
|
-
"""
|
|
181
|
-
return {
|
|
182
|
-
# scoring context
|
|
183
|
-
"golden": golden,
|
|
184
|
-
# rewriter context
|
|
185
|
-
"feedback_text": feedback_text,
|
|
186
|
-
# shared
|
|
187
|
-
"prompt": prompt,
|
|
188
|
-
"prompt_text": prompt_text,
|
|
189
|
-
"prompt_messages": prompt_messages,
|
|
190
|
-
}
|
|
191
|
-
|
|
192
|
-
|
|
193
159
|
def invoke_model_callback(
|
|
194
160
|
*,
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
str,
|
|
200
|
-
Dict,
|
|
201
|
-
Tuple[Union[str, Dict], float],
|
|
202
|
-
],
|
|
203
|
-
],
|
|
204
|
-
candidate_kwargs: Dict[str, Any],
|
|
205
|
-
) -> Union[
|
|
206
|
-
str,
|
|
207
|
-
Dict,
|
|
208
|
-
Tuple[Union[str, Dict], float],
|
|
209
|
-
]:
|
|
161
|
+
model_callback: ModelCallback,
|
|
162
|
+
prompt: Prompt,
|
|
163
|
+
golden: Union["Golden", "ConversationalGolden"],
|
|
164
|
+
) -> str:
|
|
210
165
|
"""
|
|
211
166
|
Call a user provided model_callback in a synchronous context.
|
|
212
167
|
|
|
213
|
-
|
|
214
|
-
- Injects `hook` if the callback declares it.
|
|
215
|
-
- Raises if the callback returns an awaitable; callers must use async
|
|
216
|
-
helpers for async callbacks.
|
|
168
|
+
Raises if the callback returns an awaitable.
|
|
217
169
|
"""
|
|
218
|
-
|
|
219
|
-
supported = set(sig.parameters.keys())
|
|
220
|
-
|
|
221
|
-
filtered = {
|
|
222
|
-
key: value
|
|
223
|
-
for key, value in candidate_kwargs.items()
|
|
224
|
-
if key in supported
|
|
225
|
-
}
|
|
226
|
-
|
|
227
|
-
if "hook" in supported:
|
|
228
|
-
filtered["hook"] = hook
|
|
229
|
-
|
|
230
|
-
result = model_callback(**filtered)
|
|
170
|
+
result = model_callback(prompt, golden)
|
|
231
171
|
if inspect.isawaitable(result):
|
|
232
172
|
raise DeepEvalError(
|
|
233
173
|
"model_callback returned an awaitable from a synchronous context. "
|
|
234
|
-
"Either declare the callback as `async def` and use async
|
|
174
|
+
"Either declare the callback as `async def` and use async optimization, or call "
|
|
235
175
|
"`model.generate(...)` instead of `model.a_generate(...)` inside a sync callback."
|
|
236
176
|
)
|
|
237
177
|
return result
|
|
@@ -239,41 +179,16 @@ def invoke_model_callback(
|
|
|
239
179
|
|
|
240
180
|
async def a_invoke_model_callback(
|
|
241
181
|
*,
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
str,
|
|
247
|
-
Dict,
|
|
248
|
-
Tuple[Union[str, Dict], float],
|
|
249
|
-
],
|
|
250
|
-
],
|
|
251
|
-
candidate_kwargs: Dict[str, Any],
|
|
252
|
-
) -> Union[
|
|
253
|
-
str,
|
|
254
|
-
Dict,
|
|
255
|
-
Tuple[Union[str, Dict], float],
|
|
256
|
-
]:
|
|
182
|
+
model_callback: ModelCallback,
|
|
183
|
+
prompt: Prompt,
|
|
184
|
+
golden: Union["Golden", "ConversationalGolden"],
|
|
185
|
+
) -> str:
|
|
257
186
|
"""
|
|
258
187
|
Call a user provided model_callback in an async context.
|
|
259
188
|
|
|
260
|
-
|
|
261
|
-
- Injects `hook` if the callback declares it.
|
|
262
|
-
- Supports both sync and async callbacks.
|
|
189
|
+
Supports both sync and async callbacks.
|
|
263
190
|
"""
|
|
264
|
-
|
|
265
|
-
supported = set(sig.parameters.keys())
|
|
266
|
-
|
|
267
|
-
filtered = {
|
|
268
|
-
key: value
|
|
269
|
-
for key, value in candidate_kwargs.items()
|
|
270
|
-
if key in supported
|
|
271
|
-
}
|
|
272
|
-
|
|
273
|
-
if "hook" in supported:
|
|
274
|
-
filtered["hook"] = hook
|
|
275
|
-
|
|
276
|
-
result = model_callback(**filtered)
|
|
191
|
+
result = model_callback(prompt, golden)
|
|
277
192
|
if inspect.isawaitable(result):
|
|
278
193
|
return await result
|
|
279
194
|
return result
|
|
@@ -288,58 +203,17 @@ def build_prompt_config_snapshots(
|
|
|
288
203
|
prompt_configurations_by_id: Dict[
|
|
289
204
|
PromptConfigurationId, "PromptConfiguration"
|
|
290
205
|
],
|
|
291
|
-
) -> Dict[PromptConfigurationId,
|
|
206
|
+
) -> Dict[PromptConfigurationId, PromptConfigSnapshot]:
|
|
292
207
|
"""
|
|
293
|
-
Build
|
|
294
|
-
|
|
295
|
-
Shape matches the docs for `prompt_configurations`:
|
|
296
|
-
|
|
297
|
-
{
|
|
298
|
-
"<config_id>": {
|
|
299
|
-
"parent": "<parent_id or None>",
|
|
300
|
-
"prompts": {
|
|
301
|
-
"<module_id>": {
|
|
302
|
-
"type": "TEXT",
|
|
303
|
-
"text_template": "...",
|
|
304
|
-
}
|
|
305
|
-
# or
|
|
306
|
-
"<module_id>": {
|
|
307
|
-
"type": "LIST",
|
|
308
|
-
"messages": [
|
|
309
|
-
{"role": "system", "content": "..."},
|
|
310
|
-
...
|
|
311
|
-
],
|
|
312
|
-
},
|
|
313
|
-
},
|
|
314
|
-
},
|
|
315
|
-
...
|
|
316
|
-
}
|
|
208
|
+
Build snapshots of all prompt configurations.
|
|
317
209
|
"""
|
|
318
|
-
snapshots: Dict[PromptConfigurationId,
|
|
210
|
+
snapshots: Dict[PromptConfigurationId, PromptConfigSnapshot] = {}
|
|
319
211
|
|
|
320
212
|
for cfg_id, cfg in prompt_configurations_by_id.items():
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
messages = [
|
|
326
|
-
{"role": msg.role, "content": (msg.content or "")}
|
|
327
|
-
for msg in (prompt.messages_template or [])
|
|
328
|
-
]
|
|
329
|
-
prompts_snapshot[module_id] = {
|
|
330
|
-
"type": "LIST",
|
|
331
|
-
"messages": messages,
|
|
332
|
-
}
|
|
333
|
-
else:
|
|
334
|
-
prompts_snapshot[module_id] = {
|
|
335
|
-
"type": "TEXT",
|
|
336
|
-
"text_template": (prompt.text_template or ""),
|
|
337
|
-
}
|
|
338
|
-
|
|
339
|
-
snapshots[cfg_id] = {
|
|
340
|
-
"parent": cfg.parent,
|
|
341
|
-
"prompts": prompts_snapshot,
|
|
342
|
-
}
|
|
213
|
+
snapshots[cfg_id] = PromptConfigSnapshot(
|
|
214
|
+
parent=cfg.parent,
|
|
215
|
+
prompts=dict(cfg.prompts),
|
|
216
|
+
)
|
|
343
217
|
|
|
344
218
|
return snapshots
|
|
345
219
|
|
|
@@ -494,17 +368,8 @@ def validate_sequence_of(
|
|
|
494
368
|
def validate_callback(
|
|
495
369
|
*,
|
|
496
370
|
component: str,
|
|
497
|
-
model_callback: Optional[
|
|
498
|
-
|
|
499
|
-
...,
|
|
500
|
-
Union[
|
|
501
|
-
str,
|
|
502
|
-
Dict,
|
|
503
|
-
Tuple[Union[str, Dict], float],
|
|
504
|
-
],
|
|
505
|
-
]
|
|
506
|
-
],
|
|
507
|
-
) -> Callable[..., Union[str, Dict, Tuple[Union[str, Dict], float]]]:
|
|
371
|
+
model_callback: Optional[ModelCallback],
|
|
372
|
+
) -> ModelCallback:
|
|
508
373
|
"""
|
|
509
374
|
Ensure that `model_callback` is provided.
|
|
510
375
|
|
|
@@ -596,3 +461,20 @@ def validate_int_in_range(
|
|
|
596
461
|
)
|
|
597
462
|
|
|
598
463
|
return value
|
|
464
|
+
|
|
465
|
+
|
|
466
|
+
##############
|
|
467
|
+
# Aggregates #
|
|
468
|
+
##############
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
class Aggregator(Protocol):
|
|
472
|
+
def __call__(self, scores: Sequence[float]) -> float: ...
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def mean_of_all(scores: Sequence[float]) -> float:
|
|
476
|
+
return statistics.fmean(scores) if scores else 0.0
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
def median_of_all(scores: Sequence[float]) -> float:
|
|
480
|
+
return statistics.median(scores) if scores else 0.0
|
deepeval/prompt/prompt.py
CHANGED
|
@@ -4,7 +4,7 @@ import json
|
|
|
4
4
|
import os
|
|
5
5
|
|
|
6
6
|
from enum import Enum
|
|
7
|
-
from typing import Optional, List, Dict, Type, Literal
|
|
7
|
+
from typing import Optional, List, Dict, Type, Literal
|
|
8
8
|
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
|
|
9
9
|
from rich.console import Console
|
|
10
10
|
from pydantic import BaseModel, ValidationError
|
|
@@ -34,10 +34,6 @@ from deepeval.prompt.utils import (
|
|
|
34
34
|
from deepeval.confident.api import Api, Endpoints, HttpMethods
|
|
35
35
|
from deepeval.constants import HIDDEN_DIR
|
|
36
36
|
|
|
37
|
-
|
|
38
|
-
if TYPE_CHECKING:
|
|
39
|
-
from deepeval.optimization.types import OptimizationReport
|
|
40
|
-
|
|
41
37
|
logger = logging.getLogger(__name__)
|
|
42
38
|
|
|
43
39
|
portalocker = None
|
|
@@ -117,6 +113,7 @@ class Prompt:
|
|
|
117
113
|
model_settings: Optional[ModelSettings] = None,
|
|
118
114
|
output_type: Optional[OutputType] = None,
|
|
119
115
|
output_schema: Optional[Type[BaseModel]] = None,
|
|
116
|
+
interpolation_type: Optional[PromptInterpolationType] = None,
|
|
120
117
|
):
|
|
121
118
|
if text_template and messages_template:
|
|
122
119
|
raise TypeError(
|
|
@@ -129,7 +126,9 @@ class Prompt:
|
|
|
129
126
|
self.output_type: Optional[OutputType] = output_type
|
|
130
127
|
self.output_schema: Optional[Type[BaseModel]] = output_schema
|
|
131
128
|
self.label: Optional[str] = None
|
|
132
|
-
self.interpolation_type:
|
|
129
|
+
self.interpolation_type: PromptInterpolationType = (
|
|
130
|
+
interpolation_type or PromptInterpolationType.FSTRING
|
|
131
|
+
)
|
|
133
132
|
|
|
134
133
|
self._version = None
|
|
135
134
|
self._prompt_version_id: Optional[str] = None
|
|
@@ -145,9 +144,6 @@ class Prompt:
|
|
|
145
144
|
elif messages_template:
|
|
146
145
|
self.type = PromptType.LIST
|
|
147
146
|
|
|
148
|
-
# updated after optimization runs
|
|
149
|
-
self.optimization_report: Optional["OptimizationReport"] = None
|
|
150
|
-
|
|
151
147
|
def __del__(self):
|
|
152
148
|
"""Cleanup polling tasks when instance is destroyed"""
|
|
153
149
|
try:
|
deepeval/test_case/__init__.py
CHANGED
|
@@ -3,13 +3,13 @@ from .llm_test_case import (
|
|
|
3
3
|
LLMTestCaseParams,
|
|
4
4
|
ToolCall,
|
|
5
5
|
ToolCallParams,
|
|
6
|
+
MLLMImage,
|
|
6
7
|
)
|
|
7
8
|
from .conversational_test_case import (
|
|
8
9
|
ConversationalTestCase,
|
|
9
10
|
Turn,
|
|
10
11
|
TurnParams,
|
|
11
12
|
)
|
|
12
|
-
from .mllm_test_case import MLLMTestCase, MLLMTestCaseParams, MLLMImage
|
|
13
13
|
from .arena_test_case import ArenaTestCase, Contestant
|
|
14
14
|
from .mcp import (
|
|
15
15
|
MCPServer,
|
|
@@ -31,8 +31,6 @@ __all__ = [
|
|
|
31
31
|
"MCPPromptCall",
|
|
32
32
|
"MCPResourceCall",
|
|
33
33
|
"MCPToolCall",
|
|
34
|
-
"MLLMTestCase",
|
|
35
|
-
"MLLMTestCaseParams",
|
|
36
34
|
"MLLMImage",
|
|
37
35
|
"ArenaTestCase",
|
|
38
36
|
"Contestant",
|
deepeval/test_case/api.py
CHANGED
|
@@ -10,9 +10,9 @@ from deepeval.test_run.api import (
|
|
|
10
10
|
from deepeval.test_case import (
|
|
11
11
|
LLMTestCase,
|
|
12
12
|
ConversationalTestCase,
|
|
13
|
-
MLLMTestCase,
|
|
14
13
|
Turn,
|
|
15
14
|
)
|
|
15
|
+
from deepeval.test_case.llm_test_case import _MLLM_IMAGE_REGISTRY
|
|
16
16
|
from deepeval.constants import PYTEST_RUN_TEST_NAME
|
|
17
17
|
|
|
18
18
|
|
|
@@ -29,10 +29,12 @@ def create_api_turn(turn: Turn, index: int) -> TurnApi:
|
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
def create_api_test_case(
|
|
32
|
-
test_case: Union[LLMTestCase, ConversationalTestCase
|
|
32
|
+
test_case: Union[LLMTestCase, ConversationalTestCase],
|
|
33
33
|
trace: Optional[TraceApi] = None,
|
|
34
34
|
index: Optional[int] = None,
|
|
35
35
|
) -> Union[LLMApiTestCase, ConversationalApiTestCase]:
|
|
36
|
+
from deepeval.utils import convert_to_multi_modal_array
|
|
37
|
+
|
|
36
38
|
if isinstance(test_case, ConversationalTestCase):
|
|
37
39
|
order = (
|
|
38
40
|
test_case._dataset_rank
|
|
@@ -84,7 +86,7 @@ def create_api_test_case(
|
|
|
84
86
|
name = os.getenv(PYTEST_RUN_TEST_NAME, f"test_case_{order}")
|
|
85
87
|
metrics_data = []
|
|
86
88
|
|
|
87
|
-
if isinstance(test_case, LLMTestCase):
|
|
89
|
+
if isinstance(test_case, LLMTestCase) and test_case.multimodal is False:
|
|
88
90
|
api_test_case = LLMApiTestCase(
|
|
89
91
|
name=name,
|
|
90
92
|
input=test_case.input,
|
|
@@ -106,15 +108,15 @@ def create_api_test_case(
|
|
|
106
108
|
comments=test_case.comments,
|
|
107
109
|
trace=trace,
|
|
108
110
|
)
|
|
109
|
-
elif isinstance(test_case,
|
|
111
|
+
elif isinstance(test_case, LLMTestCase) and test_case.multimodal:
|
|
110
112
|
api_test_case = LLMApiTestCase(
|
|
111
113
|
name=name,
|
|
112
|
-
input=
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
114
|
+
input=test_case.input,
|
|
115
|
+
actualOutput=test_case.actual_output,
|
|
116
|
+
expectedOutput=test_case.expected_output,
|
|
117
|
+
retrievalContext=test_case.retrieval_context,
|
|
118
|
+
context=test_case.context,
|
|
119
|
+
imagesMapping=_MLLM_IMAGE_REGISTRY,
|
|
118
120
|
toolsCalled=test_case.tools_called,
|
|
119
121
|
expectedTools=test_case.expected_tools,
|
|
120
122
|
tokenCost=test_case.token_cost,
|
|
@@ -9,7 +9,7 @@ from typing import List, Optional, Dict, Literal
|
|
|
9
9
|
from copy import deepcopy
|
|
10
10
|
from enum import Enum
|
|
11
11
|
|
|
12
|
-
from deepeval.test_case import ToolCall
|
|
12
|
+
from deepeval.test_case import ToolCall, MLLMImage
|
|
13
13
|
from deepeval.test_case.mcp import (
|
|
14
14
|
MCPServer,
|
|
15
15
|
MCPPromptCall,
|
|
@@ -156,11 +156,29 @@ class ConversationalTestCase(BaseModel):
|
|
|
156
156
|
comments: Optional[str] = Field(default=None)
|
|
157
157
|
tags: Optional[List[str]] = Field(default=None)
|
|
158
158
|
mcp_servers: Optional[List[MCPServer]] = Field(default=None)
|
|
159
|
+
multimodal: bool = False
|
|
159
160
|
|
|
160
161
|
_dataset_rank: Optional[int] = PrivateAttr(default=None)
|
|
161
162
|
_dataset_alias: Optional[str] = PrivateAttr(default=None)
|
|
162
163
|
_dataset_id: Optional[str] = PrivateAttr(default=None)
|
|
163
164
|
|
|
165
|
+
@model_validator(mode="after")
|
|
166
|
+
def set_is_multimodal(self):
|
|
167
|
+
import re
|
|
168
|
+
|
|
169
|
+
if self.multimodal is True:
|
|
170
|
+
return self
|
|
171
|
+
|
|
172
|
+
pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
|
|
173
|
+
self.multimodal = any(
|
|
174
|
+
[
|
|
175
|
+
re.search(pattern, turn.content) is not None
|
|
176
|
+
for turn in self.turns
|
|
177
|
+
]
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
return self
|
|
181
|
+
|
|
164
182
|
@model_validator(mode="before")
|
|
165
183
|
def validate_input(cls, data):
|
|
166
184
|
turns = data.get("turns")
|
|
@@ -9,7 +9,12 @@ from typing import List, Optional, Dict, Any
|
|
|
9
9
|
from enum import Enum
|
|
10
10
|
import json
|
|
11
11
|
import uuid
|
|
12
|
-
|
|
12
|
+
import re
|
|
13
|
+
import os
|
|
14
|
+
import mimetypes
|
|
15
|
+
import base64
|
|
16
|
+
from dataclasses import dataclass, field
|
|
17
|
+
from urllib.parse import urlparse, unquote
|
|
13
18
|
from deepeval.utils import make_model_config
|
|
14
19
|
|
|
15
20
|
from deepeval.test_case.mcp import (
|
|
@@ -20,6 +25,128 @@ from deepeval.test_case.mcp import (
|
|
|
20
25
|
validate_mcp_servers,
|
|
21
26
|
)
|
|
22
27
|
|
|
28
|
+
_MLLM_IMAGE_REGISTRY: Dict[str, "MLLMImage"] = {}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class MLLMImage:
|
|
33
|
+
dataBase64: Optional[str] = None
|
|
34
|
+
mimeType: Optional[str] = None
|
|
35
|
+
url: Optional[str] = None
|
|
36
|
+
local: Optional[bool] = None
|
|
37
|
+
filename: Optional[str] = None
|
|
38
|
+
_id: str = field(default_factory=lambda: uuid.uuid4().hex)
|
|
39
|
+
|
|
40
|
+
def __post_init__(self):
|
|
41
|
+
|
|
42
|
+
if not self.url and not self.dataBase64:
|
|
43
|
+
raise ValueError(
|
|
44
|
+
"You must provide either a 'url' or both 'dataBase64' and 'mimeType' to create an MLLMImage."
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
if self.dataBase64 is not None:
|
|
48
|
+
if self.mimeType is None:
|
|
49
|
+
raise ValueError(
|
|
50
|
+
"mimeType must be provided when initializing from Base64 data."
|
|
51
|
+
)
|
|
52
|
+
else:
|
|
53
|
+
is_local = self.is_local_path(self.url)
|
|
54
|
+
if self.local is not None:
|
|
55
|
+
assert self.local == is_local, "Local path mismatch"
|
|
56
|
+
else:
|
|
57
|
+
self.local = is_local
|
|
58
|
+
|
|
59
|
+
# compute filename, mime_type, and Base64 data
|
|
60
|
+
if self.local:
|
|
61
|
+
path = self.process_url(self.url)
|
|
62
|
+
self.filename = os.path.basename(path)
|
|
63
|
+
self.mimeType = (
|
|
64
|
+
mimetypes.guess_type(path)[0] or "application/octet-stream"
|
|
65
|
+
)
|
|
66
|
+
with open(path, "rb") as f:
|
|
67
|
+
raw = f.read()
|
|
68
|
+
self.dataBase64 = base64.b64encode(raw).decode("ascii")
|
|
69
|
+
else:
|
|
70
|
+
self.filename = None
|
|
71
|
+
self.mimeType = None
|
|
72
|
+
self.dataBase64 = None
|
|
73
|
+
|
|
74
|
+
_MLLM_IMAGE_REGISTRY[self._id] = self
|
|
75
|
+
|
|
76
|
+
def _placeholder(self) -> str:
|
|
77
|
+
return f"[DEEPEVAL:IMAGE:{self._id}]"
|
|
78
|
+
|
|
79
|
+
def __str__(self) -> str:
|
|
80
|
+
return self._placeholder()
|
|
81
|
+
|
|
82
|
+
def __repr__(self) -> str:
|
|
83
|
+
return self._placeholder()
|
|
84
|
+
|
|
85
|
+
def __format__(self, format_spec: str) -> str:
|
|
86
|
+
return self._placeholder()
|
|
87
|
+
|
|
88
|
+
@staticmethod
|
|
89
|
+
def process_url(url: str) -> str:
|
|
90
|
+
if os.path.exists(url):
|
|
91
|
+
return url
|
|
92
|
+
parsed = urlparse(url)
|
|
93
|
+
if parsed.scheme == "file":
|
|
94
|
+
raw_path = (
|
|
95
|
+
f"//{parsed.netloc}{parsed.path}"
|
|
96
|
+
if parsed.netloc
|
|
97
|
+
else parsed.path
|
|
98
|
+
)
|
|
99
|
+
path = unquote(raw_path)
|
|
100
|
+
return path
|
|
101
|
+
return url
|
|
102
|
+
|
|
103
|
+
@staticmethod
|
|
104
|
+
def is_local_path(url: str) -> bool:
|
|
105
|
+
if os.path.exists(url):
|
|
106
|
+
return True
|
|
107
|
+
parsed = urlparse(url)
|
|
108
|
+
if parsed.scheme == "file":
|
|
109
|
+
raw_path = (
|
|
110
|
+
f"//{parsed.netloc}{parsed.path}"
|
|
111
|
+
if parsed.netloc
|
|
112
|
+
else parsed.path
|
|
113
|
+
)
|
|
114
|
+
path = unquote(raw_path)
|
|
115
|
+
return os.path.exists(path)
|
|
116
|
+
return False
|
|
117
|
+
|
|
118
|
+
def parse_multimodal_string(s: str):
|
|
119
|
+
pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
|
|
120
|
+
matches = list(re.finditer(pattern, s))
|
|
121
|
+
|
|
122
|
+
result = []
|
|
123
|
+
last_end = 0
|
|
124
|
+
|
|
125
|
+
for m in matches:
|
|
126
|
+
start, end = m.span()
|
|
127
|
+
|
|
128
|
+
if start > last_end:
|
|
129
|
+
result.append(s[last_end:start])
|
|
130
|
+
|
|
131
|
+
img_id = m.group(1)
|
|
132
|
+
|
|
133
|
+
if img_id not in _MLLM_IMAGE_REGISTRY:
|
|
134
|
+
MLLMImage(url=img_id, _id=img_id)
|
|
135
|
+
|
|
136
|
+
result.append(_MLLM_IMAGE_REGISTRY[img_id])
|
|
137
|
+
last_end = end
|
|
138
|
+
|
|
139
|
+
if last_end < len(s):
|
|
140
|
+
result.append(s[last_end:])
|
|
141
|
+
|
|
142
|
+
return result
|
|
143
|
+
|
|
144
|
+
def as_data_uri(self) -> Optional[str]:
|
|
145
|
+
"""Return the image as a data URI string, if Base64 data is available."""
|
|
146
|
+
if not self.dataBase64 or not self.mimeType:
|
|
147
|
+
return None
|
|
148
|
+
return f"data:{self.mimeType};base64,{self.dataBase64}"
|
|
149
|
+
|
|
23
150
|
|
|
24
151
|
class LLMTestCaseParams(Enum):
|
|
25
152
|
INPUT = "input"
|
|
@@ -208,6 +335,7 @@ class LLMTestCase(BaseModel):
|
|
|
208
335
|
serialization_alias="completionTime",
|
|
209
336
|
validation_alias=AliasChoices("completionTime", "completion_time"),
|
|
210
337
|
)
|
|
338
|
+
multimodal: bool = Field(default=False)
|
|
211
339
|
name: Optional[str] = Field(default=None)
|
|
212
340
|
tags: Optional[List[str]] = Field(default=None)
|
|
213
341
|
mcp_servers: Optional[List[MCPServer]] = Field(default=None)
|
|
@@ -229,6 +357,29 @@ class LLMTestCase(BaseModel):
|
|
|
229
357
|
default_factory=lambda: str(uuid.uuid4())
|
|
230
358
|
)
|
|
231
359
|
|
|
360
|
+
@model_validator(mode="after")
|
|
361
|
+
def set_is_multimodal(self):
|
|
362
|
+
import re
|
|
363
|
+
|
|
364
|
+
if self.multimodal is True:
|
|
365
|
+
return self
|
|
366
|
+
|
|
367
|
+
pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
|
|
368
|
+
|
|
369
|
+
auto_detect = (
|
|
370
|
+
any(
|
|
371
|
+
[
|
|
372
|
+
re.search(pattern, self.input or "") is not None,
|
|
373
|
+
re.search(pattern, self.actual_output or "") is not None,
|
|
374
|
+
]
|
|
375
|
+
)
|
|
376
|
+
if isinstance(self.input, str)
|
|
377
|
+
else self.multimodal
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
self.multimodal = auto_detect
|
|
381
|
+
return self
|
|
382
|
+
|
|
232
383
|
@model_validator(mode="before")
|
|
233
384
|
def validate_input(cls, data):
|
|
234
385
|
input = data.get("input")
|
deepeval/test_case/utils.py
CHANGED
|
@@ -1,24 +1,20 @@
|
|
|
1
1
|
from typing import Union, List
|
|
2
2
|
|
|
3
|
-
from deepeval.test_case import LLMTestCase,
|
|
3
|
+
from deepeval.test_case import LLMTestCase, ConversationalTestCase
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def check_valid_test_cases_type(
|
|
7
|
-
test_cases: Union[
|
|
8
|
-
List[Union[LLMTestCase, MLLMTestCase]], List[ConversationalTestCase]
|
|
9
|
-
],
|
|
7
|
+
test_cases: Union[List[LLMTestCase], List[ConversationalTestCase]],
|
|
10
8
|
):
|
|
11
9
|
llm_test_case_count = 0
|
|
12
10
|
conversational_test_case_count = 0
|
|
13
11
|
for test_case in test_cases:
|
|
14
|
-
if isinstance(test_case, LLMTestCase)
|
|
15
|
-
test_case, MLLMTestCase
|
|
16
|
-
):
|
|
12
|
+
if isinstance(test_case, LLMTestCase):
|
|
17
13
|
llm_test_case_count += 1
|
|
18
14
|
else:
|
|
19
15
|
conversational_test_case_count += 1
|
|
20
16
|
|
|
21
17
|
if llm_test_case_count > 0 and conversational_test_case_count > 0:
|
|
22
18
|
raise ValueError(
|
|
23
|
-
"You cannot supply a mixture of `LLMTestCase
|
|
19
|
+
"You cannot supply a mixture of `LLMTestCase`(s) and `ConversationalTestCase`(s) as the list of test cases."
|
|
24
20
|
)
|