deepeval 3.7.3__py3-none-any.whl → 3.7.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/test.py +1 -1
- deepeval/config/settings.py +102 -13
- deepeval/evaluate/configs.py +1 -1
- deepeval/evaluate/execute.py +4 -1
- deepeval/metrics/answer_relevancy/template.py +4 -4
- deepeval/metrics/argument_correctness/template.py +2 -2
- deepeval/metrics/bias/template.py +3 -3
- deepeval/metrics/contextual_precision/template.py +6 -6
- deepeval/metrics/contextual_recall/template.py +2 -2
- deepeval/metrics/contextual_relevancy/template.py +3 -3
- deepeval/metrics/conversation_completeness/template.py +2 -2
- deepeval/metrics/conversational_dag/templates.py +4 -4
- deepeval/metrics/conversational_g_eval/template.py +4 -3
- deepeval/metrics/dag/templates.py +4 -4
- deepeval/metrics/faithfulness/template.py +4 -4
- deepeval/metrics/hallucination/template.py +4 -4
- deepeval/metrics/misuse/template.py +2 -2
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +7 -7
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +6 -6
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +2 -2
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +3 -3
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +9 -9
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +4 -4
- deepeval/metrics/non_advice/template.py +2 -2
- deepeval/metrics/pii_leakage/template.py +2 -2
- deepeval/metrics/prompt_alignment/template.py +4 -4
- deepeval/metrics/role_violation/template.py +2 -2
- deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
- deepeval/metrics/toxicity/template.py +4 -4
- deepeval/metrics/turn_relevancy/template.py +2 -2
- deepeval/models/embedding_models/azure_embedding_model.py +28 -15
- deepeval/models/embedding_models/local_embedding_model.py +23 -10
- deepeval/models/embedding_models/ollama_embedding_model.py +8 -6
- deepeval/models/embedding_models/openai_embedding_model.py +18 -2
- deepeval/models/llms/anthropic_model.py +17 -5
- deepeval/models/llms/azure_model.py +30 -18
- deepeval/models/llms/deepseek_model.py +22 -12
- deepeval/models/llms/gemini_model.py +120 -87
- deepeval/models/llms/grok_model.py +23 -16
- deepeval/models/llms/kimi_model.py +23 -12
- deepeval/models/llms/litellm_model.py +63 -25
- deepeval/models/llms/local_model.py +26 -18
- deepeval/models/llms/ollama_model.py +17 -7
- deepeval/models/llms/openai_model.py +22 -17
- deepeval/models/llms/portkey_model.py +132 -0
- deepeval/models/mlllms/azure_model.py +28 -19
- deepeval/models/mlllms/gemini_model.py +102 -73
- deepeval/models/mlllms/ollama_model.py +40 -9
- deepeval/models/mlllms/openai_model.py +65 -14
- deepeval/models/utils.py +48 -3
- deepeval/optimization/__init__.py +13 -0
- deepeval/optimization/adapters/__init__.py +2 -0
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +588 -0
- deepeval/optimization/aggregates.py +14 -0
- deepeval/optimization/configs.py +34 -0
- deepeval/optimization/copro/configs.py +31 -0
- deepeval/optimization/copro/loop.py +837 -0
- deepeval/optimization/gepa/__init__.py +7 -0
- deepeval/optimization/gepa/configs.py +115 -0
- deepeval/optimization/gepa/loop.py +677 -0
- deepeval/optimization/miprov2/configs.py +134 -0
- deepeval/optimization/miprov2/loop.py +785 -0
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +458 -0
- deepeval/optimization/policies/__init__.py +16 -0
- deepeval/optimization/policies/selection.py +166 -0
- deepeval/optimization/policies/tie_breaker.py +67 -0
- deepeval/optimization/prompt_optimizer.py +462 -0
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +33 -0
- deepeval/optimization/simba/loop.py +983 -0
- deepeval/optimization/simba/types.py +15 -0
- deepeval/optimization/types.py +361 -0
- deepeval/optimization/utils.py +598 -0
- deepeval/prompt/prompt.py +10 -5
- deepeval/test_run/cache.py +2 -0
- deepeval/test_run/test_run.py +6 -1
- deepeval/utils.py +24 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/METADATA +1 -1
- {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/RECORD +84 -59
- {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/WHEEL +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,598 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import inspect
|
|
3
|
+
import random
|
|
4
|
+
import re
|
|
5
|
+
from typing import (
|
|
6
|
+
Any,
|
|
7
|
+
Callable,
|
|
8
|
+
List,
|
|
9
|
+
Optional,
|
|
10
|
+
Tuple,
|
|
11
|
+
TYPE_CHECKING,
|
|
12
|
+
Union,
|
|
13
|
+
Dict,
|
|
14
|
+
Set,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
from deepeval.errors import DeepEvalError
|
|
18
|
+
from deepeval.metrics.base_metric import BaseMetric, BaseConversationalMetric
|
|
19
|
+
from deepeval.prompt.prompt import Prompt
|
|
20
|
+
from deepeval.prompt.api import PromptType, PromptMessage
|
|
21
|
+
from deepeval.optimization.types import (
|
|
22
|
+
ModuleId,
|
|
23
|
+
PromptConfigurationId,
|
|
24
|
+
PromptConfiguration,
|
|
25
|
+
OptimizationReport,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
from deepeval.dataset.golden import Golden, ConversationalGolden
|
|
31
|
+
from deepeval.prompt.api import PromptMessage
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def split_goldens(
|
|
35
|
+
goldens: Union[List[Golden], List[ConversationalGolden]],
|
|
36
|
+
pareto_size: int,
|
|
37
|
+
*,
|
|
38
|
+
random_state: random.Random,
|
|
39
|
+
) -> Tuple[
|
|
40
|
+
Union[List[Golden], List[ConversationalGolden]],
|
|
41
|
+
Union[List[Golden], List[ConversationalGolden]],
|
|
42
|
+
]:
|
|
43
|
+
"""
|
|
44
|
+
Split `goldens` into two disjoint parts:
|
|
45
|
+
|
|
46
|
+
- d_feedback: items not selected for the Pareto validation set
|
|
47
|
+
- d_pareto: `pareto_size` items for instance-wise Pareto scoring
|
|
48
|
+
|
|
49
|
+
The selection is deterministic given `seed`. Within each split, the
|
|
50
|
+
original order from `goldens` is preserved.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
goldens: Full list/sequence of examples.
|
|
54
|
+
pareto_size: Number of items to allocate to the Pareto set bound between [0, len(goldens)].
|
|
55
|
+
random_state: A shared `random.Random` instance that provides the source
|
|
56
|
+
of randomness. For reproducible runs, pass the same object used by
|
|
57
|
+
the GEPA loop constructed from `GEPAConfig.random_seed`
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
(d_feedback, d_pareto)
|
|
61
|
+
"""
|
|
62
|
+
if pareto_size < 0:
|
|
63
|
+
raise ValueError("pareto_size must be >= 0")
|
|
64
|
+
|
|
65
|
+
total = len(goldens)
|
|
66
|
+
|
|
67
|
+
if total == 0:
|
|
68
|
+
# nothing to split
|
|
69
|
+
return [], []
|
|
70
|
+
|
|
71
|
+
# With a single example, we cannot form a meaningful feedback set.
|
|
72
|
+
# callers like GEPARunner should enforce a minimum of 2 goldens for
|
|
73
|
+
# optimization.
|
|
74
|
+
if total == 1:
|
|
75
|
+
return [], list(goldens)
|
|
76
|
+
|
|
77
|
+
# For total >= 2, ensure that we always leave at least one example
|
|
78
|
+
# for d_feedback. This keeps the splits disjoint while still honoring
|
|
79
|
+
# pareto_size as a target up to (total - 1).
|
|
80
|
+
chosen_size = min(pareto_size, total - 1)
|
|
81
|
+
|
|
82
|
+
indices = list(range(total))
|
|
83
|
+
random_state.shuffle(indices)
|
|
84
|
+
|
|
85
|
+
pareto_indices = set(indices[:chosen_size])
|
|
86
|
+
|
|
87
|
+
d_pareto = [goldens[i] for i in range(total) if i in pareto_indices]
|
|
88
|
+
d_feedback = [goldens[i] for i in range(total) if i not in pareto_indices]
|
|
89
|
+
|
|
90
|
+
return d_feedback, d_pareto
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
################################
|
|
94
|
+
# Prompt normalization helpers #
|
|
95
|
+
################################
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _slug(text: str) -> str:
|
|
99
|
+
slug = text.lower()
|
|
100
|
+
slug = re.sub(r"[^a-z0-9]+", "-", slug)
|
|
101
|
+
return slug.strip("-")
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def generate_module_id(prompt: Prompt, index: int, existing: Set[str]) -> str:
|
|
105
|
+
"""
|
|
106
|
+
Build a human readable module id stable within a single optimization run.
|
|
107
|
+
Prefers alias/label; enrich with model settings provider and name; dedupe; cap to 64 chars.
|
|
108
|
+
"""
|
|
109
|
+
parts: List[str] = []
|
|
110
|
+
if prompt.alias:
|
|
111
|
+
parts.append(str(prompt.alias))
|
|
112
|
+
if prompt.label:
|
|
113
|
+
parts.append(str(prompt.label))
|
|
114
|
+
|
|
115
|
+
ms = prompt.model_settings
|
|
116
|
+
if ms is not None:
|
|
117
|
+
if ms.provider is not None:
|
|
118
|
+
parts.append(ms.provider.value)
|
|
119
|
+
if ms.name:
|
|
120
|
+
parts.append(ms.name)
|
|
121
|
+
|
|
122
|
+
base = "-".join(_slug(p) for p in parts if p) or f"module-{index+1}"
|
|
123
|
+
base = base[:64] or f"module-{index+1}"
|
|
124
|
+
|
|
125
|
+
candidate = base
|
|
126
|
+
suffix = 2
|
|
127
|
+
while candidate in existing:
|
|
128
|
+
candidate = f"{base}-{suffix}"
|
|
129
|
+
candidate = candidate[:64]
|
|
130
|
+
suffix += 1
|
|
131
|
+
|
|
132
|
+
existing.add(candidate)
|
|
133
|
+
return candidate
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def normalize_seed_prompts(
|
|
137
|
+
seed_prompts: Union[Dict[ModuleId, Prompt], List[Prompt]],
|
|
138
|
+
) -> Dict[ModuleId, Prompt]:
|
|
139
|
+
"""
|
|
140
|
+
Accept either {module_id: Prompt} or List[Prompt].
|
|
141
|
+
If a list is given, generate human readable module ids.
|
|
142
|
+
"""
|
|
143
|
+
if isinstance(seed_prompts, dict):
|
|
144
|
+
return dict(seed_prompts) # shallow copy
|
|
145
|
+
|
|
146
|
+
mapping: Dict[ModuleId, Prompt] = {}
|
|
147
|
+
used: Set[str] = set()
|
|
148
|
+
for i, prompt in enumerate(seed_prompts):
|
|
149
|
+
module_id = generate_module_id(prompt, i, used)
|
|
150
|
+
mapping[module_id] = prompt
|
|
151
|
+
return mapping
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def build_model_callback_kwargs(
|
|
155
|
+
*,
|
|
156
|
+
# scoring context
|
|
157
|
+
golden: Optional[Union["Golden", "ConversationalGolden"]] = None,
|
|
158
|
+
# rewriter context
|
|
159
|
+
feedback_text: Optional[str] = None,
|
|
160
|
+
# shared
|
|
161
|
+
prompt: Optional[Prompt] = None,
|
|
162
|
+
prompt_type: Optional[str] = None,
|
|
163
|
+
prompt_text: Optional[str] = None,
|
|
164
|
+
prompt_messages: Optional[List["PromptMessage"]] = None,
|
|
165
|
+
) -> Dict[str, Any]:
|
|
166
|
+
"""
|
|
167
|
+
Build a superset of kwargs for GEPA model callbacks.
|
|
168
|
+
|
|
169
|
+
All keys are present in the dict so callbacks can declare any subset of:
|
|
170
|
+
|
|
171
|
+
hook: str # injected by (a_)invoke_model_callback
|
|
172
|
+
prompt: Prompt
|
|
173
|
+
prompt_type: str
|
|
174
|
+
prompt_text: str
|
|
175
|
+
prompt_messages: List[PromptMessage]
|
|
176
|
+
golden: Golden | ConversationalGolden
|
|
177
|
+
feedback_text: str
|
|
178
|
+
|
|
179
|
+
Non applicable fields are set to None.
|
|
180
|
+
"""
|
|
181
|
+
return {
|
|
182
|
+
# scoring context
|
|
183
|
+
"golden": golden,
|
|
184
|
+
# rewriter context
|
|
185
|
+
"feedback_text": feedback_text,
|
|
186
|
+
# shared
|
|
187
|
+
"prompt": prompt,
|
|
188
|
+
"prompt_text": prompt_text,
|
|
189
|
+
"prompt_messages": prompt_messages,
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def invoke_model_callback(
|
|
194
|
+
*,
|
|
195
|
+
hook: str,
|
|
196
|
+
model_callback: Callable[
|
|
197
|
+
...,
|
|
198
|
+
Union[
|
|
199
|
+
str,
|
|
200
|
+
Dict,
|
|
201
|
+
Tuple[Union[str, Dict], float],
|
|
202
|
+
],
|
|
203
|
+
],
|
|
204
|
+
candidate_kwargs: Dict[str, Any],
|
|
205
|
+
) -> Union[
|
|
206
|
+
str,
|
|
207
|
+
Dict,
|
|
208
|
+
Tuple[Union[str, Dict], float],
|
|
209
|
+
]:
|
|
210
|
+
"""
|
|
211
|
+
Call a user provided model_callback in a synchronous context.
|
|
212
|
+
|
|
213
|
+
- Filters kwargs to only those the callback accepts.
|
|
214
|
+
- Injects `hook` if the callback declares it.
|
|
215
|
+
- Raises if the callback returns an awaitable; callers must use async
|
|
216
|
+
helpers for async callbacks.
|
|
217
|
+
"""
|
|
218
|
+
sig = inspect.signature(model_callback)
|
|
219
|
+
supported = set(sig.parameters.keys())
|
|
220
|
+
|
|
221
|
+
filtered = {
|
|
222
|
+
key: value
|
|
223
|
+
for key, value in candidate_kwargs.items()
|
|
224
|
+
if key in supported
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
if "hook" in supported:
|
|
228
|
+
filtered["hook"] = hook
|
|
229
|
+
|
|
230
|
+
result = model_callback(**filtered)
|
|
231
|
+
if inspect.isawaitable(result):
|
|
232
|
+
raise DeepEvalError(
|
|
233
|
+
"model_callback returned an awaitable from a synchronous context. "
|
|
234
|
+
"Either declare the callback as `async def` and use async GEPA, or call "
|
|
235
|
+
"`model.generate(...)` instead of `model.a_generate(...)` inside a sync callback."
|
|
236
|
+
)
|
|
237
|
+
return result
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
async def a_invoke_model_callback(
|
|
241
|
+
*,
|
|
242
|
+
hook: str,
|
|
243
|
+
model_callback: Callable[
|
|
244
|
+
...,
|
|
245
|
+
Union[
|
|
246
|
+
str,
|
|
247
|
+
Dict,
|
|
248
|
+
Tuple[Union[str, Dict], float],
|
|
249
|
+
],
|
|
250
|
+
],
|
|
251
|
+
candidate_kwargs: Dict[str, Any],
|
|
252
|
+
) -> Union[
|
|
253
|
+
str,
|
|
254
|
+
Dict,
|
|
255
|
+
Tuple[Union[str, Dict], float],
|
|
256
|
+
]:
|
|
257
|
+
"""
|
|
258
|
+
Call a user provided model_callback in an async context.
|
|
259
|
+
|
|
260
|
+
- Filters kwargs to only those the callback accepts.
|
|
261
|
+
- Injects `hook` if the callback declares it.
|
|
262
|
+
- Supports both sync and async callbacks.
|
|
263
|
+
"""
|
|
264
|
+
sig = inspect.signature(model_callback)
|
|
265
|
+
supported = set(sig.parameters.keys())
|
|
266
|
+
|
|
267
|
+
filtered = {
|
|
268
|
+
key: value
|
|
269
|
+
for key, value in candidate_kwargs.items()
|
|
270
|
+
if key in supported
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
if "hook" in supported:
|
|
274
|
+
filtered["hook"] = hook
|
|
275
|
+
|
|
276
|
+
result = model_callback(**filtered)
|
|
277
|
+
if inspect.isawaitable(result):
|
|
278
|
+
return await result
|
|
279
|
+
return result
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
###########
|
|
283
|
+
# Reports #
|
|
284
|
+
###########
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def build_prompt_config_snapshots(
|
|
288
|
+
prompt_configurations_by_id: Dict[
|
|
289
|
+
PromptConfigurationId, "PromptConfiguration"
|
|
290
|
+
],
|
|
291
|
+
) -> Dict[PromptConfigurationId, Dict[str, Any]]:
|
|
292
|
+
"""
|
|
293
|
+
Build a serializable snapshot of all prompt configurations.
|
|
294
|
+
|
|
295
|
+
Shape matches the docs for `prompt_configurations`:
|
|
296
|
+
|
|
297
|
+
{
|
|
298
|
+
"<config_id>": {
|
|
299
|
+
"parent": "<parent_id or None>",
|
|
300
|
+
"prompts": {
|
|
301
|
+
"<module_id>": {
|
|
302
|
+
"type": "TEXT",
|
|
303
|
+
"text_template": "...",
|
|
304
|
+
}
|
|
305
|
+
# or
|
|
306
|
+
"<module_id>": {
|
|
307
|
+
"type": "LIST",
|
|
308
|
+
"messages": [
|
|
309
|
+
{"role": "system", "content": "..."},
|
|
310
|
+
...
|
|
311
|
+
],
|
|
312
|
+
},
|
|
313
|
+
},
|
|
314
|
+
},
|
|
315
|
+
...
|
|
316
|
+
}
|
|
317
|
+
"""
|
|
318
|
+
snapshots: Dict[PromptConfigurationId, Dict[str, Any]] = {}
|
|
319
|
+
|
|
320
|
+
for cfg_id, cfg in prompt_configurations_by_id.items():
|
|
321
|
+
prompts_snapshot: Dict[str, Any] = {}
|
|
322
|
+
|
|
323
|
+
for module_id, prompt in cfg.prompts.items():
|
|
324
|
+
if prompt.type is PromptType.LIST:
|
|
325
|
+
messages = [
|
|
326
|
+
{"role": msg.role, "content": (msg.content or "")}
|
|
327
|
+
for msg in (prompt.messages_template or [])
|
|
328
|
+
]
|
|
329
|
+
prompts_snapshot[module_id] = {
|
|
330
|
+
"type": "LIST",
|
|
331
|
+
"messages": messages,
|
|
332
|
+
}
|
|
333
|
+
else:
|
|
334
|
+
prompts_snapshot[module_id] = {
|
|
335
|
+
"type": "TEXT",
|
|
336
|
+
"text_template": (prompt.text_template or ""),
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
snapshots[cfg_id] = {
|
|
340
|
+
"parent": cfg.parent,
|
|
341
|
+
"prompts": prompts_snapshot,
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
return snapshots
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def inflate_prompts_from_report(
|
|
348
|
+
report: OptimizationReport,
|
|
349
|
+
) -> Dict[str, Dict[str, Prompt]]:
|
|
350
|
+
"""
|
|
351
|
+
Build a mapping from configuration id -> { module_id -> Prompt }.
|
|
352
|
+
|
|
353
|
+
This is a convenience for users who want to work with real Prompt
|
|
354
|
+
instances instead of raw snapshots.
|
|
355
|
+
|
|
356
|
+
Returns:
|
|
357
|
+
{
|
|
358
|
+
"<config_id>": {
|
|
359
|
+
"<module_id>": Prompt(...),
|
|
360
|
+
...
|
|
361
|
+
},
|
|
362
|
+
...
|
|
363
|
+
}
|
|
364
|
+
"""
|
|
365
|
+
inflated: Dict[str, Dict[str, Prompt]] = {}
|
|
366
|
+
|
|
367
|
+
for cfg_id, cfg_snapshot in report.prompt_configurations.items():
|
|
368
|
+
module_prompts: Dict[str, Prompt] = {}
|
|
369
|
+
|
|
370
|
+
for module_id, module_snapshot in cfg_snapshot.prompts.items():
|
|
371
|
+
if module_snapshot.type == "TEXT":
|
|
372
|
+
module_prompts[module_id] = Prompt(
|
|
373
|
+
text_template=module_snapshot.text_template or ""
|
|
374
|
+
)
|
|
375
|
+
else: # "LIST"
|
|
376
|
+
messages = [
|
|
377
|
+
PromptMessage(role=m.role, content=m.content)
|
|
378
|
+
for m in module_snapshot.messages or []
|
|
379
|
+
]
|
|
380
|
+
module_prompts[module_id] = Prompt(messages_template=messages)
|
|
381
|
+
|
|
382
|
+
inflated[cfg_id] = module_prompts
|
|
383
|
+
|
|
384
|
+
return inflated
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
def get_best_prompts_from_report(
|
|
388
|
+
report: OptimizationReport,
|
|
389
|
+
) -> Dict[str, Prompt]:
|
|
390
|
+
"""
|
|
391
|
+
Convenience wrapper returning the best configuration's module prompts.
|
|
392
|
+
"""
|
|
393
|
+
all_prompts = inflate_prompts_from_report(report)
|
|
394
|
+
return all_prompts.get(report.best_id, {})
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
##############
|
|
398
|
+
# Validation #
|
|
399
|
+
##############
|
|
400
|
+
def _format_type_names(types: Tuple[type, ...]) -> str:
|
|
401
|
+
names = [t.__name__ for t in types]
|
|
402
|
+
if len(names) == 1:
|
|
403
|
+
return names[0]
|
|
404
|
+
if len(names) == 2:
|
|
405
|
+
return f"{names[0]} or {names[1]}"
|
|
406
|
+
return ", ".join(names[:-1]) + f", or {names[-1]}"
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
def validate_instance(
|
|
410
|
+
*,
|
|
411
|
+
component: str,
|
|
412
|
+
param_name: str,
|
|
413
|
+
value: Any,
|
|
414
|
+
expected_types: Union[type, Tuple[type, ...]],
|
|
415
|
+
allow_none: bool = False,
|
|
416
|
+
) -> Any:
|
|
417
|
+
"""
|
|
418
|
+
Generic type validator.
|
|
419
|
+
|
|
420
|
+
- component: Intended to help identify what is being validated.
|
|
421
|
+
e.g. "PromptOptimizer.__init__", "PromptOptimizer.optimize", etc.
|
|
422
|
+
- param_name: the name of the parameter being validated
|
|
423
|
+
- value: the actual value passed.
|
|
424
|
+
- expected_types: a type or tuple of types to accept.
|
|
425
|
+
- allow_none: if True, None is allowed and returned as-is.
|
|
426
|
+
"""
|
|
427
|
+
if value is None and allow_none:
|
|
428
|
+
return value
|
|
429
|
+
|
|
430
|
+
if not isinstance(expected_types, tuple):
|
|
431
|
+
expected_types = (expected_types,)
|
|
432
|
+
|
|
433
|
+
if not isinstance(value, expected_types):
|
|
434
|
+
expected_desc = _format_type_names(expected_types)
|
|
435
|
+
raise DeepEvalError(
|
|
436
|
+
f"{component} expected `{param_name}` to be an instance of "
|
|
437
|
+
f"{expected_desc}, but received {type(value).__name__!r} instead."
|
|
438
|
+
)
|
|
439
|
+
return value
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
def validate_sequence_of(
|
|
443
|
+
*,
|
|
444
|
+
component: str,
|
|
445
|
+
param_name: str,
|
|
446
|
+
value: Any,
|
|
447
|
+
expected_item_types: Union[type, Tuple[type, ...]],
|
|
448
|
+
sequence_types: Tuple[type, ...] = (list, tuple),
|
|
449
|
+
allow_none: bool = False,
|
|
450
|
+
) -> Any:
|
|
451
|
+
"""
|
|
452
|
+
Generic container validator.
|
|
453
|
+
|
|
454
|
+
- Ensures `value` is one of `sequence_types` (list by default).
|
|
455
|
+
- Ensures each item is an instance of `expected_item_types`.
|
|
456
|
+
|
|
457
|
+
Returns the original `value` on success.
|
|
458
|
+
"""
|
|
459
|
+
if value is None:
|
|
460
|
+
if allow_none:
|
|
461
|
+
return value
|
|
462
|
+
raise DeepEvalError(
|
|
463
|
+
f"{component} expected `{param_name}` to be a "
|
|
464
|
+
f"{_format_type_names(sequence_types)} of "
|
|
465
|
+
f"{_format_type_names(expected_item_types if isinstance(expected_item_types, tuple) else (expected_item_types,))}, "
|
|
466
|
+
"but received None instead."
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
if not isinstance(sequence_types, tuple):
|
|
470
|
+
sequence_types = (sequence_types,)
|
|
471
|
+
|
|
472
|
+
if not isinstance(value, sequence_types):
|
|
473
|
+
expected_seq = _format_type_names(sequence_types)
|
|
474
|
+
raise DeepEvalError(
|
|
475
|
+
f"{component} expected `{param_name}` to be a {expected_seq}, "
|
|
476
|
+
f"but received {type(value).__name__!r} instead."
|
|
477
|
+
)
|
|
478
|
+
|
|
479
|
+
if not isinstance(expected_item_types, tuple):
|
|
480
|
+
expected_item_types = (expected_item_types,)
|
|
481
|
+
|
|
482
|
+
for index, item in enumerate(value):
|
|
483
|
+
if not isinstance(item, expected_item_types):
|
|
484
|
+
expected_items = _format_type_names(expected_item_types)
|
|
485
|
+
raise DeepEvalError(
|
|
486
|
+
f"{component} expected all elements of `{param_name}` to be "
|
|
487
|
+
f"instances of {expected_items}, but element at index {index} "
|
|
488
|
+
f"has type {type(item).__name__!r}."
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
return value
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
def validate_callback(
|
|
495
|
+
*,
|
|
496
|
+
component: str,
|
|
497
|
+
model_callback: Optional[
|
|
498
|
+
Callable[
|
|
499
|
+
...,
|
|
500
|
+
Union[
|
|
501
|
+
str,
|
|
502
|
+
Dict,
|
|
503
|
+
Tuple[Union[str, Dict], float],
|
|
504
|
+
],
|
|
505
|
+
]
|
|
506
|
+
],
|
|
507
|
+
) -> Callable[..., Union[str, Dict, Tuple[Union[str, Dict], float]]]:
|
|
508
|
+
"""
|
|
509
|
+
Ensure that `model_callback` is provided.
|
|
510
|
+
|
|
511
|
+
- `model_callback` should be a callable that performs generation and
|
|
512
|
+
returns the model output.
|
|
513
|
+
|
|
514
|
+
Returns `model_callback` unchanged on success.
|
|
515
|
+
"""
|
|
516
|
+
if model_callback is None:
|
|
517
|
+
raise DeepEvalError(
|
|
518
|
+
f"{component} requires a `model_callback`.\n\n"
|
|
519
|
+
"supply a custom callable via `model_callback=` that performs "
|
|
520
|
+
"generation and returns the model output."
|
|
521
|
+
)
|
|
522
|
+
return model_callback
|
|
523
|
+
|
|
524
|
+
|
|
525
|
+
def validate_metrics(
|
|
526
|
+
*,
|
|
527
|
+
component: str,
|
|
528
|
+
metrics: Union[List[BaseMetric], List[BaseConversationalMetric]],
|
|
529
|
+
) -> Union[List[BaseMetric], List[BaseConversationalMetric]]:
|
|
530
|
+
|
|
531
|
+
if metrics is None or not len(metrics):
|
|
532
|
+
raise DeepEvalError(
|
|
533
|
+
f"{component} requires a `metrics`.\n\n"
|
|
534
|
+
"supply one or more DeepEval metrics via `metrics=`"
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
validate_sequence_of(
|
|
538
|
+
component=component,
|
|
539
|
+
param_name="metrics",
|
|
540
|
+
value=metrics,
|
|
541
|
+
expected_item_types=(BaseMetric, BaseConversationalMetric),
|
|
542
|
+
sequence_types=(list, tuple),
|
|
543
|
+
)
|
|
544
|
+
return list(metrics)
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
def validate_int_in_range(
|
|
548
|
+
*,
|
|
549
|
+
component: str,
|
|
550
|
+
param_name: str,
|
|
551
|
+
value: int,
|
|
552
|
+
min_inclusive: Optional[int] = None,
|
|
553
|
+
max_exclusive: Optional[int] = None,
|
|
554
|
+
) -> int:
|
|
555
|
+
"""
|
|
556
|
+
Validate that an int is within range [min_inclusive, max_exclusive).
|
|
557
|
+
|
|
558
|
+
- If `min_inclusive` is not None, value must be >= min_inclusive.
|
|
559
|
+
- If `max_exclusive` is not None, value must be < max_exclusive.
|
|
560
|
+
|
|
561
|
+
Returns the validated int on success.
|
|
562
|
+
"""
|
|
563
|
+
value = validate_instance(
|
|
564
|
+
component=component,
|
|
565
|
+
param_name=param_name,
|
|
566
|
+
value=value,
|
|
567
|
+
expected_types=int,
|
|
568
|
+
)
|
|
569
|
+
|
|
570
|
+
# Lower bound check
|
|
571
|
+
if min_inclusive is not None and value < min_inclusive:
|
|
572
|
+
if max_exclusive is None:
|
|
573
|
+
raise DeepEvalError(
|
|
574
|
+
f"{component} expected `{param_name}` to be >= {min_inclusive}, "
|
|
575
|
+
f"but received {value!r} instead."
|
|
576
|
+
)
|
|
577
|
+
max_inclusive = max_exclusive - 1
|
|
578
|
+
raise DeepEvalError(
|
|
579
|
+
f"{component} expected `{param_name}` to be between "
|
|
580
|
+
f"{min_inclusive} and {max_inclusive} (inclusive), "
|
|
581
|
+
f"but received {value!r} instead."
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
# Upper bound check (half-open, < max_exclusive)
|
|
585
|
+
if max_exclusive is not None and value >= max_exclusive:
|
|
586
|
+
if min_inclusive is None:
|
|
587
|
+
raise DeepEvalError(
|
|
588
|
+
f"{component} expected `{param_name}` to be < {max_exclusive}, "
|
|
589
|
+
f"but received {value!r} instead."
|
|
590
|
+
)
|
|
591
|
+
max_inclusive = max_exclusive - 1
|
|
592
|
+
raise DeepEvalError(
|
|
593
|
+
f"{component} expected `{param_name}` to be between "
|
|
594
|
+
f"{min_inclusive} and {max_inclusive} (inclusive), "
|
|
595
|
+
f"but received {value!r} instead."
|
|
596
|
+
)
|
|
597
|
+
|
|
598
|
+
return value
|
deepeval/prompt/prompt.py
CHANGED
|
@@ -4,12 +4,9 @@ import json
|
|
|
4
4
|
import os
|
|
5
5
|
|
|
6
6
|
from enum import Enum
|
|
7
|
-
from typing import Optional, List, Dict, Type, Literal
|
|
7
|
+
from typing import Optional, List, Dict, Type, Literal, TYPE_CHECKING
|
|
8
8
|
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
|
|
9
9
|
from rich.console import Console
|
|
10
|
-
import time
|
|
11
|
-
import json
|
|
12
|
-
import os
|
|
13
10
|
from pydantic import BaseModel, ValidationError
|
|
14
11
|
import asyncio
|
|
15
12
|
import threading
|
|
@@ -38,6 +35,9 @@ from deepeval.confident.api import Api, Endpoints, HttpMethods
|
|
|
38
35
|
from deepeval.constants import HIDDEN_DIR
|
|
39
36
|
|
|
40
37
|
|
|
38
|
+
if TYPE_CHECKING:
|
|
39
|
+
from deepeval.optimization.types import OptimizationReport
|
|
40
|
+
|
|
41
41
|
logger = logging.getLogger(__name__)
|
|
42
42
|
|
|
43
43
|
portalocker = None
|
|
@@ -145,6 +145,9 @@ class Prompt:
|
|
|
145
145
|
elif messages_template:
|
|
146
146
|
self.type = PromptType.LIST
|
|
147
147
|
|
|
148
|
+
# updated after optimization runs
|
|
149
|
+
self.optimization_report: Optional["OptimizationReport"] = None
|
|
150
|
+
|
|
148
151
|
def __del__(self):
|
|
149
152
|
"""Cleanup polling tasks when instance is destroyed"""
|
|
150
153
|
try:
|
|
@@ -178,7 +181,7 @@ class Prompt:
|
|
|
178
181
|
content = f.read()
|
|
179
182
|
try:
|
|
180
183
|
data = json.loads(content)
|
|
181
|
-
except (json.JSONDecodeError
|
|
184
|
+
except (TypeError, json.JSONDecodeError):
|
|
182
185
|
self.text_template = content
|
|
183
186
|
return content
|
|
184
187
|
|
|
@@ -364,6 +367,8 @@ class Prompt:
|
|
|
364
367
|
f.seek(0)
|
|
365
368
|
f.truncate()
|
|
366
369
|
json.dump(cache_data, f, cls=CustomEncoder)
|
|
370
|
+
f.flush()
|
|
371
|
+
os.fsync(f.fileno())
|
|
367
372
|
except portalocker.exceptions.LockException:
|
|
368
373
|
# If we can't acquire the lock, silently skip caching
|
|
369
374
|
pass
|
deepeval/test_run/cache.py
CHANGED
|
@@ -90,6 +90,8 @@ class CachedTestRun(BaseModel):
|
|
|
90
90
|
# Pydantic version below 2.0
|
|
91
91
|
body = self.dict(by_alias=True, exclude_none=True)
|
|
92
92
|
json.dump(body, f, cls=CustomEncoder)
|
|
93
|
+
f.flush()
|
|
94
|
+
os.fsync(f.fileno())
|
|
93
95
|
return self
|
|
94
96
|
|
|
95
97
|
# load from file (this happens initially during a test run)
|
deepeval/test_run/test_run.py
CHANGED
|
@@ -406,9 +406,10 @@ class TestRun(BaseModel):
|
|
|
406
406
|
try:
|
|
407
407
|
body = self.model_dump(by_alias=True, exclude_none=True)
|
|
408
408
|
except AttributeError:
|
|
409
|
-
# Pydantic version below 2.0
|
|
410
409
|
body = self.dict(by_alias=True, exclude_none=True)
|
|
411
410
|
json.dump(body, f, cls=TestRunEncoder)
|
|
411
|
+
f.flush()
|
|
412
|
+
os.fsync(f.fileno())
|
|
412
413
|
return self
|
|
413
414
|
|
|
414
415
|
@classmethod
|
|
@@ -515,6 +516,8 @@ class TestRunManager:
|
|
|
515
516
|
)
|
|
516
517
|
wrapper_data = {save_under_key: test_run_data}
|
|
517
518
|
json.dump(wrapper_data, file, cls=TestRunEncoder)
|
|
519
|
+
file.flush()
|
|
520
|
+
os.fsync(file.fileno())
|
|
518
521
|
else:
|
|
519
522
|
self.test_run.save(file)
|
|
520
523
|
except portalocker.exceptions.LockException:
|
|
@@ -527,6 +530,8 @@ class TestRunManager:
|
|
|
527
530
|
LATEST_TEST_RUN_FILE_PATH, mode="w"
|
|
528
531
|
) as file:
|
|
529
532
|
json.dump({LATEST_TEST_RUN_LINK_KEY: link}, file)
|
|
533
|
+
file.flush()
|
|
534
|
+
os.fsync(file.fileno())
|
|
530
535
|
except portalocker.exceptions.LockException:
|
|
531
536
|
pass
|
|
532
537
|
|