deepeval 3.7.3__py3-none-any.whl → 3.7.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/test.py +1 -1
- deepeval/config/settings.py +102 -13
- deepeval/evaluate/configs.py +1 -1
- deepeval/evaluate/execute.py +4 -1
- deepeval/metrics/answer_relevancy/template.py +4 -4
- deepeval/metrics/argument_correctness/template.py +2 -2
- deepeval/metrics/bias/template.py +3 -3
- deepeval/metrics/contextual_precision/template.py +6 -6
- deepeval/metrics/contextual_recall/template.py +2 -2
- deepeval/metrics/contextual_relevancy/template.py +3 -3
- deepeval/metrics/conversation_completeness/template.py +2 -2
- deepeval/metrics/conversational_dag/templates.py +4 -4
- deepeval/metrics/conversational_g_eval/template.py +4 -3
- deepeval/metrics/dag/templates.py +4 -4
- deepeval/metrics/faithfulness/template.py +4 -4
- deepeval/metrics/hallucination/template.py +4 -4
- deepeval/metrics/misuse/template.py +2 -2
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +7 -7
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +6 -6
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +2 -2
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +3 -3
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +9 -9
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +4 -4
- deepeval/metrics/non_advice/template.py +2 -2
- deepeval/metrics/pii_leakage/template.py +2 -2
- deepeval/metrics/prompt_alignment/template.py +4 -4
- deepeval/metrics/role_violation/template.py +2 -2
- deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
- deepeval/metrics/toxicity/template.py +4 -4
- deepeval/metrics/turn_relevancy/template.py +2 -2
- deepeval/models/embedding_models/azure_embedding_model.py +28 -15
- deepeval/models/embedding_models/local_embedding_model.py +23 -10
- deepeval/models/embedding_models/ollama_embedding_model.py +8 -6
- deepeval/models/embedding_models/openai_embedding_model.py +18 -2
- deepeval/models/llms/anthropic_model.py +17 -5
- deepeval/models/llms/azure_model.py +30 -18
- deepeval/models/llms/deepseek_model.py +22 -12
- deepeval/models/llms/gemini_model.py +120 -87
- deepeval/models/llms/grok_model.py +23 -16
- deepeval/models/llms/kimi_model.py +23 -12
- deepeval/models/llms/litellm_model.py +63 -25
- deepeval/models/llms/local_model.py +26 -18
- deepeval/models/llms/ollama_model.py +17 -7
- deepeval/models/llms/openai_model.py +22 -17
- deepeval/models/llms/portkey_model.py +132 -0
- deepeval/models/mlllms/azure_model.py +28 -19
- deepeval/models/mlllms/gemini_model.py +102 -73
- deepeval/models/mlllms/ollama_model.py +40 -9
- deepeval/models/mlllms/openai_model.py +65 -14
- deepeval/models/utils.py +48 -3
- deepeval/optimization/__init__.py +13 -0
- deepeval/optimization/adapters/__init__.py +2 -0
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +588 -0
- deepeval/optimization/aggregates.py +14 -0
- deepeval/optimization/configs.py +34 -0
- deepeval/optimization/copro/configs.py +31 -0
- deepeval/optimization/copro/loop.py +837 -0
- deepeval/optimization/gepa/__init__.py +7 -0
- deepeval/optimization/gepa/configs.py +115 -0
- deepeval/optimization/gepa/loop.py +677 -0
- deepeval/optimization/miprov2/configs.py +134 -0
- deepeval/optimization/miprov2/loop.py +785 -0
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +458 -0
- deepeval/optimization/policies/__init__.py +16 -0
- deepeval/optimization/policies/selection.py +166 -0
- deepeval/optimization/policies/tie_breaker.py +67 -0
- deepeval/optimization/prompt_optimizer.py +462 -0
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +33 -0
- deepeval/optimization/simba/loop.py +983 -0
- deepeval/optimization/simba/types.py +15 -0
- deepeval/optimization/types.py +361 -0
- deepeval/optimization/utils.py +598 -0
- deepeval/prompt/prompt.py +10 -5
- deepeval/test_run/cache.py +2 -0
- deepeval/test_run/test_run.py +6 -1
- deepeval/utils.py +24 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/METADATA +1 -1
- {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/RECORD +84 -59
- {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/WHEEL +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,588 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import asyncio
|
|
3
|
+
import copy
|
|
4
|
+
import inspect
|
|
5
|
+
import json
|
|
6
|
+
from functools import lru_cache
|
|
7
|
+
from pydantic import BaseModel as PydanticBaseModel
|
|
8
|
+
from typing import (
|
|
9
|
+
Any,
|
|
10
|
+
Callable,
|
|
11
|
+
Dict,
|
|
12
|
+
List,
|
|
13
|
+
Optional,
|
|
14
|
+
Tuple,
|
|
15
|
+
Union,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
from deepeval.dataset.golden import Golden, ConversationalGolden
|
|
19
|
+
from deepeval.errors import DeepEvalError
|
|
20
|
+
from deepeval.metrics import (
|
|
21
|
+
BaseMetric,
|
|
22
|
+
BaseConversationalMetric,
|
|
23
|
+
)
|
|
24
|
+
from deepeval.test_case import (
|
|
25
|
+
LLMTestCase,
|
|
26
|
+
ConversationalTestCase,
|
|
27
|
+
MLLMTestCase,
|
|
28
|
+
Turn,
|
|
29
|
+
)
|
|
30
|
+
from deepeval.prompt.api import PromptType, PromptMessage
|
|
31
|
+
from deepeval.prompt.prompt import Prompt
|
|
32
|
+
|
|
33
|
+
from deepeval.optimization.types import (
|
|
34
|
+
PromptConfiguration,
|
|
35
|
+
Objective,
|
|
36
|
+
MeanObjective,
|
|
37
|
+
ModuleId,
|
|
38
|
+
)
|
|
39
|
+
from deepeval.optimization.utils import (
|
|
40
|
+
validate_callback,
|
|
41
|
+
validate_metrics,
|
|
42
|
+
invoke_model_callback,
|
|
43
|
+
a_invoke_model_callback,
|
|
44
|
+
build_model_callback_kwargs,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@lru_cache(maxsize=None)
|
|
49
|
+
def _has_kwarg(func: Callable, keyword: str) -> bool:
|
|
50
|
+
"""Return True if func accepts keyword or has **kwargs."""
|
|
51
|
+
try:
|
|
52
|
+
signature = inspect.signature(func)
|
|
53
|
+
except (ValueError, TypeError):
|
|
54
|
+
return False
|
|
55
|
+
for param in signature.parameters.values():
|
|
56
|
+
if param.kind == inspect.Parameter.VAR_KEYWORD:
|
|
57
|
+
return True
|
|
58
|
+
return keyword in signature.parameters
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _measure_no_indicator(metric, test_case):
|
|
62
|
+
"""Call metric.measure(test_case) with _show_indicator=False if supported."""
|
|
63
|
+
measure = getattr(metric, "measure")
|
|
64
|
+
if _has_kwarg(measure, "_show_indicator"):
|
|
65
|
+
return measure(test_case, _show_indicator=False)
|
|
66
|
+
return measure(test_case)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
async def _a_measure_no_indicator(metric, test_case):
|
|
70
|
+
"""
|
|
71
|
+
Prefer metric.a_measure with fall back to metric.measure in a thread.
|
|
72
|
+
Always disable indicators when supported. This is to prevent interference
|
|
73
|
+
with the gepa indicator.
|
|
74
|
+
"""
|
|
75
|
+
a_measure = getattr(metric, "a_measure", None)
|
|
76
|
+
|
|
77
|
+
if a_measure is not None:
|
|
78
|
+
call = (
|
|
79
|
+
a_measure(test_case, _show_indicator=False)
|
|
80
|
+
if _has_kwarg(a_measure, "_show_indicator")
|
|
81
|
+
else a_measure(test_case)
|
|
82
|
+
)
|
|
83
|
+
# Be resilient if impl returns a plain value
|
|
84
|
+
return await call if inspect.isawaitable(call) else call
|
|
85
|
+
|
|
86
|
+
# No async impl: run sync measure in a thread
|
|
87
|
+
loop = asyncio.get_running_loop()
|
|
88
|
+
return await loop.run_in_executor(
|
|
89
|
+
None, lambda: _measure_no_indicator(metric, test_case)
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class DeepEvalScoringAdapter:
|
|
94
|
+
"""Scoring adapter backed by DeepEval metrics with a built-in generation step."""
|
|
95
|
+
|
|
96
|
+
DEFAULT_MODULE_ID: ModuleId = "__module__"
|
|
97
|
+
|
|
98
|
+
def __init__(
|
|
99
|
+
self,
|
|
100
|
+
*,
|
|
101
|
+
build_test_case: Optional[
|
|
102
|
+
Callable[
|
|
103
|
+
[Union[Golden, ConversationalGolden], str],
|
|
104
|
+
Union[LLMTestCase, ConversationalTestCase, MLLMTestCase],
|
|
105
|
+
]
|
|
106
|
+
] = None,
|
|
107
|
+
objective_scalar: Objective = MeanObjective(),
|
|
108
|
+
list_input_role: str = "user",
|
|
109
|
+
):
|
|
110
|
+
self.model_callback: Optional[
|
|
111
|
+
Callable[
|
|
112
|
+
...,
|
|
113
|
+
Union[
|
|
114
|
+
str,
|
|
115
|
+
Dict,
|
|
116
|
+
Tuple[Union[str, Dict], float],
|
|
117
|
+
],
|
|
118
|
+
]
|
|
119
|
+
] = None
|
|
120
|
+
self.metrics: Union[
|
|
121
|
+
List[BaseMetric], List[BaseConversationalMetric]
|
|
122
|
+
] = []
|
|
123
|
+
|
|
124
|
+
self.build_test_case = build_test_case or self._default_build_test_case
|
|
125
|
+
self.objective_scalar = objective_scalar
|
|
126
|
+
self.list_input_role = list_input_role
|
|
127
|
+
|
|
128
|
+
# async
|
|
129
|
+
self._semaphore: Optional[asyncio.Semaphore] = None
|
|
130
|
+
self._throttle: float = 0.0
|
|
131
|
+
|
|
132
|
+
def set_model_callback(
|
|
133
|
+
self,
|
|
134
|
+
model_callback: Callable[
|
|
135
|
+
...,
|
|
136
|
+
Union[
|
|
137
|
+
str,
|
|
138
|
+
Dict,
|
|
139
|
+
Tuple[Union[str, Dict], float],
|
|
140
|
+
],
|
|
141
|
+
],
|
|
142
|
+
):
|
|
143
|
+
self.model_callback = validate_callback(
|
|
144
|
+
component="DeepEvalScoringAdapter",
|
|
145
|
+
model_callback=model_callback,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
def set_metrics(
|
|
149
|
+
self,
|
|
150
|
+
metrics: Union[List[BaseMetric], List[BaseConversationalMetric]],
|
|
151
|
+
):
|
|
152
|
+
self.metrics = validate_metrics(
|
|
153
|
+
component="DeepEvalScoringAdapter", metrics=metrics
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
#######################################
|
|
157
|
+
# prompt assembly & result unwrapping #
|
|
158
|
+
#######################################
|
|
159
|
+
def _primary_input_from_golden(
|
|
160
|
+
self, golden: Union[Golden, ConversationalGolden]
|
|
161
|
+
) -> str:
|
|
162
|
+
"""
|
|
163
|
+
Return the primary textual input to feed into the prompt for a given golden.
|
|
164
|
+
|
|
165
|
+
- For Golden: use `input`
|
|
166
|
+
- For ConversationalGolden: use `scenario`
|
|
167
|
+
"""
|
|
168
|
+
if isinstance(golden, Golden):
|
|
169
|
+
return golden.input
|
|
170
|
+
|
|
171
|
+
if isinstance(golden, ConversationalGolden):
|
|
172
|
+
return golden.scenario
|
|
173
|
+
|
|
174
|
+
raise DeepEvalError(
|
|
175
|
+
"DeepEvalScoringAdapter expected golden to be a Golden or "
|
|
176
|
+
f"ConversationalGolden, but received {type(golden).__name__!r}."
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
def _compile_prompt_text(
|
|
180
|
+
self, prompt: Prompt, golden: Union[Golden, ConversationalGolden]
|
|
181
|
+
) -> str:
|
|
182
|
+
user_input = self._primary_input_from_golden(golden)
|
|
183
|
+
base_text = prompt.text_template or ""
|
|
184
|
+
if not user_input:
|
|
185
|
+
return base_text.strip()
|
|
186
|
+
return f"{base_text}\n\n{user_input}".strip()
|
|
187
|
+
|
|
188
|
+
def _compile_prompt_messages(
|
|
189
|
+
self,
|
|
190
|
+
prompt: Prompt,
|
|
191
|
+
golden: Union[Golden, ConversationalGolden],
|
|
192
|
+
) -> List[PromptMessage]:
|
|
193
|
+
"""
|
|
194
|
+
Build the message contents for PromptType.LIST.
|
|
195
|
+
|
|
196
|
+
Starts from `prompt.messages_template` and appends a new PromptMessage with
|
|
197
|
+
the golden's `input` as the final message content.
|
|
198
|
+
"""
|
|
199
|
+
messages_template = prompt.messages_template or []
|
|
200
|
+
compiled: List[PromptMessage] = list(messages_template)
|
|
201
|
+
|
|
202
|
+
user_input = self._primary_input_from_golden(golden)
|
|
203
|
+
if user_input:
|
|
204
|
+
compiled = compiled + [
|
|
205
|
+
PromptMessage(role=self.list_input_role, content=user_input)
|
|
206
|
+
]
|
|
207
|
+
|
|
208
|
+
return compiled
|
|
209
|
+
|
|
210
|
+
def _build_callback_kwargs_for_prompt(
|
|
211
|
+
self,
|
|
212
|
+
prompt: Prompt,
|
|
213
|
+
golden: Union["Golden", "ConversationalGolden"],
|
|
214
|
+
) -> Dict[str, Any]:
|
|
215
|
+
"""
|
|
216
|
+
Decide whether to treat the prompt as TEXT or LIST and build the
|
|
217
|
+
corresponding callback kwargs.
|
|
218
|
+
|
|
219
|
+
- For TEXT prompts, we send: prompt_text=...
|
|
220
|
+
- For LIST prompts, we send: prompt_messages=[...]
|
|
221
|
+
"""
|
|
222
|
+
|
|
223
|
+
if prompt.type is PromptType.LIST:
|
|
224
|
+
prompt_messages = self._compile_prompt_messages(prompt, golden)
|
|
225
|
+
return build_model_callback_kwargs(
|
|
226
|
+
prompt=prompt,
|
|
227
|
+
prompt_messages=prompt_messages,
|
|
228
|
+
golden=golden,
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
# Default to TEXT behaviour
|
|
232
|
+
prompt_text = self._compile_prompt_text(prompt, golden)
|
|
233
|
+
return build_model_callback_kwargs(
|
|
234
|
+
prompt=prompt,
|
|
235
|
+
prompt_text=prompt_text,
|
|
236
|
+
golden=golden,
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
def _unwrap_text(
|
|
240
|
+
self, result: Union[str, Dict, PydanticBaseModel, tuple]
|
|
241
|
+
) -> str:
|
|
242
|
+
# DeepEval LLMs return (output, cost), unwrap if so.
|
|
243
|
+
if isinstance(result, tuple) and result:
|
|
244
|
+
result = result[0]
|
|
245
|
+
if isinstance(result, PydanticBaseModel):
|
|
246
|
+
return result.model_dump_json()
|
|
247
|
+
if isinstance(result, dict):
|
|
248
|
+
return json.dumps(result)
|
|
249
|
+
return str(result)
|
|
250
|
+
|
|
251
|
+
#####################
|
|
252
|
+
# Test case helpers #
|
|
253
|
+
#####################
|
|
254
|
+
def _default_build_test_case(
|
|
255
|
+
self, golden: Union[Golden, ConversationalGolden], actual: str
|
|
256
|
+
) -> Union[LLMTestCase, ConversationalTestCase]:
|
|
257
|
+
"""
|
|
258
|
+
Default conversion from Golden or ConversationalGolden into a DeepEval test case.
|
|
259
|
+
|
|
260
|
+
- Golden -> LLMTestCase
|
|
261
|
+
- ConversationalGolden -> ConversationalTestCase
|
|
262
|
+
"""
|
|
263
|
+
if isinstance(golden, Golden):
|
|
264
|
+
return LLMTestCase(
|
|
265
|
+
input=golden.input,
|
|
266
|
+
expected_output=golden.expected_output,
|
|
267
|
+
actual_output=actual,
|
|
268
|
+
context=golden.context,
|
|
269
|
+
retrieval_context=golden.retrieval_context,
|
|
270
|
+
additional_metadata=golden.additional_metadata,
|
|
271
|
+
comments=golden.comments,
|
|
272
|
+
name=golden.name,
|
|
273
|
+
tools_called=golden.tools_called,
|
|
274
|
+
expected_tools=golden.expected_tools,
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
if isinstance(golden, ConversationalGolden):
|
|
278
|
+
# Start from any turns specified on the golden.
|
|
279
|
+
turns: List[Turn] = list(golden.turns or [])
|
|
280
|
+
assistant_role = "assistant"
|
|
281
|
+
user_role = "user"
|
|
282
|
+
if turns:
|
|
283
|
+
last = turns[-1]
|
|
284
|
+
if last.role == assistant_role:
|
|
285
|
+
# Replace the last assistant turn's content with the model's actual output.
|
|
286
|
+
turns[-1] = Turn(
|
|
287
|
+
role=last.role,
|
|
288
|
+
content=actual,
|
|
289
|
+
user_id=last.user_id,
|
|
290
|
+
retrieval_context=last.retrieval_context,
|
|
291
|
+
tools_called=last.tools_called,
|
|
292
|
+
)
|
|
293
|
+
else:
|
|
294
|
+
# Append a new assistant turn with the actual output.
|
|
295
|
+
turns.append(Turn(role=assistant_role, content=actual))
|
|
296
|
+
else:
|
|
297
|
+
# No turns provided: synthesize a minimal two-turn conversation.
|
|
298
|
+
turns = [
|
|
299
|
+
Turn(role=user_role, content=golden.scenario),
|
|
300
|
+
Turn(role=assistant_role, content=actual),
|
|
301
|
+
]
|
|
302
|
+
|
|
303
|
+
return ConversationalTestCase(
|
|
304
|
+
turns=turns,
|
|
305
|
+
scenario=golden.scenario,
|
|
306
|
+
expected_outcome=golden.expected_outcome,
|
|
307
|
+
user_description=golden.user_description,
|
|
308
|
+
context=golden.context,
|
|
309
|
+
additional_metadata=golden.additional_metadata,
|
|
310
|
+
comments=golden.comments,
|
|
311
|
+
name=golden.name,
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
raise DeepEvalError(
|
|
315
|
+
"DeepEvalScoringAdapter._default_build_test_case expected a Golden "
|
|
316
|
+
f"or ConversationalGolden, but received {type(golden).__name__!r}."
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
###################
|
|
320
|
+
# scoring helpers #
|
|
321
|
+
###################
|
|
322
|
+
|
|
323
|
+
async def _bounded(self, coro):
|
|
324
|
+
if self._semaphore is None:
|
|
325
|
+
return await coro
|
|
326
|
+
async with self._semaphore:
|
|
327
|
+
res = await coro
|
|
328
|
+
if self._throttle:
|
|
329
|
+
await asyncio.sleep(self._throttle)
|
|
330
|
+
return res
|
|
331
|
+
|
|
332
|
+
async def _a_score_one(
|
|
333
|
+
self,
|
|
334
|
+
prompt_configuration: PromptConfiguration,
|
|
335
|
+
golden: Union[Golden, ConversationalGolden],
|
|
336
|
+
) -> float:
|
|
337
|
+
# Clone metrics to avoid shared-state
|
|
338
|
+
metrics = [copy.copy(metric) for metric in self.metrics]
|
|
339
|
+
actual = await self.a_generate(prompt_configuration.prompts, golden)
|
|
340
|
+
test_case = self.build_test_case(golden, actual)
|
|
341
|
+
per_metric: Dict[str, float] = {}
|
|
342
|
+
for metric in metrics:
|
|
343
|
+
score = await _a_measure_no_indicator(metric, test_case)
|
|
344
|
+
per_metric[metric.__class__.__name__] = float(score)
|
|
345
|
+
return self.objective_scalar.scalarize(per_metric)
|
|
346
|
+
|
|
347
|
+
def _score_one(
|
|
348
|
+
self,
|
|
349
|
+
prompt_configuration: PromptConfiguration,
|
|
350
|
+
golden: Union[Golden, ConversationalGolden],
|
|
351
|
+
) -> float:
|
|
352
|
+
metrics = [copy.copy(m) for m in self.metrics]
|
|
353
|
+
actual = self.generate(prompt_configuration.prompts, golden)
|
|
354
|
+
test_case = self.build_test_case(golden, actual)
|
|
355
|
+
per_metric: Dict[str, float] = {}
|
|
356
|
+
for metric in metrics:
|
|
357
|
+
score = _measure_no_indicator(metric, test_case)
|
|
358
|
+
per_metric[metric.__class__.__name__] = float(score)
|
|
359
|
+
return self.objective_scalar.scalarize(per_metric)
|
|
360
|
+
|
|
361
|
+
#################
|
|
362
|
+
# Configuration #
|
|
363
|
+
#################
|
|
364
|
+
|
|
365
|
+
def configure_async(
|
|
366
|
+
self, *, max_concurrent: int = 20, throttle_seconds: float = 0.0
|
|
367
|
+
):
|
|
368
|
+
# The runner will call this once, but it is safe to recreate between runs
|
|
369
|
+
self._semaphore = asyncio.Semaphore(max_concurrent)
|
|
370
|
+
self._throttle = float(throttle_seconds)
|
|
371
|
+
|
|
372
|
+
########################
|
|
373
|
+
# generation & scoring #
|
|
374
|
+
########################
|
|
375
|
+
|
|
376
|
+
def generate(
|
|
377
|
+
self,
|
|
378
|
+
prompts_by_module: Dict[ModuleId, Prompt],
|
|
379
|
+
golden: Union[Golden, ConversationalGolden],
|
|
380
|
+
) -> str:
|
|
381
|
+
|
|
382
|
+
if not prompts_by_module:
|
|
383
|
+
raise DeepEvalError(
|
|
384
|
+
"DeepEvalScoringAdapter.generate(...) received an empty "
|
|
385
|
+
"`prompts_by_module`; at least one Prompt is required."
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
validate_callback(
|
|
389
|
+
component="DeepEvalScoringAdapter",
|
|
390
|
+
model_callback=self.model_callback,
|
|
391
|
+
)
|
|
392
|
+
validate_metrics(
|
|
393
|
+
component="DeepEvalScoringAdapter", metrics=self.metrics
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
module_id = self._select_module_id_from_prompts(prompts_by_module)
|
|
397
|
+
prompt = prompts_by_module.get(module_id) or next(
|
|
398
|
+
iter(prompts_by_module.values())
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
candidate_kwargs = self._build_callback_kwargs_for_prompt(
|
|
402
|
+
prompt=prompt,
|
|
403
|
+
golden=golden,
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
result = invoke_model_callback(
|
|
407
|
+
hook="score_generate",
|
|
408
|
+
model_callback=self.model_callback,
|
|
409
|
+
candidate_kwargs=candidate_kwargs,
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
return self._unwrap_text(result)
|
|
413
|
+
|
|
414
|
+
async def a_generate(
|
|
415
|
+
self,
|
|
416
|
+
prompts_by_module: Dict[ModuleId, Prompt],
|
|
417
|
+
golden: Union[Golden, ConversationalGolden],
|
|
418
|
+
) -> str:
|
|
419
|
+
|
|
420
|
+
if not prompts_by_module:
|
|
421
|
+
raise DeepEvalError(
|
|
422
|
+
"DeepEvalScoringAdapter.a_generate(...) received an empty "
|
|
423
|
+
"`prompts_by_module`; at least one Prompt is required."
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
validate_callback(
|
|
427
|
+
component="DeepEvalScoringAdapter",
|
|
428
|
+
model_callback=self.model_callback,
|
|
429
|
+
)
|
|
430
|
+
validate_metrics(
|
|
431
|
+
component="DeepEvalScoringAdapter", metrics=self.metrics
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
module_id = self._select_module_id_from_prompts(prompts_by_module)
|
|
435
|
+
prompt = prompts_by_module.get(module_id) or next(
|
|
436
|
+
iter(prompts_by_module.values())
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
candidate_kwargs = self._build_callback_kwargs_for_prompt(
|
|
440
|
+
prompt=prompt,
|
|
441
|
+
golden=golden,
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
result = await a_invoke_model_callback(
|
|
445
|
+
hook="score_generate",
|
|
446
|
+
model_callback=self.model_callback,
|
|
447
|
+
candidate_kwargs=candidate_kwargs,
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
return self._unwrap_text(result)
|
|
451
|
+
|
|
452
|
+
def score_on_pareto(
|
|
453
|
+
self,
|
|
454
|
+
prompt_configuration: PromptConfiguration,
|
|
455
|
+
d_pareto: Union[List[Golden], List[ConversationalGolden]],
|
|
456
|
+
) -> List[float]:
|
|
457
|
+
return [
|
|
458
|
+
self._score_one(prompt_configuration, golden) for golden in d_pareto
|
|
459
|
+
]
|
|
460
|
+
|
|
461
|
+
def minibatch_score(
|
|
462
|
+
self,
|
|
463
|
+
prompt_configuration: PromptConfiguration,
|
|
464
|
+
minibatch: Union[List[Golden], List[ConversationalGolden]],
|
|
465
|
+
) -> float:
|
|
466
|
+
if not minibatch:
|
|
467
|
+
return 0.0
|
|
468
|
+
|
|
469
|
+
scores = [
|
|
470
|
+
self._score_one(prompt_configuration, golden)
|
|
471
|
+
for golden in minibatch
|
|
472
|
+
]
|
|
473
|
+
return sum(scores) / len(scores)
|
|
474
|
+
|
|
475
|
+
def minibatch_feedback(
|
|
476
|
+
self,
|
|
477
|
+
prompt_configuration: PromptConfiguration,
|
|
478
|
+
module: ModuleId,
|
|
479
|
+
minibatch: Union[List[Golden], List[ConversationalGolden]],
|
|
480
|
+
) -> str:
|
|
481
|
+
# default metric feedback (μ_f): concat metric.reason across minibatch and cap length
|
|
482
|
+
reasons: List[str] = []
|
|
483
|
+
for golden in minibatch:
|
|
484
|
+
actual = self.generate(prompt_configuration.prompts, golden)
|
|
485
|
+
test_case = self.build_test_case(golden, actual)
|
|
486
|
+
for metric in [copy.copy(m) for m in self.metrics]:
|
|
487
|
+
_ = _measure_no_indicator(metric, test_case)
|
|
488
|
+
if getattr(metric, "reason", None):
|
|
489
|
+
reasons.append(str(metric.reason))
|
|
490
|
+
if not reasons:
|
|
491
|
+
return ""
|
|
492
|
+
unique: List[str] = []
|
|
493
|
+
seen = set()
|
|
494
|
+
for reason in reasons:
|
|
495
|
+
if reason not in seen:
|
|
496
|
+
unique.append(reason)
|
|
497
|
+
seen.add(reason)
|
|
498
|
+
return "\n---\n".join(
|
|
499
|
+
unique[:8]
|
|
500
|
+
) # TODO: Make how much feedback configurable
|
|
501
|
+
|
|
502
|
+
async def a_score_on_pareto(
|
|
503
|
+
self,
|
|
504
|
+
prompt_configuration: PromptConfiguration,
|
|
505
|
+
d_pareto: Union[List[Golden], List[ConversationalGolden]],
|
|
506
|
+
) -> List[float]:
|
|
507
|
+
tasks = [
|
|
508
|
+
self._bounded(self._a_score_one(prompt_configuration, golden))
|
|
509
|
+
for golden in d_pareto
|
|
510
|
+
]
|
|
511
|
+
return await asyncio.gather(*tasks)
|
|
512
|
+
|
|
513
|
+
async def a_minibatch_score(
|
|
514
|
+
self,
|
|
515
|
+
prompt_configuration: PromptConfiguration,
|
|
516
|
+
minibatch: Union[List[Golden], List[ConversationalGolden]],
|
|
517
|
+
) -> float:
|
|
518
|
+
tasks = [
|
|
519
|
+
self._bounded(self._a_score_one(prompt_configuration, golden))
|
|
520
|
+
for golden in minibatch
|
|
521
|
+
]
|
|
522
|
+
scores = await asyncio.gather(*tasks)
|
|
523
|
+
return sum(scores) / len(scores) if scores else 0.0
|
|
524
|
+
|
|
525
|
+
async def a_minibatch_feedback(
|
|
526
|
+
self,
|
|
527
|
+
prompt_configuration: PromptConfiguration,
|
|
528
|
+
module: ModuleId,
|
|
529
|
+
minibatch: Union[List[Golden], List[ConversationalGolden]],
|
|
530
|
+
) -> str:
|
|
531
|
+
async def reasons_one(golden) -> List[str]:
|
|
532
|
+
# Clone per task to avoid shared state
|
|
533
|
+
metrics = [copy.copy(metric) for metric in self.metrics]
|
|
534
|
+
# metrics = self.metrics
|
|
535
|
+
actual = await self.a_generate(prompt_configuration.prompts, golden)
|
|
536
|
+
test_case = self.build_test_case(golden, actual)
|
|
537
|
+
out: List[str] = []
|
|
538
|
+
for metric in metrics:
|
|
539
|
+
_ = await _a_measure_no_indicator(metric, test_case)
|
|
540
|
+
if getattr(metric, "reason", None):
|
|
541
|
+
out.append(str(metric.reason))
|
|
542
|
+
return out
|
|
543
|
+
|
|
544
|
+
tasks = [self._bounded(reasons_one(golden)) for golden in minibatch]
|
|
545
|
+
nested = await asyncio.gather(*tasks)
|
|
546
|
+
reasons: List[str] = [reason for sub in nested for reason in sub]
|
|
547
|
+
if not reasons:
|
|
548
|
+
return ""
|
|
549
|
+
unique: List[str] = []
|
|
550
|
+
seen = set()
|
|
551
|
+
for reason in reasons:
|
|
552
|
+
if reason not in seen:
|
|
553
|
+
unique.append(reason)
|
|
554
|
+
seen.add(reason)
|
|
555
|
+
return "\n---\n".join(unique[:8])
|
|
556
|
+
|
|
557
|
+
def _select_module_id_from_prompts(
|
|
558
|
+
self, prompts_by_module: Dict[ModuleId, Prompt]
|
|
559
|
+
) -> ModuleId:
|
|
560
|
+
"""
|
|
561
|
+
Default module selection strategy:
|
|
562
|
+
|
|
563
|
+
- Prefer the synthetic '__module__' key when present
|
|
564
|
+
- Otherwise fall back to the first key in prompts_by_module.
|
|
565
|
+
|
|
566
|
+
Assumes `prompts_by_module` is non-empty; callers should validate that.
|
|
567
|
+
"""
|
|
568
|
+
if self.DEFAULT_MODULE_ID in prompts_by_module:
|
|
569
|
+
return self.DEFAULT_MODULE_ID
|
|
570
|
+
|
|
571
|
+
# At this point we expect at least one key.
|
|
572
|
+
try:
|
|
573
|
+
return next(iter(prompts_by_module.keys()))
|
|
574
|
+
except StopIteration:
|
|
575
|
+
raise DeepEvalError(
|
|
576
|
+
"DeepEvalScoringAdapter._select_module_id_from_prompts(...) "
|
|
577
|
+
"received an empty `prompts_by_module`. At least one Prompt is required."
|
|
578
|
+
)
|
|
579
|
+
|
|
580
|
+
def select_module(
|
|
581
|
+
self, prompt_configuration: PromptConfiguration
|
|
582
|
+
) -> ModuleId:
|
|
583
|
+
return self._select_module_id_from_prompts(prompt_configuration.prompts)
|
|
584
|
+
|
|
585
|
+
async def a_select_module(
|
|
586
|
+
self, prompt_configuration: PromptConfiguration
|
|
587
|
+
) -> ModuleId:
|
|
588
|
+
return self.select_module(prompt_configuration)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from typing import Protocol, Sequence
|
|
2
|
+
import statistics
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class Aggregator(Protocol):
|
|
6
|
+
def __call__(self, scores: Sequence[float]) -> float: ...
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def mean_of_all(scores: Sequence[float]) -> float:
|
|
10
|
+
return statistics.fmean(scores) if scores else 0.0
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def median_of_all(scores: Sequence[float]) -> float:
|
|
14
|
+
return statistics.median(scores) if scores else 0.0
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from pydantic import BaseModel, Field, conint
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class OptimizerDisplayConfig(BaseModel):
|
|
8
|
+
"""Display controls used by PromptOptimizer for all algorithms."""
|
|
9
|
+
|
|
10
|
+
show_indicator: bool = True
|
|
11
|
+
announce_ties: bool = Field(
|
|
12
|
+
False, description="Print a one-line note when a tie is detected"
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class PromptListMutationTargetType(Enum):
|
|
17
|
+
RANDOM = "random"
|
|
18
|
+
FIXED_INDEX = "fixed_index"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# default all messages
|
|
22
|
+
class PromptListMutationConfig(BaseModel):
|
|
23
|
+
target_type: PromptListMutationTargetType = (
|
|
24
|
+
PromptListMutationTargetType.RANDOM
|
|
25
|
+
)
|
|
26
|
+
# should be list
|
|
27
|
+
target_role: Optional[str] = Field(
|
|
28
|
+
default=None,
|
|
29
|
+
description="If set, restricts candidates to messages with this role (case insensitive).",
|
|
30
|
+
)
|
|
31
|
+
target_index: conint(ge=0) = Field(
|
|
32
|
+
default=0,
|
|
33
|
+
description="0-based index used when target_type == FIXED_INDEX.",
|
|
34
|
+
)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from pydantic import Field, conint
|
|
3
|
+
|
|
4
|
+
from deepeval.optimization.miprov2.configs import MIPROConfig
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class COPROConfig(MIPROConfig):
|
|
8
|
+
"""
|
|
9
|
+
Configuration for COPRO style cooperative prompt optimization.
|
|
10
|
+
|
|
11
|
+
This extends MIPROConfig with settings that control the cooperative
|
|
12
|
+
sampling behavior.
|
|
13
|
+
|
|
14
|
+
The core MIPROConfig fields behave exactly the same as in MIPROv2.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
population_size: conint(ge=1) = Field(
|
|
18
|
+
default=4,
|
|
19
|
+
description=(
|
|
20
|
+
"Maximum number of prompt candidates maintained in the active pool. "
|
|
21
|
+
"Once this limit is exceeded, lower scoring candidates are pruned."
|
|
22
|
+
),
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
proposals_per_step: conint(ge=1) = Field(
|
|
26
|
+
default=4,
|
|
27
|
+
description=(
|
|
28
|
+
"Number of child prompts proposed cooperatively from the same "
|
|
29
|
+
"parent in each optimization iteration."
|
|
30
|
+
),
|
|
31
|
+
)
|