deepeval 3.7.3__py3-none-any.whl → 3.7.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/test.py +1 -1
- deepeval/config/settings.py +102 -13
- deepeval/evaluate/configs.py +1 -1
- deepeval/evaluate/execute.py +4 -1
- deepeval/metrics/answer_relevancy/template.py +4 -4
- deepeval/metrics/argument_correctness/template.py +2 -2
- deepeval/metrics/bias/template.py +3 -3
- deepeval/metrics/contextual_precision/template.py +6 -6
- deepeval/metrics/contextual_recall/template.py +2 -2
- deepeval/metrics/contextual_relevancy/template.py +3 -3
- deepeval/metrics/conversation_completeness/template.py +2 -2
- deepeval/metrics/conversational_dag/templates.py +4 -4
- deepeval/metrics/conversational_g_eval/template.py +4 -3
- deepeval/metrics/dag/templates.py +4 -4
- deepeval/metrics/faithfulness/template.py +4 -4
- deepeval/metrics/hallucination/template.py +4 -4
- deepeval/metrics/misuse/template.py +2 -2
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +7 -7
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +6 -6
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +2 -2
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +3 -3
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +9 -9
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +4 -4
- deepeval/metrics/non_advice/template.py +2 -2
- deepeval/metrics/pii_leakage/template.py +2 -2
- deepeval/metrics/prompt_alignment/template.py +4 -4
- deepeval/metrics/role_violation/template.py +2 -2
- deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
- deepeval/metrics/toxicity/template.py +4 -4
- deepeval/metrics/turn_relevancy/template.py +2 -2
- deepeval/models/embedding_models/azure_embedding_model.py +28 -15
- deepeval/models/embedding_models/local_embedding_model.py +23 -10
- deepeval/models/embedding_models/ollama_embedding_model.py +8 -6
- deepeval/models/embedding_models/openai_embedding_model.py +18 -2
- deepeval/models/llms/anthropic_model.py +17 -5
- deepeval/models/llms/azure_model.py +30 -18
- deepeval/models/llms/deepseek_model.py +22 -12
- deepeval/models/llms/gemini_model.py +120 -87
- deepeval/models/llms/grok_model.py +23 -16
- deepeval/models/llms/kimi_model.py +23 -12
- deepeval/models/llms/litellm_model.py +63 -25
- deepeval/models/llms/local_model.py +26 -18
- deepeval/models/llms/ollama_model.py +17 -7
- deepeval/models/llms/openai_model.py +22 -17
- deepeval/models/llms/portkey_model.py +132 -0
- deepeval/models/mlllms/azure_model.py +28 -19
- deepeval/models/mlllms/gemini_model.py +102 -73
- deepeval/models/mlllms/ollama_model.py +40 -9
- deepeval/models/mlllms/openai_model.py +65 -14
- deepeval/models/utils.py +48 -3
- deepeval/optimization/__init__.py +13 -0
- deepeval/optimization/adapters/__init__.py +2 -0
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +588 -0
- deepeval/optimization/aggregates.py +14 -0
- deepeval/optimization/configs.py +34 -0
- deepeval/optimization/copro/configs.py +31 -0
- deepeval/optimization/copro/loop.py +837 -0
- deepeval/optimization/gepa/__init__.py +7 -0
- deepeval/optimization/gepa/configs.py +115 -0
- deepeval/optimization/gepa/loop.py +677 -0
- deepeval/optimization/miprov2/configs.py +134 -0
- deepeval/optimization/miprov2/loop.py +785 -0
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +458 -0
- deepeval/optimization/policies/__init__.py +16 -0
- deepeval/optimization/policies/selection.py +166 -0
- deepeval/optimization/policies/tie_breaker.py +67 -0
- deepeval/optimization/prompt_optimizer.py +462 -0
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +33 -0
- deepeval/optimization/simba/loop.py +983 -0
- deepeval/optimization/simba/types.py +15 -0
- deepeval/optimization/types.py +361 -0
- deepeval/optimization/utils.py +598 -0
- deepeval/prompt/prompt.py +10 -5
- deepeval/test_run/cache.py +2 -0
- deepeval/test_run/test_run.py +6 -1
- deepeval/utils.py +24 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/METADATA +1 -1
- {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/RECORD +84 -59
- {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/WHEEL +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.4.dist-info}/entry_points.txt +0 -0
|
File without changes
|
|
@@ -0,0 +1,458 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import json
|
|
3
|
+
import random
|
|
4
|
+
from typing import Callable, Dict, List, Optional, Tuple, Union
|
|
5
|
+
|
|
6
|
+
from deepeval.errors import DeepEvalError
|
|
7
|
+
from deepeval.optimization.types import (
|
|
8
|
+
MetricInfo,
|
|
9
|
+
ModuleId,
|
|
10
|
+
)
|
|
11
|
+
from deepeval.optimization.utils import (
|
|
12
|
+
a_invoke_model_callback,
|
|
13
|
+
invoke_model_callback,
|
|
14
|
+
validate_callback,
|
|
15
|
+
validate_int_in_range,
|
|
16
|
+
validate_instance,
|
|
17
|
+
build_model_callback_kwargs,
|
|
18
|
+
)
|
|
19
|
+
from deepeval.optimization.configs import (
|
|
20
|
+
PromptListMutationConfig,
|
|
21
|
+
PromptListMutationTargetType,
|
|
22
|
+
)
|
|
23
|
+
from deepeval.prompt.api import PromptType, PromptMessage
|
|
24
|
+
from deepeval.prompt.prompt import Prompt
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
##################
|
|
28
|
+
# Common Helpers #
|
|
29
|
+
##################
|
|
30
|
+
def _summarize_prompt_for_rewrite(old_prompt: Prompt, max_chars: int) -> str:
|
|
31
|
+
"""
|
|
32
|
+
Produce a human-readable summary of the current prompt for the
|
|
33
|
+
rewriter instruction block.
|
|
34
|
+
|
|
35
|
+
- For TEXT prompts, this is just `text_template`.
|
|
36
|
+
- For LIST prompts, this is a numbered list of (role, content) lines.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
# LIST prompts: show each message with its role.
|
|
40
|
+
if old_prompt.type is PromptType.LIST and old_prompt.messages_template:
|
|
41
|
+
lines: List[str] = []
|
|
42
|
+
for message_index, message in enumerate(old_prompt.messages_template):
|
|
43
|
+
role = message.role or ""
|
|
44
|
+
content = message.content or ""
|
|
45
|
+
lines.append(f"[{message_index+1}] ({role}) {content}")
|
|
46
|
+
combined = "\n".join(lines)
|
|
47
|
+
return combined[:max_chars]
|
|
48
|
+
|
|
49
|
+
# Since it is not a LIST prompt, just use text_template.
|
|
50
|
+
text = old_prompt.text_template or ""
|
|
51
|
+
return text[:max_chars]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _select_list_target_index(
|
|
55
|
+
messages: List[PromptMessage],
|
|
56
|
+
config: PromptListMutationConfig,
|
|
57
|
+
random_state: random.Random,
|
|
58
|
+
) -> int:
|
|
59
|
+
"""
|
|
60
|
+
Select which list message index to rewrite, based on PromptListMutationConfig.
|
|
61
|
+
|
|
62
|
+
Rules:
|
|
63
|
+
- Start with all indices in scope.
|
|
64
|
+
- If target_role is set, restrict candidates to messages with that role
|
|
65
|
+
(case insensitive). If no messages match, fall back to all indices.
|
|
66
|
+
- target_type:
|
|
67
|
+
* FIRST: pick the first candidate index.
|
|
68
|
+
* RANDOM: pick a candidate via random_state.choice(candidates).
|
|
69
|
+
* FIXED_INDEX: use target_index when valid (and consistent with role
|
|
70
|
+
filter), otherwise fall back to the first candidate.
|
|
71
|
+
"""
|
|
72
|
+
if not messages:
|
|
73
|
+
raise DeepEvalError(
|
|
74
|
+
"PromptRewriter._select_list_target_index expected at least one "
|
|
75
|
+
"message, but received an empty message list."
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
validate_instance(
|
|
79
|
+
component="PromptRewriter._select_list_target_index",
|
|
80
|
+
param_name="target_type",
|
|
81
|
+
value=config.target_type,
|
|
82
|
+
expected_types=PromptListMutationTargetType,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
messages_length = len(messages)
|
|
86
|
+
candidate_indices = list(range(messages_length))
|
|
87
|
+
|
|
88
|
+
# Optional case insensitive role restriction
|
|
89
|
+
if config.target_role:
|
|
90
|
+
target_role_lower = config.target_role.lower()
|
|
91
|
+
filtered = [
|
|
92
|
+
index
|
|
93
|
+
for index, message in enumerate(messages)
|
|
94
|
+
if (message.role or "").lower() == target_role_lower
|
|
95
|
+
]
|
|
96
|
+
if filtered:
|
|
97
|
+
candidate_indices = filtered
|
|
98
|
+
|
|
99
|
+
target_type = config.target_type
|
|
100
|
+
|
|
101
|
+
if target_type is PromptListMutationTargetType.RANDOM:
|
|
102
|
+
return random_state.choice(candidate_indices)
|
|
103
|
+
|
|
104
|
+
if target_type is PromptListMutationTargetType.FIXED_INDEX:
|
|
105
|
+
index = validate_int_in_range(
|
|
106
|
+
component="PromptRewriter._select_list_target_index",
|
|
107
|
+
param_name="target_index",
|
|
108
|
+
value=int(config.target_index),
|
|
109
|
+
min_inclusive=0,
|
|
110
|
+
max_exclusive=len(candidate_indices),
|
|
111
|
+
)
|
|
112
|
+
return candidate_indices[index]
|
|
113
|
+
|
|
114
|
+
# if you got this error it means that a new PromptListMutationTargetType was added,
|
|
115
|
+
# but not handled above
|
|
116
|
+
raise DeepEvalError(
|
|
117
|
+
"PromptRewriter._select_list_target_index received unsupported "
|
|
118
|
+
f"target_type={target_type!r}. Expected RANDOM or FIXED_INDEX."
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _apply_rewritten_prompt(
|
|
123
|
+
old_prompt: Prompt,
|
|
124
|
+
new_text: str,
|
|
125
|
+
random_state: random.Random,
|
|
126
|
+
list_mutation_config: Optional[PromptListMutationConfig] = None,
|
|
127
|
+
) -> Prompt:
|
|
128
|
+
"""
|
|
129
|
+
Apply the rewritten text to a Prompt, preserving representation:
|
|
130
|
+
|
|
131
|
+
- For TEXT prompts, update `text_template`.
|
|
132
|
+
- For LIST prompts, rewrite the content of a single message while
|
|
133
|
+
keeping the number of messages the same.
|
|
134
|
+
- Preserve additonal Prompt meta such as `label` and `interpolation_type`
|
|
135
|
+
"""
|
|
136
|
+
if not new_text:
|
|
137
|
+
return old_prompt
|
|
138
|
+
|
|
139
|
+
if old_prompt.type is PromptType.LIST and old_prompt.messages_template:
|
|
140
|
+
messages = old_prompt.messages_template
|
|
141
|
+
config = list_mutation_config or PromptListMutationConfig()
|
|
142
|
+
|
|
143
|
+
target_index = _select_list_target_index(
|
|
144
|
+
messages=messages,
|
|
145
|
+
config=config,
|
|
146
|
+
random_state=random_state,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
new_messages: List[PromptMessage] = []
|
|
150
|
+
for message_index, message in enumerate(messages):
|
|
151
|
+
if message_index == target_index:
|
|
152
|
+
# Preserve the original role; do not inject a new one.
|
|
153
|
+
new_messages.append(
|
|
154
|
+
PromptMessage(
|
|
155
|
+
role=message.role,
|
|
156
|
+
content=new_text,
|
|
157
|
+
)
|
|
158
|
+
)
|
|
159
|
+
else:
|
|
160
|
+
new_messages.append(message)
|
|
161
|
+
|
|
162
|
+
new_prompt = Prompt(
|
|
163
|
+
alias=old_prompt.alias,
|
|
164
|
+
text_template=None,
|
|
165
|
+
messages_template=new_messages,
|
|
166
|
+
model_settings=old_prompt.model_settings,
|
|
167
|
+
output_type=old_prompt.output_type,
|
|
168
|
+
output_schema=old_prompt.output_schema,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
else:
|
|
172
|
+
# Since it is not LIST, it must be TEXT type
|
|
173
|
+
new_prompt = Prompt(
|
|
174
|
+
alias=old_prompt.alias,
|
|
175
|
+
text_template=new_text,
|
|
176
|
+
model_settings=old_prompt.model_settings,
|
|
177
|
+
output_type=old_prompt.output_type,
|
|
178
|
+
output_schema=old_prompt.output_schema,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
new_prompt.label = old_prompt.label
|
|
182
|
+
new_prompt.interpolation_type = old_prompt.interpolation_type
|
|
183
|
+
return new_prompt
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _compose_prompt_messages(system_message: str, user_message: str) -> str:
|
|
187
|
+
"""
|
|
188
|
+
Join system and user messages into a single prompt string.
|
|
189
|
+
Strips surrounding whitespace from each part; if the system message is
|
|
190
|
+
empty or absent, returns just the user message.
|
|
191
|
+
"""
|
|
192
|
+
system_text = (system_message or "").strip()
|
|
193
|
+
user_text = (user_message or "").strip()
|
|
194
|
+
return f"{system_text}\n\n{user_text}" if system_text else user_text
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _normalize_llm_output_to_text(
|
|
198
|
+
result: Union[str, Tuple[Union[str, dict], float], dict],
|
|
199
|
+
) -> str:
|
|
200
|
+
"""
|
|
201
|
+
Convert a DeepEval LLM generate() / a_generate() result to a clean string.
|
|
202
|
+
|
|
203
|
+
Accepted inputs:
|
|
204
|
+
- str -> returned as trimmed
|
|
205
|
+
- (str|dict, float_cost) -> first element extracted and normalized
|
|
206
|
+
- dict (e.g. JSON mode) -> JSON serialized with ensure_ascii=False
|
|
207
|
+
|
|
208
|
+
Fallback: if serialization fails, str(value).strip() is used.
|
|
209
|
+
"""
|
|
210
|
+
output_value: Union[str, dict]
|
|
211
|
+
if isinstance(result, tuple):
|
|
212
|
+
output_value = result[0]
|
|
213
|
+
else:
|
|
214
|
+
output_value = result
|
|
215
|
+
|
|
216
|
+
if isinstance(output_value, str):
|
|
217
|
+
return output_value.strip()
|
|
218
|
+
|
|
219
|
+
try:
|
|
220
|
+
return json.dumps(output_value, ensure_ascii=False)
|
|
221
|
+
except Exception:
|
|
222
|
+
return str(output_value).strip()
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
#################################
|
|
226
|
+
# Rewriters for prompt mutation #
|
|
227
|
+
#################################
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
class PromptRewriter:
|
|
231
|
+
"""
|
|
232
|
+
Uses a provided DeepEval model to rewrite the prompt for a module,
|
|
233
|
+
guided by feedback_text (μ_f).
|
|
234
|
+
|
|
235
|
+
For LIST prompts, the target message to rewrite is chosen according to
|
|
236
|
+
`list_mutation_config` and `random_state`.
|
|
237
|
+
"""
|
|
238
|
+
|
|
239
|
+
def __init__(
|
|
240
|
+
self,
|
|
241
|
+
*,
|
|
242
|
+
max_chars: int = 4000,
|
|
243
|
+
list_mutation_config: Optional[PromptListMutationConfig] = None,
|
|
244
|
+
random_state: Optional[Union[int, random.Random]] = None,
|
|
245
|
+
):
|
|
246
|
+
self.max_chars = max_chars
|
|
247
|
+
self.list_mutation_config = (
|
|
248
|
+
list_mutation_config or PromptListMutationConfig()
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
# Accept either an int seed or a Random instance.
|
|
252
|
+
if isinstance(random_state, int):
|
|
253
|
+
self.random_state: Optional[random.Random] = random.Random(
|
|
254
|
+
random_state
|
|
255
|
+
)
|
|
256
|
+
else:
|
|
257
|
+
self.random_state = random_state or random.Random()
|
|
258
|
+
|
|
259
|
+
def _compose_messages(
|
|
260
|
+
self, *, module_id: ModuleId, old_prompt: Prompt, feedback_text: str
|
|
261
|
+
) -> Tuple[str, str]:
|
|
262
|
+
current_prompt_block = _summarize_prompt_for_rewrite(
|
|
263
|
+
old_prompt, self.max_chars
|
|
264
|
+
)
|
|
265
|
+
system_message = (
|
|
266
|
+
"You are refining a prompt used in a multi-step LLM pipeline. "
|
|
267
|
+
"Given the current prompt and concise feedback, produce a revised prompt "
|
|
268
|
+
"that addresses the issues while preserving intent and style. "
|
|
269
|
+
"Return only the new prompt text, no explanations."
|
|
270
|
+
)
|
|
271
|
+
user_message = f"""[Current Prompt]
|
|
272
|
+
{current_prompt_block}
|
|
273
|
+
|
|
274
|
+
[Feedback]
|
|
275
|
+
{feedback_text[:self.max_chars]}
|
|
276
|
+
|
|
277
|
+
[Instruction]
|
|
278
|
+
Rewrite the prompt. Keep it concise and actionable. Do not include extraneous text.
|
|
279
|
+
"""
|
|
280
|
+
return system_message, user_message
|
|
281
|
+
|
|
282
|
+
def rewrite(
|
|
283
|
+
self,
|
|
284
|
+
*,
|
|
285
|
+
model_callback: Callable[
|
|
286
|
+
...,
|
|
287
|
+
Union[
|
|
288
|
+
str,
|
|
289
|
+
Dict,
|
|
290
|
+
Tuple[Union[str, Dict], float],
|
|
291
|
+
],
|
|
292
|
+
],
|
|
293
|
+
module_id: ModuleId,
|
|
294
|
+
old_prompt: Prompt,
|
|
295
|
+
feedback_text: str,
|
|
296
|
+
) -> Prompt:
|
|
297
|
+
model_callback = validate_callback(
|
|
298
|
+
component="PromptRewriter",
|
|
299
|
+
model_callback=model_callback,
|
|
300
|
+
)
|
|
301
|
+
if not feedback_text.strip():
|
|
302
|
+
return old_prompt
|
|
303
|
+
system_message, user_message = self._compose_messages(
|
|
304
|
+
module_id=module_id,
|
|
305
|
+
old_prompt=old_prompt,
|
|
306
|
+
feedback_text=feedback_text,
|
|
307
|
+
)
|
|
308
|
+
merged_prompt_text = _compose_prompt_messages(
|
|
309
|
+
system_message, user_message
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
prompt_messages: Optional[List[PromptMessage]] = None
|
|
313
|
+
if old_prompt.type is PromptType.LIST and old_prompt.messages_template:
|
|
314
|
+
prompt_messages = old_prompt.messages_template
|
|
315
|
+
|
|
316
|
+
candidate_kwargs = build_model_callback_kwargs(
|
|
317
|
+
prompt=old_prompt,
|
|
318
|
+
prompt_text=merged_prompt_text,
|
|
319
|
+
prompt_messages=prompt_messages,
|
|
320
|
+
feedback_text=feedback_text,
|
|
321
|
+
)
|
|
322
|
+
out = invoke_model_callback(
|
|
323
|
+
hook="prompt_rewrite",
|
|
324
|
+
model_callback=model_callback,
|
|
325
|
+
candidate_kwargs=candidate_kwargs,
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
new_text = _normalize_llm_output_to_text(out)
|
|
329
|
+
return _apply_rewritten_prompt(
|
|
330
|
+
old_prompt,
|
|
331
|
+
new_text,
|
|
332
|
+
self.random_state,
|
|
333
|
+
self.list_mutation_config,
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
async def a_rewrite(
|
|
337
|
+
self,
|
|
338
|
+
*,
|
|
339
|
+
model_callback: Callable[
|
|
340
|
+
...,
|
|
341
|
+
Union[
|
|
342
|
+
str,
|
|
343
|
+
Dict,
|
|
344
|
+
Tuple[Union[str, Dict], float],
|
|
345
|
+
],
|
|
346
|
+
],
|
|
347
|
+
module_id: ModuleId,
|
|
348
|
+
old_prompt: Prompt,
|
|
349
|
+
feedback_text: str,
|
|
350
|
+
) -> Prompt:
|
|
351
|
+
model_callback = validate_callback(
|
|
352
|
+
component="PromptRewriter",
|
|
353
|
+
model_callback=model_callback,
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
if not feedback_text.strip():
|
|
357
|
+
return old_prompt
|
|
358
|
+
|
|
359
|
+
system_message, user_message = self._compose_messages(
|
|
360
|
+
module_id=module_id,
|
|
361
|
+
old_prompt=old_prompt,
|
|
362
|
+
feedback_text=feedback_text,
|
|
363
|
+
)
|
|
364
|
+
merged_prompt_text = _compose_prompt_messages(
|
|
365
|
+
system_message, user_message
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
prompt_messages: Optional[List[PromptMessage]] = None
|
|
369
|
+
if old_prompt.type is PromptType.LIST and old_prompt.messages_template:
|
|
370
|
+
prompt_messages = old_prompt.messages_template
|
|
371
|
+
|
|
372
|
+
candidate_kwargs = build_model_callback_kwargs(
|
|
373
|
+
prompt=old_prompt,
|
|
374
|
+
prompt_text=merged_prompt_text,
|
|
375
|
+
prompt_messages=prompt_messages,
|
|
376
|
+
feedback_text=feedback_text,
|
|
377
|
+
)
|
|
378
|
+
out = await a_invoke_model_callback(
|
|
379
|
+
hook="prompt_rewrite",
|
|
380
|
+
model_callback=model_callback,
|
|
381
|
+
candidate_kwargs=candidate_kwargs,
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
new_text = _normalize_llm_output_to_text(out)
|
|
385
|
+
return _apply_rewritten_prompt(
|
|
386
|
+
old_prompt,
|
|
387
|
+
new_text,
|
|
388
|
+
self.random_state,
|
|
389
|
+
self.list_mutation_config,
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
class MetricAwareLLMRewriter(PromptRewriter):
|
|
394
|
+
"""
|
|
395
|
+
Uses μ_f (feedback_text) and optional metric rubrics to rewrite a module prompt.
|
|
396
|
+
- metrics_info: optional list of MetricInfo(name, rubric). If provided, a
|
|
397
|
+
[Metric Rubrics] block is added to the prompt for stronger guidance.
|
|
398
|
+
"""
|
|
399
|
+
|
|
400
|
+
def __init__(
|
|
401
|
+
self,
|
|
402
|
+
*,
|
|
403
|
+
metrics_info: Optional[List[MetricInfo]] = None,
|
|
404
|
+
max_chars: int = 4000,
|
|
405
|
+
max_metrics_in_prompt: int = 20,
|
|
406
|
+
list_mutation_config: Optional[PromptListMutationConfig] = None,
|
|
407
|
+
random_state: Optional[Union[int, random.Random]] = None,
|
|
408
|
+
):
|
|
409
|
+
super().__init__(
|
|
410
|
+
max_chars=max_chars,
|
|
411
|
+
list_mutation_config=list_mutation_config,
|
|
412
|
+
random_state=random_state,
|
|
413
|
+
)
|
|
414
|
+
self.metrics_info = metrics_info or []
|
|
415
|
+
self.max_metrics_in_prompt = max_metrics_in_prompt
|
|
416
|
+
|
|
417
|
+
def _compose_messages(
|
|
418
|
+
self, *, module_id: ModuleId, old_prompt: Prompt, feedback_text: str
|
|
419
|
+
) -> Tuple[str, str]:
|
|
420
|
+
|
|
421
|
+
current_prompt_block = _summarize_prompt_for_rewrite(
|
|
422
|
+
old_prompt, self.max_chars
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
# Optional rubrics block
|
|
426
|
+
rubric_block = ""
|
|
427
|
+
if self.metrics_info:
|
|
428
|
+
lines: List[str] = []
|
|
429
|
+
for metric in self.metrics_info[: self.max_metrics_in_prompt]:
|
|
430
|
+
if metric.rubric and metric.rubric.strip():
|
|
431
|
+
lines.append(f"- {metric.name}: {metric.rubric.strip()}")
|
|
432
|
+
else:
|
|
433
|
+
lines.append(
|
|
434
|
+
f"- {metric.name}: Optimize for this metric’s quality criteria."
|
|
435
|
+
)
|
|
436
|
+
rubric_block = "\n[Metric Rubrics]\n" + "\n".join(lines)
|
|
437
|
+
|
|
438
|
+
system_message = (
|
|
439
|
+
"You are refining a prompt used in a multi-step LLM pipeline. "
|
|
440
|
+
"Given the current prompt, concise feedback, and (optionally) metric rubrics, "
|
|
441
|
+
"produce a revised prompt that addresses the issues while preserving intent and style. "
|
|
442
|
+
"Return only the new prompt text, with no explanations."
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
user_message = f"""[Module]
|
|
446
|
+
{module_id}
|
|
447
|
+
|
|
448
|
+
[Current Prompt]
|
|
449
|
+
{current_prompt_block}
|
|
450
|
+
|
|
451
|
+
[Feedback]
|
|
452
|
+
{feedback_text[:self.max_chars]}
|
|
453
|
+
{rubric_block}
|
|
454
|
+
|
|
455
|
+
[Instruction]
|
|
456
|
+
Rewrite the prompt to better satisfy the metrics and address the feedback.
|
|
457
|
+
Keep it concise, actionable, and faithful to the module’s role."""
|
|
458
|
+
return system_message, user_message
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from .selection import (
|
|
2
|
+
pareto_frontier,
|
|
3
|
+
frequency_weights,
|
|
4
|
+
sample_by_frequency,
|
|
5
|
+
select_prompt_configuration_pareto,
|
|
6
|
+
)
|
|
7
|
+
from .tie_breaker import TieBreaker, pick_best_with_ties
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"pareto_frontier",
|
|
11
|
+
"frequency_weights",
|
|
12
|
+
"sample_by_frequency",
|
|
13
|
+
"select_prompt_configuration_pareto",
|
|
14
|
+
"TieBreaker",
|
|
15
|
+
"pick_best_with_ties",
|
|
16
|
+
]
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Dict, List, Sequence
|
|
3
|
+
import random
|
|
4
|
+
|
|
5
|
+
from deepeval.errors import DeepEvalError
|
|
6
|
+
from deepeval.optimization.types import PromptConfigurationId, ScoreTable
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _is_dominated(
|
|
10
|
+
candidate_scores: List[float], other_scores: List[float]
|
|
11
|
+
) -> bool:
|
|
12
|
+
"""
|
|
13
|
+
Return True if `candidate_scores` is dominated by `other_scores`:
|
|
14
|
+
(other >= candidate on all dimensions) AND (other > candidate on at least one).
|
|
15
|
+
"""
|
|
16
|
+
other_ge_everywhere = all(
|
|
17
|
+
other_score >= candidate_score
|
|
18
|
+
for candidate_score, other_score in zip(candidate_scores, other_scores)
|
|
19
|
+
)
|
|
20
|
+
other_gt_somewhere = any(
|
|
21
|
+
other_score > candidate_score
|
|
22
|
+
for candidate_score, other_score in zip(candidate_scores, other_scores)
|
|
23
|
+
)
|
|
24
|
+
return other_ge_everywhere and other_gt_somewhere
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def pareto_frontier(
|
|
28
|
+
prompt_configuration_ids: Sequence[PromptConfigurationId],
|
|
29
|
+
score_table: ScoreTable,
|
|
30
|
+
) -> List[PromptConfigurationId]:
|
|
31
|
+
"""
|
|
32
|
+
Compute the set of non-dominated candidates given their scores.
|
|
33
|
+
Returns PromptConfigurationIds on the Pareto frontier.
|
|
34
|
+
"""
|
|
35
|
+
frontier: List[PromptConfigurationId] = []
|
|
36
|
+
for prompt_configuration_id in prompt_configuration_ids:
|
|
37
|
+
candidate_vector = score_table[prompt_configuration_id]
|
|
38
|
+
dominated = False
|
|
39
|
+
|
|
40
|
+
# If any existing frontier member dominates this candidate, skip it.
|
|
41
|
+
for frontier_id in frontier:
|
|
42
|
+
if _is_dominated(candidate_vector, score_table[frontier_id]):
|
|
43
|
+
dominated = True
|
|
44
|
+
break
|
|
45
|
+
if dominated:
|
|
46
|
+
continue
|
|
47
|
+
|
|
48
|
+
# Remove any frontier member that is dominated by this candidate.
|
|
49
|
+
frontier = [
|
|
50
|
+
f_id
|
|
51
|
+
for f_id in frontier
|
|
52
|
+
if not _is_dominated(score_table[f_id], candidate_vector)
|
|
53
|
+
]
|
|
54
|
+
frontier.append(prompt_configuration_id)
|
|
55
|
+
|
|
56
|
+
return frontier
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def frequency_weights(
|
|
60
|
+
score_table: ScoreTable,
|
|
61
|
+
) -> Dict[PromptConfigurationId, int]:
|
|
62
|
+
"""
|
|
63
|
+
Build best sets, remove dominated candidates, and count appearances.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
A map {prompt_configuration_id -> frequency} counting how often each
|
|
67
|
+
globally non-dominated prompt configuration appears among the instance
|
|
68
|
+
Pareto sets.
|
|
69
|
+
"""
|
|
70
|
+
if not score_table:
|
|
71
|
+
return {}
|
|
72
|
+
|
|
73
|
+
# Assume all score vectors have the same length.
|
|
74
|
+
example_vector = next(iter(score_table.values()))
|
|
75
|
+
num_instances = len(example_vector)
|
|
76
|
+
all_candidates = list(score_table.keys())
|
|
77
|
+
|
|
78
|
+
per_instance_frontiers: List[List[PromptConfigurationId]] = []
|
|
79
|
+
for i in range(num_instances):
|
|
80
|
+
best_score_i = max(
|
|
81
|
+
score_table[prompt_configuration_id][i]
|
|
82
|
+
for prompt_configuration_id in all_candidates
|
|
83
|
+
)
|
|
84
|
+
winners_i = [
|
|
85
|
+
prompt_configuration_id
|
|
86
|
+
for prompt_configuration_id in all_candidates
|
|
87
|
+
if score_table[prompt_configuration_id][i] == best_score_i
|
|
88
|
+
]
|
|
89
|
+
|
|
90
|
+
# Instance frontier among winners. We pass 1-D score vectors
|
|
91
|
+
# so this reduces to "all candidates with the max score at instance i",
|
|
92
|
+
instance_frontier = pareto_frontier(
|
|
93
|
+
winners_i,
|
|
94
|
+
{
|
|
95
|
+
prompt_configuration_id: [
|
|
96
|
+
score_table[prompt_configuration_id][i]
|
|
97
|
+
]
|
|
98
|
+
for prompt_configuration_id in winners_i
|
|
99
|
+
},
|
|
100
|
+
)
|
|
101
|
+
per_instance_frontiers.append(instance_frontier)
|
|
102
|
+
|
|
103
|
+
# Global candidate set appearing in any winners
|
|
104
|
+
candidate_union = sorted(
|
|
105
|
+
{
|
|
106
|
+
prompt_configuration_id
|
|
107
|
+
for winners in per_instance_frontiers
|
|
108
|
+
for prompt_configuration_id in winners
|
|
109
|
+
}
|
|
110
|
+
)
|
|
111
|
+
global_frontier = pareto_frontier(candidate_union, score_table)
|
|
112
|
+
|
|
113
|
+
# Count frequency only for candidates on the global frontier
|
|
114
|
+
frequency_by_prompt_config: Dict[PromptConfigurationId, int] = {
|
|
115
|
+
prompt_configuration_id: 0
|
|
116
|
+
for prompt_configuration_id in global_frontier
|
|
117
|
+
}
|
|
118
|
+
for winners in per_instance_frontiers:
|
|
119
|
+
for prompt_configuration_id in winners:
|
|
120
|
+
if prompt_configuration_id in frequency_by_prompt_config:
|
|
121
|
+
frequency_by_prompt_config[prompt_configuration_id] += 1
|
|
122
|
+
|
|
123
|
+
return frequency_by_prompt_config
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def sample_by_frequency(
|
|
127
|
+
frequency_by_prompt_config: Dict[PromptConfigurationId, int],
|
|
128
|
+
*,
|
|
129
|
+
random_state: random.Random,
|
|
130
|
+
) -> PromptConfigurationId:
|
|
131
|
+
"""
|
|
132
|
+
Sample a prompt configuration id with probability proportional to its frequency.
|
|
133
|
+
Falls back to uniform if the total weight is zero.
|
|
134
|
+
"""
|
|
135
|
+
if not frequency_by_prompt_config:
|
|
136
|
+
raise DeepEvalError("No prompt configurations to sample.")
|
|
137
|
+
|
|
138
|
+
items = list(frequency_by_prompt_config.items())
|
|
139
|
+
total_weight = sum(weight for _, weight in items)
|
|
140
|
+
|
|
141
|
+
if total_weight == 0:
|
|
142
|
+
# Uniform fallback
|
|
143
|
+
return random_state.choice(
|
|
144
|
+
[prompt_configuration_id for prompt_configuration_id, _ in items]
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
r = random_state.uniform(0, total_weight)
|
|
148
|
+
cumulative = 0.0
|
|
149
|
+
for prompt_configuration_id, weight in items:
|
|
150
|
+
cumulative += weight
|
|
151
|
+
if r <= cumulative:
|
|
152
|
+
return prompt_configuration_id
|
|
153
|
+
return items[-1][0]
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def select_prompt_configuration_pareto(
|
|
157
|
+
score_table: ScoreTable, *, random_state: random.Random
|
|
158
|
+
) -> PromptConfigurationId:
|
|
159
|
+
"""
|
|
160
|
+
Frequency weighted sampling over the Pareto winners,
|
|
161
|
+
restricted to globally non-dominated prompt configurations. A configuration
|
|
162
|
+
is globally non-dominated if no other configuration dominates it using
|
|
163
|
+
the full vector.
|
|
164
|
+
"""
|
|
165
|
+
freq = frequency_weights(score_table)
|
|
166
|
+
return sample_by_frequency(freq, random_state=random_state)
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Dict, List, Optional, Tuple
|
|
3
|
+
from enum import Enum
|
|
4
|
+
import random
|
|
5
|
+
|
|
6
|
+
from deepeval.errors import DeepEvalError
|
|
7
|
+
from deepeval.optimization.types import PromptConfigurationId
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TieBreaker(str, Enum):
|
|
11
|
+
PREFER_ROOT = "prefer_root"
|
|
12
|
+
PREFER_CHILD = "prefer_child"
|
|
13
|
+
RANDOM = "random"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def pick_best_with_ties(
|
|
17
|
+
totals: Dict[PromptConfigurationId, float],
|
|
18
|
+
parents_by_id: Dict[PromptConfigurationId, Optional[PromptConfigurationId]],
|
|
19
|
+
*,
|
|
20
|
+
random_state: random.Random,
|
|
21
|
+
tie_tolerance: float = 1e-9,
|
|
22
|
+
policy: TieBreaker = TieBreaker.PREFER_ROOT,
|
|
23
|
+
) -> Tuple[PromptConfigurationId, List[PromptConfigurationId], float]:
|
|
24
|
+
"""
|
|
25
|
+
Choose the best candidate by aggregate score with deterministic tie handling.
|
|
26
|
+
|
|
27
|
+
Returns: (chosen_id, tied_ids, max_score)
|
|
28
|
+
- tied_ids includes everyone within tie_tolerance of max_score
|
|
29
|
+
"""
|
|
30
|
+
if not totals:
|
|
31
|
+
raise DeepEvalError("No candidate prompt configuration to choose from.")
|
|
32
|
+
|
|
33
|
+
max_score = max(totals.values())
|
|
34
|
+
tied = [
|
|
35
|
+
prompt_configuration_id
|
|
36
|
+
for prompt_configuration_id, score in totals.items()
|
|
37
|
+
if abs(score - max_score) <= tie_tolerance
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
if len(tied) == 1:
|
|
41
|
+
return tied[0], tied, max_score
|
|
42
|
+
|
|
43
|
+
# Resolve tie by policy
|
|
44
|
+
if policy == TieBreaker.PREFER_CHILD:
|
|
45
|
+
# Prefer any non root. When multiple children exist, use the most recent
|
|
46
|
+
child_ids = [
|
|
47
|
+
prompt_configuration_id
|
|
48
|
+
for prompt_configuration_id in tied
|
|
49
|
+
if parents_by_id.get(prompt_configuration_id) is not None
|
|
50
|
+
]
|
|
51
|
+
if child_ids:
|
|
52
|
+
# choose the newest child deterministically by order
|
|
53
|
+
for prompt_configuration_id in reversed(list(totals.keys())):
|
|
54
|
+
if prompt_configuration_id in child_ids:
|
|
55
|
+
return prompt_configuration_id, tied, max_score
|
|
56
|
+
|
|
57
|
+
if policy == TieBreaker.RANDOM:
|
|
58
|
+
return random_state.choice(tied), tied, max_score
|
|
59
|
+
|
|
60
|
+
# by default prefer a root if present, otherwise the first tied
|
|
61
|
+
root_ids = [
|
|
62
|
+
prompt_configuration_id
|
|
63
|
+
for prompt_configuration_id in tied
|
|
64
|
+
if parents_by_id.get(prompt_configuration_id) is None
|
|
65
|
+
]
|
|
66
|
+
chosen = root_ids[0] if root_ids else tied[0]
|
|
67
|
+
return chosen, tied, max_score
|