deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
File without changes
@@ -1,458 +0,0 @@
1
- from __future__ import annotations
2
- import json
3
- import random
4
- from typing import Callable, Dict, List, Optional, Tuple, Union
5
-
6
- from deepeval.errors import DeepEvalError
7
- from deepeval.optimization.types import (
8
- MetricInfo,
9
- ModuleId,
10
- )
11
- from deepeval.optimization.utils import (
12
- a_invoke_model_callback,
13
- invoke_model_callback,
14
- validate_callback,
15
- validate_int_in_range,
16
- validate_instance,
17
- build_model_callback_kwargs,
18
- )
19
- from deepeval.optimization.configs import (
20
- PromptListMutationConfig,
21
- PromptListMutationTargetType,
22
- )
23
- from deepeval.prompt.api import PromptType, PromptMessage
24
- from deepeval.prompt.prompt import Prompt
25
-
26
-
27
- ##################
28
- # Common Helpers #
29
- ##################
30
- def _summarize_prompt_for_rewrite(old_prompt: Prompt, max_chars: int) -> str:
31
- """
32
- Produce a human-readable summary of the current prompt for the
33
- rewriter instruction block.
34
-
35
- - For TEXT prompts, this is just `text_template`.
36
- - For LIST prompts, this is a numbered list of (role, content) lines.
37
- """
38
-
39
- # LIST prompts: show each message with its role.
40
- if old_prompt.type is PromptType.LIST and old_prompt.messages_template:
41
- lines: List[str] = []
42
- for message_index, message in enumerate(old_prompt.messages_template):
43
- role = message.role or ""
44
- content = message.content or ""
45
- lines.append(f"[{message_index+1}] ({role}) {content}")
46
- combined = "\n".join(lines)
47
- return combined[:max_chars]
48
-
49
- # Since it is not a LIST prompt, just use text_template.
50
- text = old_prompt.text_template or ""
51
- return text[:max_chars]
52
-
53
-
54
- def _select_list_target_index(
55
- messages: List[PromptMessage],
56
- config: PromptListMutationConfig,
57
- random_state: random.Random,
58
- ) -> int:
59
- """
60
- Select which list message index to rewrite, based on PromptListMutationConfig.
61
-
62
- Rules:
63
- - Start with all indices in scope.
64
- - If target_role is set, restrict candidates to messages with that role
65
- (case insensitive). If no messages match, fall back to all indices.
66
- - target_type:
67
- * FIRST: pick the first candidate index.
68
- * RANDOM: pick a candidate via random_state.choice(candidates).
69
- * FIXED_INDEX: use target_index when valid (and consistent with role
70
- filter), otherwise fall back to the first candidate.
71
- """
72
- if not messages:
73
- raise DeepEvalError(
74
- "PromptRewriter._select_list_target_index expected at least one "
75
- "message, but received an empty message list."
76
- )
77
-
78
- validate_instance(
79
- component="PromptRewriter._select_list_target_index",
80
- param_name="target_type",
81
- value=config.target_type,
82
- expected_types=PromptListMutationTargetType,
83
- )
84
-
85
- messages_length = len(messages)
86
- candidate_indices = list(range(messages_length))
87
-
88
- # Optional case insensitive role restriction
89
- if config.target_role:
90
- target_role_lower = config.target_role.lower()
91
- filtered = [
92
- index
93
- for index, message in enumerate(messages)
94
- if (message.role or "").lower() == target_role_lower
95
- ]
96
- if filtered:
97
- candidate_indices = filtered
98
-
99
- target_type = config.target_type
100
-
101
- if target_type is PromptListMutationTargetType.RANDOM:
102
- return random_state.choice(candidate_indices)
103
-
104
- if target_type is PromptListMutationTargetType.FIXED_INDEX:
105
- index = validate_int_in_range(
106
- component="PromptRewriter._select_list_target_index",
107
- param_name="target_index",
108
- value=int(config.target_index),
109
- min_inclusive=0,
110
- max_exclusive=len(candidate_indices),
111
- )
112
- return candidate_indices[index]
113
-
114
- # if you got this error it means that a new PromptListMutationTargetType was added,
115
- # but not handled above
116
- raise DeepEvalError(
117
- "PromptRewriter._select_list_target_index received unsupported "
118
- f"target_type={target_type!r}. Expected RANDOM or FIXED_INDEX."
119
- )
120
-
121
-
122
- def _apply_rewritten_prompt(
123
- old_prompt: Prompt,
124
- new_text: str,
125
- random_state: random.Random,
126
- list_mutation_config: Optional[PromptListMutationConfig] = None,
127
- ) -> Prompt:
128
- """
129
- Apply the rewritten text to a Prompt, preserving representation:
130
-
131
- - For TEXT prompts, update `text_template`.
132
- - For LIST prompts, rewrite the content of a single message while
133
- keeping the number of messages the same.
134
- - Preserve additonal Prompt meta such as `label` and `interpolation_type`
135
- """
136
- if not new_text:
137
- return old_prompt
138
-
139
- if old_prompt.type is PromptType.LIST and old_prompt.messages_template:
140
- messages = old_prompt.messages_template
141
- config = list_mutation_config or PromptListMutationConfig()
142
-
143
- target_index = _select_list_target_index(
144
- messages=messages,
145
- config=config,
146
- random_state=random_state,
147
- )
148
-
149
- new_messages: List[PromptMessage] = []
150
- for message_index, message in enumerate(messages):
151
- if message_index == target_index:
152
- # Preserve the original role; do not inject a new one.
153
- new_messages.append(
154
- PromptMessage(
155
- role=message.role,
156
- content=new_text,
157
- )
158
- )
159
- else:
160
- new_messages.append(message)
161
-
162
- new_prompt = Prompt(
163
- alias=old_prompt.alias,
164
- text_template=None,
165
- messages_template=new_messages,
166
- model_settings=old_prompt.model_settings,
167
- output_type=old_prompt.output_type,
168
- output_schema=old_prompt.output_schema,
169
- )
170
-
171
- else:
172
- # Since it is not LIST, it must be TEXT type
173
- new_prompt = Prompt(
174
- alias=old_prompt.alias,
175
- text_template=new_text,
176
- model_settings=old_prompt.model_settings,
177
- output_type=old_prompt.output_type,
178
- output_schema=old_prompt.output_schema,
179
- )
180
-
181
- new_prompt.label = old_prompt.label
182
- new_prompt.interpolation_type = old_prompt.interpolation_type
183
- return new_prompt
184
-
185
-
186
- def _compose_prompt_messages(system_message: str, user_message: str) -> str:
187
- """
188
- Join system and user messages into a single prompt string.
189
- Strips surrounding whitespace from each part; if the system message is
190
- empty or absent, returns just the user message.
191
- """
192
- system_text = (system_message or "").strip()
193
- user_text = (user_message or "").strip()
194
- return f"{system_text}\n\n{user_text}" if system_text else user_text
195
-
196
-
197
- def _normalize_llm_output_to_text(
198
- result: Union[str, Tuple[Union[str, dict], float], dict],
199
- ) -> str:
200
- """
201
- Convert a DeepEval LLM generate() / a_generate() result to a clean string.
202
-
203
- Accepted inputs:
204
- - str -> returned as trimmed
205
- - (str|dict, float_cost) -> first element extracted and normalized
206
- - dict (e.g. JSON mode) -> JSON serialized with ensure_ascii=False
207
-
208
- Fallback: if serialization fails, str(value).strip() is used.
209
- """
210
- output_value: Union[str, dict]
211
- if isinstance(result, tuple):
212
- output_value = result[0]
213
- else:
214
- output_value = result
215
-
216
- if isinstance(output_value, str):
217
- return output_value.strip()
218
-
219
- try:
220
- return json.dumps(output_value, ensure_ascii=False)
221
- except Exception:
222
- return str(output_value).strip()
223
-
224
-
225
- #################################
226
- # Rewriters for prompt mutation #
227
- #################################
228
-
229
-
230
- class PromptRewriter:
231
- """
232
- Uses a provided DeepEval model to rewrite the prompt for a module,
233
- guided by feedback_text (μ_f).
234
-
235
- For LIST prompts, the target message to rewrite is chosen according to
236
- `list_mutation_config` and `random_state`.
237
- """
238
-
239
- def __init__(
240
- self,
241
- *,
242
- max_chars: int = 4000,
243
- list_mutation_config: Optional[PromptListMutationConfig] = None,
244
- random_state: Optional[Union[int, random.Random]] = None,
245
- ):
246
- self.max_chars = max_chars
247
- self.list_mutation_config = (
248
- list_mutation_config or PromptListMutationConfig()
249
- )
250
-
251
- # Accept either an int seed or a Random instance.
252
- if isinstance(random_state, int):
253
- self.random_state: Optional[random.Random] = random.Random(
254
- random_state
255
- )
256
- else:
257
- self.random_state = random_state or random.Random()
258
-
259
- def _compose_messages(
260
- self, *, module_id: ModuleId, old_prompt: Prompt, feedback_text: str
261
- ) -> Tuple[str, str]:
262
- current_prompt_block = _summarize_prompt_for_rewrite(
263
- old_prompt, self.max_chars
264
- )
265
- system_message = (
266
- "You are refining a prompt used in a multi-step LLM pipeline. "
267
- "Given the current prompt and concise feedback, produce a revised prompt "
268
- "that addresses the issues while preserving intent and style. "
269
- "Return only the new prompt text, no explanations."
270
- )
271
- user_message = f"""[Current Prompt]
272
- {current_prompt_block}
273
-
274
- [Feedback]
275
- {feedback_text[:self.max_chars]}
276
-
277
- [Instruction]
278
- Rewrite the prompt. Keep it concise and actionable. Do not include extraneous text.
279
- """
280
- return system_message, user_message
281
-
282
- def rewrite(
283
- self,
284
- *,
285
- model_callback: Callable[
286
- ...,
287
- Union[
288
- str,
289
- Dict,
290
- Tuple[Union[str, Dict], float],
291
- ],
292
- ],
293
- module_id: ModuleId,
294
- old_prompt: Prompt,
295
- feedback_text: str,
296
- ) -> Prompt:
297
- model_callback = validate_callback(
298
- component="PromptRewriter",
299
- model_callback=model_callback,
300
- )
301
- if not feedback_text.strip():
302
- return old_prompt
303
- system_message, user_message = self._compose_messages(
304
- module_id=module_id,
305
- old_prompt=old_prompt,
306
- feedback_text=feedback_text,
307
- )
308
- merged_prompt_text = _compose_prompt_messages(
309
- system_message, user_message
310
- )
311
-
312
- prompt_messages: Optional[List[PromptMessage]] = None
313
- if old_prompt.type is PromptType.LIST and old_prompt.messages_template:
314
- prompt_messages = old_prompt.messages_template
315
-
316
- candidate_kwargs = build_model_callback_kwargs(
317
- prompt=old_prompt,
318
- prompt_text=merged_prompt_text,
319
- prompt_messages=prompt_messages,
320
- feedback_text=feedback_text,
321
- )
322
- out = invoke_model_callback(
323
- hook="prompt_rewrite",
324
- model_callback=model_callback,
325
- candidate_kwargs=candidate_kwargs,
326
- )
327
-
328
- new_text = _normalize_llm_output_to_text(out)
329
- return _apply_rewritten_prompt(
330
- old_prompt,
331
- new_text,
332
- self.random_state,
333
- self.list_mutation_config,
334
- )
335
-
336
- async def a_rewrite(
337
- self,
338
- *,
339
- model_callback: Callable[
340
- ...,
341
- Union[
342
- str,
343
- Dict,
344
- Tuple[Union[str, Dict], float],
345
- ],
346
- ],
347
- module_id: ModuleId,
348
- old_prompt: Prompt,
349
- feedback_text: str,
350
- ) -> Prompt:
351
- model_callback = validate_callback(
352
- component="PromptRewriter",
353
- model_callback=model_callback,
354
- )
355
-
356
- if not feedback_text.strip():
357
- return old_prompt
358
-
359
- system_message, user_message = self._compose_messages(
360
- module_id=module_id,
361
- old_prompt=old_prompt,
362
- feedback_text=feedback_text,
363
- )
364
- merged_prompt_text = _compose_prompt_messages(
365
- system_message, user_message
366
- )
367
-
368
- prompt_messages: Optional[List[PromptMessage]] = None
369
- if old_prompt.type is PromptType.LIST and old_prompt.messages_template:
370
- prompt_messages = old_prompt.messages_template
371
-
372
- candidate_kwargs = build_model_callback_kwargs(
373
- prompt=old_prompt,
374
- prompt_text=merged_prompt_text,
375
- prompt_messages=prompt_messages,
376
- feedback_text=feedback_text,
377
- )
378
- out = await a_invoke_model_callback(
379
- hook="prompt_rewrite",
380
- model_callback=model_callback,
381
- candidate_kwargs=candidate_kwargs,
382
- )
383
-
384
- new_text = _normalize_llm_output_to_text(out)
385
- return _apply_rewritten_prompt(
386
- old_prompt,
387
- new_text,
388
- self.random_state,
389
- self.list_mutation_config,
390
- )
391
-
392
-
393
- class MetricAwareLLMRewriter(PromptRewriter):
394
- """
395
- Uses μ_f (feedback_text) and optional metric rubrics to rewrite a module prompt.
396
- - metrics_info: optional list of MetricInfo(name, rubric). If provided, a
397
- [Metric Rubrics] block is added to the prompt for stronger guidance.
398
- """
399
-
400
- def __init__(
401
- self,
402
- *,
403
- metrics_info: Optional[List[MetricInfo]] = None,
404
- max_chars: int = 4000,
405
- max_metrics_in_prompt: int = 20,
406
- list_mutation_config: Optional[PromptListMutationConfig] = None,
407
- random_state: Optional[Union[int, random.Random]] = None,
408
- ):
409
- super().__init__(
410
- max_chars=max_chars,
411
- list_mutation_config=list_mutation_config,
412
- random_state=random_state,
413
- )
414
- self.metrics_info = metrics_info or []
415
- self.max_metrics_in_prompt = max_metrics_in_prompt
416
-
417
- def _compose_messages(
418
- self, *, module_id: ModuleId, old_prompt: Prompt, feedback_text: str
419
- ) -> Tuple[str, str]:
420
-
421
- current_prompt_block = _summarize_prompt_for_rewrite(
422
- old_prompt, self.max_chars
423
- )
424
-
425
- # Optional rubrics block
426
- rubric_block = ""
427
- if self.metrics_info:
428
- lines: List[str] = []
429
- for metric in self.metrics_info[: self.max_metrics_in_prompt]:
430
- if metric.rubric and metric.rubric.strip():
431
- lines.append(f"- {metric.name}: {metric.rubric.strip()}")
432
- else:
433
- lines.append(
434
- f"- {metric.name}: Optimize for this metric’s quality criteria."
435
- )
436
- rubric_block = "\n[Metric Rubrics]\n" + "\n".join(lines)
437
-
438
- system_message = (
439
- "You are refining a prompt used in a multi-step LLM pipeline. "
440
- "Given the current prompt, concise feedback, and (optionally) metric rubrics, "
441
- "produce a revised prompt that addresses the issues while preserving intent and style. "
442
- "Return only the new prompt text, with no explanations."
443
- )
444
-
445
- user_message = f"""[Module]
446
- {module_id}
447
-
448
- [Current Prompt]
449
- {current_prompt_block}
450
-
451
- [Feedback]
452
- {feedback_text[:self.max_chars]}
453
- {rubric_block}
454
-
455
- [Instruction]
456
- Rewrite the prompt to better satisfy the metrics and address the feedback.
457
- Keep it concise, actionable, and faithful to the module’s role."""
458
- return system_message, user_message
@@ -1,16 +0,0 @@
1
- from .selection import (
2
- pareto_frontier,
3
- frequency_weights,
4
- sample_by_frequency,
5
- select_prompt_configuration_pareto,
6
- )
7
- from .tie_breaker import TieBreaker, pick_best_with_ties
8
-
9
- __all__ = [
10
- "pareto_frontier",
11
- "frequency_weights",
12
- "sample_by_frequency",
13
- "select_prompt_configuration_pareto",
14
- "TieBreaker",
15
- "pick_best_with_ties",
16
- ]
@@ -1,67 +0,0 @@
1
- from __future__ import annotations
2
- from typing import Dict, List, Optional, Tuple
3
- from enum import Enum
4
- import random
5
-
6
- from deepeval.errors import DeepEvalError
7
- from deepeval.optimization.types import PromptConfigurationId
8
-
9
-
10
- class TieBreaker(str, Enum):
11
- PREFER_ROOT = "prefer_root"
12
- PREFER_CHILD = "prefer_child"
13
- RANDOM = "random"
14
-
15
-
16
- def pick_best_with_ties(
17
- totals: Dict[PromptConfigurationId, float],
18
- parents_by_id: Dict[PromptConfigurationId, Optional[PromptConfigurationId]],
19
- *,
20
- random_state: random.Random,
21
- tie_tolerance: float = 1e-9,
22
- policy: TieBreaker = TieBreaker.PREFER_ROOT,
23
- ) -> Tuple[PromptConfigurationId, List[PromptConfigurationId], float]:
24
- """
25
- Choose the best candidate by aggregate score with deterministic tie handling.
26
-
27
- Returns: (chosen_id, tied_ids, max_score)
28
- - tied_ids includes everyone within tie_tolerance of max_score
29
- """
30
- if not totals:
31
- raise DeepEvalError("No candidate prompt configuration to choose from.")
32
-
33
- max_score = max(totals.values())
34
- tied = [
35
- prompt_configuration_id
36
- for prompt_configuration_id, score in totals.items()
37
- if abs(score - max_score) <= tie_tolerance
38
- ]
39
-
40
- if len(tied) == 1:
41
- return tied[0], tied, max_score
42
-
43
- # Resolve tie by policy
44
- if policy == TieBreaker.PREFER_CHILD:
45
- # Prefer any non root. When multiple children exist, use the most recent
46
- child_ids = [
47
- prompt_configuration_id
48
- for prompt_configuration_id in tied
49
- if parents_by_id.get(prompt_configuration_id) is not None
50
- ]
51
- if child_ids:
52
- # choose the newest child deterministically by order
53
- for prompt_configuration_id in reversed(list(totals.keys())):
54
- if prompt_configuration_id in child_ids:
55
- return prompt_configuration_id, tied, max_score
56
-
57
- if policy == TieBreaker.RANDOM:
58
- return random_state.choice(tied), tied, max_score
59
-
60
- # by default prefer a root if present, otherwise the first tied
61
- root_ids = [
62
- prompt_configuration_id
63
- for prompt_configuration_id in tied
64
- if parents_by_id.get(prompt_configuration_id) is None
65
- ]
66
- chosen = root_ids[0] if root_ids else tied[0]
67
- return chosen, tied, max_score