deepeval 3.7.4__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/dataset/golden.py +54 -2
  3. deepeval/evaluate/evaluate.py +16 -8
  4. deepeval/evaluate/execute.py +70 -26
  5. deepeval/evaluate/utils.py +26 -22
  6. deepeval/integrations/pydantic_ai/agent.py +19 -2
  7. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  8. deepeval/metrics/__init__.py +14 -12
  9. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  10. deepeval/metrics/answer_relevancy/template.py +188 -92
  11. deepeval/metrics/base_metric.py +2 -5
  12. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  13. deepeval/metrics/contextual_precision/template.py +115 -66
  14. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  15. deepeval/metrics/contextual_recall/template.py +106 -55
  16. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  17. deepeval/metrics/contextual_relevancy/template.py +87 -58
  18. deepeval/metrics/dag/templates.py +2 -2
  19. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  20. deepeval/metrics/faithfulness/schema.py +1 -1
  21. deepeval/metrics/faithfulness/template.py +200 -115
  22. deepeval/metrics/g_eval/utils.py +2 -2
  23. deepeval/metrics/indicator.py +4 -4
  24. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  25. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  26. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  27. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  28. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  29. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  30. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  31. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  32. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  33. deepeval/metrics/ragas.py +3 -3
  34. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  35. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  36. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  37. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  38. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  39. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  40. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  41. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  42. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  43. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  44. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  45. deepeval/metrics/turn_faithfulness/template.py +218 -0
  46. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  47. deepeval/metrics/utils.py +39 -58
  48. deepeval/models/__init__.py +0 -12
  49. deepeval/models/base_model.py +16 -38
  50. deepeval/models/embedding_models/__init__.py +7 -0
  51. deepeval/models/embedding_models/azure_embedding_model.py +52 -28
  52. deepeval/models/embedding_models/local_embedding_model.py +18 -14
  53. deepeval/models/embedding_models/ollama_embedding_model.py +38 -16
  54. deepeval/models/embedding_models/openai_embedding_model.py +40 -21
  55. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  56. deepeval/models/llms/anthropic_model.py +44 -23
  57. deepeval/models/llms/azure_model.py +121 -36
  58. deepeval/models/llms/deepseek_model.py +18 -13
  59. deepeval/models/llms/gemini_model.py +129 -43
  60. deepeval/models/llms/grok_model.py +18 -13
  61. deepeval/models/llms/kimi_model.py +18 -13
  62. deepeval/models/llms/litellm_model.py +42 -22
  63. deepeval/models/llms/local_model.py +12 -7
  64. deepeval/models/llms/ollama_model.py +114 -12
  65. deepeval/models/llms/openai_model.py +137 -41
  66. deepeval/models/llms/portkey_model.py +24 -7
  67. deepeval/models/llms/utils.py +5 -3
  68. deepeval/models/retry_policy.py +17 -14
  69. deepeval/models/utils.py +46 -1
  70. deepeval/optimizer/__init__.py +5 -0
  71. deepeval/optimizer/algorithms/__init__.py +6 -0
  72. deepeval/optimizer/algorithms/base.py +29 -0
  73. deepeval/optimizer/algorithms/configs.py +18 -0
  74. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  75. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  76. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  77. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  78. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  79. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  80. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  81. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  82. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  83. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  84. deepeval/{optimization → optimizer}/configs.py +5 -8
  85. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  86. deepeval/optimizer/prompt_optimizer.py +263 -0
  87. deepeval/optimizer/rewriter/__init__.py +5 -0
  88. deepeval/optimizer/rewriter/rewriter.py +124 -0
  89. deepeval/optimizer/rewriter/utils.py +214 -0
  90. deepeval/optimizer/scorer/__init__.py +5 -0
  91. deepeval/optimizer/scorer/base.py +86 -0
  92. deepeval/optimizer/scorer/scorer.py +316 -0
  93. deepeval/optimizer/scorer/utils.py +30 -0
  94. deepeval/optimizer/types.py +148 -0
  95. deepeval/{optimization → optimizer}/utils.py +47 -165
  96. deepeval/prompt/prompt.py +5 -9
  97. deepeval/test_case/__init__.py +1 -3
  98. deepeval/test_case/api.py +12 -10
  99. deepeval/test_case/conversational_test_case.py +19 -1
  100. deepeval/test_case/llm_test_case.py +152 -1
  101. deepeval/test_case/utils.py +4 -8
  102. deepeval/test_run/api.py +15 -14
  103. deepeval/test_run/test_run.py +3 -3
  104. deepeval/tracing/patchers.py +9 -4
  105. deepeval/tracing/tracing.py +2 -2
  106. deepeval/utils.py +65 -0
  107. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  108. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/RECORD +116 -125
  109. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  110. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  111. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  112. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  113. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  114. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  115. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  116. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  117. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  118. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  119. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  120. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  121. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  122. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  123. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  124. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  125. deepeval/models/mlllms/__init__.py +0 -4
  126. deepeval/models/mlllms/azure_model.py +0 -343
  127. deepeval/models/mlllms/gemini_model.py +0 -313
  128. deepeval/models/mlllms/ollama_model.py +0 -175
  129. deepeval/models/mlllms/openai_model.py +0 -309
  130. deepeval/optimization/__init__.py +0 -13
  131. deepeval/optimization/adapters/__init__.py +0 -2
  132. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  133. deepeval/optimization/aggregates.py +0 -14
  134. deepeval/optimization/copro/configs.py +0 -31
  135. deepeval/optimization/gepa/__init__.py +0 -7
  136. deepeval/optimization/gepa/configs.py +0 -115
  137. deepeval/optimization/miprov2/configs.py +0 -134
  138. deepeval/optimization/miprov2/loop.py +0 -785
  139. deepeval/optimization/mutations/__init__.py +0 -0
  140. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  141. deepeval/optimization/policies/__init__.py +0 -16
  142. deepeval/optimization/policies/tie_breaker.py +0 -67
  143. deepeval/optimization/prompt_optimizer.py +0 -462
  144. deepeval/optimization/simba/__init__.py +0 -0
  145. deepeval/optimization/simba/configs.py +0 -33
  146. deepeval/optimization/types.py +0 -361
  147. deepeval/test_case/mllm_test_case.py +0 -170
  148. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  149. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  150. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  152. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  153. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  154. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  155. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -1,588 +0,0 @@
1
- from __future__ import annotations
2
- import asyncio
3
- import copy
4
- import inspect
5
- import json
6
- from functools import lru_cache
7
- from pydantic import BaseModel as PydanticBaseModel
8
- from typing import (
9
- Any,
10
- Callable,
11
- Dict,
12
- List,
13
- Optional,
14
- Tuple,
15
- Union,
16
- )
17
-
18
- from deepeval.dataset.golden import Golden, ConversationalGolden
19
- from deepeval.errors import DeepEvalError
20
- from deepeval.metrics import (
21
- BaseMetric,
22
- BaseConversationalMetric,
23
- )
24
- from deepeval.test_case import (
25
- LLMTestCase,
26
- ConversationalTestCase,
27
- MLLMTestCase,
28
- Turn,
29
- )
30
- from deepeval.prompt.api import PromptType, PromptMessage
31
- from deepeval.prompt.prompt import Prompt
32
-
33
- from deepeval.optimization.types import (
34
- PromptConfiguration,
35
- Objective,
36
- MeanObjective,
37
- ModuleId,
38
- )
39
- from deepeval.optimization.utils import (
40
- validate_callback,
41
- validate_metrics,
42
- invoke_model_callback,
43
- a_invoke_model_callback,
44
- build_model_callback_kwargs,
45
- )
46
-
47
-
48
- @lru_cache(maxsize=None)
49
- def _has_kwarg(func: Callable, keyword: str) -> bool:
50
- """Return True if func accepts keyword or has **kwargs."""
51
- try:
52
- signature = inspect.signature(func)
53
- except (ValueError, TypeError):
54
- return False
55
- for param in signature.parameters.values():
56
- if param.kind == inspect.Parameter.VAR_KEYWORD:
57
- return True
58
- return keyword in signature.parameters
59
-
60
-
61
- def _measure_no_indicator(metric, test_case):
62
- """Call metric.measure(test_case) with _show_indicator=False if supported."""
63
- measure = getattr(metric, "measure")
64
- if _has_kwarg(measure, "_show_indicator"):
65
- return measure(test_case, _show_indicator=False)
66
- return measure(test_case)
67
-
68
-
69
- async def _a_measure_no_indicator(metric, test_case):
70
- """
71
- Prefer metric.a_measure with fall back to metric.measure in a thread.
72
- Always disable indicators when supported. This is to prevent interference
73
- with the gepa indicator.
74
- """
75
- a_measure = getattr(metric, "a_measure", None)
76
-
77
- if a_measure is not None:
78
- call = (
79
- a_measure(test_case, _show_indicator=False)
80
- if _has_kwarg(a_measure, "_show_indicator")
81
- else a_measure(test_case)
82
- )
83
- # Be resilient if impl returns a plain value
84
- return await call if inspect.isawaitable(call) else call
85
-
86
- # No async impl: run sync measure in a thread
87
- loop = asyncio.get_running_loop()
88
- return await loop.run_in_executor(
89
- None, lambda: _measure_no_indicator(metric, test_case)
90
- )
91
-
92
-
93
- class DeepEvalScoringAdapter:
94
- """Scoring adapter backed by DeepEval metrics with a built-in generation step."""
95
-
96
- DEFAULT_MODULE_ID: ModuleId = "__module__"
97
-
98
- def __init__(
99
- self,
100
- *,
101
- build_test_case: Optional[
102
- Callable[
103
- [Union[Golden, ConversationalGolden], str],
104
- Union[LLMTestCase, ConversationalTestCase, MLLMTestCase],
105
- ]
106
- ] = None,
107
- objective_scalar: Objective = MeanObjective(),
108
- list_input_role: str = "user",
109
- ):
110
- self.model_callback: Optional[
111
- Callable[
112
- ...,
113
- Union[
114
- str,
115
- Dict,
116
- Tuple[Union[str, Dict], float],
117
- ],
118
- ]
119
- ] = None
120
- self.metrics: Union[
121
- List[BaseMetric], List[BaseConversationalMetric]
122
- ] = []
123
-
124
- self.build_test_case = build_test_case or self._default_build_test_case
125
- self.objective_scalar = objective_scalar
126
- self.list_input_role = list_input_role
127
-
128
- # async
129
- self._semaphore: Optional[asyncio.Semaphore] = None
130
- self._throttle: float = 0.0
131
-
132
- def set_model_callback(
133
- self,
134
- model_callback: Callable[
135
- ...,
136
- Union[
137
- str,
138
- Dict,
139
- Tuple[Union[str, Dict], float],
140
- ],
141
- ],
142
- ):
143
- self.model_callback = validate_callback(
144
- component="DeepEvalScoringAdapter",
145
- model_callback=model_callback,
146
- )
147
-
148
- def set_metrics(
149
- self,
150
- metrics: Union[List[BaseMetric], List[BaseConversationalMetric]],
151
- ):
152
- self.metrics = validate_metrics(
153
- component="DeepEvalScoringAdapter", metrics=metrics
154
- )
155
-
156
- #######################################
157
- # prompt assembly & result unwrapping #
158
- #######################################
159
- def _primary_input_from_golden(
160
- self, golden: Union[Golden, ConversationalGolden]
161
- ) -> str:
162
- """
163
- Return the primary textual input to feed into the prompt for a given golden.
164
-
165
- - For Golden: use `input`
166
- - For ConversationalGolden: use `scenario`
167
- """
168
- if isinstance(golden, Golden):
169
- return golden.input
170
-
171
- if isinstance(golden, ConversationalGolden):
172
- return golden.scenario
173
-
174
- raise DeepEvalError(
175
- "DeepEvalScoringAdapter expected golden to be a Golden or "
176
- f"ConversationalGolden, but received {type(golden).__name__!r}."
177
- )
178
-
179
- def _compile_prompt_text(
180
- self, prompt: Prompt, golden: Union[Golden, ConversationalGolden]
181
- ) -> str:
182
- user_input = self._primary_input_from_golden(golden)
183
- base_text = prompt.text_template or ""
184
- if not user_input:
185
- return base_text.strip()
186
- return f"{base_text}\n\n{user_input}".strip()
187
-
188
- def _compile_prompt_messages(
189
- self,
190
- prompt: Prompt,
191
- golden: Union[Golden, ConversationalGolden],
192
- ) -> List[PromptMessage]:
193
- """
194
- Build the message contents for PromptType.LIST.
195
-
196
- Starts from `prompt.messages_template` and appends a new PromptMessage with
197
- the golden's `input` as the final message content.
198
- """
199
- messages_template = prompt.messages_template or []
200
- compiled: List[PromptMessage] = list(messages_template)
201
-
202
- user_input = self._primary_input_from_golden(golden)
203
- if user_input:
204
- compiled = compiled + [
205
- PromptMessage(role=self.list_input_role, content=user_input)
206
- ]
207
-
208
- return compiled
209
-
210
- def _build_callback_kwargs_for_prompt(
211
- self,
212
- prompt: Prompt,
213
- golden: Union["Golden", "ConversationalGolden"],
214
- ) -> Dict[str, Any]:
215
- """
216
- Decide whether to treat the prompt as TEXT or LIST and build the
217
- corresponding callback kwargs.
218
-
219
- - For TEXT prompts, we send: prompt_text=...
220
- - For LIST prompts, we send: prompt_messages=[...]
221
- """
222
-
223
- if prompt.type is PromptType.LIST:
224
- prompt_messages = self._compile_prompt_messages(prompt, golden)
225
- return build_model_callback_kwargs(
226
- prompt=prompt,
227
- prompt_messages=prompt_messages,
228
- golden=golden,
229
- )
230
-
231
- # Default to TEXT behaviour
232
- prompt_text = self._compile_prompt_text(prompt, golden)
233
- return build_model_callback_kwargs(
234
- prompt=prompt,
235
- prompt_text=prompt_text,
236
- golden=golden,
237
- )
238
-
239
- def _unwrap_text(
240
- self, result: Union[str, Dict, PydanticBaseModel, tuple]
241
- ) -> str:
242
- # DeepEval LLMs return (output, cost), unwrap if so.
243
- if isinstance(result, tuple) and result:
244
- result = result[0]
245
- if isinstance(result, PydanticBaseModel):
246
- return result.model_dump_json()
247
- if isinstance(result, dict):
248
- return json.dumps(result)
249
- return str(result)
250
-
251
- #####################
252
- # Test case helpers #
253
- #####################
254
- def _default_build_test_case(
255
- self, golden: Union[Golden, ConversationalGolden], actual: str
256
- ) -> Union[LLMTestCase, ConversationalTestCase]:
257
- """
258
- Default conversion from Golden or ConversationalGolden into a DeepEval test case.
259
-
260
- - Golden -> LLMTestCase
261
- - ConversationalGolden -> ConversationalTestCase
262
- """
263
- if isinstance(golden, Golden):
264
- return LLMTestCase(
265
- input=golden.input,
266
- expected_output=golden.expected_output,
267
- actual_output=actual,
268
- context=golden.context,
269
- retrieval_context=golden.retrieval_context,
270
- additional_metadata=golden.additional_metadata,
271
- comments=golden.comments,
272
- name=golden.name,
273
- tools_called=golden.tools_called,
274
- expected_tools=golden.expected_tools,
275
- )
276
-
277
- if isinstance(golden, ConversationalGolden):
278
- # Start from any turns specified on the golden.
279
- turns: List[Turn] = list(golden.turns or [])
280
- assistant_role = "assistant"
281
- user_role = "user"
282
- if turns:
283
- last = turns[-1]
284
- if last.role == assistant_role:
285
- # Replace the last assistant turn's content with the model's actual output.
286
- turns[-1] = Turn(
287
- role=last.role,
288
- content=actual,
289
- user_id=last.user_id,
290
- retrieval_context=last.retrieval_context,
291
- tools_called=last.tools_called,
292
- )
293
- else:
294
- # Append a new assistant turn with the actual output.
295
- turns.append(Turn(role=assistant_role, content=actual))
296
- else:
297
- # No turns provided: synthesize a minimal two-turn conversation.
298
- turns = [
299
- Turn(role=user_role, content=golden.scenario),
300
- Turn(role=assistant_role, content=actual),
301
- ]
302
-
303
- return ConversationalTestCase(
304
- turns=turns,
305
- scenario=golden.scenario,
306
- expected_outcome=golden.expected_outcome,
307
- user_description=golden.user_description,
308
- context=golden.context,
309
- additional_metadata=golden.additional_metadata,
310
- comments=golden.comments,
311
- name=golden.name,
312
- )
313
-
314
- raise DeepEvalError(
315
- "DeepEvalScoringAdapter._default_build_test_case expected a Golden "
316
- f"or ConversationalGolden, but received {type(golden).__name__!r}."
317
- )
318
-
319
- ###################
320
- # scoring helpers #
321
- ###################
322
-
323
- async def _bounded(self, coro):
324
- if self._semaphore is None:
325
- return await coro
326
- async with self._semaphore:
327
- res = await coro
328
- if self._throttle:
329
- await asyncio.sleep(self._throttle)
330
- return res
331
-
332
- async def _a_score_one(
333
- self,
334
- prompt_configuration: PromptConfiguration,
335
- golden: Union[Golden, ConversationalGolden],
336
- ) -> float:
337
- # Clone metrics to avoid shared-state
338
- metrics = [copy.copy(metric) for metric in self.metrics]
339
- actual = await self.a_generate(prompt_configuration.prompts, golden)
340
- test_case = self.build_test_case(golden, actual)
341
- per_metric: Dict[str, float] = {}
342
- for metric in metrics:
343
- score = await _a_measure_no_indicator(metric, test_case)
344
- per_metric[metric.__class__.__name__] = float(score)
345
- return self.objective_scalar.scalarize(per_metric)
346
-
347
- def _score_one(
348
- self,
349
- prompt_configuration: PromptConfiguration,
350
- golden: Union[Golden, ConversationalGolden],
351
- ) -> float:
352
- metrics = [copy.copy(m) for m in self.metrics]
353
- actual = self.generate(prompt_configuration.prompts, golden)
354
- test_case = self.build_test_case(golden, actual)
355
- per_metric: Dict[str, float] = {}
356
- for metric in metrics:
357
- score = _measure_no_indicator(metric, test_case)
358
- per_metric[metric.__class__.__name__] = float(score)
359
- return self.objective_scalar.scalarize(per_metric)
360
-
361
- #################
362
- # Configuration #
363
- #################
364
-
365
- def configure_async(
366
- self, *, max_concurrent: int = 20, throttle_seconds: float = 0.0
367
- ):
368
- # The runner will call this once, but it is safe to recreate between runs
369
- self._semaphore = asyncio.Semaphore(max_concurrent)
370
- self._throttle = float(throttle_seconds)
371
-
372
- ########################
373
- # generation & scoring #
374
- ########################
375
-
376
- def generate(
377
- self,
378
- prompts_by_module: Dict[ModuleId, Prompt],
379
- golden: Union[Golden, ConversationalGolden],
380
- ) -> str:
381
-
382
- if not prompts_by_module:
383
- raise DeepEvalError(
384
- "DeepEvalScoringAdapter.generate(...) received an empty "
385
- "`prompts_by_module`; at least one Prompt is required."
386
- )
387
-
388
- validate_callback(
389
- component="DeepEvalScoringAdapter",
390
- model_callback=self.model_callback,
391
- )
392
- validate_metrics(
393
- component="DeepEvalScoringAdapter", metrics=self.metrics
394
- )
395
-
396
- module_id = self._select_module_id_from_prompts(prompts_by_module)
397
- prompt = prompts_by_module.get(module_id) or next(
398
- iter(prompts_by_module.values())
399
- )
400
-
401
- candidate_kwargs = self._build_callback_kwargs_for_prompt(
402
- prompt=prompt,
403
- golden=golden,
404
- )
405
-
406
- result = invoke_model_callback(
407
- hook="score_generate",
408
- model_callback=self.model_callback,
409
- candidate_kwargs=candidate_kwargs,
410
- )
411
-
412
- return self._unwrap_text(result)
413
-
414
- async def a_generate(
415
- self,
416
- prompts_by_module: Dict[ModuleId, Prompt],
417
- golden: Union[Golden, ConversationalGolden],
418
- ) -> str:
419
-
420
- if not prompts_by_module:
421
- raise DeepEvalError(
422
- "DeepEvalScoringAdapter.a_generate(...) received an empty "
423
- "`prompts_by_module`; at least one Prompt is required."
424
- )
425
-
426
- validate_callback(
427
- component="DeepEvalScoringAdapter",
428
- model_callback=self.model_callback,
429
- )
430
- validate_metrics(
431
- component="DeepEvalScoringAdapter", metrics=self.metrics
432
- )
433
-
434
- module_id = self._select_module_id_from_prompts(prompts_by_module)
435
- prompt = prompts_by_module.get(module_id) or next(
436
- iter(prompts_by_module.values())
437
- )
438
-
439
- candidate_kwargs = self._build_callback_kwargs_for_prompt(
440
- prompt=prompt,
441
- golden=golden,
442
- )
443
-
444
- result = await a_invoke_model_callback(
445
- hook="score_generate",
446
- model_callback=self.model_callback,
447
- candidate_kwargs=candidate_kwargs,
448
- )
449
-
450
- return self._unwrap_text(result)
451
-
452
- def score_on_pareto(
453
- self,
454
- prompt_configuration: PromptConfiguration,
455
- d_pareto: Union[List[Golden], List[ConversationalGolden]],
456
- ) -> List[float]:
457
- return [
458
- self._score_one(prompt_configuration, golden) for golden in d_pareto
459
- ]
460
-
461
- def minibatch_score(
462
- self,
463
- prompt_configuration: PromptConfiguration,
464
- minibatch: Union[List[Golden], List[ConversationalGolden]],
465
- ) -> float:
466
- if not minibatch:
467
- return 0.0
468
-
469
- scores = [
470
- self._score_one(prompt_configuration, golden)
471
- for golden in minibatch
472
- ]
473
- return sum(scores) / len(scores)
474
-
475
- def minibatch_feedback(
476
- self,
477
- prompt_configuration: PromptConfiguration,
478
- module: ModuleId,
479
- minibatch: Union[List[Golden], List[ConversationalGolden]],
480
- ) -> str:
481
- # default metric feedback (μ_f): concat metric.reason across minibatch and cap length
482
- reasons: List[str] = []
483
- for golden in minibatch:
484
- actual = self.generate(prompt_configuration.prompts, golden)
485
- test_case = self.build_test_case(golden, actual)
486
- for metric in [copy.copy(m) for m in self.metrics]:
487
- _ = _measure_no_indicator(metric, test_case)
488
- if getattr(metric, "reason", None):
489
- reasons.append(str(metric.reason))
490
- if not reasons:
491
- return ""
492
- unique: List[str] = []
493
- seen = set()
494
- for reason in reasons:
495
- if reason not in seen:
496
- unique.append(reason)
497
- seen.add(reason)
498
- return "\n---\n".join(
499
- unique[:8]
500
- ) # TODO: Make how much feedback configurable
501
-
502
- async def a_score_on_pareto(
503
- self,
504
- prompt_configuration: PromptConfiguration,
505
- d_pareto: Union[List[Golden], List[ConversationalGolden]],
506
- ) -> List[float]:
507
- tasks = [
508
- self._bounded(self._a_score_one(prompt_configuration, golden))
509
- for golden in d_pareto
510
- ]
511
- return await asyncio.gather(*tasks)
512
-
513
- async def a_minibatch_score(
514
- self,
515
- prompt_configuration: PromptConfiguration,
516
- minibatch: Union[List[Golden], List[ConversationalGolden]],
517
- ) -> float:
518
- tasks = [
519
- self._bounded(self._a_score_one(prompt_configuration, golden))
520
- for golden in minibatch
521
- ]
522
- scores = await asyncio.gather(*tasks)
523
- return sum(scores) / len(scores) if scores else 0.0
524
-
525
- async def a_minibatch_feedback(
526
- self,
527
- prompt_configuration: PromptConfiguration,
528
- module: ModuleId,
529
- minibatch: Union[List[Golden], List[ConversationalGolden]],
530
- ) -> str:
531
- async def reasons_one(golden) -> List[str]:
532
- # Clone per task to avoid shared state
533
- metrics = [copy.copy(metric) for metric in self.metrics]
534
- # metrics = self.metrics
535
- actual = await self.a_generate(prompt_configuration.prompts, golden)
536
- test_case = self.build_test_case(golden, actual)
537
- out: List[str] = []
538
- for metric in metrics:
539
- _ = await _a_measure_no_indicator(metric, test_case)
540
- if getattr(metric, "reason", None):
541
- out.append(str(metric.reason))
542
- return out
543
-
544
- tasks = [self._bounded(reasons_one(golden)) for golden in minibatch]
545
- nested = await asyncio.gather(*tasks)
546
- reasons: List[str] = [reason for sub in nested for reason in sub]
547
- if not reasons:
548
- return ""
549
- unique: List[str] = []
550
- seen = set()
551
- for reason in reasons:
552
- if reason not in seen:
553
- unique.append(reason)
554
- seen.add(reason)
555
- return "\n---\n".join(unique[:8])
556
-
557
- def _select_module_id_from_prompts(
558
- self, prompts_by_module: Dict[ModuleId, Prompt]
559
- ) -> ModuleId:
560
- """
561
- Default module selection strategy:
562
-
563
- - Prefer the synthetic '__module__' key when present
564
- - Otherwise fall back to the first key in prompts_by_module.
565
-
566
- Assumes `prompts_by_module` is non-empty; callers should validate that.
567
- """
568
- if self.DEFAULT_MODULE_ID in prompts_by_module:
569
- return self.DEFAULT_MODULE_ID
570
-
571
- # At this point we expect at least one key.
572
- try:
573
- return next(iter(prompts_by_module.keys()))
574
- except StopIteration:
575
- raise DeepEvalError(
576
- "DeepEvalScoringAdapter._select_module_id_from_prompts(...) "
577
- "received an empty `prompts_by_module`. At least one Prompt is required."
578
- )
579
-
580
- def select_module(
581
- self, prompt_configuration: PromptConfiguration
582
- ) -> ModuleId:
583
- return self._select_module_id_from_prompts(prompt_configuration.prompts)
584
-
585
- async def a_select_module(
586
- self, prompt_configuration: PromptConfiguration
587
- ) -> ModuleId:
588
- return self.select_module(prompt_configuration)
@@ -1,14 +0,0 @@
1
- from typing import Protocol, Sequence
2
- import statistics
3
-
4
-
5
- class Aggregator(Protocol):
6
- def __call__(self, scores: Sequence[float]) -> float: ...
7
-
8
-
9
- def mean_of_all(scores: Sequence[float]) -> float:
10
- return statistics.fmean(scores) if scores else 0.0
11
-
12
-
13
- def median_of_all(scores: Sequence[float]) -> float:
14
- return statistics.median(scores) if scores else 0.0
@@ -1,31 +0,0 @@
1
- from __future__ import annotations
2
- from pydantic import Field, conint
3
-
4
- from deepeval.optimization.miprov2.configs import MIPROConfig
5
-
6
-
7
- class COPROConfig(MIPROConfig):
8
- """
9
- Configuration for COPRO style cooperative prompt optimization.
10
-
11
- This extends MIPROConfig with settings that control the cooperative
12
- sampling behavior.
13
-
14
- The core MIPROConfig fields behave exactly the same as in MIPROv2.
15
- """
16
-
17
- population_size: conint(ge=1) = Field(
18
- default=4,
19
- description=(
20
- "Maximum number of prompt candidates maintained in the active pool. "
21
- "Once this limit is exceeded, lower scoring candidates are pruned."
22
- ),
23
- )
24
-
25
- proposals_per_step: conint(ge=1) = Field(
26
- default=4,
27
- description=(
28
- "Number of child prompts proposed cooperatively from the same "
29
- "parent in each optimization iteration."
30
- ),
31
- )
@@ -1,7 +0,0 @@
1
- from .configs import GEPAConfig
2
- from .loop import GEPARunner
3
-
4
- __all__ = [
5
- "GEPAConfig",
6
- "GEPARunner",
7
- ]