deepeval 3.7.3__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/test.py +1 -1
  3. deepeval/config/settings.py +102 -13
  4. deepeval/dataset/golden.py +54 -2
  5. deepeval/evaluate/configs.py +1 -1
  6. deepeval/evaluate/evaluate.py +16 -8
  7. deepeval/evaluate/execute.py +74 -27
  8. deepeval/evaluate/utils.py +26 -22
  9. deepeval/integrations/pydantic_ai/agent.py +19 -2
  10. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  11. deepeval/metrics/__init__.py +14 -12
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  13. deepeval/metrics/answer_relevancy/template.py +188 -92
  14. deepeval/metrics/argument_correctness/template.py +2 -2
  15. deepeval/metrics/base_metric.py +2 -5
  16. deepeval/metrics/bias/template.py +3 -3
  17. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  18. deepeval/metrics/contextual_precision/template.py +115 -66
  19. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  20. deepeval/metrics/contextual_recall/template.py +106 -55
  21. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  22. deepeval/metrics/contextual_relevancy/template.py +87 -58
  23. deepeval/metrics/conversation_completeness/template.py +2 -2
  24. deepeval/metrics/conversational_dag/templates.py +4 -4
  25. deepeval/metrics/conversational_g_eval/template.py +4 -3
  26. deepeval/metrics/dag/templates.py +5 -5
  27. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  28. deepeval/metrics/faithfulness/schema.py +1 -1
  29. deepeval/metrics/faithfulness/template.py +200 -115
  30. deepeval/metrics/g_eval/utils.py +2 -2
  31. deepeval/metrics/hallucination/template.py +4 -4
  32. deepeval/metrics/indicator.py +4 -4
  33. deepeval/metrics/misuse/template.py +2 -2
  34. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  35. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  36. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  37. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  38. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  39. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  40. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  41. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  42. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  43. deepeval/metrics/non_advice/template.py +2 -2
  44. deepeval/metrics/pii_leakage/template.py +2 -2
  45. deepeval/metrics/prompt_alignment/template.py +4 -4
  46. deepeval/metrics/ragas.py +3 -3
  47. deepeval/metrics/role_violation/template.py +2 -2
  48. deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
  49. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  50. deepeval/metrics/toxicity/template.py +4 -4
  51. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  52. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  53. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  54. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  55. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  56. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  57. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  58. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  59. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  60. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  61. deepeval/metrics/turn_faithfulness/template.py +218 -0
  62. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  63. deepeval/metrics/turn_relevancy/template.py +2 -2
  64. deepeval/metrics/utils.py +39 -58
  65. deepeval/models/__init__.py +0 -12
  66. deepeval/models/base_model.py +16 -38
  67. deepeval/models/embedding_models/__init__.py +7 -0
  68. deepeval/models/embedding_models/azure_embedding_model.py +69 -32
  69. deepeval/models/embedding_models/local_embedding_model.py +39 -22
  70. deepeval/models/embedding_models/ollama_embedding_model.py +42 -18
  71. deepeval/models/embedding_models/openai_embedding_model.py +50 -15
  72. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  73. deepeval/models/llms/anthropic_model.py +53 -20
  74. deepeval/models/llms/azure_model.py +140 -43
  75. deepeval/models/llms/deepseek_model.py +38 -23
  76. deepeval/models/llms/gemini_model.py +222 -103
  77. deepeval/models/llms/grok_model.py +39 -27
  78. deepeval/models/llms/kimi_model.py +39 -23
  79. deepeval/models/llms/litellm_model.py +103 -45
  80. deepeval/models/llms/local_model.py +35 -22
  81. deepeval/models/llms/ollama_model.py +129 -17
  82. deepeval/models/llms/openai_model.py +151 -50
  83. deepeval/models/llms/portkey_model.py +149 -0
  84. deepeval/models/llms/utils.py +5 -3
  85. deepeval/models/retry_policy.py +17 -14
  86. deepeval/models/utils.py +94 -4
  87. deepeval/optimizer/__init__.py +5 -0
  88. deepeval/optimizer/algorithms/__init__.py +6 -0
  89. deepeval/optimizer/algorithms/base.py +29 -0
  90. deepeval/optimizer/algorithms/configs.py +18 -0
  91. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  92. deepeval/optimizer/algorithms/copro/copro.py +836 -0
  93. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  94. deepeval/optimizer/algorithms/gepa/gepa.py +737 -0
  95. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  96. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  97. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  98. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  99. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  100. deepeval/optimizer/algorithms/simba/simba.py +999 -0
  101. deepeval/optimizer/algorithms/simba/types.py +15 -0
  102. deepeval/optimizer/configs.py +31 -0
  103. deepeval/optimizer/policies.py +227 -0
  104. deepeval/optimizer/prompt_optimizer.py +263 -0
  105. deepeval/optimizer/rewriter/__init__.py +5 -0
  106. deepeval/optimizer/rewriter/rewriter.py +124 -0
  107. deepeval/optimizer/rewriter/utils.py +214 -0
  108. deepeval/optimizer/scorer/__init__.py +5 -0
  109. deepeval/optimizer/scorer/base.py +86 -0
  110. deepeval/optimizer/scorer/scorer.py +316 -0
  111. deepeval/optimizer/scorer/utils.py +30 -0
  112. deepeval/optimizer/types.py +148 -0
  113. deepeval/optimizer/utils.py +480 -0
  114. deepeval/prompt/prompt.py +7 -6
  115. deepeval/test_case/__init__.py +1 -3
  116. deepeval/test_case/api.py +12 -10
  117. deepeval/test_case/conversational_test_case.py +19 -1
  118. deepeval/test_case/llm_test_case.py +152 -1
  119. deepeval/test_case/utils.py +4 -8
  120. deepeval/test_run/api.py +15 -14
  121. deepeval/test_run/cache.py +2 -0
  122. deepeval/test_run/test_run.py +9 -4
  123. deepeval/tracing/patchers.py +9 -4
  124. deepeval/tracing/tracing.py +2 -2
  125. deepeval/utils.py +89 -0
  126. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  127. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/RECORD +134 -118
  128. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  129. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  130. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  131. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  132. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  133. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  134. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  135. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  136. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  137. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  138. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  139. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  140. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  141. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  142. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  143. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  144. deepeval/models/mlllms/__init__.py +0 -4
  145. deepeval/models/mlllms/azure_model.py +0 -334
  146. deepeval/models/mlllms/gemini_model.py +0 -284
  147. deepeval/models/mlllms/ollama_model.py +0 -144
  148. deepeval/models/mlllms/openai_model.py +0 -258
  149. deepeval/test_case/mllm_test_case.py +0 -170
  150. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  152. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  153. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  154. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  155. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  156. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,214 @@
1
+ from __future__ import annotations
2
+ import json
3
+ import random
4
+ from typing import List, Optional, Tuple, Union
5
+
6
+ from deepeval.errors import DeepEvalError
7
+ from deepeval.optimizer.utils import (
8
+ validate_int_in_range,
9
+ validate_instance,
10
+ )
11
+ from deepeval.optimizer.configs import (
12
+ MutationConfig,
13
+ MutationTargetType,
14
+ )
15
+ from deepeval.prompt.api import PromptType, PromptMessage
16
+ from deepeval.prompt.prompt import Prompt
17
+
18
+
19
+ ##################
20
+ # Common Helpers #
21
+ ##################
22
+ def _summarize_prompt_for_rewrite(old_prompt: Prompt, max_chars: int) -> str:
23
+ """
24
+ Produce a human-readable summary of the current prompt for the
25
+ rewriter instruction block.
26
+
27
+ - For TEXT prompts, this is just `text_template`.
28
+ - For LIST prompts, this is a numbered list of (role, content) lines.
29
+ """
30
+
31
+ # LIST prompts: show each message with its role.
32
+ if old_prompt.type is PromptType.LIST and old_prompt.messages_template:
33
+ lines: List[str] = []
34
+ for message_index, message in enumerate(old_prompt.messages_template):
35
+ role = message.role or ""
36
+ content = message.content or ""
37
+ lines.append(f"[{message_index+1}] ({role}) {content}")
38
+ combined = "\n".join(lines)
39
+ return combined[:max_chars]
40
+
41
+ # Since it is not a LIST prompt, just use text_template.
42
+ text = old_prompt.text_template or ""
43
+ return text[:max_chars]
44
+
45
+
46
+ def _select_list_target_index(
47
+ messages: List[PromptMessage],
48
+ config: MutationConfig,
49
+ random_state: random.Random,
50
+ ) -> int:
51
+ """
52
+ Select which list message index to rewrite, based on PromptListMutationConfig.
53
+
54
+ Rules:
55
+ - Start with all indices in scope.
56
+ - If target_role is set, restrict candidates to messages with that role
57
+ (case insensitive). If no messages match, fall back to all indices.
58
+ - target_type:
59
+ * FIRST: pick the first candidate index.
60
+ * RANDOM: pick a candidate via random_state.choice(candidates).
61
+ * FIXED_INDEX: use target_index when valid (and consistent with role
62
+ filter), otherwise fall back to the first candidate.
63
+ """
64
+ if not messages:
65
+ raise DeepEvalError(
66
+ "Rewriter._select_list_target_index expected at least one "
67
+ "message, but received an empty message list."
68
+ )
69
+
70
+ validate_instance(
71
+ component="Rewriter._select_list_target_index",
72
+ param_name="target_type",
73
+ value=config.target_type,
74
+ expected_types=MutationTargetType,
75
+ )
76
+
77
+ messages_length = len(messages)
78
+ candidate_indices = list(range(messages_length))
79
+
80
+ # Optional case insensitive role restriction
81
+ if config.target_role:
82
+ target_role_lower = config.target_role.lower()
83
+ filtered = [
84
+ index
85
+ for index, message in enumerate(messages)
86
+ if (message.role or "").lower() == target_role_lower
87
+ ]
88
+ if filtered:
89
+ candidate_indices = filtered
90
+
91
+ target_type = config.target_type
92
+
93
+ if target_type is MutationTargetType.RANDOM:
94
+ return random_state.choice(candidate_indices)
95
+
96
+ if target_type is MutationTargetType.FIXED_INDEX:
97
+ index = validate_int_in_range(
98
+ component="Rewriter._select_list_target_index",
99
+ param_name="target_index",
100
+ value=int(config.target_index),
101
+ min_inclusive=0,
102
+ max_exclusive=len(candidate_indices),
103
+ )
104
+ return candidate_indices[index]
105
+
106
+ # if you got this error it means that a new PromptListMutationTargetType was added,
107
+ # but not handled above
108
+ raise DeepEvalError(
109
+ "Rewriter._select_list_target_index received unsupported "
110
+ f"target_type={target_type!r}. Expected RANDOM or FIXED_INDEX."
111
+ )
112
+
113
+
114
+ def _apply_rewritten_prompt(
115
+ old_prompt: Prompt,
116
+ new_text: str,
117
+ random_state: random.Random,
118
+ list_mutation_config: Optional[MutationConfig] = None,
119
+ ) -> Prompt:
120
+ """
121
+ Apply the rewritten text to a Prompt, preserving representation:
122
+
123
+ - For TEXT prompts, update `text_template`.
124
+ - For LIST prompts, rewrite the content of a single message while
125
+ keeping the number of messages the same.
126
+ - Preserve additonal Prompt meta such as `label` and `interpolation_type`
127
+ """
128
+ if not new_text:
129
+ return old_prompt
130
+
131
+ if old_prompt.type is PromptType.LIST and old_prompt.messages_template:
132
+ messages = old_prompt.messages_template
133
+ config = list_mutation_config or MutationConfig()
134
+
135
+ target_index = _select_list_target_index(
136
+ messages=messages,
137
+ config=config,
138
+ random_state=random_state,
139
+ )
140
+
141
+ new_messages: List[PromptMessage] = []
142
+ for message_index, message in enumerate(messages):
143
+ if message_index == target_index:
144
+ # Preserve the original role; do not inject a new one.
145
+ new_messages.append(
146
+ PromptMessage(
147
+ role=message.role,
148
+ content=new_text,
149
+ )
150
+ )
151
+ else:
152
+ new_messages.append(message)
153
+
154
+ new_prompt = Prompt(
155
+ alias=old_prompt.alias,
156
+ text_template=None,
157
+ messages_template=new_messages,
158
+ model_settings=old_prompt.model_settings,
159
+ output_type=old_prompt.output_type,
160
+ output_schema=old_prompt.output_schema,
161
+ )
162
+
163
+ else:
164
+ # Since it is not LIST, it must be TEXT type
165
+ new_prompt = Prompt(
166
+ alias=old_prompt.alias,
167
+ text_template=new_text,
168
+ model_settings=old_prompt.model_settings,
169
+ output_type=old_prompt.output_type,
170
+ output_schema=old_prompt.output_schema,
171
+ )
172
+
173
+ new_prompt.label = old_prompt.label
174
+ new_prompt.interpolation_type = old_prompt.interpolation_type
175
+ return new_prompt
176
+
177
+
178
+ def _compose_prompt_messages(system_message: str, user_message: str) -> str:
179
+ """
180
+ Join system and user messages into a single prompt string.
181
+ Strips surrounding whitespace from each part; if the system message is
182
+ empty or absent, returns just the user message.
183
+ """
184
+ system_text = (system_message or "").strip()
185
+ user_text = (user_message or "").strip()
186
+ return f"{system_text}\n\n{user_text}" if system_text else user_text
187
+
188
+
189
+ def _normalize_llm_output_to_text(
190
+ result: Union[str, Tuple[Union[str, dict], float], dict],
191
+ ) -> str:
192
+ """
193
+ Convert a DeepEval LLM generate() / a_generate() result to a clean string.
194
+
195
+ Accepted inputs:
196
+ - str -> returned as trimmed
197
+ - (str|dict, float_cost) -> first element extracted and normalized
198
+ - dict (e.g. JSON mode) -> JSON serialized with ensure_ascii=False
199
+
200
+ Fallback: if serialization fails, str(value).strip() is used.
201
+ """
202
+ output_value: Union[str, dict]
203
+ if isinstance(result, tuple):
204
+ output_value = result[0]
205
+ else:
206
+ output_value = result
207
+
208
+ if isinstance(output_value, str):
209
+ return output_value.strip()
210
+
211
+ try:
212
+ return json.dumps(output_value, ensure_ascii=False)
213
+ except Exception:
214
+ return str(output_value).strip()
@@ -0,0 +1,5 @@
1
+ from .scorer import Scorer
2
+
3
+ __all__ = [
4
+ "Scorer",
5
+ ]
@@ -0,0 +1,86 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Union, List
3
+
4
+ from deepeval.optimizer.types import PromptConfiguration, ScoreVector
5
+ from deepeval.dataset.golden import Golden, ConversationalGolden
6
+
7
+ ModuleId = str
8
+
9
+
10
+ class BaseScorer(ABC):
11
+ """
12
+ Base scorer contract used by optimization runners.
13
+
14
+ Runners call into this adapter to:
15
+ - compute scores per-instance on some subset (score_on_pareto),
16
+ - compute minibatch means for selection and acceptance,
17
+ - generate feedback text used by the Rewriter.
18
+ """
19
+
20
+ # Sync
21
+ @abstractmethod
22
+ def score_pareto(
23
+ self,
24
+ prompt_configuration: PromptConfiguration,
25
+ d_pareto: Union[List[Golden], List[ConversationalGolden]],
26
+ ) -> ScoreVector:
27
+ """Return per-instance scores on D_pareto."""
28
+ raise NotImplementedError
29
+
30
+ @abstractmethod
31
+ def score_minibatch(
32
+ self,
33
+ prompt_configuration: PromptConfiguration,
34
+ minibatch: Union[List[Golden], List[ConversationalGolden]],
35
+ ) -> float:
36
+ """Return average score μ on a minibatch from D_feedback."""
37
+ raise NotImplementedError
38
+
39
+ @abstractmethod
40
+ def get_minibatch_feedback(
41
+ self,
42
+ prompt_configuration: PromptConfiguration,
43
+ module: ModuleId,
44
+ minibatch: Union[List[Golden], List[ConversationalGolden]],
45
+ ) -> str:
46
+ """Return μ_f text for the module (metric.reason + traces, etc.)."""
47
+ raise NotImplementedError
48
+
49
+ @abstractmethod
50
+ def select_module(
51
+ self, prompt_configuration: PromptConfiguration
52
+ ) -> ModuleId:
53
+ """Pick a module to mutate."""
54
+ raise NotImplementedError
55
+
56
+ # Async
57
+ @abstractmethod
58
+ async def a_score_pareto(
59
+ self,
60
+ prompt_configuration: PromptConfiguration,
61
+ d_pareto: Union[List[Golden], List[ConversationalGolden]],
62
+ ) -> ScoreVector:
63
+ raise NotImplementedError
64
+
65
+ @abstractmethod
66
+ async def a_score_minibatch(
67
+ self,
68
+ prompt_configuration: PromptConfiguration,
69
+ minibatch: Union[List[Golden], List[ConversationalGolden]],
70
+ ) -> float:
71
+ raise NotImplementedError
72
+
73
+ @abstractmethod
74
+ async def a_get_minibatch_feedback(
75
+ self,
76
+ prompt_configuration: PromptConfiguration,
77
+ module: ModuleId,
78
+ minibatch: Union[List[Golden], List[ConversationalGolden]],
79
+ ) -> str:
80
+ raise NotImplementedError
81
+
82
+ @abstractmethod
83
+ async def a_select_module(
84
+ self, prompt_configuration: PromptConfiguration
85
+ ) -> ModuleId:
86
+ raise NotImplementedError
@@ -0,0 +1,316 @@
1
+ from __future__ import annotations
2
+ import asyncio
3
+ import copy
4
+ from typing import (
5
+ Callable,
6
+ Dict,
7
+ List,
8
+ Optional,
9
+ Union,
10
+ )
11
+
12
+ from deepeval.dataset.golden import Golden, ConversationalGolden
13
+ from deepeval.dataset.utils import (
14
+ convert_goldens_to_test_cases,
15
+ convert_convo_goldens_to_convo_test_cases,
16
+ )
17
+ from deepeval.errors import DeepEvalError
18
+ from deepeval.metrics import (
19
+ BaseMetric,
20
+ BaseConversationalMetric,
21
+ )
22
+ from deepeval.metrics.utils import copy_metrics
23
+ from deepeval.test_case import (
24
+ LLMTestCase,
25
+ ConversationalTestCase,
26
+ Turn,
27
+ )
28
+ from deepeval.prompt.prompt import Prompt
29
+
30
+ from deepeval.optimizer.types import (
31
+ ModelCallback,
32
+ PromptConfiguration,
33
+ Objective,
34
+ MeanObjective,
35
+ ModuleId,
36
+ )
37
+ from deepeval.optimizer.scorer.base import BaseScorer
38
+ from deepeval.optimizer.utils import (
39
+ validate_callback,
40
+ validate_metrics,
41
+ invoke_model_callback,
42
+ a_invoke_model_callback,
43
+ )
44
+ from deepeval.optimizer.scorer.utils import (
45
+ _measure_no_indicator,
46
+ _a_measure_no_indicator,
47
+ )
48
+
49
+
50
+ class Scorer(BaseScorer):
51
+ """
52
+ Scores prompts by running model_callback, building test cases,
53
+ running metrics, and aggregating scores.
54
+ """
55
+
56
+ DEFAULT_MODULE_ID: ModuleId = "__module__"
57
+
58
+ def __init__(
59
+ self,
60
+ model_callback: ModelCallback,
61
+ metrics: Union[List[BaseMetric], List[BaseConversationalMetric]],
62
+ max_concurrent: int,
63
+ throttle_seconds: float,
64
+ objective_scalar: Objective = MeanObjective(),
65
+ ):
66
+ self.model_callback = validate_callback(
67
+ component="Scorer",
68
+ model_callback=model_callback,
69
+ )
70
+ self.metrics = validate_metrics(component="Scorer", metrics=metrics)
71
+ self.objective_scalar = objective_scalar
72
+ self._semaphore = asyncio.Semaphore(max_concurrent)
73
+ self._throttle = float(throttle_seconds)
74
+
75
+ ########################
76
+ # generation & scoring #
77
+ ########################
78
+
79
+ def generate(
80
+ self,
81
+ prompts_by_module: Dict[ModuleId, Prompt],
82
+ golden: Union[Golden, ConversationalGolden],
83
+ ) -> str:
84
+ module_id = self._select_module_id_from_prompts(prompts_by_module)
85
+ prompt = prompts_by_module.get(module_id) or next(
86
+ iter(prompts_by_module.values())
87
+ )
88
+
89
+ return invoke_model_callback(
90
+ model_callback=self.model_callback,
91
+ prompt=prompt,
92
+ golden=golden,
93
+ )
94
+
95
+ async def a_generate(
96
+ self,
97
+ prompts_by_module: Dict[ModuleId, Prompt],
98
+ golden: Union[Golden, ConversationalGolden],
99
+ ) -> str:
100
+ module_id = self._select_module_id_from_prompts(prompts_by_module)
101
+ prompt = prompts_by_module.get(module_id) or next(
102
+ iter(prompts_by_module.values())
103
+ )
104
+
105
+ return await a_invoke_model_callback(
106
+ model_callback=self.model_callback,
107
+ prompt=prompt,
108
+ golden=golden,
109
+ )
110
+
111
+ def score_pareto(
112
+ self,
113
+ prompt_configuration: PromptConfiguration,
114
+ d_pareto: Union[List[Golden], List[ConversationalGolden]],
115
+ ) -> List[float]:
116
+ return [
117
+ self._score_one(prompt_configuration, golden) for golden in d_pareto
118
+ ]
119
+
120
+ def score_minibatch(
121
+ self,
122
+ prompt_configuration: PromptConfiguration,
123
+ minibatch: Union[List[Golden], List[ConversationalGolden]],
124
+ ) -> float:
125
+ if not minibatch:
126
+ return 0.0
127
+
128
+ scores = [
129
+ self._score_one(prompt_configuration, golden)
130
+ for golden in minibatch
131
+ ]
132
+ return sum(scores) / len(scores)
133
+
134
+ def get_minibatch_feedback(
135
+ self,
136
+ prompt_configuration: PromptConfiguration,
137
+ module: ModuleId,
138
+ minibatch: Union[List[Golden], List[ConversationalGolden]],
139
+ ) -> str:
140
+ # default metric feedback (μ_f): concat metric.reason across minibatch and cap length
141
+ reasons: List[str] = []
142
+ for golden in minibatch:
143
+ actual = self.generate(prompt_configuration.prompts, golden)
144
+ test_case = self._golden_to_test_case(golden, actual)
145
+ for metric in copy_metrics(self.metrics):
146
+ _measure_no_indicator(metric=metric, test_case=test_case)
147
+ if metric.reason:
148
+ reasons.append(str(metric.reason))
149
+ if not reasons:
150
+ return ""
151
+ unique: List[str] = []
152
+ seen = set()
153
+ for reason in reasons:
154
+ if reason not in seen:
155
+ unique.append(reason)
156
+ seen.add(reason)
157
+ return "\n---\n".join(
158
+ unique[:8]
159
+ ) # TODO: Make how much feedback configurable
160
+
161
+ async def a_score_pareto(
162
+ self,
163
+ prompt_configuration: PromptConfiguration,
164
+ d_pareto: Union[List[Golden], List[ConversationalGolden]],
165
+ ) -> List[float]:
166
+ tasks = [
167
+ self._bounded(self._a_score_one(prompt_configuration, golden))
168
+ for golden in d_pareto
169
+ ]
170
+ return await asyncio.gather(*tasks)
171
+
172
+ async def a_score_minibatch(
173
+ self,
174
+ prompt_configuration: PromptConfiguration,
175
+ minibatch: Union[List[Golden], List[ConversationalGolden]],
176
+ ) -> float:
177
+ tasks = [
178
+ self._bounded(self._a_score_one(prompt_configuration, golden))
179
+ for golden in minibatch
180
+ ]
181
+ scores = await asyncio.gather(*tasks)
182
+ return sum(scores) / len(scores) if scores else 0.0
183
+
184
+ async def a_get_minibatch_feedback(
185
+ self,
186
+ prompt_configuration: PromptConfiguration,
187
+ module: ModuleId,
188
+ minibatch: Union[List[Golden], List[ConversationalGolden]],
189
+ ) -> str:
190
+ async def reasons_one(golden) -> List[str]:
191
+ # Clone per task to avoid shared state
192
+ metrics = copy_metrics(self.metrics)
193
+ # metrics = self.metrics
194
+ actual = await self.a_generate(prompt_configuration.prompts, golden)
195
+ test_case = self._golden_to_test_case(golden, actual)
196
+ out: List[str] = []
197
+ for metric in metrics:
198
+ await _a_measure_no_indicator(metric, test_case)
199
+ if metric.reason:
200
+ out.append(str(metric.reason))
201
+ return out
202
+
203
+ tasks = [self._bounded(reasons_one(golden)) for golden in minibatch]
204
+ nested = await asyncio.gather(*tasks)
205
+ reasons: List[str] = [reason for sub in nested for reason in sub]
206
+ if not reasons:
207
+ return ""
208
+ unique: List[str] = []
209
+ seen = set()
210
+ for reason in reasons:
211
+ if reason not in seen:
212
+ unique.append(reason)
213
+ seen.add(reason)
214
+ return "\n---\n".join(unique[:8])
215
+
216
+ ###################
217
+ # scoring helpers #
218
+ ###################
219
+
220
+ def _golden_to_test_case(
221
+ self,
222
+ golden: Union[Golden, ConversationalGolden],
223
+ actual: str,
224
+ ) -> Union[LLMTestCase, ConversationalTestCase]:
225
+ """Convert a golden + actual output into a test case for metrics."""
226
+ if isinstance(golden, Golden):
227
+ golden.actual_output = actual
228
+ return convert_goldens_to_test_cases([golden])[0]
229
+
230
+ if isinstance(golden, ConversationalGolden):
231
+ # Build turns with actual output as assistant response
232
+ turns: List[Turn] = list(golden.turns or [])
233
+ if turns and turns[-1].role == "assistant":
234
+ turns[-1] = Turn(role="assistant", content=actual)
235
+ elif turns:
236
+ turns.append(Turn(role="assistant", content=actual))
237
+ else:
238
+ turns = [
239
+ Turn(role="assistant", content=actual),
240
+ ]
241
+
242
+ golden.turns = turns
243
+ return convert_convo_goldens_to_convo_test_cases([golden])[0]
244
+
245
+ async def _bounded(self, coro):
246
+ if self._semaphore is None:
247
+ return await coro
248
+ async with self._semaphore:
249
+ res = await coro
250
+ if self._throttle:
251
+ await asyncio.sleep(self._throttle)
252
+ return res
253
+
254
+ async def _a_score_one(
255
+ self,
256
+ prompt_configuration: PromptConfiguration,
257
+ golden: Union[Golden, ConversationalGolden],
258
+ ) -> float:
259
+ # Clone metrics to avoid shared-state
260
+ metrics = copy_metrics(self.metrics)
261
+ actual = await self.a_generate(prompt_configuration.prompts, golden)
262
+ test_case = self._golden_to_test_case(golden, actual)
263
+
264
+ per_metric: Dict[str, float] = {}
265
+ for metric in metrics:
266
+ score = await _a_measure_no_indicator(metric, test_case)
267
+ per_metric[metric.__class__.__name__] = float(score)
268
+ return self.objective_scalar.scalarize(per_metric)
269
+
270
+ def _score_one(
271
+ self,
272
+ prompt_configuration: PromptConfiguration,
273
+ golden: Union[Golden, ConversationalGolden],
274
+ ) -> float:
275
+ metrics = copy_metrics(self.metrics)
276
+ actual = self.generate(prompt_configuration.prompts, golden)
277
+ test_case = self._golden_to_test_case(golden, actual)
278
+
279
+ per_metric: Dict[str, float] = {}
280
+ for metric in metrics:
281
+ score = _measure_no_indicator(metric, test_case)
282
+ per_metric[metric.__class__.__name__] = float(score)
283
+ return self.objective_scalar.scalarize(per_metric)
284
+
285
+ def _select_module_id_from_prompts(
286
+ self, prompts_by_module: Dict[ModuleId, Prompt]
287
+ ) -> ModuleId:
288
+ """
289
+ Default module selection strategy:
290
+
291
+ - Prefer the synthetic '__module__' key when present
292
+ - Otherwise fall back to the first key in prompts_by_module.
293
+
294
+ Assumes `prompts_by_module` is non-empty; callers should validate that.
295
+ """
296
+ if self.DEFAULT_MODULE_ID in prompts_by_module:
297
+ return self.DEFAULT_MODULE_ID
298
+
299
+ # At this point we expect at least one key.
300
+ try:
301
+ return next(iter(prompts_by_module.keys()))
302
+ except StopIteration:
303
+ raise DeepEvalError(
304
+ "Scorer._select_module_id_from_prompts(...) "
305
+ "received an empty `prompts_by_module`. At least one Prompt is required."
306
+ )
307
+
308
+ def select_module(
309
+ self, prompt_configuration: PromptConfiguration
310
+ ) -> ModuleId:
311
+ return self._select_module_id_from_prompts(prompt_configuration.prompts)
312
+
313
+ async def a_select_module(
314
+ self, prompt_configuration: PromptConfiguration
315
+ ) -> ModuleId:
316
+ return self.select_module(prompt_configuration)
@@ -0,0 +1,30 @@
1
+ import inspect
2
+ from typing import Callable, Union
3
+
4
+ from deepeval.metrics import BaseConversationalMetric, BaseMetric
5
+ from deepeval.test_case import ConversationalTestCase, LLMTestCase
6
+
7
+
8
+ def _build_measure_kwargs(func: Callable) -> dict:
9
+ params = inspect.signature(func).parameters
10
+ kwargs = {}
11
+ for key in ("_show_indicator", "_in_component", "_log_metric_to_confident"):
12
+ if key in params:
13
+ kwargs[key] = False
14
+ return kwargs
15
+
16
+
17
+ def _measure_no_indicator(
18
+ metric: Union[BaseMetric, BaseConversationalMetric],
19
+ test_case: Union[LLMTestCase, ConversationalTestCase],
20
+ ):
21
+ kwargs = _build_measure_kwargs(metric.measure)
22
+ return metric.measure(test_case, **kwargs)
23
+
24
+
25
+ async def _a_measure_no_indicator(
26
+ metric: Union[BaseMetric, BaseConversationalMetric],
27
+ test_case: Union[LLMTestCase, ConversationalTestCase],
28
+ ):
29
+ kwargs = _build_measure_kwargs(metric.a_measure)
30
+ return await metric.a_measure(test_case, **kwargs)