deepeval 3.7.3__py3-none-any.whl → 3.7.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/test.py +1 -1
- deepeval/config/settings.py +102 -13
- deepeval/dataset/golden.py +54 -2
- deepeval/evaluate/configs.py +1 -1
- deepeval/evaluate/evaluate.py +16 -8
- deepeval/evaluate/execute.py +74 -27
- deepeval/evaluate/utils.py +26 -22
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/metrics/__init__.py +14 -12
- deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
- deepeval/metrics/answer_relevancy/template.py +188 -92
- deepeval/metrics/argument_correctness/template.py +2 -2
- deepeval/metrics/base_metric.py +2 -5
- deepeval/metrics/bias/template.py +3 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/template.py +2 -2
- deepeval/metrics/conversational_dag/templates.py +4 -4
- deepeval/metrics/conversational_g_eval/template.py +4 -3
- deepeval/metrics/dag/templates.py +5 -5
- deepeval/metrics/faithfulness/faithfulness.py +70 -27
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/utils.py +2 -2
- deepeval/metrics/hallucination/template.py +4 -4
- deepeval/metrics/indicator.py +4 -4
- deepeval/metrics/misuse/template.py +2 -2
- deepeval/metrics/multimodal_metrics/__init__.py +0 -18
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
- deepeval/metrics/non_advice/template.py +2 -2
- deepeval/metrics/pii_leakage/template.py +2 -2
- deepeval/metrics/prompt_alignment/template.py +4 -4
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_violation/template.py +2 -2
- deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
- deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
- deepeval/metrics/toxicity/template.py +4 -4
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
- deepeval/metrics/turn_relevancy/template.py +2 -2
- deepeval/metrics/utils.py +39 -58
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +16 -38
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +69 -32
- deepeval/models/embedding_models/local_embedding_model.py +39 -22
- deepeval/models/embedding_models/ollama_embedding_model.py +42 -18
- deepeval/models/embedding_models/openai_embedding_model.py +50 -15
- deepeval/models/llms/amazon_bedrock_model.py +1 -2
- deepeval/models/llms/anthropic_model.py +53 -20
- deepeval/models/llms/azure_model.py +140 -43
- deepeval/models/llms/deepseek_model.py +38 -23
- deepeval/models/llms/gemini_model.py +222 -103
- deepeval/models/llms/grok_model.py +39 -27
- deepeval/models/llms/kimi_model.py +39 -23
- deepeval/models/llms/litellm_model.py +103 -45
- deepeval/models/llms/local_model.py +35 -22
- deepeval/models/llms/ollama_model.py +129 -17
- deepeval/models/llms/openai_model.py +151 -50
- deepeval/models/llms/portkey_model.py +149 -0
- deepeval/models/llms/utils.py +5 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +94 -4
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/optimizer/algorithms/copro/copro.py +836 -0
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/optimizer/algorithms/gepa/gepa.py +737 -0
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/optimizer/algorithms/simba/simba.py +999 -0
- deepeval/optimizer/algorithms/simba/types.py +15 -0
- deepeval/optimizer/configs.py +31 -0
- deepeval/optimizer/policies.py +227 -0
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/optimizer/utils.py +480 -0
- deepeval/prompt/prompt.py +7 -6
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +12 -10
- deepeval/test_case/conversational_test_case.py +19 -1
- deepeval/test_case/llm_test_case.py +152 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +15 -14
- deepeval/test_run/cache.py +2 -0
- deepeval/test_run/test_run.py +9 -4
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +89 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/RECORD +134 -118
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -334
- deepeval/models/mlllms/gemini_model.py +0 -284
- deepeval/models/mlllms/ollama_model.py +0 -144
- deepeval/models/mlllms/openai_model.py +0 -258
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import json
|
|
3
|
+
import random
|
|
4
|
+
from typing import List, Optional, Tuple, Union
|
|
5
|
+
|
|
6
|
+
from deepeval.errors import DeepEvalError
|
|
7
|
+
from deepeval.optimizer.utils import (
|
|
8
|
+
validate_int_in_range,
|
|
9
|
+
validate_instance,
|
|
10
|
+
)
|
|
11
|
+
from deepeval.optimizer.configs import (
|
|
12
|
+
MutationConfig,
|
|
13
|
+
MutationTargetType,
|
|
14
|
+
)
|
|
15
|
+
from deepeval.prompt.api import PromptType, PromptMessage
|
|
16
|
+
from deepeval.prompt.prompt import Prompt
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
##################
|
|
20
|
+
# Common Helpers #
|
|
21
|
+
##################
|
|
22
|
+
def _summarize_prompt_for_rewrite(old_prompt: Prompt, max_chars: int) -> str:
|
|
23
|
+
"""
|
|
24
|
+
Produce a human-readable summary of the current prompt for the
|
|
25
|
+
rewriter instruction block.
|
|
26
|
+
|
|
27
|
+
- For TEXT prompts, this is just `text_template`.
|
|
28
|
+
- For LIST prompts, this is a numbered list of (role, content) lines.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
# LIST prompts: show each message with its role.
|
|
32
|
+
if old_prompt.type is PromptType.LIST and old_prompt.messages_template:
|
|
33
|
+
lines: List[str] = []
|
|
34
|
+
for message_index, message in enumerate(old_prompt.messages_template):
|
|
35
|
+
role = message.role or ""
|
|
36
|
+
content = message.content or ""
|
|
37
|
+
lines.append(f"[{message_index+1}] ({role}) {content}")
|
|
38
|
+
combined = "\n".join(lines)
|
|
39
|
+
return combined[:max_chars]
|
|
40
|
+
|
|
41
|
+
# Since it is not a LIST prompt, just use text_template.
|
|
42
|
+
text = old_prompt.text_template or ""
|
|
43
|
+
return text[:max_chars]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _select_list_target_index(
|
|
47
|
+
messages: List[PromptMessage],
|
|
48
|
+
config: MutationConfig,
|
|
49
|
+
random_state: random.Random,
|
|
50
|
+
) -> int:
|
|
51
|
+
"""
|
|
52
|
+
Select which list message index to rewrite, based on PromptListMutationConfig.
|
|
53
|
+
|
|
54
|
+
Rules:
|
|
55
|
+
- Start with all indices in scope.
|
|
56
|
+
- If target_role is set, restrict candidates to messages with that role
|
|
57
|
+
(case insensitive). If no messages match, fall back to all indices.
|
|
58
|
+
- target_type:
|
|
59
|
+
* FIRST: pick the first candidate index.
|
|
60
|
+
* RANDOM: pick a candidate via random_state.choice(candidates).
|
|
61
|
+
* FIXED_INDEX: use target_index when valid (and consistent with role
|
|
62
|
+
filter), otherwise fall back to the first candidate.
|
|
63
|
+
"""
|
|
64
|
+
if not messages:
|
|
65
|
+
raise DeepEvalError(
|
|
66
|
+
"Rewriter._select_list_target_index expected at least one "
|
|
67
|
+
"message, but received an empty message list."
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
validate_instance(
|
|
71
|
+
component="Rewriter._select_list_target_index",
|
|
72
|
+
param_name="target_type",
|
|
73
|
+
value=config.target_type,
|
|
74
|
+
expected_types=MutationTargetType,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
messages_length = len(messages)
|
|
78
|
+
candidate_indices = list(range(messages_length))
|
|
79
|
+
|
|
80
|
+
# Optional case insensitive role restriction
|
|
81
|
+
if config.target_role:
|
|
82
|
+
target_role_lower = config.target_role.lower()
|
|
83
|
+
filtered = [
|
|
84
|
+
index
|
|
85
|
+
for index, message in enumerate(messages)
|
|
86
|
+
if (message.role or "").lower() == target_role_lower
|
|
87
|
+
]
|
|
88
|
+
if filtered:
|
|
89
|
+
candidate_indices = filtered
|
|
90
|
+
|
|
91
|
+
target_type = config.target_type
|
|
92
|
+
|
|
93
|
+
if target_type is MutationTargetType.RANDOM:
|
|
94
|
+
return random_state.choice(candidate_indices)
|
|
95
|
+
|
|
96
|
+
if target_type is MutationTargetType.FIXED_INDEX:
|
|
97
|
+
index = validate_int_in_range(
|
|
98
|
+
component="Rewriter._select_list_target_index",
|
|
99
|
+
param_name="target_index",
|
|
100
|
+
value=int(config.target_index),
|
|
101
|
+
min_inclusive=0,
|
|
102
|
+
max_exclusive=len(candidate_indices),
|
|
103
|
+
)
|
|
104
|
+
return candidate_indices[index]
|
|
105
|
+
|
|
106
|
+
# if you got this error it means that a new PromptListMutationTargetType was added,
|
|
107
|
+
# but not handled above
|
|
108
|
+
raise DeepEvalError(
|
|
109
|
+
"Rewriter._select_list_target_index received unsupported "
|
|
110
|
+
f"target_type={target_type!r}. Expected RANDOM or FIXED_INDEX."
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _apply_rewritten_prompt(
|
|
115
|
+
old_prompt: Prompt,
|
|
116
|
+
new_text: str,
|
|
117
|
+
random_state: random.Random,
|
|
118
|
+
list_mutation_config: Optional[MutationConfig] = None,
|
|
119
|
+
) -> Prompt:
|
|
120
|
+
"""
|
|
121
|
+
Apply the rewritten text to a Prompt, preserving representation:
|
|
122
|
+
|
|
123
|
+
- For TEXT prompts, update `text_template`.
|
|
124
|
+
- For LIST prompts, rewrite the content of a single message while
|
|
125
|
+
keeping the number of messages the same.
|
|
126
|
+
- Preserve additonal Prompt meta such as `label` and `interpolation_type`
|
|
127
|
+
"""
|
|
128
|
+
if not new_text:
|
|
129
|
+
return old_prompt
|
|
130
|
+
|
|
131
|
+
if old_prompt.type is PromptType.LIST and old_prompt.messages_template:
|
|
132
|
+
messages = old_prompt.messages_template
|
|
133
|
+
config = list_mutation_config or MutationConfig()
|
|
134
|
+
|
|
135
|
+
target_index = _select_list_target_index(
|
|
136
|
+
messages=messages,
|
|
137
|
+
config=config,
|
|
138
|
+
random_state=random_state,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
new_messages: List[PromptMessage] = []
|
|
142
|
+
for message_index, message in enumerate(messages):
|
|
143
|
+
if message_index == target_index:
|
|
144
|
+
# Preserve the original role; do not inject a new one.
|
|
145
|
+
new_messages.append(
|
|
146
|
+
PromptMessage(
|
|
147
|
+
role=message.role,
|
|
148
|
+
content=new_text,
|
|
149
|
+
)
|
|
150
|
+
)
|
|
151
|
+
else:
|
|
152
|
+
new_messages.append(message)
|
|
153
|
+
|
|
154
|
+
new_prompt = Prompt(
|
|
155
|
+
alias=old_prompt.alias,
|
|
156
|
+
text_template=None,
|
|
157
|
+
messages_template=new_messages,
|
|
158
|
+
model_settings=old_prompt.model_settings,
|
|
159
|
+
output_type=old_prompt.output_type,
|
|
160
|
+
output_schema=old_prompt.output_schema,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
else:
|
|
164
|
+
# Since it is not LIST, it must be TEXT type
|
|
165
|
+
new_prompt = Prompt(
|
|
166
|
+
alias=old_prompt.alias,
|
|
167
|
+
text_template=new_text,
|
|
168
|
+
model_settings=old_prompt.model_settings,
|
|
169
|
+
output_type=old_prompt.output_type,
|
|
170
|
+
output_schema=old_prompt.output_schema,
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
new_prompt.label = old_prompt.label
|
|
174
|
+
new_prompt.interpolation_type = old_prompt.interpolation_type
|
|
175
|
+
return new_prompt
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _compose_prompt_messages(system_message: str, user_message: str) -> str:
|
|
179
|
+
"""
|
|
180
|
+
Join system and user messages into a single prompt string.
|
|
181
|
+
Strips surrounding whitespace from each part; if the system message is
|
|
182
|
+
empty or absent, returns just the user message.
|
|
183
|
+
"""
|
|
184
|
+
system_text = (system_message or "").strip()
|
|
185
|
+
user_text = (user_message or "").strip()
|
|
186
|
+
return f"{system_text}\n\n{user_text}" if system_text else user_text
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _normalize_llm_output_to_text(
|
|
190
|
+
result: Union[str, Tuple[Union[str, dict], float], dict],
|
|
191
|
+
) -> str:
|
|
192
|
+
"""
|
|
193
|
+
Convert a DeepEval LLM generate() / a_generate() result to a clean string.
|
|
194
|
+
|
|
195
|
+
Accepted inputs:
|
|
196
|
+
- str -> returned as trimmed
|
|
197
|
+
- (str|dict, float_cost) -> first element extracted and normalized
|
|
198
|
+
- dict (e.g. JSON mode) -> JSON serialized with ensure_ascii=False
|
|
199
|
+
|
|
200
|
+
Fallback: if serialization fails, str(value).strip() is used.
|
|
201
|
+
"""
|
|
202
|
+
output_value: Union[str, dict]
|
|
203
|
+
if isinstance(result, tuple):
|
|
204
|
+
output_value = result[0]
|
|
205
|
+
else:
|
|
206
|
+
output_value = result
|
|
207
|
+
|
|
208
|
+
if isinstance(output_value, str):
|
|
209
|
+
return output_value.strip()
|
|
210
|
+
|
|
211
|
+
try:
|
|
212
|
+
return json.dumps(output_value, ensure_ascii=False)
|
|
213
|
+
except Exception:
|
|
214
|
+
return str(output_value).strip()
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Union, List
|
|
3
|
+
|
|
4
|
+
from deepeval.optimizer.types import PromptConfiguration, ScoreVector
|
|
5
|
+
from deepeval.dataset.golden import Golden, ConversationalGolden
|
|
6
|
+
|
|
7
|
+
ModuleId = str
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BaseScorer(ABC):
|
|
11
|
+
"""
|
|
12
|
+
Base scorer contract used by optimization runners.
|
|
13
|
+
|
|
14
|
+
Runners call into this adapter to:
|
|
15
|
+
- compute scores per-instance on some subset (score_on_pareto),
|
|
16
|
+
- compute minibatch means for selection and acceptance,
|
|
17
|
+
- generate feedback text used by the Rewriter.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
# Sync
|
|
21
|
+
@abstractmethod
|
|
22
|
+
def score_pareto(
|
|
23
|
+
self,
|
|
24
|
+
prompt_configuration: PromptConfiguration,
|
|
25
|
+
d_pareto: Union[List[Golden], List[ConversationalGolden]],
|
|
26
|
+
) -> ScoreVector:
|
|
27
|
+
"""Return per-instance scores on D_pareto."""
|
|
28
|
+
raise NotImplementedError
|
|
29
|
+
|
|
30
|
+
@abstractmethod
|
|
31
|
+
def score_minibatch(
|
|
32
|
+
self,
|
|
33
|
+
prompt_configuration: PromptConfiguration,
|
|
34
|
+
minibatch: Union[List[Golden], List[ConversationalGolden]],
|
|
35
|
+
) -> float:
|
|
36
|
+
"""Return average score μ on a minibatch from D_feedback."""
|
|
37
|
+
raise NotImplementedError
|
|
38
|
+
|
|
39
|
+
@abstractmethod
|
|
40
|
+
def get_minibatch_feedback(
|
|
41
|
+
self,
|
|
42
|
+
prompt_configuration: PromptConfiguration,
|
|
43
|
+
module: ModuleId,
|
|
44
|
+
minibatch: Union[List[Golden], List[ConversationalGolden]],
|
|
45
|
+
) -> str:
|
|
46
|
+
"""Return μ_f text for the module (metric.reason + traces, etc.)."""
|
|
47
|
+
raise NotImplementedError
|
|
48
|
+
|
|
49
|
+
@abstractmethod
|
|
50
|
+
def select_module(
|
|
51
|
+
self, prompt_configuration: PromptConfiguration
|
|
52
|
+
) -> ModuleId:
|
|
53
|
+
"""Pick a module to mutate."""
|
|
54
|
+
raise NotImplementedError
|
|
55
|
+
|
|
56
|
+
# Async
|
|
57
|
+
@abstractmethod
|
|
58
|
+
async def a_score_pareto(
|
|
59
|
+
self,
|
|
60
|
+
prompt_configuration: PromptConfiguration,
|
|
61
|
+
d_pareto: Union[List[Golden], List[ConversationalGolden]],
|
|
62
|
+
) -> ScoreVector:
|
|
63
|
+
raise NotImplementedError
|
|
64
|
+
|
|
65
|
+
@abstractmethod
|
|
66
|
+
async def a_score_minibatch(
|
|
67
|
+
self,
|
|
68
|
+
prompt_configuration: PromptConfiguration,
|
|
69
|
+
minibatch: Union[List[Golden], List[ConversationalGolden]],
|
|
70
|
+
) -> float:
|
|
71
|
+
raise NotImplementedError
|
|
72
|
+
|
|
73
|
+
@abstractmethod
|
|
74
|
+
async def a_get_minibatch_feedback(
|
|
75
|
+
self,
|
|
76
|
+
prompt_configuration: PromptConfiguration,
|
|
77
|
+
module: ModuleId,
|
|
78
|
+
minibatch: Union[List[Golden], List[ConversationalGolden]],
|
|
79
|
+
) -> str:
|
|
80
|
+
raise NotImplementedError
|
|
81
|
+
|
|
82
|
+
@abstractmethod
|
|
83
|
+
async def a_select_module(
|
|
84
|
+
self, prompt_configuration: PromptConfiguration
|
|
85
|
+
) -> ModuleId:
|
|
86
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import asyncio
|
|
3
|
+
import copy
|
|
4
|
+
from typing import (
|
|
5
|
+
Callable,
|
|
6
|
+
Dict,
|
|
7
|
+
List,
|
|
8
|
+
Optional,
|
|
9
|
+
Union,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
from deepeval.dataset.golden import Golden, ConversationalGolden
|
|
13
|
+
from deepeval.dataset.utils import (
|
|
14
|
+
convert_goldens_to_test_cases,
|
|
15
|
+
convert_convo_goldens_to_convo_test_cases,
|
|
16
|
+
)
|
|
17
|
+
from deepeval.errors import DeepEvalError
|
|
18
|
+
from deepeval.metrics import (
|
|
19
|
+
BaseMetric,
|
|
20
|
+
BaseConversationalMetric,
|
|
21
|
+
)
|
|
22
|
+
from deepeval.metrics.utils import copy_metrics
|
|
23
|
+
from deepeval.test_case import (
|
|
24
|
+
LLMTestCase,
|
|
25
|
+
ConversationalTestCase,
|
|
26
|
+
Turn,
|
|
27
|
+
)
|
|
28
|
+
from deepeval.prompt.prompt import Prompt
|
|
29
|
+
|
|
30
|
+
from deepeval.optimizer.types import (
|
|
31
|
+
ModelCallback,
|
|
32
|
+
PromptConfiguration,
|
|
33
|
+
Objective,
|
|
34
|
+
MeanObjective,
|
|
35
|
+
ModuleId,
|
|
36
|
+
)
|
|
37
|
+
from deepeval.optimizer.scorer.base import BaseScorer
|
|
38
|
+
from deepeval.optimizer.utils import (
|
|
39
|
+
validate_callback,
|
|
40
|
+
validate_metrics,
|
|
41
|
+
invoke_model_callback,
|
|
42
|
+
a_invoke_model_callback,
|
|
43
|
+
)
|
|
44
|
+
from deepeval.optimizer.scorer.utils import (
|
|
45
|
+
_measure_no_indicator,
|
|
46
|
+
_a_measure_no_indicator,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class Scorer(BaseScorer):
|
|
51
|
+
"""
|
|
52
|
+
Scores prompts by running model_callback, building test cases,
|
|
53
|
+
running metrics, and aggregating scores.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
DEFAULT_MODULE_ID: ModuleId = "__module__"
|
|
57
|
+
|
|
58
|
+
def __init__(
|
|
59
|
+
self,
|
|
60
|
+
model_callback: ModelCallback,
|
|
61
|
+
metrics: Union[List[BaseMetric], List[BaseConversationalMetric]],
|
|
62
|
+
max_concurrent: int,
|
|
63
|
+
throttle_seconds: float,
|
|
64
|
+
objective_scalar: Objective = MeanObjective(),
|
|
65
|
+
):
|
|
66
|
+
self.model_callback = validate_callback(
|
|
67
|
+
component="Scorer",
|
|
68
|
+
model_callback=model_callback,
|
|
69
|
+
)
|
|
70
|
+
self.metrics = validate_metrics(component="Scorer", metrics=metrics)
|
|
71
|
+
self.objective_scalar = objective_scalar
|
|
72
|
+
self._semaphore = asyncio.Semaphore(max_concurrent)
|
|
73
|
+
self._throttle = float(throttle_seconds)
|
|
74
|
+
|
|
75
|
+
########################
|
|
76
|
+
# generation & scoring #
|
|
77
|
+
########################
|
|
78
|
+
|
|
79
|
+
def generate(
|
|
80
|
+
self,
|
|
81
|
+
prompts_by_module: Dict[ModuleId, Prompt],
|
|
82
|
+
golden: Union[Golden, ConversationalGolden],
|
|
83
|
+
) -> str:
|
|
84
|
+
module_id = self._select_module_id_from_prompts(prompts_by_module)
|
|
85
|
+
prompt = prompts_by_module.get(module_id) or next(
|
|
86
|
+
iter(prompts_by_module.values())
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
return invoke_model_callback(
|
|
90
|
+
model_callback=self.model_callback,
|
|
91
|
+
prompt=prompt,
|
|
92
|
+
golden=golden,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
async def a_generate(
|
|
96
|
+
self,
|
|
97
|
+
prompts_by_module: Dict[ModuleId, Prompt],
|
|
98
|
+
golden: Union[Golden, ConversationalGolden],
|
|
99
|
+
) -> str:
|
|
100
|
+
module_id = self._select_module_id_from_prompts(prompts_by_module)
|
|
101
|
+
prompt = prompts_by_module.get(module_id) or next(
|
|
102
|
+
iter(prompts_by_module.values())
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
return await a_invoke_model_callback(
|
|
106
|
+
model_callback=self.model_callback,
|
|
107
|
+
prompt=prompt,
|
|
108
|
+
golden=golden,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
def score_pareto(
|
|
112
|
+
self,
|
|
113
|
+
prompt_configuration: PromptConfiguration,
|
|
114
|
+
d_pareto: Union[List[Golden], List[ConversationalGolden]],
|
|
115
|
+
) -> List[float]:
|
|
116
|
+
return [
|
|
117
|
+
self._score_one(prompt_configuration, golden) for golden in d_pareto
|
|
118
|
+
]
|
|
119
|
+
|
|
120
|
+
def score_minibatch(
|
|
121
|
+
self,
|
|
122
|
+
prompt_configuration: PromptConfiguration,
|
|
123
|
+
minibatch: Union[List[Golden], List[ConversationalGolden]],
|
|
124
|
+
) -> float:
|
|
125
|
+
if not minibatch:
|
|
126
|
+
return 0.0
|
|
127
|
+
|
|
128
|
+
scores = [
|
|
129
|
+
self._score_one(prompt_configuration, golden)
|
|
130
|
+
for golden in minibatch
|
|
131
|
+
]
|
|
132
|
+
return sum(scores) / len(scores)
|
|
133
|
+
|
|
134
|
+
def get_minibatch_feedback(
|
|
135
|
+
self,
|
|
136
|
+
prompt_configuration: PromptConfiguration,
|
|
137
|
+
module: ModuleId,
|
|
138
|
+
minibatch: Union[List[Golden], List[ConversationalGolden]],
|
|
139
|
+
) -> str:
|
|
140
|
+
# default metric feedback (μ_f): concat metric.reason across minibatch and cap length
|
|
141
|
+
reasons: List[str] = []
|
|
142
|
+
for golden in minibatch:
|
|
143
|
+
actual = self.generate(prompt_configuration.prompts, golden)
|
|
144
|
+
test_case = self._golden_to_test_case(golden, actual)
|
|
145
|
+
for metric in copy_metrics(self.metrics):
|
|
146
|
+
_measure_no_indicator(metric=metric, test_case=test_case)
|
|
147
|
+
if metric.reason:
|
|
148
|
+
reasons.append(str(metric.reason))
|
|
149
|
+
if not reasons:
|
|
150
|
+
return ""
|
|
151
|
+
unique: List[str] = []
|
|
152
|
+
seen = set()
|
|
153
|
+
for reason in reasons:
|
|
154
|
+
if reason not in seen:
|
|
155
|
+
unique.append(reason)
|
|
156
|
+
seen.add(reason)
|
|
157
|
+
return "\n---\n".join(
|
|
158
|
+
unique[:8]
|
|
159
|
+
) # TODO: Make how much feedback configurable
|
|
160
|
+
|
|
161
|
+
async def a_score_pareto(
|
|
162
|
+
self,
|
|
163
|
+
prompt_configuration: PromptConfiguration,
|
|
164
|
+
d_pareto: Union[List[Golden], List[ConversationalGolden]],
|
|
165
|
+
) -> List[float]:
|
|
166
|
+
tasks = [
|
|
167
|
+
self._bounded(self._a_score_one(prompt_configuration, golden))
|
|
168
|
+
for golden in d_pareto
|
|
169
|
+
]
|
|
170
|
+
return await asyncio.gather(*tasks)
|
|
171
|
+
|
|
172
|
+
async def a_score_minibatch(
|
|
173
|
+
self,
|
|
174
|
+
prompt_configuration: PromptConfiguration,
|
|
175
|
+
minibatch: Union[List[Golden], List[ConversationalGolden]],
|
|
176
|
+
) -> float:
|
|
177
|
+
tasks = [
|
|
178
|
+
self._bounded(self._a_score_one(prompt_configuration, golden))
|
|
179
|
+
for golden in minibatch
|
|
180
|
+
]
|
|
181
|
+
scores = await asyncio.gather(*tasks)
|
|
182
|
+
return sum(scores) / len(scores) if scores else 0.0
|
|
183
|
+
|
|
184
|
+
async def a_get_minibatch_feedback(
|
|
185
|
+
self,
|
|
186
|
+
prompt_configuration: PromptConfiguration,
|
|
187
|
+
module: ModuleId,
|
|
188
|
+
minibatch: Union[List[Golden], List[ConversationalGolden]],
|
|
189
|
+
) -> str:
|
|
190
|
+
async def reasons_one(golden) -> List[str]:
|
|
191
|
+
# Clone per task to avoid shared state
|
|
192
|
+
metrics = copy_metrics(self.metrics)
|
|
193
|
+
# metrics = self.metrics
|
|
194
|
+
actual = await self.a_generate(prompt_configuration.prompts, golden)
|
|
195
|
+
test_case = self._golden_to_test_case(golden, actual)
|
|
196
|
+
out: List[str] = []
|
|
197
|
+
for metric in metrics:
|
|
198
|
+
await _a_measure_no_indicator(metric, test_case)
|
|
199
|
+
if metric.reason:
|
|
200
|
+
out.append(str(metric.reason))
|
|
201
|
+
return out
|
|
202
|
+
|
|
203
|
+
tasks = [self._bounded(reasons_one(golden)) for golden in minibatch]
|
|
204
|
+
nested = await asyncio.gather(*tasks)
|
|
205
|
+
reasons: List[str] = [reason for sub in nested for reason in sub]
|
|
206
|
+
if not reasons:
|
|
207
|
+
return ""
|
|
208
|
+
unique: List[str] = []
|
|
209
|
+
seen = set()
|
|
210
|
+
for reason in reasons:
|
|
211
|
+
if reason not in seen:
|
|
212
|
+
unique.append(reason)
|
|
213
|
+
seen.add(reason)
|
|
214
|
+
return "\n---\n".join(unique[:8])
|
|
215
|
+
|
|
216
|
+
###################
|
|
217
|
+
# scoring helpers #
|
|
218
|
+
###################
|
|
219
|
+
|
|
220
|
+
def _golden_to_test_case(
|
|
221
|
+
self,
|
|
222
|
+
golden: Union[Golden, ConversationalGolden],
|
|
223
|
+
actual: str,
|
|
224
|
+
) -> Union[LLMTestCase, ConversationalTestCase]:
|
|
225
|
+
"""Convert a golden + actual output into a test case for metrics."""
|
|
226
|
+
if isinstance(golden, Golden):
|
|
227
|
+
golden.actual_output = actual
|
|
228
|
+
return convert_goldens_to_test_cases([golden])[0]
|
|
229
|
+
|
|
230
|
+
if isinstance(golden, ConversationalGolden):
|
|
231
|
+
# Build turns with actual output as assistant response
|
|
232
|
+
turns: List[Turn] = list(golden.turns or [])
|
|
233
|
+
if turns and turns[-1].role == "assistant":
|
|
234
|
+
turns[-1] = Turn(role="assistant", content=actual)
|
|
235
|
+
elif turns:
|
|
236
|
+
turns.append(Turn(role="assistant", content=actual))
|
|
237
|
+
else:
|
|
238
|
+
turns = [
|
|
239
|
+
Turn(role="assistant", content=actual),
|
|
240
|
+
]
|
|
241
|
+
|
|
242
|
+
golden.turns = turns
|
|
243
|
+
return convert_convo_goldens_to_convo_test_cases([golden])[0]
|
|
244
|
+
|
|
245
|
+
async def _bounded(self, coro):
|
|
246
|
+
if self._semaphore is None:
|
|
247
|
+
return await coro
|
|
248
|
+
async with self._semaphore:
|
|
249
|
+
res = await coro
|
|
250
|
+
if self._throttle:
|
|
251
|
+
await asyncio.sleep(self._throttle)
|
|
252
|
+
return res
|
|
253
|
+
|
|
254
|
+
async def _a_score_one(
|
|
255
|
+
self,
|
|
256
|
+
prompt_configuration: PromptConfiguration,
|
|
257
|
+
golden: Union[Golden, ConversationalGolden],
|
|
258
|
+
) -> float:
|
|
259
|
+
# Clone metrics to avoid shared-state
|
|
260
|
+
metrics = copy_metrics(self.metrics)
|
|
261
|
+
actual = await self.a_generate(prompt_configuration.prompts, golden)
|
|
262
|
+
test_case = self._golden_to_test_case(golden, actual)
|
|
263
|
+
|
|
264
|
+
per_metric: Dict[str, float] = {}
|
|
265
|
+
for metric in metrics:
|
|
266
|
+
score = await _a_measure_no_indicator(metric, test_case)
|
|
267
|
+
per_metric[metric.__class__.__name__] = float(score)
|
|
268
|
+
return self.objective_scalar.scalarize(per_metric)
|
|
269
|
+
|
|
270
|
+
def _score_one(
|
|
271
|
+
self,
|
|
272
|
+
prompt_configuration: PromptConfiguration,
|
|
273
|
+
golden: Union[Golden, ConversationalGolden],
|
|
274
|
+
) -> float:
|
|
275
|
+
metrics = copy_metrics(self.metrics)
|
|
276
|
+
actual = self.generate(prompt_configuration.prompts, golden)
|
|
277
|
+
test_case = self._golden_to_test_case(golden, actual)
|
|
278
|
+
|
|
279
|
+
per_metric: Dict[str, float] = {}
|
|
280
|
+
for metric in metrics:
|
|
281
|
+
score = _measure_no_indicator(metric, test_case)
|
|
282
|
+
per_metric[metric.__class__.__name__] = float(score)
|
|
283
|
+
return self.objective_scalar.scalarize(per_metric)
|
|
284
|
+
|
|
285
|
+
def _select_module_id_from_prompts(
|
|
286
|
+
self, prompts_by_module: Dict[ModuleId, Prompt]
|
|
287
|
+
) -> ModuleId:
|
|
288
|
+
"""
|
|
289
|
+
Default module selection strategy:
|
|
290
|
+
|
|
291
|
+
- Prefer the synthetic '__module__' key when present
|
|
292
|
+
- Otherwise fall back to the first key in prompts_by_module.
|
|
293
|
+
|
|
294
|
+
Assumes `prompts_by_module` is non-empty; callers should validate that.
|
|
295
|
+
"""
|
|
296
|
+
if self.DEFAULT_MODULE_ID in prompts_by_module:
|
|
297
|
+
return self.DEFAULT_MODULE_ID
|
|
298
|
+
|
|
299
|
+
# At this point we expect at least one key.
|
|
300
|
+
try:
|
|
301
|
+
return next(iter(prompts_by_module.keys()))
|
|
302
|
+
except StopIteration:
|
|
303
|
+
raise DeepEvalError(
|
|
304
|
+
"Scorer._select_module_id_from_prompts(...) "
|
|
305
|
+
"received an empty `prompts_by_module`. At least one Prompt is required."
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
def select_module(
|
|
309
|
+
self, prompt_configuration: PromptConfiguration
|
|
310
|
+
) -> ModuleId:
|
|
311
|
+
return self._select_module_id_from_prompts(prompt_configuration.prompts)
|
|
312
|
+
|
|
313
|
+
async def a_select_module(
|
|
314
|
+
self, prompt_configuration: PromptConfiguration
|
|
315
|
+
) -> ModuleId:
|
|
316
|
+
return self.select_module(prompt_configuration)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import inspect
|
|
2
|
+
from typing import Callable, Union
|
|
3
|
+
|
|
4
|
+
from deepeval.metrics import BaseConversationalMetric, BaseMetric
|
|
5
|
+
from deepeval.test_case import ConversationalTestCase, LLMTestCase
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _build_measure_kwargs(func: Callable) -> dict:
|
|
9
|
+
params = inspect.signature(func).parameters
|
|
10
|
+
kwargs = {}
|
|
11
|
+
for key in ("_show_indicator", "_in_component", "_log_metric_to_confident"):
|
|
12
|
+
if key in params:
|
|
13
|
+
kwargs[key] = False
|
|
14
|
+
return kwargs
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _measure_no_indicator(
|
|
18
|
+
metric: Union[BaseMetric, BaseConversationalMetric],
|
|
19
|
+
test_case: Union[LLMTestCase, ConversationalTestCase],
|
|
20
|
+
):
|
|
21
|
+
kwargs = _build_measure_kwargs(metric.measure)
|
|
22
|
+
return metric.measure(test_case, **kwargs)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
async def _a_measure_no_indicator(
|
|
26
|
+
metric: Union[BaseMetric, BaseConversationalMetric],
|
|
27
|
+
test_case: Union[LLMTestCase, ConversationalTestCase],
|
|
28
|
+
):
|
|
29
|
+
kwargs = _build_measure_kwargs(metric.a_measure)
|
|
30
|
+
return await metric.a_measure(test_case, **kwargs)
|