deepeval 3.7.3__py3-none-any.whl → 3.7.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/test.py +1 -1
- deepeval/config/settings.py +102 -13
- deepeval/dataset/golden.py +54 -2
- deepeval/evaluate/configs.py +1 -1
- deepeval/evaluate/evaluate.py +16 -8
- deepeval/evaluate/execute.py +74 -27
- deepeval/evaluate/utils.py +26 -22
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/metrics/__init__.py +14 -12
- deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
- deepeval/metrics/answer_relevancy/template.py +188 -92
- deepeval/metrics/argument_correctness/template.py +2 -2
- deepeval/metrics/base_metric.py +2 -5
- deepeval/metrics/bias/template.py +3 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/template.py +2 -2
- deepeval/metrics/conversational_dag/templates.py +4 -4
- deepeval/metrics/conversational_g_eval/template.py +4 -3
- deepeval/metrics/dag/templates.py +5 -5
- deepeval/metrics/faithfulness/faithfulness.py +70 -27
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/utils.py +2 -2
- deepeval/metrics/hallucination/template.py +4 -4
- deepeval/metrics/indicator.py +4 -4
- deepeval/metrics/misuse/template.py +2 -2
- deepeval/metrics/multimodal_metrics/__init__.py +0 -18
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
- deepeval/metrics/non_advice/template.py +2 -2
- deepeval/metrics/pii_leakage/template.py +2 -2
- deepeval/metrics/prompt_alignment/template.py +4 -4
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_violation/template.py +2 -2
- deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
- deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
- deepeval/metrics/toxicity/template.py +4 -4
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
- deepeval/metrics/turn_relevancy/template.py +2 -2
- deepeval/metrics/utils.py +39 -58
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +16 -38
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +69 -32
- deepeval/models/embedding_models/local_embedding_model.py +39 -22
- deepeval/models/embedding_models/ollama_embedding_model.py +42 -18
- deepeval/models/embedding_models/openai_embedding_model.py +50 -15
- deepeval/models/llms/amazon_bedrock_model.py +1 -2
- deepeval/models/llms/anthropic_model.py +53 -20
- deepeval/models/llms/azure_model.py +140 -43
- deepeval/models/llms/deepseek_model.py +38 -23
- deepeval/models/llms/gemini_model.py +222 -103
- deepeval/models/llms/grok_model.py +39 -27
- deepeval/models/llms/kimi_model.py +39 -23
- deepeval/models/llms/litellm_model.py +103 -45
- deepeval/models/llms/local_model.py +35 -22
- deepeval/models/llms/ollama_model.py +129 -17
- deepeval/models/llms/openai_model.py +151 -50
- deepeval/models/llms/portkey_model.py +149 -0
- deepeval/models/llms/utils.py +5 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +94 -4
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/optimizer/algorithms/copro/copro.py +836 -0
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/optimizer/algorithms/gepa/gepa.py +737 -0
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/optimizer/algorithms/simba/simba.py +999 -0
- deepeval/optimizer/algorithms/simba/types.py +15 -0
- deepeval/optimizer/configs.py +31 -0
- deepeval/optimizer/policies.py +227 -0
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/optimizer/utils.py +480 -0
- deepeval/prompt/prompt.py +7 -6
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +12 -10
- deepeval/test_case/conversational_test_case.py +19 -1
- deepeval/test_case/llm_test_case.py +152 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +15 -14
- deepeval/test_run/cache.py +2 -0
- deepeval/test_run/test_run.py +9 -4
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +89 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/RECORD +134 -118
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -334
- deepeval/models/mlllms/gemini_model.py +0 -284
- deepeval/models/mlllms/ollama_model.py +0 -144
- deepeval/models/mlllms/openai_model.py +0 -258
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
- {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class SIMBAStrategy(str, Enum):
|
|
5
|
+
"""
|
|
6
|
+
Edit strategies used by SIMBA-style optimization.
|
|
7
|
+
|
|
8
|
+
- APPEND_DEMO: append one or more input/output demos distilled from the
|
|
9
|
+
current minibatch, similar in spirit to DSPy's `append_a_demo`.
|
|
10
|
+
- APPEND_RULE: append a concise natural-language rule distilled from
|
|
11
|
+
feedback, similar in spirit to DSPy's `append_a_rule`.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
APPEND_DEMO = "append_demo"
|
|
15
|
+
APPEND_RULE = "append_rule"
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from pydantic import BaseModel, Field, conint
|
|
4
|
+
from typing import Optional
|
|
5
|
+
from deepeval.evaluate.configs import AsyncConfig
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DisplayConfig(BaseModel):
|
|
9
|
+
show_indicator: bool = True
|
|
10
|
+
announce_ties: bool = Field(
|
|
11
|
+
False, description="Print a one-line note when a tie is detected"
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class MutationTargetType(Enum):
|
|
16
|
+
RANDOM = "random"
|
|
17
|
+
FIXED_INDEX = "fixed_index"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# default all messages
|
|
21
|
+
class MutationConfig(BaseModel):
|
|
22
|
+
target_type: MutationTargetType = MutationTargetType.RANDOM
|
|
23
|
+
# should be list
|
|
24
|
+
target_role: Optional[str] = Field(
|
|
25
|
+
default=None,
|
|
26
|
+
description="If set, restricts candidates to messages with this role (case insensitive).",
|
|
27
|
+
)
|
|
28
|
+
target_index: conint(ge=0) = Field(
|
|
29
|
+
default=0,
|
|
30
|
+
description="0-based index used when target_type == FIXED_INDEX.",
|
|
31
|
+
)
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from enum import Enum
|
|
3
|
+
import random
|
|
4
|
+
from typing import Dict, List, Sequence, Optional, Tuple
|
|
5
|
+
|
|
6
|
+
from deepeval.errors import DeepEvalError
|
|
7
|
+
from deepeval.optimizer.types import PromptConfigurationId, ScoreTable
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _is_dominated(
|
|
11
|
+
candidate_scores: List[float], other_scores: List[float]
|
|
12
|
+
) -> bool:
|
|
13
|
+
"""
|
|
14
|
+
Return True if `candidate_scores` is dominated by `other_scores`:
|
|
15
|
+
(other >= candidate on all dimensions) AND (other > candidate on at least one).
|
|
16
|
+
"""
|
|
17
|
+
other_ge_everywhere = all(
|
|
18
|
+
other_score >= candidate_score
|
|
19
|
+
for candidate_score, other_score in zip(candidate_scores, other_scores)
|
|
20
|
+
)
|
|
21
|
+
other_gt_somewhere = any(
|
|
22
|
+
other_score > candidate_score
|
|
23
|
+
for candidate_score, other_score in zip(candidate_scores, other_scores)
|
|
24
|
+
)
|
|
25
|
+
return other_ge_everywhere and other_gt_somewhere
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def pareto_frontier(
|
|
29
|
+
prompt_configuration_ids: Sequence[PromptConfigurationId],
|
|
30
|
+
score_table: ScoreTable,
|
|
31
|
+
) -> List[PromptConfigurationId]:
|
|
32
|
+
"""
|
|
33
|
+
Compute the set of non-dominated candidates given their scores.
|
|
34
|
+
Returns PromptConfigurationIds on the Pareto frontier.
|
|
35
|
+
"""
|
|
36
|
+
frontier: List[PromptConfigurationId] = []
|
|
37
|
+
for prompt_configuration_id in prompt_configuration_ids:
|
|
38
|
+
candidate_vector = score_table[prompt_configuration_id]
|
|
39
|
+
dominated = False
|
|
40
|
+
|
|
41
|
+
# If any existing frontier member dominates this candidate, skip it.
|
|
42
|
+
for frontier_id in frontier:
|
|
43
|
+
if _is_dominated(candidate_vector, score_table[frontier_id]):
|
|
44
|
+
dominated = True
|
|
45
|
+
break
|
|
46
|
+
if dominated:
|
|
47
|
+
continue
|
|
48
|
+
|
|
49
|
+
# Remove any frontier member that is dominated by this candidate.
|
|
50
|
+
frontier = [
|
|
51
|
+
f_id
|
|
52
|
+
for f_id in frontier
|
|
53
|
+
if not _is_dominated(score_table[f_id], candidate_vector)
|
|
54
|
+
]
|
|
55
|
+
frontier.append(prompt_configuration_id)
|
|
56
|
+
|
|
57
|
+
return frontier
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def frequency_weights(
|
|
61
|
+
score_table: ScoreTable,
|
|
62
|
+
) -> Dict[PromptConfigurationId, int]:
|
|
63
|
+
"""
|
|
64
|
+
Build best sets, remove dominated candidates, and count appearances.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
A map {prompt_configuration_id -> frequency} counting how often each
|
|
68
|
+
globally non-dominated prompt configuration appears among the instance
|
|
69
|
+
Pareto sets.
|
|
70
|
+
"""
|
|
71
|
+
if not score_table:
|
|
72
|
+
return {}
|
|
73
|
+
|
|
74
|
+
# Assume all score vectors have the same length.
|
|
75
|
+
example_vector = next(iter(score_table.values()))
|
|
76
|
+
num_instances = len(example_vector)
|
|
77
|
+
all_candidates = list(score_table.keys())
|
|
78
|
+
|
|
79
|
+
per_instance_frontiers: List[List[PromptConfigurationId]] = []
|
|
80
|
+
for i in range(num_instances):
|
|
81
|
+
best_score_i = max(
|
|
82
|
+
score_table[prompt_configuration_id][i]
|
|
83
|
+
for prompt_configuration_id in all_candidates
|
|
84
|
+
)
|
|
85
|
+
winners_i = [
|
|
86
|
+
prompt_configuration_id
|
|
87
|
+
for prompt_configuration_id in all_candidates
|
|
88
|
+
if score_table[prompt_configuration_id][i] == best_score_i
|
|
89
|
+
]
|
|
90
|
+
|
|
91
|
+
# Instance frontier among winners. We pass 1-D score vectors
|
|
92
|
+
# so this reduces to "all candidates with the max score at instance i",
|
|
93
|
+
instance_frontier = pareto_frontier(
|
|
94
|
+
winners_i,
|
|
95
|
+
{
|
|
96
|
+
prompt_configuration_id: [
|
|
97
|
+
score_table[prompt_configuration_id][i]
|
|
98
|
+
]
|
|
99
|
+
for prompt_configuration_id in winners_i
|
|
100
|
+
},
|
|
101
|
+
)
|
|
102
|
+
per_instance_frontiers.append(instance_frontier)
|
|
103
|
+
|
|
104
|
+
# Global candidate set appearing in any winners
|
|
105
|
+
candidate_union = sorted(
|
|
106
|
+
{
|
|
107
|
+
prompt_configuration_id
|
|
108
|
+
for winners in per_instance_frontiers
|
|
109
|
+
for prompt_configuration_id in winners
|
|
110
|
+
}
|
|
111
|
+
)
|
|
112
|
+
global_frontier = pareto_frontier(candidate_union, score_table)
|
|
113
|
+
|
|
114
|
+
# Count frequency only for candidates on the global frontier
|
|
115
|
+
frequency_by_prompt_config: Dict[PromptConfigurationId, int] = {
|
|
116
|
+
prompt_configuration_id: 0
|
|
117
|
+
for prompt_configuration_id in global_frontier
|
|
118
|
+
}
|
|
119
|
+
for winners in per_instance_frontiers:
|
|
120
|
+
for prompt_configuration_id in winners:
|
|
121
|
+
if prompt_configuration_id in frequency_by_prompt_config:
|
|
122
|
+
frequency_by_prompt_config[prompt_configuration_id] += 1
|
|
123
|
+
|
|
124
|
+
return frequency_by_prompt_config
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def sample_by_frequency(
|
|
128
|
+
frequency_by_prompt_config: Dict[PromptConfigurationId, int],
|
|
129
|
+
*,
|
|
130
|
+
random_state: random.Random,
|
|
131
|
+
) -> PromptConfigurationId:
|
|
132
|
+
"""
|
|
133
|
+
Sample a prompt configuration id with probability proportional to its frequency.
|
|
134
|
+
Falls back to uniform if the total weight is zero.
|
|
135
|
+
"""
|
|
136
|
+
if not frequency_by_prompt_config:
|
|
137
|
+
raise DeepEvalError("No prompt configurations to sample.")
|
|
138
|
+
|
|
139
|
+
items = list(frequency_by_prompt_config.items())
|
|
140
|
+
total_weight = sum(weight for _, weight in items)
|
|
141
|
+
|
|
142
|
+
if total_weight == 0:
|
|
143
|
+
# Uniform fallback
|
|
144
|
+
return random_state.choice(
|
|
145
|
+
[prompt_configuration_id for prompt_configuration_id, _ in items]
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
r = random_state.uniform(0, total_weight)
|
|
149
|
+
cumulative = 0.0
|
|
150
|
+
for prompt_configuration_id, weight in items:
|
|
151
|
+
cumulative += weight
|
|
152
|
+
if r <= cumulative:
|
|
153
|
+
return prompt_configuration_id
|
|
154
|
+
return items[-1][0]
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def select_prompt_configuration_pareto(
|
|
158
|
+
score_table: ScoreTable, *, random_state: random.Random
|
|
159
|
+
) -> PromptConfigurationId:
|
|
160
|
+
"""
|
|
161
|
+
Frequency weighted sampling over the Pareto winners,
|
|
162
|
+
restricted to globally non-dominated prompt configurations. A configuration
|
|
163
|
+
is globally non-dominated if no other configuration dominates it using
|
|
164
|
+
the full vector.
|
|
165
|
+
"""
|
|
166
|
+
freq = frequency_weights(score_table)
|
|
167
|
+
return sample_by_frequency(freq, random_state=random_state)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class TieBreaker(str, Enum):
|
|
171
|
+
PREFER_ROOT = "prefer_root"
|
|
172
|
+
PREFER_CHILD = "prefer_child"
|
|
173
|
+
RANDOM = "random"
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def pick_best_with_ties(
|
|
177
|
+
totals: Dict[PromptConfigurationId, float],
|
|
178
|
+
parents_by_id: Dict[PromptConfigurationId, Optional[PromptConfigurationId]],
|
|
179
|
+
*,
|
|
180
|
+
random_state: random.Random,
|
|
181
|
+
tie_tolerance: float = 1e-9,
|
|
182
|
+
policy: TieBreaker = TieBreaker.PREFER_ROOT,
|
|
183
|
+
) -> Tuple[PromptConfigurationId, List[PromptConfigurationId], float]:
|
|
184
|
+
"""
|
|
185
|
+
Choose the best candidate by aggregate score with deterministic tie handling.
|
|
186
|
+
|
|
187
|
+
Returns: (chosen_id, tied_ids, max_score)
|
|
188
|
+
- tied_ids includes everyone within tie_tolerance of max_score
|
|
189
|
+
"""
|
|
190
|
+
if not totals:
|
|
191
|
+
raise DeepEvalError("No candidate prompt configuration to choose from.")
|
|
192
|
+
|
|
193
|
+
max_score = max(totals.values())
|
|
194
|
+
tied = [
|
|
195
|
+
prompt_configuration_id
|
|
196
|
+
for prompt_configuration_id, score in totals.items()
|
|
197
|
+
if abs(score - max_score) <= tie_tolerance
|
|
198
|
+
]
|
|
199
|
+
|
|
200
|
+
if len(tied) == 1:
|
|
201
|
+
return tied[0], tied, max_score
|
|
202
|
+
|
|
203
|
+
# Resolve tie by policy
|
|
204
|
+
if policy == TieBreaker.PREFER_CHILD:
|
|
205
|
+
# Prefer any non root. When multiple children exist, use the most recent
|
|
206
|
+
child_ids = [
|
|
207
|
+
prompt_configuration_id
|
|
208
|
+
for prompt_configuration_id in tied
|
|
209
|
+
if parents_by_id.get(prompt_configuration_id) is not None
|
|
210
|
+
]
|
|
211
|
+
if child_ids:
|
|
212
|
+
# choose the newest child deterministically by order
|
|
213
|
+
for prompt_configuration_id in reversed(list(totals.keys())):
|
|
214
|
+
if prompt_configuration_id in child_ids:
|
|
215
|
+
return prompt_configuration_id, tied, max_score
|
|
216
|
+
|
|
217
|
+
if policy == TieBreaker.RANDOM:
|
|
218
|
+
return random_state.choice(tied), tied, max_score
|
|
219
|
+
|
|
220
|
+
# by default prefer a root if present, otherwise the first tied
|
|
221
|
+
root_ids = [
|
|
222
|
+
prompt_configuration_id
|
|
223
|
+
for prompt_configuration_id in tied
|
|
224
|
+
if parents_by_id.get(prompt_configuration_id) is None
|
|
225
|
+
]
|
|
226
|
+
chosen = root_ids[0] if root_ids else tied[0]
|
|
227
|
+
return chosen, tied, max_score
|
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
from contextlib import contextmanager
|
|
2
|
+
from typing import (
|
|
3
|
+
Callable,
|
|
4
|
+
Dict,
|
|
5
|
+
List,
|
|
6
|
+
Optional,
|
|
7
|
+
Tuple,
|
|
8
|
+
Union,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
from rich.progress import (
|
|
12
|
+
Progress,
|
|
13
|
+
SpinnerColumn,
|
|
14
|
+
BarColumn,
|
|
15
|
+
TextColumn,
|
|
16
|
+
TimeElapsedColumn,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
from deepeval.dataset.golden import Golden, ConversationalGolden
|
|
20
|
+
from deepeval.errors import DeepEvalError
|
|
21
|
+
from deepeval.metrics import BaseConversationalMetric, BaseMetric
|
|
22
|
+
from deepeval.metrics.utils import initialize_model
|
|
23
|
+
from deepeval.models.base_model import DeepEvalBaseLLM
|
|
24
|
+
from deepeval.optimizer.scorer import Scorer
|
|
25
|
+
from deepeval.optimizer.rewriter import Rewriter
|
|
26
|
+
from deepeval.optimizer.types import (
|
|
27
|
+
ModelCallback,
|
|
28
|
+
RunnerStatusType,
|
|
29
|
+
)
|
|
30
|
+
from deepeval.optimizer.utils import (
|
|
31
|
+
validate_callback,
|
|
32
|
+
validate_metrics,
|
|
33
|
+
)
|
|
34
|
+
from deepeval.optimizer.configs import (
|
|
35
|
+
DisplayConfig,
|
|
36
|
+
MutationConfig,
|
|
37
|
+
AsyncConfig,
|
|
38
|
+
)
|
|
39
|
+
from deepeval.prompt.prompt import Prompt
|
|
40
|
+
from deepeval.utils import get_or_create_event_loop
|
|
41
|
+
from deepeval.optimizer.algorithms import (
|
|
42
|
+
GEPA,
|
|
43
|
+
MIPROV2,
|
|
44
|
+
COPRO,
|
|
45
|
+
SIMBA,
|
|
46
|
+
)
|
|
47
|
+
from deepeval.optimizer.algorithms.configs import (
|
|
48
|
+
GEPA_REWRITE_INSTRUCTION_MAX_CHARS,
|
|
49
|
+
MIPROV2_REWRITE_INSTRUCTION_MAX_CHARS,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class PromptOptimizer:
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
model_callback: ModelCallback,
|
|
57
|
+
metrics: Union[List[BaseMetric], List[BaseConversationalMetric]],
|
|
58
|
+
optimizer_model: Optional[Union[str, DeepEvalBaseLLM]] = None,
|
|
59
|
+
algorithm: Union[GEPA, MIPROV2, COPRO, SIMBA] = GEPA(),
|
|
60
|
+
async_config: Optional[AsyncConfig] = AsyncConfig(),
|
|
61
|
+
display_config: Optional[DisplayConfig] = DisplayConfig(),
|
|
62
|
+
mutation_config: Optional[MutationConfig] = MutationConfig(),
|
|
63
|
+
):
|
|
64
|
+
self.optimizer_model, self.using_native_model = initialize_model(
|
|
65
|
+
optimizer_model
|
|
66
|
+
)
|
|
67
|
+
self.model_callback = validate_callback(
|
|
68
|
+
component="PromptOptimizer",
|
|
69
|
+
model_callback=model_callback,
|
|
70
|
+
)
|
|
71
|
+
self.metrics = validate_metrics(
|
|
72
|
+
component="PromptOptimizer", metrics=metrics
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
self.async_config = async_config
|
|
76
|
+
self.display_config = display_config
|
|
77
|
+
self.mutation_config = mutation_config
|
|
78
|
+
self.algorithm = algorithm
|
|
79
|
+
self.optimization_report = None
|
|
80
|
+
self._configure_algorithm()
|
|
81
|
+
|
|
82
|
+
# Internal state used only when a progress indicator is active.
|
|
83
|
+
# Tuple is (Progress instance, task_id).
|
|
84
|
+
self._progress_state: Optional[Tuple[Progress, int]] = None
|
|
85
|
+
|
|
86
|
+
##############
|
|
87
|
+
# Public API #
|
|
88
|
+
##############
|
|
89
|
+
|
|
90
|
+
def optimize(
|
|
91
|
+
self,
|
|
92
|
+
prompt: Prompt,
|
|
93
|
+
goldens: Union[List[Golden], List[ConversationalGolden]],
|
|
94
|
+
) -> Prompt:
|
|
95
|
+
if self.async_config.run_async:
|
|
96
|
+
loop = get_or_create_event_loop()
|
|
97
|
+
return loop.run_until_complete(
|
|
98
|
+
self.a_optimize(prompt=prompt, goldens=goldens)
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
with self._progress_context():
|
|
103
|
+
best_prompt, self.optimization_report = self.algorithm.execute(
|
|
104
|
+
prompt=prompt, goldens=goldens
|
|
105
|
+
)
|
|
106
|
+
except Exception as exc:
|
|
107
|
+
self._handle_optimization_error(exc)
|
|
108
|
+
|
|
109
|
+
return best_prompt
|
|
110
|
+
|
|
111
|
+
async def a_optimize(
|
|
112
|
+
self,
|
|
113
|
+
prompt: Prompt,
|
|
114
|
+
goldens: Union[List[Golden], List[ConversationalGolden]],
|
|
115
|
+
) -> Prompt:
|
|
116
|
+
try:
|
|
117
|
+
with self._progress_context():
|
|
118
|
+
best_prompt, self.optimization_report = (
|
|
119
|
+
await self.algorithm.a_execute(
|
|
120
|
+
prompt=prompt, goldens=goldens
|
|
121
|
+
)
|
|
122
|
+
)
|
|
123
|
+
except Exception as exc:
|
|
124
|
+
self._handle_optimization_error(exc)
|
|
125
|
+
|
|
126
|
+
return best_prompt
|
|
127
|
+
|
|
128
|
+
####################
|
|
129
|
+
# Internal helpers #
|
|
130
|
+
####################
|
|
131
|
+
|
|
132
|
+
def _configure_algorithm(self) -> None:
|
|
133
|
+
"""Configure the algorithm with scorer, rewriter, and callbacks."""
|
|
134
|
+
self.algorithm.scorer = Scorer(
|
|
135
|
+
model_callback=self.model_callback,
|
|
136
|
+
metrics=self.metrics,
|
|
137
|
+
max_concurrent=self.async_config.max_concurrent,
|
|
138
|
+
throttle_seconds=float(self.async_config.throttle_value),
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# Attach rewriter for mutation behavior
|
|
142
|
+
# GEPA uses internal constant; other algorithms use MIPROV2 constant
|
|
143
|
+
if isinstance(self.algorithm, GEPA):
|
|
144
|
+
max_chars = GEPA_REWRITE_INSTRUCTION_MAX_CHARS
|
|
145
|
+
else:
|
|
146
|
+
max_chars = MIPROV2_REWRITE_INSTRUCTION_MAX_CHARS
|
|
147
|
+
self.algorithm._rewriter = Rewriter(
|
|
148
|
+
optimizer_model=self.optimizer_model,
|
|
149
|
+
max_chars=max_chars,
|
|
150
|
+
list_mutation_config=self.mutation_config,
|
|
151
|
+
random_state=self.algorithm.random_state,
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
# Set status callback
|
|
155
|
+
self.algorithm.status_callback = self._on_status
|
|
156
|
+
|
|
157
|
+
@contextmanager
|
|
158
|
+
def _progress_context(self):
|
|
159
|
+
"""Context manager that sets up progress indicator if enabled."""
|
|
160
|
+
if not self.display_config.show_indicator:
|
|
161
|
+
yield
|
|
162
|
+
return
|
|
163
|
+
|
|
164
|
+
with Progress(
|
|
165
|
+
SpinnerColumn(style="rgb(106,0,255)"),
|
|
166
|
+
TextColumn("[progress.description]{task.description}"),
|
|
167
|
+
BarColumn(bar_width=40),
|
|
168
|
+
TimeElapsedColumn(),
|
|
169
|
+
transient=True,
|
|
170
|
+
) as progress:
|
|
171
|
+
task = progress.add_task(
|
|
172
|
+
f"Optimizing prompt with {self.algorithm.name}..."
|
|
173
|
+
)
|
|
174
|
+
self._progress_state = (progress, task)
|
|
175
|
+
try:
|
|
176
|
+
yield
|
|
177
|
+
finally:
|
|
178
|
+
self._progress_state = None
|
|
179
|
+
|
|
180
|
+
def _handle_optimization_error(self, exc: Exception) -> None:
|
|
181
|
+
"""
|
|
182
|
+
Handle optimization errors by formatting and raising a user-friendly message.
|
|
183
|
+
"""
|
|
184
|
+
total_steps: Optional[int] = None
|
|
185
|
+
iterations: Optional[int] = getattr(self.algorithm, "iterations", None)
|
|
186
|
+
if iterations is not None:
|
|
187
|
+
total_steps = int(iterations)
|
|
188
|
+
|
|
189
|
+
prefix = f"(iterations={iterations}) " if iterations is not None else ""
|
|
190
|
+
detail = (
|
|
191
|
+
f"{prefix}• error {exc.__class__.__name__}: {exc} "
|
|
192
|
+
"• halted before first iteration"
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
self._on_status(
|
|
196
|
+
RunnerStatusType.ERROR,
|
|
197
|
+
detail=detail,
|
|
198
|
+
step_index=None,
|
|
199
|
+
total_steps=total_steps,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
algo = self.algorithm.name
|
|
203
|
+
raise DeepEvalError(f"[{algo}] {detail}") from None
|
|
204
|
+
|
|
205
|
+
def _on_status(
|
|
206
|
+
self,
|
|
207
|
+
kind: RunnerStatusType,
|
|
208
|
+
detail: str,
|
|
209
|
+
step_index: Optional[int] = None,
|
|
210
|
+
total_steps: Optional[int] = None,
|
|
211
|
+
) -> None:
|
|
212
|
+
"""
|
|
213
|
+
Unified status callback used by the algorithm.
|
|
214
|
+
|
|
215
|
+
- PROGRESS: update the progress bar description and position
|
|
216
|
+
- TIE: optionally print a tie message
|
|
217
|
+
- ERROR: print a concise error message and allow the run to halt
|
|
218
|
+
"""
|
|
219
|
+
algo = self.algorithm.name
|
|
220
|
+
|
|
221
|
+
if kind is RunnerStatusType.ERROR:
|
|
222
|
+
if self._progress_state is not None:
|
|
223
|
+
progress, task = self._progress_state
|
|
224
|
+
if total_steps is not None:
|
|
225
|
+
progress.update(task, total=total_steps)
|
|
226
|
+
description = self._format_progress_description(detail)
|
|
227
|
+
progress.update(task, description=description)
|
|
228
|
+
print(f"[{algo}] {detail}")
|
|
229
|
+
return
|
|
230
|
+
|
|
231
|
+
if kind is RunnerStatusType.TIE:
|
|
232
|
+
if not self.display_config.announce_ties:
|
|
233
|
+
return
|
|
234
|
+
print(f"[{algo}] {detail}")
|
|
235
|
+
return
|
|
236
|
+
|
|
237
|
+
if kind is not RunnerStatusType.PROGRESS:
|
|
238
|
+
return
|
|
239
|
+
|
|
240
|
+
if self._progress_state is None:
|
|
241
|
+
return
|
|
242
|
+
|
|
243
|
+
progress, task = self._progress_state
|
|
244
|
+
|
|
245
|
+
if total_steps is not None:
|
|
246
|
+
progress.update(task, total=total_steps)
|
|
247
|
+
|
|
248
|
+
if step_index is not None and step_index > 0:
|
|
249
|
+
progress.advance(task, 1)
|
|
250
|
+
|
|
251
|
+
description = self._format_progress_description(detail)
|
|
252
|
+
progress.update(task, description=description)
|
|
253
|
+
|
|
254
|
+
def _format_progress_description(self, detail: str) -> str:
|
|
255
|
+
"""
|
|
256
|
+
Compose a human readable progress line using an algorithm agnostic
|
|
257
|
+
prefix and an algorithm specific detail string provided by the algorithm.
|
|
258
|
+
"""
|
|
259
|
+
algo = self.algorithm.name
|
|
260
|
+
base = f"Optimizing prompt with {algo}"
|
|
261
|
+
if detail:
|
|
262
|
+
return f"{base} [rgb(25,227,160)]{detail}[/]"
|
|
263
|
+
return base
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import random
|
|
3
|
+
from typing import Optional, Tuple, Union
|
|
4
|
+
|
|
5
|
+
from deepeval.models.base_model import DeepEvalBaseLLM
|
|
6
|
+
from deepeval.optimizer.types import (
|
|
7
|
+
ModuleId,
|
|
8
|
+
)
|
|
9
|
+
from deepeval.optimizer.configs import (
|
|
10
|
+
MutationConfig,
|
|
11
|
+
)
|
|
12
|
+
from deepeval.prompt.prompt import Prompt
|
|
13
|
+
from deepeval.optimizer.rewriter.utils import (
|
|
14
|
+
_summarize_prompt_for_rewrite,
|
|
15
|
+
_compose_prompt_messages,
|
|
16
|
+
_normalize_llm_output_to_text,
|
|
17
|
+
_apply_rewritten_prompt,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Rewriter:
|
|
22
|
+
"""
|
|
23
|
+
Uses a provided DeepEval model to rewrite the prompt for a module,
|
|
24
|
+
guided by feedback_text (μ_f).
|
|
25
|
+
|
|
26
|
+
For LIST prompts, the target message to rewrite is chosen according to
|
|
27
|
+
`list_mutation_config` and `random_state`.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(
|
|
31
|
+
self,
|
|
32
|
+
optimizer_model: DeepEvalBaseLLM,
|
|
33
|
+
max_chars: int = 4000,
|
|
34
|
+
list_mutation_config: Optional[MutationConfig] = None,
|
|
35
|
+
random_state: Optional[Union[int, random.Random]] = None,
|
|
36
|
+
):
|
|
37
|
+
self.optimizer_model = optimizer_model
|
|
38
|
+
self.max_chars = max_chars
|
|
39
|
+
self.list_mutation_config = list_mutation_config or MutationConfig()
|
|
40
|
+
|
|
41
|
+
# Accept either an int seed or a Random instance.
|
|
42
|
+
if isinstance(random_state, int):
|
|
43
|
+
self.random_state: Optional[random.Random] = random.Random(
|
|
44
|
+
random_state
|
|
45
|
+
)
|
|
46
|
+
else:
|
|
47
|
+
self.random_state = random_state or random.Random()
|
|
48
|
+
|
|
49
|
+
def _compose_messages(
|
|
50
|
+
self, *, module_id: ModuleId, old_prompt: Prompt, feedback_text: str
|
|
51
|
+
) -> Tuple[str, str]:
|
|
52
|
+
current_prompt_block = _summarize_prompt_for_rewrite(
|
|
53
|
+
old_prompt, self.max_chars
|
|
54
|
+
)
|
|
55
|
+
system_message = (
|
|
56
|
+
"You are refining a prompt used in a multi-step LLM pipeline. "
|
|
57
|
+
"Given the current prompt and concise feedback, produce a revised prompt "
|
|
58
|
+
"that addresses the issues while preserving intent and style. "
|
|
59
|
+
"Return only the new prompt text, no explanations."
|
|
60
|
+
)
|
|
61
|
+
user_message = f"""[Current Prompt]
|
|
62
|
+
{current_prompt_block}
|
|
63
|
+
|
|
64
|
+
[Feedback]
|
|
65
|
+
{feedback_text[:self.max_chars]}
|
|
66
|
+
|
|
67
|
+
[Instruction]
|
|
68
|
+
Rewrite the prompt. Keep it concise and actionable. Do not include extraneous text.
|
|
69
|
+
"""
|
|
70
|
+
return system_message, user_message
|
|
71
|
+
|
|
72
|
+
def rewrite(
|
|
73
|
+
self,
|
|
74
|
+
module_id: ModuleId,
|
|
75
|
+
old_prompt: Prompt,
|
|
76
|
+
feedback_text: str,
|
|
77
|
+
) -> Prompt:
|
|
78
|
+
if not feedback_text.strip():
|
|
79
|
+
return old_prompt
|
|
80
|
+
|
|
81
|
+
system_message, user_message = self._compose_messages(
|
|
82
|
+
module_id=module_id,
|
|
83
|
+
old_prompt=old_prompt,
|
|
84
|
+
feedback_text=feedback_text,
|
|
85
|
+
)
|
|
86
|
+
merged_prompt_text = _compose_prompt_messages(
|
|
87
|
+
system_message, user_message
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
out = self.optimizer_model.generate(merged_prompt_text)
|
|
91
|
+
new_text = _normalize_llm_output_to_text(out)
|
|
92
|
+
return _apply_rewritten_prompt(
|
|
93
|
+
old_prompt,
|
|
94
|
+
new_text,
|
|
95
|
+
self.random_state,
|
|
96
|
+
self.list_mutation_config,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
async def a_rewrite(
|
|
100
|
+
self,
|
|
101
|
+
module_id: ModuleId,
|
|
102
|
+
old_prompt: Prompt,
|
|
103
|
+
feedback_text: str,
|
|
104
|
+
) -> Prompt:
|
|
105
|
+
if not feedback_text.strip():
|
|
106
|
+
return old_prompt
|
|
107
|
+
|
|
108
|
+
system_message, user_message = self._compose_messages(
|
|
109
|
+
module_id=module_id,
|
|
110
|
+
old_prompt=old_prompt,
|
|
111
|
+
feedback_text=feedback_text,
|
|
112
|
+
)
|
|
113
|
+
merged_prompt_text = _compose_prompt_messages(
|
|
114
|
+
system_message, user_message
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
out = await self.optimizer_model.a_generate(merged_prompt_text)
|
|
118
|
+
new_text = _normalize_llm_output_to_text(out)
|
|
119
|
+
return _apply_rewritten_prompt(
|
|
120
|
+
old_prompt,
|
|
121
|
+
new_text,
|
|
122
|
+
self.random_state,
|
|
123
|
+
self.list_mutation_config,
|
|
124
|
+
)
|