deepeval 3.7.4__py3-none-any.whl → 3.7.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/dataset/golden.py +54 -2
- deepeval/evaluate/evaluate.py +16 -8
- deepeval/evaluate/execute.py +70 -26
- deepeval/evaluate/utils.py +26 -22
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/metrics/__init__.py +14 -12
- deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
- deepeval/metrics/answer_relevancy/template.py +188 -92
- deepeval/metrics/base_metric.py +2 -5
- deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/faithfulness/faithfulness.py +70 -27
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/utils.py +2 -2
- deepeval/metrics/indicator.py +4 -4
- deepeval/metrics/multimodal_metrics/__init__.py +0 -18
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
- deepeval/metrics/utils.py +39 -58
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +16 -38
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +52 -28
- deepeval/models/embedding_models/local_embedding_model.py +18 -14
- deepeval/models/embedding_models/ollama_embedding_model.py +38 -16
- deepeval/models/embedding_models/openai_embedding_model.py +40 -21
- deepeval/models/llms/amazon_bedrock_model.py +1 -2
- deepeval/models/llms/anthropic_model.py +44 -23
- deepeval/models/llms/azure_model.py +121 -36
- deepeval/models/llms/deepseek_model.py +18 -13
- deepeval/models/llms/gemini_model.py +129 -43
- deepeval/models/llms/grok_model.py +18 -13
- deepeval/models/llms/kimi_model.py +18 -13
- deepeval/models/llms/litellm_model.py +42 -22
- deepeval/models/llms/local_model.py +12 -7
- deepeval/models/llms/ollama_model.py +114 -12
- deepeval/models/llms/openai_model.py +137 -41
- deepeval/models/llms/portkey_model.py +24 -7
- deepeval/models/llms/utils.py +5 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +46 -1
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +12 -10
- deepeval/test_case/conversational_test_case.py +19 -1
- deepeval/test_case/llm_test_case.py +152 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +15 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/RECORD +116 -125
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
|
@@ -30,59 +30,109 @@ from typing import (
|
|
|
30
30
|
Union,
|
|
31
31
|
)
|
|
32
32
|
|
|
33
|
+
from deepeval.models.base_model import DeepEvalBaseLLM
|
|
34
|
+
|
|
33
35
|
from deepeval.errors import DeepEvalError
|
|
34
36
|
from deepeval.dataset.golden import ConversationalGolden, Golden
|
|
35
|
-
from deepeval.
|
|
36
|
-
from deepeval.
|
|
37
|
+
from deepeval.optimizer.utils import Aggregator, mean_of_all
|
|
38
|
+
from deepeval.optimizer.types import (
|
|
37
39
|
AcceptedIterationDict,
|
|
38
40
|
ModuleId,
|
|
39
|
-
|
|
41
|
+
OptimizationReport,
|
|
40
42
|
PromptConfiguration,
|
|
41
43
|
PromptConfigurationId,
|
|
42
|
-
|
|
44
|
+
RunnerStatusCallback,
|
|
43
45
|
RunnerStatusType,
|
|
44
46
|
ScoreTable,
|
|
45
|
-
ScoringAdapter,
|
|
46
47
|
)
|
|
47
|
-
from deepeval.
|
|
48
|
+
from deepeval.optimizer.scorer.base import BaseScorer
|
|
49
|
+
from deepeval.optimizer.algorithms.base import BaseAlgorithm
|
|
50
|
+
from deepeval.optimizer.utils import build_prompt_config_snapshots
|
|
48
51
|
from deepeval.prompt.api import PromptType
|
|
49
52
|
from deepeval.prompt.prompt import Prompt
|
|
50
|
-
from deepeval.
|
|
53
|
+
from deepeval.optimizer.rewriter import Rewriter
|
|
51
54
|
|
|
52
|
-
from .configs import
|
|
53
|
-
|
|
55
|
+
from deepeval.optimizer.algorithms.configs import (
|
|
56
|
+
MIPROV2_MIN_DELTA,
|
|
57
|
+
MIPROV2_REWRITE_INSTRUCTION_MAX_CHARS,
|
|
58
|
+
SIMBA_DEMO_INPUT_MAX_CHARS,
|
|
59
|
+
)
|
|
60
|
+
from deepeval.optimizer.algorithms.simba.types import SIMBAStrategy
|
|
54
61
|
|
|
55
62
|
|
|
56
|
-
class
|
|
63
|
+
class SIMBA(BaseAlgorithm):
|
|
57
64
|
"""
|
|
58
65
|
SIMBA-style cooperative prompt optimization loop with sync/async execution.
|
|
59
66
|
|
|
60
67
|
This runner is intentionally low level and does not know about metrics,
|
|
61
|
-
models, or async configs. It relies on a preconfigured
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
68
|
+
models, or async configs. It relies on a preconfigured Scorer and
|
|
69
|
+
Rewriter, which are typically constructed by PromptOptimizer.
|
|
70
|
+
|
|
71
|
+
Parameters
|
|
72
|
+
----------
|
|
73
|
+
iterations : int
|
|
74
|
+
Total number of optimization trials. Default is 5.
|
|
75
|
+
minibatch_size : int
|
|
76
|
+
Number of examples drawn per iteration. Default is 8.
|
|
77
|
+
random_seed : int, optional
|
|
78
|
+
RNG seed for reproducibility. If None, derived from time.time_ns().
|
|
79
|
+
exploration_probability : float
|
|
80
|
+
Epsilon greedy exploration rate. Default is 0.2.
|
|
81
|
+
full_eval_every : int, optional
|
|
82
|
+
Fully evaluate best candidate every N trials. Default is 5.
|
|
83
|
+
population_size : int
|
|
84
|
+
Maximum number of candidates in the pool. Default is 4.
|
|
85
|
+
proposals_per_step : int
|
|
86
|
+
Number of child prompts proposed per iteration. Default is 4.
|
|
87
|
+
max_demos_per_proposal : int
|
|
88
|
+
Maximum demos from minibatch for APPEND_DEMO strategy. Default is 3.
|
|
70
89
|
"""
|
|
71
90
|
|
|
91
|
+
name = "SIMBA"
|
|
72
92
|
SINGLE_MODULE_ID: ModuleId = "__module__"
|
|
73
93
|
|
|
74
94
|
def __init__(
|
|
75
95
|
self,
|
|
76
|
-
|
|
77
|
-
|
|
96
|
+
iterations: int = 5,
|
|
97
|
+
minibatch_size: int = 8,
|
|
98
|
+
random_seed: Optional[int] = None,
|
|
99
|
+
exploration_probability: float = 0.2,
|
|
100
|
+
full_eval_every: Optional[int] = 5,
|
|
101
|
+
population_size: int = 4,
|
|
102
|
+
proposals_per_step: int = 4,
|
|
103
|
+
max_demos_per_proposal: int = 3,
|
|
78
104
|
aggregate_instances: Aggregator = mean_of_all,
|
|
79
|
-
|
|
105
|
+
scorer: Optional[BaseScorer] = None,
|
|
80
106
|
) -> None:
|
|
81
|
-
|
|
107
|
+
# Validate parameters
|
|
108
|
+
if iterations < 1:
|
|
109
|
+
raise ValueError("iterations must be >= 1")
|
|
110
|
+
if minibatch_size < 1:
|
|
111
|
+
raise ValueError("minibatch_size must be >= 1")
|
|
112
|
+
if exploration_probability < 0.0 or exploration_probability > 1.0:
|
|
113
|
+
raise ValueError(
|
|
114
|
+
"exploration_probability must be >= 0.0 and <= 1.0"
|
|
115
|
+
)
|
|
116
|
+
if full_eval_every is not None and full_eval_every < 1:
|
|
117
|
+
raise ValueError("full_eval_every must be >= 1")
|
|
118
|
+
if population_size < 1:
|
|
119
|
+
raise ValueError("population_size must be >= 1")
|
|
120
|
+
if proposals_per_step < 1:
|
|
121
|
+
raise ValueError("proposals_per_step must be >= 1")
|
|
122
|
+
if max_demos_per_proposal < 0:
|
|
123
|
+
raise ValueError("max_demos_per_proposal must be >= 0")
|
|
124
|
+
|
|
125
|
+
self.iterations = iterations
|
|
126
|
+
self.minibatch_size = minibatch_size
|
|
127
|
+
self.exploration_probability = exploration_probability
|
|
128
|
+
self.full_eval_every = full_eval_every
|
|
129
|
+
self.population_size = population_size
|
|
130
|
+
self.proposals_per_step = proposals_per_step
|
|
131
|
+
self.max_demos_per_proposal = max_demos_per_proposal
|
|
82
132
|
self.aggregate_instances = aggregate_instances
|
|
83
|
-
self.
|
|
133
|
+
self.scorer = scorer
|
|
84
134
|
|
|
85
|
-
if
|
|
135
|
+
if max_demos_per_proposal > 0:
|
|
86
136
|
self._strategies = [
|
|
87
137
|
SIMBAStrategy.APPEND_DEMO,
|
|
88
138
|
SIMBAStrategy.APPEND_RULE,
|
|
@@ -90,31 +140,25 @@ class SIMBARunner:
|
|
|
90
140
|
else:
|
|
91
141
|
self._strategies = [SIMBAStrategy.APPEND_RULE]
|
|
92
142
|
|
|
93
|
-
#
|
|
94
|
-
|
|
95
|
-
|
|
143
|
+
# If no seed provided, use time-based seed
|
|
144
|
+
if random_seed is None:
|
|
145
|
+
random_seed = time.time_ns()
|
|
146
|
+
self.random_seed = random_seed
|
|
147
|
+
self.random_state = random.Random(random_seed)
|
|
96
148
|
|
|
97
149
|
# Runtime state to be reset between runs
|
|
98
150
|
self.reset_state()
|
|
99
151
|
|
|
100
152
|
# Status callback set by PromptOptimizer:
|
|
101
153
|
# (kind, step_index, total_steps, detail) -> None
|
|
102
|
-
self.status_callback: Optional[
|
|
103
|
-
|
|
104
|
-
# Model callback used by the rewriter set by PromptOptimizer.
|
|
105
|
-
self.model_callback: Optional[
|
|
106
|
-
Callable[
|
|
107
|
-
...,
|
|
108
|
-
Union[
|
|
109
|
-
str,
|
|
110
|
-
Dict,
|
|
111
|
-
Tuple[Union[str, Dict], float],
|
|
112
|
-
],
|
|
113
|
-
]
|
|
114
|
-
] = None
|
|
154
|
+
self.status_callback: Optional[RunnerStatusCallback] = None
|
|
115
155
|
|
|
116
|
-
#
|
|
117
|
-
|
|
156
|
+
# Optimizer model used by the rewriter for prompt mutation.
|
|
157
|
+
# Set by PromptOptimizer.
|
|
158
|
+
self.optimizer_model: Optional["DeepEvalBaseLLM"] = None
|
|
159
|
+
|
|
160
|
+
# Lazy-loaded Rewriter set by PromptOptimizer
|
|
161
|
+
self._rewriter: Optional[Rewriter] = None
|
|
118
162
|
|
|
119
163
|
##############
|
|
120
164
|
# Public API #
|
|
@@ -122,10 +166,9 @@ class SIMBARunner:
|
|
|
122
166
|
|
|
123
167
|
def execute(
|
|
124
168
|
self,
|
|
125
|
-
*,
|
|
126
169
|
prompt: Prompt,
|
|
127
170
|
goldens: Union[List[Golden], List[ConversationalGolden]],
|
|
128
|
-
) -> Tuple[Prompt,
|
|
171
|
+
) -> Tuple[Prompt, OptimizationReport]:
|
|
129
172
|
"""
|
|
130
173
|
Synchronous SIMBA run from a full list of goldens.
|
|
131
174
|
|
|
@@ -140,8 +183,7 @@ class SIMBARunner:
|
|
|
140
183
|
"the optimizer."
|
|
141
184
|
)
|
|
142
185
|
|
|
143
|
-
self.
|
|
144
|
-
self._ensure_rewriter()
|
|
186
|
+
self._ensure_scorer()
|
|
145
187
|
self.reset_state()
|
|
146
188
|
|
|
147
189
|
# Seed candidate pool with the root prompt configuration.
|
|
@@ -167,7 +209,7 @@ class SIMBARunner:
|
|
|
167
209
|
# candidate on the first iteration.
|
|
168
210
|
if not self._minibatch_score_counts:
|
|
169
211
|
seed_minibatch = self._draw_minibatch(goldens)
|
|
170
|
-
root_score = self.
|
|
212
|
+
root_score = self.scorer.score_minibatch(
|
|
171
213
|
root_prompt_configuration, seed_minibatch
|
|
172
214
|
)
|
|
173
215
|
self._record_minibatch_score(
|
|
@@ -182,7 +224,7 @@ class SIMBARunner:
|
|
|
182
224
|
|
|
183
225
|
# Compute shared feedback for this parent/minibatch that will be
|
|
184
226
|
# used by all SIMBA proposals in this iteration.
|
|
185
|
-
feedback_text = self.
|
|
227
|
+
feedback_text = self.scorer.get_minibatch_feedback(
|
|
186
228
|
parent_prompt_configuration, selected_module_id, minibatch
|
|
187
229
|
)
|
|
188
230
|
|
|
@@ -190,10 +232,10 @@ class SIMBARunner:
|
|
|
190
232
|
parent_prompt_configuration.id
|
|
191
233
|
)
|
|
192
234
|
jitter = 1e-6
|
|
193
|
-
min_delta = max(
|
|
235
|
+
min_delta = max(MIPROV2_MIN_DELTA, jitter)
|
|
194
236
|
|
|
195
237
|
# 2. Generate multiple SIMBA child prompts and evaluate them.
|
|
196
|
-
num_proposals = int(self.
|
|
238
|
+
num_proposals = int(self.proposals_per_step)
|
|
197
239
|
for _ in range(num_proposals):
|
|
198
240
|
strategy = self._sample_strategy()
|
|
199
241
|
child_prompt = self._generate_child_prompt(
|
|
@@ -213,7 +255,7 @@ class SIMBARunner:
|
|
|
213
255
|
child_prompt,
|
|
214
256
|
)
|
|
215
257
|
|
|
216
|
-
child_score = self.
|
|
258
|
+
child_score = self.scorer.score_minibatch(
|
|
217
259
|
child_prompt_configuration, minibatch
|
|
218
260
|
)
|
|
219
261
|
|
|
@@ -238,8 +280,8 @@ class SIMBARunner:
|
|
|
238
280
|
|
|
239
281
|
self.trial_index += 1
|
|
240
282
|
if (
|
|
241
|
-
self.
|
|
242
|
-
and self.trial_index % self.
|
|
283
|
+
self.full_eval_every is not None
|
|
284
|
+
and self.trial_index % self.full_eval_every == 0
|
|
243
285
|
):
|
|
244
286
|
self._full_evaluate_best(goldens)
|
|
245
287
|
|
|
@@ -255,7 +297,7 @@ class SIMBARunner:
|
|
|
255
297
|
prompt_config_snapshots = build_prompt_config_snapshots(
|
|
256
298
|
self.prompt_configurations_by_id
|
|
257
299
|
)
|
|
258
|
-
report =
|
|
300
|
+
report = OptimizationReport(
|
|
259
301
|
optimization_id=self.optimization_id,
|
|
260
302
|
best_id=best.id,
|
|
261
303
|
accepted_iterations=accepted_iterations,
|
|
@@ -263,14 +305,13 @@ class SIMBARunner:
|
|
|
263
305
|
parents=self.parents_by_id,
|
|
264
306
|
prompt_configurations=prompt_config_snapshots,
|
|
265
307
|
)
|
|
266
|
-
return best.prompts[self.SINGLE_MODULE_ID], report
|
|
308
|
+
return best.prompts[self.SINGLE_MODULE_ID], report
|
|
267
309
|
|
|
268
310
|
async def a_execute(
|
|
269
311
|
self,
|
|
270
|
-
*,
|
|
271
312
|
prompt: Prompt,
|
|
272
313
|
goldens: Union[List[Golden], List[ConversationalGolden]],
|
|
273
|
-
) -> Tuple[Prompt,
|
|
314
|
+
) -> Tuple[Prompt, OptimizationReport]:
|
|
274
315
|
"""
|
|
275
316
|
Asynchronous twin of execute().
|
|
276
317
|
"""
|
|
@@ -282,8 +323,7 @@ class SIMBARunner:
|
|
|
282
323
|
"the optimizer."
|
|
283
324
|
)
|
|
284
325
|
|
|
285
|
-
self.
|
|
286
|
-
self._ensure_rewriter()
|
|
326
|
+
self._ensure_scorer()
|
|
287
327
|
self.reset_state()
|
|
288
328
|
|
|
289
329
|
seed_prompts_by_module = {self.SINGLE_MODULE_ID: prompt}
|
|
@@ -303,7 +343,7 @@ class SIMBARunner:
|
|
|
303
343
|
|
|
304
344
|
if not self._minibatch_score_counts:
|
|
305
345
|
seed_minibatch = self._draw_minibatch(goldens)
|
|
306
|
-
root_score = await self.
|
|
346
|
+
root_score = await self.scorer.a_score_minibatch(
|
|
307
347
|
root_prompt_configuration, seed_minibatch
|
|
308
348
|
)
|
|
309
349
|
self._record_minibatch_score(
|
|
@@ -315,7 +355,7 @@ class SIMBARunner:
|
|
|
315
355
|
|
|
316
356
|
minibatch = self._draw_minibatch(goldens)
|
|
317
357
|
|
|
318
|
-
feedback_text = await self.
|
|
358
|
+
feedback_text = await self.scorer.a_get_minibatch_feedback(
|
|
319
359
|
parent_prompt_configuration, selected_module_id, minibatch
|
|
320
360
|
)
|
|
321
361
|
|
|
@@ -323,9 +363,9 @@ class SIMBARunner:
|
|
|
323
363
|
parent_prompt_configuration.id
|
|
324
364
|
)
|
|
325
365
|
jitter = 1e-6
|
|
326
|
-
min_delta = max(
|
|
366
|
+
min_delta = max(MIPROV2_MIN_DELTA, jitter)
|
|
327
367
|
|
|
328
|
-
num_proposals = int(self.
|
|
368
|
+
num_proposals = int(self.proposals_per_step)
|
|
329
369
|
for _ in range(num_proposals):
|
|
330
370
|
strategy = self._sample_strategy()
|
|
331
371
|
child_prompt = await self._a_generate_child_prompt(
|
|
@@ -344,7 +384,7 @@ class SIMBARunner:
|
|
|
344
384
|
child_prompt,
|
|
345
385
|
)
|
|
346
386
|
|
|
347
|
-
child_score = await self.
|
|
387
|
+
child_score = await self.scorer.a_score_minibatch(
|
|
348
388
|
child_prompt_configuration, minibatch
|
|
349
389
|
)
|
|
350
390
|
|
|
@@ -366,8 +406,8 @@ class SIMBARunner:
|
|
|
366
406
|
|
|
367
407
|
self.trial_index += 1
|
|
368
408
|
if (
|
|
369
|
-
self.
|
|
370
|
-
and self.trial_index % self.
|
|
409
|
+
self.full_eval_every is not None
|
|
410
|
+
and self.trial_index % self.full_eval_every == 0
|
|
371
411
|
):
|
|
372
412
|
await self._a_full_evaluate_best(goldens)
|
|
373
413
|
|
|
@@ -382,7 +422,7 @@ class SIMBARunner:
|
|
|
382
422
|
prompt_config_snapshots = build_prompt_config_snapshots(
|
|
383
423
|
self.prompt_configurations_by_id
|
|
384
424
|
)
|
|
385
|
-
report =
|
|
425
|
+
report = OptimizationReport(
|
|
386
426
|
optimization_id=self.optimization_id,
|
|
387
427
|
best_id=best.id,
|
|
388
428
|
accepted_iterations=accepted_iterations,
|
|
@@ -390,7 +430,7 @@ class SIMBARunner:
|
|
|
390
430
|
parents=self.parents_by_id,
|
|
391
431
|
prompt_configurations=prompt_config_snapshots,
|
|
392
432
|
)
|
|
393
|
-
return best.prompts[self.SINGLE_MODULE_ID], report
|
|
433
|
+
return best.prompts[self.SINGLE_MODULE_ID], report
|
|
394
434
|
|
|
395
435
|
###################
|
|
396
436
|
# State & helpers #
|
|
@@ -414,25 +454,14 @@ class SIMBARunner:
|
|
|
414
454
|
# Trial counter (used for full_eval_every).
|
|
415
455
|
self.trial_index: int = 0
|
|
416
456
|
|
|
417
|
-
def
|
|
418
|
-
if self.
|
|
457
|
+
def _ensure_scorer(self) -> None:
|
|
458
|
+
if self.scorer is None:
|
|
419
459
|
raise DeepEvalError(
|
|
420
|
-
"SIMBARunner requires a `
|
|
421
|
-
"Construct one (for example,
|
|
422
|
-
"PromptOptimizer and assign it to `runner.
|
|
460
|
+
"SIMBARunner requires a `scorer`. "
|
|
461
|
+
"Construct one (for example, Scorer) in "
|
|
462
|
+
"PromptOptimizer and assign it to `runner.scorer`."
|
|
423
463
|
)
|
|
424
464
|
|
|
425
|
-
def _ensure_rewriter(self) -> None:
|
|
426
|
-
if self._rewriter is not None:
|
|
427
|
-
return
|
|
428
|
-
|
|
429
|
-
# Default basic PromptRewriter; PromptOptimizer can override this and
|
|
430
|
-
# pass a configured instance (e.g. with list-mutation config).
|
|
431
|
-
self._rewriter = PromptRewriter(
|
|
432
|
-
max_chars=self.config.rewrite_instruction_max_chars,
|
|
433
|
-
random_state=self.random_state,
|
|
434
|
-
)
|
|
435
|
-
|
|
436
465
|
def _prompts_equivalent(
|
|
437
466
|
self,
|
|
438
467
|
old_prompt: Prompt,
|
|
@@ -484,9 +513,7 @@ class SIMBARunner:
|
|
|
484
513
|
|
|
485
514
|
# If we exceed the population size, iteratively prune the worst
|
|
486
515
|
# (by mean minibatch score), never removing the current best.
|
|
487
|
-
while (
|
|
488
|
-
len(self.prompt_configurations_by_id) > self.config.population_size
|
|
489
|
-
):
|
|
516
|
+
while len(self.prompt_configurations_by_id) > self.population_size:
|
|
490
517
|
best_id: Optional[PromptConfigurationId] = None
|
|
491
518
|
best_score = float("-inf")
|
|
492
519
|
for cand_id in self.prompt_configurations_by_id.keys():
|
|
@@ -611,7 +638,7 @@ class SIMBARunner:
|
|
|
611
638
|
"SIMBARunner has an empty candidate pool; this should not happen."
|
|
612
639
|
)
|
|
613
640
|
|
|
614
|
-
eps = float(self.
|
|
641
|
+
eps = float(self.exploration_probability)
|
|
615
642
|
if eps > 0.0 and self.random_state.random() < eps:
|
|
616
643
|
chosen_id = self.random_state.choice(candidate_ids)
|
|
617
644
|
else:
|
|
@@ -624,23 +651,14 @@ class SIMBARunner:
|
|
|
624
651
|
goldens: Union[List[Golden], List[ConversationalGolden]],
|
|
625
652
|
) -> Union[List[Golden], List[ConversationalGolden]]:
|
|
626
653
|
"""
|
|
627
|
-
Determine effective minibatch size
|
|
628
|
-
|
|
654
|
+
Determine effective minibatch size, bounded by the available goldens,
|
|
655
|
+
and sample with replacement.
|
|
629
656
|
"""
|
|
630
657
|
n = len(goldens)
|
|
631
658
|
if n <= 0:
|
|
632
659
|
return []
|
|
633
660
|
|
|
634
|
-
|
|
635
|
-
size = self.config.minibatch_size
|
|
636
|
-
else:
|
|
637
|
-
dynamic = max(1, int(round(n * self.config.minibatch_ratio)))
|
|
638
|
-
size = max(
|
|
639
|
-
self.config.minibatch_min_size,
|
|
640
|
-
min(dynamic, self.config.minibatch_max_size),
|
|
641
|
-
)
|
|
642
|
-
|
|
643
|
-
size = max(1, min(size, n))
|
|
661
|
+
size = min(self.minibatch_size, n)
|
|
644
662
|
|
|
645
663
|
return [goldens[self.random_state.randrange(0, n)] for _ in range(size)]
|
|
646
664
|
|
|
@@ -655,7 +673,7 @@ class SIMBARunner:
|
|
|
655
673
|
if best.id in self.pareto_score_table:
|
|
656
674
|
return
|
|
657
675
|
|
|
658
|
-
scores = await self.
|
|
676
|
+
scores = await self.scorer.a_score_pareto(best, goldens)
|
|
659
677
|
self.pareto_score_table[best.id] = scores
|
|
660
678
|
|
|
661
679
|
def _full_evaluate_best(
|
|
@@ -669,7 +687,7 @@ class SIMBARunner:
|
|
|
669
687
|
if best.id in self.pareto_score_table:
|
|
670
688
|
return
|
|
671
689
|
|
|
672
|
-
scores = self.
|
|
690
|
+
scores = self.scorer.score_pareto(best, goldens)
|
|
673
691
|
self.pareto_score_table[best.id] = scores
|
|
674
692
|
|
|
675
693
|
async def _a_generate_child_prompt(
|
|
@@ -694,7 +712,6 @@ class SIMBARunner:
|
|
|
694
712
|
)
|
|
695
713
|
|
|
696
714
|
new_prompt = await self._rewriter.a_rewrite(
|
|
697
|
-
model_callback=self.model_callback,
|
|
698
715
|
module_id=selected_module_id,
|
|
699
716
|
old_prompt=old_prompt,
|
|
700
717
|
feedback_text=strategy_feedback,
|
|
@@ -730,7 +747,6 @@ class SIMBARunner:
|
|
|
730
747
|
)
|
|
731
748
|
|
|
732
749
|
new_prompt = self._rewriter.rewrite(
|
|
733
|
-
model_callback=self.model_callback,
|
|
734
750
|
module_id=selected_module_id,
|
|
735
751
|
old_prompt=old_prompt,
|
|
736
752
|
feedback_text=strategy_feedback,
|
|
@@ -761,7 +777,7 @@ class SIMBARunner:
|
|
|
761
777
|
Truncate strategy instructions + feedback to the configured character
|
|
762
778
|
budget so the rewriter prompt does not explode.
|
|
763
779
|
"""
|
|
764
|
-
max_chars =
|
|
780
|
+
max_chars = MIPROV2_REWRITE_INSTRUCTION_MAX_CHARS
|
|
765
781
|
if max_chars <= 0:
|
|
766
782
|
return text
|
|
767
783
|
if len(text) <= max_chars:
|
|
@@ -788,15 +804,15 @@ class SIMBARunner:
|
|
|
788
804
|
Context <- " ".join(golden.context) if present
|
|
789
805
|
Output <- golden.expected_outcome
|
|
790
806
|
|
|
791
|
-
All text segments are independently truncated to `
|
|
807
|
+
All text segments are independently truncated to `SIMBA_DEMO_INPUT_MAX_CHARS`.
|
|
792
808
|
"""
|
|
793
|
-
max_demos = self.
|
|
809
|
+
max_demos = self.max_demos_per_proposal
|
|
794
810
|
if max_demos <= 0:
|
|
795
811
|
return ""
|
|
796
812
|
|
|
797
813
|
lines: List[str] = []
|
|
798
814
|
demo_limit = min(max_demos, len(minibatch))
|
|
799
|
-
max_chars =
|
|
815
|
+
max_chars = SIMBA_DEMO_INPUT_MAX_CHARS
|
|
800
816
|
|
|
801
817
|
for golden in minibatch[:demo_limit]:
|
|
802
818
|
if isinstance(golden, Golden):
|
|
@@ -843,7 +859,7 @@ class SIMBARunner:
|
|
|
843
859
|
) -> str:
|
|
844
860
|
"""
|
|
845
861
|
Construct a strategy-specific feedback string that is passed into
|
|
846
|
-
|
|
862
|
+
Rewriter.rewrite / a_rewrite.
|
|
847
863
|
|
|
848
864
|
- APPEND_RULE: emphasize extracting a concise rule from metric feedback.
|
|
849
865
|
- APPEND_DEMO: emphasize appending concrete demos built from goldens.
|
|
@@ -934,7 +950,7 @@ class SIMBARunner:
|
|
|
934
950
|
self,
|
|
935
951
|
simba_iteration: Callable[[], bool],
|
|
936
952
|
) -> None:
|
|
937
|
-
total_iterations = self.
|
|
953
|
+
total_iterations = self.iterations
|
|
938
954
|
remaining_iterations = total_iterations
|
|
939
955
|
iteration = 0
|
|
940
956
|
self._update_progress(
|
|
@@ -960,7 +976,7 @@ class SIMBARunner:
|
|
|
960
976
|
self,
|
|
961
977
|
a_simba_iteration: Callable[[], Awaitable[bool]],
|
|
962
978
|
) -> None:
|
|
963
|
-
total_iterations = self.
|
|
979
|
+
total_iterations = self.iterations
|
|
964
980
|
remaining_iterations = total_iterations
|
|
965
981
|
iteration = 0
|
|
966
982
|
self._update_progress(
|
|
@@ -2,27 +2,24 @@ from __future__ import annotations
|
|
|
2
2
|
from enum import Enum
|
|
3
3
|
from pydantic import BaseModel, Field, conint
|
|
4
4
|
from typing import Optional
|
|
5
|
+
from deepeval.evaluate.configs import AsyncConfig
|
|
5
6
|
|
|
6
7
|
|
|
7
|
-
class
|
|
8
|
-
"""Display controls used by PromptOptimizer for all algorithms."""
|
|
9
|
-
|
|
8
|
+
class DisplayConfig(BaseModel):
|
|
10
9
|
show_indicator: bool = True
|
|
11
10
|
announce_ties: bool = Field(
|
|
12
11
|
False, description="Print a one-line note when a tie is detected"
|
|
13
12
|
)
|
|
14
13
|
|
|
15
14
|
|
|
16
|
-
class
|
|
15
|
+
class MutationTargetType(Enum):
|
|
17
16
|
RANDOM = "random"
|
|
18
17
|
FIXED_INDEX = "fixed_index"
|
|
19
18
|
|
|
20
19
|
|
|
21
20
|
# default all messages
|
|
22
|
-
class
|
|
23
|
-
target_type:
|
|
24
|
-
PromptListMutationTargetType.RANDOM
|
|
25
|
-
)
|
|
21
|
+
class MutationConfig(BaseModel):
|
|
22
|
+
target_type: MutationTargetType = MutationTargetType.RANDOM
|
|
26
23
|
# should be list
|
|
27
24
|
target_role: Optional[str] = Field(
|
|
28
25
|
default=None,
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
|
-
from
|
|
2
|
+
from enum import Enum
|
|
3
3
|
import random
|
|
4
|
+
from typing import Dict, List, Sequence, Optional, Tuple
|
|
4
5
|
|
|
5
6
|
from deepeval.errors import DeepEvalError
|
|
6
|
-
from deepeval.
|
|
7
|
+
from deepeval.optimizer.types import PromptConfigurationId, ScoreTable
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
def _is_dominated(
|
|
@@ -164,3 +165,63 @@ def select_prompt_configuration_pareto(
|
|
|
164
165
|
"""
|
|
165
166
|
freq = frequency_weights(score_table)
|
|
166
167
|
return sample_by_frequency(freq, random_state=random_state)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class TieBreaker(str, Enum):
|
|
171
|
+
PREFER_ROOT = "prefer_root"
|
|
172
|
+
PREFER_CHILD = "prefer_child"
|
|
173
|
+
RANDOM = "random"
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def pick_best_with_ties(
|
|
177
|
+
totals: Dict[PromptConfigurationId, float],
|
|
178
|
+
parents_by_id: Dict[PromptConfigurationId, Optional[PromptConfigurationId]],
|
|
179
|
+
*,
|
|
180
|
+
random_state: random.Random,
|
|
181
|
+
tie_tolerance: float = 1e-9,
|
|
182
|
+
policy: TieBreaker = TieBreaker.PREFER_ROOT,
|
|
183
|
+
) -> Tuple[PromptConfigurationId, List[PromptConfigurationId], float]:
|
|
184
|
+
"""
|
|
185
|
+
Choose the best candidate by aggregate score with deterministic tie handling.
|
|
186
|
+
|
|
187
|
+
Returns: (chosen_id, tied_ids, max_score)
|
|
188
|
+
- tied_ids includes everyone within tie_tolerance of max_score
|
|
189
|
+
"""
|
|
190
|
+
if not totals:
|
|
191
|
+
raise DeepEvalError("No candidate prompt configuration to choose from.")
|
|
192
|
+
|
|
193
|
+
max_score = max(totals.values())
|
|
194
|
+
tied = [
|
|
195
|
+
prompt_configuration_id
|
|
196
|
+
for prompt_configuration_id, score in totals.items()
|
|
197
|
+
if abs(score - max_score) <= tie_tolerance
|
|
198
|
+
]
|
|
199
|
+
|
|
200
|
+
if len(tied) == 1:
|
|
201
|
+
return tied[0], tied, max_score
|
|
202
|
+
|
|
203
|
+
# Resolve tie by policy
|
|
204
|
+
if policy == TieBreaker.PREFER_CHILD:
|
|
205
|
+
# Prefer any non root. When multiple children exist, use the most recent
|
|
206
|
+
child_ids = [
|
|
207
|
+
prompt_configuration_id
|
|
208
|
+
for prompt_configuration_id in tied
|
|
209
|
+
if parents_by_id.get(prompt_configuration_id) is not None
|
|
210
|
+
]
|
|
211
|
+
if child_ids:
|
|
212
|
+
# choose the newest child deterministically by order
|
|
213
|
+
for prompt_configuration_id in reversed(list(totals.keys())):
|
|
214
|
+
if prompt_configuration_id in child_ids:
|
|
215
|
+
return prompt_configuration_id, tied, max_score
|
|
216
|
+
|
|
217
|
+
if policy == TieBreaker.RANDOM:
|
|
218
|
+
return random_state.choice(tied), tied, max_score
|
|
219
|
+
|
|
220
|
+
# by default prefer a root if present, otherwise the first tied
|
|
221
|
+
root_ids = [
|
|
222
|
+
prompt_configuration_id
|
|
223
|
+
for prompt_configuration_id in tied
|
|
224
|
+
if parents_by_id.get(prompt_configuration_id) is None
|
|
225
|
+
]
|
|
226
|
+
chosen = root_ids[0] if root_ids else tied[0]
|
|
227
|
+
return chosen, tied, max_score
|