opik-optimizer 2.1.3__py3-none-any.whl → 2.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +0 -2
- opik_optimizer/base_optimizer.py +313 -144
- opik_optimizer/evolutionary_optimizer/crossover_ops.py +31 -4
- opik_optimizer/evolutionary_optimizer/evaluation_ops.py +23 -3
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +122 -95
- opik_optimizer/evolutionary_optimizer/mcp.py +11 -6
- opik_optimizer/evolutionary_optimizer/mutation_ops.py +25 -5
- opik_optimizer/evolutionary_optimizer/population_ops.py +26 -10
- opik_optimizer/evolutionary_optimizer/reporting.py +5 -5
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +53 -99
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +4 -4
- opik_optimizer/gepa_optimizer/gepa_optimizer.py +345 -201
- opik_optimizer/gepa_optimizer/reporting.py +291 -22
- opik_optimizer/hierarchical_reflective_optimizer/hierarchical_reflective_optimizer.py +90 -167
- opik_optimizer/hierarchical_reflective_optimizer/prompts.py +7 -1
- opik_optimizer/hierarchical_reflective_optimizer/reporting.py +168 -75
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +185 -205
- opik_optimizer/meta_prompt_optimizer/reporting.py +4 -4
- opik_optimizer/mipro_optimizer/__init__.py +2 -2
- opik_optimizer/mipro_optimizer/_lm.py +4 -4
- opik_optimizer/mipro_optimizer/{_mipro_optimizer_v2.py → mipro_optimizer_v2.py} +1 -7
- opik_optimizer/mipro_optimizer/utils.py +1 -0
- opik_optimizer/optimizable_agent.py +7 -4
- opik_optimizer/optimization_config/chat_prompt.py +7 -10
- opik_optimizer/parameter_optimizer/parameter_optimizer.py +188 -40
- opik_optimizer/parameter_optimizer/reporting.py +148 -0
- opik_optimizer/reporting_utils.py +60 -15
- opik_optimizer/utils/__init__.py +3 -0
- opik_optimizer/utils/candidate_utils.py +52 -0
- opik_optimizer/utils/core.py +35 -2
- opik_optimizer/utils/prompt_segments.py +1 -2
- {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.1.dist-info}/METADATA +2 -3
- {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.1.dist-info}/RECORD +36 -36
- opik_optimizer/evolutionary_optimizer/llm_support.py +0 -136
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +0 -680
- {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.1.dist-info}/WHEEL +0 -0
- {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.1.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.1.dist-info}/top_level.txt +0 -0
|
@@ -5,8 +5,8 @@ import opik
|
|
|
5
5
|
import litellm
|
|
6
6
|
from litellm.caching import Cache
|
|
7
7
|
from litellm.types.caching import LiteLLMCacheType
|
|
8
|
+
from opik import opik_context
|
|
8
9
|
from opik.evaluation.evaluation_result import EvaluationResult
|
|
9
|
-
from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
|
|
10
10
|
from opik.evaluation import evaluator as opik_evaluator
|
|
11
11
|
|
|
12
12
|
from typing import Any, TypeVar
|
|
@@ -51,16 +51,17 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
51
51
|
complex prompt that you want to systematically refine based on understanding why it fails.
|
|
52
52
|
|
|
53
53
|
Args:
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
54
|
+
model: LiteLLM model name for the optimization algorithm (reasoning and analysis)
|
|
55
|
+
model_parameters: Optional dict of LiteLLM parameters for optimizer's internal LLM calls.
|
|
56
|
+
Common params: temperature, max_tokens, max_completion_tokens, top_p.
|
|
57
|
+
See: https://docs.litellm.ai/docs/completion/input
|
|
58
58
|
max_parallel_batches: Maximum number of batches to process concurrently during
|
|
59
|
-
hierarchical root cause analysis
|
|
60
|
-
batch_size: Number of test cases per batch for root cause analysis
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
59
|
+
hierarchical root cause analysis
|
|
60
|
+
batch_size: Number of test cases per batch for root cause analysis
|
|
61
|
+
convergence_threshold: Stop if relative improvement is below this threshold
|
|
62
|
+
n_threads: Number of parallel threads for evaluation
|
|
63
|
+
verbose: Controls internal logging/progress bars (0=off, 1=on)
|
|
64
|
+
seed: Random seed for reproducibility
|
|
64
65
|
"""
|
|
65
66
|
|
|
66
67
|
DEFAULT_ROUNDS = 10
|
|
@@ -69,139 +70,34 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
69
70
|
|
|
70
71
|
def __init__(
|
|
71
72
|
self,
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
verbose: int = 1,
|
|
75
|
-
seed: int = 42,
|
|
73
|
+
model: str = "gpt-4o",
|
|
74
|
+
model_parameters: dict[str, Any] | None = None,
|
|
76
75
|
max_parallel_batches: int = 5,
|
|
77
76
|
batch_size: int = 25,
|
|
78
|
-
max_iterations: int = DEFAULT_MAX_ITERATIONS,
|
|
79
77
|
convergence_threshold: float = DEFAULT_CONVERGENCE_THRESHOLD,
|
|
80
|
-
|
|
78
|
+
n_threads: int = 12,
|
|
79
|
+
verbose: int = 1,
|
|
80
|
+
seed: int = 42,
|
|
81
81
|
):
|
|
82
82
|
super().__init__(
|
|
83
|
-
model=
|
|
83
|
+
model=model, verbose=verbose, seed=seed, model_parameters=model_parameters
|
|
84
84
|
)
|
|
85
|
-
self.
|
|
86
|
-
self.num_threads = num_threads
|
|
85
|
+
self.n_threads = n_threads
|
|
87
86
|
self.max_parallel_batches = max_parallel_batches
|
|
88
87
|
self.batch_size = batch_size
|
|
89
|
-
self.max_iterations = max_iterations
|
|
90
88
|
self.convergence_threshold = convergence_threshold
|
|
89
|
+
self._should_stop_optimization = False # Flag to exit all loops
|
|
91
90
|
|
|
92
91
|
# Initialize hierarchical analyzer
|
|
93
92
|
self._hierarchical_analyzer = HierarchicalRootCauseAnalyzer(
|
|
94
93
|
call_model_fn=self._call_model_async,
|
|
95
|
-
reasoning_model=self.
|
|
94
|
+
reasoning_model=self.model,
|
|
96
95
|
seed=self.seed,
|
|
97
96
|
max_parallel_batches=self.max_parallel_batches,
|
|
98
97
|
batch_size=self.batch_size,
|
|
99
98
|
verbose=self.verbose,
|
|
100
99
|
)
|
|
101
100
|
|
|
102
|
-
def _prepare_model_params(
|
|
103
|
-
self,
|
|
104
|
-
model_kwargs: dict[str, Any],
|
|
105
|
-
response_model: type[T] | None = None,
|
|
106
|
-
) -> dict[str, Any]:
|
|
107
|
-
"""
|
|
108
|
-
Prepare parameters for LiteLLM call by filtering and adding monitoring.
|
|
109
|
-
|
|
110
|
-
Args:
|
|
111
|
-
model_kwargs: Additional model parameters
|
|
112
|
-
response_model: Optional Pydantic model for structured output
|
|
113
|
-
|
|
114
|
-
Returns:
|
|
115
|
-
Dictionary of parameters ready for litellm.completion/acompletion
|
|
116
|
-
"""
|
|
117
|
-
current_model_kwargs = self.model_kwargs.copy()
|
|
118
|
-
current_model_kwargs.update(model_kwargs)
|
|
119
|
-
|
|
120
|
-
# Filter out optimizer-specific kwargs that shouldn't be passed to LiteLLM
|
|
121
|
-
filtered_call_kwargs = current_model_kwargs.copy()
|
|
122
|
-
filtered_call_kwargs.pop("n_trials", None)
|
|
123
|
-
filtered_call_kwargs.pop("n_samples", None)
|
|
124
|
-
filtered_call_kwargs.pop("n_iterations", None)
|
|
125
|
-
filtered_call_kwargs.pop("min_examples", None)
|
|
126
|
-
filtered_call_kwargs.pop("max_examples", None)
|
|
127
|
-
filtered_call_kwargs.pop("project_name", None)
|
|
128
|
-
|
|
129
|
-
final_params_for_litellm = (
|
|
130
|
-
opik_litellm_monitor.try_add_opik_monitoring_to_params(filtered_call_kwargs)
|
|
131
|
-
)
|
|
132
|
-
|
|
133
|
-
# Add structured output support if response_model is provided
|
|
134
|
-
# According to LiteLLM docs: https://docs.litellm.ai/docs/completion/json_mode
|
|
135
|
-
# Pass the Pydantic model directly to response_format
|
|
136
|
-
if response_model is not None:
|
|
137
|
-
final_params_for_litellm["response_format"] = response_model
|
|
138
|
-
|
|
139
|
-
return final_params_for_litellm
|
|
140
|
-
|
|
141
|
-
def _parse_response(
|
|
142
|
-
self,
|
|
143
|
-
response: Any,
|
|
144
|
-
response_model: type[T] | None = None,
|
|
145
|
-
) -> T | str:
|
|
146
|
-
"""
|
|
147
|
-
Parse LiteLLM response, with optional structured output parsing.
|
|
148
|
-
|
|
149
|
-
Args:
|
|
150
|
-
response: The response from litellm.completion/acompletion
|
|
151
|
-
response_model: Optional Pydantic model for structured output
|
|
152
|
-
|
|
153
|
-
Returns:
|
|
154
|
-
If response_model is provided, returns an instance of that model.
|
|
155
|
-
Otherwise, returns the raw string response.
|
|
156
|
-
"""
|
|
157
|
-
content = response.choices[0].message.content
|
|
158
|
-
|
|
159
|
-
# When using structured outputs with Pydantic models, LiteLLM automatically
|
|
160
|
-
# parses the response. Parse the JSON string into the Pydantic model
|
|
161
|
-
if response_model is not None:
|
|
162
|
-
return response_model.model_validate_json(content)
|
|
163
|
-
|
|
164
|
-
return content
|
|
165
|
-
|
|
166
|
-
@_throttle.rate_limited(_rate_limiter)
|
|
167
|
-
def _call_model(
|
|
168
|
-
self,
|
|
169
|
-
model: str,
|
|
170
|
-
messages: list[dict[str, str]],
|
|
171
|
-
seed: int,
|
|
172
|
-
model_kwargs: dict[str, Any],
|
|
173
|
-
response_model: type[T] | None = None,
|
|
174
|
-
) -> T | str:
|
|
175
|
-
"""
|
|
176
|
-
Call the LLM model with optional structured output.
|
|
177
|
-
|
|
178
|
-
Args:
|
|
179
|
-
model: The model to use for the call
|
|
180
|
-
messages: List of message dictionaries with 'role' and 'content' keys
|
|
181
|
-
seed: Random seed for reproducibility
|
|
182
|
-
model_kwargs: Additional model parameters
|
|
183
|
-
response_model: Optional Pydantic model for structured output
|
|
184
|
-
|
|
185
|
-
Returns:
|
|
186
|
-
If response_model is provided, returns an instance of that model.
|
|
187
|
-
Otherwise, returns the raw string response.
|
|
188
|
-
"""
|
|
189
|
-
self.increment_llm_counter()
|
|
190
|
-
|
|
191
|
-
final_params_for_litellm = self._prepare_model_params(
|
|
192
|
-
model_kwargs, response_model
|
|
193
|
-
)
|
|
194
|
-
|
|
195
|
-
response = litellm.completion(
|
|
196
|
-
model=model,
|
|
197
|
-
messages=messages,
|
|
198
|
-
seed=seed,
|
|
199
|
-
num_retries=6,
|
|
200
|
-
**final_params_for_litellm,
|
|
201
|
-
)
|
|
202
|
-
|
|
203
|
-
return self._parse_response(response, response_model)
|
|
204
|
-
|
|
205
101
|
@_throttle.rate_limited(_rate_limiter)
|
|
206
102
|
async def _call_model_async(
|
|
207
103
|
self,
|
|
@@ -212,7 +108,10 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
212
108
|
response_model: type[T] | None = None,
|
|
213
109
|
) -> T | str:
|
|
214
110
|
"""
|
|
215
|
-
|
|
111
|
+
Adapter for async LLM calls with HierarchicalRootCauseAnalyzer signature.
|
|
112
|
+
|
|
113
|
+
This adapter translates the analyzer's expected signature to the base class
|
|
114
|
+
_call_model_async signature, ensuring project_name and tags are properly set.
|
|
216
115
|
|
|
217
116
|
Args:
|
|
218
117
|
model: The model to use for the call
|
|
@@ -225,22 +124,16 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
225
124
|
If response_model is provided, returns an instance of that model.
|
|
226
125
|
Otherwise, returns the raw string response.
|
|
227
126
|
"""
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
final_params_for_litellm = self._prepare_model_params(
|
|
231
|
-
model_kwargs, response_model
|
|
232
|
-
)
|
|
233
|
-
|
|
234
|
-
response = await litellm.acompletion(
|
|
235
|
-
model=model,
|
|
127
|
+
# Call the base class async method which properly handles project_name and tags
|
|
128
|
+
return await super()._call_model_async(
|
|
236
129
|
messages=messages,
|
|
130
|
+
model=model,
|
|
237
131
|
seed=seed,
|
|
238
|
-
|
|
239
|
-
|
|
132
|
+
response_model=response_model,
|
|
133
|
+
is_reasoning=True,
|
|
134
|
+
**model_kwargs,
|
|
240
135
|
)
|
|
241
136
|
|
|
242
|
-
return self._parse_response(response, response_model)
|
|
243
|
-
|
|
244
137
|
def get_optimizer_metadata(self) -> dict[str, Any]:
|
|
245
138
|
"""
|
|
246
139
|
Get metadata about the optimizer configuration.
|
|
@@ -249,10 +142,9 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
249
142
|
Dictionary containing optimizer-specific configuration
|
|
250
143
|
"""
|
|
251
144
|
return {
|
|
252
|
-
"
|
|
253
|
-
"
|
|
145
|
+
"model": self.model,
|
|
146
|
+
"n_threads": self.n_threads,
|
|
254
147
|
"max_parallel_batches": self.max_parallel_batches,
|
|
255
|
-
"max_iterations": self.max_iterations,
|
|
256
148
|
"convergence_threshold": self.convergence_threshold,
|
|
257
149
|
"seed": self.seed,
|
|
258
150
|
"verbose": self.verbose,
|
|
@@ -330,6 +222,12 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
330
222
|
|
|
331
223
|
cleaned_model_output = raw_model_output.strip()
|
|
332
224
|
|
|
225
|
+
# Add tags to trace for optimization tracking
|
|
226
|
+
if self.current_optimization_id:
|
|
227
|
+
opik_context.update_current_trace(
|
|
228
|
+
tags=[self.current_optimization_id, "Evaluation"]
|
|
229
|
+
)
|
|
230
|
+
|
|
333
231
|
result = {
|
|
334
232
|
mappers.EVALUATED_LLM_TASK_OUTPUT: cleaned_model_output,
|
|
335
233
|
}
|
|
@@ -344,10 +242,11 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
344
242
|
dataset=dataset,
|
|
345
243
|
task=llm_task,
|
|
346
244
|
scoring_metrics=[_create_metric_class(metric)],
|
|
347
|
-
task_threads=self.
|
|
245
|
+
task_threads=self.n_threads,
|
|
348
246
|
nb_samples=n_samples,
|
|
349
247
|
experiment_config=experiment_config,
|
|
350
248
|
verbose=self.verbose,
|
|
249
|
+
project_name=self.project_name,
|
|
351
250
|
)
|
|
352
251
|
|
|
353
252
|
return result
|
|
@@ -403,10 +302,9 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
403
302
|
)
|
|
404
303
|
|
|
405
304
|
improve_prompt_response = self._call_model(
|
|
406
|
-
model=self.reasoning_model,
|
|
407
305
|
messages=[{"role": "user", "content": improve_prompt_prompt}],
|
|
306
|
+
model=self.model,
|
|
408
307
|
seed=attempt_seed,
|
|
409
|
-
model_kwargs={},
|
|
410
308
|
response_model=ImprovedPrompt,
|
|
411
309
|
)
|
|
412
310
|
|
|
@@ -461,7 +359,8 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
461
359
|
improved_chat_prompt = chat_prompt.ChatPrompt(
|
|
462
360
|
name=prompt.name,
|
|
463
361
|
messages=messages_as_dicts,
|
|
464
|
-
tools=
|
|
362
|
+
tools=best_prompt.tools,
|
|
363
|
+
function_map=best_prompt.function_map,
|
|
465
364
|
)
|
|
466
365
|
|
|
467
366
|
# Evaluate improved prompt
|
|
@@ -503,23 +402,28 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
503
402
|
n_samples: int | None = None,
|
|
504
403
|
auto_continue: bool = False,
|
|
505
404
|
agent_class: type[OptimizableAgent] | None = None,
|
|
405
|
+
project_name: str = "Optimization",
|
|
406
|
+
max_trials: int = DEFAULT_MAX_ITERATIONS,
|
|
506
407
|
max_retries: int = 2,
|
|
408
|
+
*args: Any,
|
|
507
409
|
**kwargs: Any,
|
|
508
410
|
) -> OptimizationResult:
|
|
509
411
|
# Reset counters at the start of optimization
|
|
510
|
-
self.
|
|
511
|
-
|
|
512
|
-
# Configure prompt model if not set
|
|
513
|
-
self.configure_prompt_model(prompt)
|
|
412
|
+
self._reset_counters()
|
|
413
|
+
self._should_stop_optimization = False # Reset stop flag
|
|
514
414
|
|
|
515
415
|
# Setup agent class
|
|
516
|
-
self.agent_class = self.
|
|
416
|
+
self.agent_class = self._setup_agent_class(prompt, agent_class)
|
|
417
|
+
|
|
418
|
+
# Set project name from parameter
|
|
419
|
+
self.project_name = project_name
|
|
517
420
|
|
|
518
421
|
optimization = self.opik_client.create_optimization(
|
|
519
422
|
dataset_name=dataset.name,
|
|
520
423
|
objective_name=getattr(metric, "__name__", str(metric)),
|
|
521
424
|
metadata={"optimizer": self.__class__.__name__},
|
|
522
425
|
)
|
|
426
|
+
self.current_optimization_id = optimization.id
|
|
523
427
|
logger.debug(f"Created optimization with ID: {optimization.id}")
|
|
524
428
|
|
|
525
429
|
reporting.display_header(
|
|
@@ -535,7 +439,7 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
535
439
|
"n_samples": n_samples,
|
|
536
440
|
"auto_continue": auto_continue,
|
|
537
441
|
"max_retries": max_retries,
|
|
538
|
-
"
|
|
442
|
+
"max_trials": max_trials,
|
|
539
443
|
"convergence_threshold": self.convergence_threshold,
|
|
540
444
|
},
|
|
541
445
|
verbose=self.verbose,
|
|
@@ -569,9 +473,20 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
569
473
|
# Multi-iteration optimization loop
|
|
570
474
|
iteration = 0
|
|
571
475
|
previous_iteration_score = initial_score
|
|
476
|
+
trials_used = 0
|
|
572
477
|
|
|
573
|
-
|
|
574
|
-
|
|
478
|
+
while trials_used < max_trials:
|
|
479
|
+
iteration += 1
|
|
480
|
+
logger.info(
|
|
481
|
+
f"Starting iteration {iteration} (trials: {trials_used}/{max_trials})"
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
# Check if we should stop (flag set by inner loops)
|
|
485
|
+
if self._should_stop_optimization:
|
|
486
|
+
logger.info(
|
|
487
|
+
f"Stopping optimization: reached max_trials limit ({max_trials})."
|
|
488
|
+
)
|
|
489
|
+
break
|
|
575
490
|
|
|
576
491
|
with reporting.display_optimization_iteration(
|
|
577
492
|
iteration=iteration, verbose=self.verbose
|
|
@@ -616,7 +531,16 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
616
531
|
improved_score = None
|
|
617
532
|
|
|
618
533
|
for attempt in range(1, max_attempts + 1):
|
|
619
|
-
#
|
|
534
|
+
# Check if we've reached the trial limit before starting a new trial
|
|
535
|
+
if trials_used >= max_trials:
|
|
536
|
+
logger.info(
|
|
537
|
+
f"Reached max_trials limit ({max_trials}) during failure mode '{root_cause.name}'. "
|
|
538
|
+
f"Stopping optimization."
|
|
539
|
+
)
|
|
540
|
+
self._should_stop_optimization = True
|
|
541
|
+
break
|
|
542
|
+
|
|
543
|
+
# Generate and evaluate improvement (this is 1 trial)
|
|
620
544
|
(
|
|
621
545
|
improved_chat_prompt,
|
|
622
546
|
improved_score,
|
|
@@ -633,6 +557,7 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
633
557
|
attempt=attempt,
|
|
634
558
|
max_attempts=max_attempts,
|
|
635
559
|
)
|
|
560
|
+
trials_used += 1
|
|
636
561
|
|
|
637
562
|
# Check if we got improvement
|
|
638
563
|
if improved_score > best_score:
|
|
@@ -642,7 +567,7 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
642
567
|
break
|
|
643
568
|
|
|
644
569
|
# No improvement - should we retry?
|
|
645
|
-
if attempt < max_attempts:
|
|
570
|
+
if attempt < max_attempts and trials_used < max_trials:
|
|
646
571
|
reporting.display_retry_attempt(
|
|
647
572
|
attempt=attempt,
|
|
648
573
|
max_attempts=max_attempts,
|
|
@@ -654,6 +579,10 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
654
579
|
f"No improvement after {attempt} attempts for '{root_cause.name}'"
|
|
655
580
|
)
|
|
656
581
|
|
|
582
|
+
# Break out of failure mode loop if flag is set
|
|
583
|
+
if self._should_stop_optimization:
|
|
584
|
+
break
|
|
585
|
+
|
|
657
586
|
# Check if final result is an improvement
|
|
658
587
|
if (
|
|
659
588
|
improved_score is not None
|
|
@@ -731,29 +660,23 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
731
660
|
|
|
732
661
|
# Prepare details for the result
|
|
733
662
|
details = {
|
|
734
|
-
"model":
|
|
663
|
+
"model": self.model,
|
|
735
664
|
"temperature": (best_prompt.model_kwargs or {}).get("temperature")
|
|
736
|
-
or self.
|
|
737
|
-
"
|
|
738
|
-
"num_threads": self.num_threads,
|
|
665
|
+
or self.model_parameters.get("temperature"),
|
|
666
|
+
"n_threads": self.n_threads,
|
|
739
667
|
"max_parallel_batches": self.max_parallel_batches,
|
|
740
668
|
"max_retries": max_retries,
|
|
741
669
|
"n_samples": n_samples,
|
|
742
670
|
"auto_continue": auto_continue,
|
|
743
|
-
"
|
|
671
|
+
"max_trials": max_trials,
|
|
744
672
|
"convergence_threshold": self.convergence_threshold,
|
|
745
673
|
"iterations_completed": iteration,
|
|
674
|
+
"trials_used": trials_used,
|
|
746
675
|
}
|
|
747
676
|
|
|
748
677
|
# Extract tool prompts if tools exist
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
tool_prompts = {
|
|
752
|
-
tool.get("function", {}).get("name", f"tool_{idx}"): tool.get(
|
|
753
|
-
"function", {}
|
|
754
|
-
).get("description", "")
|
|
755
|
-
for idx, tool in enumerate(final_tools)
|
|
756
|
-
}
|
|
678
|
+
final_tools = getattr(best_prompt, "tools", None)
|
|
679
|
+
tool_prompts = self._extract_tool_prompts(final_tools)
|
|
757
680
|
|
|
758
681
|
return OptimizationResult(
|
|
759
682
|
optimizer=self.__class__.__name__,
|
|
@@ -14,6 +14,8 @@ TEST RESULTS:
|
|
|
14
14
|
{formatted_batch}
|
|
15
15
|
```
|
|
16
16
|
|
|
17
|
+
Important constraint: Base your analysis exclusively on the TEST RESULTS shown above. Do not infer, speculate, or hypothesize failure modes that are not directly evidenced in the provided results.
|
|
18
|
+
|
|
17
19
|
Think through the failures systematically:
|
|
18
20
|
|
|
19
21
|
1. IDENTIFY: List all distinct types of failures you observe in the test results
|
|
@@ -86,6 +88,10 @@ INSTRUCTIONS FOR IMPROVING THE PROMPT:
|
|
|
86
88
|
|
|
87
89
|
4. **Maintain Structure**: Keep the same message structure (role and content format). Only modify the content where necessary.
|
|
88
90
|
|
|
89
|
-
5. **
|
|
91
|
+
5. **Do NOT Add Messages**: Do not add new messages to the prompt. Only modify existing messages. The number of messages in the prompt must remain exactly the same.
|
|
92
|
+
|
|
93
|
+
6. **Be Specific**: Ensure your changes provide concrete, actionable guidance that directly addresses the identified failure mode.
|
|
94
|
+
|
|
95
|
+
Do not remove any variables or placeholders from any prompt message. You can reposition them within the same message content if needed but never remove them.
|
|
90
96
|
|
|
91
97
|
Provide your reasoning for the changes you made, explaining WHY each change addresses the failure mode, and then provide the improved prompt."""
|