opik-optimizer 2.1.2__py3-none-any.whl → 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +2 -2
- opik_optimizer/base_optimizer.py +314 -145
- opik_optimizer/evolutionary_optimizer/crossover_ops.py +31 -4
- opik_optimizer/evolutionary_optimizer/evaluation_ops.py +23 -3
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +122 -95
- opik_optimizer/evolutionary_optimizer/mcp.py +11 -6
- opik_optimizer/evolutionary_optimizer/mutation_ops.py +25 -5
- opik_optimizer/evolutionary_optimizer/population_ops.py +26 -10
- opik_optimizer/evolutionary_optimizer/reporting.py +5 -5
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +53 -99
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +4 -4
- opik_optimizer/gepa_optimizer/gepa_optimizer.py +183 -172
- opik_optimizer/gepa_optimizer/reporting.py +164 -22
- opik_optimizer/hierarchical_reflective_optimizer/hierarchical_reflective_optimizer.py +221 -245
- opik_optimizer/hierarchical_reflective_optimizer/hierarchical_root_cause_analyzer.py +38 -14
- opik_optimizer/hierarchical_reflective_optimizer/prompts.py +7 -1
- opik_optimizer/hierarchical_reflective_optimizer/reporting.py +287 -132
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +185 -205
- opik_optimizer/meta_prompt_optimizer/reporting.py +4 -4
- opik_optimizer/mipro_optimizer/__init__.py +2 -2
- opik_optimizer/mipro_optimizer/_lm.py +4 -4
- opik_optimizer/mipro_optimizer/{_mipro_optimizer_v2.py → mipro_optimizer_v2.py} +1 -7
- opik_optimizer/mipro_optimizer/utils.py +1 -0
- opik_optimizer/multi_metric_objective.py +33 -0
- opik_optimizer/optimizable_agent.py +7 -4
- opik_optimizer/optimization_config/chat_prompt.py +7 -10
- opik_optimizer/parameter_optimizer/parameter_optimizer.py +188 -40
- opik_optimizer/parameter_optimizer/reporting.py +148 -0
- opik_optimizer/reporting_utils.py +42 -15
- opik_optimizer/task_evaluator.py +26 -9
- opik_optimizer/utils/core.py +16 -2
- opik_optimizer/utils/prompt_segments.py +1 -2
- {opik_optimizer-2.1.2.dist-info → opik_optimizer-2.2.0.dist-info}/METADATA +2 -3
- {opik_optimizer-2.1.2.dist-info → opik_optimizer-2.2.0.dist-info}/RECORD +37 -37
- opik_optimizer/evolutionary_optimizer/llm_support.py +0 -136
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +0 -680
- {opik_optimizer-2.1.2.dist-info → opik_optimizer-2.2.0.dist-info}/WHEEL +0 -0
- {opik_optimizer-2.1.2.dist-info → opik_optimizer-2.2.0.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-2.1.2.dist-info → opik_optimizer-2.2.0.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
from opik.environment import get_tqdm_for_current_environment
|
|
2
1
|
import os
|
|
3
2
|
import logging
|
|
4
3
|
|
|
@@ -6,8 +5,8 @@ import opik
|
|
|
6
5
|
import litellm
|
|
7
6
|
from litellm.caching import Cache
|
|
8
7
|
from litellm.types.caching import LiteLLMCacheType
|
|
8
|
+
from opik import opik_context
|
|
9
9
|
from opik.evaluation.evaluation_result import EvaluationResult
|
|
10
|
-
from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
|
|
11
10
|
from opik.evaluation import evaluator as opik_evaluator
|
|
12
11
|
|
|
13
12
|
from typing import Any, TypeVar
|
|
@@ -29,8 +28,6 @@ from .types import (
|
|
|
29
28
|
)
|
|
30
29
|
from .prompts import IMPROVE_PROMPT_TEMPLATE
|
|
31
30
|
|
|
32
|
-
tqdm = get_tqdm_for_current_environment()
|
|
33
|
-
|
|
34
31
|
# Using disk cache for LLM calls
|
|
35
32
|
disk_cache_dir = os.path.expanduser("~/.litellm_cache")
|
|
36
33
|
litellm.cache = Cache(type=LiteLLMCacheType.DISK, disk_cache_dir=disk_cache_dir)
|
|
@@ -54,149 +51,53 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
54
51
|
complex prompt that you want to systematically refine based on understanding why it fails.
|
|
55
52
|
|
|
56
53
|
Args:
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
54
|
+
model: LiteLLM model name for the optimization algorithm (reasoning and analysis)
|
|
55
|
+
model_parameters: Optional dict of LiteLLM parameters for optimizer's internal LLM calls.
|
|
56
|
+
Common params: temperature, max_tokens, max_completion_tokens, top_p.
|
|
57
|
+
See: https://docs.litellm.ai/docs/completion/input
|
|
61
58
|
max_parallel_batches: Maximum number of batches to process concurrently during
|
|
62
|
-
hierarchical root cause analysis
|
|
63
|
-
batch_size: Number of test cases per batch for root cause analysis
|
|
64
|
-
|
|
59
|
+
hierarchical root cause analysis
|
|
60
|
+
batch_size: Number of test cases per batch for root cause analysis
|
|
61
|
+
convergence_threshold: Stop if relative improvement is below this threshold
|
|
62
|
+
n_threads: Number of parallel threads for evaluation
|
|
63
|
+
verbose: Controls internal logging/progress bars (0=off, 1=on)
|
|
64
|
+
seed: Random seed for reproducibility
|
|
65
65
|
"""
|
|
66
66
|
|
|
67
67
|
DEFAULT_ROUNDS = 10
|
|
68
|
+
DEFAULT_MAX_ITERATIONS = 5
|
|
69
|
+
DEFAULT_CONVERGENCE_THRESHOLD = 0.01 # Stop if improvement is less than 1%
|
|
68
70
|
|
|
69
71
|
def __init__(
|
|
70
72
|
self,
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
verbose: int = 1,
|
|
74
|
-
seed: int = 42,
|
|
73
|
+
model: str = "gpt-4o",
|
|
74
|
+
model_parameters: dict[str, Any] | None = None,
|
|
75
75
|
max_parallel_batches: int = 5,
|
|
76
76
|
batch_size: int = 25,
|
|
77
|
-
|
|
77
|
+
convergence_threshold: float = DEFAULT_CONVERGENCE_THRESHOLD,
|
|
78
|
+
n_threads: int = 12,
|
|
79
|
+
verbose: int = 1,
|
|
80
|
+
seed: int = 42,
|
|
78
81
|
):
|
|
79
82
|
super().__init__(
|
|
80
|
-
model=
|
|
83
|
+
model=model, verbose=verbose, seed=seed, model_parameters=model_parameters
|
|
81
84
|
)
|
|
82
|
-
self.
|
|
83
|
-
self.num_threads = num_threads
|
|
85
|
+
self.n_threads = n_threads
|
|
84
86
|
self.max_parallel_batches = max_parallel_batches
|
|
85
87
|
self.batch_size = batch_size
|
|
88
|
+
self.convergence_threshold = convergence_threshold
|
|
89
|
+
self._should_stop_optimization = False # Flag to exit all loops
|
|
86
90
|
|
|
87
91
|
# Initialize hierarchical analyzer
|
|
88
92
|
self._hierarchical_analyzer = HierarchicalRootCauseAnalyzer(
|
|
89
93
|
call_model_fn=self._call_model_async,
|
|
90
|
-
reasoning_model=self.
|
|
94
|
+
reasoning_model=self.model,
|
|
91
95
|
seed=self.seed,
|
|
92
96
|
max_parallel_batches=self.max_parallel_batches,
|
|
93
97
|
batch_size=self.batch_size,
|
|
94
98
|
verbose=self.verbose,
|
|
95
99
|
)
|
|
96
100
|
|
|
97
|
-
def _prepare_model_params(
|
|
98
|
-
self,
|
|
99
|
-
model_kwargs: dict[str, Any],
|
|
100
|
-
response_model: type[T] | None = None,
|
|
101
|
-
) -> dict[str, Any]:
|
|
102
|
-
"""
|
|
103
|
-
Prepare parameters for LiteLLM call by filtering and adding monitoring.
|
|
104
|
-
|
|
105
|
-
Args:
|
|
106
|
-
model_kwargs: Additional model parameters
|
|
107
|
-
response_model: Optional Pydantic model for structured output
|
|
108
|
-
|
|
109
|
-
Returns:
|
|
110
|
-
Dictionary of parameters ready for litellm.completion/acompletion
|
|
111
|
-
"""
|
|
112
|
-
current_model_kwargs = self.model_kwargs.copy()
|
|
113
|
-
current_model_kwargs.update(model_kwargs)
|
|
114
|
-
|
|
115
|
-
# Filter out optimizer-specific kwargs that shouldn't be passed to LiteLLM
|
|
116
|
-
filtered_call_kwargs = current_model_kwargs.copy()
|
|
117
|
-
filtered_call_kwargs.pop("n_trials", None)
|
|
118
|
-
filtered_call_kwargs.pop("n_samples", None)
|
|
119
|
-
filtered_call_kwargs.pop("n_iterations", None)
|
|
120
|
-
filtered_call_kwargs.pop("min_examples", None)
|
|
121
|
-
filtered_call_kwargs.pop("max_examples", None)
|
|
122
|
-
filtered_call_kwargs.pop("project_name", None)
|
|
123
|
-
|
|
124
|
-
final_params_for_litellm = (
|
|
125
|
-
opik_litellm_monitor.try_add_opik_monitoring_to_params(filtered_call_kwargs)
|
|
126
|
-
)
|
|
127
|
-
|
|
128
|
-
# Add structured output support if response_model is provided
|
|
129
|
-
# According to LiteLLM docs: https://docs.litellm.ai/docs/completion/json_mode
|
|
130
|
-
# Pass the Pydantic model directly to response_format
|
|
131
|
-
if response_model is not None:
|
|
132
|
-
final_params_for_litellm["response_format"] = response_model
|
|
133
|
-
|
|
134
|
-
return final_params_for_litellm
|
|
135
|
-
|
|
136
|
-
def _parse_response(
|
|
137
|
-
self,
|
|
138
|
-
response: Any,
|
|
139
|
-
response_model: type[T] | None = None,
|
|
140
|
-
) -> T | str:
|
|
141
|
-
"""
|
|
142
|
-
Parse LiteLLM response, with optional structured output parsing.
|
|
143
|
-
|
|
144
|
-
Args:
|
|
145
|
-
response: The response from litellm.completion/acompletion
|
|
146
|
-
response_model: Optional Pydantic model for structured output
|
|
147
|
-
|
|
148
|
-
Returns:
|
|
149
|
-
If response_model is provided, returns an instance of that model.
|
|
150
|
-
Otherwise, returns the raw string response.
|
|
151
|
-
"""
|
|
152
|
-
content = response.choices[0].message.content
|
|
153
|
-
|
|
154
|
-
# When using structured outputs with Pydantic models, LiteLLM automatically
|
|
155
|
-
# parses the response. Parse the JSON string into the Pydantic model
|
|
156
|
-
if response_model is not None:
|
|
157
|
-
return response_model.model_validate_json(content)
|
|
158
|
-
|
|
159
|
-
return content
|
|
160
|
-
|
|
161
|
-
@_throttle.rate_limited(_rate_limiter)
|
|
162
|
-
def _call_model(
|
|
163
|
-
self,
|
|
164
|
-
model: str,
|
|
165
|
-
messages: list[dict[str, str]],
|
|
166
|
-
seed: int,
|
|
167
|
-
model_kwargs: dict[str, Any],
|
|
168
|
-
response_model: type[T] | None = None,
|
|
169
|
-
) -> T | str:
|
|
170
|
-
"""
|
|
171
|
-
Call the LLM model with optional structured output.
|
|
172
|
-
|
|
173
|
-
Args:
|
|
174
|
-
model: The model to use for the call
|
|
175
|
-
messages: List of message dictionaries with 'role' and 'content' keys
|
|
176
|
-
seed: Random seed for reproducibility
|
|
177
|
-
model_kwargs: Additional model parameters
|
|
178
|
-
response_model: Optional Pydantic model for structured output
|
|
179
|
-
|
|
180
|
-
Returns:
|
|
181
|
-
If response_model is provided, returns an instance of that model.
|
|
182
|
-
Otherwise, returns the raw string response.
|
|
183
|
-
"""
|
|
184
|
-
self.increment_llm_counter()
|
|
185
|
-
|
|
186
|
-
final_params_for_litellm = self._prepare_model_params(
|
|
187
|
-
model_kwargs, response_model
|
|
188
|
-
)
|
|
189
|
-
|
|
190
|
-
response = litellm.completion(
|
|
191
|
-
model=model,
|
|
192
|
-
messages=messages,
|
|
193
|
-
seed=seed,
|
|
194
|
-
num_retries=6,
|
|
195
|
-
**final_params_for_litellm,
|
|
196
|
-
)
|
|
197
|
-
|
|
198
|
-
return self._parse_response(response, response_model)
|
|
199
|
-
|
|
200
101
|
@_throttle.rate_limited(_rate_limiter)
|
|
201
102
|
async def _call_model_async(
|
|
202
103
|
self,
|
|
@@ -207,7 +108,10 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
207
108
|
response_model: type[T] | None = None,
|
|
208
109
|
) -> T | str:
|
|
209
110
|
"""
|
|
210
|
-
|
|
111
|
+
Adapter for async LLM calls with HierarchicalRootCauseAnalyzer signature.
|
|
112
|
+
|
|
113
|
+
This adapter translates the analyzer's expected signature to the base class
|
|
114
|
+
_call_model_async signature, ensuring project_name and tags are properly set.
|
|
211
115
|
|
|
212
116
|
Args:
|
|
213
117
|
model: The model to use for the call
|
|
@@ -220,22 +124,16 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
220
124
|
If response_model is provided, returns an instance of that model.
|
|
221
125
|
Otherwise, returns the raw string response.
|
|
222
126
|
"""
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
final_params_for_litellm = self._prepare_model_params(
|
|
226
|
-
model_kwargs, response_model
|
|
227
|
-
)
|
|
228
|
-
|
|
229
|
-
response = await litellm.acompletion(
|
|
230
|
-
model=model,
|
|
127
|
+
# Call the base class async method which properly handles project_name and tags
|
|
128
|
+
return await super()._call_model_async(
|
|
231
129
|
messages=messages,
|
|
130
|
+
model=model,
|
|
232
131
|
seed=seed,
|
|
233
|
-
|
|
234
|
-
|
|
132
|
+
response_model=response_model,
|
|
133
|
+
is_reasoning=True,
|
|
134
|
+
**model_kwargs,
|
|
235
135
|
)
|
|
236
136
|
|
|
237
|
-
return self._parse_response(response, response_model)
|
|
238
|
-
|
|
239
137
|
def get_optimizer_metadata(self) -> dict[str, Any]:
|
|
240
138
|
"""
|
|
241
139
|
Get metadata about the optimizer configuration.
|
|
@@ -244,9 +142,10 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
244
142
|
Dictionary containing optimizer-specific configuration
|
|
245
143
|
"""
|
|
246
144
|
return {
|
|
247
|
-
"
|
|
248
|
-
"
|
|
145
|
+
"model": self.model,
|
|
146
|
+
"n_threads": self.n_threads,
|
|
249
147
|
"max_parallel_batches": self.max_parallel_batches,
|
|
148
|
+
"convergence_threshold": self.convergence_threshold,
|
|
250
149
|
"seed": self.seed,
|
|
251
150
|
"verbose": self.verbose,
|
|
252
151
|
}
|
|
@@ -323,6 +222,12 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
323
222
|
|
|
324
223
|
cleaned_model_output = raw_model_output.strip()
|
|
325
224
|
|
|
225
|
+
# Add tags to trace for optimization tracking
|
|
226
|
+
if self.current_optimization_id:
|
|
227
|
+
opik_context.update_current_trace(
|
|
228
|
+
tags=[self.current_optimization_id, "Evaluation"]
|
|
229
|
+
)
|
|
230
|
+
|
|
326
231
|
result = {
|
|
327
232
|
mappers.EVALUATED_LLM_TASK_OUTPUT: cleaned_model_output,
|
|
328
233
|
}
|
|
@@ -337,10 +242,11 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
337
242
|
dataset=dataset,
|
|
338
243
|
task=llm_task,
|
|
339
244
|
scoring_metrics=[_create_metric_class(metric)],
|
|
340
|
-
task_threads=self.
|
|
245
|
+
task_threads=self.n_threads,
|
|
341
246
|
nb_samples=n_samples,
|
|
342
247
|
experiment_config=experiment_config,
|
|
343
248
|
verbose=self.verbose,
|
|
249
|
+
project_name=self.project_name,
|
|
344
250
|
)
|
|
345
251
|
|
|
346
252
|
return result
|
|
@@ -396,10 +302,9 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
396
302
|
)
|
|
397
303
|
|
|
398
304
|
improve_prompt_response = self._call_model(
|
|
399
|
-
model=self.reasoning_model,
|
|
400
305
|
messages=[{"role": "user", "content": improve_prompt_prompt}],
|
|
306
|
+
model=self.model,
|
|
401
307
|
seed=attempt_seed,
|
|
402
|
-
model_kwargs={},
|
|
403
308
|
response_model=ImprovedPrompt,
|
|
404
309
|
)
|
|
405
310
|
|
|
@@ -417,7 +322,7 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
417
322
|
n_samples: int | None,
|
|
418
323
|
attempt: int,
|
|
419
324
|
max_attempts: int,
|
|
420
|
-
) -> tuple[chat_prompt.ChatPrompt, float]:
|
|
325
|
+
) -> tuple[chat_prompt.ChatPrompt, float, EvaluationResult]:
|
|
421
326
|
"""
|
|
422
327
|
Generate and evaluate a single improvement attempt for a failure mode.
|
|
423
328
|
|
|
@@ -434,7 +339,7 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
434
339
|
max_attempts: Total number of attempts
|
|
435
340
|
|
|
436
341
|
Returns:
|
|
437
|
-
Tuple of (improved_prompt, improved_score)
|
|
342
|
+
Tuple of (improved_prompt, improved_score, improved_experiment_result)
|
|
438
343
|
"""
|
|
439
344
|
# Generate improvement with progress indication
|
|
440
345
|
with reporting.display_prompt_improvement(
|
|
@@ -454,7 +359,8 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
454
359
|
improved_chat_prompt = chat_prompt.ChatPrompt(
|
|
455
360
|
name=prompt.name,
|
|
456
361
|
messages=messages_as_dicts,
|
|
457
|
-
tools=
|
|
362
|
+
tools=best_prompt.tools,
|
|
363
|
+
function_map=best_prompt.function_map,
|
|
458
364
|
)
|
|
459
365
|
|
|
460
366
|
# Evaluate improved prompt
|
|
@@ -485,7 +391,7 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
485
391
|
) / len(improved_experiment_result.test_results)
|
|
486
392
|
improved_reporter.set_score(improved_score)
|
|
487
393
|
|
|
488
|
-
return improved_chat_prompt, improved_score
|
|
394
|
+
return improved_chat_prompt, improved_score, improved_experiment_result
|
|
489
395
|
|
|
490
396
|
def optimize_prompt(
|
|
491
397
|
self,
|
|
@@ -496,23 +402,28 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
496
402
|
n_samples: int | None = None,
|
|
497
403
|
auto_continue: bool = False,
|
|
498
404
|
agent_class: type[OptimizableAgent] | None = None,
|
|
405
|
+
project_name: str = "Optimization",
|
|
406
|
+
max_trials: int = DEFAULT_MAX_ITERATIONS,
|
|
499
407
|
max_retries: int = 2,
|
|
408
|
+
*args: Any,
|
|
500
409
|
**kwargs: Any,
|
|
501
410
|
) -> OptimizationResult:
|
|
502
411
|
# Reset counters at the start of optimization
|
|
503
|
-
self.
|
|
504
|
-
|
|
505
|
-
# Configure prompt model if not set
|
|
506
|
-
self.configure_prompt_model(prompt)
|
|
412
|
+
self._reset_counters()
|
|
413
|
+
self._should_stop_optimization = False # Reset stop flag
|
|
507
414
|
|
|
508
415
|
# Setup agent class
|
|
509
|
-
self.agent_class = self.
|
|
416
|
+
self.agent_class = self._setup_agent_class(prompt, agent_class)
|
|
417
|
+
|
|
418
|
+
# Set project name from parameter
|
|
419
|
+
self.project_name = project_name
|
|
510
420
|
|
|
511
421
|
optimization = self.opik_client.create_optimization(
|
|
512
422
|
dataset_name=dataset.name,
|
|
513
423
|
objective_name=getattr(metric, "__name__", str(metric)),
|
|
514
424
|
metadata={"optimizer": self.__class__.__name__},
|
|
515
425
|
)
|
|
426
|
+
self.current_optimization_id = optimization.id
|
|
516
427
|
logger.debug(f"Created optimization with ID: {optimization.id}")
|
|
517
428
|
|
|
518
429
|
reporting.display_header(
|
|
@@ -528,6 +439,8 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
528
439
|
"n_samples": n_samples,
|
|
529
440
|
"auto_continue": auto_continue,
|
|
530
441
|
"max_retries": max_retries,
|
|
442
|
+
"max_trials": max_trials,
|
|
443
|
+
"convergence_threshold": self.convergence_threshold,
|
|
531
444
|
},
|
|
532
445
|
verbose=self.verbose,
|
|
533
446
|
tools=getattr(prompt, "tools", None),
|
|
@@ -557,53 +470,82 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
557
470
|
prompt.get_messages()
|
|
558
471
|
) # Store copy of initial messages for diff
|
|
559
472
|
|
|
560
|
-
#
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
# Perform hierarchical root cause analysis
|
|
565
|
-
with reporting.display_root_cause_analysis(
|
|
566
|
-
verbose=self.verbose
|
|
567
|
-
) as analysis_reporter:
|
|
568
|
-
hierarchical_analysis = self._hierarchical_root_cause_analysis(
|
|
569
|
-
experiment_result
|
|
570
|
-
)
|
|
571
|
-
analysis_reporter.set_completed(
|
|
572
|
-
total_test_cases=hierarchical_analysis.total_test_cases,
|
|
573
|
-
num_batches=hierarchical_analysis.num_batches,
|
|
574
|
-
)
|
|
473
|
+
# Multi-iteration optimization loop
|
|
474
|
+
iteration = 0
|
|
475
|
+
previous_iteration_score = initial_score
|
|
476
|
+
trials_used = 0
|
|
575
477
|
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
478
|
+
while trials_used < max_trials:
|
|
479
|
+
iteration += 1
|
|
480
|
+
logger.info(
|
|
481
|
+
f"Starting iteration {iteration} (trials: {trials_used}/{max_trials})"
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
# Check if we should stop (flag set by inner loops)
|
|
485
|
+
if self._should_stop_optimization:
|
|
486
|
+
logger.info(
|
|
487
|
+
f"Stopping optimization: reached max_trials limit ({max_trials})."
|
|
583
488
|
)
|
|
489
|
+
break
|
|
490
|
+
|
|
491
|
+
with reporting.display_optimization_iteration(
|
|
492
|
+
iteration=iteration, verbose=self.verbose
|
|
493
|
+
) as iteration_reporter:
|
|
494
|
+
# Perform hierarchical root cause analysis
|
|
495
|
+
with reporting.display_root_cause_analysis(
|
|
496
|
+
verbose=self.verbose
|
|
497
|
+
) as analysis_reporter:
|
|
498
|
+
hierarchical_analysis = self._hierarchical_root_cause_analysis(
|
|
499
|
+
experiment_result
|
|
500
|
+
)
|
|
501
|
+
analysis_reporter.set_completed(
|
|
502
|
+
total_test_cases=hierarchical_analysis.total_test_cases,
|
|
503
|
+
num_batches=hierarchical_analysis.num_batches,
|
|
504
|
+
)
|
|
584
505
|
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
506
|
+
# Display hierarchical synthesis and failure modes
|
|
507
|
+
if self.verbose:
|
|
508
|
+
reporting.display_hierarchical_synthesis(
|
|
509
|
+
total_test_cases=hierarchical_analysis.total_test_cases,
|
|
510
|
+
num_batches=hierarchical_analysis.num_batches,
|
|
511
|
+
synthesis_notes=hierarchical_analysis.synthesis_notes,
|
|
512
|
+
verbose=self.verbose,
|
|
513
|
+
)
|
|
589
514
|
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
):
|
|
594
|
-
logger.debug(
|
|
595
|
-
f"Addressing failure mode {idx}/{len(hierarchical_analysis.unified_failure_modes)}: {root_cause.name}"
|
|
515
|
+
reporting.display_failure_modes(
|
|
516
|
+
failure_modes=hierarchical_analysis.unified_failure_modes,
|
|
517
|
+
verbose=self.verbose,
|
|
596
518
|
)
|
|
597
519
|
|
|
598
|
-
#
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
520
|
+
# Generate improved prompt for each failure mode
|
|
521
|
+
for idx, root_cause in enumerate(
|
|
522
|
+
hierarchical_analysis.unified_failure_modes, 1
|
|
523
|
+
):
|
|
524
|
+
logger.debug(
|
|
525
|
+
f"Addressing failure mode {idx}/{len(hierarchical_analysis.unified_failure_modes)}: {root_cause.name}"
|
|
526
|
+
)
|
|
602
527
|
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
improved_chat_prompt
|
|
606
|
-
|
|
528
|
+
# Try multiple attempts if needed
|
|
529
|
+
max_attempts = max_retries + 1
|
|
530
|
+
improved_chat_prompt = None
|
|
531
|
+
improved_score = None
|
|
532
|
+
|
|
533
|
+
for attempt in range(1, max_attempts + 1):
|
|
534
|
+
# Check if we've reached the trial limit before starting a new trial
|
|
535
|
+
if trials_used >= max_trials:
|
|
536
|
+
logger.info(
|
|
537
|
+
f"Reached max_trials limit ({max_trials}) during failure mode '{root_cause.name}'. "
|
|
538
|
+
f"Stopping optimization."
|
|
539
|
+
)
|
|
540
|
+
self._should_stop_optimization = True
|
|
541
|
+
break
|
|
542
|
+
|
|
543
|
+
# Generate and evaluate improvement (this is 1 trial)
|
|
544
|
+
(
|
|
545
|
+
improved_chat_prompt,
|
|
546
|
+
improved_score,
|
|
547
|
+
improved_experiment_result,
|
|
548
|
+
) = self._generate_and_evaluate_improvement(
|
|
607
549
|
root_cause=root_cause,
|
|
608
550
|
best_prompt=best_prompt,
|
|
609
551
|
best_score=best_score,
|
|
@@ -615,64 +557,91 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
615
557
|
attempt=attempt,
|
|
616
558
|
max_attempts=max_attempts,
|
|
617
559
|
)
|
|
618
|
-
|
|
560
|
+
trials_used += 1
|
|
561
|
+
|
|
562
|
+
# Check if we got improvement
|
|
563
|
+
if improved_score > best_score:
|
|
564
|
+
logger.info(
|
|
565
|
+
f"Improvement found for '{root_cause.name}' on attempt {attempt}"
|
|
566
|
+
)
|
|
567
|
+
break
|
|
568
|
+
|
|
569
|
+
# No improvement - should we retry?
|
|
570
|
+
if attempt < max_attempts and trials_used < max_trials:
|
|
571
|
+
reporting.display_retry_attempt(
|
|
572
|
+
attempt=attempt,
|
|
573
|
+
max_attempts=max_attempts,
|
|
574
|
+
failure_mode_name=root_cause.name,
|
|
575
|
+
verbose=self.verbose,
|
|
576
|
+
)
|
|
577
|
+
else:
|
|
578
|
+
logger.debug(
|
|
579
|
+
f"No improvement after {attempt} attempts for '{root_cause.name}'"
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
# Break out of failure mode loop if flag is set
|
|
583
|
+
if self._should_stop_optimization:
|
|
584
|
+
break
|
|
619
585
|
|
|
620
|
-
# Check if
|
|
621
|
-
if
|
|
622
|
-
|
|
623
|
-
|
|
586
|
+
# Check if final result is an improvement
|
|
587
|
+
if (
|
|
588
|
+
improved_score is not None
|
|
589
|
+
and improved_chat_prompt is not None
|
|
590
|
+
and improved_score > best_score
|
|
591
|
+
):
|
|
592
|
+
improvement = self._calculate_improvement(
|
|
593
|
+
improved_score, best_score
|
|
624
594
|
)
|
|
625
|
-
break
|
|
626
595
|
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
failure_mode_name=root_cause.name,
|
|
596
|
+
# Display improvement for this iteration
|
|
597
|
+
reporting.display_iteration_improvement(
|
|
598
|
+
improvement=improvement,
|
|
599
|
+
current_score=improved_score,
|
|
600
|
+
best_score=best_score,
|
|
633
601
|
verbose=self.verbose,
|
|
634
602
|
)
|
|
603
|
+
|
|
604
|
+
# Update best
|
|
605
|
+
best_score = improved_score
|
|
606
|
+
best_prompt = improved_chat_prompt
|
|
607
|
+
best_messages = improved_chat_prompt.get_messages()
|
|
608
|
+
experiment_result = improved_experiment_result
|
|
609
|
+
logger.info(
|
|
610
|
+
f"Updated best prompt after addressing '{root_cause.name}'"
|
|
611
|
+
)
|
|
635
612
|
else:
|
|
636
613
|
logger.debug(
|
|
637
|
-
f"
|
|
614
|
+
f"Keeping previous best prompt, no improvement from '{root_cause.name}'"
|
|
638
615
|
)
|
|
639
616
|
|
|
640
|
-
#
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
):
|
|
646
|
-
improvement = self._calculate_improvement(
|
|
647
|
-
improved_score, best_score
|
|
648
|
-
)
|
|
649
|
-
|
|
650
|
-
# Display improvement for this iteration
|
|
651
|
-
reporting.display_iteration_improvement(
|
|
652
|
-
improvement=improvement,
|
|
653
|
-
current_score=improved_score,
|
|
654
|
-
best_score=best_score,
|
|
655
|
-
verbose=self.verbose,
|
|
656
|
-
)
|
|
617
|
+
# Mark iteration complete
|
|
618
|
+
improved_since_start = best_score > initial_score
|
|
619
|
+
iteration_reporter.iteration_complete(
|
|
620
|
+
best_score=best_score, improved=improved_since_start
|
|
621
|
+
)
|
|
657
622
|
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
logger.info(
|
|
663
|
-
f"Updated best prompt after addressing '{root_cause.name}'"
|
|
664
|
-
)
|
|
665
|
-
else:
|
|
666
|
-
logger.debug(
|
|
667
|
-
f"Keeping previous best prompt, no improvement from '{root_cause.name}'"
|
|
668
|
-
)
|
|
623
|
+
# Check for convergence after iteration
|
|
624
|
+
iteration_improvement = self._calculate_improvement(
|
|
625
|
+
best_score, previous_iteration_score
|
|
626
|
+
)
|
|
669
627
|
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
best_score=best_score, improved=improved_since_start
|
|
628
|
+
logger.info(
|
|
629
|
+
f"Iteration {iteration} complete. Score: {best_score:.4f}, "
|
|
630
|
+
f"Improvement: {iteration_improvement:.2%}"
|
|
674
631
|
)
|
|
675
632
|
|
|
633
|
+
# Stop if improvement is below convergence threshold
|
|
634
|
+
if abs(iteration_improvement) < self.convergence_threshold:
|
|
635
|
+
logger.info(
|
|
636
|
+
f"Convergence achieved: improvement ({iteration_improvement:.2%}) "
|
|
637
|
+
f"below threshold ({self.convergence_threshold:.2%}). "
|
|
638
|
+
f"Stopping after {iteration} iterations."
|
|
639
|
+
)
|
|
640
|
+
break
|
|
641
|
+
|
|
642
|
+
# Update previous score for next iteration
|
|
643
|
+
previous_iteration_score = best_score
|
|
644
|
+
|
|
676
645
|
# Display final optimization result with diff
|
|
677
646
|
reporting.display_optimized_prompt_diff(
|
|
678
647
|
initial_messages=initial_messages,
|
|
@@ -682,25 +651,32 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
|
|
|
682
651
|
verbose=self.verbose,
|
|
683
652
|
)
|
|
684
653
|
|
|
654
|
+
# Update optimization status to completed
|
|
655
|
+
try:
|
|
656
|
+
optimization.update(status="completed")
|
|
657
|
+
logger.info(f"Optimization {optimization.id} status updated to completed.")
|
|
658
|
+
except Exception as e:
|
|
659
|
+
logger.warning(f"Failed to update optimization status: {e}")
|
|
660
|
+
|
|
685
661
|
# Prepare details for the result
|
|
686
662
|
details = {
|
|
687
|
-
"
|
|
688
|
-
"
|
|
663
|
+
"model": self.model,
|
|
664
|
+
"temperature": (best_prompt.model_kwargs or {}).get("temperature")
|
|
665
|
+
or self.model_parameters.get("temperature"),
|
|
666
|
+
"n_threads": self.n_threads,
|
|
689
667
|
"max_parallel_batches": self.max_parallel_batches,
|
|
690
668
|
"max_retries": max_retries,
|
|
691
669
|
"n_samples": n_samples,
|
|
692
670
|
"auto_continue": auto_continue,
|
|
671
|
+
"max_trials": max_trials,
|
|
672
|
+
"convergence_threshold": self.convergence_threshold,
|
|
673
|
+
"iterations_completed": iteration,
|
|
674
|
+
"trials_used": trials_used,
|
|
693
675
|
}
|
|
694
676
|
|
|
695
677
|
# Extract tool prompts if tools exist
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
tool_prompts = {
|
|
699
|
-
tool.get("function", {}).get("name", f"tool_{idx}"): tool.get(
|
|
700
|
-
"function", {}
|
|
701
|
-
).get("description", "")
|
|
702
|
-
for idx, tool in enumerate(final_tools)
|
|
703
|
-
}
|
|
678
|
+
final_tools = getattr(best_prompt, "tools", None)
|
|
679
|
+
tool_prompts = self._extract_tool_prompts(final_tools)
|
|
704
680
|
|
|
705
681
|
return OptimizationResult(
|
|
706
682
|
optimizer=self.__class__.__name__,
|