opik-optimizer 2.1.2__py3-none-any.whl → 2.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. opik_optimizer/__init__.py +2 -2
  2. opik_optimizer/base_optimizer.py +314 -145
  3. opik_optimizer/evolutionary_optimizer/crossover_ops.py +31 -4
  4. opik_optimizer/evolutionary_optimizer/evaluation_ops.py +23 -3
  5. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +122 -95
  6. opik_optimizer/evolutionary_optimizer/mcp.py +11 -6
  7. opik_optimizer/evolutionary_optimizer/mutation_ops.py +25 -5
  8. opik_optimizer/evolutionary_optimizer/population_ops.py +26 -10
  9. opik_optimizer/evolutionary_optimizer/reporting.py +5 -5
  10. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +53 -99
  11. opik_optimizer/few_shot_bayesian_optimizer/reporting.py +4 -4
  12. opik_optimizer/gepa_optimizer/gepa_optimizer.py +183 -172
  13. opik_optimizer/gepa_optimizer/reporting.py +164 -22
  14. opik_optimizer/hierarchical_reflective_optimizer/hierarchical_reflective_optimizer.py +221 -245
  15. opik_optimizer/hierarchical_reflective_optimizer/hierarchical_root_cause_analyzer.py +38 -14
  16. opik_optimizer/hierarchical_reflective_optimizer/prompts.py +7 -1
  17. opik_optimizer/hierarchical_reflective_optimizer/reporting.py +287 -132
  18. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +185 -205
  19. opik_optimizer/meta_prompt_optimizer/reporting.py +4 -4
  20. opik_optimizer/mipro_optimizer/__init__.py +2 -2
  21. opik_optimizer/mipro_optimizer/_lm.py +4 -4
  22. opik_optimizer/mipro_optimizer/{_mipro_optimizer_v2.py → mipro_optimizer_v2.py} +1 -7
  23. opik_optimizer/mipro_optimizer/utils.py +1 -0
  24. opik_optimizer/multi_metric_objective.py +33 -0
  25. opik_optimizer/optimizable_agent.py +7 -4
  26. opik_optimizer/optimization_config/chat_prompt.py +7 -10
  27. opik_optimizer/parameter_optimizer/parameter_optimizer.py +188 -40
  28. opik_optimizer/parameter_optimizer/reporting.py +148 -0
  29. opik_optimizer/reporting_utils.py +42 -15
  30. opik_optimizer/task_evaluator.py +26 -9
  31. opik_optimizer/utils/core.py +16 -2
  32. opik_optimizer/utils/prompt_segments.py +1 -2
  33. {opik_optimizer-2.1.2.dist-info → opik_optimizer-2.2.0.dist-info}/METADATA +2 -3
  34. {opik_optimizer-2.1.2.dist-info → opik_optimizer-2.2.0.dist-info}/RECORD +37 -37
  35. opik_optimizer/evolutionary_optimizer/llm_support.py +0 -136
  36. opik_optimizer/mipro_optimizer/mipro_optimizer.py +0 -680
  37. {opik_optimizer-2.1.2.dist-info → opik_optimizer-2.2.0.dist-info}/WHEEL +0 -0
  38. {opik_optimizer-2.1.2.dist-info → opik_optimizer-2.2.0.dist-info}/licenses/LICENSE +0 -0
  39. {opik_optimizer-2.1.2.dist-info → opik_optimizer-2.2.0.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,3 @@
1
- from opik.environment import get_tqdm_for_current_environment
2
1
  import os
3
2
  import logging
4
3
 
@@ -6,8 +5,8 @@ import opik
6
5
  import litellm
7
6
  from litellm.caching import Cache
8
7
  from litellm.types.caching import LiteLLMCacheType
8
+ from opik import opik_context
9
9
  from opik.evaluation.evaluation_result import EvaluationResult
10
- from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
11
10
  from opik.evaluation import evaluator as opik_evaluator
12
11
 
13
12
  from typing import Any, TypeVar
@@ -29,8 +28,6 @@ from .types import (
29
28
  )
30
29
  from .prompts import IMPROVE_PROMPT_TEMPLATE
31
30
 
32
- tqdm = get_tqdm_for_current_environment()
33
-
34
31
  # Using disk cache for LLM calls
35
32
  disk_cache_dir = os.path.expanduser("~/.litellm_cache")
36
33
  litellm.cache = Cache(type=LiteLLMCacheType.DISK, disk_cache_dir=disk_cache_dir)
@@ -54,149 +51,53 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
54
51
  complex prompt that you want to systematically refine based on understanding why it fails.
55
52
 
56
53
  Args:
57
- reasoning_model: LiteLLM model name for reasoning and analysis (default: "openai/gpt-4.1")
58
- num_threads: Number of parallel threads for evaluation (default: 12)
59
- verbose: Controls internal logging/progress bars (0=off, 1=on) (default: 1)
60
- seed: Random seed for reproducibility (default: 42)
54
+ model: LiteLLM model name for the optimization algorithm (reasoning and analysis)
55
+ model_parameters: Optional dict of LiteLLM parameters for optimizer's internal LLM calls.
56
+ Common params: temperature, max_tokens, max_completion_tokens, top_p.
57
+ See: https://docs.litellm.ai/docs/completion/input
61
58
  max_parallel_batches: Maximum number of batches to process concurrently during
62
- hierarchical root cause analysis (default: 5)
63
- batch_size: Number of test cases per batch for root cause analysis (default: 25)
64
- **model_kwargs: Additional arguments passed to the LLM model
59
+ hierarchical root cause analysis
60
+ batch_size: Number of test cases per batch for root cause analysis
61
+ convergence_threshold: Stop if relative improvement is below this threshold
62
+ n_threads: Number of parallel threads for evaluation
63
+ verbose: Controls internal logging/progress bars (0=off, 1=on)
64
+ seed: Random seed for reproducibility
65
65
  """
66
66
 
67
67
  DEFAULT_ROUNDS = 10
68
+ DEFAULT_MAX_ITERATIONS = 5
69
+ DEFAULT_CONVERGENCE_THRESHOLD = 0.01 # Stop if improvement is less than 1%
68
70
 
69
71
  def __init__(
70
72
  self,
71
- reasoning_model: str = "openai/gpt-4.1",
72
- num_threads: int = 12,
73
- verbose: int = 1,
74
- seed: int = 42,
73
+ model: str = "gpt-4o",
74
+ model_parameters: dict[str, Any] | None = None,
75
75
  max_parallel_batches: int = 5,
76
76
  batch_size: int = 25,
77
- **model_kwargs: Any,
77
+ convergence_threshold: float = DEFAULT_CONVERGENCE_THRESHOLD,
78
+ n_threads: int = 12,
79
+ verbose: int = 1,
80
+ seed: int = 42,
78
81
  ):
79
82
  super().__init__(
80
- model=reasoning_model, verbose=verbose, seed=seed, **model_kwargs
83
+ model=model, verbose=verbose, seed=seed, model_parameters=model_parameters
81
84
  )
82
- self.reasoning_model = reasoning_model
83
- self.num_threads = num_threads
85
+ self.n_threads = n_threads
84
86
  self.max_parallel_batches = max_parallel_batches
85
87
  self.batch_size = batch_size
88
+ self.convergence_threshold = convergence_threshold
89
+ self._should_stop_optimization = False # Flag to exit all loops
86
90
 
87
91
  # Initialize hierarchical analyzer
88
92
  self._hierarchical_analyzer = HierarchicalRootCauseAnalyzer(
89
93
  call_model_fn=self._call_model_async,
90
- reasoning_model=self.reasoning_model,
94
+ reasoning_model=self.model,
91
95
  seed=self.seed,
92
96
  max_parallel_batches=self.max_parallel_batches,
93
97
  batch_size=self.batch_size,
94
98
  verbose=self.verbose,
95
99
  )
96
100
 
97
- def _prepare_model_params(
98
- self,
99
- model_kwargs: dict[str, Any],
100
- response_model: type[T] | None = None,
101
- ) -> dict[str, Any]:
102
- """
103
- Prepare parameters for LiteLLM call by filtering and adding monitoring.
104
-
105
- Args:
106
- model_kwargs: Additional model parameters
107
- response_model: Optional Pydantic model for structured output
108
-
109
- Returns:
110
- Dictionary of parameters ready for litellm.completion/acompletion
111
- """
112
- current_model_kwargs = self.model_kwargs.copy()
113
- current_model_kwargs.update(model_kwargs)
114
-
115
- # Filter out optimizer-specific kwargs that shouldn't be passed to LiteLLM
116
- filtered_call_kwargs = current_model_kwargs.copy()
117
- filtered_call_kwargs.pop("n_trials", None)
118
- filtered_call_kwargs.pop("n_samples", None)
119
- filtered_call_kwargs.pop("n_iterations", None)
120
- filtered_call_kwargs.pop("min_examples", None)
121
- filtered_call_kwargs.pop("max_examples", None)
122
- filtered_call_kwargs.pop("project_name", None)
123
-
124
- final_params_for_litellm = (
125
- opik_litellm_monitor.try_add_opik_monitoring_to_params(filtered_call_kwargs)
126
- )
127
-
128
- # Add structured output support if response_model is provided
129
- # According to LiteLLM docs: https://docs.litellm.ai/docs/completion/json_mode
130
- # Pass the Pydantic model directly to response_format
131
- if response_model is not None:
132
- final_params_for_litellm["response_format"] = response_model
133
-
134
- return final_params_for_litellm
135
-
136
- def _parse_response(
137
- self,
138
- response: Any,
139
- response_model: type[T] | None = None,
140
- ) -> T | str:
141
- """
142
- Parse LiteLLM response, with optional structured output parsing.
143
-
144
- Args:
145
- response: The response from litellm.completion/acompletion
146
- response_model: Optional Pydantic model for structured output
147
-
148
- Returns:
149
- If response_model is provided, returns an instance of that model.
150
- Otherwise, returns the raw string response.
151
- """
152
- content = response.choices[0].message.content
153
-
154
- # When using structured outputs with Pydantic models, LiteLLM automatically
155
- # parses the response. Parse the JSON string into the Pydantic model
156
- if response_model is not None:
157
- return response_model.model_validate_json(content)
158
-
159
- return content
160
-
161
- @_throttle.rate_limited(_rate_limiter)
162
- def _call_model(
163
- self,
164
- model: str,
165
- messages: list[dict[str, str]],
166
- seed: int,
167
- model_kwargs: dict[str, Any],
168
- response_model: type[T] | None = None,
169
- ) -> T | str:
170
- """
171
- Call the LLM model with optional structured output.
172
-
173
- Args:
174
- model: The model to use for the call
175
- messages: List of message dictionaries with 'role' and 'content' keys
176
- seed: Random seed for reproducibility
177
- model_kwargs: Additional model parameters
178
- response_model: Optional Pydantic model for structured output
179
-
180
- Returns:
181
- If response_model is provided, returns an instance of that model.
182
- Otherwise, returns the raw string response.
183
- """
184
- self.increment_llm_counter()
185
-
186
- final_params_for_litellm = self._prepare_model_params(
187
- model_kwargs, response_model
188
- )
189
-
190
- response = litellm.completion(
191
- model=model,
192
- messages=messages,
193
- seed=seed,
194
- num_retries=6,
195
- **final_params_for_litellm,
196
- )
197
-
198
- return self._parse_response(response, response_model)
199
-
200
101
  @_throttle.rate_limited(_rate_limiter)
201
102
  async def _call_model_async(
202
103
  self,
@@ -207,7 +108,10 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
207
108
  response_model: type[T] | None = None,
208
109
  ) -> T | str:
209
110
  """
210
- Async version of _call_model using litellm.acompletion.
111
+ Adapter for async LLM calls with HierarchicalRootCauseAnalyzer signature.
112
+
113
+ This adapter translates the analyzer's expected signature to the base class
114
+ _call_model_async signature, ensuring project_name and tags are properly set.
211
115
 
212
116
  Args:
213
117
  model: The model to use for the call
@@ -220,22 +124,16 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
220
124
  If response_model is provided, returns an instance of that model.
221
125
  Otherwise, returns the raw string response.
222
126
  """
223
- self.increment_llm_counter()
224
-
225
- final_params_for_litellm = self._prepare_model_params(
226
- model_kwargs, response_model
227
- )
228
-
229
- response = await litellm.acompletion(
230
- model=model,
127
+ # Call the base class async method which properly handles project_name and tags
128
+ return await super()._call_model_async(
231
129
  messages=messages,
130
+ model=model,
232
131
  seed=seed,
233
- num_retries=6,
234
- **final_params_for_litellm,
132
+ response_model=response_model,
133
+ is_reasoning=True,
134
+ **model_kwargs,
235
135
  )
236
136
 
237
- return self._parse_response(response, response_model)
238
-
239
137
  def get_optimizer_metadata(self) -> dict[str, Any]:
240
138
  """
241
139
  Get metadata about the optimizer configuration.
@@ -244,9 +142,10 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
244
142
  Dictionary containing optimizer-specific configuration
245
143
  """
246
144
  return {
247
- "reasoning_model": self.reasoning_model,
248
- "num_threads": self.num_threads,
145
+ "model": self.model,
146
+ "n_threads": self.n_threads,
249
147
  "max_parallel_batches": self.max_parallel_batches,
148
+ "convergence_threshold": self.convergence_threshold,
250
149
  "seed": self.seed,
251
150
  "verbose": self.verbose,
252
151
  }
@@ -323,6 +222,12 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
323
222
 
324
223
  cleaned_model_output = raw_model_output.strip()
325
224
 
225
+ # Add tags to trace for optimization tracking
226
+ if self.current_optimization_id:
227
+ opik_context.update_current_trace(
228
+ tags=[self.current_optimization_id, "Evaluation"]
229
+ )
230
+
326
231
  result = {
327
232
  mappers.EVALUATED_LLM_TASK_OUTPUT: cleaned_model_output,
328
233
  }
@@ -337,10 +242,11 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
337
242
  dataset=dataset,
338
243
  task=llm_task,
339
244
  scoring_metrics=[_create_metric_class(metric)],
340
- task_threads=self.num_threads,
245
+ task_threads=self.n_threads,
341
246
  nb_samples=n_samples,
342
247
  experiment_config=experiment_config,
343
248
  verbose=self.verbose,
249
+ project_name=self.project_name,
344
250
  )
345
251
 
346
252
  return result
@@ -396,10 +302,9 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
396
302
  )
397
303
 
398
304
  improve_prompt_response = self._call_model(
399
- model=self.reasoning_model,
400
305
  messages=[{"role": "user", "content": improve_prompt_prompt}],
306
+ model=self.model,
401
307
  seed=attempt_seed,
402
- model_kwargs={},
403
308
  response_model=ImprovedPrompt,
404
309
  )
405
310
 
@@ -417,7 +322,7 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
417
322
  n_samples: int | None,
418
323
  attempt: int,
419
324
  max_attempts: int,
420
- ) -> tuple[chat_prompt.ChatPrompt, float]:
325
+ ) -> tuple[chat_prompt.ChatPrompt, float, EvaluationResult]:
421
326
  """
422
327
  Generate and evaluate a single improvement attempt for a failure mode.
423
328
 
@@ -434,7 +339,7 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
434
339
  max_attempts: Total number of attempts
435
340
 
436
341
  Returns:
437
- Tuple of (improved_prompt, improved_score)
342
+ Tuple of (improved_prompt, improved_score, improved_experiment_result)
438
343
  """
439
344
  # Generate improvement with progress indication
440
345
  with reporting.display_prompt_improvement(
@@ -454,7 +359,8 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
454
359
  improved_chat_prompt = chat_prompt.ChatPrompt(
455
360
  name=prompt.name,
456
361
  messages=messages_as_dicts,
457
- tools=prompt.tools,
362
+ tools=best_prompt.tools,
363
+ function_map=best_prompt.function_map,
458
364
  )
459
365
 
460
366
  # Evaluate improved prompt
@@ -485,7 +391,7 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
485
391
  ) / len(improved_experiment_result.test_results)
486
392
  improved_reporter.set_score(improved_score)
487
393
 
488
- return improved_chat_prompt, improved_score
394
+ return improved_chat_prompt, improved_score, improved_experiment_result
489
395
 
490
396
  def optimize_prompt(
491
397
  self,
@@ -496,23 +402,28 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
496
402
  n_samples: int | None = None,
497
403
  auto_continue: bool = False,
498
404
  agent_class: type[OptimizableAgent] | None = None,
405
+ project_name: str = "Optimization",
406
+ max_trials: int = DEFAULT_MAX_ITERATIONS,
499
407
  max_retries: int = 2,
408
+ *args: Any,
500
409
  **kwargs: Any,
501
410
  ) -> OptimizationResult:
502
411
  # Reset counters at the start of optimization
503
- self.reset_counters()
504
-
505
- # Configure prompt model if not set
506
- self.configure_prompt_model(prompt)
412
+ self._reset_counters()
413
+ self._should_stop_optimization = False # Reset stop flag
507
414
 
508
415
  # Setup agent class
509
- self.agent_class = self.setup_agent_class(prompt, agent_class)
416
+ self.agent_class = self._setup_agent_class(prompt, agent_class)
417
+
418
+ # Set project name from parameter
419
+ self.project_name = project_name
510
420
 
511
421
  optimization = self.opik_client.create_optimization(
512
422
  dataset_name=dataset.name,
513
423
  objective_name=getattr(metric, "__name__", str(metric)),
514
424
  metadata={"optimizer": self.__class__.__name__},
515
425
  )
426
+ self.current_optimization_id = optimization.id
516
427
  logger.debug(f"Created optimization with ID: {optimization.id}")
517
428
 
518
429
  reporting.display_header(
@@ -528,6 +439,8 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
528
439
  "n_samples": n_samples,
529
440
  "auto_continue": auto_continue,
530
441
  "max_retries": max_retries,
442
+ "max_trials": max_trials,
443
+ "convergence_threshold": self.convergence_threshold,
531
444
  },
532
445
  verbose=self.verbose,
533
446
  tools=getattr(prompt, "tools", None),
@@ -557,53 +470,82 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
557
470
  prompt.get_messages()
558
471
  ) # Store copy of initial messages for diff
559
472
 
560
- # Iteration 1: Analyze and improve (structure ready for future multi-iteration support)
561
- with reporting.display_optimization_iteration(
562
- iteration=1, verbose=self.verbose
563
- ) as iteration_reporter:
564
- # Perform hierarchical root cause analysis
565
- with reporting.display_root_cause_analysis(
566
- verbose=self.verbose
567
- ) as analysis_reporter:
568
- hierarchical_analysis = self._hierarchical_root_cause_analysis(
569
- experiment_result
570
- )
571
- analysis_reporter.set_completed(
572
- total_test_cases=hierarchical_analysis.total_test_cases,
573
- num_batches=hierarchical_analysis.num_batches,
574
- )
473
+ # Multi-iteration optimization loop
474
+ iteration = 0
475
+ previous_iteration_score = initial_score
476
+ trials_used = 0
575
477
 
576
- # Display hierarchical synthesis and failure modes
577
- if self.verbose:
578
- reporting.display_hierarchical_synthesis(
579
- total_test_cases=hierarchical_analysis.total_test_cases,
580
- num_batches=hierarchical_analysis.num_batches,
581
- synthesis_notes=hierarchical_analysis.synthesis_notes,
582
- verbose=self.verbose,
478
+ while trials_used < max_trials:
479
+ iteration += 1
480
+ logger.info(
481
+ f"Starting iteration {iteration} (trials: {trials_used}/{max_trials})"
482
+ )
483
+
484
+ # Check if we should stop (flag set by inner loops)
485
+ if self._should_stop_optimization:
486
+ logger.info(
487
+ f"Stopping optimization: reached max_trials limit ({max_trials})."
583
488
  )
489
+ break
490
+
491
+ with reporting.display_optimization_iteration(
492
+ iteration=iteration, verbose=self.verbose
493
+ ) as iteration_reporter:
494
+ # Perform hierarchical root cause analysis
495
+ with reporting.display_root_cause_analysis(
496
+ verbose=self.verbose
497
+ ) as analysis_reporter:
498
+ hierarchical_analysis = self._hierarchical_root_cause_analysis(
499
+ experiment_result
500
+ )
501
+ analysis_reporter.set_completed(
502
+ total_test_cases=hierarchical_analysis.total_test_cases,
503
+ num_batches=hierarchical_analysis.num_batches,
504
+ )
584
505
 
585
- reporting.display_failure_modes(
586
- failure_modes=hierarchical_analysis.unified_failure_modes,
587
- verbose=self.verbose,
588
- )
506
+ # Display hierarchical synthesis and failure modes
507
+ if self.verbose:
508
+ reporting.display_hierarchical_synthesis(
509
+ total_test_cases=hierarchical_analysis.total_test_cases,
510
+ num_batches=hierarchical_analysis.num_batches,
511
+ synthesis_notes=hierarchical_analysis.synthesis_notes,
512
+ verbose=self.verbose,
513
+ )
589
514
 
590
- # Generate improved prompt for each failure mode
591
- for idx, root_cause in enumerate(
592
- hierarchical_analysis.unified_failure_modes, 1
593
- ):
594
- logger.debug(
595
- f"Addressing failure mode {idx}/{len(hierarchical_analysis.unified_failure_modes)}: {root_cause.name}"
515
+ reporting.display_failure_modes(
516
+ failure_modes=hierarchical_analysis.unified_failure_modes,
517
+ verbose=self.verbose,
596
518
  )
597
519
 
598
- # Try multiple attempts if needed
599
- max_attempts = max_retries + 1
600
- improved_chat_prompt = None
601
- improved_score = None
520
+ # Generate improved prompt for each failure mode
521
+ for idx, root_cause in enumerate(
522
+ hierarchical_analysis.unified_failure_modes, 1
523
+ ):
524
+ logger.debug(
525
+ f"Addressing failure mode {idx}/{len(hierarchical_analysis.unified_failure_modes)}: {root_cause.name}"
526
+ )
602
527
 
603
- for attempt in range(1, max_attempts + 1):
604
- # Generate and evaluate improvement
605
- improved_chat_prompt, improved_score = (
606
- self._generate_and_evaluate_improvement(
528
+ # Try multiple attempts if needed
529
+ max_attempts = max_retries + 1
530
+ improved_chat_prompt = None
531
+ improved_score = None
532
+
533
+ for attempt in range(1, max_attempts + 1):
534
+ # Check if we've reached the trial limit before starting a new trial
535
+ if trials_used >= max_trials:
536
+ logger.info(
537
+ f"Reached max_trials limit ({max_trials}) during failure mode '{root_cause.name}'. "
538
+ f"Stopping optimization."
539
+ )
540
+ self._should_stop_optimization = True
541
+ break
542
+
543
+ # Generate and evaluate improvement (this is 1 trial)
544
+ (
545
+ improved_chat_prompt,
546
+ improved_score,
547
+ improved_experiment_result,
548
+ ) = self._generate_and_evaluate_improvement(
607
549
  root_cause=root_cause,
608
550
  best_prompt=best_prompt,
609
551
  best_score=best_score,
@@ -615,64 +557,91 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
615
557
  attempt=attempt,
616
558
  max_attempts=max_attempts,
617
559
  )
618
- )
560
+ trials_used += 1
561
+
562
+ # Check if we got improvement
563
+ if improved_score > best_score:
564
+ logger.info(
565
+ f"Improvement found for '{root_cause.name}' on attempt {attempt}"
566
+ )
567
+ break
568
+
569
+ # No improvement - should we retry?
570
+ if attempt < max_attempts and trials_used < max_trials:
571
+ reporting.display_retry_attempt(
572
+ attempt=attempt,
573
+ max_attempts=max_attempts,
574
+ failure_mode_name=root_cause.name,
575
+ verbose=self.verbose,
576
+ )
577
+ else:
578
+ logger.debug(
579
+ f"No improvement after {attempt} attempts for '{root_cause.name}'"
580
+ )
581
+
582
+ # Break out of failure mode loop if flag is set
583
+ if self._should_stop_optimization:
584
+ break
619
585
 
620
- # Check if we got improvement
621
- if improved_score > best_score:
622
- logger.info(
623
- f"Improvement found for '{root_cause.name}' on attempt {attempt}"
586
+ # Check if final result is an improvement
587
+ if (
588
+ improved_score is not None
589
+ and improved_chat_prompt is not None
590
+ and improved_score > best_score
591
+ ):
592
+ improvement = self._calculate_improvement(
593
+ improved_score, best_score
624
594
  )
625
- break
626
595
 
627
- # No improvement - should we retry?
628
- if attempt < max_attempts:
629
- reporting.display_retry_attempt(
630
- attempt=attempt,
631
- max_attempts=max_attempts,
632
- failure_mode_name=root_cause.name,
596
+ # Display improvement for this iteration
597
+ reporting.display_iteration_improvement(
598
+ improvement=improvement,
599
+ current_score=improved_score,
600
+ best_score=best_score,
633
601
  verbose=self.verbose,
634
602
  )
603
+
604
+ # Update best
605
+ best_score = improved_score
606
+ best_prompt = improved_chat_prompt
607
+ best_messages = improved_chat_prompt.get_messages()
608
+ experiment_result = improved_experiment_result
609
+ logger.info(
610
+ f"Updated best prompt after addressing '{root_cause.name}'"
611
+ )
635
612
  else:
636
613
  logger.debug(
637
- f"No improvement after {attempt} attempts for '{root_cause.name}'"
614
+ f"Keeping previous best prompt, no improvement from '{root_cause.name}'"
638
615
  )
639
616
 
640
- # Check if final result is an improvement
641
- if (
642
- improved_score is not None
643
- and improved_chat_prompt is not None
644
- and improved_score > best_score
645
- ):
646
- improvement = self._calculate_improvement(
647
- improved_score, best_score
648
- )
649
-
650
- # Display improvement for this iteration
651
- reporting.display_iteration_improvement(
652
- improvement=improvement,
653
- current_score=improved_score,
654
- best_score=best_score,
655
- verbose=self.verbose,
656
- )
617
+ # Mark iteration complete
618
+ improved_since_start = best_score > initial_score
619
+ iteration_reporter.iteration_complete(
620
+ best_score=best_score, improved=improved_since_start
621
+ )
657
622
 
658
- # Update best
659
- best_score = improved_score
660
- best_prompt = improved_chat_prompt
661
- best_messages = improved_chat_prompt.get_messages()
662
- logger.info(
663
- f"Updated best prompt after addressing '{root_cause.name}'"
664
- )
665
- else:
666
- logger.debug(
667
- f"Keeping previous best prompt, no improvement from '{root_cause.name}'"
668
- )
623
+ # Check for convergence after iteration
624
+ iteration_improvement = self._calculate_improvement(
625
+ best_score, previous_iteration_score
626
+ )
669
627
 
670
- # Mark iteration complete
671
- improved_since_start = best_score > initial_score
672
- iteration_reporter.iteration_complete(
673
- best_score=best_score, improved=improved_since_start
628
+ logger.info(
629
+ f"Iteration {iteration} complete. Score: {best_score:.4f}, "
630
+ f"Improvement: {iteration_improvement:.2%}"
674
631
  )
675
632
 
633
+ # Stop if improvement is below convergence threshold
634
+ if abs(iteration_improvement) < self.convergence_threshold:
635
+ logger.info(
636
+ f"Convergence achieved: improvement ({iteration_improvement:.2%}) "
637
+ f"below threshold ({self.convergence_threshold:.2%}). "
638
+ f"Stopping after {iteration} iterations."
639
+ )
640
+ break
641
+
642
+ # Update previous score for next iteration
643
+ previous_iteration_score = best_score
644
+
676
645
  # Display final optimization result with diff
677
646
  reporting.display_optimized_prompt_diff(
678
647
  initial_messages=initial_messages,
@@ -682,25 +651,32 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
682
651
  verbose=self.verbose,
683
652
  )
684
653
 
654
+ # Update optimization status to completed
655
+ try:
656
+ optimization.update(status="completed")
657
+ logger.info(f"Optimization {optimization.id} status updated to completed.")
658
+ except Exception as e:
659
+ logger.warning(f"Failed to update optimization status: {e}")
660
+
685
661
  # Prepare details for the result
686
662
  details = {
687
- "reasoning_model": self.reasoning_model,
688
- "num_threads": self.num_threads,
663
+ "model": self.model,
664
+ "temperature": (best_prompt.model_kwargs or {}).get("temperature")
665
+ or self.model_parameters.get("temperature"),
666
+ "n_threads": self.n_threads,
689
667
  "max_parallel_batches": self.max_parallel_batches,
690
668
  "max_retries": max_retries,
691
669
  "n_samples": n_samples,
692
670
  "auto_continue": auto_continue,
671
+ "max_trials": max_trials,
672
+ "convergence_threshold": self.convergence_threshold,
673
+ "iterations_completed": iteration,
674
+ "trials_used": trials_used,
693
675
  }
694
676
 
695
677
  # Extract tool prompts if tools exist
696
- tool_prompts = None
697
- if final_tools := getattr(best_prompt, "tools", None):
698
- tool_prompts = {
699
- tool.get("function", {}).get("name", f"tool_{idx}"): tool.get(
700
- "function", {}
701
- ).get("description", "")
702
- for idx, tool in enumerate(final_tools)
703
- }
678
+ final_tools = getattr(best_prompt, "tools", None)
679
+ tool_prompts = self._extract_tool_prompts(final_tools)
704
680
 
705
681
  return OptimizationResult(
706
682
  optimizer=self.__class__.__name__,