opik-optimizer 2.1.1__tar.gz → 2.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/PKG-INFO +1 -1
  2. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/pyproject.toml +1 -1
  3. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/__init__.py +2 -0
  4. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/base_optimizer.py +2 -0
  5. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/gepa_optimizer/gepa_optimizer.py +3 -3
  6. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/hierarchical_reflective_optimizer/hierarchical_reflective_optimizer.py +146 -93
  7. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/hierarchical_reflective_optimizer/hierarchical_root_cause_analyzer.py +38 -14
  8. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/hierarchical_reflective_optimizer/reporting.py +127 -65
  9. opik_optimizer-2.1.3/src/opik_optimizer/multi_metric_objective.py +33 -0
  10. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/task_evaluator.py +26 -9
  11. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer.egg-info/PKG-INFO +1 -1
  12. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer.egg-info/SOURCES.txt +1 -0
  13. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/LICENSE +0 -0
  14. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/README.md +0 -0
  15. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/setup.cfg +0 -0
  16. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/setup.py +0 -0
  17. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/_throttle.py +0 -0
  18. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/cache_config.py +0 -0
  19. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/data/context7_eval.jsonl +0 -0
  20. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/data/hotpot-500.json +0 -0
  21. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/datasets/__init__.py +0 -0
  22. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/datasets/ai2_arc.py +0 -0
  23. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/datasets/cnn_dailymail.py +0 -0
  24. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/datasets/context7_eval.py +0 -0
  25. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/datasets/election_questions.py +0 -0
  26. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/datasets/gsm8k.py +0 -0
  27. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/datasets/halu_eval.py +0 -0
  28. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/datasets/hotpot_qa.py +0 -0
  29. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/datasets/medhallu.py +0 -0
  30. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/datasets/rag_hallucinations.py +0 -0
  31. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/datasets/ragbench.py +0 -0
  32. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/datasets/tiny_test.py +0 -0
  33. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/datasets/truthful_qa.py +0 -0
  34. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/demo/__init__.py +0 -0
  35. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/demo/cache.py +0 -0
  36. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/demo/datasets.py +0 -0
  37. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/evolutionary_optimizer/__init__.py +0 -0
  38. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/evolutionary_optimizer/crossover_ops.py +0 -0
  39. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/evolutionary_optimizer/evaluation_ops.py +0 -0
  40. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +0 -0
  41. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/evolutionary_optimizer/helpers.py +0 -0
  42. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/evolutionary_optimizer/llm_support.py +0 -0
  43. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/evolutionary_optimizer/mcp.py +0 -0
  44. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/evolutionary_optimizer/mutation_ops.py +0 -0
  45. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/evolutionary_optimizer/population_ops.py +0 -0
  46. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/evolutionary_optimizer/prompts.py +0 -0
  47. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/evolutionary_optimizer/reporting.py +0 -0
  48. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/evolutionary_optimizer/style_ops.py +0 -0
  49. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/few_shot_bayesian_optimizer/__init__.py +0 -0
  50. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +0 -0
  51. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/few_shot_bayesian_optimizer/reporting.py +0 -0
  52. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/gepa_optimizer/__init__.py +0 -0
  53. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/gepa_optimizer/adapter.py +0 -0
  54. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/gepa_optimizer/reporting.py +0 -0
  55. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/hierarchical_reflective_optimizer/__init__.py +0 -0
  56. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/hierarchical_reflective_optimizer/prompts.py +0 -0
  57. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/hierarchical_reflective_optimizer/types.py +0 -0
  58. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/logging_config.py +0 -0
  59. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/mcp_utils/__init__.py +0 -0
  60. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/mcp_utils/mcp.py +0 -0
  61. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/mcp_utils/mcp_second_pass.py +0 -0
  62. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/mcp_utils/mcp_simulator.py +0 -0
  63. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/mcp_utils/mcp_workflow.py +0 -0
  64. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/meta_prompt_optimizer/__init__.py +0 -0
  65. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +0 -0
  66. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/meta_prompt_optimizer/reporting.py +0 -0
  67. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/mipro_optimizer/__init__.py +0 -0
  68. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/mipro_optimizer/_lm.py +0 -0
  69. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +0 -0
  70. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/mipro_optimizer/mipro_optimizer.py +0 -0
  71. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/mipro_optimizer/utils.py +0 -0
  72. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/optimizable_agent.py +0 -0
  73. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/optimization_config/__init__.py +0 -0
  74. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/optimization_config/chat_prompt.py +0 -0
  75. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/optimization_config/configs.py +0 -0
  76. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/optimization_config/mappers.py +0 -0
  77. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/optimization_result.py +0 -0
  78. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/parameter_optimizer/__init__.py +0 -0
  79. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/parameter_optimizer/parameter_optimizer.py +0 -0
  80. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/parameter_optimizer/parameter_search_space.py +0 -0
  81. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/parameter_optimizer/parameter_spec.py +0 -0
  82. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/parameter_optimizer/search_space_types.py +0 -0
  83. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/parameter_optimizer/sensitivity_analysis.py +0 -0
  84. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/py.typed +0 -0
  85. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/reporting_utils.py +0 -0
  86. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/utils/__init__.py +0 -0
  87. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/utils/colbert.py +0 -0
  88. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/utils/core.py +0 -0
  89. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/utils/dataset_utils.py +0 -0
  90. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer/utils/prompt_segments.py +0 -0
  91. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer.egg-info/dependency_links.txt +0 -0
  92. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer.egg-info/requires.txt +0 -0
  93. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/src/opik_optimizer.egg-info/top_level.txt +0 -0
  94. {opik_optimizer-2.1.1 → opik_optimizer-2.1.3}/tests/test_setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: opik_optimizer
3
- Version: 2.1.1
3
+ Version: 2.1.3
4
4
  Summary: Agent optimization with Opik
5
5
  Home-page: https://github.com/comet-ml/opik
6
6
  Author: Comet ML
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "opik_optimizer"
3
- version = "2.1.1"
3
+ version = "2.1.3"
4
4
  description = "Agent optimization with Opik"
5
5
  authors = [
6
6
  {name = "Comet ML", email = "support@comet.com"}
@@ -19,6 +19,7 @@ from .mipro_optimizer import MiproOptimizer
19
19
  from .hierarchical_reflective_optimizer import HierarchicalReflectiveOptimizer
20
20
  from .optimization_config.configs import TaskConfig
21
21
  from .optimization_result import OptimizationResult
22
+ from .multi_metric_objective import MultiMetricObjective
22
23
  from .parameter_optimizer import (
23
24
  ParameterOptimizer,
24
25
  ParameterSearchSpace,
@@ -48,6 +49,7 @@ __all__ = [
48
49
  "setup_logging",
49
50
  "datasets",
50
51
  "TaskConfig",
52
+ "MultiMetricObjective",
51
53
  "ParameterSearchSpace",
52
54
  "ParameterSpec",
53
55
  "ParameterType",
@@ -280,6 +280,7 @@ class BaseOptimizer(ABC):
280
280
  agent_config["project_name"] = getattr(prompt, "project_name", None)
281
281
  agent_config["model"] = getattr(prompt, "model", None) or self.model
282
282
  agent_config["tools"] = self._serialize_tools(prompt)
283
+ agent_config["optimizer"] = self.__class__.__name__
283
284
  return self._drop_none(agent_config)
284
285
 
285
286
  def get_optimizer_metadata(self) -> dict[str, Any]:
@@ -341,6 +342,7 @@ class BaseOptimizer(ABC):
341
342
  "metric": getattr(metric, "__name__", str(metric)),
342
343
  "dataset": getattr(dataset, "name", None),
343
344
  "dataset_id": dataset_id,
345
+ "optimizer": self.__class__.__name__,
344
346
  "optimizer_metadata": self._build_optimizer_metadata(),
345
347
  "tool_signatures": self._summarize_tool_signatures(prompt),
346
348
  "configuration": {
@@ -253,7 +253,7 @@ class GepaOptimizer(BaseOptimizer):
253
253
  opt_id = None
254
254
 
255
255
  gepa_reporting.display_header(
256
- algorithm="GEPA",
256
+ algorithm=self.__class__.__name__,
257
257
  optimization_id=opt_id,
258
258
  dataset_id=getattr(dataset, "id", None),
259
259
  verbose=self.verbose,
@@ -264,7 +264,7 @@ class GepaOptimizer(BaseOptimizer):
264
264
  _display_config(
265
265
  messages=prompt.get_messages(),
266
266
  optimizer_config={
267
- "optimizer": "GEPA",
267
+ "optimizer": self.__class__.__name__,
268
268
  "model": self.model,
269
269
  "reflection_model": self.reflection_model,
270
270
  "max_metric_calls": max_metric_calls,
@@ -422,7 +422,7 @@ class GepaOptimizer(BaseOptimizer):
422
422
  "system_prompt": candidate_prompt,
423
423
  "gepa_score": val_scores[idx] if idx < len(val_scores) else None,
424
424
  "opik_score": score,
425
- "source": "GEPA",
425
+ "source": self.__class__.__name__,
426
426
  }
427
427
  )
428
428
  history.append(
@@ -1,4 +1,3 @@
1
- from opik.environment import get_tqdm_for_current_environment
2
1
  import os
3
2
  import logging
4
3
 
@@ -29,8 +28,6 @@ from .types import (
29
28
  )
30
29
  from .prompts import IMPROVE_PROMPT_TEMPLATE
31
30
 
32
- tqdm = get_tqdm_for_current_environment()
33
-
34
31
  # Using disk cache for LLM calls
35
32
  disk_cache_dir = os.path.expanduser("~/.litellm_cache")
36
33
  litellm.cache = Cache(type=LiteLLMCacheType.DISK, disk_cache_dir=disk_cache_dir)
@@ -61,10 +58,14 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
61
58
  max_parallel_batches: Maximum number of batches to process concurrently during
62
59
  hierarchical root cause analysis (default: 5)
63
60
  batch_size: Number of test cases per batch for root cause analysis (default: 25)
61
+ max_iterations: Maximum number of optimization iterations (default: 5)
62
+ convergence_threshold: Stop if relative improvement is below this threshold (default: 0.01)
64
63
  **model_kwargs: Additional arguments passed to the LLM model
65
64
  """
66
65
 
67
66
  DEFAULT_ROUNDS = 10
67
+ DEFAULT_MAX_ITERATIONS = 5
68
+ DEFAULT_CONVERGENCE_THRESHOLD = 0.01 # Stop if improvement is less than 1%
68
69
 
69
70
  def __init__(
70
71
  self,
@@ -74,6 +75,8 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
74
75
  seed: int = 42,
75
76
  max_parallel_batches: int = 5,
76
77
  batch_size: int = 25,
78
+ max_iterations: int = DEFAULT_MAX_ITERATIONS,
79
+ convergence_threshold: float = DEFAULT_CONVERGENCE_THRESHOLD,
77
80
  **model_kwargs: Any,
78
81
  ):
79
82
  super().__init__(
@@ -83,6 +86,8 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
83
86
  self.num_threads = num_threads
84
87
  self.max_parallel_batches = max_parallel_batches
85
88
  self.batch_size = batch_size
89
+ self.max_iterations = max_iterations
90
+ self.convergence_threshold = convergence_threshold
86
91
 
87
92
  # Initialize hierarchical analyzer
88
93
  self._hierarchical_analyzer = HierarchicalRootCauseAnalyzer(
@@ -247,6 +252,8 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
247
252
  "reasoning_model": self.reasoning_model,
248
253
  "num_threads": self.num_threads,
249
254
  "max_parallel_batches": self.max_parallel_batches,
255
+ "max_iterations": self.max_iterations,
256
+ "convergence_threshold": self.convergence_threshold,
250
257
  "seed": self.seed,
251
258
  "verbose": self.verbose,
252
259
  }
@@ -417,7 +424,7 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
417
424
  n_samples: int | None,
418
425
  attempt: int,
419
426
  max_attempts: int,
420
- ) -> tuple[chat_prompt.ChatPrompt, float]:
427
+ ) -> tuple[chat_prompt.ChatPrompt, float, EvaluationResult]:
421
428
  """
422
429
  Generate and evaluate a single improvement attempt for a failure mode.
423
430
 
@@ -434,7 +441,7 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
434
441
  max_attempts: Total number of attempts
435
442
 
436
443
  Returns:
437
- Tuple of (improved_prompt, improved_score)
444
+ Tuple of (improved_prompt, improved_score, improved_experiment_result)
438
445
  """
439
446
  # Generate improvement with progress indication
440
447
  with reporting.display_prompt_improvement(
@@ -485,7 +492,7 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
485
492
  ) / len(improved_experiment_result.test_results)
486
493
  improved_reporter.set_score(improved_score)
487
494
 
488
- return improved_chat_prompt, improved_score
495
+ return improved_chat_prompt, improved_score, improved_experiment_result
489
496
 
490
497
  def optimize_prompt(
491
498
  self,
@@ -528,6 +535,8 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
528
535
  "n_samples": n_samples,
529
536
  "auto_continue": auto_continue,
530
537
  "max_retries": max_retries,
538
+ "max_iterations": self.max_iterations,
539
+ "convergence_threshold": self.convergence_threshold,
531
540
  },
532
541
  verbose=self.verbose,
533
542
  tools=getattr(prompt, "tools", None),
@@ -557,53 +566,62 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
557
566
  prompt.get_messages()
558
567
  ) # Store copy of initial messages for diff
559
568
 
560
- # Iteration 1: Analyze and improve (structure ready for future multi-iteration support)
561
- with reporting.display_optimization_iteration(
562
- iteration=1, verbose=self.verbose
563
- ) as iteration_reporter:
564
- # Perform hierarchical root cause analysis
565
- with reporting.display_root_cause_analysis(
566
- verbose=self.verbose
567
- ) as analysis_reporter:
568
- hierarchical_analysis = self._hierarchical_root_cause_analysis(
569
- experiment_result
570
- )
571
- analysis_reporter.set_completed(
572
- total_test_cases=hierarchical_analysis.total_test_cases,
573
- num_batches=hierarchical_analysis.num_batches,
574
- )
575
-
576
- # Display hierarchical synthesis and failure modes
577
- if self.verbose:
578
- reporting.display_hierarchical_synthesis(
579
- total_test_cases=hierarchical_analysis.total_test_cases,
580
- num_batches=hierarchical_analysis.num_batches,
581
- synthesis_notes=hierarchical_analysis.synthesis_notes,
582
- verbose=self.verbose,
583
- )
569
+ # Multi-iteration optimization loop
570
+ iteration = 0
571
+ previous_iteration_score = initial_score
572
+
573
+ for iteration in range(1, self.max_iterations + 1):
574
+ logger.info(f"Starting iteration {iteration}/{self.max_iterations}")
575
+
576
+ with reporting.display_optimization_iteration(
577
+ iteration=iteration, verbose=self.verbose
578
+ ) as iteration_reporter:
579
+ # Perform hierarchical root cause analysis
580
+ with reporting.display_root_cause_analysis(
581
+ verbose=self.verbose
582
+ ) as analysis_reporter:
583
+ hierarchical_analysis = self._hierarchical_root_cause_analysis(
584
+ experiment_result
585
+ )
586
+ analysis_reporter.set_completed(
587
+ total_test_cases=hierarchical_analysis.total_test_cases,
588
+ num_batches=hierarchical_analysis.num_batches,
589
+ )
584
590
 
585
- reporting.display_failure_modes(
586
- failure_modes=hierarchical_analysis.unified_failure_modes,
587
- verbose=self.verbose,
588
- )
591
+ # Display hierarchical synthesis and failure modes
592
+ if self.verbose:
593
+ reporting.display_hierarchical_synthesis(
594
+ total_test_cases=hierarchical_analysis.total_test_cases,
595
+ num_batches=hierarchical_analysis.num_batches,
596
+ synthesis_notes=hierarchical_analysis.synthesis_notes,
597
+ verbose=self.verbose,
598
+ )
589
599
 
590
- # Generate improved prompt for each failure mode
591
- for idx, root_cause in enumerate(
592
- hierarchical_analysis.unified_failure_modes, 1
593
- ):
594
- logger.debug(
595
- f"Addressing failure mode {idx}/{len(hierarchical_analysis.unified_failure_modes)}: {root_cause.name}"
600
+ reporting.display_failure_modes(
601
+ failure_modes=hierarchical_analysis.unified_failure_modes,
602
+ verbose=self.verbose,
596
603
  )
597
604
 
598
- # Try multiple attempts if needed
599
- max_attempts = max_retries + 1
600
- improved_chat_prompt = None
601
- improved_score = None
605
+ # Generate improved prompt for each failure mode
606
+ for idx, root_cause in enumerate(
607
+ hierarchical_analysis.unified_failure_modes, 1
608
+ ):
609
+ logger.debug(
610
+ f"Addressing failure mode {idx}/{len(hierarchical_analysis.unified_failure_modes)}: {root_cause.name}"
611
+ )
602
612
 
603
- for attempt in range(1, max_attempts + 1):
604
- # Generate and evaluate improvement
605
- improved_chat_prompt, improved_score = (
606
- self._generate_and_evaluate_improvement(
613
+ # Try multiple attempts if needed
614
+ max_attempts = max_retries + 1
615
+ improved_chat_prompt = None
616
+ improved_score = None
617
+
618
+ for attempt in range(1, max_attempts + 1):
619
+ # Generate and evaluate improvement
620
+ (
621
+ improved_chat_prompt,
622
+ improved_score,
623
+ improved_experiment_result,
624
+ ) = self._generate_and_evaluate_improvement(
607
625
  root_cause=root_cause,
608
626
  best_prompt=best_prompt,
609
627
  best_score=best_score,
@@ -615,64 +633,86 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
615
633
  attempt=attempt,
616
634
  max_attempts=max_attempts,
617
635
  )
618
- )
619
636
 
620
- # Check if we got improvement
621
- if improved_score > best_score:
622
- logger.info(
623
- f"Improvement found for '{root_cause.name}' on attempt {attempt}"
637
+ # Check if we got improvement
638
+ if improved_score > best_score:
639
+ logger.info(
640
+ f"Improvement found for '{root_cause.name}' on attempt {attempt}"
641
+ )
642
+ break
643
+
644
+ # No improvement - should we retry?
645
+ if attempt < max_attempts:
646
+ reporting.display_retry_attempt(
647
+ attempt=attempt,
648
+ max_attempts=max_attempts,
649
+ failure_mode_name=root_cause.name,
650
+ verbose=self.verbose,
651
+ )
652
+ else:
653
+ logger.debug(
654
+ f"No improvement after {attempt} attempts for '{root_cause.name}'"
655
+ )
656
+
657
+ # Check if final result is an improvement
658
+ if (
659
+ improved_score is not None
660
+ and improved_chat_prompt is not None
661
+ and improved_score > best_score
662
+ ):
663
+ improvement = self._calculate_improvement(
664
+ improved_score, best_score
624
665
  )
625
- break
626
666
 
627
- # No improvement - should we retry?
628
- if attempt < max_attempts:
629
- reporting.display_retry_attempt(
630
- attempt=attempt,
631
- max_attempts=max_attempts,
632
- failure_mode_name=root_cause.name,
667
+ # Display improvement for this iteration
668
+ reporting.display_iteration_improvement(
669
+ improvement=improvement,
670
+ current_score=improved_score,
671
+ best_score=best_score,
633
672
  verbose=self.verbose,
634
673
  )
674
+
675
+ # Update best
676
+ best_score = improved_score
677
+ best_prompt = improved_chat_prompt
678
+ best_messages = improved_chat_prompt.get_messages()
679
+ experiment_result = improved_experiment_result
680
+ logger.info(
681
+ f"Updated best prompt after addressing '{root_cause.name}'"
682
+ )
635
683
  else:
636
684
  logger.debug(
637
- f"No improvement after {attempt} attempts for '{root_cause.name}'"
685
+ f"Keeping previous best prompt, no improvement from '{root_cause.name}'"
638
686
  )
639
687
 
640
- # Check if final result is an improvement
641
- if (
642
- improved_score is not None
643
- and improved_chat_prompt is not None
644
- and improved_score > best_score
645
- ):
646
- improvement = self._calculate_improvement(
647
- improved_score, best_score
648
- )
649
-
650
- # Display improvement for this iteration
651
- reporting.display_iteration_improvement(
652
- improvement=improvement,
653
- current_score=improved_score,
654
- best_score=best_score,
655
- verbose=self.verbose,
656
- )
688
+ # Mark iteration complete
689
+ improved_since_start = best_score > initial_score
690
+ iteration_reporter.iteration_complete(
691
+ best_score=best_score, improved=improved_since_start
692
+ )
657
693
 
658
- # Update best
659
- best_score = improved_score
660
- best_prompt = improved_chat_prompt
661
- best_messages = improved_chat_prompt.get_messages()
662
- logger.info(
663
- f"Updated best prompt after addressing '{root_cause.name}'"
664
- )
665
- else:
666
- logger.debug(
667
- f"Keeping previous best prompt, no improvement from '{root_cause.name}'"
668
- )
694
+ # Check for convergence after iteration
695
+ iteration_improvement = self._calculate_improvement(
696
+ best_score, previous_iteration_score
697
+ )
669
698
 
670
- # Mark iteration complete
671
- improved_since_start = best_score > initial_score
672
- iteration_reporter.iteration_complete(
673
- best_score=best_score, improved=improved_since_start
699
+ logger.info(
700
+ f"Iteration {iteration} complete. Score: {best_score:.4f}, "
701
+ f"Improvement: {iteration_improvement:.2%}"
674
702
  )
675
703
 
704
+ # Stop if improvement is below convergence threshold
705
+ if abs(iteration_improvement) < self.convergence_threshold:
706
+ logger.info(
707
+ f"Convergence achieved: improvement ({iteration_improvement:.2%}) "
708
+ f"below threshold ({self.convergence_threshold:.2%}). "
709
+ f"Stopping after {iteration} iterations."
710
+ )
711
+ break
712
+
713
+ # Update previous score for next iteration
714
+ previous_iteration_score = best_score
715
+
676
716
  # Display final optimization result with diff
677
717
  reporting.display_optimized_prompt_diff(
678
718
  initial_messages=initial_messages,
@@ -682,14 +722,27 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
682
722
  verbose=self.verbose,
683
723
  )
684
724
 
725
+ # Update optimization status to completed
726
+ try:
727
+ optimization.update(status="completed")
728
+ logger.info(f"Optimization {optimization.id} status updated to completed.")
729
+ except Exception as e:
730
+ logger.warning(f"Failed to update optimization status: {e}")
731
+
685
732
  # Prepare details for the result
686
733
  details = {
734
+ "model": best_prompt.model or self.model,
735
+ "temperature": (best_prompt.model_kwargs or {}).get("temperature")
736
+ or self.model_kwargs.get("temperature"),
687
737
  "reasoning_model": self.reasoning_model,
688
738
  "num_threads": self.num_threads,
689
739
  "max_parallel_batches": self.max_parallel_batches,
690
740
  "max_retries": max_retries,
691
741
  "n_samples": n_samples,
692
742
  "auto_continue": auto_continue,
743
+ "max_iterations": self.max_iterations,
744
+ "convergence_threshold": self.convergence_threshold,
745
+ "iterations_completed": iteration,
693
746
  }
694
747
 
695
748
  # Extract tool prompts if tools exist
@@ -1,8 +1,8 @@
1
1
  import logging
2
2
  import asyncio
3
3
  from typing import Any
4
- from tqdm import tqdm
5
4
 
5
+ from rich.progress import Progress, TextColumn, BarColumn, TaskProgressColumn
6
6
  from opik.evaluation.evaluation_result import EvaluationResult
7
7
  from .types import (
8
8
  RootCauseAnalysis,
@@ -11,6 +11,7 @@ from .types import (
11
11
  )
12
12
  from . import reporting
13
13
  from .prompts import BATCH_ANALYSIS_PROMPT, SYNTHESIS_PROMPT
14
+ from ..reporting_utils import get_console
14
15
 
15
16
  logger = logging.getLogger(__name__)
16
17
 
@@ -285,13 +286,11 @@ Scores:
285
286
 
286
287
  semaphore = asyncio.Semaphore(self.max_parallel_batches)
287
288
 
288
- # Create progress bar for batch processing
289
- pbar = tqdm(
290
- total=len(batch_tasks), desc="Processing batches", unit="batch", leave=False
291
- )
289
+ # Create progress bar for batch processing using Rich
290
+ console = get_console()
292
291
 
293
292
  async def run_with_semaphore(
294
- batch_num: int, task: Any
293
+ batch_num: int, task: Any, progress: Progress | None, task_id: Any | None
295
294
  ) -> tuple[int, BatchAnalysis]:
296
295
  async with semaphore:
297
296
  try:
@@ -300,19 +299,44 @@ Scores:
300
299
  f"Completed batch {batch_num}: "
301
300
  f"identified {len(result.failure_modes)} failure modes"
302
301
  )
303
- pbar.update(1) # Update progress bar
302
+ if progress and task_id is not None:
303
+ progress.update(task_id, advance=1) # Update progress bar
304
304
  return batch_num, result
305
305
  except Exception as exc:
306
306
  logger.error(f"Batch {batch_num} failed: {exc}")
307
- pbar.update(1) # Update progress bar even on error
307
+ if progress and task_id is not None:
308
+ progress.update(
309
+ task_id, advance=1
310
+ ) # Update progress bar even on error
308
311
  raise
309
312
 
310
- # Run all tasks with semaphore control
311
- results = await asyncio.gather(
312
- *[run_with_semaphore(num, task) for num, task in batch_tasks]
313
- )
314
-
315
- pbar.close() # Close progress bar
313
+ # Run all tasks with semaphore control and rich progress bar
314
+ if self.verbose >= 1:
315
+ with Progress(
316
+ TextColumn("│ "),
317
+ TextColumn("[progress.description]{task.description}"),
318
+ BarColumn(),
319
+ TaskProgressColumn(),
320
+ console=console,
321
+ transient=True,
322
+ ) as progress:
323
+ task_id = progress.add_task(
324
+ "Processing batches", total=len(batch_tasks)
325
+ )
326
+ results = await asyncio.gather(
327
+ *[
328
+ run_with_semaphore(num, task, progress, task_id)
329
+ for num, task in batch_tasks
330
+ ]
331
+ )
332
+ else:
333
+ # No progress bar in non-verbose mode
334
+ results = await asyncio.gather(
335
+ *[
336
+ run_with_semaphore(num, task, None, None)
337
+ for num, task in batch_tasks
338
+ ]
339
+ )
316
340
 
317
341
  # Sort by batch number to maintain order
318
342
  batch_analyses = [result for _, result in sorted(results)]