opik-optimizer 2.1.3__py3-none-any.whl → 2.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. opik_optimizer/__init__.py +0 -2
  2. opik_optimizer/base_optimizer.py +313 -144
  3. opik_optimizer/evolutionary_optimizer/crossover_ops.py +31 -4
  4. opik_optimizer/evolutionary_optimizer/evaluation_ops.py +23 -3
  5. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +122 -95
  6. opik_optimizer/evolutionary_optimizer/mcp.py +11 -6
  7. opik_optimizer/evolutionary_optimizer/mutation_ops.py +25 -5
  8. opik_optimizer/evolutionary_optimizer/population_ops.py +26 -10
  9. opik_optimizer/evolutionary_optimizer/reporting.py +5 -5
  10. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +53 -99
  11. opik_optimizer/few_shot_bayesian_optimizer/reporting.py +4 -4
  12. opik_optimizer/gepa_optimizer/gepa_optimizer.py +345 -201
  13. opik_optimizer/gepa_optimizer/reporting.py +291 -22
  14. opik_optimizer/hierarchical_reflective_optimizer/hierarchical_reflective_optimizer.py +90 -167
  15. opik_optimizer/hierarchical_reflective_optimizer/prompts.py +7 -1
  16. opik_optimizer/hierarchical_reflective_optimizer/reporting.py +168 -75
  17. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +185 -205
  18. opik_optimizer/meta_prompt_optimizer/reporting.py +4 -4
  19. opik_optimizer/mipro_optimizer/__init__.py +2 -2
  20. opik_optimizer/mipro_optimizer/_lm.py +4 -4
  21. opik_optimizer/mipro_optimizer/{_mipro_optimizer_v2.py → mipro_optimizer_v2.py} +1 -7
  22. opik_optimizer/mipro_optimizer/utils.py +1 -0
  23. opik_optimizer/optimizable_agent.py +7 -4
  24. opik_optimizer/optimization_config/chat_prompt.py +7 -10
  25. opik_optimizer/parameter_optimizer/parameter_optimizer.py +188 -40
  26. opik_optimizer/parameter_optimizer/reporting.py +148 -0
  27. opik_optimizer/reporting_utils.py +60 -15
  28. opik_optimizer/utils/__init__.py +3 -0
  29. opik_optimizer/utils/candidate_utils.py +52 -0
  30. opik_optimizer/utils/core.py +35 -2
  31. opik_optimizer/utils/prompt_segments.py +1 -2
  32. {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.1.dist-info}/METADATA +2 -3
  33. {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.1.dist-info}/RECORD +36 -36
  34. opik_optimizer/evolutionary_optimizer/llm_support.py +0 -136
  35. opik_optimizer/mipro_optimizer/mipro_optimizer.py +0 -680
  36. {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.1.dist-info}/WHEEL +0 -0
  37. {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.1.dist-info}/licenses/LICENSE +0 -0
  38. {opik_optimizer-2.1.3.dist-info → opik_optimizer-2.2.1.dist-info}/top_level.txt +0 -0
@@ -5,8 +5,8 @@ import opik
5
5
  import litellm
6
6
  from litellm.caching import Cache
7
7
  from litellm.types.caching import LiteLLMCacheType
8
+ from opik import opik_context
8
9
  from opik.evaluation.evaluation_result import EvaluationResult
9
- from opik.evaluation.models.litellm import opik_monitor as opik_litellm_monitor
10
10
  from opik.evaluation import evaluator as opik_evaluator
11
11
 
12
12
  from typing import Any, TypeVar
@@ -51,16 +51,17 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
51
51
  complex prompt that you want to systematically refine based on understanding why it fails.
52
52
 
53
53
  Args:
54
- reasoning_model: LiteLLM model name for reasoning and analysis (default: "openai/gpt-4.1")
55
- num_threads: Number of parallel threads for evaluation (default: 12)
56
- verbose: Controls internal logging/progress bars (0=off, 1=on) (default: 1)
57
- seed: Random seed for reproducibility (default: 42)
54
+ model: LiteLLM model name for the optimization algorithm (reasoning and analysis)
55
+ model_parameters: Optional dict of LiteLLM parameters for optimizer's internal LLM calls.
56
+ Common params: temperature, max_tokens, max_completion_tokens, top_p.
57
+ See: https://docs.litellm.ai/docs/completion/input
58
58
  max_parallel_batches: Maximum number of batches to process concurrently during
59
- hierarchical root cause analysis (default: 5)
60
- batch_size: Number of test cases per batch for root cause analysis (default: 25)
61
- max_iterations: Maximum number of optimization iterations (default: 5)
62
- convergence_threshold: Stop if relative improvement is below this threshold (default: 0.01)
63
- **model_kwargs: Additional arguments passed to the LLM model
59
+ hierarchical root cause analysis
60
+ batch_size: Number of test cases per batch for root cause analysis
61
+ convergence_threshold: Stop if relative improvement is below this threshold
62
+ n_threads: Number of parallel threads for evaluation
63
+ verbose: Controls internal logging/progress bars (0=off, 1=on)
64
+ seed: Random seed for reproducibility
64
65
  """
65
66
 
66
67
  DEFAULT_ROUNDS = 10
@@ -69,139 +70,34 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
69
70
 
70
71
  def __init__(
71
72
  self,
72
- reasoning_model: str = "openai/gpt-4.1",
73
- num_threads: int = 12,
74
- verbose: int = 1,
75
- seed: int = 42,
73
+ model: str = "gpt-4o",
74
+ model_parameters: dict[str, Any] | None = None,
76
75
  max_parallel_batches: int = 5,
77
76
  batch_size: int = 25,
78
- max_iterations: int = DEFAULT_MAX_ITERATIONS,
79
77
  convergence_threshold: float = DEFAULT_CONVERGENCE_THRESHOLD,
80
- **model_kwargs: Any,
78
+ n_threads: int = 12,
79
+ verbose: int = 1,
80
+ seed: int = 42,
81
81
  ):
82
82
  super().__init__(
83
- model=reasoning_model, verbose=verbose, seed=seed, **model_kwargs
83
+ model=model, verbose=verbose, seed=seed, model_parameters=model_parameters
84
84
  )
85
- self.reasoning_model = reasoning_model
86
- self.num_threads = num_threads
85
+ self.n_threads = n_threads
87
86
  self.max_parallel_batches = max_parallel_batches
88
87
  self.batch_size = batch_size
89
- self.max_iterations = max_iterations
90
88
  self.convergence_threshold = convergence_threshold
89
+ self._should_stop_optimization = False # Flag to exit all loops
91
90
 
92
91
  # Initialize hierarchical analyzer
93
92
  self._hierarchical_analyzer = HierarchicalRootCauseAnalyzer(
94
93
  call_model_fn=self._call_model_async,
95
- reasoning_model=self.reasoning_model,
94
+ reasoning_model=self.model,
96
95
  seed=self.seed,
97
96
  max_parallel_batches=self.max_parallel_batches,
98
97
  batch_size=self.batch_size,
99
98
  verbose=self.verbose,
100
99
  )
101
100
 
102
- def _prepare_model_params(
103
- self,
104
- model_kwargs: dict[str, Any],
105
- response_model: type[T] | None = None,
106
- ) -> dict[str, Any]:
107
- """
108
- Prepare parameters for LiteLLM call by filtering and adding monitoring.
109
-
110
- Args:
111
- model_kwargs: Additional model parameters
112
- response_model: Optional Pydantic model for structured output
113
-
114
- Returns:
115
- Dictionary of parameters ready for litellm.completion/acompletion
116
- """
117
- current_model_kwargs = self.model_kwargs.copy()
118
- current_model_kwargs.update(model_kwargs)
119
-
120
- # Filter out optimizer-specific kwargs that shouldn't be passed to LiteLLM
121
- filtered_call_kwargs = current_model_kwargs.copy()
122
- filtered_call_kwargs.pop("n_trials", None)
123
- filtered_call_kwargs.pop("n_samples", None)
124
- filtered_call_kwargs.pop("n_iterations", None)
125
- filtered_call_kwargs.pop("min_examples", None)
126
- filtered_call_kwargs.pop("max_examples", None)
127
- filtered_call_kwargs.pop("project_name", None)
128
-
129
- final_params_for_litellm = (
130
- opik_litellm_monitor.try_add_opik_monitoring_to_params(filtered_call_kwargs)
131
- )
132
-
133
- # Add structured output support if response_model is provided
134
- # According to LiteLLM docs: https://docs.litellm.ai/docs/completion/json_mode
135
- # Pass the Pydantic model directly to response_format
136
- if response_model is not None:
137
- final_params_for_litellm["response_format"] = response_model
138
-
139
- return final_params_for_litellm
140
-
141
- def _parse_response(
142
- self,
143
- response: Any,
144
- response_model: type[T] | None = None,
145
- ) -> T | str:
146
- """
147
- Parse LiteLLM response, with optional structured output parsing.
148
-
149
- Args:
150
- response: The response from litellm.completion/acompletion
151
- response_model: Optional Pydantic model for structured output
152
-
153
- Returns:
154
- If response_model is provided, returns an instance of that model.
155
- Otherwise, returns the raw string response.
156
- """
157
- content = response.choices[0].message.content
158
-
159
- # When using structured outputs with Pydantic models, LiteLLM automatically
160
- # parses the response. Parse the JSON string into the Pydantic model
161
- if response_model is not None:
162
- return response_model.model_validate_json(content)
163
-
164
- return content
165
-
166
- @_throttle.rate_limited(_rate_limiter)
167
- def _call_model(
168
- self,
169
- model: str,
170
- messages: list[dict[str, str]],
171
- seed: int,
172
- model_kwargs: dict[str, Any],
173
- response_model: type[T] | None = None,
174
- ) -> T | str:
175
- """
176
- Call the LLM model with optional structured output.
177
-
178
- Args:
179
- model: The model to use for the call
180
- messages: List of message dictionaries with 'role' and 'content' keys
181
- seed: Random seed for reproducibility
182
- model_kwargs: Additional model parameters
183
- response_model: Optional Pydantic model for structured output
184
-
185
- Returns:
186
- If response_model is provided, returns an instance of that model.
187
- Otherwise, returns the raw string response.
188
- """
189
- self.increment_llm_counter()
190
-
191
- final_params_for_litellm = self._prepare_model_params(
192
- model_kwargs, response_model
193
- )
194
-
195
- response = litellm.completion(
196
- model=model,
197
- messages=messages,
198
- seed=seed,
199
- num_retries=6,
200
- **final_params_for_litellm,
201
- )
202
-
203
- return self._parse_response(response, response_model)
204
-
205
101
  @_throttle.rate_limited(_rate_limiter)
206
102
  async def _call_model_async(
207
103
  self,
@@ -212,7 +108,10 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
212
108
  response_model: type[T] | None = None,
213
109
  ) -> T | str:
214
110
  """
215
- Async version of _call_model using litellm.acompletion.
111
+ Adapter for async LLM calls with HierarchicalRootCauseAnalyzer signature.
112
+
113
+ This adapter translates the analyzer's expected signature to the base class
114
+ _call_model_async signature, ensuring project_name and tags are properly set.
216
115
 
217
116
  Args:
218
117
  model: The model to use for the call
@@ -225,22 +124,16 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
225
124
  If response_model is provided, returns an instance of that model.
226
125
  Otherwise, returns the raw string response.
227
126
  """
228
- self.increment_llm_counter()
229
-
230
- final_params_for_litellm = self._prepare_model_params(
231
- model_kwargs, response_model
232
- )
233
-
234
- response = await litellm.acompletion(
235
- model=model,
127
+ # Call the base class async method which properly handles project_name and tags
128
+ return await super()._call_model_async(
236
129
  messages=messages,
130
+ model=model,
237
131
  seed=seed,
238
- num_retries=6,
239
- **final_params_for_litellm,
132
+ response_model=response_model,
133
+ is_reasoning=True,
134
+ **model_kwargs,
240
135
  )
241
136
 
242
- return self._parse_response(response, response_model)
243
-
244
137
  def get_optimizer_metadata(self) -> dict[str, Any]:
245
138
  """
246
139
  Get metadata about the optimizer configuration.
@@ -249,10 +142,9 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
249
142
  Dictionary containing optimizer-specific configuration
250
143
  """
251
144
  return {
252
- "reasoning_model": self.reasoning_model,
253
- "num_threads": self.num_threads,
145
+ "model": self.model,
146
+ "n_threads": self.n_threads,
254
147
  "max_parallel_batches": self.max_parallel_batches,
255
- "max_iterations": self.max_iterations,
256
148
  "convergence_threshold": self.convergence_threshold,
257
149
  "seed": self.seed,
258
150
  "verbose": self.verbose,
@@ -330,6 +222,12 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
330
222
 
331
223
  cleaned_model_output = raw_model_output.strip()
332
224
 
225
+ # Add tags to trace for optimization tracking
226
+ if self.current_optimization_id:
227
+ opik_context.update_current_trace(
228
+ tags=[self.current_optimization_id, "Evaluation"]
229
+ )
230
+
333
231
  result = {
334
232
  mappers.EVALUATED_LLM_TASK_OUTPUT: cleaned_model_output,
335
233
  }
@@ -344,10 +242,11 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
344
242
  dataset=dataset,
345
243
  task=llm_task,
346
244
  scoring_metrics=[_create_metric_class(metric)],
347
- task_threads=self.num_threads,
245
+ task_threads=self.n_threads,
348
246
  nb_samples=n_samples,
349
247
  experiment_config=experiment_config,
350
248
  verbose=self.verbose,
249
+ project_name=self.project_name,
351
250
  )
352
251
 
353
252
  return result
@@ -403,10 +302,9 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
403
302
  )
404
303
 
405
304
  improve_prompt_response = self._call_model(
406
- model=self.reasoning_model,
407
305
  messages=[{"role": "user", "content": improve_prompt_prompt}],
306
+ model=self.model,
408
307
  seed=attempt_seed,
409
- model_kwargs={},
410
308
  response_model=ImprovedPrompt,
411
309
  )
412
310
 
@@ -461,7 +359,8 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
461
359
  improved_chat_prompt = chat_prompt.ChatPrompt(
462
360
  name=prompt.name,
463
361
  messages=messages_as_dicts,
464
- tools=prompt.tools,
362
+ tools=best_prompt.tools,
363
+ function_map=best_prompt.function_map,
465
364
  )
466
365
 
467
366
  # Evaluate improved prompt
@@ -503,23 +402,28 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
503
402
  n_samples: int | None = None,
504
403
  auto_continue: bool = False,
505
404
  agent_class: type[OptimizableAgent] | None = None,
405
+ project_name: str = "Optimization",
406
+ max_trials: int = DEFAULT_MAX_ITERATIONS,
506
407
  max_retries: int = 2,
408
+ *args: Any,
507
409
  **kwargs: Any,
508
410
  ) -> OptimizationResult:
509
411
  # Reset counters at the start of optimization
510
- self.reset_counters()
511
-
512
- # Configure prompt model if not set
513
- self.configure_prompt_model(prompt)
412
+ self._reset_counters()
413
+ self._should_stop_optimization = False # Reset stop flag
514
414
 
515
415
  # Setup agent class
516
- self.agent_class = self.setup_agent_class(prompt, agent_class)
416
+ self.agent_class = self._setup_agent_class(prompt, agent_class)
417
+
418
+ # Set project name from parameter
419
+ self.project_name = project_name
517
420
 
518
421
  optimization = self.opik_client.create_optimization(
519
422
  dataset_name=dataset.name,
520
423
  objective_name=getattr(metric, "__name__", str(metric)),
521
424
  metadata={"optimizer": self.__class__.__name__},
522
425
  )
426
+ self.current_optimization_id = optimization.id
523
427
  logger.debug(f"Created optimization with ID: {optimization.id}")
524
428
 
525
429
  reporting.display_header(
@@ -535,7 +439,7 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
535
439
  "n_samples": n_samples,
536
440
  "auto_continue": auto_continue,
537
441
  "max_retries": max_retries,
538
- "max_iterations": self.max_iterations,
442
+ "max_trials": max_trials,
539
443
  "convergence_threshold": self.convergence_threshold,
540
444
  },
541
445
  verbose=self.verbose,
@@ -569,9 +473,20 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
569
473
  # Multi-iteration optimization loop
570
474
  iteration = 0
571
475
  previous_iteration_score = initial_score
476
+ trials_used = 0
572
477
 
573
- for iteration in range(1, self.max_iterations + 1):
574
- logger.info(f"Starting iteration {iteration}/{self.max_iterations}")
478
+ while trials_used < max_trials:
479
+ iteration += 1
480
+ logger.info(
481
+ f"Starting iteration {iteration} (trials: {trials_used}/{max_trials})"
482
+ )
483
+
484
+ # Check if we should stop (flag set by inner loops)
485
+ if self._should_stop_optimization:
486
+ logger.info(
487
+ f"Stopping optimization: reached max_trials limit ({max_trials})."
488
+ )
489
+ break
575
490
 
576
491
  with reporting.display_optimization_iteration(
577
492
  iteration=iteration, verbose=self.verbose
@@ -616,7 +531,16 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
616
531
  improved_score = None
617
532
 
618
533
  for attempt in range(1, max_attempts + 1):
619
- # Generate and evaluate improvement
534
+ # Check if we've reached the trial limit before starting a new trial
535
+ if trials_used >= max_trials:
536
+ logger.info(
537
+ f"Reached max_trials limit ({max_trials}) during failure mode '{root_cause.name}'. "
538
+ f"Stopping optimization."
539
+ )
540
+ self._should_stop_optimization = True
541
+ break
542
+
543
+ # Generate and evaluate improvement (this is 1 trial)
620
544
  (
621
545
  improved_chat_prompt,
622
546
  improved_score,
@@ -633,6 +557,7 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
633
557
  attempt=attempt,
634
558
  max_attempts=max_attempts,
635
559
  )
560
+ trials_used += 1
636
561
 
637
562
  # Check if we got improvement
638
563
  if improved_score > best_score:
@@ -642,7 +567,7 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
642
567
  break
643
568
 
644
569
  # No improvement - should we retry?
645
- if attempt < max_attempts:
570
+ if attempt < max_attempts and trials_used < max_trials:
646
571
  reporting.display_retry_attempt(
647
572
  attempt=attempt,
648
573
  max_attempts=max_attempts,
@@ -654,6 +579,10 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
654
579
  f"No improvement after {attempt} attempts for '{root_cause.name}'"
655
580
  )
656
581
 
582
+ # Break out of failure mode loop if flag is set
583
+ if self._should_stop_optimization:
584
+ break
585
+
657
586
  # Check if final result is an improvement
658
587
  if (
659
588
  improved_score is not None
@@ -731,29 +660,23 @@ class HierarchicalReflectiveOptimizer(BaseOptimizer):
731
660
 
732
661
  # Prepare details for the result
733
662
  details = {
734
- "model": best_prompt.model or self.model,
663
+ "model": self.model,
735
664
  "temperature": (best_prompt.model_kwargs or {}).get("temperature")
736
- or self.model_kwargs.get("temperature"),
737
- "reasoning_model": self.reasoning_model,
738
- "num_threads": self.num_threads,
665
+ or self.model_parameters.get("temperature"),
666
+ "n_threads": self.n_threads,
739
667
  "max_parallel_batches": self.max_parallel_batches,
740
668
  "max_retries": max_retries,
741
669
  "n_samples": n_samples,
742
670
  "auto_continue": auto_continue,
743
- "max_iterations": self.max_iterations,
671
+ "max_trials": max_trials,
744
672
  "convergence_threshold": self.convergence_threshold,
745
673
  "iterations_completed": iteration,
674
+ "trials_used": trials_used,
746
675
  }
747
676
 
748
677
  # Extract tool prompts if tools exist
749
- tool_prompts = None
750
- if final_tools := getattr(best_prompt, "tools", None):
751
- tool_prompts = {
752
- tool.get("function", {}).get("name", f"tool_{idx}"): tool.get(
753
- "function", {}
754
- ).get("description", "")
755
- for idx, tool in enumerate(final_tools)
756
- }
678
+ final_tools = getattr(best_prompt, "tools", None)
679
+ tool_prompts = self._extract_tool_prompts(final_tools)
757
680
 
758
681
  return OptimizationResult(
759
682
  optimizer=self.__class__.__name__,
@@ -14,6 +14,8 @@ TEST RESULTS:
14
14
  {formatted_batch}
15
15
  ```
16
16
 
17
+ Important constraint: Base your analysis exclusively on the TEST RESULTS shown above. Do not infer, speculate, or hypothesize failure modes that are not directly evidenced in the provided results.
18
+
17
19
  Think through the failures systematically:
18
20
 
19
21
  1. IDENTIFY: List all distinct types of failures you observe in the test results
@@ -86,6 +88,10 @@ INSTRUCTIONS FOR IMPROVING THE PROMPT:
86
88
 
87
89
  4. **Maintain Structure**: Keep the same message structure (role and content format). Only modify the content where necessary.
88
90
 
89
- 5. **Be Specific**: Ensure your changes provide concrete, actionable guidance that directly addresses the identified failure mode.
91
+ 5. **Do NOT Add Messages**: Do not add new messages to the prompt. Only modify existing messages. The number of messages in the prompt must remain exactly the same.
92
+
93
+ 6. **Be Specific**: Ensure your changes provide concrete, actionable guidance that directly addresses the identified failure mode.
94
+
95
+ Do not remove any variables or placeholders from any prompt message. You can reposition them within the same message content if needed but never remove them.
90
96
 
91
97
  Provide your reasoning for the changes you made, explaining WHY each change addresses the failure mode, and then provide the improved prompt."""