opik-optimizer 1.0.6__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. opik_optimizer/__init__.py +4 -0
  2. opik_optimizer/_throttle.py +2 -1
  3. opik_optimizer/base_optimizer.py +402 -28
  4. opik_optimizer/data/context7_eval.jsonl +3 -0
  5. opik_optimizer/datasets/context7_eval.py +90 -0
  6. opik_optimizer/datasets/tiny_test.py +33 -34
  7. opik_optimizer/datasets/truthful_qa.py +2 -2
  8. opik_optimizer/evolutionary_optimizer/crossover_ops.py +194 -0
  9. opik_optimizer/evolutionary_optimizer/evaluation_ops.py +136 -0
  10. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +289 -966
  11. opik_optimizer/evolutionary_optimizer/helpers.py +10 -0
  12. opik_optimizer/evolutionary_optimizer/llm_support.py +136 -0
  13. opik_optimizer/evolutionary_optimizer/mcp.py +249 -0
  14. opik_optimizer/evolutionary_optimizer/mutation_ops.py +306 -0
  15. opik_optimizer/evolutionary_optimizer/population_ops.py +228 -0
  16. opik_optimizer/evolutionary_optimizer/prompts.py +352 -0
  17. opik_optimizer/evolutionary_optimizer/reporting.py +28 -4
  18. opik_optimizer/evolutionary_optimizer/style_ops.py +86 -0
  19. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +90 -81
  20. opik_optimizer/few_shot_bayesian_optimizer/reporting.py +12 -5
  21. opik_optimizer/gepa_optimizer/__init__.py +3 -0
  22. opik_optimizer/gepa_optimizer/adapter.py +154 -0
  23. opik_optimizer/gepa_optimizer/gepa_optimizer.py +653 -0
  24. opik_optimizer/gepa_optimizer/reporting.py +181 -0
  25. opik_optimizer/logging_config.py +42 -7
  26. opik_optimizer/mcp_utils/__init__.py +22 -0
  27. opik_optimizer/mcp_utils/mcp.py +541 -0
  28. opik_optimizer/mcp_utils/mcp_second_pass.py +152 -0
  29. opik_optimizer/mcp_utils/mcp_simulator.py +116 -0
  30. opik_optimizer/mcp_utils/mcp_workflow.py +547 -0
  31. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +470 -134
  32. opik_optimizer/meta_prompt_optimizer/reporting.py +16 -2
  33. opik_optimizer/mipro_optimizer/_lm.py +30 -23
  34. opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +52 -51
  35. opik_optimizer/mipro_optimizer/mipro_optimizer.py +126 -46
  36. opik_optimizer/mipro_optimizer/utils.py +2 -4
  37. opik_optimizer/optimizable_agent.py +21 -16
  38. opik_optimizer/optimization_config/chat_prompt.py +44 -23
  39. opik_optimizer/optimization_config/configs.py +3 -3
  40. opik_optimizer/optimization_config/mappers.py +9 -8
  41. opik_optimizer/optimization_result.py +22 -14
  42. opik_optimizer/reporting_utils.py +61 -10
  43. opik_optimizer/task_evaluator.py +9 -8
  44. opik_optimizer/utils/__init__.py +15 -0
  45. opik_optimizer/utils/colbert.py +236 -0
  46. opik_optimizer/{utils.py → utils/core.py} +160 -33
  47. opik_optimizer/utils/dataset_utils.py +49 -0
  48. opik_optimizer/utils/prompt_segments.py +186 -0
  49. opik_optimizer-2.0.0.dist-info/METADATA +345 -0
  50. opik_optimizer-2.0.0.dist-info/RECORD +74 -0
  51. opik_optimizer-2.0.0.dist-info/licenses/LICENSE +203 -0
  52. opik_optimizer-1.0.6.dist-info/METADATA +0 -181
  53. opik_optimizer-1.0.6.dist-info/RECORD +0 -50
  54. opik_optimizer-1.0.6.dist-info/licenses/LICENSE +0 -21
  55. {opik_optimizer-1.0.6.dist-info → opik_optimizer-2.0.0.dist-info}/WHEEL +0 -0
  56. {opik_optimizer-1.0.6.dist-info → opik_optimizer-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,8 @@
1
1
  from contextlib import contextmanager
2
- from typing import Any, Iterator
2
+ from typing import Any
3
+ from collections.abc import Iterator
3
4
 
5
+ from rich.panel import Panel
4
6
  from rich.text import Text
5
7
 
6
8
  from ..optimization_config import chat_prompt
@@ -136,6 +138,18 @@ class CandidateGenerationReporter:
136
138
  console.print(Text("│"))
137
139
 
138
140
 
141
+ def display_tool_description(description: str, label: str, color: str) -> None:
142
+ if not description.strip():
143
+ return
144
+ console.print(
145
+ Panel(
146
+ description.strip(),
147
+ title=label,
148
+ border_style=color,
149
+ )
150
+ )
151
+
152
+
139
153
  @contextmanager
140
154
  def display_candidate_generation_report(
141
155
  num_prompts: int, verbose: int = 1
@@ -162,7 +176,7 @@ def display_prompt_candidate_scoring_report(verbose: int = 1) -> Any:
162
176
  ) -> None:
163
177
  if verbose >= 1:
164
178
  console.print(
165
- Text(f"│ Evaluating candidate prompt {candidate_count+1}:")
179
+ Text(f"│ Evaluating candidate prompt {candidate_count + 1}:")
166
180
  )
167
181
  display_messages(prompt.get_messages(), "│ ")
168
182
 
@@ -4,7 +4,7 @@ import os
4
4
  import re
5
5
  import threading
6
6
  from hashlib import sha256
7
- from typing import Any, Dict, List, Literal, Optional, cast
7
+ from typing import Any, Literal, cast
8
8
 
9
9
  import litellm
10
10
  import pydantic
@@ -42,12 +42,12 @@ class LM(BaseLM):
42
42
  max_tokens: int = 1000,
43
43
  cache: bool = True,
44
44
  cache_in_memory: bool = True,
45
- callbacks: Optional[List[BaseCallback]] = None,
45
+ callbacks: list[BaseCallback] | None = None,
46
46
  num_retries: int = 8,
47
47
  provider=None,
48
- finetuning_model: Optional[str] = None,
49
- launch_kwargs: Optional[dict[str, Any]] = None,
50
- train_kwargs: Optional[dict[str, Any]] = None,
48
+ finetuning_model: str | None = None,
49
+ launch_kwargs: dict[str, Any] | None = None,
50
+ train_kwargs: dict[str, Any] | None = None,
51
51
  **kwargs,
52
52
  ):
53
53
  """
@@ -93,9 +93,9 @@ class LM(BaseLM):
93
93
 
94
94
  if model_pattern:
95
95
  # Handle OpenAI reasoning models (o1, o3)
96
- assert (
97
- max_tokens >= 20_000 and temperature == 1.0
98
- ), "OpenAI's reasoning models require passing temperature=1.0 and max_tokens >= 20_000 to `dspy.LM(...)`"
96
+ assert max_tokens >= 20_000 and temperature == 1.0, (
97
+ "OpenAI's reasoning models require passing temperature=1.0 and max_tokens >= 20_000 to `dspy.LM(...)`"
98
+ )
99
99
  self.kwargs = dict(
100
100
  temperature=temperature, max_completion_tokens=max_tokens, **kwargs
101
101
  )
@@ -145,20 +145,27 @@ class LM(BaseLM):
145
145
  ):
146
146
  settings.usage_tracker.add_usage(self.model, dict(results.usage))
147
147
 
148
- self.llm_call_counter += 1
148
+ self.increment_llm_counter()
149
149
  return results
150
150
 
151
- def launch(self, launch_kwargs: Optional[Dict[str, Any]] = None):
151
+ def increment_llm_counter(self) -> None:
152
+ """Increment the LLM call counter."""
153
+ self.llm_call_counter += 1
154
+ parent = getattr(self, "parent_optimizer", None)
155
+ if parent is not None and hasattr(parent, "increment_llm_counter"):
156
+ parent.increment_llm_counter()
157
+
158
+ def launch(self, launch_kwargs: dict[str, Any] | None = None):
152
159
  self.provider.launch(self, launch_kwargs)
153
160
 
154
- def kill(self, launch_kwargs: Optional[Dict[str, Any]] = None):
161
+ def kill(self, launch_kwargs: dict[str, Any] | None = None):
155
162
  self.provider.kill(self, launch_kwargs)
156
163
 
157
164
  def finetune(
158
165
  self,
159
- train_data: List[Dict[str, Any]],
160
- train_data_format: Optional[TrainDataFormat],
161
- train_kwargs: Optional[Dict[str, Any]] = None,
166
+ train_data: list[dict[str, Any]],
167
+ train_data_format: TrainDataFormat | None,
168
+ train_kwargs: dict[str, Any] | None = None,
162
169
  ) -> TrainingJob:
163
170
  from dspy import settings as settings
164
171
 
@@ -222,7 +229,7 @@ class LM(BaseLM):
222
229
  return {key: getattr(self, key) for key in state_keys} | self.kwargs
223
230
 
224
231
 
225
- def request_cache(maxsize: Optional[int] = None):
232
+ def request_cache(maxsize: int | None = None):
226
233
  """
227
234
  A threadsafe decorator to create an in-memory LRU cache for LM inference functions that accept
228
235
  a dictionary-like LM request. An in-memory cache for LM calls is critical for ensuring
@@ -235,7 +242,7 @@ def request_cache(maxsize: Optional[int] = None):
235
242
  A decorator that wraps the target function with caching.
236
243
  """
237
244
 
238
- def cache_key(request: Dict[str, Any]) -> str:
245
+ def cache_key(request: dict[str, Any]) -> str:
239
246
  """
240
247
  Obtain a unique cache key for the given request dictionary by hashing its JSON
241
248
  representation. For request fields having types that are known to be JSON-incompatible,
@@ -278,7 +285,7 @@ def request_cache(maxsize: Optional[int] = None):
278
285
  # concurrently, e.g. during optimization and evaluation
279
286
  lock=threading.RLock(),
280
287
  )
281
- def func_cached(key: str, request: Dict[str, Any], *args, **kwargs):
288
+ def func_cached(key: str, request: dict[str, Any], *args, **kwargs):
282
289
  return func(request, *args, **kwargs)
283
290
 
284
291
  @functools.wraps(func)
@@ -302,8 +309,8 @@ def request_cache(maxsize: Optional[int] = None):
302
309
  return decorator
303
310
 
304
311
 
305
- @request_cache(maxsize=None)
306
- def cached_litellm_completion(request: Dict[str, Any], num_retries: int):
312
+ @request_cache(maxsize=2000)
313
+ def cached_litellm_completion(request: dict[str, Any], num_retries: int):
307
314
  return litellm_completion(
308
315
  request,
309
316
  cache={"no-cache": False, "no-store": False},
@@ -312,7 +319,7 @@ def cached_litellm_completion(request: Dict[str, Any], num_retries: int):
312
319
 
313
320
 
314
321
  def litellm_completion(
315
- request: Dict[str, Any],
322
+ request: dict[str, Any],
316
323
  num_retries: int,
317
324
  cache={"no-cache": True, "no-store": True},
318
325
  ):
@@ -361,8 +368,8 @@ def litellm_completion(
361
368
  return stream_completion()
362
369
 
363
370
 
364
- @request_cache(maxsize=None)
365
- def cached_litellm_text_completion(request: Dict[str, Any], num_retries: int):
371
+ @request_cache(maxsize=2000)
372
+ def cached_litellm_text_completion(request: dict[str, Any], num_retries: int):
366
373
  return litellm_text_completion(
367
374
  request,
368
375
  num_retries=num_retries,
@@ -371,7 +378,7 @@ def cached_litellm_text_completion(request: Dict[str, Any], num_retries: int):
371
378
 
372
379
 
373
380
  def litellm_text_completion(
374
- request: Dict[str, Any],
381
+ request: dict[str, Any],
375
382
  num_retries: int,
376
383
  cache={"no-cache": True, "no-store": True},
377
384
  ):
@@ -1,7 +1,8 @@
1
1
  import random
2
2
  import textwrap
3
3
  from collections import defaultdict
4
- from typing import Any, Callable, Dict, List, Literal, Optional, Tuple
4
+ from typing import Any, Literal
5
+ from collections.abc import Callable
5
6
 
6
7
  import dspy
7
8
  import numpy as np
@@ -69,27 +70,27 @@ class MIPROv2(Teleprompter):
69
70
  def __init__(
70
71
  self,
71
72
  metric: Callable,
72
- prompt_model: Optional[Any] = None,
73
- task_model: Optional[Any] = None,
74
- teacher_settings: Dict = {},
73
+ prompt_model: Any | None = None,
74
+ task_model: Any | None = None,
75
+ teacher_settings: dict = {},
75
76
  max_bootstrapped_demos: int = 4,
76
77
  max_labeled_demos: int = 4,
77
- auto: Optional[Literal["light", "medium", "heavy"]] = "medium",
78
+ auto: Literal["light", "medium", "heavy"] | None = "medium",
78
79
  num_candidates: int = 10,
79
- num_threads: Optional[int] = None,
80
+ num_threads: int | None = None,
80
81
  max_errors: int = 10,
81
- seed: int = 9,
82
+ seed: int = 42,
82
83
  init_temperature: float = 0.5,
83
84
  verbose: bool = False,
84
85
  track_stats: bool = True,
85
- log_dir: Optional[str] = None,
86
- metric_threshold: Optional[float] = None,
87
- opik_dataset: Optional[opik.Dataset] = None,
88
- opik_metric: Optional[Callable] = None,
89
- opik_prompt_task_config: Optional[TaskConfig] = None,
90
- opik_project_name: Optional[str] = None,
91
- opik_optimization_id: Optional[str] = None,
92
- experiment_config: Optional[Dict[str, Any]] = None,
86
+ log_dir: str | None = None,
87
+ metric_threshold: float | None = None,
88
+ opik_dataset: opik.Dataset | None = None,
89
+ opik_metric: Callable | None = None,
90
+ opik_prompt_task_config: TaskConfig | None = None,
91
+ opik_project_name: str | None = None,
92
+ opik_optimization_id: str | None = None,
93
+ experiment_config: dict[str, Any] | None = None,
93
94
  ):
94
95
  # Validate 'auto' parameter
95
96
  allowed_modes = {None, "light", "medium", "heavy"}
@@ -129,13 +130,13 @@ class MIPROv2(Teleprompter):
129
130
  self,
130
131
  student: Any,
131
132
  *,
132
- trainset: List,
133
+ trainset: list,
133
134
  teacher: Any = None,
134
- valset: Optional[List] = None,
135
+ valset: list | None = None,
135
136
  num_trials: int = 30,
136
- max_bootstrapped_demos: Optional[int] = None,
137
- max_labeled_demos: Optional[int] = None,
138
- seed: Optional[int] = None,
137
+ max_bootstrapped_demos: int | None = None,
138
+ max_labeled_demos: int | None = None,
139
+ seed: int | None = None,
139
140
  minibatch: bool = True,
140
141
  minibatch_size: int = 35,
141
142
  minibatch_full_eval_steps: int = 5,
@@ -145,7 +146,7 @@ class MIPROv2(Teleprompter):
145
146
  tip_aware_proposer: bool = True,
146
147
  fewshot_aware_proposer: bool = True,
147
148
  requires_permission_to_run: bool = True,
148
- provide_traceback: Optional[bool] = None,
149
+ provide_traceback: bool | None = None,
149
150
  ) -> Any:
150
151
  # Set random seeds
151
152
  seed = seed or self.seed
@@ -252,8 +253,8 @@ class MIPROv2(Teleprompter):
252
253
  num_trials: int,
253
254
  minibatch: bool,
254
255
  zeroshot_opt: bool,
255
- valset: List,
256
- ) -> Tuple[int, List, bool]:
256
+ valset: list,
257
+ ) -> tuple[int, list, bool]:
257
258
  if self.auto is None:
258
259
  return num_trials, valset, minibatch
259
260
 
@@ -273,7 +274,7 @@ class MIPROv2(Teleprompter):
273
274
 
274
275
  return num_trials, valset, minibatch
275
276
 
276
- def _set_and_validate_datasets(self, trainset: List, valset: Optional[List]):
277
+ def _set_and_validate_datasets(self, trainset: list, valset: list | None):
277
278
  if not trainset:
278
279
  raise ValueError("Trainset cannot be empty.")
279
280
 
@@ -292,7 +293,7 @@ class MIPROv2(Teleprompter):
292
293
 
293
294
  return trainset, valset
294
295
 
295
- def _print_auto_run_settings(self, num_trials: int, minibatch: bool, valset: List):
296
+ def _print_auto_run_settings(self, num_trials: int, minibatch: bool, valset: list):
296
297
  logger.info(
297
298
  f"\nRUNNING WITH THE FOLLOWING {self.auto.upper()} AUTO RUN SETTINGS:"
298
299
  f"\nnum_trials: {num_trials}"
@@ -308,9 +309,9 @@ class MIPROv2(Teleprompter):
308
309
  minibatch: bool,
309
310
  minibatch_size: int,
310
311
  minibatch_full_eval_steps: int,
311
- valset: List,
312
+ valset: list,
312
313
  program_aware_proposer: bool,
313
- ) -> Tuple[str, str]:
314
+ ) -> tuple[str, str]:
314
315
  num_predictors = len(program.predictors())
315
316
 
316
317
  # Estimate prompt model calls
@@ -359,7 +360,7 @@ class MIPROv2(Teleprompter):
359
360
  minibatch: bool,
360
361
  minibatch_size: int,
361
362
  minibatch_full_eval_steps: int,
362
- valset: List,
363
+ valset: list,
363
364
  program_aware_proposer: bool,
364
365
  ) -> bool:
365
366
  prompt_model_line, task_model_line = self._estimate_lm_calls(
@@ -414,8 +415,8 @@ class MIPROv2(Teleprompter):
414
415
  return user_input == "y"
415
416
 
416
417
  def _bootstrap_fewshot_examples(
417
- self, program: Any, trainset: List, seed: int, teacher: Any
418
- ) -> Optional[List]:
418
+ self, program: Any, trainset: list, seed: int, teacher: Any
419
+ ) -> list | None:
419
420
  logger.info("\n==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==")
420
421
  if self.max_bootstrapped_demos > 0:
421
422
  logger.info(
@@ -461,14 +462,14 @@ class MIPROv2(Teleprompter):
461
462
  def _propose_instructions(
462
463
  self,
463
464
  program: Any,
464
- trainset: List,
465
- demo_candidates: Optional[List],
465
+ trainset: list,
466
+ demo_candidates: list | None,
466
467
  view_data_batch_size: int,
467
468
  program_aware_proposer: bool,
468
469
  data_aware_proposer: bool,
469
470
  tip_aware_proposer: bool,
470
471
  fewshot_aware_proposer: bool,
471
- ) -> Dict[int, List[str]]:
472
+ ) -> dict[int, list[str]]:
472
473
  logger.info("\n==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==")
473
474
  logger.info(
474
475
  "We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions."
@@ -513,16 +514,16 @@ class MIPROv2(Teleprompter):
513
514
  def _optimize_prompt_parameters(
514
515
  self,
515
516
  program: Any,
516
- instruction_candidates: Dict[int, List[str]],
517
- demo_candidates: Optional[List],
517
+ instruction_candidates: dict[int, list[str]],
518
+ demo_candidates: list | None,
518
519
  evaluate: Evaluate,
519
- valset: List,
520
+ valset: list,
520
521
  num_trials: int,
521
522
  minibatch: bool,
522
523
  minibatch_size: int,
523
524
  minibatch_full_eval_steps: int,
524
525
  seed: int,
525
- ) -> Optional[Any]:
526
+ ) -> Any | None:
526
527
  # Run optimization
527
528
  optuna.logging.set_verbosity(optuna.logging.WARNING)
528
529
  logger.info("==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==")
@@ -848,12 +849,12 @@ class MIPROv2(Teleprompter):
848
849
  def _select_and_insert_instructions_and_demos(
849
850
  self,
850
851
  candidate_program: Any,
851
- instruction_candidates: Dict[int, List[str]],
852
- demo_candidates: Optional[List],
852
+ instruction_candidates: dict[int, list[str]],
853
+ demo_candidates: list | None,
853
854
  trial: optuna.trial.Trial,
854
- trial_logs: Dict,
855
+ trial_logs: dict,
855
856
  trial_num: int,
856
- ) -> List[str]:
857
+ ) -> list[str]:
857
858
  chosen_params = []
858
859
  raw_chosen_params = {}
859
860
 
@@ -902,18 +903,18 @@ class MIPROv2(Teleprompter):
902
903
  self,
903
904
  trial_num: int,
904
905
  adjusted_num_trials: int,
905
- param_score_dict: Dict,
906
- fully_evaled_param_combos: Dict,
906
+ param_score_dict: dict,
907
+ fully_evaled_param_combos: dict,
907
908
  evaluate: Evaluate,
908
- valset: List,
909
- trial_logs: Dict,
909
+ valset: list,
910
+ trial_logs: dict,
910
911
  total_eval_calls: int,
911
912
  score_data,
912
913
  best_score: float,
913
914
  best_program: Any,
914
915
  study: optuna.Study,
915
- instruction_candidates: List,
916
- demo_candidates: List,
916
+ instruction_candidates: list,
917
+ demo_candidates: list,
917
918
  ):
918
919
  logger.info(
919
920
  f"===== Trial {trial_num + 1} / {adjusted_num_trials} - Full Evaluation ====="
@@ -1026,19 +1027,19 @@ class MIPROv2(Teleprompter):
1026
1027
 
1027
1028
  def eval_candidate_program_with_opik(
1028
1029
  opik_dataset: opik.Dataset,
1029
- trainset: List,
1030
+ trainset: list,
1030
1031
  candidate_program: Any,
1031
1032
  project_name: str,
1032
1033
  metric: Callable,
1033
1034
  prompt_task_config: TaskConfig,
1034
1035
  num_threads: int,
1035
- experiment_config: Optional[Dict[str, Any]] = None,
1036
- optimization_id: Optional[str] = None,
1036
+ experiment_config: dict[str, Any] | None = None,
1037
+ optimization_id: str | None = None,
1037
1038
  ):
1038
1039
  """Evaluate a candidate program on the trainset, using the specified batch size."""
1039
1040
  dataset_item_ids = [example["id"] for example in trainset]
1040
1041
 
1041
- def program_task(dataset_item: Dict[str, Any]) -> Dict[str, Any]:
1042
+ def program_task(dataset_item: dict[str, Any]) -> dict[str, Any]:
1042
1043
  program_inputs = {
1043
1044
  input_key: dataset_item[input_key]
1044
1045
  for input_key in prompt_task_config.input_dataset_fields