opik-optimizer 0.9.2__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. opik_optimizer/__init__.py +7 -5
  2. opik_optimizer/_throttle.py +8 -8
  3. opik_optimizer/base_optimizer.py +98 -45
  4. opik_optimizer/cache_config.py +5 -3
  5. opik_optimizer/datasets/ai2_arc.py +15 -13
  6. opik_optimizer/datasets/cnn_dailymail.py +19 -15
  7. opik_optimizer/datasets/election_questions.py +10 -11
  8. opik_optimizer/datasets/gsm8k.py +16 -11
  9. opik_optimizer/datasets/halu_eval.py +6 -5
  10. opik_optimizer/datasets/hotpot_qa.py +17 -16
  11. opik_optimizer/datasets/medhallu.py +10 -7
  12. opik_optimizer/datasets/rag_hallucinations.py +11 -8
  13. opik_optimizer/datasets/ragbench.py +17 -9
  14. opik_optimizer/datasets/tiny_test.py +33 -37
  15. opik_optimizer/datasets/truthful_qa.py +18 -12
  16. opik_optimizer/demo/cache.py +6 -6
  17. opik_optimizer/demo/datasets.py +3 -7
  18. opik_optimizer/evolutionary_optimizer/__init__.py +3 -1
  19. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +722 -429
  20. opik_optimizer/evolutionary_optimizer/reporting.py +155 -74
  21. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +271 -188
  22. opik_optimizer/few_shot_bayesian_optimizer/reporting.py +79 -28
  23. opik_optimizer/logging_config.py +19 -15
  24. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +209 -129
  25. opik_optimizer/meta_prompt_optimizer/reporting.py +121 -46
  26. opik_optimizer/mipro_optimizer/__init__.py +2 -0
  27. opik_optimizer/mipro_optimizer/_lm.py +38 -9
  28. opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +37 -26
  29. opik_optimizer/mipro_optimizer/mipro_optimizer.py +132 -63
  30. opik_optimizer/mipro_optimizer/utils.py +5 -2
  31. opik_optimizer/optimizable_agent.py +179 -0
  32. opik_optimizer/optimization_config/chat_prompt.py +143 -73
  33. opik_optimizer/optimization_config/configs.py +4 -3
  34. opik_optimizer/optimization_config/mappers.py +18 -6
  35. opik_optimizer/optimization_result.py +22 -13
  36. opik_optimizer/py.typed +0 -0
  37. opik_optimizer/reporting_utils.py +89 -58
  38. opik_optimizer/task_evaluator.py +12 -14
  39. opik_optimizer/utils.py +117 -14
  40. {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.1.dist-info}/METADATA +8 -8
  41. opik_optimizer-1.0.1.dist-info/RECORD +50 -0
  42. opik_optimizer-0.9.2.dist-info/RECORD +0 -48
  43. {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.1.dist-info}/WHEEL +0 -0
  44. {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.1.dist-info}/licenses/LICENSE +0 -0
  45. {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,9 @@
1
1
  from contextlib import contextmanager
2
+ from typing import Any, Iterator
2
3
 
3
4
  from rich.text import Text
4
5
 
6
+ from ..optimization_config import chat_prompt
5
7
  from ..reporting_utils import (
6
8
  convert_tqdm_to_rich,
7
9
  display_configuration, # noqa: F401
@@ -17,33 +19,66 @@ console = get_console()
17
19
 
18
20
 
19
21
  @contextmanager
20
- def display_round_progress(max_rounds: int, verbose: int = 1):
22
+ def display_round_progress(max_rounds: int, verbose: int = 1) -> Any:
21
23
  """Context manager to display messages during an evaluation phase."""
22
-
24
+
23
25
  # Create a simple object with a method to set the score
24
26
  class Reporter:
25
- def failed_to_generate(self, num_prompts, error):
27
+ def failed_to_generate(self, num_prompts: int, error: str) -> None:
26
28
  if verbose >= 1:
27
- console.print(Text(f"│ Failed to generate {num_prompts} candidate prompt{'' if num_prompts == 1 else 's'}: {error}", style="red"))
29
+ console.print(
30
+ Text(
31
+ f"│ Failed to generate {num_prompts} candidate prompt{'' if num_prompts == 1 else 's'}: {error}",
32
+ style="red",
33
+ )
34
+ )
28
35
  console.print(Text("│"))
29
-
30
- def round_start(self, round_number):
36
+
37
+ def round_start(self, round_number: int) -> None:
31
38
  if verbose >= 1:
32
- console.print(Text(f"│ - Starting optimization round {round_number + 1} of {max_rounds}"))
39
+ console.print(
40
+ Text(
41
+ f"│ - Starting optimization round {round_number + 1} of {max_rounds}"
42
+ )
43
+ )
33
44
 
34
- def round_end(self, round_number, score, best_score, best_prompt):
45
+ def round_end(self, round_number: int, score: float, best_score: float) -> None:
35
46
  if verbose >= 1:
36
- console.print(Text(f"│ Completed optimization round {round_number + 1} of {max_rounds}"))
47
+ console.print(
48
+ Text(
49
+ f"│ Completed optimization round {round_number + 1} of {max_rounds}"
50
+ )
51
+ )
37
52
  if best_score == 0 and score == 0:
38
- console.print(Text("│ No improvement in this optimization round - score is 0", style="yellow"))
53
+ console.print(
54
+ Text(
55
+ "│ No improvement in this optimization round - score is 0",
56
+ style="yellow",
57
+ )
58
+ )
39
59
  elif best_score == 0:
40
- console.print(Text(f"│ Found a new best performing prompt: {score:.4f}", style="green"))
60
+ console.print(
61
+ Text(
62
+ f"│ Found a new best performing prompt: {score:.4f}",
63
+ style="green",
64
+ )
65
+ )
41
66
  elif score > best_score:
42
67
  perc_change = (score - best_score) / best_score
43
- console.print(Text(f"│ Found a new best performing prompt: {score:.4f} ({perc_change:.2%})", style="green"))
68
+ console.print(
69
+ Text(
70
+ f"│ Found a new best performing prompt: {score:.4f} ({perc_change:.2%})",
71
+ style="green",
72
+ )
73
+ )
44
74
  elif score <= best_score:
45
- console.print(Text("│ No improvement in this optimization round", style="red"))
46
-
75
+ console.print(
76
+ Text(
77
+ "│ No improvement in this optimization round",
78
+ style="red",
79
+ )
80
+ )
81
+
47
82
  console.print(Text("│"))
48
83
 
49
84
  # Use our log suppression context manager and yield the reporter
@@ -56,20 +91,22 @@ def display_round_progress(max_rounds: int, verbose: int = 1):
56
91
 
57
92
 
58
93
  @contextmanager
59
- def display_evaluation(message: str = "First we will establish the baseline performance:", verbose: int = 1):
94
+ def display_evaluation(
95
+ message: str = "First we will establish the baseline performance:", verbose: int = 1
96
+ ) -> Any:
60
97
  """Context manager to display messages during an evaluation phase."""
61
- score = None
62
-
63
98
  # Entry point
64
99
  if verbose >= 1:
65
100
  console.print(Text(f"> {message}"))
66
-
101
+
67
102
  # Create a simple object with a method to set the score
68
103
  class Reporter:
69
- def set_score(self, s):
104
+ def set_score(self, s: float) -> None:
70
105
  if verbose >= 1:
71
- console.print(Text(f"\r Baseline score was: {s:.4f}.\n", style="green"))
72
-
106
+ console.print(
107
+ Text(f"\r Baseline score was: {s:.4f}.\n", style="green")
108
+ )
109
+
73
110
  # Use our log suppression context manager and yield the reporter
74
111
  with suppress_opik_logs():
75
112
  with convert_tqdm_to_rich(" Evaluation", verbose=verbose):
@@ -78,62 +115,100 @@ def display_evaluation(message: str = "First we will establish the baseline perf
78
115
  finally:
79
116
  pass
80
117
 
81
- def display_optimization_start_message(verbose: int = 1):
118
+
119
+ def display_optimization_start_message(verbose: int = 1) -> None:
82
120
  if verbose >= 1:
83
121
  console.print(Text("> Starting the optimization run"))
84
122
  console.print(Text("│"))
85
123
 
86
124
 
125
+ class CandidateGenerationReporter:
126
+ def __init__(self, num_prompts: int):
127
+ self.num_prompts = num_prompts
128
+
129
+ def set_generated_prompts(self) -> None:
130
+ console.print(
131
+ Text(
132
+ f"│ Successfully generated {self.num_prompts} candidate prompt{'' if self.num_prompts == 1 else 's'}",
133
+ style="dim",
134
+ )
135
+ )
136
+ console.print(Text("│"))
137
+
138
+
87
139
  @contextmanager
88
- def display_candidate_generation_report(num_prompts: int, verbose: int = 1):
89
- """Context manager to display messages during an evaluation phase."""
90
- # Entry point
140
+ def display_candidate_generation_report(
141
+ num_prompts: int, verbose: int = 1
142
+ ) -> Iterator[CandidateGenerationReporter]:
91
143
  if verbose >= 1:
92
- console.print(Text(f"│ Generating candidate prompt{'' if num_prompts == 1 else 's'}:"))
93
-
94
- # Create a simple object with a method to set the score
95
- class Reporter:
96
- def set_generated_prompts(self, prompts):
97
- console.print(Text(f"│ Successfully generated {num_prompts} candidate prompt{'' if num_prompts == 1 else 's'}", style="dim"))
98
- console.print(Text("│"))
144
+ console.print(
145
+ Text(f"│ Generating candidate prompt{'' if num_prompts == 1 else 's'}:")
146
+ )
99
147
 
100
148
  try:
101
- yield Reporter()
149
+ yield CandidateGenerationReporter(num_prompts)
102
150
  finally:
103
151
  pass
104
152
 
105
153
 
106
154
  @contextmanager
107
- def display_prompt_candidate_scoring_report(candidate_count, prompt, verbose: int = 1):
155
+ def display_prompt_candidate_scoring_report(verbose: int = 1) -> Any:
108
156
  """Context manager to display messages during an evaluation phase."""
157
+
109
158
  # Create a simple object with a method to set the score
110
159
  class Reporter:
111
- def set_generated_prompts(self, candidate_count, prompt):
160
+ def set_generated_prompts(
161
+ self, candidate_count: int, prompt: chat_prompt.ChatPrompt
162
+ ) -> None:
112
163
  if verbose >= 1:
113
- console.print(Text(f"│ Evaluating candidate prompt {candidate_count+1}:"))
114
- display_messages(prompt, "│ ")
115
-
116
- def set_final_score(self, best_score, score):
164
+ console.print(
165
+ Text(f"│ Evaluating candidate prompt {candidate_count+1}:")
166
+ )
167
+ display_messages(prompt.get_messages(), "│ ")
168
+
169
+ def set_final_score(self, best_score: float, score: float) -> None:
117
170
  if verbose >= 1:
118
171
  if best_score == 0 and score > 0:
119
- console.print(Text(f"│ Evaluation score: {score:.4f}", style="green"))
172
+ console.print(
173
+ Text(f"│ Evaluation score: {score:.4f}", style="green")
174
+ )
120
175
  elif best_score == 0 and score == 0:
121
- console.print(Text(f"│ Evaluation score: {score:.4f}", style="dim yellow"))
176
+ console.print(
177
+ Text(
178
+ f"│ Evaluation score: {score:.4f}",
179
+ style="dim yellow",
180
+ )
181
+ )
122
182
  elif score > best_score:
123
183
  perc_change = (score - best_score) / best_score
124
- console.print(Text(f"│ Evaluation score: {score:.4f} ({perc_change:.2%})", style="green"))
184
+ console.print(
185
+ Text(
186
+ f"│ Evaluation score: {score:.4f} ({perc_change:.2%})",
187
+ style="green",
188
+ )
189
+ )
125
190
  elif score < best_score:
126
191
  perc_change = (score - best_score) / best_score
127
- console.print(Text(f"│ Evaluation score: {score:.4f} ({perc_change:.2%})", style="red"))
192
+ console.print(
193
+ Text(
194
+ f"│ Evaluation score: {score:.4f} ({perc_change:.2%})",
195
+ style="red",
196
+ )
197
+ )
128
198
  else:
129
- console.print(Text(f"│ Evaluation score: {score:.4f}", style="dim yellow"))
130
-
199
+ console.print(
200
+ Text(
201
+ f"│ Evaluation score: {score:.4f}",
202
+ style="dim yellow",
203
+ )
204
+ )
205
+
131
206
  console.print(Text("│"))
132
207
  console.print(Text("│"))
208
+
133
209
  try:
134
210
  with suppress_opik_logs():
135
211
  with convert_tqdm_to_rich("│ Evaluation", verbose=verbose):
136
212
  yield Reporter()
137
213
  finally:
138
214
  pass
139
-
@@ -1 +1,3 @@
1
1
  from .mipro_optimizer import MiproOptimizer, MIPROv2
2
+
3
+ __all__ = ["MiproOptimizer", "MIPROv2"]
@@ -22,12 +22,13 @@ from dspy.dsp.utils.settings import settings
22
22
  from dspy.utils.callback import BaseCallback, with_callbacks
23
23
  from dspy.clients.base_lm import BaseLM
24
24
 
25
- from .._throttle import RateLimiter, rate_limited, get_rate_limiter_for_current_opik_installation
25
+ from .._throttle import rate_limited, get_rate_limiter_for_current_opik_installation
26
26
 
27
27
  logger = logging.getLogger(__name__)
28
28
  # Limit how fast an LLM can be called:
29
29
  limiter = get_rate_limiter_for_current_opik_installation()
30
30
 
31
+
31
32
  class LM(BaseLM):
32
33
  """
33
34
  A language model supporting chat or text completion requests for use with DSPy modules.
@@ -95,7 +96,9 @@ class LM(BaseLM):
95
96
  assert (
96
97
  max_tokens >= 20_000 and temperature == 1.0
97
98
  ), "OpenAI's reasoning models require passing temperature=1.0 and max_tokens >= 20_000 to `dspy.LM(...)`"
98
- self.kwargs = dict(temperature=temperature, max_completion_tokens=max_tokens, **kwargs)
99
+ self.kwargs = dict(
100
+ temperature=temperature, max_completion_tokens=max_tokens, **kwargs
101
+ )
99
102
  else:
100
103
  self.kwargs = dict(temperature=temperature, max_tokens=max_tokens, **kwargs)
101
104
 
@@ -111,14 +114,22 @@ class LM(BaseLM):
111
114
 
112
115
  # Make the request and handle LRU & disk caching.
113
116
  if cache_in_memory:
114
- completion = cached_litellm_completion if self.model_type == "chat" else cached_litellm_text_completion
117
+ completion = (
118
+ cached_litellm_completion
119
+ if self.model_type == "chat"
120
+ else cached_litellm_text_completion
121
+ )
115
122
 
116
123
  results = completion(
117
124
  request=dict(model=self.model, messages=messages, **kwargs),
118
125
  num_retries=self.num_retries,
119
126
  )
120
127
  else:
121
- completion = litellm_completion if self.model_type == "chat" else litellm_text_completion
128
+ completion = (
129
+ litellm_completion
130
+ if self.model_type == "chat"
131
+ else litellm_text_completion
132
+ )
122
133
 
123
134
  results = completion(
124
135
  request=dict(model=self.model, messages=messages, **kwargs),
@@ -127,7 +138,11 @@ class LM(BaseLM):
127
138
  cache={"no-cache": not cache, "no-store": not cache},
128
139
  )
129
140
 
130
- if not getattr(results, "cache_hit", False) and dspy.settings.usage_tracker and hasattr(results, "usage"):
141
+ if (
142
+ not getattr(results, "cache_hit", False)
143
+ and dspy.settings.usage_tracker
144
+ and hasattr(results, "usage")
145
+ ):
131
146
  settings.usage_tracker.add_usage(self.model, dict(results.usage))
132
147
 
133
148
  self.llm_call_counter += 1
@@ -239,7 +254,11 @@ def request_cache(maxsize: Optional[int] = None):
239
254
  return value.model_json_schema()
240
255
  elif isinstance(value, pydantic.BaseModel):
241
256
  return value.model_dump()
242
- elif callable(value) and hasattr(value, "__code__") and hasattr(value.__code__, "co_code"):
257
+ elif (
258
+ callable(value)
259
+ and hasattr(value, "__code__")
260
+ and hasattr(value.__code__, "co_code")
261
+ ):
243
262
  return value.__code__.co_code.decode("utf-8")
244
263
  else:
245
264
  # Note: We don't attempt to compute a hash of the value, since the default
@@ -292,7 +311,11 @@ def cached_litellm_completion(request: Dict[str, Any], num_retries: int):
292
311
  )
293
312
 
294
313
 
295
- def litellm_completion(request: Dict[str, Any], num_retries: int, cache={"no-cache": True, "no-store": True}):
314
+ def litellm_completion(
315
+ request: Dict[str, Any],
316
+ num_retries: int,
317
+ cache={"no-cache": True, "no-store": True},
318
+ ):
296
319
  retry_kwargs = dict(
297
320
  retry_policy=_get_litellm_retry_policy(num_retries),
298
321
  retry_strategy="exponential_backoff_retry",
@@ -347,7 +370,11 @@ def cached_litellm_text_completion(request: Dict[str, Any], num_retries: int):
347
370
  )
348
371
 
349
372
 
350
- def litellm_text_completion(request: Dict[str, Any], num_retries: int, cache={"no-cache": True, "no-store": True}):
373
+ def litellm_text_completion(
374
+ request: Dict[str, Any],
375
+ num_retries: int,
376
+ cache={"no-cache": True, "no-store": True},
377
+ ):
351
378
  # Extract the provider and model from the model string.
352
379
  # TODO: Not all the models are in the format of "provider/model"
353
380
  model = request.pop("model").split("/", 1)
@@ -358,7 +385,9 @@ def litellm_text_completion(request: Dict[str, Any], num_retries: int, cache={"n
358
385
  api_base = request.pop("api_base", None) or os.getenv(f"{provider}_API_BASE")
359
386
 
360
387
  # Build the prompt from the messages.
361
- prompt = "\n\n".join([x["content"] for x in request.pop("messages")] + ["BEGIN RESPONSE:"])
388
+ prompt = "\n\n".join(
389
+ [x["content"] for x in request.pop("messages")] + ["BEGIN RESPONSE:"]
390
+ )
362
391
 
363
392
  return litellm.text_completion(
364
393
  cache=cache,
@@ -23,12 +23,15 @@ from dspy.teleprompt.utils import (
23
23
  from optuna.distributions import CategoricalDistribution
24
24
 
25
25
  from ..optimization_config.configs import TaskConfig
26
+ from opik_optimizer import task_evaluator
27
+ from opik_optimizer.optimization_config import mappers
26
28
 
27
29
 
28
- class Logger():
30
+ class Logger:
29
31
  def info(self, *args, **kwargs):
30
32
  print(*args)
31
33
 
34
+
32
35
  logger = Logger()
33
36
 
34
37
  # Constants
@@ -49,10 +52,6 @@ BLUE = "\033[94m"
49
52
  BOLD = "\033[1m"
50
53
  ENDC = "\033[0m" # Resets the color to default
51
54
 
52
- import opik
53
- from opik_optimizer import task_evaluator
54
- from opik_optimizer.optimization_config.configs import TaskConfig
55
- from opik_optimizer.optimization_config import mappers
56
55
 
57
56
  def get_prompt(program):
58
57
  """
@@ -65,6 +64,7 @@ def get_prompt(program):
65
64
 
66
65
  return instructions
67
66
 
67
+
68
68
  class MIPROv2(Teleprompter):
69
69
  def __init__(
70
70
  self,
@@ -554,18 +554,19 @@ class MIPROv2(Teleprompter):
554
554
 
555
555
  examples = []
556
556
  for demo in demo_candidates.values():
557
- for l in demo:
558
- for example in l:
557
+ for l_ in demo:
558
+ for example in l_:
559
559
  examples.append(example.toDict())
560
560
  prompt = get_prompt(program)
561
561
  experiment_config = {
562
562
  **self.experiment_config,
563
- **{"configuration": {
564
- "prompt": prompt,
565
- "examples": examples,
563
+ **{
564
+ "configuration": {
565
+ "prompt": prompt,
566
+ "examples": examples,
567
+ },
568
+ "evaluation": "initial",
566
569
  },
567
- "evaluation": "initial",
568
- }
569
570
  }
570
571
 
571
572
  default_score = eval_candidate_program_with_opik(
@@ -579,7 +580,7 @@ class MIPROv2(Teleprompter):
579
580
  experiment_config=experiment_config,
580
581
  optimization_id=self.opik_optimization_id,
581
582
  )
582
-
583
+
583
584
  logger.info(f"Default program score: {default_score}\n")
584
585
 
585
586
  trial_logs = {}
@@ -606,7 +607,13 @@ class MIPROv2(Teleprompter):
606
607
 
607
608
  # Define the objective function
608
609
  def objective(trial):
609
- nonlocal program, best_program, best_score, trial_logs, total_eval_calls, score_data
610
+ nonlocal \
611
+ program, \
612
+ best_program, \
613
+ best_score, \
614
+ trial_logs, \
615
+ total_eval_calls, \
616
+ score_data
610
617
 
611
618
  trial_num = trial.number + 1
612
619
  if minibatch:
@@ -927,18 +934,19 @@ class MIPROv2(Teleprompter):
927
934
 
928
935
  examples = []
929
936
  for demo in demo_candidates.values():
930
- for l in demo:
931
- for example in l:
937
+ for l_ in demo:
938
+ for example in l_:
932
939
  examples.append(example.toDict())
933
940
  prompt = get_prompt(highest_mean_program)
934
941
  experiment_config = {
935
942
  **self.experiment_config,
936
- **{"configuration": {
937
- "prompt": prompt,
938
- "examples": examples,
939
- },
940
- "evaluation": "full",
941
- }
943
+ **{
944
+ "configuration": {
945
+ "prompt": prompt,
946
+ "examples": examples,
947
+ },
948
+ "evaluation": "full",
949
+ },
942
950
  }
943
951
 
944
952
  full_eval_score = eval_candidate_program_with_opik(
@@ -988,7 +996,7 @@ class MIPROv2(Teleprompter):
988
996
  trial_logs[trial_num + 1]["full_eval_score"] = full_eval_score
989
997
 
990
998
  if full_eval_score == 1.0:
991
- return self.early_stop(default_score, program)
999
+ return self.early_stop(default_score, program) # noqa
992
1000
 
993
1001
  # Update best score and program if necessary
994
1002
  if full_eval_score > best_score:
@@ -1042,9 +1050,12 @@ def eval_candidate_program_with_opik(
1042
1050
  candidate_program._assert_failures += dspy.settings.get("assert_failures")
1043
1051
  if hasattr(candidate_program, "_suggest_failures"):
1044
1052
  candidate_program._suggest_failures += dspy.settings.get("suggest_failures")
1045
-
1046
- return {mappers.from_llm_response_text(): prediction[prompt_task_config.output_dataset_field]}
1047
1053
 
1054
+ return {
1055
+ mappers.from_llm_response_text(): prediction[
1056
+ prompt_task_config.output_dataset_field
1057
+ ]
1058
+ }
1048
1059
 
1049
1060
  score = task_evaluator.evaluate(
1050
1061
  dataset=opik_dataset,
@@ -1056,5 +1067,5 @@ def eval_candidate_program_with_opik(
1056
1067
  experiment_config=experiment_config,
1057
1068
  optimization_id=optimization_id,
1058
1069
  )
1059
-
1070
+
1060
1071
  return score