opik-optimizer 0.9.0rc0__tar.gz → 0.9.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. {opik_optimizer-0.9.0rc0/src/opik_optimizer.egg-info → opik_optimizer-0.9.2}/PKG-INFO +1 -1
  2. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/setup.py +1 -1
  3. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +37 -19
  4. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/evolutionary_optimizer/reporting.py +0 -2
  5. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +50 -21
  6. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +42 -26
  7. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/meta_prompt_optimizer/reporting.py +0 -1
  8. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/mipro_optimizer/_lm.py +3 -0
  9. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/mipro_optimizer/mipro_optimizer.py +7 -8
  10. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/optimization_config/chat_prompt.py +2 -2
  11. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/optimization_result.py +8 -9
  12. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/reporting_utils.py +27 -8
  13. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/task_evaluator.py +11 -4
  14. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/utils.py +10 -28
  15. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2/src/opik_optimizer.egg-info}/PKG-INFO +1 -1
  16. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/LICENSE +0 -0
  17. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/README.md +0 -0
  18. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/pyproject.toml +0 -0
  19. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/setup.cfg +0 -0
  20. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/__init__.py +1 -1
  21. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/_throttle.py +0 -0
  22. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/base_optimizer.py +0 -0
  23. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/cache_config.py +0 -0
  24. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/data/hotpot-500.json +0 -0
  25. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/datasets/__init__.py +0 -0
  26. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/datasets/ai2_arc.py +0 -0
  27. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/datasets/cnn_dailymail.py +0 -0
  28. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/datasets/election_questions.py +0 -0
  29. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/datasets/gsm8k.py +0 -0
  30. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/datasets/halu_eval.py +0 -0
  31. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/datasets/hotpot_qa.py +0 -0
  32. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/datasets/medhallu.py +0 -0
  33. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/datasets/rag_hallucinations.py +0 -0
  34. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/datasets/ragbench.py +0 -0
  35. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/datasets/tiny_test.py +0 -0
  36. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/datasets/truthful_qa.py +0 -0
  37. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/demo/__init__.py +0 -0
  38. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/demo/cache.py +0 -0
  39. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/demo/datasets.py +0 -0
  40. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/evolutionary_optimizer/__init__.py +0 -0
  41. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/few_shot_bayesian_optimizer/__init__.py +0 -0
  42. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/few_shot_bayesian_optimizer/reporting.py +0 -0
  43. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/logging_config.py +0 -0
  44. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/meta_prompt_optimizer/__init__.py +0 -0
  45. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/mipro_optimizer/__init__.py +0 -0
  46. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +0 -0
  47. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/mipro_optimizer/utils.py +0 -0
  48. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/optimization_config/__init__.py +0 -0
  49. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/optimization_config/configs.py +0 -0
  50. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer/optimization_config/mappers.py +0 -0
  51. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer.egg-info/SOURCES.txt +0 -0
  52. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer.egg-info/dependency_links.txt +0 -0
  53. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer.egg-info/requires.txt +0 -0
  54. {opik_optimizer-0.9.0rc0 → opik_optimizer-0.9.2}/src/opik_optimizer.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: opik_optimizer
3
- Version: 0.9.0rc0
3
+ Version: 0.9.2
4
4
  Summary: Agent optimization with Opik
5
5
  Home-page: https://github.com/comet-ml/opik
6
6
  Author: Comet ML
@@ -2,7 +2,7 @@ from setuptools import find_packages, setup
2
2
 
3
3
  setup(
4
4
  name="opik_optimizer",
5
- version="0.9.0rc0",
5
+ version="0.9.2",
6
6
  description="Agent optimization with Opik",
7
7
  author="Comet ML",
8
8
  author_email="support@comet.com",
@@ -828,7 +828,35 @@ Return only the new prompt list object.
828
828
  auto_continue: Whether to automatically continue optimization
829
829
  **kwargs: Additional keyword arguments
830
830
  """
831
- reporting.display_header(self.__class__.__name__, verbose=self.verbose)
831
+ if not isinstance(prompt, chat_prompt.ChatPrompt):
832
+ raise ValueError("Prompt must be a ChatPrompt object")
833
+
834
+ if not isinstance(dataset, opik.Dataset):
835
+ raise ValueError("Dataset must be a Dataset object")
836
+
837
+ if not isinstance(metric, Callable):
838
+ raise ValueError("Metric must be a function that takes `dataset_item` and `llm_output` as arguments.")
839
+
840
+ # Step 0. Start Opik optimization run
841
+ opik_optimization_run: Optional[optimization.Optimization] = None
842
+ try:
843
+ opik_optimization_run: optimization.Optimization = self._opik_client.create_optimization(
844
+ dataset_name=dataset.name,
845
+ objective_name=metric.__name__,
846
+ metadata={"optimizer": self.__class__.__name__},
847
+ )
848
+ self._current_optimization_id = opik_optimization_run.id
849
+ except Exception as e:
850
+ logger.warning(f"Opik server error: {e}. Continuing without Opik tracking.")
851
+ self._current_optimization_id = None
852
+
853
+ reporting.display_header(
854
+ algorithm=self.__class__.__name__,
855
+ optimization_id=self._current_optimization_id,
856
+ dataset_id=dataset.id,
857
+ verbose=self.verbose
858
+ )
859
+
832
860
  reporting.display_configuration(
833
861
  prompt.formatted_messages,
834
862
  {
@@ -841,9 +869,9 @@ Return only the new prompt list object.
841
869
  verbose=self.verbose
842
870
  )
843
871
 
872
+ # Step 1. Step variables and define fitness function
844
873
  self.llm_call_counter = 0
845
874
  self._history = []
846
- self._current_optimization_id = None
847
875
  self._current_generation = 0
848
876
  self._best_fitness_history = []
849
877
  self._generations_without_improvement = 0
@@ -851,7 +879,6 @@ Return only the new prompt list object.
851
879
  self._current_population = []
852
880
  self._generations_without_overall_improvement = 0
853
881
 
854
- # Step 0. Define fitness function
855
882
  if self.enable_moo:
856
883
  def _deap_evaluate_individual_fitness(
857
884
  messages: List[Dict[str, str]]
@@ -884,19 +911,6 @@ Return only the new prompt list object.
884
911
  return (fitness_score,)
885
912
  self.toolbox.register("evaluate", _deap_evaluate_individual_fitness)
886
913
 
887
- # Step 1. Start Opik optimization run
888
- opik_optimization_run: Optional[optimization.Optimization] = None
889
- try:
890
- opik_optimization_run: optimization.Optimization = self._opik_client.create_optimization(
891
- dataset_name=dataset.name,
892
- objective_name=metric.__name__,
893
- metadata={"optimizer": self.__class__.__name__},
894
- )
895
- self._current_optimization_id = opik_optimization_run.id
896
- logger.info(f"Created Opik Optimization run with ID: {self._current_optimization_id}")
897
- except Exception as e:
898
- logger.warning(f"Opik server error: {e}. Continuing without Opik tracking.")
899
-
900
914
  # Step 2. Compute the initial performance of the prompt
901
915
  with reporting.baseline_performance(verbose=self.verbose) as report_baseline_performance:
902
916
  initial_eval_result: Tuple[float, float] | Tuple[float, ] = _deap_evaluate_individual_fitness(prompt.formatted_messages)
@@ -976,7 +990,7 @@ Return only the new prompt list object.
976
990
  best_prompt=best_prompt_overall,
977
991
  best_score=best_primary_score_overall,
978
992
  improvement=0.0
979
- ).dict()
993
+ ).model_dump()
980
994
  self._add_to_history(initial_round_data)
981
995
 
982
996
  with reporting.start_evolutionary_algo(verbose=self.verbose) as report_evolutionary_algo:
@@ -1035,7 +1049,7 @@ Return only the new prompt list object.
1035
1049
  best_prompt=best_prompt_overall,
1036
1050
  best_score=best_primary_score_overall,
1037
1051
  improvement=(best_primary_score_overall - initial_primary_score) / abs(initial_primary_score) if initial_primary_score and initial_primary_score != 0 else (1.0 if best_primary_score_overall > 0 else 0.0)
1038
- ).dict()
1052
+ ).model_dump()
1039
1053
  self._add_to_history(gen_round_data)
1040
1054
 
1041
1055
  stopped_early_flag = self._generations_without_overall_improvement >= self.DEFAULT_EARLY_STOPPING_GENERATIONS
@@ -1101,6 +1115,7 @@ Return only the new prompt list object.
1101
1115
  # Add final details
1102
1116
  final_details.update({
1103
1117
  "total_generations_run": generation_idx + 1,
1118
+ "num_generations": self.num_generations,
1104
1119
  "population_size": self.population_size,
1105
1120
  "mutation_probability": self.mutation_rate,
1106
1121
  "crossover_probability": self.crossover_rate,
@@ -1132,7 +1147,9 @@ Return only the new prompt list object.
1132
1147
  return OptimizationResult(
1133
1148
  optimizer=self.__class__.__name__,
1134
1149
  prompt=final_best_prompt.formatted_messages,
1135
- score=final_primary_score,
1150
+ score=final_primary_score,
1151
+ initial_prompt=prompt.formatted_messages,
1152
+ initial_score=initial_primary_score,
1136
1153
  metric_name=metric.__name__,
1137
1154
  details=final_details,
1138
1155
  history=self.get_history(),
@@ -1186,6 +1203,7 @@ Return only the new prompt list object.
1186
1203
  response = litellm.completion(
1187
1204
  model=self.model, messages=messages, **final_call_params
1188
1205
  )
1206
+ self.llm_call_counter += 1
1189
1207
 
1190
1208
  logger.debug(f"Response: {response}")
1191
1209
  return response.choices[0].message.content
@@ -2,8 +2,6 @@ from contextlib import contextmanager
2
2
  from io import StringIO
3
3
  from typing import List
4
4
 
5
- import rich
6
- from rich.console import Console
7
5
  from rich.panel import Panel
8
6
  from rich.text import Text
9
7
 
@@ -1,3 +1,4 @@
1
+ import copy
1
2
  import json
2
3
  import logging
3
4
  import random
@@ -39,8 +40,9 @@ Your task:
39
40
  - Add a section title in XML or markdown format. The examples will be provided as `example_1\nexample_2\n...` with each example following the example template.
40
41
  - Analyze the examples to infer a consistent structure, and create a single string few_shot_example_template using the Python .format() style. Make sure to follow the following instructions:
41
42
  - Unless absolutely relevant, do not return an object but instead a string that can be inserted as part of {FEW_SHOT_EXAMPLE_PLACEHOLDER}
42
- - Make sure to include the variables as part of this string so we can before string formatting with actual examples. Only variables available in the examples can be used. Do not use anything else, do not apply any transformations to the variables either.
43
- - The few shot examples should include the expected response as the goal is to provide examples of the expected output format.
43
+ - Make sure to include the variables as part of this string so we can before string formatting with actual examples. Only variables available in the examples can be used.
44
+ - Do not apply any transformations to the variables either, only the variable name should be included in the format `{{<variable_name>}}`
45
+ - The few shot examples should include the expected response as the goal is to provide examples of the response.
44
46
  - Ensure the format of the few shot examples are consistent with how the model will be called
45
47
 
46
48
  Return your output as a JSON object with:
@@ -193,6 +195,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
193
195
 
194
196
  def _run_optimization(
195
197
  self,
198
+ initial_prompt: chat_prompt.ChatPrompt,
196
199
  fewshot_prompt_template: FewShotPromptTemplate,
197
200
  dataset: Dataset,
198
201
  metric: Callable,
@@ -248,13 +251,14 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
248
251
  for key, value in example.items():
249
252
  processed_example[key] = str(value)
250
253
 
251
- try:
252
- processed_demo_examples.append(
253
- fewshot_prompt_template.example_template.format(**processed_example)
254
- )
255
- except Exception as e:
256
- logger.error(f"Failed to format fewshot prompt template {fewshot_prompt_template} with example: {processed_example} ")
257
- raise
254
+ processed_demo_example=fewshot_prompt_template.example_template
255
+ for key, value in processed_example.items():
256
+ try:
257
+ processed_demo_example=processed_demo_example.replace(f"{{{key}}}", str(value))
258
+ except Exception:
259
+ logger.error(f"Failed to format fewshot prompt template {fewshot_prompt_template} with example: {processed_example} ")
260
+ raise
261
+ processed_demo_examples.append(processed_demo_example)
258
262
  few_shot_examples = "\n\n".join(processed_demo_examples)
259
263
 
260
264
  llm_task = self._build_task_from_messages(
@@ -301,7 +305,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
301
305
  trial_config = {
302
306
  "demo_examples": demo_examples,
303
307
  "message_list_with_placeholder": fewshot_prompt_template.message_list_with_placeholder,
304
- "message_list": messages
308
+ "message_list": messages_for_reporting
305
309
  }
306
310
  trial.set_user_attr("score", score)
307
311
  trial.set_user_attr("config", trial_config)
@@ -363,6 +367,12 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
363
367
  best_score = best_trial.value
364
368
  best_example_indices = best_trial.user_attrs.get("example_indices", [])
365
369
 
370
+ if best_score <= baseline_score:
371
+ best_score = baseline_score
372
+ best_prompt = initial_prompt.formatted_messages
373
+ else:
374
+ best_prompt = best_trial.user_attrs["config"]["message_list"]
375
+
366
376
  reporting.display_result(
367
377
  initial_score=baseline_score,
368
378
  best_score=best_score,
@@ -373,9 +383,12 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
373
383
  return optimization_result.OptimizationResult(
374
384
  optimizer=self.__class__.__name__,
375
385
  prompt=best_trial.user_attrs["config"]["message_list"],
386
+ initial_prompt=initial_prompt.formatted_messages,
387
+ initial_score=baseline_score,
376
388
  score=best_score,
377
389
  metric_name=metric.__name__,
378
390
  details={
391
+ "initial_score": baseline_score,
379
392
  "chat_messages": best_trial.user_attrs["config"]["message_list"],
380
393
  "prompt_parameter": best_trial.user_attrs["config"],
381
394
  #"n_examples": best_n_examples,
@@ -413,6 +426,16 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
413
426
  Returns:
414
427
  OptimizationResult: Result of the optimization
415
428
  """
429
+ if not isinstance(prompt, chat_prompt.ChatPrompt):
430
+ raise ValueError("Prompt must be a ChatPrompt object")
431
+
432
+ if not isinstance(dataset, Dataset):
433
+ raise ValueError("Dataset must be a Dataset object")
434
+
435
+ if not isinstance(metric, Callable):
436
+ raise ValueError("Metric must be a function that takes `dataset_item` and `llm_output` as arguments.")
437
+
438
+
416
439
  optimization = None
417
440
  try:
418
441
  optimization = self._opik_client.create_optimization(
@@ -420,15 +443,22 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
420
443
  objective_name=metric.__name__,
421
444
  metadata={"optimizer": self.__class__.__name__},
422
445
  )
446
+ optimization_run_id = optimization.id
423
447
  except Exception:
424
448
  logger.warning(
425
449
  "Opik server does not support optimizations. Please upgrade opik."
426
450
  )
427
451
  optimization = None
452
+ optimization_run_id = None
428
453
 
429
454
  try:
430
455
  # Start experiment reporting
431
- reporting.display_header("Few-Shot Bayesian Optimizer", verbose=self.verbose)
456
+ reporting.display_header(
457
+ algorithm=self.__class__.__name__,
458
+ optimization_id=optimization_run_id,
459
+ dataset_id=dataset.id,
460
+ verbose=self.verbose
461
+ )
432
462
  reporting.display_configuration(
433
463
  prompt.formatted_messages,
434
464
  optimizer_config={
@@ -467,6 +497,7 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
467
497
 
468
498
  # Step 3. Start the optimization process
469
499
  result = self._run_optimization(
500
+ initial_prompt=prompt,
470
501
  fewshot_prompt_template=fewshot_template,
471
502
  dataset=dataset,
472
503
  metric=metric,
@@ -562,17 +593,15 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
562
593
  self, messages: List[Dict[str, str]], few_shot_examples: Optional[str] = None
563
594
  ):
564
595
  def llm_task(dataset_item: Dict[str, Any]) -> Dict[str, Any]:
565
- prompt_ = [{
566
- "role": item["role"],
567
- "content": item["content"].format(**dataset_item)
568
- } for item in messages]
596
+ prompt_ = copy.deepcopy(messages)
597
+ for key, value in dataset_item.items():
598
+ for item in prompt_:
599
+ item["content"] = item["content"].replace("{" + key + "}", str(value))
569
600
 
570
601
  if few_shot_examples:
571
- prompt_ = [{
572
- "role": item["role"],
573
- "content": item["content"].replace(FEW_SHOT_EXAMPLE_PLACEHOLDER, few_shot_examples)
574
- } for item in prompt_]
575
-
602
+ for item in prompt_:
603
+ item["content"] = item["content"].replace(FEW_SHOT_EXAMPLE_PLACEHOLDER, few_shot_examples)
604
+
576
605
  response = self._call_model(
577
606
  model=self.model,
578
607
  messages=prompt_,
@@ -584,4 +613,4 @@ class FewShotBayesianOptimizer(base_optimizer.BaseOptimizer):
584
613
  mappers.EVALUATED_LLM_TASK_OUTPUT: response.choices[0].message.content
585
614
  }
586
615
 
587
- return llm_task, messages
616
+ return llm_task
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import copy
2
3
  import logging
3
4
  import os
4
5
  from typing import Any, Callable, Dict, List, Optional, overload
@@ -255,7 +256,7 @@ class MetaPromptOptimizer(BaseOptimizer):
255
256
  **experiment_config,
256
257
  **{
257
258
  "optimizer": self.__class__.__name__,
258
- "metric": metric.__name__,
259
+ "metric": getattr(metric, '__name__', str(metric)),
259
260
  "dataset": dataset.name,
260
261
  "configuration": {
261
262
  "prompt": prompt.formatted_messages,
@@ -300,7 +301,7 @@ class MetaPromptOptimizer(BaseOptimizer):
300
301
 
301
302
  # Use dataset's get_items with limit for sampling
302
303
  logger.debug(
303
- f"Starting evaluation with {subset_size if subset_size else 'all'} samples for metric: {metric.__name__}"
304
+ f"Starting evaluation with {subset_size if subset_size else 'all'} samples for metric: {getattr(metric, '__name__', str(metric))}"
304
305
  )
305
306
  score = task_evaluator.evaluate(
306
307
  dataset=dataset,
@@ -341,8 +342,15 @@ class MetaPromptOptimizer(BaseOptimizer):
341
342
  Returns:
342
343
  OptimizationResult: Structured result containing optimization details
343
344
  """
344
- reporting.display_header(self.__class__.__name__, verbose=self.verbose)
345
+ if not isinstance(prompt, chat_prompt.ChatPrompt):
346
+ raise ValueError("Prompt must be a ChatPrompt object")
345
347
 
348
+ if not isinstance(dataset, Dataset):
349
+ raise ValueError("Dataset must be a Dataset object")
350
+
351
+ if not isinstance(metric, Callable):
352
+ raise ValueError("Metric must be a function that takes `dataset_item` and `llm_output` as arguments.")
353
+
346
354
  total_items = len(dataset.get_items())
347
355
  if n_samples is not None and n_samples > total_items:
348
356
  logger.warning(
@@ -350,21 +358,12 @@ class MetaPromptOptimizer(BaseOptimizer):
350
358
  )
351
359
  n_samples = None
352
360
 
353
- reporting.display_configuration(
354
- messages=prompt.formatted_messages,
355
- optimizer_config={
356
- "optimizer": self.__class__.__name__,
357
- "n_samples": n_samples,
358
- "auto_continue": auto_continue
359
- },
360
- verbose=self.verbose
361
- )
362
-
361
+
363
362
  optimization = None
364
363
  try:
365
364
  optimization = self._opik_client.create_optimization(
366
365
  dataset_name=dataset.name,
367
- objective_name=metric.__name__,
366
+ objective_name=getattr(metric, '__name__', str(metric)),
368
367
  metadata={"optimizer": self.__class__.__name__},
369
368
  )
370
369
  logger.debug(f"Created optimization with ID: {optimization.id}")
@@ -374,6 +373,22 @@ class MetaPromptOptimizer(BaseOptimizer):
374
373
  )
375
374
  optimization = None
376
375
 
376
+ reporting.display_header(
377
+ algorithm=self.__class__.__name__,
378
+ optimization_id=optimization.id if optimization is not None else None,
379
+ dataset_id=dataset.id,
380
+ verbose=self.verbose
381
+ )
382
+ reporting.display_configuration(
383
+ messages=prompt.formatted_messages,
384
+ optimizer_config={
385
+ "optimizer": self.__class__.__name__,
386
+ "n_samples": n_samples,
387
+ "auto_continue": auto_continue
388
+ },
389
+ verbose=self.verbose
390
+ )
391
+
377
392
  try:
378
393
  result = self._optimize_prompt(
379
394
  optimization_id=optimization.id if optimization is not None else None,
@@ -411,6 +426,7 @@ class MetaPromptOptimizer(BaseOptimizer):
411
426
  self.dataset = dataset
412
427
  self.prompt = prompt
413
428
  self.llm_call_counter = 0 # Reset counter for run
429
+ initial_prompt: List[Dict[str, str]] = prompt.formatted_messages
414
430
 
415
431
  current_prompt = prompt.formatted_messages
416
432
  experiment_config = experiment_config or {}
@@ -418,7 +434,7 @@ class MetaPromptOptimizer(BaseOptimizer):
418
434
  **experiment_config,
419
435
  **{
420
436
  "optimizer": self.__class__.__name__,
421
- "metric": metric.__name__,
437
+ "metric": getattr(metric, '__name__', str(metric)),
422
438
  "dataset": self.dataset.name,
423
439
  "configuration": {
424
440
  "prompt": current_prompt,
@@ -527,11 +543,11 @@ class MetaPromptOptimizer(BaseOptimizer):
527
543
 
528
544
  return self._create_result(
529
545
  metric,
530
- prompt,
531
- best_prompt,
532
- best_score,
533
- initial_score,
534
- rounds,
546
+ initial_prompt=initial_prompt,
547
+ best_prompt=best_prompt,
548
+ best_score=best_score,
549
+ initial_score=initial_score,
550
+ rounds=rounds,
535
551
  )
536
552
 
537
553
  def _calculate_improvement(
@@ -581,21 +597,19 @@ class MetaPromptOptimizer(BaseOptimizer):
581
597
  def _create_result(
582
598
  self,
583
599
  metric: Callable,
584
- prompt: chat_prompt.ChatPrompt,
585
- best_prompt: str,
600
+ initial_prompt: List[Dict[str, str]],
601
+ best_prompt: List[Dict[str, str]],
586
602
  best_score: float,
587
603
  initial_score: float,
588
604
  rounds: List[OptimizationRound],
589
605
  ) -> OptimizationResult:
590
606
  """Create the final OptimizationResult object."""
591
607
  details = {
592
- "initial_prompt": prompt,
593
- "initial_score": initial_score,
594
608
  "final_prompt": best_prompt,
595
609
  "final_score": best_score,
596
610
  "rounds": rounds,
597
611
  "total_rounds": len(rounds),
598
- "metric_name": metric.__name__,
612
+ "metric_name": getattr(metric, '__name__', str(metric)),
599
613
  "model": self.model,
600
614
  "temperature": self.model_kwargs.get("temperature"),
601
615
  }
@@ -604,7 +618,9 @@ class MetaPromptOptimizer(BaseOptimizer):
604
618
  optimizer=self.__class__.__name__,
605
619
  prompt=best_prompt,
606
620
  score=best_score,
607
- metric_name=metric.__name__,
621
+ initial_prompt=initial_prompt,
622
+ initial_score=initial_score,
623
+ metric_name=getattr(metric, '__name__', str(metric)),
608
624
  details=details,
609
625
  llm_calls=self.llm_call_counter
610
626
  )
@@ -1,6 +1,5 @@
1
1
  from contextlib import contextmanager
2
2
 
3
- import rich
4
3
  from rich.text import Text
5
4
 
6
5
  from ..reporting_utils import (
@@ -82,6 +82,7 @@ class LM(BaseLM):
82
82
  self.finetuning_model = finetuning_model
83
83
  self.launch_kwargs = launch_kwargs or {}
84
84
  self.train_kwargs = train_kwargs or {}
85
+ self.llm_call_counter = 0
85
86
 
86
87
  # Handle model-specific configuration for different model families
87
88
  model_family = model.split("/")[-1].lower() if "/" in model else model.lower()
@@ -129,6 +130,7 @@ class LM(BaseLM):
129
130
  if not getattr(results, "cache_hit", False) and dspy.settings.usage_tracker and hasattr(results, "usage"):
130
131
  settings.usage_tracker.add_usage(self.model, dict(results.usage))
131
132
 
133
+ self.llm_call_counter += 1
132
134
  return results
133
135
 
134
136
  def launch(self, launch_kwargs: Optional[Dict[str, Any]] = None):
@@ -323,6 +325,7 @@ def litellm_completion(request: Dict[str, Any], num_retries: int, cache={"no-cac
323
325
  **retry_kwargs,
324
326
  **request,
325
327
  )
328
+
326
329
  chunks = []
327
330
  async for chunk in response:
328
331
  if caller_predict_id:
@@ -42,11 +42,10 @@ class MiproOptimizer(BaseOptimizer):
42
42
  self.tools = []
43
43
  self.num_threads = self.model_kwargs.pop("num_threads", 6)
44
44
  self.model_kwargs["model"] = self.model
45
- self.llm_call_counter = 0
46
45
  # FIXME: add mipro_optimizer=True - It does not count the LLM calls made internally by DSPy during MiproOptimizer.optimizer.compile().
47
- lm = LM(**self.model_kwargs)
46
+ self.lm = LM(**self.model_kwargs)
48
47
  opik_callback = OpikCallback(project_name=self.project_name, log_graph=True)
49
- dspy.configure(lm=lm, callbacks=[opik_callback])
48
+ dspy.configure(lm=self.lm, callbacks=[opik_callback])
50
49
  logger.debug(f"Initialized MiproOptimizer with model: {model}")
51
50
 
52
51
  def evaluate_prompt(
@@ -54,7 +53,7 @@ class MiproOptimizer(BaseOptimizer):
54
53
  dataset: Union[str, Dataset],
55
54
  metric: Callable,
56
55
  task_config: TaskConfig,
57
- prompt: Union[str, dspy.Module, OptimizationResult] = None,
56
+ prompt: Optional[Union[str, dspy.Module, OptimizationResult]] = None,
58
57
  n_samples: int = 10,
59
58
  dataset_item_ids: Optional[List[str]] = None,
60
59
  experiment_config: Optional[Dict] = None,
@@ -463,7 +462,7 @@ class MiproOptimizer(BaseOptimizer):
463
462
  metric_name=self.opik_metric.__name__ if hasattr(self, 'opik_metric') else "unknown_metric",
464
463
  details={"error": "No candidate programs generated by MIPRO"},
465
464
  history=mipro_history_processed,
466
- llm_calls=self.llm_call_counter
465
+ llm_calls=self.lm.llm_call_counter
467
466
  )
468
467
 
469
468
  self.module = self.get_best().details["program"]
@@ -488,7 +487,7 @@ class MiproOptimizer(BaseOptimizer):
488
487
  demonstrations=best_program_details.demonstrations,
489
488
  details=best_program_details.details,
490
489
  history=mipro_history_processed,
491
- llm_calls=self.llm_call_counter
490
+ llm_calls=self.lm.llm_call_counter
492
491
  )
493
492
 
494
493
  def get_best(self, position: int = 0) -> OptimizationResult:
@@ -501,7 +500,7 @@ class MiproOptimizer(BaseOptimizer):
501
500
  metric_name=getattr(self, 'opik_metric', None).name if hasattr(self, 'opik_metric') and self.opik_metric else "unknown_metric",
502
501
  details={"error": "No programs generated or compile failed"},
503
502
  history=[],
504
- llm_calls=self.llm_call_counter
503
+ llm_calls=self.lm.llm_call_counter
505
504
  )
506
505
 
507
506
  score = self.best_programs[position]["score"]
@@ -528,5 +527,5 @@ class MiproOptimizer(BaseOptimizer):
528
527
  metric_name=self.opik_metric.__name__,
529
528
  demonstrations=demos,
530
529
  details={"program": program_module},
531
- llm_calls=self.llm_call_counter
530
+ llm_calls=self.lm.llm_call_counter
532
531
  )
@@ -94,8 +94,8 @@ class ChatPrompt:
94
94
  }
95
95
 
96
96
  @classmethod
97
- def model_validate(cls, obj: Any, *, strict: bool | None = None, from_attributes: bool | None = None,
98
- context: Any | None = None, by_alias: bool | None = None, by_name: bool | None = None) -> 'ChatPrompt':
97
+ def model_validate(cls, obj: Any, *, strict: Optional[bool] = None, from_attributes: Optional[bool] = None,
98
+ context: Optional[Any] = None, by_alias: Optional[bool] = None, by_name: Optional[bool] = None) -> 'ChatPrompt':
99
99
  """Custom validation method to handle nested objects during deserialization."""
100
100
  return ChatPrompt(
101
101
  system=obj.get('system', None),
@@ -17,6 +17,10 @@ class OptimizationResult(pydantic.BaseModel):
17
17
  score: float
18
18
  metric_name: str
19
19
 
20
+ # Initial score
21
+ initial_prompt: Optional[List[Dict[Literal["role", "content"], str]]] = None
22
+ initial_score: Optional[float] = None
23
+
20
24
  details: Dict[str, Any] = pydantic.Field(default_factory=dict)
21
25
  history: List[Dict[str, Any]] = []
22
26
  llm_calls: Optional[int] = None
@@ -33,7 +37,7 @@ class OptimizationResult(pydantic.BaseModel):
33
37
 
34
38
  def _calculate_improvement_str(self) -> str:
35
39
  """Helper to calculate improvement percentage string."""
36
- initial_s = self.details.get("initial_score")
40
+ initial_s = self.initial_score
37
41
  final_s = self.score
38
42
 
39
43
  # Check if initial score exists and is a number
@@ -60,7 +64,7 @@ class OptimizationResult(pydantic.BaseModel):
60
64
  """Provides a clean, well-formatted plain-text summary."""
61
65
  separator = "=" * 80
62
66
  rounds_ran = len(self.details.get("rounds", []))
63
- initial_score = self.details.get("initial_score")
67
+ initial_score = self.initial_score
64
68
  initial_score_str = (
65
69
  f"{initial_score:.4f}" if isinstance(initial_score, (int, float)) else "N/A"
66
70
  )
@@ -74,7 +78,6 @@ class OptimizationResult(pydantic.BaseModel):
74
78
  .replace("[dim]", "")
75
79
  .replace("[/dim]", "")
76
80
  )
77
- stopped_early = self.details.get("stopped_early", "N/A")
78
81
 
79
82
  model_name = self.details.get("model", "N/A")
80
83
  temp = self.details.get("temperature")
@@ -101,7 +104,6 @@ class OptimizationResult(pydantic.BaseModel):
101
104
  f"Final Best Score: {final_score_str}",
102
105
  f"Total Improvement:{improvement_str.rjust(max(0, 18 - len('Total Improvement:')))}",
103
106
  f"Rounds Completed: {rounds_ran}",
104
- f"Stopped Early: {stopped_early}",
105
107
  "\nFINAL OPTIMIZED PROMPT / STRUCTURE:",
106
108
  "--------------------------------------------------------------------------------",
107
109
  f"{final_prompt_display}",
@@ -114,7 +116,7 @@ class OptimizationResult(pydantic.BaseModel):
114
116
  """Provides a rich, formatted output for terminals supporting Rich."""
115
117
  improvement_str = self._calculate_improvement_str()
116
118
  rounds_ran = len(self.details.get("rounds", []))
117
- initial_score = self.details.get("initial_score")
119
+ initial_score = self.initial_score
118
120
  initial_score_str = (
119
121
  f"{initial_score:.4f}"
120
122
  if isinstance(initial_score, (int, float))
@@ -124,8 +126,6 @@ class OptimizationResult(pydantic.BaseModel):
124
126
  stopped_early = self.details.get("stopped_early", "N/A")
125
127
 
126
128
  model_name = self.details.get("model", "[dim]N/A[/dim]")
127
- temp = self.details.get("temperature")
128
- temp_str = f"{temp:.1f}" if isinstance(temp, (int, float)) else "[dim]N/A[/dim]"
129
129
 
130
130
  table = rich.table.Table.grid(padding=(0, 1))
131
131
  table.add_column(style="dim")
@@ -135,13 +135,12 @@ class OptimizationResult(pydantic.BaseModel):
135
135
  "Optimizer:",
136
136
  f"[bold]{self.optimizer}[/bold]",
137
137
  )
138
- table.add_row("Model Used:", f"{model_name} ([dim]Temp:[/dim] {temp_str})")
138
+ table.add_row("Model Used:", f"{model_name}")
139
139
  table.add_row("Metric Evaluated:", f"[bold]{self.metric_name}[/bold]")
140
140
  table.add_row("Initial Score:", initial_score_str)
141
141
  table.add_row("Final Best Score:", f"[bold cyan]{final_score_str}[/bold cyan]")
142
142
  table.add_row("Total Improvement:", improvement_str)
143
143
  table.add_row("Rounds Completed:", str(rounds_ran))
144
- table.add_row("Stopped Early:", str(stopped_early))
145
144
 
146
145
  # Display Chat Structure if available
147
146
  panel_title = "[bold]Final Optimized Prompt[/bold]"
@@ -2,13 +2,14 @@ import logging
2
2
  from contextlib import contextmanager
3
3
  from typing import Dict, List, Optional
4
4
 
5
- import rich
6
5
  from rich import box
7
6
  from rich.console import Console, Group
8
7
  from rich.panel import Panel
9
8
  from rich.progress import track
10
9
  from rich.text import Text
11
10
 
11
+ from .utils import get_optimization_run_url_by_id
12
+
12
13
  PANEL_WIDTH = 70
13
14
 
14
15
  def get_console(*args, **kwargs):
@@ -21,10 +22,8 @@ def convert_tqdm_to_rich(description: Optional[str] = None, verbose: int = 1):
21
22
  """Context manager to convert tqdm to rich."""
22
23
  import opik.evaluation.engine.evaluation_tasks_executor
23
24
 
24
- optimizer_logger = logging.getLogger('opik_optimizer')
25
-
26
25
  def _tqdm_to_track(iterable, desc, disable, total):
27
- disable = verbose == 0 or optimizer_logger.level > logging.INFO
26
+ disable = verbose == 0
28
27
  return track(
29
28
  iterable,
30
29
  description=description or desc,
@@ -91,16 +90,36 @@ def display_messages(messages: List[Dict[str, str]], prefix: str = ""):
91
90
  for line in rendered_panel.splitlines():
92
91
  console.print(Text(prefix) + Text.from_ansi(line))
93
92
 
94
- def display_header(algorithm: str, verbose: int = 1):
93
+ def display_header(
94
+ algorithm: str,
95
+ optimization_id: Optional[str]=None,
96
+ dataset_id: Optional[str]=None,
97
+ verbose: int = 1
98
+ ):
95
99
  if verbose < 1:
96
100
  return
97
101
 
102
+ if optimization_id is not None and dataset_id is not None:
103
+ optimization_url = get_optimization_run_url_by_id(
104
+ optimization_id=optimization_id,
105
+ dataset_id=dataset_id
106
+ )
107
+
108
+ # Create a visually appealing panel with an icon and ensure link doesn't wrap
109
+
110
+ link_text = Text("-> View optimization details in your Opik dashboard")
111
+ link_text.stylize(f"link {optimization_url}", 28, len(link_text))
112
+ else:
113
+ link_text = Text("No optimization run link available", style="dim")
114
+
98
115
  content = Text.assemble(
99
116
  ("● ", "green"),
100
117
  "Running Opik Evaluation - ",
101
- (algorithm, "blue")
102
- )
118
+ (algorithm, "blue"),
119
+ "\n\n"
120
+ ).append(link_text)
103
121
 
122
+
104
123
  panel = Panel(
105
124
  content,
106
125
  box=box.ROUNDED,
@@ -126,7 +145,7 @@ def display_result(initial_score, best_score, best_prompt, verbose: int = 1):
126
145
  perc_change = (best_score - initial_score) / initial_score
127
146
  content = [Text(f"Prompt was optimized and improved from {initial_score:.4f} to {best_score:.4f} ({perc_change:.2%})", style="bold green")]
128
147
  else:
129
- content = [Text("Optimization trial did not find a better prompt than the initial one.", style="bold red")]
148
+ content = [Text(f"Optimization run did not find a better prompt than the initial one.\nScore: {best_score:.4f}", style="dim bold red")]
130
149
 
131
150
  content.append(Text("\nOptimized prompt:"))
132
151
  for i, msg in enumerate(best_prompt):
@@ -11,16 +11,23 @@ def _create_metric_class(metric: Callable):
11
11
  class MetricClass(base_metric.BaseMetric):
12
12
  def __init__(self):
13
13
  self.name = metric.__name__
14
-
14
+
15
15
  def score(self, llm_output, **kwargs) -> score_result.ScoreResult:
16
16
  try:
17
17
  metric_val = metric(dataset_item=kwargs, llm_output=llm_output)
18
18
  if isinstance(metric_val , score_result.ScoreResult):
19
- return metric_val
19
+ return score_result.ScoreResult(
20
+ name = self.name,
21
+ value = metric_val.value,
22
+ scoring_failed=metric_val.scoring_failed,
23
+ metadata=metric_val.metadata,
24
+ reason=metric_val.reason
25
+ )
20
26
  else:
21
27
  return score_result.ScoreResult(
22
28
  name = self.name,
23
- value = metric_val
29
+ value = metric_val,
30
+ scoring_failed=False
24
31
  )
25
32
  except Exception:
26
33
  return score_result.ScoreResult(
@@ -71,7 +78,7 @@ def evaluate(
71
78
  items = [item for item in items if item.get("id") in dataset_item_ids]
72
79
 
73
80
  eval_metrics = [_create_metric_class(metric)]
74
-
81
+
75
82
  if optimization_id is not None:
76
83
  result = opik_evaluator.evaluate_optimization_trial(
77
84
  optimization_id=optimization_id,
@@ -1,23 +1,17 @@
1
1
  """Utility functions and constants for the optimizer package."""
2
2
 
3
- from typing import Dict, Any, Optional, TYPE_CHECKING, Type, Literal, Final
4
- from types import TracebackType
5
-
6
- import opik
7
- from opik.api_objects.opik_client import Opik
8
- from opik.api_objects.optimization import Optimization
9
-
3
+ import base64
10
4
  import json
11
5
  import logging
12
6
  import random
13
7
  import string
14
- import base64
15
8
  import urllib.parse
16
- from rich import console
9
+ from types import TracebackType
10
+ from typing import Any, Dict, Final, Literal, Optional, Type
17
11
 
18
- # Type hint for OptimizationResult without circular import
19
- if TYPE_CHECKING:
20
- from .optimization_result import OptimizationResult
12
+ import opik
13
+ from opik.api_objects.opik_client import Opik
14
+ from opik.api_objects.optimization import Optimization
21
15
 
22
16
  ALLOWED_URL_CHARACTERS: Final[str] = ":/&?="
23
17
  logger = logging.getLogger(__name__)
@@ -63,6 +57,7 @@ class OptimizationContextManager:
63
57
  name=self.name,
64
58
  metadata=self.metadata,
65
59
  )
60
+
66
61
  if self.optimization:
67
62
  return self.optimization
68
63
  else:
@@ -238,8 +233,10 @@ def ensure_ending_slash(url: str) -> str:
238
233
 
239
234
 
240
235
  def get_optimization_run_url_by_id(
241
- dataset_id: str, optimization_id: str, url_override: str
236
+ dataset_id: str, optimization_id: str
242
237
  ) -> str:
238
+ opik_config = opik.config.get_from_user_inputs()
239
+ url_override = opik_config.url_override
243
240
  encoded_opik_url = base64.b64encode(url_override.encode("utf-8")).decode("utf-8")
244
241
 
245
242
  run_path = urllib.parse.quote(
@@ -247,18 +244,3 @@ def get_optimization_run_url_by_id(
247
244
  safe=ALLOWED_URL_CHARACTERS,
248
245
  )
249
246
  return urllib.parse.urljoin(ensure_ending_slash(url_override), run_path)
250
-
251
-
252
- def display_optimization_run_link(
253
- optimization_id: str, dataset_id: str, url_override: str
254
- ) -> None:
255
- console_container = console.Console()
256
-
257
- optimization_url = get_optimization_run_url_by_id(
258
- optimization_id=optimization_id,
259
- dataset_id=dataset_id,
260
- url_override=url_override,
261
- )
262
- console_container.print(
263
- f"View the optimization run [link={optimization_url}]in your Opik dashboard[/link]."
264
- )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: opik_optimizer
3
- Version: 0.9.0rc0
3
+ Version: 0.9.2
4
4
  Summary: Agent optimization with Opik
5
5
  Home-page: https://github.com/comet-ml/opik
6
6
  Author: Comet ML
@@ -12,8 +12,8 @@ from .logging_config import setup_logging
12
12
  from .meta_prompt_optimizer import MetaPromptOptimizer
13
13
  from .mipro_optimizer import MiproOptimizer
14
14
  from .optimization_config.chat_prompt import ChatPrompt
15
- from .optimization_result import OptimizationResult
16
15
  from .optimization_config.configs import TaskConfig
16
+ from .optimization_result import OptimizationResult
17
17
 
18
18
  __version__ = importlib.metadata.version("opik_optimizer")
19
19