opik-optimizer 0.7.8__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. opik_optimizer/__init__.py +2 -0
  2. opik_optimizer/base_optimizer.py +6 -4
  3. opik_optimizer/data/hotpot-500.json +501 -1001
  4. opik_optimizer/datasets/__init__.py +27 -0
  5. opik_optimizer/datasets/ai2_arc.py +44 -0
  6. opik_optimizer/datasets/cnn_dailymail.py +40 -0
  7. opik_optimizer/datasets/election_questions.py +36 -0
  8. opik_optimizer/datasets/gsm8k.py +40 -0
  9. opik_optimizer/datasets/halu_eval.py +43 -0
  10. opik_optimizer/datasets/hotpot_qa.py +68 -0
  11. opik_optimizer/datasets/medhallu.py +39 -0
  12. opik_optimizer/datasets/rag_hallucinations.py +41 -0
  13. opik_optimizer/datasets/ragbench.py +40 -0
  14. opik_optimizer/datasets/tiny_test.py +57 -0
  15. opik_optimizer/datasets/truthful_qa.py +107 -0
  16. opik_optimizer/demo/datasets.py +53 -607
  17. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +3 -1
  18. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +90 -19
  19. opik_optimizer/logging_config.py +1 -1
  20. opik_optimizer/meta_prompt_optimizer.py +60 -14
  21. opik_optimizer/mipro_optimizer/mipro_optimizer.py +151 -13
  22. opik_optimizer/optimization_result.py +11 -0
  23. opik_optimizer/task_evaluator.py +6 -1
  24. opik_optimizer/utils.py +0 -52
  25. opik_optimizer-0.8.1.dist-info/METADATA +196 -0
  26. opik_optimizer-0.8.1.dist-info/RECORD +45 -0
  27. opik_optimizer-0.7.8.dist-info/METADATA +0 -174
  28. opik_optimizer-0.7.8.dist-info/RECORD +0 -33
  29. {opik_optimizer-0.7.8.dist-info → opik_optimizer-0.8.1.dist-info}/WHEEL +0 -0
  30. {opik_optimizer-0.7.8.dist-info → opik_optimizer-0.8.1.dist-info}/licenses/LICENSE +0 -0
  31. {opik_optimizer-0.7.8.dist-info → opik_optimizer-0.8.1.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
1
1
  from typing import Any, Dict, List, Tuple, Union, Optional, Literal
2
2
  import os
3
3
  import random
4
+ from datetime import datetime
4
5
 
5
6
  import opik
6
7
 
@@ -37,11 +38,13 @@ logger = logging.getLogger(__name__) # Inherits config from setup_logging
37
38
 
38
39
 
39
40
  class MiproOptimizer(BaseOptimizer):
40
- def __init__(self, model, project_name: Optional[str] = None, **model_kwargs):
41
- super().__init__(model, project_name, **model_kwargs)
41
+ def __init__(self, model, project_name: Optional[str] = None, verbose: int = 1, **model_kwargs):
42
+ super().__init__(model, project_name, verbose=verbose, **model_kwargs)
42
43
  self.tools = []
43
44
  self.num_threads = self.model_kwargs.pop("num_threads", 6)
44
45
  self.model_kwargs["model"] = self.model
46
+ self.llm_call_counter = 0
47
+ # FIXME: add mipro_optimizer=True - It does not count the LLM calls made internally by DSPy during MiproOptimizer.optimizer.compile().
45
48
  lm = LM(**self.model_kwargs)
46
49
  opik_callback = OpikCallback(project_name=self.project_name, log_graph=True)
47
50
  dspy.configure(lm=lm, callbacks=[opik_callback])
@@ -56,6 +59,7 @@ class MiproOptimizer(BaseOptimizer):
56
59
  n_samples: int = 10,
57
60
  dataset_item_ids: Optional[List[str]] = None,
58
61
  experiment_config: Optional[Dict] = None,
62
+ verbose: int = 1,
59
63
  **kwargs,
60
64
  ) -> float:
61
65
  """
@@ -69,6 +73,7 @@ class MiproOptimizer(BaseOptimizer):
69
73
  n_samples: number of items to test in the dataset
70
74
  dataset_item_ids: Optional list of dataset item IDs to evaluate
71
75
  experiment_config: Optional configuration for the experiment
76
+ verbose: Verbosity level
72
77
  **kwargs: Additional arguments for evaluation
73
78
 
74
79
  Returns:
@@ -76,10 +81,14 @@ class MiproOptimizer(BaseOptimizer):
76
81
  """
77
82
  # FIMXE: call super when it is ready
78
83
  # FIXME: Intermediate values:
84
+ self.llm_call_counter += 1
79
85
  metric = metric_config.metric
80
86
  input_key = task_config.input_dataset_fields[0] # FIXME: allow all inputs
81
87
  output_key = task_config.output_dataset_field
82
88
 
89
+ # Kwargs might contain n_samples, passed from run_benchmark.py
90
+ n_samples = kwargs.pop("n_samples", None) # Get n_samples from kwargs if present
91
+
83
92
  if isinstance(dataset, str):
84
93
  opik_client = opik.Opik(project_name=self.project_name)
85
94
  dataset = opik_client.get_dataset(dataset)
@@ -144,12 +153,32 @@ class MiproOptimizer(BaseOptimizer):
144
153
 
145
154
  return result
146
155
 
147
- if n_samples is not None:
148
- if dataset_item_ids is not None:
149
- raise Exception("Can't use n_samples and dataset_item_ids")
156
+ # Robust n_samples handling for selecting dataset_item_ids
157
+ dataset_items_for_eval = dataset.get_items()
158
+ num_total_items = len(dataset_items_for_eval)
159
+ dataset_item_ids_to_use = dataset_item_ids # Use provided IDs if any
150
160
 
151
- all_ids = [dataset_item["id"] for dataset_item in dataset.get_items()]
152
- dataset_item_ids = random.sample(all_ids, n_samples)
161
+ if n_samples is not None: # If n_samples is specified by the caller (run_benchmark.py)
162
+ if dataset_item_ids is not None:
163
+ # This case should ideally be an error or a clear precedence rule.
164
+ # For now, let's assume if dataset_item_ids is provided, it takes precedence over n_samples.
165
+ logger.warning("MiproOptimizer.evaluate_prompt: Both n_samples and dataset_item_ids provided. Using provided dataset_item_ids.")
166
+ # dataset_item_ids_to_use is already dataset_item_ids
167
+ elif n_samples > num_total_items:
168
+ logger.warning(f"MiproOptimizer.evaluate_prompt: n_samples ({n_samples}) > total items ({num_total_items}). Using all {num_total_items} items.")
169
+ dataset_item_ids_to_use = None # opik.evaluation.evaluate handles None as all items
170
+ elif n_samples <= 0:
171
+ logger.warning(f"MiproOptimizer.evaluate_prompt: n_samples ({n_samples}) is <= 0. Using all {num_total_items} items.")
172
+ dataset_item_ids_to_use = None
173
+ else:
174
+ # n_samples is valid and dataset_item_ids was not provided, so sample now.
175
+ all_ids = [item["id"] for item in dataset_items_for_eval]
176
+ dataset_item_ids_to_use = random.sample(all_ids, n_samples)
177
+ logger.info(f"MiproOptimizer.evaluate_prompt: Sampled {n_samples} items for evaluation.")
178
+ else: # n_samples is None
179
+ if dataset_item_ids is None:
180
+ logger.info(f"MiproOptimizer.evaluate_prompt: n_samples is None and dataset_item_ids is None. Using all {num_total_items} items.")
181
+ # dataset_item_ids_to_use is already dataset_item_ids (which could be None)
153
182
 
154
183
  experiment_config = experiment_config or {}
155
184
  experiment_config = {
@@ -171,9 +200,10 @@ class MiproOptimizer(BaseOptimizer):
171
200
  # "reference" needs to match metric
172
201
  scoring_key_mapping={"reference": output_key},
173
202
  task_threads=self.num_threads,
174
- dataset_item_ids=dataset_item_ids,
203
+ dataset_item_ids=dataset_item_ids_to_use,
175
204
  project_name=self.project_name,
176
205
  experiment_config=experiment_config,
206
+ verbose=verbose,
177
207
  )
178
208
 
179
209
  # Calculate average score across all metrics
@@ -207,6 +237,7 @@ class MiproOptimizer(BaseOptimizer):
207
237
  optimization = self._opik_client.create_optimization(
208
238
  dataset_name=dataset.name,
209
239
  objective_name=metric_config.metric.name,
240
+ metadata={"optimizer": self.__class__.__name__},
210
241
  )
211
242
  except Exception:
212
243
  logger.warning(
@@ -284,13 +315,14 @@ class MiproOptimizer(BaseOptimizer):
284
315
  **kwargs,
285
316
  ) -> None:
286
317
  # FIXME: Intermediate values:
318
+ self.llm_call_counter = 0
287
319
  metric = metric_config.metric
288
320
  prompt = task_config.instruction_prompt
289
321
  input_key = task_config.input_dataset_fields[0] # FIXME: allow all
290
322
  output_key = task_config.output_dataset_field
291
323
  self.tools = task_config.tools
292
324
  self.num_candidates = num_candidates
293
- self.seed = 9
325
+ self.seed = 42
294
326
  self.input_key = input_key
295
327
  self.output_key = output_key
296
328
  self.prompt = prompt
@@ -347,7 +379,7 @@ class MiproOptimizer(BaseOptimizer):
347
379
  metric=self.metric_function,
348
380
  auto=self.auto,
349
381
  num_threads=self.num_threads,
350
- verbose=False,
382
+ verbose=(self.verbose == 1),
351
383
  num_candidates=self.num_candidates,
352
384
  seed=self.seed,
353
385
  opik_prompt_task_config=task_config,
@@ -373,6 +405,9 @@ class MiproOptimizer(BaseOptimizer):
373
405
  """
374
406
  Continue to look for optimizations
375
407
  """
408
+ if not hasattr(self, 'optimizer') or not self.optimizer:
409
+ raise RuntimeError("MiproOptimizer not prepared. Call prepare_optimize_prompt first.")
410
+
376
411
  self.results = self.optimizer.compile(
377
412
  student=self.module,
378
413
  trainset=self.trainset,
@@ -385,12 +420,114 @@ class MiproOptimizer(BaseOptimizer):
385
420
  key=lambda item: item["score"],
386
421
  reverse=True,
387
422
  )
423
+
424
+ mipro_history_processed = []
425
+ # self.num_candidates is set in prepare_optimize_prompt, defaults to 10
426
+ # If self.num_candidates is 0 or None, this logic might break or be odd.
427
+ # Add a safeguard for num_candidates_per_round if self.num_candidates is not usable.
428
+ num_candidates_per_round = self.num_candidates if hasattr(self, 'num_candidates') and self.num_candidates and self.num_candidates > 0 else 1
429
+
430
+ for i, candidate_data in enumerate(self.results.candidate_programs):
431
+ program_module = candidate_data.get("program")
432
+ instruction = "N/A"
433
+ if hasattr(program_module, 'signature') and hasattr(program_module.signature, 'instructions'):
434
+ instruction = program_module.signature.instructions
435
+ elif hasattr(program_module, 'extended_signature') and hasattr(program_module.extended_signature, 'instructions'):
436
+ instruction = program_module.extended_signature.instructions
437
+ elif hasattr(program_module, 'predictor') and hasattr(program_module.predictor, 'signature') and hasattr(program_module.predictor.signature, 'instructions'):
438
+ instruction = program_module.predictor.signature.instructions
439
+
440
+ # Remove R and C calculation for Mipro as its history is flat
441
+ # current_round_number = (i // num_candidates_per_round) + 1
442
+ # current_candidate_in_round = (i % num_candidates_per_round) + 1
443
+
444
+ iter_detail = {
445
+ "iteration": i + 1,
446
+ # "round_number": current_round_number, # Remove round_number
447
+ # "candidate_in_round": current_candidate_in_round, # Remove candidate_in_round
448
+ "timestamp": datetime.now().isoformat(),
449
+ "prompt_candidate": instruction,
450
+ "parameters_used": {
451
+ "program_summary": str(program_module)[:500]
452
+ },
453
+ "scores": [], # Initialize scores list
454
+ "tokens_used": None, # TODO: add tokens_used
455
+ "cost": None, # TODO: add cost
456
+ "duration_seconds": None, # TODO: add duration_seconds
457
+ }
458
+
459
+ current_score = candidate_data.get("score")
460
+ metric_name_for_history = self.opik_metric.name if hasattr(self, 'opik_metric') and self.opik_metric else "unknown_metric"
461
+
462
+ # Unscale if it's a known 0-1 metric that MIPRO might scale to 0-100
463
+ # For now, specifically targeting Levenshtein-like metrics
464
+ if isinstance(current_score, (float, int)) and \
465
+ ("levenshtein" in metric_name_for_history.lower() or "similarity" in metric_name_for_history.lower()):
466
+ # Assuming scores like 32.4 are 0-1 scores scaled by 100
467
+ if abs(current_score) > 1.0: # A simple check to see if it looks scaled
468
+ logger.debug(f"Mipro history: Unscaling score {current_score} for metric {metric_name_for_history} by dividing by 100.")
469
+ current_score /= 100.0
470
+
471
+ iter_detail["scores"].append({
472
+ "metric_name": metric_name_for_history,
473
+ "score": current_score,
474
+ "opik_evaluation_id": None # TODO: add opik_evaluation_id
475
+ })
476
+ mipro_history_processed.append(iter_detail)
477
+
478
+ if not self.best_programs:
479
+ logger.warning("MIPRO compile returned no candidate programs.")
480
+ return OptimizationResult(
481
+ optimizer="MiproOptimizer",
482
+ prompt=self.prompt,
483
+ score=0.0,
484
+ metric_name=self.opik_metric.name if hasattr(self, 'opik_metric') else "unknown_metric",
485
+ details={"error": "No candidate programs generated by MIPRO"},
486
+ history=mipro_history_processed,
487
+ llm_calls=self.llm_call_counter
488
+ )
489
+
388
490
  self.module = self.get_best().details["program"]
389
- return self.get_best()
491
+ best_program_details = self.get_best()
492
+
493
+ # Unscale the main score if necessary, similar to history scores
494
+ final_best_score = best_program_details.score
495
+ final_metric_name = best_program_details.metric_name
496
+ if isinstance(final_best_score, (float, int)) and \
497
+ final_metric_name and \
498
+ ("levenshtein" in final_metric_name.lower() or "similarity" in final_metric_name.lower()):
499
+ if abs(final_best_score) > 1.0: # A simple check to see if it looks scaled
500
+ logger.debug(f"Mipro main result: Unscaling score {final_best_score} for metric {final_metric_name} by dividing by 100.")
501
+ final_best_score /= 100.0
502
+
503
+ return OptimizationResult(
504
+ optimizer="MiproOptimizer",
505
+ prompt=best_program_details.prompt,
506
+ tool_prompts=best_program_details.tool_prompts,
507
+ score=final_best_score, # Use the potentially unscaled score
508
+ metric_name=final_metric_name,
509
+ demonstrations=best_program_details.demonstrations,
510
+ details=best_program_details.details,
511
+ history=mipro_history_processed,
512
+ llm_calls=self.llm_call_counter
513
+ )
390
514
 
391
515
  def get_best(self, position: int = 0) -> OptimizationResult:
516
+ if not hasattr(self, 'best_programs') or not self.best_programs:
517
+ logger.error("get_best() called but no best_programs found. MIPRO compile might have failed or yielded no results.")
518
+ return OptimizationResult(
519
+ optimizer="MiproOptimizer",
520
+ prompt=getattr(self, 'prompt', "Error: Initial prompt not found"),
521
+ score=0.0,
522
+ metric_name=getattr(self, 'opik_metric', None).name if hasattr(self, 'opik_metric') and self.opik_metric else "unknown_metric",
523
+ details={"error": "No programs generated or compile failed"},
524
+ history=[],
525
+ llm_calls=self.llm_call_counter
526
+ )
527
+
392
528
  score = self.best_programs[position]["score"]
393
- state = self.best_programs[position]["program"].dump_state()
529
+ program_module = self.best_programs[position]["program"]
530
+ state = program_module.dump_state()
394
531
  if self.tools:
395
532
  tool_names = [tool.__name__ for tool in self.tools]
396
533
  tool_prompts = get_tool_prompts(
@@ -410,5 +547,6 @@ class MiproOptimizer(BaseOptimizer):
410
547
  score=score,
411
548
  metric_name=self.opik_metric.name,
412
549
  demonstrations=demos,
413
- details={"program": self.best_programs[position]["program"]},
550
+ details={"program": program_module},
551
+ llm_calls=self.llm_call_counter
414
552
  )
@@ -7,6 +7,15 @@ from pydantic import BaseModel, Field
7
7
  from .base_optimizer import OptimizationRound # Adjust import as necessary
8
8
  import rich
9
9
 
10
+ class OptimizationStep(BaseModel):
11
+ """Represents a single step or trial in an optimization process."""
12
+ step: int
13
+ score: Optional[float] = None
14
+ prompt: Optional[Union[str, List[Dict[str, str]]]] = None
15
+ parameters: Optional[Dict[str, Any]] = None
16
+ timestamp: Optional[str] = None
17
+ # Add other relevant details per step if needed
18
+
10
19
 
11
20
  class OptimizationResult(pydantic.BaseModel):
12
21
  """Result of an optimization run."""
@@ -28,6 +37,8 @@ class OptimizationResult(pydantic.BaseModel):
28
37
  demonstrations: Optional[List[Dict[str, Any]]] = None
29
38
  optimizer: str = "Optimizer"
30
39
  tool_prompts: Optional[Dict[str, str]] = None
40
+ opik_metadata: Optional[Dict[str, Any]] = None
41
+ llm_calls: Optional[int] = None
31
42
 
32
43
  model_config = pydantic.ConfigDict(arbitrary_types_allowed=True)
33
44
 
@@ -1,11 +1,12 @@
1
1
  import opik
2
-
2
+ import logging
3
3
  from typing import Any, Callable, Dict, List, Optional
4
4
  from opik_optimizer.optimization_config.configs import MetricConfig
5
5
  from opik.evaluation.metrics import score_result
6
6
 
7
7
  from opik.evaluation import evaluator as opik_evaluator
8
8
 
9
+ logger = logging.getLogger(__name__)
9
10
 
10
11
  def evaluate(
11
12
  dataset: opik.Dataset,
@@ -17,6 +18,7 @@ def evaluate(
17
18
  project_name: Optional[str] = None,
18
19
  n_samples: Optional[int] = None,
19
20
  experiment_config: Optional[Dict[str, Any]] = None,
21
+ verbose: int = 1,
20
22
  ) -> float:
21
23
  """
22
24
  Evaluate a task on a dataset.
@@ -31,6 +33,7 @@ def evaluate(
31
33
  num_threads: Number of threads to use for evaluation.
32
34
  experiment_config: The dictionary with parameters that describe experiment
33
35
  optimization_id: Optional optimization ID for the experiment.
36
+ verbose: Whether to print debug information.
34
37
 
35
38
  Returns:
36
39
  float: The average score of the evaluated task.
@@ -71,6 +74,7 @@ def evaluate(
71
74
  task_threads=num_threads,
72
75
  nb_samples=n_samples,
73
76
  experiment_config=experiment_config,
77
+ verbose=verbose,
74
78
  )
75
79
  else:
76
80
  result = opik_evaluator.evaluate(
@@ -83,6 +87,7 @@ def evaluate(
83
87
  task_threads=num_threads,
84
88
  nb_samples=n_samples,
85
89
  experiment_config=experiment_config,
90
+ verbose=verbose,
86
91
  )
87
92
 
88
93
  if not result.test_results:
opik_optimizer/utils.py CHANGED
@@ -8,9 +8,6 @@ from opik.api_objects.opik_client import Opik
8
8
 
9
9
  from typing import List, Dict, Any, Optional, Callable, TYPE_CHECKING
10
10
 
11
- # Test dataset name for optimizer examples
12
- TEST_DATASET_NAME = "tiny-test-optimizer"
13
-
14
11
  # Type hint for OptimizationResult without circular import
15
12
  if TYPE_CHECKING:
16
13
  from .optimization_result import OptimizationResult
@@ -79,54 +76,5 @@ def get_random_seed() -> int:
79
76
 
80
77
  return random.randint(0, 2**32 - 1)
81
78
 
82
-
83
- def get_or_create_dataset(
84
- dataset_name: str,
85
- description: str,
86
- data_loader: Callable[[], List[Dict[str, Any]]],
87
- project_name: Optional[str] = None,
88
- ) -> opik.Dataset:
89
- """
90
- Get an existing dataset or create a new one if it doesn't exist.
91
-
92
- Args:
93
- dataset_name: Name of the dataset
94
- description: Description of the dataset
95
- data: Optional data to insert into the dataset
96
- project_name: Optional project name
97
-
98
- Returns:
99
- opik.Dataset: The dataset object
100
- """
101
- client = Opik(project_name=project_name)
102
-
103
- try:
104
- # Try to get existing dataset
105
- dataset = client.get_dataset(dataset_name)
106
- # If dataset exists but has no data, delete it
107
- if not dataset.get_items():
108
- print("Dataset exists but is empty - deleting it...")
109
- # Delete all items in the dataset
110
- items = dataset.get_items()
111
- if items:
112
- dataset.delete(items_ids=[item.id for item in items])
113
- # Delete the dataset itself
114
- client.delete_dataset(dataset_name)
115
- raise Exception("Dataset deleted, will create new one")
116
- except Exception:
117
- # Create new dataset
118
- print("Creating new dataset...")
119
- dataset = client.create_dataset(name=dataset_name, description=description)
120
-
121
- dataset_items = data_loader()
122
- dataset.insert(dataset_items)
123
-
124
- # Verify data was added
125
- if not dataset.get_items():
126
- raise Exception("Failed to add data to dataset")
127
-
128
- return dataset
129
-
130
-
131
79
  def random_chars(n: int) -> str:
132
80
  return "".join(random.choice(string.ascii_letters) for _ in range(n))
@@ -0,0 +1,196 @@
1
+ Metadata-Version: 2.4
2
+ Name: opik_optimizer
3
+ Version: 0.8.1
4
+ Summary: Agent optimization with Opik
5
+ Home-page: https://github.com/comet-ml/opik
6
+ Author: Comet ML
7
+ Author-email: support@comet.com
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Requires-Python: >=3.9,<3.13
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Requires-Dist: opik>=1.7.17
16
+ Requires-Dist: dspy<3,>=2.6.18
17
+ Requires-Dist: litellm
18
+ Requires-Dist: tqdm
19
+ Requires-Dist: datasets
20
+ Requires-Dist: optuna
21
+ Requires-Dist: pydantic
22
+ Requires-Dist: pandas
23
+ Requires-Dist: hf_xet
24
+ Requires-Dist: pyrate-limiter
25
+ Requires-Dist: deap>=1.4.3
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest; extra == "dev"
28
+ Requires-Dist: pytest-conv; extra == "dev"
29
+ Dynamic: author
30
+ Dynamic: author-email
31
+ Dynamic: classifier
32
+ Dynamic: description
33
+ Dynamic: description-content-type
34
+ Dynamic: home-page
35
+ Dynamic: license-file
36
+ Dynamic: provides-extra
37
+ Dynamic: requires-dist
38
+ Dynamic: requires-python
39
+ Dynamic: summary
40
+
41
+ # Opik Agent Optimizer
42
+
43
+ [![PyPI version](https://img.shields.io/pypi/v/opik-optimizer.svg)](https://pypi.org/project/opik-optimizer/)
44
+ [![Python versions](https://img.shields.io/pypi/pyversions/opik-optimizer.svg)](https://pypi.org/project/opik-optimizer/)
45
+ [![Downloads](https://static.pepy.tech/badge/opik-optimizer)](https://pepy.tech/project/opik-optimizer)
46
+ [![License](https://img.shields.io/github/license/comet-ml/opik)](https://github.com/comet-ml/opik/blob/main/LICENSE)
47
+
48
+ The Opik Agent Optimizer refines your prompts to achieve better performance from your Large Language Models (LLMs). It supports a variety of optimization algorithms, including:
49
+
50
+ * EvolutionaryOptimizer
51
+ * FewShotBayesianOptimizer
52
+ * MetaPromptOptimizer
53
+ * MiproOptimizer
54
+
55
+ Opik Optimizer is a component of the [Opik platform](https://github.com/comet-ml/opik), an open-source LLM evaluation platform by Comet.
56
+ For more information about the broader Opik ecosystem, visit our [Website](https://www.comet.com/site/products/opik/) or [Documentation](https://www.comet.com/docs/opik/).
57
+
58
+ ## Quickstart
59
+
60
+ Explore Opik Optimizer's capabilities with our interactive notebook:
61
+
62
+ <a href="https://colab.research.google.com/github/comet-ml/opik/blob/main/sdks/opik_optimizer/notebooks/OpikOptimizerIntro.ipynb">
63
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab"/>
64
+ </a>
65
+
66
+ ## Setup
67
+
68
+ To get started with Opik Optimizer, follow these steps:
69
+
70
+ 1. **Install the package:**
71
+ ```bash
72
+ # using pip
73
+ pip install opik-optimizer
74
+
75
+ # using uv (faster)
76
+ uv pip install opik-optimizer
77
+ ```
78
+
79
+ 2. **Configure Opik (Optional, for advanced features):**
80
+ If you plan to log optimization experiments to Comet or use Opik Datasets, you'll need to configure the Opik client:
81
+ ```bash
82
+ # Install the main Opik CLI (if not already installed)
83
+ pip install opik
84
+
85
+ # Configure your Comet API key and workspace
86
+ opik configure
87
+ # When prompted, enter your Opik API key and workspace details.
88
+ ```
89
+ Using Opik with Comet allows you to track your optimization runs, compare results, and manage datasets seamlessly.
90
+
91
+ 3. **Set up LLM Provider API Keys:**
92
+ Ensure your environment variables are set for the LLM(s) you intend to use. For example, for OpenAI models:
93
+ ```bash
94
+ export OPENAI_API_KEY="your_openai_api_key"
95
+ ```
96
+ The optimizer utilizes LiteLLM, so you can configure keys for various providers as per LiteLLM's documentation.
97
+
98
+ You'll typically need:
99
+
100
+ * An LLM model name (e.g., "gpt-4o-mini", "claude-3-haiku-20240307").
101
+ * An [Opik Dataset](https://www.comet.com/docs/opik/evaluation/manage_datasets/) (or a compatible local dataset/data generator).
102
+ * An [Opik Metric](https://www.comet.com/docs/opik/evaluation/metrics/overview/) (or a custom evaluation function).
103
+ * A starting prompt (template string).
104
+
105
+ ## Example
106
+
107
+ Here's a brief example of how to use the `FewShotBayesianOptimizer`. We'll use a sample dataset provided by Opik.
108
+
109
+ Available sample datasets for testing:
110
+ * `"tiny-test"`
111
+ * `"halu-eval-300"`
112
+ * `"hotpot-300"`
113
+
114
+ ```python
115
+ from opik.evaluation.metrics import LevenshteinRatio
116
+ from opik_optimizer import FewShotBayesianOptimizer
117
+ from opik_optimizer.demo import get_or_create_dataset
118
+
119
+ from opik_optimizer import (
120
+ MetricConfig,
121
+ TaskConfig,
122
+ from_dataset_field,
123
+ from_llm_response_text,
124
+ )
125
+
126
+ # Load a sample dataset
127
+ hot_pot_dataset = get_or_create_dataset("hotpot-300")
128
+
129
+ # Define the instruction for your chat prompt.
130
+ # Input parameters from dataset examples will be interpolated into the full prompt.
131
+ prompt_instruction = """
132
+ Answer the question based on the provided context.
133
+ """
134
+ project_name = "optimize-few-shot-bayesian-hotpot" # For Comet logging
135
+
136
+ optimizer = FewShotBayesianOptimizer(
137
+ model="gpt-4o-mini", # LiteLLM name to use for generation and optimization
138
+ project_name=project_name, # Associates the run with a Comet project
139
+ min_examples=3, # Min few-shot examples
140
+ max_examples=8, # Max few-shot examples
141
+ n_threads=16, # Parallel threads for evaluation
142
+ seed=42,
143
+ )
144
+
145
+ metric_config = MetricConfig(
146
+ metric=LevenshteinRatio(project_name=project_name), # Metric for evaluation
147
+ inputs={
148
+ "output": from_llm_response_text(), # Get output from LLM
149
+ "reference": from_dataset_field(name="answer"), # Get reference from dataset
150
+ },
151
+ )
152
+
153
+ task_config = TaskConfig(
154
+ instruction_prompt=prompt_instruction,
155
+ input_dataset_fields=["question"], # Fields from dataset to use as input
156
+ output_dataset_field="answer", # Field in dataset for reference answer
157
+ use_chat_prompt=True, # Use chat-style prompting
158
+ )
159
+
160
+ # Run the optimization
161
+ result = optimizer.optimize_prompt(
162
+ dataset=hot_pot_dataset,
163
+ metric_config=metric_config,
164
+ task_config=task_config,
165
+ n_trials=10, # Number of optimization trials
166
+ n_samples=150, # Number of dataset samples for evaluation per trial
167
+ )
168
+
169
+ # Display the best prompt and its score
170
+ result.display()
171
+ ```
172
+ The `result` object contains the optimized prompt, evaluation scores, and other details from the optimization process. If `project_name` is provided and Opik is configured, results will also be logged to your Comet workspace.
173
+
174
+ ## Development
175
+
176
+ To contribute or use the Opik Optimizer from source:
177
+
178
+ 1. **Clone the Opik repository:**
179
+ ```bash
180
+ git clone git@github.com:comet-ml/opik.git
181
+ ```
182
+ 2. **Navigate to the optimizer's directory:**
183
+ ```bash
184
+ cd opik/sdks/opik_optimizer # Adjust 'opik' if you cloned into a different folder name
185
+ ```
186
+ 3. **Install in editable mode (with development dependencies):**
187
+ ```bash
188
+ pip install -e .[dev]
189
+ ```
190
+ The `[dev]` extra installs dependencies useful for development, such as `pytest`.
191
+
192
+ ## Requirements
193
+
194
+ - Python `>=3.9,<3.13`
195
+ - Opik API key (recommended for full functionality, configure via `opik configure`)
196
+ - API key for your chosen LLM provider (e.g., OpenAI, Anthropic, Gemini), configured as per LiteLLM guidelines.
@@ -0,0 +1,45 @@
1
+ opik_optimizer/__init__.py,sha256=8nbzCWZWePrko_3fE2MT-sldseOBTnpUnbnjoNbVddU,1284
2
+ opik_optimizer/_throttle.py,sha256=ztub8qlwz4u0GVA2TIoLig0D1Cs0hJ7_o_SnT_C7Nmk,1360
3
+ opik_optimizer/base_optimizer.py,sha256=Gp96LSmWBHpC5rOoDkDUunRayvqf-A510TMwjsVhZYk,5018
4
+ opik_optimizer/cache_config.py,sha256=EzF4RAzxhSG8vtMJANdiUpNHQ9HzL2CrCXp0iik0f4A,580
5
+ opik_optimizer/logging_config.py,sha256=XECPnSoh8ghbllv1F0vj6ofO8YmE2HL0coLWjLdaNTU,2780
6
+ opik_optimizer/meta_prompt_optimizer.py,sha256=-5fAPz0LsQiQS-xj67hxr3KizvxoOScAA7gS6ACM9PY,49457
7
+ opik_optimizer/optimization_result.py,sha256=v_22SUW62XOFDPGRXrKLshPowi_QeJ1ZFrtnlaFMWek,9134
8
+ opik_optimizer/task_evaluator.py,sha256=aKVM2ER4TOgBC54FO1E6Spj-hdN_G8XstJ-F6m1gkJo,3879
9
+ opik_optimizer/utils.py,sha256=NWNyOYnsV0A7pHrfywRROmXq68nrUUuyzn2w0hKXpUg,1986
10
+ opik_optimizer/data/hotpot-500.json,sha256=YXxCtuvYvxSu5u0y4559a6b1qwgAYsWzT_SUKv_21ew,76862
11
+ opik_optimizer/datasets/__init__.py,sha256=j4O7ItmTDsm0XdAtx42uBsewSEhhw99Z-BO0CyyEBes,692
12
+ opik_optimizer/datasets/ai2_arc.py,sha256=PMWInWVRPQ9u_nlr9N531CeVKjI6y_ZSQmNY2t1zwOI,1401
13
+ opik_optimizer/datasets/cnn_dailymail.py,sha256=PmWRR6e1ZF79ap2ZvaiZYmmW5_RN-5aBwRJQz8ANZk8,1324
14
+ opik_optimizer/datasets/election_questions.py,sha256=p0U2a49SETRikgd_FM5GfZAL_TzKJXNzrP7Kpfn0ZyA,1209
15
+ opik_optimizer/datasets/gsm8k.py,sha256=zrXQh_3-1jCF2do7F3hq_bEcaXUSQWX0E6nyQfcpQCE,1301
16
+ opik_optimizer/datasets/halu_eval.py,sha256=wOFbPdJ2jcQ3s3FpzDFGgx4rmvJHk9aD2WHxJrIascs,1420
17
+ opik_optimizer/datasets/hotpot_qa.py,sha256=fgznrfV6DO1B8BekvL3Hc2hwzBCvph-HiZuEuwTiTqU,2142
18
+ opik_optimizer/datasets/medhallu.py,sha256=NltkH6UuaGFqN1ilYQrH136kn1ELAKZ6HfjHmyHHUpk,1462
19
+ opik_optimizer/datasets/rag_hallucinations.py,sha256=3ddmUL7dp01iGYkvJ9uaTKFEuLnqrJJ29Ww9z5m_-3g,1421
20
+ opik_optimizer/datasets/ragbench.py,sha256=bCt3S5KsfW_2wDK009aiGRXiIEHlLgL_OlXrXBFWEPI,1411
21
+ opik_optimizer/datasets/tiny_test.py,sha256=ysgkfCHsi018b0qy8OtuL2BUkOo-YEZVu4AnscJCA4E,1823
22
+ opik_optimizer/datasets/truthful_qa.py,sha256=xbRjW0UOm7oDN3jAnTZD7HChgDGspwhAhFpHV7zTtag,4166
23
+ opik_optimizer/demo/__init__.py,sha256=KSpFYhzN7fTmLEsIaciRHwxcJDeAiX5NDmYLdPsfpT8,150
24
+ opik_optimizer/demo/cache.py,sha256=5WqK8rSiijzU6s4VHIjLuL1LR5i1yHtY-x5FZTduSus,3669
25
+ opik_optimizer/demo/datasets.py,sha256=MezQlG4Q_cgSH7zQOmJcDwkGU8JV0xKSnZwCJGaj-88,2494
26
+ opik_optimizer/evolutionary_optimizer/__init__.py,sha256=OQ2ART5g-7EVGOISvTGY-AbmEyyDFEJJCsmJBzGJIpw,57
27
+ opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py,sha256=rrSZ8rUeUkVQ8qZwz16gY3TUoOwi0o-rVPZLumNeSWs,76650
28
+ opik_optimizer/few_shot_bayesian_optimizer/__init__.py,sha256=VuH7FOROyGcjMPryejtZC-5Y0QHlVTFLTGUDgNqRAFw,113
29
+ opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py,sha256=F4NQdm4YN7BMioxQdhzBZK8qhFEDG3qYhF--M29jfzQ,19334
30
+ opik_optimizer/few_shot_bayesian_optimizer/prompt_parameter.py,sha256=EDsSIFAUOfiZKWLrOAaBDB7Exk7cmIs4ccI95kVa7JY,3118
31
+ opik_optimizer/few_shot_bayesian_optimizer/prompt_templates.py,sha256=HmvD-UeT3aKiiet5cUtULXe6iFPEOo6hxyDE0pH2LnQ,2424
32
+ opik_optimizer/integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
+ opik_optimizer/mipro_optimizer/__init__.py,sha256=CF9TVXjOxTobDO1kAS8CD4eyLVzEozxjfgoKwIO6ZpU,44
34
+ opik_optimizer/mipro_optimizer/_lm.py,sha256=bcTy2Y5HjSaFQOATIpUaA86eIp3vKHaMuDI2_RvN2ww,16376
35
+ opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py,sha256=CiQWe39LCp-81SZmLyMybIw2lc_0RBKcxclLEuSXQgI,39757
36
+ opik_optimizer/mipro_optimizer/mipro_optimizer.py,sha256=XFXlhDCMBS5wXIVds83F26ztKSkqF3IeEDXYJvL-dZc,23957
37
+ opik_optimizer/mipro_optimizer/utils.py,sha256=wG1koygXfm_pvtA1jR-YaU4NATPbJZoTI7LE7l7df7g,3273
38
+ opik_optimizer/optimization_config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
+ opik_optimizer/optimization_config/configs.py,sha256=MYL9H2UAqeyGBlBGWbOZ-6Snto4ZMuXnypgvVuUSW1Y,1132
40
+ opik_optimizer/optimization_config/mappers.py,sha256=RXgTMxPzTQ1AHGke6Zca6rTcfCI7IkCKhQYciaEGSAo,1698
41
+ opik_optimizer-0.8.1.dist-info/licenses/LICENSE,sha256=dTRSwwCHdWeSjzodvnivYqcwi8x3Qfr21yv65QUWWBE,1062
42
+ opik_optimizer-0.8.1.dist-info/METADATA,sha256=HPdTa0hebpLlqgFVY4Ue-VFaqDucUIPb1KrZS8w_rX4,7085
43
+ opik_optimizer-0.8.1.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
44
+ opik_optimizer-0.8.1.dist-info/top_level.txt,sha256=ondOlpq6_yFckqpxoAHSfzZS2N-JfgmA-QQhOJfz7m0,15
45
+ opik_optimizer-0.8.1.dist-info/RECORD,,