opik-optimizer 0.7.8__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +2 -0
- opik_optimizer/base_optimizer.py +6 -4
- opik_optimizer/data/hotpot-500.json +501 -1001
- opik_optimizer/datasets/__init__.py +27 -0
- opik_optimizer/datasets/ai2_arc.py +44 -0
- opik_optimizer/datasets/cnn_dailymail.py +40 -0
- opik_optimizer/datasets/election_questions.py +36 -0
- opik_optimizer/datasets/gsm8k.py +40 -0
- opik_optimizer/datasets/halu_eval.py +43 -0
- opik_optimizer/datasets/hotpot_qa.py +68 -0
- opik_optimizer/datasets/medhallu.py +39 -0
- opik_optimizer/datasets/rag_hallucinations.py +41 -0
- opik_optimizer/datasets/ragbench.py +40 -0
- opik_optimizer/datasets/tiny_test.py +57 -0
- opik_optimizer/datasets/truthful_qa.py +107 -0
- opik_optimizer/demo/datasets.py +53 -607
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +3 -1
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +90 -19
- opik_optimizer/logging_config.py +1 -1
- opik_optimizer/meta_prompt_optimizer.py +60 -14
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +151 -13
- opik_optimizer/optimization_result.py +11 -0
- opik_optimizer/task_evaluator.py +6 -1
- opik_optimizer/utils.py +0 -52
- opik_optimizer-0.8.1.dist-info/METADATA +196 -0
- opik_optimizer-0.8.1.dist-info/RECORD +45 -0
- opik_optimizer-0.7.8.dist-info/METADATA +0 -174
- opik_optimizer-0.7.8.dist-info/RECORD +0 -33
- {opik_optimizer-0.7.8.dist-info → opik_optimizer-0.8.1.dist-info}/WHEEL +0 -0
- {opik_optimizer-0.7.8.dist-info → opik_optimizer-0.8.1.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-0.7.8.dist-info → opik_optimizer-0.8.1.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
|
|
1
1
|
from typing import Any, Dict, List, Tuple, Union, Optional, Literal
|
2
2
|
import os
|
3
3
|
import random
|
4
|
+
from datetime import datetime
|
4
5
|
|
5
6
|
import opik
|
6
7
|
|
@@ -37,11 +38,13 @@ logger = logging.getLogger(__name__) # Inherits config from setup_logging
|
|
37
38
|
|
38
39
|
|
39
40
|
class MiproOptimizer(BaseOptimizer):
|
40
|
-
def __init__(self, model, project_name: Optional[str] = None, **model_kwargs):
|
41
|
-
super().__init__(model, project_name, **model_kwargs)
|
41
|
+
def __init__(self, model, project_name: Optional[str] = None, verbose: int = 1, **model_kwargs):
|
42
|
+
super().__init__(model, project_name, verbose=verbose, **model_kwargs)
|
42
43
|
self.tools = []
|
43
44
|
self.num_threads = self.model_kwargs.pop("num_threads", 6)
|
44
45
|
self.model_kwargs["model"] = self.model
|
46
|
+
self.llm_call_counter = 0
|
47
|
+
# FIXME: add mipro_optimizer=True - It does not count the LLM calls made internally by DSPy during MiproOptimizer.optimizer.compile().
|
45
48
|
lm = LM(**self.model_kwargs)
|
46
49
|
opik_callback = OpikCallback(project_name=self.project_name, log_graph=True)
|
47
50
|
dspy.configure(lm=lm, callbacks=[opik_callback])
|
@@ -56,6 +59,7 @@ class MiproOptimizer(BaseOptimizer):
|
|
56
59
|
n_samples: int = 10,
|
57
60
|
dataset_item_ids: Optional[List[str]] = None,
|
58
61
|
experiment_config: Optional[Dict] = None,
|
62
|
+
verbose: int = 1,
|
59
63
|
**kwargs,
|
60
64
|
) -> float:
|
61
65
|
"""
|
@@ -69,6 +73,7 @@ class MiproOptimizer(BaseOptimizer):
|
|
69
73
|
n_samples: number of items to test in the dataset
|
70
74
|
dataset_item_ids: Optional list of dataset item IDs to evaluate
|
71
75
|
experiment_config: Optional configuration for the experiment
|
76
|
+
verbose: Verbosity level
|
72
77
|
**kwargs: Additional arguments for evaluation
|
73
78
|
|
74
79
|
Returns:
|
@@ -76,10 +81,14 @@ class MiproOptimizer(BaseOptimizer):
|
|
76
81
|
"""
|
77
82
|
# FIMXE: call super when it is ready
|
78
83
|
# FIXME: Intermediate values:
|
84
|
+
self.llm_call_counter += 1
|
79
85
|
metric = metric_config.metric
|
80
86
|
input_key = task_config.input_dataset_fields[0] # FIXME: allow all inputs
|
81
87
|
output_key = task_config.output_dataset_field
|
82
88
|
|
89
|
+
# Kwargs might contain n_samples, passed from run_benchmark.py
|
90
|
+
n_samples = kwargs.pop("n_samples", None) # Get n_samples from kwargs if present
|
91
|
+
|
83
92
|
if isinstance(dataset, str):
|
84
93
|
opik_client = opik.Opik(project_name=self.project_name)
|
85
94
|
dataset = opik_client.get_dataset(dataset)
|
@@ -144,12 +153,32 @@ class MiproOptimizer(BaseOptimizer):
|
|
144
153
|
|
145
154
|
return result
|
146
155
|
|
147
|
-
|
148
|
-
|
149
|
-
|
156
|
+
# Robust n_samples handling for selecting dataset_item_ids
|
157
|
+
dataset_items_for_eval = dataset.get_items()
|
158
|
+
num_total_items = len(dataset_items_for_eval)
|
159
|
+
dataset_item_ids_to_use = dataset_item_ids # Use provided IDs if any
|
150
160
|
|
151
|
-
|
152
|
-
dataset_item_ids
|
161
|
+
if n_samples is not None: # If n_samples is specified by the caller (run_benchmark.py)
|
162
|
+
if dataset_item_ids is not None:
|
163
|
+
# This case should ideally be an error or a clear precedence rule.
|
164
|
+
# For now, let's assume if dataset_item_ids is provided, it takes precedence over n_samples.
|
165
|
+
logger.warning("MiproOptimizer.evaluate_prompt: Both n_samples and dataset_item_ids provided. Using provided dataset_item_ids.")
|
166
|
+
# dataset_item_ids_to_use is already dataset_item_ids
|
167
|
+
elif n_samples > num_total_items:
|
168
|
+
logger.warning(f"MiproOptimizer.evaluate_prompt: n_samples ({n_samples}) > total items ({num_total_items}). Using all {num_total_items} items.")
|
169
|
+
dataset_item_ids_to_use = None # opik.evaluation.evaluate handles None as all items
|
170
|
+
elif n_samples <= 0:
|
171
|
+
logger.warning(f"MiproOptimizer.evaluate_prompt: n_samples ({n_samples}) is <= 0. Using all {num_total_items} items.")
|
172
|
+
dataset_item_ids_to_use = None
|
173
|
+
else:
|
174
|
+
# n_samples is valid and dataset_item_ids was not provided, so sample now.
|
175
|
+
all_ids = [item["id"] for item in dataset_items_for_eval]
|
176
|
+
dataset_item_ids_to_use = random.sample(all_ids, n_samples)
|
177
|
+
logger.info(f"MiproOptimizer.evaluate_prompt: Sampled {n_samples} items for evaluation.")
|
178
|
+
else: # n_samples is None
|
179
|
+
if dataset_item_ids is None:
|
180
|
+
logger.info(f"MiproOptimizer.evaluate_prompt: n_samples is None and dataset_item_ids is None. Using all {num_total_items} items.")
|
181
|
+
# dataset_item_ids_to_use is already dataset_item_ids (which could be None)
|
153
182
|
|
154
183
|
experiment_config = experiment_config or {}
|
155
184
|
experiment_config = {
|
@@ -171,9 +200,10 @@ class MiproOptimizer(BaseOptimizer):
|
|
171
200
|
# "reference" needs to match metric
|
172
201
|
scoring_key_mapping={"reference": output_key},
|
173
202
|
task_threads=self.num_threads,
|
174
|
-
dataset_item_ids=
|
203
|
+
dataset_item_ids=dataset_item_ids_to_use,
|
175
204
|
project_name=self.project_name,
|
176
205
|
experiment_config=experiment_config,
|
206
|
+
verbose=verbose,
|
177
207
|
)
|
178
208
|
|
179
209
|
# Calculate average score across all metrics
|
@@ -207,6 +237,7 @@ class MiproOptimizer(BaseOptimizer):
|
|
207
237
|
optimization = self._opik_client.create_optimization(
|
208
238
|
dataset_name=dataset.name,
|
209
239
|
objective_name=metric_config.metric.name,
|
240
|
+
metadata={"optimizer": self.__class__.__name__},
|
210
241
|
)
|
211
242
|
except Exception:
|
212
243
|
logger.warning(
|
@@ -284,13 +315,14 @@ class MiproOptimizer(BaseOptimizer):
|
|
284
315
|
**kwargs,
|
285
316
|
) -> None:
|
286
317
|
# FIXME: Intermediate values:
|
318
|
+
self.llm_call_counter = 0
|
287
319
|
metric = metric_config.metric
|
288
320
|
prompt = task_config.instruction_prompt
|
289
321
|
input_key = task_config.input_dataset_fields[0] # FIXME: allow all
|
290
322
|
output_key = task_config.output_dataset_field
|
291
323
|
self.tools = task_config.tools
|
292
324
|
self.num_candidates = num_candidates
|
293
|
-
self.seed =
|
325
|
+
self.seed = 42
|
294
326
|
self.input_key = input_key
|
295
327
|
self.output_key = output_key
|
296
328
|
self.prompt = prompt
|
@@ -347,7 +379,7 @@ class MiproOptimizer(BaseOptimizer):
|
|
347
379
|
metric=self.metric_function,
|
348
380
|
auto=self.auto,
|
349
381
|
num_threads=self.num_threads,
|
350
|
-
verbose=
|
382
|
+
verbose=(self.verbose == 1),
|
351
383
|
num_candidates=self.num_candidates,
|
352
384
|
seed=self.seed,
|
353
385
|
opik_prompt_task_config=task_config,
|
@@ -373,6 +405,9 @@ class MiproOptimizer(BaseOptimizer):
|
|
373
405
|
"""
|
374
406
|
Continue to look for optimizations
|
375
407
|
"""
|
408
|
+
if not hasattr(self, 'optimizer') or not self.optimizer:
|
409
|
+
raise RuntimeError("MiproOptimizer not prepared. Call prepare_optimize_prompt first.")
|
410
|
+
|
376
411
|
self.results = self.optimizer.compile(
|
377
412
|
student=self.module,
|
378
413
|
trainset=self.trainset,
|
@@ -385,12 +420,114 @@ class MiproOptimizer(BaseOptimizer):
|
|
385
420
|
key=lambda item: item["score"],
|
386
421
|
reverse=True,
|
387
422
|
)
|
423
|
+
|
424
|
+
mipro_history_processed = []
|
425
|
+
# self.num_candidates is set in prepare_optimize_prompt, defaults to 10
|
426
|
+
# If self.num_candidates is 0 or None, this logic might break or be odd.
|
427
|
+
# Add a safeguard for num_candidates_per_round if self.num_candidates is not usable.
|
428
|
+
num_candidates_per_round = self.num_candidates if hasattr(self, 'num_candidates') and self.num_candidates and self.num_candidates > 0 else 1
|
429
|
+
|
430
|
+
for i, candidate_data in enumerate(self.results.candidate_programs):
|
431
|
+
program_module = candidate_data.get("program")
|
432
|
+
instruction = "N/A"
|
433
|
+
if hasattr(program_module, 'signature') and hasattr(program_module.signature, 'instructions'):
|
434
|
+
instruction = program_module.signature.instructions
|
435
|
+
elif hasattr(program_module, 'extended_signature') and hasattr(program_module.extended_signature, 'instructions'):
|
436
|
+
instruction = program_module.extended_signature.instructions
|
437
|
+
elif hasattr(program_module, 'predictor') and hasattr(program_module.predictor, 'signature') and hasattr(program_module.predictor.signature, 'instructions'):
|
438
|
+
instruction = program_module.predictor.signature.instructions
|
439
|
+
|
440
|
+
# Remove R and C calculation for Mipro as its history is flat
|
441
|
+
# current_round_number = (i // num_candidates_per_round) + 1
|
442
|
+
# current_candidate_in_round = (i % num_candidates_per_round) + 1
|
443
|
+
|
444
|
+
iter_detail = {
|
445
|
+
"iteration": i + 1,
|
446
|
+
# "round_number": current_round_number, # Remove round_number
|
447
|
+
# "candidate_in_round": current_candidate_in_round, # Remove candidate_in_round
|
448
|
+
"timestamp": datetime.now().isoformat(),
|
449
|
+
"prompt_candidate": instruction,
|
450
|
+
"parameters_used": {
|
451
|
+
"program_summary": str(program_module)[:500]
|
452
|
+
},
|
453
|
+
"scores": [], # Initialize scores list
|
454
|
+
"tokens_used": None, # TODO: add tokens_used
|
455
|
+
"cost": None, # TODO: add cost
|
456
|
+
"duration_seconds": None, # TODO: add duration_seconds
|
457
|
+
}
|
458
|
+
|
459
|
+
current_score = candidate_data.get("score")
|
460
|
+
metric_name_for_history = self.opik_metric.name if hasattr(self, 'opik_metric') and self.opik_metric else "unknown_metric"
|
461
|
+
|
462
|
+
# Unscale if it's a known 0-1 metric that MIPRO might scale to 0-100
|
463
|
+
# For now, specifically targeting Levenshtein-like metrics
|
464
|
+
if isinstance(current_score, (float, int)) and \
|
465
|
+
("levenshtein" in metric_name_for_history.lower() or "similarity" in metric_name_for_history.lower()):
|
466
|
+
# Assuming scores like 32.4 are 0-1 scores scaled by 100
|
467
|
+
if abs(current_score) > 1.0: # A simple check to see if it looks scaled
|
468
|
+
logger.debug(f"Mipro history: Unscaling score {current_score} for metric {metric_name_for_history} by dividing by 100.")
|
469
|
+
current_score /= 100.0
|
470
|
+
|
471
|
+
iter_detail["scores"].append({
|
472
|
+
"metric_name": metric_name_for_history,
|
473
|
+
"score": current_score,
|
474
|
+
"opik_evaluation_id": None # TODO: add opik_evaluation_id
|
475
|
+
})
|
476
|
+
mipro_history_processed.append(iter_detail)
|
477
|
+
|
478
|
+
if not self.best_programs:
|
479
|
+
logger.warning("MIPRO compile returned no candidate programs.")
|
480
|
+
return OptimizationResult(
|
481
|
+
optimizer="MiproOptimizer",
|
482
|
+
prompt=self.prompt,
|
483
|
+
score=0.0,
|
484
|
+
metric_name=self.opik_metric.name if hasattr(self, 'opik_metric') else "unknown_metric",
|
485
|
+
details={"error": "No candidate programs generated by MIPRO"},
|
486
|
+
history=mipro_history_processed,
|
487
|
+
llm_calls=self.llm_call_counter
|
488
|
+
)
|
489
|
+
|
388
490
|
self.module = self.get_best().details["program"]
|
389
|
-
|
491
|
+
best_program_details = self.get_best()
|
492
|
+
|
493
|
+
# Unscale the main score if necessary, similar to history scores
|
494
|
+
final_best_score = best_program_details.score
|
495
|
+
final_metric_name = best_program_details.metric_name
|
496
|
+
if isinstance(final_best_score, (float, int)) and \
|
497
|
+
final_metric_name and \
|
498
|
+
("levenshtein" in final_metric_name.lower() or "similarity" in final_metric_name.lower()):
|
499
|
+
if abs(final_best_score) > 1.0: # A simple check to see if it looks scaled
|
500
|
+
logger.debug(f"Mipro main result: Unscaling score {final_best_score} for metric {final_metric_name} by dividing by 100.")
|
501
|
+
final_best_score /= 100.0
|
502
|
+
|
503
|
+
return OptimizationResult(
|
504
|
+
optimizer="MiproOptimizer",
|
505
|
+
prompt=best_program_details.prompt,
|
506
|
+
tool_prompts=best_program_details.tool_prompts,
|
507
|
+
score=final_best_score, # Use the potentially unscaled score
|
508
|
+
metric_name=final_metric_name,
|
509
|
+
demonstrations=best_program_details.demonstrations,
|
510
|
+
details=best_program_details.details,
|
511
|
+
history=mipro_history_processed,
|
512
|
+
llm_calls=self.llm_call_counter
|
513
|
+
)
|
390
514
|
|
391
515
|
def get_best(self, position: int = 0) -> OptimizationResult:
|
516
|
+
if not hasattr(self, 'best_programs') or not self.best_programs:
|
517
|
+
logger.error("get_best() called but no best_programs found. MIPRO compile might have failed or yielded no results.")
|
518
|
+
return OptimizationResult(
|
519
|
+
optimizer="MiproOptimizer",
|
520
|
+
prompt=getattr(self, 'prompt', "Error: Initial prompt not found"),
|
521
|
+
score=0.0,
|
522
|
+
metric_name=getattr(self, 'opik_metric', None).name if hasattr(self, 'opik_metric') and self.opik_metric else "unknown_metric",
|
523
|
+
details={"error": "No programs generated or compile failed"},
|
524
|
+
history=[],
|
525
|
+
llm_calls=self.llm_call_counter
|
526
|
+
)
|
527
|
+
|
392
528
|
score = self.best_programs[position]["score"]
|
393
|
-
|
529
|
+
program_module = self.best_programs[position]["program"]
|
530
|
+
state = program_module.dump_state()
|
394
531
|
if self.tools:
|
395
532
|
tool_names = [tool.__name__ for tool in self.tools]
|
396
533
|
tool_prompts = get_tool_prompts(
|
@@ -410,5 +547,6 @@ class MiproOptimizer(BaseOptimizer):
|
|
410
547
|
score=score,
|
411
548
|
metric_name=self.opik_metric.name,
|
412
549
|
demonstrations=demos,
|
413
|
-
details={"program":
|
550
|
+
details={"program": program_module},
|
551
|
+
llm_calls=self.llm_call_counter
|
414
552
|
)
|
@@ -7,6 +7,15 @@ from pydantic import BaseModel, Field
|
|
7
7
|
from .base_optimizer import OptimizationRound # Adjust import as necessary
|
8
8
|
import rich
|
9
9
|
|
10
|
+
class OptimizationStep(BaseModel):
|
11
|
+
"""Represents a single step or trial in an optimization process."""
|
12
|
+
step: int
|
13
|
+
score: Optional[float] = None
|
14
|
+
prompt: Optional[Union[str, List[Dict[str, str]]]] = None
|
15
|
+
parameters: Optional[Dict[str, Any]] = None
|
16
|
+
timestamp: Optional[str] = None
|
17
|
+
# Add other relevant details per step if needed
|
18
|
+
|
10
19
|
|
11
20
|
class OptimizationResult(pydantic.BaseModel):
|
12
21
|
"""Result of an optimization run."""
|
@@ -28,6 +37,8 @@ class OptimizationResult(pydantic.BaseModel):
|
|
28
37
|
demonstrations: Optional[List[Dict[str, Any]]] = None
|
29
38
|
optimizer: str = "Optimizer"
|
30
39
|
tool_prompts: Optional[Dict[str, str]] = None
|
40
|
+
opik_metadata: Optional[Dict[str, Any]] = None
|
41
|
+
llm_calls: Optional[int] = None
|
31
42
|
|
32
43
|
model_config = pydantic.ConfigDict(arbitrary_types_allowed=True)
|
33
44
|
|
opik_optimizer/task_evaluator.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
import opik
|
2
|
-
|
2
|
+
import logging
|
3
3
|
from typing import Any, Callable, Dict, List, Optional
|
4
4
|
from opik_optimizer.optimization_config.configs import MetricConfig
|
5
5
|
from opik.evaluation.metrics import score_result
|
6
6
|
|
7
7
|
from opik.evaluation import evaluator as opik_evaluator
|
8
8
|
|
9
|
+
logger = logging.getLogger(__name__)
|
9
10
|
|
10
11
|
def evaluate(
|
11
12
|
dataset: opik.Dataset,
|
@@ -17,6 +18,7 @@ def evaluate(
|
|
17
18
|
project_name: Optional[str] = None,
|
18
19
|
n_samples: Optional[int] = None,
|
19
20
|
experiment_config: Optional[Dict[str, Any]] = None,
|
21
|
+
verbose: int = 1,
|
20
22
|
) -> float:
|
21
23
|
"""
|
22
24
|
Evaluate a task on a dataset.
|
@@ -31,6 +33,7 @@ def evaluate(
|
|
31
33
|
num_threads: Number of threads to use for evaluation.
|
32
34
|
experiment_config: The dictionary with parameters that describe experiment
|
33
35
|
optimization_id: Optional optimization ID for the experiment.
|
36
|
+
verbose: Whether to print debug information.
|
34
37
|
|
35
38
|
Returns:
|
36
39
|
float: The average score of the evaluated task.
|
@@ -71,6 +74,7 @@ def evaluate(
|
|
71
74
|
task_threads=num_threads,
|
72
75
|
nb_samples=n_samples,
|
73
76
|
experiment_config=experiment_config,
|
77
|
+
verbose=verbose,
|
74
78
|
)
|
75
79
|
else:
|
76
80
|
result = opik_evaluator.evaluate(
|
@@ -83,6 +87,7 @@ def evaluate(
|
|
83
87
|
task_threads=num_threads,
|
84
88
|
nb_samples=n_samples,
|
85
89
|
experiment_config=experiment_config,
|
90
|
+
verbose=verbose,
|
86
91
|
)
|
87
92
|
|
88
93
|
if not result.test_results:
|
opik_optimizer/utils.py
CHANGED
@@ -8,9 +8,6 @@ from opik.api_objects.opik_client import Opik
|
|
8
8
|
|
9
9
|
from typing import List, Dict, Any, Optional, Callable, TYPE_CHECKING
|
10
10
|
|
11
|
-
# Test dataset name for optimizer examples
|
12
|
-
TEST_DATASET_NAME = "tiny-test-optimizer"
|
13
|
-
|
14
11
|
# Type hint for OptimizationResult without circular import
|
15
12
|
if TYPE_CHECKING:
|
16
13
|
from .optimization_result import OptimizationResult
|
@@ -79,54 +76,5 @@ def get_random_seed() -> int:
|
|
79
76
|
|
80
77
|
return random.randint(0, 2**32 - 1)
|
81
78
|
|
82
|
-
|
83
|
-
def get_or_create_dataset(
|
84
|
-
dataset_name: str,
|
85
|
-
description: str,
|
86
|
-
data_loader: Callable[[], List[Dict[str, Any]]],
|
87
|
-
project_name: Optional[str] = None,
|
88
|
-
) -> opik.Dataset:
|
89
|
-
"""
|
90
|
-
Get an existing dataset or create a new one if it doesn't exist.
|
91
|
-
|
92
|
-
Args:
|
93
|
-
dataset_name: Name of the dataset
|
94
|
-
description: Description of the dataset
|
95
|
-
data: Optional data to insert into the dataset
|
96
|
-
project_name: Optional project name
|
97
|
-
|
98
|
-
Returns:
|
99
|
-
opik.Dataset: The dataset object
|
100
|
-
"""
|
101
|
-
client = Opik(project_name=project_name)
|
102
|
-
|
103
|
-
try:
|
104
|
-
# Try to get existing dataset
|
105
|
-
dataset = client.get_dataset(dataset_name)
|
106
|
-
# If dataset exists but has no data, delete it
|
107
|
-
if not dataset.get_items():
|
108
|
-
print("Dataset exists but is empty - deleting it...")
|
109
|
-
# Delete all items in the dataset
|
110
|
-
items = dataset.get_items()
|
111
|
-
if items:
|
112
|
-
dataset.delete(items_ids=[item.id for item in items])
|
113
|
-
# Delete the dataset itself
|
114
|
-
client.delete_dataset(dataset_name)
|
115
|
-
raise Exception("Dataset deleted, will create new one")
|
116
|
-
except Exception:
|
117
|
-
# Create new dataset
|
118
|
-
print("Creating new dataset...")
|
119
|
-
dataset = client.create_dataset(name=dataset_name, description=description)
|
120
|
-
|
121
|
-
dataset_items = data_loader()
|
122
|
-
dataset.insert(dataset_items)
|
123
|
-
|
124
|
-
# Verify data was added
|
125
|
-
if not dataset.get_items():
|
126
|
-
raise Exception("Failed to add data to dataset")
|
127
|
-
|
128
|
-
return dataset
|
129
|
-
|
130
|
-
|
131
79
|
def random_chars(n: int) -> str:
|
132
80
|
return "".join(random.choice(string.ascii_letters) for _ in range(n))
|
@@ -0,0 +1,196 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: opik_optimizer
|
3
|
+
Version: 0.8.1
|
4
|
+
Summary: Agent optimization with Opik
|
5
|
+
Home-page: https://github.com/comet-ml/opik
|
6
|
+
Author: Comet ML
|
7
|
+
Author-email: support@comet.com
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
9
|
+
Classifier: Intended Audience :: Developers
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
12
|
+
Requires-Python: >=3.9,<3.13
|
13
|
+
Description-Content-Type: text/markdown
|
14
|
+
License-File: LICENSE
|
15
|
+
Requires-Dist: opik>=1.7.17
|
16
|
+
Requires-Dist: dspy<3,>=2.6.18
|
17
|
+
Requires-Dist: litellm
|
18
|
+
Requires-Dist: tqdm
|
19
|
+
Requires-Dist: datasets
|
20
|
+
Requires-Dist: optuna
|
21
|
+
Requires-Dist: pydantic
|
22
|
+
Requires-Dist: pandas
|
23
|
+
Requires-Dist: hf_xet
|
24
|
+
Requires-Dist: pyrate-limiter
|
25
|
+
Requires-Dist: deap>=1.4.3
|
26
|
+
Provides-Extra: dev
|
27
|
+
Requires-Dist: pytest; extra == "dev"
|
28
|
+
Requires-Dist: pytest-conv; extra == "dev"
|
29
|
+
Dynamic: author
|
30
|
+
Dynamic: author-email
|
31
|
+
Dynamic: classifier
|
32
|
+
Dynamic: description
|
33
|
+
Dynamic: description-content-type
|
34
|
+
Dynamic: home-page
|
35
|
+
Dynamic: license-file
|
36
|
+
Dynamic: provides-extra
|
37
|
+
Dynamic: requires-dist
|
38
|
+
Dynamic: requires-python
|
39
|
+
Dynamic: summary
|
40
|
+
|
41
|
+
# Opik Agent Optimizer
|
42
|
+
|
43
|
+
[](https://pypi.org/project/opik-optimizer/)
|
44
|
+
[](https://pypi.org/project/opik-optimizer/)
|
45
|
+
[](https://pepy.tech/project/opik-optimizer)
|
46
|
+
[](https://github.com/comet-ml/opik/blob/main/LICENSE)
|
47
|
+
|
48
|
+
The Opik Agent Optimizer refines your prompts to achieve better performance from your Large Language Models (LLMs). It supports a variety of optimization algorithms, including:
|
49
|
+
|
50
|
+
* EvolutionaryOptimizer
|
51
|
+
* FewShotBayesianOptimizer
|
52
|
+
* MetaPromptOptimizer
|
53
|
+
* MiproOptimizer
|
54
|
+
|
55
|
+
Opik Optimizer is a component of the [Opik platform](https://github.com/comet-ml/opik), an open-source LLM evaluation platform by Comet.
|
56
|
+
For more information about the broader Opik ecosystem, visit our [Website](https://www.comet.com/site/products/opik/) or [Documentation](https://www.comet.com/docs/opik/).
|
57
|
+
|
58
|
+
## Quickstart
|
59
|
+
|
60
|
+
Explore Opik Optimizer's capabilities with our interactive notebook:
|
61
|
+
|
62
|
+
<a href="https://colab.research.google.com/github/comet-ml/opik/blob/main/sdks/opik_optimizer/notebooks/OpikOptimizerIntro.ipynb">
|
63
|
+
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab"/>
|
64
|
+
</a>
|
65
|
+
|
66
|
+
## Setup
|
67
|
+
|
68
|
+
To get started with Opik Optimizer, follow these steps:
|
69
|
+
|
70
|
+
1. **Install the package:**
|
71
|
+
```bash
|
72
|
+
# using pip
|
73
|
+
pip install opik-optimizer
|
74
|
+
|
75
|
+
# using uv (faster)
|
76
|
+
uv pip install opik-optimizer
|
77
|
+
```
|
78
|
+
|
79
|
+
2. **Configure Opik (Optional, for advanced features):**
|
80
|
+
If you plan to log optimization experiments to Comet or use Opik Datasets, you'll need to configure the Opik client:
|
81
|
+
```bash
|
82
|
+
# Install the main Opik CLI (if not already installed)
|
83
|
+
pip install opik
|
84
|
+
|
85
|
+
# Configure your Comet API key and workspace
|
86
|
+
opik configure
|
87
|
+
# When prompted, enter your Opik API key and workspace details.
|
88
|
+
```
|
89
|
+
Using Opik with Comet allows you to track your optimization runs, compare results, and manage datasets seamlessly.
|
90
|
+
|
91
|
+
3. **Set up LLM Provider API Keys:**
|
92
|
+
Ensure your environment variables are set for the LLM(s) you intend to use. For example, for OpenAI models:
|
93
|
+
```bash
|
94
|
+
export OPENAI_API_KEY="your_openai_api_key"
|
95
|
+
```
|
96
|
+
The optimizer utilizes LiteLLM, so you can configure keys for various providers as per LiteLLM's documentation.
|
97
|
+
|
98
|
+
You'll typically need:
|
99
|
+
|
100
|
+
* An LLM model name (e.g., "gpt-4o-mini", "claude-3-haiku-20240307").
|
101
|
+
* An [Opik Dataset](https://www.comet.com/docs/opik/evaluation/manage_datasets/) (or a compatible local dataset/data generator).
|
102
|
+
* An [Opik Metric](https://www.comet.com/docs/opik/evaluation/metrics/overview/) (or a custom evaluation function).
|
103
|
+
* A starting prompt (template string).
|
104
|
+
|
105
|
+
## Example
|
106
|
+
|
107
|
+
Here's a brief example of how to use the `FewShotBayesianOptimizer`. We'll use a sample dataset provided by Opik.
|
108
|
+
|
109
|
+
Available sample datasets for testing:
|
110
|
+
* `"tiny-test"`
|
111
|
+
* `"halu-eval-300"`
|
112
|
+
* `"hotpot-300"`
|
113
|
+
|
114
|
+
```python
|
115
|
+
from opik.evaluation.metrics import LevenshteinRatio
|
116
|
+
from opik_optimizer import FewShotBayesianOptimizer
|
117
|
+
from opik_optimizer.demo import get_or_create_dataset
|
118
|
+
|
119
|
+
from opik_optimizer import (
|
120
|
+
MetricConfig,
|
121
|
+
TaskConfig,
|
122
|
+
from_dataset_field,
|
123
|
+
from_llm_response_text,
|
124
|
+
)
|
125
|
+
|
126
|
+
# Load a sample dataset
|
127
|
+
hot_pot_dataset = get_or_create_dataset("hotpot-300")
|
128
|
+
|
129
|
+
# Define the instruction for your chat prompt.
|
130
|
+
# Input parameters from dataset examples will be interpolated into the full prompt.
|
131
|
+
prompt_instruction = """
|
132
|
+
Answer the question based on the provided context.
|
133
|
+
"""
|
134
|
+
project_name = "optimize-few-shot-bayesian-hotpot" # For Comet logging
|
135
|
+
|
136
|
+
optimizer = FewShotBayesianOptimizer(
|
137
|
+
model="gpt-4o-mini", # LiteLLM name to use for generation and optimization
|
138
|
+
project_name=project_name, # Associates the run with a Comet project
|
139
|
+
min_examples=3, # Min few-shot examples
|
140
|
+
max_examples=8, # Max few-shot examples
|
141
|
+
n_threads=16, # Parallel threads for evaluation
|
142
|
+
seed=42,
|
143
|
+
)
|
144
|
+
|
145
|
+
metric_config = MetricConfig(
|
146
|
+
metric=LevenshteinRatio(project_name=project_name), # Metric for evaluation
|
147
|
+
inputs={
|
148
|
+
"output": from_llm_response_text(), # Get output from LLM
|
149
|
+
"reference": from_dataset_field(name="answer"), # Get reference from dataset
|
150
|
+
},
|
151
|
+
)
|
152
|
+
|
153
|
+
task_config = TaskConfig(
|
154
|
+
instruction_prompt=prompt_instruction,
|
155
|
+
input_dataset_fields=["question"], # Fields from dataset to use as input
|
156
|
+
output_dataset_field="answer", # Field in dataset for reference answer
|
157
|
+
use_chat_prompt=True, # Use chat-style prompting
|
158
|
+
)
|
159
|
+
|
160
|
+
# Run the optimization
|
161
|
+
result = optimizer.optimize_prompt(
|
162
|
+
dataset=hot_pot_dataset,
|
163
|
+
metric_config=metric_config,
|
164
|
+
task_config=task_config,
|
165
|
+
n_trials=10, # Number of optimization trials
|
166
|
+
n_samples=150, # Number of dataset samples for evaluation per trial
|
167
|
+
)
|
168
|
+
|
169
|
+
# Display the best prompt and its score
|
170
|
+
result.display()
|
171
|
+
```
|
172
|
+
The `result` object contains the optimized prompt, evaluation scores, and other details from the optimization process. If `project_name` is provided and Opik is configured, results will also be logged to your Comet workspace.
|
173
|
+
|
174
|
+
## Development
|
175
|
+
|
176
|
+
To contribute or use the Opik Optimizer from source:
|
177
|
+
|
178
|
+
1. **Clone the Opik repository:**
|
179
|
+
```bash
|
180
|
+
git clone git@github.com:comet-ml/opik.git
|
181
|
+
```
|
182
|
+
2. **Navigate to the optimizer's directory:**
|
183
|
+
```bash
|
184
|
+
cd opik/sdks/opik_optimizer # Adjust 'opik' if you cloned into a different folder name
|
185
|
+
```
|
186
|
+
3. **Install in editable mode (with development dependencies):**
|
187
|
+
```bash
|
188
|
+
pip install -e .[dev]
|
189
|
+
```
|
190
|
+
The `[dev]` extra installs dependencies useful for development, such as `pytest`.
|
191
|
+
|
192
|
+
## Requirements
|
193
|
+
|
194
|
+
- Python `>=3.9,<3.13`
|
195
|
+
- Opik API key (recommended for full functionality, configure via `opik configure`)
|
196
|
+
- API key for your chosen LLM provider (e.g., OpenAI, Anthropic, Gemini), configured as per LiteLLM guidelines.
|
@@ -0,0 +1,45 @@
|
|
1
|
+
opik_optimizer/__init__.py,sha256=8nbzCWZWePrko_3fE2MT-sldseOBTnpUnbnjoNbVddU,1284
|
2
|
+
opik_optimizer/_throttle.py,sha256=ztub8qlwz4u0GVA2TIoLig0D1Cs0hJ7_o_SnT_C7Nmk,1360
|
3
|
+
opik_optimizer/base_optimizer.py,sha256=Gp96LSmWBHpC5rOoDkDUunRayvqf-A510TMwjsVhZYk,5018
|
4
|
+
opik_optimizer/cache_config.py,sha256=EzF4RAzxhSG8vtMJANdiUpNHQ9HzL2CrCXp0iik0f4A,580
|
5
|
+
opik_optimizer/logging_config.py,sha256=XECPnSoh8ghbllv1F0vj6ofO8YmE2HL0coLWjLdaNTU,2780
|
6
|
+
opik_optimizer/meta_prompt_optimizer.py,sha256=-5fAPz0LsQiQS-xj67hxr3KizvxoOScAA7gS6ACM9PY,49457
|
7
|
+
opik_optimizer/optimization_result.py,sha256=v_22SUW62XOFDPGRXrKLshPowi_QeJ1ZFrtnlaFMWek,9134
|
8
|
+
opik_optimizer/task_evaluator.py,sha256=aKVM2ER4TOgBC54FO1E6Spj-hdN_G8XstJ-F6m1gkJo,3879
|
9
|
+
opik_optimizer/utils.py,sha256=NWNyOYnsV0A7pHrfywRROmXq68nrUUuyzn2w0hKXpUg,1986
|
10
|
+
opik_optimizer/data/hotpot-500.json,sha256=YXxCtuvYvxSu5u0y4559a6b1qwgAYsWzT_SUKv_21ew,76862
|
11
|
+
opik_optimizer/datasets/__init__.py,sha256=j4O7ItmTDsm0XdAtx42uBsewSEhhw99Z-BO0CyyEBes,692
|
12
|
+
opik_optimizer/datasets/ai2_arc.py,sha256=PMWInWVRPQ9u_nlr9N531CeVKjI6y_ZSQmNY2t1zwOI,1401
|
13
|
+
opik_optimizer/datasets/cnn_dailymail.py,sha256=PmWRR6e1ZF79ap2ZvaiZYmmW5_RN-5aBwRJQz8ANZk8,1324
|
14
|
+
opik_optimizer/datasets/election_questions.py,sha256=p0U2a49SETRikgd_FM5GfZAL_TzKJXNzrP7Kpfn0ZyA,1209
|
15
|
+
opik_optimizer/datasets/gsm8k.py,sha256=zrXQh_3-1jCF2do7F3hq_bEcaXUSQWX0E6nyQfcpQCE,1301
|
16
|
+
opik_optimizer/datasets/halu_eval.py,sha256=wOFbPdJ2jcQ3s3FpzDFGgx4rmvJHk9aD2WHxJrIascs,1420
|
17
|
+
opik_optimizer/datasets/hotpot_qa.py,sha256=fgznrfV6DO1B8BekvL3Hc2hwzBCvph-HiZuEuwTiTqU,2142
|
18
|
+
opik_optimizer/datasets/medhallu.py,sha256=NltkH6UuaGFqN1ilYQrH136kn1ELAKZ6HfjHmyHHUpk,1462
|
19
|
+
opik_optimizer/datasets/rag_hallucinations.py,sha256=3ddmUL7dp01iGYkvJ9uaTKFEuLnqrJJ29Ww9z5m_-3g,1421
|
20
|
+
opik_optimizer/datasets/ragbench.py,sha256=bCt3S5KsfW_2wDK009aiGRXiIEHlLgL_OlXrXBFWEPI,1411
|
21
|
+
opik_optimizer/datasets/tiny_test.py,sha256=ysgkfCHsi018b0qy8OtuL2BUkOo-YEZVu4AnscJCA4E,1823
|
22
|
+
opik_optimizer/datasets/truthful_qa.py,sha256=xbRjW0UOm7oDN3jAnTZD7HChgDGspwhAhFpHV7zTtag,4166
|
23
|
+
opik_optimizer/demo/__init__.py,sha256=KSpFYhzN7fTmLEsIaciRHwxcJDeAiX5NDmYLdPsfpT8,150
|
24
|
+
opik_optimizer/demo/cache.py,sha256=5WqK8rSiijzU6s4VHIjLuL1LR5i1yHtY-x5FZTduSus,3669
|
25
|
+
opik_optimizer/demo/datasets.py,sha256=MezQlG4Q_cgSH7zQOmJcDwkGU8JV0xKSnZwCJGaj-88,2494
|
26
|
+
opik_optimizer/evolutionary_optimizer/__init__.py,sha256=OQ2ART5g-7EVGOISvTGY-AbmEyyDFEJJCsmJBzGJIpw,57
|
27
|
+
opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py,sha256=rrSZ8rUeUkVQ8qZwz16gY3TUoOwi0o-rVPZLumNeSWs,76650
|
28
|
+
opik_optimizer/few_shot_bayesian_optimizer/__init__.py,sha256=VuH7FOROyGcjMPryejtZC-5Y0QHlVTFLTGUDgNqRAFw,113
|
29
|
+
opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py,sha256=F4NQdm4YN7BMioxQdhzBZK8qhFEDG3qYhF--M29jfzQ,19334
|
30
|
+
opik_optimizer/few_shot_bayesian_optimizer/prompt_parameter.py,sha256=EDsSIFAUOfiZKWLrOAaBDB7Exk7cmIs4ccI95kVa7JY,3118
|
31
|
+
opik_optimizer/few_shot_bayesian_optimizer/prompt_templates.py,sha256=HmvD-UeT3aKiiet5cUtULXe6iFPEOo6hxyDE0pH2LnQ,2424
|
32
|
+
opik_optimizer/integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
33
|
+
opik_optimizer/mipro_optimizer/__init__.py,sha256=CF9TVXjOxTobDO1kAS8CD4eyLVzEozxjfgoKwIO6ZpU,44
|
34
|
+
opik_optimizer/mipro_optimizer/_lm.py,sha256=bcTy2Y5HjSaFQOATIpUaA86eIp3vKHaMuDI2_RvN2ww,16376
|
35
|
+
opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py,sha256=CiQWe39LCp-81SZmLyMybIw2lc_0RBKcxclLEuSXQgI,39757
|
36
|
+
opik_optimizer/mipro_optimizer/mipro_optimizer.py,sha256=XFXlhDCMBS5wXIVds83F26ztKSkqF3IeEDXYJvL-dZc,23957
|
37
|
+
opik_optimizer/mipro_optimizer/utils.py,sha256=wG1koygXfm_pvtA1jR-YaU4NATPbJZoTI7LE7l7df7g,3273
|
38
|
+
opik_optimizer/optimization_config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
39
|
+
opik_optimizer/optimization_config/configs.py,sha256=MYL9H2UAqeyGBlBGWbOZ-6Snto4ZMuXnypgvVuUSW1Y,1132
|
40
|
+
opik_optimizer/optimization_config/mappers.py,sha256=RXgTMxPzTQ1AHGke6Zca6rTcfCI7IkCKhQYciaEGSAo,1698
|
41
|
+
opik_optimizer-0.8.1.dist-info/licenses/LICENSE,sha256=dTRSwwCHdWeSjzodvnivYqcwi8x3Qfr21yv65QUWWBE,1062
|
42
|
+
opik_optimizer-0.8.1.dist-info/METADATA,sha256=HPdTa0hebpLlqgFVY4Ue-VFaqDucUIPb1KrZS8w_rX4,7085
|
43
|
+
opik_optimizer-0.8.1.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
|
44
|
+
opik_optimizer-0.8.1.dist-info/top_level.txt,sha256=ondOlpq6_yFckqpxoAHSfzZS2N-JfgmA-QQhOJfz7m0,15
|
45
|
+
opik_optimizer-0.8.1.dist-info/RECORD,,
|