opik-optimizer 0.7.7__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +2 -0
- opik_optimizer/base_optimizer.py +6 -4
- opik_optimizer/datasets/__init__.py +27 -0
- opik_optimizer/datasets/ai2_arc.py +44 -0
- opik_optimizer/datasets/cnn_dailymail.py +40 -0
- opik_optimizer/datasets/election_questions.py +36 -0
- opik_optimizer/datasets/gsm8k.py +40 -0
- opik_optimizer/datasets/halu_eval.py +43 -0
- opik_optimizer/datasets/hotpot_qa.py +67 -0
- opik_optimizer/datasets/medhallu.py +39 -0
- opik_optimizer/datasets/rag_hallucinations.py +41 -0
- opik_optimizer/datasets/ragbench.py +40 -0
- opik_optimizer/datasets/tiny_test.py +57 -0
- opik_optimizer/datasets/truthful_qa.py +107 -0
- opik_optimizer/demo/datasets.py +53 -607
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +3 -1
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +88 -17
- opik_optimizer/logging_config.py +1 -1
- opik_optimizer/meta_prompt_optimizer.py +57 -11
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +164 -16
- opik_optimizer/mipro_optimizer/utils.py +8 -1
- opik_optimizer/optimization_result.py +11 -0
- opik_optimizer/task_evaluator.py +6 -1
- opik_optimizer/utils.py +0 -52
- opik_optimizer-0.8.0.dist-info/METADATA +196 -0
- opik_optimizer-0.8.0.dist-info/RECORD +45 -0
- opik_optimizer-0.7.7.dist-info/METADATA +0 -174
- opik_optimizer-0.7.7.dist-info/RECORD +0 -33
- {opik_optimizer-0.7.7.dist-info → opik_optimizer-0.8.0.dist-info}/WHEEL +0 -0
- {opik_optimizer-0.7.7.dist-info → opik_optimizer-0.8.0.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-0.7.7.dist-info → opik_optimizer-0.8.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
|
|
1
1
|
from typing import Any, Dict, List, Tuple, Union, Optional, Literal
|
2
2
|
import os
|
3
3
|
import random
|
4
|
+
from datetime import datetime
|
4
5
|
|
5
6
|
import opik
|
6
7
|
|
@@ -37,11 +38,13 @@ logger = logging.getLogger(__name__) # Inherits config from setup_logging
|
|
37
38
|
|
38
39
|
|
39
40
|
class MiproOptimizer(BaseOptimizer):
|
40
|
-
def __init__(self, model, project_name: Optional[str] = None, **model_kwargs):
|
41
|
-
super().__init__(model, project_name, **model_kwargs)
|
41
|
+
def __init__(self, model, project_name: Optional[str] = None, verbose: int = 1, **model_kwargs):
|
42
|
+
super().__init__(model, project_name, verbose=verbose, **model_kwargs)
|
42
43
|
self.tools = []
|
43
44
|
self.num_threads = self.model_kwargs.pop("num_threads", 6)
|
44
45
|
self.model_kwargs["model"] = self.model
|
46
|
+
self.llm_call_counter = 0
|
47
|
+
# FIXME: add mipro_optimizer=True - It does not count the LLM calls made internally by DSPy during MiproOptimizer.optimizer.compile().
|
45
48
|
lm = LM(**self.model_kwargs)
|
46
49
|
opik_callback = OpikCallback(project_name=self.project_name, log_graph=True)
|
47
50
|
dspy.configure(lm=lm, callbacks=[opik_callback])
|
@@ -56,6 +59,7 @@ class MiproOptimizer(BaseOptimizer):
|
|
56
59
|
n_samples: int = 10,
|
57
60
|
dataset_item_ids: Optional[List[str]] = None,
|
58
61
|
experiment_config: Optional[Dict] = None,
|
62
|
+
verbose: int = 1,
|
59
63
|
**kwargs,
|
60
64
|
) -> float:
|
61
65
|
"""
|
@@ -69,6 +73,7 @@ class MiproOptimizer(BaseOptimizer):
|
|
69
73
|
n_samples: number of items to test in the dataset
|
70
74
|
dataset_item_ids: Optional list of dataset item IDs to evaluate
|
71
75
|
experiment_config: Optional configuration for the experiment
|
76
|
+
verbose: Verbosity level
|
72
77
|
**kwargs: Additional arguments for evaluation
|
73
78
|
|
74
79
|
Returns:
|
@@ -76,10 +81,14 @@ class MiproOptimizer(BaseOptimizer):
|
|
76
81
|
"""
|
77
82
|
# FIMXE: call super when it is ready
|
78
83
|
# FIXME: Intermediate values:
|
84
|
+
self.llm_call_counter += 1
|
79
85
|
metric = metric_config.metric
|
80
86
|
input_key = task_config.input_dataset_fields[0] # FIXME: allow all inputs
|
81
87
|
output_key = task_config.output_dataset_field
|
82
88
|
|
89
|
+
# Kwargs might contain n_samples, passed from run_benchmark.py
|
90
|
+
n_samples = kwargs.pop("n_samples", None) # Get n_samples from kwargs if present
|
91
|
+
|
83
92
|
if isinstance(dataset, str):
|
84
93
|
opik_client = opik.Opik(project_name=self.project_name)
|
85
94
|
dataset = opik_client.get_dataset(dataset)
|
@@ -144,12 +153,32 @@ class MiproOptimizer(BaseOptimizer):
|
|
144
153
|
|
145
154
|
return result
|
146
155
|
|
147
|
-
|
148
|
-
|
149
|
-
|
156
|
+
# Robust n_samples handling for selecting dataset_item_ids
|
157
|
+
dataset_items_for_eval = dataset.get_items()
|
158
|
+
num_total_items = len(dataset_items_for_eval)
|
159
|
+
dataset_item_ids_to_use = dataset_item_ids # Use provided IDs if any
|
150
160
|
|
151
|
-
|
152
|
-
dataset_item_ids
|
161
|
+
if n_samples is not None: # If n_samples is specified by the caller (run_benchmark.py)
|
162
|
+
if dataset_item_ids is not None:
|
163
|
+
# This case should ideally be an error or a clear precedence rule.
|
164
|
+
# For now, let's assume if dataset_item_ids is provided, it takes precedence over n_samples.
|
165
|
+
logger.warning("MiproOptimizer.evaluate_prompt: Both n_samples and dataset_item_ids provided. Using provided dataset_item_ids.")
|
166
|
+
# dataset_item_ids_to_use is already dataset_item_ids
|
167
|
+
elif n_samples > num_total_items:
|
168
|
+
logger.warning(f"MiproOptimizer.evaluate_prompt: n_samples ({n_samples}) > total items ({num_total_items}). Using all {num_total_items} items.")
|
169
|
+
dataset_item_ids_to_use = None # opik.evaluation.evaluate handles None as all items
|
170
|
+
elif n_samples <= 0:
|
171
|
+
logger.warning(f"MiproOptimizer.evaluate_prompt: n_samples ({n_samples}) is <= 0. Using all {num_total_items} items.")
|
172
|
+
dataset_item_ids_to_use = None
|
173
|
+
else:
|
174
|
+
# n_samples is valid and dataset_item_ids was not provided, so sample now.
|
175
|
+
all_ids = [item["id"] for item in dataset_items_for_eval]
|
176
|
+
dataset_item_ids_to_use = random.sample(all_ids, n_samples)
|
177
|
+
logger.info(f"MiproOptimizer.evaluate_prompt: Sampled {n_samples} items for evaluation.")
|
178
|
+
else: # n_samples is None
|
179
|
+
if dataset_item_ids is None:
|
180
|
+
logger.info(f"MiproOptimizer.evaluate_prompt: n_samples is None and dataset_item_ids is None. Using all {num_total_items} items.")
|
181
|
+
# dataset_item_ids_to_use is already dataset_item_ids (which could be None)
|
153
182
|
|
154
183
|
experiment_config = experiment_config or {}
|
155
184
|
experiment_config = {
|
@@ -171,9 +200,10 @@ class MiproOptimizer(BaseOptimizer):
|
|
171
200
|
# "reference" needs to match metric
|
172
201
|
scoring_key_mapping={"reference": output_key},
|
173
202
|
task_threads=self.num_threads,
|
174
|
-
dataset_item_ids=
|
203
|
+
dataset_item_ids=dataset_item_ids_to_use,
|
175
204
|
project_name=self.project_name,
|
176
205
|
experiment_config=experiment_config,
|
206
|
+
verbose=verbose,
|
177
207
|
)
|
178
208
|
|
179
209
|
# Calculate average score across all metrics
|
@@ -197,6 +227,8 @@ class MiproOptimizer(BaseOptimizer):
|
|
197
227
|
num_candidates: int = 10,
|
198
228
|
experiment_config: Optional[Dict] = None,
|
199
229
|
num_trials: Optional[int] = 3,
|
230
|
+
n_samples: Optional[int] = 10,
|
231
|
+
auto: Optional[Literal["light", "medium", "heavy"]] = "light",
|
200
232
|
**kwargs,
|
201
233
|
) -> OptimizationResult:
|
202
234
|
self._opik_client = opik.Opik()
|
@@ -205,6 +237,7 @@ class MiproOptimizer(BaseOptimizer):
|
|
205
237
|
optimization = self._opik_client.create_optimization(
|
206
238
|
dataset_name=dataset.name,
|
207
239
|
objective_name=metric_config.metric.name,
|
240
|
+
metadata={"optimizer": self.__class__.__name__},
|
208
241
|
)
|
209
242
|
except Exception:
|
210
243
|
logger.warning(
|
@@ -224,6 +257,8 @@ class MiproOptimizer(BaseOptimizer):
|
|
224
257
|
experiment_config=experiment_config,
|
225
258
|
optimization_id=optimization.id if optimization is not None else None,
|
226
259
|
num_trials=num_trials,
|
260
|
+
n_samples=n_samples,
|
261
|
+
auto=auto,
|
227
262
|
**kwargs,
|
228
263
|
)
|
229
264
|
if optimization:
|
@@ -244,6 +279,8 @@ class MiproOptimizer(BaseOptimizer):
|
|
244
279
|
experiment_config: Optional[Dict] = None,
|
245
280
|
optimization_id: Optional[str] = None,
|
246
281
|
num_trials: Optional[int] = 3,
|
282
|
+
n_samples: Optional[int] = 10,
|
283
|
+
auto: Optional[Literal["light", "medium", "heavy"]] = "light",
|
247
284
|
**kwargs,
|
248
285
|
) -> OptimizationResult:
|
249
286
|
logger.info("Preparing MIPRO optimization...")
|
@@ -255,6 +292,8 @@ class MiproOptimizer(BaseOptimizer):
|
|
255
292
|
experiment_config=experiment_config,
|
256
293
|
optimization_id=optimization_id,
|
257
294
|
num_trials=num_trials,
|
295
|
+
n_samples=n_samples,
|
296
|
+
auto=auto,
|
258
297
|
**kwargs,
|
259
298
|
)
|
260
299
|
logger.info("Starting MIPRO compilation...")
|
@@ -271,21 +310,24 @@ class MiproOptimizer(BaseOptimizer):
|
|
271
310
|
experiment_config: Optional[Dict] = None,
|
272
311
|
optimization_id: Optional[str] = None,
|
273
312
|
num_trials: Optional[int] = 3,
|
274
|
-
|
313
|
+
n_samples: Optional[int] = 10,
|
314
|
+
auto: Optional[Literal["light", "medium", "heavy"]] = "light",
|
275
315
|
**kwargs,
|
276
316
|
) -> None:
|
277
317
|
# FIXME: Intermediate values:
|
318
|
+
self.llm_call_counter = 0
|
278
319
|
metric = metric_config.metric
|
279
320
|
prompt = task_config.instruction_prompt
|
280
321
|
input_key = task_config.input_dataset_fields[0] # FIXME: allow all
|
281
322
|
output_key = task_config.output_dataset_field
|
282
323
|
self.tools = task_config.tools
|
283
324
|
self.num_candidates = num_candidates
|
284
|
-
self.seed =
|
325
|
+
self.seed = 42
|
285
326
|
self.input_key = input_key
|
286
327
|
self.output_key = output_key
|
287
328
|
self.prompt = prompt
|
288
329
|
self.num_trials = num_trials
|
330
|
+
self.n_samples = n_samples
|
289
331
|
self.auto = auto
|
290
332
|
|
291
333
|
# Convert to values for MIPRO:
|
@@ -302,7 +344,7 @@ class MiproOptimizer(BaseOptimizer):
|
|
302
344
|
if self.output_key not in row:
|
303
345
|
raise Exception("row does not contain output_key: %r" % self.output_key)
|
304
346
|
|
305
|
-
self.trainset = create_dspy_training_set(self.dataset, self.input_key)
|
347
|
+
self.trainset = create_dspy_training_set(self.dataset, self.input_key, self.n_samples)
|
306
348
|
self.data_signature = create_dspy_signature(
|
307
349
|
self.input_key, self.output_key, self.prompt
|
308
350
|
)
|
@@ -327,7 +369,7 @@ class MiproOptimizer(BaseOptimizer):
|
|
327
369
|
"metric": metric.name,
|
328
370
|
"num_threads": self.num_threads,
|
329
371
|
"num_candidates": self.num_candidates,
|
330
|
-
"
|
372
|
+
"num_trials": self.num_trials,
|
331
373
|
"dataset": dataset.name,
|
332
374
|
},
|
333
375
|
}
|
@@ -337,7 +379,7 @@ class MiproOptimizer(BaseOptimizer):
|
|
337
379
|
metric=self.metric_function,
|
338
380
|
auto=self.auto,
|
339
381
|
num_threads=self.num_threads,
|
340
|
-
verbose=
|
382
|
+
verbose=(self.verbose == 1),
|
341
383
|
num_candidates=self.num_candidates,
|
342
384
|
seed=self.seed,
|
343
385
|
opik_prompt_task_config=task_config,
|
@@ -363,6 +405,9 @@ class MiproOptimizer(BaseOptimizer):
|
|
363
405
|
"""
|
364
406
|
Continue to look for optimizations
|
365
407
|
"""
|
408
|
+
if not hasattr(self, 'optimizer') or not self.optimizer:
|
409
|
+
raise RuntimeError("MiproOptimizer not prepared. Call prepare_optimize_prompt first.")
|
410
|
+
|
366
411
|
self.results = self.optimizer.compile(
|
367
412
|
student=self.module,
|
368
413
|
trainset=self.trainset,
|
@@ -375,12 +420,114 @@ class MiproOptimizer(BaseOptimizer):
|
|
375
420
|
key=lambda item: item["score"],
|
376
421
|
reverse=True,
|
377
422
|
)
|
423
|
+
|
424
|
+
mipro_history_processed = []
|
425
|
+
# self.num_candidates is set in prepare_optimize_prompt, defaults to 10
|
426
|
+
# If self.num_candidates is 0 or None, this logic might break or be odd.
|
427
|
+
# Add a safeguard for num_candidates_per_round if self.num_candidates is not usable.
|
428
|
+
num_candidates_per_round = self.num_candidates if hasattr(self, 'num_candidates') and self.num_candidates and self.num_candidates > 0 else 1
|
429
|
+
|
430
|
+
for i, candidate_data in enumerate(self.results.candidate_programs):
|
431
|
+
program_module = candidate_data.get("program")
|
432
|
+
instruction = "N/A"
|
433
|
+
if hasattr(program_module, 'signature') and hasattr(program_module.signature, 'instructions'):
|
434
|
+
instruction = program_module.signature.instructions
|
435
|
+
elif hasattr(program_module, 'extended_signature') and hasattr(program_module.extended_signature, 'instructions'):
|
436
|
+
instruction = program_module.extended_signature.instructions
|
437
|
+
elif hasattr(program_module, 'predictor') and hasattr(program_module.predictor, 'signature') and hasattr(program_module.predictor.signature, 'instructions'):
|
438
|
+
instruction = program_module.predictor.signature.instructions
|
439
|
+
|
440
|
+
# Remove R and C calculation for Mipro as its history is flat
|
441
|
+
# current_round_number = (i // num_candidates_per_round) + 1
|
442
|
+
# current_candidate_in_round = (i % num_candidates_per_round) + 1
|
443
|
+
|
444
|
+
iter_detail = {
|
445
|
+
"iteration": i + 1,
|
446
|
+
# "round_number": current_round_number, # Remove round_number
|
447
|
+
# "candidate_in_round": current_candidate_in_round, # Remove candidate_in_round
|
448
|
+
"timestamp": datetime.now().isoformat(),
|
449
|
+
"prompt_candidate": instruction,
|
450
|
+
"parameters_used": {
|
451
|
+
"program_summary": str(program_module)[:500]
|
452
|
+
},
|
453
|
+
"scores": [], # Initialize scores list
|
454
|
+
"tokens_used": None, # TODO: add tokens_used
|
455
|
+
"cost": None, # TODO: add cost
|
456
|
+
"duration_seconds": None, # TODO: add duration_seconds
|
457
|
+
}
|
458
|
+
|
459
|
+
current_score = candidate_data.get("score")
|
460
|
+
metric_name_for_history = self.opik_metric.name if hasattr(self, 'opik_metric') and self.opik_metric else "unknown_metric"
|
461
|
+
|
462
|
+
# Unscale if it's a known 0-1 metric that MIPRO might scale to 0-100
|
463
|
+
# For now, specifically targeting Levenshtein-like metrics
|
464
|
+
if isinstance(current_score, (float, int)) and \
|
465
|
+
("levenshtein" in metric_name_for_history.lower() or "similarity" in metric_name_for_history.lower()):
|
466
|
+
# Assuming scores like 32.4 are 0-1 scores scaled by 100
|
467
|
+
if abs(current_score) > 1.0: # A simple check to see if it looks scaled
|
468
|
+
logger.debug(f"Mipro history: Unscaling score {current_score} for metric {metric_name_for_history} by dividing by 100.")
|
469
|
+
current_score /= 100.0
|
470
|
+
|
471
|
+
iter_detail["scores"].append({
|
472
|
+
"metric_name": metric_name_for_history,
|
473
|
+
"score": current_score,
|
474
|
+
"opik_evaluation_id": None # TODO: add opik_evaluation_id
|
475
|
+
})
|
476
|
+
mipro_history_processed.append(iter_detail)
|
477
|
+
|
478
|
+
if not self.best_programs:
|
479
|
+
logger.warning("MIPRO compile returned no candidate programs.")
|
480
|
+
return OptimizationResult(
|
481
|
+
optimizer="MiproOptimizer",
|
482
|
+
prompt=self.prompt,
|
483
|
+
score=0.0,
|
484
|
+
metric_name=self.opik_metric.name if hasattr(self, 'opik_metric') else "unknown_metric",
|
485
|
+
details={"error": "No candidate programs generated by MIPRO"},
|
486
|
+
history=mipro_history_processed,
|
487
|
+
llm_calls=self.llm_call_counter
|
488
|
+
)
|
489
|
+
|
378
490
|
self.module = self.get_best().details["program"]
|
379
|
-
|
491
|
+
best_program_details = self.get_best()
|
492
|
+
|
493
|
+
# Unscale the main score if necessary, similar to history scores
|
494
|
+
final_best_score = best_program_details.score
|
495
|
+
final_metric_name = best_program_details.metric_name
|
496
|
+
if isinstance(final_best_score, (float, int)) and \
|
497
|
+
final_metric_name and \
|
498
|
+
("levenshtein" in final_metric_name.lower() or "similarity" in final_metric_name.lower()):
|
499
|
+
if abs(final_best_score) > 1.0: # A simple check to see if it looks scaled
|
500
|
+
logger.debug(f"Mipro main result: Unscaling score {final_best_score} for metric {final_metric_name} by dividing by 100.")
|
501
|
+
final_best_score /= 100.0
|
502
|
+
|
503
|
+
return OptimizationResult(
|
504
|
+
optimizer="MiproOptimizer",
|
505
|
+
prompt=best_program_details.prompt,
|
506
|
+
tool_prompts=best_program_details.tool_prompts,
|
507
|
+
score=final_best_score, # Use the potentially unscaled score
|
508
|
+
metric_name=final_metric_name,
|
509
|
+
demonstrations=best_program_details.demonstrations,
|
510
|
+
details=best_program_details.details,
|
511
|
+
history=mipro_history_processed,
|
512
|
+
llm_calls=self.llm_call_counter
|
513
|
+
)
|
380
514
|
|
381
515
|
def get_best(self, position: int = 0) -> OptimizationResult:
|
516
|
+
if not hasattr(self, 'best_programs') or not self.best_programs:
|
517
|
+
logger.error("get_best() called but no best_programs found. MIPRO compile might have failed or yielded no results.")
|
518
|
+
return OptimizationResult(
|
519
|
+
optimizer="MiproOptimizer",
|
520
|
+
prompt=getattr(self, 'prompt', "Error: Initial prompt not found"),
|
521
|
+
score=0.0,
|
522
|
+
metric_name=getattr(self, 'opik_metric', None).name if hasattr(self, 'opik_metric') and self.opik_metric else "unknown_metric",
|
523
|
+
details={"error": "No programs generated or compile failed"},
|
524
|
+
history=[],
|
525
|
+
llm_calls=self.llm_call_counter
|
526
|
+
)
|
527
|
+
|
382
528
|
score = self.best_programs[position]["score"]
|
383
|
-
|
529
|
+
program_module = self.best_programs[position]["program"]
|
530
|
+
state = program_module.dump_state()
|
384
531
|
if self.tools:
|
385
532
|
tool_names = [tool.__name__ for tool in self.tools]
|
386
533
|
tool_prompts = get_tool_prompts(
|
@@ -400,5 +547,6 @@ class MiproOptimizer(BaseOptimizer):
|
|
400
547
|
score=score,
|
401
548
|
metric_name=self.opik_metric.name,
|
402
549
|
demonstrations=demos,
|
403
|
-
details={"program":
|
550
|
+
details={"program": program_module},
|
551
|
+
llm_calls=self.llm_call_counter
|
404
552
|
)
|
@@ -3,6 +3,7 @@ from typing import Any, Dict, List, Tuple, Union, Optional
|
|
3
3
|
import uuid
|
4
4
|
import dspy
|
5
5
|
import re
|
6
|
+
import random
|
6
7
|
|
7
8
|
from dspy.signatures.signature import make_signature
|
8
9
|
|
@@ -78,11 +79,17 @@ def opik_metric_to_dspy(metric, output):
|
|
78
79
|
return opik_metric_score_wrapper
|
79
80
|
|
80
81
|
|
81
|
-
def create_dspy_training_set(
|
82
|
+
def create_dspy_training_set(
|
83
|
+
data: list[dict], input: str, n_samples: Optional[int] = None
|
84
|
+
) -> list[dspy.Example]:
|
82
85
|
"""
|
83
86
|
Turn a list of dicts into a list of dspy Examples
|
84
87
|
"""
|
85
88
|
output = []
|
89
|
+
|
90
|
+
if n_samples is not None:
|
91
|
+
data = random.sample(data, n_samples)
|
92
|
+
|
86
93
|
for example in data:
|
87
94
|
example_obj = dspy.Example(
|
88
95
|
**example, dspy_uuid=str(uuid.uuid4()), dspy_split="train"
|
@@ -7,6 +7,15 @@ from pydantic import BaseModel, Field
|
|
7
7
|
from .base_optimizer import OptimizationRound # Adjust import as necessary
|
8
8
|
import rich
|
9
9
|
|
10
|
+
class OptimizationStep(BaseModel):
|
11
|
+
"""Represents a single step or trial in an optimization process."""
|
12
|
+
step: int
|
13
|
+
score: Optional[float] = None
|
14
|
+
prompt: Optional[Union[str, List[Dict[str, str]]]] = None
|
15
|
+
parameters: Optional[Dict[str, Any]] = None
|
16
|
+
timestamp: Optional[str] = None
|
17
|
+
# Add other relevant details per step if needed
|
18
|
+
|
10
19
|
|
11
20
|
class OptimizationResult(pydantic.BaseModel):
|
12
21
|
"""Result of an optimization run."""
|
@@ -28,6 +37,8 @@ class OptimizationResult(pydantic.BaseModel):
|
|
28
37
|
demonstrations: Optional[List[Dict[str, Any]]] = None
|
29
38
|
optimizer: str = "Optimizer"
|
30
39
|
tool_prompts: Optional[Dict[str, str]] = None
|
40
|
+
opik_metadata: Optional[Dict[str, Any]] = None
|
41
|
+
llm_calls: Optional[int] = None
|
31
42
|
|
32
43
|
model_config = pydantic.ConfigDict(arbitrary_types_allowed=True)
|
33
44
|
|
opik_optimizer/task_evaluator.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
import opik
|
2
|
-
|
2
|
+
import logging
|
3
3
|
from typing import Any, Callable, Dict, List, Optional
|
4
4
|
from opik_optimizer.optimization_config.configs import MetricConfig
|
5
5
|
from opik.evaluation.metrics import score_result
|
6
6
|
|
7
7
|
from opik.evaluation import evaluator as opik_evaluator
|
8
8
|
|
9
|
+
logger = logging.getLogger(__name__)
|
9
10
|
|
10
11
|
def evaluate(
|
11
12
|
dataset: opik.Dataset,
|
@@ -17,6 +18,7 @@ def evaluate(
|
|
17
18
|
project_name: Optional[str] = None,
|
18
19
|
n_samples: Optional[int] = None,
|
19
20
|
experiment_config: Optional[Dict[str, Any]] = None,
|
21
|
+
verbose: int = 1,
|
20
22
|
) -> float:
|
21
23
|
"""
|
22
24
|
Evaluate a task on a dataset.
|
@@ -31,6 +33,7 @@ def evaluate(
|
|
31
33
|
num_threads: Number of threads to use for evaluation.
|
32
34
|
experiment_config: The dictionary with parameters that describe experiment
|
33
35
|
optimization_id: Optional optimization ID for the experiment.
|
36
|
+
verbose: Whether to print debug information.
|
34
37
|
|
35
38
|
Returns:
|
36
39
|
float: The average score of the evaluated task.
|
@@ -71,6 +74,7 @@ def evaluate(
|
|
71
74
|
task_threads=num_threads,
|
72
75
|
nb_samples=n_samples,
|
73
76
|
experiment_config=experiment_config,
|
77
|
+
verbose=verbose,
|
74
78
|
)
|
75
79
|
else:
|
76
80
|
result = opik_evaluator.evaluate(
|
@@ -83,6 +87,7 @@ def evaluate(
|
|
83
87
|
task_threads=num_threads,
|
84
88
|
nb_samples=n_samples,
|
85
89
|
experiment_config=experiment_config,
|
90
|
+
verbose=verbose,
|
86
91
|
)
|
87
92
|
|
88
93
|
if not result.test_results:
|
opik_optimizer/utils.py
CHANGED
@@ -8,9 +8,6 @@ from opik.api_objects.opik_client import Opik
|
|
8
8
|
|
9
9
|
from typing import List, Dict, Any, Optional, Callable, TYPE_CHECKING
|
10
10
|
|
11
|
-
# Test dataset name for optimizer examples
|
12
|
-
TEST_DATASET_NAME = "tiny-test-optimizer"
|
13
|
-
|
14
11
|
# Type hint for OptimizationResult without circular import
|
15
12
|
if TYPE_CHECKING:
|
16
13
|
from .optimization_result import OptimizationResult
|
@@ -79,54 +76,5 @@ def get_random_seed() -> int:
|
|
79
76
|
|
80
77
|
return random.randint(0, 2**32 - 1)
|
81
78
|
|
82
|
-
|
83
|
-
def get_or_create_dataset(
|
84
|
-
dataset_name: str,
|
85
|
-
description: str,
|
86
|
-
data_loader: Callable[[], List[Dict[str, Any]]],
|
87
|
-
project_name: Optional[str] = None,
|
88
|
-
) -> opik.Dataset:
|
89
|
-
"""
|
90
|
-
Get an existing dataset or create a new one if it doesn't exist.
|
91
|
-
|
92
|
-
Args:
|
93
|
-
dataset_name: Name of the dataset
|
94
|
-
description: Description of the dataset
|
95
|
-
data: Optional data to insert into the dataset
|
96
|
-
project_name: Optional project name
|
97
|
-
|
98
|
-
Returns:
|
99
|
-
opik.Dataset: The dataset object
|
100
|
-
"""
|
101
|
-
client = Opik(project_name=project_name)
|
102
|
-
|
103
|
-
try:
|
104
|
-
# Try to get existing dataset
|
105
|
-
dataset = client.get_dataset(dataset_name)
|
106
|
-
# If dataset exists but has no data, delete it
|
107
|
-
if not dataset.get_items():
|
108
|
-
print("Dataset exists but is empty - deleting it...")
|
109
|
-
# Delete all items in the dataset
|
110
|
-
items = dataset.get_items()
|
111
|
-
if items:
|
112
|
-
dataset.delete(items_ids=[item.id for item in items])
|
113
|
-
# Delete the dataset itself
|
114
|
-
client.delete_dataset(dataset_name)
|
115
|
-
raise Exception("Dataset deleted, will create new one")
|
116
|
-
except Exception:
|
117
|
-
# Create new dataset
|
118
|
-
print("Creating new dataset...")
|
119
|
-
dataset = client.create_dataset(name=dataset_name, description=description)
|
120
|
-
|
121
|
-
dataset_items = data_loader()
|
122
|
-
dataset.insert(dataset_items)
|
123
|
-
|
124
|
-
# Verify data was added
|
125
|
-
if not dataset.get_items():
|
126
|
-
raise Exception("Failed to add data to dataset")
|
127
|
-
|
128
|
-
return dataset
|
129
|
-
|
130
|
-
|
131
79
|
def random_chars(n: int) -> str:
|
132
80
|
return "".join(random.choice(string.ascii_letters) for _ in range(n))
|