opik-optimizer 0.9.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +7 -3
- opik_optimizer/_throttle.py +8 -8
- opik_optimizer/base_optimizer.py +98 -45
- opik_optimizer/cache_config.py +5 -3
- opik_optimizer/datasets/ai2_arc.py +15 -13
- opik_optimizer/datasets/cnn_dailymail.py +19 -15
- opik_optimizer/datasets/election_questions.py +10 -11
- opik_optimizer/datasets/gsm8k.py +16 -11
- opik_optimizer/datasets/halu_eval.py +6 -5
- opik_optimizer/datasets/hotpot_qa.py +17 -16
- opik_optimizer/datasets/medhallu.py +10 -7
- opik_optimizer/datasets/rag_hallucinations.py +11 -8
- opik_optimizer/datasets/ragbench.py +17 -9
- opik_optimizer/datasets/tiny_test.py +33 -37
- opik_optimizer/datasets/truthful_qa.py +18 -12
- opik_optimizer/demo/cache.py +6 -6
- opik_optimizer/demo/datasets.py +3 -7
- opik_optimizer/evolutionary_optimizer/__init__.py +3 -1
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +748 -437
- opik_optimizer/evolutionary_optimizer/reporting.py +155 -76
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +291 -181
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +79 -28
- opik_optimizer/logging_config.py +19 -15
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +234 -138
- opik_optimizer/meta_prompt_optimizer/reporting.py +121 -47
- opik_optimizer/mipro_optimizer/__init__.py +2 -0
- opik_optimizer/mipro_optimizer/_lm.py +41 -9
- opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +37 -26
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +135 -67
- opik_optimizer/mipro_optimizer/utils.py +5 -2
- opik_optimizer/optimizable_agent.py +179 -0
- opik_optimizer/optimization_config/chat_prompt.py +143 -73
- opik_optimizer/optimization_config/configs.py +4 -3
- opik_optimizer/optimization_config/mappers.py +18 -6
- opik_optimizer/optimization_result.py +28 -20
- opik_optimizer/py.typed +0 -0
- opik_optimizer/reporting_utils.py +96 -46
- opik_optimizer/task_evaluator.py +12 -14
- opik_optimizer/utils.py +122 -37
- {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/METADATA +8 -8
- opik_optimizer-1.0.0.dist-info/RECORD +50 -0
- opik_optimizer-0.9.1.dist-info/RECORD +0 -48
- {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/WHEEL +0 -0
- {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/top_level.txt +0 -0
@@ -2,6 +2,7 @@ import os
|
|
2
2
|
import random
|
3
3
|
from datetime import datetime
|
4
4
|
from typing import Callable, Dict, List, Literal, Optional, Union
|
5
|
+
import logging
|
5
6
|
|
6
7
|
import dspy
|
7
8
|
import litellm
|
@@ -16,7 +17,6 @@ from ..optimization_result import OptimizationResult
|
|
16
17
|
from ..utils import optimization_context
|
17
18
|
from ..base_optimizer import BaseOptimizer
|
18
19
|
from ..optimization_config.configs import TaskConfig
|
19
|
-
from ..optimization_result import OptimizationResult
|
20
20
|
from ._lm import LM
|
21
21
|
from ._mipro_optimizer_v2 import MIPROv2
|
22
22
|
from .utils import (
|
@@ -30,23 +30,26 @@ from .utils import (
|
|
30
30
|
disk_cache_dir = os.path.expanduser("~/.litellm_cache")
|
31
31
|
litellm.cache = Cache(type="disk", disk_cache_dir=disk_cache_dir)
|
32
32
|
|
33
|
-
# Set up logging
|
34
|
-
import logging
|
35
|
-
|
36
33
|
logger = logging.getLogger(__name__) # Inherits config from setup_logging
|
37
34
|
|
38
35
|
|
39
36
|
class MiproOptimizer(BaseOptimizer):
|
40
|
-
def __init__(
|
41
|
-
|
37
|
+
def __init__(
|
38
|
+
self,
|
39
|
+
model,
|
40
|
+
project_name: Optional[str] = None,
|
41
|
+
verbose: int = 1,
|
42
|
+
**model_kwargs,
|
43
|
+
):
|
44
|
+
super().__init__(model=model, verbose=verbose, **model_kwargs)
|
42
45
|
self.tools = []
|
46
|
+
self.project_name = project_name
|
43
47
|
self.num_threads = self.model_kwargs.pop("num_threads", 6)
|
44
48
|
self.model_kwargs["model"] = self.model
|
45
|
-
self.llm_call_counter = 0
|
46
49
|
# FIXME: add mipro_optimizer=True - It does not count the LLM calls made internally by DSPy during MiproOptimizer.optimizer.compile().
|
47
|
-
lm = LM(**self.model_kwargs)
|
50
|
+
self.lm = LM(**self.model_kwargs)
|
48
51
|
opik_callback = OpikCallback(project_name=self.project_name, log_graph=True)
|
49
|
-
dspy.configure(lm=lm, callbacks=[opik_callback])
|
52
|
+
dspy.configure(lm=self.lm, callbacks=[opik_callback])
|
50
53
|
logger.debug(f"Initialized MiproOptimizer with model: {model}")
|
51
54
|
|
52
55
|
def evaluate_prompt(
|
@@ -54,7 +57,7 @@ class MiproOptimizer(BaseOptimizer):
|
|
54
57
|
dataset: Union[str, Dataset],
|
55
58
|
metric: Callable,
|
56
59
|
task_config: TaskConfig,
|
57
|
-
prompt: Union[str, dspy.Module, OptimizationResult] = None,
|
60
|
+
prompt: Optional[Union[str, dspy.Module, OptimizationResult]] = None,
|
58
61
|
n_samples: int = 10,
|
59
62
|
dataset_item_ids: Optional[List[str]] = None,
|
60
63
|
experiment_config: Optional[Dict] = None,
|
@@ -85,7 +88,9 @@ class MiproOptimizer(BaseOptimizer):
|
|
85
88
|
output_key = task_config.output_dataset_field
|
86
89
|
|
87
90
|
# Kwargs might contain n_samples, passed from run_benchmark.py
|
88
|
-
n_samples = kwargs.pop(
|
91
|
+
n_samples = kwargs.pop(
|
92
|
+
"n_samples", None
|
93
|
+
) # Get n_samples from kwargs if present
|
89
94
|
|
90
95
|
if isinstance(dataset, str):
|
91
96
|
opik_client = opik.Opik(project_name=self.project_name)
|
@@ -154,28 +159,42 @@ class MiproOptimizer(BaseOptimizer):
|
|
154
159
|
# Robust n_samples handling for selecting dataset_item_ids
|
155
160
|
dataset_items_for_eval = dataset.get_items()
|
156
161
|
num_total_items = len(dataset_items_for_eval)
|
157
|
-
dataset_item_ids_to_use = dataset_item_ids
|
162
|
+
dataset_item_ids_to_use = dataset_item_ids # Use provided IDs if any
|
158
163
|
|
159
|
-
if
|
164
|
+
if (
|
165
|
+
n_samples is not None
|
166
|
+
): # If n_samples is specified by the caller (run_benchmark.py)
|
160
167
|
if dataset_item_ids is not None:
|
161
168
|
# This case should ideally be an error or a clear precedence rule.
|
162
169
|
# For now, let's assume if dataset_item_ids is provided, it takes precedence over n_samples.
|
163
|
-
logger.warning(
|
170
|
+
logger.warning(
|
171
|
+
"MiproOptimizer.evaluate_prompt: Both n_samples and dataset_item_ids provided. Using provided dataset_item_ids."
|
172
|
+
)
|
164
173
|
# dataset_item_ids_to_use is already dataset_item_ids
|
165
174
|
elif n_samples > num_total_items:
|
166
|
-
logger.warning(
|
167
|
-
|
175
|
+
logger.warning(
|
176
|
+
f"MiproOptimizer.evaluate_prompt: n_samples ({n_samples}) > total items ({num_total_items}). Using all {num_total_items} items."
|
177
|
+
)
|
178
|
+
dataset_item_ids_to_use = (
|
179
|
+
None # opik.evaluation.evaluate handles None as all items
|
180
|
+
)
|
168
181
|
elif n_samples <= 0:
|
169
|
-
logger.warning(
|
182
|
+
logger.warning(
|
183
|
+
f"MiproOptimizer.evaluate_prompt: n_samples ({n_samples}) is <= 0. Using all {num_total_items} items."
|
184
|
+
)
|
170
185
|
dataset_item_ids_to_use = None
|
171
186
|
else:
|
172
187
|
# n_samples is valid and dataset_item_ids was not provided, so sample now.
|
173
188
|
all_ids = [item["id"] for item in dataset_items_for_eval]
|
174
189
|
dataset_item_ids_to_use = random.sample(all_ids, n_samples)
|
175
|
-
logger.info(
|
176
|
-
|
190
|
+
logger.info(
|
191
|
+
f"MiproOptimizer.evaluate_prompt: Sampled {n_samples} items for evaluation."
|
192
|
+
)
|
193
|
+
else: # n_samples is None
|
177
194
|
if dataset_item_ids is None:
|
178
|
-
logger.info(
|
195
|
+
logger.info(
|
196
|
+
f"MiproOptimizer.evaluate_prompt: n_samples is None and dataset_item_ids is None. Using all {num_total_items} items."
|
197
|
+
)
|
179
198
|
# dataset_item_ids_to_use is already dataset_item_ids (which could be None)
|
180
199
|
|
181
200
|
experiment_config = experiment_config or {}
|
@@ -231,10 +250,10 @@ class MiproOptimizer(BaseOptimizer):
|
|
231
250
|
) -> OptimizationResult:
|
232
251
|
self._opik_client = opik.Opik()
|
233
252
|
with optimization_context(
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
253
|
+
client=self._opik_client,
|
254
|
+
dataset_name=dataset.name,
|
255
|
+
objective_name=metric.__name__,
|
256
|
+
metadata={"optimizer": self.__class__.__name__},
|
238
257
|
) as optimization:
|
239
258
|
result = self._optimize_prompt(
|
240
259
|
dataset=dataset,
|
@@ -323,7 +342,9 @@ class MiproOptimizer(BaseOptimizer):
|
|
323
342
|
if self.output_key not in row:
|
324
343
|
raise Exception("row does not contain output_key: %r" % self.output_key)
|
325
344
|
|
326
|
-
self.trainset = create_dspy_training_set(
|
345
|
+
self.trainset = create_dspy_training_set(
|
346
|
+
self.dataset, self.input_key, self.n_samples
|
347
|
+
)
|
327
348
|
self.data_signature = create_dspy_signature(
|
328
349
|
self.input_key, self.output_key, self.prompt
|
329
350
|
)
|
@@ -384,8 +405,10 @@ class MiproOptimizer(BaseOptimizer):
|
|
384
405
|
"""
|
385
406
|
Continue to look for optimizations
|
386
407
|
"""
|
387
|
-
if not hasattr(self,
|
388
|
-
raise RuntimeError(
|
408
|
+
if not hasattr(self, "optimizer") or not self.optimizer:
|
409
|
+
raise RuntimeError(
|
410
|
+
"MiproOptimizer not prepared. Call prepare_optimize_prompt first."
|
411
|
+
)
|
389
412
|
|
390
413
|
self.results = self.optimizer.compile(
|
391
414
|
student=self.module,
|
@@ -404,16 +427,30 @@ class MiproOptimizer(BaseOptimizer):
|
|
404
427
|
# self.num_candidates is set in prepare_optimize_prompt, defaults to 10
|
405
428
|
# If self.num_candidates is 0 or None, this logic might break or be odd.
|
406
429
|
# Add a safeguard for num_candidates_per_round if self.num_candidates is not usable.
|
407
|
-
num_candidates_per_round =
|
430
|
+
num_candidates_per_round = ( # noqa
|
431
|
+
self.num_candidates
|
432
|
+
if hasattr(self, "num_candidates")
|
433
|
+
and self.num_candidates
|
434
|
+
and self.num_candidates > 0
|
435
|
+
else 1
|
436
|
+
)
|
408
437
|
|
409
438
|
for i, candidate_data in enumerate(self.results.candidate_programs):
|
410
439
|
program_module = candidate_data.get("program")
|
411
440
|
instruction = "N/A"
|
412
|
-
if hasattr(program_module,
|
441
|
+
if hasattr(program_module, "signature") and hasattr(
|
442
|
+
program_module.signature, "instructions"
|
443
|
+
):
|
413
444
|
instruction = program_module.signature.instructions
|
414
|
-
elif hasattr(program_module,
|
445
|
+
elif hasattr(program_module, "extended_signature") and hasattr(
|
446
|
+
program_module.extended_signature, "instructions"
|
447
|
+
):
|
415
448
|
instruction = program_module.extended_signature.instructions
|
416
|
-
elif
|
449
|
+
elif (
|
450
|
+
hasattr(program_module, "predictor")
|
451
|
+
and hasattr(program_module.predictor, "signature")
|
452
|
+
and hasattr(program_module.predictor.signature, "instructions")
|
453
|
+
):
|
417
454
|
instruction = program_module.predictor.signature.instructions
|
418
455
|
|
419
456
|
# Remove R and C calculation for Mipro as its history is flat
|
@@ -426,13 +463,11 @@ class MiproOptimizer(BaseOptimizer):
|
|
426
463
|
# "candidate_in_round": current_candidate_in_round, # Remove candidate_in_round
|
427
464
|
"timestamp": datetime.now().isoformat(),
|
428
465
|
"prompt_candidate": instruction,
|
429
|
-
"parameters_used": {
|
430
|
-
|
431
|
-
|
432
|
-
"
|
433
|
-
"
|
434
|
-
"cost": None, # TODO: add cost
|
435
|
-
"duration_seconds": None, # TODO: add duration_seconds
|
466
|
+
"parameters_used": {"program_summary": str(program_module)[:500]},
|
467
|
+
"scores": [], # Initialize scores list
|
468
|
+
"tokens_used": None, # TODO: add tokens_used
|
469
|
+
"cost": None, # TODO: add cost
|
470
|
+
"duration_seconds": None, # TODO: add duration_seconds
|
436
471
|
}
|
437
472
|
|
438
473
|
current_score = candidate_data.get("score")
|
@@ -440,70 +475,103 @@ class MiproOptimizer(BaseOptimizer):
|
|
440
475
|
|
441
476
|
# Unscale if it's a known 0-1 metric that MIPRO might scale to 0-100
|
442
477
|
# For now, specifically targeting Levenshtein-like metrics
|
443
|
-
if isinstance(current_score, (float, int)) and
|
444
|
-
|
478
|
+
if isinstance(current_score, (float, int)) and (
|
479
|
+
"levenshtein" in metric_name_for_history.lower()
|
480
|
+
or "similarity" in metric_name_for_history.lower()
|
481
|
+
):
|
445
482
|
# Assuming scores like 32.4 are 0-1 scores scaled by 100
|
446
|
-
if abs(current_score) > 1.0:
|
447
|
-
logger.debug(
|
483
|
+
if abs(current_score) > 1.0: # A simple check to see if it looks scaled
|
484
|
+
logger.debug(
|
485
|
+
f"Mipro history: Unscaling score {current_score} for metric {metric_name_for_history} by dividing by 100."
|
486
|
+
)
|
448
487
|
current_score /= 100.0
|
449
|
-
|
450
|
-
iter_detail["scores"].append(
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
488
|
+
|
489
|
+
iter_detail["scores"].append(
|
490
|
+
{
|
491
|
+
"metric_name": metric_name_for_history,
|
492
|
+
"score": current_score,
|
493
|
+
"opik_evaluation_id": None, # TODO: add opik_evaluation_id
|
494
|
+
}
|
495
|
+
)
|
455
496
|
mipro_history_processed.append(iter_detail)
|
456
497
|
|
457
498
|
if not self.best_programs:
|
458
499
|
logger.warning("MIPRO compile returned no candidate programs.")
|
459
500
|
return OptimizationResult(
|
460
501
|
optimizer="MiproOptimizer",
|
461
|
-
prompt=[
|
502
|
+
prompt=[
|
503
|
+
{
|
504
|
+
"role": "user",
|
505
|
+
"content": getattr(
|
506
|
+
self, "prompt", "Error: Initial prompt not found"
|
507
|
+
),
|
508
|
+
}
|
509
|
+
],
|
462
510
|
score=0.0,
|
463
|
-
metric_name=self.opik_metric.__name__
|
511
|
+
metric_name=self.opik_metric.__name__
|
512
|
+
if hasattr(self, "opik_metric")
|
513
|
+
else "unknown_metric",
|
464
514
|
details={"error": "No candidate programs generated by MIPRO"},
|
465
515
|
history=mipro_history_processed,
|
466
|
-
llm_calls=self.llm_call_counter
|
516
|
+
llm_calls=self.lm.llm_call_counter,
|
467
517
|
)
|
468
518
|
|
469
519
|
self.module = self.get_best().details["program"]
|
470
520
|
best_program_details = self.get_best()
|
471
|
-
|
521
|
+
|
472
522
|
# Unscale the main score if necessary, similar to history scores
|
473
523
|
final_best_score = best_program_details.score
|
474
524
|
final_metric_name = best_program_details.metric_name
|
475
|
-
if
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
525
|
+
if (
|
526
|
+
isinstance(final_best_score, (float, int))
|
527
|
+
and final_metric_name
|
528
|
+
and (
|
529
|
+
"levenshtein" in final_metric_name.lower()
|
530
|
+
or "similarity" in final_metric_name.lower()
|
531
|
+
)
|
532
|
+
):
|
533
|
+
if abs(final_best_score) > 1.0: # A simple check to see if it looks scaled
|
534
|
+
logger.debug(
|
535
|
+
f"Mipro main result: Unscaling score {final_best_score} for metric {final_metric_name} by dividing by 100."
|
536
|
+
)
|
480
537
|
final_best_score /= 100.0
|
481
538
|
|
482
539
|
return OptimizationResult(
|
483
540
|
optimizer="MiproOptimizer",
|
484
541
|
prompt=best_program_details.prompt,
|
485
542
|
tool_prompts=best_program_details.tool_prompts,
|
486
|
-
score=final_best_score,
|
543
|
+
score=final_best_score, # Use the potentially unscaled score
|
487
544
|
metric_name=final_metric_name,
|
488
545
|
demonstrations=best_program_details.demonstrations,
|
489
546
|
details=best_program_details.details,
|
490
547
|
history=mipro_history_processed,
|
491
|
-
llm_calls=self.llm_call_counter
|
548
|
+
llm_calls=self.lm.llm_call_counter,
|
492
549
|
)
|
493
550
|
|
494
551
|
def get_best(self, position: int = 0) -> OptimizationResult:
|
495
|
-
if not hasattr(self,
|
496
|
-
logger.error(
|
552
|
+
if not hasattr(self, "best_programs") or not self.best_programs:
|
553
|
+
logger.error(
|
554
|
+
"get_best() called but no best_programs found. MIPRO compile might have failed or yielded no results."
|
555
|
+
)
|
497
556
|
return OptimizationResult(
|
498
557
|
optimizer="MiproOptimizer",
|
499
|
-
prompt=[
|
500
|
-
|
501
|
-
|
558
|
+
prompt=[
|
559
|
+
{
|
560
|
+
"role": "user",
|
561
|
+
"content": getattr(
|
562
|
+
self, "prompt", "Error: Initial prompt not found"
|
563
|
+
),
|
564
|
+
}
|
565
|
+
],
|
566
|
+
score=0.0,
|
567
|
+
metric_name=getattr(self, "opik_metric", None).name
|
568
|
+
if hasattr(self, "opik_metric") and self.opik_metric
|
569
|
+
else "unknown_metric",
|
502
570
|
details={"error": "No programs generated or compile failed"},
|
503
571
|
history=[],
|
504
|
-
llm_calls=self.llm_call_counter
|
572
|
+
llm_calls=self.lm.llm_call_counter,
|
505
573
|
)
|
506
|
-
|
574
|
+
|
507
575
|
score = self.best_programs[position]["score"]
|
508
576
|
program_module = self.best_programs[position]["program"]
|
509
577
|
state = program_module.dump_state()
|
@@ -528,5 +596,5 @@ class MiproOptimizer(BaseOptimizer):
|
|
528
596
|
metric_name=self.opik_metric.__name__,
|
529
597
|
demonstrations=demos,
|
530
598
|
details={"program": program_module},
|
531
|
-
llm_calls=self.llm_call_counter
|
599
|
+
llm_calls=self.lm.llm_call_counter,
|
532
600
|
)
|
@@ -1,4 +1,4 @@
|
|
1
|
-
from typing import
|
1
|
+
from typing import Dict, Optional
|
2
2
|
|
3
3
|
import uuid
|
4
4
|
import dspy
|
@@ -46,7 +46,10 @@ def opik_metric_to_dspy(metric, output):
|
|
46
46
|
def opik_metric_score_wrapper(example, prediction, trace=None):
|
47
47
|
try:
|
48
48
|
# Calculate the score using the metric
|
49
|
-
score_result = metric(
|
49
|
+
score_result = metric(
|
50
|
+
dataset_item=example.toDict(),
|
51
|
+
llm_output=getattr(prediction, answer_field, ""),
|
52
|
+
)
|
50
53
|
return (
|
51
54
|
score_result.value if hasattr(score_result, "value") else score_result
|
52
55
|
)
|
@@ -0,0 +1,179 @@
|
|
1
|
+
from typing import Dict, Any, List, Optional, TYPE_CHECKING
|
2
|
+
import json
|
3
|
+
import os
|
4
|
+
|
5
|
+
|
6
|
+
from opik.opik_context import get_current_span_data
|
7
|
+
|
8
|
+
import litellm
|
9
|
+
from litellm.integrations.opik.opik import OpikLogger
|
10
|
+
|
11
|
+
from . import _throttle
|
12
|
+
|
13
|
+
_limiter = _throttle.get_rate_limiter_for_current_opik_installation()
|
14
|
+
|
15
|
+
if TYPE_CHECKING:
|
16
|
+
from .optimization_config.chat_prompt import ChatPrompt
|
17
|
+
|
18
|
+
|
19
|
+
def tools_to_dict(tools: Dict[str, Dict[str, Any]]) -> Dict[str, Any]:
|
20
|
+
retval = {}
|
21
|
+
for name in tools:
|
22
|
+
parts = {}
|
23
|
+
for part in tools[name]:
|
24
|
+
if isinstance(tools[name][part], (int, float, str)):
|
25
|
+
parts[part] = tools[name][part]
|
26
|
+
if parts:
|
27
|
+
retval[name] = parts
|
28
|
+
return retval
|
29
|
+
|
30
|
+
|
31
|
+
class OptimizableAgent:
|
32
|
+
"""
|
33
|
+
An agent class to subclass to make an Optimizable Agent.
|
34
|
+
|
35
|
+
Attributes:
|
36
|
+
model (Optional[str]): The model to use for the agent
|
37
|
+
model_kwargs (Dict[str, Any]): Additional keyword arguments for the model
|
38
|
+
project_name (Optional[str]): The project name for tracking
|
39
|
+
"""
|
40
|
+
|
41
|
+
model: Optional[str] = None
|
42
|
+
model_kwargs: Dict[str, Any] = {}
|
43
|
+
project_name: Optional[str] = "Default Project"
|
44
|
+
input_dataset_field: Optional[str] = None
|
45
|
+
prompts: Dict[str, "ChatPrompt"]
|
46
|
+
prompt: "ChatPrompt"
|
47
|
+
|
48
|
+
def __init__(self, prompt: "ChatPrompt") -> None:
|
49
|
+
"""
|
50
|
+
Initialize the OptimizableAgent.
|
51
|
+
|
52
|
+
Args:
|
53
|
+
prompt: a chat prompt
|
54
|
+
"""
|
55
|
+
self.init_llm()
|
56
|
+
self.init_agent(prompt)
|
57
|
+
|
58
|
+
def init_llm(self) -> None:
|
59
|
+
"""Initialize the LLM with the appropriate callbacks."""
|
60
|
+
# Litellm bug requires this (maybe problematic if multi-threaded)
|
61
|
+
os.environ["OPIK_PROJECT_NAME"] = str(self.project_name)
|
62
|
+
self.opik_logger = OpikLogger()
|
63
|
+
litellm.callbacks = [self.opik_logger]
|
64
|
+
|
65
|
+
def init_agent(self, prompt: "ChatPrompt") -> None:
|
66
|
+
"""Initialize the agent with the provided configuration."""
|
67
|
+
# Register the tools, if any, for default LiteLLM Agent use:
|
68
|
+
self.prompt = prompt
|
69
|
+
|
70
|
+
@_throttle.rate_limited(_limiter)
|
71
|
+
def _llm_complete(
|
72
|
+
self,
|
73
|
+
messages: List[Dict[str, str]],
|
74
|
+
tools: Optional[List[Dict[str, str]]],
|
75
|
+
seed: int,
|
76
|
+
) -> Any:
|
77
|
+
response = litellm.completion(
|
78
|
+
model=self.model,
|
79
|
+
messages=messages,
|
80
|
+
seed=seed,
|
81
|
+
tools=tools,
|
82
|
+
metadata={
|
83
|
+
"opik": {
|
84
|
+
"current_span_data": get_current_span_data(),
|
85
|
+
},
|
86
|
+
},
|
87
|
+
**self.model_kwargs,
|
88
|
+
)
|
89
|
+
return response
|
90
|
+
|
91
|
+
def llm_invoke(
|
92
|
+
self,
|
93
|
+
query: Optional[str] = None,
|
94
|
+
messages: Optional[List[Dict[str, str]]] = None,
|
95
|
+
seed: Optional[int] = None,
|
96
|
+
allow_tool_use: Optional[bool] = False,
|
97
|
+
) -> str:
|
98
|
+
"""
|
99
|
+
NOTE: this is the default LiteLLM API. It is used
|
100
|
+
internally for the LiteLLM Agent.
|
101
|
+
|
102
|
+
Invoke the LLM with the provided query or messages.
|
103
|
+
|
104
|
+
Args:
|
105
|
+
query (Optional[str]): The query to send to the LLM
|
106
|
+
messages (Optional[List[Dict[str, str]]]): Messages to send to the LLM
|
107
|
+
seed (Optional[int]): Seed for reproducibility
|
108
|
+
allow_tool_use: If True, allow LLM to use tools
|
109
|
+
|
110
|
+
Returns:
|
111
|
+
str: The LLM's response
|
112
|
+
"""
|
113
|
+
all_messages = []
|
114
|
+
if messages is not None:
|
115
|
+
all_messages.extend(messages)
|
116
|
+
|
117
|
+
if query is not None:
|
118
|
+
all_messages.append({"role": "user", "content": query})
|
119
|
+
|
120
|
+
if allow_tool_use and self.prompt.tools:
|
121
|
+
# Tool-calling loop
|
122
|
+
final_response = "I was unable to find the desired information."
|
123
|
+
count = 0
|
124
|
+
while count < 20:
|
125
|
+
count += 1
|
126
|
+
response = self._llm_complete(all_messages, self.prompt.tools, seed)
|
127
|
+
msg = response.choices[0].message
|
128
|
+
all_messages.append(msg.to_dict())
|
129
|
+
if msg.tool_calls:
|
130
|
+
for tool_call in msg["tool_calls"]:
|
131
|
+
tool_name = tool_call["function"]["name"]
|
132
|
+
arguments = json.loads(tool_call["function"]["arguments"])
|
133
|
+
tool_func = self.prompt.function_map.get(tool_name)
|
134
|
+
try:
|
135
|
+
tool_result = (
|
136
|
+
tool_func(**arguments)
|
137
|
+
if tool_func is not None
|
138
|
+
else "Unknown tool"
|
139
|
+
)
|
140
|
+
except Exception:
|
141
|
+
tool_result = f"Error in calling tool `{tool_name}`"
|
142
|
+
all_messages.append(
|
143
|
+
{
|
144
|
+
"role": "tool",
|
145
|
+
"tool_call_id": tool_call["id"],
|
146
|
+
"content": str(tool_result),
|
147
|
+
}
|
148
|
+
)
|
149
|
+
else:
|
150
|
+
final_response = msg["content"]
|
151
|
+
break
|
152
|
+
result = final_response
|
153
|
+
else:
|
154
|
+
response = self._llm_complete(all_messages, None, seed)
|
155
|
+
result = response.choices[0].message.content
|
156
|
+
return result
|
157
|
+
|
158
|
+
def invoke_dataset_item(self, dataset_item: Dict[str, str]) -> str:
|
159
|
+
messages = self.prompt.get_messages(dataset_item)
|
160
|
+
return self.invoke(messages)
|
161
|
+
|
162
|
+
def invoke(
|
163
|
+
self,
|
164
|
+
messages: List[Dict[str, str]],
|
165
|
+
seed: Optional[int] = None,
|
166
|
+
) -> str:
|
167
|
+
"""
|
168
|
+
Invoke the agent with a dataset item.
|
169
|
+
|
170
|
+
Args:
|
171
|
+
dataset_item (Dict[str, Any]): The dataset item to process
|
172
|
+
seed (Optional[int]): Seed for reproducibility
|
173
|
+
|
174
|
+
Returns:
|
175
|
+
Dict[str, Any]: The agent's response
|
176
|
+
"""
|
177
|
+
# Replace with agent invocation:
|
178
|
+
result = self.llm_invoke(messages=messages, seed=seed, allow_tool_use=True)
|
179
|
+
return result
|