opik-optimizer 0.9.2__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik_optimizer/__init__.py +7 -5
- opik_optimizer/_throttle.py +8 -8
- opik_optimizer/base_optimizer.py +98 -45
- opik_optimizer/cache_config.py +5 -3
- opik_optimizer/datasets/ai2_arc.py +15 -13
- opik_optimizer/datasets/cnn_dailymail.py +19 -15
- opik_optimizer/datasets/election_questions.py +10 -11
- opik_optimizer/datasets/gsm8k.py +16 -11
- opik_optimizer/datasets/halu_eval.py +6 -5
- opik_optimizer/datasets/hotpot_qa.py +17 -16
- opik_optimizer/datasets/medhallu.py +10 -7
- opik_optimizer/datasets/rag_hallucinations.py +11 -8
- opik_optimizer/datasets/ragbench.py +17 -9
- opik_optimizer/datasets/tiny_test.py +33 -37
- opik_optimizer/datasets/truthful_qa.py +18 -12
- opik_optimizer/demo/cache.py +6 -6
- opik_optimizer/demo/datasets.py +3 -7
- opik_optimizer/evolutionary_optimizer/__init__.py +3 -1
- opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +722 -429
- opik_optimizer/evolutionary_optimizer/reporting.py +155 -74
- opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +271 -188
- opik_optimizer/few_shot_bayesian_optimizer/reporting.py +79 -28
- opik_optimizer/logging_config.py +19 -15
- opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +209 -129
- opik_optimizer/meta_prompt_optimizer/reporting.py +121 -46
- opik_optimizer/mipro_optimizer/__init__.py +2 -0
- opik_optimizer/mipro_optimizer/_lm.py +38 -9
- opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +37 -26
- opik_optimizer/mipro_optimizer/mipro_optimizer.py +132 -63
- opik_optimizer/mipro_optimizer/utils.py +5 -2
- opik_optimizer/optimizable_agent.py +179 -0
- opik_optimizer/optimization_config/chat_prompt.py +143 -73
- opik_optimizer/optimization_config/configs.py +4 -3
- opik_optimizer/optimization_config/mappers.py +18 -6
- opik_optimizer/optimization_result.py +22 -13
- opik_optimizer/py.typed +0 -0
- opik_optimizer/reporting_utils.py +89 -58
- opik_optimizer/task_evaluator.py +12 -14
- opik_optimizer/utils.py +117 -14
- {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.1.dist-info}/METADATA +8 -8
- opik_optimizer-1.0.1.dist-info/RECORD +50 -0
- opik_optimizer-0.9.2.dist-info/RECORD +0 -48
- {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.1.dist-info}/WHEEL +0 -0
- {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.1.dist-info}/licenses/LICENSE +0 -0
- {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.1.dist-info}/top_level.txt +0 -0
@@ -2,6 +2,7 @@ import os
|
|
2
2
|
import random
|
3
3
|
from datetime import datetime
|
4
4
|
from typing import Callable, Dict, List, Literal, Optional, Union
|
5
|
+
import logging
|
5
6
|
|
6
7
|
import dspy
|
7
8
|
import litellm
|
@@ -16,7 +17,6 @@ from ..optimization_result import OptimizationResult
|
|
16
17
|
from ..utils import optimization_context
|
17
18
|
from ..base_optimizer import BaseOptimizer
|
18
19
|
from ..optimization_config.configs import TaskConfig
|
19
|
-
from ..optimization_result import OptimizationResult
|
20
20
|
from ._lm import LM
|
21
21
|
from ._mipro_optimizer_v2 import MIPROv2
|
22
22
|
from .utils import (
|
@@ -30,16 +30,20 @@ from .utils import (
|
|
30
30
|
disk_cache_dir = os.path.expanduser("~/.litellm_cache")
|
31
31
|
litellm.cache = Cache(type="disk", disk_cache_dir=disk_cache_dir)
|
32
32
|
|
33
|
-
# Set up logging
|
34
|
-
import logging
|
35
|
-
|
36
33
|
logger = logging.getLogger(__name__) # Inherits config from setup_logging
|
37
34
|
|
38
35
|
|
39
36
|
class MiproOptimizer(BaseOptimizer):
|
40
|
-
def __init__(
|
41
|
-
|
37
|
+
def __init__(
|
38
|
+
self,
|
39
|
+
model,
|
40
|
+
project_name: Optional[str] = None,
|
41
|
+
verbose: int = 1,
|
42
|
+
**model_kwargs,
|
43
|
+
):
|
44
|
+
super().__init__(model=model, verbose=verbose, **model_kwargs)
|
42
45
|
self.tools = []
|
46
|
+
self.project_name = project_name
|
43
47
|
self.num_threads = self.model_kwargs.pop("num_threads", 6)
|
44
48
|
self.model_kwargs["model"] = self.model
|
45
49
|
# FIXME: add mipro_optimizer=True - It does not count the LLM calls made internally by DSPy during MiproOptimizer.optimizer.compile().
|
@@ -84,7 +88,9 @@ class MiproOptimizer(BaseOptimizer):
|
|
84
88
|
output_key = task_config.output_dataset_field
|
85
89
|
|
86
90
|
# Kwargs might contain n_samples, passed from run_benchmark.py
|
87
|
-
n_samples = kwargs.pop(
|
91
|
+
n_samples = kwargs.pop(
|
92
|
+
"n_samples", None
|
93
|
+
) # Get n_samples from kwargs if present
|
88
94
|
|
89
95
|
if isinstance(dataset, str):
|
90
96
|
opik_client = opik.Opik(project_name=self.project_name)
|
@@ -153,28 +159,42 @@ class MiproOptimizer(BaseOptimizer):
|
|
153
159
|
# Robust n_samples handling for selecting dataset_item_ids
|
154
160
|
dataset_items_for_eval = dataset.get_items()
|
155
161
|
num_total_items = len(dataset_items_for_eval)
|
156
|
-
dataset_item_ids_to_use = dataset_item_ids
|
162
|
+
dataset_item_ids_to_use = dataset_item_ids # Use provided IDs if any
|
157
163
|
|
158
|
-
if
|
164
|
+
if (
|
165
|
+
n_samples is not None
|
166
|
+
): # If n_samples is specified by the caller (run_benchmark.py)
|
159
167
|
if dataset_item_ids is not None:
|
160
168
|
# This case should ideally be an error or a clear precedence rule.
|
161
169
|
# For now, let's assume if dataset_item_ids is provided, it takes precedence over n_samples.
|
162
|
-
logger.warning(
|
170
|
+
logger.warning(
|
171
|
+
"MiproOptimizer.evaluate_prompt: Both n_samples and dataset_item_ids provided. Using provided dataset_item_ids."
|
172
|
+
)
|
163
173
|
# dataset_item_ids_to_use is already dataset_item_ids
|
164
174
|
elif n_samples > num_total_items:
|
165
|
-
logger.warning(
|
166
|
-
|
175
|
+
logger.warning(
|
176
|
+
f"MiproOptimizer.evaluate_prompt: n_samples ({n_samples}) > total items ({num_total_items}). Using all {num_total_items} items."
|
177
|
+
)
|
178
|
+
dataset_item_ids_to_use = (
|
179
|
+
None # opik.evaluation.evaluate handles None as all items
|
180
|
+
)
|
167
181
|
elif n_samples <= 0:
|
168
|
-
logger.warning(
|
182
|
+
logger.warning(
|
183
|
+
f"MiproOptimizer.evaluate_prompt: n_samples ({n_samples}) is <= 0. Using all {num_total_items} items."
|
184
|
+
)
|
169
185
|
dataset_item_ids_to_use = None
|
170
186
|
else:
|
171
187
|
# n_samples is valid and dataset_item_ids was not provided, so sample now.
|
172
188
|
all_ids = [item["id"] for item in dataset_items_for_eval]
|
173
189
|
dataset_item_ids_to_use = random.sample(all_ids, n_samples)
|
174
|
-
logger.info(
|
175
|
-
|
190
|
+
logger.info(
|
191
|
+
f"MiproOptimizer.evaluate_prompt: Sampled {n_samples} items for evaluation."
|
192
|
+
)
|
193
|
+
else: # n_samples is None
|
176
194
|
if dataset_item_ids is None:
|
177
|
-
logger.info(
|
195
|
+
logger.info(
|
196
|
+
f"MiproOptimizer.evaluate_prompt: n_samples is None and dataset_item_ids is None. Using all {num_total_items} items."
|
197
|
+
)
|
178
198
|
# dataset_item_ids_to_use is already dataset_item_ids (which could be None)
|
179
199
|
|
180
200
|
experiment_config = experiment_config or {}
|
@@ -230,10 +250,10 @@ class MiproOptimizer(BaseOptimizer):
|
|
230
250
|
) -> OptimizationResult:
|
231
251
|
self._opik_client = opik.Opik()
|
232
252
|
with optimization_context(
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
253
|
+
client=self._opik_client,
|
254
|
+
dataset_name=dataset.name,
|
255
|
+
objective_name=metric.__name__,
|
256
|
+
metadata={"optimizer": self.__class__.__name__},
|
237
257
|
) as optimization:
|
238
258
|
result = self._optimize_prompt(
|
239
259
|
dataset=dataset,
|
@@ -322,7 +342,9 @@ class MiproOptimizer(BaseOptimizer):
|
|
322
342
|
if self.output_key not in row:
|
323
343
|
raise Exception("row does not contain output_key: %r" % self.output_key)
|
324
344
|
|
325
|
-
self.trainset = create_dspy_training_set(
|
345
|
+
self.trainset = create_dspy_training_set(
|
346
|
+
self.dataset, self.input_key, self.n_samples
|
347
|
+
)
|
326
348
|
self.data_signature = create_dspy_signature(
|
327
349
|
self.input_key, self.output_key, self.prompt
|
328
350
|
)
|
@@ -383,8 +405,10 @@ class MiproOptimizer(BaseOptimizer):
|
|
383
405
|
"""
|
384
406
|
Continue to look for optimizations
|
385
407
|
"""
|
386
|
-
if not hasattr(self,
|
387
|
-
raise RuntimeError(
|
408
|
+
if not hasattr(self, "optimizer") or not self.optimizer:
|
409
|
+
raise RuntimeError(
|
410
|
+
"MiproOptimizer not prepared. Call prepare_optimize_prompt first."
|
411
|
+
)
|
388
412
|
|
389
413
|
self.results = self.optimizer.compile(
|
390
414
|
student=self.module,
|
@@ -403,16 +427,30 @@ class MiproOptimizer(BaseOptimizer):
|
|
403
427
|
# self.num_candidates is set in prepare_optimize_prompt, defaults to 10
|
404
428
|
# If self.num_candidates is 0 or None, this logic might break or be odd.
|
405
429
|
# Add a safeguard for num_candidates_per_round if self.num_candidates is not usable.
|
406
|
-
num_candidates_per_round =
|
430
|
+
num_candidates_per_round = ( # noqa
|
431
|
+
self.num_candidates
|
432
|
+
if hasattr(self, "num_candidates")
|
433
|
+
and self.num_candidates
|
434
|
+
and self.num_candidates > 0
|
435
|
+
else 1
|
436
|
+
)
|
407
437
|
|
408
438
|
for i, candidate_data in enumerate(self.results.candidate_programs):
|
409
439
|
program_module = candidate_data.get("program")
|
410
440
|
instruction = "N/A"
|
411
|
-
if hasattr(program_module,
|
441
|
+
if hasattr(program_module, "signature") and hasattr(
|
442
|
+
program_module.signature, "instructions"
|
443
|
+
):
|
412
444
|
instruction = program_module.signature.instructions
|
413
|
-
elif hasattr(program_module,
|
445
|
+
elif hasattr(program_module, "extended_signature") and hasattr(
|
446
|
+
program_module.extended_signature, "instructions"
|
447
|
+
):
|
414
448
|
instruction = program_module.extended_signature.instructions
|
415
|
-
elif
|
449
|
+
elif (
|
450
|
+
hasattr(program_module, "predictor")
|
451
|
+
and hasattr(program_module.predictor, "signature")
|
452
|
+
and hasattr(program_module.predictor.signature, "instructions")
|
453
|
+
):
|
416
454
|
instruction = program_module.predictor.signature.instructions
|
417
455
|
|
418
456
|
# Remove R and C calculation for Mipro as its history is flat
|
@@ -425,13 +463,11 @@ class MiproOptimizer(BaseOptimizer):
|
|
425
463
|
# "candidate_in_round": current_candidate_in_round, # Remove candidate_in_round
|
426
464
|
"timestamp": datetime.now().isoformat(),
|
427
465
|
"prompt_candidate": instruction,
|
428
|
-
"parameters_used": {
|
429
|
-
|
430
|
-
|
431
|
-
"
|
432
|
-
"
|
433
|
-
"cost": None, # TODO: add cost
|
434
|
-
"duration_seconds": None, # TODO: add duration_seconds
|
466
|
+
"parameters_used": {"program_summary": str(program_module)[:500]},
|
467
|
+
"scores": [], # Initialize scores list
|
468
|
+
"tokens_used": None, # TODO: add tokens_used
|
469
|
+
"cost": None, # TODO: add cost
|
470
|
+
"duration_seconds": None, # TODO: add duration_seconds
|
435
471
|
}
|
436
472
|
|
437
473
|
current_score = candidate_data.get("score")
|
@@ -439,70 +475,103 @@ class MiproOptimizer(BaseOptimizer):
|
|
439
475
|
|
440
476
|
# Unscale if it's a known 0-1 metric that MIPRO might scale to 0-100
|
441
477
|
# For now, specifically targeting Levenshtein-like metrics
|
442
|
-
if isinstance(current_score, (float, int)) and
|
443
|
-
|
478
|
+
if isinstance(current_score, (float, int)) and (
|
479
|
+
"levenshtein" in metric_name_for_history.lower()
|
480
|
+
or "similarity" in metric_name_for_history.lower()
|
481
|
+
):
|
444
482
|
# Assuming scores like 32.4 are 0-1 scores scaled by 100
|
445
|
-
if abs(current_score) > 1.0:
|
446
|
-
logger.debug(
|
483
|
+
if abs(current_score) > 1.0: # A simple check to see if it looks scaled
|
484
|
+
logger.debug(
|
485
|
+
f"Mipro history: Unscaling score {current_score} for metric {metric_name_for_history} by dividing by 100."
|
486
|
+
)
|
447
487
|
current_score /= 100.0
|
448
|
-
|
449
|
-
iter_detail["scores"].append(
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
488
|
+
|
489
|
+
iter_detail["scores"].append(
|
490
|
+
{
|
491
|
+
"metric_name": metric_name_for_history,
|
492
|
+
"score": current_score,
|
493
|
+
"opik_evaluation_id": None, # TODO: add opik_evaluation_id
|
494
|
+
}
|
495
|
+
)
|
454
496
|
mipro_history_processed.append(iter_detail)
|
455
497
|
|
456
498
|
if not self.best_programs:
|
457
499
|
logger.warning("MIPRO compile returned no candidate programs.")
|
458
500
|
return OptimizationResult(
|
459
501
|
optimizer="MiproOptimizer",
|
460
|
-
prompt=[
|
502
|
+
prompt=[
|
503
|
+
{
|
504
|
+
"role": "user",
|
505
|
+
"content": getattr(
|
506
|
+
self, "prompt", "Error: Initial prompt not found"
|
507
|
+
),
|
508
|
+
}
|
509
|
+
],
|
461
510
|
score=0.0,
|
462
|
-
metric_name=self.opik_metric.__name__
|
511
|
+
metric_name=self.opik_metric.__name__
|
512
|
+
if hasattr(self, "opik_metric")
|
513
|
+
else "unknown_metric",
|
463
514
|
details={"error": "No candidate programs generated by MIPRO"},
|
464
515
|
history=mipro_history_processed,
|
465
|
-
llm_calls=self.lm.llm_call_counter
|
516
|
+
llm_calls=self.lm.llm_call_counter,
|
466
517
|
)
|
467
518
|
|
468
519
|
self.module = self.get_best().details["program"]
|
469
520
|
best_program_details = self.get_best()
|
470
|
-
|
521
|
+
|
471
522
|
# Unscale the main score if necessary, similar to history scores
|
472
523
|
final_best_score = best_program_details.score
|
473
524
|
final_metric_name = best_program_details.metric_name
|
474
|
-
if
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
525
|
+
if (
|
526
|
+
isinstance(final_best_score, (float, int))
|
527
|
+
and final_metric_name
|
528
|
+
and (
|
529
|
+
"levenshtein" in final_metric_name.lower()
|
530
|
+
or "similarity" in final_metric_name.lower()
|
531
|
+
)
|
532
|
+
):
|
533
|
+
if abs(final_best_score) > 1.0: # A simple check to see if it looks scaled
|
534
|
+
logger.debug(
|
535
|
+
f"Mipro main result: Unscaling score {final_best_score} for metric {final_metric_name} by dividing by 100."
|
536
|
+
)
|
479
537
|
final_best_score /= 100.0
|
480
538
|
|
481
539
|
return OptimizationResult(
|
482
540
|
optimizer="MiproOptimizer",
|
483
541
|
prompt=best_program_details.prompt,
|
484
542
|
tool_prompts=best_program_details.tool_prompts,
|
485
|
-
score=final_best_score,
|
543
|
+
score=final_best_score, # Use the potentially unscaled score
|
486
544
|
metric_name=final_metric_name,
|
487
545
|
demonstrations=best_program_details.demonstrations,
|
488
546
|
details=best_program_details.details,
|
489
547
|
history=mipro_history_processed,
|
490
|
-
llm_calls=self.lm.llm_call_counter
|
548
|
+
llm_calls=self.lm.llm_call_counter,
|
491
549
|
)
|
492
550
|
|
493
551
|
def get_best(self, position: int = 0) -> OptimizationResult:
|
494
|
-
if not hasattr(self,
|
495
|
-
logger.error(
|
552
|
+
if not hasattr(self, "best_programs") or not self.best_programs:
|
553
|
+
logger.error(
|
554
|
+
"get_best() called but no best_programs found. MIPRO compile might have failed or yielded no results."
|
555
|
+
)
|
496
556
|
return OptimizationResult(
|
497
557
|
optimizer="MiproOptimizer",
|
498
|
-
prompt=[
|
499
|
-
|
500
|
-
|
558
|
+
prompt=[
|
559
|
+
{
|
560
|
+
"role": "user",
|
561
|
+
"content": getattr(
|
562
|
+
self, "prompt", "Error: Initial prompt not found"
|
563
|
+
),
|
564
|
+
}
|
565
|
+
],
|
566
|
+
score=0.0,
|
567
|
+
metric_name=getattr(self, "opik_metric", None).name
|
568
|
+
if hasattr(self, "opik_metric") and self.opik_metric
|
569
|
+
else "unknown_metric",
|
501
570
|
details={"error": "No programs generated or compile failed"},
|
502
571
|
history=[],
|
503
|
-
llm_calls=self.lm.llm_call_counter
|
572
|
+
llm_calls=self.lm.llm_call_counter,
|
504
573
|
)
|
505
|
-
|
574
|
+
|
506
575
|
score = self.best_programs[position]["score"]
|
507
576
|
program_module = self.best_programs[position]["program"]
|
508
577
|
state = program_module.dump_state()
|
@@ -527,5 +596,5 @@ class MiproOptimizer(BaseOptimizer):
|
|
527
596
|
metric_name=self.opik_metric.__name__,
|
528
597
|
demonstrations=demos,
|
529
598
|
details={"program": program_module},
|
530
|
-
llm_calls=self.lm.llm_call_counter
|
599
|
+
llm_calls=self.lm.llm_call_counter,
|
531
600
|
)
|
@@ -1,4 +1,4 @@
|
|
1
|
-
from typing import
|
1
|
+
from typing import Dict, Optional
|
2
2
|
|
3
3
|
import uuid
|
4
4
|
import dspy
|
@@ -46,7 +46,10 @@ def opik_metric_to_dspy(metric, output):
|
|
46
46
|
def opik_metric_score_wrapper(example, prediction, trace=None):
|
47
47
|
try:
|
48
48
|
# Calculate the score using the metric
|
49
|
-
score_result = metric(
|
49
|
+
score_result = metric(
|
50
|
+
dataset_item=example.toDict(),
|
51
|
+
llm_output=getattr(prediction, answer_field, ""),
|
52
|
+
)
|
50
53
|
return (
|
51
54
|
score_result.value if hasattr(score_result, "value") else score_result
|
52
55
|
)
|
@@ -0,0 +1,179 @@
|
|
1
|
+
from typing import Dict, Any, List, Optional, TYPE_CHECKING
|
2
|
+
import json
|
3
|
+
import os
|
4
|
+
|
5
|
+
|
6
|
+
from opik.opik_context import get_current_span_data
|
7
|
+
|
8
|
+
import litellm
|
9
|
+
from litellm.integrations.opik.opik import OpikLogger
|
10
|
+
|
11
|
+
from . import _throttle
|
12
|
+
|
13
|
+
_limiter = _throttle.get_rate_limiter_for_current_opik_installation()
|
14
|
+
|
15
|
+
if TYPE_CHECKING:
|
16
|
+
from .optimization_config.chat_prompt import ChatPrompt
|
17
|
+
|
18
|
+
|
19
|
+
def tools_to_dict(tools: Dict[str, Dict[str, Any]]) -> Dict[str, Any]:
|
20
|
+
retval = {}
|
21
|
+
for name in tools:
|
22
|
+
parts = {}
|
23
|
+
for part in tools[name]:
|
24
|
+
if isinstance(tools[name][part], (int, float, str)):
|
25
|
+
parts[part] = tools[name][part]
|
26
|
+
if parts:
|
27
|
+
retval[name] = parts
|
28
|
+
return retval
|
29
|
+
|
30
|
+
|
31
|
+
class OptimizableAgent:
|
32
|
+
"""
|
33
|
+
An agent class to subclass to make an Optimizable Agent.
|
34
|
+
|
35
|
+
Attributes:
|
36
|
+
model (Optional[str]): The model to use for the agent
|
37
|
+
model_kwargs (Dict[str, Any]): Additional keyword arguments for the model
|
38
|
+
project_name (Optional[str]): The project name for tracking
|
39
|
+
"""
|
40
|
+
|
41
|
+
model: Optional[str] = None
|
42
|
+
model_kwargs: Dict[str, Any] = {}
|
43
|
+
project_name: Optional[str] = "Default Project"
|
44
|
+
input_dataset_field: Optional[str] = None
|
45
|
+
prompts: Dict[str, "ChatPrompt"]
|
46
|
+
prompt: "ChatPrompt"
|
47
|
+
|
48
|
+
def __init__(self, prompt: "ChatPrompt") -> None:
|
49
|
+
"""
|
50
|
+
Initialize the OptimizableAgent.
|
51
|
+
|
52
|
+
Args:
|
53
|
+
prompt: a chat prompt
|
54
|
+
"""
|
55
|
+
self.init_llm()
|
56
|
+
self.init_agent(prompt)
|
57
|
+
|
58
|
+
def init_llm(self) -> None:
|
59
|
+
"""Initialize the LLM with the appropriate callbacks."""
|
60
|
+
# Litellm bug requires this (maybe problematic if multi-threaded)
|
61
|
+
os.environ["OPIK_PROJECT_NAME"] = str(self.project_name)
|
62
|
+
self.opik_logger = OpikLogger()
|
63
|
+
litellm.callbacks = [self.opik_logger]
|
64
|
+
|
65
|
+
def init_agent(self, prompt: "ChatPrompt") -> None:
|
66
|
+
"""Initialize the agent with the provided configuration."""
|
67
|
+
# Register the tools, if any, for default LiteLLM Agent use:
|
68
|
+
self.prompt = prompt
|
69
|
+
|
70
|
+
@_throttle.rate_limited(_limiter)
|
71
|
+
def _llm_complete(
|
72
|
+
self,
|
73
|
+
messages: List[Dict[str, str]],
|
74
|
+
tools: Optional[List[Dict[str, str]]],
|
75
|
+
seed: int,
|
76
|
+
) -> Any:
|
77
|
+
response = litellm.completion(
|
78
|
+
model=self.model,
|
79
|
+
messages=messages,
|
80
|
+
seed=seed,
|
81
|
+
tools=tools,
|
82
|
+
metadata={
|
83
|
+
"opik": {
|
84
|
+
"current_span_data": get_current_span_data(),
|
85
|
+
},
|
86
|
+
},
|
87
|
+
**self.model_kwargs,
|
88
|
+
)
|
89
|
+
return response
|
90
|
+
|
91
|
+
def llm_invoke(
|
92
|
+
self,
|
93
|
+
query: Optional[str] = None,
|
94
|
+
messages: Optional[List[Dict[str, str]]] = None,
|
95
|
+
seed: Optional[int] = None,
|
96
|
+
allow_tool_use: Optional[bool] = False,
|
97
|
+
) -> str:
|
98
|
+
"""
|
99
|
+
NOTE: this is the default LiteLLM API. It is used
|
100
|
+
internally for the LiteLLM Agent.
|
101
|
+
|
102
|
+
Invoke the LLM with the provided query or messages.
|
103
|
+
|
104
|
+
Args:
|
105
|
+
query (Optional[str]): The query to send to the LLM
|
106
|
+
messages (Optional[List[Dict[str, str]]]): Messages to send to the LLM
|
107
|
+
seed (Optional[int]): Seed for reproducibility
|
108
|
+
allow_tool_use: If True, allow LLM to use tools
|
109
|
+
|
110
|
+
Returns:
|
111
|
+
str: The LLM's response
|
112
|
+
"""
|
113
|
+
all_messages = []
|
114
|
+
if messages is not None:
|
115
|
+
all_messages.extend(messages)
|
116
|
+
|
117
|
+
if query is not None:
|
118
|
+
all_messages.append({"role": "user", "content": query})
|
119
|
+
|
120
|
+
if allow_tool_use and self.prompt.tools:
|
121
|
+
# Tool-calling loop
|
122
|
+
final_response = "I was unable to find the desired information."
|
123
|
+
count = 0
|
124
|
+
while count < 20:
|
125
|
+
count += 1
|
126
|
+
response = self._llm_complete(all_messages, self.prompt.tools, seed)
|
127
|
+
msg = response.choices[0].message
|
128
|
+
all_messages.append(msg.to_dict())
|
129
|
+
if msg.tool_calls:
|
130
|
+
for tool_call in msg["tool_calls"]:
|
131
|
+
tool_name = tool_call["function"]["name"]
|
132
|
+
arguments = json.loads(tool_call["function"]["arguments"])
|
133
|
+
tool_func = self.prompt.function_map.get(tool_name)
|
134
|
+
try:
|
135
|
+
tool_result = (
|
136
|
+
tool_func(**arguments)
|
137
|
+
if tool_func is not None
|
138
|
+
else "Unknown tool"
|
139
|
+
)
|
140
|
+
except Exception:
|
141
|
+
tool_result = f"Error in calling tool `{tool_name}`"
|
142
|
+
all_messages.append(
|
143
|
+
{
|
144
|
+
"role": "tool",
|
145
|
+
"tool_call_id": tool_call["id"],
|
146
|
+
"content": str(tool_result),
|
147
|
+
}
|
148
|
+
)
|
149
|
+
else:
|
150
|
+
final_response = msg["content"]
|
151
|
+
break
|
152
|
+
result = final_response
|
153
|
+
else:
|
154
|
+
response = self._llm_complete(all_messages, None, seed)
|
155
|
+
result = response.choices[0].message.content
|
156
|
+
return result
|
157
|
+
|
158
|
+
def invoke_dataset_item(self, dataset_item: Dict[str, str]) -> str:
|
159
|
+
messages = self.prompt.get_messages(dataset_item)
|
160
|
+
return self.invoke(messages)
|
161
|
+
|
162
|
+
def invoke(
|
163
|
+
self,
|
164
|
+
messages: List[Dict[str, str]],
|
165
|
+
seed: Optional[int] = None,
|
166
|
+
) -> str:
|
167
|
+
"""
|
168
|
+
Invoke the agent with a dataset item.
|
169
|
+
|
170
|
+
Args:
|
171
|
+
dataset_item (Dict[str, Any]): The dataset item to process
|
172
|
+
seed (Optional[int]): Seed for reproducibility
|
173
|
+
|
174
|
+
Returns:
|
175
|
+
Dict[str, Any]: The agent's response
|
176
|
+
"""
|
177
|
+
# Replace with agent invocation:
|
178
|
+
result = self.llm_invoke(messages=messages, seed=seed, allow_tool_use=True)
|
179
|
+
return result
|