opik-optimizer 0.9.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. opik_optimizer/__init__.py +7 -3
  2. opik_optimizer/_throttle.py +8 -8
  3. opik_optimizer/base_optimizer.py +98 -45
  4. opik_optimizer/cache_config.py +5 -3
  5. opik_optimizer/datasets/ai2_arc.py +15 -13
  6. opik_optimizer/datasets/cnn_dailymail.py +19 -15
  7. opik_optimizer/datasets/election_questions.py +10 -11
  8. opik_optimizer/datasets/gsm8k.py +16 -11
  9. opik_optimizer/datasets/halu_eval.py +6 -5
  10. opik_optimizer/datasets/hotpot_qa.py +17 -16
  11. opik_optimizer/datasets/medhallu.py +10 -7
  12. opik_optimizer/datasets/rag_hallucinations.py +11 -8
  13. opik_optimizer/datasets/ragbench.py +17 -9
  14. opik_optimizer/datasets/tiny_test.py +33 -37
  15. opik_optimizer/datasets/truthful_qa.py +18 -12
  16. opik_optimizer/demo/cache.py +6 -6
  17. opik_optimizer/demo/datasets.py +3 -7
  18. opik_optimizer/evolutionary_optimizer/__init__.py +3 -1
  19. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +748 -437
  20. opik_optimizer/evolutionary_optimizer/reporting.py +155 -76
  21. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +291 -181
  22. opik_optimizer/few_shot_bayesian_optimizer/reporting.py +79 -28
  23. opik_optimizer/logging_config.py +19 -15
  24. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +234 -138
  25. opik_optimizer/meta_prompt_optimizer/reporting.py +121 -47
  26. opik_optimizer/mipro_optimizer/__init__.py +2 -0
  27. opik_optimizer/mipro_optimizer/_lm.py +41 -9
  28. opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +37 -26
  29. opik_optimizer/mipro_optimizer/mipro_optimizer.py +135 -67
  30. opik_optimizer/mipro_optimizer/utils.py +5 -2
  31. opik_optimizer/optimizable_agent.py +179 -0
  32. opik_optimizer/optimization_config/chat_prompt.py +143 -73
  33. opik_optimizer/optimization_config/configs.py +4 -3
  34. opik_optimizer/optimization_config/mappers.py +18 -6
  35. opik_optimizer/optimization_result.py +28 -20
  36. opik_optimizer/py.typed +0 -0
  37. opik_optimizer/reporting_utils.py +96 -46
  38. opik_optimizer/task_evaluator.py +12 -14
  39. opik_optimizer/utils.py +122 -37
  40. {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/METADATA +8 -8
  41. opik_optimizer-1.0.0.dist-info/RECORD +50 -0
  42. opik_optimizer-0.9.1.dist-info/RECORD +0 -48
  43. {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/WHEEL +0 -0
  44. {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/licenses/LICENSE +0 -0
  45. {opik_optimizer-0.9.1.dist-info → opik_optimizer-1.0.0.dist-info}/top_level.txt +0 -0
@@ -2,6 +2,7 @@ import os
2
2
  import random
3
3
  from datetime import datetime
4
4
  from typing import Callable, Dict, List, Literal, Optional, Union
5
+ import logging
5
6
 
6
7
  import dspy
7
8
  import litellm
@@ -16,7 +17,6 @@ from ..optimization_result import OptimizationResult
16
17
  from ..utils import optimization_context
17
18
  from ..base_optimizer import BaseOptimizer
18
19
  from ..optimization_config.configs import TaskConfig
19
- from ..optimization_result import OptimizationResult
20
20
  from ._lm import LM
21
21
  from ._mipro_optimizer_v2 import MIPROv2
22
22
  from .utils import (
@@ -30,23 +30,26 @@ from .utils import (
30
30
  disk_cache_dir = os.path.expanduser("~/.litellm_cache")
31
31
  litellm.cache = Cache(type="disk", disk_cache_dir=disk_cache_dir)
32
32
 
33
- # Set up logging
34
- import logging
35
-
36
33
  logger = logging.getLogger(__name__) # Inherits config from setup_logging
37
34
 
38
35
 
39
36
  class MiproOptimizer(BaseOptimizer):
40
- def __init__(self, model, project_name: Optional[str] = None, verbose: int = 1, **model_kwargs):
41
- super().__init__(model, project_name, verbose=verbose, **model_kwargs)
37
+ def __init__(
38
+ self,
39
+ model,
40
+ project_name: Optional[str] = None,
41
+ verbose: int = 1,
42
+ **model_kwargs,
43
+ ):
44
+ super().__init__(model=model, verbose=verbose, **model_kwargs)
42
45
  self.tools = []
46
+ self.project_name = project_name
43
47
  self.num_threads = self.model_kwargs.pop("num_threads", 6)
44
48
  self.model_kwargs["model"] = self.model
45
- self.llm_call_counter = 0
46
49
  # FIXME: add mipro_optimizer=True - It does not count the LLM calls made internally by DSPy during MiproOptimizer.optimizer.compile().
47
- lm = LM(**self.model_kwargs)
50
+ self.lm = LM(**self.model_kwargs)
48
51
  opik_callback = OpikCallback(project_name=self.project_name, log_graph=True)
49
- dspy.configure(lm=lm, callbacks=[opik_callback])
52
+ dspy.configure(lm=self.lm, callbacks=[opik_callback])
50
53
  logger.debug(f"Initialized MiproOptimizer with model: {model}")
51
54
 
52
55
  def evaluate_prompt(
@@ -54,7 +57,7 @@ class MiproOptimizer(BaseOptimizer):
54
57
  dataset: Union[str, Dataset],
55
58
  metric: Callable,
56
59
  task_config: TaskConfig,
57
- prompt: Union[str, dspy.Module, OptimizationResult] = None,
60
+ prompt: Optional[Union[str, dspy.Module, OptimizationResult]] = None,
58
61
  n_samples: int = 10,
59
62
  dataset_item_ids: Optional[List[str]] = None,
60
63
  experiment_config: Optional[Dict] = None,
@@ -85,7 +88,9 @@ class MiproOptimizer(BaseOptimizer):
85
88
  output_key = task_config.output_dataset_field
86
89
 
87
90
  # Kwargs might contain n_samples, passed from run_benchmark.py
88
- n_samples = kwargs.pop("n_samples", None) # Get n_samples from kwargs if present
91
+ n_samples = kwargs.pop(
92
+ "n_samples", None
93
+ ) # Get n_samples from kwargs if present
89
94
 
90
95
  if isinstance(dataset, str):
91
96
  opik_client = opik.Opik(project_name=self.project_name)
@@ -154,28 +159,42 @@ class MiproOptimizer(BaseOptimizer):
154
159
  # Robust n_samples handling for selecting dataset_item_ids
155
160
  dataset_items_for_eval = dataset.get_items()
156
161
  num_total_items = len(dataset_items_for_eval)
157
- dataset_item_ids_to_use = dataset_item_ids # Use provided IDs if any
162
+ dataset_item_ids_to_use = dataset_item_ids # Use provided IDs if any
158
163
 
159
- if n_samples is not None: # If n_samples is specified by the caller (run_benchmark.py)
164
+ if (
165
+ n_samples is not None
166
+ ): # If n_samples is specified by the caller (run_benchmark.py)
160
167
  if dataset_item_ids is not None:
161
168
  # This case should ideally be an error or a clear precedence rule.
162
169
  # For now, let's assume if dataset_item_ids is provided, it takes precedence over n_samples.
163
- logger.warning("MiproOptimizer.evaluate_prompt: Both n_samples and dataset_item_ids provided. Using provided dataset_item_ids.")
170
+ logger.warning(
171
+ "MiproOptimizer.evaluate_prompt: Both n_samples and dataset_item_ids provided. Using provided dataset_item_ids."
172
+ )
164
173
  # dataset_item_ids_to_use is already dataset_item_ids
165
174
  elif n_samples > num_total_items:
166
- logger.warning(f"MiproOptimizer.evaluate_prompt: n_samples ({n_samples}) > total items ({num_total_items}). Using all {num_total_items} items.")
167
- dataset_item_ids_to_use = None # opik.evaluation.evaluate handles None as all items
175
+ logger.warning(
176
+ f"MiproOptimizer.evaluate_prompt: n_samples ({n_samples}) > total items ({num_total_items}). Using all {num_total_items} items."
177
+ )
178
+ dataset_item_ids_to_use = (
179
+ None # opik.evaluation.evaluate handles None as all items
180
+ )
168
181
  elif n_samples <= 0:
169
- logger.warning(f"MiproOptimizer.evaluate_prompt: n_samples ({n_samples}) is <= 0. Using all {num_total_items} items.")
182
+ logger.warning(
183
+ f"MiproOptimizer.evaluate_prompt: n_samples ({n_samples}) is <= 0. Using all {num_total_items} items."
184
+ )
170
185
  dataset_item_ids_to_use = None
171
186
  else:
172
187
  # n_samples is valid and dataset_item_ids was not provided, so sample now.
173
188
  all_ids = [item["id"] for item in dataset_items_for_eval]
174
189
  dataset_item_ids_to_use = random.sample(all_ids, n_samples)
175
- logger.info(f"MiproOptimizer.evaluate_prompt: Sampled {n_samples} items for evaluation.")
176
- else: # n_samples is None
190
+ logger.info(
191
+ f"MiproOptimizer.evaluate_prompt: Sampled {n_samples} items for evaluation."
192
+ )
193
+ else: # n_samples is None
177
194
  if dataset_item_ids is None:
178
- logger.info(f"MiproOptimizer.evaluate_prompt: n_samples is None and dataset_item_ids is None. Using all {num_total_items} items.")
195
+ logger.info(
196
+ f"MiproOptimizer.evaluate_prompt: n_samples is None and dataset_item_ids is None. Using all {num_total_items} items."
197
+ )
179
198
  # dataset_item_ids_to_use is already dataset_item_ids (which could be None)
180
199
 
181
200
  experiment_config = experiment_config or {}
@@ -231,10 +250,10 @@ class MiproOptimizer(BaseOptimizer):
231
250
  ) -> OptimizationResult:
232
251
  self._opik_client = opik.Opik()
233
252
  with optimization_context(
234
- client=self._opik_client,
235
- dataset_name=dataset.name,
236
- objective_name=metric.__name__,
237
- metadata={"optimizer": self.__class__.__name__},
253
+ client=self._opik_client,
254
+ dataset_name=dataset.name,
255
+ objective_name=metric.__name__,
256
+ metadata={"optimizer": self.__class__.__name__},
238
257
  ) as optimization:
239
258
  result = self._optimize_prompt(
240
259
  dataset=dataset,
@@ -323,7 +342,9 @@ class MiproOptimizer(BaseOptimizer):
323
342
  if self.output_key not in row:
324
343
  raise Exception("row does not contain output_key: %r" % self.output_key)
325
344
 
326
- self.trainset = create_dspy_training_set(self.dataset, self.input_key, self.n_samples)
345
+ self.trainset = create_dspy_training_set(
346
+ self.dataset, self.input_key, self.n_samples
347
+ )
327
348
  self.data_signature = create_dspy_signature(
328
349
  self.input_key, self.output_key, self.prompt
329
350
  )
@@ -384,8 +405,10 @@ class MiproOptimizer(BaseOptimizer):
384
405
  """
385
406
  Continue to look for optimizations
386
407
  """
387
- if not hasattr(self, 'optimizer') or not self.optimizer:
388
- raise RuntimeError("MiproOptimizer not prepared. Call prepare_optimize_prompt first.")
408
+ if not hasattr(self, "optimizer") or not self.optimizer:
409
+ raise RuntimeError(
410
+ "MiproOptimizer not prepared. Call prepare_optimize_prompt first."
411
+ )
389
412
 
390
413
  self.results = self.optimizer.compile(
391
414
  student=self.module,
@@ -404,16 +427,30 @@ class MiproOptimizer(BaseOptimizer):
404
427
  # self.num_candidates is set in prepare_optimize_prompt, defaults to 10
405
428
  # If self.num_candidates is 0 or None, this logic might break or be odd.
406
429
  # Add a safeguard for num_candidates_per_round if self.num_candidates is not usable.
407
- num_candidates_per_round = self.num_candidates if hasattr(self, 'num_candidates') and self.num_candidates and self.num_candidates > 0 else 1
430
+ num_candidates_per_round = ( # noqa
431
+ self.num_candidates
432
+ if hasattr(self, "num_candidates")
433
+ and self.num_candidates
434
+ and self.num_candidates > 0
435
+ else 1
436
+ )
408
437
 
409
438
  for i, candidate_data in enumerate(self.results.candidate_programs):
410
439
  program_module = candidate_data.get("program")
411
440
  instruction = "N/A"
412
- if hasattr(program_module, 'signature') and hasattr(program_module.signature, 'instructions'):
441
+ if hasattr(program_module, "signature") and hasattr(
442
+ program_module.signature, "instructions"
443
+ ):
413
444
  instruction = program_module.signature.instructions
414
- elif hasattr(program_module, 'extended_signature') and hasattr(program_module.extended_signature, 'instructions'):
445
+ elif hasattr(program_module, "extended_signature") and hasattr(
446
+ program_module.extended_signature, "instructions"
447
+ ):
415
448
  instruction = program_module.extended_signature.instructions
416
- elif hasattr(program_module, 'predictor') and hasattr(program_module.predictor, 'signature') and hasattr(program_module.predictor.signature, 'instructions'):
449
+ elif (
450
+ hasattr(program_module, "predictor")
451
+ and hasattr(program_module.predictor, "signature")
452
+ and hasattr(program_module.predictor.signature, "instructions")
453
+ ):
417
454
  instruction = program_module.predictor.signature.instructions
418
455
 
419
456
  # Remove R and C calculation for Mipro as its history is flat
@@ -426,13 +463,11 @@ class MiproOptimizer(BaseOptimizer):
426
463
  # "candidate_in_round": current_candidate_in_round, # Remove candidate_in_round
427
464
  "timestamp": datetime.now().isoformat(),
428
465
  "prompt_candidate": instruction,
429
- "parameters_used": {
430
- "program_summary": str(program_module)[:500]
431
- },
432
- "scores": [], # Initialize scores list
433
- "tokens_used": None, # TODO: add tokens_used
434
- "cost": None, # TODO: add cost
435
- "duration_seconds": None, # TODO: add duration_seconds
466
+ "parameters_used": {"program_summary": str(program_module)[:500]},
467
+ "scores": [], # Initialize scores list
468
+ "tokens_used": None, # TODO: add tokens_used
469
+ "cost": None, # TODO: add cost
470
+ "duration_seconds": None, # TODO: add duration_seconds
436
471
  }
437
472
 
438
473
  current_score = candidate_data.get("score")
@@ -440,70 +475,103 @@ class MiproOptimizer(BaseOptimizer):
440
475
 
441
476
  # Unscale if it's a known 0-1 metric that MIPRO might scale to 0-100
442
477
  # For now, specifically targeting Levenshtein-like metrics
443
- if isinstance(current_score, (float, int)) and \
444
- ("levenshtein" in metric_name_for_history.lower() or "similarity" in metric_name_for_history.lower()):
478
+ if isinstance(current_score, (float, int)) and (
479
+ "levenshtein" in metric_name_for_history.lower()
480
+ or "similarity" in metric_name_for_history.lower()
481
+ ):
445
482
  # Assuming scores like 32.4 are 0-1 scores scaled by 100
446
- if abs(current_score) > 1.0: # A simple check to see if it looks scaled
447
- logger.debug(f"Mipro history: Unscaling score {current_score} for metric {metric_name_for_history} by dividing by 100.")
483
+ if abs(current_score) > 1.0: # A simple check to see if it looks scaled
484
+ logger.debug(
485
+ f"Mipro history: Unscaling score {current_score} for metric {metric_name_for_history} by dividing by 100."
486
+ )
448
487
  current_score /= 100.0
449
-
450
- iter_detail["scores"].append({
451
- "metric_name": metric_name_for_history,
452
- "score": current_score,
453
- "opik_evaluation_id": None # TODO: add opik_evaluation_id
454
- })
488
+
489
+ iter_detail["scores"].append(
490
+ {
491
+ "metric_name": metric_name_for_history,
492
+ "score": current_score,
493
+ "opik_evaluation_id": None, # TODO: add opik_evaluation_id
494
+ }
495
+ )
455
496
  mipro_history_processed.append(iter_detail)
456
497
 
457
498
  if not self.best_programs:
458
499
  logger.warning("MIPRO compile returned no candidate programs.")
459
500
  return OptimizationResult(
460
501
  optimizer="MiproOptimizer",
461
- prompt=[{"role": "user", "content": getattr(self, 'prompt', "Error: Initial prompt not found")}],
502
+ prompt=[
503
+ {
504
+ "role": "user",
505
+ "content": getattr(
506
+ self, "prompt", "Error: Initial prompt not found"
507
+ ),
508
+ }
509
+ ],
462
510
  score=0.0,
463
- metric_name=self.opik_metric.__name__ if hasattr(self, 'opik_metric') else "unknown_metric",
511
+ metric_name=self.opik_metric.__name__
512
+ if hasattr(self, "opik_metric")
513
+ else "unknown_metric",
464
514
  details={"error": "No candidate programs generated by MIPRO"},
465
515
  history=mipro_history_processed,
466
- llm_calls=self.llm_call_counter
516
+ llm_calls=self.lm.llm_call_counter,
467
517
  )
468
518
 
469
519
  self.module = self.get_best().details["program"]
470
520
  best_program_details = self.get_best()
471
-
521
+
472
522
  # Unscale the main score if necessary, similar to history scores
473
523
  final_best_score = best_program_details.score
474
524
  final_metric_name = best_program_details.metric_name
475
- if isinstance(final_best_score, (float, int)) and \
476
- final_metric_name and \
477
- ("levenshtein" in final_metric_name.lower() or "similarity" in final_metric_name.lower()):
478
- if abs(final_best_score) > 1.0: # A simple check to see if it looks scaled
479
- logger.debug(f"Mipro main result: Unscaling score {final_best_score} for metric {final_metric_name} by dividing by 100.")
525
+ if (
526
+ isinstance(final_best_score, (float, int))
527
+ and final_metric_name
528
+ and (
529
+ "levenshtein" in final_metric_name.lower()
530
+ or "similarity" in final_metric_name.lower()
531
+ )
532
+ ):
533
+ if abs(final_best_score) > 1.0: # A simple check to see if it looks scaled
534
+ logger.debug(
535
+ f"Mipro main result: Unscaling score {final_best_score} for metric {final_metric_name} by dividing by 100."
536
+ )
480
537
  final_best_score /= 100.0
481
538
 
482
539
  return OptimizationResult(
483
540
  optimizer="MiproOptimizer",
484
541
  prompt=best_program_details.prompt,
485
542
  tool_prompts=best_program_details.tool_prompts,
486
- score=final_best_score, # Use the potentially unscaled score
543
+ score=final_best_score, # Use the potentially unscaled score
487
544
  metric_name=final_metric_name,
488
545
  demonstrations=best_program_details.demonstrations,
489
546
  details=best_program_details.details,
490
547
  history=mipro_history_processed,
491
- llm_calls=self.llm_call_counter
548
+ llm_calls=self.lm.llm_call_counter,
492
549
  )
493
550
 
494
551
  def get_best(self, position: int = 0) -> OptimizationResult:
495
- if not hasattr(self, 'best_programs') or not self.best_programs:
496
- logger.error("get_best() called but no best_programs found. MIPRO compile might have failed or yielded no results.")
552
+ if not hasattr(self, "best_programs") or not self.best_programs:
553
+ logger.error(
554
+ "get_best() called but no best_programs found. MIPRO compile might have failed or yielded no results."
555
+ )
497
556
  return OptimizationResult(
498
557
  optimizer="MiproOptimizer",
499
- prompt=[{"role": "user", "content": getattr(self, 'prompt', "Error: Initial prompt not found")}],
500
- score=0.0,
501
- metric_name=getattr(self, 'opik_metric', None).name if hasattr(self, 'opik_metric') and self.opik_metric else "unknown_metric",
558
+ prompt=[
559
+ {
560
+ "role": "user",
561
+ "content": getattr(
562
+ self, "prompt", "Error: Initial prompt not found"
563
+ ),
564
+ }
565
+ ],
566
+ score=0.0,
567
+ metric_name=getattr(self, "opik_metric", None).name
568
+ if hasattr(self, "opik_metric") and self.opik_metric
569
+ else "unknown_metric",
502
570
  details={"error": "No programs generated or compile failed"},
503
571
  history=[],
504
- llm_calls=self.llm_call_counter
572
+ llm_calls=self.lm.llm_call_counter,
505
573
  )
506
-
574
+
507
575
  score = self.best_programs[position]["score"]
508
576
  program_module = self.best_programs[position]["program"]
509
577
  state = program_module.dump_state()
@@ -528,5 +596,5 @@ class MiproOptimizer(BaseOptimizer):
528
596
  metric_name=self.opik_metric.__name__,
529
597
  demonstrations=demos,
530
598
  details={"program": program_module},
531
- llm_calls=self.llm_call_counter
599
+ llm_calls=self.lm.llm_call_counter,
532
600
  )
@@ -1,4 +1,4 @@
1
- from typing import Any, Dict, List, Tuple, Union, Optional
1
+ from typing import Dict, Optional
2
2
 
3
3
  import uuid
4
4
  import dspy
@@ -46,7 +46,10 @@ def opik_metric_to_dspy(metric, output):
46
46
  def opik_metric_score_wrapper(example, prediction, trace=None):
47
47
  try:
48
48
  # Calculate the score using the metric
49
- score_result = metric(dataset_item=example.toDict(), llm_output=getattr(prediction, answer_field, ""))
49
+ score_result = metric(
50
+ dataset_item=example.toDict(),
51
+ llm_output=getattr(prediction, answer_field, ""),
52
+ )
50
53
  return (
51
54
  score_result.value if hasattr(score_result, "value") else score_result
52
55
  )
@@ -0,0 +1,179 @@
1
+ from typing import Dict, Any, List, Optional, TYPE_CHECKING
2
+ import json
3
+ import os
4
+
5
+
6
+ from opik.opik_context import get_current_span_data
7
+
8
+ import litellm
9
+ from litellm.integrations.opik.opik import OpikLogger
10
+
11
+ from . import _throttle
12
+
13
+ _limiter = _throttle.get_rate_limiter_for_current_opik_installation()
14
+
15
+ if TYPE_CHECKING:
16
+ from .optimization_config.chat_prompt import ChatPrompt
17
+
18
+
19
+ def tools_to_dict(tools: Dict[str, Dict[str, Any]]) -> Dict[str, Any]:
20
+ retval = {}
21
+ for name in tools:
22
+ parts = {}
23
+ for part in tools[name]:
24
+ if isinstance(tools[name][part], (int, float, str)):
25
+ parts[part] = tools[name][part]
26
+ if parts:
27
+ retval[name] = parts
28
+ return retval
29
+
30
+
31
+ class OptimizableAgent:
32
+ """
33
+ An agent class to subclass to make an Optimizable Agent.
34
+
35
+ Attributes:
36
+ model (Optional[str]): The model to use for the agent
37
+ model_kwargs (Dict[str, Any]): Additional keyword arguments for the model
38
+ project_name (Optional[str]): The project name for tracking
39
+ """
40
+
41
+ model: Optional[str] = None
42
+ model_kwargs: Dict[str, Any] = {}
43
+ project_name: Optional[str] = "Default Project"
44
+ input_dataset_field: Optional[str] = None
45
+ prompts: Dict[str, "ChatPrompt"]
46
+ prompt: "ChatPrompt"
47
+
48
+ def __init__(self, prompt: "ChatPrompt") -> None:
49
+ """
50
+ Initialize the OptimizableAgent.
51
+
52
+ Args:
53
+ prompt: a chat prompt
54
+ """
55
+ self.init_llm()
56
+ self.init_agent(prompt)
57
+
58
+ def init_llm(self) -> None:
59
+ """Initialize the LLM with the appropriate callbacks."""
60
+ # Litellm bug requires this (maybe problematic if multi-threaded)
61
+ os.environ["OPIK_PROJECT_NAME"] = str(self.project_name)
62
+ self.opik_logger = OpikLogger()
63
+ litellm.callbacks = [self.opik_logger]
64
+
65
+ def init_agent(self, prompt: "ChatPrompt") -> None:
66
+ """Initialize the agent with the provided configuration."""
67
+ # Register the tools, if any, for default LiteLLM Agent use:
68
+ self.prompt = prompt
69
+
70
+ @_throttle.rate_limited(_limiter)
71
+ def _llm_complete(
72
+ self,
73
+ messages: List[Dict[str, str]],
74
+ tools: Optional[List[Dict[str, str]]],
75
+ seed: int,
76
+ ) -> Any:
77
+ response = litellm.completion(
78
+ model=self.model,
79
+ messages=messages,
80
+ seed=seed,
81
+ tools=tools,
82
+ metadata={
83
+ "opik": {
84
+ "current_span_data": get_current_span_data(),
85
+ },
86
+ },
87
+ **self.model_kwargs,
88
+ )
89
+ return response
90
+
91
+ def llm_invoke(
92
+ self,
93
+ query: Optional[str] = None,
94
+ messages: Optional[List[Dict[str, str]]] = None,
95
+ seed: Optional[int] = None,
96
+ allow_tool_use: Optional[bool] = False,
97
+ ) -> str:
98
+ """
99
+ NOTE: this is the default LiteLLM API. It is used
100
+ internally for the LiteLLM Agent.
101
+
102
+ Invoke the LLM with the provided query or messages.
103
+
104
+ Args:
105
+ query (Optional[str]): The query to send to the LLM
106
+ messages (Optional[List[Dict[str, str]]]): Messages to send to the LLM
107
+ seed (Optional[int]): Seed for reproducibility
108
+ allow_tool_use: If True, allow LLM to use tools
109
+
110
+ Returns:
111
+ str: The LLM's response
112
+ """
113
+ all_messages = []
114
+ if messages is not None:
115
+ all_messages.extend(messages)
116
+
117
+ if query is not None:
118
+ all_messages.append({"role": "user", "content": query})
119
+
120
+ if allow_tool_use and self.prompt.tools:
121
+ # Tool-calling loop
122
+ final_response = "I was unable to find the desired information."
123
+ count = 0
124
+ while count < 20:
125
+ count += 1
126
+ response = self._llm_complete(all_messages, self.prompt.tools, seed)
127
+ msg = response.choices[0].message
128
+ all_messages.append(msg.to_dict())
129
+ if msg.tool_calls:
130
+ for tool_call in msg["tool_calls"]:
131
+ tool_name = tool_call["function"]["name"]
132
+ arguments = json.loads(tool_call["function"]["arguments"])
133
+ tool_func = self.prompt.function_map.get(tool_name)
134
+ try:
135
+ tool_result = (
136
+ tool_func(**arguments)
137
+ if tool_func is not None
138
+ else "Unknown tool"
139
+ )
140
+ except Exception:
141
+ tool_result = f"Error in calling tool `{tool_name}`"
142
+ all_messages.append(
143
+ {
144
+ "role": "tool",
145
+ "tool_call_id": tool_call["id"],
146
+ "content": str(tool_result),
147
+ }
148
+ )
149
+ else:
150
+ final_response = msg["content"]
151
+ break
152
+ result = final_response
153
+ else:
154
+ response = self._llm_complete(all_messages, None, seed)
155
+ result = response.choices[0].message.content
156
+ return result
157
+
158
+ def invoke_dataset_item(self, dataset_item: Dict[str, str]) -> str:
159
+ messages = self.prompt.get_messages(dataset_item)
160
+ return self.invoke(messages)
161
+
162
+ def invoke(
163
+ self,
164
+ messages: List[Dict[str, str]],
165
+ seed: Optional[int] = None,
166
+ ) -> str:
167
+ """
168
+ Invoke the agent with a dataset item.
169
+
170
+ Args:
171
+ dataset_item (Dict[str, Any]): The dataset item to process
172
+ seed (Optional[int]): Seed for reproducibility
173
+
174
+ Returns:
175
+ Dict[str, Any]: The agent's response
176
+ """
177
+ # Replace with agent invocation:
178
+ result = self.llm_invoke(messages=messages, seed=seed, allow_tool_use=True)
179
+ return result