opik-optimizer 0.9.2__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. opik_optimizer/__init__.py +7 -5
  2. opik_optimizer/_throttle.py +8 -8
  3. opik_optimizer/base_optimizer.py +98 -45
  4. opik_optimizer/cache_config.py +5 -3
  5. opik_optimizer/datasets/ai2_arc.py +15 -13
  6. opik_optimizer/datasets/cnn_dailymail.py +19 -15
  7. opik_optimizer/datasets/election_questions.py +10 -11
  8. opik_optimizer/datasets/gsm8k.py +16 -11
  9. opik_optimizer/datasets/halu_eval.py +6 -5
  10. opik_optimizer/datasets/hotpot_qa.py +17 -16
  11. opik_optimizer/datasets/medhallu.py +10 -7
  12. opik_optimizer/datasets/rag_hallucinations.py +11 -8
  13. opik_optimizer/datasets/ragbench.py +17 -9
  14. opik_optimizer/datasets/tiny_test.py +33 -37
  15. opik_optimizer/datasets/truthful_qa.py +18 -12
  16. opik_optimizer/demo/cache.py +6 -6
  17. opik_optimizer/demo/datasets.py +3 -7
  18. opik_optimizer/evolutionary_optimizer/__init__.py +3 -1
  19. opik_optimizer/evolutionary_optimizer/evolutionary_optimizer.py +722 -429
  20. opik_optimizer/evolutionary_optimizer/reporting.py +155 -74
  21. opik_optimizer/few_shot_bayesian_optimizer/few_shot_bayesian_optimizer.py +271 -188
  22. opik_optimizer/few_shot_bayesian_optimizer/reporting.py +79 -28
  23. opik_optimizer/logging_config.py +19 -15
  24. opik_optimizer/meta_prompt_optimizer/meta_prompt_optimizer.py +209 -129
  25. opik_optimizer/meta_prompt_optimizer/reporting.py +121 -46
  26. opik_optimizer/mipro_optimizer/__init__.py +2 -0
  27. opik_optimizer/mipro_optimizer/_lm.py +38 -9
  28. opik_optimizer/mipro_optimizer/_mipro_optimizer_v2.py +37 -26
  29. opik_optimizer/mipro_optimizer/mipro_optimizer.py +132 -63
  30. opik_optimizer/mipro_optimizer/utils.py +5 -2
  31. opik_optimizer/optimizable_agent.py +179 -0
  32. opik_optimizer/optimization_config/chat_prompt.py +143 -73
  33. opik_optimizer/optimization_config/configs.py +4 -3
  34. opik_optimizer/optimization_config/mappers.py +18 -6
  35. opik_optimizer/optimization_result.py +22 -13
  36. opik_optimizer/py.typed +0 -0
  37. opik_optimizer/reporting_utils.py +89 -58
  38. opik_optimizer/task_evaluator.py +12 -14
  39. opik_optimizer/utils.py +117 -14
  40. {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.1.dist-info}/METADATA +8 -8
  41. opik_optimizer-1.0.1.dist-info/RECORD +50 -0
  42. opik_optimizer-0.9.2.dist-info/RECORD +0 -48
  43. {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.1.dist-info}/WHEEL +0 -0
  44. {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.1.dist-info}/licenses/LICENSE +0 -0
  45. {opik_optimizer-0.9.2.dist-info → opik_optimizer-1.0.1.dist-info}/top_level.txt +0 -0
@@ -2,6 +2,7 @@ import os
2
2
  import random
3
3
  from datetime import datetime
4
4
  from typing import Callable, Dict, List, Literal, Optional, Union
5
+ import logging
5
6
 
6
7
  import dspy
7
8
  import litellm
@@ -16,7 +17,6 @@ from ..optimization_result import OptimizationResult
16
17
  from ..utils import optimization_context
17
18
  from ..base_optimizer import BaseOptimizer
18
19
  from ..optimization_config.configs import TaskConfig
19
- from ..optimization_result import OptimizationResult
20
20
  from ._lm import LM
21
21
  from ._mipro_optimizer_v2 import MIPROv2
22
22
  from .utils import (
@@ -30,16 +30,20 @@ from .utils import (
30
30
  disk_cache_dir = os.path.expanduser("~/.litellm_cache")
31
31
  litellm.cache = Cache(type="disk", disk_cache_dir=disk_cache_dir)
32
32
 
33
- # Set up logging
34
- import logging
35
-
36
33
  logger = logging.getLogger(__name__) # Inherits config from setup_logging
37
34
 
38
35
 
39
36
  class MiproOptimizer(BaseOptimizer):
40
- def __init__(self, model, project_name: Optional[str] = None, verbose: int = 1, **model_kwargs):
41
- super().__init__(model, project_name, verbose=verbose, **model_kwargs)
37
+ def __init__(
38
+ self,
39
+ model,
40
+ project_name: Optional[str] = None,
41
+ verbose: int = 1,
42
+ **model_kwargs,
43
+ ):
44
+ super().__init__(model=model, verbose=verbose, **model_kwargs)
42
45
  self.tools = []
46
+ self.project_name = project_name
43
47
  self.num_threads = self.model_kwargs.pop("num_threads", 6)
44
48
  self.model_kwargs["model"] = self.model
45
49
  # FIXME: add mipro_optimizer=True - It does not count the LLM calls made internally by DSPy during MiproOptimizer.optimizer.compile().
@@ -84,7 +88,9 @@ class MiproOptimizer(BaseOptimizer):
84
88
  output_key = task_config.output_dataset_field
85
89
 
86
90
  # Kwargs might contain n_samples, passed from run_benchmark.py
87
- n_samples = kwargs.pop("n_samples", None) # Get n_samples from kwargs if present
91
+ n_samples = kwargs.pop(
92
+ "n_samples", None
93
+ ) # Get n_samples from kwargs if present
88
94
 
89
95
  if isinstance(dataset, str):
90
96
  opik_client = opik.Opik(project_name=self.project_name)
@@ -153,28 +159,42 @@ class MiproOptimizer(BaseOptimizer):
153
159
  # Robust n_samples handling for selecting dataset_item_ids
154
160
  dataset_items_for_eval = dataset.get_items()
155
161
  num_total_items = len(dataset_items_for_eval)
156
- dataset_item_ids_to_use = dataset_item_ids # Use provided IDs if any
162
+ dataset_item_ids_to_use = dataset_item_ids # Use provided IDs if any
157
163
 
158
- if n_samples is not None: # If n_samples is specified by the caller (run_benchmark.py)
164
+ if (
165
+ n_samples is not None
166
+ ): # If n_samples is specified by the caller (run_benchmark.py)
159
167
  if dataset_item_ids is not None:
160
168
  # This case should ideally be an error or a clear precedence rule.
161
169
  # For now, let's assume if dataset_item_ids is provided, it takes precedence over n_samples.
162
- logger.warning("MiproOptimizer.evaluate_prompt: Both n_samples and dataset_item_ids provided. Using provided dataset_item_ids.")
170
+ logger.warning(
171
+ "MiproOptimizer.evaluate_prompt: Both n_samples and dataset_item_ids provided. Using provided dataset_item_ids."
172
+ )
163
173
  # dataset_item_ids_to_use is already dataset_item_ids
164
174
  elif n_samples > num_total_items:
165
- logger.warning(f"MiproOptimizer.evaluate_prompt: n_samples ({n_samples}) > total items ({num_total_items}). Using all {num_total_items} items.")
166
- dataset_item_ids_to_use = None # opik.evaluation.evaluate handles None as all items
175
+ logger.warning(
176
+ f"MiproOptimizer.evaluate_prompt: n_samples ({n_samples}) > total items ({num_total_items}). Using all {num_total_items} items."
177
+ )
178
+ dataset_item_ids_to_use = (
179
+ None # opik.evaluation.evaluate handles None as all items
180
+ )
167
181
  elif n_samples <= 0:
168
- logger.warning(f"MiproOptimizer.evaluate_prompt: n_samples ({n_samples}) is <= 0. Using all {num_total_items} items.")
182
+ logger.warning(
183
+ f"MiproOptimizer.evaluate_prompt: n_samples ({n_samples}) is <= 0. Using all {num_total_items} items."
184
+ )
169
185
  dataset_item_ids_to_use = None
170
186
  else:
171
187
  # n_samples is valid and dataset_item_ids was not provided, so sample now.
172
188
  all_ids = [item["id"] for item in dataset_items_for_eval]
173
189
  dataset_item_ids_to_use = random.sample(all_ids, n_samples)
174
- logger.info(f"MiproOptimizer.evaluate_prompt: Sampled {n_samples} items for evaluation.")
175
- else: # n_samples is None
190
+ logger.info(
191
+ f"MiproOptimizer.evaluate_prompt: Sampled {n_samples} items for evaluation."
192
+ )
193
+ else: # n_samples is None
176
194
  if dataset_item_ids is None:
177
- logger.info(f"MiproOptimizer.evaluate_prompt: n_samples is None and dataset_item_ids is None. Using all {num_total_items} items.")
195
+ logger.info(
196
+ f"MiproOptimizer.evaluate_prompt: n_samples is None and dataset_item_ids is None. Using all {num_total_items} items."
197
+ )
178
198
  # dataset_item_ids_to_use is already dataset_item_ids (which could be None)
179
199
 
180
200
  experiment_config = experiment_config or {}
@@ -230,10 +250,10 @@ class MiproOptimizer(BaseOptimizer):
230
250
  ) -> OptimizationResult:
231
251
  self._opik_client = opik.Opik()
232
252
  with optimization_context(
233
- client=self._opik_client,
234
- dataset_name=dataset.name,
235
- objective_name=metric.__name__,
236
- metadata={"optimizer": self.__class__.__name__},
253
+ client=self._opik_client,
254
+ dataset_name=dataset.name,
255
+ objective_name=metric.__name__,
256
+ metadata={"optimizer": self.__class__.__name__},
237
257
  ) as optimization:
238
258
  result = self._optimize_prompt(
239
259
  dataset=dataset,
@@ -322,7 +342,9 @@ class MiproOptimizer(BaseOptimizer):
322
342
  if self.output_key not in row:
323
343
  raise Exception("row does not contain output_key: %r" % self.output_key)
324
344
 
325
- self.trainset = create_dspy_training_set(self.dataset, self.input_key, self.n_samples)
345
+ self.trainset = create_dspy_training_set(
346
+ self.dataset, self.input_key, self.n_samples
347
+ )
326
348
  self.data_signature = create_dspy_signature(
327
349
  self.input_key, self.output_key, self.prompt
328
350
  )
@@ -383,8 +405,10 @@ class MiproOptimizer(BaseOptimizer):
383
405
  """
384
406
  Continue to look for optimizations
385
407
  """
386
- if not hasattr(self, 'optimizer') or not self.optimizer:
387
- raise RuntimeError("MiproOptimizer not prepared. Call prepare_optimize_prompt first.")
408
+ if not hasattr(self, "optimizer") or not self.optimizer:
409
+ raise RuntimeError(
410
+ "MiproOptimizer not prepared. Call prepare_optimize_prompt first."
411
+ )
388
412
 
389
413
  self.results = self.optimizer.compile(
390
414
  student=self.module,
@@ -403,16 +427,30 @@ class MiproOptimizer(BaseOptimizer):
403
427
  # self.num_candidates is set in prepare_optimize_prompt, defaults to 10
404
428
  # If self.num_candidates is 0 or None, this logic might break or be odd.
405
429
  # Add a safeguard for num_candidates_per_round if self.num_candidates is not usable.
406
- num_candidates_per_round = self.num_candidates if hasattr(self, 'num_candidates') and self.num_candidates and self.num_candidates > 0 else 1
430
+ num_candidates_per_round = ( # noqa
431
+ self.num_candidates
432
+ if hasattr(self, "num_candidates")
433
+ and self.num_candidates
434
+ and self.num_candidates > 0
435
+ else 1
436
+ )
407
437
 
408
438
  for i, candidate_data in enumerate(self.results.candidate_programs):
409
439
  program_module = candidate_data.get("program")
410
440
  instruction = "N/A"
411
- if hasattr(program_module, 'signature') and hasattr(program_module.signature, 'instructions'):
441
+ if hasattr(program_module, "signature") and hasattr(
442
+ program_module.signature, "instructions"
443
+ ):
412
444
  instruction = program_module.signature.instructions
413
- elif hasattr(program_module, 'extended_signature') and hasattr(program_module.extended_signature, 'instructions'):
445
+ elif hasattr(program_module, "extended_signature") and hasattr(
446
+ program_module.extended_signature, "instructions"
447
+ ):
414
448
  instruction = program_module.extended_signature.instructions
415
- elif hasattr(program_module, 'predictor') and hasattr(program_module.predictor, 'signature') and hasattr(program_module.predictor.signature, 'instructions'):
449
+ elif (
450
+ hasattr(program_module, "predictor")
451
+ and hasattr(program_module.predictor, "signature")
452
+ and hasattr(program_module.predictor.signature, "instructions")
453
+ ):
416
454
  instruction = program_module.predictor.signature.instructions
417
455
 
418
456
  # Remove R and C calculation for Mipro as its history is flat
@@ -425,13 +463,11 @@ class MiproOptimizer(BaseOptimizer):
425
463
  # "candidate_in_round": current_candidate_in_round, # Remove candidate_in_round
426
464
  "timestamp": datetime.now().isoformat(),
427
465
  "prompt_candidate": instruction,
428
- "parameters_used": {
429
- "program_summary": str(program_module)[:500]
430
- },
431
- "scores": [], # Initialize scores list
432
- "tokens_used": None, # TODO: add tokens_used
433
- "cost": None, # TODO: add cost
434
- "duration_seconds": None, # TODO: add duration_seconds
466
+ "parameters_used": {"program_summary": str(program_module)[:500]},
467
+ "scores": [], # Initialize scores list
468
+ "tokens_used": None, # TODO: add tokens_used
469
+ "cost": None, # TODO: add cost
470
+ "duration_seconds": None, # TODO: add duration_seconds
435
471
  }
436
472
 
437
473
  current_score = candidate_data.get("score")
@@ -439,70 +475,103 @@ class MiproOptimizer(BaseOptimizer):
439
475
 
440
476
  # Unscale if it's a known 0-1 metric that MIPRO might scale to 0-100
441
477
  # For now, specifically targeting Levenshtein-like metrics
442
- if isinstance(current_score, (float, int)) and \
443
- ("levenshtein" in metric_name_for_history.lower() or "similarity" in metric_name_for_history.lower()):
478
+ if isinstance(current_score, (float, int)) and (
479
+ "levenshtein" in metric_name_for_history.lower()
480
+ or "similarity" in metric_name_for_history.lower()
481
+ ):
444
482
  # Assuming scores like 32.4 are 0-1 scores scaled by 100
445
- if abs(current_score) > 1.0: # A simple check to see if it looks scaled
446
- logger.debug(f"Mipro history: Unscaling score {current_score} for metric {metric_name_for_history} by dividing by 100.")
483
+ if abs(current_score) > 1.0: # A simple check to see if it looks scaled
484
+ logger.debug(
485
+ f"Mipro history: Unscaling score {current_score} for metric {metric_name_for_history} by dividing by 100."
486
+ )
447
487
  current_score /= 100.0
448
-
449
- iter_detail["scores"].append({
450
- "metric_name": metric_name_for_history,
451
- "score": current_score,
452
- "opik_evaluation_id": None # TODO: add opik_evaluation_id
453
- })
488
+
489
+ iter_detail["scores"].append(
490
+ {
491
+ "metric_name": metric_name_for_history,
492
+ "score": current_score,
493
+ "opik_evaluation_id": None, # TODO: add opik_evaluation_id
494
+ }
495
+ )
454
496
  mipro_history_processed.append(iter_detail)
455
497
 
456
498
  if not self.best_programs:
457
499
  logger.warning("MIPRO compile returned no candidate programs.")
458
500
  return OptimizationResult(
459
501
  optimizer="MiproOptimizer",
460
- prompt=[{"role": "user", "content": getattr(self, 'prompt', "Error: Initial prompt not found")}],
502
+ prompt=[
503
+ {
504
+ "role": "user",
505
+ "content": getattr(
506
+ self, "prompt", "Error: Initial prompt not found"
507
+ ),
508
+ }
509
+ ],
461
510
  score=0.0,
462
- metric_name=self.opik_metric.__name__ if hasattr(self, 'opik_metric') else "unknown_metric",
511
+ metric_name=self.opik_metric.__name__
512
+ if hasattr(self, "opik_metric")
513
+ else "unknown_metric",
463
514
  details={"error": "No candidate programs generated by MIPRO"},
464
515
  history=mipro_history_processed,
465
- llm_calls=self.lm.llm_call_counter
516
+ llm_calls=self.lm.llm_call_counter,
466
517
  )
467
518
 
468
519
  self.module = self.get_best().details["program"]
469
520
  best_program_details = self.get_best()
470
-
521
+
471
522
  # Unscale the main score if necessary, similar to history scores
472
523
  final_best_score = best_program_details.score
473
524
  final_metric_name = best_program_details.metric_name
474
- if isinstance(final_best_score, (float, int)) and \
475
- final_metric_name and \
476
- ("levenshtein" in final_metric_name.lower() or "similarity" in final_metric_name.lower()):
477
- if abs(final_best_score) > 1.0: # A simple check to see if it looks scaled
478
- logger.debug(f"Mipro main result: Unscaling score {final_best_score} for metric {final_metric_name} by dividing by 100.")
525
+ if (
526
+ isinstance(final_best_score, (float, int))
527
+ and final_metric_name
528
+ and (
529
+ "levenshtein" in final_metric_name.lower()
530
+ or "similarity" in final_metric_name.lower()
531
+ )
532
+ ):
533
+ if abs(final_best_score) > 1.0: # A simple check to see if it looks scaled
534
+ logger.debug(
535
+ f"Mipro main result: Unscaling score {final_best_score} for metric {final_metric_name} by dividing by 100."
536
+ )
479
537
  final_best_score /= 100.0
480
538
 
481
539
  return OptimizationResult(
482
540
  optimizer="MiproOptimizer",
483
541
  prompt=best_program_details.prompt,
484
542
  tool_prompts=best_program_details.tool_prompts,
485
- score=final_best_score, # Use the potentially unscaled score
543
+ score=final_best_score, # Use the potentially unscaled score
486
544
  metric_name=final_metric_name,
487
545
  demonstrations=best_program_details.demonstrations,
488
546
  details=best_program_details.details,
489
547
  history=mipro_history_processed,
490
- llm_calls=self.lm.llm_call_counter
548
+ llm_calls=self.lm.llm_call_counter,
491
549
  )
492
550
 
493
551
  def get_best(self, position: int = 0) -> OptimizationResult:
494
- if not hasattr(self, 'best_programs') or not self.best_programs:
495
- logger.error("get_best() called but no best_programs found. MIPRO compile might have failed or yielded no results.")
552
+ if not hasattr(self, "best_programs") or not self.best_programs:
553
+ logger.error(
554
+ "get_best() called but no best_programs found. MIPRO compile might have failed or yielded no results."
555
+ )
496
556
  return OptimizationResult(
497
557
  optimizer="MiproOptimizer",
498
- prompt=[{"role": "user", "content": getattr(self, 'prompt', "Error: Initial prompt not found")}],
499
- score=0.0,
500
- metric_name=getattr(self, 'opik_metric', None).name if hasattr(self, 'opik_metric') and self.opik_metric else "unknown_metric",
558
+ prompt=[
559
+ {
560
+ "role": "user",
561
+ "content": getattr(
562
+ self, "prompt", "Error: Initial prompt not found"
563
+ ),
564
+ }
565
+ ],
566
+ score=0.0,
567
+ metric_name=getattr(self, "opik_metric", None).name
568
+ if hasattr(self, "opik_metric") and self.opik_metric
569
+ else "unknown_metric",
501
570
  details={"error": "No programs generated or compile failed"},
502
571
  history=[],
503
- llm_calls=self.lm.llm_call_counter
572
+ llm_calls=self.lm.llm_call_counter,
504
573
  )
505
-
574
+
506
575
  score = self.best_programs[position]["score"]
507
576
  program_module = self.best_programs[position]["program"]
508
577
  state = program_module.dump_state()
@@ -527,5 +596,5 @@ class MiproOptimizer(BaseOptimizer):
527
596
  metric_name=self.opik_metric.__name__,
528
597
  demonstrations=demos,
529
598
  details={"program": program_module},
530
- llm_calls=self.lm.llm_call_counter
599
+ llm_calls=self.lm.llm_call_counter,
531
600
  )
@@ -1,4 +1,4 @@
1
- from typing import Any, Dict, List, Tuple, Union, Optional
1
+ from typing import Dict, Optional
2
2
 
3
3
  import uuid
4
4
  import dspy
@@ -46,7 +46,10 @@ def opik_metric_to_dspy(metric, output):
46
46
  def opik_metric_score_wrapper(example, prediction, trace=None):
47
47
  try:
48
48
  # Calculate the score using the metric
49
- score_result = metric(dataset_item=example.toDict(), llm_output=getattr(prediction, answer_field, ""))
49
+ score_result = metric(
50
+ dataset_item=example.toDict(),
51
+ llm_output=getattr(prediction, answer_field, ""),
52
+ )
50
53
  return (
51
54
  score_result.value if hasattr(score_result, "value") else score_result
52
55
  )
@@ -0,0 +1,179 @@
1
+ from typing import Dict, Any, List, Optional, TYPE_CHECKING
2
+ import json
3
+ import os
4
+
5
+
6
+ from opik.opik_context import get_current_span_data
7
+
8
+ import litellm
9
+ from litellm.integrations.opik.opik import OpikLogger
10
+
11
+ from . import _throttle
12
+
13
+ _limiter = _throttle.get_rate_limiter_for_current_opik_installation()
14
+
15
+ if TYPE_CHECKING:
16
+ from .optimization_config.chat_prompt import ChatPrompt
17
+
18
+
19
+ def tools_to_dict(tools: Dict[str, Dict[str, Any]]) -> Dict[str, Any]:
20
+ retval = {}
21
+ for name in tools:
22
+ parts = {}
23
+ for part in tools[name]:
24
+ if isinstance(tools[name][part], (int, float, str)):
25
+ parts[part] = tools[name][part]
26
+ if parts:
27
+ retval[name] = parts
28
+ return retval
29
+
30
+
31
+ class OptimizableAgent:
32
+ """
33
+ An agent class to subclass to make an Optimizable Agent.
34
+
35
+ Attributes:
36
+ model (Optional[str]): The model to use for the agent
37
+ model_kwargs (Dict[str, Any]): Additional keyword arguments for the model
38
+ project_name (Optional[str]): The project name for tracking
39
+ """
40
+
41
+ model: Optional[str] = None
42
+ model_kwargs: Dict[str, Any] = {}
43
+ project_name: Optional[str] = "Default Project"
44
+ input_dataset_field: Optional[str] = None
45
+ prompts: Dict[str, "ChatPrompt"]
46
+ prompt: "ChatPrompt"
47
+
48
+ def __init__(self, prompt: "ChatPrompt") -> None:
49
+ """
50
+ Initialize the OptimizableAgent.
51
+
52
+ Args:
53
+ prompt: a chat prompt
54
+ """
55
+ self.init_llm()
56
+ self.init_agent(prompt)
57
+
58
+ def init_llm(self) -> None:
59
+ """Initialize the LLM with the appropriate callbacks."""
60
+ # Litellm bug requires this (maybe problematic if multi-threaded)
61
+ os.environ["OPIK_PROJECT_NAME"] = str(self.project_name)
62
+ self.opik_logger = OpikLogger()
63
+ litellm.callbacks = [self.opik_logger]
64
+
65
+ def init_agent(self, prompt: "ChatPrompt") -> None:
66
+ """Initialize the agent with the provided configuration."""
67
+ # Register the tools, if any, for default LiteLLM Agent use:
68
+ self.prompt = prompt
69
+
70
+ @_throttle.rate_limited(_limiter)
71
+ def _llm_complete(
72
+ self,
73
+ messages: List[Dict[str, str]],
74
+ tools: Optional[List[Dict[str, str]]],
75
+ seed: int,
76
+ ) -> Any:
77
+ response = litellm.completion(
78
+ model=self.model,
79
+ messages=messages,
80
+ seed=seed,
81
+ tools=tools,
82
+ metadata={
83
+ "opik": {
84
+ "current_span_data": get_current_span_data(),
85
+ },
86
+ },
87
+ **self.model_kwargs,
88
+ )
89
+ return response
90
+
91
+ def llm_invoke(
92
+ self,
93
+ query: Optional[str] = None,
94
+ messages: Optional[List[Dict[str, str]]] = None,
95
+ seed: Optional[int] = None,
96
+ allow_tool_use: Optional[bool] = False,
97
+ ) -> str:
98
+ """
99
+ NOTE: this is the default LiteLLM API. It is used
100
+ internally for the LiteLLM Agent.
101
+
102
+ Invoke the LLM with the provided query or messages.
103
+
104
+ Args:
105
+ query (Optional[str]): The query to send to the LLM
106
+ messages (Optional[List[Dict[str, str]]]): Messages to send to the LLM
107
+ seed (Optional[int]): Seed for reproducibility
108
+ allow_tool_use: If True, allow LLM to use tools
109
+
110
+ Returns:
111
+ str: The LLM's response
112
+ """
113
+ all_messages = []
114
+ if messages is not None:
115
+ all_messages.extend(messages)
116
+
117
+ if query is not None:
118
+ all_messages.append({"role": "user", "content": query})
119
+
120
+ if allow_tool_use and self.prompt.tools:
121
+ # Tool-calling loop
122
+ final_response = "I was unable to find the desired information."
123
+ count = 0
124
+ while count < 20:
125
+ count += 1
126
+ response = self._llm_complete(all_messages, self.prompt.tools, seed)
127
+ msg = response.choices[0].message
128
+ all_messages.append(msg.to_dict())
129
+ if msg.tool_calls:
130
+ for tool_call in msg["tool_calls"]:
131
+ tool_name = tool_call["function"]["name"]
132
+ arguments = json.loads(tool_call["function"]["arguments"])
133
+ tool_func = self.prompt.function_map.get(tool_name)
134
+ try:
135
+ tool_result = (
136
+ tool_func(**arguments)
137
+ if tool_func is not None
138
+ else "Unknown tool"
139
+ )
140
+ except Exception:
141
+ tool_result = f"Error in calling tool `{tool_name}`"
142
+ all_messages.append(
143
+ {
144
+ "role": "tool",
145
+ "tool_call_id": tool_call["id"],
146
+ "content": str(tool_result),
147
+ }
148
+ )
149
+ else:
150
+ final_response = msg["content"]
151
+ break
152
+ result = final_response
153
+ else:
154
+ response = self._llm_complete(all_messages, None, seed)
155
+ result = response.choices[0].message.content
156
+ return result
157
+
158
+ def invoke_dataset_item(self, dataset_item: Dict[str, str]) -> str:
159
+ messages = self.prompt.get_messages(dataset_item)
160
+ return self.invoke(messages)
161
+
162
+ def invoke(
163
+ self,
164
+ messages: List[Dict[str, str]],
165
+ seed: Optional[int] = None,
166
+ ) -> str:
167
+ """
168
+ Invoke the agent with a dataset item.
169
+
170
+ Args:
171
+ dataset_item (Dict[str, Any]): The dataset item to process
172
+ seed (Optional[int]): Seed for reproducibility
173
+
174
+ Returns:
175
+ Dict[str, Any]: The agent's response
176
+ """
177
+ # Replace with agent invocation:
178
+ result = self.llm_invoke(messages=messages, seed=seed, allow_tool_use=True)
179
+ return result