swarms 7.8.9__py3-none-any.whl → 7.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
swarms/structs/agent.py CHANGED
@@ -56,7 +56,6 @@ from swarms.tools.base_tool import BaseTool
56
56
  from swarms.tools.py_func_to_openai_func_str import (
57
57
  convert_multiple_functions_to_openai_function_schema,
58
58
  )
59
- from swarms.utils.any_to_str import any_to_str
60
59
  from swarms.utils.data_to_text import data_to_text
61
60
  from swarms.utils.file_processing import create_file_in_folder
62
61
  from swarms.utils.formatter import formatter
@@ -288,6 +287,11 @@ class Agent:
288
287
  >>> print(response)
289
288
  >>> # Generate a report on the financials.
290
289
 
290
+ >>> # Real-time streaming example
291
+ >>> agent = Agent(llm=llm, max_loops=1, streaming_on=True)
292
+ >>> response = agent.run("Tell me a long story.") # Will stream in real-time
293
+ >>> print(response) # Final complete response
294
+
291
295
  """
292
296
 
293
297
  def __init__(
@@ -404,7 +408,7 @@ class Agent:
404
408
  llm_args: dict = None,
405
409
  load_state_path: str = None,
406
410
  role: agent_roles = "worker",
407
- no_print: bool = False,
411
+ print_on: bool = True,
408
412
  tools_list_dictionary: Optional[List[Dict[str, Any]]] = None,
409
413
  mcp_url: Optional[Union[str, MCPConnection]] = None,
410
414
  mcp_urls: List[str] = None,
@@ -420,6 +424,7 @@ class Agent:
420
424
  rag_config: Optional[RAGConfig] = None,
421
425
  tool_call_summary: bool = True,
422
426
  output_raw_json_from_tool_call: bool = False,
427
+ summarize_multiple_images: bool = False,
423
428
  *args,
424
429
  **kwargs,
425
430
  ):
@@ -540,7 +545,7 @@ class Agent:
540
545
  self.llm_args = llm_args
541
546
  self.load_state_path = load_state_path
542
547
  self.role = role
543
- self.no_print = no_print
548
+ self.print_on = print_on
544
549
  self.tools_list_dictionary = tools_list_dictionary
545
550
  self.mcp_url = mcp_url
546
551
  self.mcp_urls = mcp_urls
@@ -558,6 +563,7 @@ class Agent:
558
563
  self.output_raw_json_from_tool_call = (
559
564
  output_raw_json_from_tool_call
560
565
  )
566
+ self.summarize_multiple_images = summarize_multiple_images
561
567
 
562
568
  # self.short_memory = self.short_memory_init()
563
569
 
@@ -630,16 +636,20 @@ class Agent:
630
636
  )
631
637
 
632
638
  self.short_memory.add(
633
- role=f"{self.agent_name}",
639
+ role=self.agent_name,
634
640
  content=self.tools_list_dictionary,
635
641
  )
636
642
 
637
643
  def short_memory_init(self):
638
- if (
639
- self.agent_name is not None
640
- or self.agent_description is not None
641
- ):
642
- prompt = f"\n Your Name: {self.agent_name} \n\n Your Description: {self.agent_description} \n\n {self.system_prompt}"
644
+ prompt = ""
645
+
646
+ # Add agent name, description, and instructions to the prompt
647
+ if self.agent_name is not None:
648
+ prompt += f"\n Name: {self.agent_name}"
649
+ elif self.agent_description is not None:
650
+ prompt += f"\n Description: {self.agent_description}"
651
+ elif self.system_prompt is not None:
652
+ prompt += f"\n Instructions: {self.system_prompt}"
643
653
  else:
644
654
  prompt = self.system_prompt
645
655
 
@@ -810,6 +820,29 @@ class Agent:
810
820
 
811
821
  return json.loads(self.tools_list_dictionary)
812
822
 
823
+ def check_model_supports_utilities(self, img: str = None) -> bool:
824
+ """
825
+ Check if the current model supports vision capabilities.
826
+
827
+ Args:
828
+ img (str, optional): Image input to check vision support for. Defaults to None.
829
+
830
+ Returns:
831
+ bool: True if model supports vision and image is provided, False otherwise.
832
+ """
833
+ from litellm.utils import supports_vision
834
+
835
+ # Only check vision support if an image is provided
836
+ if img is not None:
837
+ out = supports_vision(self.model_name)
838
+ if not out:
839
+ raise ValueError(
840
+ f"Model {self.model_name} does not support vision capabilities. Please use a vision-enabled model."
841
+ )
842
+ return out
843
+
844
+ return False
845
+
813
846
  def check_if_no_prompt_then_autogenerate(self, task: str = None):
814
847
  """
815
848
  Checks if auto_generate_prompt is enabled and generates a prompt by combining agent name, description and system prompt if available.
@@ -931,12 +964,7 @@ class Agent:
931
964
  self,
932
965
  task: Optional[Union[str, Any]] = None,
933
966
  img: Optional[str] = None,
934
- speech: Optional[str] = None,
935
- video: Optional[str] = None,
936
- is_last: Optional[bool] = False,
937
967
  print_task: Optional[bool] = False,
938
- generate_speech: Optional[bool] = False,
939
- correct_answer: Optional[str] = None,
940
968
  *args,
941
969
  **kwargs,
942
970
  ) -> Any:
@@ -961,9 +989,12 @@ class Agent:
961
989
 
962
990
  self.check_if_no_prompt_then_autogenerate(task)
963
991
 
992
+ if img is not None:
993
+ self.check_model_supports_utilities(img=img)
994
+
964
995
  self.short_memory.add(role=self.user_name, content=task)
965
996
 
966
- if self.plan_enabled or self.planning_prompt is not None:
997
+ if self.plan_enabled is True:
967
998
  self.plan(task)
968
999
 
969
1000
  # Set the loop count
@@ -1030,12 +1061,23 @@ class Agent:
1030
1061
  )
1031
1062
  self.memory_query(task_prompt)
1032
1063
 
1033
- response = self.call_llm(
1034
- task=task_prompt, img=img, *args, **kwargs
1035
- )
1036
-
1037
- print(f"Response: {response}")
1064
+ if img is not None:
1065
+ response = self.call_llm(
1066
+ task=task_prompt,
1067
+ img=img,
1068
+ current_loop=loop_count,
1069
+ *args,
1070
+ **kwargs,
1071
+ )
1072
+ else:
1073
+ response = self.call_llm(
1074
+ task=task_prompt,
1075
+ current_loop=loop_count,
1076
+ *args,
1077
+ **kwargs,
1078
+ )
1038
1079
 
1080
+ # Parse the response from the agent with the output type
1039
1081
  if exists(self.tools_list_dictionary):
1040
1082
  if isinstance(response, BaseModel):
1041
1083
  response = response.model_dump()
@@ -1053,18 +1095,22 @@ class Agent:
1053
1095
 
1054
1096
  # Check and execute callable tools
1055
1097
  if exists(self.tools):
1056
-
1057
1098
  if (
1058
1099
  self.output_raw_json_from_tool_call
1059
1100
  is True
1060
1101
  ):
1061
- print(type(response))
1062
1102
  response = response
1063
1103
  else:
1064
- self.execute_tools(
1065
- response=response,
1066
- loop_count=loop_count,
1067
- )
1104
+ # Only execute tools if response is not None
1105
+ if response is not None:
1106
+ self.execute_tools(
1107
+ response=response,
1108
+ loop_count=loop_count,
1109
+ )
1110
+ else:
1111
+ logger.warning(
1112
+ f"LLM returned None response in loop {loop_count}, skipping tool execution"
1113
+ )
1068
1114
 
1069
1115
  # Handle MCP tools
1070
1116
  if (
@@ -1072,10 +1118,16 @@ class Agent:
1072
1118
  or exists(self.mcp_config)
1073
1119
  or exists(self.mcp_urls)
1074
1120
  ):
1075
- self.mcp_tool_handling(
1076
- response=response,
1077
- current_loop=loop_count,
1078
- )
1121
+ # Only handle MCP tools if response is not None
1122
+ if response is not None:
1123
+ self.mcp_tool_handling(
1124
+ response=response,
1125
+ current_loop=loop_count,
1126
+ )
1127
+ else:
1128
+ logger.warning(
1129
+ f"LLM returned None response in loop {loop_count}, skipping MCP tool handling"
1130
+ )
1079
1131
 
1080
1132
  self.sentiment_and_evaluator(response)
1081
1133
 
@@ -1130,7 +1182,10 @@ class Agent:
1130
1182
  user_input.lower()
1131
1183
  == self.custom_exit_command.lower()
1132
1184
  ):
1133
- print("Exiting as per user request.")
1185
+ self.pretty_print(
1186
+ "Exiting as per user request.",
1187
+ loop_count=loop_count,
1188
+ )
1134
1189
  break
1135
1190
 
1136
1191
  self.short_memory.add(
@@ -1231,12 +1286,6 @@ class Agent:
1231
1286
  self,
1232
1287
  task: Optional[str] = None,
1233
1288
  img: Optional[str] = None,
1234
- is_last: bool = False,
1235
- device: str = "cpu", # gpu
1236
- device_id: int = 1,
1237
- all_cores: bool = True,
1238
- do_not_use_cluster_ops: bool = True,
1239
- all_gpus: bool = False,
1240
1289
  *args,
1241
1290
  **kwargs,
1242
1291
  ) -> Any:
@@ -1245,10 +1294,6 @@ class Agent:
1245
1294
  Args:
1246
1295
  task (Optional[str]): The task to be performed. Defaults to None.
1247
1296
  img (Optional[str]): The image to be processed. Defaults to None.
1248
- is_last (bool): Indicates if this is the last task. Defaults to False.
1249
- device (str): The device to use for execution. Defaults to "cpu".
1250
- device_id (int): The ID of the GPU to use if device is set to "gpu". Defaults to 0.
1251
- all_cores (bool): If True, uses all available CPU cores. Defaults to True.
1252
1297
  """
1253
1298
  try:
1254
1299
  return self.run(
@@ -1339,10 +1384,15 @@ class Agent:
1339
1384
  # Get the current conversation history
1340
1385
  history = self.short_memory.get_str()
1341
1386
 
1387
+ plan_prompt = f"Create a comprehensive step-by-step plan to complete the following task: \n\n {task}"
1388
+
1342
1389
  # Construct the planning prompt by combining history, planning prompt, and task
1343
- planning_prompt = (
1344
- f"{history}\n\n{self.planning_prompt}\n\nTask: {task}"
1345
- )
1390
+ if exists(self.planning_prompt):
1391
+ planning_prompt = f"{history}\n\n{self.planning_prompt}\n\nTask: {task}"
1392
+ else:
1393
+ planning_prompt = (
1394
+ f"{history}\n\n{plan_prompt}\n\nTask: {task}"
1395
+ )
1346
1396
 
1347
1397
  # Generate the plan using the LLM
1348
1398
  plan = self.llm.run(task=planning_prompt, *args, **kwargs)
@@ -1350,9 +1400,6 @@ class Agent:
1350
1400
  # Store the generated plan in short-term memory
1351
1401
  self.short_memory.add(role=self.agent_name, content=plan)
1352
1402
 
1353
- logger.info(
1354
- f"Successfully created plan for task: {task[:50]}..."
1355
- )
1356
1403
  return None
1357
1404
 
1358
1405
  except Exception as error:
@@ -1477,10 +1524,13 @@ class Agent:
1477
1524
  f"The model '{self.model_name}' does not support function calling. Please use a model that supports function calling."
1478
1525
  )
1479
1526
 
1480
- if self.max_tokens > get_max_tokens(self.model_name):
1481
- raise AgentInitializationError(
1482
- f"Max tokens is set to {self.max_tokens}, but the model '{self.model_name}' only supports {get_max_tokens(self.model_name)} tokens. Please set max tokens to {get_max_tokens(self.model_name)} or less."
1483
- )
1527
+ try:
1528
+ if self.max_tokens > get_max_tokens(self.model_name):
1529
+ raise AgentInitializationError(
1530
+ f"Max tokens is set to {self.max_tokens}, but the model '{self.model_name}' only supports {get_max_tokens(self.model_name)} tokens. Please set max tokens to {get_max_tokens(self.model_name)} or less."
1531
+ )
1532
+ except Exception:
1533
+ pass
1484
1534
 
1485
1535
  if self.model_name not in model_list:
1486
1536
  logger.warning(
@@ -2424,7 +2474,12 @@ class Agent:
2424
2474
  return None
2425
2475
 
2426
2476
  def call_llm(
2427
- self, task: str, img: Optional[str] = None, *args, **kwargs
2477
+ self,
2478
+ task: str,
2479
+ img: Optional[str] = None,
2480
+ current_loop: int = 0,
2481
+ *args,
2482
+ **kwargs,
2428
2483
  ) -> str:
2429
2484
  """
2430
2485
  Calls the appropriate method on the `llm` object based on the given task.
@@ -2446,14 +2501,81 @@ class Agent:
2446
2501
  """
2447
2502
 
2448
2503
  try:
2449
- if img is not None:
2450
- out = self.llm.run(
2451
- task=task, img=img, *args, **kwargs
2452
- )
2504
+ # Set streaming parameter in LLM if streaming is enabled
2505
+ if self.streaming_on and hasattr(self.llm, "stream"):
2506
+ original_stream = self.llm.stream
2507
+ self.llm.stream = True
2508
+
2509
+ if img is not None:
2510
+ streaming_response = self.llm.run(
2511
+ task=task, img=img, *args, **kwargs
2512
+ )
2513
+ else:
2514
+ streaming_response = self.llm.run(
2515
+ task=task, *args, **kwargs
2516
+ )
2517
+
2518
+ # If we get a streaming response, handle it with the new streaming panel
2519
+ if hasattr(
2520
+ streaming_response, "__iter__"
2521
+ ) and not isinstance(streaming_response, str):
2522
+ # Check print_on parameter for different streaming behaviors
2523
+ if self.print_on is False:
2524
+ # Silent streaming - no printing, just collect chunks
2525
+ chunks = []
2526
+ for chunk in streaming_response:
2527
+ if (
2528
+ hasattr(chunk, "choices")
2529
+ and chunk.choices[0].delta.content
2530
+ ):
2531
+ content = chunk.choices[
2532
+ 0
2533
+ ].delta.content
2534
+ chunks.append(content)
2535
+ complete_response = "".join(chunks)
2536
+ else:
2537
+ # Collect chunks for conversation saving
2538
+ collected_chunks = []
2539
+
2540
+ def on_chunk_received(chunk: str):
2541
+ """Callback to collect chunks as they arrive"""
2542
+ collected_chunks.append(chunk)
2543
+ # Optional: Save each chunk to conversation in real-time
2544
+ # This creates a more detailed conversation history
2545
+ if self.verbose:
2546
+ logger.debug(
2547
+ f"Streaming chunk received: {chunk[:50]}..."
2548
+ )
2549
+
2550
+ # Use the streaming panel to display and collect the response
2551
+ complete_response = formatter.print_streaming_panel(
2552
+ streaming_response,
2553
+ title=f"🤖 Agent: {self.agent_name} Loops: {current_loop}",
2554
+ style=None, # Use random color like non-streaming approach
2555
+ collect_chunks=True,
2556
+ on_chunk_callback=on_chunk_received,
2557
+ )
2558
+
2559
+ # Restore original stream setting
2560
+ self.llm.stream = original_stream
2561
+
2562
+ # Return the complete response for further processing
2563
+ return complete_response
2564
+ else:
2565
+ # Restore original stream setting
2566
+ self.llm.stream = original_stream
2567
+ return streaming_response
2453
2568
  else:
2454
- out = self.llm.run(task=task, *args, **kwargs)
2569
+ # Non-streaming call
2570
+ if img is not None:
2571
+ out = self.llm.run(
2572
+ task=task, img=img, *args, **kwargs
2573
+ )
2574
+ else:
2575
+ out = self.llm.run(task=task, *args, **kwargs)
2576
+
2577
+ return out
2455
2578
 
2456
- return out
2457
2579
  except AgentLLMError as e:
2458
2580
  logger.error(
2459
2581
  f"Error calling LLM: {e}. Task: {task}, Args: {args}, Kwargs: {kwargs}"
@@ -2479,7 +2601,8 @@ class Agent:
2479
2601
  self,
2480
2602
  task: Optional[Union[str, Any]] = None,
2481
2603
  img: Optional[str] = None,
2482
- scheduled_run_date: Optional[datetime] = None,
2604
+ imgs: Optional[List[str]] = None,
2605
+ correct_answer: Optional[str] = None,
2483
2606
  *args,
2484
2607
  **kwargs,
2485
2608
  ) -> Any:
@@ -2493,11 +2616,7 @@ class Agent:
2493
2616
  Args:
2494
2617
  task (Optional[str], optional): The task to be executed. Defaults to None.
2495
2618
  img (Optional[str], optional): The image to be processed. Defaults to None.
2496
- device (str, optional): The device to use for execution. Defaults to "cpu".
2497
- device_id (int, optional): The ID of the GPU to use if device is set to "gpu". Defaults to 0.
2498
- all_cores (bool, optional): If True, uses all available CPU cores. Defaults to True.
2499
- scheduled_run_date (Optional[datetime], optional): The date and time to schedule the task. Defaults to None.
2500
- do_not_use_cluster_ops (bool, optional): If True, does not use cluster ops. Defaults to False.
2619
+ imgs (Optional[List[str]], optional): The list of images to be processed. Defaults to None.
2501
2620
  *args: Additional positional arguments to be passed to the execution method.
2502
2621
  **kwargs: Additional keyword arguments to be passed to the execution method.
2503
2622
 
@@ -2510,21 +2629,28 @@ class Agent:
2510
2629
  """
2511
2630
 
2512
2631
  if not isinstance(task, str):
2513
- task = any_to_str(task)
2514
-
2515
- if scheduled_run_date:
2516
- while datetime.now() < scheduled_run_date:
2517
- time.sleep(
2518
- 1
2519
- ) # Sleep for a short period to avoid busy waiting
2632
+ task = format_data_structure(task)
2520
2633
 
2521
2634
  try:
2522
- output = self._run(
2523
- task=task,
2524
- img=img,
2525
- *args,
2526
- **kwargs,
2527
- )
2635
+ if exists(imgs):
2636
+ output = self.run_multiple_images(
2637
+ task=task, imgs=imgs, *args, **kwargs
2638
+ )
2639
+ elif exists(correct_answer):
2640
+ output = self.continuous_run_with_answer(
2641
+ task=task,
2642
+ img=img,
2643
+ correct_answer=correct_answer,
2644
+ *args,
2645
+ **kwargs,
2646
+ )
2647
+ else:
2648
+ output = self._run(
2649
+ task=task,
2650
+ img=img,
2651
+ *args,
2652
+ **kwargs,
2653
+ )
2528
2654
 
2529
2655
  return output
2530
2656
 
@@ -2664,14 +2790,12 @@ class Agent:
2664
2790
  return self.role
2665
2791
 
2666
2792
  def pretty_print(self, response: str, loop_count: int):
2667
- if self.no_print is False:
2793
+ if self.print_on is False:
2668
2794
  if self.streaming_on is True:
2669
- # self.stream_response(response)
2670
- formatter.print_panel_token_by_token(
2671
- f"{self.agent_name}: {response}",
2672
- title=f"Agent Name: {self.agent_name} [Max Loops: {loop_count}]",
2673
- )
2674
- elif self.no_print is True:
2795
+ # Skip printing here since real streaming is handled in call_llm
2796
+ # This avoids double printing when streaming_on=True
2797
+ pass
2798
+ elif self.print_on is False:
2675
2799
  pass
2676
2800
  else:
2677
2801
  # logger.info(f"Response: {response}")
@@ -2781,7 +2905,7 @@ class Agent:
2781
2905
  )
2782
2906
  # tool_response = format_data_structure(tool_response)
2783
2907
 
2784
- print(f"Multiple MCP Tool Response: {tool_response}")
2908
+ # print(f"Multiple MCP Tool Response: {tool_response}")
2785
2909
  else:
2786
2910
  raise AgentMCPConnectionError(
2787
2911
  "mcp_url must be either a string URL or MCPConnection object"
@@ -2791,7 +2915,7 @@ class Agent:
2791
2915
  # execute_tool_call_simple returns a string directly, not an object with content attribute
2792
2916
  text_content = f"MCP Tool Response: \n\n {json.dumps(tool_response, indent=2)}"
2793
2917
 
2794
- if self.no_print is False:
2918
+ if self.print_on is False:
2795
2919
  formatter.print_panel(
2796
2920
  text_content,
2797
2921
  "MCP Tool Response: 🛠️",
@@ -2834,7 +2958,7 @@ class Agent:
2834
2958
  temperature=self.temperature,
2835
2959
  max_tokens=self.max_tokens,
2836
2960
  system_prompt=self.system_prompt,
2837
- stream=self.streaming_on,
2961
+ stream=False, # Always disable streaming for tool summaries
2838
2962
  tools_list_dictionary=None,
2839
2963
  parallel_tool_calls=False,
2840
2964
  base_url=self.llm_base_url,
@@ -2842,6 +2966,13 @@ class Agent:
2842
2966
  )
2843
2967
 
2844
2968
  def execute_tools(self, response: any, loop_count: int):
2969
+ # Handle None response gracefully
2970
+ if response is None:
2971
+ logger.warning(
2972
+ f"Cannot execute tools with None response in loop {loop_count}. "
2973
+ "This may indicate the LLM did not return a valid response."
2974
+ )
2975
+ return
2845
2976
 
2846
2977
  output = (
2847
2978
  self.tool_struct.execute_function_calls_from_api_response(
@@ -2888,3 +3019,134 @@ class Agent:
2888
3019
 
2889
3020
  def list_output_types(self):
2890
3021
  return OutputType
3022
+
3023
+ def run_multiple_images(
3024
+ self, task: str, imgs: List[str], *args, **kwargs
3025
+ ):
3026
+ """
3027
+ Run the agent with multiple images using concurrent processing.
3028
+
3029
+ Args:
3030
+ task (str): The task to be performed on each image.
3031
+ imgs (List[str]): List of image paths or URLs to process.
3032
+ *args: Additional positional arguments to pass to the agent's run method.
3033
+ **kwargs: Additional keyword arguments to pass to the agent's run method.
3034
+
3035
+ Returns:
3036
+ List[Any]: A list of outputs generated for each image in the same order as the input images.
3037
+
3038
+ Examples:
3039
+ >>> agent = Agent()
3040
+ >>> outputs = agent.run_multiple_images(
3041
+ ... task="Describe what you see in this image",
3042
+ ... imgs=["image1.jpg", "image2.png", "image3.jpeg"]
3043
+ ... )
3044
+ >>> print(f"Processed {len(outputs)} images")
3045
+ Processed 3 images
3046
+
3047
+ Raises:
3048
+ Exception: If an error occurs while processing any of the images.
3049
+ """
3050
+ # Calculate number of workers as 95% of available CPU cores
3051
+ cpu_count = os.cpu_count()
3052
+ max_workers = max(1, int(cpu_count * 0.95))
3053
+
3054
+ # Use ThreadPoolExecutor for concurrent processing
3055
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
3056
+ # Submit all image processing tasks
3057
+ future_to_img = {
3058
+ executor.submit(
3059
+ self.run, task=task, img=img, *args, **kwargs
3060
+ ): img
3061
+ for img in imgs
3062
+ }
3063
+
3064
+ # Collect results in order
3065
+ outputs = []
3066
+ for future in future_to_img:
3067
+ try:
3068
+ output = future.result()
3069
+ outputs.append(output)
3070
+ except Exception as e:
3071
+ logger.error(f"Error processing image: {e}")
3072
+ outputs.append(
3073
+ None
3074
+ ) # or raise the exception based on your preference
3075
+
3076
+ # Combine the outputs into a single string if summarization is enabled
3077
+ if self.summarize_multiple_images is True:
3078
+ output = "\n".join(outputs)
3079
+
3080
+ prompt = f"""
3081
+ You have already analyzed {len(outputs)} images and provided detailed descriptions for each one.
3082
+ Now, based on your previous analysis of these images, create a comprehensive report that:
3083
+
3084
+ 1. Synthesizes the key findings across all images
3085
+ 2. Identifies common themes, patterns, or relationships between the images
3086
+ 3. Provides an overall summary that captures the most important insights
3087
+ 4. Highlights any notable differences or contrasts between the images
3088
+
3089
+ Here are your previous analyses of the images:
3090
+ {output}
3091
+
3092
+ Please create a well-structured report that brings together your insights from all {len(outputs)} images.
3093
+ """
3094
+
3095
+ outputs = self.run(task=prompt, *args, **kwargs)
3096
+
3097
+ return outputs
3098
+
3099
+ def continuous_run_with_answer(
3100
+ self,
3101
+ task: str,
3102
+ img: Optional[str] = None,
3103
+ correct_answer: str = None,
3104
+ max_attempts: int = 10,
3105
+ ):
3106
+ """
3107
+ Run the agent with the task until the correct answer is provided.
3108
+
3109
+ Args:
3110
+ task (str): The task to be performed
3111
+ correct_answer (str): The correct answer that must be found in the response
3112
+ max_attempts (int): Maximum number of attempts before giving up (default: 10)
3113
+
3114
+ Returns:
3115
+ str: The response containing the correct answer
3116
+
3117
+ Raises:
3118
+ Exception: If max_attempts is reached without finding the correct answer
3119
+ """
3120
+ attempts = 0
3121
+
3122
+ while attempts < max_attempts:
3123
+ attempts += 1
3124
+
3125
+ if self.verbose:
3126
+ logger.info(
3127
+ f"Attempt {attempts}/{max_attempts} to find correct answer"
3128
+ )
3129
+
3130
+ response = self._run(task=task, img=img)
3131
+
3132
+ # Check if the correct answer is in the response (case-insensitive)
3133
+ if correct_answer.lower() in response.lower():
3134
+ if self.verbose:
3135
+ logger.info(
3136
+ f"Correct answer found on attempt {attempts}"
3137
+ )
3138
+ return response
3139
+ else:
3140
+ # Add feedback to help guide the agent
3141
+ feedback = "Your previous response was incorrect. Think carefully about the question and ensure your response directly addresses what was asked."
3142
+ self.short_memory.add(role="User", content=feedback)
3143
+
3144
+ if self.verbose:
3145
+ logger.info(
3146
+ f"Correct answer not found. Expected: '{correct_answer}'"
3147
+ )
3148
+
3149
+ # If we reach here, we've exceeded max_attempts
3150
+ raise Exception(
3151
+ f"Failed to find correct answer '{correct_answer}' after {max_attempts} attempts"
3152
+ )