vision-agent 0.2.230__tar.gz → 0.2.232__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. {vision_agent-0.2.230 → vision_agent-0.2.232}/PKG-INFO +3 -11
  2. {vision_agent-0.2.230 → vision_agent-0.2.232}/README.md +2 -10
  3. {vision_agent-0.2.230 → vision_agent-0.2.232}/pyproject.toml +1 -1
  4. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/.sim_tools/df.csv +12 -10
  5. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/agent_utils.py +4 -2
  6. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/types.py +1 -0
  7. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_coder_prompts_v2.py +1 -1
  8. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_coder_v2.py +15 -1
  9. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_planner_prompts_v2.py +12 -8
  10. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_planner_v2.py +2 -0
  11. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_prompts_v2.py +4 -1
  12. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_v2.py +5 -2
  13. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/configs/anthropic_config.py +2 -2
  14. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/configs/openai_config.py +2 -2
  15. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/tools/planner_tools.py +47 -21
  16. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/tools/tool_utils.py +3 -0
  17. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/tools/tools.py +76 -34
  18. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/utils/sim.py +33 -12
  19. {vision_agent-0.2.230 → vision_agent-0.2.232}/LICENSE +0 -0
  20. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/.sim_tools/embs.npy +0 -0
  21. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/__init__.py +0 -0
  22. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/README.md +0 -0
  23. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/__init__.py +0 -0
  24. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/agent.py +0 -0
  25. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/vision_agent.py +0 -0
  26. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_coder.py +0 -0
  27. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
  28. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_planner.py +0 -0
  29. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_planner_prompts.py +0 -0
  30. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_prompts.py +0 -0
  31. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/clients/__init__.py +0 -0
  32. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/clients/http.py +0 -0
  33. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/clients/landing_public_api.py +0 -0
  34. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/configs/__init__.py +0 -0
  35. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/configs/anthropic_openai_config.py +0 -0
  36. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/configs/config.py +0 -0
  37. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/fonts/__init__.py +0 -0
  38. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
  39. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/lmm/__init__.py +0 -0
  40. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/lmm/lmm.py +0 -0
  41. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/lmm/types.py +0 -0
  42. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/tools/__init__.py +4 -4
  43. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/tools/meta_tools.py +0 -0
  44. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/tools/prompts.py +0 -0
  45. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/tools/tools_types.py +0 -0
  46. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/utils/__init__.py +0 -0
  47. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/utils/exceptions.py +0 -0
  48. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/utils/execute.py +0 -0
  49. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/utils/image_utils.py +0 -0
  50. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/utils/type_defs.py +0 -0
  51. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/utils/video.py +0 -0
  52. {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/utils/video_tracking.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.230
3
+ Version: 0.2.232
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -71,12 +71,7 @@ export ANTHROPIC_API_KEY="your-api-key"
71
71
  export OPENAI_API_KEY="your-api-key"
72
72
  ```
73
73
 
74
- ---
75
- **NOTE**
76
- We found using both Anthropic Claude-3.5 and OpenAI o1 to be provide the best performance
77
- for VisionAgent. If you want to use a different LLM provider or only one, see
78
- 'Using Other LLM Providers' below.
79
- ---
74
+ > **_NOTE:_** We found using both Anthropic Claude-3.5 and OpenAI o1 to be provide the best performance for VisionAgent. If you want to use a different LLM provider or only one, see 'Using Other LLM Providers' below.
80
75
 
81
76
  ## Documentation
82
77
 
@@ -149,8 +144,5 @@ directory. For example to change to Anthropic simply just run:
149
144
  cp vision_agent/configs/anthropic_config.py vision_agent/configs/config.py
150
145
  ```
151
146
 
152
- **NOTE**
153
- VisionAgent moves fast and we are constantly updating and changing the library. If you
154
- have any questions or need help, please reach out to us on our discord channel.
155
- ---
147
+ > **_NOTE:_** VisionAgent moves fast and we are constantly updating and changing the library. If you have any questions or need help, please reach out to us on our discord channel.
156
148
 
@@ -26,12 +26,7 @@ export ANTHROPIC_API_KEY="your-api-key"
26
26
  export OPENAI_API_KEY="your-api-key"
27
27
  ```
28
28
 
29
- ---
30
- **NOTE**
31
- We found using both Anthropic Claude-3.5 and OpenAI o1 to be provide the best performance
32
- for VisionAgent. If you want to use a different LLM provider or only one, see
33
- 'Using Other LLM Providers' below.
34
- ---
29
+ > **_NOTE:_** We found using both Anthropic Claude-3.5 and OpenAI o1 to be provide the best performance for VisionAgent. If you want to use a different LLM provider or only one, see 'Using Other LLM Providers' below.
35
30
 
36
31
  ## Documentation
37
32
 
@@ -104,7 +99,4 @@ directory. For example to change to Anthropic simply just run:
104
99
  cp vision_agent/configs/anthropic_config.py vision_agent/configs/config.py
105
100
  ```
106
101
 
107
- **NOTE**
108
- VisionAgent moves fast and we are constantly updating and changing the library. If you
109
- have any questions or need help, please reach out to us on our discord channel.
110
- ---
102
+ > **_NOTE:_** VisionAgent moves fast and we are constantly updating and changing the library. If you have any questions or need help, please reach out to us on our discord channel.
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "vision-agent"
7
- version = "0.2.230"
7
+ version = "0.2.232"
8
8
  description = "Toolset for Vision Agent"
9
9
  authors = ["Landing AI <dev@landing.ai>"]
10
10
  readme = "README.md"
@@ -514,7 +514,7 @@ desc,doc,name
514
514
  >>> vit_nsfw_classification(image)
515
515
  {""label"": ""normal"", ""scores"": 0.68},
516
516
  ",vit_nsfw_classification
517
- 'video_temporal_localization' will run qwen2vl on each chunk_length_frames value selected for the video. It can detect multiple objects independently per chunk_length_frames given a text prompt such as a referring expression but does not track objects across frames. It returns a list of floats with a value of 1.0 if the objects are found in a given chunk_length_frames of the video.,"video_temporal_localization(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen2vl', chunk_length_frames: Optional[int] = 2) -> List[float]:
517
+ 'video_temporal_localization' will run qwen2vl on each chunk_length_frames value selected for the video. It can detect multiple objects independently per chunk_length_frames given a text prompt such as a referring expression but does not track objects across frames. It returns a list of floats with a value of 1.0 if the objects are found in a given chunk_length_frames of the video.,"video_temporal_localization(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen2vl', chunk_length_frames: int = 2) -> List[float]:
518
518
  'video_temporal_localization' will run qwen2vl on each chunk_length_frames
519
519
  value selected for the video. It can detect multiple objects independently per
520
520
  chunk_length_frames given a text prompt such as a referring expression
@@ -527,7 +527,7 @@ desc,doc,name
527
527
  frames (List[np.ndarray]): The reference frames used for the question
528
528
  model (str): The model to use for the inference. Valid values are
529
529
  'qwen2vl', 'gpt4o'.
530
- chunk_length_frames (Optional[int]): length of each chunk in frames
530
+ chunk_length_frames (int): length of each chunk in frames
531
531
 
532
532
  Returns:
533
533
  List[float]: A list of floats with a value of 1.0 if the objects to be found
@@ -540,16 +540,18 @@ desc,doc,name
540
540
  ",video_temporal_localization
541
541
  "'flux_image_inpainting' performs image inpainting to fill the masked regions, given by mask, in the image, given image based on the text prompt and surrounding image context. It can be used to edit regions of an image according to the prompt given.","flux_image_inpainting(prompt: str, image: numpy.ndarray, mask: numpy.ndarray) -> numpy.ndarray:
542
542
  'flux_image_inpainting' performs image inpainting to fill the masked regions,
543
- given by mask, in the image, given image based on the text prompt and surrounding image context.
544
- It can be used to edit regions of an image according to the prompt given.
543
+ given by mask, in the image, given image based on the text prompt and surrounding
544
+ image context. It can be used to edit regions of an image according to the prompt
545
+ given.
545
546
 
546
547
  Parameters:
547
548
  prompt (str): A detailed text description guiding what should be generated
548
- in the masked area. More detailed and specific prompts typically yield better results.
549
- image (np.ndarray): The source image to be inpainted.
550
- The image will serve as the base context for the inpainting process.
551
- mask (np.ndarray): A binary mask image with 0's and 1's,
552
- where 1 indicates areas to be inpainted and 0 indicates areas to be preserved.
549
+ in the masked area. More detailed and specific prompts typically yield
550
+ better results.
551
+ image (np.ndarray): The source image to be inpainted. The image will serve as
552
+ the base context for the inpainting process.
553
+ mask (np.ndarray): A binary mask image with 0's and 1's, where 1 indicates
554
+ areas to be inpainted and 0 indicates areas to be preserved.
553
555
 
554
556
  Returns:
555
557
  np.ndarray: The generated image(s) as a numpy array in RGB format with values
@@ -658,7 +660,7 @@ desc,doc,name
658
660
  -------
659
661
  >>> save_image(image)
660
662
  ",save_image
661
- 'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.,"save_video(frames: List[numpy.ndarray], output_video_path: Optional[str] = None, fps: float = 1) -> str:
663
+ 'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.,"save_video(frames: List[numpy.ndarray], output_video_path: Optional[str] = None, fps: float = 5) -> str:
662
664
  'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.
663
665
 
664
666
  Parameters:
@@ -148,8 +148,10 @@ def format_plan_v2(plan: PlanContext) -> str:
148
148
  plan_str += "Instructions:\n"
149
149
  for v in plan.instructions:
150
150
  plan_str += f" - {v}\n"
151
- plan_str += "Code:\n"
152
- plan_str += plan.code
151
+
152
+ if plan.code:
153
+ plan_str += "Code:\n"
154
+ plan_str += plan.code
153
155
  return plan_str
154
156
 
155
157
 
@@ -33,6 +33,7 @@ class AgentMessage(BaseModel):
33
33
  Literal["interaction_response"],
34
34
  Literal["conversation"],
35
35
  Literal["planner"],
36
+ Literal["planner_update"],
36
37
  Literal["coder"],
37
38
  ]
38
39
  content: str
@@ -6,7 +6,7 @@ FEEDBACK = """
6
6
 
7
7
 
8
8
  CODE = """
9
- **Role**: You are an expoert software programmer.
9
+ **Role**: You are an expert software programmer.
10
10
 
11
11
  **Task**: You are given a plan by a planning agent that solves a vision problem posed by the user. You are also given code snippets that the planning agent used to solve the task. Your job is to organize the code so that it can be easily called by the user to solve the task.
12
12
 
@@ -425,6 +425,8 @@ class VisionAgentCoderV2(AgentCoder):
425
425
  chat (List[AgentMessage]): The input to the agent. This should be a list of
426
426
  AgentMessage objects.
427
427
  plan_context (PlanContext): The plan context that was previously generated.
428
+ If plan_context.code is not provided, then the code will be generated
429
+ from the chat messages.
428
430
  code_interpreter (Optional[CodeInterpreter]): The code interpreter to use.
429
431
 
430
432
  Returns:
@@ -455,12 +457,24 @@ class VisionAgentCoderV2(AgentCoder):
455
457
  int_chat, _, media_list = add_media_to_chat(chat, code_interpreter)
456
458
  tool_docs = retrieve_tools(plan_context.instructions, self.tool_recommender)
457
459
 
460
+ # If code is not provided from the plan_context then generate it, else use
461
+ # the provided code and start with testing
462
+ if not plan_context.code.strip():
463
+ code = write_code(
464
+ coder=self.coder,
465
+ chat=int_chat,
466
+ tool_docs=tool_docs,
467
+ plan=format_plan_v2(plan_context),
468
+ )
469
+ else:
470
+ code = plan_context.code
471
+
458
472
  code_context = test_code(
459
473
  tester=self.tester,
460
474
  debugger=self.debugger,
461
475
  chat=int_chat,
462
476
  plan=format_plan_v2(plan_context),
463
- code=plan_context.code,
477
+ code=code,
464
478
  tool_docs=tool_docs,
465
479
  code_interpreter=code_interpreter,
466
480
  media_list=media_list,
@@ -458,6 +458,8 @@ You are given a task: "{task}" from the user. You must extract the type of categ
458
458
  - "DocQA" - answering questions about a document or extracting information from a document.
459
459
  - "video object tracking" - tracking objects in a video.
460
460
  - "depth and pose estimation" - estimating the depth or pose of objects in an image.
461
+ - "temporal localization" - localizing the time period an event occurs in a video.
462
+ - "inpainting" - filling in masked parts of an image.
461
463
 
462
464
  Return the category or categories (comma separated) inside tags <category># your categories here</category>. If you are unsure about a task, it is better to include more categories than less.
463
465
  """
@@ -651,22 +653,24 @@ PICK_TOOL = """
651
653
  """
652
654
 
653
655
  FINALIZE_PLAN = """
654
- **Role**: You are an expert AI model that can understand the user request and construct plans to accomplish it.
656
+ **Task**: You are given a chain of thoughts, python executions and observations from a planning agent as it tries to construct a plan to solve a user request. Your task is to summarize the plan it found so that another programming agent to write a program to accomplish the user request.
655
657
 
656
- **Task**: You are given a chain of thoughts, python executions and observations from a planning agent as it tries to construct a plan to solve a user request. Your task is to summarize the plan it found so that another programming agnet to write a program to accomplish the user request.
658
+ **Documentation**: You can use these tools to help you visualize or save the output:
659
+ {tool_desc}
657
660
 
658
661
  **Planning**: Here is chain of thoughts, executions and observations from the planning agent:
659
662
  {planning}
660
663
 
661
664
  **Instructions**:
662
665
  1. Summarize the plan that the planning agent found.
663
- 2. Write a single function that solves the problem based on what the planner found.
664
- 3. Specifically call out the tools used and the order in which they were used. Only include tools obtained from calling `get_tool_for_task`.
666
+ 2. Write a single function that solves the problem based on what the planner found and only returns the final solution.
667
+ 3. Only use tools obtained from calling `get_tool_for_task`.
665
668
  4. Do not include {excluded_tools} tools in your instructions.
666
- 5. Add final instructions for visualizing the output with `overlay_bounding_boxes` or `overlay_segmentation_masks` and saving it to a file with `save_image` or `save_video`.
667
- 6. Use the default FPS for extracting frames from videos unless otherwise specified by the user.
668
- 7. Include the expected answer in your 'plan' so that the programming agent can properly test if it has the correct answer.
669
- 8. Respond in the following format with JSON surrounded by <json> tags and code surrounded by <code> tags:
669
+ 5. Ensure the function is well documented and easy to understand.
670
+ 6. Ensure you visualize the output with `overlay_bounding_boxes` or `overlay_segmentation_masks` and save it to a file with `save_image` or `save_video`.
671
+ 7. Use the default FPS for extracting frames from videos unless otherwise specified by the user.
672
+ 8. Include the expected answer in your 'plan' so that the programming agent can properly test if it has the correct answer.
673
+ 9. Respond in the following format with JSON surrounded by <json> tags and code surrounded by <code> tags:
670
674
 
671
675
  <json>
672
676
  {{
@@ -326,6 +326,7 @@ def create_finalize_plan(
326
326
  return [], PlanContext(plan="", instructions=[], code="")
327
327
 
328
328
  prompt = FINALIZE_PLAN.format(
329
+ tool_desc=UTIL_DOCSTRING,
329
330
  planning=get_planning(chat),
330
331
  excluded_tools=str([t.__name__ for t in pt.PLANNER_TOOLS]),
331
332
  )
@@ -513,6 +514,7 @@ class VisionAgentPlannerV2(AgentPlanner):
513
514
  code = extract_tag(response, "execute_python")
514
515
  finalize_plan = extract_tag(response, "finalize_plan")
515
516
  finished = finalize_plan is not None
517
+ self.update_callback({"role": "planner_update", "content": response})
516
518
 
517
519
  if self.verbose:
518
520
  _CONSOLE.print(
@@ -42,6 +42,8 @@ AGENT: <response>I am VisionAgent, an agent built by LandingAI, to help users wr
42
42
  - Understanding documents
43
43
  - Pose estimation
44
44
  - Visual question answering for both images and videos
45
+ - Action recognition in videos
46
+ - Image inpainting
45
47
 
46
48
  How can I help you?</response>
47
49
  --- END EXAMPLE2 ---
@@ -54,7 +56,8 @@ Here is the current conversation so far:
54
56
 
55
57
  **Instructions**:
56
58
  1. Only respond with a single <response> tag and a single <action> tag.
57
- 2. Respond in the following format, the <action> tag is optional and can be excluded if you do not want to take any action:
59
+ 2. You can only take one action at a time in response to the user's message. Do not offer to fix code on the user's behalf, only if they have directly asked you to.
60
+ 3. Respond in the following format, the <action> tag is optional and can be excluded if you do not want to take any action:
58
61
 
59
62
  <response>Your response to the user's message</response>
60
63
  <action>The action you want to take from **Actions**</action>
@@ -112,14 +112,17 @@ def maybe_run_action(
112
112
  )
113
113
  ]
114
114
  elif action == "edit_code":
115
+ # We don't want to pass code in plan_context.code so the coder will generate
116
+ # new code from plan_context.plan
115
117
  plan_context = PlanContext(
116
- plan="Edit the latest code observed in the fewest steps possible according to the user's feedback.",
118
+ plan="Edit the latest code observed in the fewest steps possible according to the user's feedback."
119
+ + ("<code>\n" + final_code + "\n</code>" if final_code is not None else ""),
117
120
  instructions=[
118
121
  chat_i.content
119
122
  for chat_i in extracted_chat
120
123
  if chat_i.role == "user" and "<final_code>" not in chat_i.content
121
124
  ],
122
- code=final_code if final_code is not None else "",
125
+ code="",
123
126
  )
124
127
  context = coder.generate_code_from_plan(
125
128
  extracted_chat, plan_context, code_interpreter=code_interpreter
@@ -81,7 +81,7 @@ class Config(BaseModel):
81
81
  tool_tester_kwargs: dict = Field(
82
82
  default_factory=lambda: {
83
83
  "model_name": "claude-3-5-sonnet-20241022",
84
- "temperature": 1.0,
84
+ "temperature": 0.0,
85
85
  "image_size": 768,
86
86
  }
87
87
  )
@@ -111,7 +111,7 @@ class Config(BaseModel):
111
111
  vqa_kwargs: dict = Field(
112
112
  default_factory=lambda: {
113
113
  "model_name": "claude-3-5-sonnet-20241022",
114
- "temperature": 1.0,
114
+ "temperature": 0.0,
115
115
  "image_size": 768,
116
116
  }
117
117
  )
@@ -98,7 +98,7 @@ class Config(BaseModel):
98
98
  tool_chooser_kwargs: dict = Field(
99
99
  default_factory=lambda: {
100
100
  "model_name": "gpt-4o-2024-08-06",
101
- "temperature": 0.0,
101
+ "temperature": 1.0,
102
102
  "image_size": 768,
103
103
  "image_detail": "low",
104
104
  }
@@ -109,7 +109,7 @@ class Config(BaseModel):
109
109
  suggester_kwargs: dict = Field(
110
110
  default_factory=lambda: {
111
111
  "model_name": "gpt-4o-2024-08-06",
112
- "temperature": 0.0,
112
+ "temperature": 1.0,
113
113
  "image_size": 768,
114
114
  "image_detail": "low",
115
115
  }
@@ -2,7 +2,7 @@ import inspect
2
2
  import logging
3
3
  import tempfile
4
4
  from concurrent.futures import ThreadPoolExecutor, as_completed
5
- from typing import Any, Callable, Dict, List, Optional, Tuple, cast
5
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
6
6
 
7
7
  import libcst as cst
8
8
  import numpy as np
@@ -10,12 +10,7 @@ from IPython.display import display
10
10
  from PIL import Image
11
11
 
12
12
  import vision_agent.tools as T
13
- from vision_agent.agent.agent_utils import (
14
- DefaultImports,
15
- extract_code,
16
- extract_json,
17
- extract_tag,
18
- )
13
+ from vision_agent.agent.agent_utils import DefaultImports, extract_json, extract_tag
19
14
  from vision_agent.agent.vision_agent_planner_prompts_v2 import (
20
15
  CATEGORIZE_TOOL_REQUEST,
21
16
  FINALIZE_PLAN,
@@ -36,6 +31,9 @@ from vision_agent.utils.image_utils import convert_to_b64
36
31
  from vision_agent.utils.sim import get_tool_recommender
37
32
 
38
33
  TOOL_FUNCTIONS = {tool.__name__: tool for tool in T.TOOLS}
34
+ LOAD_TOOLS_DOCSTRING = T.get_tool_documentation(
35
+ [T.load_image, T.extract_frames_and_timestamps]
36
+ )
39
37
 
40
38
  CONFIG = Config()
41
39
  _LOGGER = logging.getLogger(__name__)
@@ -179,6 +177,7 @@ def run_tool_testing(
179
177
  cleaned_tool_docs.append(tool_doc)
180
178
  tool_docs = cleaned_tool_docs
181
179
  tool_docs_str = "\n".join([e["doc"] for e in tool_docs])
180
+ tool_docs_str += "\n" + LOAD_TOOLS_DOCSTRING
182
181
 
183
182
  prompt = TEST_TOOLS.format(
184
183
  tool_docs=tool_docs_str,
@@ -217,8 +216,15 @@ def run_tool_testing(
217
216
  examples=EXAMPLES,
218
217
  media=str(image_paths),
219
218
  )
220
- code = extract_code(lmm.generate(prompt, media=image_paths)) # type: ignore
221
- code = process_code(code)
219
+ response = cast(str, lmm.generate(prompt, media=image_paths))
220
+ code = extract_tag(response, "code")
221
+ if code is None:
222
+ code = response
223
+
224
+ try:
225
+ code = process_code(code)
226
+ except Exception as e:
227
+ _LOGGER.error(f"Error processing code: {e}")
222
228
  tool_output = code_interpreter.exec_isolation(
223
229
  DefaultImports.prepend_imports(code)
224
230
  )
@@ -229,7 +235,9 @@ def run_tool_testing(
229
235
 
230
236
 
231
237
  def get_tool_for_task(
232
- task: str, images: List[np.ndarray], exclude_tools: Optional[List[str]] = None
238
+ task: str,
239
+ images: Union[Dict[str, List[np.ndarray]], List[np.ndarray]],
240
+ exclude_tools: Optional[List[str]] = None,
233
241
  ) -> None:
234
242
  """Given a task and one or more images this function will find a tool to accomplish
235
243
  the jobs. It prints the tool documentation and thoughts on why it chose the tool.
@@ -242,6 +250,8 @@ def get_tool_for_task(
242
250
  - VQA
243
251
  - Depth and pose estimation
244
252
  - Video object tracking
253
+ - Video temporal localization (action recognition)
254
+ - Image inpainting
245
255
 
246
256
  Only ask for one type of task at a time, for example a task needing to identify
247
257
  text is one OCR task while needing to identify non-text objects is an OD task. Wait
@@ -250,7 +260,8 @@ def get_tool_for_task(
250
260
 
251
261
  Parameters:
252
262
  task: str: The task to accomplish.
253
- images: List[np.ndarray]: The images to use for the task.
263
+ images: Union[Dict[str, List[np.ndarray]], List[np.ndarray]]: The images to use
264
+ for the task. If a key is provided, it is used as the file name.
254
265
  exclude_tools: Optional[List[str]]: A list of tool names to exclude from the
255
266
  recommendations. This is helpful if you are calling get_tool_for_task twice
256
267
  and do not want the same tool recommended.
@@ -260,20 +271,29 @@ def get_tool_for_task(
260
271
 
261
272
  Examples
262
273
  --------
263
- >>> get_tool_for_task("Give me an OCR model that can find 'hot chocolate' in the image", [image])
274
+ >>> get_tool_for_task(
275
+ >>> "Give me an OCR model that can find 'hot chocolate' in the image",
276
+ >>> {"image": [image]})
277
+ >>> get_tool_for_taks(
278
+ >>> "I need a tool that can paint a background for this image and maks",
279
+ >>> {"image": [image], "mask": [mask]})
264
280
  """
265
281
  tool_tester = CONFIG.create_tool_tester()
266
282
  tool_chooser = CONFIG.create_tool_chooser()
267
283
 
284
+ if isinstance(images, list):
285
+ images = {"image": images}
286
+
268
287
  with (
269
288
  tempfile.TemporaryDirectory() as tmpdirname,
270
289
  CodeInterpreterFactory.new_instance() as code_interpreter,
271
290
  ):
272
291
  image_paths = []
273
- for i, image in enumerate(images[:3]):
274
- image_path = f"{tmpdirname}/image_{i}.png"
275
- Image.fromarray(image).save(image_path)
276
- image_paths.append(image_path)
292
+ for k in images.keys():
293
+ for i, image in enumerate(images[k]):
294
+ image_path = f"{tmpdirname}/{k}_{i}.png"
295
+ Image.fromarray(image).save(image_path)
296
+ image_paths.append(image_path)
277
297
 
278
298
  code, tool_docs_str, tool_output = run_tool_testing(
279
299
  task, image_paths, tool_tester, exclude_tools, code_interpreter
@@ -294,20 +314,26 @@ def get_tool_documentation(tool_name: str) -> str:
294
314
 
295
315
 
296
316
  def get_tool_for_task_human_reviewer(
297
- task: str, images: List[np.ndarray], exclude_tools: Optional[List[str]] = None
317
+ task: str,
318
+ images: Union[Dict[str, List[np.ndarray]], List[np.ndarray]],
319
+ exclude_tools: Optional[List[str]] = None,
298
320
  ) -> None:
299
321
  # NOTE: this will have the same documentation as get_tool_for_task
300
322
  tool_tester = CONFIG.create_tool_tester()
301
323
 
324
+ if isinstance(images, list):
325
+ images = {"image": images}
326
+
302
327
  with (
303
328
  tempfile.TemporaryDirectory() as tmpdirname,
304
329
  CodeInterpreterFactory.new_instance() as code_interpreter,
305
330
  ):
306
331
  image_paths = []
307
- for i, image in enumerate(images[:3]):
308
- image_path = f"{tmpdirname}/image_{i}.png"
309
- Image.fromarray(image).save(image_path)
310
- image_paths.append(image_path)
332
+ for k in images.keys():
333
+ for i, image in enumerate(images[k]):
334
+ image_path = f"{tmpdirname}/{k}_{i}.png"
335
+ Image.fromarray(image).save(image_path)
336
+ image_paths.append(image_path)
311
337
 
312
338
  tools = [
313
339
  t.__name__
@@ -318,6 +318,9 @@ def single_nms(
318
318
  def nms(
319
319
  all_preds: List[List[Dict[str, Any]]], iou_threshold: float
320
320
  ) -> List[List[Dict[str, Any]]]:
321
+ if not isinstance(all_preds[0], List):
322
+ all_preds = [all_preds]
323
+
321
324
  return_preds = []
322
325
  for frame_preds in all_preds:
323
326
  frame_preds = single_nms(frame_preds, iou_threshold)
@@ -222,7 +222,7 @@ def sam2(
222
222
  ret = _sam2(image, detections, image_size)
223
223
  _display_tool_trace(
224
224
  sam2.__name__,
225
- {},
225
+ {"detections": detections},
226
226
  ret["display_data"],
227
227
  ret["files"],
228
228
  )
@@ -314,18 +314,29 @@ def od_sam2_video_tracking(
314
314
 
315
315
  # Process each segment and collect detections
316
316
  detections_per_segment: List[Any] = []
317
- for segment_index, segment in enumerate(segments):
318
- segment_detections = process_segment(
319
- segment_frames=segment,
320
- od_model=od_model,
321
- prompt=prompt,
322
- fine_tune_id=fine_tune_id,
323
- chunk_length=chunk_length,
324
- image_size=image_size,
325
- segment_index=segment_index,
326
- object_detection_tool=_apply_object_detection,
327
- )
328
- detections_per_segment.append(segment_detections)
317
+ with ThreadPoolExecutor() as executor:
318
+ futures = {
319
+ executor.submit(
320
+ process_segment,
321
+ segment_frames=segment,
322
+ od_model=od_model,
323
+ prompt=prompt,
324
+ fine_tune_id=fine_tune_id,
325
+ chunk_length=chunk_length,
326
+ image_size=image_size,
327
+ segment_index=segment_index,
328
+ object_detection_tool=_apply_object_detection,
329
+ ): segment_index
330
+ for segment_index, segment in enumerate(segments)
331
+ }
332
+
333
+ for future in as_completed(futures):
334
+ segment_index = futures[future]
335
+ detections_per_segment.append((segment_index, future.result()))
336
+
337
+ detections_per_segment = [
338
+ x[1] for x in sorted(detections_per_segment, key=lambda x: x[0])
339
+ ]
329
340
 
330
341
  merged_detections = merge_segments(detections_per_segment)
331
342
  post_processed = post_process(merged_detections, image_size)
@@ -390,7 +401,7 @@ def _owlv2_object_detection(
390
401
  {
391
402
  "label": bbox["label"],
392
403
  "bbox": normalize_bbox(bbox["bounding_box"], image_size),
393
- "score": bbox["score"],
404
+ "score": round(bbox["score"], 2),
394
405
  }
395
406
  for bbox in bboxes
396
407
  ]
@@ -398,7 +409,7 @@ def _owlv2_object_detection(
398
409
  {
399
410
  "label": bbox["label"],
400
411
  "bbox": bbox["bounding_box"],
401
- "score": bbox["score"],
412
+ "score": round(bbox["score"], 2),
402
413
  }
403
414
  for bbox in bboxes
404
415
  ]
@@ -582,7 +593,7 @@ def owlv2_sam2_video_tracking(
582
593
  )
583
594
  _display_tool_trace(
584
595
  owlv2_sam2_video_tracking.__name__,
585
- {},
596
+ {"prompt": prompt, "chunk_length": chunk_length},
586
597
  ret["display_data"],
587
598
  ret["files"],
588
599
  )
@@ -1681,7 +1692,7 @@ def video_temporal_localization(
1681
1692
  prompt: str,
1682
1693
  frames: List[np.ndarray],
1683
1694
  model: str = "qwen2vl",
1684
- chunk_length_frames: Optional[int] = 2,
1695
+ chunk_length_frames: int = 2,
1685
1696
  ) -> List[float]:
1686
1697
  """'video_temporal_localization' will run qwen2vl on each chunk_length_frames
1687
1698
  value selected for the video. It can detect multiple objects independently per
@@ -1695,7 +1706,7 @@ def video_temporal_localization(
1695
1706
  frames (List[np.ndarray]): The reference frames used for the question
1696
1707
  model (str): The model to use for the inference. Valid values are
1697
1708
  'qwen2vl', 'gpt4o'.
1698
- chunk_length_frames (Optional[int]): length of each chunk in frames
1709
+ chunk_length_frames (int): length of each chunk in frames
1699
1710
 
1700
1711
  Returns:
1701
1712
  List[float]: A list of floats with a value of 1.0 if the objects to be found
@@ -1714,19 +1725,48 @@ def video_temporal_localization(
1714
1725
  "model": model,
1715
1726
  "function_name": "video_temporal_localization",
1716
1727
  }
1717
- if chunk_length_frames is not None:
1718
- payload["chunk_length_frames"] = chunk_length_frames
1728
+ payload["chunk_length_frames"] = chunk_length_frames
1729
+
1730
+ segments = split_frames_into_segments(frames, segment_size=50, overlap=0)
1731
+
1732
+ def _apply_temporal_localization(
1733
+ segment: List[np.ndarray],
1734
+ ) -> List[float]:
1735
+ segment_buffer_bytes = [("video", frames_to_bytes(segment))]
1736
+ data = send_inference_request(
1737
+ payload, "video-temporal-localization", files=segment_buffer_bytes, v2=True
1738
+ )
1739
+ chunked_data = [cast(float, value) for value in data]
1740
+
1741
+ full_data = []
1742
+ for value in chunked_data:
1743
+ full_data.extend([value] * chunk_length_frames)
1744
+
1745
+ return full_data[: len(segment)]
1746
+
1747
+ with ThreadPoolExecutor() as executor:
1748
+ futures = {
1749
+ executor.submit(_apply_temporal_localization, segment): segment_index
1750
+ for segment_index, segment in enumerate(segments)
1751
+ }
1752
+
1753
+ localization_per_segment = []
1754
+ for future in as_completed(futures):
1755
+ segment_index = futures[future]
1756
+ localization_per_segment.append((segment_index, future.result()))
1757
+
1758
+ localization_per_segment = [
1759
+ x[1] for x in sorted(localization_per_segment, key=lambda x: x[0]) # type: ignore
1760
+ ]
1761
+ localizations = cast(List[float], [e for o in localization_per_segment for e in o])
1719
1762
 
1720
- data = send_inference_request(
1721
- payload, "video-temporal-localization", files=files, v2=True
1722
- )
1723
1763
  _display_tool_trace(
1724
1764
  video_temporal_localization.__name__,
1725
1765
  payload,
1726
- data,
1766
+ localization_per_segment,
1727
1767
  files,
1728
1768
  )
1729
- return [cast(float, value) for value in data]
1769
+ return localizations
1730
1770
 
1731
1771
 
1732
1772
  def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
@@ -2012,16 +2052,18 @@ def flux_image_inpainting(
2012
2052
  mask: np.ndarray,
2013
2053
  ) -> np.ndarray:
2014
2054
  """'flux_image_inpainting' performs image inpainting to fill the masked regions,
2015
- given by mask, in the image, given image based on the text prompt and surrounding image context.
2016
- It can be used to edit regions of an image according to the prompt given.
2055
+ given by mask, in the image, given image based on the text prompt and surrounding
2056
+ image context. It can be used to edit regions of an image according to the prompt
2057
+ given.
2017
2058
 
2018
2059
  Parameters:
2019
2060
  prompt (str): A detailed text description guiding what should be generated
2020
- in the masked area. More detailed and specific prompts typically yield better results.
2021
- image (np.ndarray): The source image to be inpainted.
2022
- The image will serve as the base context for the inpainting process.
2023
- mask (np.ndarray): A binary mask image with 0's and 1's,
2024
- where 1 indicates areas to be inpainted and 0 indicates areas to be preserved.
2061
+ in the masked area. More detailed and specific prompts typically yield
2062
+ better results.
2063
+ image (np.ndarray): The source image to be inpainted. The image will serve as
2064
+ the base context for the inpainting process.
2065
+ mask (np.ndarray): A binary mask image with 0's and 1's, where 1 indicates
2066
+ areas to be inpainted and 0 indicates areas to be preserved.
2025
2067
 
2026
2068
  Returns:
2027
2069
  np.ndarray: The generated image(s) as a numpy array in RGB format with values
@@ -2150,7 +2192,7 @@ def siglip_classification(image: np.ndarray, labels: List[str]) -> Dict[str, Any
2150
2192
  return response
2151
2193
 
2152
2194
 
2153
- # agentic od tools
2195
+ # Agentic OD Tools
2154
2196
 
2155
2197
 
2156
2198
  def _agentic_object_detection(
@@ -2646,7 +2688,7 @@ def save_image(image: np.ndarray, file_path: str) -> None:
2646
2688
 
2647
2689
 
2648
2690
  def save_video(
2649
- frames: List[np.ndarray], output_video_path: Optional[str] = None, fps: float = 1
2691
+ frames: List[np.ndarray], output_video_path: Optional[str] = None, fps: float = 5
2650
2692
  ) -> str:
2651
2693
  """'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.
2652
2694
 
@@ -98,10 +98,12 @@ class Sim:
98
98
  raise ValueError("key is required if no column 'embs' is present.")
99
99
 
100
100
  if sim_key is not None:
101
- self.df["embs"] = self.df[sim_key].apply(
102
- lambda x: get_embedding(
103
- self.emb_call,
104
- x,
101
+ self.df = self.df.assign(
102
+ embs=self.df[sim_key].apply(
103
+ lambda x: get_embedding(
104
+ self.emb_call,
105
+ x,
106
+ )
105
107
  )
106
108
  )
107
109
 
@@ -141,7 +143,9 @@ class Sim:
141
143
 
142
144
  df_load = pd.read_csv(load_dir / "df.csv")
143
145
  if platform.system() == "Windows":
144
- df_load["doc"] = df_load["doc"].apply(lambda x: x.replace("\r", ""))
146
+ df_load = df_load.assign(
147
+ doc=df_load.doc.apply(lambda x: x.replace("\r", ""))
148
+ )
145
149
  return df.equals(df_load) # type: ignore
146
150
 
147
151
  @lru_cache(maxsize=256)
@@ -166,7 +170,9 @@ class Sim:
166
170
  self.emb_call,
167
171
  query,
168
172
  )
169
- self.df["sim"] = self.df.embs.apply(lambda x: 1 - cosine(x, embedding))
173
+ self.df = self.df.assign(
174
+ sim=self.df.embs.apply(lambda x: 1 - cosine(x, embedding))
175
+ )
170
176
  res = self.df.sort_values("sim", ascending=False).head(k)
171
177
  if thresh is not None:
172
178
  res = res[res.sim > thresh]
@@ -214,8 +220,13 @@ class AzureSim(Sim):
214
220
  raise ValueError("key is required if no column 'embs' is present.")
215
221
 
216
222
  if sim_key is not None:
217
- self.df["embs"] = self.df[sim_key].apply(
218
- lambda x: get_embedding(self.emb_call, x)
223
+ self.df = self.df.assign(
224
+ embs=self.df[sim_key].apply(
225
+ lambda x: get_embedding(
226
+ self.emb_call,
227
+ x,
228
+ )
229
+ )
219
230
  )
220
231
 
221
232
 
@@ -245,8 +256,13 @@ class OllamaSim(Sim):
245
256
  raise ValueError("key is required if no column 'embs' is present.")
246
257
 
247
258
  if sim_key is not None:
248
- self.df["embs"] = self.df[sim_key].apply(
249
- lambda x: get_embedding(emb_call, x)
259
+ self.df = self.df.assign(
260
+ embs=self.df[sim_key].apply(
261
+ lambda x: get_embedding(
262
+ self.emb_call,
263
+ x,
264
+ )
265
+ )
250
266
  )
251
267
 
252
268
 
@@ -267,8 +283,13 @@ class StellaSim(Sim):
267
283
  raise ValueError("key is required if no column 'embs' is present.")
268
284
 
269
285
  if sim_key is not None:
270
- self.df["embs"] = self.df[sim_key].apply(
271
- lambda x: get_embedding(emb_call, x)
286
+ self.df = self.df.assign(
287
+ embs=self.df[sim_key].apply(
288
+ lambda x: get_embedding(
289
+ self.emb_call,
290
+ x,
291
+ )
292
+ )
272
293
  )
273
294
 
274
295
  @staticmethod
File without changes
@@ -23,6 +23,9 @@ from .tools import (
23
23
  TOOLS_INFO,
24
24
  UTIL_TOOLS,
25
25
  UTILITIES_DOCSTRING,
26
+ agentic_object_detection,
27
+ agentic_sam2_instance_segmentation,
28
+ agentic_sam2_video_tracking,
26
29
  claude35_text_extraction,
27
30
  closest_box_distance,
28
31
  closest_mask_distance,
@@ -30,6 +33,7 @@ from .tools import (
30
33
  countgd_sam2_instance_segmentation,
31
34
  countgd_sam2_video_tracking,
32
35
  countgd_visual_prompt_object_detection,
36
+ custom_object_detection,
33
37
  depth_anything_v2,
34
38
  detr_segmentation,
35
39
  document_extraction,
@@ -63,10 +67,6 @@ from .tools import (
63
67
  video_temporal_localization,
64
68
  vit_image_classification,
65
69
  vit_nsfw_classification,
66
- custom_object_detection,
67
- agentic_object_detection,
68
- agentic_sam2_instance_segmentation,
69
- agentic_sam2_video_tracking,
70
70
  )
71
71
 
72
72
  __new_tools__ = [