vision-agent 0.2.230__tar.gz → 0.2.232__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vision_agent-0.2.230 → vision_agent-0.2.232}/PKG-INFO +3 -11
- {vision_agent-0.2.230 → vision_agent-0.2.232}/README.md +2 -10
- {vision_agent-0.2.230 → vision_agent-0.2.232}/pyproject.toml +1 -1
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/.sim_tools/df.csv +12 -10
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/agent_utils.py +4 -2
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/types.py +1 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_coder_prompts_v2.py +1 -1
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_coder_v2.py +15 -1
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_planner_prompts_v2.py +12 -8
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_planner_v2.py +2 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_prompts_v2.py +4 -1
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_v2.py +5 -2
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/configs/anthropic_config.py +2 -2
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/configs/openai_config.py +2 -2
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/tools/planner_tools.py +47 -21
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/tools/tool_utils.py +3 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/tools/tools.py +76 -34
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/utils/sim.py +33 -12
- {vision_agent-0.2.230 → vision_agent-0.2.232}/LICENSE +0 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/.sim_tools/embs.npy +0 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/README.md +0 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/vision_agent.py +0 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_coder.py +0 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_planner.py +0 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_planner_prompts.py +0 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_prompts.py +0 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/clients/__init__.py +0 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/clients/http.py +0 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/clients/landing_public_api.py +0 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/configs/__init__.py +0 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/configs/anthropic_openai_config.py +0 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/configs/config.py +0 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/lmm/lmm.py +0 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/lmm/types.py +0 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/tools/__init__.py +4 -4
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/tools/meta_tools.py +0 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/tools/tools_types.py +0 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/utils/exceptions.py +0 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/utils/execute.py +0 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/utils/image_utils.py +0 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/utils/type_defs.py +0 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/utils/video.py +0 -0
- {vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/utils/video_tracking.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.232
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -71,12 +71,7 @@ export ANTHROPIC_API_KEY="your-api-key"
|
|
71
71
|
export OPENAI_API_KEY="your-api-key"
|
72
72
|
```
|
73
73
|
|
74
|
-
|
75
|
-
**NOTE**
|
76
|
-
We found using both Anthropic Claude-3.5 and OpenAI o1 to be provide the best performance
|
77
|
-
for VisionAgent. If you want to use a different LLM provider or only one, see
|
78
|
-
'Using Other LLM Providers' below.
|
79
|
-
---
|
74
|
+
> **_NOTE:_** We found using both Anthropic Claude-3.5 and OpenAI o1 to be provide the best performance for VisionAgent. If you want to use a different LLM provider or only one, see 'Using Other LLM Providers' below.
|
80
75
|
|
81
76
|
## Documentation
|
82
77
|
|
@@ -149,8 +144,5 @@ directory. For example to change to Anthropic simply just run:
|
|
149
144
|
cp vision_agent/configs/anthropic_config.py vision_agent/configs/config.py
|
150
145
|
```
|
151
146
|
|
152
|
-
**
|
153
|
-
VisionAgent moves fast and we are constantly updating and changing the library. If you
|
154
|
-
have any questions or need help, please reach out to us on our discord channel.
|
155
|
-
---
|
147
|
+
> **_NOTE:_** VisionAgent moves fast and we are constantly updating and changing the library. If you have any questions or need help, please reach out to us on our discord channel.
|
156
148
|
|
@@ -26,12 +26,7 @@ export ANTHROPIC_API_KEY="your-api-key"
|
|
26
26
|
export OPENAI_API_KEY="your-api-key"
|
27
27
|
```
|
28
28
|
|
29
|
-
|
30
|
-
**NOTE**
|
31
|
-
We found using both Anthropic Claude-3.5 and OpenAI o1 to be provide the best performance
|
32
|
-
for VisionAgent. If you want to use a different LLM provider or only one, see
|
33
|
-
'Using Other LLM Providers' below.
|
34
|
-
---
|
29
|
+
> **_NOTE:_** We found using both Anthropic Claude-3.5 and OpenAI o1 to be provide the best performance for VisionAgent. If you want to use a different LLM provider or only one, see 'Using Other LLM Providers' below.
|
35
30
|
|
36
31
|
## Documentation
|
37
32
|
|
@@ -104,7 +99,4 @@ directory. For example to change to Anthropic simply just run:
|
|
104
99
|
cp vision_agent/configs/anthropic_config.py vision_agent/configs/config.py
|
105
100
|
```
|
106
101
|
|
107
|
-
**
|
108
|
-
VisionAgent moves fast and we are constantly updating and changing the library. If you
|
109
|
-
have any questions or need help, please reach out to us on our discord channel.
|
110
|
-
---
|
102
|
+
> **_NOTE:_** VisionAgent moves fast and we are constantly updating and changing the library. If you have any questions or need help, please reach out to us on our discord channel.
|
@@ -514,7 +514,7 @@ desc,doc,name
|
|
514
514
|
>>> vit_nsfw_classification(image)
|
515
515
|
{""label"": ""normal"", ""scores"": 0.68},
|
516
516
|
",vit_nsfw_classification
|
517
|
-
'video_temporal_localization' will run qwen2vl on each chunk_length_frames value selected for the video. It can detect multiple objects independently per chunk_length_frames given a text prompt such as a referring expression but does not track objects across frames. It returns a list of floats with a value of 1.0 if the objects are found in a given chunk_length_frames of the video.,"video_temporal_localization(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen2vl', chunk_length_frames:
|
517
|
+
'video_temporal_localization' will run qwen2vl on each chunk_length_frames value selected for the video. It can detect multiple objects independently per chunk_length_frames given a text prompt such as a referring expression but does not track objects across frames. It returns a list of floats with a value of 1.0 if the objects are found in a given chunk_length_frames of the video.,"video_temporal_localization(prompt: str, frames: List[numpy.ndarray], model: str = 'qwen2vl', chunk_length_frames: int = 2) -> List[float]:
|
518
518
|
'video_temporal_localization' will run qwen2vl on each chunk_length_frames
|
519
519
|
value selected for the video. It can detect multiple objects independently per
|
520
520
|
chunk_length_frames given a text prompt such as a referring expression
|
@@ -527,7 +527,7 @@ desc,doc,name
|
|
527
527
|
frames (List[np.ndarray]): The reference frames used for the question
|
528
528
|
model (str): The model to use for the inference. Valid values are
|
529
529
|
'qwen2vl', 'gpt4o'.
|
530
|
-
chunk_length_frames (
|
530
|
+
chunk_length_frames (int): length of each chunk in frames
|
531
531
|
|
532
532
|
Returns:
|
533
533
|
List[float]: A list of floats with a value of 1.0 if the objects to be found
|
@@ -540,16 +540,18 @@ desc,doc,name
|
|
540
540
|
",video_temporal_localization
|
541
541
|
"'flux_image_inpainting' performs image inpainting to fill the masked regions, given by mask, in the image, given image based on the text prompt and surrounding image context. It can be used to edit regions of an image according to the prompt given.","flux_image_inpainting(prompt: str, image: numpy.ndarray, mask: numpy.ndarray) -> numpy.ndarray:
|
542
542
|
'flux_image_inpainting' performs image inpainting to fill the masked regions,
|
543
|
-
given by mask, in the image, given image based on the text prompt and surrounding
|
544
|
-
It can be used to edit regions of an image according to the prompt
|
543
|
+
given by mask, in the image, given image based on the text prompt and surrounding
|
544
|
+
image context. It can be used to edit regions of an image according to the prompt
|
545
|
+
given.
|
545
546
|
|
546
547
|
Parameters:
|
547
548
|
prompt (str): A detailed text description guiding what should be generated
|
548
|
-
in the masked area. More detailed and specific prompts typically yield
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
549
|
+
in the masked area. More detailed and specific prompts typically yield
|
550
|
+
better results.
|
551
|
+
image (np.ndarray): The source image to be inpainted. The image will serve as
|
552
|
+
the base context for the inpainting process.
|
553
|
+
mask (np.ndarray): A binary mask image with 0's and 1's, where 1 indicates
|
554
|
+
areas to be inpainted and 0 indicates areas to be preserved.
|
553
555
|
|
554
556
|
Returns:
|
555
557
|
np.ndarray: The generated image(s) as a numpy array in RGB format with values
|
@@ -658,7 +660,7 @@ desc,doc,name
|
|
658
660
|
-------
|
659
661
|
>>> save_image(image)
|
660
662
|
",save_image
|
661
|
-
'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.,"save_video(frames: List[numpy.ndarray], output_video_path: Optional[str] = None, fps: float =
|
663
|
+
'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.,"save_video(frames: List[numpy.ndarray], output_video_path: Optional[str] = None, fps: float = 5) -> str:
|
662
664
|
'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.
|
663
665
|
|
664
666
|
Parameters:
|
@@ -148,8 +148,10 @@ def format_plan_v2(plan: PlanContext) -> str:
|
|
148
148
|
plan_str += "Instructions:\n"
|
149
149
|
for v in plan.instructions:
|
150
150
|
plan_str += f" - {v}\n"
|
151
|
-
|
152
|
-
|
151
|
+
|
152
|
+
if plan.code:
|
153
|
+
plan_str += "Code:\n"
|
154
|
+
plan_str += plan.code
|
153
155
|
return plan_str
|
154
156
|
|
155
157
|
|
{vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_coder_prompts_v2.py
RENAMED
@@ -6,7 +6,7 @@ FEEDBACK = """
|
|
6
6
|
|
7
7
|
|
8
8
|
CODE = """
|
9
|
-
**Role**: You are an
|
9
|
+
**Role**: You are an expert software programmer.
|
10
10
|
|
11
11
|
**Task**: You are given a plan by a planning agent that solves a vision problem posed by the user. You are also given code snippets that the planning agent used to solve the task. Your job is to organize the code so that it can be easily called by the user to solve the task.
|
12
12
|
|
@@ -425,6 +425,8 @@ class VisionAgentCoderV2(AgentCoder):
|
|
425
425
|
chat (List[AgentMessage]): The input to the agent. This should be a list of
|
426
426
|
AgentMessage objects.
|
427
427
|
plan_context (PlanContext): The plan context that was previously generated.
|
428
|
+
If plan_context.code is not provided, then the code will be generated
|
429
|
+
from the chat messages.
|
428
430
|
code_interpreter (Optional[CodeInterpreter]): The code interpreter to use.
|
429
431
|
|
430
432
|
Returns:
|
@@ -455,12 +457,24 @@ class VisionAgentCoderV2(AgentCoder):
|
|
455
457
|
int_chat, _, media_list = add_media_to_chat(chat, code_interpreter)
|
456
458
|
tool_docs = retrieve_tools(plan_context.instructions, self.tool_recommender)
|
457
459
|
|
460
|
+
# If code is not provided from the plan_context then generate it, else use
|
461
|
+
# the provided code and start with testing
|
462
|
+
if not plan_context.code.strip():
|
463
|
+
code = write_code(
|
464
|
+
coder=self.coder,
|
465
|
+
chat=int_chat,
|
466
|
+
tool_docs=tool_docs,
|
467
|
+
plan=format_plan_v2(plan_context),
|
468
|
+
)
|
469
|
+
else:
|
470
|
+
code = plan_context.code
|
471
|
+
|
458
472
|
code_context = test_code(
|
459
473
|
tester=self.tester,
|
460
474
|
debugger=self.debugger,
|
461
475
|
chat=int_chat,
|
462
476
|
plan=format_plan_v2(plan_context),
|
463
|
-
code=
|
477
|
+
code=code,
|
464
478
|
tool_docs=tool_docs,
|
465
479
|
code_interpreter=code_interpreter,
|
466
480
|
media_list=media_list,
|
{vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_planner_prompts_v2.py
RENAMED
@@ -458,6 +458,8 @@ You are given a task: "{task}" from the user. You must extract the type of categ
|
|
458
458
|
- "DocQA" - answering questions about a document or extracting information from a document.
|
459
459
|
- "video object tracking" - tracking objects in a video.
|
460
460
|
- "depth and pose estimation" - estimating the depth or pose of objects in an image.
|
461
|
+
- "temporal localization" - localizing the time period an event occurs in a video.
|
462
|
+
- "inpainting" - filling in masked parts of an image.
|
461
463
|
|
462
464
|
Return the category or categories (comma separated) inside tags <category># your categories here</category>. If you are unsure about a task, it is better to include more categories than less.
|
463
465
|
"""
|
@@ -651,22 +653,24 @@ PICK_TOOL = """
|
|
651
653
|
"""
|
652
654
|
|
653
655
|
FINALIZE_PLAN = """
|
654
|
-
**
|
656
|
+
**Task**: You are given a chain of thoughts, python executions and observations from a planning agent as it tries to construct a plan to solve a user request. Your task is to summarize the plan it found so that another programming agent to write a program to accomplish the user request.
|
655
657
|
|
656
|
-
**
|
658
|
+
**Documentation**: You can use these tools to help you visualize or save the output:
|
659
|
+
{tool_desc}
|
657
660
|
|
658
661
|
**Planning**: Here is chain of thoughts, executions and observations from the planning agent:
|
659
662
|
{planning}
|
660
663
|
|
661
664
|
**Instructions**:
|
662
665
|
1. Summarize the plan that the planning agent found.
|
663
|
-
2. Write a single function that solves the problem based on what the planner found.
|
664
|
-
3.
|
666
|
+
2. Write a single function that solves the problem based on what the planner found and only returns the final solution.
|
667
|
+
3. Only use tools obtained from calling `get_tool_for_task`.
|
665
668
|
4. Do not include {excluded_tools} tools in your instructions.
|
666
|
-
5.
|
667
|
-
6.
|
668
|
-
7.
|
669
|
-
8.
|
669
|
+
5. Ensure the function is well documented and easy to understand.
|
670
|
+
6. Ensure you visualize the output with `overlay_bounding_boxes` or `overlay_segmentation_masks` and save it to a file with `save_image` or `save_video`.
|
671
|
+
7. Use the default FPS for extracting frames from videos unless otherwise specified by the user.
|
672
|
+
8. Include the expected answer in your 'plan' so that the programming agent can properly test if it has the correct answer.
|
673
|
+
9. Respond in the following format with JSON surrounded by <json> tags and code surrounded by <code> tags:
|
670
674
|
|
671
675
|
<json>
|
672
676
|
{{
|
@@ -326,6 +326,7 @@ def create_finalize_plan(
|
|
326
326
|
return [], PlanContext(plan="", instructions=[], code="")
|
327
327
|
|
328
328
|
prompt = FINALIZE_PLAN.format(
|
329
|
+
tool_desc=UTIL_DOCSTRING,
|
329
330
|
planning=get_planning(chat),
|
330
331
|
excluded_tools=str([t.__name__ for t in pt.PLANNER_TOOLS]),
|
331
332
|
)
|
@@ -513,6 +514,7 @@ class VisionAgentPlannerV2(AgentPlanner):
|
|
513
514
|
code = extract_tag(response, "execute_python")
|
514
515
|
finalize_plan = extract_tag(response, "finalize_plan")
|
515
516
|
finished = finalize_plan is not None
|
517
|
+
self.update_callback({"role": "planner_update", "content": response})
|
516
518
|
|
517
519
|
if self.verbose:
|
518
520
|
_CONSOLE.print(
|
@@ -42,6 +42,8 @@ AGENT: <response>I am VisionAgent, an agent built by LandingAI, to help users wr
|
|
42
42
|
- Understanding documents
|
43
43
|
- Pose estimation
|
44
44
|
- Visual question answering for both images and videos
|
45
|
+
- Action recognition in videos
|
46
|
+
- Image inpainting
|
45
47
|
|
46
48
|
How can I help you?</response>
|
47
49
|
--- END EXAMPLE2 ---
|
@@ -54,7 +56,8 @@ Here is the current conversation so far:
|
|
54
56
|
|
55
57
|
**Instructions**:
|
56
58
|
1. Only respond with a single <response> tag and a single <action> tag.
|
57
|
-
2.
|
59
|
+
2. You can only take one action at a time in response to the user's message. Do not offer to fix code on the user's behalf, only if they have directly asked you to.
|
60
|
+
3. Respond in the following format, the <action> tag is optional and can be excluded if you do not want to take any action:
|
58
61
|
|
59
62
|
<response>Your response to the user's message</response>
|
60
63
|
<action>The action you want to take from **Actions**</action>
|
@@ -112,14 +112,17 @@ def maybe_run_action(
|
|
112
112
|
)
|
113
113
|
]
|
114
114
|
elif action == "edit_code":
|
115
|
+
# We don't want to pass code in plan_context.code so the coder will generate
|
116
|
+
# new code from plan_context.plan
|
115
117
|
plan_context = PlanContext(
|
116
|
-
plan="Edit the latest code observed in the fewest steps possible according to the user's feedback."
|
118
|
+
plan="Edit the latest code observed in the fewest steps possible according to the user's feedback."
|
119
|
+
+ ("<code>\n" + final_code + "\n</code>" if final_code is not None else ""),
|
117
120
|
instructions=[
|
118
121
|
chat_i.content
|
119
122
|
for chat_i in extracted_chat
|
120
123
|
if chat_i.role == "user" and "<final_code>" not in chat_i.content
|
121
124
|
],
|
122
|
-
code=
|
125
|
+
code="",
|
123
126
|
)
|
124
127
|
context = coder.generate_code_from_plan(
|
125
128
|
extracted_chat, plan_context, code_interpreter=code_interpreter
|
@@ -81,7 +81,7 @@ class Config(BaseModel):
|
|
81
81
|
tool_tester_kwargs: dict = Field(
|
82
82
|
default_factory=lambda: {
|
83
83
|
"model_name": "claude-3-5-sonnet-20241022",
|
84
|
-
"temperature":
|
84
|
+
"temperature": 0.0,
|
85
85
|
"image_size": 768,
|
86
86
|
}
|
87
87
|
)
|
@@ -111,7 +111,7 @@ class Config(BaseModel):
|
|
111
111
|
vqa_kwargs: dict = Field(
|
112
112
|
default_factory=lambda: {
|
113
113
|
"model_name": "claude-3-5-sonnet-20241022",
|
114
|
-
"temperature":
|
114
|
+
"temperature": 0.0,
|
115
115
|
"image_size": 768,
|
116
116
|
}
|
117
117
|
)
|
@@ -98,7 +98,7 @@ class Config(BaseModel):
|
|
98
98
|
tool_chooser_kwargs: dict = Field(
|
99
99
|
default_factory=lambda: {
|
100
100
|
"model_name": "gpt-4o-2024-08-06",
|
101
|
-
"temperature":
|
101
|
+
"temperature": 1.0,
|
102
102
|
"image_size": 768,
|
103
103
|
"image_detail": "low",
|
104
104
|
}
|
@@ -109,7 +109,7 @@ class Config(BaseModel):
|
|
109
109
|
suggester_kwargs: dict = Field(
|
110
110
|
default_factory=lambda: {
|
111
111
|
"model_name": "gpt-4o-2024-08-06",
|
112
|
-
"temperature":
|
112
|
+
"temperature": 1.0,
|
113
113
|
"image_size": 768,
|
114
114
|
"image_detail": "low",
|
115
115
|
}
|
@@ -2,7 +2,7 @@ import inspect
|
|
2
2
|
import logging
|
3
3
|
import tempfile
|
4
4
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
5
|
-
from typing import Any, Callable, Dict, List, Optional, Tuple, cast
|
5
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
|
6
6
|
|
7
7
|
import libcst as cst
|
8
8
|
import numpy as np
|
@@ -10,12 +10,7 @@ from IPython.display import display
|
|
10
10
|
from PIL import Image
|
11
11
|
|
12
12
|
import vision_agent.tools as T
|
13
|
-
from vision_agent.agent.agent_utils import
|
14
|
-
DefaultImports,
|
15
|
-
extract_code,
|
16
|
-
extract_json,
|
17
|
-
extract_tag,
|
18
|
-
)
|
13
|
+
from vision_agent.agent.agent_utils import DefaultImports, extract_json, extract_tag
|
19
14
|
from vision_agent.agent.vision_agent_planner_prompts_v2 import (
|
20
15
|
CATEGORIZE_TOOL_REQUEST,
|
21
16
|
FINALIZE_PLAN,
|
@@ -36,6 +31,9 @@ from vision_agent.utils.image_utils import convert_to_b64
|
|
36
31
|
from vision_agent.utils.sim import get_tool_recommender
|
37
32
|
|
38
33
|
TOOL_FUNCTIONS = {tool.__name__: tool for tool in T.TOOLS}
|
34
|
+
LOAD_TOOLS_DOCSTRING = T.get_tool_documentation(
|
35
|
+
[T.load_image, T.extract_frames_and_timestamps]
|
36
|
+
)
|
39
37
|
|
40
38
|
CONFIG = Config()
|
41
39
|
_LOGGER = logging.getLogger(__name__)
|
@@ -179,6 +177,7 @@ def run_tool_testing(
|
|
179
177
|
cleaned_tool_docs.append(tool_doc)
|
180
178
|
tool_docs = cleaned_tool_docs
|
181
179
|
tool_docs_str = "\n".join([e["doc"] for e in tool_docs])
|
180
|
+
tool_docs_str += "\n" + LOAD_TOOLS_DOCSTRING
|
182
181
|
|
183
182
|
prompt = TEST_TOOLS.format(
|
184
183
|
tool_docs=tool_docs_str,
|
@@ -217,8 +216,15 @@ def run_tool_testing(
|
|
217
216
|
examples=EXAMPLES,
|
218
217
|
media=str(image_paths),
|
219
218
|
)
|
220
|
-
|
221
|
-
code =
|
219
|
+
response = cast(str, lmm.generate(prompt, media=image_paths))
|
220
|
+
code = extract_tag(response, "code")
|
221
|
+
if code is None:
|
222
|
+
code = response
|
223
|
+
|
224
|
+
try:
|
225
|
+
code = process_code(code)
|
226
|
+
except Exception as e:
|
227
|
+
_LOGGER.error(f"Error processing code: {e}")
|
222
228
|
tool_output = code_interpreter.exec_isolation(
|
223
229
|
DefaultImports.prepend_imports(code)
|
224
230
|
)
|
@@ -229,7 +235,9 @@ def run_tool_testing(
|
|
229
235
|
|
230
236
|
|
231
237
|
def get_tool_for_task(
|
232
|
-
task: str,
|
238
|
+
task: str,
|
239
|
+
images: Union[Dict[str, List[np.ndarray]], List[np.ndarray]],
|
240
|
+
exclude_tools: Optional[List[str]] = None,
|
233
241
|
) -> None:
|
234
242
|
"""Given a task and one or more images this function will find a tool to accomplish
|
235
243
|
the jobs. It prints the tool documentation and thoughts on why it chose the tool.
|
@@ -242,6 +250,8 @@ def get_tool_for_task(
|
|
242
250
|
- VQA
|
243
251
|
- Depth and pose estimation
|
244
252
|
- Video object tracking
|
253
|
+
- Video temporal localization (action recognition)
|
254
|
+
- Image inpainting
|
245
255
|
|
246
256
|
Only ask for one type of task at a time, for example a task needing to identify
|
247
257
|
text is one OCR task while needing to identify non-text objects is an OD task. Wait
|
@@ -250,7 +260,8 @@ def get_tool_for_task(
|
|
250
260
|
|
251
261
|
Parameters:
|
252
262
|
task: str: The task to accomplish.
|
253
|
-
images: List[np.ndarray]: The images to use
|
263
|
+
images: Union[Dict[str, List[np.ndarray]], List[np.ndarray]]: The images to use
|
264
|
+
for the task. If a key is provided, it is used as the file name.
|
254
265
|
exclude_tools: Optional[List[str]]: A list of tool names to exclude from the
|
255
266
|
recommendations. This is helpful if you are calling get_tool_for_task twice
|
256
267
|
and do not want the same tool recommended.
|
@@ -260,20 +271,29 @@ def get_tool_for_task(
|
|
260
271
|
|
261
272
|
Examples
|
262
273
|
--------
|
263
|
-
>>> get_tool_for_task(
|
274
|
+
>>> get_tool_for_task(
|
275
|
+
>>> "Give me an OCR model that can find 'hot chocolate' in the image",
|
276
|
+
>>> {"image": [image]})
|
277
|
+
>>> get_tool_for_taks(
|
278
|
+
>>> "I need a tool that can paint a background for this image and maks",
|
279
|
+
>>> {"image": [image], "mask": [mask]})
|
264
280
|
"""
|
265
281
|
tool_tester = CONFIG.create_tool_tester()
|
266
282
|
tool_chooser = CONFIG.create_tool_chooser()
|
267
283
|
|
284
|
+
if isinstance(images, list):
|
285
|
+
images = {"image": images}
|
286
|
+
|
268
287
|
with (
|
269
288
|
tempfile.TemporaryDirectory() as tmpdirname,
|
270
289
|
CodeInterpreterFactory.new_instance() as code_interpreter,
|
271
290
|
):
|
272
291
|
image_paths = []
|
273
|
-
for
|
274
|
-
|
275
|
-
|
276
|
-
|
292
|
+
for k in images.keys():
|
293
|
+
for i, image in enumerate(images[k]):
|
294
|
+
image_path = f"{tmpdirname}/{k}_{i}.png"
|
295
|
+
Image.fromarray(image).save(image_path)
|
296
|
+
image_paths.append(image_path)
|
277
297
|
|
278
298
|
code, tool_docs_str, tool_output = run_tool_testing(
|
279
299
|
task, image_paths, tool_tester, exclude_tools, code_interpreter
|
@@ -294,20 +314,26 @@ def get_tool_documentation(tool_name: str) -> str:
|
|
294
314
|
|
295
315
|
|
296
316
|
def get_tool_for_task_human_reviewer(
|
297
|
-
task: str,
|
317
|
+
task: str,
|
318
|
+
images: Union[Dict[str, List[np.ndarray]], List[np.ndarray]],
|
319
|
+
exclude_tools: Optional[List[str]] = None,
|
298
320
|
) -> None:
|
299
321
|
# NOTE: this will have the same documentation as get_tool_for_task
|
300
322
|
tool_tester = CONFIG.create_tool_tester()
|
301
323
|
|
324
|
+
if isinstance(images, list):
|
325
|
+
images = {"image": images}
|
326
|
+
|
302
327
|
with (
|
303
328
|
tempfile.TemporaryDirectory() as tmpdirname,
|
304
329
|
CodeInterpreterFactory.new_instance() as code_interpreter,
|
305
330
|
):
|
306
331
|
image_paths = []
|
307
|
-
for
|
308
|
-
|
309
|
-
|
310
|
-
|
332
|
+
for k in images.keys():
|
333
|
+
for i, image in enumerate(images[k]):
|
334
|
+
image_path = f"{tmpdirname}/{k}_{i}.png"
|
335
|
+
Image.fromarray(image).save(image_path)
|
336
|
+
image_paths.append(image_path)
|
311
337
|
|
312
338
|
tools = [
|
313
339
|
t.__name__
|
@@ -318,6 +318,9 @@ def single_nms(
|
|
318
318
|
def nms(
|
319
319
|
all_preds: List[List[Dict[str, Any]]], iou_threshold: float
|
320
320
|
) -> List[List[Dict[str, Any]]]:
|
321
|
+
if not isinstance(all_preds[0], List):
|
322
|
+
all_preds = [all_preds]
|
323
|
+
|
321
324
|
return_preds = []
|
322
325
|
for frame_preds in all_preds:
|
323
326
|
frame_preds = single_nms(frame_preds, iou_threshold)
|
@@ -222,7 +222,7 @@ def sam2(
|
|
222
222
|
ret = _sam2(image, detections, image_size)
|
223
223
|
_display_tool_trace(
|
224
224
|
sam2.__name__,
|
225
|
-
{},
|
225
|
+
{"detections": detections},
|
226
226
|
ret["display_data"],
|
227
227
|
ret["files"],
|
228
228
|
)
|
@@ -314,18 +314,29 @@ def od_sam2_video_tracking(
|
|
314
314
|
|
315
315
|
# Process each segment and collect detections
|
316
316
|
detections_per_segment: List[Any] = []
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
317
|
+
with ThreadPoolExecutor() as executor:
|
318
|
+
futures = {
|
319
|
+
executor.submit(
|
320
|
+
process_segment,
|
321
|
+
segment_frames=segment,
|
322
|
+
od_model=od_model,
|
323
|
+
prompt=prompt,
|
324
|
+
fine_tune_id=fine_tune_id,
|
325
|
+
chunk_length=chunk_length,
|
326
|
+
image_size=image_size,
|
327
|
+
segment_index=segment_index,
|
328
|
+
object_detection_tool=_apply_object_detection,
|
329
|
+
): segment_index
|
330
|
+
for segment_index, segment in enumerate(segments)
|
331
|
+
}
|
332
|
+
|
333
|
+
for future in as_completed(futures):
|
334
|
+
segment_index = futures[future]
|
335
|
+
detections_per_segment.append((segment_index, future.result()))
|
336
|
+
|
337
|
+
detections_per_segment = [
|
338
|
+
x[1] for x in sorted(detections_per_segment, key=lambda x: x[0])
|
339
|
+
]
|
329
340
|
|
330
341
|
merged_detections = merge_segments(detections_per_segment)
|
331
342
|
post_processed = post_process(merged_detections, image_size)
|
@@ -390,7 +401,7 @@ def _owlv2_object_detection(
|
|
390
401
|
{
|
391
402
|
"label": bbox["label"],
|
392
403
|
"bbox": normalize_bbox(bbox["bounding_box"], image_size),
|
393
|
-
"score": bbox["score"],
|
404
|
+
"score": round(bbox["score"], 2),
|
394
405
|
}
|
395
406
|
for bbox in bboxes
|
396
407
|
]
|
@@ -398,7 +409,7 @@ def _owlv2_object_detection(
|
|
398
409
|
{
|
399
410
|
"label": bbox["label"],
|
400
411
|
"bbox": bbox["bounding_box"],
|
401
|
-
"score": bbox["score"],
|
412
|
+
"score": round(bbox["score"], 2),
|
402
413
|
}
|
403
414
|
for bbox in bboxes
|
404
415
|
]
|
@@ -582,7 +593,7 @@ def owlv2_sam2_video_tracking(
|
|
582
593
|
)
|
583
594
|
_display_tool_trace(
|
584
595
|
owlv2_sam2_video_tracking.__name__,
|
585
|
-
{},
|
596
|
+
{"prompt": prompt, "chunk_length": chunk_length},
|
586
597
|
ret["display_data"],
|
587
598
|
ret["files"],
|
588
599
|
)
|
@@ -1681,7 +1692,7 @@ def video_temporal_localization(
|
|
1681
1692
|
prompt: str,
|
1682
1693
|
frames: List[np.ndarray],
|
1683
1694
|
model: str = "qwen2vl",
|
1684
|
-
chunk_length_frames:
|
1695
|
+
chunk_length_frames: int = 2,
|
1685
1696
|
) -> List[float]:
|
1686
1697
|
"""'video_temporal_localization' will run qwen2vl on each chunk_length_frames
|
1687
1698
|
value selected for the video. It can detect multiple objects independently per
|
@@ -1695,7 +1706,7 @@ def video_temporal_localization(
|
|
1695
1706
|
frames (List[np.ndarray]): The reference frames used for the question
|
1696
1707
|
model (str): The model to use for the inference. Valid values are
|
1697
1708
|
'qwen2vl', 'gpt4o'.
|
1698
|
-
chunk_length_frames (
|
1709
|
+
chunk_length_frames (int): length of each chunk in frames
|
1699
1710
|
|
1700
1711
|
Returns:
|
1701
1712
|
List[float]: A list of floats with a value of 1.0 if the objects to be found
|
@@ -1714,19 +1725,48 @@ def video_temporal_localization(
|
|
1714
1725
|
"model": model,
|
1715
1726
|
"function_name": "video_temporal_localization",
|
1716
1727
|
}
|
1717
|
-
|
1718
|
-
|
1728
|
+
payload["chunk_length_frames"] = chunk_length_frames
|
1729
|
+
|
1730
|
+
segments = split_frames_into_segments(frames, segment_size=50, overlap=0)
|
1731
|
+
|
1732
|
+
def _apply_temporal_localization(
|
1733
|
+
segment: List[np.ndarray],
|
1734
|
+
) -> List[float]:
|
1735
|
+
segment_buffer_bytes = [("video", frames_to_bytes(segment))]
|
1736
|
+
data = send_inference_request(
|
1737
|
+
payload, "video-temporal-localization", files=segment_buffer_bytes, v2=True
|
1738
|
+
)
|
1739
|
+
chunked_data = [cast(float, value) for value in data]
|
1740
|
+
|
1741
|
+
full_data = []
|
1742
|
+
for value in chunked_data:
|
1743
|
+
full_data.extend([value] * chunk_length_frames)
|
1744
|
+
|
1745
|
+
return full_data[: len(segment)]
|
1746
|
+
|
1747
|
+
with ThreadPoolExecutor() as executor:
|
1748
|
+
futures = {
|
1749
|
+
executor.submit(_apply_temporal_localization, segment): segment_index
|
1750
|
+
for segment_index, segment in enumerate(segments)
|
1751
|
+
}
|
1752
|
+
|
1753
|
+
localization_per_segment = []
|
1754
|
+
for future in as_completed(futures):
|
1755
|
+
segment_index = futures[future]
|
1756
|
+
localization_per_segment.append((segment_index, future.result()))
|
1757
|
+
|
1758
|
+
localization_per_segment = [
|
1759
|
+
x[1] for x in sorted(localization_per_segment, key=lambda x: x[0]) # type: ignore
|
1760
|
+
]
|
1761
|
+
localizations = cast(List[float], [e for o in localization_per_segment for e in o])
|
1719
1762
|
|
1720
|
-
data = send_inference_request(
|
1721
|
-
payload, "video-temporal-localization", files=files, v2=True
|
1722
|
-
)
|
1723
1763
|
_display_tool_trace(
|
1724
1764
|
video_temporal_localization.__name__,
|
1725
1765
|
payload,
|
1726
|
-
|
1766
|
+
localization_per_segment,
|
1727
1767
|
files,
|
1728
1768
|
)
|
1729
|
-
return
|
1769
|
+
return localizations
|
1730
1770
|
|
1731
1771
|
|
1732
1772
|
def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
|
@@ -2012,16 +2052,18 @@ def flux_image_inpainting(
|
|
2012
2052
|
mask: np.ndarray,
|
2013
2053
|
) -> np.ndarray:
|
2014
2054
|
"""'flux_image_inpainting' performs image inpainting to fill the masked regions,
|
2015
|
-
given by mask, in the image, given image based on the text prompt and surrounding
|
2016
|
-
It can be used to edit regions of an image according to the prompt
|
2055
|
+
given by mask, in the image, given image based on the text prompt and surrounding
|
2056
|
+
image context. It can be used to edit regions of an image according to the prompt
|
2057
|
+
given.
|
2017
2058
|
|
2018
2059
|
Parameters:
|
2019
2060
|
prompt (str): A detailed text description guiding what should be generated
|
2020
|
-
in the masked area. More detailed and specific prompts typically yield
|
2021
|
-
|
2022
|
-
|
2023
|
-
|
2024
|
-
|
2061
|
+
in the masked area. More detailed and specific prompts typically yield
|
2062
|
+
better results.
|
2063
|
+
image (np.ndarray): The source image to be inpainted. The image will serve as
|
2064
|
+
the base context for the inpainting process.
|
2065
|
+
mask (np.ndarray): A binary mask image with 0's and 1's, where 1 indicates
|
2066
|
+
areas to be inpainted and 0 indicates areas to be preserved.
|
2025
2067
|
|
2026
2068
|
Returns:
|
2027
2069
|
np.ndarray: The generated image(s) as a numpy array in RGB format with values
|
@@ -2150,7 +2192,7 @@ def siglip_classification(image: np.ndarray, labels: List[str]) -> Dict[str, Any
|
|
2150
2192
|
return response
|
2151
2193
|
|
2152
2194
|
|
2153
|
-
#
|
2195
|
+
# Agentic OD Tools
|
2154
2196
|
|
2155
2197
|
|
2156
2198
|
def _agentic_object_detection(
|
@@ -2646,7 +2688,7 @@ def save_image(image: np.ndarray, file_path: str) -> None:
|
|
2646
2688
|
|
2647
2689
|
|
2648
2690
|
def save_video(
|
2649
|
-
frames: List[np.ndarray], output_video_path: Optional[str] = None, fps: float =
|
2691
|
+
frames: List[np.ndarray], output_video_path: Optional[str] = None, fps: float = 5
|
2650
2692
|
) -> str:
|
2651
2693
|
"""'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.
|
2652
2694
|
|
@@ -98,10 +98,12 @@ class Sim:
|
|
98
98
|
raise ValueError("key is required if no column 'embs' is present.")
|
99
99
|
|
100
100
|
if sim_key is not None:
|
101
|
-
self.df
|
102
|
-
|
103
|
-
|
104
|
-
|
101
|
+
self.df = self.df.assign(
|
102
|
+
embs=self.df[sim_key].apply(
|
103
|
+
lambda x: get_embedding(
|
104
|
+
self.emb_call,
|
105
|
+
x,
|
106
|
+
)
|
105
107
|
)
|
106
108
|
)
|
107
109
|
|
@@ -141,7 +143,9 @@ class Sim:
|
|
141
143
|
|
142
144
|
df_load = pd.read_csv(load_dir / "df.csv")
|
143
145
|
if platform.system() == "Windows":
|
144
|
-
df_load
|
146
|
+
df_load = df_load.assign(
|
147
|
+
doc=df_load.doc.apply(lambda x: x.replace("\r", ""))
|
148
|
+
)
|
145
149
|
return df.equals(df_load) # type: ignore
|
146
150
|
|
147
151
|
@lru_cache(maxsize=256)
|
@@ -166,7 +170,9 @@ class Sim:
|
|
166
170
|
self.emb_call,
|
167
171
|
query,
|
168
172
|
)
|
169
|
-
self.df
|
173
|
+
self.df = self.df.assign(
|
174
|
+
sim=self.df.embs.apply(lambda x: 1 - cosine(x, embedding))
|
175
|
+
)
|
170
176
|
res = self.df.sort_values("sim", ascending=False).head(k)
|
171
177
|
if thresh is not None:
|
172
178
|
res = res[res.sim > thresh]
|
@@ -214,8 +220,13 @@ class AzureSim(Sim):
|
|
214
220
|
raise ValueError("key is required if no column 'embs' is present.")
|
215
221
|
|
216
222
|
if sim_key is not None:
|
217
|
-
self.df
|
218
|
-
|
223
|
+
self.df = self.df.assign(
|
224
|
+
embs=self.df[sim_key].apply(
|
225
|
+
lambda x: get_embedding(
|
226
|
+
self.emb_call,
|
227
|
+
x,
|
228
|
+
)
|
229
|
+
)
|
219
230
|
)
|
220
231
|
|
221
232
|
|
@@ -245,8 +256,13 @@ class OllamaSim(Sim):
|
|
245
256
|
raise ValueError("key is required if no column 'embs' is present.")
|
246
257
|
|
247
258
|
if sim_key is not None:
|
248
|
-
self.df
|
249
|
-
|
259
|
+
self.df = self.df.assign(
|
260
|
+
embs=self.df[sim_key].apply(
|
261
|
+
lambda x: get_embedding(
|
262
|
+
self.emb_call,
|
263
|
+
x,
|
264
|
+
)
|
265
|
+
)
|
250
266
|
)
|
251
267
|
|
252
268
|
|
@@ -267,8 +283,13 @@ class StellaSim(Sim):
|
|
267
283
|
raise ValueError("key is required if no column 'embs' is present.")
|
268
284
|
|
269
285
|
if sim_key is not None:
|
270
|
-
self.df
|
271
|
-
|
286
|
+
self.df = self.df.assign(
|
287
|
+
embs=self.df[sim_key].apply(
|
288
|
+
lambda x: get_embedding(
|
289
|
+
self.emb_call,
|
290
|
+
x,
|
291
|
+
)
|
292
|
+
)
|
272
293
|
)
|
273
294
|
|
274
295
|
@staticmethod
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_coder_prompts.py
RENAMED
File without changes
|
File without changes
|
{vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/agent/vision_agent_planner_prompts.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{vision_agent-0.2.230 → vision_agent-0.2.232}/vision_agent/configs/anthropic_openai_config.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
@@ -23,6 +23,9 @@ from .tools import (
|
|
23
23
|
TOOLS_INFO,
|
24
24
|
UTIL_TOOLS,
|
25
25
|
UTILITIES_DOCSTRING,
|
26
|
+
agentic_object_detection,
|
27
|
+
agentic_sam2_instance_segmentation,
|
28
|
+
agentic_sam2_video_tracking,
|
26
29
|
claude35_text_extraction,
|
27
30
|
closest_box_distance,
|
28
31
|
closest_mask_distance,
|
@@ -30,6 +33,7 @@ from .tools import (
|
|
30
33
|
countgd_sam2_instance_segmentation,
|
31
34
|
countgd_sam2_video_tracking,
|
32
35
|
countgd_visual_prompt_object_detection,
|
36
|
+
custom_object_detection,
|
33
37
|
depth_anything_v2,
|
34
38
|
detr_segmentation,
|
35
39
|
document_extraction,
|
@@ -63,10 +67,6 @@ from .tools import (
|
|
63
67
|
video_temporal_localization,
|
64
68
|
vit_image_classification,
|
65
69
|
vit_nsfw_classification,
|
66
|
-
custom_object_detection,
|
67
|
-
agentic_object_detection,
|
68
|
-
agentic_sam2_instance_segmentation,
|
69
|
-
agentic_sam2_video_tracking,
|
70
70
|
)
|
71
71
|
|
72
72
|
__new_tools__ = [
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|