vision-agent 0.2.229__py3-none-any.whl → 0.2.230__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/.sim_tools/df.csv +10 -8
- vision_agent/agent/agent_utils.py +10 -9
- vision_agent/agent/vision_agent.py +3 -4
- vision_agent/agent/vision_agent_coder_prompts.py +6 -6
- vision_agent/agent/vision_agent_coder_v2.py +41 -26
- vision_agent/agent/vision_agent_planner_prompts.py +6 -6
- vision_agent/agent/vision_agent_planner_prompts_v2.py +16 -50
- vision_agent/agent/vision_agent_planner_v2.py +10 -12
- vision_agent/agent/vision_agent_prompts.py +11 -11
- vision_agent/agent/vision_agent_prompts_v2.py +18 -3
- vision_agent/agent/vision_agent_v2.py +29 -30
- vision_agent/configs/__init__.py +1 -0
- vision_agent/configs/anthropic_config.py +150 -0
- vision_agent/configs/anthropic_openai_config.py +150 -0
- vision_agent/configs/config.py +150 -0
- vision_agent/configs/openai_config.py +160 -0
- vision_agent/lmm/__init__.py +1 -1
- vision_agent/lmm/lmm.py +63 -9
- vision_agent/tools/planner_tools.py +60 -40
- vision_agent/tools/tools.py +10 -8
- vision_agent-0.2.230.dist-info/METADATA +156 -0
- {vision_agent-0.2.229.dist-info → vision_agent-0.2.230.dist-info}/RECORD +24 -19
- vision_agent-0.2.229.dist-info/METADATA +0 -562
- {vision_agent-0.2.229.dist-info → vision_agent-0.2.230.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.229.dist-info → vision_agent-0.2.230.dist-info}/WHEEL +0 -0
vision_agent/.sim_tools/df.csv
CHANGED
@@ -244,7 +244,8 @@ desc,doc,name
|
|
244
244
|
1.0.
|
245
245
|
|
246
246
|
Parameters:
|
247
|
-
prompt (str): The prompt to ground to the image.
|
247
|
+
prompt (str): The prompt to ground to the image. Use exclusive categories that
|
248
|
+
do not overlap such as 'person, car' and NOT 'person, athlete'.
|
248
249
|
image (np.ndarray): The image to ground the prompt to.
|
249
250
|
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
250
251
|
fine-tuned model ID here to use it.
|
@@ -281,7 +282,8 @@ desc,doc,name
|
|
281
282
|
is useful for tracking and counting without duplicating counts.
|
282
283
|
|
283
284
|
Parameters:
|
284
|
-
prompt (str): The prompt to ground to the
|
285
|
+
prompt (str): The prompt to ground to the image. Use exclusive categories that
|
286
|
+
do not overlap such as 'person, car' and NOT 'person, athlete'.
|
285
287
|
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
286
288
|
chunk_length (Optional[int]): The number of frames to re-run florence2 to find
|
287
289
|
new objects.
|
@@ -317,14 +319,14 @@ desc,doc,name
|
|
317
319
|
]
|
318
320
|
",florence2_sam2_video_tracking
|
319
321
|
"'florence2_object_detection' is a tool that can detect multiple objects given a text prompt which can be object names or caption. You can optionally separate the object names in the text with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated confidence scores of 1.0.","florence2_object_detection(prompt: str, image: numpy.ndarray, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
|
320
|
-
'florence2_object_detection' is a tool that can detect multiple
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
confidence scores of 1.0.
|
322
|
+
'florence2_object_detection' is a tool that can detect multiple objects given a
|
323
|
+
text prompt which can be object names or caption. You can optionally separate the
|
324
|
+
object names in the text with commas. It returns a list of bounding boxes with
|
325
|
+
normalized coordinates, label names and associated confidence scores of 1.0.
|
325
326
|
|
326
327
|
Parameters:
|
327
|
-
prompt (str): The prompt to ground to the image.
|
328
|
+
prompt (str): The prompt to ground to the image. Use exclusive categories that
|
329
|
+
do not overlap such as 'person, car' and NOT 'person, athlete'.
|
328
330
|
image (np.ndarray): The image to used to detect objects
|
329
331
|
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
330
332
|
fine-tuned model ID here to use it.
|
@@ -157,10 +157,11 @@ def format_conversation(chat: List[AgentMessage]) -> str:
|
|
157
157
|
chat = copy.deepcopy(chat)
|
158
158
|
prompt = ""
|
159
159
|
for chat_i in chat:
|
160
|
-
if chat_i.role == "user":
|
161
|
-
|
162
|
-
|
163
|
-
|
160
|
+
if chat_i.role == "user" or chat_i.role == "coder":
|
161
|
+
if "<final_code>" in chat_i.role:
|
162
|
+
prompt += f"OBSERVATION: {chat_i.content}\n\n"
|
163
|
+
elif chat_i.role == "user":
|
164
|
+
prompt += f"USER: {chat_i.content}\n\n"
|
164
165
|
elif chat_i.role == "conversation":
|
165
166
|
prompt += f"AGENT: {chat_i.content}\n\n"
|
166
167
|
return prompt
|
@@ -332,26 +333,26 @@ def strip_function_calls( # noqa: C901
|
|
332
333
|
def __init__(self, exclusions: List[str]):
|
333
334
|
# Store exclusions to skip removing certain function calls
|
334
335
|
self.exclusions = exclusions
|
335
|
-
self.in_function_or_class =
|
336
|
+
self.in_function_or_class: List[bool] = []
|
336
337
|
|
337
338
|
def visit_FunctionDef(self, node: cst.FunctionDef) -> Optional[bool]:
|
338
|
-
self.in_function_or_class
|
339
|
+
self.in_function_or_class.append(True)
|
339
340
|
return True
|
340
341
|
|
341
342
|
def leave_FunctionDef(
|
342
343
|
self, original_node: cst.FunctionDef, updated_node: cst.FunctionDef
|
343
344
|
) -> cst.BaseStatement:
|
344
|
-
self.in_function_or_class
|
345
|
+
self.in_function_or_class.pop()
|
345
346
|
return updated_node
|
346
347
|
|
347
348
|
def visit_ClassDef(self, node: cst.ClassDef) -> Optional[bool]:
|
348
|
-
self.in_function_or_class
|
349
|
+
self.in_function_or_class.append(True)
|
349
350
|
return True
|
350
351
|
|
351
352
|
def leave_ClassDef(
|
352
353
|
self, node: cst.ClassDef, updated_node: cst.ClassDef
|
353
354
|
) -> cst.BaseStatement:
|
354
|
-
self.in_function_or_class
|
355
|
+
self.in_function_or_class.pop()
|
355
356
|
return updated_node
|
356
357
|
|
357
358
|
def leave_Expr(
|
@@ -291,10 +291,9 @@ class VisionAgent(Agent):
|
|
291
291
|
verbosity (int): The verbosity level of the agent.
|
292
292
|
callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
|
293
293
|
function to send intermediate update messages.
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
object is provided it will use that.
|
294
|
+
code_sandbox_runtime (Optional[str]): For string values it can be one of:
|
295
|
+
None, "local" or "e2b". If None, it will read from the environment
|
296
|
+
variable "CODE_SANDBOX_RUNTIME".
|
298
297
|
"""
|
299
298
|
|
300
299
|
self.agent = AnthropicLMM(temperature=0.0) if agent is None else agent
|
@@ -44,22 +44,22 @@ Can you write a program to check if each person is wearing a helmet? First detec
|
|
44
44
|
|
45
45
|
## Subtasks
|
46
46
|
|
47
|
-
This plan uses the
|
48
|
-
-Use
|
47
|
+
This plan uses the owlv2_object_detection tool to detect both people and helmets in a single pass, which should be efficient and accurate. We can then compare the detections to determine if each person is wearing a helmet.
|
48
|
+
-Use owlv2_object_detection with prompt 'person, helmet' to detect both people and helmets in the image
|
49
49
|
-Process the detections to match helmets with people based on bounding box proximity
|
50
50
|
-Count people with and without helmets based on the matching results
|
51
51
|
-Return a dictionary with the counts
|
52
52
|
|
53
53
|
|
54
54
|
**Tool Tests and Outputs**:
|
55
|
-
After examining the image, I can see 4 workers in total, with 3 wearing yellow safety helmets and 1 not wearing a helmet. Plan 1 using
|
55
|
+
After examining the image, I can see 4 workers in total, with 3 wearing yellow safety helmets and 1 not wearing a helmet. Plan 1 using owlv2_object_detection seems to be the most accurate in detecting both people and helmets. However, it needs some modifications to improve accuracy. We should increase the confidence threshold to 0.15 to filter out the lowest confidence box, and implement logic to associate helmets with people based on their bounding box positions. Plan 2 and Plan 3 seem less reliable given the tool outputs, as they either failed to distinguish between people with and without helmets or misclassified all workers as not wearing helmets.
|
56
56
|
|
57
57
|
**Tool Output Thoughts**:
|
58
58
|
```python
|
59
59
|
...
|
60
60
|
```
|
61
61
|
----- stdout -----
|
62
|
-
Plan 1 -
|
62
|
+
Plan 1 - owlv2_object_detection:
|
63
63
|
|
64
64
|
[{{'label': 'helmet', 'score': 0.15, 'bbox': [0.85, 0.41, 0.87, 0.45]}}, {{'label': 'helmet', 'score': 0.3, 'bbox': [0.8, 0.43, 0.81, 0.46]}}, {{'label': 'helmet', 'score': 0.31, 'bbox': [0.85, 0.45, 0.86, 0.46]}}, {{'label': 'person', 'score': 0.31, 'bbox': [0.84, 0.45, 0.88, 0.58]}}, {{'label': 'person', 'score': 0.31, 'bbox': [0.78, 0.43, 0.82, 0.57]}}, {{'label': 'helmet', 'score': 0.33, 'bbox': [0.3, 0.65, 0.32, 0.67]}}, {{'label': 'person', 'score': 0.29, 'bbox': [0.28, 0.65, 0.36, 0.84]}}, {{'label': 'helmet', 'score': 0.29, 'bbox': [0.13, 0.82, 0.15, 0.85]}}, {{'label': 'person', 'score': 0.3, 'bbox': [0.1, 0.82, 0.24, 1.0]}}]
|
65
65
|
|
@@ -67,12 +67,12 @@ Plan 1 - owl_v2_image:
|
|
67
67
|
|
68
68
|
**Input Code Snippet**:
|
69
69
|
```python
|
70
|
-
from vision_agent.tools import load_image,
|
70
|
+
from vision_agent.tools import load_image, owlv2_object_detection
|
71
71
|
|
72
72
|
def check_helmets(image_path):
|
73
73
|
image = load_image(image_path)
|
74
74
|
# Detect people and helmets, filter out the lowest confidence helmet score of 0.15
|
75
|
-
detections =
|
75
|
+
detections = owlv2_object_detection("person, helmet", image, box_threshold=0.15)
|
76
76
|
height, width = image.shape[:2]
|
77
77
|
|
78
78
|
# Separate people and helmets
|
@@ -26,7 +26,8 @@ from vision_agent.agent.types import (
|
|
26
26
|
)
|
27
27
|
from vision_agent.agent.vision_agent_coder_prompts_v2 import CODE, FIX_BUG, TEST
|
28
28
|
from vision_agent.agent.vision_agent_planner_v2 import VisionAgentPlannerV2
|
29
|
-
from vision_agent.
|
29
|
+
from vision_agent.configs import Config
|
30
|
+
from vision_agent.lmm import LMM
|
30
31
|
from vision_agent.lmm.types import Message
|
31
32
|
from vision_agent.tools.meta_tools import get_diff
|
32
33
|
from vision_agent.utils.execute import (
|
@@ -36,6 +37,7 @@ from vision_agent.utils.execute import (
|
|
36
37
|
)
|
37
38
|
from vision_agent.utils.sim import Sim, get_tool_recommender
|
38
39
|
|
40
|
+
CONFIG = Config()
|
39
41
|
_CONSOLE = Console()
|
40
42
|
|
41
43
|
|
@@ -185,23 +187,17 @@ def debug_code(
|
|
185
187
|
return code, test, debug_info
|
186
188
|
|
187
189
|
|
188
|
-
def
|
189
|
-
coder: LMM,
|
190
|
+
def test_code(
|
190
191
|
tester: LMM,
|
191
192
|
debugger: LMM,
|
192
193
|
chat: List[AgentMessage],
|
193
194
|
plan: str,
|
195
|
+
code: str,
|
194
196
|
tool_docs: str,
|
195
197
|
code_interpreter: CodeInterpreter,
|
196
198
|
media_list: List[Union[str, Path]],
|
197
199
|
verbose: bool,
|
198
200
|
) -> CodeContext:
|
199
|
-
code = write_code(
|
200
|
-
coder=coder,
|
201
|
-
chat=chat,
|
202
|
-
tool_docs=tool_docs,
|
203
|
-
plan=plan,
|
204
|
-
)
|
205
201
|
try:
|
206
202
|
code = strip_function_calls(code)
|
207
203
|
except Exception:
|
@@ -257,6 +253,36 @@ def write_and_test_code(
|
|
257
253
|
)
|
258
254
|
|
259
255
|
|
256
|
+
def write_and_test_code(
|
257
|
+
coder: LMM,
|
258
|
+
tester: LMM,
|
259
|
+
debugger: LMM,
|
260
|
+
chat: List[AgentMessage],
|
261
|
+
plan: str,
|
262
|
+
tool_docs: str,
|
263
|
+
code_interpreter: CodeInterpreter,
|
264
|
+
media_list: List[Union[str, Path]],
|
265
|
+
verbose: bool,
|
266
|
+
) -> CodeContext:
|
267
|
+
code = write_code(
|
268
|
+
coder=coder,
|
269
|
+
chat=chat,
|
270
|
+
tool_docs=tool_docs,
|
271
|
+
plan=plan,
|
272
|
+
)
|
273
|
+
return test_code(
|
274
|
+
tester,
|
275
|
+
debugger,
|
276
|
+
chat,
|
277
|
+
plan,
|
278
|
+
code,
|
279
|
+
tool_docs,
|
280
|
+
code_interpreter,
|
281
|
+
media_list,
|
282
|
+
verbose,
|
283
|
+
)
|
284
|
+
|
285
|
+
|
260
286
|
class VisionAgentCoderV2(AgentCoder):
|
261
287
|
"""VisionAgentCoderV2 is an agent that will write vision code for you."""
|
262
288
|
|
@@ -300,21 +326,9 @@ class VisionAgentCoderV2(AgentCoder):
|
|
300
326
|
)
|
301
327
|
)
|
302
328
|
|
303
|
-
self.coder = (
|
304
|
-
|
305
|
-
|
306
|
-
else AnthropicLMM(model_name="claude-3-5-sonnet-20241022", temperature=0.0)
|
307
|
-
)
|
308
|
-
self.tester = (
|
309
|
-
tester
|
310
|
-
if tester is not None
|
311
|
-
else AnthropicLMM(model_name="claude-3-5-sonnet-20241022", temperature=0.0)
|
312
|
-
)
|
313
|
-
self.debugger = (
|
314
|
-
debugger
|
315
|
-
if debugger is not None
|
316
|
-
else AnthropicLMM(model_name="claude-3-5-sonnet-20241022", temperature=0.0)
|
317
|
-
)
|
329
|
+
self.coder = coder if coder is not None else CONFIG.create_coder()
|
330
|
+
self.tester = tester if tester is not None else CONFIG.create_tester()
|
331
|
+
self.debugger = debugger if debugger is not None else CONFIG.create_debugger()
|
318
332
|
if tool_recommender is not None:
|
319
333
|
if isinstance(tool_recommender, str):
|
320
334
|
self.tool_recommender = Sim.load(tool_recommender)
|
@@ -440,12 +454,13 @@ class VisionAgentCoderV2(AgentCoder):
|
|
440
454
|
) as code_interpreter:
|
441
455
|
int_chat, _, media_list = add_media_to_chat(chat, code_interpreter)
|
442
456
|
tool_docs = retrieve_tools(plan_context.instructions, self.tool_recommender)
|
443
|
-
|
444
|
-
|
457
|
+
|
458
|
+
code_context = test_code(
|
445
459
|
tester=self.tester,
|
446
460
|
debugger=self.debugger,
|
447
461
|
chat=int_chat,
|
448
462
|
plan=format_plan_v2(plan_context),
|
463
|
+
code=plan_context.code,
|
449
464
|
tool_docs=tool_docs,
|
450
465
|
code_interpreter=code_interpreter,
|
451
466
|
media_list=media_list,
|
@@ -55,27 +55,27 @@ This is the documentation for the functions you have access to. You may call any
|
|
55
55
|
--- EXAMPLE1 ---
|
56
56
|
plan1:
|
57
57
|
- Load the image from the provided file path 'image.jpg'.
|
58
|
-
- Use the '
|
58
|
+
- Use the 'owlv2_object_detection' tool with the prompt 'person' to detect and count the number of people in the image.
|
59
59
|
plan2:
|
60
60
|
- Load the image from the provided file path 'image.jpg'.
|
61
|
-
- Use the '
|
61
|
+
- Use the 'florence2_sam2_instance_segmentation' tool with the prompt 'person' to detect and count the number of people in the image.
|
62
62
|
- Count the number of detected objects labeled as 'person'.
|
63
63
|
plan3:
|
64
64
|
- Load the image from the provided file path 'image.jpg'.
|
65
65
|
- Use the 'countgd_object_detection' tool to count the dominant foreground object, which in this case is people.
|
66
66
|
|
67
67
|
```python
|
68
|
-
from vision_agent.tools import load_image,
|
68
|
+
from vision_agent.tools import load_image, owlv2_object_detection, florence2_sam2_instance_segmentation, countgd_object_detection
|
69
69
|
image = load_image("image.jpg")
|
70
|
-
owl_v2_out =
|
70
|
+
owl_v2_out = owlv2_object_detection("person", image)
|
71
71
|
|
72
|
-
f2s2_out =
|
72
|
+
f2s2_out = florence2_sam2_instance_segmentation("person", image)
|
73
73
|
# strip out the masks from the output becuase they don't provide useful information when printed
|
74
74
|
f2s2_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in f2s2_out]
|
75
75
|
|
76
76
|
cgd_out = countgd_object_detection("person", image)
|
77
77
|
|
78
|
-
final_out = {{"
|
78
|
+
final_out = {{"owlv2_object_detection": owl_v2_out, "florence2_sam2_instance_segmentation": f2s2, "countgd_object_detection": cgd_out}}
|
79
79
|
print(final_out)
|
80
80
|
--- END EXAMPLE1 ---
|
81
81
|
|
@@ -130,7 +130,7 @@ In these aerial images, I can see approximately 5-6 pedestrians walking in vario
|
|
130
130
|
[end of claude35_vqa_output]
|
131
131
|
|
132
132
|
[get_tool_for_task output]
|
133
|
-
After examining the image, I can see it's an aerial view of a busy urban intersection with multiple lanes of traffic. There are numerous cars visible, each likely containing at least one person (the driver). While it's difficult to count exact individuals, I can estimate based on the number of vehicles. The countgd_object_detection tool is the best choice for this task as it can detect and count multiple instances of an object given a text prompt. I will use this tool to count the number of pedestrians in the image.
|
133
|
+
After examining the image, I can see it's an aerial view of a busy urban intersection with multiple lanes of traffic. There are numerous cars visible, each likely containing at least one person (the driver). While it's difficult to count exact individuals, I can estimate based on the number of vehicles. The countgd_object_detection tool is the best choice for this task as it can detect and count multiple instances of an object given a text prompt. I will use this tool with the prompt 'pedistrian' to count the number of pedestrians in the image.
|
134
134
|
|
135
135
|
Tool Documentation:
|
136
136
|
countgd_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float = 0.23) -> List[Dict[str, Any]]:
|
@@ -162,7 +162,7 @@ countgd_object_detection(prompt: str, image: numpy.ndarray, box_threshold: float
|
|
162
162
|
[end of get_tool_for_task output]
|
163
163
|
<count>8</count>
|
164
164
|
|
165
|
-
AGENT: <thinking>I need to now merge the boxes from all
|
165
|
+
AGENT: <thinking>I need to now merge the boxes from all region and use the countgd_object_detection tool with the prompt 'pedestrian' as suggested by get_tool_for_task.</thinking>
|
166
166
|
<execute_python>
|
167
167
|
def translate_ofset(bbox, offset_x, offset_y):
|
168
168
|
return (bbox[0] + offset_x, bbox[1] + offset_y, bbox[2] + offset_x, bbox[3] + offset_y)
|
@@ -271,7 +271,7 @@ get_tool_for_task('Identify and track the boxes in the video', frames[:5])
|
|
271
271
|
|
272
272
|
OBSERVATION:
|
273
273
|
[get_tool_for_task output]
|
274
|
-
For tracking boxes moving on a conveyor belt, we need a tool that can consistently track the same box across frames without losing it or double counting. Looking at the outputs: florence2_sam2_video_tracking successfully tracks the single box across all 5 frames, maintaining consistent tracking IDs and showing the box's movement along the conveyor.
|
274
|
+
For tracking boxes moving on a conveyor belt, we need a tool that can consistently track the same box across frames without losing it or double counting. Looking at the outputs: florence2_sam2_video_tracking successfully tracks the single box across all 5 frames, maintaining consistent tracking IDs and showing the box's movement along the conveyor and using the prompt 'box'.
|
275
275
|
|
276
276
|
Tool Documentation:
|
277
277
|
def florence2_sam2_video_tracking(prompt: str, frames: List[np.ndarray], chunk_length: Optional[int] = 10) -> List[List[Dict[str, Any]]]:
|
@@ -317,7 +317,7 @@ def florence2_sam2_video_tracking(prompt: str, frames: List[np.ndarray], chunk_l
|
|
317
317
|
[end of get_tool_for_task output]
|
318
318
|
<count>8</count>
|
319
319
|
|
320
|
-
AGENT: <thinking>Now that I have the tool for tracking I need to find a tool to identify if there is tape on each of the boxes.</thinking>
|
320
|
+
AGENT: <thinking>Now that I have the tool and prompt for tracking I need to find a tool to identify if there is tape on each of the boxes.</thinking>
|
321
321
|
<execute_python>
|
322
322
|
track_predictions = florence2_sam2_video_tracking("box", frames)
|
323
323
|
|
@@ -338,7 +338,7 @@ get_tool_for_task('Identify if there is tape on the boxes', crops[:3])
|
|
338
338
|
|
339
339
|
OBSERVATION:
|
340
340
|
[get_tool_for_task output]
|
341
|
-
owlv2_object_detection performed best as it specifically detected multiple instances of tape with localized bounding boxes, which matches what's visible in the images.
|
341
|
+
owlv2_object_detection performed best with the prompt 'tape', as it specifically detected multiple instances of tape with localized bounding boxes, which matches what's visible in the images.
|
342
342
|
|
343
343
|
'owlv2_object_detection' is a tool that can detect and count multiple objects given a
|
344
344
|
text prompt such as category names or referring expressions on images. The categories
|
@@ -370,7 +370,7 @@ Example
|
|
370
370
|
[end of get_tool_for_task output]
|
371
371
|
<count>7</count>
|
372
372
|
|
373
|
-
AGENT: <thinking>Now that I have the tools for tracking and identifying tape on the boxes, I will use them to accomplish the user's request.</thinking>
|
373
|
+
AGENT: <thinking>Now that I have the tools for tracking and identifying 'tape' on the boxes, I will use them to accomplish the user's request.</thinking>
|
374
374
|
<execute_python>
|
375
375
|
obj_to_info = {{}}
|
376
376
|
for frame, frame_predictions in zip(frames, track_predictions):
|
@@ -487,8 +487,9 @@ TEST_TOOLS = """
|
|
487
487
|
2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
|
488
488
|
3. Your test case MUST run only on the given images which are {media}
|
489
489
|
4. For video tracking, use chunk_length=1 and at least 3 frames to ensure the best results when evaluating the tool.
|
490
|
-
5.
|
491
|
-
6.
|
490
|
+
5. Use mutually exclusive categories for prompts such as 'person, car' and not 'person, athlete' to avoid over counting.
|
491
|
+
6. Print this final dictionary.
|
492
|
+
7. Output your code in the following format wrapped in <code> tags:
|
492
493
|
<code>
|
493
494
|
# Your code here
|
494
495
|
</code>
|
@@ -649,41 +650,6 @@ PICK_TOOL = """
|
|
649
650
|
</json>
|
650
651
|
"""
|
651
652
|
|
652
|
-
PICK_TOOL_V2 = """
|
653
|
-
**Role**: You are an expert evaluator that can understand user requests and evaluate the output of different tools.
|
654
|
-
|
655
|
-
**Task**: You are given the output of different tools for a user request along with the image. You must evaluate the output and determine the best tool for the user request.
|
656
|
-
|
657
|
-
**User Request**:
|
658
|
-
{user_request}
|
659
|
-
|
660
|
-
**Tools**: This is the documentation of all the functions that were tested.
|
661
|
-
{tool_docs}
|
662
|
-
|
663
|
-
**Testing Code and Tool Output**:
|
664
|
-
{context}
|
665
|
-
|
666
|
-
**Previous Attempt**: This is the code and output of the previous attempt, if it is empty then there was no previous attempt.
|
667
|
-
{previous_attempts}
|
668
|
-
|
669
|
-
**Instructions**:
|
670
|
-
1. Re-read the user request, plans, tool outputs and examine the image.
|
671
|
-
2. Given the user request, try to solve the problem yourself.
|
672
|
-
3. Pick which tool output best matches your solution first and the user request, then consider other factors like box size, etc. DO NOT worry about low confidence scores if the output is correct.
|
673
|
-
4. DO NOT modify confidence thresholds unless the tool output is completely wrong.
|
674
|
-
5. Remember for videos that in order to count objects a video some sort of tracking is needed, or else you will overcount the objects.
|
675
|
-
6. Assign each tool a score from 0 to 10 based on how well it solves the user request. A score of 8+ means the tool output matches your solution and the tool is the best choice, 5-7 means the tool output is okay but needs some modifications, less than 5 means the tool output is bad and the tool should not be used. Return the the following JSON format inside <json> tags using the exact tool name as the key and the score as the value:
|
676
|
-
<json>
|
677
|
-
{{
|
678
|
-
"predicted_answer": str # the answer you would expect from the best plan
|
679
|
-
"thoughts": str # your thought process for choosing the best tool over other tools and any modifications you madeas well as the prompt you used with the tool.
|
680
|
-
"first tool": int # the score for the first tool
|
681
|
-
"second tool": int # the score for the second tool
|
682
|
-
...
|
683
|
-
}}
|
684
|
-
</json>
|
685
|
-
"""
|
686
|
-
|
687
653
|
FINALIZE_PLAN = """
|
688
654
|
**Role**: You are an expert AI model that can understand the user request and construct plans to accomplish it.
|
689
655
|
|
@@ -693,13 +659,13 @@ FINALIZE_PLAN = """
|
|
693
659
|
{planning}
|
694
660
|
|
695
661
|
**Instructions**:
|
696
|
-
1.
|
697
|
-
2.
|
698
|
-
3.
|
699
|
-
4.
|
700
|
-
5.
|
701
|
-
6.
|
702
|
-
7.
|
662
|
+
1. Summarize the plan that the planning agent found.
|
663
|
+
2. Write a single function that solves the problem based on what the planner found.
|
664
|
+
3. Specifically call out the tools used and the order in which they were used. Only include tools obtained from calling `get_tool_for_task`.
|
665
|
+
4. Do not include {excluded_tools} tools in your instructions.
|
666
|
+
5. Add final instructions for visualizing the output with `overlay_bounding_boxes` or `overlay_segmentation_masks` and saving it to a file with `save_image` or `save_video`.
|
667
|
+
6. Use the default FPS for extracting frames from videos unless otherwise specified by the user.
|
668
|
+
7. Include the expected answer in your 'plan' so that the programming agent can properly test if it has the correct answer.
|
703
669
|
8. Respond in the following format with JSON surrounded by <json> tags and code surrounded by <code> tags:
|
704
670
|
|
705
671
|
<json>
|
@@ -32,7 +32,8 @@ from vision_agent.agent.vision_agent_planner_prompts_v2 import (
|
|
32
32
|
PICK_PLAN,
|
33
33
|
PLAN,
|
34
34
|
)
|
35
|
-
from vision_agent.
|
35
|
+
from vision_agent.configs import Config
|
36
|
+
from vision_agent.lmm import LMM, Message
|
36
37
|
from vision_agent.tools.planner_tools import check_function_call, get_tool_documentation
|
37
38
|
from vision_agent.utils.execute import (
|
38
39
|
CodeInterpreter,
|
@@ -41,6 +42,7 @@ from vision_agent.utils.execute import (
|
|
41
42
|
)
|
42
43
|
|
43
44
|
logging.basicConfig(level=logging.INFO)
|
45
|
+
CONFIG = Config()
|
44
46
|
UTIL_DOCSTRING = T.get_tool_documentation(
|
45
47
|
[
|
46
48
|
T.load_image,
|
@@ -315,8 +317,8 @@ def maybe_run_code(
|
|
315
317
|
|
316
318
|
|
317
319
|
def create_finalize_plan(
|
318
|
-
chat: List[AgentMessage],
|
319
320
|
model: LMM,
|
321
|
+
chat: List[AgentMessage],
|
320
322
|
verbose: bool = False,
|
321
323
|
) -> Tuple[List[AgentMessage], PlanContext]:
|
322
324
|
# if we're in the middle of an interaction, don't finalize the plan
|
@@ -385,6 +387,7 @@ class VisionAgentPlannerV2(AgentPlanner):
|
|
385
387
|
def __init__(
|
386
388
|
self,
|
387
389
|
planner: Optional[LMM] = None,
|
390
|
+
summarizer: Optional[LMM] = None,
|
388
391
|
critic: Optional[LMM] = None,
|
389
392
|
max_steps: int = 10,
|
390
393
|
use_multi_trial_planning: bool = False,
|
@@ -414,16 +417,11 @@ class VisionAgentPlannerV2(AgentPlanner):
|
|
414
417
|
that will send back intermediate conversation messages.
|
415
418
|
"""
|
416
419
|
|
417
|
-
self.planner = (
|
418
|
-
|
419
|
-
if
|
420
|
-
else AnthropicLMM(model_name="claude-3-5-sonnet-20241022", temperature=0.0)
|
421
|
-
)
|
422
|
-
self.critic = (
|
423
|
-
critic
|
424
|
-
if critic is not None
|
425
|
-
else AnthropicLMM(model_name="claude-3-5-sonnet-20241022", temperature=0.0)
|
420
|
+
self.planner = planner if planner is not None else CONFIG.create_planner()
|
421
|
+
self.summarizer = (
|
422
|
+
summarizer if summarizer is not None else CONFIG.create_summarizer()
|
426
423
|
)
|
424
|
+
self.critic = critic if critic is not None else CONFIG.create_critic()
|
427
425
|
self.max_steps = max_steps
|
428
426
|
self.use_multi_trial_planning = use_multi_trial_planning
|
429
427
|
self.critique_steps = critique_steps
|
@@ -561,7 +559,7 @@ class VisionAgentPlannerV2(AgentPlanner):
|
|
561
559
|
context = InteractionContext(chat=int_chat)
|
562
560
|
else:
|
563
561
|
updated_chat, context = create_finalize_plan(
|
564
|
-
|
562
|
+
self.summarizer, int_chat, self.verbose
|
565
563
|
)
|
566
564
|
int_chat.extend(updated_chat)
|
567
565
|
for chat_elt in updated_chat:
|
@@ -55,10 +55,10 @@ generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect
|
|
55
55
|
|
56
56
|
OBSERVATION:
|
57
57
|
[Artifact dog_detector.py (5 lines total)]
|
58
|
-
0|from vision_agent.tools import load_image,
|
58
|
+
0|from vision_agent.tools import load_image, owlv2_object_detection
|
59
59
|
1|def detect_dogs(image_path: str):
|
60
60
|
2| image = load_image(image_path)
|
61
|
-
3| dogs =
|
61
|
+
3| dogs = owlv2_object_detection("dog", image)
|
62
62
|
4| return dogs
|
63
63
|
[End of artifact]
|
64
64
|
|
@@ -96,10 +96,10 @@ edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect do
|
|
96
96
|
|
97
97
|
OBSERVATION:
|
98
98
|
[Artifact dog_detector.py (5 lines total)]
|
99
|
-
0|from vision_agent.tools import load_image,
|
99
|
+
0|from vision_agent.tools import load_image, owlv2_object_detection
|
100
100
|
1|def detect_dogs(image_path: str):
|
101
101
|
2| image = load_image(image_path)
|
102
|
-
3| dogs =
|
102
|
+
3| dogs = owlv2_object_detection("dog", image, threshold=0.24)
|
103
103
|
4| return dogs
|
104
104
|
[End of artifact]
|
105
105
|
|
@@ -168,10 +168,10 @@ generate_vision_code(artifacts, 'code.py', 'Can you create a function to count w
|
|
168
168
|
|
169
169
|
OBSERVATION:
|
170
170
|
[Artifact code.py (17 lines total)]
|
171
|
-
0|from vision_agent.tools import load_image,
|
171
|
+
0|from vision_agent.tools import load_image, florence2_object_detection, closest_box_distance, overlay_bounding_boxes, save_image
|
172
172
|
1|def count_workers_with_helmets(image_path: str):
|
173
173
|
2| image = load_image(image_path)
|
174
|
-
3| detections =
|
174
|
+
3| detections = florence2_object_detection("worker, helmet", image)
|
175
175
|
4| workers = [d for d in detections if d['label'] == 'worker']
|
176
176
|
5| helmets = [d for d in detections if d['label'] == 'helmet']
|
177
177
|
6| count = 0
|
@@ -217,7 +217,7 @@ object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker',
|
|
217
217
|
OBSERVATION:
|
218
218
|
[Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
|
219
219
|
|
220
|
-
AGENT: <thinking>The model has finished fine tuning, I will now replace the original
|
220
|
+
AGENT: <thinking>The model has finished fine tuning, I will now replace the original florence2_object_detection call with the fine tuning id.</thinking>
|
221
221
|
<execute_python>
|
222
222
|
use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")
|
223
223
|
</execute_python>
|
@@ -228,11 +228,11 @@ OBSERVATION:
|
|
228
228
|
---
|
229
229
|
+++
|
230
230
|
@@ -1,7 +1,7 @@
|
231
|
-
from vision_agent.tools import load_image,
|
231
|
+
from vision_agent.tools import load_image, florence2_object_detection, closest_box_distance, overlay_bounding_boxes, save_image
|
232
232
|
def count_workers_with_helmets(image_path: str, output_path: str):
|
233
233
|
image = load_image(image_path)
|
234
|
-
- detections =
|
235
|
-
+ detections =
|
234
|
+
- detections = florence2_object_detection("worker, helmet", image)
|
235
|
+
+ detections = florence2_object_detection("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
|
236
236
|
workers = [d for d in detections if d['label'] == 'worker']
|
237
237
|
helmets = [d for d in detections if d['label'] == 'helmet']
|
238
238
|
count = 0
|
@@ -250,7 +250,7 @@ OBSERVATION:
|
|
250
250
|
3
|
251
251
|
|
252
252
|
AGENT: <thinking>Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.</thinking>
|
253
|
-
<response>I have updated the code using the fine tuned
|
253
|
+
<response>I have updated the code using the fine tuned florence2_object_detection model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.</response>
|
254
254
|
<let_user_respond>true</let_user_respond>
|
255
255
|
"""
|
256
256
|
|
@@ -1,5 +1,5 @@
|
|
1
1
|
CONVERSATION = """
|
2
|
-
**Role**: You are a help agent that assists users
|
2
|
+
**Role**: You are a help agent that called VisionAent, built by LandingAI, that assists users write code to solve vision tasks.
|
3
3
|
|
4
4
|
**Actions**:
|
5
5
|
`generate_or_edit_vision_code` - This action will generate code for the user to solve a vision task. It will also edit vision code for the user, this is useful if the user wants to modify vision-related aspects of the code such as changing the vision model or the image pre-processing steps.
|
@@ -8,7 +8,7 @@ CONVERSATION = """
|
|
8
8
|
|
9
9
|
**Exampels**:
|
10
10
|
Here is an example of how you can interact with a user and Actions:
|
11
|
-
---
|
11
|
+
--- EXAMPLE1 ---
|
12
12
|
USER: Can you help me write code to detect dogs in this image? Media name images/dogs.jpg
|
13
13
|
|
14
14
|
AGENT: <response>Yes, I can help you with that. I will write the code to detect dogs in the image.</response><action>generate_or_edit_vision_code</action>
|
@@ -28,8 +28,23 @@ def test_detect_dogs():
|
|
28
28
|
</final_test>
|
29
29
|
|
30
30
|
AGENT: <response>Here is the code to detect dogs in the image.</response>
|
31
|
+
--- END EXAMPLE1 ---
|
31
32
|
|
32
|
-
---
|
33
|
+
--- EXAMPLE2 ---
|
34
|
+
USER: Who are you and what can you do?
|
35
|
+
|
36
|
+
AGENT: <response>I am VisionAgent, an agent built by LandingAI, to help users write code to solve vision tasks. I can write code to do the following tasks:
|
37
|
+
|
38
|
+
- Detecting objects in an image with bounding boxes
|
39
|
+
- Detecting objects in an image with masks
|
40
|
+
- Tracking object in a video
|
41
|
+
- Classifying images
|
42
|
+
- Understanding documents
|
43
|
+
- Pose estimation
|
44
|
+
- Visual question answering for both images and videos
|
45
|
+
|
46
|
+
How can I help you?</response>
|
47
|
+
--- END EXAMPLE2 ---
|
33
48
|
|
34
49
|
**Conversation**:
|
35
50
|
Here is the current conversation so far:
|