vision-agent 0.2.129__tar.gz → 0.2.131__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {vision_agent-0.2.129 → vision_agent-0.2.131}/PKG-INFO +1 -1
- {vision_agent-0.2.129 → vision_agent-0.2.131}/pyproject.toml +1 -1
- {vision_agent-0.2.129 → vision_agent-0.2.131}/vision_agent/agent/vision_agent.py +20 -3
- {vision_agent-0.2.129 → vision_agent-0.2.131}/vision_agent/agent/vision_agent_prompts.py +49 -14
- {vision_agent-0.2.129 → vision_agent-0.2.131}/vision_agent/tools/meta_tools.py +79 -10
- {vision_agent-0.2.129 → vision_agent-0.2.131}/vision_agent/tools/tools.py +1 -12
- {vision_agent-0.2.129 → vision_agent-0.2.131}/vision_agent/utils/execute.py +3 -0
- {vision_agent-0.2.129 → vision_agent-0.2.131}/LICENSE +0 -0
- {vision_agent-0.2.129 → vision_agent-0.2.131}/README.md +0 -0
- {vision_agent-0.2.129 → vision_agent-0.2.131}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.129 → vision_agent-0.2.131}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.2.129 → vision_agent-0.2.131}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.2.129 → vision_agent-0.2.131}/vision_agent/agent/agent_utils.py +0 -0
- {vision_agent-0.2.129 → vision_agent-0.2.131}/vision_agent/agent/vision_agent_coder.py +0 -0
- {vision_agent-0.2.129 → vision_agent-0.2.131}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
- {vision_agent-0.2.129 → vision_agent-0.2.131}/vision_agent/clients/__init__.py +0 -0
- {vision_agent-0.2.129 → vision_agent-0.2.131}/vision_agent/clients/http.py +0 -0
- {vision_agent-0.2.129 → vision_agent-0.2.131}/vision_agent/clients/landing_public_api.py +0 -0
- {vision_agent-0.2.129 → vision_agent-0.2.131}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.129 → vision_agent-0.2.131}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.129 → vision_agent-0.2.131}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.2.129 → vision_agent-0.2.131}/vision_agent/lmm/lmm.py +0 -0
- {vision_agent-0.2.129 → vision_agent-0.2.131}/vision_agent/lmm/types.py +0 -0
- {vision_agent-0.2.129 → vision_agent-0.2.131}/vision_agent/tools/__init__.py +0 -0
- {vision_agent-0.2.129 → vision_agent-0.2.131}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.129 → vision_agent-0.2.131}/vision_agent/tools/tool_utils.py +0 -0
- {vision_agent-0.2.129 → vision_agent-0.2.131}/vision_agent/tools/tools_types.py +0 -0
- {vision_agent-0.2.129 → vision_agent-0.2.131}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.129 → vision_agent-0.2.131}/vision_agent/utils/exceptions.py +0 -0
- {vision_agent-0.2.129 → vision_agent-0.2.131}/vision_agent/utils/image_utils.py +0 -0
- {vision_agent-0.2.129 → vision_agent-0.2.131}/vision_agent/utils/sim.py +0 -0
- {vision_agent-0.2.129 → vision_agent-0.2.131}/vision_agent/utils/type_defs.py +0 -0
- {vision_agent-0.2.129 → vision_agent-0.2.131}/vision_agent/utils/video.py +0 -0
@@ -14,7 +14,7 @@ from vision_agent.agent.vision_agent_prompts import (
|
|
14
14
|
)
|
15
15
|
from vision_agent.lmm import LMM, Message, OpenAILMM
|
16
16
|
from vision_agent.tools import META_TOOL_DOCSTRING
|
17
|
-
from vision_agent.tools.meta_tools import Artifacts
|
17
|
+
from vision_agent.tools.meta_tools import Artifacts, use_extra_vision_agent_args
|
18
18
|
from vision_agent.utils import CodeInterpreterFactory
|
19
19
|
from vision_agent.utils.execute import CodeInterpreter, Execution
|
20
20
|
|
@@ -87,11 +87,18 @@ def run_code_action(
|
|
87
87
|
return result, obs
|
88
88
|
|
89
89
|
|
90
|
-
def parse_execution(
|
90
|
+
def parse_execution(
|
91
|
+
response: str,
|
92
|
+
test_multi_plan: bool = True,
|
93
|
+
customed_tool_names: Optional[List[str]] = None,
|
94
|
+
) -> Optional[str]:
|
91
95
|
code = None
|
92
96
|
if "<execute_python>" in response:
|
93
97
|
code = response[response.find("<execute_python>") + len("<execute_python>") :]
|
94
98
|
code = code[: code.find("</execute_python>")]
|
99
|
+
|
100
|
+
if code is not None:
|
101
|
+
code = use_extra_vision_agent_args(code, test_multi_plan, customed_tool_names)
|
95
102
|
return code
|
96
103
|
|
97
104
|
|
@@ -174,6 +181,8 @@ class VisionAgent(Agent):
|
|
174
181
|
self,
|
175
182
|
chat: List[Message],
|
176
183
|
artifacts: Optional[Artifacts] = None,
|
184
|
+
test_multi_plan: bool = True,
|
185
|
+
customized_tool_names: Optional[List[str]] = None,
|
177
186
|
) -> Tuple[List[Message], Artifacts]:
|
178
187
|
"""Chat with VisionAgent, it will use code to execute actions to accomplish
|
179
188
|
its tasks.
|
@@ -184,6 +193,12 @@ class VisionAgent(Agent):
|
|
184
193
|
or if it contains media files, it should be in the format of:
|
185
194
|
[{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
|
186
195
|
artifacts (Optional[Artifacts]): The artifacts to use in the task.
|
196
|
+
test_multi_plan (bool): If True, it will test tools for multiple plans and
|
197
|
+
pick the best one based off of the tool results. If False, it will go
|
198
|
+
with the first plan.
|
199
|
+
customized_tool_names (List[str]): A list of customized tools for agent to
|
200
|
+
pick and use. If not provided, default to full tool set from
|
201
|
+
vision_agent.tools.
|
187
202
|
|
188
203
|
Returns:
|
189
204
|
List[Message]: The conversation response.
|
@@ -262,7 +277,9 @@ class VisionAgent(Agent):
|
|
262
277
|
if response["let_user_respond"]:
|
263
278
|
break
|
264
279
|
|
265
|
-
code_action = parse_execution(
|
280
|
+
code_action = parse_execution(
|
281
|
+
response["response"], test_multi_plan, customized_tool_names
|
282
|
+
)
|
266
283
|
|
267
284
|
if code_action is not None:
|
268
285
|
result, obs = run_code_action(
|
@@ -1,5 +1,5 @@
|
|
1
1
|
VA_CODE = """
|
2
|
-
**Role**: You are a helpful
|
2
|
+
**Role**: You are a helpful agent that assists users with writing code.
|
3
3
|
|
4
4
|
**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved everytime you execute python code.
|
5
5
|
|
@@ -56,7 +56,9 @@ OBSERVATION:
|
|
56
56
|
|
57
57
|
|
58
58
|
AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect dogs and shown the output, do the results look good to you?", "let_user_respond": true}
|
59
|
+
"""
|
59
60
|
|
61
|
+
EXAMPLES_CODE1_EXTRA = """
|
60
62
|
USER: The the image only has one dog, can you fix this?
|
61
63
|
|
62
64
|
[Artifacts loaded]
|
@@ -105,25 +107,24 @@ AGENT: {"thoughts": "I will use the generate_vision_code to count the workers wi
|
|
105
107
|
|
106
108
|
OBSERVATION:
|
107
109
|
[Artifact code.py]
|
108
|
-
0|from vision_agent.tools import load_image,
|
110
|
+
0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
|
109
111
|
1|def count_workers_with_helmets(image_path: str, output_path: str):
|
110
112
|
2| image = load_image(image_path)
|
111
|
-
3| detections =
|
113
|
+
3| detections = florence2_phrase_grounding("worker, helmet", image)
|
112
114
|
4| workers = [d for d in detections if d['label'] == 'worker']
|
113
115
|
5| helmets = [d for d in detections if d['label'] == 'helmet']
|
114
116
|
6| count = 0
|
115
117
|
6| for worker in workers:
|
116
|
-
7|
|
117
|
-
8|
|
118
|
-
9|
|
119
|
-
10|
|
120
|
-
11|
|
121
|
-
|
122
|
-
13|
|
123
|
-
14|
|
124
|
-
15|
|
125
|
-
16|
|
126
|
-
15| return count
|
118
|
+
7| person_has_helmet = False
|
119
|
+
8| for helmet in helmets:
|
120
|
+
9| if closest_box_distance(worker['box'], helmet['box']) < 0.01:
|
121
|
+
10| person_has_helmet = True
|
122
|
+
11| break
|
123
|
+
11| if person_has_helmet:
|
124
|
+
13| count += 1
|
125
|
+
14| overlay_bounding_boxes(image, detections)
|
126
|
+
15| save_image(output_path, image)
|
127
|
+
16| return count
|
127
128
|
[End of artifact]
|
128
129
|
|
129
130
|
AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
|
@@ -133,4 +134,38 @@ OBSERVATION:
|
|
133
134
|
2
|
134
135
|
|
135
136
|
AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
|
137
|
+
|
138
|
+
USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
|
139
|
+
|
140
|
+
AGENT: {"thoughts": "Because the user has supplied me with labels I can call florence2_fine_tuning on their behalf to fine tune the model", "response": "I will fine tune florence2 with the labels you provided <execute_python>florence2_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}], "phrase_grounding")</execute_python>", "let_user_respond": false}
|
141
|
+
|
142
|
+
OBSERVATION:
|
143
|
+
[Florence2 fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
|
144
|
+
|
145
|
+
|
146
|
+
AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_florence2_fine_tuning(artifacts, "code.py", "phrase_grounding", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
|
147
|
+
|
148
|
+
OBSERVATION:
|
149
|
+
[Artifact code.py edits]
|
150
|
+
---
|
151
|
+
+++
|
152
|
+
@@ -1,7 +1,7 @@
|
153
|
+
from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
|
154
|
+
def count_workers_with_helmets(image_path: str, output_path: str):
|
155
|
+
image = load_image(image_path)
|
156
|
+
- detections = florence2_phrase_grounding("worker, helmet", image)
|
157
|
+
+ detections = florence2_phrase_grounding("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
|
158
|
+
workers = [d for d in detections if d['label'] == 'worker']
|
159
|
+
helmets = [d for d in detections if d['label'] == 'helmet']
|
160
|
+
count = 0
|
161
|
+
[End of edits]
|
162
|
+
|
163
|
+
|
164
|
+
AGENT: {"thoughts": "I have updated the code to use the fine tuned model, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
|
165
|
+
|
166
|
+
OBSERVATION:
|
167
|
+
----- stdout -----
|
168
|
+
3
|
169
|
+
|
170
|
+
AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
|
136
171
|
"""
|
@@ -297,7 +297,12 @@ def edit_code_artifact(
|
|
297
297
|
|
298
298
|
|
299
299
|
def generate_vision_code(
|
300
|
-
artifacts: Artifacts,
|
300
|
+
artifacts: Artifacts,
|
301
|
+
name: str,
|
302
|
+
chat: str,
|
303
|
+
media: List[str],
|
304
|
+
test_multi_plan: bool = True,
|
305
|
+
customized_tool_names: Optional[List[str]] = None,
|
301
306
|
) -> str:
|
302
307
|
"""Generates python code to solve vision based tasks.
|
303
308
|
|
@@ -306,6 +311,8 @@ def generate_vision_code(
|
|
306
311
|
name (str): The name of the artifact to save the code to.
|
307
312
|
chat (str): The chat message from the user.
|
308
313
|
media (List[str]): The media files to use.
|
314
|
+
test_multi_plan (bool): Do not change this parameter.
|
315
|
+
customized_tool_names (Optional[List[str]]): Do not change this parameter.
|
309
316
|
|
310
317
|
Returns:
|
311
318
|
str: The generated code.
|
@@ -330,7 +337,11 @@ def generate_vision_code(
|
|
330
337
|
agent = va.agent.VisionAgentCoder()
|
331
338
|
|
332
339
|
fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
|
333
|
-
response = agent.chat_with_workflow(
|
340
|
+
response = agent.chat_with_workflow(
|
341
|
+
fixed_chat,
|
342
|
+
test_multi_plan=test_multi_plan,
|
343
|
+
customized_tool_names=customized_tool_names,
|
344
|
+
)
|
334
345
|
redisplay_results(response["test_result"])
|
335
346
|
code = response["code"]
|
336
347
|
artifacts[name] = code
|
@@ -342,7 +353,11 @@ def generate_vision_code(
|
|
342
353
|
|
343
354
|
|
344
355
|
def edit_vision_code(
|
345
|
-
artifacts: Artifacts,
|
356
|
+
artifacts: Artifacts,
|
357
|
+
name: str,
|
358
|
+
chat_history: List[str],
|
359
|
+
media: List[str],
|
360
|
+
customized_tool_names: Optional[List[str]] = None,
|
346
361
|
) -> str:
|
347
362
|
"""Edits python code to solve a vision based task.
|
348
363
|
|
@@ -350,6 +365,7 @@ def edit_vision_code(
|
|
350
365
|
artifacts (Artifacts): The artifacts object to save the code to.
|
351
366
|
name (str): The file path to the code.
|
352
367
|
chat_history (List[str]): The chat history to used to generate the code.
|
368
|
+
customized_tool_names (Optional[List[str]]): Do not change this parameter.
|
353
369
|
|
354
370
|
Returns:
|
355
371
|
str: The edited code.
|
@@ -386,7 +402,11 @@ def edit_vision_code(
|
|
386
402
|
fixed_chat_history.append({"role": "assistant", "content": code})
|
387
403
|
fixed_chat_history.append({"role": "user", "content": chat})
|
388
404
|
|
389
|
-
response = agent.chat_with_workflow(
|
405
|
+
response = agent.chat_with_workflow(
|
406
|
+
fixed_chat_history,
|
407
|
+
test_multi_plan=False,
|
408
|
+
customized_tool_names=customized_tool_names,
|
409
|
+
)
|
390
410
|
redisplay_results(response["test_result"])
|
391
411
|
code = response["code"]
|
392
412
|
artifacts[name] = code
|
@@ -425,18 +445,19 @@ def get_tool_descriptions() -> str:
|
|
425
445
|
|
426
446
|
|
427
447
|
def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str:
|
428
|
-
"""
|
448
|
+
"""DO NOT use this function unless the user has supplied you with bboxes.
|
449
|
+
'florence2_fine_tuning' is a tool that fine-tune florence2 to be able to detect
|
429
450
|
objects in an image based on a given dataset. It returns the fine tuning job id.
|
430
451
|
|
431
452
|
Parameters:
|
432
|
-
bboxes (List[BboxInput]): A list of BboxInput containing the
|
433
|
-
|
453
|
+
bboxes (List[BboxInput]): A list of BboxInput containing the image path, labels
|
454
|
+
and bounding boxes. The coordinates are unnormalized.
|
434
455
|
task (str): The florencev2 fine-tuning task. The options are
|
435
456
|
'phrase_grounding'.
|
436
457
|
|
437
458
|
Returns:
|
438
|
-
|
439
|
-
|
459
|
+
str: The fine tuning job id, this id will used to retrieve the fine tuned
|
460
|
+
model.
|
440
461
|
|
441
462
|
Example
|
442
463
|
-------
|
@@ -473,6 +494,54 @@ def get_diff(before: str, after: str) -> str:
|
|
473
494
|
)
|
474
495
|
|
475
496
|
|
497
|
+
def get_diff_with_prompts(name: str, before: str, after: str) -> str:
|
498
|
+
diff = get_diff(before, after)
|
499
|
+
return f"[Artifact {name} edits]\n{diff}\n[End of edits]"
|
500
|
+
|
501
|
+
|
502
|
+
def use_extra_vision_agent_args(
|
503
|
+
code: str,
|
504
|
+
test_multi_plan: bool = True,
|
505
|
+
customized_tool_names: Optional[List[str]] = None,
|
506
|
+
) -> str:
|
507
|
+
"""This is for forcing arguments passed by the user to VisionAgent into the
|
508
|
+
VisionAgentCoder call.
|
509
|
+
|
510
|
+
Parameters:
|
511
|
+
code (str): The code to edit.
|
512
|
+
test_multi_plan (bool): Do not change this parameter.
|
513
|
+
customized_tool_names (Optional[List[str]]): Do not change this parameter.
|
514
|
+
|
515
|
+
Returns:
|
516
|
+
str: The edited code.
|
517
|
+
"""
|
518
|
+
generate_pattern = r"generate_vision_code\(\s*([^\)]+)\)"
|
519
|
+
|
520
|
+
def generate_replacer(match: re.Match) -> str:
|
521
|
+
arg = match.group(1)
|
522
|
+
out_str = f"generate_vision_code({arg}, test_multi_plan={test_multi_plan}"
|
523
|
+
if customized_tool_names is not None:
|
524
|
+
out_str += f", customized_tool_names={customized_tool_names})"
|
525
|
+
else:
|
526
|
+
out_str += ")"
|
527
|
+
return out_str
|
528
|
+
|
529
|
+
edit_pattern = r"edit_vision_code\(\s*([^\)]+)\)"
|
530
|
+
|
531
|
+
def edit_replacer(match: re.Match) -> str:
|
532
|
+
arg = match.group(1)
|
533
|
+
out_str = f"edit_vision_code({arg}"
|
534
|
+
if customized_tool_names is not None:
|
535
|
+
out_str += f", customized_tool_names={customized_tool_names})"
|
536
|
+
else:
|
537
|
+
out_str += ")"
|
538
|
+
return out_str
|
539
|
+
|
540
|
+
new_code = re.sub(generate_pattern, generate_replacer, code)
|
541
|
+
new_code = re.sub(edit_pattern, edit_replacer, new_code)
|
542
|
+
return new_code
|
543
|
+
|
544
|
+
|
476
545
|
def use_florence2_fine_tuning(
|
477
546
|
artifacts: Artifacts, name: str, task: str, fine_tune_id: str
|
478
547
|
) -> str:
|
@@ -521,7 +590,7 @@ def use_florence2_fine_tuning(
|
|
521
590
|
|
522
591
|
artifacts[name] = new_code
|
523
592
|
|
524
|
-
diff =
|
593
|
+
diff = get_diff_with_prompts(name, code, new_code)
|
525
594
|
print(diff)
|
526
595
|
return diff
|
527
596
|
|
@@ -1945,15 +1945,4 @@ TOOLS_DF = get_tools_df(TOOLS) # type: ignore
|
|
1945
1945
|
TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS) # type: ignore
|
1946
1946
|
TOOL_DOCSTRING = get_tool_documentation(TOOLS) # type: ignore
|
1947
1947
|
TOOLS_INFO = get_tools_info(FUNCTION_TOOLS) # type: ignore
|
1948
|
-
UTILITIES_DOCSTRING = get_tool_documentation(
|
1949
|
-
[
|
1950
|
-
save_json,
|
1951
|
-
load_image,
|
1952
|
-
save_image,
|
1953
|
-
save_video,
|
1954
|
-
overlay_bounding_boxes,
|
1955
|
-
overlay_segmentation_masks,
|
1956
|
-
overlay_heat_map,
|
1957
|
-
overlay_counting_results,
|
1958
|
-
]
|
1959
|
-
)
|
1948
|
+
UTILITIES_DOCSTRING = get_tool_documentation(UTIL_TOOLS) # type: ignore
|
@@ -243,6 +243,9 @@ class Logs(BaseModel):
|
|
243
243
|
f"----- stdout -----\n{stdout_str}\n----- stderr -----\n{stderr_str}"
|
244
244
|
)
|
245
245
|
|
246
|
+
def to_json(self) -> dict[str, list[str]]:
|
247
|
+
return {"stdout": self.stdout, "stderr": self.stderr}
|
248
|
+
|
246
249
|
|
247
250
|
class Error(BaseModel):
|
248
251
|
"""Represents an error that occurred during the execution of a cell. The error
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{vision_agent-0.2.129 → vision_agent-0.2.131}/vision_agent/agent/vision_agent_coder_prompts.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|