vision-agent 0.2.80__tar.gz → 0.2.81__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vision_agent-0.2.80 → vision_agent-0.2.81}/PKG-INFO +1 -1
- {vision_agent-0.2.80 → vision_agent-0.2.81}/pyproject.toml +1 -1
- {vision_agent-0.2.80 → vision_agent-0.2.81}/vision_agent/agent/vision_agent.py +163 -86
- {vision_agent-0.2.80 → vision_agent-0.2.81}/vision_agent/agent/vision_agent_prompts.py +95 -7
- {vision_agent-0.2.80 → vision_agent-0.2.81}/vision_agent/lmm/lmm.py +43 -20
- {vision_agent-0.2.80 → vision_agent-0.2.81}/vision_agent/utils/execute.py +9 -4
- {vision_agent-0.2.80 → vision_agent-0.2.81}/LICENSE +0 -0
- {vision_agent-0.2.80 → vision_agent-0.2.81}/README.md +0 -0
- {vision_agent-0.2.80 → vision_agent-0.2.81}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.80 → vision_agent-0.2.81}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.2.80 → vision_agent-0.2.81}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.2.80 → vision_agent-0.2.81}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.80 → vision_agent-0.2.81}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.80 → vision_agent-0.2.81}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.2.80 → vision_agent-0.2.81}/vision_agent/tools/__init__.py +6 -6
- {vision_agent-0.2.80 → vision_agent-0.2.81}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.80 → vision_agent-0.2.81}/vision_agent/tools/tool_utils.py +0 -0
- {vision_agent-0.2.80 → vision_agent-0.2.81}/vision_agent/tools/tools.py +0 -0
- {vision_agent-0.2.80 → vision_agent-0.2.81}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.80 → vision_agent-0.2.81}/vision_agent/utils/exceptions.py +0 -0
- {vision_agent-0.2.80 → vision_agent-0.2.81}/vision_agent/utils/image_utils.py +0 -0
- {vision_agent-0.2.80 → vision_agent-0.2.81}/vision_agent/utils/sim.py +0 -0
- {vision_agent-0.2.80 → vision_agent-0.2.81}/vision_agent/utils/type_defs.py +0 -0
- {vision_agent-0.2.80 → vision_agent-0.2.81}/vision_agent/utils/video.py +0 -0
@@ -5,7 +5,7 @@ import logging
|
|
5
5
|
import sys
|
6
6
|
import tempfile
|
7
7
|
from pathlib import Path
|
8
|
-
from typing import Any, Callable, Dict, List, Optional, Sequence, Union, cast
|
8
|
+
from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, cast
|
9
9
|
|
10
10
|
from langsmith import traceable
|
11
11
|
from PIL import Image
|
@@ -20,9 +20,11 @@ from vision_agent.agent.vision_agent_prompts import (
|
|
20
20
|
CODE,
|
21
21
|
FIX_BUG,
|
22
22
|
FULL_TASK,
|
23
|
+
PICK_PLAN,
|
23
24
|
PLAN,
|
24
|
-
|
25
|
+
PREVIOUS_FAILED,
|
25
26
|
SIMPLE_TEST,
|
27
|
+
TEST_PLANS,
|
26
28
|
USER_REQ,
|
27
29
|
)
|
28
30
|
from vision_agent.lmm import LMM, AzureOpenAILMM, Message, OpenAILMM
|
@@ -80,6 +82,15 @@ def format_memory(memory: List[Dict[str, str]]) -> str:
|
|
80
82
|
return output_str
|
81
83
|
|
82
84
|
|
85
|
+
def format_plans(plans: Dict[str, Any]) -> str:
|
86
|
+
plan_str = ""
|
87
|
+
for k, v in plans.items():
|
88
|
+
plan_str += f"{k}:\n"
|
89
|
+
plan_str += "-" + "\n-".join([e["instructions"] for e in v])
|
90
|
+
|
91
|
+
return plan_str
|
92
|
+
|
93
|
+
|
83
94
|
def extract_code(code: str) -> str:
|
84
95
|
if "\n```python" in code:
|
85
96
|
start = "\n```python"
|
@@ -140,12 +151,12 @@ def extract_image(
|
|
140
151
|
|
141
152
|
|
142
153
|
@traceable
|
143
|
-
def
|
154
|
+
def write_plans(
|
144
155
|
chat: List[Message],
|
145
156
|
tool_desc: str,
|
146
157
|
working_memory: str,
|
147
158
|
model: LMM,
|
148
|
-
) ->
|
159
|
+
) -> Dict[str, Any]:
|
149
160
|
chat = copy.deepcopy(chat)
|
150
161
|
if chat[-1]["role"] != "user":
|
151
162
|
raise ValueError("Last chat message must be from the user.")
|
@@ -154,14 +165,84 @@ def write_plan(
|
|
154
165
|
context = USER_REQ.format(user_request=user_request)
|
155
166
|
prompt = PLAN.format(context=context, tool_desc=tool_desc, feedback=working_memory)
|
156
167
|
chat[-1]["content"] = prompt
|
157
|
-
return extract_json(model.chat(chat))
|
168
|
+
return extract_json(model.chat(chat))
|
169
|
+
|
170
|
+
|
171
|
+
@traceable
|
172
|
+
def pick_plan(
|
173
|
+
chat: List[Message],
|
174
|
+
plans: Dict[str, Any],
|
175
|
+
tool_info: str,
|
176
|
+
model: LMM,
|
177
|
+
code_interpreter: CodeInterpreter,
|
178
|
+
verbosity: int = 0,
|
179
|
+
) -> Tuple[str, str]:
|
180
|
+
chat = copy.deepcopy(chat)
|
181
|
+
if chat[-1]["role"] != "user":
|
182
|
+
raise ValueError("Last chat message must be from the user.")
|
183
|
+
|
184
|
+
plan_str = format_plans(plans)
|
185
|
+
prompt = TEST_PLANS.format(
|
186
|
+
docstring=tool_info, plans=plan_str, previous_attempts=""
|
187
|
+
)
|
188
|
+
|
189
|
+
code = extract_code(model(prompt))
|
190
|
+
tool_output = code_interpreter.exec_isolation(DefaultImports.prepend_imports(code))
|
191
|
+
tool_output_str = ""
|
192
|
+
if len(tool_output.logs.stdout) > 0:
|
193
|
+
tool_output_str = tool_output.logs.stdout[0]
|
194
|
+
|
195
|
+
if verbosity >= 1:
|
196
|
+
_print_code("Initial code and tests:", code)
|
197
|
+
_LOGGER.info(f"Initial code execution result:\n{tool_output.text()}")
|
198
|
+
|
199
|
+
# retry if the tool output is empty or code fails
|
200
|
+
count = 1
|
201
|
+
while (not tool_output.success or tool_output_str == "") and count < 3:
|
202
|
+
prompt = TEST_PLANS.format(
|
203
|
+
docstring=tool_info,
|
204
|
+
plans=plan_str,
|
205
|
+
previous_attempts=PREVIOUS_FAILED.format(
|
206
|
+
code=code, error=tool_output.text()
|
207
|
+
),
|
208
|
+
)
|
209
|
+
code = extract_code(model(prompt))
|
210
|
+
tool_output = code_interpreter.exec_isolation(
|
211
|
+
DefaultImports.prepend_imports(code)
|
212
|
+
)
|
213
|
+
tool_output_str = ""
|
214
|
+
if len(tool_output.logs.stdout) > 0:
|
215
|
+
tool_output_str = tool_output.logs.stdout[0]
|
216
|
+
|
217
|
+
if verbosity == 1:
|
218
|
+
_print_code("Code and test after attempted fix:", code)
|
219
|
+
_LOGGER.info(f"Code execution result after attempte {count}")
|
220
|
+
|
221
|
+
count += 1
|
222
|
+
|
223
|
+
user_req = chat[-1]["content"]
|
224
|
+
context = USER_REQ.format(user_request=user_req)
|
225
|
+
# because the tool picker model gets the image as well, we have to be careful with
|
226
|
+
# how much text we send it, so we truncate the tool output to 20,000 characters
|
227
|
+
prompt = PICK_PLAN.format(
|
228
|
+
context=context,
|
229
|
+
plans=format_plans(plans),
|
230
|
+
tool_output=tool_output_str[:20_000],
|
231
|
+
)
|
232
|
+
chat[-1]["content"] = prompt
|
233
|
+
best_plan = extract_json(model(chat))
|
234
|
+
if verbosity >= 1:
|
235
|
+
_LOGGER.info(f"Best plan:\n{best_plan}")
|
236
|
+
return best_plan["best_plan"], tool_output_str
|
158
237
|
|
159
238
|
|
160
239
|
@traceable
|
161
240
|
def write_code(
|
162
241
|
coder: LMM,
|
163
242
|
chat: List[Message],
|
243
|
+
plan: str,
|
164
244
|
tool_info: str,
|
245
|
+
tool_output: str,
|
165
246
|
feedback: str,
|
166
247
|
) -> str:
|
167
248
|
chat = copy.deepcopy(chat)
|
@@ -171,7 +252,8 @@ def write_code(
|
|
171
252
|
user_request = chat[-1]["content"]
|
172
253
|
prompt = CODE.format(
|
173
254
|
docstring=tool_info,
|
174
|
-
question=user_request,
|
255
|
+
question=FULL_TASK.format(user_request=user_request, subtasks=plan),
|
256
|
+
tool_output=tool_output,
|
175
257
|
feedback=feedback,
|
176
258
|
)
|
177
259
|
chat[-1]["content"] = prompt
|
@@ -203,27 +285,11 @@ def write_test(
|
|
203
285
|
return extract_code(tester(chat))
|
204
286
|
|
205
287
|
|
206
|
-
@traceable
|
207
|
-
def reflect(
|
208
|
-
chat: List[Message],
|
209
|
-
plan: str,
|
210
|
-
code: str,
|
211
|
-
model: LMM,
|
212
|
-
) -> Dict[str, Union[str, bool]]:
|
213
|
-
chat = copy.deepcopy(chat)
|
214
|
-
if chat[-1]["role"] != "user":
|
215
|
-
raise ValueError("Last chat message must be from the user.")
|
216
|
-
|
217
|
-
user_request = chat[-1]["content"]
|
218
|
-
context = USER_REQ.format(user_request=user_request)
|
219
|
-
prompt = REFLECT.format(context=context, plan=plan, code=code)
|
220
|
-
chat[-1]["content"] = prompt
|
221
|
-
return extract_json(model(chat))
|
222
|
-
|
223
|
-
|
224
288
|
def write_and_test_code(
|
225
289
|
chat: List[Message],
|
290
|
+
plan: str,
|
226
291
|
tool_info: str,
|
292
|
+
tool_output: str,
|
227
293
|
tool_utils: str,
|
228
294
|
working_memory: List[Dict[str, str]],
|
229
295
|
coder: LMM,
|
@@ -241,7 +307,14 @@ def write_and_test_code(
|
|
241
307
|
"status": "started",
|
242
308
|
}
|
243
309
|
)
|
244
|
-
code = write_code(
|
310
|
+
code = write_code(
|
311
|
+
coder,
|
312
|
+
chat,
|
313
|
+
plan,
|
314
|
+
tool_info,
|
315
|
+
tool_output,
|
316
|
+
format_memory(working_memory),
|
317
|
+
)
|
245
318
|
test = write_test(
|
246
319
|
tester, chat, tool_utils, code, format_memory(working_memory), media
|
247
320
|
)
|
@@ -412,11 +485,11 @@ def _print_code(title: str, code: str, test: Optional[str] = None) -> None:
|
|
412
485
|
|
413
486
|
|
414
487
|
def retrieve_tools(
|
415
|
-
|
488
|
+
plans: Dict[str, List[Dict[str, str]]],
|
416
489
|
tool_recommender: Sim,
|
417
490
|
log_progress: Callable[[Dict[str, Any]], None],
|
418
491
|
verbosity: int = 0,
|
419
|
-
) -> str:
|
492
|
+
) -> Dict[str, str]:
|
420
493
|
log_progress(
|
421
494
|
{
|
422
495
|
"type": "tools",
|
@@ -425,27 +498,29 @@ def retrieve_tools(
|
|
425
498
|
)
|
426
499
|
tool_info = []
|
427
500
|
tool_desc = []
|
428
|
-
|
429
|
-
for
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
"type": "tools",
|
439
|
-
"status": "completed",
|
440
|
-
"payload": list({v["description"]: v for v in tool_list}.values()),
|
441
|
-
}
|
442
|
-
)
|
501
|
+
tool_lists: Dict[str, List[Dict[str, str]]] = {}
|
502
|
+
for k, plan in plans.items():
|
503
|
+
tool_lists[k] = []
|
504
|
+
for task in plan:
|
505
|
+
tools = tool_recommender.top_k(task["instructions"], k=2, thresh=0.3)
|
506
|
+
tool_info.extend([e["doc"] for e in tools])
|
507
|
+
tool_desc.extend([e["desc"] for e in tools])
|
508
|
+
tool_lists[k].extend(
|
509
|
+
{"description": e["desc"], "documentation": e["doc"]} for e in tools
|
510
|
+
)
|
443
511
|
|
444
512
|
if verbosity == 2:
|
445
513
|
tool_desc_str = "\n".join(set(tool_desc))
|
446
514
|
_LOGGER.info(f"Tools Description:\n{tool_desc_str}")
|
447
|
-
|
448
|
-
|
515
|
+
|
516
|
+
tool_lists_unique = {}
|
517
|
+
for k in tool_lists:
|
518
|
+
tool_lists_unique[k] = "\n\n".join(
|
519
|
+
set(e["documentation"] for e in tool_lists[k])
|
520
|
+
)
|
521
|
+
all_tools = "\n\n".join(set(tool_info))
|
522
|
+
tool_lists_unique["all"] = all_tools
|
523
|
+
return tool_lists_unique
|
449
524
|
|
450
525
|
|
451
526
|
class VisionAgent(Agent):
|
@@ -543,7 +618,6 @@ class VisionAgent(Agent):
|
|
543
618
|
def chat_with_workflow(
|
544
619
|
self,
|
545
620
|
chat: List[Message],
|
546
|
-
self_reflection: bool = False,
|
547
621
|
display_visualization: bool = False,
|
548
622
|
) -> Dict[str, Any]:
|
549
623
|
"""Chat with Vision Agent and return intermediate information regarding the task.
|
@@ -554,7 +628,6 @@ class VisionAgent(Agent):
|
|
554
628
|
[{"role": "user", "content": "describe your task here..."}]
|
555
629
|
or if it contains media files, it should be in the format of:
|
556
630
|
[{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
|
557
|
-
self_reflection (bool): Whether to reflect on the task and debug the code.
|
558
631
|
display_visualization (bool): If True, it opens a new window locally to
|
559
632
|
show the image(s) created by visualization code (if there is any).
|
560
633
|
|
@@ -581,7 +654,10 @@ class VisionAgent(Agent):
|
|
581
654
|
|
582
655
|
int_chat = cast(
|
583
656
|
List[Message],
|
584
|
-
[
|
657
|
+
[
|
658
|
+
{"role": c["role"], "content": c["content"], "media": c["media"]}
|
659
|
+
for c in chat
|
660
|
+
],
|
585
661
|
)
|
586
662
|
|
587
663
|
code = ""
|
@@ -599,13 +675,45 @@ class VisionAgent(Agent):
|
|
599
675
|
"status": "started",
|
600
676
|
}
|
601
677
|
)
|
602
|
-
|
678
|
+
plans = write_plans(
|
603
679
|
int_chat,
|
604
680
|
T.TOOL_DESCRIPTIONS,
|
605
681
|
format_memory(working_memory),
|
606
682
|
self.planner,
|
607
683
|
)
|
608
|
-
|
684
|
+
|
685
|
+
if self.verbosity >= 1:
|
686
|
+
for p in plans:
|
687
|
+
_LOGGER.info(
|
688
|
+
f"\n{tabulate(tabular_data=plans[p], headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
|
689
|
+
)
|
690
|
+
|
691
|
+
tool_infos = retrieve_tools(
|
692
|
+
plans,
|
693
|
+
self.tool_recommender,
|
694
|
+
self.log_progress,
|
695
|
+
self.verbosity,
|
696
|
+
)
|
697
|
+
best_plan, tool_output_str = pick_plan(
|
698
|
+
int_chat,
|
699
|
+
plans,
|
700
|
+
tool_infos["all"],
|
701
|
+
self.coder,
|
702
|
+
code_interpreter,
|
703
|
+
verbosity=self.verbosity,
|
704
|
+
)
|
705
|
+
|
706
|
+
if best_plan in plans and best_plan in tool_infos:
|
707
|
+
plan_i = plans[best_plan]
|
708
|
+
tool_info = tool_infos[best_plan]
|
709
|
+
else:
|
710
|
+
if self.verbosity >= 1:
|
711
|
+
_LOGGER.warning(
|
712
|
+
f"Best plan {best_plan} not found in plans or tool_infos. Using the first plan and tool info."
|
713
|
+
)
|
714
|
+
k = list(plans.keys())[0]
|
715
|
+
plan_i = plans[k]
|
716
|
+
tool_info = tool_infos[k]
|
609
717
|
|
610
718
|
self.log_progress(
|
611
719
|
{
|
@@ -616,18 +724,16 @@ class VisionAgent(Agent):
|
|
616
724
|
)
|
617
725
|
if self.verbosity >= 1:
|
618
726
|
_LOGGER.info(
|
619
|
-
f"
|
727
|
+
f"Picked best plan:\n{tabulate(tabular_data=plan_i, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
|
620
728
|
)
|
621
729
|
|
622
|
-
tool_info = retrieve_tools(
|
623
|
-
plan_i,
|
624
|
-
self.tool_recommender,
|
625
|
-
self.log_progress,
|
626
|
-
self.verbosity,
|
627
|
-
)
|
628
730
|
results = write_and_test_code(
|
629
|
-
chat=
|
731
|
+
chat=[
|
732
|
+
{"role": c["role"], "content": c["content"]} for c in int_chat
|
733
|
+
],
|
734
|
+
plan="\n-" + "\n-".join([e["instructions"] for e in plan_i]),
|
630
735
|
tool_info=tool_info,
|
736
|
+
tool_output=tool_output_str,
|
631
737
|
tool_utils=T.UTILITIES_DOCSTRING,
|
632
738
|
working_memory=working_memory,
|
633
739
|
coder=self.coder,
|
@@ -644,35 +750,6 @@ class VisionAgent(Agent):
|
|
644
750
|
working_memory.extend(results["working_memory"]) # type: ignore
|
645
751
|
plan.append({"code": code, "test": test, "plan": plan_i})
|
646
752
|
|
647
|
-
if not self_reflection:
|
648
|
-
break
|
649
|
-
|
650
|
-
self.log_progress(
|
651
|
-
{
|
652
|
-
"type": "self_reflection",
|
653
|
-
"status": "started",
|
654
|
-
}
|
655
|
-
)
|
656
|
-
reflection = reflect(
|
657
|
-
int_chat,
|
658
|
-
FULL_TASK.format(
|
659
|
-
user_request=chat[0]["content"], subtasks=plan_i_str
|
660
|
-
),
|
661
|
-
code,
|
662
|
-
self.planner,
|
663
|
-
)
|
664
|
-
if self.verbosity > 0:
|
665
|
-
_LOGGER.info(f"Reflection: {reflection}")
|
666
|
-
feedback = cast(str, reflection["feedback"])
|
667
|
-
success = cast(bool, reflection["success"])
|
668
|
-
self.log_progress(
|
669
|
-
{
|
670
|
-
"type": "self_reflection",
|
671
|
-
"status": "completed" if success else "failed",
|
672
|
-
"payload": reflection,
|
673
|
-
}
|
674
|
-
)
|
675
|
-
working_memory.append({"code": f"{code}\n{test}", "feedback": feedback})
|
676
753
|
retries += 1
|
677
754
|
|
678
755
|
execution_result = cast(Execution, results["test_result"])
|
@@ -19,7 +19,7 @@ FEEDBACK = """
|
|
19
19
|
|
20
20
|
|
21
21
|
PLAN = """
|
22
|
-
**Context
|
22
|
+
**Context**:
|
23
23
|
{context}
|
24
24
|
|
25
25
|
**Tools Available**:
|
@@ -29,23 +29,110 @@ PLAN = """
|
|
29
29
|
{feedback}
|
30
30
|
|
31
31
|
**Instructions**:
|
32
|
-
1. Based on the context and tools you have available,
|
33
|
-
2.
|
32
|
+
1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request.
|
33
|
+
2. Output three different plans each utilize a different strategy or tool.
|
34
34
|
|
35
35
|
Output a list of jsons in the following format
|
36
36
|
|
37
37
|
```json
|
38
38
|
{{
|
39
|
-
"
|
39
|
+
"plan1":
|
40
40
|
[
|
41
41
|
{{
|
42
42
|
"instructions": str # what you should do in this task associated with a tool
|
43
43
|
}}
|
44
|
-
]
|
44
|
+
],
|
45
|
+
"plan2": ...,
|
46
|
+
"plan3": ...
|
45
47
|
}}
|
46
48
|
```
|
47
49
|
"""
|
48
50
|
|
51
|
+
|
52
|
+
TEST_PLANS = """
|
53
|
+
**Role**: You are a software programmer responsible for testing different tools.
|
54
|
+
|
55
|
+
**Task**: Your responsibility is to take a set of several plans and test the different tools for each plan.
|
56
|
+
|
57
|
+
**Documentation**:
|
58
|
+
This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`.
|
59
|
+
|
60
|
+
{docstring}
|
61
|
+
|
62
|
+
**Plans**:
|
63
|
+
{plans}
|
64
|
+
|
65
|
+
{previous_attempts}
|
66
|
+
|
67
|
+
**Instructions**:
|
68
|
+
1. Write a program to load the media and call each tool and save it's output.
|
69
|
+
2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove any array types from the printed dictionary.
|
70
|
+
3. Print this final dictionary.
|
71
|
+
|
72
|
+
**Example**:
|
73
|
+
plan1:
|
74
|
+
- Load the image from the provided file path 'image.jpg'.
|
75
|
+
- Use the 'owl_v2' tool with the prompt 'person' to detect and count the number of people in the image.
|
76
|
+
plan2:
|
77
|
+
- Load the image from the provided file path 'image.jpg'.
|
78
|
+
- Use the 'grounding_sam' tool with the prompt 'person' to detect and count the number of people in the image.
|
79
|
+
- Count the number of detected objects labeled as 'person'.
|
80
|
+
plan3:
|
81
|
+
- Load the image from the provided file path 'image.jpg'.
|
82
|
+
- Use the 'loca_zero_shot_counting' tool to count the dominant foreground object, which in this case is people.
|
83
|
+
|
84
|
+
```python
|
85
|
+
from vision_agent.tools import load_image, owl_v2, grounding_sam, loca_zero_shot_counting
|
86
|
+
image = load_image("image.jpg")
|
87
|
+
owl_v2_out = owl_v2("person", image)
|
88
|
+
|
89
|
+
gsam_out = grounding_sam("person", image)
|
90
|
+
gsam_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in gsam_out]
|
91
|
+
|
92
|
+
loca_out = loca_zero_shot_counting(image)
|
93
|
+
loca_out = loca_out["count"]
|
94
|
+
|
95
|
+
final_out = {{"owl_v2": owl_v2_out, "florencev2_object_detection": florencev2_out, "loca_zero_shot_counting": loca_out}}
|
96
|
+
print(final_out)
|
97
|
+
```
|
98
|
+
"""
|
99
|
+
|
100
|
+
|
101
|
+
PREVIOUS_FAILED = """
|
102
|
+
**Previous Failed Attempts**:
|
103
|
+
You previously ran this code:
|
104
|
+
```python
|
105
|
+
{code}
|
106
|
+
```
|
107
|
+
|
108
|
+
But got the following error or no stdout:
|
109
|
+
{error}
|
110
|
+
"""
|
111
|
+
|
112
|
+
|
113
|
+
PICK_PLAN = """
|
114
|
+
**Role**: You are a software programmer.
|
115
|
+
|
116
|
+
**Task**: Your responsibility is to pick the best plan from the three plans provided.
|
117
|
+
|
118
|
+
**Context**:
|
119
|
+
{context}
|
120
|
+
|
121
|
+
**Plans**:
|
122
|
+
{plans}
|
123
|
+
|
124
|
+
**Tool Output**:
|
125
|
+
{tool_output}
|
126
|
+
|
127
|
+
**Instructions**:
|
128
|
+
1. Given the plans, image, and tool outputs, decide which plan is the best to achieve the user request.
|
129
|
+
2. Output a JSON object with the following format:
|
130
|
+
{{
|
131
|
+
"thoughts": str # your thought process for choosing the best plan
|
132
|
+
"best_plan": str # the best plan you have chosen
|
133
|
+
}}
|
134
|
+
"""
|
135
|
+
|
49
136
|
CODE = """
|
50
137
|
**Role**: You are a software programmer.
|
51
138
|
|
@@ -64,6 +151,9 @@ This is the documentation for the functions you have access to. You may call any
|
|
64
151
|
**User Instructions**:
|
65
152
|
{question}
|
66
153
|
|
154
|
+
**Tool Output**:
|
155
|
+
{tool_output}
|
156
|
+
|
67
157
|
**Previous Feedback**:
|
68
158
|
{feedback}
|
69
159
|
|
@@ -72,7 +162,6 @@ This is the documentation for the functions you have access to. You may call any
|
|
72
162
|
2. **Algorithm/Method Selection**: Decide on the most efficient way.
|
73
163
|
3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode.
|
74
164
|
4. **Code Generation**: Translate your pseudocode into executable Python code. Ensure you use correct arguments, remember coordinates are always returned normalized from `vision_agent.tools`. All images from `vision_agent.tools` are in RGB format, red is (255, 0, 0) and blue is (0, 0, 255).
|
75
|
-
5. **Logging**: Log the output of the custom functions that were provided to you from `from vision_agent.tools import *`. Use a debug flag in the function parameters to toggle logging on and off.
|
76
165
|
"""
|
77
166
|
|
78
167
|
TEST = """
|
@@ -147,7 +236,6 @@ print(found_text)
|
|
147
236
|
```
|
148
237
|
"""
|
149
238
|
|
150
|
-
|
151
239
|
SIMPLE_TEST = """
|
152
240
|
**Role**: As a tester, your task is to create a simple test case for the provided code. This test case should verify the fundamental functionality under normal conditions.
|
153
241
|
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import base64
|
2
|
+
import io
|
2
3
|
import json
|
3
4
|
import logging
|
4
5
|
import os
|
@@ -8,6 +9,7 @@ from typing import Any, Callable, Dict, List, Optional, Union, cast
|
|
8
9
|
|
9
10
|
import requests
|
10
11
|
from openai import AzureOpenAI, OpenAI
|
12
|
+
from PIL import Image
|
11
13
|
|
12
14
|
import vision_agent.tools as T
|
13
15
|
from vision_agent.tools.prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
|
@@ -15,12 +17,40 @@ from vision_agent.tools.prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
|
|
15
17
|
_LOGGER = logging.getLogger(__name__)
|
16
18
|
|
17
19
|
|
18
|
-
def
|
19
|
-
|
20
|
-
|
20
|
+
def encode_image_bytes(image: bytes) -> str:
|
21
|
+
image = Image.open(io.BytesIO(image)).convert("RGB") # type: ignore
|
22
|
+
buffer = io.BytesIO()
|
23
|
+
image.save(buffer, format="PNG") # type: ignore
|
24
|
+
encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
21
25
|
return encoded_image
|
22
26
|
|
23
27
|
|
28
|
+
def encode_media(media: Union[str, Path]) -> str:
|
29
|
+
extension = "png"
|
30
|
+
extension = Path(media).suffix
|
31
|
+
if extension.lower() not in {
|
32
|
+
".jpg",
|
33
|
+
".jpeg",
|
34
|
+
".png",
|
35
|
+
".webp",
|
36
|
+
".bmp",
|
37
|
+
".mp4",
|
38
|
+
".mov",
|
39
|
+
}:
|
40
|
+
raise ValueError(f"Unsupported image extension: {extension}")
|
41
|
+
|
42
|
+
image_bytes = b""
|
43
|
+
if extension.lower() in {".mp4", ".mov"}:
|
44
|
+
frames = T.extract_frames(media)
|
45
|
+
image = frames[len(frames) // 2]
|
46
|
+
buffer = io.BytesIO()
|
47
|
+
Image.fromarray(image[0]).convert("RGB").save(buffer, format="PNG")
|
48
|
+
image_bytes = buffer.getvalue()
|
49
|
+
else:
|
50
|
+
image_bytes = open(media, "rb").read()
|
51
|
+
return encode_image_bytes(image_bytes)
|
52
|
+
|
53
|
+
|
24
54
|
TextOrImage = Union[str, List[Union[str, Path]]]
|
25
55
|
Message = Dict[str, TextOrImage]
|
26
56
|
|
@@ -54,7 +84,7 @@ class OpenAILMM(LMM):
|
|
54
84
|
self,
|
55
85
|
model_name: str = "gpt-4o",
|
56
86
|
api_key: Optional[str] = None,
|
57
|
-
max_tokens: int =
|
87
|
+
max_tokens: int = 4096,
|
58
88
|
json_mode: bool = False,
|
59
89
|
**kwargs: Any,
|
60
90
|
):
|
@@ -97,20 +127,14 @@ class OpenAILMM(LMM):
|
|
97
127
|
fixed_c = {"role": c["role"]}
|
98
128
|
fixed_c["content"] = [{"type": "text", "text": c["content"]}] # type: ignore
|
99
129
|
if "media" in c:
|
100
|
-
for
|
101
|
-
|
102
|
-
|
103
|
-
extension = "jpg"
|
104
|
-
elif extension.lower() == ".png":
|
105
|
-
extension = "png"
|
106
|
-
else:
|
107
|
-
raise ValueError(f"Unsupported image extension: {extension}")
|
108
|
-
encoded_image = encode_image(image)
|
130
|
+
for media in c["media"]:
|
131
|
+
encoded_media = encode_media(media)
|
132
|
+
|
109
133
|
fixed_c["content"].append( # type: ignore
|
110
134
|
{
|
111
135
|
"type": "image_url",
|
112
136
|
"image_url": {
|
113
|
-
"url": f"data:image/
|
137
|
+
"url": f"data:image/png;base64,{encoded_media}", # type: ignore
|
114
138
|
"detail": "low",
|
115
139
|
},
|
116
140
|
},
|
@@ -138,13 +162,12 @@ class OpenAILMM(LMM):
|
|
138
162
|
]
|
139
163
|
if media and len(media) > 0:
|
140
164
|
for m in media:
|
141
|
-
|
142
|
-
encoded_image = encode_image(m)
|
165
|
+
encoded_media = encode_media(m)
|
143
166
|
message[0]["content"].append(
|
144
167
|
{
|
145
168
|
"type": "image_url",
|
146
169
|
"image_url": {
|
147
|
-
"url": f"data:image/
|
170
|
+
"url": f"data:image/png;base64,{encoded_media}",
|
148
171
|
"detail": "low",
|
149
172
|
},
|
150
173
|
},
|
@@ -241,7 +264,7 @@ class AzureOpenAILMM(OpenAILMM):
|
|
241
264
|
api_key: Optional[str] = None,
|
242
265
|
api_version: str = "2024-02-01",
|
243
266
|
azure_endpoint: Optional[str] = None,
|
244
|
-
max_tokens: int =
|
267
|
+
max_tokens: int = 4096,
|
245
268
|
json_mode: bool = False,
|
246
269
|
**kwargs: Any,
|
247
270
|
):
|
@@ -312,7 +335,7 @@ class OllamaLMM(LMM):
|
|
312
335
|
fixed_chat = []
|
313
336
|
for message in chat:
|
314
337
|
if "media" in message:
|
315
|
-
message["images"] = [
|
338
|
+
message["images"] = [encode_media(m) for m in message["media"]]
|
316
339
|
del message["media"]
|
317
340
|
fixed_chat.append(message)
|
318
341
|
url = f"{self.url}/chat"
|
@@ -343,7 +366,7 @@ class OllamaLMM(LMM):
|
|
343
366
|
json_data = json.dumps(data)
|
344
367
|
if media and len(media) > 0:
|
345
368
|
for m in media:
|
346
|
-
data["images"].append(
|
369
|
+
data["images"].append(encode_media(m)) # type: ignore
|
347
370
|
|
348
371
|
response = requests.post(url, data=json_data)
|
349
372
|
|
@@ -362,8 +362,10 @@ class Execution(BaseModel):
|
|
362
362
|
return Execution(
|
363
363
|
error=Error(
|
364
364
|
name=exec.__class__.__name__,
|
365
|
-
value=str(exec),
|
366
|
-
traceback_raw=
|
365
|
+
value=_remove_escape_and_color_codes(str(exec)),
|
366
|
+
traceback_raw=[
|
367
|
+
_remove_escape_and_color_codes(line) for line in traceback_raw
|
368
|
+
],
|
367
369
|
)
|
368
370
|
)
|
369
371
|
|
@@ -378,8 +380,11 @@ class Execution(BaseModel):
|
|
378
380
|
error=(
|
379
381
|
Error(
|
380
382
|
name=exec.error.name,
|
381
|
-
value=exec.error.value,
|
382
|
-
traceback_raw=
|
383
|
+
value=_remove_escape_and_color_codes(exec.error.value),
|
384
|
+
traceback_raw=[
|
385
|
+
_remove_escape_and_color_codes(line)
|
386
|
+
for line in exec.error.traceback_raw
|
387
|
+
],
|
383
388
|
)
|
384
389
|
if exec.error
|
385
390
|
else None
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
@@ -11,19 +11,19 @@ from .tools import (
|
|
11
11
|
clip,
|
12
12
|
closest_box_distance,
|
13
13
|
closest_mask_distance,
|
14
|
+
depth_anything_v2,
|
15
|
+
detr_segmentation,
|
16
|
+
dpt_hybrid_midas,
|
14
17
|
extract_frames,
|
15
18
|
florencev2_image_caption,
|
16
|
-
get_tool_documentation,
|
17
19
|
florencev2_object_detection,
|
18
|
-
|
19
|
-
depth_anything_v2,
|
20
|
-
generate_soft_edge_image,
|
21
|
-
dpt_hybrid_midas,
|
20
|
+
florencev2_roberta_vqa,
|
22
21
|
generate_pose_image,
|
22
|
+
generate_soft_edge_image,
|
23
|
+
get_tool_documentation,
|
23
24
|
git_vqa_v2,
|
24
25
|
grounding_dino,
|
25
26
|
grounding_sam,
|
26
|
-
florencev2_roberta_vqa,
|
27
27
|
load_image,
|
28
28
|
loca_visual_prompt_counting,
|
29
29
|
loca_zero_shot_counting,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|