vision-agent 0.2.44__py3-none-any.whl → 0.2.46__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent.py +75 -20
- vision_agent/agent/vision_agent_prompts.py +5 -4
- vision_agent/tools/tools.py +6 -3
- vision_agent/utils/video.py +1 -1
- {vision_agent-0.2.44.dist-info → vision_agent-0.2.46.dist-info}/METADATA +1 -1
- {vision_agent-0.2.44.dist-info → vision_agent-0.2.46.dist-info}/RECORD +8 -8
- {vision_agent-0.2.44.dist-info → vision_agent-0.2.46.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.44.dist-info → vision_agent-0.2.46.dist-info}/WHEEL +0 -0
@@ -1,10 +1,13 @@
|
|
1
1
|
import copy
|
2
|
+
import difflib
|
2
3
|
import json
|
3
4
|
import logging
|
4
5
|
import sys
|
6
|
+
import tempfile
|
5
7
|
from pathlib import Path
|
6
|
-
from typing import Any, Callable, Dict, List, Optional, Union, cast
|
8
|
+
from typing import Any, Callable, Dict, List, Optional, Sequence, Union, cast
|
7
9
|
|
10
|
+
from PIL import Image
|
8
11
|
from rich.console import Console
|
9
12
|
from rich.style import Style
|
10
13
|
from rich.syntax import Syntax
|
@@ -14,7 +17,6 @@ import vision_agent.tools as T
|
|
14
17
|
from vision_agent.agent import Agent
|
15
18
|
from vision_agent.agent.vision_agent_prompts import (
|
16
19
|
CODE,
|
17
|
-
FEEDBACK,
|
18
20
|
FIX_BUG,
|
19
21
|
FULL_TASK,
|
20
22
|
PLAN,
|
@@ -37,17 +39,27 @@ _CONSOLE = Console()
|
|
37
39
|
_DEFAULT_IMPORT = "\n".join(T.__new_tools__)
|
38
40
|
|
39
41
|
|
40
|
-
def
|
41
|
-
return
|
42
|
-
|
43
|
-
|
44
|
-
f"### Feedback {i}:\nCode: ```python\n{m['code']}\n```\nFeedback: {m['feedback']}\n"
|
45
|
-
for i, m in enumerate(memory)
|
46
|
-
]
|
42
|
+
def get_diff(before: str, after: str) -> str:
|
43
|
+
return "".join(
|
44
|
+
difflib.unified_diff(
|
45
|
+
before.splitlines(keepends=True), after.splitlines(keepends=True)
|
47
46
|
)
|
48
47
|
)
|
49
48
|
|
50
49
|
|
50
|
+
def format_memory(memory: List[Dict[str, str]]) -> str:
|
51
|
+
output_str = ""
|
52
|
+
for i, m in enumerate(memory):
|
53
|
+
output_str += f"### Feedback {i}:\n"
|
54
|
+
output_str += f"Code {i}:\n```python\n{m['code']}```\n\n"
|
55
|
+
output_str += f"Feedback {i}: {m['feedback']}\n\n"
|
56
|
+
if "edits" in m:
|
57
|
+
output_str += f"Edits {i}:\n{m['edits']}\n"
|
58
|
+
output_str += "\n"
|
59
|
+
|
60
|
+
return output_str
|
61
|
+
|
62
|
+
|
51
63
|
def extract_code(code: str) -> str:
|
52
64
|
if "\n```python" in code:
|
53
65
|
start = "\n```python"
|
@@ -78,12 +90,35 @@ def extract_json(json_str: str) -> Dict[str, Any]:
|
|
78
90
|
return json_dict # type: ignore
|
79
91
|
|
80
92
|
|
93
|
+
def extract_image(
|
94
|
+
media: Optional[Sequence[Union[str, Path]]]
|
95
|
+
) -> Optional[Sequence[Union[str, Path]]]:
|
96
|
+
if media is None:
|
97
|
+
return None
|
98
|
+
|
99
|
+
new_media = []
|
100
|
+
for m in media:
|
101
|
+
m = Path(m)
|
102
|
+
extension = m.suffix
|
103
|
+
if extension in [".jpg", ".jpeg", ".png", ".bmp"]:
|
104
|
+
new_media.append(m)
|
105
|
+
elif extension in [".mp4", ".mov"]:
|
106
|
+
frames = T.extract_frames(m)
|
107
|
+
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
|
108
|
+
if len(frames) > 0:
|
109
|
+
Image.fromarray(frames[0][0]).save(tmp.name)
|
110
|
+
new_media.append(Path(tmp.name))
|
111
|
+
if len(new_media) == 0:
|
112
|
+
return None
|
113
|
+
return new_media
|
114
|
+
|
115
|
+
|
81
116
|
def write_plan(
|
82
117
|
chat: List[Dict[str, str]],
|
83
118
|
tool_desc: str,
|
84
119
|
working_memory: str,
|
85
120
|
model: Union[LLM, LMM],
|
86
|
-
media: Optional[
|
121
|
+
media: Optional[Sequence[Union[str, Path]]] = None,
|
87
122
|
) -> List[Dict[str, str]]:
|
88
123
|
chat = copy.deepcopy(chat)
|
89
124
|
if chat[-1]["role"] != "user":
|
@@ -94,6 +129,7 @@ def write_plan(
|
|
94
129
|
prompt = PLAN.format(context=context, tool_desc=tool_desc, feedback=working_memory)
|
95
130
|
chat[-1]["content"] = prompt
|
96
131
|
if isinstance(model, OpenAILMM):
|
132
|
+
media = extract_image(media)
|
97
133
|
return extract_json(model.chat(chat, images=media))["plan"] # type: ignore
|
98
134
|
else:
|
99
135
|
return extract_json(model.chat(chat))["plan"] # type: ignore
|
@@ -103,7 +139,7 @@ def reflect(
|
|
103
139
|
chat: List[Dict[str, str]],
|
104
140
|
plan: str,
|
105
141
|
code: str,
|
106
|
-
model: LLM,
|
142
|
+
model: Union[LLM, LMM],
|
107
143
|
) -> Dict[str, Union[str, bool]]:
|
108
144
|
chat = copy.deepcopy(chat)
|
109
145
|
if chat[-1]["role"] != "user":
|
@@ -120,7 +156,7 @@ def write_and_test_code(
|
|
120
156
|
task: str,
|
121
157
|
tool_info: str,
|
122
158
|
tool_utils: str,
|
123
|
-
working_memory: str,
|
159
|
+
working_memory: List[Dict[str, str]],
|
124
160
|
coder: LLM,
|
125
161
|
tester: LLM,
|
126
162
|
debugger: LLM,
|
@@ -137,7 +173,13 @@ def write_and_test_code(
|
|
137
173
|
}
|
138
174
|
)
|
139
175
|
code = extract_code(
|
140
|
-
coder(
|
176
|
+
coder(
|
177
|
+
CODE.format(
|
178
|
+
docstring=tool_info,
|
179
|
+
question=task,
|
180
|
+
feedback=format_memory(working_memory),
|
181
|
+
)
|
182
|
+
)
|
141
183
|
)
|
142
184
|
test = extract_code(
|
143
185
|
tester(
|
@@ -180,7 +222,7 @@ def write_and_test_code(
|
|
180
222
|
)
|
181
223
|
|
182
224
|
count = 0
|
183
|
-
new_working_memory = []
|
225
|
+
new_working_memory: List[Dict[str, str]] = []
|
184
226
|
while not result.success and count < max_retries:
|
185
227
|
log_progress(
|
186
228
|
{
|
@@ -191,14 +233,28 @@ def write_and_test_code(
|
|
191
233
|
fixed_code_and_test = extract_json(
|
192
234
|
debugger(
|
193
235
|
FIX_BUG.format(
|
194
|
-
code=code,
|
236
|
+
code=code,
|
237
|
+
tests=test,
|
238
|
+
result="\n".join(result.text().splitlines()[-50:]),
|
239
|
+
feedback=format_memory(working_memory + new_working_memory),
|
195
240
|
)
|
196
241
|
)
|
197
242
|
)
|
243
|
+
old_code = code
|
244
|
+
old_test = test
|
245
|
+
|
198
246
|
if fixed_code_and_test["code"].strip() != "":
|
199
247
|
code = extract_code(fixed_code_and_test["code"])
|
200
248
|
if fixed_code_and_test["test"].strip() != "":
|
201
249
|
test = extract_code(fixed_code_and_test["test"])
|
250
|
+
|
251
|
+
new_working_memory.append(
|
252
|
+
{
|
253
|
+
"code": f"{code}\n{test}",
|
254
|
+
"feedback": fixed_code_and_test["reflections"],
|
255
|
+
"edits": get_diff(f"{old_code}\n{old_test}", f"{code}\n{test}"),
|
256
|
+
}
|
257
|
+
)
|
202
258
|
log_progress(
|
203
259
|
{
|
204
260
|
"type": "code",
|
@@ -209,9 +265,6 @@ def write_and_test_code(
|
|
209
265
|
},
|
210
266
|
}
|
211
267
|
)
|
212
|
-
new_working_memory.append(
|
213
|
-
{"code": f"{code}\n{test}", "feedback": fixed_code_and_test["reflections"]}
|
214
|
-
)
|
215
268
|
|
216
269
|
result = code_interpreter.exec_isolation(f"{_DEFAULT_IMPORT}\n{code}\n{test}")
|
217
270
|
log_progress(
|
@@ -309,7 +362,7 @@ class VisionAgent(Agent):
|
|
309
362
|
|
310
363
|
def __init__(
|
311
364
|
self,
|
312
|
-
planner: Optional[LLM] = None,
|
365
|
+
planner: Optional[Union[LLM, LMM]] = None,
|
313
366
|
coder: Optional[LLM] = None,
|
314
367
|
tester: Optional[LLM] = None,
|
315
368
|
debugger: Optional[LLM] = None,
|
@@ -459,7 +512,7 @@ class VisionAgent(Agent):
|
|
459
512
|
),
|
460
513
|
tool_info=tool_info,
|
461
514
|
tool_utils=T.UTILITIES_DOCSTRING,
|
462
|
-
working_memory=
|
515
|
+
working_memory=working_memory,
|
463
516
|
coder=self.coder,
|
464
517
|
tester=self.tester,
|
465
518
|
debugger=self.debugger,
|
@@ -503,6 +556,8 @@ class VisionAgent(Agent):
|
|
503
556
|
working_memory.append(
|
504
557
|
{"code": f"{code}\n{test}", "feedback": feedback}
|
505
558
|
)
|
559
|
+
else:
|
560
|
+
break
|
506
561
|
|
507
562
|
retries += 1
|
508
563
|
|
@@ -29,14 +29,17 @@ PLAN = """
|
|
29
29
|
{feedback}
|
30
30
|
|
31
31
|
**Instructions**:
|
32
|
-
Based on the context and tools you have available, write a plan of subtasks to achieve the user request
|
32
|
+
1. Based on the context and tools you have available, write a plan of subtasks to achieve the user request.
|
33
|
+
2. Go over the users request step by step and ensure each step is represented as a clear subtask in your plan.
|
34
|
+
|
35
|
+
Output a list of jsons in the following format
|
33
36
|
|
34
37
|
```json
|
35
38
|
{{
|
36
39
|
"plan":
|
37
40
|
[
|
38
41
|
{{
|
39
|
-
"instructions": str # what you should do in this task
|
42
|
+
"instructions": str # what you should do in this task associated with a tool
|
40
43
|
}}
|
41
44
|
]
|
42
45
|
}}
|
@@ -194,9 +197,7 @@ When we run this test code:
|
|
194
197
|
```
|
195
198
|
|
196
199
|
It raises this error:
|
197
|
-
```python
|
198
200
|
{result}
|
199
|
-
```
|
200
201
|
|
201
202
|
This is previous feedback provided on the code:
|
202
203
|
{feedback}
|
vision_agent/tools/tools.py
CHANGED
@@ -199,14 +199,15 @@ def extract_frames(
|
|
199
199
|
|
200
200
|
def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
201
201
|
"""'ocr' extracts text from an image. It returns a list of detected text, bounding
|
202
|
-
boxes, and confidence scores. The results are sorted
|
202
|
+
boxes with normalized coordinates, and confidence scores. The results are sorted
|
203
|
+
from top-left to bottom right.
|
203
204
|
|
204
205
|
Parameters:
|
205
206
|
image (np.ndarray): The image to extract text from.
|
206
207
|
|
207
208
|
Returns:
|
208
|
-
List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
|
209
|
-
and confidence score.
|
209
|
+
List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
|
210
|
+
with nornmalized coordinates, and confidence score.
|
210
211
|
|
211
212
|
Example
|
212
213
|
-------
|
@@ -607,6 +608,7 @@ def overlay_bounding_boxes(
|
|
607
608
|
label: COLORS[i % len(COLORS)]
|
608
609
|
for i, label in enumerate(set([box["label"] for box in bboxes]))
|
609
610
|
}
|
611
|
+
bboxes = sorted(bboxes, key=lambda x: x["label"], reverse=True)
|
610
612
|
|
611
613
|
width, height = pil_image.size
|
612
614
|
fontsize = max(12, int(min(width, height) / 40))
|
@@ -679,6 +681,7 @@ def overlay_segmentation_masks(
|
|
679
681
|
label: COLORS[i % len(COLORS)]
|
680
682
|
for i, label in enumerate(set([mask["label"] for mask in masks]))
|
681
683
|
}
|
684
|
+
masks = sorted(masks, key=lambda x: x["label"], reverse=True)
|
682
685
|
|
683
686
|
for elt in masks:
|
684
687
|
mask = elt["mask"]
|
vision_agent/utils/video.py
CHANGED
@@ -11,8 +11,8 @@ vision_agent/agent/easytool_v2.py,sha256=CjY-sSj3abxnSq3ZHZMt-7YvRWDXEZsC6RN8FFI
|
|
11
11
|
vision_agent/agent/easytool_v2_prompts.py,sha256=MZSIwovYgB-f-kdJ6btaNDVXptJn47bfOL3-Zn6NiC0,8573
|
12
12
|
vision_agent/agent/reflexion.py,sha256=AlM5AvBJvCslXlYQdZiadq4oVHsNBm3IF_03DglTxRo,10506
|
13
13
|
vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
|
14
|
-
vision_agent/agent/vision_agent.py,sha256=
|
15
|
-
vision_agent/agent/vision_agent_prompts.py,sha256=
|
14
|
+
vision_agent/agent/vision_agent.py,sha256=S0VJWsdr0NIYjikXvPrEX-njGMqOIA53r4Q4NYY0Lpo,20365
|
15
|
+
vision_agent/agent/vision_agent_prompts.py,sha256=hgnTlaYp2HMBHLi3e4faPb-DI5jQL9jfhKq9jyEUEgY,8370
|
16
16
|
vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
17
|
vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
|
18
18
|
vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
|
@@ -23,14 +23,14 @@ vision_agent/tools/__init__.py,sha256=K_7knxmyTIcSEGL8c9wF8RpVh3GrMYfybFaq-2SUM1
|
|
23
23
|
vision_agent/tools/easytool_tools.py,sha256=pZc5dQlYINlV4nYbbzsDi3-wauA-fCeD2iGmJUMoUfE,47373
|
24
24
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
25
25
|
vision_agent/tools/tool_utils.py,sha256=wzRacbUpqk9hhfX_Y08rL8qP0XCN2w-8IZoYLi3Upn4,869
|
26
|
-
vision_agent/tools/tools.py,sha256=
|
26
|
+
vision_agent/tools/tools.py,sha256=SrNrIjyUKoTE3mCqGcy6nC-MeEzJ8uJCumlSkTvvPpg,26085
|
27
27
|
vision_agent/utils/__init__.py,sha256=Ce4yPhoWanRsnTy3X7YzZNBYYRJsrJeT7N59WUf8GZM,209
|
28
28
|
vision_agent/utils/execute.py,sha256=GlpUGe3pg5KdSvRHLFfVcn9ptXBIp-QRoHT3Wa6aIMs,20318
|
29
29
|
vision_agent/utils/image_utils.py,sha256=_cdiS5YrLzqkq_ZgFUO897m5M4_SCIThwUy4lOklfB8,7700
|
30
30
|
vision_agent/utils/sim.py,sha256=oUZ-6eu8Io-UNt9GXJ0XRKtP-Wc0sPWVzYGVpB2yDFk,3001
|
31
31
|
vision_agent/utils/type_defs.py,sha256=BlI8ywWHAplC7kYWLvt4AOdnKpEW3qWEFm-GEOSkrFQ,1792
|
32
|
-
vision_agent/utils/video.py,sha256=
|
33
|
-
vision_agent-0.2.
|
34
|
-
vision_agent-0.2.
|
35
|
-
vision_agent-0.2.
|
36
|
-
vision_agent-0.2.
|
32
|
+
vision_agent/utils/video.py,sha256=_u3UrEpcJzbclKyJYxF7SiDQGhE2gUc598diYYiEv34,8885
|
33
|
+
vision_agent-0.2.46.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
34
|
+
vision_agent-0.2.46.dist-info/METADATA,sha256=FOlKABAkLUX8oqtjeE2q9EO6j8yeoiwyw3lWUpIe0ow,6817
|
35
|
+
vision_agent-0.2.46.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
36
|
+
vision_agent-0.2.46.dist-info/RECORD,,
|
File without changes
|
File without changes
|