vision-agent 0.2.15__py3-none-any.whl → 0.2.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent.py +6 -4
- vision_agent/agent/vision_agent_v2.py +143 -47
- vision_agent/agent/vision_agent_v2_prompt.py +25 -10
- vision_agent/llm/llm.py +1 -1
- vision_agent/tools/__init__.py +1 -1
- vision_agent/tools/tool_utils.py +3 -0
- vision_agent/tools/tools.py +58 -36
- vision_agent/utils/__init__.py +1 -1
- vision_agent/utils/sim.py +17 -2
- {vision_agent-0.2.15.dist-info → vision_agent-0.2.22.dist-info}/METADATA +1 -1
- {vision_agent-0.2.15.dist-info → vision_agent-0.2.22.dist-info}/RECORD +13 -13
- {vision_agent-0.2.15.dist-info → vision_agent-0.2.22.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.15.dist-info → vision_agent-0.2.22.dist-info}/WHEEL +0 -0
@@ -308,7 +308,7 @@ def _handle_extract_frames(
|
|
308
308
|
# any following processing
|
309
309
|
for video_file_output in tool_result["call_results"]:
|
310
310
|
# When the video tool is run with wrong parameters, exit the loop
|
311
|
-
if len(video_file_output) < 2:
|
311
|
+
if not isinstance(video_file_output, tuple) or len(video_file_output) < 2:
|
312
312
|
break
|
313
313
|
for frame, _ in video_file_output:
|
314
314
|
image = frame
|
@@ -464,15 +464,17 @@ class VisionAgent(Agent):
|
|
464
464
|
report_progress_callback: a callback to report the progress of the agent. This is useful for streaming logs in a web application where multiple VisionAgent instances are running in parallel. This callback ensures that the progress are not mixed up.
|
465
465
|
"""
|
466
466
|
self.task_model = (
|
467
|
-
OpenAILLM(json_mode=True, temperature=0.
|
467
|
+
OpenAILLM(model_name="gpt-4-turbo", json_mode=True, temperature=0.0)
|
468
468
|
if task_model is None
|
469
469
|
else task_model
|
470
470
|
)
|
471
471
|
self.answer_model = (
|
472
|
-
OpenAILLM(temperature=0.
|
472
|
+
OpenAILLM(model_name="gpt-4-turbo", temperature=0.0)
|
473
|
+
if answer_model is None
|
474
|
+
else answer_model
|
473
475
|
)
|
474
476
|
self.reflect_model = (
|
475
|
-
OpenAILMM(json_mode=True, temperature=0.
|
477
|
+
OpenAILMM(model_name="gpt-4-turbo", json_mode=True, temperature=0.0)
|
476
478
|
if reflect_model is None
|
477
479
|
else reflect_model
|
478
480
|
)
|
@@ -1,8 +1,9 @@
|
|
1
1
|
import json
|
2
2
|
import logging
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
4
|
+
from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union
|
5
5
|
|
6
|
+
import pandas as pd
|
6
7
|
from rich.console import Console
|
7
8
|
from rich.syntax import Syntax
|
8
9
|
from tabulate import tabulate
|
@@ -20,6 +21,7 @@ from vision_agent.agent.vision_agent_v2_prompt import (
|
|
20
21
|
TEST,
|
21
22
|
USER_REQ_CONTEXT,
|
22
23
|
USER_REQ_SUBTASK_CONTEXT,
|
24
|
+
USER_REQ_SUBTASK_WM_CONTEXT,
|
23
25
|
)
|
24
26
|
from vision_agent.llm import LLM, OpenAILLM
|
25
27
|
from vision_agent.tools.tools_v2 import TOOL_DESCRIPTIONS, TOOLS_DF
|
@@ -31,28 +33,68 @@ _MAX_TABULATE_COL_WIDTH = 80
|
|
31
33
|
_CONSOLE = Console()
|
32
34
|
|
33
35
|
|
36
|
+
def build_working_memory(working_memory: Mapping[str, List[str]]) -> Sim:
|
37
|
+
data: Mapping[str, List[str]] = {"desc": [], "doc": []}
|
38
|
+
for key, value in working_memory.items():
|
39
|
+
data["desc"].append(key)
|
40
|
+
data["doc"].append("\n".join(value))
|
41
|
+
df = pd.DataFrame(data) # type: ignore
|
42
|
+
return Sim(df, sim_key="desc")
|
43
|
+
|
44
|
+
|
34
45
|
def extract_code(code: str) -> str:
|
35
46
|
if "```python" in code:
|
36
47
|
code = code[code.find("```python") + len("```python") :]
|
37
48
|
code = code[: code.find("```")]
|
49
|
+
if code.startswith("python\n"):
|
50
|
+
code = code[len("python\n") :]
|
38
51
|
return code
|
39
52
|
|
40
53
|
|
54
|
+
def extract_json(json_str: str) -> Dict[str, Any]:
|
55
|
+
try:
|
56
|
+
json_dict = json.loads(json_str)
|
57
|
+
except json.JSONDecodeError:
|
58
|
+
if "```json" in json_str:
|
59
|
+
json_str = json_str[json_str.find("```json") + len("```json") :]
|
60
|
+
json_str = json_str[: json_str.find("```")]
|
61
|
+
elif "```" in json_str:
|
62
|
+
json_str = json_str[json_str.find("```") + len("```") :]
|
63
|
+
# get the last ``` not one from an intermediate string
|
64
|
+
json_str = json_str[: json_str.find("}```")]
|
65
|
+
json_dict = json.loads(json_str)
|
66
|
+
return json_dict # type: ignore
|
67
|
+
|
68
|
+
|
41
69
|
def write_plan(
|
42
|
-
|
43
|
-
|
70
|
+
chat: List[Dict[str, str]],
|
71
|
+
plan: Optional[List[Dict[str, Any]]],
|
72
|
+
tool_desc: str,
|
73
|
+
model: LLM,
|
74
|
+
) -> Tuple[str, List[Dict[str, Any]]]:
|
75
|
+
# Get last user request
|
76
|
+
if chat[-1]["role"] != "user":
|
77
|
+
raise ValueError("Last chat message must be from the user.")
|
78
|
+
user_requirements = chat[-1]["content"]
|
79
|
+
|
44
80
|
context = USER_REQ_CONTEXT.format(user_requirement=user_requirements)
|
45
|
-
prompt = PLAN.format(context=context, plan=
|
46
|
-
|
47
|
-
|
81
|
+
prompt = PLAN.format(context=context, plan=str(plan), tool_desc=tool_desc)
|
82
|
+
chat[-1]["content"] = prompt
|
83
|
+
new_plan = extract_json(model.chat(chat))
|
84
|
+
return new_plan["user_req"], new_plan["plan"]
|
48
85
|
|
49
86
|
|
50
87
|
def write_code(
|
51
|
-
user_req: str,
|
88
|
+
user_req: str,
|
89
|
+
subtask: str,
|
90
|
+
working_memory: str,
|
91
|
+
tool_info: str,
|
92
|
+
code: str,
|
93
|
+
model: LLM,
|
52
94
|
) -> str:
|
53
95
|
prompt = CODE.format(
|
54
|
-
context=
|
55
|
-
user_requirement=user_req, subtask=subtask
|
96
|
+
context=USER_REQ_SUBTASK_WM_CONTEXT.format(
|
97
|
+
user_requirement=user_req, working_memory=working_memory, subtask=subtask
|
56
98
|
),
|
57
99
|
tool_info=tool_info,
|
58
100
|
code=code,
|
@@ -66,7 +108,7 @@ def write_code(
|
|
66
108
|
|
67
109
|
|
68
110
|
def write_test(
|
69
|
-
user_req: str, subtask: str, tool_info: str, code: str, model: LLM
|
111
|
+
user_req: str, subtask: str, tool_info: str, _: str, code: str, model: LLM
|
70
112
|
) -> str:
|
71
113
|
prompt = TEST.format(
|
72
114
|
context=USER_REQ_SUBTASK_CONTEXT.format(
|
@@ -83,20 +125,30 @@ def write_test(
|
|
83
125
|
return extract_code(code)
|
84
126
|
|
85
127
|
|
86
|
-
def debug_code(
|
128
|
+
def debug_code(
|
129
|
+
user_req: str,
|
130
|
+
subtask: str,
|
131
|
+
retrieved_ltm: str,
|
132
|
+
working_memory: str,
|
133
|
+
model: LLM,
|
134
|
+
) -> Tuple[str, str]:
|
87
135
|
# Make debug model output JSON
|
88
136
|
if hasattr(model, "kwargs"):
|
89
137
|
model.kwargs["response_format"] = {"type": "json_object"}
|
90
138
|
prompt = DEBUG.format(
|
91
139
|
debug_example=DEBUG_EXAMPLE,
|
92
|
-
context=
|
93
|
-
|
140
|
+
context=USER_REQ_SUBTASK_WM_CONTEXT.format(
|
141
|
+
user_requirement=user_req,
|
142
|
+
subtask=subtask,
|
143
|
+
working_memory=retrieved_ltm,
|
144
|
+
),
|
145
|
+
previous_impl=working_memory,
|
94
146
|
)
|
95
147
|
messages = [
|
96
148
|
{"role": "system", "content": DEBUG_SYS_MSG},
|
97
149
|
{"role": "user", "content": prompt},
|
98
150
|
]
|
99
|
-
code_and_ref =
|
151
|
+
code_and_ref = extract_json(model.chat(messages))
|
100
152
|
if hasattr(model, "kwargs"):
|
101
153
|
del model.kwargs["response_format"]
|
102
154
|
return extract_code(code_and_ref["improved_impl"]), code_and_ref["reflection"]
|
@@ -106,20 +158,25 @@ def write_and_exec_code(
|
|
106
158
|
user_req: str,
|
107
159
|
subtask: str,
|
108
160
|
orig_code: str,
|
109
|
-
code_writer_call: Callable,
|
161
|
+
code_writer_call: Callable[..., str],
|
110
162
|
model: LLM,
|
111
163
|
tool_info: str,
|
112
164
|
exec: Execute,
|
165
|
+
retrieved_ltm: str,
|
113
166
|
max_retry: int = 3,
|
114
|
-
|
167
|
+
verbosity: int = 0,
|
115
168
|
) -> Tuple[bool, str, str, Dict[str, List[str]]]:
|
116
169
|
success = False
|
117
170
|
counter = 0
|
118
171
|
reflection = ""
|
119
172
|
|
120
|
-
|
121
|
-
|
173
|
+
code = code_writer_call(
|
174
|
+
user_req, subtask, retrieved_ltm, tool_info, orig_code, model
|
175
|
+
)
|
122
176
|
success, result = exec.run_isolation(code)
|
177
|
+
if verbosity == 2:
|
178
|
+
_CONSOLE.print(Syntax(code, "python", theme="gruvbox-dark", line_numbers=True))
|
179
|
+
_LOGGER.info(f"\tCode success: {success}, result: {str(result)}")
|
123
180
|
working_memory: Dict[str, List[str]] = {}
|
124
181
|
while not success and counter < max_retry:
|
125
182
|
if subtask not in working_memory:
|
@@ -136,19 +193,21 @@ def write_and_exec_code(
|
|
136
193
|
PREV_CODE_CONTEXT.format(code=code, result=result)
|
137
194
|
)
|
138
195
|
|
139
|
-
code, reflection = debug_code(
|
196
|
+
code, reflection = debug_code(
|
197
|
+
user_req, subtask, retrieved_ltm, "\n".join(working_memory[subtask]), model
|
198
|
+
)
|
140
199
|
success, result = exec.run_isolation(code)
|
141
200
|
counter += 1
|
142
|
-
if
|
201
|
+
if verbosity == 2:
|
143
202
|
_CONSOLE.print(
|
144
203
|
Syntax(code, "python", theme="gruvbox-dark", line_numbers=True)
|
145
204
|
)
|
146
|
-
|
205
|
+
_LOGGER.info(f"\tDebugging reflection: {reflection}, result: {result}")
|
147
206
|
|
148
207
|
if success:
|
149
208
|
working_memory[subtask].append(
|
150
209
|
PREV_CODE_CONTEXT_WITH_REFLECTION.format(
|
151
|
-
code=code, result=result
|
210
|
+
reflection=reflection, code=code, result=result
|
152
211
|
)
|
153
212
|
)
|
154
213
|
|
@@ -162,12 +221,15 @@ def run_plan(
|
|
162
221
|
exec: Execute,
|
163
222
|
code: str,
|
164
223
|
tool_recommender: Sim,
|
165
|
-
|
224
|
+
long_term_memory: Optional[Sim] = None,
|
225
|
+
verbosity: int = 0,
|
166
226
|
) -> Tuple[str, str, List[Dict[str, Any]], Dict[str, List[str]]]:
|
167
227
|
active_plan = [e for e in plan if "success" not in e or not e["success"]]
|
168
|
-
working_memory: Dict[str, List[str]] = {}
|
169
228
|
current_code = code
|
170
229
|
current_test = ""
|
230
|
+
retrieved_ltm = ""
|
231
|
+
working_memory: Dict[str, List[str]] = {}
|
232
|
+
|
171
233
|
for task in active_plan:
|
172
234
|
_LOGGER.info(
|
173
235
|
f"""
|
@@ -176,7 +238,13 @@ def run_plan(
|
|
176
238
|
tool_info = "\n".join(
|
177
239
|
[e["doc"] for e in tool_recommender.top_k(task["instruction"])]
|
178
240
|
)
|
179
|
-
|
241
|
+
|
242
|
+
if long_term_memory is not None:
|
243
|
+
retrieved_ltm = "\n".join(
|
244
|
+
[e["doc"] for e in long_term_memory.top_k(task["instruction"], 1)]
|
245
|
+
)
|
246
|
+
|
247
|
+
success, code, result, working_memory_i = write_and_exec_code(
|
180
248
|
user_req,
|
181
249
|
task["instruction"],
|
182
250
|
current_code,
|
@@ -184,20 +252,21 @@ def run_plan(
|
|
184
252
|
coder,
|
185
253
|
tool_info,
|
186
254
|
exec,
|
187
|
-
|
255
|
+
retrieved_ltm,
|
256
|
+
verbosity=verbosity,
|
188
257
|
)
|
189
258
|
if task["type"] == "code":
|
190
259
|
current_code = code
|
191
260
|
else:
|
192
261
|
current_test = code
|
193
262
|
|
194
|
-
working_memory.update(
|
263
|
+
working_memory.update(working_memory_i)
|
195
264
|
|
196
|
-
if
|
265
|
+
if verbosity == 1:
|
197
266
|
_CONSOLE.print(
|
198
267
|
Syntax(code, "python", theme="gruvbox-dark", line_numbers=True)
|
199
268
|
)
|
200
|
-
_LOGGER.info(f"\tCode success
|
269
|
+
_LOGGER.info(f"\tCode success: {success} result: {str(result)}")
|
201
270
|
|
202
271
|
task["success"] = success
|
203
272
|
task["result"] = result
|
@@ -231,53 +300,71 @@ class VisionAgentV2(Agent):
|
|
231
300
|
self,
|
232
301
|
timeout: int = 600,
|
233
302
|
tool_recommender: Optional[Sim] = None,
|
234
|
-
|
303
|
+
long_term_memory: Optional[Sim] = None,
|
304
|
+
verbosity: int = 0,
|
235
305
|
) -> None:
|
236
|
-
self.planner = OpenAILLM(temperature=0.
|
237
|
-
self.coder = OpenAILLM(temperature=0.
|
306
|
+
self.planner = OpenAILLM(temperature=0.0, json_mode=True)
|
307
|
+
self.coder = OpenAILLM(temperature=0.0)
|
238
308
|
self.exec = Execute(timeout=timeout)
|
239
309
|
if tool_recommender is None:
|
240
310
|
self.tool_recommender = Sim(TOOLS_DF, sim_key="desc")
|
241
311
|
else:
|
242
312
|
self.tool_recommender = tool_recommender
|
243
|
-
self.
|
244
|
-
|
313
|
+
self.verbosity = verbosity
|
314
|
+
self._working_memory: Dict[str, List[str]] = {}
|
315
|
+
if long_term_memory is not None:
|
316
|
+
if "doc" not in long_term_memory.df.columns:
|
317
|
+
raise ValueError("Long term memory must have a 'doc' column.")
|
318
|
+
self.long_term_memory = long_term_memory
|
319
|
+
self.max_retries = 3
|
320
|
+
if self.verbosity:
|
245
321
|
_LOGGER.setLevel(logging.INFO)
|
246
322
|
|
247
323
|
def __call__(
|
248
324
|
self,
|
249
325
|
input: Union[List[Dict[str, str]], str],
|
250
326
|
image: Optional[Union[str, Path]] = None,
|
327
|
+
plan: Optional[List[Dict[str, Any]]] = None,
|
251
328
|
) -> str:
|
252
329
|
if isinstance(input, str):
|
253
330
|
input = [{"role": "user", "content": input}]
|
254
|
-
|
255
|
-
return code
|
331
|
+
results = self.chat_with_workflow(input, image, plan)
|
332
|
+
return results["code"] # type: ignore
|
256
333
|
|
257
|
-
def
|
334
|
+
def chat_with_workflow(
|
258
335
|
self,
|
259
336
|
chat: List[Dict[str, str]],
|
260
337
|
image: Optional[Union[str, Path]] = None,
|
261
|
-
|
338
|
+
plan: Optional[List[Dict[str, Any]]] = None,
|
339
|
+
) -> Dict[str, Any]:
|
262
340
|
if len(chat) == 0:
|
263
341
|
raise ValueError("Input cannot be empty.")
|
264
342
|
|
265
|
-
user_req = chat[0]["content"]
|
266
343
|
if image is not None:
|
267
|
-
|
344
|
+
# append file names to all user messages
|
345
|
+
for chat_i in chat:
|
346
|
+
if chat_i["role"] == "user":
|
347
|
+
chat_i["content"] += f" Image name {image}"
|
348
|
+
|
349
|
+
working_code = ""
|
350
|
+
if plan is not None:
|
351
|
+
# grab the latest working code from a previous plan
|
352
|
+
for task in plan:
|
353
|
+
if "success" in task and "code" in task and task["success"]:
|
354
|
+
working_code = task["code"]
|
268
355
|
|
269
|
-
plan = write_plan(
|
356
|
+
user_req, plan = write_plan(chat, plan, TOOL_DESCRIPTIONS, self.planner)
|
270
357
|
_LOGGER.info(
|
271
358
|
f"""Plan:
|
272
359
|
{tabulate(tabular_data=plan, headers="keys", tablefmt="mixed_grid", maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"""
|
273
360
|
)
|
274
|
-
working_memory: Dict[str, List[str]] = {}
|
275
361
|
|
276
|
-
working_code = ""
|
277
362
|
working_test = ""
|
363
|
+
working_memory: Dict[str, List[str]] = {}
|
278
364
|
success = False
|
365
|
+
retries = 0
|
279
366
|
|
280
|
-
while not success:
|
367
|
+
while not success and retries < self.max_retries:
|
281
368
|
working_code, working_test, plan, working_memory_i = run_plan(
|
282
369
|
user_req,
|
283
370
|
plan,
|
@@ -285,16 +372,25 @@ class VisionAgentV2(Agent):
|
|
285
372
|
self.exec,
|
286
373
|
working_code,
|
287
374
|
self.tool_recommender,
|
288
|
-
self.
|
375
|
+
self.long_term_memory,
|
376
|
+
self.verbosity,
|
289
377
|
)
|
290
378
|
success = all(task["success"] for task in plan)
|
291
379
|
working_memory.update(working_memory_i)
|
292
380
|
|
293
381
|
if not success:
|
294
|
-
#
|
382
|
+
# return to user and request feedback
|
295
383
|
break
|
296
384
|
|
297
|
-
|
385
|
+
retries += 1
|
386
|
+
|
387
|
+
return {
|
388
|
+
"code": working_code,
|
389
|
+
"test": working_test,
|
390
|
+
"success": success,
|
391
|
+
"working_memory": build_working_memory(working_memory),
|
392
|
+
"plan": plan,
|
393
|
+
}
|
298
394
|
|
299
395
|
def log_progress(self, description: str) -> None:
|
300
396
|
pass
|
@@ -1,3 +1,8 @@
|
|
1
|
+
USER_REQ_CONTEXT = """
|
2
|
+
## User Requirement
|
3
|
+
{user_requirement}
|
4
|
+
"""
|
5
|
+
|
1
6
|
USER_REQ_SUBTASK_CONTEXT = """
|
2
7
|
## User Requirement
|
3
8
|
{user_requirement}
|
@@ -6,11 +11,16 @@ USER_REQ_SUBTASK_CONTEXT = """
|
|
6
11
|
{subtask}
|
7
12
|
"""
|
8
13
|
|
9
|
-
|
14
|
+
USER_REQ_SUBTASK_WM_CONTEXT = """
|
10
15
|
## User Requirement
|
11
16
|
{user_requirement}
|
12
|
-
"""
|
13
17
|
|
18
|
+
## Current Subtask
|
19
|
+
{subtask}
|
20
|
+
|
21
|
+
## Previous Task
|
22
|
+
{working_memory}
|
23
|
+
"""
|
14
24
|
|
15
25
|
PLAN = """
|
16
26
|
# Context
|
@@ -27,11 +37,13 @@ Based on the context and the tools you have available, write a plan of subtasks
|
|
27
37
|
- For each subtask, you should provide a short instruction on what to do. Ensure the subtasks are large enough to be meaningful, encompassing multiple lines of code.
|
28
38
|
- You do not need to have the agent rewrite any tool functionality you already have, you should instead instruct it to utilize one or more of those tools in each subtask.
|
29
39
|
- You can have agents either write coding tasks, to code some functionality or testing tasks to test previous functionality.
|
40
|
+
- If a current plan exists, examine each item in the plan to determine if it was successful. If there was an item that failed, i.e. 'success': False, then you should rewrite that item and all subsequent items to ensure that the rewritten plan is successful.
|
30
41
|
|
31
42
|
Output a list of jsons in the following format:
|
32
43
|
|
33
44
|
```json
|
34
45
|
{{
|
46
|
+
"user_req": str, # "a summarized version of the user requirement"
|
35
47
|
"plan":
|
36
48
|
[
|
37
49
|
{{
|
@@ -61,8 +73,9 @@ CODE = """
|
|
61
73
|
{code}
|
62
74
|
|
63
75
|
# Constraints
|
64
|
-
- Write a function that accomplishes the User Requirement. You are supplied code from a previous task, feel free to copy over that code into your own implementation if you need it.
|
65
|
-
- Always prioritize using pre-defined tools or code for the same functionality. You have access to all these tools through the `from vision_agent.tools.tools_v2 import *` import.
|
76
|
+
- Write a function that accomplishes the 'User Requirement'. You are supplied code from a previous task under 'Previous Code', feel free to copy over that code into your own implementation if you need it.
|
77
|
+
- Always prioritize using pre-defined tools or code for the same functionality from 'Tool Info for Current Subtask'. You have access to all these tools through the `from vision_agent.tools.tools_v2 import *` import.
|
78
|
+
- You may recieve previous trials and errors under 'Previous Task', this is code, output and reflections from previous tasks. You can use these to avoid running in to the same issues when writing your code.
|
66
79
|
- Write clean, readable, and well-documented code.
|
67
80
|
|
68
81
|
# Output
|
@@ -102,6 +115,7 @@ def add(a: int, b: int) -> int:
|
|
102
115
|
|
103
116
|
|
104
117
|
PREV_CODE_CONTEXT = """
|
118
|
+
[previous impl]
|
105
119
|
```python
|
106
120
|
{code}
|
107
121
|
```
|
@@ -112,18 +126,20 @@ PREV_CODE_CONTEXT = """
|
|
112
126
|
|
113
127
|
|
114
128
|
PREV_CODE_CONTEXT_WITH_REFLECTION = """
|
129
|
+
[reflection on previous impl]
|
130
|
+
{reflection}
|
131
|
+
|
132
|
+
[new impl]
|
115
133
|
```python
|
116
134
|
{code}
|
117
135
|
```
|
118
136
|
|
119
|
-
[
|
137
|
+
[new output]
|
120
138
|
{result}
|
121
139
|
|
122
|
-
[reflection on previous impl]
|
123
|
-
{reflection}
|
124
140
|
"""
|
125
141
|
|
126
|
-
|
142
|
+
# don't need [previous impl] because it will come from PREV_CODE_CONTEXT or PREV_CODE_CONTEXT_WITH_REFLECTION
|
127
143
|
DEBUG = """
|
128
144
|
[example]
|
129
145
|
Here is an example of debugging with reflection.
|
@@ -133,7 +149,6 @@ Here is an example of debugging with reflection.
|
|
133
149
|
[context]
|
134
150
|
{context}
|
135
151
|
|
136
|
-
[previous impl]
|
137
152
|
{previous_impl}
|
138
153
|
|
139
154
|
[instruction]
|
@@ -158,7 +173,7 @@ TEST = """
|
|
158
173
|
{code}
|
159
174
|
|
160
175
|
# Constraints
|
161
|
-
- Write code to test the functionality of the provided code according to the Current Subtask. If you cannot test the code, then write code to visualize the result by calling the code.
|
176
|
+
- Write code to test the functionality of the provided code according to the 'Current Subtask'. If you cannot test the code, then write code to visualize the result by calling the code.
|
162
177
|
- Always prioritize using pre-defined tools for the same functionality.
|
163
178
|
- Write clean, readable, and well-documented code.
|
164
179
|
|
vision_agent/llm/llm.py
CHANGED
vision_agent/tools/__init__.py
CHANGED
vision_agent/tools/tool_utils.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
import logging
|
2
|
+
import os
|
2
3
|
from typing import Any, Dict
|
3
4
|
|
4
5
|
import requests
|
@@ -13,6 +14,8 @@ _LND_API_URL = "https://api.dev.landing.ai/v1/agent"
|
|
13
14
|
def _send_inference_request(
|
14
15
|
payload: Dict[str, Any], endpoint_name: str
|
15
16
|
) -> Dict[str, Any]:
|
17
|
+
if runtime_tag := os.environ.get("RUNTIME_TAG", ""):
|
18
|
+
payload["runtime_tag"] = runtime_tag
|
16
19
|
res = requests.post(
|
17
20
|
f"{_LND_API_URL}/model/{endpoint_name}",
|
18
21
|
headers={
|
vision_agent/tools/tools.py
CHANGED
@@ -174,15 +174,15 @@ class GroundingDINO(Tool):
|
|
174
174
|
"""
|
175
175
|
|
176
176
|
name = "grounding_dino_"
|
177
|
-
description = "'grounding_dino_' is a tool that can detect and count objects given a text prompt such as category names or referring expressions. It returns a list and count of bounding boxes, label names and associated probability scores."
|
177
|
+
description = "'grounding_dino_' is a tool that can detect and count multiple objects given a text prompt such as category names or referring expressions. It returns a list and count of bounding boxes, label names and associated probability scores."
|
178
178
|
usage = {
|
179
179
|
"required_parameters": [
|
180
180
|
{"name": "prompt", "type": "str"},
|
181
181
|
{"name": "image", "type": "str"},
|
182
182
|
],
|
183
183
|
"optional_parameters": [
|
184
|
-
{"name": "box_threshold", "type": "float"},
|
185
|
-
{"name": "iou_threshold", "type": "float"},
|
184
|
+
{"name": "box_threshold", "type": "float", "min": 0.1, "max": 0.5},
|
185
|
+
{"name": "iou_threshold", "type": "float", "min": 0.01, "max": 0.99},
|
186
186
|
],
|
187
187
|
"examples": [
|
188
188
|
{
|
@@ -209,7 +209,7 @@ class GroundingDINO(Tool):
|
|
209
209
|
"prompt": "red shirt. green shirt",
|
210
210
|
"image": "shirts.jpg",
|
211
211
|
"box_threshold": 0.20,
|
212
|
-
"iou_threshold": 0.
|
212
|
+
"iou_threshold": 0.20,
|
213
213
|
},
|
214
214
|
},
|
215
215
|
],
|
@@ -221,7 +221,7 @@ class GroundingDINO(Tool):
|
|
221
221
|
prompt: str,
|
222
222
|
image: Union[str, Path, ImageType],
|
223
223
|
box_threshold: float = 0.20,
|
224
|
-
iou_threshold: float = 0.
|
224
|
+
iou_threshold: float = 0.20,
|
225
225
|
) -> Dict:
|
226
226
|
"""Invoke the Grounding DINO model.
|
227
227
|
|
@@ -249,7 +249,7 @@ class GroundingDINO(Tool):
|
|
249
249
|
data["scores"] = [round(score, 2) for score in data["scores"]]
|
250
250
|
if "labels" in data:
|
251
251
|
data["labels"] = list(data["labels"])
|
252
|
-
data["
|
252
|
+
data["image_size"] = image_size
|
253
253
|
return data
|
254
254
|
|
255
255
|
|
@@ -277,15 +277,15 @@ class GroundingSAM(Tool):
|
|
277
277
|
"""
|
278
278
|
|
279
279
|
name = "grounding_sam_"
|
280
|
-
description = "'grounding_sam_' is a tool that can detect and segment objects given a text prompt such as category names or referring expressions. It returns a list of bounding boxes, label names and masks file names and associated probability scores."
|
280
|
+
description = "'grounding_sam_' is a tool that can detect and segment multiple objects given a text prompt such as category names or referring expressions. It returns a list of bounding boxes, label names and masks file names and associated probability scores."
|
281
281
|
usage = {
|
282
282
|
"required_parameters": [
|
283
283
|
{"name": "prompt", "type": "str"},
|
284
284
|
{"name": "image", "type": "str"},
|
285
285
|
],
|
286
286
|
"optional_parameters": [
|
287
|
-
{"name": "box_threshold", "type": "float"},
|
288
|
-
{"name": "iou_threshold", "type": "float"},
|
287
|
+
{"name": "box_threshold", "type": "float", "min": 0.1, "max": 0.5},
|
288
|
+
{"name": "iou_threshold", "type": "float", "min": 0.01, "max": 0.99},
|
289
289
|
],
|
290
290
|
"examples": [
|
291
291
|
{
|
@@ -312,7 +312,7 @@ class GroundingSAM(Tool):
|
|
312
312
|
"prompt": "red shirt, green shirt",
|
313
313
|
"image": "shirts.jpg",
|
314
314
|
"box_threshold": 0.20,
|
315
|
-
"iou_threshold": 0.
|
315
|
+
"iou_threshold": 0.20,
|
316
316
|
},
|
317
317
|
},
|
318
318
|
],
|
@@ -324,7 +324,7 @@ class GroundingSAM(Tool):
|
|
324
324
|
prompt: str,
|
325
325
|
image: Union[str, ImageType],
|
326
326
|
box_threshold: float = 0.2,
|
327
|
-
iou_threshold: float = 0.
|
327
|
+
iou_threshold: float = 0.2,
|
328
328
|
) -> Dict:
|
329
329
|
"""Invoke the Grounding SAM model.
|
330
330
|
|
@@ -353,6 +353,7 @@ class GroundingSAM(Tool):
|
|
353
353
|
rle_decode(mask_rle=mask, shape=data["mask_shape"])
|
354
354
|
for mask in data["masks"]
|
355
355
|
]
|
356
|
+
data["image_size"] = image_size
|
356
357
|
data.pop("mask_shape", None)
|
357
358
|
return data
|
358
359
|
|
@@ -434,6 +435,8 @@ class DINOv(Tool):
|
|
434
435
|
for mask in data["masks"]
|
435
436
|
]
|
436
437
|
data["labels"] = ["visual prompt" for _ in range(len(data["masks"]))]
|
438
|
+
mask_shape = data.pop("mask_shape", None)
|
439
|
+
data["image_size"] = (mask_shape[0], mask_shape[1]) if mask_shape else None
|
437
440
|
return data
|
438
441
|
|
439
442
|
|
@@ -789,33 +792,49 @@ class Crop(Tool):
|
|
789
792
|
return {"image": tmp.name}
|
790
793
|
|
791
794
|
|
792
|
-
class
|
793
|
-
r"""
|
795
|
+
class BboxStats(Tool):
|
796
|
+
r"""BboxStats returns the height, width and area of the bounding box in pixels to 2 decimal places."""
|
794
797
|
|
795
|
-
name = "
|
796
|
-
description = "'
|
798
|
+
name = "bbox_stats_"
|
799
|
+
description = "'bbox_stats_' returns the height, width and area of the given bounding box in pixels to 2 decimal places."
|
797
800
|
usage = {
|
798
|
-
"required_parameters": [
|
801
|
+
"required_parameters": [
|
802
|
+
{"name": "bboxes", "type": "List[int]"},
|
803
|
+
{"name": "image_size", "type": "Tuple[int]"},
|
804
|
+
],
|
799
805
|
"examples": [
|
800
806
|
{
|
801
|
-
"scenario": "
|
802
|
-
"parameters": {
|
803
|
-
|
807
|
+
"scenario": "Calculate the width and height of the bounding box [0.2, 0.21, 0.34, 0.42]",
|
808
|
+
"parameters": {
|
809
|
+
"bboxes": [[0.2, 0.21, 0.34, 0.42]],
|
810
|
+
"image_size": (500, 1200),
|
811
|
+
},
|
812
|
+
},
|
813
|
+
{
|
814
|
+
"scenario": "Calculate the area of the bounding box [0.2, 0.21, 0.34, 0.42]",
|
815
|
+
"parameters": {
|
816
|
+
"bboxes": [[0.2, 0.21, 0.34, 0.42]],
|
817
|
+
"image_size": (640, 480),
|
818
|
+
},
|
819
|
+
},
|
804
820
|
],
|
805
821
|
}
|
806
822
|
|
807
|
-
def __call__(
|
823
|
+
def __call__(
|
824
|
+
self, bboxes: List[List[int]], image_size: Tuple[int, int]
|
825
|
+
) -> List[Dict]:
|
808
826
|
areas = []
|
809
|
-
|
810
|
-
|
811
|
-
|
812
|
-
|
813
|
-
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
|
818
|
-
|
827
|
+
height, width = image_size
|
828
|
+
for bbox in bboxes:
|
829
|
+
x1, y1, x2, y2 = bbox
|
830
|
+
areas.append(
|
831
|
+
{
|
832
|
+
"width": round((x2 - x1) * width, 2),
|
833
|
+
"height": round((y2 - y1) * height, 2),
|
834
|
+
"area": round((x2 - x1) * (y2 - y1) * width * height, 2),
|
835
|
+
}
|
836
|
+
)
|
837
|
+
|
819
838
|
return areas
|
820
839
|
|
821
840
|
|
@@ -1054,22 +1073,25 @@ class ExtractFrames(Tool):
|
|
1054
1073
|
r"""Extract frames from a video."""
|
1055
1074
|
|
1056
1075
|
name = "extract_frames_"
|
1057
|
-
description = "'extract_frames_' extracts frames from a video, returns a list of tuples (frame, timestamp), where timestamp is the relative time in seconds where the frame was captured. The frame is a local image file path."
|
1076
|
+
description = "'extract_frames_' extracts frames from a video every 2 seconds, returns a list of tuples (frame, timestamp), where timestamp is the relative time in seconds where the frame was captured. The frame is a local image file path."
|
1058
1077
|
usage = {
|
1059
1078
|
"required_parameters": [{"name": "video_uri", "type": "str"}],
|
1079
|
+
"optional_parameters": [{"name": "frames_every", "type": "float"}],
|
1060
1080
|
"examples": [
|
1061
1081
|
{
|
1062
1082
|
"scenario": "Can you extract the frames from this video? Video: www.foobar.com/video?name=test.mp4",
|
1063
1083
|
"parameters": {"video_uri": "www.foobar.com/video?name=test.mp4"},
|
1064
1084
|
},
|
1065
1085
|
{
|
1066
|
-
"scenario": "Can you extract the images from this video file? Video path: tests/data/test.mp4",
|
1067
|
-
"parameters": {"video_uri": "tests/data/test.mp4"},
|
1086
|
+
"scenario": "Can you extract the images from this video file at every 2 seconds ? Video path: tests/data/test.mp4",
|
1087
|
+
"parameters": {"video_uri": "tests/data/test.mp4", "frames_every": 2},
|
1068
1088
|
},
|
1069
1089
|
],
|
1070
1090
|
}
|
1071
1091
|
|
1072
|
-
def __call__(
|
1092
|
+
def __call__(
|
1093
|
+
self, video_uri: str, frames_every: float = 2
|
1094
|
+
) -> List[Tuple[str, float]]:
|
1073
1095
|
"""Extract frames from a video.
|
1074
1096
|
|
1075
1097
|
|
@@ -1079,7 +1101,7 @@ class ExtractFrames(Tool):
|
|
1079
1101
|
Returns:
|
1080
1102
|
a list of tuples containing the extracted frame and the timestamp in seconds. E.g. [(path_to_frame1, 0.0), (path_to_frame2, 0.5), ...]. The timestamp is the time in seconds from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video. The frames are sorted by the timestamp in ascending order.
|
1081
1103
|
"""
|
1082
|
-
frames = extract_frames_from_video(video_uri)
|
1104
|
+
frames = extract_frames_from_video(video_uri, fps=round(1 / frames_every, 2))
|
1083
1105
|
result = []
|
1084
1106
|
_LOGGER.info(
|
1085
1107
|
f"Extracted {len(frames)} frames from video {video_uri}. Temporarily saving them as images to disk for downstream tasks."
|
@@ -1182,7 +1204,7 @@ TOOLS = {
|
|
1182
1204
|
AgentDINOv,
|
1183
1205
|
ExtractFrames,
|
1184
1206
|
Crop,
|
1185
|
-
|
1207
|
+
BboxStats,
|
1186
1208
|
SegArea,
|
1187
1209
|
ObjectDistance,
|
1188
1210
|
BboxContains,
|
vision_agent/utils/__init__.py
CHANGED
vision_agent/utils/sim.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
from pathlib import Path
|
2
2
|
from typing import Dict, List, Optional, Sequence, Union
|
3
3
|
|
4
|
+
import numpy as np
|
4
5
|
import pandas as pd
|
5
6
|
from openai import Client
|
6
7
|
from scipy.spatial.distance import cosine # type: ignore
|
@@ -46,7 +47,14 @@ class Sim:
|
|
46
47
|
)
|
47
48
|
|
48
49
|
def save(self, sim_file: Union[str, Path]) -> None:
|
49
|
-
|
50
|
+
sim_file = Path(sim_file)
|
51
|
+
sim_file.mkdir(parents=True, exist_ok=True)
|
52
|
+
|
53
|
+
df = self.df.copy()
|
54
|
+
embs = np.array(df.embs.tolist())
|
55
|
+
np.save(sim_file / "embs.npy", embs)
|
56
|
+
df = df.drop("embs", axis=1)
|
57
|
+
df.to_csv(sim_file / "df.csv", index=False)
|
50
58
|
|
51
59
|
def top_k(self, query: str, k: int = 5) -> Sequence[Dict]:
|
52
60
|
"""Returns the top k most similar items to the query.
|
@@ -65,6 +73,13 @@ class Sim:
|
|
65
73
|
return res[[c for c in res.columns if c != "embs"]].to_dict(orient="records")
|
66
74
|
|
67
75
|
|
76
|
+
def merge_sim(sim1: Sim, sim2: Sim) -> Sim:
|
77
|
+
return Sim(pd.concat([sim1.df, sim2.df], ignore_index=True))
|
78
|
+
|
79
|
+
|
68
80
|
def load_sim(sim_file: Union[str, Path]) -> Sim:
|
69
|
-
|
81
|
+
sim_file = Path(sim_file)
|
82
|
+
df = pd.read_csv(sim_file / "df.csv")
|
83
|
+
embs = np.load(sim_file / "embs.npy")
|
84
|
+
df["embs"] = list(embs)
|
70
85
|
return Sim(df)
|
@@ -7,28 +7,28 @@ vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMV
|
|
7
7
|
vision_agent/agent/easytool_prompts.py,sha256=Bikw-PPLkm78dwywTlnv32Y1Tw6JMeC-R7oCnXWLcTk,4656
|
8
8
|
vision_agent/agent/reflexion.py,sha256=4gz30BuFMeGxSsTzoDV4p91yE0R8LISXp28IaOI6wdM,10506
|
9
9
|
vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
|
10
|
-
vision_agent/agent/vision_agent.py,sha256=
|
10
|
+
vision_agent/agent/vision_agent.py,sha256=pnx7gtTPazR7Dck5_kfZC3S3QWKu4e28YVigzOicOX0,27130
|
11
11
|
vision_agent/agent/vision_agent_prompts.py,sha256=MZSIwovYgB-f-kdJ6btaNDVXptJn47bfOL3-Zn6NiC0,8573
|
12
|
-
vision_agent/agent/vision_agent_v2.py,sha256=
|
13
|
-
vision_agent/agent/vision_agent_v2_prompt.py,sha256
|
12
|
+
vision_agent/agent/vision_agent_v2.py,sha256=pAOYfNxBVZwnNxyYfv_Bk5dklFr4ougA52ib4q8O4Uo,12942
|
13
|
+
vision_agent/agent/vision_agent_v2_prompt.py,sha256=dd9m9Vqp91r4dpsKMDwXr54jG_GTBdJNDzpgR115S8Q,5997
|
14
14
|
vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
15
|
vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
|
16
16
|
vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
|
17
|
-
vision_agent/llm/llm.py,sha256=
|
17
|
+
vision_agent/llm/llm.py,sha256=A-gN0vMb79fSxhSK1qBs6PTu1fba9Gvy6pitOyjW2gM,5779
|
18
18
|
vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
|
19
19
|
vision_agent/lmm/lmm.py,sha256=gK90vMxh0OcGSuIZQikBkDXm4pfkdFk1R2y7rtWDl84,10539
|
20
|
-
vision_agent/tools/__init__.py,sha256=
|
20
|
+
vision_agent/tools/__init__.py,sha256=p5SM0YhThSVO_jRF9O-OjH2fYDPv-iMjexDX9xPPb7M,452
|
21
21
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
22
|
-
vision_agent/tools/tool_utils.py,sha256=
|
23
|
-
vision_agent/tools/tools.py,sha256=
|
22
|
+
vision_agent/tools/tool_utils.py,sha256=mK6QfbYr6oo9ci979-_6R1DrxU2i8HGhwosADyvciI0,865
|
23
|
+
vision_agent/tools/tools.py,sha256=sVxN7SpDkz_XTc_SKwkoRF4EwaMTuHvTsCHwtR942Fc,47373
|
24
24
|
vision_agent/tools/tools_v2.py,sha256=1Y_ZbYJyuo2eZZkq7jY3YfuKWC82C-GFCZMLYH-I5ew,13800
|
25
|
-
vision_agent/utils/__init__.py,sha256=
|
25
|
+
vision_agent/utils/__init__.py,sha256=xsHFyJSDbLdonB9Dh74cwZnVTiT__2OQF3Brd3Nmglc,116
|
26
26
|
vision_agent/utils/execute.py,sha256=RC_jKrm2kOWwzNe9xKuA2xJcbsNcD0Hb95_o3_Le0_E,3820
|
27
27
|
vision_agent/utils/image_utils.py,sha256=1dggPBhW8_hUXDItCRLa23h-hdBwS50cjL4v1hsoUbg,7586
|
28
|
-
vision_agent/utils/sim.py,sha256=
|
28
|
+
vision_agent/utils/sim.py,sha256=SO4-pj2Fjs3yr-KT8S0nuUd66lf7m7XvMAp7_ecvKuQ,2813
|
29
29
|
vision_agent/utils/type_defs.py,sha256=4LTnTL4HNsfYqCrDn9Ppjg9bSG2ZGcoKSSd9YeQf4Bw,1792
|
30
30
|
vision_agent/utils/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
|
31
|
-
vision_agent-0.2.
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
34
|
-
vision_agent-0.2.
|
31
|
+
vision_agent-0.2.22.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
32
|
+
vision_agent-0.2.22.dist-info/METADATA,sha256=hOVbcYSPue2CEdagEkIiX3dGtjip9p1GgKilmPYj-gU,9121
|
33
|
+
vision_agent-0.2.22.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
34
|
+
vision_agent-0.2.22.dist-info/RECORD,,
|
File without changes
|
File without changes
|