vision-agent 0.2.21__py3-none-any.whl → 0.2.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent_v2.py +39 -19
- vision_agent/tools/__init__.py +1 -1
- vision_agent/tools/tools_v2.py +52 -1
- {vision_agent-0.2.21.dist-info → vision_agent-0.2.23.dist-info}/METADATA +1 -1
- {vision_agent-0.2.21.dist-info → vision_agent-0.2.23.dist-info}/RECORD +7 -7
- {vision_agent-0.2.21.dist-info → vision_agent-0.2.23.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.21.dist-info → vision_agent-0.2.23.dist-info}/WHEEL +0 -0
@@ -51,6 +51,21 @@ def extract_code(code: str) -> str:
|
|
51
51
|
return code
|
52
52
|
|
53
53
|
|
54
|
+
def extract_json(json_str: str) -> Dict[str, Any]:
|
55
|
+
try:
|
56
|
+
json_dict = json.loads(json_str)
|
57
|
+
except json.JSONDecodeError:
|
58
|
+
if "```json" in json_str:
|
59
|
+
json_str = json_str[json_str.find("```json") + len("```json") :]
|
60
|
+
json_str = json_str[: json_str.find("```")]
|
61
|
+
elif "```" in json_str:
|
62
|
+
json_str = json_str[json_str.find("```") + len("```") :]
|
63
|
+
# get the last ``` not one from an intermediate string
|
64
|
+
json_str = json_str[: json_str.find("}```")]
|
65
|
+
json_dict = json.loads(json_str)
|
66
|
+
return json_dict # type: ignore
|
67
|
+
|
68
|
+
|
54
69
|
def write_plan(
|
55
70
|
chat: List[Dict[str, str]],
|
56
71
|
plan: Optional[List[Dict[str, Any]]],
|
@@ -65,8 +80,8 @@ def write_plan(
|
|
65
80
|
context = USER_REQ_CONTEXT.format(user_requirement=user_requirements)
|
66
81
|
prompt = PLAN.format(context=context, plan=str(plan), tool_desc=tool_desc)
|
67
82
|
chat[-1]["content"] = prompt
|
68
|
-
|
69
|
-
return
|
83
|
+
new_plan = extract_json(model.chat(chat))
|
84
|
+
return new_plan["user_req"], new_plan["plan"]
|
70
85
|
|
71
86
|
|
72
87
|
def write_code(
|
@@ -133,7 +148,7 @@ def debug_code(
|
|
133
148
|
{"role": "system", "content": DEBUG_SYS_MSG},
|
134
149
|
{"role": "user", "content": prompt},
|
135
150
|
]
|
136
|
-
code_and_ref =
|
151
|
+
code_and_ref = extract_json(model.chat(messages))
|
137
152
|
if hasattr(model, "kwargs"):
|
138
153
|
del model.kwargs["response_format"]
|
139
154
|
return extract_code(code_and_ref["improved_impl"]), code_and_ref["reflection"]
|
@@ -149,7 +164,7 @@ def write_and_exec_code(
|
|
149
164
|
exec: Execute,
|
150
165
|
retrieved_ltm: str,
|
151
166
|
max_retry: int = 3,
|
152
|
-
|
167
|
+
verbosity: int = 0,
|
153
168
|
) -> Tuple[bool, str, str, Dict[str, List[str]]]:
|
154
169
|
success = False
|
155
170
|
counter = 0
|
@@ -159,6 +174,9 @@ def write_and_exec_code(
|
|
159
174
|
user_req, subtask, retrieved_ltm, tool_info, orig_code, model
|
160
175
|
)
|
161
176
|
success, result = exec.run_isolation(code)
|
177
|
+
if verbosity == 2:
|
178
|
+
_CONSOLE.print(Syntax(code, "python", theme="gruvbox-dark", line_numbers=True))
|
179
|
+
_LOGGER.info(f"\tCode success: {success}, result: {str(result)}")
|
162
180
|
working_memory: Dict[str, List[str]] = {}
|
163
181
|
while not success and counter < max_retry:
|
164
182
|
if subtask not in working_memory:
|
@@ -180,11 +198,11 @@ def write_and_exec_code(
|
|
180
198
|
)
|
181
199
|
success, result = exec.run_isolation(code)
|
182
200
|
counter += 1
|
183
|
-
if
|
201
|
+
if verbosity == 2:
|
184
202
|
_CONSOLE.print(
|
185
203
|
Syntax(code, "python", theme="gruvbox-dark", line_numbers=True)
|
186
204
|
)
|
187
|
-
|
205
|
+
_LOGGER.info(f"\tDebugging reflection: {reflection}, result: {result}")
|
188
206
|
|
189
207
|
if success:
|
190
208
|
working_memory[subtask].append(
|
@@ -204,7 +222,7 @@ def run_plan(
|
|
204
222
|
code: str,
|
205
223
|
tool_recommender: Sim,
|
206
224
|
long_term_memory: Optional[Sim] = None,
|
207
|
-
|
225
|
+
verbosity: int = 0,
|
208
226
|
) -> Tuple[str, str, List[Dict[str, Any]], Dict[str, List[str]]]:
|
209
227
|
active_plan = [e for e in plan if "success" not in e or not e["success"]]
|
210
228
|
current_code = code
|
@@ -217,9 +235,11 @@ def run_plan(
|
|
217
235
|
f"""
|
218
236
|
{tabulate(tabular_data=[task], headers="keys", tablefmt="mixed_grid", maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"""
|
219
237
|
)
|
220
|
-
|
221
|
-
|
222
|
-
|
238
|
+
tools = tool_recommender.top_k(task["instruction"])
|
239
|
+
tool_info = "\n".join([e["doc"] for e in tools])
|
240
|
+
|
241
|
+
if verbosity == 2:
|
242
|
+
_LOGGER.info(f"Tools retrieved: {[e['desc'] for e in tools]}")
|
223
243
|
|
224
244
|
if long_term_memory is not None:
|
225
245
|
retrieved_ltm = "\n".join(
|
@@ -235,7 +255,7 @@ def run_plan(
|
|
235
255
|
tool_info,
|
236
256
|
exec,
|
237
257
|
retrieved_ltm,
|
238
|
-
|
258
|
+
verbosity=verbosity,
|
239
259
|
)
|
240
260
|
if task["type"] == "code":
|
241
261
|
current_code = code
|
@@ -244,11 +264,11 @@ def run_plan(
|
|
244
264
|
|
245
265
|
working_memory.update(working_memory_i)
|
246
266
|
|
247
|
-
if
|
267
|
+
if verbosity == 1:
|
248
268
|
_CONSOLE.print(
|
249
269
|
Syntax(code, "python", theme="gruvbox-dark", line_numbers=True)
|
250
270
|
)
|
251
|
-
_LOGGER.info(f"\tCode success
|
271
|
+
_LOGGER.info(f"\tCode success: {success} result: {str(result)}")
|
252
272
|
|
253
273
|
task["success"] = success
|
254
274
|
task["result"] = result
|
@@ -283,23 +303,23 @@ class VisionAgentV2(Agent):
|
|
283
303
|
timeout: int = 600,
|
284
304
|
tool_recommender: Optional[Sim] = None,
|
285
305
|
long_term_memory: Optional[Sim] = None,
|
286
|
-
|
306
|
+
verbosity: int = 0,
|
287
307
|
) -> None:
|
288
|
-
self.planner = OpenAILLM(temperature=0.
|
289
|
-
self.coder = OpenAILLM(temperature=0.
|
308
|
+
self.planner = OpenAILLM(temperature=0.0, json_mode=True)
|
309
|
+
self.coder = OpenAILLM(temperature=0.0)
|
290
310
|
self.exec = Execute(timeout=timeout)
|
291
311
|
if tool_recommender is None:
|
292
312
|
self.tool_recommender = Sim(TOOLS_DF, sim_key="desc")
|
293
313
|
else:
|
294
314
|
self.tool_recommender = tool_recommender
|
295
|
-
self.
|
315
|
+
self.verbosity = verbosity
|
296
316
|
self._working_memory: Dict[str, List[str]] = {}
|
297
317
|
if long_term_memory is not None:
|
298
318
|
if "doc" not in long_term_memory.df.columns:
|
299
319
|
raise ValueError("Long term memory must have a 'doc' column.")
|
300
320
|
self.long_term_memory = long_term_memory
|
301
321
|
self.max_retries = 3
|
302
|
-
if self.
|
322
|
+
if self.verbosity:
|
303
323
|
_LOGGER.setLevel(logging.INFO)
|
304
324
|
|
305
325
|
def __call__(
|
@@ -355,7 +375,7 @@ class VisionAgentV2(Agent):
|
|
355
375
|
working_code,
|
356
376
|
self.tool_recommender,
|
357
377
|
self.long_term_memory,
|
358
|
-
self.
|
378
|
+
self.verbosity,
|
359
379
|
)
|
360
380
|
success = all(task["success"] for task in plan)
|
361
381
|
working_memory.update(working_memory_i)
|
vision_agent/tools/__init__.py
CHANGED
vision_agent/tools/tools_v2.py
CHANGED
@@ -4,12 +4,13 @@ import logging
|
|
4
4
|
import tempfile
|
5
5
|
from importlib import resources
|
6
6
|
from pathlib import Path
|
7
|
-
from typing import Any, Callable, Dict, List, Tuple, Union
|
7
|
+
from typing import Any, Callable, Dict, List, Tuple, Union, cast
|
8
8
|
|
9
9
|
import numpy as np
|
10
10
|
import pandas as pd
|
11
11
|
import requests
|
12
12
|
from PIL import Image, ImageDraw, ImageFont
|
13
|
+
from scipy.spatial import distance # type: ignore
|
13
14
|
|
14
15
|
from vision_agent.tools.tool_utils import _send_inference_request
|
15
16
|
from vision_agent.utils import extract_frames_from_video
|
@@ -233,6 +234,54 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
233
234
|
return output
|
234
235
|
|
235
236
|
|
237
|
+
def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float:
|
238
|
+
"""'closest_mask_distance' calculates the closest distance between two masks.
|
239
|
+
|
240
|
+
Parameters:
|
241
|
+
mask1 (np.ndarray): The first mask.
|
242
|
+
mask2 (np.ndarray): The second mask.
|
243
|
+
|
244
|
+
Returns:
|
245
|
+
float: The closest distance between the two masks.
|
246
|
+
|
247
|
+
Example
|
248
|
+
-------
|
249
|
+
>>> closest_mask_distance(mask1, mask2)
|
250
|
+
0.5
|
251
|
+
"""
|
252
|
+
|
253
|
+
mask1 = np.clip(mask1, 0, 1)
|
254
|
+
mask2 = np.clip(mask2, 0, 1)
|
255
|
+
mask1_points = np.transpose(np.nonzero(mask1))
|
256
|
+
mask2_points = np.transpose(np.nonzero(mask2))
|
257
|
+
dist_matrix = distance.cdist(mask1_points, mask2_points, "euclidean")
|
258
|
+
return cast(float, np.min(dist_matrix))
|
259
|
+
|
260
|
+
|
261
|
+
def closest_box_distance(box1: List[float], box2: List[float]) -> float:
|
262
|
+
"""'closest_box_distance' calculates the closest distance between two bounding boxes.
|
263
|
+
|
264
|
+
Parameters:
|
265
|
+
box1 (List[float]): The first bounding box.
|
266
|
+
box2 (List[float]): The second bounding box.
|
267
|
+
|
268
|
+
Returns:
|
269
|
+
float: The closest distance between the two bounding boxes.
|
270
|
+
|
271
|
+
Example
|
272
|
+
-------
|
273
|
+
>>> closest_box_distance([100, 100, 200, 200], [300, 300, 400, 400])
|
274
|
+
141.42
|
275
|
+
"""
|
276
|
+
|
277
|
+
x11, y11, x12, y12 = box1
|
278
|
+
x21, y21, x22, y22 = box2
|
279
|
+
|
280
|
+
horizontal_distance = np.max([0, x21 - x12, x11 - x22])
|
281
|
+
vertical_distance = np.max([0, y21 - y12, y11 - y22])
|
282
|
+
return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
|
283
|
+
|
284
|
+
|
236
285
|
# Utility and visualization functions
|
237
286
|
|
238
287
|
|
@@ -429,6 +478,8 @@ TOOLS = [
|
|
429
478
|
grounding_sam,
|
430
479
|
extract_frames,
|
431
480
|
ocr,
|
481
|
+
closest_mask_distance,
|
482
|
+
closest_box_distance,
|
432
483
|
load_image,
|
433
484
|
save_image,
|
434
485
|
overlay_bounding_boxes,
|
@@ -9,7 +9,7 @@ vision_agent/agent/reflexion.py,sha256=4gz30BuFMeGxSsTzoDV4p91yE0R8LISXp28IaOI6w
|
|
9
9
|
vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
|
10
10
|
vision_agent/agent/vision_agent.py,sha256=pnx7gtTPazR7Dck5_kfZC3S3QWKu4e28YVigzOicOX0,27130
|
11
11
|
vision_agent/agent/vision_agent_prompts.py,sha256=MZSIwovYgB-f-kdJ6btaNDVXptJn47bfOL3-Zn6NiC0,8573
|
12
|
-
vision_agent/agent/vision_agent_v2.py,sha256=
|
12
|
+
vision_agent/agent/vision_agent_v2.py,sha256=0-bJH_KiYB9fdfN5rbutnyJgQr1XYeszNYqmR69IxZc,13045
|
13
13
|
vision_agent/agent/vision_agent_v2_prompt.py,sha256=dd9m9Vqp91r4dpsKMDwXr54jG_GTBdJNDzpgR115S8Q,5997
|
14
14
|
vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
15
|
vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
|
@@ -17,18 +17,18 @@ vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,
|
|
17
17
|
vision_agent/llm/llm.py,sha256=A-gN0vMb79fSxhSK1qBs6PTu1fba9Gvy6pitOyjW2gM,5779
|
18
18
|
vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
|
19
19
|
vision_agent/lmm/lmm.py,sha256=gK90vMxh0OcGSuIZQikBkDXm4pfkdFk1R2y7rtWDl84,10539
|
20
|
-
vision_agent/tools/__init__.py,sha256=
|
20
|
+
vision_agent/tools/__init__.py,sha256=dRHXGpjhItXZRQs0r_l3Z3bQIreaZaYP0CJrl8mOJxM,452
|
21
21
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
22
22
|
vision_agent/tools/tool_utils.py,sha256=mK6QfbYr6oo9ci979-_6R1DrxU2i8HGhwosADyvciI0,865
|
23
23
|
vision_agent/tools/tools.py,sha256=sVxN7SpDkz_XTc_SKwkoRF4EwaMTuHvTsCHwtR942Fc,47373
|
24
|
-
vision_agent/tools/tools_v2.py,sha256=
|
24
|
+
vision_agent/tools/tools_v2.py,sha256=Dh5Rs1iaEs5ijRDwVI3Na9ylC7eOjtrIqtYOZSredH8,15364
|
25
25
|
vision_agent/utils/__init__.py,sha256=xsHFyJSDbLdonB9Dh74cwZnVTiT__2OQF3Brd3Nmglc,116
|
26
26
|
vision_agent/utils/execute.py,sha256=RC_jKrm2kOWwzNe9xKuA2xJcbsNcD0Hb95_o3_Le0_E,3820
|
27
27
|
vision_agent/utils/image_utils.py,sha256=1dggPBhW8_hUXDItCRLa23h-hdBwS50cjL4v1hsoUbg,7586
|
28
28
|
vision_agent/utils/sim.py,sha256=SO4-pj2Fjs3yr-KT8S0nuUd66lf7m7XvMAp7_ecvKuQ,2813
|
29
29
|
vision_agent/utils/type_defs.py,sha256=4LTnTL4HNsfYqCrDn9Ppjg9bSG2ZGcoKSSd9YeQf4Bw,1792
|
30
30
|
vision_agent/utils/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
|
31
|
-
vision_agent-0.2.
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
34
|
-
vision_agent-0.2.
|
31
|
+
vision_agent-0.2.23.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
32
|
+
vision_agent-0.2.23.dist-info/METADATA,sha256=r3JWwYu2mKPjViXrm50ZS_9juGciOrYfEyz2YhPeczQ,9121
|
33
|
+
vision_agent-0.2.23.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
34
|
+
vision_agent-0.2.23.dist-info/RECORD,,
|
File without changes
|
File without changes
|