vision-agent 0.2.21__py3-none-any.whl → 0.2.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -51,6 +51,21 @@ def extract_code(code: str) -> str:
51
51
  return code
52
52
 
53
53
 
54
+ def extract_json(json_str: str) -> Dict[str, Any]:
55
+ try:
56
+ json_dict = json.loads(json_str)
57
+ except json.JSONDecodeError:
58
+ if "```json" in json_str:
59
+ json_str = json_str[json_str.find("```json") + len("```json") :]
60
+ json_str = json_str[: json_str.find("```")]
61
+ elif "```" in json_str:
62
+ json_str = json_str[json_str.find("```") + len("```") :]
63
+ # get the last ``` not one from an intermediate string
64
+ json_str = json_str[: json_str.find("}```")]
65
+ json_dict = json.loads(json_str)
66
+ return json_dict # type: ignore
67
+
68
+
54
69
  def write_plan(
55
70
  chat: List[Dict[str, str]],
56
71
  plan: Optional[List[Dict[str, Any]]],
@@ -65,8 +80,8 @@ def write_plan(
65
80
  context = USER_REQ_CONTEXT.format(user_requirement=user_requirements)
66
81
  prompt = PLAN.format(context=context, plan=str(plan), tool_desc=tool_desc)
67
82
  chat[-1]["content"] = prompt
68
- plan = json.loads(model.chat(chat).replace("```", "").strip())
69
- return plan["user_req"], plan["plan"] # type: ignore
83
+ new_plan = extract_json(model.chat(chat))
84
+ return new_plan["user_req"], new_plan["plan"]
70
85
 
71
86
 
72
87
  def write_code(
@@ -133,7 +148,7 @@ def debug_code(
133
148
  {"role": "system", "content": DEBUG_SYS_MSG},
134
149
  {"role": "user", "content": prompt},
135
150
  ]
136
- code_and_ref = json.loads(model.chat(messages).replace("```", "").strip())
151
+ code_and_ref = extract_json(model.chat(messages))
137
152
  if hasattr(model, "kwargs"):
138
153
  del model.kwargs["response_format"]
139
154
  return extract_code(code_and_ref["improved_impl"]), code_and_ref["reflection"]
@@ -149,7 +164,7 @@ def write_and_exec_code(
149
164
  exec: Execute,
150
165
  retrieved_ltm: str,
151
166
  max_retry: int = 3,
152
- verbose: bool = False,
167
+ verbosity: int = 0,
153
168
  ) -> Tuple[bool, str, str, Dict[str, List[str]]]:
154
169
  success = False
155
170
  counter = 0
@@ -159,6 +174,9 @@ def write_and_exec_code(
159
174
  user_req, subtask, retrieved_ltm, tool_info, orig_code, model
160
175
  )
161
176
  success, result = exec.run_isolation(code)
177
+ if verbosity == 2:
178
+ _CONSOLE.print(Syntax(code, "python", theme="gruvbox-dark", line_numbers=True))
179
+ _LOGGER.info(f"\tCode success: {success}, result: {str(result)}")
162
180
  working_memory: Dict[str, List[str]] = {}
163
181
  while not success and counter < max_retry:
164
182
  if subtask not in working_memory:
@@ -180,11 +198,11 @@ def write_and_exec_code(
180
198
  )
181
199
  success, result = exec.run_isolation(code)
182
200
  counter += 1
183
- if verbose:
201
+ if verbosity == 2:
184
202
  _CONSOLE.print(
185
203
  Syntax(code, "python", theme="gruvbox-dark", line_numbers=True)
186
204
  )
187
- _LOGGER.info(f"\tDebugging reflection, result: {reflection}, {result}")
205
+ _LOGGER.info(f"\tDebugging reflection: {reflection}, result: {result}")
188
206
 
189
207
  if success:
190
208
  working_memory[subtask].append(
@@ -204,7 +222,7 @@ def run_plan(
204
222
  code: str,
205
223
  tool_recommender: Sim,
206
224
  long_term_memory: Optional[Sim] = None,
207
- verbose: bool = False,
225
+ verbosity: int = 0,
208
226
  ) -> Tuple[str, str, List[Dict[str, Any]], Dict[str, List[str]]]:
209
227
  active_plan = [e for e in plan if "success" not in e or not e["success"]]
210
228
  current_code = code
@@ -217,9 +235,11 @@ def run_plan(
217
235
  f"""
218
236
  {tabulate(tabular_data=[task], headers="keys", tablefmt="mixed_grid", maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"""
219
237
  )
220
- tool_info = "\n".join(
221
- [e["doc"] for e in tool_recommender.top_k(task["instruction"])]
222
- )
238
+ tools = tool_recommender.top_k(task["instruction"])
239
+ tool_info = "\n".join([e["doc"] for e in tools])
240
+
241
+ if verbosity == 2:
242
+ _LOGGER.info(f"Tools retrieved: {[e['desc'] for e in tools]}")
223
243
 
224
244
  if long_term_memory is not None:
225
245
  retrieved_ltm = "\n".join(
@@ -235,7 +255,7 @@ def run_plan(
235
255
  tool_info,
236
256
  exec,
237
257
  retrieved_ltm,
238
- verbose=verbose,
258
+ verbosity=verbosity,
239
259
  )
240
260
  if task["type"] == "code":
241
261
  current_code = code
@@ -244,11 +264,11 @@ def run_plan(
244
264
 
245
265
  working_memory.update(working_memory_i)
246
266
 
247
- if verbose:
267
+ if verbosity == 1:
248
268
  _CONSOLE.print(
249
269
  Syntax(code, "python", theme="gruvbox-dark", line_numbers=True)
250
270
  )
251
- _LOGGER.info(f"\tCode success, result: {success}, {str(result)}")
271
+ _LOGGER.info(f"\tCode success: {success} result: {str(result)}")
252
272
 
253
273
  task["success"] = success
254
274
  task["result"] = result
@@ -283,23 +303,23 @@ class VisionAgentV2(Agent):
283
303
  timeout: int = 600,
284
304
  tool_recommender: Optional[Sim] = None,
285
305
  long_term_memory: Optional[Sim] = None,
286
- verbose: bool = False,
306
+ verbosity: int = 0,
287
307
  ) -> None:
288
- self.planner = OpenAILLM(temperature=0.1, json_mode=True)
289
- self.coder = OpenAILLM(temperature=0.1)
308
+ self.planner = OpenAILLM(temperature=0.0, json_mode=True)
309
+ self.coder = OpenAILLM(temperature=0.0)
290
310
  self.exec = Execute(timeout=timeout)
291
311
  if tool_recommender is None:
292
312
  self.tool_recommender = Sim(TOOLS_DF, sim_key="desc")
293
313
  else:
294
314
  self.tool_recommender = tool_recommender
295
- self.verbose = verbose
315
+ self.verbosity = verbosity
296
316
  self._working_memory: Dict[str, List[str]] = {}
297
317
  if long_term_memory is not None:
298
318
  if "doc" not in long_term_memory.df.columns:
299
319
  raise ValueError("Long term memory must have a 'doc' column.")
300
320
  self.long_term_memory = long_term_memory
301
321
  self.max_retries = 3
302
- if self.verbose:
322
+ if self.verbosity:
303
323
  _LOGGER.setLevel(logging.INFO)
304
324
 
305
325
  def __call__(
@@ -355,7 +375,7 @@ class VisionAgentV2(Agent):
355
375
  working_code,
356
376
  self.tool_recommender,
357
377
  self.long_term_memory,
358
- self.verbose,
378
+ self.verbosity,
359
379
  )
360
380
  success = all(task["success"] for task in plan)
361
381
  working_memory.update(working_memory_i)
@@ -3,8 +3,8 @@ from .tools import ( # Counter,
3
3
  CLIP,
4
4
  OCR,
5
5
  TOOLS,
6
- BboxStats,
7
6
  BboxIoU,
7
+ BboxStats,
8
8
  BoxDistance,
9
9
  Crop,
10
10
  DINOv,
@@ -4,12 +4,13 @@ import logging
4
4
  import tempfile
5
5
  from importlib import resources
6
6
  from pathlib import Path
7
- from typing import Any, Callable, Dict, List, Tuple, Union
7
+ from typing import Any, Callable, Dict, List, Tuple, Union, cast
8
8
 
9
9
  import numpy as np
10
10
  import pandas as pd
11
11
  import requests
12
12
  from PIL import Image, ImageDraw, ImageFont
13
+ from scipy.spatial import distance # type: ignore
13
14
 
14
15
  from vision_agent.tools.tool_utils import _send_inference_request
15
16
  from vision_agent.utils import extract_frames_from_video
@@ -233,6 +234,54 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
233
234
  return output
234
235
 
235
236
 
237
+ def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float:
238
+ """'closest_mask_distance' calculates the closest distance between two masks.
239
+
240
+ Parameters:
241
+ mask1 (np.ndarray): The first mask.
242
+ mask2 (np.ndarray): The second mask.
243
+
244
+ Returns:
245
+ float: The closest distance between the two masks.
246
+
247
+ Example
248
+ -------
249
+ >>> closest_mask_distance(mask1, mask2)
250
+ 0.5
251
+ """
252
+
253
+ mask1 = np.clip(mask1, 0, 1)
254
+ mask2 = np.clip(mask2, 0, 1)
255
+ mask1_points = np.transpose(np.nonzero(mask1))
256
+ mask2_points = np.transpose(np.nonzero(mask2))
257
+ dist_matrix = distance.cdist(mask1_points, mask2_points, "euclidean")
258
+ return cast(float, np.min(dist_matrix))
259
+
260
+
261
+ def closest_box_distance(box1: List[float], box2: List[float]) -> float:
262
+ """'closest_box_distance' calculates the closest distance between two bounding boxes.
263
+
264
+ Parameters:
265
+ box1 (List[float]): The first bounding box.
266
+ box2 (List[float]): The second bounding box.
267
+
268
+ Returns:
269
+ float: The closest distance between the two bounding boxes.
270
+
271
+ Example
272
+ -------
273
+ >>> closest_box_distance([100, 100, 200, 200], [300, 300, 400, 400])
274
+ 141.42
275
+ """
276
+
277
+ x11, y11, x12, y12 = box1
278
+ x21, y21, x22, y22 = box2
279
+
280
+ horizontal_distance = np.max([0, x21 - x12, x11 - x22])
281
+ vertical_distance = np.max([0, y21 - y12, y11 - y22])
282
+ return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2))
283
+
284
+
236
285
  # Utility and visualization functions
237
286
 
238
287
 
@@ -429,6 +478,8 @@ TOOLS = [
429
478
  grounding_sam,
430
479
  extract_frames,
431
480
  ocr,
481
+ closest_mask_distance,
482
+ closest_box_distance,
432
483
  load_image,
433
484
  save_image,
434
485
  overlay_bounding_boxes,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.21
3
+ Version: 0.2.23
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -9,7 +9,7 @@ vision_agent/agent/reflexion.py,sha256=4gz30BuFMeGxSsTzoDV4p91yE0R8LISXp28IaOI6w
9
9
  vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
10
10
  vision_agent/agent/vision_agent.py,sha256=pnx7gtTPazR7Dck5_kfZC3S3QWKu4e28YVigzOicOX0,27130
11
11
  vision_agent/agent/vision_agent_prompts.py,sha256=MZSIwovYgB-f-kdJ6btaNDVXptJn47bfOL3-Zn6NiC0,8573
12
- vision_agent/agent/vision_agent_v2.py,sha256=K-zJ0utlvgWpR0TlP9M3yO0O9t9L37FcBs31OfOC4C0,12185
12
+ vision_agent/agent/vision_agent_v2.py,sha256=0-bJH_KiYB9fdfN5rbutnyJgQr1XYeszNYqmR69IxZc,13045
13
13
  vision_agent/agent/vision_agent_v2_prompt.py,sha256=dd9m9Vqp91r4dpsKMDwXr54jG_GTBdJNDzpgR115S8Q,5997
14
14
  vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
15
  vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
@@ -17,18 +17,18 @@ vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,
17
17
  vision_agent/llm/llm.py,sha256=A-gN0vMb79fSxhSK1qBs6PTu1fba9Gvy6pitOyjW2gM,5779
18
18
  vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
19
19
  vision_agent/lmm/lmm.py,sha256=gK90vMxh0OcGSuIZQikBkDXm4pfkdFk1R2y7rtWDl84,10539
20
- vision_agent/tools/__init__.py,sha256=p5SM0YhThSVO_jRF9O-OjH2fYDPv-iMjexDX9xPPb7M,452
20
+ vision_agent/tools/__init__.py,sha256=dRHXGpjhItXZRQs0r_l3Z3bQIreaZaYP0CJrl8mOJxM,452
21
21
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
22
22
  vision_agent/tools/tool_utils.py,sha256=mK6QfbYr6oo9ci979-_6R1DrxU2i8HGhwosADyvciI0,865
23
23
  vision_agent/tools/tools.py,sha256=sVxN7SpDkz_XTc_SKwkoRF4EwaMTuHvTsCHwtR942Fc,47373
24
- vision_agent/tools/tools_v2.py,sha256=1Y_ZbYJyuo2eZZkq7jY3YfuKWC82C-GFCZMLYH-I5ew,13800
24
+ vision_agent/tools/tools_v2.py,sha256=Dh5Rs1iaEs5ijRDwVI3Na9ylC7eOjtrIqtYOZSredH8,15364
25
25
  vision_agent/utils/__init__.py,sha256=xsHFyJSDbLdonB9Dh74cwZnVTiT__2OQF3Brd3Nmglc,116
26
26
  vision_agent/utils/execute.py,sha256=RC_jKrm2kOWwzNe9xKuA2xJcbsNcD0Hb95_o3_Le0_E,3820
27
27
  vision_agent/utils/image_utils.py,sha256=1dggPBhW8_hUXDItCRLa23h-hdBwS50cjL4v1hsoUbg,7586
28
28
  vision_agent/utils/sim.py,sha256=SO4-pj2Fjs3yr-KT8S0nuUd66lf7m7XvMAp7_ecvKuQ,2813
29
29
  vision_agent/utils/type_defs.py,sha256=4LTnTL4HNsfYqCrDn9Ppjg9bSG2ZGcoKSSd9YeQf4Bw,1792
30
30
  vision_agent/utils/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
31
- vision_agent-0.2.21.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
32
- vision_agent-0.2.21.dist-info/METADATA,sha256=zth6S82TWHpBoFToYIVQYRyOWBxovRci731LcIHb8Bw,9121
33
- vision_agent-0.2.21.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
34
- vision_agent-0.2.21.dist-info/RECORD,,
31
+ vision_agent-0.2.23.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
32
+ vision_agent-0.2.23.dist-info/METADATA,sha256=r3JWwYu2mKPjViXrm50ZS_9juGciOrYfEyz2YhPeczQ,9121
33
+ vision_agent-0.2.23.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
34
+ vision_agent-0.2.23.dist-info/RECORD,,