vision-agent 1.1.16__py3-none-any.whl → 1.1.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,206 @@
1
+ import base64
2
+ import copy
3
+ import io
4
+ from typing import Dict, List, Optional, Tuple, Union, cast
5
+
6
+ import cv2
7
+ import matplotlib.figure
8
+ import matplotlib.pyplot as plt
9
+ import numpy as np
10
+ from PIL import Image
11
+ from PIL.Image import Image as PILImageType
12
+
13
+ from vision_agent.utils.image_utils import (
14
+ denormalize_bbox,
15
+ normalize_bbox,
16
+ numpy_to_bytes,
17
+ rle_decode_array,
18
+ )
19
+ from vision_agent.utils.tools import send_inference_request
20
+
21
+
22
+ def maybe_denormalize_bbox(
23
+ bbox: List[Union[int, float]], image_size: Tuple[int, ...]
24
+ ) -> List[float]:
25
+ if all([0 <= c <= 1 for c in bbox]):
26
+ return denormalize_bbox(bbox, image_size)
27
+ return bbox
28
+
29
+
30
+ def maybe_normalize_bbox(
31
+ bbox: List[Union[int, float]], image_size: Tuple[int, ...]
32
+ ) -> List[float]:
33
+ if any([1 <= c for c in bbox]):
34
+ return normalize_bbox(bbox, image_size)
35
+ return bbox
36
+
37
+
38
+ def instance_segmentation(
39
+ prompt: str, image: np.ndarray, threshold: float = 0.23, nms_threshold: float = 0.5
40
+ ) -> List[Dict[str, Union[str, float, List[float], np.ndarray]]]:
41
+ image_bytes = numpy_to_bytes(image)
42
+ files = [("image", image_bytes)]
43
+ data = {"prompts": [prompt], "threshold": threshold, "nms_threshold": nms_threshold}
44
+ results = send_inference_request(
45
+ data,
46
+ "glee",
47
+ files=files,
48
+ v2=True,
49
+ )
50
+ results = results[0]
51
+ results_formatted = [
52
+ {
53
+ "label": elt["label"],
54
+ "score": elt["score"],
55
+ "bbox": normalize_bbox(elt["bounding_box"], image.shape[:2]),
56
+ "mask": np.array(rle_decode_array(elt["mask"])),
57
+ }
58
+ for elt in results
59
+ ]
60
+ return results_formatted
61
+
62
+
63
+ def ocr(image: np.ndarray) -> List[Dict[str, Union[str, float, List[float]]]]:
64
+ image_bytes = numpy_to_bytes(image)
65
+ files = [("image", image_bytes)]
66
+ results = send_inference_request(
67
+ {},
68
+ "paddle-ocr",
69
+ files=files,
70
+ v2=True,
71
+ )
72
+ results_formatted = [
73
+ {
74
+ "label": elt["label"],
75
+ "score": elt["score"],
76
+ "bbox": normalize_bbox(elt["bbox"], image.shape[:2]),
77
+ }
78
+ for elt in results
79
+ ]
80
+ return results_formatted
81
+
82
+
83
+ def depth_estimation(image: np.ndarray) -> np.ndarray:
84
+ shape = image.shape[:2]
85
+ image_bytes = numpy_to_bytes(image)
86
+ files = [("image", image_bytes)]
87
+ results = send_inference_request(
88
+ {},
89
+ "depth-pro",
90
+ files=files,
91
+ v2=True,
92
+ )
93
+ depth = np.frombuffer(base64.b64decode(results["depth"]), dtype=np.float32).reshape(
94
+ shape
95
+ )
96
+ return depth
97
+
98
+
99
+ def visualize_bounding_boxes(
100
+ image: np.ndarray, bounding_boxes: List[Dict[str, Union[str, float, List[float]]]]
101
+ ) -> np.ndarray:
102
+ image = image.copy()
103
+ image_size = image.shape[:2]
104
+ bounding_boxes = copy.deepcopy(bounding_boxes)
105
+
106
+ for bbox in bounding_boxes:
107
+ bbox["bbox"] = maybe_denormalize_bbox(
108
+ cast(List[float], bbox["bbox"]), image_size
109
+ )
110
+ for bbox in bounding_boxes:
111
+ x1, y1, x2, y2 = bbox["bbox"] # type: ignore
112
+ cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (0, 0, 255), 2)
113
+ return image
114
+
115
+
116
+ def visualize_segmentation_masks(
117
+ image: np.ndarray,
118
+ segmentation_masks: List[Dict[str, Union[str, float, np.ndarray]]],
119
+ ) -> np.ndarray:
120
+ alpha = 0.5
121
+ overlay = image.copy()
122
+ color_mask = np.zeros_like(image)
123
+ color_mask[:, :] = (0, 100, 255)
124
+ for elt in segmentation_masks:
125
+ mask = cast(np.ndarray, elt["mask"])
126
+ overlay[mask == 1] = (1 - alpha) * overlay[mask == 1] + alpha * color_mask[
127
+ mask == 1
128
+ ]
129
+
130
+ # draw outline on the mask so it doesn't just think the color of the object changed
131
+ mask_uint8 = (mask * 255).astype(np.uint8)
132
+ contours, _ = cv2.findContours(
133
+ mask_uint8, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
134
+ )
135
+ cv2.drawContours(overlay, contours, -1, (0, 0, 255), 2, lineType=cv2.LINE_AA)
136
+ overlay = np.clip(overlay, 0, 255).astype(np.uint8)
137
+ return overlay
138
+
139
+
140
+ def get_crops(
141
+ image: np.ndarray, bounding_boxes: List[Dict[str, Union[str, float, List[float]]]]
142
+ ) -> List[np.ndarray]:
143
+ image = image.copy()
144
+ bounding_boxes = copy.deepcopy(bounding_boxes)
145
+
146
+ for bbox in bounding_boxes:
147
+ bbox["bbox"] = maybe_denormalize_bbox(
148
+ cast(List[float], bbox["bbox"]), image.shape[:2]
149
+ )
150
+ crops = []
151
+ for bbox in bounding_boxes:
152
+ x1, y1, x2, y2 = bbox["bbox"] # type: ignore
153
+ crops.append(image[int(y1) : int(y2), int(x1) : int(x2)])
154
+ return crops
155
+
156
+
157
+ def rotate_90(image: np.ndarray, k: int = 1) -> np.ndarray:
158
+ return np.rot90(image, k=k, axes=(0, 1))
159
+
160
+
161
+ def iou(
162
+ pred1: Union[List[float], np.ndarray], pred2: Union[List[float], np.ndarray]
163
+ ) -> float:
164
+ if isinstance(pred1, list) and isinstance(pred2, list):
165
+ x1, y1, x2, y2 = pred1
166
+ x1_, y1_, x2_, y2_ = pred2
167
+ intersection = max(0, min(x2, x2_) - max(x1, x1_)) * max(
168
+ 0, min(y2, y2_) - max(y1, y1_)
169
+ )
170
+ union = (x2 - x1) * (y2 - y1) + (x2_ - x1_) * (y2_ - y1_) - intersection
171
+ return intersection / union
172
+ elif isinstance(pred1, np.ndarray) and isinstance(pred2, np.ndarray):
173
+ pred1 = np.clip(pred1, 0, 1)
174
+ pred2 = np.clip(pred2, 0, 1)
175
+ intersection = np.sum(pred1 * pred2)
176
+ union = np.sum(pred1) + np.sum(pred2) - intersection
177
+ return intersection / union
178
+ raise ValueError("Unsupported input types for IoU calculation.")
179
+
180
+
181
+ def display_image(
182
+ image: Union[np.ndarray, PILImageType, matplotlib.figure.Figure, str],
183
+ ) -> None:
184
+ display_img: Optional[PILImageType] = None
185
+ if isinstance(image, np.ndarray):
186
+ display_img = Image.fromarray(image)
187
+ elif isinstance(image, matplotlib.figure.Figure):
188
+ # Render the figure to a BytesIO buffer
189
+ buf = io.BytesIO()
190
+ image.savefig(buf, format="png")
191
+ buf.seek(0)
192
+ # Load the buffer as a PIL Image
193
+ display_img = Image.open(buf)
194
+ plt.close(image) # type: ignore
195
+ elif isinstance(image, PILImageType):
196
+ display_img = image # Already a PIL Image
197
+ elif isinstance(image, str):
198
+ display_img = Image.open(image)
199
+
200
+ if display_img is not None:
201
+ plt.imshow(display_img) # type: ignore
202
+ plt.axis("off") # type: ignore
203
+ plt.show()
204
+ else:
205
+ # Handle cases where image type is not supported or conversion failed
206
+ print("Unsupported image type or conversion failed.")
@@ -4,7 +4,7 @@ import logging
4
4
  import os
5
5
  import tempfile
6
6
  import urllib.request
7
- from base64 import b64encode
7
+ from base64 import b64encode, b64decode
8
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
9
9
  from importlib import resources
10
10
  from pathlib import Path
@@ -15,7 +15,6 @@ import time
15
15
  import cv2
16
16
  import numpy as np
17
17
  import pandas as pd
18
- import requests
19
18
  from IPython.display import display
20
19
  from PIL import Image, ImageDraw, ImageFont
21
20
  from pillow_heif import register_heif_opener # type: ignore
@@ -2034,8 +2033,8 @@ def qwen2_vl_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
2034
2033
  return cast(str, data)
2035
2034
 
2036
2035
 
2037
- def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
2038
- """'ocr' extracts text from an image. It returns a list of detected text, bounding
2036
+ def paddle_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
2037
+ """'paddle_ocr' extracts text from an image. It returns a list of detected text, bounding
2039
2038
  boxes with normalized coordinates, and confidence scores. The results are sorted
2040
2039
  from top-left to bottom right.
2041
2040
 
@@ -2048,51 +2047,33 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
2048
2047
 
2049
2048
  Example
2050
2049
  -------
2051
- >>> ocr(image)
2050
+ >>> paddle_ocr(image)
2052
2051
  [
2053
2052
  {'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
2054
2053
  ]
2055
2054
  """
2056
2055
 
2057
- pil_image = Image.fromarray(image).convert("RGB")
2058
- image_size = pil_image.size[::-1]
2056
+ image_size = image.shape[:2]
2059
2057
  if image_size[0] < 1 or image_size[1] < 1:
2060
2058
  return []
2061
- image_buffer = io.BytesIO()
2062
- pil_image.save(image_buffer, format="PNG")
2063
- buffer_bytes = image_buffer.getvalue()
2064
- image_buffer.close()
2065
-
2066
- res = requests.post(
2067
- _OCR_URL,
2068
- files={"images": buffer_bytes},
2069
- data={"language": "en"},
2070
- headers={"contentType": "multipart/form-data", "apikey": _API_KEY},
2071
- )
2072
-
2073
- if res.status_code != 200:
2074
- raise ValueError(f"OCR request failed with status code {res.status_code}")
2075
-
2076
- data = res.json()
2077
- output = []
2078
- for det in data[0]:
2079
- label = det["text"]
2080
- box = [
2081
- det["location"][0]["x"],
2082
- det["location"][0]["y"],
2083
- det["location"][2]["x"],
2084
- det["location"][2]["y"],
2085
- ]
2086
- box = normalize_bbox(box, image_size)
2087
- output.append({"label": label, "bbox": box, "score": round(det["score"], 2)})
2059
+ buffer_bytes = numpy_to_bytes(image)
2060
+ files = [("image", buffer_bytes)]
2061
+
2062
+ res = send_inference_request(
2063
+ payload={"function_name": "paddle-ocr"},
2064
+ endpoint_name="paddle-ocr",
2065
+ files=files,
2066
+ v2=True,
2067
+ )
2088
2068
 
2089
2069
  _display_tool_trace(
2090
- ocr.__name__,
2070
+ paddle_ocr.__name__,
2091
2071
  {},
2092
- data,
2093
- cast(List[Tuple[str, bytes]], [("image", buffer_bytes)]),
2072
+ res,
2073
+ files,
2094
2074
  )
2095
- return sorted(output, key=lambda x: (x["bbox"][1], x["bbox"][0]))
2075
+
2076
+ return sorted(res, key=lambda x: (x["bbox"][1], x["bbox"][0]))
2096
2077
 
2097
2078
 
2098
2079
  def claude35_text_extraction(image: np.ndarray) -> str:
@@ -2370,7 +2351,12 @@ def agentic_activity_recognition(
2370
2351
  buffer_bytes = frames_to_bytes(frames, fps=fps)
2371
2352
  files = [("video", buffer_bytes)]
2372
2353
 
2373
- payload = {"prompt": prompt, "specificity": specificity, "with_audio": with_audio}
2354
+ payload = {
2355
+ "prompt": prompt,
2356
+ "specificity": specificity,
2357
+ "with_audio": with_audio,
2358
+ "function_name": "agentic_activity_recognition",
2359
+ }
2374
2360
 
2375
2361
  response = send_inference_request(
2376
2362
  payload=payload, endpoint_name="activity-recognition", files=files, v2=True
@@ -2529,48 +2515,53 @@ def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
2529
2515
  return return_data
2530
2516
 
2531
2517
 
2532
- def depth_anything_v2(image: np.ndarray) -> np.ndarray:
2533
- """'depth_anything_v2' is a tool that runs depth anything v2 model to generate a
2534
- depth image from a given RGB image. The returned depth image is monochrome and
2535
- represents depth values as pixel intensities with pixel values ranging from 0 to 255.
2518
+ def depth_pro(
2519
+ image: np.ndarray,
2520
+ ) -> np.ndarray:
2521
+ """'depth_pro' is a tool that runs the Apple DepthPro model to generate a
2522
+ depth map from a given RGB image. The returned depth map has the same dimensions
2523
+ as the input image, with each pixel indicating the distance from the camera in meters.
2536
2524
 
2537
2525
  Parameters:
2538
2526
  image (np.ndarray): The image to used to generate depth image
2539
2527
 
2540
2528
  Returns:
2541
- np.ndarray: A grayscale depth image with pixel values ranging from 0 to 255
2542
- where high values represent closer objects and low values further.
2529
+ np.ndarray: A depth map with float32 pixel values that represent
2530
+ the distance from the camera in meters.
2543
2531
 
2544
2532
  Example
2545
2533
  -------
2546
- >>> depth_anything_v2(image)
2534
+ >>> depth_pro(image)
2547
2535
  array([[0, 0, 0, ..., 0, 0, 0],
2548
2536
  [0, 20, 24, ..., 0, 100, 103],
2549
2537
  ...,
2550
2538
  [10, 11, 15, ..., 202, 202, 205],
2551
- [10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
2539
+ [10, 10, 10, ..., 200, 200, 200]], dtype=np.float32),
2552
2540
  """
2553
- if image.shape[0] < 1 or image.shape[1] < 1:
2554
- raise ValueError(f"Image is empty, image shape: {image.shape}")
2555
2541
 
2556
- image_b64 = convert_to_b64(image)
2557
- data = {
2558
- "image": image_b64,
2559
- "function_name": "depth_anything_v2",
2560
- }
2542
+ image_size = image.shape[:2]
2543
+ if image_size[0] < 1 or image_size[1] < 1:
2544
+ return np.empty(0)
2545
+ buffer_bytes = numpy_to_bytes(image)
2546
+ files = [("image", buffer_bytes)]
2561
2547
 
2562
- depth_map = send_inference_request(data, "depth-anything-v2", v2=True)
2563
- depth_map_np = np.array(depth_map["map"])
2564
- depth_map_np = (depth_map_np - depth_map_np.min()) / (
2565
- depth_map_np.max() - depth_map_np.min()
2548
+ detections = send_inference_request(
2549
+ payload={"function_name": "depth-pro"},
2550
+ endpoint_name="depth-pro",
2551
+ files=files,
2552
+ v2=True,
2566
2553
  )
2567
- depth_map_np = (255 * depth_map_np).astype(np.uint8)
2554
+
2555
+ depth_bytes = b64decode(detections["depth"])
2556
+ depth_map_np = np.frombuffer(depth_bytes, dtype=np.float32).reshape(image_size)
2557
+
2568
2558
  _display_tool_trace(
2569
- depth_anything_v2.__name__,
2559
+ depth_pro.__name__,
2570
2560
  {},
2571
- depth_map,
2572
- image_b64,
2561
+ response=detections,
2562
+ files=files,
2573
2563
  )
2564
+
2574
2565
  return depth_map_np
2575
2566
 
2576
2567
 
@@ -3564,12 +3555,12 @@ FUNCTION_TOOLS = [
3564
3555
  claude35_text_extraction,
3565
3556
  agentic_document_extraction,
3566
3557
  document_qa,
3567
- ocr,
3558
+ paddle_ocr,
3568
3559
  gemini_image_generation,
3569
3560
  qwen25_vl_images_vqa,
3570
3561
  qwen25_vl_video_vqa,
3571
3562
  agentic_activity_recognition,
3572
- depth_anything_v2,
3563
+ depth_pro,
3573
3564
  generate_pose_image,
3574
3565
  vit_nsfw_classification,
3575
3566
  siglip_classification,
@@ -247,7 +247,9 @@ def print_table(title: str, columns: List[str], rows: List[List[str]]) -> None:
247
247
 
248
248
 
249
249
  def add_media_to_chat(
250
- chat: List[AgentMessage], code_interpreter: Optional[CodeInterpreter] = None
250
+ chat: List[AgentMessage],
251
+ code_interpreter: Optional[CodeInterpreter] = None,
252
+ append_to_prompt: bool = True,
251
253
  ) -> Tuple[List[AgentMessage], List[AgentMessage], List[Union[str, Path]]]:
252
254
  orig_chat = copy.deepcopy(chat)
253
255
  int_chat = copy.deepcopy(chat)
@@ -278,6 +280,7 @@ def add_media_to_chat(
278
280
  if (
279
281
  not str(chat_i.content).endswith(f" Media name {media}")
280
282
  and chat_i.role == "user"
283
+ and append_to_prompt
281
284
  ):
282
285
  chat_i.content += f" Media name {media}"
283
286
  chat_i.media = media_list_i if len(media_list_i) > 0 else None
@@ -304,13 +307,26 @@ def add_media_to_chat(
304
307
  def capture_media_from_exec(execution: Execution) -> List[str]:
305
308
  images = []
306
309
  for result in execution.results:
307
- for format in result.formats():
308
- if format in ["png", "jpeg"]:
309
- # converts the image to png and then to base64
310
- images.append(
311
- "data:image/png;base64,"
312
- + convert_to_b64(b64_to_pil(result[format]))
313
- )
310
+ if hasattr(result, "formats"):
311
+ for format in result.formats():
312
+ if format in ["png", "jpeg"]:
313
+ # converts the image to png and then to base64
314
+ images.append(
315
+ "data:image/png;base64,"
316
+ + convert_to_b64(b64_to_pil(result[format]))
317
+ )
318
+ elif hasattr(result, "savefig"):
319
+ pass
320
+ elif hasattr(result, "_repr_png_") and result._repr_png_():
321
+ images.append(
322
+ "data:image/png;base64,"
323
+ + convert_to_b64(b64_to_pil(result._repr_png_())) # type: ignore
324
+ )
325
+ elif hasattr(result, "_repr_jpeg_") and result._repr_jpeg_():
326
+ images.append(
327
+ "data:image/jpeg;base64,"
328
+ + convert_to_b64(b64_to_pil(result._repr_jpeg_())) # type: ignore
329
+ )
314
330
  return images
315
331
 
316
332
 
@@ -106,7 +106,7 @@ def send_task_inference_request(
106
106
  if metadata is not None and "function_name" in metadata:
107
107
  function_name = metadata["function_name"]
108
108
  response = _call_post(url, payload, session, files, function_name, is_form)
109
- return response["data"]
109
+ return response["data"] if "data" in response else response
110
110
 
111
111
 
112
112
  def _create_requests_session(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vision-agent
3
- Version: 1.1.16
3
+ Version: 1.1.18
4
4
  Summary: Toolset for Vision Agent
5
5
  Project-URL: Homepage, https://landing.ai
6
6
  Project-URL: repository, https://github.com/landing-ai/vision-agent
@@ -8,7 +8,7 @@ Project-URL: documentation, https://github.com/landing-ai/vision-agent
8
8
  Author-email: Landing AI <dev@landing.ai>
9
9
  License-File: LICENSE
10
10
  Requires-Python: <4.0,>=3.9
11
- Requires-Dist: anthropic<0.32,>=0.31.0
11
+ Requires-Dist: anthropic>=0.54.0
12
12
  Requires-Dist: av<12,>=11.0.0
13
13
  Requires-Dist: dotenv<0.10,>=0.9.9
14
14
  Requires-Dist: flake8<8,>=7.0.0
@@ -20,7 +20,7 @@ Requires-Dist: matplotlib<4,>=3.9.2
20
20
  Requires-Dist: nbclient<0.11,>=0.10.0
21
21
  Requires-Dist: nbformat<6,>=5.10.4
22
22
  Requires-Dist: numpy<2.0.0,>=1.21.0
23
- Requires-Dist: openai==1.55.3
23
+ Requires-Dist: openai>=1.86.0
24
24
  Requires-Dist: opencv-python==4.*
25
25
  Requires-Dist: opentelemetry-api<2,>=1.29.0
26
26
  Requires-Dist: pandas==2.*
@@ -36,7 +36,7 @@ Requires-Dist: tabulate<0.10,>=0.9.0
36
36
  Requires-Dist: tenacity<9,>=8.3.0
37
37
  Requires-Dist: tqdm<5.0.0,>=4.64.0
38
38
  Requires-Dist: typing-extensions==4.*
39
- Requires-Dist: yt-dlp>=2025.3.31
39
+ Requires-Dist: yt-dlp>=2025.6.9
40
40
  Description-Content-Type: text/markdown
41
41
 
42
42
  <div align="center">
@@ -1,15 +1,17 @@
1
1
  vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
2
- vision_agent/.sim_tools/df.csv,sha256=i732_U1KQf55UNhT-9srtZXF91XvDnfWBDdc8EqDmpw,41215
3
- vision_agent/.sim_tools/embs.npy,sha256=XCu3LnLS10IS3npfPMqX2VHIbDPq9iY_NPDBwq5AEj0,245888
2
+ vision_agent/.sim_tools/df.csv,sha256=Hus29ljPZV15EmAd1qFTStPuVDi8JDS0ekUcyjCTJ9U,41187
3
+ vision_agent/.sim_tools/embs.npy,sha256=OLj2rt4aBFze2HIf9bQ3yn0-_3RVPecrHWxm2CWvgn0,245888
4
4
  vision_agent/agent/README.md,sha256=3XSPG_VO7-6y6P8COvcgSSonWj5uvfgvfmOkBpfKK8Q,5527
5
- vision_agent/agent/__init__.py,sha256=_-nGLHhRTLViXxBSb9D4OwLTqk9HXKPEkTBkvK8c7OU,206
5
+ vision_agent/agent/__init__.py,sha256=lhPV1JUJ_Ckp_NHpq9VcwqaBd0wh4-GtyT79aFOWvI0,249
6
6
  vision_agent/agent/agent.py,sha256=o1Zuhl6h2R7uVwvUur0Aj38kak8U08plfeFWPst_ErM,1576
7
7
  vision_agent/agent/vision_agent_coder_prompts_v2.py,sha256=53b_DhQtffX5wxLuCbNQ83AJhB0P_3wEnuKr-v5bx-o,4866
8
8
  vision_agent/agent/vision_agent_coder_v2.py,sha256=ELc_J8Q4NKPs7YETu3a9O0Vk1zN3k6QfHBgu0M0IWGk,17450
9
9
  vision_agent/agent/vision_agent_planner_prompts_v2.py,sha256=O24BpRhMRZx7D_WdaRv-a2K6fLpin0o7oWxlvL70WpM,35944
10
10
  vision_agent/agent/vision_agent_planner_v2.py,sha256=Aww_BJhTFKZ5XjYe8FW57z2Gwp2se0vg1t1DKLGRAyQ,22050
11
11
  vision_agent/agent/vision_agent_prompts_v2.py,sha256=NG1xnZvZGi4DcqdfqZCkPkS7oka3gr6h42ekUKUKcqY,4231
12
+ vision_agent/agent/vision_agent_prompts_v3.py,sha256=ABFdTe1TMnFBy_VH_AYDSE0IHFiPX0KOB-nNRfLurxM,16548
12
13
  vision_agent/agent/vision_agent_v2.py,sha256=iPW6DowH7wCFIA5vb1SdSLfZFWbn_oSC7Xa8uO8KIJI,11675
14
+ vision_agent/agent/vision_agent_v3.py,sha256=tFr9VYSG65R0PRypiNzoW6NzKV1yuBPXIzmE4HO-p0A,10228
13
15
  vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
16
  vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
15
17
  vision_agent/configs/__init__.py,sha256=Iu75-w9_nlPmnB_qKA7nYaaaHf7xtTrDmK8N4v2WV34,27
@@ -19,28 +21,29 @@ vision_agent/configs/openai_config.py,sha256=Bw7ElBYmBcaZttyRBoNpcy3uTkqg5qADk8L
19
21
  vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
22
  vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
21
23
  vision_agent/lmm/__init__.py,sha256=4qX2lmGnKWHeKftXueEi9xj_ieK2nQh_ipHf72nKGFk,84
22
- vision_agent/lmm/lmm.py,sha256=w23nWSmUiW1rxfRC-Td44-UR3-8k0ey80-0SVZraeqA,23681
24
+ vision_agent/lmm/lmm.py,sha256=gGUf621irXgQ18W497bMa1vQzbgUsZQsRwLHFNpBSJA,29982
23
25
  vision_agent/models/__init__.py,sha256=eIP0pD5dYog8zUA7uuTmUxCF6SIutbLRLRE0cmuCJgQ,326
24
26
  vision_agent/models/agent_types.py,sha256=vBZ9-ns5lHDdFMO7ulCGGeZ6OwRo3gK4O3vN0814IWc,3064
25
27
  vision_agent/models/lmm_types.py,sha256=v04h-NjbczHOIN8UWa1vvO5-1BDuZ4JQhD2mge1cXmw,305
26
28
  vision_agent/models/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
27
29
  vision_agent/sim/__init__.py,sha256=Aouz6HEPPTYcLxR5_0fTYCL1OvPKAH1RMWAF90QXAlA,135
28
30
  vision_agent/sim/sim.py,sha256=WQY_x9A4VT647qGDBScJ3R8_Iv0aoYLHTgwcQSCXwv4,10059
29
- vision_agent/tools/__init__.py,sha256=zf8HzjcMSgxKhtrxbqYe9hmvsfuweeDMrOc8eVA8Ya8,2477
31
+ vision_agent/tools/__init__.py,sha256=USlLNSJ1YZ3UQBAHYu6MXx8Scf639sfL10im1NUuI4k,2490
30
32
  vision_agent/tools/meta_tools.py,sha256=9iJilpGYEiXW0nYPTYAWHa7l23wGN8IM5KbE7mWDOT0,6798
31
33
  vision_agent/tools/planner_tools.py,sha256=iQWtTgXdomn0IWrbmvXXM-y8Q_RSEOxyP04HIRLrgWI,19576
34
+ vision_agent/tools/planner_v3_tools.py,sha256=9uLKDtdWdpiRm_lVgc2DdeLEo2D4cw2demFTUQ401Zo,6525
32
35
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
33
- vision_agent/tools/tools.py,sha256=i9GGGu8tvo2M6O5fF4UUBTpn_Ul2KEN9mG3ZlJ95qao,124929
36
+ vision_agent/tools/tools.py,sha256=lndSG8xrIWcs6Rpe1-Jq44niUDXQnWlYfGP2B1YjpI0,124216
34
37
  vision_agent/utils/__init__.py,sha256=mANUs_84VL-3gpZbXryvV2mWU623eWnRlJCSUHtMjuw,122
35
- vision_agent/utils/agent.py,sha256=2ifTP5QElItnr4YHOJR6L5P1PUzV0GhChTTqVxuVyQg,15153
38
+ vision_agent/utils/agent.py,sha256=88axZswX7DibAkckc0mDJWLr0SoVPyam4mqO4zsLRNQ,15827
36
39
  vision_agent/utils/exceptions.py,sha256=zis8smCbdEylBVZBTVfEUfAh7Rb7cWV3MSPambu6FsQ,1837
37
40
  vision_agent/utils/execute.py,sha256=QAql6KC2uEhX1o_44mMA77lCmMUs0itaaGMFSfJBki8,21520
38
41
  vision_agent/utils/image_utils.py,sha256=bJM2mEvB6E__M9pxi74yQYzAiZ7mu3KE2ptyVrp5vzQ,12533
39
- vision_agent/utils/tools.py,sha256=Days0dETPRQLSDamMKPnXFsc5g5IKX9QJcPPNmSHNdM,8111
42
+ vision_agent/utils/tools.py,sha256=gF5h1QuBCJaC2u_FRxPR32eYPRa78R_DPcmOiPcnb3A,8147
40
43
  vision_agent/utils/tools_doc.py,sha256=PKcXXbJktiuPi9q6Q1zXzFx24Dh229SNgWBDtZ2fQSQ,2730
41
44
  vision_agent/utils/video.py,sha256=rjsQ1sKKisaQ6AVjJz0zd_G4g-ovRweS_rs4JEhenoI,5340
42
45
  vision_agent/utils/video_tracking.py,sha256=DZLFpNCuzuPJQzbQoVNcp-m4dKxgiKdCNM5QTh_zURE,12245
43
- vision_agent-1.1.16.dist-info/METADATA,sha256=JMmL6rIdT1-WO6XTrjNHucAp4S_UlkjDW1dxznQJ994,12078
44
- vision_agent-1.1.16.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
45
- vision_agent-1.1.16.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
46
- vision_agent-1.1.16.dist-info/RECORD,,
46
+ vision_agent-1.1.18.dist-info/METADATA,sha256=S7WnsgYo0nBT-O4Ca6-rYLG3tjQ9np5Tk1Fv1Z-_0pU,12071
47
+ vision_agent-1.1.18.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
48
+ vision_agent-1.1.18.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
49
+ vision_agent-1.1.18.dist-info/RECORD,,