vision-agent 0.2.126__py3-none-any.whl → 0.2.128__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/agent/vision_agent_coder_prompts.py +41 -7
- vision_agent/tools/__init__.py +3 -2
- vision_agent/tools/tools.py +128 -39
- vision_agent/utils/video.py +24 -5
- {vision_agent-0.2.126.dist-info → vision_agent-0.2.128.dist-info}/METADATA +2 -1
- {vision_agent-0.2.126.dist-info → vision_agent-0.2.128.dist-info}/RECORD +8 -8
- {vision_agent-0.2.126.dist-info → vision_agent-0.2.128.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.126.dist-info → vision_agent-0.2.128.dist-info}/WHEEL +0 -0
@@ -70,30 +70,64 @@ This is the documentation for the functions you have access to. You may call any
|
|
70
70
|
2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
|
71
71
|
3. Your test case MUST run only on the given images which are {media}
|
72
72
|
4. Print this final dictionary.
|
73
|
+
5. For video input, sample at 1 FPS and use the first 10 frames only to reduce processing time.
|
73
74
|
|
74
75
|
**Example**:
|
76
|
+
--- EXAMPLE1 ---
|
75
77
|
plan1:
|
76
78
|
- Load the image from the provided file path 'image.jpg'.
|
77
|
-
- Use the '
|
79
|
+
- Use the 'owl_v2_image' tool with the prompt 'person' to detect and count the number of people in the image.
|
78
80
|
plan2:
|
79
81
|
- Load the image from the provided file path 'image.jpg'.
|
80
|
-
- Use the '
|
82
|
+
- Use the 'florence2_sam2_image' tool with the prompt 'person' to detect and count the number of people in the image.
|
81
83
|
- Count the number of detected objects labeled as 'person'.
|
82
84
|
plan3:
|
83
85
|
- Load the image from the provided file path 'image.jpg'.
|
84
86
|
- Use the 'countgd_counting' tool to count the dominant foreground object, which in this case is people.
|
85
87
|
|
86
88
|
```python
|
87
|
-
from vision_agent.tools import load_image,
|
89
|
+
from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image, countgd_counting
|
88
90
|
image = load_image("image.jpg")
|
89
|
-
owl_v2_out =
|
91
|
+
owl_v2_out = owl_v2_image("person", image)
|
90
92
|
|
91
|
-
|
92
|
-
|
93
|
+
f2s2_out = florence2_sam2_image("person", image)
|
94
|
+
# strip out the masks from the output becuase they don't provide useful information when printed
|
95
|
+
f2s2_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in f2s2_out]
|
93
96
|
|
94
97
|
cgd_out = countgd_counting(image)
|
95
98
|
|
96
|
-
final_out = {{"
|
99
|
+
final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "countgd_counting": cgd_out}}
|
100
|
+
print(final_out)
|
101
|
+
|
102
|
+
--- EXAMPLE2 ---
|
103
|
+
plan1:
|
104
|
+
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
|
105
|
+
- Use the 'owl_v2_image' tool with the prompt 'person' to detect where the people are in the video.
|
106
|
+
plan2:
|
107
|
+
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
|
108
|
+
- Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
|
109
|
+
plan3:
|
110
|
+
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
|
111
|
+
- Use the 'countgd_counting' tool with the prompt 'person' to detect where the people are in the video.
|
112
|
+
|
113
|
+
|
114
|
+
```python
|
115
|
+
from vision_agent.tools import extract_frames, owl_v2_image, florence2_phrase_grounding, countgd_counting
|
116
|
+
|
117
|
+
# sample at 1 FPS and use the first 10 frames to reduce processing time
|
118
|
+
frames = extract_frames("video.mp4", 1)
|
119
|
+
frames = [f[0] for f in frames][:10]
|
120
|
+
|
121
|
+
# plan1
|
122
|
+
owl_v2_out = [owl_v2_image("person", f) for f in frames]
|
123
|
+
|
124
|
+
# plan2
|
125
|
+
florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
|
126
|
+
|
127
|
+
# plan3
|
128
|
+
countgd_out = [countgd_counting(f) for f in frames]
|
129
|
+
|
130
|
+
final_out = {{"owl_v2_image": owl_v2_out, "florencev2_object_detection": florencev2_out, "countgd_counting": cgd_out}}
|
97
131
|
print(final_out)
|
98
132
|
```
|
99
133
|
"""
|
vision_agent/tools/__init__.py
CHANGED
@@ -27,7 +27,7 @@ from .tools import (
|
|
27
27
|
florence2_phrase_grounding,
|
28
28
|
florence2_roberta_vqa,
|
29
29
|
florence2_sam2_image,
|
30
|
-
|
30
|
+
florence2_sam2_video_tracking,
|
31
31
|
generate_pose_image,
|
32
32
|
generate_soft_edge_image,
|
33
33
|
get_tool_documentation,
|
@@ -46,7 +46,8 @@ from .tools import (
|
|
46
46
|
overlay_counting_results,
|
47
47
|
overlay_heat_map,
|
48
48
|
overlay_segmentation_masks,
|
49
|
-
|
49
|
+
owl_v2_image,
|
50
|
+
owl_v2_video,
|
50
51
|
save_image,
|
51
52
|
save_json,
|
52
53
|
save_video,
|
vision_agent/tools/tools.py
CHANGED
@@ -145,15 +145,15 @@ def grounding_dino(
|
|
145
145
|
return return_data
|
146
146
|
|
147
147
|
|
148
|
-
def
|
148
|
+
def owl_v2_image(
|
149
149
|
prompt: str,
|
150
150
|
image: np.ndarray,
|
151
151
|
box_threshold: float = 0.10,
|
152
152
|
) -> List[Dict[str, Any]]:
|
153
|
-
"""'
|
154
|
-
prompt such as category names or referring expressions. The categories in
|
155
|
-
prompt are separated by commas. It returns a list of bounding boxes with
|
156
|
-
coordinates, label names and associated probability scores.
|
153
|
+
"""'owl_v2_image' is a tool that can detect and count multiple objects given a text
|
154
|
+
prompt such as category names or referring expressions on images. The categories in
|
155
|
+
text prompt are separated by commas. It returns a list of bounding boxes with
|
156
|
+
normalized coordinates, label names and associated probability scores.
|
157
157
|
|
158
158
|
Parameters:
|
159
159
|
prompt (str): The prompt to ground to the image.
|
@@ -170,32 +170,103 @@ def owl_v2(
|
|
170
170
|
|
171
171
|
Example
|
172
172
|
-------
|
173
|
-
>>>
|
173
|
+
>>> owl_v2_image("car, dinosaur", image)
|
174
174
|
[
|
175
175
|
{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
176
176
|
{'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
177
177
|
]
|
178
178
|
"""
|
179
179
|
image_size = image.shape[:2]
|
180
|
-
|
181
|
-
|
180
|
+
buffer_bytes = numpy_to_bytes(image)
|
181
|
+
files = [("image", buffer_bytes)]
|
182
|
+
payload = {
|
182
183
|
"prompts": [s.strip() for s in prompt.split(",")],
|
183
|
-
"
|
184
|
-
"
|
185
|
-
"function_name": "owl_v2",
|
184
|
+
"model": "owlv2",
|
185
|
+
"function_name": "owl_v2_image",
|
186
186
|
}
|
187
|
-
|
188
|
-
|
187
|
+
resp_data = send_inference_request(
|
188
|
+
payload, "text-to-object-detection", files=files, v2=True
|
189
|
+
)
|
190
|
+
bboxes = resp_data[0]
|
191
|
+
bboxes_formatted = [
|
192
|
+
ODResponseData(
|
193
|
+
label=bbox["label"],
|
194
|
+
bbox=normalize_bbox(bbox["bounding_box"], image_size),
|
195
|
+
score=round(bbox["score"], 2),
|
196
|
+
)
|
197
|
+
for bbox in bboxes
|
198
|
+
]
|
199
|
+
filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
|
200
|
+
return [bbox.model_dump() for bbox in filtered_bboxes]
|
201
|
+
|
202
|
+
|
203
|
+
def owl_v2_video(
|
204
|
+
prompt: str,
|
205
|
+
frames: List[np.ndarray],
|
206
|
+
box_threshold: float = 0.10,
|
207
|
+
) -> List[List[Dict[str, Any]]]:
|
208
|
+
"""'owl_v2_video' will run owl_v2 on each frame of a video. It can detect multiple
|
209
|
+
objects per frame given a text prompt sucha s a category name or referring
|
210
|
+
expression. The categories in text prompt are separated by commas. It returns a list
|
211
|
+
of lists where each inner list contains the score, label, and bounding box of the
|
212
|
+
detections for that frame.
|
213
|
+
|
214
|
+
Parameters:
|
215
|
+
prompt (str): The prompt to ground to the video.
|
216
|
+
frames (List[np.ndarray]): The list of frames to ground the prompt to.
|
217
|
+
box_threshold (float, optional): The threshold for the box detection. Defaults
|
218
|
+
to 0.30.
|
219
|
+
|
220
|
+
Returns:
|
221
|
+
List[List[Dict[str, Any]]]: A list of lists of dictionaries containing the
|
222
|
+
score, label, and bounding box of the detected objects with normalized
|
223
|
+
coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the
|
224
|
+
coordinates of the top-left and xmax and ymax are the coordinates of the
|
225
|
+
bottom-right of the bounding box.
|
226
|
+
|
227
|
+
Example
|
228
|
+
-------
|
229
|
+
>>> owl_v2_video("car, dinosaur", frames)
|
230
|
+
[
|
231
|
+
[
|
232
|
+
{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
233
|
+
{'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
234
|
+
],
|
235
|
+
...
|
236
|
+
]
|
237
|
+
"""
|
238
|
+
if len(frames) == 0:
|
239
|
+
raise ValueError("No frames provided")
|
240
|
+
|
241
|
+
image_size = frames[0].shape[:2]
|
242
|
+
buffer_bytes = frames_to_bytes(frames)
|
243
|
+
files = [("video", buffer_bytes)]
|
244
|
+
payload = {
|
245
|
+
"prompts": [s.strip() for s in prompt.split(",")],
|
246
|
+
"model": "owlv2",
|
247
|
+
"function_name": "owl_v2_video",
|
248
|
+
}
|
249
|
+
data: Dict[str, Any] = send_inference_request(
|
250
|
+
payload, "text-to-object-detection", files=files, v2=True
|
251
|
+
)
|
252
|
+
bboxes_formatted = []
|
189
253
|
if data is not None:
|
190
|
-
for
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
254
|
+
for frame_data in data:
|
255
|
+
bboxes_formated_frame = []
|
256
|
+
for elt in frame_data:
|
257
|
+
bboxes_formated_frame.append(
|
258
|
+
ODResponseData(
|
259
|
+
label=elt["label"], # type: ignore
|
260
|
+
bbox=normalize_bbox(elt["bounding_box"], image_size), # type: ignore
|
261
|
+
score=round(elt["score"], 2), # type: ignore
|
262
|
+
)
|
263
|
+
)
|
264
|
+
bboxes_formatted.append(bboxes_formated_frame)
|
265
|
+
|
266
|
+
filtered_bboxes = [
|
267
|
+
filter_bboxes_by_threshold(elt, box_threshold) for elt in bboxes_formatted
|
268
|
+
]
|
269
|
+
return [[bbox.model_dump() for bbox in frame] for frame in filtered_bboxes]
|
199
270
|
|
200
271
|
|
201
272
|
def grounding_sam(
|
@@ -317,14 +388,14 @@ def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]
|
|
317
388
|
return return_data
|
318
389
|
|
319
390
|
|
320
|
-
def
|
391
|
+
def florence2_sam2_video_tracking(
|
321
392
|
prompt: str, frames: List[np.ndarray]
|
322
393
|
) -> List[List[Dict[str, Any]]]:
|
323
|
-
"""'
|
324
|
-
in a video given a text prompt such as category names or referring
|
325
|
-
can optionally separate the categories in the text with commas. It
|
326
|
-
entities present in the first frame and only returns segmentation
|
327
|
-
useful for tracking and counting without duplicating counts.
|
394
|
+
"""'florence2_sam2_video_tracking' is a tool that can segment and track multiple
|
395
|
+
entities in a video given a text prompt such as category names or referring
|
396
|
+
expressions. You can optionally separate the categories in the text with commas. It
|
397
|
+
only tracks entities present in the first frame and only returns segmentation
|
398
|
+
masks. It is useful for tracking and counting without duplicating counts.
|
328
399
|
|
329
400
|
Parameters:
|
330
401
|
prompt (str): The prompt to ground to the video.
|
@@ -351,14 +422,15 @@ def florence2_sam2_video(
|
|
351
422
|
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
352
423
|
},
|
353
424
|
],
|
425
|
+
...
|
354
426
|
]
|
355
427
|
"""
|
356
428
|
|
357
429
|
buffer_bytes = frames_to_bytes(frames)
|
358
430
|
files = [("video", buffer_bytes)]
|
359
431
|
payload = {
|
360
|
-
"prompts": prompt.split(","),
|
361
|
-
"function_name": "
|
432
|
+
"prompts": [s.strip() for s in prompt.split(",")],
|
433
|
+
"function_name": "florence2_sam2_video_tracking",
|
362
434
|
}
|
363
435
|
data: Dict[str, Any] = send_inference_request(
|
364
436
|
payload, "florence2-sam2", files=files, v2=True
|
@@ -549,7 +621,14 @@ def countgd_counting(
|
|
549
621
|
payload, "text-to-object-detection", files=files, metadata=metadata
|
550
622
|
)
|
551
623
|
bboxes_per_frame = resp_data[0]
|
552
|
-
bboxes_formatted = [
|
624
|
+
bboxes_formatted = [
|
625
|
+
ODResponseData(
|
626
|
+
label=bbox["label"],
|
627
|
+
bbox=list(map(lambda x: round(x, 2), bbox["bounding_box"])),
|
628
|
+
score=round(bbox["score"], 2),
|
629
|
+
)
|
630
|
+
for bbox in bboxes_per_frame
|
631
|
+
]
|
553
632
|
filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
|
554
633
|
return [bbox.model_dump() for bbox in filtered_bboxes]
|
555
634
|
|
@@ -601,7 +680,14 @@ def countgd_example_based_counting(
|
|
601
680
|
payload, "visual-prompts-to-object-detection", files=files, metadata=metadata
|
602
681
|
)
|
603
682
|
bboxes_per_frame = resp_data[0]
|
604
|
-
bboxes_formatted = [
|
683
|
+
bboxes_formatted = [
|
684
|
+
ODResponseData(
|
685
|
+
label=bbox["label"],
|
686
|
+
bbox=list(map(lambda x: round(x, 2), bbox["bounding_box"])),
|
687
|
+
score=round(bbox["score"], 2),
|
688
|
+
)
|
689
|
+
for bbox in bboxes_per_frame
|
690
|
+
]
|
605
691
|
filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
|
606
692
|
return [bbox.model_dump() for bbox in filtered_bboxes]
|
607
693
|
|
@@ -1374,12 +1460,12 @@ def closest_box_distance(
|
|
1374
1460
|
def extract_frames(
|
1375
1461
|
video_uri: Union[str, Path], fps: float = 1
|
1376
1462
|
) -> List[Tuple[np.ndarray, float]]:
|
1377
|
-
"""'extract_frames' extracts frames from a video which can be a file path or
|
1378
|
-
link, returns a list of tuples (frame, timestamp), where timestamp is the
|
1379
|
-
time in seconds where the frame was captured. The frame is a numpy array.
|
1463
|
+
"""'extract_frames' extracts frames from a video which can be a file path, url or
|
1464
|
+
youtube link, returns a list of tuples (frame, timestamp), where timestamp is the
|
1465
|
+
relative time in seconds where the frame was captured. The frame is a numpy array.
|
1380
1466
|
|
1381
1467
|
Parameters:
|
1382
|
-
video_uri (Union[str, Path]): The path to the video file or youtube link
|
1468
|
+
video_uri (Union[str, Path]): The path to the video file, url or youtube link
|
1383
1469
|
fps (float, optional): The frame rate per second to extract the frames. Defaults
|
1384
1470
|
to 10.
|
1385
1471
|
|
@@ -1518,7 +1604,9 @@ def save_video(
|
|
1518
1604
|
raise ValueError(f"fps must be greater than 0 got {fps}")
|
1519
1605
|
|
1520
1606
|
if output_video_path is None:
|
1521
|
-
output_video_path = tempfile.NamedTemporaryFile(
|
1607
|
+
output_video_path = tempfile.NamedTemporaryFile(
|
1608
|
+
delete=False, suffix=".mp4"
|
1609
|
+
).name
|
1522
1610
|
|
1523
1611
|
output_video_path = video_writer(frames, fps, output_video_path)
|
1524
1612
|
_save_video_to_result(output_video_path)
|
@@ -1818,7 +1906,8 @@ def overlay_counting_results(
|
|
1818
1906
|
|
1819
1907
|
|
1820
1908
|
FUNCTION_TOOLS = [
|
1821
|
-
|
1909
|
+
owl_v2_image,
|
1910
|
+
owl_v2_video,
|
1822
1911
|
ocr,
|
1823
1912
|
clip,
|
1824
1913
|
vit_image_classification,
|
@@ -1827,7 +1916,7 @@ FUNCTION_TOOLS = [
|
|
1827
1916
|
florence2_image_caption,
|
1828
1917
|
florence2_ocr,
|
1829
1918
|
florence2_sam2_image,
|
1830
|
-
|
1919
|
+
florence2_sam2_video_tracking,
|
1831
1920
|
florence2_phrase_grounding,
|
1832
1921
|
ixc25_image_vqa,
|
1833
1922
|
ixc25_video_vqa,
|
vision_agent/utils/video.py
CHANGED
@@ -4,6 +4,7 @@ import tempfile
|
|
4
4
|
from functools import lru_cache
|
5
5
|
from typing import List, Optional, Tuple
|
6
6
|
|
7
|
+
import av # type: ignore
|
7
8
|
import cv2
|
8
9
|
import numpy as np
|
9
10
|
from decord import VideoReader # type: ignore
|
@@ -43,18 +44,36 @@ def play_video(video_base64: str) -> None:
|
|
43
44
|
cv2.destroyAllWindows()
|
44
45
|
|
45
46
|
|
47
|
+
def _resize_frame(frame: np.ndarray) -> np.ndarray:
|
48
|
+
height, width = frame.shape[:2]
|
49
|
+
new_width = width - (width % 2)
|
50
|
+
new_height = height - (height % 2)
|
51
|
+
return cv2.resize(frame, (new_width, new_height))
|
52
|
+
|
53
|
+
|
46
54
|
def video_writer(
|
47
55
|
frames: List[np.ndarray], fps: float = 1.0, filename: Optional[str] = None
|
48
56
|
) -> str:
|
49
57
|
if filename is None:
|
50
58
|
filename = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
|
51
|
-
|
52
|
-
|
59
|
+
container = av.open(filename, mode="w")
|
60
|
+
stream = container.add_stream("h264", rate=fps)
|
53
61
|
height, width = frames[0].shape[:2]
|
54
|
-
|
62
|
+
stream.height = height - (height % 2)
|
63
|
+
stream.width = width - (width % 2)
|
64
|
+
stream.pix_fmt = "yuv420p"
|
55
65
|
for frame in frames:
|
56
|
-
|
57
|
-
|
66
|
+
# Remove the alpha channel (convert RGBA to RGB)
|
67
|
+
frame_rgb = frame[:, :, :3]
|
68
|
+
# Resize the frame to make dimensions divisible by 2
|
69
|
+
frame_rgb = _resize_frame(frame_rgb)
|
70
|
+
av_frame = av.VideoFrame.from_ndarray(frame_rgb, format="rgb24")
|
71
|
+
for packet in stream.encode(av_frame):
|
72
|
+
container.mux(packet)
|
73
|
+
|
74
|
+
for packet in stream.encode():
|
75
|
+
container.mux(packet)
|
76
|
+
container.close()
|
58
77
|
return filename
|
59
78
|
|
60
79
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.128
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -10,6 +10,7 @@ Classifier: Programming Language :: Python :: 3.9
|
|
10
10
|
Classifier: Programming Language :: Python :: 3.10
|
11
11
|
Classifier: Programming Language :: Python :: 3.11
|
12
12
|
Requires-Dist: anthropic (>=0.31.0,<0.32.0)
|
13
|
+
Requires-Dist: av (>=11.0.0,<12.0.0)
|
13
14
|
Requires-Dist: e2b (>=0.17.2a50,<0.18.0)
|
14
15
|
Requires-Dist: e2b-code-interpreter (==0.0.11a37)
|
15
16
|
Requires-Dist: eva-decord (>=0.6.1,<0.7.0)
|
@@ -4,7 +4,7 @@ vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,5
|
|
4
4
|
vision_agent/agent/agent_utils.py,sha256=22LiPhkJlS5mVeo2dIi259pc2NgA7PGHRpcbnrtKo78,1930
|
5
5
|
vision_agent/agent/vision_agent.py,sha256=WM1_o0VAQokAKlDr-0lpFxCRwUm_eFfFNWP-wSNjo7s,11180
|
6
6
|
vision_agent/agent/vision_agent_coder.py,sha256=_2QQd_nTGojkk2ZOiMevVCY6-eUA9q1QdCWH7-Noq4w,34237
|
7
|
-
vision_agent/agent/vision_agent_coder_prompts.py,sha256=
|
7
|
+
vision_agent/agent/vision_agent_coder_prompts.py,sha256=nj4iRRSAWYHjKqyUSp12aTCV1D5iUVCHeezVXoozS4M,12687
|
8
8
|
vision_agent/agent/vision_agent_prompts.py,sha256=K1nLo3XKQ-IqCom1TRwh3cMoGZNxNwEgZqf3uJ6eL18,7221
|
9
9
|
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
|
@@ -14,11 +14,11 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
|
|
14
14
|
vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,103
|
15
15
|
vision_agent/lmm/lmm.py,sha256=092oefI65_QSRvQm2znXkjTdzlZTh-Ni_38610kfbJg,16836
|
16
16
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
17
|
-
vision_agent/tools/__init__.py,sha256=
|
17
|
+
vision_agent/tools/__init__.py,sha256=nx60_hujcnLz3d2wQlCbcerUmT6R2vxRy66IsQjdB3M,2364
|
18
18
|
vision_agent/tools/meta_tools.py,sha256=KeGiw2OtY8ARpGbtWjoNAoO1dwevt7LbCupaJX61MkE,18929
|
19
19
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
20
20
|
vision_agent/tools/tool_utils.py,sha256=62NVlojPMf9MuJ-3yJEcrB3mzmOxN2HrNQzzjVa-FZg,7527
|
21
|
-
vision_agent/tools/tools.py,sha256=
|
21
|
+
vision_agent/tools/tools.py,sha256=p6QUo7V03UZOKBAGfabVWdPm9vUT9tyP_utCv0yKfcY,68659
|
22
22
|
vision_agent/tools/tools_types.py,sha256=rLpCUODPY0yI65SLOTJOxfHFfqWM3WjOq-AYX25Chjk,2356
|
23
23
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
24
24
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
@@ -26,8 +26,8 @@ vision_agent/utils/execute.py,sha256=gc4R_0BKUrZyhiKvIxOpYuzQPYVWQEqxr3ANy1lJAw4
|
|
26
26
|
vision_agent/utils/image_utils.py,sha256=zTTOJFOieMzwIquTFnW7T6ssx9o6XfoZ0Unqyk7GJrg,10746
|
27
27
|
vision_agent/utils/sim.py,sha256=ebE9Cs00pVEDI1HMjAzUBk88tQQmc2U-yAzIDinnekU,5572
|
28
28
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
29
|
-
vision_agent/utils/video.py,sha256=
|
30
|
-
vision_agent-0.2.
|
31
|
-
vision_agent-0.2.
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
29
|
+
vision_agent/utils/video.py,sha256=GmJqu_3WhBMEwP4HToMMp8EwgftliHSpv5nd-QEDOcs,4528
|
30
|
+
vision_agent-0.2.128.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
31
|
+
vision_agent-0.2.128.dist-info/METADATA,sha256=4E1im4aLvJnSR-tKxWUtKyJ0ZbkHxYMYxfqGz_0Layw,12295
|
32
|
+
vision_agent-0.2.128.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
33
|
+
vision_agent-0.2.128.dist-info/RECORD,,
|
File without changes
|
File without changes
|