vision-agent 0.2.10__py3-none-any.whl → 0.2.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/__init__.py +2 -0
- vision_agent/agent/agent_coder.py +196 -0
- vision_agent/agent/agent_coder_prompts.py +135 -0
- vision_agent/agent/vision_agent.py +46 -30
- vision_agent/agent/vision_agent_prompts.py +3 -3
- vision_agent/agent/vision_agent_v2.py +396 -0
- vision_agent/agent/vision_agent_v2_prompt.py +185 -0
- vision_agent/llm/llm.py +12 -4
- vision_agent/tools/__init__.py +3 -1
- vision_agent/tools/tool_utils.py +30 -0
- vision_agent/tools/tools.py +157 -79
- vision_agent/tools/tools_v2.py +442 -0
- vision_agent/utils/__init__.py +3 -0
- vision_agent/utils/execute.py +104 -0
- vision_agent/utils/sim.py +85 -0
- {vision_agent-0.2.10.dist-info → vision_agent-0.2.22.dist-info}/METADATA +7 -3
- vision_agent-0.2.22.dist-info/RECORD +34 -0
- vision_agent-0.2.10.dist-info/RECORD +0 -25
- /vision_agent/{image_utils.py → utils/image_utils.py} +0 -0
- /vision_agent/{type_defs.py → utils/type_defs.py} +0 -0
- /vision_agent/{tools → utils}/video.py +0 -0
- {vision_agent-0.2.10.dist-info → vision_agent-0.2.22.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.10.dist-info → vision_agent-0.2.22.dist-info}/WHEEL +0 -0
@@ -0,0 +1,442 @@
|
|
1
|
+
import inspect
|
2
|
+
import io
|
3
|
+
import logging
|
4
|
+
import tempfile
|
5
|
+
from importlib import resources
|
6
|
+
from pathlib import Path
|
7
|
+
from typing import Any, Callable, Dict, List, Tuple, Union
|
8
|
+
|
9
|
+
import numpy as np
|
10
|
+
import pandas as pd
|
11
|
+
import requests
|
12
|
+
from PIL import Image, ImageDraw, ImageFont
|
13
|
+
|
14
|
+
from vision_agent.tools.tool_utils import _send_inference_request
|
15
|
+
from vision_agent.utils import extract_frames_from_video
|
16
|
+
from vision_agent.utils.image_utils import convert_to_b64, normalize_bbox, rle_decode
|
17
|
+
|
18
|
+
COLORS = [
|
19
|
+
(158, 218, 229),
|
20
|
+
(219, 219, 141),
|
21
|
+
(23, 190, 207),
|
22
|
+
(188, 189, 34),
|
23
|
+
(199, 199, 199),
|
24
|
+
(247, 182, 210),
|
25
|
+
(127, 127, 127),
|
26
|
+
(227, 119, 194),
|
27
|
+
(196, 156, 148),
|
28
|
+
(197, 176, 213),
|
29
|
+
(140, 86, 75),
|
30
|
+
(148, 103, 189),
|
31
|
+
(255, 152, 150),
|
32
|
+
(152, 223, 138),
|
33
|
+
(214, 39, 40),
|
34
|
+
(44, 160, 44),
|
35
|
+
(255, 187, 120),
|
36
|
+
(174, 199, 232),
|
37
|
+
(255, 127, 14),
|
38
|
+
(31, 119, 180),
|
39
|
+
]
|
40
|
+
_API_KEY = "land_sk_WVYwP00xA3iXely2vuar6YUDZ3MJT9yLX6oW5noUkwICzYLiDV"
|
41
|
+
_OCR_URL = "https://app.landing.ai/ocr/v1/detect-text"
|
42
|
+
logging.basicConfig(level=logging.INFO)
|
43
|
+
_LOGGER = logging.getLogger(__name__)
|
44
|
+
|
45
|
+
|
46
|
+
def grounding_dino(
|
47
|
+
prompt: str,
|
48
|
+
image: np.ndarray,
|
49
|
+
box_threshold: float = 0.20,
|
50
|
+
iou_threshold: float = 0.75,
|
51
|
+
) -> List[Dict[str, Any]]:
|
52
|
+
"""'grounding_dino' is a tool that can detect and count objects given a text prompt
|
53
|
+
such as category names or referring expressions. It returns a list and count of
|
54
|
+
bounding boxes, label names and associated probability scores.
|
55
|
+
|
56
|
+
Parameters:
|
57
|
+
prompt (str): The prompt to ground to the image.
|
58
|
+
image (np.ndarray): The image to ground the prompt to.
|
59
|
+
box_threshold (float, optional): The threshold for the box detection. Defaults
|
60
|
+
to 0.20.
|
61
|
+
iou_threshold (float, optional): The threshold for the Intersection over Union
|
62
|
+
(IoU). Defaults to 0.75.
|
63
|
+
|
64
|
+
Returns:
|
65
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
66
|
+
bounding box of the detected objects with normalized coordinates
|
67
|
+
(x1, y1, x2, y2).
|
68
|
+
|
69
|
+
Example
|
70
|
+
-------
|
71
|
+
>>> grounding_dino("car. dinosaur", image)
|
72
|
+
[
|
73
|
+
{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
74
|
+
{'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
75
|
+
]
|
76
|
+
"""
|
77
|
+
image_size = image.shape[:2]
|
78
|
+
image_b64 = convert_to_b64(Image.fromarray(image))
|
79
|
+
request_data = {
|
80
|
+
"prompt": prompt,
|
81
|
+
"image": image_b64,
|
82
|
+
"tool": "visual_grounding",
|
83
|
+
"kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
|
84
|
+
}
|
85
|
+
data: Dict[str, Any] = _send_inference_request(request_data, "tools")
|
86
|
+
return_data = []
|
87
|
+
for i in range(len(data["bboxes"])):
|
88
|
+
return_data.append(
|
89
|
+
{
|
90
|
+
"score": round(data["scores"][i], 2),
|
91
|
+
"label": data["labels"][i],
|
92
|
+
"bbox": normalize_bbox(data["bboxes"][i], image_size),
|
93
|
+
}
|
94
|
+
)
|
95
|
+
return return_data
|
96
|
+
|
97
|
+
|
98
|
+
def grounding_sam(
|
99
|
+
prompt: str,
|
100
|
+
image: np.ndarray,
|
101
|
+
box_threshold: float = 0.20,
|
102
|
+
iou_threshold: float = 0.75,
|
103
|
+
) -> List[Dict[str, Any]]:
|
104
|
+
"""'grounding_sam' is a tool that can detect and segment objects given a text
|
105
|
+
prompt such as category names or referring expressions. It returns a list of
|
106
|
+
bounding boxes, label names and masks file names and associated probability scores.
|
107
|
+
|
108
|
+
Parameters:
|
109
|
+
prompt (str): The prompt to ground to the image.
|
110
|
+
image (np.ndarray): The image to ground the prompt to.
|
111
|
+
box_threshold (float, optional): The threshold for the box detection. Defaults
|
112
|
+
to 0.20.
|
113
|
+
iou_threshold (float, optional): The threshold for the Intersection over Union
|
114
|
+
(IoU). Defaults to 0.75.
|
115
|
+
|
116
|
+
Returns:
|
117
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
118
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
119
|
+
(x1, y1, x2, y2).
|
120
|
+
|
121
|
+
Example
|
122
|
+
-------
|
123
|
+
>>> grounding_sam("car. dinosaur", image)
|
124
|
+
[
|
125
|
+
{
|
126
|
+
'score': 0.99,
|
127
|
+
'label': 'dinosaur',
|
128
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
129
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
130
|
+
[0, 0, 0, ..., 0, 0, 0],
|
131
|
+
...,
|
132
|
+
[0, 0, 0, ..., 0, 0, 0],
|
133
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
134
|
+
},
|
135
|
+
]
|
136
|
+
"""
|
137
|
+
image_size = image.shape[:2]
|
138
|
+
image_b64 = convert_to_b64(Image.fromarray(image))
|
139
|
+
request_data = {
|
140
|
+
"prompt": prompt,
|
141
|
+
"image": image_b64,
|
142
|
+
"tool": "visual_grounding_segment",
|
143
|
+
"kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
|
144
|
+
}
|
145
|
+
data: Dict[str, Any] = _send_inference_request(request_data, "tools")
|
146
|
+
return_data = []
|
147
|
+
for i in range(len(data["bboxes"])):
|
148
|
+
return_data.append(
|
149
|
+
{
|
150
|
+
"score": round(data["scores"][i], 2),
|
151
|
+
"label": data["labels"][i],
|
152
|
+
"bbox": normalize_bbox(data["bboxes"][i], image_size),
|
153
|
+
"mask": rle_decode(mask_rle=data["masks"][i], shape=data["mask_shape"]),
|
154
|
+
}
|
155
|
+
)
|
156
|
+
return return_data
|
157
|
+
|
158
|
+
|
159
|
+
def extract_frames(
|
160
|
+
video_uri: Union[str, Path], fps: float = 0.5
|
161
|
+
) -> List[Tuple[np.ndarray, float]]:
|
162
|
+
"""'extract_frames' extracts frames from a video, returns a list of tuples (frame,
|
163
|
+
timestamp), where timestamp is the relative time in seconds where the frame was
|
164
|
+
captured. The frame is a local image file path.
|
165
|
+
|
166
|
+
Parameters:
|
167
|
+
video_uri (Union[str, Path]): The path to the video file.
|
168
|
+
fps (float, optional): The frame rate per second to extract the frames. Defaults
|
169
|
+
to 0.5.
|
170
|
+
|
171
|
+
Returns:
|
172
|
+
List[Tuple[np.ndarray, float]]: A list of tuples containing the extracted frame
|
173
|
+
and the timestamp in seconds.
|
174
|
+
|
175
|
+
Example
|
176
|
+
-------
|
177
|
+
>>> extract_frames("path/to/video.mp4")
|
178
|
+
[(frame1, 0.0), (frame2, 0.5), ...]
|
179
|
+
"""
|
180
|
+
|
181
|
+
return extract_frames_from_video(str(video_uri), fps)
|
182
|
+
|
183
|
+
|
184
|
+
def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
185
|
+
"""'ocr' extracts text from an image. It returns a list of detected text, bounding
|
186
|
+
boxes, and confidence scores.
|
187
|
+
|
188
|
+
Parameters:
|
189
|
+
image (np.ndarray): The image to extract text from.
|
190
|
+
|
191
|
+
Returns:
|
192
|
+
List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox,
|
193
|
+
and confidence score.
|
194
|
+
|
195
|
+
Example
|
196
|
+
-------
|
197
|
+
>>> ocr(image)
|
198
|
+
[
|
199
|
+
{'label': 'some text', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
|
200
|
+
]
|
201
|
+
"""
|
202
|
+
|
203
|
+
pil_image = Image.fromarray(image).convert("RGB")
|
204
|
+
image_size = pil_image.size[::-1]
|
205
|
+
image_buffer = io.BytesIO()
|
206
|
+
pil_image.save(image_buffer, format="PNG")
|
207
|
+
buffer_bytes = image_buffer.getvalue()
|
208
|
+
image_buffer.close()
|
209
|
+
|
210
|
+
res = requests.post(
|
211
|
+
_OCR_URL,
|
212
|
+
files={"images": buffer_bytes},
|
213
|
+
data={"language": "en"},
|
214
|
+
headers={"contentType": "multipart/form-data", "apikey": _API_KEY},
|
215
|
+
)
|
216
|
+
|
217
|
+
if res.status_code != 200:
|
218
|
+
raise ValueError(f"OCR request failed with status code {res.status_code}")
|
219
|
+
|
220
|
+
data = res.json()
|
221
|
+
output = []
|
222
|
+
for det in data[0]:
|
223
|
+
label = det["text"]
|
224
|
+
box = [
|
225
|
+
det["location"][0]["x"],
|
226
|
+
det["location"][0]["y"],
|
227
|
+
det["location"][2]["x"],
|
228
|
+
det["location"][2]["y"],
|
229
|
+
]
|
230
|
+
box = normalize_bbox(box, image_size)
|
231
|
+
output.append({"label": label, "bbox": box, "score": round(det["score"], 2)})
|
232
|
+
|
233
|
+
return output
|
234
|
+
|
235
|
+
|
236
|
+
# Utility and visualization functions
|
237
|
+
|
238
|
+
|
239
|
+
def load_image(image_path: str) -> np.ndarray:
|
240
|
+
"""'load_image' is a utility function that loads an image from the given path.
|
241
|
+
|
242
|
+
Parameters:
|
243
|
+
image_path (str): The path to the image.
|
244
|
+
|
245
|
+
Returns:
|
246
|
+
np.ndarray: The image as a NumPy array.
|
247
|
+
|
248
|
+
Example
|
249
|
+
-------
|
250
|
+
>>> load_image("path/to/image.jpg")
|
251
|
+
"""
|
252
|
+
|
253
|
+
image = Image.open(image_path).convert("RGB")
|
254
|
+
return np.array(image)
|
255
|
+
|
256
|
+
|
257
|
+
def save_image(image: np.ndarray) -> str:
|
258
|
+
"""'save_image' is a utility function that saves an image as a temporary file.
|
259
|
+
|
260
|
+
Parameters:
|
261
|
+
image (np.ndarray): The image to save.
|
262
|
+
|
263
|
+
Returns:
|
264
|
+
str: The path to the saved image.
|
265
|
+
|
266
|
+
Example
|
267
|
+
-------
|
268
|
+
>>> save_image(image)
|
269
|
+
"/tmp/tmpabc123.png"
|
270
|
+
"""
|
271
|
+
|
272
|
+
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
|
273
|
+
pil_image = Image.fromarray(image.astype(np.uint8))
|
274
|
+
pil_image.save(f, "PNG")
|
275
|
+
return f.name
|
276
|
+
|
277
|
+
|
278
|
+
def overlay_bounding_boxes(
|
279
|
+
image: np.ndarray, bboxes: List[Dict[str, Any]]
|
280
|
+
) -> np.ndarray:
|
281
|
+
"""'display_bounding_boxes' is a utility function that displays bounding boxes on
|
282
|
+
an image.
|
283
|
+
|
284
|
+
Parameters:
|
285
|
+
image (np.ndarray): The image to display the bounding boxes on.
|
286
|
+
bboxes (List[Dict[str, Any]]): A list of dictionaries containing the bounding
|
287
|
+
boxes.
|
288
|
+
|
289
|
+
Returns:
|
290
|
+
np.ndarray: The image with the bounding boxes, labels and scores displayed.
|
291
|
+
|
292
|
+
Example
|
293
|
+
-------
|
294
|
+
>>> image_with_bboxes = display_bounding_boxes(
|
295
|
+
image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
|
296
|
+
)
|
297
|
+
"""
|
298
|
+
pil_image = Image.fromarray(image.astype(np.uint8))
|
299
|
+
|
300
|
+
if len(set([box["label"] for box in bboxes])) > len(COLORS):
|
301
|
+
_LOGGER.warning(
|
302
|
+
"Number of unique labels exceeds the number of available colors. Some labels may have the same color."
|
303
|
+
)
|
304
|
+
|
305
|
+
color = {
|
306
|
+
label: COLORS[i % len(COLORS)]
|
307
|
+
for i, label in enumerate(set([box["label"] for box in bboxes]))
|
308
|
+
}
|
309
|
+
|
310
|
+
width, height = pil_image.size
|
311
|
+
fontsize = max(12, int(min(width, height) / 40))
|
312
|
+
draw = ImageDraw.Draw(pil_image)
|
313
|
+
font = ImageFont.truetype(
|
314
|
+
str(resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")),
|
315
|
+
fontsize,
|
316
|
+
)
|
317
|
+
|
318
|
+
for elt in bboxes:
|
319
|
+
label = elt["label"]
|
320
|
+
box = elt["bbox"]
|
321
|
+
scores = elt["score"]
|
322
|
+
|
323
|
+
box = [
|
324
|
+
int(box[0] * width),
|
325
|
+
int(box[1] * height),
|
326
|
+
int(box[2] * width),
|
327
|
+
int(box[3] * height),
|
328
|
+
]
|
329
|
+
draw.rectangle(box, outline=color[label], width=4)
|
330
|
+
text = f"{label}: {scores:.2f}"
|
331
|
+
text_box = draw.textbbox((box[0], box[1]), text=text, font=font)
|
332
|
+
draw.rectangle((box[0], box[1], text_box[2], text_box[3]), fill=color[label])
|
333
|
+
draw.text((box[0], box[1]), text, fill="black", font=font)
|
334
|
+
return np.array(pil_image.convert("RGB"))
|
335
|
+
|
336
|
+
|
337
|
+
def overlay_segmentation_masks(
|
338
|
+
image: np.ndarray, masks: List[Dict[str, Any]]
|
339
|
+
) -> np.ndarray:
|
340
|
+
"""'display_segmentation_masks' is a utility function that displays segmentation
|
341
|
+
masks.
|
342
|
+
|
343
|
+
Parameters:
|
344
|
+
image (np.ndarray): The image to display the masks on.
|
345
|
+
masks (List[Dict[str, Any]]): A list of dictionaries containing the masks.
|
346
|
+
|
347
|
+
Returns:
|
348
|
+
np.ndarray: The image with the masks displayed.
|
349
|
+
|
350
|
+
Example
|
351
|
+
-------
|
352
|
+
>>> image_with_masks = display_segmentation_masks(
|
353
|
+
image,
|
354
|
+
[{
|
355
|
+
'score': 0.99,
|
356
|
+
'label': 'dinosaur',
|
357
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
358
|
+
[0, 0, 0, ..., 0, 0, 0],
|
359
|
+
...,
|
360
|
+
[0, 0, 0, ..., 0, 0, 0],
|
361
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
362
|
+
}],
|
363
|
+
)
|
364
|
+
"""
|
365
|
+
pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGBA")
|
366
|
+
|
367
|
+
if len(set([mask["label"] for mask in masks])) > len(COLORS):
|
368
|
+
_LOGGER.warning(
|
369
|
+
"Number of unique labels exceeds the number of available colors. Some labels may have the same color."
|
370
|
+
)
|
371
|
+
|
372
|
+
color = {
|
373
|
+
label: COLORS[i % len(COLORS)]
|
374
|
+
for i, label in enumerate(set([mask["label"] for mask in masks]))
|
375
|
+
}
|
376
|
+
|
377
|
+
for elt in masks:
|
378
|
+
mask = elt["mask"]
|
379
|
+
label = elt["label"]
|
380
|
+
np_mask = np.zeros((pil_image.size[1], pil_image.size[0], 4))
|
381
|
+
np_mask[mask > 0, :] = color[label] + (255 * 0.5,)
|
382
|
+
mask_img = Image.fromarray(np_mask.astype(np.uint8))
|
383
|
+
pil_image = Image.alpha_composite(pil_image, mask_img)
|
384
|
+
return np.array(pil_image.convert("RGB"))
|
385
|
+
|
386
|
+
|
387
|
+
def get_tool_documentation(funcs: List[Callable[..., Any]]) -> str:
|
388
|
+
docstrings = ""
|
389
|
+
for func in funcs:
|
390
|
+
docstrings += f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}\n\n"
|
391
|
+
|
392
|
+
return docstrings
|
393
|
+
|
394
|
+
|
395
|
+
def get_tool_descriptions(funcs: List[Callable[..., Any]]) -> str:
|
396
|
+
descriptions = ""
|
397
|
+
for func in funcs:
|
398
|
+
description = func.__doc__
|
399
|
+
if description is None:
|
400
|
+
description = ""
|
401
|
+
|
402
|
+
description = (
|
403
|
+
description[: description.find("Parameters:")].replace("\n", " ").strip()
|
404
|
+
)
|
405
|
+
description = " ".join(description.split())
|
406
|
+
descriptions += f"- {func.__name__}{inspect.signature(func)}: {description}\n"
|
407
|
+
return descriptions
|
408
|
+
|
409
|
+
|
410
|
+
def get_tools_df(funcs: List[Callable[..., Any]]) -> pd.DataFrame:
|
411
|
+
data: Dict[str, List[str]] = {"desc": [], "doc": []}
|
412
|
+
|
413
|
+
for func in funcs:
|
414
|
+
desc = func.__doc__
|
415
|
+
if desc is None:
|
416
|
+
desc = ""
|
417
|
+
desc = desc[: desc.find("Parameters:")].replace("\n", " ").strip()
|
418
|
+
desc = " ".join(desc.split())
|
419
|
+
|
420
|
+
doc = f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}"
|
421
|
+
data["desc"].append(desc)
|
422
|
+
data["doc"].append(doc)
|
423
|
+
|
424
|
+
return pd.DataFrame(data) # type: ignore
|
425
|
+
|
426
|
+
|
427
|
+
TOOLS = [
|
428
|
+
grounding_dino,
|
429
|
+
grounding_sam,
|
430
|
+
extract_frames,
|
431
|
+
ocr,
|
432
|
+
load_image,
|
433
|
+
save_image,
|
434
|
+
overlay_bounding_boxes,
|
435
|
+
overlay_segmentation_masks,
|
436
|
+
]
|
437
|
+
TOOLS_DF = get_tools_df(TOOLS) # type: ignore
|
438
|
+
TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS) # type: ignore
|
439
|
+
TOOL_DOCSTRING = get_tool_documentation(TOOLS) # type: ignore
|
440
|
+
UTILITIES_DOCSTRING = get_tool_documentation(
|
441
|
+
[load_image, save_image, overlay_bounding_boxes]
|
442
|
+
)
|
@@ -0,0 +1,104 @@
|
|
1
|
+
"""This code is adapted from MetaGPT's https://github.com/geekan/MetaGPT/blob/main/metagpt/actions/di/execute_nb_code.py
|
2
|
+
"""
|
3
|
+
|
4
|
+
import base64 as b64
|
5
|
+
import io
|
6
|
+
import re
|
7
|
+
from typing import Dict, List, Tuple
|
8
|
+
|
9
|
+
import nbformat
|
10
|
+
from nbclient import NotebookClient
|
11
|
+
from nbclient.exceptions import CellTimeoutError, DeadKernelError
|
12
|
+
from nbclient.util import run_sync
|
13
|
+
from nbformat import NotebookNode
|
14
|
+
from nbformat.v4 import new_code_cell
|
15
|
+
from PIL import Image
|
16
|
+
|
17
|
+
|
18
|
+
def remove_escape_and_color_codes(input_str: str) -> str:
|
19
|
+
pattern = re.compile(r"\x1b\[[0-9;]*[mK]")
|
20
|
+
result = pattern.sub("", input_str)
|
21
|
+
return result
|
22
|
+
|
23
|
+
|
24
|
+
def parse_outputs(outputs: List[Dict]) -> Tuple[bool, str]:
|
25
|
+
success, parsed_output = True, []
|
26
|
+
for output in outputs:
|
27
|
+
# TODO: add parse image data
|
28
|
+
if output["output_type"] == "stream":
|
29
|
+
parsed_output.append(output["text"])
|
30
|
+
elif output["output_type"] == "text/plain":
|
31
|
+
parsed_output.append(output["data"]["text/plain"])
|
32
|
+
elif output["output_type"] == "display_data":
|
33
|
+
if "image/png" in output["data"]:
|
34
|
+
image_bytes = b64.b64decode(output["data"]["image/png"])
|
35
|
+
Image.open(io.BytesIO(image_bytes)).show()
|
36
|
+
elif output["output_type"] == "error":
|
37
|
+
success = False
|
38
|
+
output_text = remove_escape_and_color_codes("\n".join(output["traceback"]))
|
39
|
+
parsed_output.append(output_text)
|
40
|
+
|
41
|
+
return success, ",".join(parsed_output)
|
42
|
+
|
43
|
+
|
44
|
+
class Execute:
|
45
|
+
def __init__(self, timeout: int = 600) -> None:
|
46
|
+
self.nb = nbformat.v4.new_notebook()
|
47
|
+
self.timeout = timeout
|
48
|
+
self.nb_client = NotebookClient(self.nb, timeout=self.timeout)
|
49
|
+
|
50
|
+
def build(self) -> None:
|
51
|
+
if self.nb_client.kc is None or not run_sync(self.nb_client.kc.is_alive)(): # type: ignore
|
52
|
+
self.nb_client.create_kernel_manager()
|
53
|
+
self.nb_client.start_new_kernel()
|
54
|
+
self.nb_client.start_new_kernel_client()
|
55
|
+
|
56
|
+
def terminate(self) -> None:
|
57
|
+
if self.nb_client.km is not None and run_sync(self.nb_client.km.is_alive)(): # type: ignore
|
58
|
+
run_sync(self.nb_client.km.shutdown_kernel)(now=True)
|
59
|
+
run_sync(self.nb_client.km.cleanup_resources)()
|
60
|
+
|
61
|
+
channels = [
|
62
|
+
self.nb_client.kc.stdin_channel,
|
63
|
+
self.nb_client.kc.hb_channel,
|
64
|
+
self.nb_client.kc.control_channel,
|
65
|
+
]
|
66
|
+
|
67
|
+
for ch in channels:
|
68
|
+
if ch.is_alive():
|
69
|
+
ch.stop()
|
70
|
+
|
71
|
+
self.nb_client.kc = None
|
72
|
+
self.nb_client.km = None
|
73
|
+
|
74
|
+
def reset(self) -> None:
|
75
|
+
self.terminate()
|
76
|
+
self.nb = nbformat.v4.new_notebook()
|
77
|
+
self.nb_client = NotebookClient(self.nb, timeout=self.timeout)
|
78
|
+
self.build()
|
79
|
+
|
80
|
+
def run_cell(self, cell: NotebookNode, cell_index: int) -> Tuple[bool, str]:
|
81
|
+
try:
|
82
|
+
self.nb_client.execute_cell(cell, cell_index)
|
83
|
+
return parse_outputs(self.nb.cells[-1].outputs)
|
84
|
+
except CellTimeoutError:
|
85
|
+
run_sync(self.nb_client.km.interrupt_kernel)() # type: ignore
|
86
|
+
return False, "Cell execution timed out."
|
87
|
+
except DeadKernelError:
|
88
|
+
self.reset()
|
89
|
+
return False, "DeadKernelError"
|
90
|
+
except Exception:
|
91
|
+
return parse_outputs(self.nb.cells[-1].outputs)
|
92
|
+
|
93
|
+
def add_code_cell(self, code: str) -> None:
|
94
|
+
self.nb.cells.append(new_code_cell(code))
|
95
|
+
|
96
|
+
def run_additional(self, code: str) -> Tuple[bool, str]:
|
97
|
+
self.build()
|
98
|
+
self.add_code_cell(code)
|
99
|
+
return self.run_cell(self.nb.cells[-1], len(self.nb.cells) - 1)
|
100
|
+
|
101
|
+
def run_isolation(self, code: str) -> Tuple[bool, str]:
|
102
|
+
self.reset()
|
103
|
+
self.add_code_cell(code)
|
104
|
+
return self.run_cell(self.nb.cells[-1], len(self.nb.cells) - 1)
|
@@ -0,0 +1,85 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from typing import Dict, List, Optional, Sequence, Union
|
3
|
+
|
4
|
+
import numpy as np
|
5
|
+
import pandas as pd
|
6
|
+
from openai import Client
|
7
|
+
from scipy.spatial.distance import cosine # type: ignore
|
8
|
+
|
9
|
+
|
10
|
+
def get_embedding(
|
11
|
+
client: Client, text: str, model: str = "text-embedding-3-small"
|
12
|
+
) -> List[float]:
|
13
|
+
text = text.replace("\n", " ")
|
14
|
+
return client.embeddings.create(input=[text], model=model).data[0].embedding
|
15
|
+
|
16
|
+
|
17
|
+
class Sim:
|
18
|
+
def __init__(
|
19
|
+
self,
|
20
|
+
df: pd.DataFrame,
|
21
|
+
sim_key: Optional[str] = None,
|
22
|
+
api_key: Optional[str] = None,
|
23
|
+
model: str = "text-embedding-3-small",
|
24
|
+
) -> None:
|
25
|
+
"""Creates a similarity object that can be used to find similar items in a
|
26
|
+
dataframe.
|
27
|
+
|
28
|
+
Parameters:
|
29
|
+
df: pd.DataFrame: The dataframe to use for similarity.
|
30
|
+
sim_key: Optional[str]: The column name that you want to use to construct
|
31
|
+
the embeddings.
|
32
|
+
model: str: The model to use for embeddings.
|
33
|
+
"""
|
34
|
+
self.df = df
|
35
|
+
if not api_key:
|
36
|
+
self.client = Client()
|
37
|
+
else:
|
38
|
+
self.client = Client(api_key=api_key)
|
39
|
+
|
40
|
+
self.model = model
|
41
|
+
if "embs" not in df.columns and sim_key is None:
|
42
|
+
raise ValueError("key is required if no column 'embs' is present.")
|
43
|
+
|
44
|
+
if sim_key is not None:
|
45
|
+
self.df["embs"] = self.df[sim_key].apply(
|
46
|
+
lambda x: get_embedding(self.client, x, model=self.model)
|
47
|
+
)
|
48
|
+
|
49
|
+
def save(self, sim_file: Union[str, Path]) -> None:
|
50
|
+
sim_file = Path(sim_file)
|
51
|
+
sim_file.mkdir(parents=True, exist_ok=True)
|
52
|
+
|
53
|
+
df = self.df.copy()
|
54
|
+
embs = np.array(df.embs.tolist())
|
55
|
+
np.save(sim_file / "embs.npy", embs)
|
56
|
+
df = df.drop("embs", axis=1)
|
57
|
+
df.to_csv(sim_file / "df.csv", index=False)
|
58
|
+
|
59
|
+
def top_k(self, query: str, k: int = 5) -> Sequence[Dict]:
|
60
|
+
"""Returns the top k most similar items to the query.
|
61
|
+
|
62
|
+
Parameters:
|
63
|
+
query: str: The query to compare to.
|
64
|
+
k: int: The number of items to return.
|
65
|
+
|
66
|
+
Returns:
|
67
|
+
Sequence[Dict]: The top k most similar items.
|
68
|
+
"""
|
69
|
+
|
70
|
+
embedding = get_embedding(self.client, query, model=self.model)
|
71
|
+
self.df["sim"] = self.df.embs.apply(lambda x: 1 - cosine(x, embedding))
|
72
|
+
res = self.df.sort_values("sim", ascending=False).head(k)
|
73
|
+
return res[[c for c in res.columns if c != "embs"]].to_dict(orient="records")
|
74
|
+
|
75
|
+
|
76
|
+
def merge_sim(sim1: Sim, sim2: Sim) -> Sim:
|
77
|
+
return Sim(pd.concat([sim1.df, sim2.df], ignore_index=True))
|
78
|
+
|
79
|
+
|
80
|
+
def load_sim(sim_file: Union[str, Path]) -> Sim:
|
81
|
+
sim_file = Path(sim_file)
|
82
|
+
df = pd.read_csv(sim_file / "df.csv")
|
83
|
+
embs = np.load(sim_file / "embs.npy")
|
84
|
+
df["embs"] = list(embs)
|
85
|
+
return Sim(df)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.22
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -10,6 +10,8 @@ Classifier: Programming Language :: Python :: 3.9
|
|
10
10
|
Classifier: Programming Language :: Python :: 3.10
|
11
11
|
Classifier: Programming Language :: Python :: 3.11
|
12
12
|
Requires-Dist: moviepy (>=1.0.0,<2.0.0)
|
13
|
+
Requires-Dist: nbclient (>=0.10.0,<0.11.0)
|
14
|
+
Requires-Dist: nbformat (>=5.10.4,<6.0.0)
|
13
15
|
Requires-Dist: numpy (>=1.21.0,<2.0.0)
|
14
16
|
Requires-Dist: openai (>=1.0.0,<2.0.0)
|
15
17
|
Requires-Dist: opencv-python-headless (>=4.0.0,<5.0.0)
|
@@ -17,6 +19,8 @@ Requires-Dist: pandas (>=2.0.0,<3.0.0)
|
|
17
19
|
Requires-Dist: pillow (>=10.0.0,<11.0.0)
|
18
20
|
Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
|
19
21
|
Requires-Dist: requests (>=2.0.0,<3.0.0)
|
22
|
+
Requires-Dist: rich (>=13.7.1,<14.0.0)
|
23
|
+
Requires-Dist: scipy (>=1.13.0,<1.14.0)
|
20
24
|
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
21
25
|
Requires-Dist: tqdm (>=4.64.0,<5.0.0)
|
22
26
|
Requires-Dist: typing_extensions (>=4.0.0,<5.0.0)
|
@@ -149,7 +153,7 @@ you. For example:
|
|
149
153
|
|
150
154
|
#### Custom Tools
|
151
155
|
You can also add your own custom tools for your vision agent to use:
|
152
|
-
|
156
|
+
|
153
157
|
```python
|
154
158
|
from vision_agent.tools import Tool, register_tool
|
155
159
|
@register_tool
|
@@ -180,13 +184,13 @@ find an example that creates a custom tool for template matching [here](examples
|
|
180
184
|
| GroundingDINO | GroundingDINO is a tool that can detect arbitrary objects with inputs such as category names or referring expressions. |
|
181
185
|
| GroundingSAM | GroundingSAM is a tool that can detect and segment arbitrary objects with inputs such as category names or referring expressions. |
|
182
186
|
| DINOv | DINOv is a tool that can detect arbitrary objects with using a referring mask. |
|
183
|
-
| ExtractFrames | ExtractFrames extracts frames with motion from a video. |
|
184
187
|
| Crop | Crop crops an image given a bounding box and returns a file name of the cropped image. |
|
185
188
|
| BboxArea | BboxArea returns the area of the bounding box in pixels normalized to 2 decimal places. |
|
186
189
|
| SegArea | SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places. |
|
187
190
|
| BboxIoU | BboxIoU returns the intersection over union of two bounding boxes normalized to 2 decimal places. |
|
188
191
|
| SegIoU | SegIoU returns the intersection over union of two segmentation masks normalized to 2 decimal places. |
|
189
192
|
| BoxDistance | BoxDistance returns the minimum distance between two bounding boxes normalized to 2 decimal places. |
|
193
|
+
| MaskDistance | MaskDistance returns the minimum distance between two segmentation masks in pixel units |
|
190
194
|
| BboxContains | BboxContains returns the intersection of two boxes over the target box area. It is good for check if one box is contained within another box. |
|
191
195
|
| ExtractFrames | ExtractFrames extracts frames with motion from a video. |
|
192
196
|
| ZeroShotCounting | ZeroShotCounting returns the total number of objects belonging to a single class in a given image. |
|