vision-agent 0.2.10__py3-none-any.whl → 0.2.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,442 @@
1
+ import inspect
2
+ import io
3
+ import logging
4
+ import tempfile
5
+ from importlib import resources
6
+ from pathlib import Path
7
+ from typing import Any, Callable, Dict, List, Tuple, Union
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+ import requests
12
+ from PIL import Image, ImageDraw, ImageFont
13
+
14
+ from vision_agent.tools.tool_utils import _send_inference_request
15
+ from vision_agent.utils import extract_frames_from_video
16
+ from vision_agent.utils.image_utils import convert_to_b64, normalize_bbox, rle_decode
17
+
18
+ COLORS = [
19
+ (158, 218, 229),
20
+ (219, 219, 141),
21
+ (23, 190, 207),
22
+ (188, 189, 34),
23
+ (199, 199, 199),
24
+ (247, 182, 210),
25
+ (127, 127, 127),
26
+ (227, 119, 194),
27
+ (196, 156, 148),
28
+ (197, 176, 213),
29
+ (140, 86, 75),
30
+ (148, 103, 189),
31
+ (255, 152, 150),
32
+ (152, 223, 138),
33
+ (214, 39, 40),
34
+ (44, 160, 44),
35
+ (255, 187, 120),
36
+ (174, 199, 232),
37
+ (255, 127, 14),
38
+ (31, 119, 180),
39
+ ]
40
+ _API_KEY = "land_sk_WVYwP00xA3iXely2vuar6YUDZ3MJT9yLX6oW5noUkwICzYLiDV"
41
+ _OCR_URL = "https://app.landing.ai/ocr/v1/detect-text"
42
+ logging.basicConfig(level=logging.INFO)
43
+ _LOGGER = logging.getLogger(__name__)
44
+
45
+
46
+ def grounding_dino(
47
+ prompt: str,
48
+ image: np.ndarray,
49
+ box_threshold: float = 0.20,
50
+ iou_threshold: float = 0.75,
51
+ ) -> List[Dict[str, Any]]:
52
+ """'grounding_dino' is a tool that can detect and count objects given a text prompt
53
+ such as category names or referring expressions. It returns a list and count of
54
+ bounding boxes, label names and associated probability scores.
55
+
56
+ Parameters:
57
+ prompt (str): The prompt to ground to the image.
58
+ image (np.ndarray): The image to ground the prompt to.
59
+ box_threshold (float, optional): The threshold for the box detection. Defaults
60
+ to 0.20.
61
+ iou_threshold (float, optional): The threshold for the Intersection over Union
62
+ (IoU). Defaults to 0.75.
63
+
64
+ Returns:
65
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
66
+ bounding box of the detected objects with normalized coordinates
67
+ (x1, y1, x2, y2).
68
+
69
+ Example
70
+ -------
71
+ >>> grounding_dino("car. dinosaur", image)
72
+ [
73
+ {'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
74
+ {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
75
+ ]
76
+ """
77
+ image_size = image.shape[:2]
78
+ image_b64 = convert_to_b64(Image.fromarray(image))
79
+ request_data = {
80
+ "prompt": prompt,
81
+ "image": image_b64,
82
+ "tool": "visual_grounding",
83
+ "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
84
+ }
85
+ data: Dict[str, Any] = _send_inference_request(request_data, "tools")
86
+ return_data = []
87
+ for i in range(len(data["bboxes"])):
88
+ return_data.append(
89
+ {
90
+ "score": round(data["scores"][i], 2),
91
+ "label": data["labels"][i],
92
+ "bbox": normalize_bbox(data["bboxes"][i], image_size),
93
+ }
94
+ )
95
+ return return_data
96
+
97
+
98
+ def grounding_sam(
99
+ prompt: str,
100
+ image: np.ndarray,
101
+ box_threshold: float = 0.20,
102
+ iou_threshold: float = 0.75,
103
+ ) -> List[Dict[str, Any]]:
104
+ """'grounding_sam' is a tool that can detect and segment objects given a text
105
+ prompt such as category names or referring expressions. It returns a list of
106
+ bounding boxes, label names and masks file names and associated probability scores.
107
+
108
+ Parameters:
109
+ prompt (str): The prompt to ground to the image.
110
+ image (np.ndarray): The image to ground the prompt to.
111
+ box_threshold (float, optional): The threshold for the box detection. Defaults
112
+ to 0.20.
113
+ iou_threshold (float, optional): The threshold for the Intersection over Union
114
+ (IoU). Defaults to 0.75.
115
+
116
+ Returns:
117
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
118
+ bounding box, and mask of the detected objects with normalized coordinates
119
+ (x1, y1, x2, y2).
120
+
121
+ Example
122
+ -------
123
+ >>> grounding_sam("car. dinosaur", image)
124
+ [
125
+ {
126
+ 'score': 0.99,
127
+ 'label': 'dinosaur',
128
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
129
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
130
+ [0, 0, 0, ..., 0, 0, 0],
131
+ ...,
132
+ [0, 0, 0, ..., 0, 0, 0],
133
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
134
+ },
135
+ ]
136
+ """
137
+ image_size = image.shape[:2]
138
+ image_b64 = convert_to_b64(Image.fromarray(image))
139
+ request_data = {
140
+ "prompt": prompt,
141
+ "image": image_b64,
142
+ "tool": "visual_grounding_segment",
143
+ "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
144
+ }
145
+ data: Dict[str, Any] = _send_inference_request(request_data, "tools")
146
+ return_data = []
147
+ for i in range(len(data["bboxes"])):
148
+ return_data.append(
149
+ {
150
+ "score": round(data["scores"][i], 2),
151
+ "label": data["labels"][i],
152
+ "bbox": normalize_bbox(data["bboxes"][i], image_size),
153
+ "mask": rle_decode(mask_rle=data["masks"][i], shape=data["mask_shape"]),
154
+ }
155
+ )
156
+ return return_data
157
+
158
+
159
+ def extract_frames(
160
+ video_uri: Union[str, Path], fps: float = 0.5
161
+ ) -> List[Tuple[np.ndarray, float]]:
162
+ """'extract_frames' extracts frames from a video, returns a list of tuples (frame,
163
+ timestamp), where timestamp is the relative time in seconds where the frame was
164
+ captured. The frame is a local image file path.
165
+
166
+ Parameters:
167
+ video_uri (Union[str, Path]): The path to the video file.
168
+ fps (float, optional): The frame rate per second to extract the frames. Defaults
169
+ to 0.5.
170
+
171
+ Returns:
172
+ List[Tuple[np.ndarray, float]]: A list of tuples containing the extracted frame
173
+ and the timestamp in seconds.
174
+
175
+ Example
176
+ -------
177
+ >>> extract_frames("path/to/video.mp4")
178
+ [(frame1, 0.0), (frame2, 0.5), ...]
179
+ """
180
+
181
+ return extract_frames_from_video(str(video_uri), fps)
182
+
183
+
184
+ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
185
+ """'ocr' extracts text from an image. It returns a list of detected text, bounding
186
+ boxes, and confidence scores.
187
+
188
+ Parameters:
189
+ image (np.ndarray): The image to extract text from.
190
+
191
+ Returns:
192
+ List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox,
193
+ and confidence score.
194
+
195
+ Example
196
+ -------
197
+ >>> ocr(image)
198
+ [
199
+ {'label': 'some text', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
200
+ ]
201
+ """
202
+
203
+ pil_image = Image.fromarray(image).convert("RGB")
204
+ image_size = pil_image.size[::-1]
205
+ image_buffer = io.BytesIO()
206
+ pil_image.save(image_buffer, format="PNG")
207
+ buffer_bytes = image_buffer.getvalue()
208
+ image_buffer.close()
209
+
210
+ res = requests.post(
211
+ _OCR_URL,
212
+ files={"images": buffer_bytes},
213
+ data={"language": "en"},
214
+ headers={"contentType": "multipart/form-data", "apikey": _API_KEY},
215
+ )
216
+
217
+ if res.status_code != 200:
218
+ raise ValueError(f"OCR request failed with status code {res.status_code}")
219
+
220
+ data = res.json()
221
+ output = []
222
+ for det in data[0]:
223
+ label = det["text"]
224
+ box = [
225
+ det["location"][0]["x"],
226
+ det["location"][0]["y"],
227
+ det["location"][2]["x"],
228
+ det["location"][2]["y"],
229
+ ]
230
+ box = normalize_bbox(box, image_size)
231
+ output.append({"label": label, "bbox": box, "score": round(det["score"], 2)})
232
+
233
+ return output
234
+
235
+
236
+ # Utility and visualization functions
237
+
238
+
239
+ def load_image(image_path: str) -> np.ndarray:
240
+ """'load_image' is a utility function that loads an image from the given path.
241
+
242
+ Parameters:
243
+ image_path (str): The path to the image.
244
+
245
+ Returns:
246
+ np.ndarray: The image as a NumPy array.
247
+
248
+ Example
249
+ -------
250
+ >>> load_image("path/to/image.jpg")
251
+ """
252
+
253
+ image = Image.open(image_path).convert("RGB")
254
+ return np.array(image)
255
+
256
+
257
+ def save_image(image: np.ndarray) -> str:
258
+ """'save_image' is a utility function that saves an image as a temporary file.
259
+
260
+ Parameters:
261
+ image (np.ndarray): The image to save.
262
+
263
+ Returns:
264
+ str: The path to the saved image.
265
+
266
+ Example
267
+ -------
268
+ >>> save_image(image)
269
+ "/tmp/tmpabc123.png"
270
+ """
271
+
272
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
273
+ pil_image = Image.fromarray(image.astype(np.uint8))
274
+ pil_image.save(f, "PNG")
275
+ return f.name
276
+
277
+
278
+ def overlay_bounding_boxes(
279
+ image: np.ndarray, bboxes: List[Dict[str, Any]]
280
+ ) -> np.ndarray:
281
+ """'display_bounding_boxes' is a utility function that displays bounding boxes on
282
+ an image.
283
+
284
+ Parameters:
285
+ image (np.ndarray): The image to display the bounding boxes on.
286
+ bboxes (List[Dict[str, Any]]): A list of dictionaries containing the bounding
287
+ boxes.
288
+
289
+ Returns:
290
+ np.ndarray: The image with the bounding boxes, labels and scores displayed.
291
+
292
+ Example
293
+ -------
294
+ >>> image_with_bboxes = display_bounding_boxes(
295
+ image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
296
+ )
297
+ """
298
+ pil_image = Image.fromarray(image.astype(np.uint8))
299
+
300
+ if len(set([box["label"] for box in bboxes])) > len(COLORS):
301
+ _LOGGER.warning(
302
+ "Number of unique labels exceeds the number of available colors. Some labels may have the same color."
303
+ )
304
+
305
+ color = {
306
+ label: COLORS[i % len(COLORS)]
307
+ for i, label in enumerate(set([box["label"] for box in bboxes]))
308
+ }
309
+
310
+ width, height = pil_image.size
311
+ fontsize = max(12, int(min(width, height) / 40))
312
+ draw = ImageDraw.Draw(pil_image)
313
+ font = ImageFont.truetype(
314
+ str(resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")),
315
+ fontsize,
316
+ )
317
+
318
+ for elt in bboxes:
319
+ label = elt["label"]
320
+ box = elt["bbox"]
321
+ scores = elt["score"]
322
+
323
+ box = [
324
+ int(box[0] * width),
325
+ int(box[1] * height),
326
+ int(box[2] * width),
327
+ int(box[3] * height),
328
+ ]
329
+ draw.rectangle(box, outline=color[label], width=4)
330
+ text = f"{label}: {scores:.2f}"
331
+ text_box = draw.textbbox((box[0], box[1]), text=text, font=font)
332
+ draw.rectangle((box[0], box[1], text_box[2], text_box[3]), fill=color[label])
333
+ draw.text((box[0], box[1]), text, fill="black", font=font)
334
+ return np.array(pil_image.convert("RGB"))
335
+
336
+
337
+ def overlay_segmentation_masks(
338
+ image: np.ndarray, masks: List[Dict[str, Any]]
339
+ ) -> np.ndarray:
340
+ """'display_segmentation_masks' is a utility function that displays segmentation
341
+ masks.
342
+
343
+ Parameters:
344
+ image (np.ndarray): The image to display the masks on.
345
+ masks (List[Dict[str, Any]]): A list of dictionaries containing the masks.
346
+
347
+ Returns:
348
+ np.ndarray: The image with the masks displayed.
349
+
350
+ Example
351
+ -------
352
+ >>> image_with_masks = display_segmentation_masks(
353
+ image,
354
+ [{
355
+ 'score': 0.99,
356
+ 'label': 'dinosaur',
357
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
358
+ [0, 0, 0, ..., 0, 0, 0],
359
+ ...,
360
+ [0, 0, 0, ..., 0, 0, 0],
361
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
362
+ }],
363
+ )
364
+ """
365
+ pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGBA")
366
+
367
+ if len(set([mask["label"] for mask in masks])) > len(COLORS):
368
+ _LOGGER.warning(
369
+ "Number of unique labels exceeds the number of available colors. Some labels may have the same color."
370
+ )
371
+
372
+ color = {
373
+ label: COLORS[i % len(COLORS)]
374
+ for i, label in enumerate(set([mask["label"] for mask in masks]))
375
+ }
376
+
377
+ for elt in masks:
378
+ mask = elt["mask"]
379
+ label = elt["label"]
380
+ np_mask = np.zeros((pil_image.size[1], pil_image.size[0], 4))
381
+ np_mask[mask > 0, :] = color[label] + (255 * 0.5,)
382
+ mask_img = Image.fromarray(np_mask.astype(np.uint8))
383
+ pil_image = Image.alpha_composite(pil_image, mask_img)
384
+ return np.array(pil_image.convert("RGB"))
385
+
386
+
387
+ def get_tool_documentation(funcs: List[Callable[..., Any]]) -> str:
388
+ docstrings = ""
389
+ for func in funcs:
390
+ docstrings += f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}\n\n"
391
+
392
+ return docstrings
393
+
394
+
395
+ def get_tool_descriptions(funcs: List[Callable[..., Any]]) -> str:
396
+ descriptions = ""
397
+ for func in funcs:
398
+ description = func.__doc__
399
+ if description is None:
400
+ description = ""
401
+
402
+ description = (
403
+ description[: description.find("Parameters:")].replace("\n", " ").strip()
404
+ )
405
+ description = " ".join(description.split())
406
+ descriptions += f"- {func.__name__}{inspect.signature(func)}: {description}\n"
407
+ return descriptions
408
+
409
+
410
+ def get_tools_df(funcs: List[Callable[..., Any]]) -> pd.DataFrame:
411
+ data: Dict[str, List[str]] = {"desc": [], "doc": []}
412
+
413
+ for func in funcs:
414
+ desc = func.__doc__
415
+ if desc is None:
416
+ desc = ""
417
+ desc = desc[: desc.find("Parameters:")].replace("\n", " ").strip()
418
+ desc = " ".join(desc.split())
419
+
420
+ doc = f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}"
421
+ data["desc"].append(desc)
422
+ data["doc"].append(doc)
423
+
424
+ return pd.DataFrame(data) # type: ignore
425
+
426
+
427
+ TOOLS = [
428
+ grounding_dino,
429
+ grounding_sam,
430
+ extract_frames,
431
+ ocr,
432
+ load_image,
433
+ save_image,
434
+ overlay_bounding_boxes,
435
+ overlay_segmentation_masks,
436
+ ]
437
+ TOOLS_DF = get_tools_df(TOOLS) # type: ignore
438
+ TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS) # type: ignore
439
+ TOOL_DOCSTRING = get_tool_documentation(TOOLS) # type: ignore
440
+ UTILITIES_DOCSTRING = get_tool_documentation(
441
+ [load_image, save_image, overlay_bounding_boxes]
442
+ )
@@ -0,0 +1,3 @@
1
+ from .execute import Execute
2
+ from .sim import Sim, load_sim, merge_sim
3
+ from .video import extract_frames_from_video
@@ -0,0 +1,104 @@
1
+ """This code is adapted from MetaGPT's https://github.com/geekan/MetaGPT/blob/main/metagpt/actions/di/execute_nb_code.py
2
+ """
3
+
4
+ import base64 as b64
5
+ import io
6
+ import re
7
+ from typing import Dict, List, Tuple
8
+
9
+ import nbformat
10
+ from nbclient import NotebookClient
11
+ from nbclient.exceptions import CellTimeoutError, DeadKernelError
12
+ from nbclient.util import run_sync
13
+ from nbformat import NotebookNode
14
+ from nbformat.v4 import new_code_cell
15
+ from PIL import Image
16
+
17
+
18
+ def remove_escape_and_color_codes(input_str: str) -> str:
19
+ pattern = re.compile(r"\x1b\[[0-9;]*[mK]")
20
+ result = pattern.sub("", input_str)
21
+ return result
22
+
23
+
24
+ def parse_outputs(outputs: List[Dict]) -> Tuple[bool, str]:
25
+ success, parsed_output = True, []
26
+ for output in outputs:
27
+ # TODO: add parse image data
28
+ if output["output_type"] == "stream":
29
+ parsed_output.append(output["text"])
30
+ elif output["output_type"] == "text/plain":
31
+ parsed_output.append(output["data"]["text/plain"])
32
+ elif output["output_type"] == "display_data":
33
+ if "image/png" in output["data"]:
34
+ image_bytes = b64.b64decode(output["data"]["image/png"])
35
+ Image.open(io.BytesIO(image_bytes)).show()
36
+ elif output["output_type"] == "error":
37
+ success = False
38
+ output_text = remove_escape_and_color_codes("\n".join(output["traceback"]))
39
+ parsed_output.append(output_text)
40
+
41
+ return success, ",".join(parsed_output)
42
+
43
+
44
+ class Execute:
45
+ def __init__(self, timeout: int = 600) -> None:
46
+ self.nb = nbformat.v4.new_notebook()
47
+ self.timeout = timeout
48
+ self.nb_client = NotebookClient(self.nb, timeout=self.timeout)
49
+
50
+ def build(self) -> None:
51
+ if self.nb_client.kc is None or not run_sync(self.nb_client.kc.is_alive)(): # type: ignore
52
+ self.nb_client.create_kernel_manager()
53
+ self.nb_client.start_new_kernel()
54
+ self.nb_client.start_new_kernel_client()
55
+
56
+ def terminate(self) -> None:
57
+ if self.nb_client.km is not None and run_sync(self.nb_client.km.is_alive)(): # type: ignore
58
+ run_sync(self.nb_client.km.shutdown_kernel)(now=True)
59
+ run_sync(self.nb_client.km.cleanup_resources)()
60
+
61
+ channels = [
62
+ self.nb_client.kc.stdin_channel,
63
+ self.nb_client.kc.hb_channel,
64
+ self.nb_client.kc.control_channel,
65
+ ]
66
+
67
+ for ch in channels:
68
+ if ch.is_alive():
69
+ ch.stop()
70
+
71
+ self.nb_client.kc = None
72
+ self.nb_client.km = None
73
+
74
+ def reset(self) -> None:
75
+ self.terminate()
76
+ self.nb = nbformat.v4.new_notebook()
77
+ self.nb_client = NotebookClient(self.nb, timeout=self.timeout)
78
+ self.build()
79
+
80
+ def run_cell(self, cell: NotebookNode, cell_index: int) -> Tuple[bool, str]:
81
+ try:
82
+ self.nb_client.execute_cell(cell, cell_index)
83
+ return parse_outputs(self.nb.cells[-1].outputs)
84
+ except CellTimeoutError:
85
+ run_sync(self.nb_client.km.interrupt_kernel)() # type: ignore
86
+ return False, "Cell execution timed out."
87
+ except DeadKernelError:
88
+ self.reset()
89
+ return False, "DeadKernelError"
90
+ except Exception:
91
+ return parse_outputs(self.nb.cells[-1].outputs)
92
+
93
+ def add_code_cell(self, code: str) -> None:
94
+ self.nb.cells.append(new_code_cell(code))
95
+
96
+ def run_additional(self, code: str) -> Tuple[bool, str]:
97
+ self.build()
98
+ self.add_code_cell(code)
99
+ return self.run_cell(self.nb.cells[-1], len(self.nb.cells) - 1)
100
+
101
+ def run_isolation(self, code: str) -> Tuple[bool, str]:
102
+ self.reset()
103
+ self.add_code_cell(code)
104
+ return self.run_cell(self.nb.cells[-1], len(self.nb.cells) - 1)
@@ -0,0 +1,85 @@
1
+ from pathlib import Path
2
+ from typing import Dict, List, Optional, Sequence, Union
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ from openai import Client
7
+ from scipy.spatial.distance import cosine # type: ignore
8
+
9
+
10
+ def get_embedding(
11
+ client: Client, text: str, model: str = "text-embedding-3-small"
12
+ ) -> List[float]:
13
+ text = text.replace("\n", " ")
14
+ return client.embeddings.create(input=[text], model=model).data[0].embedding
15
+
16
+
17
+ class Sim:
18
+ def __init__(
19
+ self,
20
+ df: pd.DataFrame,
21
+ sim_key: Optional[str] = None,
22
+ api_key: Optional[str] = None,
23
+ model: str = "text-embedding-3-small",
24
+ ) -> None:
25
+ """Creates a similarity object that can be used to find similar items in a
26
+ dataframe.
27
+
28
+ Parameters:
29
+ df: pd.DataFrame: The dataframe to use for similarity.
30
+ sim_key: Optional[str]: The column name that you want to use to construct
31
+ the embeddings.
32
+ model: str: The model to use for embeddings.
33
+ """
34
+ self.df = df
35
+ if not api_key:
36
+ self.client = Client()
37
+ else:
38
+ self.client = Client(api_key=api_key)
39
+
40
+ self.model = model
41
+ if "embs" not in df.columns and sim_key is None:
42
+ raise ValueError("key is required if no column 'embs' is present.")
43
+
44
+ if sim_key is not None:
45
+ self.df["embs"] = self.df[sim_key].apply(
46
+ lambda x: get_embedding(self.client, x, model=self.model)
47
+ )
48
+
49
+ def save(self, sim_file: Union[str, Path]) -> None:
50
+ sim_file = Path(sim_file)
51
+ sim_file.mkdir(parents=True, exist_ok=True)
52
+
53
+ df = self.df.copy()
54
+ embs = np.array(df.embs.tolist())
55
+ np.save(sim_file / "embs.npy", embs)
56
+ df = df.drop("embs", axis=1)
57
+ df.to_csv(sim_file / "df.csv", index=False)
58
+
59
+ def top_k(self, query: str, k: int = 5) -> Sequence[Dict]:
60
+ """Returns the top k most similar items to the query.
61
+
62
+ Parameters:
63
+ query: str: The query to compare to.
64
+ k: int: The number of items to return.
65
+
66
+ Returns:
67
+ Sequence[Dict]: The top k most similar items.
68
+ """
69
+
70
+ embedding = get_embedding(self.client, query, model=self.model)
71
+ self.df["sim"] = self.df.embs.apply(lambda x: 1 - cosine(x, embedding))
72
+ res = self.df.sort_values("sim", ascending=False).head(k)
73
+ return res[[c for c in res.columns if c != "embs"]].to_dict(orient="records")
74
+
75
+
76
+ def merge_sim(sim1: Sim, sim2: Sim) -> Sim:
77
+ return Sim(pd.concat([sim1.df, sim2.df], ignore_index=True))
78
+
79
+
80
+ def load_sim(sim_file: Union[str, Path]) -> Sim:
81
+ sim_file = Path(sim_file)
82
+ df = pd.read_csv(sim_file / "df.csv")
83
+ embs = np.load(sim_file / "embs.npy")
84
+ df["embs"] = list(embs)
85
+ return Sim(df)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.10
3
+ Version: 0.2.22
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -10,6 +10,8 @@ Classifier: Programming Language :: Python :: 3.9
10
10
  Classifier: Programming Language :: Python :: 3.10
11
11
  Classifier: Programming Language :: Python :: 3.11
12
12
  Requires-Dist: moviepy (>=1.0.0,<2.0.0)
13
+ Requires-Dist: nbclient (>=0.10.0,<0.11.0)
14
+ Requires-Dist: nbformat (>=5.10.4,<6.0.0)
13
15
  Requires-Dist: numpy (>=1.21.0,<2.0.0)
14
16
  Requires-Dist: openai (>=1.0.0,<2.0.0)
15
17
  Requires-Dist: opencv-python-headless (>=4.0.0,<5.0.0)
@@ -17,6 +19,8 @@ Requires-Dist: pandas (>=2.0.0,<3.0.0)
17
19
  Requires-Dist: pillow (>=10.0.0,<11.0.0)
18
20
  Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
19
21
  Requires-Dist: requests (>=2.0.0,<3.0.0)
22
+ Requires-Dist: rich (>=13.7.1,<14.0.0)
23
+ Requires-Dist: scipy (>=1.13.0,<1.14.0)
20
24
  Requires-Dist: tabulate (>=0.9.0,<0.10.0)
21
25
  Requires-Dist: tqdm (>=4.64.0,<5.0.0)
22
26
  Requires-Dist: typing_extensions (>=4.0.0,<5.0.0)
@@ -149,7 +153,7 @@ you. For example:
149
153
 
150
154
  #### Custom Tools
151
155
  You can also add your own custom tools for your vision agent to use:
152
-
156
+
153
157
  ```python
154
158
  from vision_agent.tools import Tool, register_tool
155
159
  @register_tool
@@ -180,13 +184,13 @@ find an example that creates a custom tool for template matching [here](examples
180
184
  | GroundingDINO | GroundingDINO is a tool that can detect arbitrary objects with inputs such as category names or referring expressions. |
181
185
  | GroundingSAM | GroundingSAM is a tool that can detect and segment arbitrary objects with inputs such as category names or referring expressions. |
182
186
  | DINOv | DINOv is a tool that can detect arbitrary objects with using a referring mask. |
183
- | ExtractFrames | ExtractFrames extracts frames with motion from a video. |
184
187
  | Crop | Crop crops an image given a bounding box and returns a file name of the cropped image. |
185
188
  | BboxArea | BboxArea returns the area of the bounding box in pixels normalized to 2 decimal places. |
186
189
  | SegArea | SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places. |
187
190
  | BboxIoU | BboxIoU returns the intersection over union of two bounding boxes normalized to 2 decimal places. |
188
191
  | SegIoU | SegIoU returns the intersection over union of two segmentation masks normalized to 2 decimal places. |
189
192
  | BoxDistance | BoxDistance returns the minimum distance between two bounding boxes normalized to 2 decimal places. |
193
+ | MaskDistance | MaskDistance returns the minimum distance between two segmentation masks in pixel units |
190
194
  | BboxContains | BboxContains returns the intersection of two boxes over the target box area. It is good for check if one box is contained within another box. |
191
195
  | ExtractFrames | ExtractFrames extracts frames with motion from a video. |
192
196
  | ZeroShotCounting | ZeroShotCounting returns the total number of objects belonging to a single class in a given image. |