vision-agent 0.2.13__py3-none-any.whl → 0.2.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,7 @@ from typing import Any, Dict
3
3
 
4
4
  import requests
5
5
 
6
- from vision_agent.type_defs import LandingaiAPIKey
6
+ from vision_agent.utils.type_defs import LandingaiAPIKey
7
7
 
8
8
  _LOGGER = logging.getLogger(__name__)
9
9
  _LND_API_KEY = LandingaiAPIKey().api_key
@@ -11,7 +11,10 @@ from PIL import Image
11
11
  from PIL.Image import Image as ImageType
12
12
  from scipy.spatial import distance # type: ignore
13
13
 
14
- from vision_agent.image_utils import (
14
+ from vision_agent.lmm import OpenAILMM
15
+ from vision_agent.tools.tool_utils import _send_inference_request
16
+ from vision_agent.utils import extract_frames_from_video
17
+ from vision_agent.utils.image_utils import (
15
18
  b64_to_pil,
16
19
  convert_to_b64,
17
20
  denormalize_bbox,
@@ -19,9 +22,6 @@ from vision_agent.image_utils import (
19
22
  normalize_bbox,
20
23
  rle_decode,
21
24
  )
22
- from vision_agent.lmm import OpenAILMM
23
- from vision_agent.tools.tool_utils import _send_inference_request
24
- from vision_agent.tools.video import extract_frames_from_video
25
25
 
26
26
  _LOGGER = logging.getLogger(__name__)
27
27
 
@@ -422,7 +422,6 @@ class DINOv(Tool):
422
422
  request_data = {
423
423
  "prompt": prompt,
424
424
  "image": image_b64,
425
- "tool": "dinov",
426
425
  }
427
426
  data: Dict[str, Any] = _send_inference_request(request_data, "dinov")
428
427
  if "bboxes" in data:
@@ -1,13 +1,19 @@
1
1
  import inspect
2
+ import io
3
+ import logging
2
4
  import tempfile
3
5
  from importlib import resources
4
- from typing import Any, Callable, Dict, List
6
+ from pathlib import Path
7
+ from typing import Any, Callable, Dict, List, Tuple, Union
5
8
 
6
9
  import numpy as np
10
+ import pandas as pd
11
+ import requests
7
12
  from PIL import Image, ImageDraw, ImageFont
8
13
 
9
- from vision_agent.image_utils import convert_to_b64, normalize_bbox
10
14
  from vision_agent.tools.tool_utils import _send_inference_request
15
+ from vision_agent.utils import extract_frames_from_video
16
+ from vision_agent.utils.image_utils import convert_to_b64, normalize_bbox, rle_decode
11
17
 
12
18
  COLORS = [
13
19
  (158, 218, 229),
@@ -31,6 +37,10 @@ COLORS = [
31
37
  (255, 127, 14),
32
38
  (31, 119, 180),
33
39
  ]
40
+ _API_KEY = "land_sk_WVYwP00xA3iXely2vuar6YUDZ3MJT9yLX6oW5noUkwICzYLiDV"
41
+ _OCR_URL = "https://app.landing.ai/ocr/v1/detect-text"
42
+ logging.basicConfig(level=logging.INFO)
43
+ _LOGGER = logging.getLogger(__name__)
34
44
 
35
45
 
36
46
  def grounding_dino(
@@ -39,23 +49,30 @@ def grounding_dino(
39
49
  box_threshold: float = 0.20,
40
50
  iou_threshold: float = 0.75,
41
51
  ) -> List[Dict[str, Any]]:
42
- """'grounding_dino' is a tool that can detect arbitrary objects with inputs such as
43
- category names or referring expressions.
52
+ """'grounding_dino' is a tool that can detect and count objects given a text prompt
53
+ such as category names or referring expressions. It returns a list and count of
54
+ bounding boxes, label names and associated probability scores.
44
55
 
45
56
  Parameters:
46
57
  prompt (str): The prompt to ground to the image.
47
58
  image (np.ndarray): The image to ground the prompt to.
48
- box_threshold (float, optional): The threshold for the box detection. Defaults to 0.20.
49
- iou_threshold (float, optional): The threshold for the Intersection over Union (IoU). Defaults to 0.75.
59
+ box_threshold (float, optional): The threshold for the box detection. Defaults
60
+ to 0.20.
61
+ iou_threshold (float, optional): The threshold for the Intersection over Union
62
+ (IoU). Defaults to 0.75.
50
63
 
51
64
  Returns:
52
65
  List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
53
- bounding box of the detected objects with normalized coordinates.
66
+ bounding box of the detected objects with normalized coordinates
67
+ (x1, y1, x2, y2).
54
68
 
55
69
  Example
56
70
  -------
57
71
  >>> grounding_dino("car. dinosaur", image)
58
- [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}]
72
+ [
73
+ {'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
74
+ {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
75
+ ]
59
76
  """
60
77
  image_size = image.shape[:2]
61
78
  image_b64 = convert_to_b64(Image.fromarray(image))
@@ -78,6 +95,147 @@ def grounding_dino(
78
95
  return return_data
79
96
 
80
97
 
98
+ def grounding_sam(
99
+ prompt: str,
100
+ image: np.ndarray,
101
+ box_threshold: float = 0.20,
102
+ iou_threshold: float = 0.75,
103
+ ) -> List[Dict[str, Any]]:
104
+ """'grounding_sam' is a tool that can detect and segment objects given a text
105
+ prompt such as category names or referring expressions. It returns a list of
106
+ bounding boxes, label names and masks file names and associated probability scores.
107
+
108
+ Parameters:
109
+ prompt (str): The prompt to ground to the image.
110
+ image (np.ndarray): The image to ground the prompt to.
111
+ box_threshold (float, optional): The threshold for the box detection. Defaults
112
+ to 0.20.
113
+ iou_threshold (float, optional): The threshold for the Intersection over Union
114
+ (IoU). Defaults to 0.75.
115
+
116
+ Returns:
117
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label,
118
+ bounding box, and mask of the detected objects with normalized coordinates
119
+ (x1, y1, x2, y2).
120
+
121
+ Example
122
+ -------
123
+ >>> grounding_sam("car. dinosaur", image)
124
+ [
125
+ {
126
+ 'score': 0.99,
127
+ 'label': 'dinosaur',
128
+ 'bbox': [0.1, 0.11, 0.35, 0.4],
129
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
130
+ [0, 0, 0, ..., 0, 0, 0],
131
+ ...,
132
+ [0, 0, 0, ..., 0, 0, 0],
133
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
134
+ },
135
+ ]
136
+ """
137
+ image_size = image.shape[:2]
138
+ image_b64 = convert_to_b64(Image.fromarray(image))
139
+ request_data = {
140
+ "prompt": prompt,
141
+ "image": image_b64,
142
+ "tool": "visual_grounding_segment",
143
+ "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
144
+ }
145
+ data: Dict[str, Any] = _send_inference_request(request_data, "tools")
146
+ return_data = []
147
+ for i in range(len(data["bboxes"])):
148
+ return_data.append(
149
+ {
150
+ "score": round(data["scores"][i], 2),
151
+ "label": data["labels"][i],
152
+ "bbox": normalize_bbox(data["bboxes"][i], image_size),
153
+ "mask": rle_decode(mask_rle=data["masks"][i], shape=data["mask_shape"]),
154
+ }
155
+ )
156
+ return return_data
157
+
158
+
159
+ def extract_frames(
160
+ video_uri: Union[str, Path], fps: float = 0.5
161
+ ) -> List[Tuple[np.ndarray, float]]:
162
+ """'extract_frames' extracts frames from a video, returns a list of tuples (frame,
163
+ timestamp), where timestamp is the relative time in seconds where the frame was
164
+ captured. The frame is a local image file path.
165
+
166
+ Parameters:
167
+ video_uri (Union[str, Path]): The path to the video file.
168
+ fps (float, optional): The frame rate per second to extract the frames. Defaults
169
+ to 0.5.
170
+
171
+ Returns:
172
+ List[Tuple[np.ndarray, float]]: A list of tuples containing the extracted frame
173
+ and the timestamp in seconds.
174
+
175
+ Example
176
+ -------
177
+ >>> extract_frames("path/to/video.mp4")
178
+ [(frame1, 0.0), (frame2, 0.5), ...]
179
+ """
180
+
181
+ return extract_frames_from_video(str(video_uri), fps)
182
+
183
+
184
+ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
185
+ """'ocr' extracts text from an image. It returns a list of detected text, bounding
186
+ boxes, and confidence scores.
187
+
188
+ Parameters:
189
+ image (np.ndarray): The image to extract text from.
190
+
191
+ Returns:
192
+ List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox,
193
+ and confidence score.
194
+
195
+ Example
196
+ -------
197
+ >>> ocr(image)
198
+ [
199
+ {'label': 'some text', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
200
+ ]
201
+ """
202
+
203
+ pil_image = Image.fromarray(image).convert("RGB")
204
+ image_size = pil_image.size[::-1]
205
+ image_buffer = io.BytesIO()
206
+ pil_image.save(image_buffer, format="PNG")
207
+ buffer_bytes = image_buffer.getvalue()
208
+ image_buffer.close()
209
+
210
+ res = requests.post(
211
+ _OCR_URL,
212
+ files={"images": buffer_bytes},
213
+ data={"language": "en"},
214
+ headers={"contentType": "multipart/form-data", "apikey": _API_KEY},
215
+ )
216
+
217
+ if res.status_code != 200:
218
+ raise ValueError(f"OCR request failed with status code {res.status_code}")
219
+
220
+ data = res.json()
221
+ output = []
222
+ for det in data[0]:
223
+ label = det["text"]
224
+ box = [
225
+ det["location"][0]["x"],
226
+ det["location"][0]["y"],
227
+ det["location"][2]["x"],
228
+ det["location"][2]["y"],
229
+ ]
230
+ box = normalize_bbox(box, image_size)
231
+ output.append({"label": label, "bbox": box, "score": round(det["score"], 2)})
232
+
233
+ return output
234
+
235
+
236
+ # Utility and visualization functions
237
+
238
+
81
239
  def load_image(image_path: str) -> np.ndarray:
82
240
  """'load_image' is a utility function that loads an image from the given path.
83
241
 
@@ -117,24 +275,33 @@ def save_image(image: np.ndarray) -> str:
117
275
  return f.name
118
276
 
119
277
 
120
- def display_bounding_boxes(
278
+ def overlay_bounding_boxes(
121
279
  image: np.ndarray, bboxes: List[Dict[str, Any]]
122
280
  ) -> np.ndarray:
123
- """'display_bounding_boxes' is a utility function that displays bounding boxes on an image.
281
+ """'display_bounding_boxes' is a utility function that displays bounding boxes on
282
+ an image.
124
283
 
125
284
  Parameters:
126
285
  image (np.ndarray): The image to display the bounding boxes on.
127
- bboxes (List[Dict[str, Any]]): A list of dictionaries containing the bounding boxes.
286
+ bboxes (List[Dict[str, Any]]): A list of dictionaries containing the bounding
287
+ boxes.
128
288
 
129
289
  Returns:
130
- np.ndarray: The image with the bounding boxes displayed.
290
+ np.ndarray: The image with the bounding boxes, labels and scores displayed.
131
291
 
132
292
  Example
133
293
  -------
134
- >>> image_with_bboxes = display_bounding_boxes(image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}])
294
+ >>> image_with_bboxes = display_bounding_boxes(
295
+ image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
296
+ )
135
297
  """
136
298
  pil_image = Image.fromarray(image.astype(np.uint8))
137
299
 
300
+ if len(set([box["label"] for box in bboxes])) > len(COLORS):
301
+ _LOGGER.warning(
302
+ "Number of unique labels exceeds the number of available colors. Some labels may have the same color."
303
+ )
304
+
138
305
  color = {
139
306
  label: COLORS[i % len(COLORS)]
140
307
  for i, label in enumerate(set([box["label"] for box in bboxes]))
@@ -167,15 +334,109 @@ def display_bounding_boxes(
167
334
  return np.array(pil_image.convert("RGB"))
168
335
 
169
336
 
170
- def get_tool_documentation(funcs: List[Callable]) -> str:
337
+ def overlay_segmentation_masks(
338
+ image: np.ndarray, masks: List[Dict[str, Any]]
339
+ ) -> np.ndarray:
340
+ """'display_segmentation_masks' is a utility function that displays segmentation
341
+ masks.
342
+
343
+ Parameters:
344
+ image (np.ndarray): The image to display the masks on.
345
+ masks (List[Dict[str, Any]]): A list of dictionaries containing the masks.
346
+
347
+ Returns:
348
+ np.ndarray: The image with the masks displayed.
349
+
350
+ Example
351
+ -------
352
+ >>> image_with_masks = display_segmentation_masks(
353
+ image,
354
+ [{
355
+ 'score': 0.99,
356
+ 'label': 'dinosaur',
357
+ 'mask': array([[0, 0, 0, ..., 0, 0, 0],
358
+ [0, 0, 0, ..., 0, 0, 0],
359
+ ...,
360
+ [0, 0, 0, ..., 0, 0, 0],
361
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
362
+ }],
363
+ )
364
+ """
365
+ pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGBA")
366
+
367
+ if len(set([mask["label"] for mask in masks])) > len(COLORS):
368
+ _LOGGER.warning(
369
+ "Number of unique labels exceeds the number of available colors. Some labels may have the same color."
370
+ )
371
+
372
+ color = {
373
+ label: COLORS[i % len(COLORS)]
374
+ for i, label in enumerate(set([mask["label"] for mask in masks]))
375
+ }
376
+
377
+ for elt in masks:
378
+ mask = elt["mask"]
379
+ label = elt["label"]
380
+ np_mask = np.zeros((pil_image.size[1], pil_image.size[0], 4))
381
+ np_mask[mask > 0, :] = color[label] + (255 * 0.5,)
382
+ mask_img = Image.fromarray(np_mask.astype(np.uint8))
383
+ pil_image = Image.alpha_composite(pil_image, mask_img)
384
+ return np.array(pil_image.convert("RGB"))
385
+
386
+
387
+ def get_tool_documentation(funcs: List[Callable[..., Any]]) -> str:
171
388
  docstrings = ""
172
389
  for func in funcs:
173
- docstrings += f"{func.__name__}: {inspect.signature(func)}\n{func.__doc__}\n\n"
390
+ docstrings += f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}\n\n"
174
391
 
175
392
  return docstrings
176
393
 
177
394
 
178
- TOOLS_DOCSTRING = get_tool_documentation([load_image, grounding_dino])
395
+ def get_tool_descriptions(funcs: List[Callable[..., Any]]) -> str:
396
+ descriptions = ""
397
+ for func in funcs:
398
+ description = func.__doc__
399
+ if description is None:
400
+ description = ""
401
+
402
+ description = (
403
+ description[: description.find("Parameters:")].replace("\n", " ").strip()
404
+ )
405
+ description = " ".join(description.split())
406
+ descriptions += f"- {func.__name__}{inspect.signature(func)}: {description}\n"
407
+ return descriptions
408
+
409
+
410
+ def get_tools_df(funcs: List[Callable[..., Any]]) -> pd.DataFrame:
411
+ data: Dict[str, List[str]] = {"desc": [], "doc": []}
412
+
413
+ for func in funcs:
414
+ desc = func.__doc__
415
+ if desc is None:
416
+ desc = ""
417
+ desc = desc[: desc.find("Parameters:")].replace("\n", " ").strip()
418
+ desc = " ".join(desc.split())
419
+
420
+ doc = f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}"
421
+ data["desc"].append(desc)
422
+ data["doc"].append(doc)
423
+
424
+ return pd.DataFrame(data) # type: ignore
425
+
426
+
427
+ TOOLS = [
428
+ grounding_dino,
429
+ grounding_sam,
430
+ extract_frames,
431
+ ocr,
432
+ load_image,
433
+ save_image,
434
+ overlay_bounding_boxes,
435
+ overlay_segmentation_masks,
436
+ ]
437
+ TOOLS_DF = get_tools_df(TOOLS) # type: ignore
438
+ TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS) # type: ignore
439
+ TOOL_DOCSTRING = get_tool_documentation(TOOLS) # type: ignore
179
440
  UTILITIES_DOCSTRING = get_tool_documentation(
180
- [load_image, save_image, display_bounding_boxes]
441
+ [load_image, save_image, overlay_bounding_boxes]
181
442
  )
@@ -0,0 +1,3 @@
1
+ from .execute import Execute
2
+ from .sim import Sim
3
+ from .video import extract_frames_from_video
@@ -0,0 +1,104 @@
1
+ """This code is adapted from MetaGPT's https://github.com/geekan/MetaGPT/blob/main/metagpt/actions/di/execute_nb_code.py
2
+ """
3
+
4
+ import base64 as b64
5
+ import io
6
+ import re
7
+ from typing import Dict, List, Tuple
8
+
9
+ import nbformat
10
+ from nbclient import NotebookClient
11
+ from nbclient.exceptions import CellTimeoutError, DeadKernelError
12
+ from nbclient.util import run_sync
13
+ from nbformat import NotebookNode
14
+ from nbformat.v4 import new_code_cell
15
+ from PIL import Image
16
+
17
+
18
+ def remove_escape_and_color_codes(input_str: str) -> str:
19
+ pattern = re.compile(r"\x1b\[[0-9;]*[mK]")
20
+ result = pattern.sub("", input_str)
21
+ return result
22
+
23
+
24
+ def parse_outputs(outputs: List[Dict]) -> Tuple[bool, str]:
25
+ success, parsed_output = True, []
26
+ for output in outputs:
27
+ # TODO: add parse image data
28
+ if output["output_type"] == "stream":
29
+ parsed_output.append(output["text"])
30
+ elif output["output_type"] == "text/plain":
31
+ parsed_output.append(output["data"]["text/plain"])
32
+ elif output["output_type"] == "display_data":
33
+ if "image/png" in output["data"]:
34
+ image_bytes = b64.b64decode(output["data"]["image/png"])
35
+ Image.open(io.BytesIO(image_bytes)).show()
36
+ elif output["output_type"] == "error":
37
+ success = False
38
+ output_text = remove_escape_and_color_codes("\n".join(output["traceback"]))
39
+ parsed_output.append(output_text)
40
+
41
+ return success, ",".join(parsed_output)
42
+
43
+
44
+ class Execute:
45
+ def __init__(self, timeout: int = 600) -> None:
46
+ self.nb = nbformat.v4.new_notebook()
47
+ self.timeout = timeout
48
+ self.nb_client = NotebookClient(self.nb, timeout=self.timeout)
49
+
50
+ def build(self) -> None:
51
+ if self.nb_client.kc is None or not run_sync(self.nb_client.kc.is_alive)(): # type: ignore
52
+ self.nb_client.create_kernel_manager()
53
+ self.nb_client.start_new_kernel()
54
+ self.nb_client.start_new_kernel_client()
55
+
56
+ def terminate(self) -> None:
57
+ if self.nb_client.km is not None and run_sync(self.nb_client.km.is_alive)(): # type: ignore
58
+ run_sync(self.nb_client.km.shutdown_kernel)(now=True)
59
+ run_sync(self.nb_client.km.cleanup_resources)()
60
+
61
+ channels = [
62
+ self.nb_client.kc.stdin_channel,
63
+ self.nb_client.kc.hb_channel,
64
+ self.nb_client.kc.control_channel,
65
+ ]
66
+
67
+ for ch in channels:
68
+ if ch.is_alive():
69
+ ch.stop()
70
+
71
+ self.nb_client.kc = None
72
+ self.nb_client.km = None
73
+
74
+ def reset(self) -> None:
75
+ self.terminate()
76
+ self.nb = nbformat.v4.new_notebook()
77
+ self.nb_client = NotebookClient(self.nb, timeout=self.timeout)
78
+ self.build()
79
+
80
+ def run_cell(self, cell: NotebookNode, cell_index: int) -> Tuple[bool, str]:
81
+ try:
82
+ self.nb_client.execute_cell(cell, cell_index)
83
+ return parse_outputs(self.nb.cells[-1].outputs)
84
+ except CellTimeoutError:
85
+ run_sync(self.nb_client.km.interrupt_kernel)() # type: ignore
86
+ return False, "Cell execution timed out."
87
+ except DeadKernelError:
88
+ self.reset()
89
+ return False, "DeadKernelError"
90
+ except Exception:
91
+ return parse_outputs(self.nb.cells[-1].outputs)
92
+
93
+ def add_code_cell(self, code: str) -> None:
94
+ self.nb.cells.append(new_code_cell(code))
95
+
96
+ def run_additional(self, code: str) -> Tuple[bool, str]:
97
+ self.build()
98
+ self.add_code_cell(code)
99
+ return self.run_cell(self.nb.cells[-1], len(self.nb.cells) - 1)
100
+
101
+ def run_isolation(self, code: str) -> Tuple[bool, str]:
102
+ self.reset()
103
+ self.add_code_cell(code)
104
+ return self.run_cell(self.nb.cells[-1], len(self.nb.cells) - 1)
@@ -0,0 +1,70 @@
1
+ from pathlib import Path
2
+ from typing import Dict, List, Optional, Sequence, Union
3
+
4
+ import pandas as pd
5
+ from openai import Client
6
+ from scipy.spatial.distance import cosine # type: ignore
7
+
8
+
9
+ def get_embedding(
10
+ client: Client, text: str, model: str = "text-embedding-3-small"
11
+ ) -> List[float]:
12
+ text = text.replace("\n", " ")
13
+ return client.embeddings.create(input=[text], model=model).data[0].embedding
14
+
15
+
16
+ class Sim:
17
+ def __init__(
18
+ self,
19
+ df: pd.DataFrame,
20
+ sim_key: Optional[str] = None,
21
+ api_key: Optional[str] = None,
22
+ model: str = "text-embedding-3-small",
23
+ ) -> None:
24
+ """Creates a similarity object that can be used to find similar items in a
25
+ dataframe.
26
+
27
+ Parameters:
28
+ df: pd.DataFrame: The dataframe to use for similarity.
29
+ sim_key: Optional[str]: The column name that you want to use to construct
30
+ the embeddings.
31
+ model: str: The model to use for embeddings.
32
+ """
33
+ self.df = df
34
+ if not api_key:
35
+ self.client = Client()
36
+ else:
37
+ self.client = Client(api_key=api_key)
38
+
39
+ self.model = model
40
+ if "embs" not in df.columns and sim_key is None:
41
+ raise ValueError("key is required if no column 'embs' is present.")
42
+
43
+ if sim_key is not None:
44
+ self.df["embs"] = self.df[sim_key].apply(
45
+ lambda x: get_embedding(self.client, x, model=self.model)
46
+ )
47
+
48
+ def save(self, sim_file: Union[str, Path]) -> None:
49
+ self.df.to_csv(sim_file, index=False)
50
+
51
+ def top_k(self, query: str, k: int = 5) -> Sequence[Dict]:
52
+ """Returns the top k most similar items to the query.
53
+
54
+ Parameters:
55
+ query: str: The query to compare to.
56
+ k: int: The number of items to return.
57
+
58
+ Returns:
59
+ Sequence[Dict]: The top k most similar items.
60
+ """
61
+
62
+ embedding = get_embedding(self.client, query, model=self.model)
63
+ self.df["sim"] = self.df.embs.apply(lambda x: 1 - cosine(x, embedding))
64
+ res = self.df.sort_values("sim", ascending=False).head(k)
65
+ return res[[c for c in res.columns if c != "embs"]].to_dict(orient="records")
66
+
67
+
68
+ def load_sim(sim_file: Union[str, Path]) -> Sim:
69
+ df = pd.read_csv(sim_file)
70
+ return Sim(df)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.13
3
+ Version: 0.2.15
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -10,6 +10,8 @@ Classifier: Programming Language :: Python :: 3.9
10
10
  Classifier: Programming Language :: Python :: 3.10
11
11
  Classifier: Programming Language :: Python :: 3.11
12
12
  Requires-Dist: moviepy (>=1.0.0,<2.0.0)
13
+ Requires-Dist: nbclient (>=0.10.0,<0.11.0)
14
+ Requires-Dist: nbformat (>=5.10.4,<6.0.0)
13
15
  Requires-Dist: numpy (>=1.21.0,<2.0.0)
14
16
  Requires-Dist: openai (>=1.0.0,<2.0.0)
15
17
  Requires-Dist: opencv-python-headless (>=4.0.0,<5.0.0)
@@ -17,6 +19,7 @@ Requires-Dist: pandas (>=2.0.0,<3.0.0)
17
19
  Requires-Dist: pillow (>=10.0.0,<11.0.0)
18
20
  Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
19
21
  Requires-Dist: requests (>=2.0.0,<3.0.0)
22
+ Requires-Dist: rich (>=13.7.1,<14.0.0)
20
23
  Requires-Dist: scipy (>=1.13.0,<1.14.0)
21
24
  Requires-Dist: tabulate (>=0.9.0,<0.10.0)
22
25
  Requires-Dist: tqdm (>=4.64.0,<5.0.0)
@@ -181,7 +184,6 @@ find an example that creates a custom tool for template matching [here](examples
181
184
  | GroundingDINO | GroundingDINO is a tool that can detect arbitrary objects with inputs such as category names or referring expressions. |
182
185
  | GroundingSAM | GroundingSAM is a tool that can detect and segment arbitrary objects with inputs such as category names or referring expressions. |
183
186
  | DINOv | DINOv is a tool that can detect arbitrary objects with using a referring mask. |
184
- | ExtractFrames | ExtractFrames extracts frames with motion from a video. |
185
187
  | Crop | Crop crops an image given a bounding box and returns a file name of the cropped image. |
186
188
  | BboxArea | BboxArea returns the area of the bounding box in pixels normalized to 2 decimal places. |
187
189
  | SegArea | SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places. |
@@ -0,0 +1,34 @@
1
+ vision_agent/__init__.py,sha256=GVLHCeK_R-zgldpbcPmOzJat-BkadvkuRCMxDvTIcXs,108
2
+ vision_agent/agent/__init__.py,sha256=Zv8lc91mPy0iDySId38_vc4mo56JQ9mCMvUWdAKQjh0,206
3
+ vision_agent/agent/agent.py,sha256=X7kON-g9ePUKumCDaYfQNBX_MEFE-ax5PnRp7-Cc5Wo,529
4
+ vision_agent/agent/agent_coder.py,sha256=e3mQn1xenahYk_uGflvuQ10s6dSHHM6p0jZN9UT1ZpE,6508
5
+ vision_agent/agent/agent_coder_prompts.py,sha256=CJe3v7xvHQ32u3RQAXQga_Tk_4UgU64RBAMHZ3S70KY,5538
6
+ vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMVg,11511
7
+ vision_agent/agent/easytool_prompts.py,sha256=Bikw-PPLkm78dwywTlnv32Y1Tw6JMeC-R7oCnXWLcTk,4656
8
+ vision_agent/agent/reflexion.py,sha256=4gz30BuFMeGxSsTzoDV4p91yE0R8LISXp28IaOI6wdM,10506
9
+ vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
10
+ vision_agent/agent/vision_agent.py,sha256=4-GjEX8ZmLhvLebqNRRTSSu1kSaFYVR_wFsrjXgKdYI,26984
11
+ vision_agent/agent/vision_agent_prompts.py,sha256=MZSIwovYgB-f-kdJ6btaNDVXptJn47bfOL3-Zn6NiC0,8573
12
+ vision_agent/agent/vision_agent_v2.py,sha256=CDgGBSoa2LoMS0b4JhyDkoS3PJJNmCCPfxIGUc4RfQg,9658
13
+ vision_agent/agent/vision_agent_v2_prompt.py,sha256=-90Hlbtqb5Fp7OVjGabpTdgr-yCr8AYKIfiMRfoL4SY,5141
14
+ vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
16
+ vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
17
+ vision_agent/llm/llm.py,sha256=qWDBpJolGLWNwDjpEXu1NrjlJbo7Fj9efJYkSfVn6oE,5784
18
+ vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
19
+ vision_agent/lmm/lmm.py,sha256=gK90vMxh0OcGSuIZQikBkDXm4pfkdFk1R2y7rtWDl84,10539
20
+ vision_agent/tools/__init__.py,sha256=WiEjXzXyaBq7IQMKOMbFAK3FKvLNfzZ3dd7CPN-d7B8,451
21
+ vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
22
+ vision_agent/tools/tool_utils.py,sha256=moR7X4hkLKQzC56axdojo_OcIuVOv45bKcHPUVZrPvk,753
23
+ vision_agent/tools/tools.py,sha256=WrNu_L5n2cEpe7e1oy8S1o3dy4JJ4AUxTHcjAdX64_g,46398
24
+ vision_agent/tools/tools_v2.py,sha256=1Y_ZbYJyuo2eZZkq7jY3YfuKWC82C-GFCZMLYH-I5ew,13800
25
+ vision_agent/utils/__init__.py,sha256=AKXf1QVOpO6MnqU8RSaFLQ_4us4DcKf8ibgEbhuHjvI,95
26
+ vision_agent/utils/execute.py,sha256=RC_jKrm2kOWwzNe9xKuA2xJcbsNcD0Hb95_o3_Le0_E,3820
27
+ vision_agent/utils/image_utils.py,sha256=1dggPBhW8_hUXDItCRLa23h-hdBwS50cjL4v1hsoUbg,7586
28
+ vision_agent/utils/sim.py,sha256=FaD16kKL1-JR2aSCmznF9KkJux9u3_Nr9tF4smBeoK0,2327
29
+ vision_agent/utils/type_defs.py,sha256=4LTnTL4HNsfYqCrDn9Ppjg9bSG2ZGcoKSSd9YeQf4Bw,1792
30
+ vision_agent/utils/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
31
+ vision_agent-0.2.15.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
32
+ vision_agent-0.2.15.dist-info/METADATA,sha256=qK9rIVOI_IL0dcUcIqtgoRCxuk5GZuQ5HHSrdsuVLKs,9121
33
+ vision_agent-0.2.15.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
34
+ vision_agent-0.2.15.dist-info/RECORD,,