vision-agent 0.2.13__py3-none-any.whl → 0.2.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/__init__.py +1 -0
- vision_agent/agent/agent_coder.py +33 -7
- vision_agent/agent/vision_agent.py +15 -13
- vision_agent/agent/vision_agent_prompts.py +3 -3
- vision_agent/agent/vision_agent_v2.py +300 -0
- vision_agent/agent/vision_agent_v2_prompt.py +170 -0
- vision_agent/llm/llm.py +11 -3
- vision_agent/tools/__init__.py +2 -2
- vision_agent/tools/tool_utils.py +1 -1
- vision_agent/tools/tools.py +4 -5
- vision_agent/tools/tools_v2.py +278 -17
- vision_agent/utils/__init__.py +3 -0
- vision_agent/utils/execute.py +104 -0
- vision_agent/utils/sim.py +70 -0
- {vision_agent-0.2.13.dist-info → vision_agent-0.2.15.dist-info}/METADATA +4 -2
- vision_agent-0.2.15.dist-info/RECORD +34 -0
- vision_agent/agent/execution.py +0 -287
- vision_agent-0.2.13.dist-info/RECORD +0 -30
- /vision_agent/{image_utils.py → utils/image_utils.py} +0 -0
- /vision_agent/{type_defs.py → utils/type_defs.py} +0 -0
- /vision_agent/{tools → utils}/video.py +0 -0
- {vision_agent-0.2.13.dist-info → vision_agent-0.2.15.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.13.dist-info → vision_agent-0.2.15.dist-info}/WHEEL +0 -0
vision_agent/tools/tool_utils.py
CHANGED
vision_agent/tools/tools.py
CHANGED
@@ -11,7 +11,10 @@ from PIL import Image
|
|
11
11
|
from PIL.Image import Image as ImageType
|
12
12
|
from scipy.spatial import distance # type: ignore
|
13
13
|
|
14
|
-
from vision_agent.
|
14
|
+
from vision_agent.lmm import OpenAILMM
|
15
|
+
from vision_agent.tools.tool_utils import _send_inference_request
|
16
|
+
from vision_agent.utils import extract_frames_from_video
|
17
|
+
from vision_agent.utils.image_utils import (
|
15
18
|
b64_to_pil,
|
16
19
|
convert_to_b64,
|
17
20
|
denormalize_bbox,
|
@@ -19,9 +22,6 @@ from vision_agent.image_utils import (
|
|
19
22
|
normalize_bbox,
|
20
23
|
rle_decode,
|
21
24
|
)
|
22
|
-
from vision_agent.lmm import OpenAILMM
|
23
|
-
from vision_agent.tools.tool_utils import _send_inference_request
|
24
|
-
from vision_agent.tools.video import extract_frames_from_video
|
25
25
|
|
26
26
|
_LOGGER = logging.getLogger(__name__)
|
27
27
|
|
@@ -422,7 +422,6 @@ class DINOv(Tool):
|
|
422
422
|
request_data = {
|
423
423
|
"prompt": prompt,
|
424
424
|
"image": image_b64,
|
425
|
-
"tool": "dinov",
|
426
425
|
}
|
427
426
|
data: Dict[str, Any] = _send_inference_request(request_data, "dinov")
|
428
427
|
if "bboxes" in data:
|
vision_agent/tools/tools_v2.py
CHANGED
@@ -1,13 +1,19 @@
|
|
1
1
|
import inspect
|
2
|
+
import io
|
3
|
+
import logging
|
2
4
|
import tempfile
|
3
5
|
from importlib import resources
|
4
|
-
from
|
6
|
+
from pathlib import Path
|
7
|
+
from typing import Any, Callable, Dict, List, Tuple, Union
|
5
8
|
|
6
9
|
import numpy as np
|
10
|
+
import pandas as pd
|
11
|
+
import requests
|
7
12
|
from PIL import Image, ImageDraw, ImageFont
|
8
13
|
|
9
|
-
from vision_agent.image_utils import convert_to_b64, normalize_bbox
|
10
14
|
from vision_agent.tools.tool_utils import _send_inference_request
|
15
|
+
from vision_agent.utils import extract_frames_from_video
|
16
|
+
from vision_agent.utils.image_utils import convert_to_b64, normalize_bbox, rle_decode
|
11
17
|
|
12
18
|
COLORS = [
|
13
19
|
(158, 218, 229),
|
@@ -31,6 +37,10 @@ COLORS = [
|
|
31
37
|
(255, 127, 14),
|
32
38
|
(31, 119, 180),
|
33
39
|
]
|
40
|
+
_API_KEY = "land_sk_WVYwP00xA3iXely2vuar6YUDZ3MJT9yLX6oW5noUkwICzYLiDV"
|
41
|
+
_OCR_URL = "https://app.landing.ai/ocr/v1/detect-text"
|
42
|
+
logging.basicConfig(level=logging.INFO)
|
43
|
+
_LOGGER = logging.getLogger(__name__)
|
34
44
|
|
35
45
|
|
36
46
|
def grounding_dino(
|
@@ -39,23 +49,30 @@ def grounding_dino(
|
|
39
49
|
box_threshold: float = 0.20,
|
40
50
|
iou_threshold: float = 0.75,
|
41
51
|
) -> List[Dict[str, Any]]:
|
42
|
-
"""'grounding_dino' is a tool that can detect
|
43
|
-
category names or referring expressions.
|
52
|
+
"""'grounding_dino' is a tool that can detect and count objects given a text prompt
|
53
|
+
such as category names or referring expressions. It returns a list and count of
|
54
|
+
bounding boxes, label names and associated probability scores.
|
44
55
|
|
45
56
|
Parameters:
|
46
57
|
prompt (str): The prompt to ground to the image.
|
47
58
|
image (np.ndarray): The image to ground the prompt to.
|
48
|
-
box_threshold (float, optional): The threshold for the box detection. Defaults
|
49
|
-
|
59
|
+
box_threshold (float, optional): The threshold for the box detection. Defaults
|
60
|
+
to 0.20.
|
61
|
+
iou_threshold (float, optional): The threshold for the Intersection over Union
|
62
|
+
(IoU). Defaults to 0.75.
|
50
63
|
|
51
64
|
Returns:
|
52
65
|
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
53
|
-
bounding box of the detected objects with normalized coordinates
|
66
|
+
bounding box of the detected objects with normalized coordinates
|
67
|
+
(x1, y1, x2, y2).
|
54
68
|
|
55
69
|
Example
|
56
70
|
-------
|
57
71
|
>>> grounding_dino("car. dinosaur", image)
|
58
|
-
[
|
72
|
+
[
|
73
|
+
{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
74
|
+
{'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
75
|
+
]
|
59
76
|
"""
|
60
77
|
image_size = image.shape[:2]
|
61
78
|
image_b64 = convert_to_b64(Image.fromarray(image))
|
@@ -78,6 +95,147 @@ def grounding_dino(
|
|
78
95
|
return return_data
|
79
96
|
|
80
97
|
|
98
|
+
def grounding_sam(
|
99
|
+
prompt: str,
|
100
|
+
image: np.ndarray,
|
101
|
+
box_threshold: float = 0.20,
|
102
|
+
iou_threshold: float = 0.75,
|
103
|
+
) -> List[Dict[str, Any]]:
|
104
|
+
"""'grounding_sam' is a tool that can detect and segment objects given a text
|
105
|
+
prompt such as category names or referring expressions. It returns a list of
|
106
|
+
bounding boxes, label names and masks file names and associated probability scores.
|
107
|
+
|
108
|
+
Parameters:
|
109
|
+
prompt (str): The prompt to ground to the image.
|
110
|
+
image (np.ndarray): The image to ground the prompt to.
|
111
|
+
box_threshold (float, optional): The threshold for the box detection. Defaults
|
112
|
+
to 0.20.
|
113
|
+
iou_threshold (float, optional): The threshold for the Intersection over Union
|
114
|
+
(IoU). Defaults to 0.75.
|
115
|
+
|
116
|
+
Returns:
|
117
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label,
|
118
|
+
bounding box, and mask of the detected objects with normalized coordinates
|
119
|
+
(x1, y1, x2, y2).
|
120
|
+
|
121
|
+
Example
|
122
|
+
-------
|
123
|
+
>>> grounding_sam("car. dinosaur", image)
|
124
|
+
[
|
125
|
+
{
|
126
|
+
'score': 0.99,
|
127
|
+
'label': 'dinosaur',
|
128
|
+
'bbox': [0.1, 0.11, 0.35, 0.4],
|
129
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
130
|
+
[0, 0, 0, ..., 0, 0, 0],
|
131
|
+
...,
|
132
|
+
[0, 0, 0, ..., 0, 0, 0],
|
133
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
134
|
+
},
|
135
|
+
]
|
136
|
+
"""
|
137
|
+
image_size = image.shape[:2]
|
138
|
+
image_b64 = convert_to_b64(Image.fromarray(image))
|
139
|
+
request_data = {
|
140
|
+
"prompt": prompt,
|
141
|
+
"image": image_b64,
|
142
|
+
"tool": "visual_grounding_segment",
|
143
|
+
"kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
|
144
|
+
}
|
145
|
+
data: Dict[str, Any] = _send_inference_request(request_data, "tools")
|
146
|
+
return_data = []
|
147
|
+
for i in range(len(data["bboxes"])):
|
148
|
+
return_data.append(
|
149
|
+
{
|
150
|
+
"score": round(data["scores"][i], 2),
|
151
|
+
"label": data["labels"][i],
|
152
|
+
"bbox": normalize_bbox(data["bboxes"][i], image_size),
|
153
|
+
"mask": rle_decode(mask_rle=data["masks"][i], shape=data["mask_shape"]),
|
154
|
+
}
|
155
|
+
)
|
156
|
+
return return_data
|
157
|
+
|
158
|
+
|
159
|
+
def extract_frames(
|
160
|
+
video_uri: Union[str, Path], fps: float = 0.5
|
161
|
+
) -> List[Tuple[np.ndarray, float]]:
|
162
|
+
"""'extract_frames' extracts frames from a video, returns a list of tuples (frame,
|
163
|
+
timestamp), where timestamp is the relative time in seconds where the frame was
|
164
|
+
captured. The frame is a local image file path.
|
165
|
+
|
166
|
+
Parameters:
|
167
|
+
video_uri (Union[str, Path]): The path to the video file.
|
168
|
+
fps (float, optional): The frame rate per second to extract the frames. Defaults
|
169
|
+
to 0.5.
|
170
|
+
|
171
|
+
Returns:
|
172
|
+
List[Tuple[np.ndarray, float]]: A list of tuples containing the extracted frame
|
173
|
+
and the timestamp in seconds.
|
174
|
+
|
175
|
+
Example
|
176
|
+
-------
|
177
|
+
>>> extract_frames("path/to/video.mp4")
|
178
|
+
[(frame1, 0.0), (frame2, 0.5), ...]
|
179
|
+
"""
|
180
|
+
|
181
|
+
return extract_frames_from_video(str(video_uri), fps)
|
182
|
+
|
183
|
+
|
184
|
+
def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
185
|
+
"""'ocr' extracts text from an image. It returns a list of detected text, bounding
|
186
|
+
boxes, and confidence scores.
|
187
|
+
|
188
|
+
Parameters:
|
189
|
+
image (np.ndarray): The image to extract text from.
|
190
|
+
|
191
|
+
Returns:
|
192
|
+
List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox,
|
193
|
+
and confidence score.
|
194
|
+
|
195
|
+
Example
|
196
|
+
-------
|
197
|
+
>>> ocr(image)
|
198
|
+
[
|
199
|
+
{'label': 'some text', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
|
200
|
+
]
|
201
|
+
"""
|
202
|
+
|
203
|
+
pil_image = Image.fromarray(image).convert("RGB")
|
204
|
+
image_size = pil_image.size[::-1]
|
205
|
+
image_buffer = io.BytesIO()
|
206
|
+
pil_image.save(image_buffer, format="PNG")
|
207
|
+
buffer_bytes = image_buffer.getvalue()
|
208
|
+
image_buffer.close()
|
209
|
+
|
210
|
+
res = requests.post(
|
211
|
+
_OCR_URL,
|
212
|
+
files={"images": buffer_bytes},
|
213
|
+
data={"language": "en"},
|
214
|
+
headers={"contentType": "multipart/form-data", "apikey": _API_KEY},
|
215
|
+
)
|
216
|
+
|
217
|
+
if res.status_code != 200:
|
218
|
+
raise ValueError(f"OCR request failed with status code {res.status_code}")
|
219
|
+
|
220
|
+
data = res.json()
|
221
|
+
output = []
|
222
|
+
for det in data[0]:
|
223
|
+
label = det["text"]
|
224
|
+
box = [
|
225
|
+
det["location"][0]["x"],
|
226
|
+
det["location"][0]["y"],
|
227
|
+
det["location"][2]["x"],
|
228
|
+
det["location"][2]["y"],
|
229
|
+
]
|
230
|
+
box = normalize_bbox(box, image_size)
|
231
|
+
output.append({"label": label, "bbox": box, "score": round(det["score"], 2)})
|
232
|
+
|
233
|
+
return output
|
234
|
+
|
235
|
+
|
236
|
+
# Utility and visualization functions
|
237
|
+
|
238
|
+
|
81
239
|
def load_image(image_path: str) -> np.ndarray:
|
82
240
|
"""'load_image' is a utility function that loads an image from the given path.
|
83
241
|
|
@@ -117,24 +275,33 @@ def save_image(image: np.ndarray) -> str:
|
|
117
275
|
return f.name
|
118
276
|
|
119
277
|
|
120
|
-
def
|
278
|
+
def overlay_bounding_boxes(
|
121
279
|
image: np.ndarray, bboxes: List[Dict[str, Any]]
|
122
280
|
) -> np.ndarray:
|
123
|
-
"""'display_bounding_boxes' is a utility function that displays bounding boxes on
|
281
|
+
"""'display_bounding_boxes' is a utility function that displays bounding boxes on
|
282
|
+
an image.
|
124
283
|
|
125
284
|
Parameters:
|
126
285
|
image (np.ndarray): The image to display the bounding boxes on.
|
127
|
-
bboxes (List[Dict[str, Any]]): A list of dictionaries containing the bounding
|
286
|
+
bboxes (List[Dict[str, Any]]): A list of dictionaries containing the bounding
|
287
|
+
boxes.
|
128
288
|
|
129
289
|
Returns:
|
130
|
-
np.ndarray: The image with the bounding boxes displayed.
|
290
|
+
np.ndarray: The image with the bounding boxes, labels and scores displayed.
|
131
291
|
|
132
292
|
Example
|
133
293
|
-------
|
134
|
-
>>> image_with_bboxes = display_bounding_boxes(
|
294
|
+
>>> image_with_bboxes = display_bounding_boxes(
|
295
|
+
image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
|
296
|
+
)
|
135
297
|
"""
|
136
298
|
pil_image = Image.fromarray(image.astype(np.uint8))
|
137
299
|
|
300
|
+
if len(set([box["label"] for box in bboxes])) > len(COLORS):
|
301
|
+
_LOGGER.warning(
|
302
|
+
"Number of unique labels exceeds the number of available colors. Some labels may have the same color."
|
303
|
+
)
|
304
|
+
|
138
305
|
color = {
|
139
306
|
label: COLORS[i % len(COLORS)]
|
140
307
|
for i, label in enumerate(set([box["label"] for box in bboxes]))
|
@@ -167,15 +334,109 @@ def display_bounding_boxes(
|
|
167
334
|
return np.array(pil_image.convert("RGB"))
|
168
335
|
|
169
336
|
|
170
|
-
def
|
337
|
+
def overlay_segmentation_masks(
|
338
|
+
image: np.ndarray, masks: List[Dict[str, Any]]
|
339
|
+
) -> np.ndarray:
|
340
|
+
"""'display_segmentation_masks' is a utility function that displays segmentation
|
341
|
+
masks.
|
342
|
+
|
343
|
+
Parameters:
|
344
|
+
image (np.ndarray): The image to display the masks on.
|
345
|
+
masks (List[Dict[str, Any]]): A list of dictionaries containing the masks.
|
346
|
+
|
347
|
+
Returns:
|
348
|
+
np.ndarray: The image with the masks displayed.
|
349
|
+
|
350
|
+
Example
|
351
|
+
-------
|
352
|
+
>>> image_with_masks = display_segmentation_masks(
|
353
|
+
image,
|
354
|
+
[{
|
355
|
+
'score': 0.99,
|
356
|
+
'label': 'dinosaur',
|
357
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
358
|
+
[0, 0, 0, ..., 0, 0, 0],
|
359
|
+
...,
|
360
|
+
[0, 0, 0, ..., 0, 0, 0],
|
361
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
362
|
+
}],
|
363
|
+
)
|
364
|
+
"""
|
365
|
+
pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGBA")
|
366
|
+
|
367
|
+
if len(set([mask["label"] for mask in masks])) > len(COLORS):
|
368
|
+
_LOGGER.warning(
|
369
|
+
"Number of unique labels exceeds the number of available colors. Some labels may have the same color."
|
370
|
+
)
|
371
|
+
|
372
|
+
color = {
|
373
|
+
label: COLORS[i % len(COLORS)]
|
374
|
+
for i, label in enumerate(set([mask["label"] for mask in masks]))
|
375
|
+
}
|
376
|
+
|
377
|
+
for elt in masks:
|
378
|
+
mask = elt["mask"]
|
379
|
+
label = elt["label"]
|
380
|
+
np_mask = np.zeros((pil_image.size[1], pil_image.size[0], 4))
|
381
|
+
np_mask[mask > 0, :] = color[label] + (255 * 0.5,)
|
382
|
+
mask_img = Image.fromarray(np_mask.astype(np.uint8))
|
383
|
+
pil_image = Image.alpha_composite(pil_image, mask_img)
|
384
|
+
return np.array(pil_image.convert("RGB"))
|
385
|
+
|
386
|
+
|
387
|
+
def get_tool_documentation(funcs: List[Callable[..., Any]]) -> str:
|
171
388
|
docstrings = ""
|
172
389
|
for func in funcs:
|
173
|
-
docstrings += f"{func.__name__}
|
390
|
+
docstrings += f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}\n\n"
|
174
391
|
|
175
392
|
return docstrings
|
176
393
|
|
177
394
|
|
178
|
-
|
395
|
+
def get_tool_descriptions(funcs: List[Callable[..., Any]]) -> str:
|
396
|
+
descriptions = ""
|
397
|
+
for func in funcs:
|
398
|
+
description = func.__doc__
|
399
|
+
if description is None:
|
400
|
+
description = ""
|
401
|
+
|
402
|
+
description = (
|
403
|
+
description[: description.find("Parameters:")].replace("\n", " ").strip()
|
404
|
+
)
|
405
|
+
description = " ".join(description.split())
|
406
|
+
descriptions += f"- {func.__name__}{inspect.signature(func)}: {description}\n"
|
407
|
+
return descriptions
|
408
|
+
|
409
|
+
|
410
|
+
def get_tools_df(funcs: List[Callable[..., Any]]) -> pd.DataFrame:
|
411
|
+
data: Dict[str, List[str]] = {"desc": [], "doc": []}
|
412
|
+
|
413
|
+
for func in funcs:
|
414
|
+
desc = func.__doc__
|
415
|
+
if desc is None:
|
416
|
+
desc = ""
|
417
|
+
desc = desc[: desc.find("Parameters:")].replace("\n", " ").strip()
|
418
|
+
desc = " ".join(desc.split())
|
419
|
+
|
420
|
+
doc = f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}"
|
421
|
+
data["desc"].append(desc)
|
422
|
+
data["doc"].append(doc)
|
423
|
+
|
424
|
+
return pd.DataFrame(data) # type: ignore
|
425
|
+
|
426
|
+
|
427
|
+
TOOLS = [
|
428
|
+
grounding_dino,
|
429
|
+
grounding_sam,
|
430
|
+
extract_frames,
|
431
|
+
ocr,
|
432
|
+
load_image,
|
433
|
+
save_image,
|
434
|
+
overlay_bounding_boxes,
|
435
|
+
overlay_segmentation_masks,
|
436
|
+
]
|
437
|
+
TOOLS_DF = get_tools_df(TOOLS) # type: ignore
|
438
|
+
TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS) # type: ignore
|
439
|
+
TOOL_DOCSTRING = get_tool_documentation(TOOLS) # type: ignore
|
179
440
|
UTILITIES_DOCSTRING = get_tool_documentation(
|
180
|
-
[load_image, save_image,
|
441
|
+
[load_image, save_image, overlay_bounding_boxes]
|
181
442
|
)
|
@@ -0,0 +1,104 @@
|
|
1
|
+
"""This code is adapted from MetaGPT's https://github.com/geekan/MetaGPT/blob/main/metagpt/actions/di/execute_nb_code.py
|
2
|
+
"""
|
3
|
+
|
4
|
+
import base64 as b64
|
5
|
+
import io
|
6
|
+
import re
|
7
|
+
from typing import Dict, List, Tuple
|
8
|
+
|
9
|
+
import nbformat
|
10
|
+
from nbclient import NotebookClient
|
11
|
+
from nbclient.exceptions import CellTimeoutError, DeadKernelError
|
12
|
+
from nbclient.util import run_sync
|
13
|
+
from nbformat import NotebookNode
|
14
|
+
from nbformat.v4 import new_code_cell
|
15
|
+
from PIL import Image
|
16
|
+
|
17
|
+
|
18
|
+
def remove_escape_and_color_codes(input_str: str) -> str:
|
19
|
+
pattern = re.compile(r"\x1b\[[0-9;]*[mK]")
|
20
|
+
result = pattern.sub("", input_str)
|
21
|
+
return result
|
22
|
+
|
23
|
+
|
24
|
+
def parse_outputs(outputs: List[Dict]) -> Tuple[bool, str]:
|
25
|
+
success, parsed_output = True, []
|
26
|
+
for output in outputs:
|
27
|
+
# TODO: add parse image data
|
28
|
+
if output["output_type"] == "stream":
|
29
|
+
parsed_output.append(output["text"])
|
30
|
+
elif output["output_type"] == "text/plain":
|
31
|
+
parsed_output.append(output["data"]["text/plain"])
|
32
|
+
elif output["output_type"] == "display_data":
|
33
|
+
if "image/png" in output["data"]:
|
34
|
+
image_bytes = b64.b64decode(output["data"]["image/png"])
|
35
|
+
Image.open(io.BytesIO(image_bytes)).show()
|
36
|
+
elif output["output_type"] == "error":
|
37
|
+
success = False
|
38
|
+
output_text = remove_escape_and_color_codes("\n".join(output["traceback"]))
|
39
|
+
parsed_output.append(output_text)
|
40
|
+
|
41
|
+
return success, ",".join(parsed_output)
|
42
|
+
|
43
|
+
|
44
|
+
class Execute:
|
45
|
+
def __init__(self, timeout: int = 600) -> None:
|
46
|
+
self.nb = nbformat.v4.new_notebook()
|
47
|
+
self.timeout = timeout
|
48
|
+
self.nb_client = NotebookClient(self.nb, timeout=self.timeout)
|
49
|
+
|
50
|
+
def build(self) -> None:
|
51
|
+
if self.nb_client.kc is None or not run_sync(self.nb_client.kc.is_alive)(): # type: ignore
|
52
|
+
self.nb_client.create_kernel_manager()
|
53
|
+
self.nb_client.start_new_kernel()
|
54
|
+
self.nb_client.start_new_kernel_client()
|
55
|
+
|
56
|
+
def terminate(self) -> None:
|
57
|
+
if self.nb_client.km is not None and run_sync(self.nb_client.km.is_alive)(): # type: ignore
|
58
|
+
run_sync(self.nb_client.km.shutdown_kernel)(now=True)
|
59
|
+
run_sync(self.nb_client.km.cleanup_resources)()
|
60
|
+
|
61
|
+
channels = [
|
62
|
+
self.nb_client.kc.stdin_channel,
|
63
|
+
self.nb_client.kc.hb_channel,
|
64
|
+
self.nb_client.kc.control_channel,
|
65
|
+
]
|
66
|
+
|
67
|
+
for ch in channels:
|
68
|
+
if ch.is_alive():
|
69
|
+
ch.stop()
|
70
|
+
|
71
|
+
self.nb_client.kc = None
|
72
|
+
self.nb_client.km = None
|
73
|
+
|
74
|
+
def reset(self) -> None:
|
75
|
+
self.terminate()
|
76
|
+
self.nb = nbformat.v4.new_notebook()
|
77
|
+
self.nb_client = NotebookClient(self.nb, timeout=self.timeout)
|
78
|
+
self.build()
|
79
|
+
|
80
|
+
def run_cell(self, cell: NotebookNode, cell_index: int) -> Tuple[bool, str]:
|
81
|
+
try:
|
82
|
+
self.nb_client.execute_cell(cell, cell_index)
|
83
|
+
return parse_outputs(self.nb.cells[-1].outputs)
|
84
|
+
except CellTimeoutError:
|
85
|
+
run_sync(self.nb_client.km.interrupt_kernel)() # type: ignore
|
86
|
+
return False, "Cell execution timed out."
|
87
|
+
except DeadKernelError:
|
88
|
+
self.reset()
|
89
|
+
return False, "DeadKernelError"
|
90
|
+
except Exception:
|
91
|
+
return parse_outputs(self.nb.cells[-1].outputs)
|
92
|
+
|
93
|
+
def add_code_cell(self, code: str) -> None:
|
94
|
+
self.nb.cells.append(new_code_cell(code))
|
95
|
+
|
96
|
+
def run_additional(self, code: str) -> Tuple[bool, str]:
|
97
|
+
self.build()
|
98
|
+
self.add_code_cell(code)
|
99
|
+
return self.run_cell(self.nb.cells[-1], len(self.nb.cells) - 1)
|
100
|
+
|
101
|
+
def run_isolation(self, code: str) -> Tuple[bool, str]:
|
102
|
+
self.reset()
|
103
|
+
self.add_code_cell(code)
|
104
|
+
return self.run_cell(self.nb.cells[-1], len(self.nb.cells) - 1)
|
@@ -0,0 +1,70 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from typing import Dict, List, Optional, Sequence, Union
|
3
|
+
|
4
|
+
import pandas as pd
|
5
|
+
from openai import Client
|
6
|
+
from scipy.spatial.distance import cosine # type: ignore
|
7
|
+
|
8
|
+
|
9
|
+
def get_embedding(
|
10
|
+
client: Client, text: str, model: str = "text-embedding-3-small"
|
11
|
+
) -> List[float]:
|
12
|
+
text = text.replace("\n", " ")
|
13
|
+
return client.embeddings.create(input=[text], model=model).data[0].embedding
|
14
|
+
|
15
|
+
|
16
|
+
class Sim:
|
17
|
+
def __init__(
|
18
|
+
self,
|
19
|
+
df: pd.DataFrame,
|
20
|
+
sim_key: Optional[str] = None,
|
21
|
+
api_key: Optional[str] = None,
|
22
|
+
model: str = "text-embedding-3-small",
|
23
|
+
) -> None:
|
24
|
+
"""Creates a similarity object that can be used to find similar items in a
|
25
|
+
dataframe.
|
26
|
+
|
27
|
+
Parameters:
|
28
|
+
df: pd.DataFrame: The dataframe to use for similarity.
|
29
|
+
sim_key: Optional[str]: The column name that you want to use to construct
|
30
|
+
the embeddings.
|
31
|
+
model: str: The model to use for embeddings.
|
32
|
+
"""
|
33
|
+
self.df = df
|
34
|
+
if not api_key:
|
35
|
+
self.client = Client()
|
36
|
+
else:
|
37
|
+
self.client = Client(api_key=api_key)
|
38
|
+
|
39
|
+
self.model = model
|
40
|
+
if "embs" not in df.columns and sim_key is None:
|
41
|
+
raise ValueError("key is required if no column 'embs' is present.")
|
42
|
+
|
43
|
+
if sim_key is not None:
|
44
|
+
self.df["embs"] = self.df[sim_key].apply(
|
45
|
+
lambda x: get_embedding(self.client, x, model=self.model)
|
46
|
+
)
|
47
|
+
|
48
|
+
def save(self, sim_file: Union[str, Path]) -> None:
|
49
|
+
self.df.to_csv(sim_file, index=False)
|
50
|
+
|
51
|
+
def top_k(self, query: str, k: int = 5) -> Sequence[Dict]:
|
52
|
+
"""Returns the top k most similar items to the query.
|
53
|
+
|
54
|
+
Parameters:
|
55
|
+
query: str: The query to compare to.
|
56
|
+
k: int: The number of items to return.
|
57
|
+
|
58
|
+
Returns:
|
59
|
+
Sequence[Dict]: The top k most similar items.
|
60
|
+
"""
|
61
|
+
|
62
|
+
embedding = get_embedding(self.client, query, model=self.model)
|
63
|
+
self.df["sim"] = self.df.embs.apply(lambda x: 1 - cosine(x, embedding))
|
64
|
+
res = self.df.sort_values("sim", ascending=False).head(k)
|
65
|
+
return res[[c for c in res.columns if c != "embs"]].to_dict(orient="records")
|
66
|
+
|
67
|
+
|
68
|
+
def load_sim(sim_file: Union[str, Path]) -> Sim:
|
69
|
+
df = pd.read_csv(sim_file)
|
70
|
+
return Sim(df)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.15
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -10,6 +10,8 @@ Classifier: Programming Language :: Python :: 3.9
|
|
10
10
|
Classifier: Programming Language :: Python :: 3.10
|
11
11
|
Classifier: Programming Language :: Python :: 3.11
|
12
12
|
Requires-Dist: moviepy (>=1.0.0,<2.0.0)
|
13
|
+
Requires-Dist: nbclient (>=0.10.0,<0.11.0)
|
14
|
+
Requires-Dist: nbformat (>=5.10.4,<6.0.0)
|
13
15
|
Requires-Dist: numpy (>=1.21.0,<2.0.0)
|
14
16
|
Requires-Dist: openai (>=1.0.0,<2.0.0)
|
15
17
|
Requires-Dist: opencv-python-headless (>=4.0.0,<5.0.0)
|
@@ -17,6 +19,7 @@ Requires-Dist: pandas (>=2.0.0,<3.0.0)
|
|
17
19
|
Requires-Dist: pillow (>=10.0.0,<11.0.0)
|
18
20
|
Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
|
19
21
|
Requires-Dist: requests (>=2.0.0,<3.0.0)
|
22
|
+
Requires-Dist: rich (>=13.7.1,<14.0.0)
|
20
23
|
Requires-Dist: scipy (>=1.13.0,<1.14.0)
|
21
24
|
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
22
25
|
Requires-Dist: tqdm (>=4.64.0,<5.0.0)
|
@@ -181,7 +184,6 @@ find an example that creates a custom tool for template matching [here](examples
|
|
181
184
|
| GroundingDINO | GroundingDINO is a tool that can detect arbitrary objects with inputs such as category names or referring expressions. |
|
182
185
|
| GroundingSAM | GroundingSAM is a tool that can detect and segment arbitrary objects with inputs such as category names or referring expressions. |
|
183
186
|
| DINOv | DINOv is a tool that can detect arbitrary objects with using a referring mask. |
|
184
|
-
| ExtractFrames | ExtractFrames extracts frames with motion from a video. |
|
185
187
|
| Crop | Crop crops an image given a bounding box and returns a file name of the cropped image. |
|
186
188
|
| BboxArea | BboxArea returns the area of the bounding box in pixels normalized to 2 decimal places. |
|
187
189
|
| SegArea | SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places. |
|
@@ -0,0 +1,34 @@
|
|
1
|
+
vision_agent/__init__.py,sha256=GVLHCeK_R-zgldpbcPmOzJat-BkadvkuRCMxDvTIcXs,108
|
2
|
+
vision_agent/agent/__init__.py,sha256=Zv8lc91mPy0iDySId38_vc4mo56JQ9mCMvUWdAKQjh0,206
|
3
|
+
vision_agent/agent/agent.py,sha256=X7kON-g9ePUKumCDaYfQNBX_MEFE-ax5PnRp7-Cc5Wo,529
|
4
|
+
vision_agent/agent/agent_coder.py,sha256=e3mQn1xenahYk_uGflvuQ10s6dSHHM6p0jZN9UT1ZpE,6508
|
5
|
+
vision_agent/agent/agent_coder_prompts.py,sha256=CJe3v7xvHQ32u3RQAXQga_Tk_4UgU64RBAMHZ3S70KY,5538
|
6
|
+
vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMVg,11511
|
7
|
+
vision_agent/agent/easytool_prompts.py,sha256=Bikw-PPLkm78dwywTlnv32Y1Tw6JMeC-R7oCnXWLcTk,4656
|
8
|
+
vision_agent/agent/reflexion.py,sha256=4gz30BuFMeGxSsTzoDV4p91yE0R8LISXp28IaOI6wdM,10506
|
9
|
+
vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
|
10
|
+
vision_agent/agent/vision_agent.py,sha256=4-GjEX8ZmLhvLebqNRRTSSu1kSaFYVR_wFsrjXgKdYI,26984
|
11
|
+
vision_agent/agent/vision_agent_prompts.py,sha256=MZSIwovYgB-f-kdJ6btaNDVXptJn47bfOL3-Zn6NiC0,8573
|
12
|
+
vision_agent/agent/vision_agent_v2.py,sha256=CDgGBSoa2LoMS0b4JhyDkoS3PJJNmCCPfxIGUc4RfQg,9658
|
13
|
+
vision_agent/agent/vision_agent_v2_prompt.py,sha256=-90Hlbtqb5Fp7OVjGabpTdgr-yCr8AYKIfiMRfoL4SY,5141
|
14
|
+
vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
+
vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
|
16
|
+
vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
|
17
|
+
vision_agent/llm/llm.py,sha256=qWDBpJolGLWNwDjpEXu1NrjlJbo7Fj9efJYkSfVn6oE,5784
|
18
|
+
vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
|
19
|
+
vision_agent/lmm/lmm.py,sha256=gK90vMxh0OcGSuIZQikBkDXm4pfkdFk1R2y7rtWDl84,10539
|
20
|
+
vision_agent/tools/__init__.py,sha256=WiEjXzXyaBq7IQMKOMbFAK3FKvLNfzZ3dd7CPN-d7B8,451
|
21
|
+
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
22
|
+
vision_agent/tools/tool_utils.py,sha256=moR7X4hkLKQzC56axdojo_OcIuVOv45bKcHPUVZrPvk,753
|
23
|
+
vision_agent/tools/tools.py,sha256=WrNu_L5n2cEpe7e1oy8S1o3dy4JJ4AUxTHcjAdX64_g,46398
|
24
|
+
vision_agent/tools/tools_v2.py,sha256=1Y_ZbYJyuo2eZZkq7jY3YfuKWC82C-GFCZMLYH-I5ew,13800
|
25
|
+
vision_agent/utils/__init__.py,sha256=AKXf1QVOpO6MnqU8RSaFLQ_4us4DcKf8ibgEbhuHjvI,95
|
26
|
+
vision_agent/utils/execute.py,sha256=RC_jKrm2kOWwzNe9xKuA2xJcbsNcD0Hb95_o3_Le0_E,3820
|
27
|
+
vision_agent/utils/image_utils.py,sha256=1dggPBhW8_hUXDItCRLa23h-hdBwS50cjL4v1hsoUbg,7586
|
28
|
+
vision_agent/utils/sim.py,sha256=FaD16kKL1-JR2aSCmznF9KkJux9u3_Nr9tF4smBeoK0,2327
|
29
|
+
vision_agent/utils/type_defs.py,sha256=4LTnTL4HNsfYqCrDn9Ppjg9bSG2ZGcoKSSd9YeQf4Bw,1792
|
30
|
+
vision_agent/utils/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
|
31
|
+
vision_agent-0.2.15.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
32
|
+
vision_agent-0.2.15.dist-info/METADATA,sha256=qK9rIVOI_IL0dcUcIqtgoRCxuk5GZuQ5HHSrdsuVLKs,9121
|
33
|
+
vision_agent-0.2.15.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
34
|
+
vision_agent-0.2.15.dist-info/RECORD,,
|