vision-agent 0.2.62__py3-none-any.whl → 0.2.64__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent.py +9 -2
- vision_agent/lmm/lmm.py +2 -2
- vision_agent/tools/__init__.py +7 -4
- vision_agent/tools/tools.py +140 -19
- {vision_agent-0.2.62.dist-info → vision_agent-0.2.64.dist-info}/METADATA +2 -1
- {vision_agent-0.2.62.dist-info → vision_agent-0.2.64.dist-info}/RECORD +8 -8
- {vision_agent-0.2.62.dist-info → vision_agent-0.2.64.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.62.dist-info → vision_agent-0.2.64.dist-info}/WHEEL +0 -0
@@ -7,8 +7,8 @@ import tempfile
|
|
7
7
|
from pathlib import Path
|
8
8
|
from typing import Any, Callable, Dict, List, Optional, Sequence, Union, cast
|
9
9
|
|
10
|
-
from PIL import Image
|
11
10
|
from langsmith import traceable
|
11
|
+
from PIL import Image
|
12
12
|
from rich.console import Console
|
13
13
|
from rich.style import Style
|
14
14
|
from rich.syntax import Syntax
|
@@ -43,6 +43,7 @@ class DefaultImports:
|
|
43
43
|
|
44
44
|
common_imports = [
|
45
45
|
"from typing import *",
|
46
|
+
"register_heif_opener()",
|
46
47
|
]
|
47
48
|
|
48
49
|
@staticmethod
|
@@ -97,6 +98,7 @@ def extract_json(json_str: str) -> Dict[str, Any]:
|
|
97
98
|
try:
|
98
99
|
json_dict = json.loads(json_str)
|
99
100
|
except json.JSONDecodeError:
|
101
|
+
input_json_str = json_str
|
100
102
|
if "```json" in json_str:
|
101
103
|
json_str = json_str[json_str.find("```json") + len("```json") :]
|
102
104
|
json_str = json_str[: json_str.find("```")]
|
@@ -104,7 +106,12 @@ def extract_json(json_str: str) -> Dict[str, Any]:
|
|
104
106
|
json_str = json_str[json_str.find("```") + len("```") :]
|
105
107
|
# get the last ``` not one from an intermediate string
|
106
108
|
json_str = json_str[: json_str.find("}```")]
|
107
|
-
|
109
|
+
try:
|
110
|
+
json_dict = json.loads(json_str)
|
111
|
+
except json.JSONDecodeError as e:
|
112
|
+
error_msg = f"Could not extract JSON from the given str: {json_str}.\nFunction input:\n{input_json_str}"
|
113
|
+
_LOGGER.exception(error_msg)
|
114
|
+
raise ValueError(error_msg) from e
|
108
115
|
return json_dict # type: ignore
|
109
116
|
|
110
117
|
|
vision_agent/lmm/lmm.py
CHANGED
@@ -224,10 +224,10 @@ class OpenAILMM(LMM):
|
|
224
224
|
return lambda x: T.grounding_sam(params["prompt"], x)
|
225
225
|
|
226
226
|
def generate_zero_shot_counter(self, question: str) -> Callable:
|
227
|
-
return T.
|
227
|
+
return T.loca_zero_shot_counting
|
228
228
|
|
229
229
|
def generate_image_qa_tool(self, question: str) -> Callable:
|
230
|
-
return lambda x: T.
|
230
|
+
return lambda x: T.git_vqa_v2(question, x)
|
231
231
|
|
232
232
|
|
233
233
|
class AzureOpenAILMM(OpenAILMM):
|
vision_agent/tools/__init__.py
CHANGED
@@ -7,25 +7,28 @@ from .tools import (
|
|
7
7
|
TOOLS,
|
8
8
|
TOOLS_DF,
|
9
9
|
UTILITIES_DOCSTRING,
|
10
|
+
blip_image_caption,
|
10
11
|
clip,
|
11
12
|
closest_box_distance,
|
12
13
|
closest_mask_distance,
|
13
14
|
extract_frames,
|
14
15
|
get_tool_documentation,
|
16
|
+
git_vqa_v2,
|
15
17
|
grounding_dino,
|
16
18
|
grounding_sam,
|
17
|
-
image_caption,
|
18
|
-
image_question_answering,
|
19
19
|
load_image,
|
20
|
+
loca_visual_prompt_counting,
|
21
|
+
loca_zero_shot_counting,
|
20
22
|
ocr,
|
21
23
|
overlay_bounding_boxes,
|
22
24
|
overlay_heat_map,
|
23
25
|
overlay_segmentation_masks,
|
26
|
+
owl_v2,
|
24
27
|
save_image,
|
25
28
|
save_json,
|
26
29
|
save_video,
|
27
|
-
|
28
|
-
|
30
|
+
vit_image_classification,
|
31
|
+
vit_nsfw_classification,
|
29
32
|
)
|
30
33
|
|
31
34
|
__new_tools__ = [
|
vision_agent/tools/tools.py
CHANGED
@@ -13,6 +13,7 @@ import pandas as pd
|
|
13
13
|
import requests
|
14
14
|
from moviepy.editor import ImageSequenceClip
|
15
15
|
from PIL import Image, ImageDraw, ImageFont
|
16
|
+
from pillow_heif import register_heif_opener # type: ignore
|
16
17
|
|
17
18
|
from vision_agent.tools.tool_utils import _send_inference_request
|
18
19
|
from vision_agent.utils import extract_frames_from_video
|
@@ -26,6 +27,8 @@ from vision_agent.utils.image_utils import (
|
|
26
27
|
rle_decode,
|
27
28
|
)
|
28
29
|
|
30
|
+
register_heif_opener()
|
31
|
+
|
29
32
|
COLORS = [
|
30
33
|
(158, 218, 229),
|
31
34
|
(219, 219, 141),
|
@@ -59,6 +62,7 @@ def grounding_dino(
|
|
59
62
|
image: np.ndarray,
|
60
63
|
box_threshold: float = 0.20,
|
61
64
|
iou_threshold: float = 0.20,
|
65
|
+
model_size: str = "large",
|
62
66
|
) -> List[Dict[str, Any]]:
|
63
67
|
"""'grounding_dino' is a tool that can detect and count multiple objects given a text
|
64
68
|
prompt such as category names or referring expressions. The categories in text prompt
|
@@ -72,6 +76,7 @@ def grounding_dino(
|
|
72
76
|
to 0.20.
|
73
77
|
iou_threshold (float, optional): The threshold for the Intersection over Union
|
74
78
|
(IoU). Defaults to 0.20.
|
79
|
+
model_size (str, optional): The size of the model to use.
|
75
80
|
|
76
81
|
Returns:
|
77
82
|
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
@@ -90,10 +95,14 @@ def grounding_dino(
|
|
90
95
|
"""
|
91
96
|
image_size = image.shape[:2]
|
92
97
|
image_b64 = convert_to_b64(image)
|
98
|
+
if model_size not in ["large", "tiny"]:
|
99
|
+
raise ValueError("model_size must be either 'large' or 'tiny'")
|
93
100
|
request_data = {
|
94
101
|
"prompt": prompt,
|
95
102
|
"image": image_b64,
|
96
|
-
"tool":
|
103
|
+
"tool": (
|
104
|
+
"visual_grounding" if model_size == "large" else "visual_grounding_tiny"
|
105
|
+
),
|
97
106
|
"kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
|
98
107
|
}
|
99
108
|
data: Dict[str, Any] = _send_inference_request(request_data, "tools")
|
@@ -109,6 +118,62 @@ def grounding_dino(
|
|
109
118
|
return return_data
|
110
119
|
|
111
120
|
|
121
|
+
def owl_v2(
|
122
|
+
prompt: str,
|
123
|
+
image: np.ndarray,
|
124
|
+
box_threshold: float = 0.10,
|
125
|
+
iou_threshold: float = 0.10,
|
126
|
+
) -> List[Dict[str, Any]]:
|
127
|
+
"""'owl_v2' is a tool that can detect and count multiple objects given a text
|
128
|
+
prompt such as category names or referring expressions. The categories in text prompt
|
129
|
+
are separated by commas or periods. It returns a list of bounding boxes with
|
130
|
+
normalized coordinates, label names and associated probability scores.
|
131
|
+
|
132
|
+
Parameters:
|
133
|
+
prompt (str): The prompt to ground to the image.
|
134
|
+
image (np.ndarray): The image to ground the prompt to.
|
135
|
+
box_threshold (float, optional): The threshold for the box detection. Defaults
|
136
|
+
to 0.10.
|
137
|
+
iou_threshold (float, optional): The threshold for the Intersection over Union
|
138
|
+
(IoU). Defaults to 0.10.
|
139
|
+
model_size (str, optional): The size of the model to use.
|
140
|
+
|
141
|
+
Returns:
|
142
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
143
|
+
bounding box of the detected objects with normalized coordinates between 0
|
144
|
+
and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
|
145
|
+
top-left and xmax and ymax are the coordinates of the bottom-right of the
|
146
|
+
bounding box.
|
147
|
+
|
148
|
+
Example
|
149
|
+
-------
|
150
|
+
>>> owl_v2("car. dinosaur", image)
|
151
|
+
[
|
152
|
+
{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
153
|
+
{'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
154
|
+
]
|
155
|
+
"""
|
156
|
+
image_size = image.shape[:2]
|
157
|
+
image_b64 = convert_to_b64(image)
|
158
|
+
request_data = {
|
159
|
+
"prompt": prompt,
|
160
|
+
"image": image_b64,
|
161
|
+
"tool": "open_vocab_detection",
|
162
|
+
"kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
|
163
|
+
}
|
164
|
+
data: Dict[str, Any] = _send_inference_request(request_data, "tools")
|
165
|
+
return_data = []
|
166
|
+
for i in range(len(data["bboxes"])):
|
167
|
+
return_data.append(
|
168
|
+
{
|
169
|
+
"score": round(data["scores"][i], 2),
|
170
|
+
"label": data["labels"][i].strip(),
|
171
|
+
"bbox": normalize_bbox(data["bboxes"][i], image_size),
|
172
|
+
}
|
173
|
+
)
|
174
|
+
return return_data
|
175
|
+
|
176
|
+
|
112
177
|
def grounding_sam(
|
113
178
|
prompt: str,
|
114
179
|
image: np.ndarray,
|
@@ -253,8 +318,8 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
253
318
|
return ocr_results
|
254
319
|
|
255
320
|
|
256
|
-
def
|
257
|
-
"""'
|
321
|
+
def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
|
322
|
+
"""'loca_zero_shot_counting' is a tool that counts the dominant foreground object given
|
258
323
|
an image and no other information about the content. It returns only the count of
|
259
324
|
the objects in the image.
|
260
325
|
|
@@ -267,7 +332,7 @@ def zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
|
|
267
332
|
|
268
333
|
Example
|
269
334
|
-------
|
270
|
-
>>>
|
335
|
+
>>> loca_zero_shot_counting(image)
|
271
336
|
{'count': 45},
|
272
337
|
"""
|
273
338
|
|
@@ -281,10 +346,10 @@ def zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
|
|
281
346
|
return resp_data
|
282
347
|
|
283
348
|
|
284
|
-
def
|
349
|
+
def loca_visual_prompt_counting(
|
285
350
|
image: np.ndarray, visual_prompt: Dict[str, List[float]]
|
286
351
|
) -> Dict[str, Any]:
|
287
|
-
"""'
|
352
|
+
"""'loca_visual_prompt_counting' is a tool that counts the dominant foreground object
|
288
353
|
given an image and a visual prompt which is a bounding box describing the object.
|
289
354
|
It returns only the count of the objects in the image.
|
290
355
|
|
@@ -297,7 +362,7 @@ def visual_prompt_counting(
|
|
297
362
|
|
298
363
|
Example
|
299
364
|
-------
|
300
|
-
>>>
|
365
|
+
>>> loca_visual_prompt_counting(image, {"bbox": [0.1, 0.1, 0.4, 0.42]})
|
301
366
|
{'count': 45},
|
302
367
|
"""
|
303
368
|
|
@@ -316,8 +381,8 @@ def visual_prompt_counting(
|
|
316
381
|
return resp_data
|
317
382
|
|
318
383
|
|
319
|
-
def
|
320
|
-
"""'
|
384
|
+
def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
|
385
|
+
"""'git_vqa_v2' is a tool that can answer questions about the visual
|
321
386
|
contents of an image given a question and an image. It returns an answer to the
|
322
387
|
question
|
323
388
|
|
@@ -331,7 +396,7 @@ def image_question_answering(prompt: str, image: np.ndarray) -> str:
|
|
331
396
|
|
332
397
|
Example
|
333
398
|
-------
|
334
|
-
>>>
|
399
|
+
>>> git_vqa_v2('What is the cat doing ?', image)
|
335
400
|
'drinking milk'
|
336
401
|
"""
|
337
402
|
|
@@ -376,8 +441,62 @@ def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
|
|
376
441
|
return resp_data
|
377
442
|
|
378
443
|
|
379
|
-
def
|
380
|
-
"""'
|
444
|
+
def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
|
445
|
+
"""'vit_image_classification' is a tool that can classify an image. It returns a
|
446
|
+
list of classes and their probability scores based on image content.
|
447
|
+
|
448
|
+
Parameters:
|
449
|
+
image (np.ndarray): The image to classify or tag
|
450
|
+
|
451
|
+
Returns:
|
452
|
+
Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
|
453
|
+
contains a list of labels and other a list of scores.
|
454
|
+
|
455
|
+
Example
|
456
|
+
-------
|
457
|
+
>>> vit_image_classification(image)
|
458
|
+
{"labels": ["leopard", "lemur, otter", "bird"], "scores": [0.68, 0.30, 0.02]},
|
459
|
+
"""
|
460
|
+
|
461
|
+
image_b64 = convert_to_b64(image)
|
462
|
+
data = {
|
463
|
+
"image": image_b64,
|
464
|
+
"tool": "image_classification",
|
465
|
+
}
|
466
|
+
resp_data = _send_inference_request(data, "tools")
|
467
|
+
resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]]
|
468
|
+
return resp_data
|
469
|
+
|
470
|
+
|
471
|
+
def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]:
|
472
|
+
"""'vit_nsfw_classification' is a tool that can classify an image as 'nsfw' or 'normal'.
|
473
|
+
It returns the predicted label and their probability scores based on image content.
|
474
|
+
|
475
|
+
Parameters:
|
476
|
+
image (np.ndarray): The image to classify or tag
|
477
|
+
|
478
|
+
Returns:
|
479
|
+
Dict[str, Any]: A dictionary containing the labels and scores. One dictionary
|
480
|
+
contains a list of labels and other a list of scores.
|
481
|
+
|
482
|
+
Example
|
483
|
+
-------
|
484
|
+
>>> vit_nsfw_classification(image)
|
485
|
+
{"labels": "normal", "scores": 0.68},
|
486
|
+
"""
|
487
|
+
|
488
|
+
image_b64 = convert_to_b64(image)
|
489
|
+
data = {
|
490
|
+
"image": image_b64,
|
491
|
+
"tool": "nsfw_image_classification",
|
492
|
+
}
|
493
|
+
resp_data = _send_inference_request(data, "tools")
|
494
|
+
resp_data["scores"] = round(resp_data["scores"], 4)
|
495
|
+
return resp_data
|
496
|
+
|
497
|
+
|
498
|
+
def blip_image_caption(image: np.ndarray) -> str:
|
499
|
+
"""'blip_image_caption' is a tool that can caption an image based on its contents. It
|
381
500
|
returns a text describing the image.
|
382
501
|
|
383
502
|
Parameters:
|
@@ -388,7 +507,7 @@ def image_caption(image: np.ndarray) -> str:
|
|
388
507
|
|
389
508
|
Example
|
390
509
|
-------
|
391
|
-
>>>
|
510
|
+
>>> blip_image_caption(image)
|
392
511
|
'This image contains a cat sitting on a table with a bowl of milk.'
|
393
512
|
"""
|
394
513
|
|
@@ -543,7 +662,7 @@ def save_image(image: np.ndarray, file_path: str) -> None:
|
|
543
662
|
"""
|
544
663
|
from IPython.display import display
|
545
664
|
|
546
|
-
pil_image = Image.fromarray(image.astype(np.uint8))
|
665
|
+
pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB")
|
547
666
|
display(pil_image)
|
548
667
|
pil_image.save(file_path)
|
549
668
|
|
@@ -792,15 +911,17 @@ def get_tools_df(funcs: List[Callable[..., Any]]) -> pd.DataFrame:
|
|
792
911
|
|
793
912
|
|
794
913
|
TOOLS = [
|
795
|
-
|
914
|
+
owl_v2,
|
796
915
|
grounding_sam,
|
797
916
|
extract_frames,
|
798
917
|
ocr,
|
799
918
|
clip,
|
800
|
-
|
801
|
-
|
802
|
-
|
803
|
-
|
919
|
+
vit_image_classification,
|
920
|
+
vit_nsfw_classification,
|
921
|
+
loca_zero_shot_counting,
|
922
|
+
loca_visual_prompt_counting,
|
923
|
+
git_vqa_v2,
|
924
|
+
blip_image_caption,
|
804
925
|
closest_mask_distance,
|
805
926
|
closest_box_distance,
|
806
927
|
save_json,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.64
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -21,6 +21,7 @@ Requires-Dist: openai (>=1.0.0,<2.0.0)
|
|
21
21
|
Requires-Dist: opencv-python (>=4.0.0,<5.0.0)
|
22
22
|
Requires-Dist: pandas (>=2.0.0,<3.0.0)
|
23
23
|
Requires-Dist: pillow (>=10.0.0,<11.0.0)
|
24
|
+
Requires-Dist: pillow-heif (>=0.16.0,<0.17.0)
|
24
25
|
Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
|
25
26
|
Requires-Dist: requests (>=2.0.0,<3.0.0)
|
26
27
|
Requires-Dist: rich (>=13.7.1,<14.0.0)
|
@@ -1,23 +1,23 @@
|
|
1
1
|
vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
2
2
|
vision_agent/agent/__init__.py,sha256=IUwfbPMcT8X_rnXMLmI8gJ4ltsHy_XSs9eLiKURJxeY,81
|
3
3
|
vision_agent/agent/agent.py,sha256=ZK-5lOtd9-eD9aWcXssJpnOyvZuO7_5hAmnb-6sWVe8,569
|
4
|
-
vision_agent/agent/vision_agent.py,sha256=
|
4
|
+
vision_agent/agent/vision_agent.py,sha256=Bk2PkYj7dqawbGyapmgHtKFEZMr1BmqCvzkTwbMxTkw,25228
|
5
5
|
vision_agent/agent/vision_agent_prompts.py,sha256=bMXdZYf6kbikHn__tCGrYE1QvXC88EmpMpM_97V6szA,8472
|
6
6
|
vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
7
|
vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
|
8
8
|
vision_agent/lmm/__init__.py,sha256=3ro5lCIoS3DgEghOy0SPFrEhYvFnWZpVC5S5kSnIx6A,57
|
9
|
-
vision_agent/lmm/lmm.py,sha256=
|
10
|
-
vision_agent/tools/__init__.py,sha256=
|
9
|
+
vision_agent/lmm/lmm.py,sha256=ihmLYL_291HnELyMtfFKTCnPWnmuoEH2DDFmc4ynMG8,8945
|
10
|
+
vision_agent/tools/__init__.py,sha256=aE1O8cMeLDPO50Sc-CuAQ_Akh0viz7vBxDcVeZNqsA0,1604
|
11
11
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
12
12
|
vision_agent/tools/tool_utils.py,sha256=wzRacbUpqk9hhfX_Y08rL8qP0XCN2w-8IZoYLi3Upn4,869
|
13
|
-
vision_agent/tools/tools.py,sha256=
|
13
|
+
vision_agent/tools/tools.py,sha256=Qzwm_wu6KJh-3DSoNmZ4Lv8jCCNJMwKIPBFxxN6FmDo,31397
|
14
14
|
vision_agent/utils/__init__.py,sha256=CW84HnhqI6XQVuxf2KifkLnSuO7EOhmuL09-gAymAak,219
|
15
15
|
vision_agent/utils/execute.py,sha256=GqoAodxtwTPBr1nujPTsWiZO2rBGvWVXTe8lgxY4d_g,20603
|
16
16
|
vision_agent/utils/image_utils.py,sha256=_cdiS5YrLzqkq_ZgFUO897m5M4_SCIThwUy4lOklfB8,7700
|
17
17
|
vision_agent/utils/sim.py,sha256=ci6Eta73dDgLP1Ajtknbgmf1g8aAvBHqlVQvBuLMKXQ,4427
|
18
18
|
vision_agent/utils/type_defs.py,sha256=BlI8ywWHAplC7kYWLvt4AOdnKpEW3qWEFm-GEOSkrFQ,1792
|
19
19
|
vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
|
20
|
-
vision_agent-0.2.
|
21
|
-
vision_agent-0.2.
|
22
|
-
vision_agent-0.2.
|
23
|
-
vision_agent-0.2.
|
20
|
+
vision_agent-0.2.64.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
21
|
+
vision_agent-0.2.64.dist-info/METADATA,sha256=nfNPxFtWxFNev_MmWviQEL-xOliYZAicZq1uqQVPHwM,8363
|
22
|
+
vision_agent-0.2.64.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
23
|
+
vision_agent-0.2.64.dist-info/RECORD,,
|
File without changes
|
File without changes
|