vision-agent 0.2.74__tar.gz → 0.2.76__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vision_agent-0.2.74 → vision_agent-0.2.76}/PKG-INFO +4 -3
- {vision_agent-0.2.74 → vision_agent-0.2.76}/README.md +2 -2
- {vision_agent-0.2.74 → vision_agent-0.2.76}/pyproject.toml +2 -1
- {vision_agent-0.2.74 → vision_agent-0.2.76}/vision_agent/lmm/lmm.py +5 -2
- {vision_agent-0.2.74 → vision_agent-0.2.76}/vision_agent/tools/__init__.py +9 -0
- {vision_agent-0.2.74 → vision_agent-0.2.76}/vision_agent/tools/tools.py +373 -12
- {vision_agent-0.2.74 → vision_agent-0.2.76}/LICENSE +0 -0
- {vision_agent-0.2.74 → vision_agent-0.2.76}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.74 → vision_agent-0.2.76}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.2.74 → vision_agent-0.2.76}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.2.74 → vision_agent-0.2.76}/vision_agent/agent/vision_agent.py +0 -0
- {vision_agent-0.2.74 → vision_agent-0.2.76}/vision_agent/agent/vision_agent_prompts.py +0 -0
- {vision_agent-0.2.74 → vision_agent-0.2.76}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.74 → vision_agent-0.2.76}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.74 → vision_agent-0.2.76}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.2.74 → vision_agent-0.2.76}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.74 → vision_agent-0.2.76}/vision_agent/tools/tool_utils.py +0 -0
- {vision_agent-0.2.74 → vision_agent-0.2.76}/vision_agent/utils/__init__.py +0 -0
- {vision_agent-0.2.74 → vision_agent-0.2.76}/vision_agent/utils/execute.py +0 -0
- {vision_agent-0.2.74 → vision_agent-0.2.76}/vision_agent/utils/image_utils.py +0 -0
- {vision_agent-0.2.74 → vision_agent-0.2.76}/vision_agent/utils/sim.py +0 -0
- {vision_agent-0.2.74 → vision_agent-0.2.76}/vision_agent/utils/type_defs.py +0 -0
- {vision_agent-0.2.74 → vision_agent-0.2.76}/vision_agent/utils/video.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.76
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -23,6 +23,7 @@ Requires-Dist: pandas (>=2.0.0,<3.0.0)
|
|
23
23
|
Requires-Dist: pillow (>=10.0.0,<11.0.0)
|
24
24
|
Requires-Dist: pillow-heif (>=0.16.0,<0.17.0)
|
25
25
|
Requires-Dist: pydantic-settings (>=2.2.1,<3.0.0)
|
26
|
+
Requires-Dist: pytube (==15.0.0)
|
26
27
|
Requires-Dist: requests (>=2.0.0,<3.0.0)
|
27
28
|
Requires-Dist: rich (>=13.7.1,<14.0.0)
|
28
29
|
Requires-Dist: scipy (>=1.13.0,<1.14.0)
|
@@ -182,8 +183,8 @@ you. For example:
|
|
182
183
|
|
183
184
|
```python
|
184
185
|
>>> import vision_agent as va
|
185
|
-
>>>
|
186
|
-
>>> detector =
|
186
|
+
>>> lmm = va.lmm.OpenAILMM()
|
187
|
+
>>> detector = lmm.generate_detector("Can you build a jar detector for me?")
|
187
188
|
>>> detector(va.tools.load_image("jar.jpg"))
|
188
189
|
[{"labels": ["jar",],
|
189
190
|
"scores": [0.99],
|
@@ -145,8 +145,8 @@ you. For example:
|
|
145
145
|
|
146
146
|
```python
|
147
147
|
>>> import vision_agent as va
|
148
|
-
>>>
|
149
|
-
>>> detector =
|
148
|
+
>>> lmm = va.lmm.OpenAILMM()
|
149
|
+
>>> detector = lmm.generate_detector("Can you build a jar detector for me?")
|
150
150
|
>>> detector(va.tools.load_image("jar.jpg"))
|
151
151
|
[{"labels": ["jar",],
|
152
152
|
"scores": [0.99],
|
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|
4
4
|
|
5
5
|
[tool.poetry]
|
6
6
|
name = "vision-agent"
|
7
|
-
version = "0.2.
|
7
|
+
version = "0.2.76"
|
8
8
|
description = "Toolset for Vision Agent"
|
9
9
|
authors = ["Landing AI <dev@landing.ai>"]
|
10
10
|
readme = "README.md"
|
@@ -38,6 +38,7 @@ e2b = "^0.17.1"
|
|
38
38
|
e2b-code-interpreter = "^0.0.9"
|
39
39
|
tenacity = "^8.3.0"
|
40
40
|
pillow-heif = "^0.16.0"
|
41
|
+
pytube = "15.0.0"
|
41
42
|
|
42
43
|
[tool.poetry.group.dev.dependencies]
|
43
44
|
autoflake = "1.*"
|
@@ -164,6 +164,7 @@ class OpenAILMM(LMM):
|
|
164
164
|
{"role": "system", "content": SYSTEM_PROMPT},
|
165
165
|
{"role": "user", "content": prompt},
|
166
166
|
],
|
167
|
+
response_format={"type": "json_object"},
|
167
168
|
)
|
168
169
|
|
169
170
|
try:
|
@@ -179,7 +180,7 @@ class OpenAILMM(LMM):
|
|
179
180
|
return lambda x: T.clip(x, params["prompt"])
|
180
181
|
|
181
182
|
def generate_detector(self, question: str) -> Callable:
|
182
|
-
api_doc = T.get_tool_documentation([T.
|
183
|
+
api_doc = T.get_tool_documentation([T.owl_v2])
|
183
184
|
prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
|
184
185
|
response = self.client.chat.completions.create(
|
185
186
|
model=self.model_name,
|
@@ -187,6 +188,7 @@ class OpenAILMM(LMM):
|
|
187
188
|
{"role": "system", "content": SYSTEM_PROMPT},
|
188
189
|
{"role": "user", "content": prompt},
|
189
190
|
],
|
191
|
+
response_format={"type": "json_object"},
|
190
192
|
)
|
191
193
|
|
192
194
|
try:
|
@@ -199,7 +201,7 @@ class OpenAILMM(LMM):
|
|
199
201
|
)
|
200
202
|
raise ValueError("Failed to decode response")
|
201
203
|
|
202
|
-
return lambda x: T.
|
204
|
+
return lambda x: T.owl_v2(params["prompt"], x)
|
203
205
|
|
204
206
|
def generate_segmentor(self, question: str) -> Callable:
|
205
207
|
api_doc = T.get_tool_documentation([T.grounding_sam])
|
@@ -210,6 +212,7 @@ class OpenAILMM(LMM):
|
|
210
212
|
{"role": "system", "content": SYSTEM_PROMPT},
|
211
213
|
{"role": "user", "content": prompt},
|
212
214
|
],
|
215
|
+
response_format={"type": "json_object"},
|
213
216
|
)
|
214
217
|
|
215
218
|
try:
|
@@ -12,10 +12,18 @@ from .tools import (
|
|
12
12
|
closest_box_distance,
|
13
13
|
closest_mask_distance,
|
14
14
|
extract_frames,
|
15
|
+
florencev2_image_caption,
|
15
16
|
get_tool_documentation,
|
17
|
+
florencev2_object_detection,
|
18
|
+
detr_segmentation,
|
19
|
+
depth_anything_v2,
|
20
|
+
generate_soft_edge_image,
|
21
|
+
dpt_hybrid_midas,
|
22
|
+
generate_pose_image,
|
16
23
|
git_vqa_v2,
|
17
24
|
grounding_dino,
|
18
25
|
grounding_sam,
|
26
|
+
florencev2_roberta_vqa,
|
19
27
|
load_image,
|
20
28
|
loca_visual_prompt_counting,
|
21
29
|
loca_zero_shot_counting,
|
@@ -27,6 +35,7 @@ from .tools import (
|
|
27
35
|
save_image,
|
28
36
|
save_json,
|
29
37
|
save_video,
|
38
|
+
template_match,
|
30
39
|
vit_image_classification,
|
31
40
|
vit_nsfw_classification,
|
32
41
|
)
|
@@ -14,6 +14,7 @@ import requests
|
|
14
14
|
from moviepy.editor import ImageSequenceClip
|
15
15
|
from PIL import Image, ImageDraw, ImageFont
|
16
16
|
from pillow_heif import register_heif_opener # type: ignore
|
17
|
+
from pytube import YouTube # type: ignore
|
17
18
|
|
18
19
|
from vision_agent.tools.tool_utils import send_inference_request
|
19
20
|
from vision_agent.utils import extract_frames_from_video
|
@@ -126,7 +127,7 @@ def owl_v2(
|
|
126
127
|
) -> List[Dict[str, Any]]:
|
127
128
|
"""'owl_v2' is a tool that can detect and count multiple objects given a text
|
128
129
|
prompt such as category names or referring expressions. The categories in text prompt
|
129
|
-
are separated by commas
|
130
|
+
are separated by commas. It returns a list of bounding boxes with
|
130
131
|
normalized coordinates, label names and associated probability scores.
|
131
132
|
|
132
133
|
Parameters:
|
@@ -136,7 +137,6 @@ def owl_v2(
|
|
136
137
|
to 0.10.
|
137
138
|
iou_threshold (float, optional): The threshold for the Intersection over Union
|
138
139
|
(IoU). Defaults to 0.10.
|
139
|
-
model_size (str, optional): The size of the model to use.
|
140
140
|
|
141
141
|
Returns:
|
142
142
|
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
@@ -180,7 +180,7 @@ def grounding_sam(
|
|
180
180
|
box_threshold: float = 0.20,
|
181
181
|
iou_threshold: float = 0.20,
|
182
182
|
) -> List[Dict[str, Any]]:
|
183
|
-
"""'grounding_sam' is a tool that can
|
183
|
+
"""'grounding_sam' is a tool that can segment multiple objects given a
|
184
184
|
text prompt such as category names or referring expressions. The categories in text
|
185
185
|
prompt are separated by commas or periods. It returns a list of bounding boxes,
|
186
186
|
label names, mask file names and associated probability scores.
|
@@ -242,12 +242,12 @@ def grounding_sam(
|
|
242
242
|
def extract_frames(
|
243
243
|
video_uri: Union[str, Path], fps: float = 0.5
|
244
244
|
) -> List[Tuple[np.ndarray, float]]:
|
245
|
-
"""'extract_frames' extracts frames from a video
|
246
|
-
timestamp), where timestamp is the relative
|
247
|
-
captured. The frame is a numpy array.
|
245
|
+
"""'extract_frames' extracts frames from a video which can be a file path or youtube
|
246
|
+
link, returns a list of tuples (frame, timestamp), where timestamp is the relative
|
247
|
+
time in seconds where the frame was captured. The frame is a numpy array.
|
248
248
|
|
249
249
|
Parameters:
|
250
|
-
video_uri (Union[str, Path]): The path to the video file
|
250
|
+
video_uri (Union[str, Path]): The path to the video file or youtube link
|
251
251
|
fps (float, optional): The frame rate per second to extract the frames. Defaults
|
252
252
|
to 0.5.
|
253
253
|
|
@@ -261,6 +261,29 @@ def extract_frames(
|
|
261
261
|
[(frame1, 0.0), (frame2, 0.5), ...]
|
262
262
|
"""
|
263
263
|
|
264
|
+
if str(video_uri).startswith(
|
265
|
+
(
|
266
|
+
"http://www.youtube.com/",
|
267
|
+
"https://www.youtube.com/",
|
268
|
+
"http://youtu.be/",
|
269
|
+
"https://youtu.be/",
|
270
|
+
)
|
271
|
+
):
|
272
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
273
|
+
yt = YouTube(str(video_uri))
|
274
|
+
# Download the highest resolution video
|
275
|
+
video = (
|
276
|
+
yt.streams.filter(progressive=True, file_extension="mp4")
|
277
|
+
.order_by("resolution")
|
278
|
+
.desc()
|
279
|
+
.first()
|
280
|
+
)
|
281
|
+
if not video:
|
282
|
+
raise Exception("No suitable video stream found")
|
283
|
+
video_file_path = video.download(output_path=temp_dir)
|
284
|
+
|
285
|
+
return extract_frames_from_video(video_file_path, fps)
|
286
|
+
|
264
287
|
return extract_frames_from_video(str(video_uri), fps)
|
265
288
|
|
266
289
|
|
@@ -381,6 +404,35 @@ def loca_visual_prompt_counting(
|
|
381
404
|
return resp_data
|
382
405
|
|
383
406
|
|
407
|
+
def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
|
408
|
+
"""'florencev2_roberta_vqa' is a tool that takes an image and analyzes
|
409
|
+
its contents, generates detailed captions and then tries to answer the given
|
410
|
+
question using the generated context. It returns text as an answer to the question.
|
411
|
+
|
412
|
+
Parameters:
|
413
|
+
prompt (str): The question about the image
|
414
|
+
image (np.ndarray): The reference image used for the question
|
415
|
+
|
416
|
+
Returns:
|
417
|
+
str: A string which is the answer to the given prompt.
|
418
|
+
|
419
|
+
Example
|
420
|
+
-------
|
421
|
+
>>> florencev2_roberta_vqa('What is the top left animal in this image ?', image)
|
422
|
+
'white tiger'
|
423
|
+
"""
|
424
|
+
|
425
|
+
image_b64 = convert_to_b64(image)
|
426
|
+
data = {
|
427
|
+
"image": image_b64,
|
428
|
+
"prompt": prompt,
|
429
|
+
"tool": "image_question_answering_with_context",
|
430
|
+
}
|
431
|
+
|
432
|
+
answer = send_inference_request(data, "tools")
|
433
|
+
return answer["text"][0] # type: ignore
|
434
|
+
|
435
|
+
|
384
436
|
def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
|
385
437
|
"""'git_vqa_v2' is a tool that can answer questions about the visual
|
386
438
|
contents of an image given a question and an image. It returns an answer to the
|
@@ -391,8 +443,7 @@ def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
|
|
391
443
|
image (np.ndarray): The reference image used for the question
|
392
444
|
|
393
445
|
Returns:
|
394
|
-
str: A string which is the answer to the given prompt.
|
395
|
-
image contains a cat sitting on a table with a bowl of milk.'}.
|
446
|
+
str: A string which is the answer to the given prompt.
|
396
447
|
|
397
448
|
Example
|
398
449
|
-------
|
@@ -521,6 +572,309 @@ def blip_image_caption(image: np.ndarray) -> str:
|
|
521
572
|
return answer["text"][0] # type: ignore
|
522
573
|
|
523
574
|
|
575
|
+
def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) -> str:
|
576
|
+
"""'florencev2_image_caption' is a tool that can caption or describe an image based
|
577
|
+
on its contents. It returns a text describing the image.
|
578
|
+
|
579
|
+
Parameters:
|
580
|
+
image (np.ndarray): The image to caption
|
581
|
+
detail_caption (bool): If True, the caption will be as detailed as possible else
|
582
|
+
the caption will be a brief description.
|
583
|
+
|
584
|
+
Returns:
|
585
|
+
str: A string which is the caption for the given image.
|
586
|
+
|
587
|
+
Example
|
588
|
+
-------
|
589
|
+
>>> florencev2_image_caption(image, False)
|
590
|
+
'This image contains a cat sitting on a table with a bowl of milk.'
|
591
|
+
"""
|
592
|
+
image_b64 = convert_to_b64(image)
|
593
|
+
data = {
|
594
|
+
"image": image_b64,
|
595
|
+
"tool": "florence2_image_captioning",
|
596
|
+
"detail_caption": detail_caption,
|
597
|
+
}
|
598
|
+
|
599
|
+
answer = send_inference_request(data, "tools")
|
600
|
+
return answer["text"][0] # type: ignore
|
601
|
+
|
602
|
+
|
603
|
+
def florencev2_object_detection(image: np.ndarray) -> List[Dict[str, Any]]:
|
604
|
+
"""'florencev2_object_detection' is a tool that can detect common objects in an
|
605
|
+
image without any text prompt or thresholding. It returns a list of detected objects
|
606
|
+
as labels and their location as bounding boxes.
|
607
|
+
|
608
|
+
Parameters:
|
609
|
+
image (np.ndarray): The image to used to detect objects
|
610
|
+
|
611
|
+
Returns:
|
612
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
613
|
+
bounding box of the detected objects with normalized coordinates between 0
|
614
|
+
and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
|
615
|
+
top-left and xmax and ymax are the coordinates of the bottom-right of the
|
616
|
+
bounding box. The scores are always 1.0 and cannot be thresholded
|
617
|
+
|
618
|
+
Example
|
619
|
+
-------
|
620
|
+
>>> florencev2_object_detection(image)
|
621
|
+
[
|
622
|
+
{'score': 1.0, 'label': 'window', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
623
|
+
{'score': 1.0, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
|
624
|
+
{'score': 1.0, 'label': 'person', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
625
|
+
]
|
626
|
+
"""
|
627
|
+
image_size = image.shape[:2]
|
628
|
+
image_b64 = convert_to_b64(image)
|
629
|
+
data = {
|
630
|
+
"image": image_b64,
|
631
|
+
"tool": "object_detection",
|
632
|
+
}
|
633
|
+
|
634
|
+
answer = send_inference_request(data, "tools")
|
635
|
+
return_data = []
|
636
|
+
for i in range(len(answer["bboxes"])):
|
637
|
+
return_data.append(
|
638
|
+
{
|
639
|
+
"score": round(answer["scores"][i], 2),
|
640
|
+
"label": answer["labels"][i],
|
641
|
+
"bbox": normalize_bbox(answer["bboxes"][i], image_size),
|
642
|
+
}
|
643
|
+
)
|
644
|
+
return return_data
|
645
|
+
|
646
|
+
|
647
|
+
def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
|
648
|
+
"""'detr_segmentation' is a tool that can segment common objects in an
|
649
|
+
image without any text prompt. It returns a list of detected objects
|
650
|
+
as labels, their regions as masks and their scores.
|
651
|
+
|
652
|
+
Parameters:
|
653
|
+
image (np.ndarray): The image used to segment things and objects
|
654
|
+
|
655
|
+
Returns:
|
656
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score, label
|
657
|
+
and mask of the detected objects. The mask is binary 2D numpy array where 1
|
658
|
+
indicates the object and 0 indicates the background.
|
659
|
+
|
660
|
+
Example
|
661
|
+
-------
|
662
|
+
>>> detr_segmentation(image)
|
663
|
+
[
|
664
|
+
{
|
665
|
+
'score': 0.45,
|
666
|
+
'label': 'window',
|
667
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
668
|
+
[0, 0, 0, ..., 0, 0, 0],
|
669
|
+
...,
|
670
|
+
[0, 0, 0, ..., 0, 0, 0],
|
671
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
672
|
+
},
|
673
|
+
{
|
674
|
+
'score': 0.70,
|
675
|
+
'label': 'bird',
|
676
|
+
'mask': array([[0, 0, 0, ..., 0, 0, 0],
|
677
|
+
[0, 0, 0, ..., 0, 0, 0],
|
678
|
+
...,
|
679
|
+
[0, 0, 0, ..., 0, 0, 0],
|
680
|
+
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
|
681
|
+
},
|
682
|
+
]
|
683
|
+
"""
|
684
|
+
image_b64 = convert_to_b64(image)
|
685
|
+
data = {
|
686
|
+
"image": image_b64,
|
687
|
+
"tool": "panoptic_segmentation",
|
688
|
+
}
|
689
|
+
|
690
|
+
answer = send_inference_request(data, "tools")
|
691
|
+
return_data = []
|
692
|
+
|
693
|
+
for i in range(len(answer["scores"])):
|
694
|
+
return_data.append(
|
695
|
+
{
|
696
|
+
"score": round(answer["scores"][i], 2),
|
697
|
+
"label": answer["labels"][i],
|
698
|
+
"mask": rle_decode(
|
699
|
+
mask_rle=answer["masks"][i], shape=answer["mask_shape"][0]
|
700
|
+
),
|
701
|
+
}
|
702
|
+
)
|
703
|
+
return return_data
|
704
|
+
|
705
|
+
|
706
|
+
def depth_anything_v2(image: np.ndarray) -> np.ndarray:
|
707
|
+
"""'depth_anything_v2' is a tool that runs depth_anythingv2 model to generate a
|
708
|
+
depth image from a given RGB image. The returned depth image is monochrome and
|
709
|
+
represents depth values as pixel intesities with pixel values ranging from 0 to 255.
|
710
|
+
|
711
|
+
Parameters:
|
712
|
+
image (np.ndarray): The image to used to generate depth image
|
713
|
+
|
714
|
+
Returns:
|
715
|
+
np.ndarray: A grayscale depth image with pixel values ranging from 0 to 255.
|
716
|
+
|
717
|
+
Example
|
718
|
+
-------
|
719
|
+
>>> depth_anything_v2(image)
|
720
|
+
array([[0, 0, 0, ..., 0, 0, 0],
|
721
|
+
[0, 20, 24, ..., 0, 100, 103],
|
722
|
+
...,
|
723
|
+
[10, 11, 15, ..., 202, 202, 205],
|
724
|
+
[10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
|
725
|
+
"""
|
726
|
+
image_b64 = convert_to_b64(image)
|
727
|
+
data = {
|
728
|
+
"image": image_b64,
|
729
|
+
"tool": "generate_depth",
|
730
|
+
}
|
731
|
+
|
732
|
+
answer = send_inference_request(data, "tools")
|
733
|
+
return_data = np.array(b64_to_pil(answer["masks"][0]).convert("L"))
|
734
|
+
return return_data
|
735
|
+
|
736
|
+
|
737
|
+
def generate_soft_edge_image(image: np.ndarray) -> np.ndarray:
|
738
|
+
"""'generate_soft_edge_image' is a tool that runs Holistically Nested edge detection
|
739
|
+
to generate a soft edge image (HED) from a given RGB image. The returned image is
|
740
|
+
monochrome and represents object boundaries as soft white edges on black background
|
741
|
+
|
742
|
+
Parameters:
|
743
|
+
image (np.ndarray): The image to used to generate soft edge image
|
744
|
+
|
745
|
+
Returns:
|
746
|
+
np.ndarray: A soft edge image with pixel values ranging from 0 to 255.
|
747
|
+
|
748
|
+
Example
|
749
|
+
-------
|
750
|
+
>>> generate_soft_edge_image(image)
|
751
|
+
array([[0, 0, 0, ..., 0, 0, 0],
|
752
|
+
[0, 20, 24, ..., 0, 100, 103],
|
753
|
+
...,
|
754
|
+
[10, 11, 15, ..., 202, 202, 205],
|
755
|
+
[10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
|
756
|
+
"""
|
757
|
+
image_b64 = convert_to_b64(image)
|
758
|
+
data = {
|
759
|
+
"image": image_b64,
|
760
|
+
"tool": "generate_hed",
|
761
|
+
}
|
762
|
+
|
763
|
+
answer = send_inference_request(data, "tools")
|
764
|
+
return_data = np.array(b64_to_pil(answer["masks"][0]).convert("L"))
|
765
|
+
return return_data
|
766
|
+
|
767
|
+
|
768
|
+
def dpt_hybrid_midas(image: np.ndarray) -> np.ndarray:
|
769
|
+
"""'dpt_hybrid_midas' is a tool that generates a normal mapped from a given RGB
|
770
|
+
image. The returned RGB image is texture mapped image of the surface normals and the
|
771
|
+
RGB values represent the surface normals in the x, y, z directions.
|
772
|
+
|
773
|
+
Parameters:
|
774
|
+
image (np.ndarray): The image to used to generate normal image
|
775
|
+
|
776
|
+
Returns:
|
777
|
+
np.ndarray: A mapped normal image with RGB pixel values indicating surface
|
778
|
+
normals in x, y, z directions.
|
779
|
+
|
780
|
+
Example
|
781
|
+
-------
|
782
|
+
>>> dpt_hybrid_midas(image)
|
783
|
+
array([[0, 0, 0, ..., 0, 0, 0],
|
784
|
+
[0, 20, 24, ..., 0, 100, 103],
|
785
|
+
...,
|
786
|
+
[10, 11, 15, ..., 202, 202, 205],
|
787
|
+
[10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
|
788
|
+
"""
|
789
|
+
image_b64 = convert_to_b64(image)
|
790
|
+
data = {
|
791
|
+
"image": image_b64,
|
792
|
+
"tool": "generate_normal",
|
793
|
+
}
|
794
|
+
|
795
|
+
answer = send_inference_request(data, "tools")
|
796
|
+
return_data = np.array(b64_to_pil(answer["masks"][0]).convert("RGB"))
|
797
|
+
return return_data
|
798
|
+
|
799
|
+
|
800
|
+
def generate_pose_image(image: np.ndarray) -> np.ndarray:
|
801
|
+
"""'generate_pose_image' is a tool that generates a open pose bone/stick image from
|
802
|
+
a given RGB image. The returned bone image is RGB with the pose amd keypoints colored
|
803
|
+
and background as black.
|
804
|
+
|
805
|
+
Parameters:
|
806
|
+
image (np.ndarray): The image to used to generate pose image
|
807
|
+
|
808
|
+
Returns:
|
809
|
+
np.ndarray: A bone or pose image indicating the pose and keypoints
|
810
|
+
|
811
|
+
Example
|
812
|
+
-------
|
813
|
+
>>> generate_pose_image(image)
|
814
|
+
array([[0, 0, 0, ..., 0, 0, 0],
|
815
|
+
[0, 20, 24, ..., 0, 100, 103],
|
816
|
+
...,
|
817
|
+
[10, 11, 15, ..., 202, 202, 205],
|
818
|
+
[10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
|
819
|
+
"""
|
820
|
+
image_b64 = convert_to_b64(image)
|
821
|
+
data = {
|
822
|
+
"image": image_b64,
|
823
|
+
"tool": "generate_pose",
|
824
|
+
}
|
825
|
+
|
826
|
+
answer = send_inference_request(data, "tools")
|
827
|
+
return_data = np.array(b64_to_pil(answer["masks"][0]).convert("RGB"))
|
828
|
+
return return_data
|
829
|
+
|
830
|
+
|
831
|
+
def template_match(
|
832
|
+
image: np.ndarray, template_image: np.ndarray
|
833
|
+
) -> List[Dict[str, Any]]:
|
834
|
+
"""'template_match' is a tool that can detect all instances of a template in
|
835
|
+
a given image. It returns the locations of the detected template, a corresponding
|
836
|
+
similarity score of the same
|
837
|
+
|
838
|
+
Parameters:
|
839
|
+
image (np.ndarray): The image used for searching the template
|
840
|
+
template_image (np.ndarray): The template image or crop to search in the image
|
841
|
+
|
842
|
+
Returns:
|
843
|
+
List[Dict[str, Any]]: A list of dictionaries containing the score and
|
844
|
+
bounding box of the detected template with normalized coordinates between 0
|
845
|
+
and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
|
846
|
+
top-left and xmax and ymax are the coordinates of the bottom-right of the
|
847
|
+
bounding box.
|
848
|
+
|
849
|
+
Example
|
850
|
+
-------
|
851
|
+
>>> template_match(image, template)
|
852
|
+
[
|
853
|
+
{'score': 0.79, 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
854
|
+
{'score': 0.38, 'bbox': [0.2, 0.21, 0.45, 0.5},
|
855
|
+
]
|
856
|
+
"""
|
857
|
+
image_size = image.shape[:2]
|
858
|
+
image_b64 = convert_to_b64(image)
|
859
|
+
template_image_b64 = convert_to_b64(template_image)
|
860
|
+
data = {
|
861
|
+
"image": image_b64,
|
862
|
+
"template": template_image_b64,
|
863
|
+
"tool": "template_match",
|
864
|
+
}
|
865
|
+
|
866
|
+
answer = send_inference_request(data, "tools")
|
867
|
+
return_data = []
|
868
|
+
for i in range(len(answer["bboxes"])):
|
869
|
+
return_data.append(
|
870
|
+
{
|
871
|
+
"score": round(answer["scores"][i], 2),
|
872
|
+
"bbox": normalize_bbox(answer["bboxes"][i], image_size),
|
873
|
+
}
|
874
|
+
)
|
875
|
+
return return_data
|
876
|
+
|
877
|
+
|
524
878
|
def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float:
|
525
879
|
"""'closest_mask_distance' calculates the closest distance between two masks.
|
526
880
|
|
@@ -733,7 +1087,7 @@ def overlay_bounding_boxes(
|
|
733
1087
|
image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
|
734
1088
|
)
|
735
1089
|
"""
|
736
|
-
pil_image = Image.fromarray(image.astype(np.uint8))
|
1090
|
+
pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB")
|
737
1091
|
|
738
1092
|
if len(set([box["label"] for box in bboxes])) > len(COLORS):
|
739
1093
|
_LOGGER.warning(
|
@@ -920,8 +1274,14 @@ TOOLS = [
|
|
920
1274
|
vit_nsfw_classification,
|
921
1275
|
loca_zero_shot_counting,
|
922
1276
|
loca_visual_prompt_counting,
|
923
|
-
|
924
|
-
|
1277
|
+
florencev2_roberta_vqa,
|
1278
|
+
florencev2_image_caption,
|
1279
|
+
florencev2_object_detection,
|
1280
|
+
detr_segmentation,
|
1281
|
+
depth_anything_v2,
|
1282
|
+
generate_soft_edge_image,
|
1283
|
+
dpt_hybrid_midas,
|
1284
|
+
generate_pose_image,
|
925
1285
|
closest_mask_distance,
|
926
1286
|
closest_box_distance,
|
927
1287
|
save_json,
|
@@ -931,6 +1291,7 @@ TOOLS = [
|
|
931
1291
|
overlay_bounding_boxes,
|
932
1292
|
overlay_segmentation_masks,
|
933
1293
|
overlay_heat_map,
|
1294
|
+
template_match,
|
934
1295
|
]
|
935
1296
|
TOOLS_DF = get_tools_df(TOOLS) # type: ignore
|
936
1297
|
TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS) # type: ignore
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|