vision-agent 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent.py +39 -5
- vision_agent/image_utils.py +2 -2
- vision_agent/tools/tools.py +7 -14
- {vision_agent-0.2.1.dist-info → vision_agent-0.2.3.dist-info}/METADATA +6 -3
- {vision_agent-0.2.1.dist-info → vision_agent-0.2.3.dist-info}/RECORD +7 -7
- {vision_agent-0.2.1.dist-info → vision_agent-0.2.3.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.1.dist-info → vision_agent-0.2.3.dist-info}/WHEEL +0 -0
@@ -8,7 +8,12 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
|
|
8
8
|
from PIL import Image
|
9
9
|
from tabulate import tabulate
|
10
10
|
|
11
|
-
from vision_agent.image_utils import
|
11
|
+
from vision_agent.image_utils import (
|
12
|
+
convert_to_b64,
|
13
|
+
overlay_bboxes,
|
14
|
+
overlay_heat_map,
|
15
|
+
overlay_masks,
|
16
|
+
)
|
12
17
|
from vision_agent.llm import LLM, OpenAILLM
|
13
18
|
from vision_agent.lmm import LMM, OpenAILMM
|
14
19
|
from vision_agent.tools import TOOLS
|
@@ -423,7 +428,7 @@ class VisionAgent(Agent):
|
|
423
428
|
):
|
424
429
|
"""VisionAgent constructor.
|
425
430
|
|
426
|
-
Parameters
|
431
|
+
Parameters:
|
427
432
|
task_model: the model to use for task decomposition.
|
428
433
|
answer_model: the model to use for reasoning and concluding the answer.
|
429
434
|
reflect_model: the model to use for self reflection.
|
@@ -481,6 +486,17 @@ class VisionAgent(Agent):
|
|
481
486
|
if self.report_progress_callback:
|
482
487
|
self.report_progress_callback(description)
|
483
488
|
|
489
|
+
def _report_visualization_via_callback(
|
490
|
+
self, images: Sequence[Union[str, Path]]
|
491
|
+
) -> None:
|
492
|
+
"""This is intended for streaming the visualization images via the callback to the client side."""
|
493
|
+
if self.report_progress_callback:
|
494
|
+
self.report_progress_callback("<VIZ>")
|
495
|
+
if images:
|
496
|
+
for img in images:
|
497
|
+
self.report_progress_callback(f"<IMG>{convert_to_b64(img)}</IMG>")
|
498
|
+
self.report_progress_callback("</VIZ>")
|
499
|
+
|
484
500
|
def chat_with_workflow(
|
485
501
|
self,
|
486
502
|
chat: List[Dict[str, str]],
|
@@ -488,6 +504,21 @@ class VisionAgent(Agent):
|
|
488
504
|
reference_data: Optional[Dict[str, str]] = None,
|
489
505
|
visualize_output: Optional[bool] = False,
|
490
506
|
) -> Tuple[str, List[Dict]]:
|
507
|
+
"""Chat with the vision agent and return the final answer and all tool results.
|
508
|
+
|
509
|
+
Parameters:
|
510
|
+
chat: a conversation in the format of
|
511
|
+
[{"role": "user", "content": "describe your task here..."}].
|
512
|
+
image: the input image referenced in the chat parameter.
|
513
|
+
reference_data: a dictionary containing the reference image and mask. in the
|
514
|
+
format of {"image": "image.jpg", "mask": "mask.jpg}
|
515
|
+
visualize_output: whether to visualize the output.
|
516
|
+
|
517
|
+
Returns:
|
518
|
+
A tuple where the first item is the final answer and the second item is a
|
519
|
+
list of all the tool results. The last item in the tool results also
|
520
|
+
contains the visualized output.
|
521
|
+
"""
|
491
522
|
question = chat[0]["content"]
|
492
523
|
if image:
|
493
524
|
question += f" Image name: {image}"
|
@@ -577,9 +608,12 @@ class VisionAgent(Agent):
|
|
577
608
|
)
|
578
609
|
|
579
610
|
if visualize_output:
|
580
|
-
|
581
|
-
|
582
|
-
|
611
|
+
viz_images: Sequence[Union[str, Path]] = all_tool_results[-1][
|
612
|
+
"visualized_output"
|
613
|
+
]
|
614
|
+
self._report_visualization_via_callback(viz_images)
|
615
|
+
for img in viz_images:
|
616
|
+
Image.open(img).show()
|
583
617
|
|
584
618
|
return final_answer, all_tool_results
|
585
619
|
|
vision_agent/image_utils.py
CHANGED
@@ -4,7 +4,7 @@ import base64
|
|
4
4
|
from importlib import resources
|
5
5
|
from io import BytesIO
|
6
6
|
from pathlib import Path
|
7
|
-
from typing import Dict, Tuple, Union
|
7
|
+
from typing import Dict, List, Tuple, Union
|
8
8
|
|
9
9
|
import numpy as np
|
10
10
|
from PIL import Image, ImageDraw, ImageFont
|
@@ -108,7 +108,7 @@ def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
|
|
108
108
|
data = Image.open(data)
|
109
109
|
if isinstance(data, Image.Image):
|
110
110
|
buffer = BytesIO()
|
111
|
-
data.convert("RGB").save(buffer, format="
|
111
|
+
data.convert("RGB").save(buffer, format="PNG")
|
112
112
|
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
113
113
|
else:
|
114
114
|
arr_bytes = data.tobytes()
|
vision_agent/tools/tools.py
CHANGED
@@ -108,8 +108,7 @@ class CLIP(Tool):
|
|
108
108
|
|
109
109
|
|
110
110
|
class ImageCaption(Tool):
|
111
|
-
r"""ImageCaption is a tool that can caption an image based on its contents
|
112
|
-
or tags.
|
111
|
+
r"""ImageCaption is a tool that can caption an image based on its contents or tags.
|
113
112
|
|
114
113
|
Example
|
115
114
|
-------
|
@@ -120,26 +119,20 @@ class ImageCaption(Tool):
|
|
120
119
|
"""
|
121
120
|
|
122
121
|
name = "image_caption_"
|
123
|
-
description = "'image_caption_' is a tool that can caption an image based on its contents or tags. It returns a text describing the image"
|
122
|
+
description = "'image_caption_' is a tool that can caption an image based on its contents or tags. It returns a text describing the image."
|
124
123
|
usage = {
|
125
124
|
"required_parameters": [
|
126
125
|
{"name": "image", "type": "str"},
|
127
126
|
],
|
128
127
|
"examples": [
|
129
128
|
{
|
130
|
-
"scenario": "Can you describe this image
|
129
|
+
"scenario": "Can you describe this image? Image name: cat.jpg",
|
131
130
|
"parameters": {"image": "cat.jpg"},
|
132
131
|
},
|
133
132
|
{
|
134
|
-
"scenario": "Can you caption this image with their main contents
|
133
|
+
"scenario": "Can you caption this image with their main contents? Image name: cat_dog.jpg",
|
135
134
|
"parameters": {"image": "cat_dog.jpg"},
|
136
135
|
},
|
137
|
-
{
|
138
|
-
"scenario": "Can you build me a image captioning tool ? Image name: shirts.jpg",
|
139
|
-
"parameters": {
|
140
|
-
"image": "shirts.jpg",
|
141
|
-
},
|
142
|
-
},
|
143
136
|
],
|
144
137
|
}
|
145
138
|
|
@@ -487,15 +480,15 @@ class ZeroShotCounting(Tool):
|
|
487
480
|
],
|
488
481
|
"examples": [
|
489
482
|
{
|
490
|
-
"scenario": "Can you count the lids in the image
|
483
|
+
"scenario": "Can you count the lids in the image? Image name: lids.jpg",
|
491
484
|
"parameters": {"image": "lids.jpg"},
|
492
485
|
},
|
493
486
|
{
|
494
|
-
"scenario": "Can you count the total number of objects in this image
|
487
|
+
"scenario": "Can you count the total number of objects in this image? Image name: tray.jpg",
|
495
488
|
"parameters": {"image": "tray.jpg"},
|
496
489
|
},
|
497
490
|
{
|
498
|
-
"scenario": "Can you build me an object counting tool
|
491
|
+
"scenario": "Can you build me an object counting tool? Image name: shirts.jpg",
|
499
492
|
"parameters": {
|
500
493
|
"image": "shirts.jpg",
|
501
494
|
},
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.3
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -126,15 +126,18 @@ you. For example:
|
|
126
126
|
| Tool | Description |
|
127
127
|
| --- | --- |
|
128
128
|
| CLIP | CLIP is a tool that can classify or tag any image given a set of input classes or tags. |
|
129
|
+
| ImageCaption| ImageCaption is a tool that can generate a caption for an image. |
|
129
130
|
| GroundingDINO | GroundingDINO is a tool that can detect arbitrary objects with inputs such as category names or referring expressions. |
|
130
131
|
| GroundingSAM | GroundingSAM is a tool that can detect and segment arbitrary objects with inputs such as category names or referring expressions. |
|
131
|
-
|
|
132
|
+
| DINOv | DINOv is a tool that can detect arbitrary objects with using a referring mask. |
|
133
|
+
| ExtractFrames | ExtractFrames extracts frames with motion from a video. |
|
132
134
|
| Crop | Crop crops an image given a bounding box and returns a file name of the cropped image. |
|
133
135
|
| BboxArea | BboxArea returns the area of the bounding box in pixels normalized to 2 decimal places. |
|
134
136
|
| SegArea | SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places. |
|
135
137
|
| BboxIoU | BboxIoU returns the intersection over union of two bounding boxes normalized to 2 decimal places. |
|
136
138
|
| SegIoU | SegIoU returns the intersection over union of two segmentation masks normalized to 2 decimal places. |
|
137
|
-
|
|
139
|
+
| BoxDistance | BoxDistance returns the minimum distance between two bounding boxes normalized to 2 decimal places. |
|
140
|
+
| BboxContains | BboxContains returns the intersection of two boxes over the target box area. It is good for check if one box is contained within another box. |
|
138
141
|
| ExtractFrames | ExtractFrames extracts frames with motion from a video. |
|
139
142
|
| ZeroShotCounting | ZeroShotCounting returns the total number of objects belonging to a single class in a given image |
|
140
143
|
| VisualPromptCounting | VisualPromptCounting returns the total number of objects belonging to a single class given an image and visual prompt |
|
@@ -5,21 +5,21 @@ vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMV
|
|
5
5
|
vision_agent/agent/easytool_prompts.py,sha256=zdQQw6WpXOmvwOMtlBlNKY5a3WNlr65dbUvMIGiqdeo,4526
|
6
6
|
vision_agent/agent/reflexion.py,sha256=4gz30BuFMeGxSsTzoDV4p91yE0R8LISXp28IaOI6wdM,10506
|
7
7
|
vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
|
8
|
-
vision_agent/agent/vision_agent.py,sha256=
|
8
|
+
vision_agent/agent/vision_agent.py,sha256=6AtVaEQL0ksg1QkUBn_YhytYjRfH7-M4q7G6pnds9Ds,25002
|
9
9
|
vision_agent/agent/vision_agent_prompts.py,sha256=W3Z72FpUt71UIJSkjAcgtQqxeMqkYuATqHAN5fYY26c,7342
|
10
10
|
vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
11
|
vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
|
12
|
-
vision_agent/image_utils.py,sha256=
|
12
|
+
vision_agent/image_utils.py,sha256=YvP5KE9NrWdgJKuHW2NR1glzfObkxtcXBknpmj3Gsbs,7554
|
13
13
|
vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
|
14
14
|
vision_agent/llm/llm.py,sha256=gwDQ9-p9wEn24xi1019e5jzTGQg4xWDSqBCsqIqGcU4,5168
|
15
15
|
vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
|
16
16
|
vision_agent/lmm/lmm.py,sha256=FjxCuIk0KXuWnfY4orVmdyhJW2I4C6i5QNNEXk7gybk,10197
|
17
17
|
vision_agent/tools/__init__.py,sha256=BlfxqbYkB0oODhnSmQg1UyzQm73AvvjCjrIiOWBIYDs,328
|
18
18
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
19
|
-
vision_agent/tools/tools.py,sha256=
|
19
|
+
vision_agent/tools/tools.py,sha256=Cwh7GNSnCYxyKKgusHlf-Cqd9NBjlbZG7d-GauQJCwI,34751
|
20
20
|
vision_agent/tools/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
|
21
21
|
vision_agent/type_defs.py,sha256=4LTnTL4HNsfYqCrDn9Ppjg9bSG2ZGcoKSSd9YeQf4Bw,1792
|
22
|
-
vision_agent-0.2.
|
23
|
-
vision_agent-0.2.
|
24
|
-
vision_agent-0.2.
|
25
|
-
vision_agent-0.2.
|
22
|
+
vision_agent-0.2.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
23
|
+
vision_agent-0.2.3.dist-info/METADATA,sha256=cQnQTRlWBxf0aVwsMoJS4TiiAtN3SbU00nlCrbNNb9w,6748
|
24
|
+
vision_agent-0.2.3.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
25
|
+
vision_agent-0.2.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|