vision-agent 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent.py +28 -12
- vision_agent/llm/llm.py +5 -0
- vision_agent/lmm/lmm.py +13 -4
- vision_agent/tools/__init__.py +4 -0
- vision_agent/tools/tools.py +233 -20
- {vision_agent-0.2.2.dist-info → vision_agent-0.2.4.dist-info}/METADATA +34 -5
- {vision_agent-0.2.2.dist-info → vision_agent-0.2.4.dist-info}/RECORD +9 -9
- {vision_agent-0.2.2.dist-info → vision_agent-0.2.4.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.2.dist-info → vision_agent-0.2.4.dist-info}/WHEEL +0 -0
@@ -377,6 +377,7 @@ def visualize_result(all_tool_results: List[Dict]) -> Sequence[Union[str, Path]]
|
|
377
377
|
"dinov_",
|
378
378
|
"zero_shot_counting_",
|
379
379
|
"visual_prompt_counting_",
|
380
|
+
"ocr_",
|
380
381
|
]:
|
381
382
|
continue
|
382
383
|
|
@@ -428,7 +429,7 @@ class VisionAgent(Agent):
|
|
428
429
|
):
|
429
430
|
"""VisionAgent constructor.
|
430
431
|
|
431
|
-
Parameters
|
432
|
+
Parameters:
|
432
433
|
task_model: the model to use for task decomposition.
|
433
434
|
answer_model: the model to use for reasoning and concluding the answer.
|
434
435
|
reflect_model: the model to use for self reflection.
|
@@ -504,24 +505,39 @@ class VisionAgent(Agent):
|
|
504
505
|
reference_data: Optional[Dict[str, str]] = None,
|
505
506
|
visualize_output: Optional[bool] = False,
|
506
507
|
) -> Tuple[str, List[Dict]]:
|
508
|
+
"""Chat with the vision agent and return the final answer and all tool results.
|
509
|
+
|
510
|
+
Parameters:
|
511
|
+
chat: a conversation in the format of
|
512
|
+
[{"role": "user", "content": "describe your task here..."}].
|
513
|
+
image: the input image referenced in the chat parameter.
|
514
|
+
reference_data: a dictionary containing the reference image and mask. in the
|
515
|
+
format of {"image": "image.jpg", "mask": "mask.jpg}
|
516
|
+
visualize_output: whether to visualize the output.
|
517
|
+
|
518
|
+
Returns:
|
519
|
+
A tuple where the first item is the final answer and the second item is a
|
520
|
+
list of all the tool results. The last item in the tool results also
|
521
|
+
contains the visualized output.
|
522
|
+
"""
|
507
523
|
question = chat[0]["content"]
|
508
524
|
if image:
|
509
525
|
question += f" Image name: {image}"
|
510
526
|
if reference_data:
|
511
|
-
|
512
|
-
"image
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
visual_prompt_data = (
|
519
|
-
f"Reference mask: {reference_data['mask']}"
|
527
|
+
question += (
|
528
|
+
f" Reference image: {reference_data['image']}"
|
529
|
+
if "image" in reference_data
|
530
|
+
else ""
|
531
|
+
)
|
532
|
+
question += (
|
533
|
+
f" Reference mask: {reference_data['mask']}"
|
520
534
|
if "mask" in reference_data
|
521
|
-
else
|
535
|
+
else ""
|
522
536
|
)
|
523
537
|
question += (
|
524
|
-
f" Reference
|
538
|
+
f" Reference bbox: {reference_data['bbox']}"
|
539
|
+
if "bbox" in reference_data
|
540
|
+
else ""
|
525
541
|
)
|
526
542
|
|
527
543
|
reflections = ""
|
vision_agent/llm/llm.py
CHANGED
@@ -131,6 +131,11 @@ class OpenAILLM(LLM):
|
|
131
131
|
def generate_zero_shot_counter(self, question: str) -> Callable:
|
132
132
|
return lambda x: ZeroShotCounting()(**{"image": x})
|
133
133
|
|
134
|
+
def generate_image_qa_tool(self, question: str) -> Callable:
|
135
|
+
from vision_agent.tools import ImageQuestionAnswering
|
136
|
+
|
137
|
+
return lambda x: ImageQuestionAnswering()(**{"prompt": question, "image": x})
|
138
|
+
|
134
139
|
|
135
140
|
class AzureOpenAILLM(OpenAILLM):
|
136
141
|
def __init__(
|
vision_agent/lmm/lmm.py
CHANGED
@@ -11,11 +11,7 @@ from openai import AzureOpenAI, OpenAI
|
|
11
11
|
|
12
12
|
from vision_agent.tools import (
|
13
13
|
CHOOSE_PARAMS,
|
14
|
-
CLIP,
|
15
14
|
SYSTEM_PROMPT,
|
16
|
-
GroundingDINO,
|
17
|
-
GroundingSAM,
|
18
|
-
ZeroShotCounting,
|
19
15
|
)
|
20
16
|
|
21
17
|
_LOGGER = logging.getLogger(__name__)
|
@@ -205,6 +201,8 @@ class OpenAILMM(LMM):
|
|
205
201
|
return cast(str, response.choices[0].message.content)
|
206
202
|
|
207
203
|
def generate_classifier(self, question: str) -> Callable:
|
204
|
+
from vision_agent.tools import CLIP
|
205
|
+
|
208
206
|
api_doc = CLIP.description + "\n" + str(CLIP.usage)
|
209
207
|
prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
|
210
208
|
response = self.client.chat.completions.create(
|
@@ -228,6 +226,8 @@ class OpenAILMM(LMM):
|
|
228
226
|
return lambda x: CLIP()(**{"prompt": params["prompt"], "image": x})
|
229
227
|
|
230
228
|
def generate_detector(self, question: str) -> Callable:
|
229
|
+
from vision_agent.tools import GroundingDINO
|
230
|
+
|
231
231
|
api_doc = GroundingDINO.description + "\n" + str(GroundingDINO.usage)
|
232
232
|
prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
|
233
233
|
response = self.client.chat.completions.create(
|
@@ -251,6 +251,8 @@ class OpenAILMM(LMM):
|
|
251
251
|
return lambda x: GroundingDINO()(**{"prompt": params["prompt"], "image": x})
|
252
252
|
|
253
253
|
def generate_segmentor(self, question: str) -> Callable:
|
254
|
+
from vision_agent.tools import GroundingSAM
|
255
|
+
|
254
256
|
api_doc = GroundingSAM.description + "\n" + str(GroundingSAM.usage)
|
255
257
|
prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
|
256
258
|
response = self.client.chat.completions.create(
|
@@ -274,8 +276,15 @@ class OpenAILMM(LMM):
|
|
274
276
|
return lambda x: GroundingSAM()(**{"prompt": params["prompt"], "image": x})
|
275
277
|
|
276
278
|
def generate_zero_shot_counter(self, question: str) -> Callable:
|
279
|
+
from vision_agent.tools import ZeroShotCounting
|
280
|
+
|
277
281
|
return lambda x: ZeroShotCounting()(**{"image": x})
|
278
282
|
|
283
|
+
def generate_image_qa_tool(self, question: str) -> Callable:
|
284
|
+
from vision_agent.tools import ImageQuestionAnswering
|
285
|
+
|
286
|
+
return lambda x: ImageQuestionAnswering()(**{"prompt": question, "image": x})
|
287
|
+
|
279
288
|
|
280
289
|
class AzureOpenAILMM(OpenAILMM):
|
281
290
|
def __init__(
|
vision_agent/tools/__init__.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
|
2
2
|
from .tools import ( # Counter,
|
3
3
|
CLIP,
|
4
|
+
OCR,
|
4
5
|
TOOLS,
|
5
6
|
BboxArea,
|
6
7
|
BboxIoU,
|
@@ -13,7 +14,10 @@ from .tools import ( # Counter,
|
|
13
14
|
ImageCaption,
|
14
15
|
ZeroShotCounting,
|
15
16
|
VisualPromptCounting,
|
17
|
+
VisualQuestionAnswering,
|
18
|
+
ImageQuestionAnswering,
|
16
19
|
SegArea,
|
17
20
|
SegIoU,
|
18
21
|
Tool,
|
22
|
+
register_tool,
|
19
23
|
)
|
vision_agent/tools/tools.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1
|
+
import io
|
1
2
|
import logging
|
2
3
|
import tempfile
|
3
4
|
from abc import ABC
|
4
5
|
from pathlib import Path
|
5
|
-
from typing import Any, Dict, List, Tuple, Union, cast
|
6
|
+
from typing import Any, Dict, List, Tuple, Type, Union, cast
|
6
7
|
|
7
8
|
import numpy as np
|
8
9
|
import requests
|
@@ -11,13 +12,14 @@ from PIL.Image import Image as ImageType
|
|
11
12
|
|
12
13
|
from vision_agent.image_utils import (
|
13
14
|
convert_to_b64,
|
15
|
+
denormalize_bbox,
|
14
16
|
get_image_size,
|
15
|
-
rle_decode,
|
16
17
|
normalize_bbox,
|
17
|
-
|
18
|
+
rle_decode,
|
18
19
|
)
|
19
20
|
from vision_agent.tools.video import extract_frames_from_video
|
20
21
|
from vision_agent.type_defs import LandingaiAPIKey
|
22
|
+
from vision_agent.lmm import OpenAILMM
|
21
23
|
|
22
24
|
_LOGGER = logging.getLogger(__name__)
|
23
25
|
_LND_API_KEY = LandingaiAPIKey().api_key
|
@@ -29,6 +31,9 @@ class Tool(ABC):
|
|
29
31
|
description: str
|
30
32
|
usage: Dict
|
31
33
|
|
34
|
+
def __call__(self, *args: Any, **kwargs: Any) -> Any:
|
35
|
+
raise NotImplementedError
|
36
|
+
|
32
37
|
|
33
38
|
class NoOp(Tool):
|
34
39
|
name = "noop_"
|
@@ -108,8 +113,7 @@ class CLIP(Tool):
|
|
108
113
|
|
109
114
|
|
110
115
|
class ImageCaption(Tool):
|
111
|
-
r"""ImageCaption is a tool that can caption an image based on its contents
|
112
|
-
or tags.
|
116
|
+
r"""ImageCaption is a tool that can caption an image based on its contents or tags.
|
113
117
|
|
114
118
|
Example
|
115
119
|
-------
|
@@ -120,26 +124,20 @@ class ImageCaption(Tool):
|
|
120
124
|
"""
|
121
125
|
|
122
126
|
name = "image_caption_"
|
123
|
-
description = "'image_caption_' is a tool that can caption an image based on its contents or tags. It returns a text describing the image"
|
127
|
+
description = "'image_caption_' is a tool that can caption an image based on its contents or tags. It returns a text describing the image."
|
124
128
|
usage = {
|
125
129
|
"required_parameters": [
|
126
130
|
{"name": "image", "type": "str"},
|
127
131
|
],
|
128
132
|
"examples": [
|
129
133
|
{
|
130
|
-
"scenario": "Can you describe this image
|
134
|
+
"scenario": "Can you describe this image? Image name: cat.jpg",
|
131
135
|
"parameters": {"image": "cat.jpg"},
|
132
136
|
},
|
133
137
|
{
|
134
|
-
"scenario": "Can you caption this image with their main contents
|
138
|
+
"scenario": "Can you caption this image with their main contents? Image name: cat_dog.jpg",
|
135
139
|
"parameters": {"image": "cat_dog.jpg"},
|
136
140
|
},
|
137
|
-
{
|
138
|
-
"scenario": "Can you build me a image captioning tool ? Image name: shirts.jpg",
|
139
|
-
"parameters": {
|
140
|
-
"image": "shirts.jpg",
|
141
|
-
},
|
142
|
-
},
|
143
141
|
],
|
144
142
|
}
|
145
143
|
|
@@ -487,15 +485,15 @@ class ZeroShotCounting(Tool):
|
|
487
485
|
],
|
488
486
|
"examples": [
|
489
487
|
{
|
490
|
-
"scenario": "Can you count the lids in the image
|
488
|
+
"scenario": "Can you count the lids in the image? Image name: lids.jpg",
|
491
489
|
"parameters": {"image": "lids.jpg"},
|
492
490
|
},
|
493
491
|
{
|
494
|
-
"scenario": "Can you count the total number of objects in this image
|
492
|
+
"scenario": "Can you count the total number of objects in this image? Image name: tray.jpg",
|
495
493
|
"parameters": {"image": "tray.jpg"},
|
496
494
|
},
|
497
495
|
{
|
498
|
-
"scenario": "Can you build me an object counting tool
|
496
|
+
"scenario": "Can you build me an object counting tool? Image name: shirts.jpg",
|
499
497
|
"parameters": {
|
500
498
|
"image": "shirts.jpg",
|
501
499
|
},
|
@@ -505,7 +503,7 @@ class ZeroShotCounting(Tool):
|
|
505
503
|
|
506
504
|
# TODO: Add support for input multiple images, which aligns with the output type.
|
507
505
|
def __call__(self, image: Union[str, ImageType]) -> Dict:
|
508
|
-
"""Invoke the
|
506
|
+
"""Invoke the Zero shot counting model.
|
509
507
|
|
510
508
|
Parameters:
|
511
509
|
image: the input image.
|
@@ -569,7 +567,7 @@ class VisualPromptCounting(Tool):
|
|
569
567
|
|
570
568
|
# TODO: Add support for input multiple images, which aligns with the output type.
|
571
569
|
def __call__(self, image: Union[str, ImageType], prompt: str) -> Dict:
|
572
|
-
"""Invoke the
|
570
|
+
"""Invoke the few shot counting model.
|
573
571
|
|
574
572
|
Parameters:
|
575
573
|
image: the input image.
|
@@ -590,6 +588,144 @@ class VisualPromptCounting(Tool):
|
|
590
588
|
return _send_inference_request(data, "tools")
|
591
589
|
|
592
590
|
|
591
|
+
class VisualQuestionAnswering(Tool):
|
592
|
+
r"""VisualQuestionAnswering is a tool that can explain contents of an image and answer questions about the same
|
593
|
+
|
594
|
+
Example
|
595
|
+
-------
|
596
|
+
>>> import vision_agent as va
|
597
|
+
>>> vqa_tool = va.tools.VisualQuestionAnswering()
|
598
|
+
>>> vqa_tool(image="image1.jpg", prompt="describe this image in detail")
|
599
|
+
{'text': "The image contains a cat sitting on a table with a bowl of milk."}
|
600
|
+
"""
|
601
|
+
|
602
|
+
name = "visual_question_answering_"
|
603
|
+
description = "'visual_question_answering_' is a tool that can describe the contents of the image and it can also answer basic questions about the image."
|
604
|
+
|
605
|
+
usage = {
|
606
|
+
"required_parameters": [
|
607
|
+
{"name": "image", "type": "str"},
|
608
|
+
{"name": "prompt", "type": "str"},
|
609
|
+
],
|
610
|
+
"examples": [
|
611
|
+
{
|
612
|
+
"scenario": "Describe this image in detail. Image name: cat.jpg",
|
613
|
+
"parameters": {
|
614
|
+
"image": "cats.jpg",
|
615
|
+
"prompt": "Describe this image in detail",
|
616
|
+
},
|
617
|
+
},
|
618
|
+
{
|
619
|
+
"scenario": "Can you help me with this street sign in this image ? What does it say ? Image name: sign.jpg",
|
620
|
+
"parameters": {
|
621
|
+
"image": "sign.jpg",
|
622
|
+
"prompt": "Can you help me with this street sign ? What does it say ?",
|
623
|
+
},
|
624
|
+
},
|
625
|
+
{
|
626
|
+
"scenario": "Describe the weather in the image for me ? Image name: weather.jpg",
|
627
|
+
"parameters": {
|
628
|
+
"image": "weather.jpg",
|
629
|
+
"prompt": "Describe the weather in the image for me ",
|
630
|
+
},
|
631
|
+
},
|
632
|
+
{
|
633
|
+
"scenario": "Which 2 are the least frequent bins in this histogram ? Image name: chart.jpg",
|
634
|
+
"parameters": {
|
635
|
+
"image": "chart.jpg",
|
636
|
+
"prompt": "Which 2 are the least frequent bins in this histogram",
|
637
|
+
},
|
638
|
+
},
|
639
|
+
],
|
640
|
+
}
|
641
|
+
|
642
|
+
def __call__(self, image: str, prompt: str) -> Dict:
|
643
|
+
"""Invoke the visual question answering model.
|
644
|
+
|
645
|
+
Parameters:
|
646
|
+
image: the input image.
|
647
|
+
|
648
|
+
Returns:
|
649
|
+
A dictionary containing the key 'text' and the answer to the prompt. E.g. {'text': 'This image contains a cat sitting on a table with a bowl of milk.'}
|
650
|
+
"""
|
651
|
+
|
652
|
+
gpt = OpenAILMM()
|
653
|
+
return {"text": gpt(input=prompt, images=[image])}
|
654
|
+
|
655
|
+
|
656
|
+
class ImageQuestionAnswering(Tool):
|
657
|
+
r"""ImageQuestionAnswering is a tool that can explain contents of an image and answer questions about the same
|
658
|
+
It is same as VisualQuestionAnswering but this tool is not used by agents. It is used when user requests a tool for VQA using generate_image_qa_tool function.
|
659
|
+
It is also useful if the user wants the data to be not exposed to OpenAI endpoints
|
660
|
+
|
661
|
+
Example
|
662
|
+
-------
|
663
|
+
>>> import vision_agent as va
|
664
|
+
>>> vqa_tool = va.tools.ImageQuestionAnswering()
|
665
|
+
>>> vqa_tool(image="image1.jpg", prompt="describe this image in detail")
|
666
|
+
{'text': "The image contains a cat sitting on a table with a bowl of milk."}
|
667
|
+
"""
|
668
|
+
|
669
|
+
name = "image_question_answering_"
|
670
|
+
description = "'image_question_answering_' is a tool that can describe the contents of the image and it can also answer basic questions about the image."
|
671
|
+
|
672
|
+
usage = {
|
673
|
+
"required_parameters": [
|
674
|
+
{"name": "image", "type": "str"},
|
675
|
+
{"name": "prompt", "type": "str"},
|
676
|
+
],
|
677
|
+
"examples": [
|
678
|
+
{
|
679
|
+
"scenario": "Describe this image in detail. Image name: cat.jpg",
|
680
|
+
"parameters": {
|
681
|
+
"image": "cats.jpg",
|
682
|
+
"prompt": "Describe this image in detail",
|
683
|
+
},
|
684
|
+
},
|
685
|
+
{
|
686
|
+
"scenario": "Can you help me with this street sign in this image ? What does it say ? Image name: sign.jpg",
|
687
|
+
"parameters": {
|
688
|
+
"image": "sign.jpg",
|
689
|
+
"prompt": "Can you help me with this street sign ? What does it say ?",
|
690
|
+
},
|
691
|
+
},
|
692
|
+
{
|
693
|
+
"scenario": "Describe the weather in the image for me ? Image name: weather.jpg",
|
694
|
+
"parameters": {
|
695
|
+
"image": "weather.jpg",
|
696
|
+
"prompt": "Describe the weather in the image for me ",
|
697
|
+
},
|
698
|
+
},
|
699
|
+
{
|
700
|
+
"scenario": "Can you generate an image question answering tool ? Image name: chart.jpg, prompt: Which 2 are the least frequent bins in this histogram",
|
701
|
+
"parameters": {
|
702
|
+
"image": "chart.jpg",
|
703
|
+
"prompt": "Which 2 are the least frequent bins in this histogram",
|
704
|
+
},
|
705
|
+
},
|
706
|
+
],
|
707
|
+
}
|
708
|
+
|
709
|
+
def __call__(self, image: Union[str, ImageType], prompt: str) -> Dict:
|
710
|
+
"""Invoke the visual question answering model.
|
711
|
+
|
712
|
+
Parameters:
|
713
|
+
image: the input image.
|
714
|
+
|
715
|
+
Returns:
|
716
|
+
A dictionary containing the key 'text' and the answer to the prompt. E.g. {'text': 'This image contains a cat sitting on a table with a bowl of milk.'}
|
717
|
+
"""
|
718
|
+
|
719
|
+
image_b64 = convert_to_b64(image)
|
720
|
+
data = {
|
721
|
+
"image": image_b64,
|
722
|
+
"prompt": prompt,
|
723
|
+
"tool": "image_question_answering",
|
724
|
+
}
|
725
|
+
|
726
|
+
return _send_inference_request(data, "tools")
|
727
|
+
|
728
|
+
|
593
729
|
class Crop(Tool):
|
594
730
|
r"""Crop crops an image given a bounding box and returns a file name of the cropped image."""
|
595
731
|
|
@@ -865,6 +1001,57 @@ class ExtractFrames(Tool):
|
|
865
1001
|
return result
|
866
1002
|
|
867
1003
|
|
1004
|
+
class OCR(Tool):
|
1005
|
+
name = "ocr_"
|
1006
|
+
description = "'ocr_' extracts text from an image."
|
1007
|
+
usage = {
|
1008
|
+
"required_parameters": [
|
1009
|
+
{"name": "image", "type": "str"},
|
1010
|
+
],
|
1011
|
+
"examples": [
|
1012
|
+
{
|
1013
|
+
"scenario": "Can you extract the text from this image? Image name: image.png",
|
1014
|
+
"parameters": {"image": "image.png"},
|
1015
|
+
},
|
1016
|
+
],
|
1017
|
+
}
|
1018
|
+
_API_KEY = "land_sk_WVYwP00xA3iXely2vuar6YUDZ3MJT9yLX6oW5noUkwICzYLiDV"
|
1019
|
+
_URL = "https://app.landing.ai/ocr/v1/detect-text"
|
1020
|
+
|
1021
|
+
def __call__(self, image: str) -> dict:
|
1022
|
+
pil_image = Image.open(image).convert("RGB")
|
1023
|
+
image_size = pil_image.size[::-1]
|
1024
|
+
image_buffer = io.BytesIO()
|
1025
|
+
pil_image.save(image_buffer, format="PNG")
|
1026
|
+
buffer_bytes = image_buffer.getvalue()
|
1027
|
+
image_buffer.close()
|
1028
|
+
|
1029
|
+
res = requests.post(
|
1030
|
+
self._URL,
|
1031
|
+
files={"images": buffer_bytes},
|
1032
|
+
data={"language": "en"},
|
1033
|
+
headers={"contentType": "multipart/form-data", "apikey": self._API_KEY},
|
1034
|
+
)
|
1035
|
+
if res.status_code != 200:
|
1036
|
+
_LOGGER.error(f"Request failed: {res.text}")
|
1037
|
+
raise ValueError(f"Request failed: {res.text}")
|
1038
|
+
|
1039
|
+
data = res.json()
|
1040
|
+
output: Dict[str, List] = {"labels": [], "bboxes": [], "scores": []}
|
1041
|
+
for det in data[0]:
|
1042
|
+
output["labels"].append(det["text"])
|
1043
|
+
box = [
|
1044
|
+
det["location"][0]["x"],
|
1045
|
+
det["location"][0]["y"],
|
1046
|
+
det["location"][2]["x"],
|
1047
|
+
det["location"][2]["y"],
|
1048
|
+
]
|
1049
|
+
box = normalize_bbox(box, image_size)
|
1050
|
+
output["bboxes"].append(box)
|
1051
|
+
output["scores"].append(round(det["score"], 2))
|
1052
|
+
return output
|
1053
|
+
|
1054
|
+
|
868
1055
|
class Calculator(Tool):
|
869
1056
|
r"""Calculator is a tool that can perform basic arithmetic operations."""
|
870
1057
|
|
@@ -896,11 +1083,11 @@ TOOLS = {
|
|
896
1083
|
[
|
897
1084
|
NoOp,
|
898
1085
|
CLIP,
|
899
|
-
ImageCaption,
|
900
1086
|
GroundingDINO,
|
901
1087
|
AgentGroundingSAM,
|
902
1088
|
ZeroShotCounting,
|
903
1089
|
VisualPromptCounting,
|
1090
|
+
VisualQuestionAnswering,
|
904
1091
|
AgentDINOv,
|
905
1092
|
ExtractFrames,
|
906
1093
|
Crop,
|
@@ -910,6 +1097,7 @@ TOOLS = {
|
|
910
1097
|
SegIoU,
|
911
1098
|
BboxContains,
|
912
1099
|
BoxDistance,
|
1100
|
+
OCR,
|
913
1101
|
Calculator,
|
914
1102
|
]
|
915
1103
|
)
|
@@ -917,6 +1105,31 @@ TOOLS = {
|
|
917
1105
|
}
|
918
1106
|
|
919
1107
|
|
1108
|
+
def register_tool(tool: Type[Tool]) -> Type[Tool]:
|
1109
|
+
r"""Add a tool to the list of available tools.
|
1110
|
+
|
1111
|
+
Parameters:
|
1112
|
+
tool: The tool to add.
|
1113
|
+
"""
|
1114
|
+
|
1115
|
+
if (
|
1116
|
+
not hasattr(tool, "name")
|
1117
|
+
or not hasattr(tool, "description")
|
1118
|
+
or not hasattr(tool, "usage")
|
1119
|
+
):
|
1120
|
+
raise ValueError(
|
1121
|
+
"The tool must have 'name', 'description' and 'usage' attributes."
|
1122
|
+
)
|
1123
|
+
|
1124
|
+
TOOLS[len(TOOLS)] = {
|
1125
|
+
"name": tool.name,
|
1126
|
+
"description": tool.description,
|
1127
|
+
"usage": tool.usage,
|
1128
|
+
"class": tool,
|
1129
|
+
}
|
1130
|
+
return tool
|
1131
|
+
|
1132
|
+
|
920
1133
|
def _send_inference_request(
|
921
1134
|
payload: Dict[str, Any], endpoint_name: str
|
922
1135
|
) -> Dict[str, Any]:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.4
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -58,7 +58,7 @@ pip install vision-agent
|
|
58
58
|
```
|
59
59
|
|
60
60
|
Ensure you have an OpenAI API key and set it as an environment variable (if you are
|
61
|
-
using Azure OpenAI please see the
|
61
|
+
using Azure OpenAI please see the Azure setup section):
|
62
62
|
|
63
63
|
```bash
|
64
64
|
export OPENAI_API_KEY="your-api-key"
|
@@ -123,26 +123,55 @@ you. For example:
|
|
123
123
|
}]
|
124
124
|
```
|
125
125
|
|
126
|
+
#### Custom Tools
|
127
|
+
You can also add your own custom tools for your vision agent to use:
|
128
|
+
|
129
|
+
```python
|
130
|
+
>>> from vision_agent.tools import Tool, register_tool
|
131
|
+
>>> @register_tool
|
132
|
+
>>> class NumItems(Tool):
|
133
|
+
>>> name = "num_items_"
|
134
|
+
>>> description = "Returns the number of items in a list."
|
135
|
+
>>> usage = {
|
136
|
+
>>> "required_parameters": [{"name": "prompt", "type": "list"}],
|
137
|
+
>>> "examples": [
|
138
|
+
>>> {
|
139
|
+
>>> "scenario": "How many items are in this list? ['a', 'b', 'c']",
|
140
|
+
>>> "parameters": {"prompt": "['a', 'b', 'c']"},
|
141
|
+
>>> }
|
142
|
+
>>> ],
|
143
|
+
>>> }
|
144
|
+
>>> def __call__(self, prompt: list[str]) -> int:
|
145
|
+
>>> return len(prompt)
|
146
|
+
```
|
147
|
+
This will register it with the list of tools Vision Agent has access to. It will be able
|
148
|
+
to pick it based on the tool description and use it based on the usage provided.
|
149
|
+
|
150
|
+
#### Tool List
|
126
151
|
| Tool | Description |
|
127
152
|
| --- | --- |
|
128
153
|
| CLIP | CLIP is a tool that can classify or tag any image given a set of input classes or tags. |
|
154
|
+
| ImageCaption| ImageCaption is a tool that can generate a caption for an image. |
|
129
155
|
| GroundingDINO | GroundingDINO is a tool that can detect arbitrary objects with inputs such as category names or referring expressions. |
|
130
156
|
| GroundingSAM | GroundingSAM is a tool that can detect and segment arbitrary objects with inputs such as category names or referring expressions. |
|
131
|
-
|
|
157
|
+
| DINOv | DINOv is a tool that can detect arbitrary objects with using a referring mask. |
|
158
|
+
| ExtractFrames | ExtractFrames extracts frames with motion from a video. |
|
132
159
|
| Crop | Crop crops an image given a bounding box and returns a file name of the cropped image. |
|
133
160
|
| BboxArea | BboxArea returns the area of the bounding box in pixels normalized to 2 decimal places. |
|
134
161
|
| SegArea | SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places. |
|
135
162
|
| BboxIoU | BboxIoU returns the intersection over union of two bounding boxes normalized to 2 decimal places. |
|
136
163
|
| SegIoU | SegIoU returns the intersection over union of two segmentation masks normalized to 2 decimal places. |
|
137
|
-
|
|
164
|
+
| BoxDistance | BoxDistance returns the minimum distance between two bounding boxes normalized to 2 decimal places. |
|
165
|
+
| BboxContains | BboxContains returns the intersection of two boxes over the target box area. It is good for check if one box is contained within another box. |
|
138
166
|
| ExtractFrames | ExtractFrames extracts frames with motion from a video. |
|
139
167
|
| ZeroShotCounting | ZeroShotCounting returns the total number of objects belonging to a single class in a given image |
|
140
168
|
| VisualPromptCounting | VisualPromptCounting returns the total number of objects belonging to a single class given an image and visual prompt |
|
169
|
+
| OCR | OCR returns the text detected in an image along with the location. |
|
141
170
|
|
142
171
|
|
143
172
|
It also has a basic set of calculate tools such as add, subtract, multiply and divide.
|
144
173
|
|
145
|
-
###
|
174
|
+
### Azure Setup
|
146
175
|
If you want to use Azure OpenAI models, you can set the environment variable:
|
147
176
|
|
148
177
|
```bash
|
@@ -5,21 +5,21 @@ vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMV
|
|
5
5
|
vision_agent/agent/easytool_prompts.py,sha256=zdQQw6WpXOmvwOMtlBlNKY5a3WNlr65dbUvMIGiqdeo,4526
|
6
6
|
vision_agent/agent/reflexion.py,sha256=4gz30BuFMeGxSsTzoDV4p91yE0R8LISXp28IaOI6wdM,10506
|
7
7
|
vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
|
8
|
-
vision_agent/agent/vision_agent.py,sha256=
|
8
|
+
vision_agent/agent/vision_agent.py,sha256=Ehb97lyPs7lYM9ipx07yxm6c2kUqz2OnjGQsv-nMwKA,24849
|
9
9
|
vision_agent/agent/vision_agent_prompts.py,sha256=W3Z72FpUt71UIJSkjAcgtQqxeMqkYuATqHAN5fYY26c,7342
|
10
10
|
vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
11
|
vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
|
12
12
|
vision_agent/image_utils.py,sha256=YvP5KE9NrWdgJKuHW2NR1glzfObkxtcXBknpmj3Gsbs,7554
|
13
13
|
vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
|
14
|
-
vision_agent/llm/llm.py,sha256=
|
14
|
+
vision_agent/llm/llm.py,sha256=1BkrSVBWEClyqLc0Rmyw4heLhi_ZVm6JO7-i1wd1ziw,5383
|
15
15
|
vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
|
16
|
-
vision_agent/lmm/lmm.py,sha256=
|
17
|
-
vision_agent/tools/__init__.py,sha256=
|
16
|
+
vision_agent/lmm/lmm.py,sha256=sECjGMaGrv1QHq7OiFr-9LoBM5uRLjAqd0Ypp-zyFlw,10552
|
17
|
+
vision_agent/tools/__init__.py,sha256=X6yJhWa8iKkQm4Mgf1KcV0_o39-Nrg3E56QAB5gWCO0,413
|
18
18
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
19
|
-
vision_agent/tools/tools.py,sha256=
|
19
|
+
vision_agent/tools/tools.py,sha256=hYgRTHMCBwjT0kkT2SY5MN0FK89vuuecu-x1VqRlGbU,42779
|
20
20
|
vision_agent/tools/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
|
21
21
|
vision_agent/type_defs.py,sha256=4LTnTL4HNsfYqCrDn9Ppjg9bSG2ZGcoKSSd9YeQf4Bw,1792
|
22
|
-
vision_agent-0.2.
|
23
|
-
vision_agent-0.2.
|
24
|
-
vision_agent-0.2.
|
25
|
-
vision_agent-0.2.
|
22
|
+
vision_agent-0.2.4.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
23
|
+
vision_agent-0.2.4.dist-info/METADATA,sha256=2T1YLGMh2-n8F0gGf1P2BDhgzxmtmAiylpfW3E3Q4_c,7697
|
24
|
+
vision_agent-0.2.4.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
25
|
+
vision_agent-0.2.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|