vision-agent 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent.py +27 -12
- vision_agent/llm/llm.py +5 -0
- vision_agent/lmm/lmm.py +14 -8
- vision_agent/tools/__init__.py +6 -2
- vision_agent/tools/tools.py +226 -6
- {vision_agent-0.2.3.dist-info → vision_agent-0.2.5.dist-info}/METADATA +29 -3
- {vision_agent-0.2.3.dist-info → vision_agent-0.2.5.dist-info}/RECORD +9 -9
- {vision_agent-0.2.3.dist-info → vision_agent-0.2.5.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.3.dist-info → vision_agent-0.2.5.dist-info}/WHEEL +0 -0
@@ -366,6 +366,20 @@ def _handle_viz_tools(
|
|
366
366
|
return image_to_data
|
367
367
|
|
368
368
|
|
369
|
+
def sample_n_evenly_spaced(lst: Sequence, n: int) -> Sequence:
|
370
|
+
if n <= 0:
|
371
|
+
return []
|
372
|
+
elif len(lst) == 0:
|
373
|
+
return []
|
374
|
+
elif n == 1:
|
375
|
+
return [lst[0]]
|
376
|
+
elif n >= len(lst):
|
377
|
+
return lst
|
378
|
+
|
379
|
+
spacing = (len(lst) - 1) / (n - 1)
|
380
|
+
return [lst[round(spacing * i)] for i in range(n)]
|
381
|
+
|
382
|
+
|
369
383
|
def visualize_result(all_tool_results: List[Dict]) -> Sequence[Union[str, Path]]:
|
370
384
|
image_to_data: Dict[str, Dict] = {}
|
371
385
|
for tool_result in all_tool_results:
|
@@ -377,6 +391,7 @@ def visualize_result(all_tool_results: List[Dict]) -> Sequence[Union[str, Path]]
|
|
377
391
|
"dinov_",
|
378
392
|
"zero_shot_counting_",
|
379
393
|
"visual_prompt_counting_",
|
394
|
+
"ocr_",
|
380
395
|
]:
|
381
396
|
continue
|
382
397
|
|
@@ -523,20 +538,20 @@ class VisionAgent(Agent):
|
|
523
538
|
if image:
|
524
539
|
question += f" Image name: {image}"
|
525
540
|
if reference_data:
|
526
|
-
|
527
|
-
"image
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
visual_prompt_data = (
|
534
|
-
f"Reference mask: {reference_data['mask']}"
|
541
|
+
question += (
|
542
|
+
f" Reference image: {reference_data['image']}"
|
543
|
+
if "image" in reference_data
|
544
|
+
else ""
|
545
|
+
)
|
546
|
+
question += (
|
547
|
+
f" Reference mask: {reference_data['mask']}"
|
535
548
|
if "mask" in reference_data
|
536
|
-
else
|
549
|
+
else ""
|
537
550
|
)
|
538
551
|
question += (
|
539
|
-
f" Reference
|
552
|
+
f" Reference bbox: {reference_data['bbox']}"
|
553
|
+
if "bbox" in reference_data
|
554
|
+
else ""
|
540
555
|
)
|
541
556
|
|
542
557
|
reflections = ""
|
@@ -583,7 +598,7 @@ class VisionAgent(Agent):
|
|
583
598
|
visualized_output = visualize_result(all_tool_results)
|
584
599
|
all_tool_results.append({"visualized_output": visualized_output})
|
585
600
|
if len(visualized_output) > 0:
|
586
|
-
reflection_images = visualized_output
|
601
|
+
reflection_images = sample_n_evenly_spaced(visualized_output, 3)
|
587
602
|
elif image is not None:
|
588
603
|
reflection_images = [image]
|
589
604
|
else:
|
vision_agent/llm/llm.py
CHANGED
@@ -131,6 +131,11 @@ class OpenAILLM(LLM):
|
|
131
131
|
def generate_zero_shot_counter(self, question: str) -> Callable:
|
132
132
|
return lambda x: ZeroShotCounting()(**{"image": x})
|
133
133
|
|
134
|
+
def generate_image_qa_tool(self, question: str) -> Callable:
|
135
|
+
from vision_agent.tools import ImageQuestionAnswering
|
136
|
+
|
137
|
+
return lambda x: ImageQuestionAnswering()(**{"prompt": question, "image": x})
|
138
|
+
|
134
139
|
|
135
140
|
class AzureOpenAILLM(OpenAILLM):
|
136
141
|
def __init__(
|
vision_agent/lmm/lmm.py
CHANGED
@@ -9,14 +9,7 @@ from typing import Any, Callable, Dict, List, Optional, Union, cast
|
|
9
9
|
import requests
|
10
10
|
from openai import AzureOpenAI, OpenAI
|
11
11
|
|
12
|
-
from vision_agent.tools import
|
13
|
-
CHOOSE_PARAMS,
|
14
|
-
CLIP,
|
15
|
-
SYSTEM_PROMPT,
|
16
|
-
GroundingDINO,
|
17
|
-
GroundingSAM,
|
18
|
-
ZeroShotCounting,
|
19
|
-
)
|
12
|
+
from vision_agent.tools import CHOOSE_PARAMS, SYSTEM_PROMPT
|
20
13
|
|
21
14
|
_LOGGER = logging.getLogger(__name__)
|
22
15
|
|
@@ -205,6 +198,8 @@ class OpenAILMM(LMM):
|
|
205
198
|
return cast(str, response.choices[0].message.content)
|
206
199
|
|
207
200
|
def generate_classifier(self, question: str) -> Callable:
|
201
|
+
from vision_agent.tools import CLIP
|
202
|
+
|
208
203
|
api_doc = CLIP.description + "\n" + str(CLIP.usage)
|
209
204
|
prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
|
210
205
|
response = self.client.chat.completions.create(
|
@@ -228,6 +223,8 @@ class OpenAILMM(LMM):
|
|
228
223
|
return lambda x: CLIP()(**{"prompt": params["prompt"], "image": x})
|
229
224
|
|
230
225
|
def generate_detector(self, question: str) -> Callable:
|
226
|
+
from vision_agent.tools import GroundingDINO
|
227
|
+
|
231
228
|
api_doc = GroundingDINO.description + "\n" + str(GroundingDINO.usage)
|
232
229
|
prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
|
233
230
|
response = self.client.chat.completions.create(
|
@@ -251,6 +248,8 @@ class OpenAILMM(LMM):
|
|
251
248
|
return lambda x: GroundingDINO()(**{"prompt": params["prompt"], "image": x})
|
252
249
|
|
253
250
|
def generate_segmentor(self, question: str) -> Callable:
|
251
|
+
from vision_agent.tools import GroundingSAM
|
252
|
+
|
254
253
|
api_doc = GroundingSAM.description + "\n" + str(GroundingSAM.usage)
|
255
254
|
prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
|
256
255
|
response = self.client.chat.completions.create(
|
@@ -274,8 +273,15 @@ class OpenAILMM(LMM):
|
|
274
273
|
return lambda x: GroundingSAM()(**{"prompt": params["prompt"], "image": x})
|
275
274
|
|
276
275
|
def generate_zero_shot_counter(self, question: str) -> Callable:
|
276
|
+
from vision_agent.tools import ZeroShotCounting
|
277
|
+
|
277
278
|
return lambda x: ZeroShotCounting()(**{"image": x})
|
278
279
|
|
280
|
+
def generate_image_qa_tool(self, question: str) -> Callable:
|
281
|
+
from vision_agent.tools import ImageQuestionAnswering
|
282
|
+
|
283
|
+
return lambda x: ImageQuestionAnswering()(**{"prompt": question, "image": x})
|
284
|
+
|
279
285
|
|
280
286
|
class AzureOpenAILMM(OpenAILMM):
|
281
287
|
def __init__(
|
vision_agent/tools/__init__.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
|
2
2
|
from .tools import ( # Counter,
|
3
3
|
CLIP,
|
4
|
+
OCR,
|
4
5
|
TOOLS,
|
5
6
|
BboxArea,
|
6
7
|
BboxIoU,
|
@@ -11,9 +12,12 @@ from .tools import ( # Counter,
|
|
11
12
|
GroundingDINO,
|
12
13
|
GroundingSAM,
|
13
14
|
ImageCaption,
|
14
|
-
|
15
|
-
VisualPromptCounting,
|
15
|
+
ImageQuestionAnswering,
|
16
16
|
SegArea,
|
17
17
|
SegIoU,
|
18
18
|
Tool,
|
19
|
+
VisualPromptCounting,
|
20
|
+
VisualQuestionAnswering,
|
21
|
+
ZeroShotCounting,
|
22
|
+
register_tool,
|
19
23
|
)
|
vision_agent/tools/tools.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1
|
+
import io
|
1
2
|
import logging
|
2
3
|
import tempfile
|
3
4
|
from abc import ABC
|
4
5
|
from pathlib import Path
|
5
|
-
from typing import Any, Dict, List, Tuple, Union, cast
|
6
|
+
from typing import Any, Dict, List, Tuple, Type, Union, cast
|
6
7
|
|
7
8
|
import numpy as np
|
8
9
|
import requests
|
@@ -11,11 +12,12 @@ from PIL.Image import Image as ImageType
|
|
11
12
|
|
12
13
|
from vision_agent.image_utils import (
|
13
14
|
convert_to_b64,
|
15
|
+
denormalize_bbox,
|
14
16
|
get_image_size,
|
15
|
-
rle_decode,
|
16
17
|
normalize_bbox,
|
17
|
-
|
18
|
+
rle_decode,
|
18
19
|
)
|
20
|
+
from vision_agent.lmm import OpenAILMM
|
19
21
|
from vision_agent.tools.video import extract_frames_from_video
|
20
22
|
from vision_agent.type_defs import LandingaiAPIKey
|
21
23
|
|
@@ -29,6 +31,9 @@ class Tool(ABC):
|
|
29
31
|
description: str
|
30
32
|
usage: Dict
|
31
33
|
|
34
|
+
def __call__(self, *args: Any, **kwargs: Any) -> Any:
|
35
|
+
raise NotImplementedError
|
36
|
+
|
32
37
|
|
33
38
|
class NoOp(Tool):
|
34
39
|
name = "noop_"
|
@@ -498,7 +503,7 @@ class ZeroShotCounting(Tool):
|
|
498
503
|
|
499
504
|
# TODO: Add support for input multiple images, which aligns with the output type.
|
500
505
|
def __call__(self, image: Union[str, ImageType]) -> Dict:
|
501
|
-
"""Invoke the
|
506
|
+
"""Invoke the Zero shot counting model.
|
502
507
|
|
503
508
|
Parameters:
|
504
509
|
image: the input image.
|
@@ -562,7 +567,7 @@ class VisualPromptCounting(Tool):
|
|
562
567
|
|
563
568
|
# TODO: Add support for input multiple images, which aligns with the output type.
|
564
569
|
def __call__(self, image: Union[str, ImageType], prompt: str) -> Dict:
|
565
|
-
"""Invoke the
|
570
|
+
"""Invoke the few shot counting model.
|
566
571
|
|
567
572
|
Parameters:
|
568
573
|
image: the input image.
|
@@ -583,6 +588,144 @@ class VisualPromptCounting(Tool):
|
|
583
588
|
return _send_inference_request(data, "tools")
|
584
589
|
|
585
590
|
|
591
|
+
class VisualQuestionAnswering(Tool):
|
592
|
+
r"""VisualQuestionAnswering is a tool that can explain contents of an image and answer questions about the same
|
593
|
+
|
594
|
+
Example
|
595
|
+
-------
|
596
|
+
>>> import vision_agent as va
|
597
|
+
>>> vqa_tool = va.tools.VisualQuestionAnswering()
|
598
|
+
>>> vqa_tool(image="image1.jpg", prompt="describe this image in detail")
|
599
|
+
{'text': "The image contains a cat sitting on a table with a bowl of milk."}
|
600
|
+
"""
|
601
|
+
|
602
|
+
name = "visual_question_answering_"
|
603
|
+
description = "'visual_question_answering_' is a tool that can describe the contents of the image and it can also answer basic questions about the image."
|
604
|
+
|
605
|
+
usage = {
|
606
|
+
"required_parameters": [
|
607
|
+
{"name": "image", "type": "str"},
|
608
|
+
{"name": "prompt", "type": "str"},
|
609
|
+
],
|
610
|
+
"examples": [
|
611
|
+
{
|
612
|
+
"scenario": "Describe this image in detail. Image name: cat.jpg",
|
613
|
+
"parameters": {
|
614
|
+
"image": "cats.jpg",
|
615
|
+
"prompt": "Describe this image in detail",
|
616
|
+
},
|
617
|
+
},
|
618
|
+
{
|
619
|
+
"scenario": "Can you help me with this street sign in this image ? What does it say ? Image name: sign.jpg",
|
620
|
+
"parameters": {
|
621
|
+
"image": "sign.jpg",
|
622
|
+
"prompt": "Can you help me with this street sign ? What does it say ?",
|
623
|
+
},
|
624
|
+
},
|
625
|
+
{
|
626
|
+
"scenario": "Describe the weather in the image for me ? Image name: weather.jpg",
|
627
|
+
"parameters": {
|
628
|
+
"image": "weather.jpg",
|
629
|
+
"prompt": "Describe the weather in the image for me ",
|
630
|
+
},
|
631
|
+
},
|
632
|
+
{
|
633
|
+
"scenario": "Which 2 are the least frequent bins in this histogram ? Image name: chart.jpg",
|
634
|
+
"parameters": {
|
635
|
+
"image": "chart.jpg",
|
636
|
+
"prompt": "Which 2 are the least frequent bins in this histogram",
|
637
|
+
},
|
638
|
+
},
|
639
|
+
],
|
640
|
+
}
|
641
|
+
|
642
|
+
def __call__(self, image: str, prompt: str) -> Dict:
|
643
|
+
"""Invoke the visual question answering model.
|
644
|
+
|
645
|
+
Parameters:
|
646
|
+
image: the input image.
|
647
|
+
|
648
|
+
Returns:
|
649
|
+
A dictionary containing the key 'text' and the answer to the prompt. E.g. {'text': 'This image contains a cat sitting on a table with a bowl of milk.'}
|
650
|
+
"""
|
651
|
+
|
652
|
+
gpt = OpenAILMM()
|
653
|
+
return {"text": gpt(input=prompt, images=[image])}
|
654
|
+
|
655
|
+
|
656
|
+
class ImageQuestionAnswering(Tool):
|
657
|
+
r"""ImageQuestionAnswering is a tool that can explain contents of an image and answer questions about the same
|
658
|
+
It is same as VisualQuestionAnswering but this tool is not used by agents. It is used when user requests a tool for VQA using generate_image_qa_tool function.
|
659
|
+
It is also useful if the user wants the data to be not exposed to OpenAI endpoints
|
660
|
+
|
661
|
+
Example
|
662
|
+
-------
|
663
|
+
>>> import vision_agent as va
|
664
|
+
>>> vqa_tool = va.tools.ImageQuestionAnswering()
|
665
|
+
>>> vqa_tool(image="image1.jpg", prompt="describe this image in detail")
|
666
|
+
{'text': "The image contains a cat sitting on a table with a bowl of milk."}
|
667
|
+
"""
|
668
|
+
|
669
|
+
name = "image_question_answering_"
|
670
|
+
description = "'image_question_answering_' is a tool that can describe the contents of the image and it can also answer basic questions about the image."
|
671
|
+
|
672
|
+
usage = {
|
673
|
+
"required_parameters": [
|
674
|
+
{"name": "image", "type": "str"},
|
675
|
+
{"name": "prompt", "type": "str"},
|
676
|
+
],
|
677
|
+
"examples": [
|
678
|
+
{
|
679
|
+
"scenario": "Describe this image in detail. Image name: cat.jpg",
|
680
|
+
"parameters": {
|
681
|
+
"image": "cats.jpg",
|
682
|
+
"prompt": "Describe this image in detail",
|
683
|
+
},
|
684
|
+
},
|
685
|
+
{
|
686
|
+
"scenario": "Can you help me with this street sign in this image ? What does it say ? Image name: sign.jpg",
|
687
|
+
"parameters": {
|
688
|
+
"image": "sign.jpg",
|
689
|
+
"prompt": "Can you help me with this street sign ? What does it say ?",
|
690
|
+
},
|
691
|
+
},
|
692
|
+
{
|
693
|
+
"scenario": "Describe the weather in the image for me ? Image name: weather.jpg",
|
694
|
+
"parameters": {
|
695
|
+
"image": "weather.jpg",
|
696
|
+
"prompt": "Describe the weather in the image for me ",
|
697
|
+
},
|
698
|
+
},
|
699
|
+
{
|
700
|
+
"scenario": "Can you generate an image question answering tool ? Image name: chart.jpg, prompt: Which 2 are the least frequent bins in this histogram",
|
701
|
+
"parameters": {
|
702
|
+
"image": "chart.jpg",
|
703
|
+
"prompt": "Which 2 are the least frequent bins in this histogram",
|
704
|
+
},
|
705
|
+
},
|
706
|
+
],
|
707
|
+
}
|
708
|
+
|
709
|
+
def __call__(self, image: Union[str, ImageType], prompt: str) -> Dict:
|
710
|
+
"""Invoke the visual question answering model.
|
711
|
+
|
712
|
+
Parameters:
|
713
|
+
image: the input image.
|
714
|
+
|
715
|
+
Returns:
|
716
|
+
A dictionary containing the key 'text' and the answer to the prompt. E.g. {'text': 'This image contains a cat sitting on a table with a bowl of milk.'}
|
717
|
+
"""
|
718
|
+
|
719
|
+
image_b64 = convert_to_b64(image)
|
720
|
+
data = {
|
721
|
+
"image": image_b64,
|
722
|
+
"prompt": prompt,
|
723
|
+
"tool": "image_question_answering",
|
724
|
+
}
|
725
|
+
|
726
|
+
return _send_inference_request(data, "tools")
|
727
|
+
|
728
|
+
|
586
729
|
class Crop(Tool):
|
587
730
|
r"""Crop crops an image given a bounding box and returns a file name of the cropped image."""
|
588
731
|
|
@@ -858,6 +1001,57 @@ class ExtractFrames(Tool):
|
|
858
1001
|
return result
|
859
1002
|
|
860
1003
|
|
1004
|
+
class OCR(Tool):
|
1005
|
+
name = "ocr_"
|
1006
|
+
description = "'ocr_' extracts text from an image."
|
1007
|
+
usage = {
|
1008
|
+
"required_parameters": [
|
1009
|
+
{"name": "image", "type": "str"},
|
1010
|
+
],
|
1011
|
+
"examples": [
|
1012
|
+
{
|
1013
|
+
"scenario": "Can you extract the text from this image? Image name: image.png",
|
1014
|
+
"parameters": {"image": "image.png"},
|
1015
|
+
},
|
1016
|
+
],
|
1017
|
+
}
|
1018
|
+
_API_KEY = "land_sk_WVYwP00xA3iXely2vuar6YUDZ3MJT9yLX6oW5noUkwICzYLiDV"
|
1019
|
+
_URL = "https://app.landing.ai/ocr/v1/detect-text"
|
1020
|
+
|
1021
|
+
def __call__(self, image: str) -> dict:
|
1022
|
+
pil_image = Image.open(image).convert("RGB")
|
1023
|
+
image_size = pil_image.size[::-1]
|
1024
|
+
image_buffer = io.BytesIO()
|
1025
|
+
pil_image.save(image_buffer, format="PNG")
|
1026
|
+
buffer_bytes = image_buffer.getvalue()
|
1027
|
+
image_buffer.close()
|
1028
|
+
|
1029
|
+
res = requests.post(
|
1030
|
+
self._URL,
|
1031
|
+
files={"images": buffer_bytes},
|
1032
|
+
data={"language": "en"},
|
1033
|
+
headers={"contentType": "multipart/form-data", "apikey": self._API_KEY},
|
1034
|
+
)
|
1035
|
+
if res.status_code != 200:
|
1036
|
+
_LOGGER.error(f"Request failed: {res.text}")
|
1037
|
+
raise ValueError(f"Request failed: {res.text}")
|
1038
|
+
|
1039
|
+
data = res.json()
|
1040
|
+
output: Dict[str, List] = {"labels": [], "bboxes": [], "scores": []}
|
1041
|
+
for det in data[0]:
|
1042
|
+
output["labels"].append(det["text"])
|
1043
|
+
box = [
|
1044
|
+
det["location"][0]["x"],
|
1045
|
+
det["location"][0]["y"],
|
1046
|
+
det["location"][2]["x"],
|
1047
|
+
det["location"][2]["y"],
|
1048
|
+
]
|
1049
|
+
box = normalize_bbox(box, image_size)
|
1050
|
+
output["bboxes"].append(box)
|
1051
|
+
output["scores"].append(round(det["score"], 2))
|
1052
|
+
return output
|
1053
|
+
|
1054
|
+
|
861
1055
|
class Calculator(Tool):
|
862
1056
|
r"""Calculator is a tool that can perform basic arithmetic operations."""
|
863
1057
|
|
@@ -889,11 +1083,11 @@ TOOLS = {
|
|
889
1083
|
[
|
890
1084
|
NoOp,
|
891
1085
|
CLIP,
|
892
|
-
ImageCaption,
|
893
1086
|
GroundingDINO,
|
894
1087
|
AgentGroundingSAM,
|
895
1088
|
ZeroShotCounting,
|
896
1089
|
VisualPromptCounting,
|
1090
|
+
VisualQuestionAnswering,
|
897
1091
|
AgentDINOv,
|
898
1092
|
ExtractFrames,
|
899
1093
|
Crop,
|
@@ -903,6 +1097,7 @@ TOOLS = {
|
|
903
1097
|
SegIoU,
|
904
1098
|
BboxContains,
|
905
1099
|
BoxDistance,
|
1100
|
+
OCR,
|
906
1101
|
Calculator,
|
907
1102
|
]
|
908
1103
|
)
|
@@ -910,6 +1105,31 @@ TOOLS = {
|
|
910
1105
|
}
|
911
1106
|
|
912
1107
|
|
1108
|
+
def register_tool(tool: Type[Tool]) -> Type[Tool]:
|
1109
|
+
r"""Add a tool to the list of available tools.
|
1110
|
+
|
1111
|
+
Parameters:
|
1112
|
+
tool: The tool to add.
|
1113
|
+
"""
|
1114
|
+
|
1115
|
+
if (
|
1116
|
+
not hasattr(tool, "name")
|
1117
|
+
or not hasattr(tool, "description")
|
1118
|
+
or not hasattr(tool, "usage")
|
1119
|
+
):
|
1120
|
+
raise ValueError(
|
1121
|
+
"The tool must have 'name', 'description' and 'usage' attributes."
|
1122
|
+
)
|
1123
|
+
|
1124
|
+
TOOLS[len(TOOLS)] = {
|
1125
|
+
"name": tool.name,
|
1126
|
+
"description": tool.description,
|
1127
|
+
"usage": tool.usage,
|
1128
|
+
"class": tool,
|
1129
|
+
}
|
1130
|
+
return tool
|
1131
|
+
|
1132
|
+
|
913
1133
|
def _send_inference_request(
|
914
1134
|
payload: Dict[str, Any], endpoint_name: str
|
915
1135
|
) -> Dict[str, Any]:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.5
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -58,7 +58,7 @@ pip install vision-agent
|
|
58
58
|
```
|
59
59
|
|
60
60
|
Ensure you have an OpenAI API key and set it as an environment variable (if you are
|
61
|
-
using Azure OpenAI please see the
|
61
|
+
using Azure OpenAI please see the Azure setup section):
|
62
62
|
|
63
63
|
```bash
|
64
64
|
export OPENAI_API_KEY="your-api-key"
|
@@ -123,6 +123,31 @@ you. For example:
|
|
123
123
|
}]
|
124
124
|
```
|
125
125
|
|
126
|
+
#### Custom Tools
|
127
|
+
You can also add your own custom tools for your vision agent to use:
|
128
|
+
|
129
|
+
```python
|
130
|
+
>>> from vision_agent.tools import Tool, register_tool
|
131
|
+
>>> @register_tool
|
132
|
+
>>> class NumItems(Tool):
|
133
|
+
>>> name = "num_items_"
|
134
|
+
>>> description = "Returns the number of items in a list."
|
135
|
+
>>> usage = {
|
136
|
+
>>> "required_parameters": [{"name": "prompt", "type": "list"}],
|
137
|
+
>>> "examples": [
|
138
|
+
>>> {
|
139
|
+
>>> "scenario": "How many items are in this list? ['a', 'b', 'c']",
|
140
|
+
>>> "parameters": {"prompt": "['a', 'b', 'c']"},
|
141
|
+
>>> }
|
142
|
+
>>> ],
|
143
|
+
>>> }
|
144
|
+
>>> def __call__(self, prompt: list[str]) -> int:
|
145
|
+
>>> return len(prompt)
|
146
|
+
```
|
147
|
+
This will register it with the list of tools Vision Agent has access to. It will be able
|
148
|
+
to pick it based on the tool description and use it based on the usage provided.
|
149
|
+
|
150
|
+
#### Tool List
|
126
151
|
| Tool | Description |
|
127
152
|
| --- | --- |
|
128
153
|
| CLIP | CLIP is a tool that can classify or tag any image given a set of input classes or tags. |
|
@@ -141,11 +166,12 @@ you. For example:
|
|
141
166
|
| ExtractFrames | ExtractFrames extracts frames with motion from a video. |
|
142
167
|
| ZeroShotCounting | ZeroShotCounting returns the total number of objects belonging to a single class in a given image |
|
143
168
|
| VisualPromptCounting | VisualPromptCounting returns the total number of objects belonging to a single class given an image and visual prompt |
|
169
|
+
| OCR | OCR returns the text detected in an image along with the location. |
|
144
170
|
|
145
171
|
|
146
172
|
It also has a basic set of calculate tools such as add, subtract, multiply and divide.
|
147
173
|
|
148
|
-
###
|
174
|
+
### Azure Setup
|
149
175
|
If you want to use Azure OpenAI models, you can set the environment variable:
|
150
176
|
|
151
177
|
```bash
|
@@ -5,21 +5,21 @@ vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMV
|
|
5
5
|
vision_agent/agent/easytool_prompts.py,sha256=zdQQw6WpXOmvwOMtlBlNKY5a3WNlr65dbUvMIGiqdeo,4526
|
6
6
|
vision_agent/agent/reflexion.py,sha256=4gz30BuFMeGxSsTzoDV4p91yE0R8LISXp28IaOI6wdM,10506
|
7
7
|
vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
|
8
|
-
vision_agent/agent/vision_agent.py,sha256=
|
8
|
+
vision_agent/agent/vision_agent.py,sha256=SFdw6OBqWj0cr-YthFMM_x-Urg86CggazYQG4wy0n-U,25195
|
9
9
|
vision_agent/agent/vision_agent_prompts.py,sha256=W3Z72FpUt71UIJSkjAcgtQqxeMqkYuATqHAN5fYY26c,7342
|
10
10
|
vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
11
|
vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
|
12
12
|
vision_agent/image_utils.py,sha256=YvP5KE9NrWdgJKuHW2NR1glzfObkxtcXBknpmj3Gsbs,7554
|
13
13
|
vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
|
14
|
-
vision_agent/llm/llm.py,sha256=
|
14
|
+
vision_agent/llm/llm.py,sha256=1BkrSVBWEClyqLc0Rmyw4heLhi_ZVm6JO7-i1wd1ziw,5383
|
15
15
|
vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
|
16
|
-
vision_agent/lmm/lmm.py,sha256=
|
17
|
-
vision_agent/tools/__init__.py,sha256=
|
16
|
+
vision_agent/lmm/lmm.py,sha256=gK90vMxh0OcGSuIZQikBkDXm4pfkdFk1R2y7rtWDl84,10539
|
17
|
+
vision_agent/tools/__init__.py,sha256=HfUr0JQUwk0Kyieen93df9lMbbdpVf9Q6CcVFmKv_q4,413
|
18
18
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
19
|
-
vision_agent/tools/tools.py,sha256=
|
19
|
+
vision_agent/tools/tools.py,sha256=GvRDLeMVS9C7z56hlSpThGoV0r_x5pKSFw-g4JW_qnw,42779
|
20
20
|
vision_agent/tools/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
|
21
21
|
vision_agent/type_defs.py,sha256=4LTnTL4HNsfYqCrDn9Ppjg9bSG2ZGcoKSSd9YeQf4Bw,1792
|
22
|
-
vision_agent-0.2.
|
23
|
-
vision_agent-0.2.
|
24
|
-
vision_agent-0.2.
|
25
|
-
vision_agent-0.2.
|
22
|
+
vision_agent-0.2.5.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
23
|
+
vision_agent-0.2.5.dist-info/METADATA,sha256=zSTYpM893hERFpO2j7-YdRmRPKeGI6-qU_wkq5MitFY,7697
|
24
|
+
vision_agent-0.2.5.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
25
|
+
vision_agent-0.2.5.dist-info/RECORD,,
|
File without changes
|
File without changes
|