vision-agent 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent.py +12 -11
- vision_agent/llm/llm.py +5 -0
- vision_agent/lmm/lmm.py +13 -4
- vision_agent/tools/__init__.py +4 -0
- vision_agent/tools/tools.py +226 -6
- {vision_agent-0.2.3.dist-info → vision_agent-0.2.4.dist-info}/METADATA +29 -3
- {vision_agent-0.2.3.dist-info → vision_agent-0.2.4.dist-info}/RECORD +9 -9
- {vision_agent-0.2.3.dist-info → vision_agent-0.2.4.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.3.dist-info → vision_agent-0.2.4.dist-info}/WHEEL +0 -0
@@ -377,6 +377,7 @@ def visualize_result(all_tool_results: List[Dict]) -> Sequence[Union[str, Path]]
|
|
377
377
|
"dinov_",
|
378
378
|
"zero_shot_counting_",
|
379
379
|
"visual_prompt_counting_",
|
380
|
+
"ocr_",
|
380
381
|
]:
|
381
382
|
continue
|
382
383
|
|
@@ -523,20 +524,20 @@ class VisionAgent(Agent):
|
|
523
524
|
if image:
|
524
525
|
question += f" Image name: {image}"
|
525
526
|
if reference_data:
|
526
|
-
|
527
|
-
"image
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
visual_prompt_data = (
|
534
|
-
f"Reference mask: {reference_data['mask']}"
|
527
|
+
question += (
|
528
|
+
f" Reference image: {reference_data['image']}"
|
529
|
+
if "image" in reference_data
|
530
|
+
else ""
|
531
|
+
)
|
532
|
+
question += (
|
533
|
+
f" Reference mask: {reference_data['mask']}"
|
535
534
|
if "mask" in reference_data
|
536
|
-
else
|
535
|
+
else ""
|
537
536
|
)
|
538
537
|
question += (
|
539
|
-
f" Reference
|
538
|
+
f" Reference bbox: {reference_data['bbox']}"
|
539
|
+
if "bbox" in reference_data
|
540
|
+
else ""
|
540
541
|
)
|
541
542
|
|
542
543
|
reflections = ""
|
vision_agent/llm/llm.py
CHANGED
@@ -131,6 +131,11 @@ class OpenAILLM(LLM):
|
|
131
131
|
def generate_zero_shot_counter(self, question: str) -> Callable:
|
132
132
|
return lambda x: ZeroShotCounting()(**{"image": x})
|
133
133
|
|
134
|
+
def generate_image_qa_tool(self, question: str) -> Callable:
|
135
|
+
from vision_agent.tools import ImageQuestionAnswering
|
136
|
+
|
137
|
+
return lambda x: ImageQuestionAnswering()(**{"prompt": question, "image": x})
|
138
|
+
|
134
139
|
|
135
140
|
class AzureOpenAILLM(OpenAILLM):
|
136
141
|
def __init__(
|
vision_agent/lmm/lmm.py
CHANGED
@@ -11,11 +11,7 @@ from openai import AzureOpenAI, OpenAI
|
|
11
11
|
|
12
12
|
from vision_agent.tools import (
|
13
13
|
CHOOSE_PARAMS,
|
14
|
-
CLIP,
|
15
14
|
SYSTEM_PROMPT,
|
16
|
-
GroundingDINO,
|
17
|
-
GroundingSAM,
|
18
|
-
ZeroShotCounting,
|
19
15
|
)
|
20
16
|
|
21
17
|
_LOGGER = logging.getLogger(__name__)
|
@@ -205,6 +201,8 @@ class OpenAILMM(LMM):
|
|
205
201
|
return cast(str, response.choices[0].message.content)
|
206
202
|
|
207
203
|
def generate_classifier(self, question: str) -> Callable:
|
204
|
+
from vision_agent.tools import CLIP
|
205
|
+
|
208
206
|
api_doc = CLIP.description + "\n" + str(CLIP.usage)
|
209
207
|
prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
|
210
208
|
response = self.client.chat.completions.create(
|
@@ -228,6 +226,8 @@ class OpenAILMM(LMM):
|
|
228
226
|
return lambda x: CLIP()(**{"prompt": params["prompt"], "image": x})
|
229
227
|
|
230
228
|
def generate_detector(self, question: str) -> Callable:
|
229
|
+
from vision_agent.tools import GroundingDINO
|
230
|
+
|
231
231
|
api_doc = GroundingDINO.description + "\n" + str(GroundingDINO.usage)
|
232
232
|
prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
|
233
233
|
response = self.client.chat.completions.create(
|
@@ -251,6 +251,8 @@ class OpenAILMM(LMM):
|
|
251
251
|
return lambda x: GroundingDINO()(**{"prompt": params["prompt"], "image": x})
|
252
252
|
|
253
253
|
def generate_segmentor(self, question: str) -> Callable:
|
254
|
+
from vision_agent.tools import GroundingSAM
|
255
|
+
|
254
256
|
api_doc = GroundingSAM.description + "\n" + str(GroundingSAM.usage)
|
255
257
|
prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
|
256
258
|
response = self.client.chat.completions.create(
|
@@ -274,8 +276,15 @@ class OpenAILMM(LMM):
|
|
274
276
|
return lambda x: GroundingSAM()(**{"prompt": params["prompt"], "image": x})
|
275
277
|
|
276
278
|
def generate_zero_shot_counter(self, question: str) -> Callable:
|
279
|
+
from vision_agent.tools import ZeroShotCounting
|
280
|
+
|
277
281
|
return lambda x: ZeroShotCounting()(**{"image": x})
|
278
282
|
|
283
|
+
def generate_image_qa_tool(self, question: str) -> Callable:
|
284
|
+
from vision_agent.tools import ImageQuestionAnswering
|
285
|
+
|
286
|
+
return lambda x: ImageQuestionAnswering()(**{"prompt": question, "image": x})
|
287
|
+
|
279
288
|
|
280
289
|
class AzureOpenAILMM(OpenAILMM):
|
281
290
|
def __init__(
|
vision_agent/tools/__init__.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
|
2
2
|
from .tools import ( # Counter,
|
3
3
|
CLIP,
|
4
|
+
OCR,
|
4
5
|
TOOLS,
|
5
6
|
BboxArea,
|
6
7
|
BboxIoU,
|
@@ -13,7 +14,10 @@ from .tools import ( # Counter,
|
|
13
14
|
ImageCaption,
|
14
15
|
ZeroShotCounting,
|
15
16
|
VisualPromptCounting,
|
17
|
+
VisualQuestionAnswering,
|
18
|
+
ImageQuestionAnswering,
|
16
19
|
SegArea,
|
17
20
|
SegIoU,
|
18
21
|
Tool,
|
22
|
+
register_tool,
|
19
23
|
)
|
vision_agent/tools/tools.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1
|
+
import io
|
1
2
|
import logging
|
2
3
|
import tempfile
|
3
4
|
from abc import ABC
|
4
5
|
from pathlib import Path
|
5
|
-
from typing import Any, Dict, List, Tuple, Union, cast
|
6
|
+
from typing import Any, Dict, List, Tuple, Type, Union, cast
|
6
7
|
|
7
8
|
import numpy as np
|
8
9
|
import requests
|
@@ -11,13 +12,14 @@ from PIL.Image import Image as ImageType
|
|
11
12
|
|
12
13
|
from vision_agent.image_utils import (
|
13
14
|
convert_to_b64,
|
15
|
+
denormalize_bbox,
|
14
16
|
get_image_size,
|
15
|
-
rle_decode,
|
16
17
|
normalize_bbox,
|
17
|
-
|
18
|
+
rle_decode,
|
18
19
|
)
|
19
20
|
from vision_agent.tools.video import extract_frames_from_video
|
20
21
|
from vision_agent.type_defs import LandingaiAPIKey
|
22
|
+
from vision_agent.lmm import OpenAILMM
|
21
23
|
|
22
24
|
_LOGGER = logging.getLogger(__name__)
|
23
25
|
_LND_API_KEY = LandingaiAPIKey().api_key
|
@@ -29,6 +31,9 @@ class Tool(ABC):
|
|
29
31
|
description: str
|
30
32
|
usage: Dict
|
31
33
|
|
34
|
+
def __call__(self, *args: Any, **kwargs: Any) -> Any:
|
35
|
+
raise NotImplementedError
|
36
|
+
|
32
37
|
|
33
38
|
class NoOp(Tool):
|
34
39
|
name = "noop_"
|
@@ -498,7 +503,7 @@ class ZeroShotCounting(Tool):
|
|
498
503
|
|
499
504
|
# TODO: Add support for input multiple images, which aligns with the output type.
|
500
505
|
def __call__(self, image: Union[str, ImageType]) -> Dict:
|
501
|
-
"""Invoke the
|
506
|
+
"""Invoke the Zero shot counting model.
|
502
507
|
|
503
508
|
Parameters:
|
504
509
|
image: the input image.
|
@@ -562,7 +567,7 @@ class VisualPromptCounting(Tool):
|
|
562
567
|
|
563
568
|
# TODO: Add support for input multiple images, which aligns with the output type.
|
564
569
|
def __call__(self, image: Union[str, ImageType], prompt: str) -> Dict:
|
565
|
-
"""Invoke the
|
570
|
+
"""Invoke the few shot counting model.
|
566
571
|
|
567
572
|
Parameters:
|
568
573
|
image: the input image.
|
@@ -583,6 +588,144 @@ class VisualPromptCounting(Tool):
|
|
583
588
|
return _send_inference_request(data, "tools")
|
584
589
|
|
585
590
|
|
591
|
+
class VisualQuestionAnswering(Tool):
|
592
|
+
r"""VisualQuestionAnswering is a tool that can explain contents of an image and answer questions about the same
|
593
|
+
|
594
|
+
Example
|
595
|
+
-------
|
596
|
+
>>> import vision_agent as va
|
597
|
+
>>> vqa_tool = va.tools.VisualQuestionAnswering()
|
598
|
+
>>> vqa_tool(image="image1.jpg", prompt="describe this image in detail")
|
599
|
+
{'text': "The image contains a cat sitting on a table with a bowl of milk."}
|
600
|
+
"""
|
601
|
+
|
602
|
+
name = "visual_question_answering_"
|
603
|
+
description = "'visual_question_answering_' is a tool that can describe the contents of the image and it can also answer basic questions about the image."
|
604
|
+
|
605
|
+
usage = {
|
606
|
+
"required_parameters": [
|
607
|
+
{"name": "image", "type": "str"},
|
608
|
+
{"name": "prompt", "type": "str"},
|
609
|
+
],
|
610
|
+
"examples": [
|
611
|
+
{
|
612
|
+
"scenario": "Describe this image in detail. Image name: cat.jpg",
|
613
|
+
"parameters": {
|
614
|
+
"image": "cats.jpg",
|
615
|
+
"prompt": "Describe this image in detail",
|
616
|
+
},
|
617
|
+
},
|
618
|
+
{
|
619
|
+
"scenario": "Can you help me with this street sign in this image ? What does it say ? Image name: sign.jpg",
|
620
|
+
"parameters": {
|
621
|
+
"image": "sign.jpg",
|
622
|
+
"prompt": "Can you help me with this street sign ? What does it say ?",
|
623
|
+
},
|
624
|
+
},
|
625
|
+
{
|
626
|
+
"scenario": "Describe the weather in the image for me ? Image name: weather.jpg",
|
627
|
+
"parameters": {
|
628
|
+
"image": "weather.jpg",
|
629
|
+
"prompt": "Describe the weather in the image for me ",
|
630
|
+
},
|
631
|
+
},
|
632
|
+
{
|
633
|
+
"scenario": "Which 2 are the least frequent bins in this histogram ? Image name: chart.jpg",
|
634
|
+
"parameters": {
|
635
|
+
"image": "chart.jpg",
|
636
|
+
"prompt": "Which 2 are the least frequent bins in this histogram",
|
637
|
+
},
|
638
|
+
},
|
639
|
+
],
|
640
|
+
}
|
641
|
+
|
642
|
+
def __call__(self, image: str, prompt: str) -> Dict:
|
643
|
+
"""Invoke the visual question answering model.
|
644
|
+
|
645
|
+
Parameters:
|
646
|
+
image: the input image.
|
647
|
+
|
648
|
+
Returns:
|
649
|
+
A dictionary containing the key 'text' and the answer to the prompt. E.g. {'text': 'This image contains a cat sitting on a table with a bowl of milk.'}
|
650
|
+
"""
|
651
|
+
|
652
|
+
gpt = OpenAILMM()
|
653
|
+
return {"text": gpt(input=prompt, images=[image])}
|
654
|
+
|
655
|
+
|
656
|
+
class ImageQuestionAnswering(Tool):
|
657
|
+
r"""ImageQuestionAnswering is a tool that can explain contents of an image and answer questions about the same
|
658
|
+
It is same as VisualQuestionAnswering but this tool is not used by agents. It is used when user requests a tool for VQA using generate_image_qa_tool function.
|
659
|
+
It is also useful if the user wants the data to be not exposed to OpenAI endpoints
|
660
|
+
|
661
|
+
Example
|
662
|
+
-------
|
663
|
+
>>> import vision_agent as va
|
664
|
+
>>> vqa_tool = va.tools.ImageQuestionAnswering()
|
665
|
+
>>> vqa_tool(image="image1.jpg", prompt="describe this image in detail")
|
666
|
+
{'text': "The image contains a cat sitting on a table with a bowl of milk."}
|
667
|
+
"""
|
668
|
+
|
669
|
+
name = "image_question_answering_"
|
670
|
+
description = "'image_question_answering_' is a tool that can describe the contents of the image and it can also answer basic questions about the image."
|
671
|
+
|
672
|
+
usage = {
|
673
|
+
"required_parameters": [
|
674
|
+
{"name": "image", "type": "str"},
|
675
|
+
{"name": "prompt", "type": "str"},
|
676
|
+
],
|
677
|
+
"examples": [
|
678
|
+
{
|
679
|
+
"scenario": "Describe this image in detail. Image name: cat.jpg",
|
680
|
+
"parameters": {
|
681
|
+
"image": "cats.jpg",
|
682
|
+
"prompt": "Describe this image in detail",
|
683
|
+
},
|
684
|
+
},
|
685
|
+
{
|
686
|
+
"scenario": "Can you help me with this street sign in this image ? What does it say ? Image name: sign.jpg",
|
687
|
+
"parameters": {
|
688
|
+
"image": "sign.jpg",
|
689
|
+
"prompt": "Can you help me with this street sign ? What does it say ?",
|
690
|
+
},
|
691
|
+
},
|
692
|
+
{
|
693
|
+
"scenario": "Describe the weather in the image for me ? Image name: weather.jpg",
|
694
|
+
"parameters": {
|
695
|
+
"image": "weather.jpg",
|
696
|
+
"prompt": "Describe the weather in the image for me ",
|
697
|
+
},
|
698
|
+
},
|
699
|
+
{
|
700
|
+
"scenario": "Can you generate an image question answering tool ? Image name: chart.jpg, prompt: Which 2 are the least frequent bins in this histogram",
|
701
|
+
"parameters": {
|
702
|
+
"image": "chart.jpg",
|
703
|
+
"prompt": "Which 2 are the least frequent bins in this histogram",
|
704
|
+
},
|
705
|
+
},
|
706
|
+
],
|
707
|
+
}
|
708
|
+
|
709
|
+
def __call__(self, image: Union[str, ImageType], prompt: str) -> Dict:
|
710
|
+
"""Invoke the visual question answering model.
|
711
|
+
|
712
|
+
Parameters:
|
713
|
+
image: the input image.
|
714
|
+
|
715
|
+
Returns:
|
716
|
+
A dictionary containing the key 'text' and the answer to the prompt. E.g. {'text': 'This image contains a cat sitting on a table with a bowl of milk.'}
|
717
|
+
"""
|
718
|
+
|
719
|
+
image_b64 = convert_to_b64(image)
|
720
|
+
data = {
|
721
|
+
"image": image_b64,
|
722
|
+
"prompt": prompt,
|
723
|
+
"tool": "image_question_answering",
|
724
|
+
}
|
725
|
+
|
726
|
+
return _send_inference_request(data, "tools")
|
727
|
+
|
728
|
+
|
586
729
|
class Crop(Tool):
|
587
730
|
r"""Crop crops an image given a bounding box and returns a file name of the cropped image."""
|
588
731
|
|
@@ -858,6 +1001,57 @@ class ExtractFrames(Tool):
|
|
858
1001
|
return result
|
859
1002
|
|
860
1003
|
|
1004
|
+
class OCR(Tool):
|
1005
|
+
name = "ocr_"
|
1006
|
+
description = "'ocr_' extracts text from an image."
|
1007
|
+
usage = {
|
1008
|
+
"required_parameters": [
|
1009
|
+
{"name": "image", "type": "str"},
|
1010
|
+
],
|
1011
|
+
"examples": [
|
1012
|
+
{
|
1013
|
+
"scenario": "Can you extract the text from this image? Image name: image.png",
|
1014
|
+
"parameters": {"image": "image.png"},
|
1015
|
+
},
|
1016
|
+
],
|
1017
|
+
}
|
1018
|
+
_API_KEY = "land_sk_WVYwP00xA3iXely2vuar6YUDZ3MJT9yLX6oW5noUkwICzYLiDV"
|
1019
|
+
_URL = "https://app.landing.ai/ocr/v1/detect-text"
|
1020
|
+
|
1021
|
+
def __call__(self, image: str) -> dict:
|
1022
|
+
pil_image = Image.open(image).convert("RGB")
|
1023
|
+
image_size = pil_image.size[::-1]
|
1024
|
+
image_buffer = io.BytesIO()
|
1025
|
+
pil_image.save(image_buffer, format="PNG")
|
1026
|
+
buffer_bytes = image_buffer.getvalue()
|
1027
|
+
image_buffer.close()
|
1028
|
+
|
1029
|
+
res = requests.post(
|
1030
|
+
self._URL,
|
1031
|
+
files={"images": buffer_bytes},
|
1032
|
+
data={"language": "en"},
|
1033
|
+
headers={"contentType": "multipart/form-data", "apikey": self._API_KEY},
|
1034
|
+
)
|
1035
|
+
if res.status_code != 200:
|
1036
|
+
_LOGGER.error(f"Request failed: {res.text}")
|
1037
|
+
raise ValueError(f"Request failed: {res.text}")
|
1038
|
+
|
1039
|
+
data = res.json()
|
1040
|
+
output: Dict[str, List] = {"labels": [], "bboxes": [], "scores": []}
|
1041
|
+
for det in data[0]:
|
1042
|
+
output["labels"].append(det["text"])
|
1043
|
+
box = [
|
1044
|
+
det["location"][0]["x"],
|
1045
|
+
det["location"][0]["y"],
|
1046
|
+
det["location"][2]["x"],
|
1047
|
+
det["location"][2]["y"],
|
1048
|
+
]
|
1049
|
+
box = normalize_bbox(box, image_size)
|
1050
|
+
output["bboxes"].append(box)
|
1051
|
+
output["scores"].append(round(det["score"], 2))
|
1052
|
+
return output
|
1053
|
+
|
1054
|
+
|
861
1055
|
class Calculator(Tool):
|
862
1056
|
r"""Calculator is a tool that can perform basic arithmetic operations."""
|
863
1057
|
|
@@ -889,11 +1083,11 @@ TOOLS = {
|
|
889
1083
|
[
|
890
1084
|
NoOp,
|
891
1085
|
CLIP,
|
892
|
-
ImageCaption,
|
893
1086
|
GroundingDINO,
|
894
1087
|
AgentGroundingSAM,
|
895
1088
|
ZeroShotCounting,
|
896
1089
|
VisualPromptCounting,
|
1090
|
+
VisualQuestionAnswering,
|
897
1091
|
AgentDINOv,
|
898
1092
|
ExtractFrames,
|
899
1093
|
Crop,
|
@@ -903,6 +1097,7 @@ TOOLS = {
|
|
903
1097
|
SegIoU,
|
904
1098
|
BboxContains,
|
905
1099
|
BoxDistance,
|
1100
|
+
OCR,
|
906
1101
|
Calculator,
|
907
1102
|
]
|
908
1103
|
)
|
@@ -910,6 +1105,31 @@ TOOLS = {
|
|
910
1105
|
}
|
911
1106
|
|
912
1107
|
|
1108
|
+
def register_tool(tool: Type[Tool]) -> Type[Tool]:
|
1109
|
+
r"""Add a tool to the list of available tools.
|
1110
|
+
|
1111
|
+
Parameters:
|
1112
|
+
tool: The tool to add.
|
1113
|
+
"""
|
1114
|
+
|
1115
|
+
if (
|
1116
|
+
not hasattr(tool, "name")
|
1117
|
+
or not hasattr(tool, "description")
|
1118
|
+
or not hasattr(tool, "usage")
|
1119
|
+
):
|
1120
|
+
raise ValueError(
|
1121
|
+
"The tool must have 'name', 'description' and 'usage' attributes."
|
1122
|
+
)
|
1123
|
+
|
1124
|
+
TOOLS[len(TOOLS)] = {
|
1125
|
+
"name": tool.name,
|
1126
|
+
"description": tool.description,
|
1127
|
+
"usage": tool.usage,
|
1128
|
+
"class": tool,
|
1129
|
+
}
|
1130
|
+
return tool
|
1131
|
+
|
1132
|
+
|
913
1133
|
def _send_inference_request(
|
914
1134
|
payload: Dict[str, Any], endpoint_name: str
|
915
1135
|
) -> Dict[str, Any]:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.4
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -58,7 +58,7 @@ pip install vision-agent
|
|
58
58
|
```
|
59
59
|
|
60
60
|
Ensure you have an OpenAI API key and set it as an environment variable (if you are
|
61
|
-
using Azure OpenAI please see the
|
61
|
+
using Azure OpenAI please see the Azure setup section):
|
62
62
|
|
63
63
|
```bash
|
64
64
|
export OPENAI_API_KEY="your-api-key"
|
@@ -123,6 +123,31 @@ you. For example:
|
|
123
123
|
}]
|
124
124
|
```
|
125
125
|
|
126
|
+
#### Custom Tools
|
127
|
+
You can also add your own custom tools for your vision agent to use:
|
128
|
+
|
129
|
+
```python
|
130
|
+
>>> from vision_agent.tools import Tool, register_tool
|
131
|
+
>>> @register_tool
|
132
|
+
>>> class NumItems(Tool):
|
133
|
+
>>> name = "num_items_"
|
134
|
+
>>> description = "Returns the number of items in a list."
|
135
|
+
>>> usage = {
|
136
|
+
>>> "required_parameters": [{"name": "prompt", "type": "list"}],
|
137
|
+
>>> "examples": [
|
138
|
+
>>> {
|
139
|
+
>>> "scenario": "How many items are in this list? ['a', 'b', 'c']",
|
140
|
+
>>> "parameters": {"prompt": "['a', 'b', 'c']"},
|
141
|
+
>>> }
|
142
|
+
>>> ],
|
143
|
+
>>> }
|
144
|
+
>>> def __call__(self, prompt: list[str]) -> int:
|
145
|
+
>>> return len(prompt)
|
146
|
+
```
|
147
|
+
This will register it with the list of tools Vision Agent has access to. It will be able
|
148
|
+
to pick it based on the tool description and use it based on the usage provided.
|
149
|
+
|
150
|
+
#### Tool List
|
126
151
|
| Tool | Description |
|
127
152
|
| --- | --- |
|
128
153
|
| CLIP | CLIP is a tool that can classify or tag any image given a set of input classes or tags. |
|
@@ -141,11 +166,12 @@ you. For example:
|
|
141
166
|
| ExtractFrames | ExtractFrames extracts frames with motion from a video. |
|
142
167
|
| ZeroShotCounting | ZeroShotCounting returns the total number of objects belonging to a single class in a given image |
|
143
168
|
| VisualPromptCounting | VisualPromptCounting returns the total number of objects belonging to a single class given an image and visual prompt |
|
169
|
+
| OCR | OCR returns the text detected in an image along with the location. |
|
144
170
|
|
145
171
|
|
146
172
|
It also has a basic set of calculate tools such as add, subtract, multiply and divide.
|
147
173
|
|
148
|
-
###
|
174
|
+
### Azure Setup
|
149
175
|
If you want to use Azure OpenAI models, you can set the environment variable:
|
150
176
|
|
151
177
|
```bash
|
@@ -5,21 +5,21 @@ vision_agent/agent/easytool.py,sha256=oMHnBg7YBtIPgqQUNcZgq7uMgpPThs99_UnO7ERkMV
|
|
5
5
|
vision_agent/agent/easytool_prompts.py,sha256=zdQQw6WpXOmvwOMtlBlNKY5a3WNlr65dbUvMIGiqdeo,4526
|
6
6
|
vision_agent/agent/reflexion.py,sha256=4gz30BuFMeGxSsTzoDV4p91yE0R8LISXp28IaOI6wdM,10506
|
7
7
|
vision_agent/agent/reflexion_prompts.py,sha256=G7UAeNz_g2qCb2yN6OaIC7bQVUkda4m3z42EG8wAyfE,9342
|
8
|
-
vision_agent/agent/vision_agent.py,sha256=
|
8
|
+
vision_agent/agent/vision_agent.py,sha256=Ehb97lyPs7lYM9ipx07yxm6c2kUqz2OnjGQsv-nMwKA,24849
|
9
9
|
vision_agent/agent/vision_agent_prompts.py,sha256=W3Z72FpUt71UIJSkjAcgtQqxeMqkYuATqHAN5fYY26c,7342
|
10
10
|
vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
11
|
vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
|
12
12
|
vision_agent/image_utils.py,sha256=YvP5KE9NrWdgJKuHW2NR1glzfObkxtcXBknpmj3Gsbs,7554
|
13
13
|
vision_agent/llm/__init__.py,sha256=BoUm_zSAKnLlE8s-gKTSQugXDqVZKPqYlWwlTLdhcz4,48
|
14
|
-
vision_agent/llm/llm.py,sha256=
|
14
|
+
vision_agent/llm/llm.py,sha256=1BkrSVBWEClyqLc0Rmyw4heLhi_ZVm6JO7-i1wd1ziw,5383
|
15
15
|
vision_agent/lmm/__init__.py,sha256=nnNeKD1k7q_4vLb1x51O_EUTYaBgGfeiCx5F433gr3M,67
|
16
|
-
vision_agent/lmm/lmm.py,sha256=
|
17
|
-
vision_agent/tools/__init__.py,sha256=
|
16
|
+
vision_agent/lmm/lmm.py,sha256=sECjGMaGrv1QHq7OiFr-9LoBM5uRLjAqd0Ypp-zyFlw,10552
|
17
|
+
vision_agent/tools/__init__.py,sha256=X6yJhWa8iKkQm4Mgf1KcV0_o39-Nrg3E56QAB5gWCO0,413
|
18
18
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
19
|
-
vision_agent/tools/tools.py,sha256=
|
19
|
+
vision_agent/tools/tools.py,sha256=hYgRTHMCBwjT0kkT2SY5MN0FK89vuuecu-x1VqRlGbU,42779
|
20
20
|
vision_agent/tools/video.py,sha256=xTElFSFp1Jw4ulOMnk81Vxsh-9dTxcWUO6P9fzEi3AM,7653
|
21
21
|
vision_agent/type_defs.py,sha256=4LTnTL4HNsfYqCrDn9Ppjg9bSG2ZGcoKSSd9YeQf4Bw,1792
|
22
|
-
vision_agent-0.2.
|
23
|
-
vision_agent-0.2.
|
24
|
-
vision_agent-0.2.
|
25
|
-
vision_agent-0.2.
|
22
|
+
vision_agent-0.2.4.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
23
|
+
vision_agent-0.2.4.dist-info/METADATA,sha256=2T1YLGMh2-n8F0gGf1P2BDhgzxmtmAiylpfW3E3Q4_c,7697
|
24
|
+
vision_agent-0.2.4.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
25
|
+
vision_agent-0.2.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|