vision-agent 0.0.43__tar.gz → 0.0.45__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vision_agent-0.0.43 → vision_agent-0.0.45}/PKG-INFO +16 -17
- {vision_agent-0.0.43 → vision_agent-0.0.45}/README.md +15 -16
- {vision_agent-0.0.43 → vision_agent-0.0.45}/pyproject.toml +1 -1
- {vision_agent-0.0.43 → vision_agent-0.0.45}/vision_agent/tools/tools.py +5 -14
- vision_agent-0.0.43/vision_agent/tools/tools.json +0 -154
- {vision_agent-0.0.43 → vision_agent-0.0.45}/LICENSE +0 -0
- {vision_agent-0.0.43 → vision_agent-0.0.45}/vision_agent/__init__.py +0 -0
- {vision_agent-0.0.43 → vision_agent-0.0.45}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.0.43 → vision_agent-0.0.45}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.0.43 → vision_agent-0.0.45}/vision_agent/agent/easytool.py +0 -0
- {vision_agent-0.0.43 → vision_agent-0.0.45}/vision_agent/agent/easytool_prompts.py +0 -0
- {vision_agent-0.0.43 → vision_agent-0.0.45}/vision_agent/agent/reflexion.py +0 -0
- {vision_agent-0.0.43 → vision_agent-0.0.45}/vision_agent/agent/reflexion_prompts.py +0 -0
- {vision_agent-0.0.43 → vision_agent-0.0.45}/vision_agent/agent/vision_agent.py +0 -0
- {vision_agent-0.0.43 → vision_agent-0.0.45}/vision_agent/agent/vision_agent_prompts.py +0 -0
- {vision_agent-0.0.43 → vision_agent-0.0.45}/vision_agent/data/__init__.py +0 -0
- {vision_agent-0.0.43 → vision_agent-0.0.45}/vision_agent/data/data.py +0 -0
- {vision_agent-0.0.43 → vision_agent-0.0.45}/vision_agent/emb/__init__.py +0 -0
- {vision_agent-0.0.43 → vision_agent-0.0.45}/vision_agent/emb/emb.py +0 -0
- {vision_agent-0.0.43 → vision_agent-0.0.45}/vision_agent/image_utils.py +0 -0
- {vision_agent-0.0.43 → vision_agent-0.0.45}/vision_agent/llm/__init__.py +0 -0
- {vision_agent-0.0.43 → vision_agent-0.0.45}/vision_agent/llm/llm.py +0 -0
- {vision_agent-0.0.43 → vision_agent-0.0.45}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.0.43 → vision_agent-0.0.45}/vision_agent/lmm/lmm.py +0 -0
- {vision_agent-0.0.43 → vision_agent-0.0.45}/vision_agent/tools/__init__.py +0 -0
- {vision_agent-0.0.43 → vision_agent-0.0.45}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.0.43 → vision_agent-0.0.45}/vision_agent/tools/video.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.45
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -71,8 +71,8 @@ You can interact with the agents as you would with any LLM or LMM model:
|
|
71
71
|
```python
|
72
72
|
>>> import vision_agent as va
|
73
73
|
>>> agent = VisionAgent()
|
74
|
-
>>> agent("
|
75
|
-
"
|
74
|
+
>>> agent("What percentage of the area of this jar is filled with coffee beans?", image="jar.jpg")
|
75
|
+
"The percentage of area of the jar filled with coffee beans is 25%."
|
76
76
|
```
|
77
77
|
|
78
78
|
To better understand how the model came up with it's answer, you can also run it in
|
@@ -86,22 +86,22 @@ You can also have it return the workflow it used to complete the task along with
|
|
86
86
|
the individual steps and tools to get the answer:
|
87
87
|
|
88
88
|
```python
|
89
|
-
>>> resp, workflow = agent.chat_with_workflow([{"role": "user", "content": "
|
89
|
+
>>> resp, workflow = agent.chat_with_workflow([{"role": "user", "content": "What percentage of the area of this jar is filled with coffee beans?"}], image="jar.jpg")
|
90
90
|
>>> print(workflow)
|
91
|
-
[{"task": "
|
92
|
-
"tool": "
|
93
|
-
"parameters": {"prompt": "
|
91
|
+
[{"task": "Segment the jar using 'grounding_sam_'.",
|
92
|
+
"tool": "grounding_sam_",
|
93
|
+
"parameters": {"prompt": "jar", "image": "jar.jpg"},
|
94
94
|
"call_results": [[
|
95
95
|
{
|
96
|
-
"labels": ["
|
97
|
-
"scores": [0.99
|
96
|
+
"labels": ["jar"],
|
97
|
+
"scores": [0.99],
|
98
98
|
"bboxes": [
|
99
99
|
[0.58, 0.2, 0.72, 0.45],
|
100
|
-
|
101
|
-
|
100
|
+
],
|
101
|
+
"masks": "mask.png"
|
102
102
|
}
|
103
103
|
]],
|
104
|
-
"answer": "
|
104
|
+
"answer": "The jar is located at [0.58, 0.2, 0.72, 0.45].",
|
105
105
|
}]
|
106
106
|
```
|
107
107
|
|
@@ -113,13 +113,12 @@ you. For example:
|
|
113
113
|
```python
|
114
114
|
>>> import vision_agent as va
|
115
115
|
>>> llm = va.llm.OpenAILLM()
|
116
|
-
>>> detector = llm.generate_detector("Can you build
|
117
|
-
>>> detector("
|
118
|
-
[{"labels": ["
|
119
|
-
"scores": [0.99
|
116
|
+
>>> detector = llm.generate_detector("Can you build a jar detector for me?")
|
117
|
+
>>> detector("jar.jpg")
|
118
|
+
[{"labels": ["jar",],
|
119
|
+
"scores": [0.99],
|
120
120
|
"bboxes": [
|
121
121
|
[0.58, 0.2, 0.72, 0.45],
|
122
|
-
[0.94, 0.57, 0.98, 0.66],
|
123
122
|
]
|
124
123
|
}]
|
125
124
|
```
|
@@ -42,8 +42,8 @@ You can interact with the agents as you would with any LLM or LMM model:
|
|
42
42
|
```python
|
43
43
|
>>> import vision_agent as va
|
44
44
|
>>> agent = VisionAgent()
|
45
|
-
>>> agent("
|
46
|
-
"
|
45
|
+
>>> agent("What percentage of the area of this jar is filled with coffee beans?", image="jar.jpg")
|
46
|
+
"The percentage of area of the jar filled with coffee beans is 25%."
|
47
47
|
```
|
48
48
|
|
49
49
|
To better understand how the model came up with it's answer, you can also run it in
|
@@ -57,22 +57,22 @@ You can also have it return the workflow it used to complete the task along with
|
|
57
57
|
the individual steps and tools to get the answer:
|
58
58
|
|
59
59
|
```python
|
60
|
-
>>> resp, workflow = agent.chat_with_workflow([{"role": "user", "content": "
|
60
|
+
>>> resp, workflow = agent.chat_with_workflow([{"role": "user", "content": "What percentage of the area of this jar is filled with coffee beans?"}], image="jar.jpg")
|
61
61
|
>>> print(workflow)
|
62
|
-
[{"task": "
|
63
|
-
"tool": "
|
64
|
-
"parameters": {"prompt": "
|
62
|
+
[{"task": "Segment the jar using 'grounding_sam_'.",
|
63
|
+
"tool": "grounding_sam_",
|
64
|
+
"parameters": {"prompt": "jar", "image": "jar.jpg"},
|
65
65
|
"call_results": [[
|
66
66
|
{
|
67
|
-
"labels": ["
|
68
|
-
"scores": [0.99
|
67
|
+
"labels": ["jar"],
|
68
|
+
"scores": [0.99],
|
69
69
|
"bboxes": [
|
70
70
|
[0.58, 0.2, 0.72, 0.45],
|
71
|
-
|
72
|
-
|
71
|
+
],
|
72
|
+
"masks": "mask.png"
|
73
73
|
}
|
74
74
|
]],
|
75
|
-
"answer": "
|
75
|
+
"answer": "The jar is located at [0.58, 0.2, 0.72, 0.45].",
|
76
76
|
}]
|
77
77
|
```
|
78
78
|
|
@@ -84,13 +84,12 @@ you. For example:
|
|
84
84
|
```python
|
85
85
|
>>> import vision_agent as va
|
86
86
|
>>> llm = va.llm.OpenAILLM()
|
87
|
-
>>> detector = llm.generate_detector("Can you build
|
88
|
-
>>> detector("
|
89
|
-
[{"labels": ["
|
90
|
-
"scores": [0.99
|
87
|
+
>>> detector = llm.generate_detector("Can you build a jar detector for me?")
|
88
|
+
>>> detector("jar.jpg")
|
89
|
+
[{"labels": ["jar",],
|
90
|
+
"scores": [0.99],
|
91
91
|
"bboxes": [
|
92
92
|
[0.58, 0.2, 0.72, 0.45],
|
93
|
-
[0.94, 0.57, 0.98, 0.66],
|
94
93
|
]
|
95
94
|
}]
|
96
95
|
```
|
@@ -1,6 +1,5 @@
|
|
1
1
|
import logging
|
2
2
|
import tempfile
|
3
|
-
import os
|
4
3
|
from abc import ABC
|
5
4
|
from collections import Counter as CounterClass
|
6
5
|
from pathlib import Path
|
@@ -140,7 +139,7 @@ class GroundingDINO(Tool):
|
|
140
139
|
'scores': [0.98, 0.02]}]
|
141
140
|
"""
|
142
141
|
|
143
|
-
_ENDPOINT = "https://
|
142
|
+
_ENDPOINT = "https://chnicr4kes5ku77niv2zoytggq0qyqlp.lambda-url.us-east-2.on.aws"
|
144
143
|
|
145
144
|
name = "grounding_dino_"
|
146
145
|
description = "'grounding_dino_' is a tool that can detect arbitrary objects with inputs such as category names or referring expressions."
|
@@ -183,15 +182,11 @@ class GroundingDINO(Tool):
|
|
183
182
|
image_b64 = convert_to_b64(image)
|
184
183
|
data = {
|
185
184
|
"prompt": prompt,
|
186
|
-
"images": image_b64,
|
187
|
-
"tool": "visual_grounding",
|
185
|
+
"images": [image_b64],
|
188
186
|
}
|
189
187
|
res = requests.post(
|
190
188
|
self._ENDPOINT,
|
191
|
-
headers={
|
192
|
-
"Content-Type": "application/json",
|
193
|
-
"Authorization": f"Api-Key {os.environ['BASETEN_API_KEY']}",
|
194
|
-
},
|
189
|
+
headers={"Content-Type": "application/json"},
|
195
190
|
json=data,
|
196
191
|
)
|
197
192
|
resp_json: Dict[str, Any] = res.json()
|
@@ -235,7 +230,7 @@ class GroundingSAM(Tool):
|
|
235
230
|
[1, 1, 1, ..., 1, 1, 1]], dtype=uint8)]}]
|
236
231
|
"""
|
237
232
|
|
238
|
-
_ENDPOINT = "https://
|
233
|
+
_ENDPOINT = "https://cou5lfmus33jbddl6hoqdfbw7e0qidrw.lambda-url.us-east-2.on.aws"
|
239
234
|
|
240
235
|
name = "grounding_sam_"
|
241
236
|
description = "'grounding_sam_' is a tool that can detect and segment arbitrary objects with inputs such as category names or referring expressions."
|
@@ -279,14 +274,10 @@ class GroundingSAM(Tool):
|
|
279
274
|
data = {
|
280
275
|
"classes": prompt,
|
281
276
|
"image": image_b64,
|
282
|
-
"tool": "visual_grounding_segment",
|
283
277
|
}
|
284
278
|
res = requests.post(
|
285
279
|
self._ENDPOINT,
|
286
|
-
headers={
|
287
|
-
"Content-Type": "application/json",
|
288
|
-
"Authorization": f"Api-Key {os.environ['BASETEN_API_KEY']}",
|
289
|
-
},
|
280
|
+
headers={"Content-Type": "application/json"},
|
290
281
|
json=data,
|
291
282
|
)
|
292
283
|
resp_json: Dict[str, Any] = res.json()
|
@@ -1,154 +0,0 @@
|
|
1
|
-
[
|
2
|
-
{
|
3
|
-
"name": "image_question_answering",
|
4
|
-
"description": "answers a question about an image"
|
5
|
-
},
|
6
|
-
{
|
7
|
-
"name": "text_question_answering",
|
8
|
-
"description": "answers a question provided a context in text form"
|
9
|
-
},
|
10
|
-
{
|
11
|
-
"name": "image_captioning",
|
12
|
-
"description": "Generate a caption for the image. It can generate a brief description that can be used for image perception and image generation. For example: a) you can use this tool when you want to know what is it in the image\"; and b) when you want to generate a new image similar or resemble to input.png, you can use `image_captioning` to obtain the description about image input.png."
|
13
|
-
},
|
14
|
-
{
|
15
|
-
"name": "image_to_text",
|
16
|
-
"description": "Generate a description for the image. It can generate a detailed description that can be used for image perception and image generation. For example: a) you can use this tool when you want to know what is it in the image\"; and b) when you want to generate a new image similar or resemble to input.png, you can use `text_to_image` to obtain the description about image input.png."
|
17
|
-
},
|
18
|
-
{
|
19
|
-
"name": "image_to_edge",
|
20
|
-
"description": "get the edge map of the image."
|
21
|
-
},
|
22
|
-
{
|
23
|
-
"name": "image_to_line",
|
24
|
-
"description": "get the line map of the image."
|
25
|
-
},
|
26
|
-
{
|
27
|
-
"name": "image_to_hed",
|
28
|
-
"description": "get the HED map of the image."
|
29
|
-
},
|
30
|
-
{
|
31
|
-
"name": "image_to_scribble",
|
32
|
-
"description": "get the scribble of the image."
|
33
|
-
},
|
34
|
-
{
|
35
|
-
"name": "image_to_pose",
|
36
|
-
"description": "Get the pose of the image. It is usually used in image generation conditioned on pose map from input image."
|
37
|
-
},
|
38
|
-
{
|
39
|
-
"name": "image_to_depth",
|
40
|
-
"description": "get the depth map of the image."
|
41
|
-
},
|
42
|
-
{
|
43
|
-
"name": "image_to_normal",
|
44
|
-
"description": "get the normal map of the image."
|
45
|
-
},
|
46
|
-
{
|
47
|
-
"name": "object_detection",
|
48
|
-
"description": "detect all the objects in the image."
|
49
|
-
},
|
50
|
-
{
|
51
|
-
"name": "image_classification",
|
52
|
-
"description": "classify the objects in the image."
|
53
|
-
},
|
54
|
-
{
|
55
|
-
"name": "closed_set_image_classification",
|
56
|
-
"description": "Given a set of classes as a text prompt, classify the objects in the image based on the given classes."
|
57
|
-
},
|
58
|
-
{
|
59
|
-
"name": "panoptic_segmentation",
|
60
|
-
"description": "segment the common objects in the given image."
|
61
|
-
},
|
62
|
-
{
|
63
|
-
"name": "visual_grounding",
|
64
|
-
"description": "Visual Grounding (VG) aims to locate the most relevant object or region in an image, based on a natural language query. The query can be a phrase, a sentence or even a multi-round dialogue."
|
65
|
-
},
|
66
|
-
{
|
67
|
-
"name": "visual_grounding_segment",
|
68
|
-
"description": "Visual Grounding (VG) aims to locate the most relevant object or region in an image, based on a natural language query. The query can be a phrase, a sentence or even a multi-round dialogue."
|
69
|
-
},
|
70
|
-
{
|
71
|
-
"name": "optical_character_recognition",
|
72
|
-
"description": "Optical Character Recognition (OCR) is the process that converts an image of text into a machine-readable text format."
|
73
|
-
},
|
74
|
-
{
|
75
|
-
"name": "select_category",
|
76
|
-
"description": "select the target classes in category list with the given condition."
|
77
|
-
},
|
78
|
-
{
|
79
|
-
"name": "select_bbox",
|
80
|
-
"description": "select the bounding boxes with the given condition."
|
81
|
-
},
|
82
|
-
{
|
83
|
-
"name": "select_mask",
|
84
|
-
"description": "select the masks with the given condition."
|
85
|
-
},
|
86
|
-
{
|
87
|
-
"name": "count_categories",
|
88
|
-
"description": "count target categories in the given list."
|
89
|
-
},
|
90
|
-
{
|
91
|
-
"name": "count_objects",
|
92
|
-
"description": "count target objects in the given list. It is useful when you want to count the number of objects in the image"
|
93
|
-
},
|
94
|
-
{
|
95
|
-
"name": "count_masks",
|
96
|
-
"description": "count target mask in the given list."
|
97
|
-
},
|
98
|
-
{
|
99
|
-
"name": "video_captioning",
|
100
|
-
"description": "Generate a caption or description for video. It can generate a detailed description that can be used for video perception and video generation. For example: a) you can use this tool when you want to know what happened in the video\"; and b) when you want to generate tags for input video, you can use translate description obtained from `image_captioning` into tags."
|
101
|
-
},
|
102
|
-
{
|
103
|
-
"name": "video_classification",
|
104
|
-
"description": "Classify the video and detect the actions in the video."
|
105
|
-
},
|
106
|
-
{
|
107
|
-
"name": "frame_captioning",
|
108
|
-
"description": "Generate a caption or description for video every n seconds. It can generate a detailed description that can be used for video perception and video generation. For example: a) you can use this tool when you want to know what happened in the video\"; and b) when you want to generate tags for input video, you can use translate description obtained from `image_captioning` into tags."
|
109
|
-
},
|
110
|
-
{
|
111
|
-
"name": "frame_classification",
|
112
|
-
"description": "Classify the video and detect the actions in the every n seconds."
|
113
|
-
},
|
114
|
-
{
|
115
|
-
"name": "text_to_text_generation",
|
116
|
-
"description": "Text to text generation. It can be used for sentence acceptability judgment, Sentiment analysis, Paraphrasing/sentence similarity, Natural language inference, Sentence completion, Word sense disambiguation, Question answering."
|
117
|
-
},
|
118
|
-
{
|
119
|
-
"name": "openai_chat_model",
|
120
|
-
"description": "Answer the question by Large Language Model. It is useful for tasks such as generating content, answering questions, engaging in conversations and providing explanations. However, it still has some limitations. For example, it can not directly access the up-to-date information like time, weather, etc."
|
121
|
-
},
|
122
|
-
{
|
123
|
-
"name": "summarization",
|
124
|
-
"description": "Summarize sentences, long narratives, articles, papers, textbooks."
|
125
|
-
},
|
126
|
-
{
|
127
|
-
"name": "text_to_tags",
|
128
|
-
"description": "Predict the tags of text, article and papers by using the their textual content as input"
|
129
|
-
},
|
130
|
-
{
|
131
|
-
"name": "sentiment_analysis",
|
132
|
-
"description": "Sentiment analysis is the process of analyzing digital text to determine if the emotional tone of the message is positive, negative, or neutral."
|
133
|
-
}
|
134
|
-
]
|
135
|
-
|
136
|
-
input_keys: [
|
137
|
-
{ "image": "image file"},
|
138
|
-
{ "prompt": "text"},
|
139
|
-
{ "video": "video file"},
|
140
|
-
{ "context": "context for text qa"},
|
141
|
-
{ "tool": "tool name mentioned above"},
|
142
|
-
]
|
143
|
-
|
144
|
-
output_keys: [
|
145
|
-
{ "labels": "list of labels for image tasks"},
|
146
|
-
{ "scores": "list of scores for image and text tasks"},
|
147
|
-
{ "bboxes": "list of bounding boxes for detection tasks"},
|
148
|
-
{ "masks": "list of masks for segmentation tasks"},
|
149
|
-
{ "text": "list of text for text tasks"},
|
150
|
-
{ "frames": "list of frame numbers for video tasks"},
|
151
|
-
]
|
152
|
-
|
153
|
-
|
154
|
-
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|