vision-agent 1.0.3__py3-none-any.whl → 1.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/.sim_tools/df.csv +46 -47
- vision_agent/.sim_tools/embs.npy +0 -0
- vision_agent/agent/vision_agent_planner_prompts_v2.py +57 -58
- vision_agent/agent/vision_agent_planner_v2.py +3 -2
- vision_agent/configs/anthropic_config.py +29 -16
- vision_agent/configs/config.py +14 -15
- vision_agent/configs/openai_config.py +10 -10
- vision_agent/lmm/lmm.py +2 -2
- vision_agent/sim/sim.py +4 -1
- vision_agent/tools/planner_tools.py +13 -14
- vision_agent/tools/tools.py +16 -27
- vision_agent/utils/tools.py +8 -2
- {vision_agent-1.0.3.dist-info → vision_agent-1.0.5.dist-info}/METADATA +31 -3
- {vision_agent-1.0.3.dist-info → vision_agent-1.0.5.dist-info}/RECORD +16 -17
- vision_agent/configs/anthropic_openai_config.py +0 -164
- {vision_agent-1.0.3.dist-info → vision_agent-1.0.5.dist-info}/LICENSE +0 -0
- {vision_agent-1.0.3.dist-info → vision_agent-1.0.5.dist-info}/WHEEL +0 -0
@@ -2,7 +2,7 @@ from typing import Type
|
|
2
2
|
|
3
3
|
from pydantic import BaseModel, Field
|
4
4
|
|
5
|
-
from vision_agent.lmm import LMM, AnthropicLMM
|
5
|
+
from vision_agent.lmm import LMM, AnthropicLMM, OpenAILMM
|
6
6
|
|
7
7
|
|
8
8
|
class Config(BaseModel):
|
@@ -10,7 +10,7 @@ class Config(BaseModel):
|
|
10
10
|
agent: Type[LMM] = Field(default=AnthropicLMM)
|
11
11
|
agent_kwargs: dict = Field(
|
12
12
|
default_factory=lambda: {
|
13
|
-
"model_name": "claude-3-
|
13
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
14
14
|
"temperature": 0.0,
|
15
15
|
"image_size": 768,
|
16
16
|
}
|
@@ -20,18 +20,17 @@ class Config(BaseModel):
|
|
20
20
|
planner: Type[LMM] = Field(default=AnthropicLMM)
|
21
21
|
planner_kwargs: dict = Field(
|
22
22
|
default_factory=lambda: {
|
23
|
-
"model_name": "claude-3-
|
23
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
24
24
|
"temperature": 0.0,
|
25
25
|
"image_size": 768,
|
26
26
|
}
|
27
27
|
)
|
28
28
|
|
29
|
-
# for vision_agent_planner_v2
|
30
29
|
summarizer: Type[LMM] = Field(default=AnthropicLMM)
|
31
30
|
summarizer_kwargs: dict = Field(
|
32
31
|
default_factory=lambda: {
|
33
|
-
"model_name": "claude-3-
|
34
|
-
"temperature":
|
32
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
33
|
+
"temperature": 1.0, # o1 has fixed temperature
|
35
34
|
"image_size": 768,
|
36
35
|
}
|
37
36
|
)
|
@@ -40,7 +39,7 @@ class Config(BaseModel):
|
|
40
39
|
critic: Type[LMM] = Field(default=AnthropicLMM)
|
41
40
|
critic_kwargs: dict = Field(
|
42
41
|
default_factory=lambda: {
|
43
|
-
"model_name": "claude-3-
|
42
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
44
43
|
"temperature": 0.0,
|
45
44
|
"image_size": 768,
|
46
45
|
}
|
@@ -50,7 +49,7 @@ class Config(BaseModel):
|
|
50
49
|
coder: Type[LMM] = Field(default=AnthropicLMM)
|
51
50
|
coder_kwargs: dict = Field(
|
52
51
|
default_factory=lambda: {
|
53
|
-
"model_name": "claude-3-
|
52
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
54
53
|
"temperature": 0.0,
|
55
54
|
"image_size": 768,
|
56
55
|
}
|
@@ -60,7 +59,7 @@ class Config(BaseModel):
|
|
60
59
|
tester: Type[LMM] = Field(default=AnthropicLMM)
|
61
60
|
tester_kwargs: dict = Field(
|
62
61
|
default_factory=lambda: {
|
63
|
-
"model_name": "claude-3-
|
62
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
64
63
|
"temperature": 0.0,
|
65
64
|
"image_size": 768,
|
66
65
|
}
|
@@ -70,7 +69,7 @@ class Config(BaseModel):
|
|
70
69
|
debugger: Type[LMM] = Field(default=AnthropicLMM)
|
71
70
|
debugger_kwargs: dict = Field(
|
72
71
|
default_factory=lambda: {
|
73
|
-
"model_name": "claude-3-
|
72
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
74
73
|
"temperature": 0.0,
|
75
74
|
"image_size": 768,
|
76
75
|
}
|
@@ -80,7 +79,7 @@ class Config(BaseModel):
|
|
80
79
|
tool_tester: Type[LMM] = Field(default=AnthropicLMM)
|
81
80
|
tool_tester_kwargs: dict = Field(
|
82
81
|
default_factory=lambda: {
|
83
|
-
"model_name": "claude-3-
|
82
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
84
83
|
"temperature": 0.0,
|
85
84
|
"image_size": 768,
|
86
85
|
}
|
@@ -90,19 +89,30 @@ class Config(BaseModel):
|
|
90
89
|
tool_chooser: Type[LMM] = Field(default=AnthropicLMM)
|
91
90
|
tool_chooser_kwargs: dict = Field(
|
92
91
|
default_factory=lambda: {
|
93
|
-
"model_name": "claude-3-
|
92
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
94
93
|
"temperature": 1.0,
|
95
94
|
"image_size": 768,
|
96
95
|
}
|
97
96
|
)
|
98
97
|
|
98
|
+
# for get_tool_for_task
|
99
|
+
od_judge: Type[LMM] = Field(default=AnthropicLMM)
|
100
|
+
od_judge_kwargs: dict = Field(
|
101
|
+
default_factory=lambda: {
|
102
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
103
|
+
"temperature": 0.0,
|
104
|
+
"image_size": 512,
|
105
|
+
}
|
106
|
+
)
|
107
|
+
|
99
108
|
# for suggestions module
|
100
|
-
suggester: Type[LMM] = Field(default=
|
109
|
+
suggester: Type[LMM] = Field(default=OpenAILMM)
|
101
110
|
suggester_kwargs: dict = Field(
|
102
111
|
default_factory=lambda: {
|
103
|
-
"model_name": "
|
112
|
+
"model_name": "o1",
|
104
113
|
"temperature": 1.0,
|
105
|
-
"
|
114
|
+
"image_detail": "high",
|
115
|
+
"image_size": 1024,
|
106
116
|
}
|
107
117
|
)
|
108
118
|
|
@@ -110,7 +120,7 @@ class Config(BaseModel):
|
|
110
120
|
vqa: Type[LMM] = Field(default=AnthropicLMM)
|
111
121
|
vqa_kwargs: dict = Field(
|
112
122
|
default_factory=lambda: {
|
113
|
-
"model_name": "claude-3-
|
123
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
114
124
|
"temperature": 0.0,
|
115
125
|
"image_size": 768,
|
116
126
|
}
|
@@ -143,6 +153,9 @@ class Config(BaseModel):
|
|
143
153
|
def create_tool_chooser(self) -> LMM:
|
144
154
|
return self.tool_chooser(**self.tool_chooser_kwargs)
|
145
155
|
|
156
|
+
def create_od_judge(self) -> LMM:
|
157
|
+
return self.od_judge(**self.od_judge_kwargs)
|
158
|
+
|
146
159
|
def create_suggester(self) -> LMM:
|
147
160
|
return self.suggester(**self.suggester_kwargs)
|
148
161
|
|
vision_agent/configs/config.py
CHANGED
@@ -2,7 +2,7 @@ from typing import Type
|
|
2
2
|
|
3
3
|
from pydantic import BaseModel, Field
|
4
4
|
|
5
|
-
from vision_agent.lmm import LMM, AnthropicLMM, OpenAILMM
|
5
|
+
from vision_agent.lmm import LMM, AnthropicLMM, OpenAILMM, GoogleLMM
|
6
6
|
|
7
7
|
|
8
8
|
class Config(BaseModel):
|
@@ -10,7 +10,7 @@ class Config(BaseModel):
|
|
10
10
|
agent: Type[LMM] = Field(default=AnthropicLMM)
|
11
11
|
agent_kwargs: dict = Field(
|
12
12
|
default_factory=lambda: {
|
13
|
-
"model_name": "claude-3-
|
13
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
14
14
|
"temperature": 0.0,
|
15
15
|
"image_size": 768,
|
16
16
|
}
|
@@ -20,17 +20,16 @@ class Config(BaseModel):
|
|
20
20
|
planner: Type[LMM] = Field(default=AnthropicLMM)
|
21
21
|
planner_kwargs: dict = Field(
|
22
22
|
default_factory=lambda: {
|
23
|
-
"model_name": "claude-3-
|
23
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
24
24
|
"temperature": 0.0,
|
25
25
|
"image_size": 768,
|
26
26
|
}
|
27
27
|
)
|
28
28
|
|
29
|
-
|
30
|
-
summarizer: Type[LMM] = Field(default=OpenAILMM)
|
29
|
+
summarizer: Type[LMM] = Field(default=AnthropicLMM)
|
31
30
|
summarizer_kwargs: dict = Field(
|
32
31
|
default_factory=lambda: {
|
33
|
-
"model_name": "
|
32
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
34
33
|
"temperature": 1.0, # o1 has fixed temperature
|
35
34
|
"image_size": 768,
|
36
35
|
}
|
@@ -40,7 +39,7 @@ class Config(BaseModel):
|
|
40
39
|
critic: Type[LMM] = Field(default=AnthropicLMM)
|
41
40
|
critic_kwargs: dict = Field(
|
42
41
|
default_factory=lambda: {
|
43
|
-
"model_name": "claude-3-
|
42
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
44
43
|
"temperature": 0.0,
|
45
44
|
"image_size": 768,
|
46
45
|
}
|
@@ -50,7 +49,7 @@ class Config(BaseModel):
|
|
50
49
|
coder: Type[LMM] = Field(default=AnthropicLMM)
|
51
50
|
coder_kwargs: dict = Field(
|
52
51
|
default_factory=lambda: {
|
53
|
-
"model_name": "claude-3-
|
52
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
54
53
|
"temperature": 0.0,
|
55
54
|
"image_size": 768,
|
56
55
|
}
|
@@ -60,7 +59,7 @@ class Config(BaseModel):
|
|
60
59
|
tester: Type[LMM] = Field(default=AnthropicLMM)
|
61
60
|
tester_kwargs: dict = Field(
|
62
61
|
default_factory=lambda: {
|
63
|
-
"model_name": "claude-3-
|
62
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
64
63
|
"temperature": 0.0,
|
65
64
|
"image_size": 768,
|
66
65
|
}
|
@@ -70,7 +69,7 @@ class Config(BaseModel):
|
|
70
69
|
debugger: Type[LMM] = Field(default=AnthropicLMM)
|
71
70
|
debugger_kwargs: dict = Field(
|
72
71
|
default_factory=lambda: {
|
73
|
-
"model_name": "claude-3-
|
72
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
74
73
|
"temperature": 0.0,
|
75
74
|
"image_size": 768,
|
76
75
|
}
|
@@ -80,7 +79,7 @@ class Config(BaseModel):
|
|
80
79
|
tool_tester: Type[LMM] = Field(default=AnthropicLMM)
|
81
80
|
tool_tester_kwargs: dict = Field(
|
82
81
|
default_factory=lambda: {
|
83
|
-
"model_name": "claude-3-
|
82
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
84
83
|
"temperature": 0.0,
|
85
84
|
"image_size": 768,
|
86
85
|
}
|
@@ -90,7 +89,7 @@ class Config(BaseModel):
|
|
90
89
|
tool_chooser: Type[LMM] = Field(default=AnthropicLMM)
|
91
90
|
tool_chooser_kwargs: dict = Field(
|
92
91
|
default_factory=lambda: {
|
93
|
-
"model_name": "claude-3-
|
92
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
94
93
|
"temperature": 1.0,
|
95
94
|
"image_size": 768,
|
96
95
|
}
|
@@ -100,7 +99,7 @@ class Config(BaseModel):
|
|
100
99
|
od_judge: Type[LMM] = Field(default=AnthropicLMM)
|
101
100
|
od_judge_kwargs: dict = Field(
|
102
101
|
default_factory=lambda: {
|
103
|
-
"model_name": "claude-3-
|
102
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
104
103
|
"temperature": 0.0,
|
105
104
|
"image_size": 512,
|
106
105
|
}
|
@@ -118,10 +117,10 @@ class Config(BaseModel):
|
|
118
117
|
)
|
119
118
|
|
120
119
|
# for vqa module
|
121
|
-
vqa: Type[LMM] = Field(default=
|
120
|
+
vqa: Type[LMM] = Field(default=GoogleLMM)
|
122
121
|
vqa_kwargs: dict = Field(
|
123
122
|
default_factory=lambda: {
|
124
|
-
"model_name": "
|
123
|
+
"model_name": "gemini-2.0-flash-exp",
|
125
124
|
"temperature": 0.0,
|
126
125
|
"image_size": 768,
|
127
126
|
}
|
@@ -10,7 +10,7 @@ class Config(BaseModel):
|
|
10
10
|
agent: Type[LMM] = Field(default=OpenAILMM)
|
11
11
|
agent_kwargs: dict = Field(
|
12
12
|
default_factory=lambda: {
|
13
|
-
"model_name": "gpt-4o-2024-
|
13
|
+
"model_name": "gpt-4o-2024-11-20",
|
14
14
|
"temperature": 0.0,
|
15
15
|
"image_size": 768,
|
16
16
|
"image_detail": "low",
|
@@ -21,7 +21,7 @@ class Config(BaseModel):
|
|
21
21
|
planner: Type[LMM] = Field(default=OpenAILMM)
|
22
22
|
planner_kwargs: dict = Field(
|
23
23
|
default_factory=lambda: {
|
24
|
-
"model_name": "gpt-4o-2024-
|
24
|
+
"model_name": "gpt-4o-2024-11-20",
|
25
25
|
"temperature": 0.0,
|
26
26
|
"image_size": 768,
|
27
27
|
"image_detail": "low",
|
@@ -42,7 +42,7 @@ class Config(BaseModel):
|
|
42
42
|
critic: Type[LMM] = Field(default=OpenAILMM)
|
43
43
|
critic_kwargs: dict = Field(
|
44
44
|
default_factory=lambda: {
|
45
|
-
"model_name": "gpt-4o-2024-
|
45
|
+
"model_name": "gpt-4o-2024-11-20",
|
46
46
|
"temperature": 0.0,
|
47
47
|
"image_size": 768,
|
48
48
|
"image_detail": "low",
|
@@ -53,7 +53,7 @@ class Config(BaseModel):
|
|
53
53
|
coder: Type[LMM] = Field(default=OpenAILMM)
|
54
54
|
coder_kwargs: dict = Field(
|
55
55
|
default_factory=lambda: {
|
56
|
-
"model_name": "gpt-4o-2024-
|
56
|
+
"model_name": "gpt-4o-2024-11-20",
|
57
57
|
"temperature": 0.0,
|
58
58
|
"image_size": 768,
|
59
59
|
"image_detail": "low",
|
@@ -64,7 +64,7 @@ class Config(BaseModel):
|
|
64
64
|
tester: Type[LMM] = Field(default=OpenAILMM)
|
65
65
|
tester_kwargs: dict = Field(
|
66
66
|
default_factory=lambda: {
|
67
|
-
"model_name": "gpt-4o-2024-
|
67
|
+
"model_name": "gpt-4o-2024-11-20",
|
68
68
|
"temperature": 0.0,
|
69
69
|
"image_size": 768,
|
70
70
|
"image_detail": "low",
|
@@ -75,7 +75,7 @@ class Config(BaseModel):
|
|
75
75
|
debugger: Type[LMM] = Field(default=OpenAILMM)
|
76
76
|
debugger_kwargs: dict = Field(
|
77
77
|
default_factory=lambda: {
|
78
|
-
"model_name": "gpt-4o-2024-
|
78
|
+
"model_name": "gpt-4o-2024-11-20",
|
79
79
|
"temperature": 0.0,
|
80
80
|
"image_size": 768,
|
81
81
|
"image_detail": "low",
|
@@ -86,7 +86,7 @@ class Config(BaseModel):
|
|
86
86
|
tool_tester: Type[LMM] = Field(default=OpenAILMM)
|
87
87
|
tool_tester_kwargs: dict = Field(
|
88
88
|
default_factory=lambda: {
|
89
|
-
"model_name": "gpt-4o-2024-
|
89
|
+
"model_name": "gpt-4o-2024-11-20",
|
90
90
|
"temperature": 0.0,
|
91
91
|
"image_size": 768,
|
92
92
|
"image_detail": "low",
|
@@ -97,7 +97,7 @@ class Config(BaseModel):
|
|
97
97
|
tool_chooser: Type[LMM] = Field(default=OpenAILMM)
|
98
98
|
tool_chooser_kwargs: dict = Field(
|
99
99
|
default_factory=lambda: {
|
100
|
-
"model_name": "gpt-4o-2024-
|
100
|
+
"model_name": "gpt-4o-2024-11-20",
|
101
101
|
"temperature": 1.0,
|
102
102
|
"image_size": 768,
|
103
103
|
"image_detail": "low",
|
@@ -108,7 +108,7 @@ class Config(BaseModel):
|
|
108
108
|
suggester: Type[LMM] = Field(default=OpenAILMM)
|
109
109
|
suggester_kwargs: dict = Field(
|
110
110
|
default_factory=lambda: {
|
111
|
-
"model_name": "gpt-4o-2024-
|
111
|
+
"model_name": "gpt-4o-2024-11-20",
|
112
112
|
"temperature": 1.0,
|
113
113
|
"image_size": 768,
|
114
114
|
"image_detail": "low",
|
@@ -119,7 +119,7 @@ class Config(BaseModel):
|
|
119
119
|
vqa: Type[LMM] = Field(default=OpenAILMM)
|
120
120
|
vqa_kwargs: dict = Field(
|
121
121
|
default_factory=lambda: {
|
122
|
-
"model_name": "gpt-4o-2024-
|
122
|
+
"model_name": "gpt-4o-2024-11-20",
|
123
123
|
"temperature": 0.0,
|
124
124
|
"image_size": 768,
|
125
125
|
"image_detail": "low",
|
vision_agent/lmm/lmm.py
CHANGED
@@ -98,7 +98,7 @@ class OpenAILMM(LMM):
|
|
98
98
|
for c in chat:
|
99
99
|
fixed_c = {"role": c["role"]}
|
100
100
|
fixed_c["content"] = [{"type": "text", "text": c["content"]}] # type: ignore
|
101
|
-
if "media" in c:
|
101
|
+
if "media" in c and self.model_name != "o3-mini":
|
102
102
|
for media in c["media"]:
|
103
103
|
resize = kwargs["resize"] if "resize" in kwargs else self.image_size
|
104
104
|
image_detail = (
|
@@ -154,7 +154,7 @@ class OpenAILMM(LMM):
|
|
154
154
|
],
|
155
155
|
}
|
156
156
|
]
|
157
|
-
if media and len(media) > 0:
|
157
|
+
if media and len(media) > 0 and self.model_name != "o3-mini":
|
158
158
|
for m in media:
|
159
159
|
resize = kwargs["resize"] if "resize" in kwargs else None
|
160
160
|
image_detail = (
|
vision_agent/sim/sim.py
CHANGED
@@ -58,7 +58,10 @@ def stella_embeddings(prompts: List[str]) -> List[np.ndarray]:
|
|
58
58
|
}
|
59
59
|
url = f"{_LND_API_URL_v2}/embeddings"
|
60
60
|
vision_agent_api_key = get_vision_agent_api_key()
|
61
|
-
headers = {
|
61
|
+
headers = {
|
62
|
+
"Authorization": f"Basic {vision_agent_api_key}",
|
63
|
+
"X-Source": "vision_agent",
|
64
|
+
}
|
62
65
|
session = _create_requests_session(
|
63
66
|
url=url,
|
64
67
|
num_retry=3,
|
@@ -236,7 +236,7 @@ def retrieve_tool_docs(lmm: LMM, task: str, exclude_tools: Optional[List[str]])
|
|
236
236
|
all_tool_docs = []
|
237
237
|
all_tool_doc_names = set()
|
238
238
|
exclude_tools = [] if exclude_tools is None else exclude_tools
|
239
|
-
for category in categories:
|
239
|
+
for category in categories + [task]:
|
240
240
|
tool_docs = sim.top_k(category, k=3, thresh=0.3)
|
241
241
|
|
242
242
|
for tool_doc in tool_docs:
|
@@ -248,9 +248,7 @@ def retrieve_tool_docs(lmm: LMM, task: str, exclude_tools: Optional[List[str]])
|
|
248
248
|
all_tool_doc_names.add(tool_doc["name"])
|
249
249
|
|
250
250
|
tool_docs_str = explanation + "\n\n" + "\n".join([e["doc"] for e in all_tool_docs])
|
251
|
-
tool_docs_str += (
|
252
|
-
"\n" + get_load_tools_docstring() + get_tool_documentation([judge_od_results])
|
253
|
-
)
|
251
|
+
tool_docs_str += get_load_tools_docstring()
|
254
252
|
return tool_docs_str
|
255
253
|
|
256
254
|
|
@@ -346,22 +344,22 @@ def get_tool_for_task(
|
|
346
344
|
and output signatures are.
|
347
345
|
|
348
346
|
Parameters:
|
349
|
-
task
|
350
|
-
images
|
347
|
+
task (str): The task to accomplish.
|
348
|
+
images (Union[Dict[str, List[np.ndarray]], List[np.ndarray]]): The images to use
|
351
349
|
for the task. If a key is provided, it is used as the file name.
|
352
|
-
exclude_tools
|
350
|
+
exclude_tools (Optional[List[str]]): A list of tool names to exclude from the
|
353
351
|
recommendations. This is helpful if you are calling get_tool_for_task twice
|
354
352
|
and do not want the same tool recommended.
|
355
353
|
|
356
354
|
Returns:
|
357
|
-
The
|
355
|
+
None: The function does not return the tool but prints it to stdout.
|
358
356
|
|
359
357
|
Examples
|
360
358
|
--------
|
361
359
|
>>> get_tool_for_task(
|
362
360
|
>>> "Give me an OCR model that can find 'hot chocolate' in the image",
|
363
361
|
>>> {"image": [image]})
|
364
|
-
>>>
|
362
|
+
>>> get_tool_for_task(
|
365
363
|
>>> "I need a tool that can paint a background for this image and maks",
|
366
364
|
>>> {"image": [image], "mask": [mask]})
|
367
365
|
"""
|
@@ -497,8 +495,8 @@ def finalize_plan(user_request: str, chain_of_thoughts: str) -> str:
|
|
497
495
|
return finalized_plan
|
498
496
|
|
499
497
|
|
500
|
-
def
|
501
|
-
"""Asks the
|
498
|
+
def vqa(prompt: str, medias: List[np.ndarray]) -> None:
|
499
|
+
"""Asks the VQA model a question about the given media and returns an answer.
|
502
500
|
|
503
501
|
Parameters:
|
504
502
|
prompt: str: The question to ask the model.
|
@@ -515,13 +513,14 @@ def claude35_vqa(prompt: str, medias: List[np.ndarray]) -> None:
|
|
515
513
|
]
|
516
514
|
|
517
515
|
response = cast(str, vqa.generate(prompt, media=all_media_b64))
|
518
|
-
print(f"[
|
516
|
+
print(f"[vqa output]\n{response}\n[end of vqa output]")
|
519
517
|
|
520
518
|
|
521
519
|
def suggestion(prompt: str, medias: List[np.ndarray]) -> None:
|
522
520
|
"""Given your problem statement and the images, this will provide you with a
|
523
521
|
suggested plan on how to proceed. Always call suggestion when starting to solve
|
524
|
-
a problem.
|
522
|
+
a problem. 'suggestion' will only print pseudo code for you to execute, it will not
|
523
|
+
execute the code for you.
|
525
524
|
|
526
525
|
Parameters:
|
527
526
|
prompt: str: The problem statement, provide a detailed description of the
|
@@ -538,7 +537,7 @@ def suggestion(prompt: str, medias: List[np.ndarray]) -> None:
|
|
538
537
|
|
539
538
|
|
540
539
|
PLANNER_TOOLS = [
|
541
|
-
|
540
|
+
vqa,
|
542
541
|
suggestion,
|
543
542
|
get_tool_for_task,
|
544
543
|
]
|
vision_agent/tools/tools.py
CHANGED
@@ -1488,8 +1488,8 @@ def agentic_object_detection(
|
|
1488
1488
|
"""'agentic_object_detection' is a tool that can detect multiple objects given a
|
1489
1489
|
text prompt such as object names or referring expressions on images. It's
|
1490
1490
|
particularly good at detecting specific objects given detailed descriptive prompts
|
1491
|
-
but runs slower. It returns a list of bounding boxes
|
1492
|
-
label names and associated
|
1491
|
+
but runs slower so not ideal for high counts. It returns a list of bounding boxes
|
1492
|
+
with normalized coordinates, label names and associated confidence score of 1.0.
|
1493
1493
|
|
1494
1494
|
Parameters:
|
1495
1495
|
prompt (str): The prompt to ground to the image, only supports a single prompt
|
@@ -1533,8 +1533,9 @@ def agentic_sam2_instance_segmentation(
|
|
1533
1533
|
"""'agentic_sam2_instance_segmentation' is a tool that can detect multiple
|
1534
1534
|
instances given a text prompt such as object names or referring expressions on
|
1535
1535
|
images. It's particularly good at detecting specific objects given detailed
|
1536
|
-
descriptive prompts but runs slower. It returns a list
|
1537
|
-
normalized coordinates, label names, masks and associated
|
1536
|
+
descriptive prompts but runs slower so not ideal for high counts. It returns a list
|
1537
|
+
of bounding boxes with normalized coordinates, label names, masks and associated
|
1538
|
+
confidence score of 1.0.
|
1538
1539
|
|
1539
1540
|
Parameters:
|
1540
1541
|
prompt (str): The object that needs to be counted, only supports a single
|
@@ -1591,9 +1592,9 @@ def agentic_sam2_video_tracking(
|
|
1591
1592
|
"""'agentic_sam2_video_tracking' is a tool that can track and segment multiple
|
1592
1593
|
objects in a video given a text prompt such as object names or referring
|
1593
1594
|
expressions. It's particularly good at detecting specific objects given detailed
|
1594
|
-
descriptive prompts but runs slower
|
1595
|
-
names, masks and associated
|
1596
|
-
counting without duplicating counts.
|
1595
|
+
descriptive prompts but runs slower so not ideal for high counts. It returns a list
|
1596
|
+
of bounding boxes, label names, masks and associated confidence score of 1.0 and is
|
1597
|
+
useful for tracking and counting without duplicating counts.
|
1597
1598
|
|
1598
1599
|
Parameters:
|
1599
1600
|
prompt (str): The prompt to ground to the image, only supports a single prompt
|
@@ -2307,22 +2308,10 @@ def _qwenvl_activity_recognition(
|
|
2307
2308
|
return [0.0] * len(segment)
|
2308
2309
|
|
2309
2310
|
|
2310
|
-
def _qwen2vl_activity_recognition(
|
2311
|
-
segment: List[np.ndarray], prompt: str
|
2312
|
-
) -> List[float]:
|
2313
|
-
return _qwenvl_activity_recognition(segment, prompt, model_name="qwen2vl")
|
2314
|
-
|
2315
|
-
|
2316
|
-
def _qwen25vl_activity_recognition(
|
2317
|
-
segment: List[np.ndarray], prompt: str
|
2318
|
-
) -> List[float]:
|
2319
|
-
return _qwenvl_activity_recognition(segment, prompt, model_name="qwen25vl")
|
2320
|
-
|
2321
|
-
|
2322
2311
|
def activity_recognition(
|
2323
2312
|
prompt: str,
|
2324
2313
|
frames: List[np.ndarray],
|
2325
|
-
model: str = "
|
2314
|
+
model: str = "qwen25vl",
|
2326
2315
|
chunk_length_frames: int = 10,
|
2327
2316
|
) -> List[float]:
|
2328
2317
|
"""'activity_recognition' is a tool that can recognize activities in a video given a
|
@@ -2371,12 +2360,12 @@ def activity_recognition(
|
|
2371
2360
|
elif model == "qwen2vl":
|
2372
2361
|
|
2373
2362
|
def _apply_activity_recognition(segment: List[np.ndarray]) -> List[float]:
|
2374
|
-
return
|
2363
|
+
return _qwenvl_activity_recognition(segment, prompt, model_name="qwen2vl")
|
2375
2364
|
|
2376
2365
|
elif model == "qwen25vl":
|
2377
2366
|
|
2378
2367
|
def _apply_activity_recognition(segment: List[np.ndarray]) -> List[float]:
|
2379
|
-
return
|
2368
|
+
return _qwenvl_activity_recognition(segment, prompt, model_name="qwen25vl")
|
2380
2369
|
|
2381
2370
|
else:
|
2382
2371
|
raise ValueError(f"Invalid model: {model}")
|
@@ -3488,9 +3477,9 @@ def _plot_counting(
|
|
3488
3477
|
|
3489
3478
|
|
3490
3479
|
FUNCTION_TOOLS = [
|
3491
|
-
|
3492
|
-
|
3493
|
-
|
3480
|
+
glee_object_detection,
|
3481
|
+
glee_sam2_instance_segmentation,
|
3482
|
+
glee_sam2_video_tracking,
|
3494
3483
|
countgd_object_detection,
|
3495
3484
|
countgd_sam2_instance_segmentation,
|
3496
3485
|
countgd_sam2_video_tracking,
|
@@ -3502,8 +3491,8 @@ FUNCTION_TOOLS = [
|
|
3502
3491
|
document_extraction,
|
3503
3492
|
document_qa,
|
3504
3493
|
ocr,
|
3505
|
-
|
3506
|
-
|
3494
|
+
qwen25_vl_images_vqa,
|
3495
|
+
qwen25_vl_video_vqa,
|
3507
3496
|
activity_recognition,
|
3508
3497
|
depth_anything_v2,
|
3509
3498
|
generate_pose_image,
|
vision_agent/utils/tools.py
CHANGED
@@ -56,7 +56,10 @@ def send_inference_request(
|
|
56
56
|
url = os.environ["TOOL_ENDPOINT_URL"]
|
57
57
|
|
58
58
|
vision_agent_api_key = get_vision_agent_api_key()
|
59
|
-
headers = {
|
59
|
+
headers = {
|
60
|
+
"Authorization": f"Basic {vision_agent_api_key}",
|
61
|
+
"X-Source": "vision_agent",
|
62
|
+
}
|
60
63
|
if "TOOL_ENDPOINT_AUTH" in os.environ:
|
61
64
|
headers["Authorization"] = os.environ["TOOL_ENDPOINT_AUTH"]
|
62
65
|
headers.pop("apikey")
|
@@ -90,7 +93,10 @@ def send_task_inference_request(
|
|
90
93
|
) -> Any:
|
91
94
|
url = f"{_LND_API_URL_v2}/{task_name}"
|
92
95
|
vision_agent_api_key = get_vision_agent_api_key()
|
93
|
-
headers = {
|
96
|
+
headers = {
|
97
|
+
"Authorization": f"Basic {vision_agent_api_key}",
|
98
|
+
"X-Source": "vision_agent",
|
99
|
+
}
|
94
100
|
session = _create_requests_session(
|
95
101
|
url=url,
|
96
102
|
num_retry=3,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.5
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -65,10 +65,10 @@ pip install vision-agent
|
|
65
65
|
|
66
66
|
```bash
|
67
67
|
export ANTHROPIC_API_KEY="your-api-key"
|
68
|
-
export
|
68
|
+
export GEMINI_API_KEY="your-api-key"
|
69
69
|
```
|
70
70
|
|
71
|
-
> **_NOTE:_** We found using both Anthropic Claude-3.
|
71
|
+
> **_NOTE:_** We found using both Anthropic Claude-3.7 and Gemini-2.0-Flash-Exp to be provide the best performance for VisionAgent. If you want to use a different LLM provider or only one, see 'Using Other LLM Providers' below.
|
72
72
|
|
73
73
|
You will also need to set your VisionAgent API key to be able to authenticate when using the hosted vision tools that we provide through our APIs. Currently, the APIs are free to use so you will only need to get it from [here](https://va.landing.ai/account/api-key).
|
74
74
|
|
@@ -147,5 +147,33 @@ directory. For example to change to Anthropic simply just run:
|
|
147
147
|
cp vision_agent/configs/anthropic_config.py vision_agent/configs/config.py
|
148
148
|
```
|
149
149
|
|
150
|
+
You can also modify the existing `config.py` file yourself to use a different LLM
|
151
|
+
provider, for example if you wanted to change the planner from Anthropic inside
|
152
|
+
`config.py` to OpenAI you would replace this code:
|
153
|
+
```python
|
154
|
+
planner: Type[LMM] = Field(default=AnthropicLMM)
|
155
|
+
planner_kwargs: dict = Field(
|
156
|
+
default_factory=lambda: {
|
157
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
158
|
+
"temperature": 0.0,
|
159
|
+
"image_size": 768,
|
160
|
+
}
|
161
|
+
)
|
162
|
+
```
|
163
|
+
|
164
|
+
with this code:
|
165
|
+
|
166
|
+
```python
|
167
|
+
planner: Type[LMM] = Field(default=OpenAILMM)
|
168
|
+
planner_kwargs: dict = Field(
|
169
|
+
default_factory=lambda: {
|
170
|
+
"model_name": "gpt-4o-2024-11-20",
|
171
|
+
"temperature": 0.0,
|
172
|
+
"image_size": 768,
|
173
|
+
"image_detail": "low",
|
174
|
+
}
|
175
|
+
)
|
176
|
+
```
|
177
|
+
|
150
178
|
> **_NOTE:_** VisionAgent moves fast and we are constantly updating and changing the library. If you have any questions or need help, please reach out to us on our discord channel.
|
151
179
|
|