vision-agent 1.0.4__py3-none-any.whl → 1.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/.sim_tools/df.csv +46 -47
- vision_agent/.sim_tools/embs.npy +0 -0
- vision_agent/agent/__init__.py +0 -16
- vision_agent/agent/vision_agent_planner_prompts_v2.py +57 -58
- vision_agent/agent/vision_agent_planner_v2.py +3 -2
- vision_agent/configs/anthropic_config.py +29 -16
- vision_agent/configs/config.py +14 -15
- vision_agent/configs/openai_config.py +10 -10
- vision_agent/lmm/lmm.py +2 -2
- vision_agent/tools/__init__.py +0 -6
- vision_agent/tools/meta_tools.py +1 -492
- vision_agent/tools/planner_tools.py +13 -14
- vision_agent/tools/tools.py +16 -27
- {vision_agent-1.0.4.dist-info → vision_agent-1.0.7.dist-info}/METADATA +31 -3
- {vision_agent-1.0.4.dist-info → vision_agent-1.0.7.dist-info}/RECORD +17 -24
- vision_agent/agent/vision_agent.py +0 -605
- vision_agent/agent/vision_agent_coder.py +0 -742
- vision_agent/agent/vision_agent_coder_prompts.py +0 -290
- vision_agent/agent/vision_agent_planner.py +0 -564
- vision_agent/agent/vision_agent_planner_prompts.py +0 -199
- vision_agent/agent/vision_agent_prompts.py +0 -312
- vision_agent/configs/anthropic_openai_config.py +0 -164
- {vision_agent-1.0.4.dist-info → vision_agent-1.0.7.dist-info}/LICENSE +0 -0
- {vision_agent-1.0.4.dist-info → vision_agent-1.0.7.dist-info}/WHEEL +0 -0
@@ -2,7 +2,7 @@ from typing import Type
|
|
2
2
|
|
3
3
|
from pydantic import BaseModel, Field
|
4
4
|
|
5
|
-
from vision_agent.lmm import LMM, AnthropicLMM
|
5
|
+
from vision_agent.lmm import LMM, AnthropicLMM, OpenAILMM
|
6
6
|
|
7
7
|
|
8
8
|
class Config(BaseModel):
|
@@ -10,7 +10,7 @@ class Config(BaseModel):
|
|
10
10
|
agent: Type[LMM] = Field(default=AnthropicLMM)
|
11
11
|
agent_kwargs: dict = Field(
|
12
12
|
default_factory=lambda: {
|
13
|
-
"model_name": "claude-3-
|
13
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
14
14
|
"temperature": 0.0,
|
15
15
|
"image_size": 768,
|
16
16
|
}
|
@@ -20,18 +20,17 @@ class Config(BaseModel):
|
|
20
20
|
planner: Type[LMM] = Field(default=AnthropicLMM)
|
21
21
|
planner_kwargs: dict = Field(
|
22
22
|
default_factory=lambda: {
|
23
|
-
"model_name": "claude-3-
|
23
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
24
24
|
"temperature": 0.0,
|
25
25
|
"image_size": 768,
|
26
26
|
}
|
27
27
|
)
|
28
28
|
|
29
|
-
# for vision_agent_planner_v2
|
30
29
|
summarizer: Type[LMM] = Field(default=AnthropicLMM)
|
31
30
|
summarizer_kwargs: dict = Field(
|
32
31
|
default_factory=lambda: {
|
33
|
-
"model_name": "claude-3-
|
34
|
-
"temperature":
|
32
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
33
|
+
"temperature": 1.0, # o1 has fixed temperature
|
35
34
|
"image_size": 768,
|
36
35
|
}
|
37
36
|
)
|
@@ -40,7 +39,7 @@ class Config(BaseModel):
|
|
40
39
|
critic: Type[LMM] = Field(default=AnthropicLMM)
|
41
40
|
critic_kwargs: dict = Field(
|
42
41
|
default_factory=lambda: {
|
43
|
-
"model_name": "claude-3-
|
42
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
44
43
|
"temperature": 0.0,
|
45
44
|
"image_size": 768,
|
46
45
|
}
|
@@ -50,7 +49,7 @@ class Config(BaseModel):
|
|
50
49
|
coder: Type[LMM] = Field(default=AnthropicLMM)
|
51
50
|
coder_kwargs: dict = Field(
|
52
51
|
default_factory=lambda: {
|
53
|
-
"model_name": "claude-3-
|
52
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
54
53
|
"temperature": 0.0,
|
55
54
|
"image_size": 768,
|
56
55
|
}
|
@@ -60,7 +59,7 @@ class Config(BaseModel):
|
|
60
59
|
tester: Type[LMM] = Field(default=AnthropicLMM)
|
61
60
|
tester_kwargs: dict = Field(
|
62
61
|
default_factory=lambda: {
|
63
|
-
"model_name": "claude-3-
|
62
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
64
63
|
"temperature": 0.0,
|
65
64
|
"image_size": 768,
|
66
65
|
}
|
@@ -70,7 +69,7 @@ class Config(BaseModel):
|
|
70
69
|
debugger: Type[LMM] = Field(default=AnthropicLMM)
|
71
70
|
debugger_kwargs: dict = Field(
|
72
71
|
default_factory=lambda: {
|
73
|
-
"model_name": "claude-3-
|
72
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
74
73
|
"temperature": 0.0,
|
75
74
|
"image_size": 768,
|
76
75
|
}
|
@@ -80,7 +79,7 @@ class Config(BaseModel):
|
|
80
79
|
tool_tester: Type[LMM] = Field(default=AnthropicLMM)
|
81
80
|
tool_tester_kwargs: dict = Field(
|
82
81
|
default_factory=lambda: {
|
83
|
-
"model_name": "claude-3-
|
82
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
84
83
|
"temperature": 0.0,
|
85
84
|
"image_size": 768,
|
86
85
|
}
|
@@ -90,19 +89,30 @@ class Config(BaseModel):
|
|
90
89
|
tool_chooser: Type[LMM] = Field(default=AnthropicLMM)
|
91
90
|
tool_chooser_kwargs: dict = Field(
|
92
91
|
default_factory=lambda: {
|
93
|
-
"model_name": "claude-3-
|
92
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
94
93
|
"temperature": 1.0,
|
95
94
|
"image_size": 768,
|
96
95
|
}
|
97
96
|
)
|
98
97
|
|
98
|
+
# for get_tool_for_task
|
99
|
+
od_judge: Type[LMM] = Field(default=AnthropicLMM)
|
100
|
+
od_judge_kwargs: dict = Field(
|
101
|
+
default_factory=lambda: {
|
102
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
103
|
+
"temperature": 0.0,
|
104
|
+
"image_size": 512,
|
105
|
+
}
|
106
|
+
)
|
107
|
+
|
99
108
|
# for suggestions module
|
100
|
-
suggester: Type[LMM] = Field(default=
|
109
|
+
suggester: Type[LMM] = Field(default=OpenAILMM)
|
101
110
|
suggester_kwargs: dict = Field(
|
102
111
|
default_factory=lambda: {
|
103
|
-
"model_name": "
|
112
|
+
"model_name": "o1",
|
104
113
|
"temperature": 1.0,
|
105
|
-
"
|
114
|
+
"image_detail": "high",
|
115
|
+
"image_size": 1024,
|
106
116
|
}
|
107
117
|
)
|
108
118
|
|
@@ -110,7 +120,7 @@ class Config(BaseModel):
|
|
110
120
|
vqa: Type[LMM] = Field(default=AnthropicLMM)
|
111
121
|
vqa_kwargs: dict = Field(
|
112
122
|
default_factory=lambda: {
|
113
|
-
"model_name": "claude-3-
|
123
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
114
124
|
"temperature": 0.0,
|
115
125
|
"image_size": 768,
|
116
126
|
}
|
@@ -143,6 +153,9 @@ class Config(BaseModel):
|
|
143
153
|
def create_tool_chooser(self) -> LMM:
|
144
154
|
return self.tool_chooser(**self.tool_chooser_kwargs)
|
145
155
|
|
156
|
+
def create_od_judge(self) -> LMM:
|
157
|
+
return self.od_judge(**self.od_judge_kwargs)
|
158
|
+
|
146
159
|
def create_suggester(self) -> LMM:
|
147
160
|
return self.suggester(**self.suggester_kwargs)
|
148
161
|
|
vision_agent/configs/config.py
CHANGED
@@ -2,7 +2,7 @@ from typing import Type
|
|
2
2
|
|
3
3
|
from pydantic import BaseModel, Field
|
4
4
|
|
5
|
-
from vision_agent.lmm import LMM, AnthropicLMM, OpenAILMM
|
5
|
+
from vision_agent.lmm import LMM, AnthropicLMM, OpenAILMM, GoogleLMM
|
6
6
|
|
7
7
|
|
8
8
|
class Config(BaseModel):
|
@@ -10,7 +10,7 @@ class Config(BaseModel):
|
|
10
10
|
agent: Type[LMM] = Field(default=AnthropicLMM)
|
11
11
|
agent_kwargs: dict = Field(
|
12
12
|
default_factory=lambda: {
|
13
|
-
"model_name": "claude-3-
|
13
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
14
14
|
"temperature": 0.0,
|
15
15
|
"image_size": 768,
|
16
16
|
}
|
@@ -20,17 +20,16 @@ class Config(BaseModel):
|
|
20
20
|
planner: Type[LMM] = Field(default=AnthropicLMM)
|
21
21
|
planner_kwargs: dict = Field(
|
22
22
|
default_factory=lambda: {
|
23
|
-
"model_name": "claude-3-
|
23
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
24
24
|
"temperature": 0.0,
|
25
25
|
"image_size": 768,
|
26
26
|
}
|
27
27
|
)
|
28
28
|
|
29
|
-
|
30
|
-
summarizer: Type[LMM] = Field(default=OpenAILMM)
|
29
|
+
summarizer: Type[LMM] = Field(default=AnthropicLMM)
|
31
30
|
summarizer_kwargs: dict = Field(
|
32
31
|
default_factory=lambda: {
|
33
|
-
"model_name": "
|
32
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
34
33
|
"temperature": 1.0, # o1 has fixed temperature
|
35
34
|
"image_size": 768,
|
36
35
|
}
|
@@ -40,7 +39,7 @@ class Config(BaseModel):
|
|
40
39
|
critic: Type[LMM] = Field(default=AnthropicLMM)
|
41
40
|
critic_kwargs: dict = Field(
|
42
41
|
default_factory=lambda: {
|
43
|
-
"model_name": "claude-3-
|
42
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
44
43
|
"temperature": 0.0,
|
45
44
|
"image_size": 768,
|
46
45
|
}
|
@@ -50,7 +49,7 @@ class Config(BaseModel):
|
|
50
49
|
coder: Type[LMM] = Field(default=AnthropicLMM)
|
51
50
|
coder_kwargs: dict = Field(
|
52
51
|
default_factory=lambda: {
|
53
|
-
"model_name": "claude-3-
|
52
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
54
53
|
"temperature": 0.0,
|
55
54
|
"image_size": 768,
|
56
55
|
}
|
@@ -60,7 +59,7 @@ class Config(BaseModel):
|
|
60
59
|
tester: Type[LMM] = Field(default=AnthropicLMM)
|
61
60
|
tester_kwargs: dict = Field(
|
62
61
|
default_factory=lambda: {
|
63
|
-
"model_name": "claude-3-
|
62
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
64
63
|
"temperature": 0.0,
|
65
64
|
"image_size": 768,
|
66
65
|
}
|
@@ -70,7 +69,7 @@ class Config(BaseModel):
|
|
70
69
|
debugger: Type[LMM] = Field(default=AnthropicLMM)
|
71
70
|
debugger_kwargs: dict = Field(
|
72
71
|
default_factory=lambda: {
|
73
|
-
"model_name": "claude-3-
|
72
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
74
73
|
"temperature": 0.0,
|
75
74
|
"image_size": 768,
|
76
75
|
}
|
@@ -80,7 +79,7 @@ class Config(BaseModel):
|
|
80
79
|
tool_tester: Type[LMM] = Field(default=AnthropicLMM)
|
81
80
|
tool_tester_kwargs: dict = Field(
|
82
81
|
default_factory=lambda: {
|
83
|
-
"model_name": "claude-3-
|
82
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
84
83
|
"temperature": 0.0,
|
85
84
|
"image_size": 768,
|
86
85
|
}
|
@@ -90,7 +89,7 @@ class Config(BaseModel):
|
|
90
89
|
tool_chooser: Type[LMM] = Field(default=AnthropicLMM)
|
91
90
|
tool_chooser_kwargs: dict = Field(
|
92
91
|
default_factory=lambda: {
|
93
|
-
"model_name": "claude-3-
|
92
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
94
93
|
"temperature": 1.0,
|
95
94
|
"image_size": 768,
|
96
95
|
}
|
@@ -100,7 +99,7 @@ class Config(BaseModel):
|
|
100
99
|
od_judge: Type[LMM] = Field(default=AnthropicLMM)
|
101
100
|
od_judge_kwargs: dict = Field(
|
102
101
|
default_factory=lambda: {
|
103
|
-
"model_name": "claude-3-
|
102
|
+
"model_name": "claude-3-7-sonnet-20250219",
|
104
103
|
"temperature": 0.0,
|
105
104
|
"image_size": 512,
|
106
105
|
}
|
@@ -118,10 +117,10 @@ class Config(BaseModel):
|
|
118
117
|
)
|
119
118
|
|
120
119
|
# for vqa module
|
121
|
-
vqa: Type[LMM] = Field(default=
|
120
|
+
vqa: Type[LMM] = Field(default=GoogleLMM)
|
122
121
|
vqa_kwargs: dict = Field(
|
123
122
|
default_factory=lambda: {
|
124
|
-
"model_name": "
|
123
|
+
"model_name": "gemini-2.0-flash-exp",
|
125
124
|
"temperature": 0.0,
|
126
125
|
"image_size": 768,
|
127
126
|
}
|
@@ -10,7 +10,7 @@ class Config(BaseModel):
|
|
10
10
|
agent: Type[LMM] = Field(default=OpenAILMM)
|
11
11
|
agent_kwargs: dict = Field(
|
12
12
|
default_factory=lambda: {
|
13
|
-
"model_name": "gpt-4o-2024-
|
13
|
+
"model_name": "gpt-4o-2024-11-20",
|
14
14
|
"temperature": 0.0,
|
15
15
|
"image_size": 768,
|
16
16
|
"image_detail": "low",
|
@@ -21,7 +21,7 @@ class Config(BaseModel):
|
|
21
21
|
planner: Type[LMM] = Field(default=OpenAILMM)
|
22
22
|
planner_kwargs: dict = Field(
|
23
23
|
default_factory=lambda: {
|
24
|
-
"model_name": "gpt-4o-2024-
|
24
|
+
"model_name": "gpt-4o-2024-11-20",
|
25
25
|
"temperature": 0.0,
|
26
26
|
"image_size": 768,
|
27
27
|
"image_detail": "low",
|
@@ -42,7 +42,7 @@ class Config(BaseModel):
|
|
42
42
|
critic: Type[LMM] = Field(default=OpenAILMM)
|
43
43
|
critic_kwargs: dict = Field(
|
44
44
|
default_factory=lambda: {
|
45
|
-
"model_name": "gpt-4o-2024-
|
45
|
+
"model_name": "gpt-4o-2024-11-20",
|
46
46
|
"temperature": 0.0,
|
47
47
|
"image_size": 768,
|
48
48
|
"image_detail": "low",
|
@@ -53,7 +53,7 @@ class Config(BaseModel):
|
|
53
53
|
coder: Type[LMM] = Field(default=OpenAILMM)
|
54
54
|
coder_kwargs: dict = Field(
|
55
55
|
default_factory=lambda: {
|
56
|
-
"model_name": "gpt-4o-2024-
|
56
|
+
"model_name": "gpt-4o-2024-11-20",
|
57
57
|
"temperature": 0.0,
|
58
58
|
"image_size": 768,
|
59
59
|
"image_detail": "low",
|
@@ -64,7 +64,7 @@ class Config(BaseModel):
|
|
64
64
|
tester: Type[LMM] = Field(default=OpenAILMM)
|
65
65
|
tester_kwargs: dict = Field(
|
66
66
|
default_factory=lambda: {
|
67
|
-
"model_name": "gpt-4o-2024-
|
67
|
+
"model_name": "gpt-4o-2024-11-20",
|
68
68
|
"temperature": 0.0,
|
69
69
|
"image_size": 768,
|
70
70
|
"image_detail": "low",
|
@@ -75,7 +75,7 @@ class Config(BaseModel):
|
|
75
75
|
debugger: Type[LMM] = Field(default=OpenAILMM)
|
76
76
|
debugger_kwargs: dict = Field(
|
77
77
|
default_factory=lambda: {
|
78
|
-
"model_name": "gpt-4o-2024-
|
78
|
+
"model_name": "gpt-4o-2024-11-20",
|
79
79
|
"temperature": 0.0,
|
80
80
|
"image_size": 768,
|
81
81
|
"image_detail": "low",
|
@@ -86,7 +86,7 @@ class Config(BaseModel):
|
|
86
86
|
tool_tester: Type[LMM] = Field(default=OpenAILMM)
|
87
87
|
tool_tester_kwargs: dict = Field(
|
88
88
|
default_factory=lambda: {
|
89
|
-
"model_name": "gpt-4o-2024-
|
89
|
+
"model_name": "gpt-4o-2024-11-20",
|
90
90
|
"temperature": 0.0,
|
91
91
|
"image_size": 768,
|
92
92
|
"image_detail": "low",
|
@@ -97,7 +97,7 @@ class Config(BaseModel):
|
|
97
97
|
tool_chooser: Type[LMM] = Field(default=OpenAILMM)
|
98
98
|
tool_chooser_kwargs: dict = Field(
|
99
99
|
default_factory=lambda: {
|
100
|
-
"model_name": "gpt-4o-2024-
|
100
|
+
"model_name": "gpt-4o-2024-11-20",
|
101
101
|
"temperature": 1.0,
|
102
102
|
"image_size": 768,
|
103
103
|
"image_detail": "low",
|
@@ -108,7 +108,7 @@ class Config(BaseModel):
|
|
108
108
|
suggester: Type[LMM] = Field(default=OpenAILMM)
|
109
109
|
suggester_kwargs: dict = Field(
|
110
110
|
default_factory=lambda: {
|
111
|
-
"model_name": "gpt-4o-2024-
|
111
|
+
"model_name": "gpt-4o-2024-11-20",
|
112
112
|
"temperature": 1.0,
|
113
113
|
"image_size": 768,
|
114
114
|
"image_detail": "low",
|
@@ -119,7 +119,7 @@ class Config(BaseModel):
|
|
119
119
|
vqa: Type[LMM] = Field(default=OpenAILMM)
|
120
120
|
vqa_kwargs: dict = Field(
|
121
121
|
default_factory=lambda: {
|
122
|
-
"model_name": "gpt-4o-2024-
|
122
|
+
"model_name": "gpt-4o-2024-11-20",
|
123
123
|
"temperature": 0.0,
|
124
124
|
"image_size": 768,
|
125
125
|
"image_detail": "low",
|
vision_agent/lmm/lmm.py
CHANGED
@@ -98,7 +98,7 @@ class OpenAILMM(LMM):
|
|
98
98
|
for c in chat:
|
99
99
|
fixed_c = {"role": c["role"]}
|
100
100
|
fixed_c["content"] = [{"type": "text", "text": c["content"]}] # type: ignore
|
101
|
-
if "media" in c:
|
101
|
+
if "media" in c and self.model_name != "o3-mini":
|
102
102
|
for media in c["media"]:
|
103
103
|
resize = kwargs["resize"] if "resize" in kwargs else self.image_size
|
104
104
|
image_detail = (
|
@@ -154,7 +154,7 @@ class OpenAILMM(LMM):
|
|
154
154
|
],
|
155
155
|
}
|
156
156
|
]
|
157
|
-
if media and len(media) > 0:
|
157
|
+
if media and len(media) > 0 and self.model_name != "o3-mini":
|
158
158
|
for m in media:
|
159
159
|
resize = kwargs["resize"] if "resize" in kwargs else None
|
160
160
|
image_detail = (
|
vision_agent/tools/__init__.py
CHANGED
@@ -1,13 +1,7 @@
|
|
1
1
|
from typing import Callable, List, Optional
|
2
2
|
|
3
3
|
from .meta_tools import (
|
4
|
-
create_code_artifact,
|
5
|
-
edit_code_artifact,
|
6
|
-
edit_vision_code,
|
7
|
-
generate_vision_code,
|
8
4
|
get_tool_descriptions,
|
9
|
-
list_artifacts,
|
10
|
-
open_code_artifact,
|
11
5
|
view_media_artifact,
|
12
6
|
)
|
13
7
|
from .planner_tools import judge_od_results
|