vision-agent 1.0.4__py3-none-any.whl → 1.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,199 +0,0 @@
1
- USER_REQ = """
2
- ## User Request
3
- {user_request}
4
- """
5
-
6
- PLAN = """
7
- **Context**:
8
- {context}
9
-
10
- **Tools Available**:
11
- {tool_desc}
12
-
13
- **Previous Feedback**:
14
- {feedback}
15
-
16
- **Instructions**:
17
- 1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request.
18
- 2. For each subtask, be sure to include the tool(s) you want to use to accomplish that subtask.
19
- 3. Output three different plans each utilize a different strategy or set of tools ordering them from most likely to least likely to succeed.
20
-
21
- Output a list of jsons in the following format:
22
-
23
- ```json
24
- {{
25
- "plan1":
26
- {{
27
- "thoughts": str # your thought process for choosing this plan
28
- "instructions": [
29
- str # what you should do in this task associated with a tool
30
- ]
31
- }},
32
- "plan2": ...,
33
- "plan3": ...
34
- }}
35
- ```
36
- """
37
-
38
- TEST_PLANS = """
39
- **Role**: You are a software programmer responsible for testing different tools.
40
-
41
- **Task**: Your responsibility is to take a set of several plans and test the different tools for each plan.
42
-
43
- **Documentation**:
44
- This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`.
45
-
46
- {docstring}
47
-
48
- **Plans**:
49
- {plans}
50
-
51
- **Previous Attempts**:
52
- {previous_attempts}
53
-
54
- **Examples**:
55
- --- EXAMPLE1 ---
56
- plan1:
57
- - Load the image from the provided file path 'image.jpg'.
58
- - Use the 'owlv2_object_detection' tool with the prompt 'person' to detect and count the number of people in the image.
59
- plan2:
60
- - Load the image from the provided file path 'image.jpg'.
61
- - Use the 'florence2_sam2_instance_segmentation' tool with the prompt 'person' to detect and count the number of people in the image.
62
- - Count the number of detected objects labeled as 'person'.
63
- plan3:
64
- - Load the image from the provided file path 'image.jpg'.
65
- - Use the 'countgd_object_detection' tool to count the dominant foreground object, which in this case is people.
66
-
67
- ```python
68
- from vision_agent.tools import load_image, owlv2_object_detection, florence2_sam2_instance_segmentation, countgd_object_detection
69
- image = load_image("image.jpg")
70
- owl_v2_out = owlv2_object_detection("person", image)
71
-
72
- f2s2_out = florence2_sam2_instance_segmentation("person", image)
73
- # strip out the masks from the output becuase they don't provide useful information when printed
74
- f2s2_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in f2s2_out]
75
-
76
- cgd_out = countgd_object_detection("person", image)
77
-
78
- final_out = {{"owlv2_object_detection": owl_v2_out, "florence2_sam2_instance_segmentation": f2s2, "countgd_object_detection": cgd_out}}
79
- print(final_out)
80
- --- END EXAMPLE1 ---
81
-
82
- --- EXAMPLE2 ---
83
- plan1:
84
- - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
85
- - Use the 'owl_v2_video' tool with the prompt 'person' to detect where the people are in the video.
86
- plan2:
87
- - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
88
- - Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
89
- plan3:
90
- - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
91
- - Use the 'florence2_sam2_video_tracking' tool with the prompt 'person' to detect where the people are in the video.
92
-
93
-
94
- ```python
95
- import numpy as np
96
- from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking
97
-
98
- # sample at 1 FPS and use the first 10 frames to reduce processing time
99
- frames = extract_frames_and_timestamps("video.mp4", 1)
100
- frames = [f["frame"] for f in frames][:10]
101
-
102
- # strip arrays from the output to make it easier to read
103
- def remove_arrays(o):
104
- if isinstance(o, list):
105
- return [remove_arrays(e) for e in o]
106
- elif isinstance(o, dict):
107
- return {{k: remove_arrays(v) for k, v in o.items()}}
108
- elif isinstance(o, np.ndarray):
109
- return "array: " + str(o.shape)
110
- else:
111
- return o
112
-
113
- # return the counts of each label per frame to help determine the stability of the model results
114
- def get_counts(preds):
115
- counts = {{}}
116
- for i, pred_frame in enumerate(preds):
117
- counts_i = {{}}
118
- for pred in pred_frame:
119
- label = pred["label"].split(":")[1] if ":" in pred["label"] else pred["label"]
120
- counts_i[label] = counts_i.get(label, 0) + 1
121
- counts[f"frame_{{i}}"] = counts_i
122
- return counts
123
-
124
-
125
- # plan1
126
- owl_v2_out = owl_v2_video("person", frames)
127
- owl_v2_counts = get_counts(owl_v2_out)
128
-
129
- # plan2
130
- florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
131
- florence2_counts = get_counts(florence2_out)
132
-
133
- # plan3
134
- f2s2_tracking_out = florence2_sam2_video_tracking("person", frames)
135
- remove_arrays(f2s2_tracking_out)
136
- f2s2_counts = get_counts(f2s2_tracking_out)
137
-
138
- final_out = {{
139
- "owl_v2_video": owl_v2_out,
140
- "florence2_phrase_grounding": florence2_out,
141
- "florence2_sam2_video_tracking": f2s2_out,
142
- }}
143
-
144
- counts = {{
145
- "owl_v2_video": owl_v2_counts,
146
- "florence2_phrase_grounding": florence2_counts,
147
- "florence2_sam2_video_tracking": f2s2_counts,
148
- }}
149
-
150
- print(final_out)
151
- print(labels_and_scores)
152
- print(counts)
153
- ```
154
- --- END EXAMPLE2 ---
155
-
156
- **Instructions**:
157
- 1. Write a program to load the media and call each tool and print it's output along with other relevant information.
158
- 2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
159
- 3. Your test case MUST run only on the given images which are {media}
160
- 4. Print this final dictionary.
161
- 5. For video input, sample at 1 FPS and use the first 10 frames only to reduce processing time.
162
- """
163
-
164
- PREVIOUS_FAILED = """
165
- **Previous Failed Attempts**:
166
- You previously ran this code:
167
- ```python
168
- {code}
169
- ```
170
-
171
- But got the following error or no stdout:
172
- {error}
173
- """
174
-
175
- PICK_PLAN = """
176
- **Role**: You are an advanced AI model that can understand the user request and construct plans to accomplish it.
177
-
178
- **Task**: Your responsibility is to pick the best plan from the three plans provided.
179
-
180
- **Context**:
181
- {context}
182
-
183
- **Plans**:
184
- {plans}
185
-
186
- **Tool Output**:
187
- {tool_output}
188
-
189
- **Instructions**:
190
- 1. Re-read the user request, plans, tool outputs and examine the image.
191
- 2. Solve the problem yourself given the image and pick the most accurate plan that matches your solution the best.
192
- 3. Add modifications to improve the plan including: changing a tool, adding thresholds, string matching.
193
- 4. Output a JSON object with the following format:
194
- {{
195
- "predicted_answer": str # the answer you would expect from the best plan
196
- "thoughts": str # your thought process for choosing the best plan over other plans and any modifications you made
197
- "best_plan": str # the best plan you have chosen, must be `plan1`, `plan2`, or `plan3`
198
- }}
199
- """
@@ -1,312 +0,0 @@
1
- VA_CODE = """
2
- **Role**: You are a helpful agent that assists users with writing code.
3
-
4
- **Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execute_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be saved only AFTER you execute python code with `save_imgae` or `save_video`. The user can see all `artifacts`.
5
-
6
- <execute_python>
7
- print("Hello World!")
8
- </execute_python>
9
-
10
- **Documentation**:
11
- This is the documentation for the different actions you can take:
12
-
13
- {documentation}
14
-
15
- **Examples**:
16
- Here is an example of how you can interact with a user and Actions to complete a task:
17
- --- START EXAMPLES ---
18
- {examples}
19
- --- END EXAMPLES ---
20
-
21
- **Conversation**:
22
- Here is the current conversation so far:
23
- --- START CONVERSATION ---
24
- {conversation}
25
- --- END CONVERSATION ---
26
-
27
- **Instructions**:
28
- 1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
29
- 2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`. DO NOT run `edit_vision_code` or `edit_code_artifact` more than 2 times in a row and instead ask the user for help.
30
- 3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question or show your results to the user, set <let_user_respond> to `true`.
31
- 4. **Artifacts**: Files are only saved in `artifacts` after <execute_python>, do not try to access artifacts until you observe that they are loaded.
32
- 5. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation.
33
- 6. **Output**: You can only respond with <thinking>, <response>, <execute_python>, and <let_user_respond> tags.
34
-
35
- <thinking>Your thoughts here...</thinking>
36
- <response>Your response to the user here...</response>
37
- <execute_python>Your code here...</execute_python>
38
- <let_user_respond>true/false for whether or not to you want to let the user respond.</let_user_respond>
39
- """
40
-
41
-
42
- EXAMPLES_CODE1 = """
43
- USER: Can you write code to detect the dogs in this image? Media name dog.jpg
44
-
45
- OBSERVATION:
46
- [Artifacts loaded]
47
- Artifact dog.jpg loaded to /path/to/images/dog.jpg
48
- [End of artifacts]
49
-
50
- AGENT: <thinking>I will use the generate_vision_code to detect the dogs in the image.</thinking>
51
- <execute_python>
52
- generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg'])
53
- </execute_python>
54
- <let_user_respond>false</let_user_respond>
55
-
56
- OBSERVATION:
57
- [Artifact dog_detector.py (5 lines total)]
58
- 0|from vision_agent.tools import load_image, owlv2_object_detection
59
- 1|def detect_dogs(image_path: str):
60
- 2| image = load_image(image_path)
61
- 3| dogs = owlv2_object_detection("dog", image)
62
- 4| return dogs
63
- [End of artifact]
64
-
65
- AGENT: <thinking>I have generated the code to detect the dogs in the image, I must now run the code and print the results to get the output.</thinking>
66
- <execute_python>
67
- from dog_detector import detect_dogs
68
- print(detect_dogs('/example/workspace/dog.jpg'))
69
- </execute_python>
70
- <let_user_respond>false</let_user_respond>
71
-
72
- OBSERVATION:
73
- ----- stdout -----
74
- [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}, {'score': 0.23, 'label': 'dog', 'box': [0.2, 0.3, 0.4, 0.5]}]
75
-
76
-
77
- AGENT: <thinking>Two dogs are detected, I will show this to the user and ask them if the result looks good.</thinking>
78
- <response>The code detectd two dogs, do the results look good to you?</response>
79
- <let_user_respond>true</let_user_respond>
80
- """
81
-
82
- EXAMPLES_CODE1_EXTRA = """
83
- USER: The the image only has one dog, can you fix this?
84
-
85
- OBSERVATION:
86
- [Artifacts loaded]
87
- Artifact dog.jpg loaded to /path/to/images/dog.jpg
88
- Artifact dog_detector.py loaded to /path/to/code/dog_detector.py
89
- [End of artifacts]
90
-
91
- AGENT: <thinking>I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that</thinking>
92
- <execute_python>
93
- edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/path/to/images/dog.jpg'])
94
- </execute_python>
95
- <let_user_respond>false</let_user_respond>
96
-
97
- OBSERVATION:
98
- [Artifact dog_detector.py (5 lines total)]
99
- 0|from vision_agent.tools import load_image, owlv2_object_detection
100
- 1|def detect_dogs(image_path: str):
101
- 2| image = load_image(image_path)
102
- 3| dogs = owlv2_object_detection("dog", image, threshold=0.24)
103
- 4| return dogs
104
- [End of artifact]
105
-
106
- AGENT: <thinking>I have edited the code to detect only one dog, I must now run the code and print the results to get the output.</thinking>
107
- <execute_python>
108
- from dog_detector import detect_dogs
109
- print(detect_dogs('/path/to/images/dog.jpg'))
110
- </execute_python>
111
- <let_user_respond>false</let_user_respond>
112
-
113
- OBSERVATION:
114
- ----- stdout -----
115
- [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}]
116
-
117
- AGENT: <thinking>One dog is detected, I will show this to the user and ask them if the result looks good.</thinking>
118
- <response>The code detected one dog, do these results look good to you?</response>
119
- <let_user_respond>true</let_user_respond>
120
- """
121
-
122
- EXAMPLES_CODE2 = """
123
- USER: Can you describe this image?
124
-
125
- OBSERVATION:
126
- [Artifacts loaded]
127
- Artifact image.jpg loaded to /path/to/images/image.jpg
128
- [End of artifacts]
129
-
130
- AGENT: <thinking>The user hasn't asked me to write any code and the task is very simple so I will view the image and answer myself to respond to the user quickly.</thinking>
131
- <execute_python>
132
- view_media_artifacts('image.jpg')
133
- </execute_python>
134
- <let_user_respond>false</let_user_respond>
135
-
136
- OBSERVATION:
137
- [Image image.jpg displayed]
138
-
139
- AGENT: <thinking>The image shows a cat and a dog sitting on the couch, I will tell the user and ask them if they need any other assistance.</thinking>
140
- <response>The image contains a dog and a cat sitting on a couch. Can I help you with any other tasks?</response>
141
- <let_user_respond>true</let_user_respond>
142
- """
143
-
144
-
145
- EXAMPLES_CODE3 = """
146
- USER: Can you create a function to count workers with helmets, return the count and a visaulization of the bounding boxes?
147
-
148
- OBSERVATION:
149
- [Artifacts loaded]
150
- [End of artifacts]
151
-
152
- AGENT: <thinking>The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.</thinking>
153
- <response>Can you provide an image of workers with helmets?</response>
154
- <let_user_respond>true</let_user_respond>
155
-
156
- USER: Yes you can use workers.png
157
-
158
- OBSERVATION:
159
- [Artifacts loaded]
160
- Artifact workers.png loaded to /path/to/images/workers.png
161
- [End of artifacts]
162
-
163
- AGENT: <thinking>I will use the generate_vision_code to count the workers with helmets in the image.</thinking>
164
- <execute_python>
165
- generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/paths/to/images/workers.png'])
166
- </execute_python>
167
- <let_user_respond>false</let_user_respond>
168
-
169
- OBSERVATION:
170
- [Artifact code.py (17 lines total)]
171
- 0|from vision_agent.tools import load_image, florence2_object_detection, closest_box_distance, overlay_bounding_boxes, save_image
172
- 1|def count_workers_with_helmets(image_path: str):
173
- 2| image = load_image(image_path)
174
- 3| detections = florence2_object_detection("worker, helmet", image)
175
- 4| workers = [d for d in detections if d['label'] == 'worker']
176
- 5| helmets = [d for d in detections if d['label'] == 'helmet']
177
- 6| count = 0
178
- 6| for worker in workers:
179
- 7| person_has_helmet = False
180
- 8| for helmet in helmets:
181
- 9| if closest_box_distance(worker['box'], helmet['box']) < 0.01:
182
- 10| person_has_helmet = True
183
- 11| break
184
- 11| if person_has_helmet:
185
- 13| count += 1
186
- 14| viz = overlay_bounding_boxes(image, detections)
187
- 16| return count, viz
188
- [End of artifact]
189
-
190
- AGENT: <thinking>I have generated the code to count the workers with helmets in the image, I must now run the code, print the counts and save the visualization.</thinking>
191
- <execute_python>
192
- from code import count_workers_with_helmets
193
- count, viz = count_workers_with_helmets('/path/to/images/workers.png')
194
- save_image(viz, 'workers_viz.png')
195
- print(count)
196
- </execute_python>
197
- <let_user_respond>false</let_user_respond>
198
-
199
- OBSERVATION:
200
- ----- stdout -----
201
- 2
202
-
203
- AGENT: <thinking>Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.</thinking>
204
- <response>The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.</response>
205
- <let_user_respond>true</let_user_respond>
206
- """
207
-
208
- EXAMPLES_CODE3_EXTRA1 = """
209
- USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
210
-
211
- AGENT: <thinking>Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model</thinking>
212
- <execute_python>
213
- object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])
214
- </execute_python>
215
- <let_user_respond>false</let_user_respond>
216
-
217
- OBSERVATION:
218
- [Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
219
-
220
- AGENT: <thinking>The model has finished fine tuning, I will now replace the original florence2_object_detection call with the fine tuning id.</thinking>
221
- <execute_python>
222
- use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")
223
- </execute_python>
224
- <let_user_respond>false</let_user_respond>
225
-
226
- OBSERVATION:
227
- [Artifact code.py edits]
228
- ---
229
- +++
230
- @@ -1,7 +1,7 @@
231
- from vision_agent.tools import load_image, florence2_object_detection, closest_box_distance, overlay_bounding_boxes, save_image
232
- def count_workers_with_helmets(image_path: str, output_path: str):
233
- image = load_image(image_path)
234
- - detections = florence2_object_detection("worker, helmet", image)
235
- + detections = florence2_object_detection("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
236
- workers = [d for d in detections if d['label'] == 'worker']
237
- helmets = [d for d in detections if d['label'] == 'helmet']
238
- count = 0
239
- [End of edits]
240
-
241
- AGENT: <thinking>I have updated the code to use the fine tuned model, I must now run the code and print the output.</thinking>
242
- <execute_python>
243
- from code import count_workers_with_helmets
244
- print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))
245
- </execute_python>
246
- <let_user_respond>false</let_user_respond>
247
-
248
- OBSERVATION:
249
- ----- stdout -----
250
- 3
251
-
252
- AGENT: <thinking>Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.</thinking>
253
- <response>I have updated the code using the fine tuned florence2_object_detection model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.</response>
254
- <let_user_respond>true</let_user_respond>
255
- """
256
-
257
- EXAMPLES_CODE3_EXTRA2 = """
258
- USER: Can you write a streamlit app that allows me to upload an image and count the workers with helmets?
259
-
260
- AGENT: <thinking>I will use edit_code_artifact to generate a streamlit app that allows the user to upload an image and count the workers with helmets.</thinking>
261
- <execute_python>
262
- CODE = '''
263
- import streamlit as st
264
- from code imoprt count_workers_with_helmets
265
- import os
266
-
267
- def main():
268
- st.title("Worker with Helmet Counter")
269
- uploaded_file = st.file_uploader("Choose image...", type=["jpeg", "jpg", "png"])
270
- if uploaded_file is not None:
271
- with open("temp_image.jpg", "wb") as f:
272
- f.write(uploaded_file.read())
273
-
274
- count = count_workers_with_helmets("temp_image.jpg", "temp_image_viz.jpg")
275
- st.image("temp_image_viz.jpg")
276
- st.write(f"Number of workers with helmets: {count}")
277
- os.remove("temp_image.jpg")
278
- os.remove("temp_image_viz.jpg")
279
-
280
- if __name__ == "__main__":
281
- main()
282
- '''
283
- edit_code_artifact(artifacts, 'streamlit_app.py', 0, 0, CODE)
284
- </execute_python>
285
-
286
- OBSERVATION:
287
- [Artifact streamlit_app.py (19 lines total)]
288
- 0|import streamlit as st
289
- 1|from code imoprt count_workers_with_helmets
290
- 2|import os
291
- 3|
292
- 4|def main():
293
- 5| st.title("Worker with Helmet Counter")
294
- 6| uploaded_file = st.file_uploader("Choose image...", type=["jpeg", "jpg", "png"])
295
- 7| if uploaded_file is not None:
296
- 8| with open("temp_image.jpg", "wb") as f:
297
- 9| f.write(uploaded_file.read())
298
- 10|
299
- 11| count = count_workers_with_helmets("temp_image.jpg", "temp_image_viz.jpg")
300
- 12| st.image("temp_image_viz.jpg")
301
- 13| st.write(f"Number of workers with helmets: {count}")
302
- 14| os.remove("temp_image.jpg")
303
- 15| os.remove("temp_image_viz.jpg")
304
- 16|
305
- 17|if __name__ == "__main__":
306
- 18| main()
307
- [End of artifact]
308
-
309
- AGENT: <thinking>I have generated the streamlit app that allows the user to upload an image and count the workers</thinking>
310
- <response>The streamlit app is saved in streamlit_app.py, you can run the app by running `streamlit run streamlit_app.py`.</response>
311
- <let_user_respond>true</let_user_respond>
312
- """
@@ -1,164 +0,0 @@
1
- from typing import Type
2
-
3
- from pydantic import BaseModel, Field
4
-
5
- from vision_agent.lmm import LMM, AnthropicLMM, OpenAILMM
6
-
7
-
8
- class Config(BaseModel):
9
- # for vision_agent_v2
10
- agent: Type[LMM] = Field(default=AnthropicLMM)
11
- agent_kwargs: dict = Field(
12
- default_factory=lambda: {
13
- "model_name": "claude-3-5-sonnet-20241022",
14
- "temperature": 0.0,
15
- "image_size": 768,
16
- }
17
- )
18
-
19
- # for vision_agent_planner_v2
20
- planner: Type[LMM] = Field(default=AnthropicLMM)
21
- planner_kwargs: dict = Field(
22
- default_factory=lambda: {
23
- "model_name": "claude-3-5-sonnet-20241022",
24
- "temperature": 0.0,
25
- "image_size": 768,
26
- }
27
- )
28
-
29
- # for vision_agent_planner_v2
30
- summarizer: Type[LMM] = Field(default=OpenAILMM)
31
- summarizer_kwargs: dict = Field(
32
- default_factory=lambda: {
33
- "model_name": "o1",
34
- "temperature": 1.0, # o1 has fixed temperature
35
- "image_size": 768,
36
- }
37
- )
38
-
39
- # for vision_agent_planner_v2
40
- critic: Type[LMM] = Field(default=AnthropicLMM)
41
- critic_kwargs: dict = Field(
42
- default_factory=lambda: {
43
- "model_name": "claude-3-5-sonnet-20241022",
44
- "temperature": 0.0,
45
- "image_size": 768,
46
- }
47
- )
48
-
49
- # for vision_agent_coder_v2
50
- coder: Type[LMM] = Field(default=AnthropicLMM)
51
- coder_kwargs: dict = Field(
52
- default_factory=lambda: {
53
- "model_name": "claude-3-5-sonnet-20241022",
54
- "temperature": 0.0,
55
- "image_size": 768,
56
- }
57
- )
58
-
59
- # for vision_agent_coder_v2
60
- tester: Type[LMM] = Field(default=AnthropicLMM)
61
- tester_kwargs: dict = Field(
62
- default_factory=lambda: {
63
- "model_name": "claude-3-5-sonnet-20241022",
64
- "temperature": 0.0,
65
- "image_size": 768,
66
- }
67
- )
68
-
69
- # for vision_agent_coder_v2
70
- debugger: Type[LMM] = Field(default=AnthropicLMM)
71
- debugger_kwargs: dict = Field(
72
- default_factory=lambda: {
73
- "model_name": "claude-3-5-sonnet-20241022",
74
- "temperature": 0.0,
75
- "image_size": 768,
76
- }
77
- )
78
-
79
- # for get_tool_for_task
80
- tool_tester: Type[LMM] = Field(default=AnthropicLMM)
81
- tool_tester_kwargs: dict = Field(
82
- default_factory=lambda: {
83
- "model_name": "claude-3-5-sonnet-20241022",
84
- "temperature": 0.0,
85
- "image_size": 768,
86
- }
87
- )
88
-
89
- # for get_tool_for_task
90
- tool_chooser: Type[LMM] = Field(default=AnthropicLMM)
91
- tool_chooser_kwargs: dict = Field(
92
- default_factory=lambda: {
93
- "model_name": "claude-3-5-sonnet-20241022",
94
- "temperature": 1.0,
95
- "image_size": 768,
96
- }
97
- )
98
-
99
- # for get_tool_for_task
100
- od_judge: Type[LMM] = Field(default=AnthropicLMM)
101
- od_judge_kwargs: dict = Field(
102
- default_factory=lambda: {
103
- "model_name": "claude-3-5-sonnet-20241022",
104
- "temperature": 0.0,
105
- "image_size": 512,
106
- }
107
- )
108
-
109
- # for suggestions module
110
- suggester: Type[LMM] = Field(default=OpenAILMM)
111
- suggester_kwargs: dict = Field(
112
- default_factory=lambda: {
113
- "model_name": "o1",
114
- "temperature": 1.0,
115
- "image_detail": "high",
116
- "image_size": 1024,
117
- }
118
- )
119
-
120
- # for vqa module
121
- vqa: Type[LMM] = Field(default=AnthropicLMM)
122
- vqa_kwargs: dict = Field(
123
- default_factory=lambda: {
124
- "model_name": "claude-3-5-sonnet-20241022",
125
- "temperature": 0.0,
126
- "image_size": 768,
127
- }
128
- )
129
-
130
- def create_agent(self) -> LMM:
131
- return self.agent(**self.agent_kwargs)
132
-
133
- def create_planner(self) -> LMM:
134
- return self.planner(**self.planner_kwargs)
135
-
136
- def create_summarizer(self) -> LMM:
137
- return self.summarizer(**self.summarizer_kwargs)
138
-
139
- def create_critic(self) -> LMM:
140
- return self.critic(**self.critic_kwargs)
141
-
142
- def create_coder(self) -> LMM:
143
- return self.coder(**self.coder_kwargs)
144
-
145
- def create_tester(self) -> LMM:
146
- return self.tester(**self.tester_kwargs)
147
-
148
- def create_debugger(self) -> LMM:
149
- return self.debugger(**self.debugger_kwargs)
150
-
151
- def create_tool_tester(self) -> LMM:
152
- return self.tool_tester(**self.tool_tester_kwargs)
153
-
154
- def create_tool_chooser(self) -> LMM:
155
- return self.tool_chooser(**self.tool_chooser_kwargs)
156
-
157
- def create_od_judge(self) -> LMM:
158
- return self.od_judge(**self.od_judge_kwargs)
159
-
160
- def create_suggester(self) -> LMM:
161
- return self.suggester(**self.suggester_kwargs)
162
-
163
- def create_vqa(self) -> LMM:
164
- return self.vqa(**self.vqa_kwargs)