vision-agent 0.2.139__py3-none-any.whl → 0.2.141__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/agent/__init__.py +2 -1
- vision_agent/agent/agent_utils.py +8 -2
- vision_agent/agent/vision_agent.py +97 -17
- vision_agent/agent/vision_agent_coder.py +93 -66
- vision_agent/agent/vision_agent_coder_prompts.py +53 -19
- vision_agent/agent/vision_agent_prompts.py +31 -9
- vision_agent/lmm/__init__.py +1 -1
- vision_agent/lmm/lmm.py +6 -9
- vision_agent/tools/__init__.py +1 -1
- vision_agent/tools/meta_tools.py +64 -32
- vision_agent/tools/tool_utils.py +12 -6
- vision_agent/tools/tools.py +115 -30
- vision_agent/tools/tools_types.py +1 -0
- vision_agent/utils/image_utils.py +18 -7
- vision_agent/utils/video.py +2 -1
- {vision_agent-0.2.139.dist-info → vision_agent-0.2.141.dist-info}/METADATA +60 -12
- vision_agent-0.2.141.dist-info/RECORD +33 -0
- vision_agent-0.2.139.dist-info/RECORD +0 -33
- {vision_agent-0.2.139.dist-info → vision_agent-0.2.141.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.139.dist-info → vision_agent-0.2.141.dist-info}/WHEEL +0 -0
@@ -30,9 +30,10 @@ PLAN = """
|
|
30
30
|
|
31
31
|
**Instructions**:
|
32
32
|
1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request.
|
33
|
-
2.
|
33
|
+
2. For each subtask, be sure to include the tool(s) you want to use to accomplish that subtask.
|
34
|
+
3. Output three different plans each utilize a different strategy or set of tools ordering them from most likely to least likely to succeed.
|
34
35
|
|
35
|
-
Output a list of jsons in the following format
|
36
|
+
Output a list of jsons in the following format:
|
36
37
|
|
37
38
|
```json
|
38
39
|
{{
|
@@ -67,7 +68,7 @@ This is the documentation for the functions you have access to. You may call any
|
|
67
68
|
{previous_attempts}
|
68
69
|
|
69
70
|
**Instructions**:
|
70
|
-
1. Write a program to load the media and call each tool and
|
71
|
+
1. Write a program to load the media and call each tool and print it's output along with other relevant information.
|
71
72
|
2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
|
72
73
|
3. Your test case MUST run only on the given images which are {media}
|
73
74
|
4. Print this final dictionary.
|
@@ -102,24 +103,25 @@ print(final_out)
|
|
102
103
|
|
103
104
|
--- EXAMPLE2 ---
|
104
105
|
plan1:
|
105
|
-
- Extract frames from 'video.mp4' at 10 FPS using the '
|
106
|
-
- Use the '
|
106
|
+
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
|
107
|
+
- Use the 'owl_v2_video' tool with the prompt 'person' to detect where the people are in the video.
|
107
108
|
plan2:
|
108
|
-
- Extract frames from 'video.mp4' at 10 FPS using the '
|
109
|
+
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
|
109
110
|
- Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
|
110
111
|
plan3:
|
111
|
-
- Extract frames from 'video.mp4' at 10 FPS using the '
|
112
|
+
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
|
112
113
|
- Use the 'florence2_sam2_video_tracking' tool with the prompt 'person' to detect where the people are in the video.
|
113
114
|
|
114
115
|
|
115
116
|
```python
|
116
117
|
import numpy as np
|
117
|
-
from vision_agent.tools import
|
118
|
+
from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking
|
118
119
|
|
119
120
|
# sample at 1 FPS and use the first 10 frames to reduce processing time
|
120
|
-
frames =
|
121
|
-
frames = [f[
|
121
|
+
frames = extract_frames_and_timestamps("video.mp4", 1)
|
122
|
+
frames = [f["frame"] for f in frames][:10]
|
122
123
|
|
124
|
+
# strip arrays from the output to make it easier to read
|
123
125
|
def remove_arrays(o):
|
124
126
|
if isinstance(o, list):
|
125
127
|
return [remove_arrays(e) for e in o]
|
@@ -130,18 +132,46 @@ def remove_arrays(o):
|
|
130
132
|
else:
|
131
133
|
return o
|
132
134
|
|
135
|
+
# return the counts of each label per frame to help determine the stability of the model results
|
136
|
+
def get_counts(preds):
|
137
|
+
counts = {{}}
|
138
|
+
for i, pred_frame in enumerate(preds):
|
139
|
+
counts_i = {{}}
|
140
|
+
for pred in pred_frame:
|
141
|
+
label = pred["label"].split(":")[1] if ":" in pred["label"] else pred["label"]
|
142
|
+
counts_i[label] = counts_i.get(label, 0) + 1
|
143
|
+
counts[f"frame_{{i}}"] = counts_i
|
144
|
+
return counts
|
145
|
+
|
146
|
+
|
133
147
|
# plan1
|
134
|
-
owl_v2_out =
|
148
|
+
owl_v2_out = owl_v2_video("person", frames)
|
149
|
+
owl_v2_counts = get_counts(owl_v2_out)
|
135
150
|
|
136
151
|
# plan2
|
137
152
|
florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
|
153
|
+
florence2_counts = get_counts(florence2_out)
|
138
154
|
|
139
155
|
# plan3
|
140
156
|
f2s2_tracking_out = florence2_sam2_video_tracking("person", frames)
|
141
157
|
remove_arrays(f2s2_tracking_out)
|
158
|
+
f2s2_counts = get_counts(f2s2_tracking_out)
|
159
|
+
|
160
|
+
final_out = {{
|
161
|
+
"owl_v2_video": owl_v2_out,
|
162
|
+
"florence2_phrase_grounding": florence2_out,
|
163
|
+
"florence2_sam2_video_tracking": f2s2_out,
|
164
|
+
}}
|
165
|
+
|
166
|
+
counts = {{
|
167
|
+
"owl_v2_video": owl_v2_counts,
|
168
|
+
"florence2_phrase_grounding": florence2_counts,
|
169
|
+
"florence2_sam2_video_tracking": f2s2_counts,
|
170
|
+
}}
|
142
171
|
|
143
|
-
final_out = {{"owl_v2_image": owl_v2_out, "florence2_phrase_grounding": florence2_out, "florence2_sam2_video_tracking": f2s2_tracking_out}}
|
144
172
|
print(final_out)
|
173
|
+
print(labels_and_scores)
|
174
|
+
print(counts)
|
145
175
|
```
|
146
176
|
"""
|
147
177
|
|
@@ -159,7 +189,7 @@ But got the following error or no stdout:
|
|
159
189
|
|
160
190
|
|
161
191
|
PICK_PLAN = """
|
162
|
-
**Role**: You are
|
192
|
+
**Role**: You are an advanced AI model that can understand the user request and construct plans to accomplish it.
|
163
193
|
|
164
194
|
**Task**: Your responsibility is to pick the best plan from the three plans provided.
|
165
195
|
|
@@ -173,13 +203,14 @@ PICK_PLAN = """
|
|
173
203
|
{tool_output}
|
174
204
|
|
175
205
|
**Instructions**:
|
176
|
-
1.
|
177
|
-
2. Solve the problem yourself given the image and pick the plan that matches your solution the best.
|
206
|
+
1. Re-read the user request, plans, tool outputs and examine the image.
|
207
|
+
2. Solve the problem yourself given the image and pick the most accurate plan that matches your solution the best.
|
208
|
+
3. Add modifications to improve the plan including: changing a tool, adding thresholds, string matching.
|
178
209
|
3. Output a JSON object with the following format:
|
179
210
|
{{
|
180
211
|
"predicted_answer": str # the answer you would expect from the best plan
|
181
|
-
"thoughts": str # your thought process for choosing the best plan
|
182
|
-
"best_plan": str # the best plan you have chosen
|
212
|
+
"thoughts": str # your thought process for choosing the best plan over other plans and any modifications you made
|
213
|
+
"best_plan": str # the best plan you have chosen, must be `plan1`, `plan2`, or `plan3`
|
183
214
|
}}
|
184
215
|
"""
|
185
216
|
|
@@ -201,15 +232,18 @@ This is the documentation for the functions you have access to. You may call any
|
|
201
232
|
**User Instructions**:
|
202
233
|
{question}
|
203
234
|
|
204
|
-
**Tool
|
235
|
+
**Tool Tests and Outputs**:
|
205
236
|
{tool_output}
|
206
237
|
|
238
|
+
**Tool Output Thoughts**:
|
239
|
+
{plan_thoughts}
|
240
|
+
|
207
241
|
**Previous Feedback**:
|
208
242
|
{feedback}
|
209
243
|
|
210
244
|
**Instructions**:
|
211
245
|
1. **Understand and Clarify**: Make sure you understand the task.
|
212
|
-
2. **Algorithm/Method Selection**: Decide on the most efficient
|
246
|
+
2. **Algorithm/Method Selection**: Decide on the most efficient method, use the tool outputs and tool thoughts to guide you.
|
213
247
|
3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode.
|
214
248
|
4. **Code Generation**: Translate your pseudocode into executable Python code. Ensure you use correct arguments, remember coordinates are always returned normalized from `vision_agent.tools`. All images from `vision_agent.tools` are in RGB format, red is (255, 0, 0) and blue is (0, 0, 255).
|
215
249
|
"""
|
@@ -18,19 +18,24 @@ Here is an example of how you can interact with a user and Actions to complete a
|
|
18
18
|
{examples}
|
19
19
|
--- END EXAMPLES ---
|
20
20
|
|
21
|
-
**Instructions**:
|
22
|
-
1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
|
23
|
-
2. **Output in JSON**: Respond in JSON format, {{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
|
24
|
-
|
25
21
|
**Conversation**:
|
26
22
|
Here is the current conversation so far:
|
27
23
|
--- START CONVERSATION ---
|
28
24
|
{conversation}
|
25
|
+
--- END CONVERSATION ---
|
26
|
+
|
27
|
+
**Instructions**:
|
28
|
+
1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
|
29
|
+
2. **Output in JSON**: Respond in the following format in JSON:
|
30
|
+
|
31
|
+
```json
|
32
|
+
{{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
|
33
|
+
```
|
29
34
|
"""
|
30
35
|
|
31
36
|
|
32
37
|
EXAMPLES_CODE1 = """
|
33
|
-
USER: Can you detect the dogs in this image? Media name dog.jpg
|
38
|
+
USER: Can you write code to detect the dogs in this image? Media name dog.jpg
|
34
39
|
|
35
40
|
OBSERVATION:
|
36
41
|
[Artifacts loaded]
|
@@ -61,6 +66,7 @@ AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask
|
|
61
66
|
EXAMPLES_CODE1_EXTRA = """
|
62
67
|
USER: The the image only has one dog, can you fix this?
|
63
68
|
|
69
|
+
OBSERVATION:
|
64
70
|
[Artifacts loaded]
|
65
71
|
Artifact dog.jpg loaded to /path/to/images/dog.jpg
|
66
72
|
Artifact dog_detector.py loaded to /path/to/code/dog_detector.py
|
@@ -86,8 +92,24 @@ OBSERVATION:
|
|
86
92
|
AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect one dog and shown you the output, do the results look good to you?", "let_user_respond": true}
|
87
93
|
"""
|
88
94
|
|
89
|
-
|
90
95
|
EXAMPLES_CODE2 = """
|
96
|
+
USER: Can you describe this image?
|
97
|
+
|
98
|
+
OBSERVATION:
|
99
|
+
[Artifacts loaded]
|
100
|
+
Artifact image.jpg loaded to /path/to/images/image.jpg
|
101
|
+
[End of artifacts]
|
102
|
+
|
103
|
+
AGENT: {"thoughts": "The user hasn't asked me to write any code and the task is very simple so I will view the image and answer myself to respond to the user quickly.", "response": "<execute_python>view_media_artifacts('image.jpg')</execute_python>", "let_user_respond": false}
|
104
|
+
|
105
|
+
OBSERVATION:
|
106
|
+
[Image image.jpg displayed]
|
107
|
+
|
108
|
+
AGENT: {"thoughts": "The image shows a cat and a dog sitting on the couch, I will tell the user and ask them if they need any other assistance.", "response": "The image contains a dog and a cat sitting on a couch. Can I help you with any other tasks?", "let_user_respond": true}
|
109
|
+
"""
|
110
|
+
|
111
|
+
|
112
|
+
EXAMPLES_CODE3 = """
|
91
113
|
USER: Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?
|
92
114
|
|
93
115
|
OBSERVATION:
|
@@ -137,13 +159,13 @@ AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to
|
|
137
159
|
|
138
160
|
USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
|
139
161
|
|
140
|
-
AGENT: {"thoughts": "Because the user has supplied me with labels I can call
|
162
|
+
AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "I will fine tune florence2 with the labels you provided <execute_python>object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])</execute_python>", "let_user_respond": false}
|
141
163
|
|
142
164
|
OBSERVATION:
|
143
|
-
[
|
165
|
+
[Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
|
144
166
|
|
145
167
|
|
146
|
-
AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>
|
168
|
+
AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
|
147
169
|
|
148
170
|
OBSERVATION:
|
149
171
|
[Artifact code.py edits]
|
vision_agent/lmm/__init__.py
CHANGED
@@ -1,2 +1,2 @@
|
|
1
|
-
from .lmm import LMM,
|
1
|
+
from .lmm import LMM, AnthropicLMM, AzureOpenAILMM, OllamaLMM, OpenAILMM
|
2
2
|
from .types import Message
|
vision_agent/lmm/lmm.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
import json
|
2
|
-
import logging
|
3
2
|
import os
|
4
3
|
from abc import ABC, abstractmethod
|
5
4
|
from pathlib import Path
|
@@ -14,8 +13,6 @@ from vision_agent.utils.image_utils import encode_media
|
|
14
13
|
|
15
14
|
from .types import Message
|
16
15
|
|
17
|
-
_LOGGER = logging.getLogger(__name__)
|
18
|
-
|
19
16
|
|
20
17
|
class LMM(ABC):
|
21
18
|
@abstractmethod
|
@@ -45,11 +42,11 @@ class LMM(ABC):
|
|
45
42
|
|
46
43
|
|
47
44
|
class OpenAILMM(LMM):
|
48
|
-
r"""An LMM class for the OpenAI
|
45
|
+
r"""An LMM class for the OpenAI LMMs."""
|
49
46
|
|
50
47
|
def __init__(
|
51
48
|
self,
|
52
|
-
model_name: str = "gpt-4o",
|
49
|
+
model_name: str = "gpt-4o-2024-05-13",
|
53
50
|
api_key: Optional[str] = None,
|
54
51
|
max_tokens: int = 4096,
|
55
52
|
json_mode: bool = False,
|
@@ -365,8 +362,8 @@ class OllamaLMM(LMM):
|
|
365
362
|
return resp["response"] # type: ignore
|
366
363
|
|
367
364
|
|
368
|
-
class
|
369
|
-
r"""An LMM class for Anthropic's
|
365
|
+
class AnthropicLMM(LMM):
|
366
|
+
r"""An LMM class for Anthropic's LMMs."""
|
370
367
|
|
371
368
|
def __init__(
|
372
369
|
self,
|
@@ -402,7 +399,7 @@ class ClaudeSonnetLMM(LMM):
|
|
402
399
|
]
|
403
400
|
if "media" in msg:
|
404
401
|
for media_path in msg["media"]:
|
405
|
-
encoded_media = encode_media(media_path)
|
402
|
+
encoded_media = encode_media(media_path, resize=768)
|
406
403
|
content.append(
|
407
404
|
ImageBlockParam(
|
408
405
|
type="image",
|
@@ -449,7 +446,7 @@ class ClaudeSonnetLMM(LMM):
|
|
449
446
|
]
|
450
447
|
if media:
|
451
448
|
for m in media:
|
452
|
-
encoded_media = encode_media(m)
|
449
|
+
encoded_media = encode_media(m, resize=768)
|
453
450
|
content.append(
|
454
451
|
ImageBlockParam(
|
455
452
|
type="image",
|
vision_agent/tools/__init__.py
CHANGED
vision_agent/tools/meta_tools.py
CHANGED
@@ -486,6 +486,33 @@ def list_artifacts(artifacts: Artifacts) -> str:
|
|
486
486
|
return output_str
|
487
487
|
|
488
488
|
|
489
|
+
def check_and_load_image(code: str) -> List[str]:
|
490
|
+
if not code.strip():
|
491
|
+
return []
|
492
|
+
|
493
|
+
pattern = r"show_media_artifact\(\s*([^\)]+),\s*['\"]([^\)]+)['\"]\s*\)"
|
494
|
+
match = re.search(pattern, code)
|
495
|
+
if match:
|
496
|
+
name = match.group(2)
|
497
|
+
return [name]
|
498
|
+
return []
|
499
|
+
|
500
|
+
|
501
|
+
def view_media_artifact(artifacts: Artifacts, name: str) -> str:
|
502
|
+
"""Views the image artifact with the given name.
|
503
|
+
|
504
|
+
Parameters:
|
505
|
+
artifacts (Artifacts): The artifacts object to show the image from.
|
506
|
+
name (str): The name of the image artifact to show.
|
507
|
+
"""
|
508
|
+
if name not in artifacts:
|
509
|
+
output_str = f"[Artifact {name} does not exist]"
|
510
|
+
else:
|
511
|
+
output_str = f"[Image {name} displayed]"
|
512
|
+
print(output_str)
|
513
|
+
return output_str
|
514
|
+
|
515
|
+
|
489
516
|
def get_tool_descriptions() -> str:
|
490
517
|
"""Returns a description of all the tools that `generate_vision_code` has access to.
|
491
518
|
Helpful for answering questions about what types of vision tasks you can do with
|
@@ -493,16 +520,15 @@ def get_tool_descriptions() -> str:
|
|
493
520
|
return TOOL_DESCRIPTIONS
|
494
521
|
|
495
522
|
|
496
|
-
def
|
523
|
+
def object_detection_fine_tuning(bboxes: List[Dict[str, Any]]) -> str:
|
497
524
|
"""DO NOT use this function unless the user has supplied you with bboxes.
|
498
|
-
'
|
499
|
-
objects in an image based on a given dataset. It returns the fine
|
525
|
+
'object_detection_fine_tuning' is a tool that fine-tunes object detection models to
|
526
|
+
be able to detect objects in an image based on a given dataset. It returns the fine
|
527
|
+
tuning job id.
|
500
528
|
|
501
529
|
Parameters:
|
502
530
|
bboxes (List[BboxInput]): A list of BboxInput containing the image path, labels
|
503
531
|
and bounding boxes. The coordinates are unnormalized.
|
504
|
-
task (str): The florencev2 fine-tuning task. The options are
|
505
|
-
'phrase_grounding'.
|
506
532
|
|
507
533
|
Returns:
|
508
534
|
str: The fine tuning job id, this id will used to retrieve the fine tuned
|
@@ -510,12 +536,13 @@ def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str:
|
|
510
536
|
|
511
537
|
Example
|
512
538
|
-------
|
513
|
-
>>> fine_tuning_job_id =
|
539
|
+
>>> fine_tuning_job_id = object_detection_fine_tuning(
|
514
540
|
[{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
|
515
541
|
{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
|
516
542
|
"phrase_grounding"
|
517
543
|
)
|
518
544
|
"""
|
545
|
+
task = "phrase_grounding"
|
519
546
|
bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
|
520
547
|
task_type = PromptTask[task.upper()]
|
521
548
|
fine_tuning_request = [
|
@@ -531,7 +558,7 @@ def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str:
|
|
531
558
|
fine_tune_id = str(
|
532
559
|
landing_api.launch_fine_tuning_job("florencev2", task_type, fine_tuning_request)
|
533
560
|
)
|
534
|
-
print(f"[
|
561
|
+
print(f"[Fine tuning id: {fine_tune_id}]")
|
535
562
|
return fine_tune_id
|
536
563
|
|
537
564
|
|
@@ -564,7 +591,7 @@ def use_extra_vision_agent_args(
|
|
564
591
|
Returns:
|
565
592
|
str: The edited code.
|
566
593
|
"""
|
567
|
-
generate_pattern = r"generate_vision_code\(\s*([^\)]+)\)"
|
594
|
+
generate_pattern = r"generate_vision_code\(\s*([^\)]+)\s*\)"
|
568
595
|
|
569
596
|
def generate_replacer(match: re.Match) -> str:
|
570
597
|
arg = match.group(1)
|
@@ -575,7 +602,7 @@ def use_extra_vision_agent_args(
|
|
575
602
|
out_str += ")"
|
576
603
|
return out_str
|
577
604
|
|
578
|
-
edit_pattern = r"edit_vision_code\(\s*([^\)]+)\)"
|
605
|
+
edit_pattern = r"edit_vision_code\(\s*([^\)]+)\s*\)"
|
579
606
|
|
580
607
|
def edit_replacer(match: re.Match) -> str:
|
581
608
|
arg = match.group(1)
|
@@ -591,48 +618,52 @@ def use_extra_vision_agent_args(
|
|
591
618
|
return new_code
|
592
619
|
|
593
620
|
|
594
|
-
def
|
595
|
-
artifacts: Artifacts, name: str,
|
621
|
+
def use_object_detection_fine_tuning(
|
622
|
+
artifacts: Artifacts, name: str, fine_tune_id: str
|
596
623
|
) -> str:
|
597
|
-
"""Replaces
|
598
|
-
|
599
|
-
|
624
|
+
"""Replaces calls to 'owl_v2_image', 'florence2_phrase_detection' and
|
625
|
+
'florence2_sam2_image' with the fine tuning id. This ensures that the code utilizes
|
626
|
+
the fined tuned florence2 model. Returns the diff between the original code and the
|
627
|
+
new code.
|
600
628
|
|
601
629
|
Parameters:
|
602
630
|
artifacts (Artifacts): The artifacts object to edit the code from.
|
603
631
|
name (str): The name of the artifact to edit.
|
604
|
-
task (str): The task to fine tune the model for. The options are
|
605
|
-
'phrase_grounding'.
|
606
632
|
fine_tune_id (str): The fine tuning job id.
|
607
633
|
|
608
634
|
Examples
|
609
635
|
--------
|
610
|
-
>>> diff =
|
636
|
+
>>> diff = use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")
|
611
637
|
"""
|
612
638
|
|
613
|
-
task_to_fn = {"phrase_grounding": "florence2_phrase_grounding"}
|
614
|
-
|
615
639
|
if name not in artifacts:
|
616
640
|
output_str = f"[Artifact {name} does not exist]"
|
617
641
|
print(output_str)
|
618
642
|
return output_str
|
619
643
|
|
620
644
|
code = artifacts[name]
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
645
|
+
patterns = [
|
646
|
+
(
|
647
|
+
r"florence2_phrase_grounding\(\s*([^\)]+)\s*\)",
|
648
|
+
lambda match: f'florence2_phrase_grounding({match.group(1)}, "{fine_tune_id}")',
|
649
|
+
),
|
650
|
+
(
|
651
|
+
r"owl_v2_image\(\s*([^\)]+)\s*\)",
|
652
|
+
lambda match: f'owl_v2_image({match.group(1)}, "{fine_tune_id}")',
|
653
|
+
),
|
654
|
+
(
|
655
|
+
r"florence2_sam2_image\(\s*([^\)]+)\s*\)",
|
656
|
+
lambda match: f'florence2_sam2_image({match.group(1)}, "{fine_tune_id}")',
|
657
|
+
),
|
658
|
+
]
|
630
659
|
|
631
|
-
new_code =
|
660
|
+
new_code = code
|
661
|
+
for pattern, replacer in patterns:
|
662
|
+
new_code = re.sub(pattern, replacer, new_code)
|
632
663
|
|
633
664
|
if new_code == code:
|
634
665
|
output_str = (
|
635
|
-
f"[
|
666
|
+
f"[No function calls to replace with fine tuning id in artifact {name}]"
|
636
667
|
)
|
637
668
|
print(output_str)
|
638
669
|
return output_str
|
@@ -662,8 +693,9 @@ META_TOOL_DOCSTRING = get_tool_documentation(
|
|
662
693
|
generate_vision_code,
|
663
694
|
edit_vision_code,
|
664
695
|
write_media_artifact,
|
665
|
-
|
666
|
-
|
696
|
+
view_media_artifact,
|
697
|
+
object_detection_fine_tuning,
|
698
|
+
use_object_detection_fine_tuning,
|
667
699
|
list_artifacts,
|
668
700
|
]
|
669
701
|
)
|
vision_agent/tools/tool_utils.py
CHANGED
@@ -208,19 +208,25 @@ def _call_post(
|
|
208
208
|
if files:
|
209
209
|
files_in_b64 = [(file[0], b64encode(file[1]).decode("utf-8")) for file in files]
|
210
210
|
try:
|
211
|
+
if files is not None:
|
212
|
+
response = session.post(url, data=payload, files=files)
|
213
|
+
else:
|
214
|
+
response = session.post(url, json=payload)
|
215
|
+
|
216
|
+
# make sure function_name is in the payload so we can display it
|
217
|
+
tool_call_trace_payload = (
|
218
|
+
payload
|
219
|
+
if "function_name" in payload
|
220
|
+
else {**payload, **{"function_name": function_name}}
|
221
|
+
)
|
211
222
|
tool_call_trace = ToolCallTrace(
|
212
223
|
endpoint_url=url,
|
213
|
-
request=
|
224
|
+
request=tool_call_trace_payload,
|
214
225
|
response={},
|
215
226
|
error=None,
|
216
227
|
files=files_in_b64,
|
217
228
|
)
|
218
229
|
|
219
|
-
if files is not None:
|
220
|
-
response = session.post(url, data=payload, files=files)
|
221
|
-
else:
|
222
|
-
response = session.post(url, json=payload)
|
223
|
-
|
224
230
|
if response.status_code != 200:
|
225
231
|
tool_call_trace.error = Error(
|
226
232
|
name="RemoteToolCallFailed",
|