vision-agent 0.2.140__py3-none-any.whl → 0.2.141__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/__init__.py +2 -1
- vision_agent/agent/agent_utils.py +8 -2
- vision_agent/agent/vision_agent.py +97 -17
- vision_agent/agent/vision_agent_coder.py +93 -66
- vision_agent/agent/vision_agent_coder_prompts.py +53 -19
- vision_agent/agent/vision_agent_prompts.py +31 -9
- vision_agent/lmm/__init__.py +1 -1
- vision_agent/lmm/lmm.py +6 -9
- vision_agent/tools/__init__.py +1 -1
- vision_agent/tools/meta_tools.py +64 -32
- vision_agent/tools/tools.py +115 -30
- vision_agent/tools/tools_types.py +1 -0
- vision_agent/utils/image_utils.py +18 -7
- vision_agent/utils/video.py +2 -1
- {vision_agent-0.2.140.dist-info → vision_agent-0.2.141.dist-info}/METADATA +60 -12
- vision_agent-0.2.141.dist-info/RECORD +33 -0
- vision_agent-0.2.140.dist-info/RECORD +0 -33
- {vision_agent-0.2.140.dist-info → vision_agent-0.2.141.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.140.dist-info → vision_agent-0.2.141.dist-info}/WHEEL +0 -0
@@ -30,9 +30,10 @@ PLAN = """
|
|
30
30
|
|
31
31
|
**Instructions**:
|
32
32
|
1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request.
|
33
|
-
2.
|
33
|
+
2. For each subtask, be sure to include the tool(s) you want to use to accomplish that subtask.
|
34
|
+
3. Output three different plans each utilize a different strategy or set of tools ordering them from most likely to least likely to succeed.
|
34
35
|
|
35
|
-
Output a list of jsons in the following format
|
36
|
+
Output a list of jsons in the following format:
|
36
37
|
|
37
38
|
```json
|
38
39
|
{{
|
@@ -67,7 +68,7 @@ This is the documentation for the functions you have access to. You may call any
|
|
67
68
|
{previous_attempts}
|
68
69
|
|
69
70
|
**Instructions**:
|
70
|
-
1. Write a program to load the media and call each tool and
|
71
|
+
1. Write a program to load the media and call each tool and print it's output along with other relevant information.
|
71
72
|
2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
|
72
73
|
3. Your test case MUST run only on the given images which are {media}
|
73
74
|
4. Print this final dictionary.
|
@@ -102,24 +103,25 @@ print(final_out)
|
|
102
103
|
|
103
104
|
--- EXAMPLE2 ---
|
104
105
|
plan1:
|
105
|
-
- Extract frames from 'video.mp4' at 10 FPS using the '
|
106
|
-
- Use the '
|
106
|
+
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
|
107
|
+
- Use the 'owl_v2_video' tool with the prompt 'person' to detect where the people are in the video.
|
107
108
|
plan2:
|
108
|
-
- Extract frames from 'video.mp4' at 10 FPS using the '
|
109
|
+
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
|
109
110
|
- Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
|
110
111
|
plan3:
|
111
|
-
- Extract frames from 'video.mp4' at 10 FPS using the '
|
112
|
+
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
|
112
113
|
- Use the 'florence2_sam2_video_tracking' tool with the prompt 'person' to detect where the people are in the video.
|
113
114
|
|
114
115
|
|
115
116
|
```python
|
116
117
|
import numpy as np
|
117
|
-
from vision_agent.tools import
|
118
|
+
from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking
|
118
119
|
|
119
120
|
# sample at 1 FPS and use the first 10 frames to reduce processing time
|
120
|
-
frames =
|
121
|
-
frames = [f[
|
121
|
+
frames = extract_frames_and_timestamps("video.mp4", 1)
|
122
|
+
frames = [f["frame"] for f in frames][:10]
|
122
123
|
|
124
|
+
# strip arrays from the output to make it easier to read
|
123
125
|
def remove_arrays(o):
|
124
126
|
if isinstance(o, list):
|
125
127
|
return [remove_arrays(e) for e in o]
|
@@ -130,18 +132,46 @@ def remove_arrays(o):
|
|
130
132
|
else:
|
131
133
|
return o
|
132
134
|
|
135
|
+
# return the counts of each label per frame to help determine the stability of the model results
|
136
|
+
def get_counts(preds):
|
137
|
+
counts = {{}}
|
138
|
+
for i, pred_frame in enumerate(preds):
|
139
|
+
counts_i = {{}}
|
140
|
+
for pred in pred_frame:
|
141
|
+
label = pred["label"].split(":")[1] if ":" in pred["label"] else pred["label"]
|
142
|
+
counts_i[label] = counts_i.get(label, 0) + 1
|
143
|
+
counts[f"frame_{{i}}"] = counts_i
|
144
|
+
return counts
|
145
|
+
|
146
|
+
|
133
147
|
# plan1
|
134
|
-
owl_v2_out =
|
148
|
+
owl_v2_out = owl_v2_video("person", frames)
|
149
|
+
owl_v2_counts = get_counts(owl_v2_out)
|
135
150
|
|
136
151
|
# plan2
|
137
152
|
florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
|
153
|
+
florence2_counts = get_counts(florence2_out)
|
138
154
|
|
139
155
|
# plan3
|
140
156
|
f2s2_tracking_out = florence2_sam2_video_tracking("person", frames)
|
141
157
|
remove_arrays(f2s2_tracking_out)
|
158
|
+
f2s2_counts = get_counts(f2s2_tracking_out)
|
159
|
+
|
160
|
+
final_out = {{
|
161
|
+
"owl_v2_video": owl_v2_out,
|
162
|
+
"florence2_phrase_grounding": florence2_out,
|
163
|
+
"florence2_sam2_video_tracking": f2s2_out,
|
164
|
+
}}
|
165
|
+
|
166
|
+
counts = {{
|
167
|
+
"owl_v2_video": owl_v2_counts,
|
168
|
+
"florence2_phrase_grounding": florence2_counts,
|
169
|
+
"florence2_sam2_video_tracking": f2s2_counts,
|
170
|
+
}}
|
142
171
|
|
143
|
-
final_out = {{"owl_v2_image": owl_v2_out, "florence2_phrase_grounding": florence2_out, "florence2_sam2_video_tracking": f2s2_tracking_out}}
|
144
172
|
print(final_out)
|
173
|
+
print(labels_and_scores)
|
174
|
+
print(counts)
|
145
175
|
```
|
146
176
|
"""
|
147
177
|
|
@@ -159,7 +189,7 @@ But got the following error or no stdout:
|
|
159
189
|
|
160
190
|
|
161
191
|
PICK_PLAN = """
|
162
|
-
**Role**: You are
|
192
|
+
**Role**: You are an advanced AI model that can understand the user request and construct plans to accomplish it.
|
163
193
|
|
164
194
|
**Task**: Your responsibility is to pick the best plan from the three plans provided.
|
165
195
|
|
@@ -173,13 +203,14 @@ PICK_PLAN = """
|
|
173
203
|
{tool_output}
|
174
204
|
|
175
205
|
**Instructions**:
|
176
|
-
1.
|
177
|
-
2. Solve the problem yourself given the image and pick the plan that matches your solution the best.
|
206
|
+
1. Re-read the user request, plans, tool outputs and examine the image.
|
207
|
+
2. Solve the problem yourself given the image and pick the most accurate plan that matches your solution the best.
|
208
|
+
3. Add modifications to improve the plan including: changing a tool, adding thresholds, string matching.
|
178
209
|
3. Output a JSON object with the following format:
|
179
210
|
{{
|
180
211
|
"predicted_answer": str # the answer you would expect from the best plan
|
181
|
-
"thoughts": str # your thought process for choosing the best plan
|
182
|
-
"best_plan": str # the best plan you have chosen
|
212
|
+
"thoughts": str # your thought process for choosing the best plan over other plans and any modifications you made
|
213
|
+
"best_plan": str # the best plan you have chosen, must be `plan1`, `plan2`, or `plan3`
|
183
214
|
}}
|
184
215
|
"""
|
185
216
|
|
@@ -201,15 +232,18 @@ This is the documentation for the functions you have access to. You may call any
|
|
201
232
|
**User Instructions**:
|
202
233
|
{question}
|
203
234
|
|
204
|
-
**Tool
|
235
|
+
**Tool Tests and Outputs**:
|
205
236
|
{tool_output}
|
206
237
|
|
238
|
+
**Tool Output Thoughts**:
|
239
|
+
{plan_thoughts}
|
240
|
+
|
207
241
|
**Previous Feedback**:
|
208
242
|
{feedback}
|
209
243
|
|
210
244
|
**Instructions**:
|
211
245
|
1. **Understand and Clarify**: Make sure you understand the task.
|
212
|
-
2. **Algorithm/Method Selection**: Decide on the most efficient
|
246
|
+
2. **Algorithm/Method Selection**: Decide on the most efficient method, use the tool outputs and tool thoughts to guide you.
|
213
247
|
3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode.
|
214
248
|
4. **Code Generation**: Translate your pseudocode into executable Python code. Ensure you use correct arguments, remember coordinates are always returned normalized from `vision_agent.tools`. All images from `vision_agent.tools` are in RGB format, red is (255, 0, 0) and blue is (0, 0, 255).
|
215
249
|
"""
|
@@ -18,19 +18,24 @@ Here is an example of how you can interact with a user and Actions to complete a
|
|
18
18
|
{examples}
|
19
19
|
--- END EXAMPLES ---
|
20
20
|
|
21
|
-
**Instructions**:
|
22
|
-
1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
|
23
|
-
2. **Output in JSON**: Respond in JSON format, {{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
|
24
|
-
|
25
21
|
**Conversation**:
|
26
22
|
Here is the current conversation so far:
|
27
23
|
--- START CONVERSATION ---
|
28
24
|
{conversation}
|
25
|
+
--- END CONVERSATION ---
|
26
|
+
|
27
|
+
**Instructions**:
|
28
|
+
1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
|
29
|
+
2. **Output in JSON**: Respond in the following format in JSON:
|
30
|
+
|
31
|
+
```json
|
32
|
+
{{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
|
33
|
+
```
|
29
34
|
"""
|
30
35
|
|
31
36
|
|
32
37
|
EXAMPLES_CODE1 = """
|
33
|
-
USER: Can you detect the dogs in this image? Media name dog.jpg
|
38
|
+
USER: Can you write code to detect the dogs in this image? Media name dog.jpg
|
34
39
|
|
35
40
|
OBSERVATION:
|
36
41
|
[Artifacts loaded]
|
@@ -61,6 +66,7 @@ AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask
|
|
61
66
|
EXAMPLES_CODE1_EXTRA = """
|
62
67
|
USER: The the image only has one dog, can you fix this?
|
63
68
|
|
69
|
+
OBSERVATION:
|
64
70
|
[Artifacts loaded]
|
65
71
|
Artifact dog.jpg loaded to /path/to/images/dog.jpg
|
66
72
|
Artifact dog_detector.py loaded to /path/to/code/dog_detector.py
|
@@ -86,8 +92,24 @@ OBSERVATION:
|
|
86
92
|
AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect one dog and shown you the output, do the results look good to you?", "let_user_respond": true}
|
87
93
|
"""
|
88
94
|
|
89
|
-
|
90
95
|
EXAMPLES_CODE2 = """
|
96
|
+
USER: Can you describe this image?
|
97
|
+
|
98
|
+
OBSERVATION:
|
99
|
+
[Artifacts loaded]
|
100
|
+
Artifact image.jpg loaded to /path/to/images/image.jpg
|
101
|
+
[End of artifacts]
|
102
|
+
|
103
|
+
AGENT: {"thoughts": "The user hasn't asked me to write any code and the task is very simple so I will view the image and answer myself to respond to the user quickly.", "response": "<execute_python>view_media_artifacts('image.jpg')</execute_python>", "let_user_respond": false}
|
104
|
+
|
105
|
+
OBSERVATION:
|
106
|
+
[Image image.jpg displayed]
|
107
|
+
|
108
|
+
AGENT: {"thoughts": "The image shows a cat and a dog sitting on the couch, I will tell the user and ask them if they need any other assistance.", "response": "The image contains a dog and a cat sitting on a couch. Can I help you with any other tasks?", "let_user_respond": true}
|
109
|
+
"""
|
110
|
+
|
111
|
+
|
112
|
+
EXAMPLES_CODE3 = """
|
91
113
|
USER: Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?
|
92
114
|
|
93
115
|
OBSERVATION:
|
@@ -137,13 +159,13 @@ AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to
|
|
137
159
|
|
138
160
|
USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
|
139
161
|
|
140
|
-
AGENT: {"thoughts": "Because the user has supplied me with labels I can call
|
162
|
+
AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "I will fine tune florence2 with the labels you provided <execute_python>object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])</execute_python>", "let_user_respond": false}
|
141
163
|
|
142
164
|
OBSERVATION:
|
143
|
-
[
|
165
|
+
[Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
|
144
166
|
|
145
167
|
|
146
|
-
AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>
|
168
|
+
AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
|
147
169
|
|
148
170
|
OBSERVATION:
|
149
171
|
[Artifact code.py edits]
|
vision_agent/lmm/__init__.py
CHANGED
@@ -1,2 +1,2 @@
|
|
1
|
-
from .lmm import LMM,
|
1
|
+
from .lmm import LMM, AnthropicLMM, AzureOpenAILMM, OllamaLMM, OpenAILMM
|
2
2
|
from .types import Message
|
vision_agent/lmm/lmm.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
import json
|
2
|
-
import logging
|
3
2
|
import os
|
4
3
|
from abc import ABC, abstractmethod
|
5
4
|
from pathlib import Path
|
@@ -14,8 +13,6 @@ from vision_agent.utils.image_utils import encode_media
|
|
14
13
|
|
15
14
|
from .types import Message
|
16
15
|
|
17
|
-
_LOGGER = logging.getLogger(__name__)
|
18
|
-
|
19
16
|
|
20
17
|
class LMM(ABC):
|
21
18
|
@abstractmethod
|
@@ -45,11 +42,11 @@ class LMM(ABC):
|
|
45
42
|
|
46
43
|
|
47
44
|
class OpenAILMM(LMM):
|
48
|
-
r"""An LMM class for the OpenAI
|
45
|
+
r"""An LMM class for the OpenAI LMMs."""
|
49
46
|
|
50
47
|
def __init__(
|
51
48
|
self,
|
52
|
-
model_name: str = "gpt-4o",
|
49
|
+
model_name: str = "gpt-4o-2024-05-13",
|
53
50
|
api_key: Optional[str] = None,
|
54
51
|
max_tokens: int = 4096,
|
55
52
|
json_mode: bool = False,
|
@@ -365,8 +362,8 @@ class OllamaLMM(LMM):
|
|
365
362
|
return resp["response"] # type: ignore
|
366
363
|
|
367
364
|
|
368
|
-
class
|
369
|
-
r"""An LMM class for Anthropic's
|
365
|
+
class AnthropicLMM(LMM):
|
366
|
+
r"""An LMM class for Anthropic's LMMs."""
|
370
367
|
|
371
368
|
def __init__(
|
372
369
|
self,
|
@@ -402,7 +399,7 @@ class ClaudeSonnetLMM(LMM):
|
|
402
399
|
]
|
403
400
|
if "media" in msg:
|
404
401
|
for media_path in msg["media"]:
|
405
|
-
encoded_media = encode_media(media_path)
|
402
|
+
encoded_media = encode_media(media_path, resize=768)
|
406
403
|
content.append(
|
407
404
|
ImageBlockParam(
|
408
405
|
type="image",
|
@@ -449,7 +446,7 @@ class ClaudeSonnetLMM(LMM):
|
|
449
446
|
]
|
450
447
|
if media:
|
451
448
|
for m in media:
|
452
|
-
encoded_media = encode_media(m)
|
449
|
+
encoded_media = encode_media(m, resize=768)
|
453
450
|
content.append(
|
454
451
|
ImageBlockParam(
|
455
452
|
type="image",
|
vision_agent/tools/__init__.py
CHANGED
vision_agent/tools/meta_tools.py
CHANGED
@@ -486,6 +486,33 @@ def list_artifacts(artifacts: Artifacts) -> str:
|
|
486
486
|
return output_str
|
487
487
|
|
488
488
|
|
489
|
+
def check_and_load_image(code: str) -> List[str]:
|
490
|
+
if not code.strip():
|
491
|
+
return []
|
492
|
+
|
493
|
+
pattern = r"show_media_artifact\(\s*([^\)]+),\s*['\"]([^\)]+)['\"]\s*\)"
|
494
|
+
match = re.search(pattern, code)
|
495
|
+
if match:
|
496
|
+
name = match.group(2)
|
497
|
+
return [name]
|
498
|
+
return []
|
499
|
+
|
500
|
+
|
501
|
+
def view_media_artifact(artifacts: Artifacts, name: str) -> str:
|
502
|
+
"""Views the image artifact with the given name.
|
503
|
+
|
504
|
+
Parameters:
|
505
|
+
artifacts (Artifacts): The artifacts object to show the image from.
|
506
|
+
name (str): The name of the image artifact to show.
|
507
|
+
"""
|
508
|
+
if name not in artifacts:
|
509
|
+
output_str = f"[Artifact {name} does not exist]"
|
510
|
+
else:
|
511
|
+
output_str = f"[Image {name} displayed]"
|
512
|
+
print(output_str)
|
513
|
+
return output_str
|
514
|
+
|
515
|
+
|
489
516
|
def get_tool_descriptions() -> str:
|
490
517
|
"""Returns a description of all the tools that `generate_vision_code` has access to.
|
491
518
|
Helpful for answering questions about what types of vision tasks you can do with
|
@@ -493,16 +520,15 @@ def get_tool_descriptions() -> str:
|
|
493
520
|
return TOOL_DESCRIPTIONS
|
494
521
|
|
495
522
|
|
496
|
-
def
|
523
|
+
def object_detection_fine_tuning(bboxes: List[Dict[str, Any]]) -> str:
|
497
524
|
"""DO NOT use this function unless the user has supplied you with bboxes.
|
498
|
-
'
|
499
|
-
objects in an image based on a given dataset. It returns the fine
|
525
|
+
'object_detection_fine_tuning' is a tool that fine-tunes object detection models to
|
526
|
+
be able to detect objects in an image based on a given dataset. It returns the fine
|
527
|
+
tuning job id.
|
500
528
|
|
501
529
|
Parameters:
|
502
530
|
bboxes (List[BboxInput]): A list of BboxInput containing the image path, labels
|
503
531
|
and bounding boxes. The coordinates are unnormalized.
|
504
|
-
task (str): The florencev2 fine-tuning task. The options are
|
505
|
-
'phrase_grounding'.
|
506
532
|
|
507
533
|
Returns:
|
508
534
|
str: The fine tuning job id, this id will used to retrieve the fine tuned
|
@@ -510,12 +536,13 @@ def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str:
|
|
510
536
|
|
511
537
|
Example
|
512
538
|
-------
|
513
|
-
>>> fine_tuning_job_id =
|
539
|
+
>>> fine_tuning_job_id = object_detection_fine_tuning(
|
514
540
|
[{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
|
515
541
|
{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
|
516
542
|
"phrase_grounding"
|
517
543
|
)
|
518
544
|
"""
|
545
|
+
task = "phrase_grounding"
|
519
546
|
bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
|
520
547
|
task_type = PromptTask[task.upper()]
|
521
548
|
fine_tuning_request = [
|
@@ -531,7 +558,7 @@ def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str:
|
|
531
558
|
fine_tune_id = str(
|
532
559
|
landing_api.launch_fine_tuning_job("florencev2", task_type, fine_tuning_request)
|
533
560
|
)
|
534
|
-
print(f"[
|
561
|
+
print(f"[Fine tuning id: {fine_tune_id}]")
|
535
562
|
return fine_tune_id
|
536
563
|
|
537
564
|
|
@@ -564,7 +591,7 @@ def use_extra_vision_agent_args(
|
|
564
591
|
Returns:
|
565
592
|
str: The edited code.
|
566
593
|
"""
|
567
|
-
generate_pattern = r"generate_vision_code\(\s*([^\)]+)\)"
|
594
|
+
generate_pattern = r"generate_vision_code\(\s*([^\)]+)\s*\)"
|
568
595
|
|
569
596
|
def generate_replacer(match: re.Match) -> str:
|
570
597
|
arg = match.group(1)
|
@@ -575,7 +602,7 @@ def use_extra_vision_agent_args(
|
|
575
602
|
out_str += ")"
|
576
603
|
return out_str
|
577
604
|
|
578
|
-
edit_pattern = r"edit_vision_code\(\s*([^\)]+)\)"
|
605
|
+
edit_pattern = r"edit_vision_code\(\s*([^\)]+)\s*\)"
|
579
606
|
|
580
607
|
def edit_replacer(match: re.Match) -> str:
|
581
608
|
arg = match.group(1)
|
@@ -591,48 +618,52 @@ def use_extra_vision_agent_args(
|
|
591
618
|
return new_code
|
592
619
|
|
593
620
|
|
594
|
-
def
|
595
|
-
artifacts: Artifacts, name: str,
|
621
|
+
def use_object_detection_fine_tuning(
|
622
|
+
artifacts: Artifacts, name: str, fine_tune_id: str
|
596
623
|
) -> str:
|
597
|
-
"""Replaces
|
598
|
-
|
599
|
-
|
624
|
+
"""Replaces calls to 'owl_v2_image', 'florence2_phrase_detection' and
|
625
|
+
'florence2_sam2_image' with the fine tuning id. This ensures that the code utilizes
|
626
|
+
the fined tuned florence2 model. Returns the diff between the original code and the
|
627
|
+
new code.
|
600
628
|
|
601
629
|
Parameters:
|
602
630
|
artifacts (Artifacts): The artifacts object to edit the code from.
|
603
631
|
name (str): The name of the artifact to edit.
|
604
|
-
task (str): The task to fine tune the model for. The options are
|
605
|
-
'phrase_grounding'.
|
606
632
|
fine_tune_id (str): The fine tuning job id.
|
607
633
|
|
608
634
|
Examples
|
609
635
|
--------
|
610
|
-
>>> diff =
|
636
|
+
>>> diff = use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")
|
611
637
|
"""
|
612
638
|
|
613
|
-
task_to_fn = {"phrase_grounding": "florence2_phrase_grounding"}
|
614
|
-
|
615
639
|
if name not in artifacts:
|
616
640
|
output_str = f"[Artifact {name} does not exist]"
|
617
641
|
print(output_str)
|
618
642
|
return output_str
|
619
643
|
|
620
644
|
code = artifacts[name]
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
645
|
+
patterns = [
|
646
|
+
(
|
647
|
+
r"florence2_phrase_grounding\(\s*([^\)]+)\s*\)",
|
648
|
+
lambda match: f'florence2_phrase_grounding({match.group(1)}, "{fine_tune_id}")',
|
649
|
+
),
|
650
|
+
(
|
651
|
+
r"owl_v2_image\(\s*([^\)]+)\s*\)",
|
652
|
+
lambda match: f'owl_v2_image({match.group(1)}, "{fine_tune_id}")',
|
653
|
+
),
|
654
|
+
(
|
655
|
+
r"florence2_sam2_image\(\s*([^\)]+)\s*\)",
|
656
|
+
lambda match: f'florence2_sam2_image({match.group(1)}, "{fine_tune_id}")',
|
657
|
+
),
|
658
|
+
]
|
630
659
|
|
631
|
-
new_code =
|
660
|
+
new_code = code
|
661
|
+
for pattern, replacer in patterns:
|
662
|
+
new_code = re.sub(pattern, replacer, new_code)
|
632
663
|
|
633
664
|
if new_code == code:
|
634
665
|
output_str = (
|
635
|
-
f"[
|
666
|
+
f"[No function calls to replace with fine tuning id in artifact {name}]"
|
636
667
|
)
|
637
668
|
print(output_str)
|
638
669
|
return output_str
|
@@ -662,8 +693,9 @@ META_TOOL_DOCSTRING = get_tool_documentation(
|
|
662
693
|
generate_vision_code,
|
663
694
|
edit_vision_code,
|
664
695
|
write_media_artifact,
|
665
|
-
|
666
|
-
|
696
|
+
view_media_artifact,
|
697
|
+
object_detection_fine_tuning,
|
698
|
+
use_object_detection_fine_tuning,
|
667
699
|
list_artifacts,
|
668
700
|
]
|
669
701
|
)
|