vision-agent 0.2.139__py3-none-any.whl → 0.2.141__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -30,9 +30,10 @@ PLAN = """
30
30
 
31
31
  **Instructions**:
32
32
  1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request.
33
- 2. Output three different plans each utilize a different strategy or set of tools.
33
+ 2. For each subtask, be sure to include the tool(s) you want to use to accomplish that subtask.
34
+ 3. Output three different plans each utilize a different strategy or set of tools ordering them from most likely to least likely to succeed.
34
35
 
35
- Output a list of jsons in the following format
36
+ Output a list of jsons in the following format:
36
37
 
37
38
  ```json
38
39
  {{
@@ -67,7 +68,7 @@ This is the documentation for the functions you have access to. You may call any
67
68
  {previous_attempts}
68
69
 
69
70
  **Instructions**:
70
- 1. Write a program to load the media and call each tool and save it's output.
71
+ 1. Write a program to load the media and call each tool and print it's output along with other relevant information.
71
72
  2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
72
73
  3. Your test case MUST run only on the given images which are {media}
73
74
  4. Print this final dictionary.
@@ -102,24 +103,25 @@ print(final_out)
102
103
 
103
104
  --- EXAMPLE2 ---
104
105
  plan1:
105
- - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
106
- - Use the 'owl_v2_image' tool with the prompt 'person' to detect where the people are in the video.
106
+ - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
107
+ - Use the 'owl_v2_video' tool with the prompt 'person' to detect where the people are in the video.
107
108
  plan2:
108
- - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
109
+ - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
109
110
  - Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
110
111
  plan3:
111
- - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
112
+ - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
112
113
  - Use the 'florence2_sam2_video_tracking' tool with the prompt 'person' to detect where the people are in the video.
113
114
 
114
115
 
115
116
  ```python
116
117
  import numpy as np
117
- from vision_agent.tools import extract_frames, owl_v2_image, florence2_phrase_grounding, florence2_sam2_video_tracking
118
+ from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking
118
119
 
119
120
  # sample at 1 FPS and use the first 10 frames to reduce processing time
120
- frames = extract_frames("video.mp4", 1)
121
- frames = [f[0] for f in frames][:10]
121
+ frames = extract_frames_and_timestamps("video.mp4", 1)
122
+ frames = [f["frame"] for f in frames][:10]
122
123
 
124
+ # strip arrays from the output to make it easier to read
123
125
  def remove_arrays(o):
124
126
  if isinstance(o, list):
125
127
  return [remove_arrays(e) for e in o]
@@ -130,18 +132,46 @@ def remove_arrays(o):
130
132
  else:
131
133
  return o
132
134
 
135
+ # return the counts of each label per frame to help determine the stability of the model results
136
+ def get_counts(preds):
137
+ counts = {{}}
138
+ for i, pred_frame in enumerate(preds):
139
+ counts_i = {{}}
140
+ for pred in pred_frame:
141
+ label = pred["label"].split(":")[1] if ":" in pred["label"] else pred["label"]
142
+ counts_i[label] = counts_i.get(label, 0) + 1
143
+ counts[f"frame_{{i}}"] = counts_i
144
+ return counts
145
+
146
+
133
147
  # plan1
134
- owl_v2_out = [owl_v2_image("person", f) for f in frames]
148
+ owl_v2_out = owl_v2_video("person", frames)
149
+ owl_v2_counts = get_counts(owl_v2_out)
135
150
 
136
151
  # plan2
137
152
  florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
153
+ florence2_counts = get_counts(florence2_out)
138
154
 
139
155
  # plan3
140
156
  f2s2_tracking_out = florence2_sam2_video_tracking("person", frames)
141
157
  remove_arrays(f2s2_tracking_out)
158
+ f2s2_counts = get_counts(f2s2_tracking_out)
159
+
160
+ final_out = {{
161
+ "owl_v2_video": owl_v2_out,
162
+ "florence2_phrase_grounding": florence2_out,
163
+ "florence2_sam2_video_tracking": f2s2_out,
164
+ }}
165
+
166
+ counts = {{
167
+ "owl_v2_video": owl_v2_counts,
168
+ "florence2_phrase_grounding": florence2_counts,
169
+ "florence2_sam2_video_tracking": f2s2_counts,
170
+ }}
142
171
 
143
- final_out = {{"owl_v2_image": owl_v2_out, "florence2_phrase_grounding": florence2_out, "florence2_sam2_video_tracking": f2s2_tracking_out}}
144
172
  print(final_out)
173
+ print(labels_and_scores)
174
+ print(counts)
145
175
  ```
146
176
  """
147
177
 
@@ -159,7 +189,7 @@ But got the following error or no stdout:
159
189
 
160
190
 
161
191
  PICK_PLAN = """
162
- **Role**: You are a software programmer.
192
+ **Role**: You are an advanced AI model that can understand the user request and construct plans to accomplish it.
163
193
 
164
194
  **Task**: Your responsibility is to pick the best plan from the three plans provided.
165
195
 
@@ -173,13 +203,14 @@ PICK_PLAN = """
173
203
  {tool_output}
174
204
 
175
205
  **Instructions**:
176
- 1. Given the plans, image, and tool outputs, decide which plan is the best to achieve the user request.
177
- 2. Solve the problem yourself given the image and pick the plan that matches your solution the best.
206
+ 1. Re-read the user request, plans, tool outputs and examine the image.
207
+ 2. Solve the problem yourself given the image and pick the most accurate plan that matches your solution the best.
208
+ 3. Add modifications to improve the plan including: changing a tool, adding thresholds, string matching.
178
209
  3. Output a JSON object with the following format:
179
210
  {{
180
211
  "predicted_answer": str # the answer you would expect from the best plan
181
- "thoughts": str # your thought process for choosing the best plan
182
- "best_plan": str # the best plan you have chosen
212
+ "thoughts": str # your thought process for choosing the best plan over other plans and any modifications you made
213
+ "best_plan": str # the best plan you have chosen, must be `plan1`, `plan2`, or `plan3`
183
214
  }}
184
215
  """
185
216
 
@@ -201,15 +232,18 @@ This is the documentation for the functions you have access to. You may call any
201
232
  **User Instructions**:
202
233
  {question}
203
234
 
204
- **Tool Output**:
235
+ **Tool Tests and Outputs**:
205
236
  {tool_output}
206
237
 
238
+ **Tool Output Thoughts**:
239
+ {plan_thoughts}
240
+
207
241
  **Previous Feedback**:
208
242
  {feedback}
209
243
 
210
244
  **Instructions**:
211
245
  1. **Understand and Clarify**: Make sure you understand the task.
212
- 2. **Algorithm/Method Selection**: Decide on the most efficient way.
246
+ 2. **Algorithm/Method Selection**: Decide on the most efficient method, use the tool outputs and tool thoughts to guide you.
213
247
  3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode.
214
248
  4. **Code Generation**: Translate your pseudocode into executable Python code. Ensure you use correct arguments, remember coordinates are always returned normalized from `vision_agent.tools`. All images from `vision_agent.tools` are in RGB format, red is (255, 0, 0) and blue is (0, 0, 255).
215
249
  """
@@ -18,19 +18,24 @@ Here is an example of how you can interact with a user and Actions to complete a
18
18
  {examples}
19
19
  --- END EXAMPLES ---
20
20
 
21
- **Instructions**:
22
- 1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
23
- 2. **Output in JSON**: Respond in JSON format, {{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
24
-
25
21
  **Conversation**:
26
22
  Here is the current conversation so far:
27
23
  --- START CONVERSATION ---
28
24
  {conversation}
25
+ --- END CONVERSATION ---
26
+
27
+ **Instructions**:
28
+ 1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
29
+ 2. **Output in JSON**: Respond in the following format in JSON:
30
+
31
+ ```json
32
+ {{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
33
+ ```
29
34
  """
30
35
 
31
36
 
32
37
  EXAMPLES_CODE1 = """
33
- USER: Can you detect the dogs in this image? Media name dog.jpg
38
+ USER: Can you write code to detect the dogs in this image? Media name dog.jpg
34
39
 
35
40
  OBSERVATION:
36
41
  [Artifacts loaded]
@@ -61,6 +66,7 @@ AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask
61
66
  EXAMPLES_CODE1_EXTRA = """
62
67
  USER: The the image only has one dog, can you fix this?
63
68
 
69
+ OBSERVATION:
64
70
  [Artifacts loaded]
65
71
  Artifact dog.jpg loaded to /path/to/images/dog.jpg
66
72
  Artifact dog_detector.py loaded to /path/to/code/dog_detector.py
@@ -86,8 +92,24 @@ OBSERVATION:
86
92
  AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect one dog and shown you the output, do the results look good to you?", "let_user_respond": true}
87
93
  """
88
94
 
89
-
90
95
  EXAMPLES_CODE2 = """
96
+ USER: Can you describe this image?
97
+
98
+ OBSERVATION:
99
+ [Artifacts loaded]
100
+ Artifact image.jpg loaded to /path/to/images/image.jpg
101
+ [End of artifacts]
102
+
103
+ AGENT: {"thoughts": "The user hasn't asked me to write any code and the task is very simple so I will view the image and answer myself to respond to the user quickly.", "response": "<execute_python>view_media_artifacts('image.jpg')</execute_python>", "let_user_respond": false}
104
+
105
+ OBSERVATION:
106
+ [Image image.jpg displayed]
107
+
108
+ AGENT: {"thoughts": "The image shows a cat and a dog sitting on the couch, I will tell the user and ask them if they need any other assistance.", "response": "The image contains a dog and a cat sitting on a couch. Can I help you with any other tasks?", "let_user_respond": true}
109
+ """
110
+
111
+
112
+ EXAMPLES_CODE3 = """
91
113
  USER: Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?
92
114
 
93
115
  OBSERVATION:
@@ -137,13 +159,13 @@ AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to
137
159
 
138
160
  USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
139
161
 
140
- AGENT: {"thoughts": "Because the user has supplied me with labels I can call florence2_fine_tuning on their behalf to fine tune the model", "response": "I will fine tune florence2 with the labels you provided <execute_python>florence2_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}], "phrase_grounding")</execute_python>", "let_user_respond": false}
162
+ AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "I will fine tune florence2 with the labels you provided <execute_python>object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])</execute_python>", "let_user_respond": false}
141
163
 
142
164
  OBSERVATION:
143
- [Florence2 fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
165
+ [Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
144
166
 
145
167
 
146
- AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_florence2_fine_tuning(artifacts, "code.py", "phrase_grounding", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
168
+ AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
147
169
 
148
170
  OBSERVATION:
149
171
  [Artifact code.py edits]
@@ -1,2 +1,2 @@
1
- from .lmm import LMM, AzureOpenAILMM, ClaudeSonnetLMM, OllamaLMM, OpenAILMM
1
+ from .lmm import LMM, AnthropicLMM, AzureOpenAILMM, OllamaLMM, OpenAILMM
2
2
  from .types import Message
vision_agent/lmm/lmm.py CHANGED
@@ -1,5 +1,4 @@
1
1
  import json
2
- import logging
3
2
  import os
4
3
  from abc import ABC, abstractmethod
5
4
  from pathlib import Path
@@ -14,8 +13,6 @@ from vision_agent.utils.image_utils import encode_media
14
13
 
15
14
  from .types import Message
16
15
 
17
- _LOGGER = logging.getLogger(__name__)
18
-
19
16
 
20
17
  class LMM(ABC):
21
18
  @abstractmethod
@@ -45,11 +42,11 @@ class LMM(ABC):
45
42
 
46
43
 
47
44
  class OpenAILMM(LMM):
48
- r"""An LMM class for the OpenAI GPT-4 Vision model."""
45
+ r"""An LMM class for the OpenAI LMMs."""
49
46
 
50
47
  def __init__(
51
48
  self,
52
- model_name: str = "gpt-4o",
49
+ model_name: str = "gpt-4o-2024-05-13",
53
50
  api_key: Optional[str] = None,
54
51
  max_tokens: int = 4096,
55
52
  json_mode: bool = False,
@@ -365,8 +362,8 @@ class OllamaLMM(LMM):
365
362
  return resp["response"] # type: ignore
366
363
 
367
364
 
368
- class ClaudeSonnetLMM(LMM):
369
- r"""An LMM class for Anthropic's Claude Sonnet model."""
365
+ class AnthropicLMM(LMM):
366
+ r"""An LMM class for Anthropic's LMMs."""
370
367
 
371
368
  def __init__(
372
369
  self,
@@ -402,7 +399,7 @@ class ClaudeSonnetLMM(LMM):
402
399
  ]
403
400
  if "media" in msg:
404
401
  for media_path in msg["media"]:
405
- encoded_media = encode_media(media_path)
402
+ encoded_media = encode_media(media_path, resize=768)
406
403
  content.append(
407
404
  ImageBlockParam(
408
405
  type="image",
@@ -449,7 +446,7 @@ class ClaudeSonnetLMM(LMM):
449
446
  ]
450
447
  if media:
451
448
  for m in media:
452
- encoded_media = encode_media(m)
449
+ encoded_media = encode_media(m, resize=768)
453
450
  content.append(
454
451
  ImageBlockParam(
455
452
  type="image",
@@ -21,7 +21,7 @@ from .tools import (
21
21
  depth_anything_v2,
22
22
  detr_segmentation,
23
23
  dpt_hybrid_midas,
24
- extract_frames,
24
+ extract_frames_and_timestamps,
25
25
  florence2_image_caption,
26
26
  florence2_ocr,
27
27
  florence2_phrase_grounding,
@@ -486,6 +486,33 @@ def list_artifacts(artifacts: Artifacts) -> str:
486
486
  return output_str
487
487
 
488
488
 
489
+ def check_and_load_image(code: str) -> List[str]:
490
+ if not code.strip():
491
+ return []
492
+
493
+ pattern = r"show_media_artifact\(\s*([^\)]+),\s*['\"]([^\)]+)['\"]\s*\)"
494
+ match = re.search(pattern, code)
495
+ if match:
496
+ name = match.group(2)
497
+ return [name]
498
+ return []
499
+
500
+
501
+ def view_media_artifact(artifacts: Artifacts, name: str) -> str:
502
+ """Views the image artifact with the given name.
503
+
504
+ Parameters:
505
+ artifacts (Artifacts): The artifacts object to show the image from.
506
+ name (str): The name of the image artifact to show.
507
+ """
508
+ if name not in artifacts:
509
+ output_str = f"[Artifact {name} does not exist]"
510
+ else:
511
+ output_str = f"[Image {name} displayed]"
512
+ print(output_str)
513
+ return output_str
514
+
515
+
489
516
  def get_tool_descriptions() -> str:
490
517
  """Returns a description of all the tools that `generate_vision_code` has access to.
491
518
  Helpful for answering questions about what types of vision tasks you can do with
@@ -493,16 +520,15 @@ def get_tool_descriptions() -> str:
493
520
  return TOOL_DESCRIPTIONS
494
521
 
495
522
 
496
- def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str:
523
+ def object_detection_fine_tuning(bboxes: List[Dict[str, Any]]) -> str:
497
524
  """DO NOT use this function unless the user has supplied you with bboxes.
498
- 'florence2_fine_tuning' is a tool that fine-tune florence2 to be able to detect
499
- objects in an image based on a given dataset. It returns the fine tuning job id.
525
+ 'object_detection_fine_tuning' is a tool that fine-tunes object detection models to
526
+ be able to detect objects in an image based on a given dataset. It returns the fine
527
+ tuning job id.
500
528
 
501
529
  Parameters:
502
530
  bboxes (List[BboxInput]): A list of BboxInput containing the image path, labels
503
531
  and bounding boxes. The coordinates are unnormalized.
504
- task (str): The florencev2 fine-tuning task. The options are
505
- 'phrase_grounding'.
506
532
 
507
533
  Returns:
508
534
  str: The fine tuning job id, this id will used to retrieve the fine tuned
@@ -510,12 +536,13 @@ def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str:
510
536
 
511
537
  Example
512
538
  -------
513
- >>> fine_tuning_job_id = florencev2_fine_tuning(
539
+ >>> fine_tuning_job_id = object_detection_fine_tuning(
514
540
  [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
515
541
  {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
516
542
  "phrase_grounding"
517
543
  )
518
544
  """
545
+ task = "phrase_grounding"
519
546
  bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
520
547
  task_type = PromptTask[task.upper()]
521
548
  fine_tuning_request = [
@@ -531,7 +558,7 @@ def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str:
531
558
  fine_tune_id = str(
532
559
  landing_api.launch_fine_tuning_job("florencev2", task_type, fine_tuning_request)
533
560
  )
534
- print(f"[Florence2 fine tuning id: {fine_tune_id}]")
561
+ print(f"[Fine tuning id: {fine_tune_id}]")
535
562
  return fine_tune_id
536
563
 
537
564
 
@@ -564,7 +591,7 @@ def use_extra_vision_agent_args(
564
591
  Returns:
565
592
  str: The edited code.
566
593
  """
567
- generate_pattern = r"generate_vision_code\(\s*([^\)]+)\)"
594
+ generate_pattern = r"generate_vision_code\(\s*([^\)]+)\s*\)"
568
595
 
569
596
  def generate_replacer(match: re.Match) -> str:
570
597
  arg = match.group(1)
@@ -575,7 +602,7 @@ def use_extra_vision_agent_args(
575
602
  out_str += ")"
576
603
  return out_str
577
604
 
578
- edit_pattern = r"edit_vision_code\(\s*([^\)]+)\)"
605
+ edit_pattern = r"edit_vision_code\(\s*([^\)]+)\s*\)"
579
606
 
580
607
  def edit_replacer(match: re.Match) -> str:
581
608
  arg = match.group(1)
@@ -591,48 +618,52 @@ def use_extra_vision_agent_args(
591
618
  return new_code
592
619
 
593
620
 
594
- def use_florence2_fine_tuning(
595
- artifacts: Artifacts, name: str, task: str, fine_tune_id: str
621
+ def use_object_detection_fine_tuning(
622
+ artifacts: Artifacts, name: str, fine_tune_id: str
596
623
  ) -> str:
597
- """Replaces florence2 calls with the fine tuning id. This ensures that the code
598
- utilizes the fined tuned florence2 model. Returns the diff between the original
599
- code and the new code.
624
+ """Replaces calls to 'owl_v2_image', 'florence2_phrase_detection' and
625
+ 'florence2_sam2_image' with the fine tuning id. This ensures that the code utilizes
626
+ the fined tuned florence2 model. Returns the diff between the original code and the
627
+ new code.
600
628
 
601
629
  Parameters:
602
630
  artifacts (Artifacts): The artifacts object to edit the code from.
603
631
  name (str): The name of the artifact to edit.
604
- task (str): The task to fine tune the model for. The options are
605
- 'phrase_grounding'.
606
632
  fine_tune_id (str): The fine tuning job id.
607
633
 
608
634
  Examples
609
635
  --------
610
- >>> diff = use_florence2_fine_tuning(artifacts, "code.py", "phrase_grounding", "23b3b022-5ebf-4798-9373-20ef36429abf")
636
+ >>> diff = use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")
611
637
  """
612
638
 
613
- task_to_fn = {"phrase_grounding": "florence2_phrase_grounding"}
614
-
615
639
  if name not in artifacts:
616
640
  output_str = f"[Artifact {name} does not exist]"
617
641
  print(output_str)
618
642
  return output_str
619
643
 
620
644
  code = artifacts[name]
621
- if task.lower() == "phrase_grounding":
622
- pattern = r"florence2_phrase_grounding\(\s*([^\)]+)\)"
623
-
624
- def replacer(match: re.Match) -> str:
625
- arg = match.group(1) # capture all initial arguments
626
- return f'florence2_phrase_grounding({arg}, "{fine_tune_id}")'
627
-
628
- else:
629
- raise ValueError(f"Task {task} is not supported.")
645
+ patterns = [
646
+ (
647
+ r"florence2_phrase_grounding\(\s*([^\)]+)\s*\)",
648
+ lambda match: f'florence2_phrase_grounding({match.group(1)}, "{fine_tune_id}")',
649
+ ),
650
+ (
651
+ r"owl_v2_image\(\s*([^\)]+)\s*\)",
652
+ lambda match: f'owl_v2_image({match.group(1)}, "{fine_tune_id}")',
653
+ ),
654
+ (
655
+ r"florence2_sam2_image\(\s*([^\)]+)\s*\)",
656
+ lambda match: f'florence2_sam2_image({match.group(1)}, "{fine_tune_id}")',
657
+ ),
658
+ ]
630
659
 
631
- new_code = re.sub(pattern, replacer, code)
660
+ new_code = code
661
+ for pattern, replacer in patterns:
662
+ new_code = re.sub(pattern, replacer, new_code)
632
663
 
633
664
  if new_code == code:
634
665
  output_str = (
635
- f"[Fine tuning task {task} function {task_to_fn[task]} not found in code]"
666
+ f"[No function calls to replace with fine tuning id in artifact {name}]"
636
667
  )
637
668
  print(output_str)
638
669
  return output_str
@@ -662,8 +693,9 @@ META_TOOL_DOCSTRING = get_tool_documentation(
662
693
  generate_vision_code,
663
694
  edit_vision_code,
664
695
  write_media_artifact,
665
- florence2_fine_tuning,
666
- use_florence2_fine_tuning,
696
+ view_media_artifact,
697
+ object_detection_fine_tuning,
698
+ use_object_detection_fine_tuning,
667
699
  list_artifacts,
668
700
  ]
669
701
  )
@@ -208,19 +208,25 @@ def _call_post(
208
208
  if files:
209
209
  files_in_b64 = [(file[0], b64encode(file[1]).decode("utf-8")) for file in files]
210
210
  try:
211
+ if files is not None:
212
+ response = session.post(url, data=payload, files=files)
213
+ else:
214
+ response = session.post(url, json=payload)
215
+
216
+ # make sure function_name is in the payload so we can display it
217
+ tool_call_trace_payload = (
218
+ payload
219
+ if "function_name" in payload
220
+ else {**payload, **{"function_name": function_name}}
221
+ )
211
222
  tool_call_trace = ToolCallTrace(
212
223
  endpoint_url=url,
213
- request=payload,
224
+ request=tool_call_trace_payload,
214
225
  response={},
215
226
  error=None,
216
227
  files=files_in_b64,
217
228
  )
218
229
 
219
- if files is not None:
220
- response = session.post(url, data=payload, files=files)
221
- else:
222
- response = session.post(url, json=payload)
223
-
224
230
  if response.status_code != 200:
225
231
  tool_call_trace.error = Error(
226
232
  name="RemoteToolCallFailed",