vision-agent 0.2.121__py3-none-any.whl → 0.2.122__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/agent/vision_agent.py +10 -6
- vision_agent/agent/vision_agent_coder.py +1 -9
- vision_agent/agent/vision_agent_prompts.py +3 -3
- vision_agent/tools/meta_tools.py +140 -8
- vision_agent/tools/tools.py +32 -124
- vision_agent/tools/tools_types.py +3 -10
- vision_agent/utils/execute.py +13 -4
- {vision_agent-0.2.121.dist-info → vision_agent-0.2.122.dist-info}/METADATA +1 -1
- {vision_agent-0.2.121.dist-info → vision_agent-0.2.122.dist-info}/RECORD +11 -11
- {vision_agent-0.2.121.dist-info → vision_agent-0.2.122.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.121.dist-info → vision_agent-0.2.122.dist-info}/WHEEL +0 -0
@@ -30,7 +30,7 @@ class BoilerplateCode:
|
|
30
30
|
pre_code = [
|
31
31
|
"from typing import *",
|
32
32
|
"from vision_agent.utils.execute import CodeInterpreter",
|
33
|
-
"from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact",
|
33
|
+
"from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, florence2_fine_tuning, use_florence2_fine_tuning",
|
34
34
|
"artifacts = Artifacts('{remote_path}')",
|
35
35
|
"artifacts.load('{remote_path}')",
|
36
36
|
]
|
@@ -76,11 +76,16 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
|
|
76
76
|
|
77
77
|
def run_code_action(
|
78
78
|
code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str
|
79
|
-
) -> Execution:
|
80
|
-
|
79
|
+
) -> Tuple[Execution, str]:
|
80
|
+
result = code_interpreter.exec_isolation(
|
81
81
|
BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path)
|
82
82
|
)
|
83
83
|
|
84
|
+
obs = str(result.logs)
|
85
|
+
if result.error:
|
86
|
+
obs += f"\n{result.error}"
|
87
|
+
return result, obs
|
88
|
+
|
84
89
|
|
85
90
|
def parse_execution(response: str) -> Optional[str]:
|
86
91
|
code = None
|
@@ -192,7 +197,7 @@ class VisionAgent(Agent):
|
|
192
197
|
artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
|
193
198
|
|
194
199
|
with CodeInterpreterFactory.new_instance(
|
195
|
-
code_sandbox_runtime=self.code_sandbox_runtime
|
200
|
+
code_sandbox_runtime=self.code_sandbox_runtime,
|
196
201
|
) as code_interpreter:
|
197
202
|
orig_chat = copy.deepcopy(chat)
|
198
203
|
int_chat = copy.deepcopy(chat)
|
@@ -260,10 +265,9 @@ class VisionAgent(Agent):
|
|
260
265
|
code_action = parse_execution(response["response"])
|
261
266
|
|
262
267
|
if code_action is not None:
|
263
|
-
result = run_code_action(
|
268
|
+
result, obs = run_code_action(
|
264
269
|
code_action, code_interpreter, str(remote_artifacts_path)
|
265
270
|
)
|
266
|
-
obs = str(result.logs)
|
267
271
|
|
268
272
|
if self.verbosity >= 1:
|
269
273
|
_LOGGER.info(obs)
|
@@ -1,5 +1,4 @@
|
|
1
1
|
import copy
|
2
|
-
import difflib
|
3
2
|
import logging
|
4
3
|
import os
|
5
4
|
import sys
|
@@ -29,6 +28,7 @@ from vision_agent.agent.vision_agent_coder_prompts import (
|
|
29
28
|
USER_REQ,
|
30
29
|
)
|
31
30
|
from vision_agent.lmm import LMM, AzureOpenAILMM, Message, OllamaLMM, OpenAILMM
|
31
|
+
from vision_agent.tools.meta_tools import get_diff
|
32
32
|
from vision_agent.utils import CodeInterpreterFactory, Execution
|
33
33
|
from vision_agent.utils.execute import CodeInterpreter
|
34
34
|
from vision_agent.utils.image_utils import b64_to_pil
|
@@ -63,14 +63,6 @@ class DefaultImports:
|
|
63
63
|
return DefaultImports.to_code_string() + "\n\n" + code
|
64
64
|
|
65
65
|
|
66
|
-
def get_diff(before: str, after: str) -> str:
|
67
|
-
return "".join(
|
68
|
-
difflib.unified_diff(
|
69
|
-
before.splitlines(keepends=True), after.splitlines(keepends=True)
|
70
|
-
)
|
71
|
-
)
|
72
|
-
|
73
|
-
|
74
66
|
def format_memory(memory: List[Dict[str, str]]) -> str:
|
75
67
|
output_str = ""
|
76
68
|
for i, m in enumerate(memory):
|
@@ -48,7 +48,7 @@ OBSERVATION:
|
|
48
48
|
4| return dogs
|
49
49
|
[End of artifact]
|
50
50
|
|
51
|
-
AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
|
51
|
+
AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code and print the results to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
|
52
52
|
|
53
53
|
OBSERVATION:
|
54
54
|
----- stdout -----
|
@@ -75,7 +75,7 @@ OBSERVATION:
|
|
75
75
|
4| return dogs
|
76
76
|
[End of artifact]
|
77
77
|
|
78
|
-
AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/path/to/images/dog.jpg'))</execute_python>", "let_user_respond": false}
|
78
|
+
AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code and print the results to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/path/to/images/dog.jpg'))</execute_python>", "let_user_respond": false}
|
79
79
|
|
80
80
|
OBSERVATION:
|
81
81
|
----- stdout -----
|
@@ -126,7 +126,7 @@ OBSERVATION:
|
|
126
126
|
15| return count
|
127
127
|
[End of artifact]
|
128
128
|
|
129
|
-
AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code
|
129
|
+
AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
|
130
130
|
|
131
131
|
OBSERVATION:
|
132
132
|
----- stdout -----
|
vision_agent/tools/meta_tools.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1
|
+
import difflib
|
1
2
|
import os
|
2
3
|
import pickle as pkl
|
4
|
+
import re
|
3
5
|
import subprocess
|
4
6
|
import tempfile
|
5
7
|
from pathlib import Path
|
@@ -8,10 +10,13 @@ from typing import Any, Dict, List, Optional, Union
|
|
8
10
|
from IPython.display import display
|
9
11
|
|
10
12
|
import vision_agent as va
|
13
|
+
from vision_agent.clients.landing_public_api import LandingPublicAPI
|
11
14
|
from vision_agent.lmm.types import Message
|
12
15
|
from vision_agent.tools.tool_utils import get_tool_documentation
|
13
16
|
from vision_agent.tools.tools import TOOL_DESCRIPTIONS
|
17
|
+
from vision_agent.tools.tools_types import BboxInput, BboxInputBase64, PromptTask
|
14
18
|
from vision_agent.utils.execute import Execution, MimeType
|
19
|
+
from vision_agent.utils.image_utils import convert_to_b64
|
15
20
|
|
16
21
|
# These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
|
17
22
|
|
@@ -99,13 +104,14 @@ class Artifacts:
|
|
99
104
|
|
100
105
|
def show(self) -> str:
|
101
106
|
"""Shows the artifacts that have been loaded and their remote save paths."""
|
102
|
-
|
107
|
+
output_str = "[Artifacts loaded]\n"
|
103
108
|
for k in self.artifacts.keys():
|
104
|
-
|
109
|
+
output_str += (
|
105
110
|
f"Artifact {k} loaded to {str(self.remote_save_path.parent / k)}\n"
|
106
111
|
)
|
107
|
-
|
108
|
-
|
112
|
+
output_str += "[End of artifacts]\n"
|
113
|
+
print(output_str)
|
114
|
+
return output_str
|
109
115
|
|
110
116
|
def save(self, local_path: Optional[Union[str, Path]] = None) -> None:
|
111
117
|
save_path = (
|
@@ -135,7 +141,12 @@ def format_lines(lines: List[str], start_idx: int) -> str:
|
|
135
141
|
|
136
142
|
|
137
143
|
def view_lines(
|
138
|
-
lines: List[str],
|
144
|
+
lines: List[str],
|
145
|
+
line_num: int,
|
146
|
+
window_size: int,
|
147
|
+
name: str,
|
148
|
+
total_lines: int,
|
149
|
+
print_output: bool = True,
|
139
150
|
) -> str:
|
140
151
|
start = max(0, line_num - window_size)
|
141
152
|
end = min(len(lines), line_num + window_size)
|
@@ -148,7 +159,9 @@ def view_lines(
|
|
148
159
|
else f"[{len(lines) - end} more lines]"
|
149
160
|
)
|
150
161
|
)
|
151
|
-
|
162
|
+
|
163
|
+
if print_output:
|
164
|
+
print(return_str)
|
152
165
|
return return_str
|
153
166
|
|
154
167
|
|
@@ -231,7 +244,7 @@ def edit_code_artifact(
|
|
231
244
|
new_content_lines = [
|
232
245
|
line if line.endswith("\n") else line + "\n" for line in new_content_lines
|
233
246
|
]
|
234
|
-
lines = artifacts[name].splitlines()
|
247
|
+
lines = artifacts[name].splitlines(keepends=True)
|
235
248
|
edited_lines = lines[:start] + new_content_lines + lines[end:]
|
236
249
|
|
237
250
|
cur_line = start + len(content.split("\n")) // 2
|
@@ -261,13 +274,20 @@ def edit_code_artifact(
|
|
261
274
|
DEFAULT_WINDOW_SIZE,
|
262
275
|
name,
|
263
276
|
total_lines,
|
277
|
+
print_output=False,
|
264
278
|
)
|
265
279
|
total_lines_edit = sum(1 for _ in edited_lines)
|
266
280
|
edited_view = view_lines(
|
267
|
-
edited_lines,
|
281
|
+
edited_lines,
|
282
|
+
cur_line,
|
283
|
+
DEFAULT_WINDOW_SIZE,
|
284
|
+
name,
|
285
|
+
total_lines_edit,
|
286
|
+
print_output=False,
|
268
287
|
)
|
269
288
|
|
270
289
|
error_msg += f"\n[This is how your edit would have looked like if applied]\n{edited_view}\n\n[This is the original code before your edit]\n{original_view}"
|
290
|
+
print(error_msg)
|
271
291
|
return error_msg
|
272
292
|
|
273
293
|
artifacts[name] = "".join(edited_lines)
|
@@ -390,6 +410,13 @@ def write_media_artifact(artifacts: Artifacts, local_path: str) -> str:
|
|
390
410
|
return f"[Media {Path(local_path).name} saved]"
|
391
411
|
|
392
412
|
|
413
|
+
def list_artifacts(artifacts: Artifacts) -> str:
|
414
|
+
"""Lists all the artifacts that have been loaded into the artifacts object."""
|
415
|
+
output_str = artifacts.show()
|
416
|
+
print(output_str)
|
417
|
+
return output_str
|
418
|
+
|
419
|
+
|
393
420
|
def get_tool_descriptions() -> str:
|
394
421
|
"""Returns a description of all the tools that `generate_vision_code` has access to.
|
395
422
|
Helpful for answering questions about what types of vision tasks you can do with
|
@@ -397,6 +424,108 @@ def get_tool_descriptions() -> str:
|
|
397
424
|
return TOOL_DESCRIPTIONS
|
398
425
|
|
399
426
|
|
427
|
+
def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str:
|
428
|
+
"""'florence2_fine_tuning' is a tool that fine-tune florence2 to be able to detect
|
429
|
+
objects in an image based on a given dataset. It returns the fine tuning job id.
|
430
|
+
|
431
|
+
Parameters:
|
432
|
+
bboxes (List[BboxInput]): A list of BboxInput containing the
|
433
|
+
image path, labels and bounding boxes.
|
434
|
+
task (str): The florencev2 fine-tuning task. The options are
|
435
|
+
'phrase_grounding'.
|
436
|
+
|
437
|
+
Returns:
|
438
|
+
UUID: The fine tuning job id, this id will used to retrieve the fine
|
439
|
+
tuned model.
|
440
|
+
|
441
|
+
Example
|
442
|
+
-------
|
443
|
+
>>> fine_tuning_job_id = florencev2_fine_tuning(
|
444
|
+
[{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
|
445
|
+
{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
|
446
|
+
"phrase_grounding"
|
447
|
+
)
|
448
|
+
"""
|
449
|
+
bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
|
450
|
+
task_type = PromptTask[task.upper()]
|
451
|
+
fine_tuning_request = [
|
452
|
+
BboxInputBase64(
|
453
|
+
image=convert_to_b64(bbox_input.image_path),
|
454
|
+
filename=Path(bbox_input.image_path).name,
|
455
|
+
labels=bbox_input.labels,
|
456
|
+
bboxes=bbox_input.bboxes,
|
457
|
+
)
|
458
|
+
for bbox_input in bboxes_input
|
459
|
+
]
|
460
|
+
landing_api = LandingPublicAPI()
|
461
|
+
fine_tune_id = str(
|
462
|
+
landing_api.launch_fine_tuning_job("florencev2", task_type, fine_tuning_request)
|
463
|
+
)
|
464
|
+
print(f"[Florence2 fine tuning id: {fine_tune_id}]")
|
465
|
+
return fine_tune_id
|
466
|
+
|
467
|
+
|
468
|
+
def get_diff(before: str, after: str) -> str:
|
469
|
+
return "".join(
|
470
|
+
difflib.unified_diff(
|
471
|
+
before.splitlines(keepends=True), after.splitlines(keepends=True)
|
472
|
+
)
|
473
|
+
)
|
474
|
+
|
475
|
+
|
476
|
+
def use_florence2_fine_tuning(
|
477
|
+
artifacts: Artifacts, name: str, task: str, fine_tune_id: str
|
478
|
+
) -> str:
|
479
|
+
"""Replaces florence2 calls with the fine tuning id. This ensures that the code
|
480
|
+
utilizes the fined tuned florence2 model. Returns the diff between the original
|
481
|
+
code and the new code.
|
482
|
+
|
483
|
+
Parameters:
|
484
|
+
artifacts (Artifacts): The artifacts object to edit the code from.
|
485
|
+
name (str): The name of the artifact to edit.
|
486
|
+
task (str): The task to fine tune the model for. The options are
|
487
|
+
'phrase_grounding'.
|
488
|
+
fine_tune_id (str): The fine tuning job id.
|
489
|
+
|
490
|
+
Examples
|
491
|
+
--------
|
492
|
+
>>> diff = use_florence2_fine_tuning(artifacts, "code.py", "phrase_grounding", "23b3b022-5ebf-4798-9373-20ef36429abf")
|
493
|
+
"""
|
494
|
+
|
495
|
+
task_to_fn = {"phrase_grounding": "florence2_phrase_grounding"}
|
496
|
+
|
497
|
+
if name not in artifacts:
|
498
|
+
output_str = f"[Artifact {name} does not exist]"
|
499
|
+
print(output_str)
|
500
|
+
return output_str
|
501
|
+
|
502
|
+
code = artifacts[name]
|
503
|
+
if task.lower() == "phrase_grounding":
|
504
|
+
pattern = r"florence2_phrase_grounding\(\s*([^\)]+)\)"
|
505
|
+
|
506
|
+
def replacer(match: re.Match) -> str:
|
507
|
+
arg = match.group(1) # capture all initial arguments
|
508
|
+
return f'florence2_phrase_grounding({arg}, "{fine_tune_id}")'
|
509
|
+
|
510
|
+
else:
|
511
|
+
raise ValueError(f"Task {task} is not supported.")
|
512
|
+
|
513
|
+
new_code = re.sub(pattern, replacer, code)
|
514
|
+
|
515
|
+
if new_code == code:
|
516
|
+
output_str = (
|
517
|
+
f"[Fine tuning task {task} function {task_to_fn[task]} not found in code]"
|
518
|
+
)
|
519
|
+
print(output_str)
|
520
|
+
return output_str
|
521
|
+
|
522
|
+
artifacts[name] = new_code
|
523
|
+
|
524
|
+
diff = get_diff(code, new_code)
|
525
|
+
print(diff)
|
526
|
+
return diff
|
527
|
+
|
528
|
+
|
400
529
|
META_TOOL_DOCSTRING = get_tool_documentation(
|
401
530
|
[
|
402
531
|
get_tool_descriptions,
|
@@ -406,5 +535,8 @@ META_TOOL_DOCSTRING = get_tool_documentation(
|
|
406
535
|
generate_vision_code,
|
407
536
|
edit_vision_code,
|
408
537
|
write_media_artifact,
|
538
|
+
florence2_fine_tuning,
|
539
|
+
use_florence2_fine_tuning,
|
540
|
+
list_artifacts,
|
409
541
|
]
|
410
542
|
)
|
vision_agent/tools/tools.py
CHANGED
@@ -28,10 +28,8 @@ from vision_agent.tools.tool_utils import (
|
|
28
28
|
filter_bboxes_by_threshold,
|
29
29
|
)
|
30
30
|
from vision_agent.tools.tools_types import (
|
31
|
-
BboxInput,
|
32
|
-
BboxInputBase64,
|
33
31
|
FineTuning,
|
34
|
-
|
32
|
+
Florence2FtRequest,
|
35
33
|
JobStatus,
|
36
34
|
PromptTask,
|
37
35
|
ODResponseData,
|
@@ -867,7 +865,9 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
|
|
867
865
|
return answer[task] # type: ignore
|
868
866
|
|
869
867
|
|
870
|
-
def florence2_phrase_grounding(
|
868
|
+
def florence2_phrase_grounding(
|
869
|
+
prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
|
870
|
+
) -> List[Dict[str, Any]]:
|
871
871
|
"""'florence2_phrase_grounding' is a tool that can detect multiple
|
872
872
|
objects given a text prompt which can be object names or caption. You
|
873
873
|
can optionally separate the object names in the text with commas. It returns a list
|
@@ -877,6 +877,8 @@ def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str,
|
|
877
877
|
Parameters:
|
878
878
|
prompt (str): The prompt to ground to the image.
|
879
879
|
image (np.ndarray): The image to used to detect objects
|
880
|
+
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
881
|
+
fine-tuned model ID here to use it.
|
880
882
|
|
881
883
|
Returns:
|
882
884
|
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
@@ -895,14 +897,33 @@ def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str,
|
|
895
897
|
"""
|
896
898
|
image_size = image.shape[:2]
|
897
899
|
image_b64 = convert_to_b64(image)
|
898
|
-
data = {
|
899
|
-
"image": image_b64,
|
900
|
-
"task": "<CAPTION_TO_PHRASE_GROUNDING>",
|
901
|
-
"prompt": prompt,
|
902
|
-
"function_name": "florence2_phrase_grounding",
|
903
|
-
}
|
904
900
|
|
905
|
-
|
901
|
+
if fine_tune_id is not None:
|
902
|
+
landing_api = LandingPublicAPI()
|
903
|
+
status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
|
904
|
+
if status is not JobStatus.SUCCEEDED:
|
905
|
+
raise FineTuneModelIsNotReady(
|
906
|
+
f"Fine-tuned model {fine_tune_id} is not ready yet"
|
907
|
+
)
|
908
|
+
|
909
|
+
data_obj = Florence2FtRequest(
|
910
|
+
image=image_b64,
|
911
|
+
task=PromptTask.PHRASE_GROUNDING,
|
912
|
+
tool="florencev2_fine_tuning",
|
913
|
+
prompt=prompt,
|
914
|
+
fine_tuning=FineTuning(job_id=UUID(fine_tune_id)),
|
915
|
+
)
|
916
|
+
data = data_obj.model_dump(by_alias=True)
|
917
|
+
detections = send_inference_request(data, "tools", v2=False)
|
918
|
+
else:
|
919
|
+
data = {
|
920
|
+
"image": image_b64,
|
921
|
+
"task": "<CAPTION_TO_PHRASE_GROUNDING>",
|
922
|
+
"prompt": prompt,
|
923
|
+
"function_name": "florence2_phrase_grounding",
|
924
|
+
}
|
925
|
+
detections = send_inference_request(data, "florence2", v2=True)
|
926
|
+
|
906
927
|
detections = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
|
907
928
|
return_data = []
|
908
929
|
for i in range(len(detections["bboxes"])):
|
@@ -1732,119 +1753,6 @@ def overlay_counting_results(
|
|
1732
1753
|
return np.array(pil_image)
|
1733
1754
|
|
1734
1755
|
|
1735
|
-
# TODO: add this function to the imports so that is picked in the agent
|
1736
|
-
def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
|
1737
|
-
"""'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
|
1738
|
-
to detect objects in an image based on a given dataset. It returns the fine
|
1739
|
-
tuning job id.
|
1740
|
-
|
1741
|
-
Parameters:
|
1742
|
-
bboxes (List[BboxInput]): A list of BboxInput containing the
|
1743
|
-
image path, labels and bounding boxes.
|
1744
|
-
task (PromptTask): The florencev2 fine-tuning task. The options are
|
1745
|
-
CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
|
1746
|
-
|
1747
|
-
Returns:
|
1748
|
-
UUID: The fine tuning job id, this id will used to retrieve the fine
|
1749
|
-
tuned model.
|
1750
|
-
|
1751
|
-
Example
|
1752
|
-
-------
|
1753
|
-
>>> fine_tuning_job_id = florencev2_fine_tuning(
|
1754
|
-
[{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
|
1755
|
-
{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
|
1756
|
-
"OBJECT_DETECTION"
|
1757
|
-
)
|
1758
|
-
"""
|
1759
|
-
bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
|
1760
|
-
task_input = PromptTask[task]
|
1761
|
-
fine_tuning_request = [
|
1762
|
-
BboxInputBase64(
|
1763
|
-
image=convert_to_b64(bbox_input.image_path),
|
1764
|
-
filename=bbox_input.image_path.split("/")[-1],
|
1765
|
-
labels=bbox_input.labels,
|
1766
|
-
bboxes=bbox_input.bboxes,
|
1767
|
-
)
|
1768
|
-
for bbox_input in bboxes_input
|
1769
|
-
]
|
1770
|
-
landing_api = LandingPublicAPI()
|
1771
|
-
return landing_api.launch_fine_tuning_job(
|
1772
|
-
"florencev2", task_input, fine_tuning_request
|
1773
|
-
)
|
1774
|
-
|
1775
|
-
|
1776
|
-
# TODO: add this function to the imports so that is picked in the agent
|
1777
|
-
def florencev2_fine_tuned_object_detection(
|
1778
|
-
image: np.ndarray, prompt: str, model_id: UUID, task: str
|
1779
|
-
) -> List[Dict[str, Any]]:
|
1780
|
-
"""'florencev2_fine_tuned_object_detection' is a tool that uses a fine tuned model
|
1781
|
-
to detect objects given a text prompt such as a phrase or class names separated by
|
1782
|
-
commas. It returns a list of detected objects as labels and their location as
|
1783
|
-
bounding boxes with score of 1.0.
|
1784
|
-
|
1785
|
-
Parameters:
|
1786
|
-
image (np.ndarray): The image to used to detect objects.
|
1787
|
-
prompt (str): The prompt to help find objects in the image.
|
1788
|
-
model_id (UUID): The fine-tuned model id.
|
1789
|
-
task (PromptTask): The florencev2 fine-tuning task. The options are
|
1790
|
-
CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
|
1791
|
-
|
1792
|
-
Returns:
|
1793
|
-
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
1794
|
-
bounding box of the detected objects with normalized coordinates between 0
|
1795
|
-
and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
|
1796
|
-
top-left and xmax and ymax are the coordinates of the bottom-right of the
|
1797
|
-
bounding box. The scores are always 1.0 and cannot be thresholded
|
1798
|
-
|
1799
|
-
Example
|
1800
|
-
-------
|
1801
|
-
>>> florencev2_fine_tuned_object_detection(
|
1802
|
-
image,
|
1803
|
-
'person looking at a coyote',
|
1804
|
-
UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83")
|
1805
|
-
)
|
1806
|
-
[
|
1807
|
-
{'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
1808
|
-
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
1809
|
-
]
|
1810
|
-
"""
|
1811
|
-
# check if job succeeded first
|
1812
|
-
landing_api = LandingPublicAPI()
|
1813
|
-
status = landing_api.check_fine_tuning_job(model_id)
|
1814
|
-
if status is not JobStatus.SUCCEEDED:
|
1815
|
-
raise FineTuneModelIsNotReady()
|
1816
|
-
|
1817
|
-
task = PromptTask[task]
|
1818
|
-
if task is PromptTask.OBJECT_DETECTION:
|
1819
|
-
prompt = ""
|
1820
|
-
|
1821
|
-
data_obj = Florencev2FtRequest(
|
1822
|
-
image=convert_to_b64(image),
|
1823
|
-
task=task,
|
1824
|
-
tool="florencev2_fine_tuning",
|
1825
|
-
prompt=prompt,
|
1826
|
-
fine_tuning=FineTuning(job_id=model_id),
|
1827
|
-
)
|
1828
|
-
data = data_obj.model_dump(by_alias=True)
|
1829
|
-
metadata_payload = {"function_name": "florencev2_fine_tuned_object_detection"}
|
1830
|
-
detections = send_inference_request(
|
1831
|
-
data, "tools", v2=False, metadata_payload=metadata_payload
|
1832
|
-
)
|
1833
|
-
|
1834
|
-
detections = detections[task.value]
|
1835
|
-
return_data = []
|
1836
|
-
image_size = image.shape[:2]
|
1837
|
-
for i in range(len(detections["bboxes"])):
|
1838
|
-
return_data.append(
|
1839
|
-
{
|
1840
|
-
"score": 1.0,
|
1841
|
-
"label": detections["labels"][i],
|
1842
|
-
"bbox": normalize_bbox(detections["bboxes"][i], image_size),
|
1843
|
-
}
|
1844
|
-
)
|
1845
|
-
return return_data
|
1846
|
-
|
1847
|
-
|
1848
1756
|
FUNCTION_TOOLS = [
|
1849
1757
|
owl_v2,
|
1850
1758
|
extract_frames,
|
@@ -19,16 +19,9 @@ class BboxInputBase64(BaseModel):
|
|
19
19
|
|
20
20
|
|
21
21
|
class PromptTask(str, Enum):
|
22
|
-
"""
|
23
|
-
Valid task prompts options for the Florencev2 model.
|
24
|
-
"""
|
22
|
+
"""Valid task prompts options for the Florence2 model."""
|
25
23
|
|
26
|
-
|
27
|
-
""""""
|
28
|
-
CAPTION_TO_PHRASE_GROUNDING = "<CAPTION_TO_PHRASE_GROUNDING>"
|
29
|
-
""""""
|
30
|
-
OBJECT_DETECTION = "<OD>"
|
31
|
-
""""""
|
24
|
+
PHRASE_GROUNDING = "<CAPTION_TO_PHRASE_GROUNDING>"
|
32
25
|
|
33
26
|
|
34
27
|
class FineTuning(BaseModel):
|
@@ -41,7 +34,7 @@ class FineTuning(BaseModel):
|
|
41
34
|
return str(job_id)
|
42
35
|
|
43
36
|
|
44
|
-
class
|
37
|
+
class Florence2FtRequest(BaseModel):
|
45
38
|
model_config = ConfigDict(populate_by_name=True)
|
46
39
|
|
47
40
|
image: str
|
vision_agent/utils/execute.py
CHANGED
@@ -564,7 +564,13 @@ class LocalCodeInterpreter(CodeInterpreter):
|
|
564
564
|
) -> None:
|
565
565
|
super().__init__(timeout=timeout)
|
566
566
|
self.nb = nbformat.v4.new_notebook()
|
567
|
-
|
567
|
+
# Set the notebook execution path to the remote path
|
568
|
+
self.resources = {"metadata": {"path": str(self.remote_path)}}
|
569
|
+
self.nb_client = NotebookClient(
|
570
|
+
self.nb,
|
571
|
+
timeout=self.timeout,
|
572
|
+
resources=self.resources,
|
573
|
+
)
|
568
574
|
_LOGGER.info(
|
569
575
|
f"""Local code interpreter initialized
|
570
576
|
Python version: {sys.version}
|
@@ -606,7 +612,9 @@ Timeout: {self.timeout}"""
|
|
606
612
|
def restart_kernel(self) -> None:
|
607
613
|
self.close()
|
608
614
|
self.nb = nbformat.v4.new_notebook()
|
609
|
-
self.nb_client = NotebookClient(
|
615
|
+
self.nb_client = NotebookClient(
|
616
|
+
self.nb, timeout=self.timeout, resources=self.resources
|
617
|
+
)
|
610
618
|
sleep(1)
|
611
619
|
self._new_kernel()
|
612
620
|
|
@@ -636,7 +644,7 @@ Timeout: {self.timeout}"""
|
|
636
644
|
f.write(contents)
|
637
645
|
_LOGGER.info(f"File ({file_path}) is uploaded to: {str(self.remote_path)}")
|
638
646
|
|
639
|
-
return Path(self.remote_path / file_path)
|
647
|
+
return Path(self.remote_path / Path(file_path).name)
|
640
648
|
|
641
649
|
def download_file(
|
642
650
|
self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]
|
@@ -672,7 +680,8 @@ class CodeInterpreterFactory:
|
|
672
680
|
|
673
681
|
@staticmethod
|
674
682
|
def new_instance(
|
675
|
-
code_sandbox_runtime: Optional[str] = None,
|
683
|
+
code_sandbox_runtime: Optional[str] = None,
|
684
|
+
remote_path: Optional[Union[str, Path]] = None,
|
676
685
|
) -> CodeInterpreter:
|
677
686
|
if not code_sandbox_runtime:
|
678
687
|
code_sandbox_runtime = os.getenv("CODE_SANDBOX_RUNTIME", "local")
|
@@ -2,10 +2,10 @@ vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
|
2
2
|
vision_agent/agent/__init__.py,sha256=FRwiux1FGvGccetyUCtY46KP01fQteqorm-JtFepovI,176
|
3
3
|
vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
|
4
4
|
vision_agent/agent/agent_utils.py,sha256=22LiPhkJlS5mVeo2dIi259pc2NgA7PGHRpcbnrtKo78,1930
|
5
|
-
vision_agent/agent/vision_agent.py,sha256=
|
6
|
-
vision_agent/agent/vision_agent_coder.py,sha256=
|
5
|
+
vision_agent/agent/vision_agent.py,sha256=WM1_o0VAQokAKlDr-0lpFxCRwUm_eFfFNWP-wSNjo7s,11180
|
6
|
+
vision_agent/agent/vision_agent_coder.py,sha256=ujctkpmQkX2C6YXjlp7VLZFqSB00xwkGe-9swA8Gv8s,34240
|
7
7
|
vision_agent/agent/vision_agent_coder_prompts.py,sha256=Rg7-Ih7oFgFbHFFno0EHpaZEgm0SYj_nTdqqdp21YLo,11246
|
8
|
-
vision_agent/agent/vision_agent_prompts.py,sha256=
|
8
|
+
vision_agent/agent/vision_agent_prompts.py,sha256=K1nLo3XKQ-IqCom1TRwh3cMoGZNxNwEgZqf3uJ6eL18,7221
|
9
9
|
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
|
11
11
|
vision_agent/clients/landing_public_api.py,sha256=rGtACkr8o5egDuMHQ5MBO4NuvsgPTp9Ew3rbq4R-vs0,1507
|
@@ -15,19 +15,19 @@ vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,
|
|
15
15
|
vision_agent/lmm/lmm.py,sha256=H3a5V7c073-vXRJfQOblE2j_CsZkH1CNNRoQgLjJZuQ,20751
|
16
16
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
17
17
|
vision_agent/tools/__init__.py,sha256=TILaqdFYicScvpnCXMxgBsFmSW22NQDIvucvEgo0etw,2289
|
18
|
-
vision_agent/tools/meta_tools.py,sha256=
|
18
|
+
vision_agent/tools/meta_tools.py,sha256=KeGiw2OtY8ARpGbtWjoNAoO1dwevt7LbCupaJX61MkE,18929
|
19
19
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
20
20
|
vision_agent/tools/tool_utils.py,sha256=e_p-G2nwgWOpoaqpDitY3FJ6fFuTEg5GhDOD67wI2bE,7527
|
21
|
-
vision_agent/tools/tools.py,sha256=
|
22
|
-
vision_agent/tools/tools_types.py,sha256=
|
21
|
+
vision_agent/tools/tools.py,sha256=jOBsuN-spY_2TlvpahoRYGvyInhQDTPXXukx9q72lEU,63454
|
22
|
+
vision_agent/tools/tools_types.py,sha256=qs11HGLRXc9zytahBtG6TQxCh8Gigvn232at3jk54jI,2356
|
23
23
|
vision_agent/utils/__init__.py,sha256=pWk0ktvR4aUEhuEIzSLM9kSgW4WDVqptdvOTeGLkJ6M,230
|
24
24
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
25
|
-
vision_agent/utils/execute.py,sha256=
|
25
|
+
vision_agent/utils/execute.py,sha256=gc4R_0BKUrZyhiKvIxOpYuzQPYVWQEqxr3ANy1lJAw4,27037
|
26
26
|
vision_agent/utils/image_utils.py,sha256=UloC4byIQLM4CSCaH41SBciQ7X2OqKvsVvNOVKqIH_k,9856
|
27
27
|
vision_agent/utils/sim.py,sha256=ebE9Cs00pVEDI1HMjAzUBk88tQQmc2U-yAzIDinnekU,5572
|
28
28
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
29
29
|
vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
|
30
|
-
vision_agent-0.2.
|
31
|
-
vision_agent-0.2.
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
30
|
+
vision_agent-0.2.122.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
31
|
+
vision_agent-0.2.122.dist-info/METADATA,sha256=WMdLNPyKY4Ot6ifOzwXNDiVm2TsStY-l-ge8t72Ynhk,12255
|
32
|
+
vision_agent-0.2.122.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
33
|
+
vision_agent-0.2.122.dist-info/RECORD,,
|
File without changes
|
File without changes
|