vision-agent 0.2.121__py3-none-any.whl → 0.2.122__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/agent/vision_agent.py +10 -6
- vision_agent/agent/vision_agent_coder.py +1 -9
- vision_agent/agent/vision_agent_prompts.py +3 -3
- vision_agent/tools/meta_tools.py +140 -8
- vision_agent/tools/tools.py +32 -124
- vision_agent/tools/tools_types.py +3 -10
- vision_agent/utils/execute.py +13 -4
- {vision_agent-0.2.121.dist-info → vision_agent-0.2.122.dist-info}/METADATA +1 -1
- {vision_agent-0.2.121.dist-info → vision_agent-0.2.122.dist-info}/RECORD +11 -11
- {vision_agent-0.2.121.dist-info → vision_agent-0.2.122.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.121.dist-info → vision_agent-0.2.122.dist-info}/WHEEL +0 -0
@@ -30,7 +30,7 @@ class BoilerplateCode:
|
|
30
30
|
pre_code = [
|
31
31
|
"from typing import *",
|
32
32
|
"from vision_agent.utils.execute import CodeInterpreter",
|
33
|
-
"from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact",
|
33
|
+
"from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, florence2_fine_tuning, use_florence2_fine_tuning",
|
34
34
|
"artifacts = Artifacts('{remote_path}')",
|
35
35
|
"artifacts.load('{remote_path}')",
|
36
36
|
]
|
@@ -76,11 +76,16 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
|
|
76
76
|
|
77
77
|
def run_code_action(
|
78
78
|
code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str
|
79
|
-
) -> Execution:
|
80
|
-
|
79
|
+
) -> Tuple[Execution, str]:
|
80
|
+
result = code_interpreter.exec_isolation(
|
81
81
|
BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path)
|
82
82
|
)
|
83
83
|
|
84
|
+
obs = str(result.logs)
|
85
|
+
if result.error:
|
86
|
+
obs += f"\n{result.error}"
|
87
|
+
return result, obs
|
88
|
+
|
84
89
|
|
85
90
|
def parse_execution(response: str) -> Optional[str]:
|
86
91
|
code = None
|
@@ -192,7 +197,7 @@ class VisionAgent(Agent):
|
|
192
197
|
artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
|
193
198
|
|
194
199
|
with CodeInterpreterFactory.new_instance(
|
195
|
-
code_sandbox_runtime=self.code_sandbox_runtime
|
200
|
+
code_sandbox_runtime=self.code_sandbox_runtime,
|
196
201
|
) as code_interpreter:
|
197
202
|
orig_chat = copy.deepcopy(chat)
|
198
203
|
int_chat = copy.deepcopy(chat)
|
@@ -260,10 +265,9 @@ class VisionAgent(Agent):
|
|
260
265
|
code_action = parse_execution(response["response"])
|
261
266
|
|
262
267
|
if code_action is not None:
|
263
|
-
result = run_code_action(
|
268
|
+
result, obs = run_code_action(
|
264
269
|
code_action, code_interpreter, str(remote_artifacts_path)
|
265
270
|
)
|
266
|
-
obs = str(result.logs)
|
267
271
|
|
268
272
|
if self.verbosity >= 1:
|
269
273
|
_LOGGER.info(obs)
|
@@ -1,5 +1,4 @@
|
|
1
1
|
import copy
|
2
|
-
import difflib
|
3
2
|
import logging
|
4
3
|
import os
|
5
4
|
import sys
|
@@ -29,6 +28,7 @@ from vision_agent.agent.vision_agent_coder_prompts import (
|
|
29
28
|
USER_REQ,
|
30
29
|
)
|
31
30
|
from vision_agent.lmm import LMM, AzureOpenAILMM, Message, OllamaLMM, OpenAILMM
|
31
|
+
from vision_agent.tools.meta_tools import get_diff
|
32
32
|
from vision_agent.utils import CodeInterpreterFactory, Execution
|
33
33
|
from vision_agent.utils.execute import CodeInterpreter
|
34
34
|
from vision_agent.utils.image_utils import b64_to_pil
|
@@ -63,14 +63,6 @@ class DefaultImports:
|
|
63
63
|
return DefaultImports.to_code_string() + "\n\n" + code
|
64
64
|
|
65
65
|
|
66
|
-
def get_diff(before: str, after: str) -> str:
|
67
|
-
return "".join(
|
68
|
-
difflib.unified_diff(
|
69
|
-
before.splitlines(keepends=True), after.splitlines(keepends=True)
|
70
|
-
)
|
71
|
-
)
|
72
|
-
|
73
|
-
|
74
66
|
def format_memory(memory: List[Dict[str, str]]) -> str:
|
75
67
|
output_str = ""
|
76
68
|
for i, m in enumerate(memory):
|
@@ -48,7 +48,7 @@ OBSERVATION:
|
|
48
48
|
4| return dogs
|
49
49
|
[End of artifact]
|
50
50
|
|
51
|
-
AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
|
51
|
+
AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code and print the results to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
|
52
52
|
|
53
53
|
OBSERVATION:
|
54
54
|
----- stdout -----
|
@@ -75,7 +75,7 @@ OBSERVATION:
|
|
75
75
|
4| return dogs
|
76
76
|
[End of artifact]
|
77
77
|
|
78
|
-
AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/path/to/images/dog.jpg'))</execute_python>", "let_user_respond": false}
|
78
|
+
AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code and print the results to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/path/to/images/dog.jpg'))</execute_python>", "let_user_respond": false}
|
79
79
|
|
80
80
|
OBSERVATION:
|
81
81
|
----- stdout -----
|
@@ -126,7 +126,7 @@ OBSERVATION:
|
|
126
126
|
15| return count
|
127
127
|
[End of artifact]
|
128
128
|
|
129
|
-
AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code
|
129
|
+
AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
|
130
130
|
|
131
131
|
OBSERVATION:
|
132
132
|
----- stdout -----
|
vision_agent/tools/meta_tools.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1
|
+
import difflib
|
1
2
|
import os
|
2
3
|
import pickle as pkl
|
4
|
+
import re
|
3
5
|
import subprocess
|
4
6
|
import tempfile
|
5
7
|
from pathlib import Path
|
@@ -8,10 +10,13 @@ from typing import Any, Dict, List, Optional, Union
|
|
8
10
|
from IPython.display import display
|
9
11
|
|
10
12
|
import vision_agent as va
|
13
|
+
from vision_agent.clients.landing_public_api import LandingPublicAPI
|
11
14
|
from vision_agent.lmm.types import Message
|
12
15
|
from vision_agent.tools.tool_utils import get_tool_documentation
|
13
16
|
from vision_agent.tools.tools import TOOL_DESCRIPTIONS
|
17
|
+
from vision_agent.tools.tools_types import BboxInput, BboxInputBase64, PromptTask
|
14
18
|
from vision_agent.utils.execute import Execution, MimeType
|
19
|
+
from vision_agent.utils.image_utils import convert_to_b64
|
15
20
|
|
16
21
|
# These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
|
17
22
|
|
@@ -99,13 +104,14 @@ class Artifacts:
|
|
99
104
|
|
100
105
|
def show(self) -> str:
|
101
106
|
"""Shows the artifacts that have been loaded and their remote save paths."""
|
102
|
-
|
107
|
+
output_str = "[Artifacts loaded]\n"
|
103
108
|
for k in self.artifacts.keys():
|
104
|
-
|
109
|
+
output_str += (
|
105
110
|
f"Artifact {k} loaded to {str(self.remote_save_path.parent / k)}\n"
|
106
111
|
)
|
107
|
-
|
108
|
-
|
112
|
+
output_str += "[End of artifacts]\n"
|
113
|
+
print(output_str)
|
114
|
+
return output_str
|
109
115
|
|
110
116
|
def save(self, local_path: Optional[Union[str, Path]] = None) -> None:
|
111
117
|
save_path = (
|
@@ -135,7 +141,12 @@ def format_lines(lines: List[str], start_idx: int) -> str:
|
|
135
141
|
|
136
142
|
|
137
143
|
def view_lines(
|
138
|
-
lines: List[str],
|
144
|
+
lines: List[str],
|
145
|
+
line_num: int,
|
146
|
+
window_size: int,
|
147
|
+
name: str,
|
148
|
+
total_lines: int,
|
149
|
+
print_output: bool = True,
|
139
150
|
) -> str:
|
140
151
|
start = max(0, line_num - window_size)
|
141
152
|
end = min(len(lines), line_num + window_size)
|
@@ -148,7 +159,9 @@ def view_lines(
|
|
148
159
|
else f"[{len(lines) - end} more lines]"
|
149
160
|
)
|
150
161
|
)
|
151
|
-
|
162
|
+
|
163
|
+
if print_output:
|
164
|
+
print(return_str)
|
152
165
|
return return_str
|
153
166
|
|
154
167
|
|
@@ -231,7 +244,7 @@ def edit_code_artifact(
|
|
231
244
|
new_content_lines = [
|
232
245
|
line if line.endswith("\n") else line + "\n" for line in new_content_lines
|
233
246
|
]
|
234
|
-
lines = artifacts[name].splitlines()
|
247
|
+
lines = artifacts[name].splitlines(keepends=True)
|
235
248
|
edited_lines = lines[:start] + new_content_lines + lines[end:]
|
236
249
|
|
237
250
|
cur_line = start + len(content.split("\n")) // 2
|
@@ -261,13 +274,20 @@ def edit_code_artifact(
|
|
261
274
|
DEFAULT_WINDOW_SIZE,
|
262
275
|
name,
|
263
276
|
total_lines,
|
277
|
+
print_output=False,
|
264
278
|
)
|
265
279
|
total_lines_edit = sum(1 for _ in edited_lines)
|
266
280
|
edited_view = view_lines(
|
267
|
-
edited_lines,
|
281
|
+
edited_lines,
|
282
|
+
cur_line,
|
283
|
+
DEFAULT_WINDOW_SIZE,
|
284
|
+
name,
|
285
|
+
total_lines_edit,
|
286
|
+
print_output=False,
|
268
287
|
)
|
269
288
|
|
270
289
|
error_msg += f"\n[This is how your edit would have looked like if applied]\n{edited_view}\n\n[This is the original code before your edit]\n{original_view}"
|
290
|
+
print(error_msg)
|
271
291
|
return error_msg
|
272
292
|
|
273
293
|
artifacts[name] = "".join(edited_lines)
|
@@ -390,6 +410,13 @@ def write_media_artifact(artifacts: Artifacts, local_path: str) -> str:
|
|
390
410
|
return f"[Media {Path(local_path).name} saved]"
|
391
411
|
|
392
412
|
|
413
|
+
def list_artifacts(artifacts: Artifacts) -> str:
|
414
|
+
"""Lists all the artifacts that have been loaded into the artifacts object."""
|
415
|
+
output_str = artifacts.show()
|
416
|
+
print(output_str)
|
417
|
+
return output_str
|
418
|
+
|
419
|
+
|
393
420
|
def get_tool_descriptions() -> str:
|
394
421
|
"""Returns a description of all the tools that `generate_vision_code` has access to.
|
395
422
|
Helpful for answering questions about what types of vision tasks you can do with
|
@@ -397,6 +424,108 @@ def get_tool_descriptions() -> str:
|
|
397
424
|
return TOOL_DESCRIPTIONS
|
398
425
|
|
399
426
|
|
427
|
+
def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str:
|
428
|
+
"""'florence2_fine_tuning' is a tool that fine-tune florence2 to be able to detect
|
429
|
+
objects in an image based on a given dataset. It returns the fine tuning job id.
|
430
|
+
|
431
|
+
Parameters:
|
432
|
+
bboxes (List[BboxInput]): A list of BboxInput containing the
|
433
|
+
image path, labels and bounding boxes.
|
434
|
+
task (str): The florencev2 fine-tuning task. The options are
|
435
|
+
'phrase_grounding'.
|
436
|
+
|
437
|
+
Returns:
|
438
|
+
UUID: The fine tuning job id, this id will used to retrieve the fine
|
439
|
+
tuned model.
|
440
|
+
|
441
|
+
Example
|
442
|
+
-------
|
443
|
+
>>> fine_tuning_job_id = florencev2_fine_tuning(
|
444
|
+
[{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
|
445
|
+
{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
|
446
|
+
"phrase_grounding"
|
447
|
+
)
|
448
|
+
"""
|
449
|
+
bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
|
450
|
+
task_type = PromptTask[task.upper()]
|
451
|
+
fine_tuning_request = [
|
452
|
+
BboxInputBase64(
|
453
|
+
image=convert_to_b64(bbox_input.image_path),
|
454
|
+
filename=Path(bbox_input.image_path).name,
|
455
|
+
labels=bbox_input.labels,
|
456
|
+
bboxes=bbox_input.bboxes,
|
457
|
+
)
|
458
|
+
for bbox_input in bboxes_input
|
459
|
+
]
|
460
|
+
landing_api = LandingPublicAPI()
|
461
|
+
fine_tune_id = str(
|
462
|
+
landing_api.launch_fine_tuning_job("florencev2", task_type, fine_tuning_request)
|
463
|
+
)
|
464
|
+
print(f"[Florence2 fine tuning id: {fine_tune_id}]")
|
465
|
+
return fine_tune_id
|
466
|
+
|
467
|
+
|
468
|
+
def get_diff(before: str, after: str) -> str:
|
469
|
+
return "".join(
|
470
|
+
difflib.unified_diff(
|
471
|
+
before.splitlines(keepends=True), after.splitlines(keepends=True)
|
472
|
+
)
|
473
|
+
)
|
474
|
+
|
475
|
+
|
476
|
+
def use_florence2_fine_tuning(
|
477
|
+
artifacts: Artifacts, name: str, task: str, fine_tune_id: str
|
478
|
+
) -> str:
|
479
|
+
"""Replaces florence2 calls with the fine tuning id. This ensures that the code
|
480
|
+
utilizes the fined tuned florence2 model. Returns the diff between the original
|
481
|
+
code and the new code.
|
482
|
+
|
483
|
+
Parameters:
|
484
|
+
artifacts (Artifacts): The artifacts object to edit the code from.
|
485
|
+
name (str): The name of the artifact to edit.
|
486
|
+
task (str): The task to fine tune the model for. The options are
|
487
|
+
'phrase_grounding'.
|
488
|
+
fine_tune_id (str): The fine tuning job id.
|
489
|
+
|
490
|
+
Examples
|
491
|
+
--------
|
492
|
+
>>> diff = use_florence2_fine_tuning(artifacts, "code.py", "phrase_grounding", "23b3b022-5ebf-4798-9373-20ef36429abf")
|
493
|
+
"""
|
494
|
+
|
495
|
+
task_to_fn = {"phrase_grounding": "florence2_phrase_grounding"}
|
496
|
+
|
497
|
+
if name not in artifacts:
|
498
|
+
output_str = f"[Artifact {name} does not exist]"
|
499
|
+
print(output_str)
|
500
|
+
return output_str
|
501
|
+
|
502
|
+
code = artifacts[name]
|
503
|
+
if task.lower() == "phrase_grounding":
|
504
|
+
pattern = r"florence2_phrase_grounding\(\s*([^\)]+)\)"
|
505
|
+
|
506
|
+
def replacer(match: re.Match) -> str:
|
507
|
+
arg = match.group(1) # capture all initial arguments
|
508
|
+
return f'florence2_phrase_grounding({arg}, "{fine_tune_id}")'
|
509
|
+
|
510
|
+
else:
|
511
|
+
raise ValueError(f"Task {task} is not supported.")
|
512
|
+
|
513
|
+
new_code = re.sub(pattern, replacer, code)
|
514
|
+
|
515
|
+
if new_code == code:
|
516
|
+
output_str = (
|
517
|
+
f"[Fine tuning task {task} function {task_to_fn[task]} not found in code]"
|
518
|
+
)
|
519
|
+
print(output_str)
|
520
|
+
return output_str
|
521
|
+
|
522
|
+
artifacts[name] = new_code
|
523
|
+
|
524
|
+
diff = get_diff(code, new_code)
|
525
|
+
print(diff)
|
526
|
+
return diff
|
527
|
+
|
528
|
+
|
400
529
|
META_TOOL_DOCSTRING = get_tool_documentation(
|
401
530
|
[
|
402
531
|
get_tool_descriptions,
|
@@ -406,5 +535,8 @@ META_TOOL_DOCSTRING = get_tool_documentation(
|
|
406
535
|
generate_vision_code,
|
407
536
|
edit_vision_code,
|
408
537
|
write_media_artifact,
|
538
|
+
florence2_fine_tuning,
|
539
|
+
use_florence2_fine_tuning,
|
540
|
+
list_artifacts,
|
409
541
|
]
|
410
542
|
)
|
vision_agent/tools/tools.py
CHANGED
@@ -28,10 +28,8 @@ from vision_agent.tools.tool_utils import (
|
|
28
28
|
filter_bboxes_by_threshold,
|
29
29
|
)
|
30
30
|
from vision_agent.tools.tools_types import (
|
31
|
-
BboxInput,
|
32
|
-
BboxInputBase64,
|
33
31
|
FineTuning,
|
34
|
-
|
32
|
+
Florence2FtRequest,
|
35
33
|
JobStatus,
|
36
34
|
PromptTask,
|
37
35
|
ODResponseData,
|
@@ -867,7 +865,9 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
|
|
867
865
|
return answer[task] # type: ignore
|
868
866
|
|
869
867
|
|
870
|
-
def florence2_phrase_grounding(
|
868
|
+
def florence2_phrase_grounding(
|
869
|
+
prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
|
870
|
+
) -> List[Dict[str, Any]]:
|
871
871
|
"""'florence2_phrase_grounding' is a tool that can detect multiple
|
872
872
|
objects given a text prompt which can be object names or caption. You
|
873
873
|
can optionally separate the object names in the text with commas. It returns a list
|
@@ -877,6 +877,8 @@ def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str,
|
|
877
877
|
Parameters:
|
878
878
|
prompt (str): The prompt to ground to the image.
|
879
879
|
image (np.ndarray): The image to used to detect objects
|
880
|
+
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
|
881
|
+
fine-tuned model ID here to use it.
|
880
882
|
|
881
883
|
Returns:
|
882
884
|
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
@@ -895,14 +897,33 @@ def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str,
|
|
895
897
|
"""
|
896
898
|
image_size = image.shape[:2]
|
897
899
|
image_b64 = convert_to_b64(image)
|
898
|
-
data = {
|
899
|
-
"image": image_b64,
|
900
|
-
"task": "<CAPTION_TO_PHRASE_GROUNDING>",
|
901
|
-
"prompt": prompt,
|
902
|
-
"function_name": "florence2_phrase_grounding",
|
903
|
-
}
|
904
900
|
|
905
|
-
|
901
|
+
if fine_tune_id is not None:
|
902
|
+
landing_api = LandingPublicAPI()
|
903
|
+
status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
|
904
|
+
if status is not JobStatus.SUCCEEDED:
|
905
|
+
raise FineTuneModelIsNotReady(
|
906
|
+
f"Fine-tuned model {fine_tune_id} is not ready yet"
|
907
|
+
)
|
908
|
+
|
909
|
+
data_obj = Florence2FtRequest(
|
910
|
+
image=image_b64,
|
911
|
+
task=PromptTask.PHRASE_GROUNDING,
|
912
|
+
tool="florencev2_fine_tuning",
|
913
|
+
prompt=prompt,
|
914
|
+
fine_tuning=FineTuning(job_id=UUID(fine_tune_id)),
|
915
|
+
)
|
916
|
+
data = data_obj.model_dump(by_alias=True)
|
917
|
+
detections = send_inference_request(data, "tools", v2=False)
|
918
|
+
else:
|
919
|
+
data = {
|
920
|
+
"image": image_b64,
|
921
|
+
"task": "<CAPTION_TO_PHRASE_GROUNDING>",
|
922
|
+
"prompt": prompt,
|
923
|
+
"function_name": "florence2_phrase_grounding",
|
924
|
+
}
|
925
|
+
detections = send_inference_request(data, "florence2", v2=True)
|
926
|
+
|
906
927
|
detections = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
|
907
928
|
return_data = []
|
908
929
|
for i in range(len(detections["bboxes"])):
|
@@ -1732,119 +1753,6 @@ def overlay_counting_results(
|
|
1732
1753
|
return np.array(pil_image)
|
1733
1754
|
|
1734
1755
|
|
1735
|
-
# TODO: add this function to the imports so that is picked in the agent
|
1736
|
-
def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
|
1737
|
-
"""'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
|
1738
|
-
to detect objects in an image based on a given dataset. It returns the fine
|
1739
|
-
tuning job id.
|
1740
|
-
|
1741
|
-
Parameters:
|
1742
|
-
bboxes (List[BboxInput]): A list of BboxInput containing the
|
1743
|
-
image path, labels and bounding boxes.
|
1744
|
-
task (PromptTask): The florencev2 fine-tuning task. The options are
|
1745
|
-
CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
|
1746
|
-
|
1747
|
-
Returns:
|
1748
|
-
UUID: The fine tuning job id, this id will used to retrieve the fine
|
1749
|
-
tuned model.
|
1750
|
-
|
1751
|
-
Example
|
1752
|
-
-------
|
1753
|
-
>>> fine_tuning_job_id = florencev2_fine_tuning(
|
1754
|
-
[{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
|
1755
|
-
{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
|
1756
|
-
"OBJECT_DETECTION"
|
1757
|
-
)
|
1758
|
-
"""
|
1759
|
-
bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
|
1760
|
-
task_input = PromptTask[task]
|
1761
|
-
fine_tuning_request = [
|
1762
|
-
BboxInputBase64(
|
1763
|
-
image=convert_to_b64(bbox_input.image_path),
|
1764
|
-
filename=bbox_input.image_path.split("/")[-1],
|
1765
|
-
labels=bbox_input.labels,
|
1766
|
-
bboxes=bbox_input.bboxes,
|
1767
|
-
)
|
1768
|
-
for bbox_input in bboxes_input
|
1769
|
-
]
|
1770
|
-
landing_api = LandingPublicAPI()
|
1771
|
-
return landing_api.launch_fine_tuning_job(
|
1772
|
-
"florencev2", task_input, fine_tuning_request
|
1773
|
-
)
|
1774
|
-
|
1775
|
-
|
1776
|
-
# TODO: add this function to the imports so that is picked in the agent
|
1777
|
-
def florencev2_fine_tuned_object_detection(
|
1778
|
-
image: np.ndarray, prompt: str, model_id: UUID, task: str
|
1779
|
-
) -> List[Dict[str, Any]]:
|
1780
|
-
"""'florencev2_fine_tuned_object_detection' is a tool that uses a fine tuned model
|
1781
|
-
to detect objects given a text prompt such as a phrase or class names separated by
|
1782
|
-
commas. It returns a list of detected objects as labels and their location as
|
1783
|
-
bounding boxes with score of 1.0.
|
1784
|
-
|
1785
|
-
Parameters:
|
1786
|
-
image (np.ndarray): The image to used to detect objects.
|
1787
|
-
prompt (str): The prompt to help find objects in the image.
|
1788
|
-
model_id (UUID): The fine-tuned model id.
|
1789
|
-
task (PromptTask): The florencev2 fine-tuning task. The options are
|
1790
|
-
CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
|
1791
|
-
|
1792
|
-
Returns:
|
1793
|
-
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
1794
|
-
bounding box of the detected objects with normalized coordinates between 0
|
1795
|
-
and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
|
1796
|
-
top-left and xmax and ymax are the coordinates of the bottom-right of the
|
1797
|
-
bounding box. The scores are always 1.0 and cannot be thresholded
|
1798
|
-
|
1799
|
-
Example
|
1800
|
-
-------
|
1801
|
-
>>> florencev2_fine_tuned_object_detection(
|
1802
|
-
image,
|
1803
|
-
'person looking at a coyote',
|
1804
|
-
UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83")
|
1805
|
-
)
|
1806
|
-
[
|
1807
|
-
{'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
1808
|
-
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
1809
|
-
]
|
1810
|
-
"""
|
1811
|
-
# check if job succeeded first
|
1812
|
-
landing_api = LandingPublicAPI()
|
1813
|
-
status = landing_api.check_fine_tuning_job(model_id)
|
1814
|
-
if status is not JobStatus.SUCCEEDED:
|
1815
|
-
raise FineTuneModelIsNotReady()
|
1816
|
-
|
1817
|
-
task = PromptTask[task]
|
1818
|
-
if task is PromptTask.OBJECT_DETECTION:
|
1819
|
-
prompt = ""
|
1820
|
-
|
1821
|
-
data_obj = Florencev2FtRequest(
|
1822
|
-
image=convert_to_b64(image),
|
1823
|
-
task=task,
|
1824
|
-
tool="florencev2_fine_tuning",
|
1825
|
-
prompt=prompt,
|
1826
|
-
fine_tuning=FineTuning(job_id=model_id),
|
1827
|
-
)
|
1828
|
-
data = data_obj.model_dump(by_alias=True)
|
1829
|
-
metadata_payload = {"function_name": "florencev2_fine_tuned_object_detection"}
|
1830
|
-
detections = send_inference_request(
|
1831
|
-
data, "tools", v2=False, metadata_payload=metadata_payload
|
1832
|
-
)
|
1833
|
-
|
1834
|
-
detections = detections[task.value]
|
1835
|
-
return_data = []
|
1836
|
-
image_size = image.shape[:2]
|
1837
|
-
for i in range(len(detections["bboxes"])):
|
1838
|
-
return_data.append(
|
1839
|
-
{
|
1840
|
-
"score": 1.0,
|
1841
|
-
"label": detections["labels"][i],
|
1842
|
-
"bbox": normalize_bbox(detections["bboxes"][i], image_size),
|
1843
|
-
}
|
1844
|
-
)
|
1845
|
-
return return_data
|
1846
|
-
|
1847
|
-
|
1848
1756
|
FUNCTION_TOOLS = [
|
1849
1757
|
owl_v2,
|
1850
1758
|
extract_frames,
|
@@ -19,16 +19,9 @@ class BboxInputBase64(BaseModel):
|
|
19
19
|
|
20
20
|
|
21
21
|
class PromptTask(str, Enum):
|
22
|
-
"""
|
23
|
-
Valid task prompts options for the Florencev2 model.
|
24
|
-
"""
|
22
|
+
"""Valid task prompts options for the Florence2 model."""
|
25
23
|
|
26
|
-
|
27
|
-
""""""
|
28
|
-
CAPTION_TO_PHRASE_GROUNDING = "<CAPTION_TO_PHRASE_GROUNDING>"
|
29
|
-
""""""
|
30
|
-
OBJECT_DETECTION = "<OD>"
|
31
|
-
""""""
|
24
|
+
PHRASE_GROUNDING = "<CAPTION_TO_PHRASE_GROUNDING>"
|
32
25
|
|
33
26
|
|
34
27
|
class FineTuning(BaseModel):
|
@@ -41,7 +34,7 @@ class FineTuning(BaseModel):
|
|
41
34
|
return str(job_id)
|
42
35
|
|
43
36
|
|
44
|
-
class
|
37
|
+
class Florence2FtRequest(BaseModel):
|
45
38
|
model_config = ConfigDict(populate_by_name=True)
|
46
39
|
|
47
40
|
image: str
|
vision_agent/utils/execute.py
CHANGED
@@ -564,7 +564,13 @@ class LocalCodeInterpreter(CodeInterpreter):
|
|
564
564
|
) -> None:
|
565
565
|
super().__init__(timeout=timeout)
|
566
566
|
self.nb = nbformat.v4.new_notebook()
|
567
|
-
|
567
|
+
# Set the notebook execution path to the remote path
|
568
|
+
self.resources = {"metadata": {"path": str(self.remote_path)}}
|
569
|
+
self.nb_client = NotebookClient(
|
570
|
+
self.nb,
|
571
|
+
timeout=self.timeout,
|
572
|
+
resources=self.resources,
|
573
|
+
)
|
568
574
|
_LOGGER.info(
|
569
575
|
f"""Local code interpreter initialized
|
570
576
|
Python version: {sys.version}
|
@@ -606,7 +612,9 @@ Timeout: {self.timeout}"""
|
|
606
612
|
def restart_kernel(self) -> None:
|
607
613
|
self.close()
|
608
614
|
self.nb = nbformat.v4.new_notebook()
|
609
|
-
self.nb_client = NotebookClient(
|
615
|
+
self.nb_client = NotebookClient(
|
616
|
+
self.nb, timeout=self.timeout, resources=self.resources
|
617
|
+
)
|
610
618
|
sleep(1)
|
611
619
|
self._new_kernel()
|
612
620
|
|
@@ -636,7 +644,7 @@ Timeout: {self.timeout}"""
|
|
636
644
|
f.write(contents)
|
637
645
|
_LOGGER.info(f"File ({file_path}) is uploaded to: {str(self.remote_path)}")
|
638
646
|
|
639
|
-
return Path(self.remote_path / file_path)
|
647
|
+
return Path(self.remote_path / Path(file_path).name)
|
640
648
|
|
641
649
|
def download_file(
|
642
650
|
self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]
|
@@ -672,7 +680,8 @@ class CodeInterpreterFactory:
|
|
672
680
|
|
673
681
|
@staticmethod
|
674
682
|
def new_instance(
|
675
|
-
code_sandbox_runtime: Optional[str] = None,
|
683
|
+
code_sandbox_runtime: Optional[str] = None,
|
684
|
+
remote_path: Optional[Union[str, Path]] = None,
|
676
685
|
) -> CodeInterpreter:
|
677
686
|
if not code_sandbox_runtime:
|
678
687
|
code_sandbox_runtime = os.getenv("CODE_SANDBOX_RUNTIME", "local")
|
@@ -2,10 +2,10 @@ vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
|
2
2
|
vision_agent/agent/__init__.py,sha256=FRwiux1FGvGccetyUCtY46KP01fQteqorm-JtFepovI,176
|
3
3
|
vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
|
4
4
|
vision_agent/agent/agent_utils.py,sha256=22LiPhkJlS5mVeo2dIi259pc2NgA7PGHRpcbnrtKo78,1930
|
5
|
-
vision_agent/agent/vision_agent.py,sha256=
|
6
|
-
vision_agent/agent/vision_agent_coder.py,sha256=
|
5
|
+
vision_agent/agent/vision_agent.py,sha256=WM1_o0VAQokAKlDr-0lpFxCRwUm_eFfFNWP-wSNjo7s,11180
|
6
|
+
vision_agent/agent/vision_agent_coder.py,sha256=ujctkpmQkX2C6YXjlp7VLZFqSB00xwkGe-9swA8Gv8s,34240
|
7
7
|
vision_agent/agent/vision_agent_coder_prompts.py,sha256=Rg7-Ih7oFgFbHFFno0EHpaZEgm0SYj_nTdqqdp21YLo,11246
|
8
|
-
vision_agent/agent/vision_agent_prompts.py,sha256=
|
8
|
+
vision_agent/agent/vision_agent_prompts.py,sha256=K1nLo3XKQ-IqCom1TRwh3cMoGZNxNwEgZqf3uJ6eL18,7221
|
9
9
|
vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
|
11
11
|
vision_agent/clients/landing_public_api.py,sha256=rGtACkr8o5egDuMHQ5MBO4NuvsgPTp9Ew3rbq4R-vs0,1507
|
@@ -15,19 +15,19 @@ vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,
|
|
15
15
|
vision_agent/lmm/lmm.py,sha256=H3a5V7c073-vXRJfQOblE2j_CsZkH1CNNRoQgLjJZuQ,20751
|
16
16
|
vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
|
17
17
|
vision_agent/tools/__init__.py,sha256=TILaqdFYicScvpnCXMxgBsFmSW22NQDIvucvEgo0etw,2289
|
18
|
-
vision_agent/tools/meta_tools.py,sha256=
|
18
|
+
vision_agent/tools/meta_tools.py,sha256=KeGiw2OtY8ARpGbtWjoNAoO1dwevt7LbCupaJX61MkE,18929
|
19
19
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
20
20
|
vision_agent/tools/tool_utils.py,sha256=e_p-G2nwgWOpoaqpDitY3FJ6fFuTEg5GhDOD67wI2bE,7527
|
21
|
-
vision_agent/tools/tools.py,sha256=
|
22
|
-
vision_agent/tools/tools_types.py,sha256=
|
21
|
+
vision_agent/tools/tools.py,sha256=jOBsuN-spY_2TlvpahoRYGvyInhQDTPXXukx9q72lEU,63454
|
22
|
+
vision_agent/tools/tools_types.py,sha256=qs11HGLRXc9zytahBtG6TQxCh8Gigvn232at3jk54jI,2356
|
23
23
|
vision_agent/utils/__init__.py,sha256=pWk0ktvR4aUEhuEIzSLM9kSgW4WDVqptdvOTeGLkJ6M,230
|
24
24
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
25
|
-
vision_agent/utils/execute.py,sha256=
|
25
|
+
vision_agent/utils/execute.py,sha256=gc4R_0BKUrZyhiKvIxOpYuzQPYVWQEqxr3ANy1lJAw4,27037
|
26
26
|
vision_agent/utils/image_utils.py,sha256=UloC4byIQLM4CSCaH41SBciQ7X2OqKvsVvNOVKqIH_k,9856
|
27
27
|
vision_agent/utils/sim.py,sha256=ebE9Cs00pVEDI1HMjAzUBk88tQQmc2U-yAzIDinnekU,5572
|
28
28
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
29
29
|
vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
|
30
|
-
vision_agent-0.2.
|
31
|
-
vision_agent-0.2.
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
30
|
+
vision_agent-0.2.122.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
31
|
+
vision_agent-0.2.122.dist-info/METADATA,sha256=WMdLNPyKY4Ot6ifOzwXNDiVm2TsStY-l-ge8t72Ynhk,12255
|
32
|
+
vision_agent-0.2.122.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
33
|
+
vision_agent-0.2.122.dist-info/RECORD,,
|
File without changes
|
File without changes
|