vision-agent 0.2.121__py3-none-any.whl → 0.2.122__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -30,7 +30,7 @@ class BoilerplateCode:
30
30
  pre_code = [
31
31
  "from typing import *",
32
32
  "from vision_agent.utils.execute import CodeInterpreter",
33
- "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact",
33
+ "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, florence2_fine_tuning, use_florence2_fine_tuning",
34
34
  "artifacts = Artifacts('{remote_path}')",
35
35
  "artifacts.load('{remote_path}')",
36
36
  ]
@@ -76,11 +76,16 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
76
76
 
77
77
  def run_code_action(
78
78
  code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str
79
- ) -> Execution:
80
- return code_interpreter.exec_isolation(
79
+ ) -> Tuple[Execution, str]:
80
+ result = code_interpreter.exec_isolation(
81
81
  BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path)
82
82
  )
83
83
 
84
+ obs = str(result.logs)
85
+ if result.error:
86
+ obs += f"\n{result.error}"
87
+ return result, obs
88
+
84
89
 
85
90
  def parse_execution(response: str) -> Optional[str]:
86
91
  code = None
@@ -192,7 +197,7 @@ class VisionAgent(Agent):
192
197
  artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
193
198
 
194
199
  with CodeInterpreterFactory.new_instance(
195
- code_sandbox_runtime=self.code_sandbox_runtime
200
+ code_sandbox_runtime=self.code_sandbox_runtime,
196
201
  ) as code_interpreter:
197
202
  orig_chat = copy.deepcopy(chat)
198
203
  int_chat = copy.deepcopy(chat)
@@ -260,10 +265,9 @@ class VisionAgent(Agent):
260
265
  code_action = parse_execution(response["response"])
261
266
 
262
267
  if code_action is not None:
263
- result = run_code_action(
268
+ result, obs = run_code_action(
264
269
  code_action, code_interpreter, str(remote_artifacts_path)
265
270
  )
266
- obs = str(result.logs)
267
271
 
268
272
  if self.verbosity >= 1:
269
273
  _LOGGER.info(obs)
@@ -1,5 +1,4 @@
1
1
  import copy
2
- import difflib
3
2
  import logging
4
3
  import os
5
4
  import sys
@@ -29,6 +28,7 @@ from vision_agent.agent.vision_agent_coder_prompts import (
29
28
  USER_REQ,
30
29
  )
31
30
  from vision_agent.lmm import LMM, AzureOpenAILMM, Message, OllamaLMM, OpenAILMM
31
+ from vision_agent.tools.meta_tools import get_diff
32
32
  from vision_agent.utils import CodeInterpreterFactory, Execution
33
33
  from vision_agent.utils.execute import CodeInterpreter
34
34
  from vision_agent.utils.image_utils import b64_to_pil
@@ -63,14 +63,6 @@ class DefaultImports:
63
63
  return DefaultImports.to_code_string() + "\n\n" + code
64
64
 
65
65
 
66
- def get_diff(before: str, after: str) -> str:
67
- return "".join(
68
- difflib.unified_diff(
69
- before.splitlines(keepends=True), after.splitlines(keepends=True)
70
- )
71
- )
72
-
73
-
74
66
  def format_memory(memory: List[Dict[str, str]]) -> str:
75
67
  output_str = ""
76
68
  for i, m in enumerate(memory):
@@ -48,7 +48,7 @@ OBSERVATION:
48
48
  4| return dogs
49
49
  [End of artifact]
50
50
 
51
- AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
51
+ AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code and print the results to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
52
52
 
53
53
  OBSERVATION:
54
54
  ----- stdout -----
@@ -75,7 +75,7 @@ OBSERVATION:
75
75
  4| return dogs
76
76
  [End of artifact]
77
77
 
78
- AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/path/to/images/dog.jpg'))</execute_python>", "let_user_respond": false}
78
+ AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code and print the results to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/path/to/images/dog.jpg'))</execute_python>", "let_user_respond": false}
79
79
 
80
80
  OBSERVATION:
81
81
  ----- stdout -----
@@ -126,7 +126,7 @@ OBSERVATION:
126
126
  15| return count
127
127
  [End of artifact]
128
128
 
129
- AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output and write the visualization to the artifacts so the user can see it.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
129
+ AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
130
130
 
131
131
  OBSERVATION:
132
132
  ----- stdout -----
@@ -1,5 +1,7 @@
1
+ import difflib
1
2
  import os
2
3
  import pickle as pkl
4
+ import re
3
5
  import subprocess
4
6
  import tempfile
5
7
  from pathlib import Path
@@ -8,10 +10,13 @@ from typing import Any, Dict, List, Optional, Union
8
10
  from IPython.display import display
9
11
 
10
12
  import vision_agent as va
13
+ from vision_agent.clients.landing_public_api import LandingPublicAPI
11
14
  from vision_agent.lmm.types import Message
12
15
  from vision_agent.tools.tool_utils import get_tool_documentation
13
16
  from vision_agent.tools.tools import TOOL_DESCRIPTIONS
17
+ from vision_agent.tools.tools_types import BboxInput, BboxInputBase64, PromptTask
14
18
  from vision_agent.utils.execute import Execution, MimeType
19
+ from vision_agent.utils.image_utils import convert_to_b64
15
20
 
16
21
  # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
17
22
 
@@ -99,13 +104,14 @@ class Artifacts:
99
104
 
100
105
  def show(self) -> str:
101
106
  """Shows the artifacts that have been loaded and their remote save paths."""
102
- out_str = "[Artifacts loaded]\n"
107
+ output_str = "[Artifacts loaded]\n"
103
108
  for k in self.artifacts.keys():
104
- out_str += (
109
+ output_str += (
105
110
  f"Artifact {k} loaded to {str(self.remote_save_path.parent / k)}\n"
106
111
  )
107
- out_str += "[End of artifacts]\n"
108
- return out_str
112
+ output_str += "[End of artifacts]\n"
113
+ print(output_str)
114
+ return output_str
109
115
 
110
116
  def save(self, local_path: Optional[Union[str, Path]] = None) -> None:
111
117
  save_path = (
@@ -135,7 +141,12 @@ def format_lines(lines: List[str], start_idx: int) -> str:
135
141
 
136
142
 
137
143
  def view_lines(
138
- lines: List[str], line_num: int, window_size: int, name: str, total_lines: int
144
+ lines: List[str],
145
+ line_num: int,
146
+ window_size: int,
147
+ name: str,
148
+ total_lines: int,
149
+ print_output: bool = True,
139
150
  ) -> str:
140
151
  start = max(0, line_num - window_size)
141
152
  end = min(len(lines), line_num + window_size)
@@ -148,7 +159,9 @@ def view_lines(
148
159
  else f"[{len(lines) - end} more lines]"
149
160
  )
150
161
  )
151
- print(return_str)
162
+
163
+ if print_output:
164
+ print(return_str)
152
165
  return return_str
153
166
 
154
167
 
@@ -231,7 +244,7 @@ def edit_code_artifact(
231
244
  new_content_lines = [
232
245
  line if line.endswith("\n") else line + "\n" for line in new_content_lines
233
246
  ]
234
- lines = artifacts[name].splitlines()
247
+ lines = artifacts[name].splitlines(keepends=True)
235
248
  edited_lines = lines[:start] + new_content_lines + lines[end:]
236
249
 
237
250
  cur_line = start + len(content.split("\n")) // 2
@@ -261,13 +274,20 @@ def edit_code_artifact(
261
274
  DEFAULT_WINDOW_SIZE,
262
275
  name,
263
276
  total_lines,
277
+ print_output=False,
264
278
  )
265
279
  total_lines_edit = sum(1 for _ in edited_lines)
266
280
  edited_view = view_lines(
267
- edited_lines, cur_line, DEFAULT_WINDOW_SIZE, name, total_lines_edit
281
+ edited_lines,
282
+ cur_line,
283
+ DEFAULT_WINDOW_SIZE,
284
+ name,
285
+ total_lines_edit,
286
+ print_output=False,
268
287
  )
269
288
 
270
289
  error_msg += f"\n[This is how your edit would have looked like if applied]\n{edited_view}\n\n[This is the original code before your edit]\n{original_view}"
290
+ print(error_msg)
271
291
  return error_msg
272
292
 
273
293
  artifacts[name] = "".join(edited_lines)
@@ -390,6 +410,13 @@ def write_media_artifact(artifacts: Artifacts, local_path: str) -> str:
390
410
  return f"[Media {Path(local_path).name} saved]"
391
411
 
392
412
 
413
+ def list_artifacts(artifacts: Artifacts) -> str:
414
+ """Lists all the artifacts that have been loaded into the artifacts object."""
415
+ output_str = artifacts.show()
416
+ print(output_str)
417
+ return output_str
418
+
419
+
393
420
  def get_tool_descriptions() -> str:
394
421
  """Returns a description of all the tools that `generate_vision_code` has access to.
395
422
  Helpful for answering questions about what types of vision tasks you can do with
@@ -397,6 +424,108 @@ def get_tool_descriptions() -> str:
397
424
  return TOOL_DESCRIPTIONS
398
425
 
399
426
 
427
+ def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str:
428
+ """'florence2_fine_tuning' is a tool that fine-tune florence2 to be able to detect
429
+ objects in an image based on a given dataset. It returns the fine tuning job id.
430
+
431
+ Parameters:
432
+ bboxes (List[BboxInput]): A list of BboxInput containing the
433
+ image path, labels and bounding boxes.
434
+ task (str): The florencev2 fine-tuning task. The options are
435
+ 'phrase_grounding'.
436
+
437
+ Returns:
438
+ UUID: The fine tuning job id, this id will used to retrieve the fine
439
+ tuned model.
440
+
441
+ Example
442
+ -------
443
+ >>> fine_tuning_job_id = florencev2_fine_tuning(
444
+ [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
445
+ {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
446
+ "phrase_grounding"
447
+ )
448
+ """
449
+ bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
450
+ task_type = PromptTask[task.upper()]
451
+ fine_tuning_request = [
452
+ BboxInputBase64(
453
+ image=convert_to_b64(bbox_input.image_path),
454
+ filename=Path(bbox_input.image_path).name,
455
+ labels=bbox_input.labels,
456
+ bboxes=bbox_input.bboxes,
457
+ )
458
+ for bbox_input in bboxes_input
459
+ ]
460
+ landing_api = LandingPublicAPI()
461
+ fine_tune_id = str(
462
+ landing_api.launch_fine_tuning_job("florencev2", task_type, fine_tuning_request)
463
+ )
464
+ print(f"[Florence2 fine tuning id: {fine_tune_id}]")
465
+ return fine_tune_id
466
+
467
+
468
+ def get_diff(before: str, after: str) -> str:
469
+ return "".join(
470
+ difflib.unified_diff(
471
+ before.splitlines(keepends=True), after.splitlines(keepends=True)
472
+ )
473
+ )
474
+
475
+
476
+ def use_florence2_fine_tuning(
477
+ artifacts: Artifacts, name: str, task: str, fine_tune_id: str
478
+ ) -> str:
479
+ """Replaces florence2 calls with the fine tuning id. This ensures that the code
480
+ utilizes the fined tuned florence2 model. Returns the diff between the original
481
+ code and the new code.
482
+
483
+ Parameters:
484
+ artifacts (Artifacts): The artifacts object to edit the code from.
485
+ name (str): The name of the artifact to edit.
486
+ task (str): The task to fine tune the model for. The options are
487
+ 'phrase_grounding'.
488
+ fine_tune_id (str): The fine tuning job id.
489
+
490
+ Examples
491
+ --------
492
+ >>> diff = use_florence2_fine_tuning(artifacts, "code.py", "phrase_grounding", "23b3b022-5ebf-4798-9373-20ef36429abf")
493
+ """
494
+
495
+ task_to_fn = {"phrase_grounding": "florence2_phrase_grounding"}
496
+
497
+ if name not in artifacts:
498
+ output_str = f"[Artifact {name} does not exist]"
499
+ print(output_str)
500
+ return output_str
501
+
502
+ code = artifacts[name]
503
+ if task.lower() == "phrase_grounding":
504
+ pattern = r"florence2_phrase_grounding\(\s*([^\)]+)\)"
505
+
506
+ def replacer(match: re.Match) -> str:
507
+ arg = match.group(1) # capture all initial arguments
508
+ return f'florence2_phrase_grounding({arg}, "{fine_tune_id}")'
509
+
510
+ else:
511
+ raise ValueError(f"Task {task} is not supported.")
512
+
513
+ new_code = re.sub(pattern, replacer, code)
514
+
515
+ if new_code == code:
516
+ output_str = (
517
+ f"[Fine tuning task {task} function {task_to_fn[task]} not found in code]"
518
+ )
519
+ print(output_str)
520
+ return output_str
521
+
522
+ artifacts[name] = new_code
523
+
524
+ diff = get_diff(code, new_code)
525
+ print(diff)
526
+ return diff
527
+
528
+
400
529
  META_TOOL_DOCSTRING = get_tool_documentation(
401
530
  [
402
531
  get_tool_descriptions,
@@ -406,5 +535,8 @@ META_TOOL_DOCSTRING = get_tool_documentation(
406
535
  generate_vision_code,
407
536
  edit_vision_code,
408
537
  write_media_artifact,
538
+ florence2_fine_tuning,
539
+ use_florence2_fine_tuning,
540
+ list_artifacts,
409
541
  ]
410
542
  )
@@ -28,10 +28,8 @@ from vision_agent.tools.tool_utils import (
28
28
  filter_bboxes_by_threshold,
29
29
  )
30
30
  from vision_agent.tools.tools_types import (
31
- BboxInput,
32
- BboxInputBase64,
33
31
  FineTuning,
34
- Florencev2FtRequest,
32
+ Florence2FtRequest,
35
33
  JobStatus,
36
34
  PromptTask,
37
35
  ODResponseData,
@@ -867,7 +865,9 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
867
865
  return answer[task] # type: ignore
868
866
 
869
867
 
870
- def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
868
+ def florence2_phrase_grounding(
869
+ prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
870
+ ) -> List[Dict[str, Any]]:
871
871
  """'florence2_phrase_grounding' is a tool that can detect multiple
872
872
  objects given a text prompt which can be object names or caption. You
873
873
  can optionally separate the object names in the text with commas. It returns a list
@@ -877,6 +877,8 @@ def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str,
877
877
  Parameters:
878
878
  prompt (str): The prompt to ground to the image.
879
879
  image (np.ndarray): The image to used to detect objects
880
+ fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
881
+ fine-tuned model ID here to use it.
880
882
 
881
883
  Returns:
882
884
  List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
@@ -895,14 +897,33 @@ def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str,
895
897
  """
896
898
  image_size = image.shape[:2]
897
899
  image_b64 = convert_to_b64(image)
898
- data = {
899
- "image": image_b64,
900
- "task": "<CAPTION_TO_PHRASE_GROUNDING>",
901
- "prompt": prompt,
902
- "function_name": "florence2_phrase_grounding",
903
- }
904
900
 
905
- detections = send_inference_request(data, "florence2", v2=True)
901
+ if fine_tune_id is not None:
902
+ landing_api = LandingPublicAPI()
903
+ status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
904
+ if status is not JobStatus.SUCCEEDED:
905
+ raise FineTuneModelIsNotReady(
906
+ f"Fine-tuned model {fine_tune_id} is not ready yet"
907
+ )
908
+
909
+ data_obj = Florence2FtRequest(
910
+ image=image_b64,
911
+ task=PromptTask.PHRASE_GROUNDING,
912
+ tool="florencev2_fine_tuning",
913
+ prompt=prompt,
914
+ fine_tuning=FineTuning(job_id=UUID(fine_tune_id)),
915
+ )
916
+ data = data_obj.model_dump(by_alias=True)
917
+ detections = send_inference_request(data, "tools", v2=False)
918
+ else:
919
+ data = {
920
+ "image": image_b64,
921
+ "task": "<CAPTION_TO_PHRASE_GROUNDING>",
922
+ "prompt": prompt,
923
+ "function_name": "florence2_phrase_grounding",
924
+ }
925
+ detections = send_inference_request(data, "florence2", v2=True)
926
+
906
927
  detections = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
907
928
  return_data = []
908
929
  for i in range(len(detections["bboxes"])):
@@ -1732,119 +1753,6 @@ def overlay_counting_results(
1732
1753
  return np.array(pil_image)
1733
1754
 
1734
1755
 
1735
- # TODO: add this function to the imports so that is picked in the agent
1736
- def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
1737
- """'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
1738
- to detect objects in an image based on a given dataset. It returns the fine
1739
- tuning job id.
1740
-
1741
- Parameters:
1742
- bboxes (List[BboxInput]): A list of BboxInput containing the
1743
- image path, labels and bounding boxes.
1744
- task (PromptTask): The florencev2 fine-tuning task. The options are
1745
- CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
1746
-
1747
- Returns:
1748
- UUID: The fine tuning job id, this id will used to retrieve the fine
1749
- tuned model.
1750
-
1751
- Example
1752
- -------
1753
- >>> fine_tuning_job_id = florencev2_fine_tuning(
1754
- [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
1755
- {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
1756
- "OBJECT_DETECTION"
1757
- )
1758
- """
1759
- bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
1760
- task_input = PromptTask[task]
1761
- fine_tuning_request = [
1762
- BboxInputBase64(
1763
- image=convert_to_b64(bbox_input.image_path),
1764
- filename=bbox_input.image_path.split("/")[-1],
1765
- labels=bbox_input.labels,
1766
- bboxes=bbox_input.bboxes,
1767
- )
1768
- for bbox_input in bboxes_input
1769
- ]
1770
- landing_api = LandingPublicAPI()
1771
- return landing_api.launch_fine_tuning_job(
1772
- "florencev2", task_input, fine_tuning_request
1773
- )
1774
-
1775
-
1776
- # TODO: add this function to the imports so that is picked in the agent
1777
- def florencev2_fine_tuned_object_detection(
1778
- image: np.ndarray, prompt: str, model_id: UUID, task: str
1779
- ) -> List[Dict[str, Any]]:
1780
- """'florencev2_fine_tuned_object_detection' is a tool that uses a fine tuned model
1781
- to detect objects given a text prompt such as a phrase or class names separated by
1782
- commas. It returns a list of detected objects as labels and their location as
1783
- bounding boxes with score of 1.0.
1784
-
1785
- Parameters:
1786
- image (np.ndarray): The image to used to detect objects.
1787
- prompt (str): The prompt to help find objects in the image.
1788
- model_id (UUID): The fine-tuned model id.
1789
- task (PromptTask): The florencev2 fine-tuning task. The options are
1790
- CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
1791
-
1792
- Returns:
1793
- List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
1794
- bounding box of the detected objects with normalized coordinates between 0
1795
- and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
1796
- top-left and xmax and ymax are the coordinates of the bottom-right of the
1797
- bounding box. The scores are always 1.0 and cannot be thresholded
1798
-
1799
- Example
1800
- -------
1801
- >>> florencev2_fine_tuned_object_detection(
1802
- image,
1803
- 'person looking at a coyote',
1804
- UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83")
1805
- )
1806
- [
1807
- {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
1808
- {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
1809
- ]
1810
- """
1811
- # check if job succeeded first
1812
- landing_api = LandingPublicAPI()
1813
- status = landing_api.check_fine_tuning_job(model_id)
1814
- if status is not JobStatus.SUCCEEDED:
1815
- raise FineTuneModelIsNotReady()
1816
-
1817
- task = PromptTask[task]
1818
- if task is PromptTask.OBJECT_DETECTION:
1819
- prompt = ""
1820
-
1821
- data_obj = Florencev2FtRequest(
1822
- image=convert_to_b64(image),
1823
- task=task,
1824
- tool="florencev2_fine_tuning",
1825
- prompt=prompt,
1826
- fine_tuning=FineTuning(job_id=model_id),
1827
- )
1828
- data = data_obj.model_dump(by_alias=True)
1829
- metadata_payload = {"function_name": "florencev2_fine_tuned_object_detection"}
1830
- detections = send_inference_request(
1831
- data, "tools", v2=False, metadata_payload=metadata_payload
1832
- )
1833
-
1834
- detections = detections[task.value]
1835
- return_data = []
1836
- image_size = image.shape[:2]
1837
- for i in range(len(detections["bboxes"])):
1838
- return_data.append(
1839
- {
1840
- "score": 1.0,
1841
- "label": detections["labels"][i],
1842
- "bbox": normalize_bbox(detections["bboxes"][i], image_size),
1843
- }
1844
- )
1845
- return return_data
1846
-
1847
-
1848
1756
  FUNCTION_TOOLS = [
1849
1757
  owl_v2,
1850
1758
  extract_frames,
@@ -19,16 +19,9 @@ class BboxInputBase64(BaseModel):
19
19
 
20
20
 
21
21
  class PromptTask(str, Enum):
22
- """
23
- Valid task prompts options for the Florencev2 model.
24
- """
22
+ """Valid task prompts options for the Florence2 model."""
25
23
 
26
- CAPTION = "<CAPTION>"
27
- """"""
28
- CAPTION_TO_PHRASE_GROUNDING = "<CAPTION_TO_PHRASE_GROUNDING>"
29
- """"""
30
- OBJECT_DETECTION = "<OD>"
31
- """"""
24
+ PHRASE_GROUNDING = "<CAPTION_TO_PHRASE_GROUNDING>"
32
25
 
33
26
 
34
27
  class FineTuning(BaseModel):
@@ -41,7 +34,7 @@ class FineTuning(BaseModel):
41
34
  return str(job_id)
42
35
 
43
36
 
44
- class Florencev2FtRequest(BaseModel):
37
+ class Florence2FtRequest(BaseModel):
45
38
  model_config = ConfigDict(populate_by_name=True)
46
39
 
47
40
  image: str
@@ -564,7 +564,13 @@ class LocalCodeInterpreter(CodeInterpreter):
564
564
  ) -> None:
565
565
  super().__init__(timeout=timeout)
566
566
  self.nb = nbformat.v4.new_notebook()
567
- self.nb_client = NotebookClient(self.nb, timeout=self.timeout)
567
+ # Set the notebook execution path to the remote path
568
+ self.resources = {"metadata": {"path": str(self.remote_path)}}
569
+ self.nb_client = NotebookClient(
570
+ self.nb,
571
+ timeout=self.timeout,
572
+ resources=self.resources,
573
+ )
568
574
  _LOGGER.info(
569
575
  f"""Local code interpreter initialized
570
576
  Python version: {sys.version}
@@ -606,7 +612,9 @@ Timeout: {self.timeout}"""
606
612
  def restart_kernel(self) -> None:
607
613
  self.close()
608
614
  self.nb = nbformat.v4.new_notebook()
609
- self.nb_client = NotebookClient(self.nb, timeout=self.timeout)
615
+ self.nb_client = NotebookClient(
616
+ self.nb, timeout=self.timeout, resources=self.resources
617
+ )
610
618
  sleep(1)
611
619
  self._new_kernel()
612
620
 
@@ -636,7 +644,7 @@ Timeout: {self.timeout}"""
636
644
  f.write(contents)
637
645
  _LOGGER.info(f"File ({file_path}) is uploaded to: {str(self.remote_path)}")
638
646
 
639
- return Path(self.remote_path / file_path)
647
+ return Path(self.remote_path / Path(file_path).name)
640
648
 
641
649
  def download_file(
642
650
  self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]
@@ -672,7 +680,8 @@ class CodeInterpreterFactory:
672
680
 
673
681
  @staticmethod
674
682
  def new_instance(
675
- code_sandbox_runtime: Optional[str] = None, remote_path: Optional[str] = None
683
+ code_sandbox_runtime: Optional[str] = None,
684
+ remote_path: Optional[Union[str, Path]] = None,
676
685
  ) -> CodeInterpreter:
677
686
  if not code_sandbox_runtime:
678
687
  code_sandbox_runtime = os.getenv("CODE_SANDBOX_RUNTIME", "local")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.121
3
+ Version: 0.2.122
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -2,10 +2,10 @@ vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
2
2
  vision_agent/agent/__init__.py,sha256=FRwiux1FGvGccetyUCtY46KP01fQteqorm-JtFepovI,176
3
3
  vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
4
4
  vision_agent/agent/agent_utils.py,sha256=22LiPhkJlS5mVeo2dIi259pc2NgA7PGHRpcbnrtKo78,1930
5
- vision_agent/agent/vision_agent.py,sha256=IEyXT_JPCuWmBHdEnM1Wrsj7hmCe5pKLf0gnZFJTddI,11046
6
- vision_agent/agent/vision_agent_coder.py,sha256=DOTmDdGPxcI06Jp6yx4ekRMP0vhiVaK9B9Dl8UyJHeo,34396
5
+ vision_agent/agent/vision_agent.py,sha256=WM1_o0VAQokAKlDr-0lpFxCRwUm_eFfFNWP-wSNjo7s,11180
6
+ vision_agent/agent/vision_agent_coder.py,sha256=ujctkpmQkX2C6YXjlp7VLZFqSB00xwkGe-9swA8Gv8s,34240
7
7
  vision_agent/agent/vision_agent_coder_prompts.py,sha256=Rg7-Ih7oFgFbHFFno0EHpaZEgm0SYj_nTdqqdp21YLo,11246
8
- vision_agent/agent/vision_agent_prompts.py,sha256=0GliXFtBf32aPu2ClU63FI5ii5CTxWYsvrsmnnDp-gs,7134
8
+ vision_agent/agent/vision_agent_prompts.py,sha256=K1nLo3XKQ-IqCom1TRwh3cMoGZNxNwEgZqf3uJ6eL18,7221
9
9
  vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
11
11
  vision_agent/clients/landing_public_api.py,sha256=rGtACkr8o5egDuMHQ5MBO4NuvsgPTp9Ew3rbq4R-vs0,1507
@@ -15,19 +15,19 @@ vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,
15
15
  vision_agent/lmm/lmm.py,sha256=H3a5V7c073-vXRJfQOblE2j_CsZkH1CNNRoQgLjJZuQ,20751
16
16
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
17
17
  vision_agent/tools/__init__.py,sha256=TILaqdFYicScvpnCXMxgBsFmSW22NQDIvucvEgo0etw,2289
18
- vision_agent/tools/meta_tools.py,sha256=Vu9WnKicGhafx9dPzDbQjQdcIzRCYYFPF68o79hDP-8,14616
18
+ vision_agent/tools/meta_tools.py,sha256=KeGiw2OtY8ARpGbtWjoNAoO1dwevt7LbCupaJX61MkE,18929
19
19
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
20
20
  vision_agent/tools/tool_utils.py,sha256=e_p-G2nwgWOpoaqpDitY3FJ6fFuTEg5GhDOD67wI2bE,7527
21
- vision_agent/tools/tools.py,sha256=Eec7-3ecjv_8s0CJcDMibDD5z99CLHMOx7SOL3kilVE,67010
22
- vision_agent/tools/tools_types.py,sha256=1AvGEb-eslXjz4iWQGNQIatgKm6JDoBCDP0h7TjsNkU,2468
21
+ vision_agent/tools/tools.py,sha256=jOBsuN-spY_2TlvpahoRYGvyInhQDTPXXukx9q72lEU,63454
22
+ vision_agent/tools/tools_types.py,sha256=qs11HGLRXc9zytahBtG6TQxCh8Gigvn232at3jk54jI,2356
23
23
  vision_agent/utils/__init__.py,sha256=pWk0ktvR4aUEhuEIzSLM9kSgW4WDVqptdvOTeGLkJ6M,230
24
24
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
25
- vision_agent/utils/execute.py,sha256=Ap8Yx80spQq5f2QtKGx1MK03BR45mJKhlp1kfh-rIao,26751
25
+ vision_agent/utils/execute.py,sha256=gc4R_0BKUrZyhiKvIxOpYuzQPYVWQEqxr3ANy1lJAw4,27037
26
26
  vision_agent/utils/image_utils.py,sha256=UloC4byIQLM4CSCaH41SBciQ7X2OqKvsVvNOVKqIH_k,9856
27
27
  vision_agent/utils/sim.py,sha256=ebE9Cs00pVEDI1HMjAzUBk88tQQmc2U-yAzIDinnekU,5572
28
28
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
29
29
  vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
30
- vision_agent-0.2.121.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
- vision_agent-0.2.121.dist-info/METADATA,sha256=OEbC_dogT2Hg9xLN2H8Zb2FCLQjxf1wfx_0TM1aJrYU,12255
32
- vision_agent-0.2.121.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
- vision_agent-0.2.121.dist-info/RECORD,,
30
+ vision_agent-0.2.122.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
+ vision_agent-0.2.122.dist-info/METADATA,sha256=WMdLNPyKY4Ot6ifOzwXNDiVm2TsStY-l-ge8t72Ynhk,12255
32
+ vision_agent-0.2.122.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
+ vision_agent-0.2.122.dist-info/RECORD,,