vision-agent 0.2.121__py3-none-any.whl → 0.2.123__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -30,7 +30,7 @@ class BoilerplateCode:
30
30
  pre_code = [
31
31
  "from typing import *",
32
32
  "from vision_agent.utils.execute import CodeInterpreter",
33
- "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact",
33
+ "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, florence2_fine_tuning, use_florence2_fine_tuning",
34
34
  "artifacts = Artifacts('{remote_path}')",
35
35
  "artifacts.load('{remote_path}')",
36
36
  ]
@@ -76,11 +76,16 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
76
76
 
77
77
  def run_code_action(
78
78
  code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str
79
- ) -> Execution:
80
- return code_interpreter.exec_isolation(
79
+ ) -> Tuple[Execution, str]:
80
+ result = code_interpreter.exec_isolation(
81
81
  BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path)
82
82
  )
83
83
 
84
+ obs = str(result.logs)
85
+ if result.error:
86
+ obs += f"\n{result.error}"
87
+ return result, obs
88
+
84
89
 
85
90
  def parse_execution(response: str) -> Optional[str]:
86
91
  code = None
@@ -192,7 +197,7 @@ class VisionAgent(Agent):
192
197
  artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
193
198
 
194
199
  with CodeInterpreterFactory.new_instance(
195
- code_sandbox_runtime=self.code_sandbox_runtime
200
+ code_sandbox_runtime=self.code_sandbox_runtime,
196
201
  ) as code_interpreter:
197
202
  orig_chat = copy.deepcopy(chat)
198
203
  int_chat = copy.deepcopy(chat)
@@ -260,10 +265,9 @@ class VisionAgent(Agent):
260
265
  code_action = parse_execution(response["response"])
261
266
 
262
267
  if code_action is not None:
263
- result = run_code_action(
268
+ result, obs = run_code_action(
264
269
  code_action, code_interpreter, str(remote_artifacts_path)
265
270
  )
266
- obs = str(result.logs)
267
271
 
268
272
  if self.verbosity >= 1:
269
273
  _LOGGER.info(obs)
@@ -1,5 +1,4 @@
1
1
  import copy
2
- import difflib
3
2
  import logging
4
3
  import os
5
4
  import sys
@@ -29,6 +28,7 @@ from vision_agent.agent.vision_agent_coder_prompts import (
29
28
  USER_REQ,
30
29
  )
31
30
  from vision_agent.lmm import LMM, AzureOpenAILMM, Message, OllamaLMM, OpenAILMM
31
+ from vision_agent.tools.meta_tools import get_diff
32
32
  from vision_agent.utils import CodeInterpreterFactory, Execution
33
33
  from vision_agent.utils.execute import CodeInterpreter
34
34
  from vision_agent.utils.image_utils import b64_to_pil
@@ -63,14 +63,6 @@ class DefaultImports:
63
63
  return DefaultImports.to_code_string() + "\n\n" + code
64
64
 
65
65
 
66
- def get_diff(before: str, after: str) -> str:
67
- return "".join(
68
- difflib.unified_diff(
69
- before.splitlines(keepends=True), after.splitlines(keepends=True)
70
- )
71
- )
72
-
73
-
74
66
  def format_memory(memory: List[Dict[str, str]]) -> str:
75
67
  output_str = ""
76
68
  for i, m in enumerate(memory):
@@ -48,7 +48,7 @@ OBSERVATION:
48
48
  4| return dogs
49
49
  [End of artifact]
50
50
 
51
- AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
51
+ AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code and print the results to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
52
52
 
53
53
  OBSERVATION:
54
54
  ----- stdout -----
@@ -75,7 +75,7 @@ OBSERVATION:
75
75
  4| return dogs
76
76
  [End of artifact]
77
77
 
78
- AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/path/to/images/dog.jpg'))</execute_python>", "let_user_respond": false}
78
+ AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code and print the results to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/path/to/images/dog.jpg'))</execute_python>", "let_user_respond": false}
79
79
 
80
80
  OBSERVATION:
81
81
  ----- stdout -----
@@ -126,7 +126,7 @@ OBSERVATION:
126
126
  15| return count
127
127
  [End of artifact]
128
128
 
129
- AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output and write the visualization to the artifacts so the user can see it.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
129
+ AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
130
130
 
131
131
  OBSERVATION:
132
132
  ----- stdout -----
@@ -12,7 +12,7 @@ from vision_agent.utils.type_defs import LandingaiAPIKey
12
12
 
13
13
  class LandingPublicAPI(BaseHTTP):
14
14
  def __init__(self) -> None:
15
- landing_url = os.environ.get("LANDINGAI_URL", "https://api.dev.landing.ai")
15
+ landing_url = os.environ.get("LANDINGAI_URL", "https://api.landing.ai")
16
16
  landing_api_key = os.environ.get("LANDINGAI_API_KEY", LandingaiAPIKey().api_key)
17
17
  headers = {"Content-Type": "application/json", "apikey": landing_api_key}
18
18
  super().__init__(base_endpoint=landing_url, headers=headers)
@@ -1,5 +1,7 @@
1
+ import difflib
1
2
  import os
2
3
  import pickle as pkl
4
+ import re
3
5
  import subprocess
4
6
  import tempfile
5
7
  from pathlib import Path
@@ -8,10 +10,13 @@ from typing import Any, Dict, List, Optional, Union
8
10
  from IPython.display import display
9
11
 
10
12
  import vision_agent as va
13
+ from vision_agent.clients.landing_public_api import LandingPublicAPI
11
14
  from vision_agent.lmm.types import Message
12
15
  from vision_agent.tools.tool_utils import get_tool_documentation
13
16
  from vision_agent.tools.tools import TOOL_DESCRIPTIONS
17
+ from vision_agent.tools.tools_types import BboxInput, BboxInputBase64, PromptTask
14
18
  from vision_agent.utils.execute import Execution, MimeType
19
+ from vision_agent.utils.image_utils import convert_to_b64
15
20
 
16
21
  # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
17
22
 
@@ -99,13 +104,14 @@ class Artifacts:
99
104
 
100
105
  def show(self) -> str:
101
106
  """Shows the artifacts that have been loaded and their remote save paths."""
102
- out_str = "[Artifacts loaded]\n"
107
+ output_str = "[Artifacts loaded]\n"
103
108
  for k in self.artifacts.keys():
104
- out_str += (
109
+ output_str += (
105
110
  f"Artifact {k} loaded to {str(self.remote_save_path.parent / k)}\n"
106
111
  )
107
- out_str += "[End of artifacts]\n"
108
- return out_str
112
+ output_str += "[End of artifacts]\n"
113
+ print(output_str)
114
+ return output_str
109
115
 
110
116
  def save(self, local_path: Optional[Union[str, Path]] = None) -> None:
111
117
  save_path = (
@@ -135,7 +141,12 @@ def format_lines(lines: List[str], start_idx: int) -> str:
135
141
 
136
142
 
137
143
  def view_lines(
138
- lines: List[str], line_num: int, window_size: int, name: str, total_lines: int
144
+ lines: List[str],
145
+ line_num: int,
146
+ window_size: int,
147
+ name: str,
148
+ total_lines: int,
149
+ print_output: bool = True,
139
150
  ) -> str:
140
151
  start = max(0, line_num - window_size)
141
152
  end = min(len(lines), line_num + window_size)
@@ -148,7 +159,9 @@ def view_lines(
148
159
  else f"[{len(lines) - end} more lines]"
149
160
  )
150
161
  )
151
- print(return_str)
162
+
163
+ if print_output:
164
+ print(return_str)
152
165
  return return_str
153
166
 
154
167
 
@@ -231,7 +244,7 @@ def edit_code_artifact(
231
244
  new_content_lines = [
232
245
  line if line.endswith("\n") else line + "\n" for line in new_content_lines
233
246
  ]
234
- lines = artifacts[name].splitlines()
247
+ lines = artifacts[name].splitlines(keepends=True)
235
248
  edited_lines = lines[:start] + new_content_lines + lines[end:]
236
249
 
237
250
  cur_line = start + len(content.split("\n")) // 2
@@ -261,13 +274,20 @@ def edit_code_artifact(
261
274
  DEFAULT_WINDOW_SIZE,
262
275
  name,
263
276
  total_lines,
277
+ print_output=False,
264
278
  )
265
279
  total_lines_edit = sum(1 for _ in edited_lines)
266
280
  edited_view = view_lines(
267
- edited_lines, cur_line, DEFAULT_WINDOW_SIZE, name, total_lines_edit
281
+ edited_lines,
282
+ cur_line,
283
+ DEFAULT_WINDOW_SIZE,
284
+ name,
285
+ total_lines_edit,
286
+ print_output=False,
268
287
  )
269
288
 
270
289
  error_msg += f"\n[This is how your edit would have looked like if applied]\n{edited_view}\n\n[This is the original code before your edit]\n{original_view}"
290
+ print(error_msg)
271
291
  return error_msg
272
292
 
273
293
  artifacts[name] = "".join(edited_lines)
@@ -390,6 +410,13 @@ def write_media_artifact(artifacts: Artifacts, local_path: str) -> str:
390
410
  return f"[Media {Path(local_path).name} saved]"
391
411
 
392
412
 
413
+ def list_artifacts(artifacts: Artifacts) -> str:
414
+ """Lists all the artifacts that have been loaded into the artifacts object."""
415
+ output_str = artifacts.show()
416
+ print(output_str)
417
+ return output_str
418
+
419
+
393
420
  def get_tool_descriptions() -> str:
394
421
  """Returns a description of all the tools that `generate_vision_code` has access to.
395
422
  Helpful for answering questions about what types of vision tasks you can do with
@@ -397,6 +424,108 @@ def get_tool_descriptions() -> str:
397
424
  return TOOL_DESCRIPTIONS
398
425
 
399
426
 
427
+ def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str:
428
+ """'florence2_fine_tuning' is a tool that fine-tune florence2 to be able to detect
429
+ objects in an image based on a given dataset. It returns the fine tuning job id.
430
+
431
+ Parameters:
432
+ bboxes (List[BboxInput]): A list of BboxInput containing the
433
+ image path, labels and bounding boxes.
434
+ task (str): The florencev2 fine-tuning task. The options are
435
+ 'phrase_grounding'.
436
+
437
+ Returns:
438
+ UUID: The fine tuning job id, this id will used to retrieve the fine
439
+ tuned model.
440
+
441
+ Example
442
+ -------
443
+ >>> fine_tuning_job_id = florencev2_fine_tuning(
444
+ [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
445
+ {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
446
+ "phrase_grounding"
447
+ )
448
+ """
449
+ bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
450
+ task_type = PromptTask[task.upper()]
451
+ fine_tuning_request = [
452
+ BboxInputBase64(
453
+ image=convert_to_b64(bbox_input.image_path),
454
+ filename=Path(bbox_input.image_path).name,
455
+ labels=bbox_input.labels,
456
+ bboxes=bbox_input.bboxes,
457
+ )
458
+ for bbox_input in bboxes_input
459
+ ]
460
+ landing_api = LandingPublicAPI()
461
+ fine_tune_id = str(
462
+ landing_api.launch_fine_tuning_job("florencev2", task_type, fine_tuning_request)
463
+ )
464
+ print(f"[Florence2 fine tuning id: {fine_tune_id}]")
465
+ return fine_tune_id
466
+
467
+
468
+ def get_diff(before: str, after: str) -> str:
469
+ return "".join(
470
+ difflib.unified_diff(
471
+ before.splitlines(keepends=True), after.splitlines(keepends=True)
472
+ )
473
+ )
474
+
475
+
476
+ def use_florence2_fine_tuning(
477
+ artifacts: Artifacts, name: str, task: str, fine_tune_id: str
478
+ ) -> str:
479
+ """Replaces florence2 calls with the fine tuning id. This ensures that the code
480
+ utilizes the fined tuned florence2 model. Returns the diff between the original
481
+ code and the new code.
482
+
483
+ Parameters:
484
+ artifacts (Artifacts): The artifacts object to edit the code from.
485
+ name (str): The name of the artifact to edit.
486
+ task (str): The task to fine tune the model for. The options are
487
+ 'phrase_grounding'.
488
+ fine_tune_id (str): The fine tuning job id.
489
+
490
+ Examples
491
+ --------
492
+ >>> diff = use_florence2_fine_tuning(artifacts, "code.py", "phrase_grounding", "23b3b022-5ebf-4798-9373-20ef36429abf")
493
+ """
494
+
495
+ task_to_fn = {"phrase_grounding": "florence2_phrase_grounding"}
496
+
497
+ if name not in artifacts:
498
+ output_str = f"[Artifact {name} does not exist]"
499
+ print(output_str)
500
+ return output_str
501
+
502
+ code = artifacts[name]
503
+ if task.lower() == "phrase_grounding":
504
+ pattern = r"florence2_phrase_grounding\(\s*([^\)]+)\)"
505
+
506
+ def replacer(match: re.Match) -> str:
507
+ arg = match.group(1) # capture all initial arguments
508
+ return f'florence2_phrase_grounding({arg}, "{fine_tune_id}")'
509
+
510
+ else:
511
+ raise ValueError(f"Task {task} is not supported.")
512
+
513
+ new_code = re.sub(pattern, replacer, code)
514
+
515
+ if new_code == code:
516
+ output_str = (
517
+ f"[Fine tuning task {task} function {task_to_fn[task]} not found in code]"
518
+ )
519
+ print(output_str)
520
+ return output_str
521
+
522
+ artifacts[name] = new_code
523
+
524
+ diff = get_diff(code, new_code)
525
+ print(diff)
526
+ return diff
527
+
528
+
400
529
  META_TOOL_DOCSTRING = get_tool_documentation(
401
530
  [
402
531
  get_tool_descriptions,
@@ -406,5 +535,8 @@ META_TOOL_DOCSTRING = get_tool_documentation(
406
535
  generate_vision_code,
407
536
  edit_vision_code,
408
537
  write_media_artifact,
538
+ florence2_fine_tuning,
539
+ use_florence2_fine_tuning,
540
+ list_artifacts,
409
541
  ]
410
542
  )
@@ -28,10 +28,8 @@ from vision_agent.tools.tool_utils import (
28
28
  filter_bboxes_by_threshold,
29
29
  )
30
30
  from vision_agent.tools.tools_types import (
31
- BboxInput,
32
- BboxInputBase64,
33
31
  FineTuning,
34
- Florencev2FtRequest,
32
+ Florence2FtRequest,
35
33
  JobStatus,
36
34
  PromptTask,
37
35
  ODResponseData,
@@ -867,7 +865,9 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
867
865
  return answer[task] # type: ignore
868
866
 
869
867
 
870
- def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
868
+ def florence2_phrase_grounding(
869
+ prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
870
+ ) -> List[Dict[str, Any]]:
871
871
  """'florence2_phrase_grounding' is a tool that can detect multiple
872
872
  objects given a text prompt which can be object names or caption. You
873
873
  can optionally separate the object names in the text with commas. It returns a list
@@ -877,6 +877,8 @@ def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str,
877
877
  Parameters:
878
878
  prompt (str): The prompt to ground to the image.
879
879
  image (np.ndarray): The image to used to detect objects
880
+ fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
881
+ fine-tuned model ID here to use it.
880
882
 
881
883
  Returns:
882
884
  List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
@@ -895,14 +897,33 @@ def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str,
895
897
  """
896
898
  image_size = image.shape[:2]
897
899
  image_b64 = convert_to_b64(image)
898
- data = {
899
- "image": image_b64,
900
- "task": "<CAPTION_TO_PHRASE_GROUNDING>",
901
- "prompt": prompt,
902
- "function_name": "florence2_phrase_grounding",
903
- }
904
900
 
905
- detections = send_inference_request(data, "florence2", v2=True)
901
+ if fine_tune_id is not None:
902
+ landing_api = LandingPublicAPI()
903
+ status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
904
+ if status is not JobStatus.SUCCEEDED:
905
+ raise FineTuneModelIsNotReady(
906
+ f"Fine-tuned model {fine_tune_id} is not ready yet"
907
+ )
908
+
909
+ data_obj = Florence2FtRequest(
910
+ image=image_b64,
911
+ task=PromptTask.PHRASE_GROUNDING,
912
+ tool="florencev2_fine_tuning",
913
+ prompt=prompt,
914
+ fine_tuning=FineTuning(job_id=UUID(fine_tune_id)),
915
+ )
916
+ data = data_obj.model_dump(by_alias=True)
917
+ detections = send_inference_request(data, "tools", v2=False)
918
+ else:
919
+ data = {
920
+ "image": image_b64,
921
+ "task": "<CAPTION_TO_PHRASE_GROUNDING>",
922
+ "prompt": prompt,
923
+ "function_name": "florence2_phrase_grounding",
924
+ }
925
+ detections = send_inference_request(data, "florence2", v2=True)
926
+
906
927
  detections = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
907
928
  return_data = []
908
929
  for i in range(len(detections["bboxes"])):
@@ -1732,119 +1753,6 @@ def overlay_counting_results(
1732
1753
  return np.array(pil_image)
1733
1754
 
1734
1755
 
1735
- # TODO: add this function to the imports so that is picked in the agent
1736
- def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
1737
- """'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
1738
- to detect objects in an image based on a given dataset. It returns the fine
1739
- tuning job id.
1740
-
1741
- Parameters:
1742
- bboxes (List[BboxInput]): A list of BboxInput containing the
1743
- image path, labels and bounding boxes.
1744
- task (PromptTask): The florencev2 fine-tuning task. The options are
1745
- CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
1746
-
1747
- Returns:
1748
- UUID: The fine tuning job id, this id will used to retrieve the fine
1749
- tuned model.
1750
-
1751
- Example
1752
- -------
1753
- >>> fine_tuning_job_id = florencev2_fine_tuning(
1754
- [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
1755
- {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
1756
- "OBJECT_DETECTION"
1757
- )
1758
- """
1759
- bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
1760
- task_input = PromptTask[task]
1761
- fine_tuning_request = [
1762
- BboxInputBase64(
1763
- image=convert_to_b64(bbox_input.image_path),
1764
- filename=bbox_input.image_path.split("/")[-1],
1765
- labels=bbox_input.labels,
1766
- bboxes=bbox_input.bboxes,
1767
- )
1768
- for bbox_input in bboxes_input
1769
- ]
1770
- landing_api = LandingPublicAPI()
1771
- return landing_api.launch_fine_tuning_job(
1772
- "florencev2", task_input, fine_tuning_request
1773
- )
1774
-
1775
-
1776
- # TODO: add this function to the imports so that is picked in the agent
1777
- def florencev2_fine_tuned_object_detection(
1778
- image: np.ndarray, prompt: str, model_id: UUID, task: str
1779
- ) -> List[Dict[str, Any]]:
1780
- """'florencev2_fine_tuned_object_detection' is a tool that uses a fine tuned model
1781
- to detect objects given a text prompt such as a phrase or class names separated by
1782
- commas. It returns a list of detected objects as labels and their location as
1783
- bounding boxes with score of 1.0.
1784
-
1785
- Parameters:
1786
- image (np.ndarray): The image to used to detect objects.
1787
- prompt (str): The prompt to help find objects in the image.
1788
- model_id (UUID): The fine-tuned model id.
1789
- task (PromptTask): The florencev2 fine-tuning task. The options are
1790
- CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
1791
-
1792
- Returns:
1793
- List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
1794
- bounding box of the detected objects with normalized coordinates between 0
1795
- and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
1796
- top-left and xmax and ymax are the coordinates of the bottom-right of the
1797
- bounding box. The scores are always 1.0 and cannot be thresholded
1798
-
1799
- Example
1800
- -------
1801
- >>> florencev2_fine_tuned_object_detection(
1802
- image,
1803
- 'person looking at a coyote',
1804
- UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83")
1805
- )
1806
- [
1807
- {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
1808
- {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
1809
- ]
1810
- """
1811
- # check if job succeeded first
1812
- landing_api = LandingPublicAPI()
1813
- status = landing_api.check_fine_tuning_job(model_id)
1814
- if status is not JobStatus.SUCCEEDED:
1815
- raise FineTuneModelIsNotReady()
1816
-
1817
- task = PromptTask[task]
1818
- if task is PromptTask.OBJECT_DETECTION:
1819
- prompt = ""
1820
-
1821
- data_obj = Florencev2FtRequest(
1822
- image=convert_to_b64(image),
1823
- task=task,
1824
- tool="florencev2_fine_tuning",
1825
- prompt=prompt,
1826
- fine_tuning=FineTuning(job_id=model_id),
1827
- )
1828
- data = data_obj.model_dump(by_alias=True)
1829
- metadata_payload = {"function_name": "florencev2_fine_tuned_object_detection"}
1830
- detections = send_inference_request(
1831
- data, "tools", v2=False, metadata_payload=metadata_payload
1832
- )
1833
-
1834
- detections = detections[task.value]
1835
- return_data = []
1836
- image_size = image.shape[:2]
1837
- for i in range(len(detections["bboxes"])):
1838
- return_data.append(
1839
- {
1840
- "score": 1.0,
1841
- "label": detections["labels"][i],
1842
- "bbox": normalize_bbox(detections["bboxes"][i], image_size),
1843
- }
1844
- )
1845
- return return_data
1846
-
1847
-
1848
1756
  FUNCTION_TOOLS = [
1849
1757
  owl_v2,
1850
1758
  extract_frames,
@@ -19,16 +19,9 @@ class BboxInputBase64(BaseModel):
19
19
 
20
20
 
21
21
  class PromptTask(str, Enum):
22
- """
23
- Valid task prompts options for the Florencev2 model.
24
- """
22
+ """Valid task prompts options for the Florence2 model."""
25
23
 
26
- CAPTION = "<CAPTION>"
27
- """"""
28
- CAPTION_TO_PHRASE_GROUNDING = "<CAPTION_TO_PHRASE_GROUNDING>"
29
- """"""
30
- OBJECT_DETECTION = "<OD>"
31
- """"""
24
+ PHRASE_GROUNDING = "<CAPTION_TO_PHRASE_GROUNDING>"
32
25
 
33
26
 
34
27
  class FineTuning(BaseModel):
@@ -41,7 +34,7 @@ class FineTuning(BaseModel):
41
34
  return str(job_id)
42
35
 
43
36
 
44
- class Florencev2FtRequest(BaseModel):
37
+ class Florence2FtRequest(BaseModel):
45
38
  model_config = ConfigDict(populate_by_name=True)
46
39
 
47
40
  image: str
@@ -564,7 +564,13 @@ class LocalCodeInterpreter(CodeInterpreter):
564
564
  ) -> None:
565
565
  super().__init__(timeout=timeout)
566
566
  self.nb = nbformat.v4.new_notebook()
567
- self.nb_client = NotebookClient(self.nb, timeout=self.timeout)
567
+ # Set the notebook execution path to the remote path
568
+ self.resources = {"metadata": {"path": str(self.remote_path)}}
569
+ self.nb_client = NotebookClient(
570
+ self.nb,
571
+ timeout=self.timeout,
572
+ resources=self.resources,
573
+ )
568
574
  _LOGGER.info(
569
575
  f"""Local code interpreter initialized
570
576
  Python version: {sys.version}
@@ -606,7 +612,9 @@ Timeout: {self.timeout}"""
606
612
  def restart_kernel(self) -> None:
607
613
  self.close()
608
614
  self.nb = nbformat.v4.new_notebook()
609
- self.nb_client = NotebookClient(self.nb, timeout=self.timeout)
615
+ self.nb_client = NotebookClient(
616
+ self.nb, timeout=self.timeout, resources=self.resources
617
+ )
610
618
  sleep(1)
611
619
  self._new_kernel()
612
620
 
@@ -636,7 +644,7 @@ Timeout: {self.timeout}"""
636
644
  f.write(contents)
637
645
  _LOGGER.info(f"File ({file_path}) is uploaded to: {str(self.remote_path)}")
638
646
 
639
- return Path(self.remote_path / file_path)
647
+ return Path(self.remote_path / Path(file_path).name)
640
648
 
641
649
  def download_file(
642
650
  self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]
@@ -672,7 +680,8 @@ class CodeInterpreterFactory:
672
680
 
673
681
  @staticmethod
674
682
  def new_instance(
675
- code_sandbox_runtime: Optional[str] = None, remote_path: Optional[str] = None
683
+ code_sandbox_runtime: Optional[str] = None,
684
+ remote_path: Optional[Union[str, Path]] = None,
676
685
  ) -> CodeInterpreter:
677
686
  if not code_sandbox_runtime:
678
687
  code_sandbox_runtime = os.getenv("CODE_SANDBOX_RUNTIME", "local")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.121
3
+ Version: 0.2.123
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -2,32 +2,32 @@ vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
2
2
  vision_agent/agent/__init__.py,sha256=FRwiux1FGvGccetyUCtY46KP01fQteqorm-JtFepovI,176
3
3
  vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
4
4
  vision_agent/agent/agent_utils.py,sha256=22LiPhkJlS5mVeo2dIi259pc2NgA7PGHRpcbnrtKo78,1930
5
- vision_agent/agent/vision_agent.py,sha256=IEyXT_JPCuWmBHdEnM1Wrsj7hmCe5pKLf0gnZFJTddI,11046
6
- vision_agent/agent/vision_agent_coder.py,sha256=DOTmDdGPxcI06Jp6yx4ekRMP0vhiVaK9B9Dl8UyJHeo,34396
5
+ vision_agent/agent/vision_agent.py,sha256=WM1_o0VAQokAKlDr-0lpFxCRwUm_eFfFNWP-wSNjo7s,11180
6
+ vision_agent/agent/vision_agent_coder.py,sha256=ujctkpmQkX2C6YXjlp7VLZFqSB00xwkGe-9swA8Gv8s,34240
7
7
  vision_agent/agent/vision_agent_coder_prompts.py,sha256=Rg7-Ih7oFgFbHFFno0EHpaZEgm0SYj_nTdqqdp21YLo,11246
8
- vision_agent/agent/vision_agent_prompts.py,sha256=0GliXFtBf32aPu2ClU63FI5ii5CTxWYsvrsmnnDp-gs,7134
8
+ vision_agent/agent/vision_agent_prompts.py,sha256=K1nLo3XKQ-IqCom1TRwh3cMoGZNxNwEgZqf3uJ6eL18,7221
9
9
  vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
11
- vision_agent/clients/landing_public_api.py,sha256=rGtACkr8o5egDuMHQ5MBO4NuvsgPTp9Ew3rbq4R-vs0,1507
11
+ vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
12
12
  vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
13
  vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
14
14
  vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,103
15
15
  vision_agent/lmm/lmm.py,sha256=H3a5V7c073-vXRJfQOblE2j_CsZkH1CNNRoQgLjJZuQ,20751
16
16
  vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
17
17
  vision_agent/tools/__init__.py,sha256=TILaqdFYicScvpnCXMxgBsFmSW22NQDIvucvEgo0etw,2289
18
- vision_agent/tools/meta_tools.py,sha256=Vu9WnKicGhafx9dPzDbQjQdcIzRCYYFPF68o79hDP-8,14616
18
+ vision_agent/tools/meta_tools.py,sha256=KeGiw2OtY8ARpGbtWjoNAoO1dwevt7LbCupaJX61MkE,18929
19
19
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
20
20
  vision_agent/tools/tool_utils.py,sha256=e_p-G2nwgWOpoaqpDitY3FJ6fFuTEg5GhDOD67wI2bE,7527
21
- vision_agent/tools/tools.py,sha256=Eec7-3ecjv_8s0CJcDMibDD5z99CLHMOx7SOL3kilVE,67010
22
- vision_agent/tools/tools_types.py,sha256=1AvGEb-eslXjz4iWQGNQIatgKm6JDoBCDP0h7TjsNkU,2468
21
+ vision_agent/tools/tools.py,sha256=jOBsuN-spY_2TlvpahoRYGvyInhQDTPXXukx9q72lEU,63454
22
+ vision_agent/tools/tools_types.py,sha256=qs11HGLRXc9zytahBtG6TQxCh8Gigvn232at3jk54jI,2356
23
23
  vision_agent/utils/__init__.py,sha256=pWk0ktvR4aUEhuEIzSLM9kSgW4WDVqptdvOTeGLkJ6M,230
24
24
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
25
- vision_agent/utils/execute.py,sha256=Ap8Yx80spQq5f2QtKGx1MK03BR45mJKhlp1kfh-rIao,26751
25
+ vision_agent/utils/execute.py,sha256=gc4R_0BKUrZyhiKvIxOpYuzQPYVWQEqxr3ANy1lJAw4,27037
26
26
  vision_agent/utils/image_utils.py,sha256=UloC4byIQLM4CSCaH41SBciQ7X2OqKvsVvNOVKqIH_k,9856
27
27
  vision_agent/utils/sim.py,sha256=ebE9Cs00pVEDI1HMjAzUBk88tQQmc2U-yAzIDinnekU,5572
28
28
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
29
29
  vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
30
- vision_agent-0.2.121.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
- vision_agent-0.2.121.dist-info/METADATA,sha256=OEbC_dogT2Hg9xLN2H8Zb2FCLQjxf1wfx_0TM1aJrYU,12255
32
- vision_agent-0.2.121.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
- vision_agent-0.2.121.dist-info/RECORD,,
30
+ vision_agent-0.2.123.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
+ vision_agent-0.2.123.dist-info/METADATA,sha256=eoydeqKc5SAqpYMoNHLp_rajkn6zYy91wQTaWtjFv2c,12255
32
+ vision_agent-0.2.123.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
+ vision_agent-0.2.123.dist-info/RECORD,,