PyPI - vision-agent - Versions diffs - 0.2.121__py3-none-any.whl → 0.2.122__py3-none-any.whl - Mend

vision-agent 0.2.121py3-none-any.whl → 0.2.122py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

vision_agent/agent/vision_agent.py +10 -6
vision_agent/agent/vision_agent_coder.py +1 -9
vision_agent/agent/vision_agent_prompts.py +3 -3
vision_agent/tools/meta_tools.py +140 -8
vision_agent/tools/tools.py +32 -124
vision_agent/tools/tools_types.py +3 -10
vision_agent/utils/execute.py +13 -4
{vision_agent-0.2.121.dist-info → vision_agent-0.2.122.dist-info}/METADATA +1 -1
{vision_agent-0.2.121.dist-info → vision_agent-0.2.122.dist-info}/RECORD +11 -11
{vision_agent-0.2.121.dist-info → vision_agent-0.2.122.dist-info}/LICENSE +0 -0
{vision_agent-0.2.121.dist-info → vision_agent-0.2.122.dist-info}/WHEEL +0 -0

vision_agent/agent/vision_agent.py CHANGED Viewed

@@ -30,7 +30,7 @@ class BoilerplateCode:
     pre_code = [
         "from typing import *",
         "from vision_agent.utils.execute import CodeInterpreter",
-        "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact",
+        "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, florence2_fine_tuning, use_florence2_fine_tuning",
         "artifacts = Artifacts('{remote_path}')",
         "artifacts.load('{remote_path}')",
     ]
@@ -76,11 +76,16 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
 def run_code_action(
     code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str
-) -> Execution:
-    return code_interpreter.exec_isolation(
+) -> Tuple[Execution, str]:
+    result = code_interpreter.exec_isolation(
         BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path)
     )
+    obs = str(result.logs)
+    if result.error:
+        obs += f"\n{result.error}"
+    return result, obs
 def parse_execution(response: str) -> Optional[str]:
     code = None
@@ -192,7 +197,7 @@ class VisionAgent(Agent):
             artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
         with CodeInterpreterFactory.new_instance(
-            code_sandbox_runtime=self.code_sandbox_runtime
+            code_sandbox_runtime=self.code_sandbox_runtime,
         ) as code_interpreter:
             orig_chat = copy.deepcopy(chat)
             int_chat = copy.deepcopy(chat)
@@ -260,10 +265,9 @@ class VisionAgent(Agent):
                 code_action = parse_execution(response["response"])
                 if code_action is not None:
-                    result = run_code_action(
+                    result, obs = run_code_action(
                         code_action, code_interpreter, str(remote_artifacts_path)
                     )
-                    obs = str(result.logs)
                     if self.verbosity >= 1:
                         _LOGGER.info(obs)

vision_agent/agent/vision_agent_coder.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import copy
-import difflib
 import logging
 import os
 import sys
@@ -29,6 +28,7 @@ from vision_agent.agent.vision_agent_coder_prompts import (
     USER_REQ,
 )
 from vision_agent.lmm import LMM, AzureOpenAILMM, Message, OllamaLMM, OpenAILMM
+from vision_agent.tools.meta_tools import get_diff
 from vision_agent.utils import CodeInterpreterFactory, Execution
 from vision_agent.utils.execute import CodeInterpreter
 from vision_agent.utils.image_utils import b64_to_pil
@@ -63,14 +63,6 @@ class DefaultImports:
         return DefaultImports.to_code_string() + "\n\n" + code
-def get_diff(before: str, after: str) -> str:
-    return "".join(
-        difflib.unified_diff(
-            before.splitlines(keepends=True), after.splitlines(keepends=True)
-        )
-    )
 def format_memory(memory: List[Dict[str, str]]) -> str:
     output_str = ""
     for i, m in enumerate(memory):

vision_agent/agent/vision_agent_prompts.py CHANGED Viewed

@@ -48,7 +48,7 @@ OBSERVATION:
 4|    return dogs
 [End of artifact]
-AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code and print the results to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
 OBSERVATION:
 ----- stdout -----
@@ -75,7 +75,7 @@ OBSERVATION:
 4|    return dogs
 [End of artifact]
-AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/path/to/images/dog.jpg'))</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code and print the results to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/path/to/images/dog.jpg'))</execute_python>", "let_user_respond": false}
 OBSERVATION:
 ----- stdout -----
@@ -126,7 +126,7 @@ OBSERVATION:
 15|    return count
 [End of artifact]
-AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output and write the visualization to the artifacts so the user can see it.", "response": "<execute_python>from code import count_workers_with_helmets\n    print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n    write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n    print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n    write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
 OBSERVATION:
 ----- stdout -----

vision_agent/tools/meta_tools.py CHANGED Viewed

@@ -1,5 +1,7 @@
+import difflib
 import os
 import pickle as pkl
+import re
 import subprocess
 import tempfile
 from pathlib import Path
@@ -8,10 +10,13 @@ from typing import Any, Dict, List, Optional, Union
 from IPython.display import display
 import vision_agent as va
+from vision_agent.clients.landing_public_api import LandingPublicAPI
 from vision_agent.lmm.types import Message
 from vision_agent.tools.tool_utils import get_tool_documentation
 from vision_agent.tools.tools import TOOL_DESCRIPTIONS
+from vision_agent.tools.tools_types import BboxInput, BboxInputBase64, PromptTask
 from vision_agent.utils.execute import Execution, MimeType
+from vision_agent.utils.image_utils import convert_to_b64
 # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
@@ -99,13 +104,14 @@ class Artifacts:
     def show(self) -> str:
         """Shows the artifacts that have been loaded and their remote save paths."""
-        out_str = "[Artifacts loaded]\n"
+        output_str = "[Artifacts loaded]\n"
         for k in self.artifacts.keys():
-            out_str += (
+            output_str += (
                 f"Artifact {k} loaded to {str(self.remote_save_path.parent / k)}\n"
             )
-        out_str += "[End of artifacts]\n"
-        return out_str
+        output_str += "[End of artifacts]\n"
+        print(output_str)
+        return output_str
     def save(self, local_path: Optional[Union[str, Path]] = None) -> None:
         save_path = (
@@ -135,7 +141,12 @@ def format_lines(lines: List[str], start_idx: int) -> str:
 def view_lines(
-    lines: List[str], line_num: int, window_size: int, name: str, total_lines: int
+    lines: List[str],
+    line_num: int,
+    window_size: int,
+    name: str,
+    total_lines: int,
+    print_output: bool = True,
 ) -> str:
     start = max(0, line_num - window_size)
     end = min(len(lines), line_num + window_size)
@@ -148,7 +159,9 @@ def view_lines(
             else f"[{len(lines) - end} more lines]"
         )
     )
-    print(return_str)
+    if print_output:
+        print(return_str)
     return return_str
@@ -231,7 +244,7 @@ def edit_code_artifact(
     new_content_lines = [
         line if line.endswith("\n") else line + "\n" for line in new_content_lines
     ]
-    lines = artifacts[name].splitlines()
+    lines = artifacts[name].splitlines(keepends=True)
     edited_lines = lines[:start] + new_content_lines + lines[end:]
     cur_line = start + len(content.split("\n")) // 2
@@ -261,13 +274,20 @@ def edit_code_artifact(
                 DEFAULT_WINDOW_SIZE,
                 name,
                 total_lines,
+                print_output=False,
             )
             total_lines_edit = sum(1 for _ in edited_lines)
             edited_view = view_lines(
-                edited_lines, cur_line, DEFAULT_WINDOW_SIZE, name, total_lines_edit
+                edited_lines,
+                cur_line,
+                DEFAULT_WINDOW_SIZE,
+                name,
+                total_lines_edit,
+                print_output=False,
             )
             error_msg += f"\n[This is how your edit would have looked like if applied]\n{edited_view}\n\n[This is the original code before your edit]\n{original_view}"
+            print(error_msg)
             return error_msg
     artifacts[name] = "".join(edited_lines)
@@ -390,6 +410,13 @@ def write_media_artifact(artifacts: Artifacts, local_path: str) -> str:
     return f"[Media {Path(local_path).name} saved]"
+def list_artifacts(artifacts: Artifacts) -> str:
+    """Lists all the artifacts that have been loaded into the artifacts object."""
+    output_str = artifacts.show()
+    print(output_str)
+    return output_str
 def get_tool_descriptions() -> str:
     """Returns a description of all the tools that `generate_vision_code` has access to.
     Helpful for answering questions about what types of vision tasks you can do with
@@ -397,6 +424,108 @@ def get_tool_descriptions() -> str:
     return TOOL_DESCRIPTIONS
+def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str:
+    """'florence2_fine_tuning' is a tool that fine-tune florence2 to be able to detect
+    objects in an image based on a given dataset. It returns the fine tuning job id.
+    Parameters:
+        bboxes (List[BboxInput]): A list of BboxInput containing the
+            image path, labels and bounding boxes.
+        task (str): The florencev2 fine-tuning task. The options are
+            'phrase_grounding'.
+    Returns:
+        UUID: The fine tuning job id, this id will used to retrieve the fine
+            tuned model.
+    Example
+    -------
+        >>> fine_tuning_job_id = florencev2_fine_tuning(
+            [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
+             {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
+             "phrase_grounding"
+        )
+    """
+    bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
+    task_type = PromptTask[task.upper()]
+    fine_tuning_request = [
+        BboxInputBase64(
+            image=convert_to_b64(bbox_input.image_path),
+            filename=Path(bbox_input.image_path).name,
+            labels=bbox_input.labels,
+            bboxes=bbox_input.bboxes,
+        )
+        for bbox_input in bboxes_input
+    ]
+    landing_api = LandingPublicAPI()
+    fine_tune_id = str(
+        landing_api.launch_fine_tuning_job("florencev2", task_type, fine_tuning_request)
+    )
+    print(f"[Florence2 fine tuning id: {fine_tune_id}]")
+    return fine_tune_id
+def get_diff(before: str, after: str) -> str:
+    return "".join(
+        difflib.unified_diff(
+            before.splitlines(keepends=True), after.splitlines(keepends=True)
+        )
+    )
+def use_florence2_fine_tuning(
+    artifacts: Artifacts, name: str, task: str, fine_tune_id: str
+) -> str:
+    """Replaces florence2 calls with the fine tuning id. This ensures that the code
+    utilizes the fined tuned florence2 model. Returns the diff between the original
+    code and the new code.
+    Parameters:
+        artifacts (Artifacts): The artifacts object to edit the code from.
+        name (str): The name of the artifact to edit.
+        task (str): The task to fine tune the model for. The options are
+            'phrase_grounding'.
+        fine_tune_id (str): The fine tuning job id.
+    Examples
+    --------
+        >>> diff = use_florence2_fine_tuning(artifacts, "code.py", "phrase_grounding", "23b3b022-5ebf-4798-9373-20ef36429abf")
+    """
+    task_to_fn = {"phrase_grounding": "florence2_phrase_grounding"}
+    if name not in artifacts:
+        output_str = f"[Artifact {name} does not exist]"
+        print(output_str)
+        return output_str
+    code = artifacts[name]
+    if task.lower() == "phrase_grounding":
+        pattern = r"florence2_phrase_grounding\(\s*([^\)]+)\)"
+        def replacer(match: re.Match) -> str:
+            arg = match.group(1)  # capture all initial arguments
+            return f'florence2_phrase_grounding({arg}, "{fine_tune_id}")'
+    else:
+        raise ValueError(f"Task {task} is not supported.")
+    new_code = re.sub(pattern, replacer, code)
+    if new_code == code:
+        output_str = (
+            f"[Fine tuning task {task} function {task_to_fn[task]} not found in code]"
+        )
+        print(output_str)
+        return output_str
+    artifacts[name] = new_code
+    diff = get_diff(code, new_code)
+    print(diff)
+    return diff
 META_TOOL_DOCSTRING = get_tool_documentation(
     [
         get_tool_descriptions,
@@ -406,5 +535,8 @@ META_TOOL_DOCSTRING = get_tool_documentation(
         generate_vision_code,
         edit_vision_code,
         write_media_artifact,
+        florence2_fine_tuning,
+        use_florence2_fine_tuning,
+        list_artifacts,
     ]
 )

vision_agent/tools/tools.py CHANGED Viewed

@@ -28,10 +28,8 @@ from vision_agent.tools.tool_utils import (
     filter_bboxes_by_threshold,
 )
 from vision_agent.tools.tools_types import (
-    BboxInput,
-    BboxInputBase64,
     FineTuning,
-    Florencev2FtRequest,
+    Florence2FtRequest,
     JobStatus,
     PromptTask,
     ODResponseData,
@@ -867,7 +865,9 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
     return answer[task]  # type: ignore
-def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
+def florence2_phrase_grounding(
+    prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
+) -> List[Dict[str, Any]]:
     """'florence2_phrase_grounding' is a tool that can detect multiple
     objects given a text prompt which can be object names or caption. You
     can optionally separate the object names in the text with commas. It returns a list
@@ -877,6 +877,8 @@ def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str,
     Parameters:
         prompt (str): The prompt to ground to the image.
         image (np.ndarray): The image to used to detect objects
+        fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
+            fine-tuned model ID here to use it.
     Returns:
         List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
@@ -895,14 +897,33 @@ def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str,
     """
     image_size = image.shape[:2]
     image_b64 = convert_to_b64(image)
-    data = {
-        "image": image_b64,
-        "task": "<CAPTION_TO_PHRASE_GROUNDING>",
-        "prompt": prompt,
-        "function_name": "florence2_phrase_grounding",
-    }
-    detections = send_inference_request(data, "florence2", v2=True)
+    if fine_tune_id is not None:
+        landing_api = LandingPublicAPI()
+        status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
+        if status is not JobStatus.SUCCEEDED:
+            raise FineTuneModelIsNotReady(
+                f"Fine-tuned model {fine_tune_id} is not ready yet"
+            )
+        data_obj = Florence2FtRequest(
+            image=image_b64,
+            task=PromptTask.PHRASE_GROUNDING,
+            tool="florencev2_fine_tuning",
+            prompt=prompt,
+            fine_tuning=FineTuning(job_id=UUID(fine_tune_id)),
+        )
+        data = data_obj.model_dump(by_alias=True)
+        detections = send_inference_request(data, "tools", v2=False)
+    else:
+        data = {
+            "image": image_b64,
+            "task": "<CAPTION_TO_PHRASE_GROUNDING>",
+            "prompt": prompt,
+            "function_name": "florence2_phrase_grounding",
+        }
+        detections = send_inference_request(data, "florence2", v2=True)
     detections = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
     return_data = []
     for i in range(len(detections["bboxes"])):
@@ -1732,119 +1753,6 @@ def overlay_counting_results(
     return np.array(pil_image)
-# TODO: add this function to the imports so that is picked in the agent
-def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
-    """'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
-    to detect objects in an image based on a given dataset. It returns the fine
-    tuning job id.
-    Parameters:
-        bboxes (List[BboxInput]): A list of BboxInput containing the
-            image path, labels and bounding boxes.
-        task (PromptTask): The florencev2 fine-tuning task. The options are
-            CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
-    Returns:
-        UUID: The fine tuning job id, this id will used to retrieve the fine
-            tuned model.
-    Example
-    -------
-        >>> fine_tuning_job_id = florencev2_fine_tuning(
-            [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
-             {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
-             "OBJECT_DETECTION"
-        )
-    """
-    bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
-    task_input = PromptTask[task]
-    fine_tuning_request = [
-        BboxInputBase64(
-            image=convert_to_b64(bbox_input.image_path),
-            filename=bbox_input.image_path.split("/")[-1],
-            labels=bbox_input.labels,
-            bboxes=bbox_input.bboxes,
-        )
-        for bbox_input in bboxes_input
-    ]
-    landing_api = LandingPublicAPI()
-    return landing_api.launch_fine_tuning_job(
-        "florencev2", task_input, fine_tuning_request
-    )
-# TODO: add this function to the imports so that is picked in the agent
-def florencev2_fine_tuned_object_detection(
-    image: np.ndarray, prompt: str, model_id: UUID, task: str
-) -> List[Dict[str, Any]]:
-    """'florencev2_fine_tuned_object_detection' is a tool that uses a fine tuned model
-    to detect objects given a text prompt such as a phrase or class names separated by
-    commas. It returns a list of detected objects as labels and their location as
-    bounding boxes with score of 1.0.
-    Parameters:
-        image (np.ndarray): The image to used to detect objects.
-        prompt (str): The prompt to help find objects in the image.
-        model_id (UUID): The fine-tuned model id.
-        task (PromptTask): The florencev2 fine-tuning task. The options are
-            CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
-    Returns:
-        List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
-            bounding box of the detected objects with normalized coordinates between 0
-            and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
-            top-left and xmax and ymax are the coordinates of the bottom-right of the
-            bounding box. The scores are always 1.0 and cannot be thresholded
-    Example
-    -------
-        >>> florencev2_fine_tuned_object_detection(
-            image,
-            'person looking at a coyote',
-            UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83")
-        )
-        [
-            {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
-            {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
-        ]
-    """
-    # check if job succeeded first
-    landing_api = LandingPublicAPI()
-    status = landing_api.check_fine_tuning_job(model_id)
-    if status is not JobStatus.SUCCEEDED:
-        raise FineTuneModelIsNotReady()
-    task = PromptTask[task]
-    if task is PromptTask.OBJECT_DETECTION:
-        prompt = ""
-    data_obj = Florencev2FtRequest(
-        image=convert_to_b64(image),
-        task=task,
-        tool="florencev2_fine_tuning",
-        prompt=prompt,
-        fine_tuning=FineTuning(job_id=model_id),
-    )
-    data = data_obj.model_dump(by_alias=True)
-    metadata_payload = {"function_name": "florencev2_fine_tuned_object_detection"}
-    detections = send_inference_request(
-        data, "tools", v2=False, metadata_payload=metadata_payload
-    )
-    detections = detections[task.value]
-    return_data = []
-    image_size = image.shape[:2]
-    for i in range(len(detections["bboxes"])):
-        return_data.append(
-            {
-                "score": 1.0,
-                "label": detections["labels"][i],
-                "bbox": normalize_bbox(detections["bboxes"][i], image_size),
-            }
-        )
-    return return_data
 FUNCTION_TOOLS = [
     owl_v2,
     extract_frames,

vision_agent/tools/tools_types.py CHANGED Viewed

@@ -19,16 +19,9 @@ class BboxInputBase64(BaseModel):
 class PromptTask(str, Enum):
-    """
-    Valid task prompts options for the Florencev2 model.
-    """
+    """Valid task prompts options for the Florence2 model."""
-    CAPTION = "<CAPTION>"
-    """"""
-    CAPTION_TO_PHRASE_GROUNDING = "<CAPTION_TO_PHRASE_GROUNDING>"
-    """"""
-    OBJECT_DETECTION = "<OD>"
-    """"""
+    PHRASE_GROUNDING = "<CAPTION_TO_PHRASE_GROUNDING>"
 class FineTuning(BaseModel):
@@ -41,7 +34,7 @@ class FineTuning(BaseModel):
         return str(job_id)
-class Florencev2FtRequest(BaseModel):
+class Florence2FtRequest(BaseModel):
     model_config = ConfigDict(populate_by_name=True)
     image: str

vision_agent/utils/execute.py CHANGED Viewed

@@ -564,7 +564,13 @@ class LocalCodeInterpreter(CodeInterpreter):
     ) -> None:
         super().__init__(timeout=timeout)
         self.nb = nbformat.v4.new_notebook()
-        self.nb_client = NotebookClient(self.nb, timeout=self.timeout)
+        # Set the notebook execution path to the remote path
+        self.resources = {"metadata": {"path": str(self.remote_path)}}
+        self.nb_client = NotebookClient(
+            self.nb,
+            timeout=self.timeout,
+            resources=self.resources,
+        )
         _LOGGER.info(
             f"""Local code interpreter initialized
 Python version: {sys.version}
@@ -606,7 +612,9 @@ Timeout: {self.timeout}"""
     def restart_kernel(self) -> None:
         self.close()
         self.nb = nbformat.v4.new_notebook()
-        self.nb_client = NotebookClient(self.nb, timeout=self.timeout)
+        self.nb_client = NotebookClient(
+            self.nb, timeout=self.timeout, resources=self.resources
+        )
         sleep(1)
         self._new_kernel()
@@ -636,7 +644,7 @@ Timeout: {self.timeout}"""
             f.write(contents)
         _LOGGER.info(f"File ({file_path}) is uploaded to: {str(self.remote_path)}")
-        return Path(self.remote_path / file_path)
+        return Path(self.remote_path / Path(file_path).name)
     def download_file(
         self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]
@@ -672,7 +680,8 @@ class CodeInterpreterFactory:
     @staticmethod
     def new_instance(
-        code_sandbox_runtime: Optional[str] = None, remote_path: Optional[str] = None
+        code_sandbox_runtime: Optional[str] = None,
+        remote_path: Optional[Union[str, Path]] = None,
     ) -> CodeInterpreter:
         if not code_sandbox_runtime:
             code_sandbox_runtime = os.getenv("CODE_SANDBOX_RUNTIME", "local")

{vision_agent-0.2.121.dist-info → vision_agent-0.2.122.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.121
+Version: 0.2.122
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.121.dist-info → vision_agent-0.2.122.dist-info}/RECORD RENAMED Viewed

@@ -2,10 +2,10 @@ vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
 vision_agent/agent/__init__.py,sha256=FRwiux1FGvGccetyUCtY46KP01fQteqorm-JtFepovI,176
 vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
 vision_agent/agent/agent_utils.py,sha256=22LiPhkJlS5mVeo2dIi259pc2NgA7PGHRpcbnrtKo78,1930
-vision_agent/agent/vision_agent.py,sha256=IEyXT_JPCuWmBHdEnM1Wrsj7hmCe5pKLf0gnZFJTddI,11046
-vision_agent/agent/vision_agent_coder.py,sha256=DOTmDdGPxcI06Jp6yx4ekRMP0vhiVaK9B9Dl8UyJHeo,34396
+vision_agent/agent/vision_agent.py,sha256=WM1_o0VAQokAKlDr-0lpFxCRwUm_eFfFNWP-wSNjo7s,11180
+vision_agent/agent/vision_agent_coder.py,sha256=ujctkpmQkX2C6YXjlp7VLZFqSB00xwkGe-9swA8Gv8s,34240
 vision_agent/agent/vision_agent_coder_prompts.py,sha256=Rg7-Ih7oFgFbHFFno0EHpaZEgm0SYj_nTdqqdp21YLo,11246
-vision_agent/agent/vision_agent_prompts.py,sha256=0GliXFtBf32aPu2ClU63FI5ii5CTxWYsvrsmnnDp-gs,7134
+vision_agent/agent/vision_agent_prompts.py,sha256=K1nLo3XKQ-IqCom1TRwh3cMoGZNxNwEgZqf3uJ6eL18,7221
 vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
 vision_agent/clients/landing_public_api.py,sha256=rGtACkr8o5egDuMHQ5MBO4NuvsgPTp9Ew3rbq4R-vs0,1507
@@ -15,19 +15,19 @@ vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,
 vision_agent/lmm/lmm.py,sha256=H3a5V7c073-vXRJfQOblE2j_CsZkH1CNNRoQgLjJZuQ,20751
 vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
 vision_agent/tools/__init__.py,sha256=TILaqdFYicScvpnCXMxgBsFmSW22NQDIvucvEgo0etw,2289
-vision_agent/tools/meta_tools.py,sha256=Vu9WnKicGhafx9dPzDbQjQdcIzRCYYFPF68o79hDP-8,14616
+vision_agent/tools/meta_tools.py,sha256=KeGiw2OtY8ARpGbtWjoNAoO1dwevt7LbCupaJX61MkE,18929
 vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
 vision_agent/tools/tool_utils.py,sha256=e_p-G2nwgWOpoaqpDitY3FJ6fFuTEg5GhDOD67wI2bE,7527
-vision_agent/tools/tools.py,sha256=Eec7-3ecjv_8s0CJcDMibDD5z99CLHMOx7SOL3kilVE,67010
-vision_agent/tools/tools_types.py,sha256=1AvGEb-eslXjz4iWQGNQIatgKm6JDoBCDP0h7TjsNkU,2468
+vision_agent/tools/tools.py,sha256=jOBsuN-spY_2TlvpahoRYGvyInhQDTPXXukx9q72lEU,63454
+vision_agent/tools/tools_types.py,sha256=qs11HGLRXc9zytahBtG6TQxCh8Gigvn232at3jk54jI,2356
 vision_agent/utils/__init__.py,sha256=pWk0ktvR4aUEhuEIzSLM9kSgW4WDVqptdvOTeGLkJ6M,230
 vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
-vision_agent/utils/execute.py,sha256=Ap8Yx80spQq5f2QtKGx1MK03BR45mJKhlp1kfh-rIao,26751
+vision_agent/utils/execute.py,sha256=gc4R_0BKUrZyhiKvIxOpYuzQPYVWQEqxr3ANy1lJAw4,27037
 vision_agent/utils/image_utils.py,sha256=UloC4byIQLM4CSCaH41SBciQ7X2OqKvsVvNOVKqIH_k,9856
 vision_agent/utils/sim.py,sha256=ebE9Cs00pVEDI1HMjAzUBk88tQQmc2U-yAzIDinnekU,5572
 vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
 vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
-vision_agent-0.2.121.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.2.121.dist-info/METADATA,sha256=OEbC_dogT2Hg9xLN2H8Zb2FCLQjxf1wfx_0TM1aJrYU,12255
-vision_agent-0.2.121.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.2.121.dist-info/RECORD,,
+vision_agent-0.2.122.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.2.122.dist-info/METADATA,sha256=WMdLNPyKY4Ot6ifOzwXNDiVm2TsStY-l-ge8t72Ynhk,12255
+vision_agent-0.2.122.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.2.122.dist-info/RECORD,,

{vision_agent-0.2.121.dist-info → vision_agent-0.2.122.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.121.dist-info → vision_agent-0.2.122.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.2.121__py3-none-any.whl → 0.2.122__py3-none-any.whl

vision-agent 0.2.121py3-none-any.whl → 0.2.122py3-none-any.whl