PyPI - vision-agent - Versions diffs - 0.2.174__tar.gz → 0.2.175__tar.gz - Mend

vision-agent 0.2.174tar.gz → 0.2.175tar.gz

Files changed (35) hide show

{vision_agent-0.2.174 → vision_agent-0.2.175}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.174
+Version: 0.2.175
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.174 → vision_agent-0.2.175}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.174"
+version = "0.2.175"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"

{vision_agent-0.2.174 → vision_agent-0.2.175}/vision_agent/agent/agent_utils.py RENAMED Viewed

@@ -72,7 +72,9 @@ def extract_json(json_str: str) -> Dict[str, Any]:
         if json_dict is None:
             error_msg = f"Could not extract JSON from the given str: {json_orig}"
             _LOGGER.exception(error_msg)
-            raise ValueError(error_msg)
+            raise json.JSONDecodeError(
+                msg="Could not extract JSON", doc=json_orig, pos=0
+            )
         return json_dict

{vision_agent-0.2.174 → vision_agent-0.2.175}/vision_agent/agent/vision_agent.py RENAMED Viewed

@@ -2,7 +2,6 @@ import copy
 import json
 import logging
 import os
-import tempfile
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
@@ -12,6 +11,7 @@ from vision_agent.agent.vision_agent_prompts import (
     EXAMPLES_CODE1,
     EXAMPLES_CODE2,
     EXAMPLES_CODE3,
+    EXAMPLES_CODE3_EXTRA2,
     VA_CODE,
 )
 from vision_agent.lmm import LMM, AnthropicLMM, Message, OpenAILMM
@@ -19,7 +19,6 @@ from vision_agent.tools.meta_tools import (
     META_TOOL_DOCSTRING,
     Artifacts,
     check_and_load_image,
-    extract_and_save_files_to_artifacts,
     use_extra_vision_agent_args,
 )
 from vision_agent.utils import CodeInterpreterFactory
@@ -37,11 +36,12 @@ class BoilerplateCode:
     pre_code = [
         "from typing import *",
         "from vision_agent.utils.execute import CodeInterpreter",
-        "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning",
-        "artifacts = Artifacts('{remote_path}')",
+        "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning, list_artifacts, capture_files_into_artifacts",
+        "artifacts = Artifacts('{remote_path}', '{remote_path}')",
         "artifacts.load('{remote_path}')",
     ]
     post_code = [
+        "capture_files_into_artifacts(artifacts)",
         "artifacts.save()",
     ]
@@ -97,8 +97,9 @@ def _clean_response(response: str) -> str:
 def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
     chat = copy.deepcopy(chat)
+    # only add 10 most recent messages in the chat to not go over token limit
     conversation = ""
-    for chat_i in chat:
+    for chat_i in chat[-10:]:
         if chat_i["role"] == "user":
             conversation += f"USER: {chat_i['content']}\n\n"
         elif chat_i["role"] == "observation":
@@ -110,7 +111,7 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
     prompt = VA_CODE.format(
         documentation=META_TOOL_DOCSTRING,
-        examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}\n{EXAMPLES_CODE3}",
+        examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}\n{EXAMPLES_CODE3}\n{EXAMPLES_CODE3_EXTRA2}",
         conversation=conversation,
     )
     message: Message = {"role": "user", "content": prompt}
@@ -120,7 +121,9 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
         and "media" in chat[-1]
         and len(chat[-1]["media"]) > 0  # type: ignore
     ):
-        message["media"] = chat[-1]["media"]
+        media_obs = [media for media in chat[-1]["media"] if Path(media).exists()]  # type: ignore
+        if len(media_obs) > 0:
+            message["media"] = media_obs  # type: ignore
     conv_resp = cast(str, orch([message], stream=False))
     # clean the response first, if we are executing code, do not resond or end
@@ -144,16 +147,16 @@ def execute_code_action(
     artifacts: Artifacts,
     code: str,
     code_interpreter: CodeInterpreter,
-    artifact_remote_path: str,
 ) -> Tuple[Execution, str]:
     result = code_interpreter.exec_isolation(
-        BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path)
+        BoilerplateCode.add_boilerplate(
+            code, remote_path=str(artifacts.remote_save_path)
+        )
     )
     obs = str(result.logs)
     if result.error:
         obs += f"\n{result.error}"
-    extract_and_save_files_to_artifacts(artifacts, code, obs)
     return result, obs
@@ -161,7 +164,6 @@ def execute_user_code_action(
     artifacts: Artifacts,
     last_user_message: Message,
     code_interpreter: CodeInterpreter,
-    artifact_remote_path: str,
 ) -> Tuple[Optional[Execution], Optional[str]]:
     user_result = None
     user_obs = None
@@ -178,11 +180,10 @@ def execute_user_code_action(
     if user_code_action is not None:
         user_code_action = use_extra_vision_agent_args(user_code_action, False)
         user_result, user_obs = execute_code_action(
-            artifacts, user_code_action, code_interpreter, artifact_remote_path
+            artifacts, user_code_action, code_interpreter
         )
         if user_result.error:
             user_obs += f"\n{user_result.error}"
-        extract_and_save_files_to_artifacts(artifacts, user_code_action, user_obs)
     return user_result, user_obs
@@ -231,9 +232,18 @@ def old_format_to_new_format(old_format_str: str) -> str:
     except json.JSONDecodeError:
         return old_format_str
-    thinking = old_format["thoughts"] if old_format["thoughts"].strip() != "" else None
-    let_user_respond = old_format["let_user_respond"]
-    if "<execute_python>" in old_format["response"]:
+    if "thoughts" in old_format:
+        thinking = (
+            old_format["thoughts"] if old_format["thoughts"].strip() != "" else None
+        )
+    else:
+        thinking = None
+    let_user_respond = (
+        old_format["let_user_respond"] if "let_user_respond" in old_format else True
+    )
+    if "response" in old_format and "<execute_python>" in old_format["response"]:
         execute_python = extract_tag(old_format["response"], "execute_python")
         response = (
             old_format["response"]
@@ -244,7 +254,7 @@ def old_format_to_new_format(old_format_str: str) -> str:
         )
     else:
         execute_python = None
-        response = old_format["response"]
+        response = old_format["response"] if "response" in old_format else None
     return json.dumps(
         {
@@ -275,7 +285,6 @@ class VisionAgent(Agent):
         self,
         agent: Optional[LMM] = None,
         verbosity: int = 0,
-        local_artifacts_path: Optional[Union[str, Path]] = None,
         callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
         code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
     ) -> None:
@@ -285,8 +294,6 @@ class VisionAgent(Agent):
             agent (Optional[LMM]): The agent to use for conversation and orchestration
                 of other agents.
             verbosity (int): The verbosity level of the agent.
-            local_artifacts_path (Optional[Union[str, Path]]): The path to the local
-                artifacts file.
             callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
                 function to send intermediate update messages.
             code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
@@ -302,14 +309,6 @@ class VisionAgent(Agent):
         self.callback_message = callback_message
         if self.verbosity >= 1:
             _LOGGER.setLevel(logging.INFO)
-        self.local_artifacts_path = cast(
-            str,
-            (
-                Path(local_artifacts_path)
-                if local_artifacts_path is not None
-                else Path(tempfile.NamedTemporaryFile(delete=False).name)
-            ),
-        )
     def __call__(
         self,
@@ -386,7 +385,7 @@ class VisionAgent(Agent):
         if not artifacts:
             # this is setting remote artifacts path
-            artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
+            artifacts = Artifacts("", "")
         # NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues
         code_interpreter = (
@@ -395,8 +394,15 @@ class VisionAgent(Agent):
             and not isinstance(self.code_interpreter, str)
             else CodeInterpreterFactory.new_instance(
                 code_sandbox_runtime=self.code_interpreter,
+                remote_path=artifacts.remote_save_path.parent,
             )
         )
+        if code_interpreter.remote_path != artifacts.remote_save_path.parent:
+            raise ValueError(
+                f"Code interpreter remote path {code_interpreter.remote_path} does not match {artifacts.remote_save_path.parent}"
+            )
         with code_interpreter:
             orig_chat = copy.deepcopy(chat)
             int_chat = copy.deepcopy(chat)
@@ -436,15 +442,13 @@ class VisionAgent(Agent):
             # Save the current state of artifacts, will include any images the user
             # passed in.
-            artifacts.save(self.local_artifacts_path)
+            artifacts.save()
             # Upload artifacts to remote location and show where they are going
             # to be loaded to. The actual loading happens in BoilerplateCode as
             # part of the pre_code.
-            remote_artifacts_path = code_interpreter.upload_file(
-                self.local_artifacts_path
-            )
-            artifacts_loaded = artifacts.show(code_interpreter.remote_path)
+            code_interpreter.upload_file(artifacts.local_save_path)
+            artifacts_loaded = artifacts.show(artifacts.remote_save_path.parent)
             int_chat.append({"role": "observation", "content": artifacts_loaded})
             orig_chat.append({"role": "observation", "content": artifacts_loaded})
             self.streaming_message({"role": "observation", "content": artifacts_loaded})
@@ -453,7 +457,6 @@ class VisionAgent(Agent):
                 artifacts,
                 last_user_message,
                 code_interpreter,
-                str(remote_artifacts_path),
             )
             finished = user_result is not None and user_obs is not None
             if user_result is not None and user_obs is not None:
@@ -472,7 +475,16 @@ class VisionAgent(Agent):
                 )
             while not finished and iterations < self.max_iterations:
+                # ensure we upload the artifacts before each turn, so any local
+                # modifications we made to it will be reflected in the remote
+                code_interpreter.upload_file(artifacts.local_save_path)
                 response = run_conversation(self.agent, int_chat)
+                code_action = use_extra_vision_agent_args(
+                    response.get("execute_python", None),
+                    test_multi_plan,
+                    custom_tool_names,
+                )
                 if self.verbosity >= 1:
                     _LOGGER.info(response)
                 int_chat.append(
@@ -532,31 +544,20 @@ class VisionAgent(Agent):
                         artifacts,
                         code_action,
                         code_interpreter,
-                        str(remote_artifacts_path),
                     )
-                    media_obs = check_and_load_image(code_action)
-                    if self.verbosity >= 1:
-                        _LOGGER.info(obs)
                     obs_chat_elt: Message = {"role": "observation", "content": obs}
+                    media_obs = check_and_load_image(code_action)
                     if media_obs and result.success:
-                        # for view_media_artifact, we need to ensure the media is loaded
-                        # locally so the conversation agent can actually see it
-                        code_interpreter.download_file(
-                            str(remote_artifacts_path.name),
-                            str(self.local_artifacts_path),
-                        )
-                        artifacts.load(
-                            self.local_artifacts_path,
-                            Path(self.local_artifacts_path).parent,
-                        )
+                        # media paths will be under the local_save_path when we download
+                        # them after each turn
                         obs_chat_elt["media"] = [
-                            Path(self.local_artifacts_path).parent / media_ob
+                            artifacts.local_save_path.parent / media_ob
                             for media_ob in media_obs
                         ]
+                    if self.verbosity >= 1:
+                        _LOGGER.info(obs)
                     # don't add execution results to internal chat
                     int_chat.append(obs_chat_elt)
                     obs_chat_elt["execution"] = result
@@ -573,13 +574,15 @@ class VisionAgent(Agent):
                 iterations += 1
                 last_response = response
-            # after running the agent, download the artifacts locally
-            code_interpreter.download_file(
-                str(remote_artifacts_path.name), str(self.local_artifacts_path)
-            )
-            artifacts.load(
-                self.local_artifacts_path, Path(self.local_artifacts_path).parent
-            )
+                # after each turn, download the artifacts locally
+                code_interpreter.download_file(
+                    str(artifacts.remote_save_path.name),
+                    str(artifacts.local_save_path),
+                )
+                artifacts.load(
+                    artifacts.local_save_path, artifacts.local_save_path.parent
+                )
         return orig_chat, artifacts
     def streaming_message(self, message: Dict[str, Any]) -> None:
@@ -595,7 +598,6 @@ class OpenAIVisionAgent(VisionAgent):
         self,
         agent: Optional[LMM] = None,
         verbosity: int = 0,
-        local_artifacts_path: Optional[Union[str, Path]] = None,
         callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
         code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
     ) -> None:
@@ -605,8 +607,6 @@ class OpenAIVisionAgent(VisionAgent):
             agent (Optional[LMM]): The agent to use for conversation and orchestration
                 of other agents.
             verbosity (int): The verbosity level of the agent.
-            local_artifacts_path (Optional[Union[str, Path]]): The path to the local
-                artifacts file.
             callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
                 function to send intermediate update messages.
             code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
@@ -619,7 +619,6 @@ class OpenAIVisionAgent(VisionAgent):
         super().__init__(
             agent,
             verbosity,
-            local_artifacts_path,
             callback_message,
             code_interpreter,
         )
@@ -630,7 +629,6 @@ class AnthropicVisionAgent(VisionAgent):
         self,
         agent: Optional[LMM] = None,
         verbosity: int = 0,
-        local_artifacts_path: Optional[Union[str, Path]] = None,
         callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
         code_interpreter: Optional[Union[str, CodeInterpreter]] = None,
     ) -> None:
@@ -640,8 +638,6 @@ class AnthropicVisionAgent(VisionAgent):
             agent (Optional[LMM]): The agent to use for conversation and orchestration
                 of other agents.
             verbosity (int): The verbosity level of the agent.
-            local_artifacts_path (Optional[Union[str, Path]]): The path to the local
-                artifacts file.
             callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
                 function to send intermediate update messages.
             code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
@@ -654,7 +650,6 @@ class AnthropicVisionAgent(VisionAgent):
         super().__init__(
             agent,
             verbosity,
-            local_artifacts_path,
             callback_message,
             code_interpreter,
         )

{vision_agent-0.2.174 → vision_agent-0.2.175}/vision_agent/agent/vision_agent_prompts.py RENAMED Viewed

@@ -1,7 +1,7 @@
 VA_CODE = """
 **Role**: You are a helpful agent that assists users with writing code.
-**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved everytime you execute python code.
+**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execute_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be saved only AFTER you execute python code with `save_imgae` or `save_video`. The user can see all `artifacts`.
 <execute_python>
 print("Hello World!")
@@ -26,10 +26,11 @@ Here is the current conversation so far:
 **Instructions**:
 1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
-2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`.
+2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`. DO NOT run `edit_vision_code` or `edit_code_artifact` more than 2 times in a row and instead ask the user for help.
 3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question or show your results to the user, set <let_user_respond> to `true`.
-4. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation.
-5. **Output**: You can only respond with <thinking>, <response>, <execute_python>, and <let_user_respond> tags.
+4. **Artifacts**: Files are only saved in `artifacts` after <execute_python>, do not try to access artifacts until you observe that they are loaded.
+5. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation.
+6. **Output**: You can only respond with <thinking>, <response>, <execute_python>, and <let_user_respond> tags.
 <thinking>Your thoughts here...</thinking>
 <response>Your response to the user here...</response>
@@ -53,7 +54,7 @@ generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect
 <let_user_respond>false</let_user_respond>
 OBSERVATION:
-[Artifact dog_detector.py]
+[Artifact dog_detector.py (5 lines total)]
 0|from vision_agent.tools import load_image, owl_v2
 1|def detect_dogs(image_path: str):
 2|    image = load_image(image_path)
@@ -94,7 +95,7 @@ edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect do
 <let_user_respond>false</let_user_respond>
 OBSERVATION:
-[Artifact dog_detector.py]
+[Artifact dog_detector.py (5 lines total)]
 0|from vision_agent.tools import load_image, owl_v2
 1|def detect_dogs(image_path: str):
 2|    image = load_image(image_path)
@@ -142,7 +143,7 @@ AGENT: <thinking>The image shows a cat and a dog sitting on the couch, I will te
 EXAMPLES_CODE3 = """
-USER: Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?
+USER: Can you create a function to count workers with helmets, return the count and a visaulization of the bounding boxes?
 OBSERVATION:
 [Artifacts loaded]
@@ -166,9 +167,9 @@ generate_vision_code(artifacts, 'code.py', 'Can you create a function to count w
 <let_user_respond>false</let_user_respond>
 OBSERVATION:
-[Artifact code.py]
+[Artifact code.py (17 lines total)]
 0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
-1|def count_workers_with_helmets(image_path: str, output_path: str):
+1|def count_workers_with_helmets(image_path: str):
 2|    image = load_image(image_path)
 3|    detections = florence2_phrase_grounding("worker, helmet", image)
 4|    workers = [d for d in detections if d['label'] == 'worker']
@@ -182,15 +183,16 @@ OBSERVATION:
 11|                break
 11|        if person_has_helmet:
 13|            count += 1
-14|    overlay_bounding_boxes(image, detections)
-15|    save_image(output_path, image)
-16|    return count
+14|    viz = overlay_bounding_boxes(image, detections)
+16|    return count, viz
 [End of artifact]
-AGENT: <thinking>I have generated the code to count the workers with helmets in the image, I must now run the code and print the output.</thinking>
+AGENT: <thinking>I have generated the code to count the workers with helmets in the image, I must now run the code, print the counts and save the visualization.</thinking>
 <execute_python>
 from code import count_workers_with_helmets
-print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))
+count, viz = count_workers_with_helmets('/path/to/images/workers.png')
+save_image(viz, 'workers_viz.png')
+print(count)
 </execute_python>
 <let_user_respond>false</let_user_respond>
@@ -201,7 +203,9 @@ OBSERVATION:
 AGENT: <thinking>Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.</thinking>
 <response>The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.</response>
 <let_user_respond>true</let_user_respond>
+"""
+EXAMPLES_CODE3_EXTRA1 = """
 USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
 AGENT: <thinking>Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model</thinking>
@@ -249,3 +253,60 @@ AGENT: <thinking>Three workers with helmets are detected, I will show this to th
 <response>I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.</response>
 <let_user_respond>true</let_user_respond>
 """
+EXAMPLES_CODE3_EXTRA2 = """
+USER: Can you write a streamlit app that allows me to upload an image and count the workers with helmets?
+AGENT: <thinking>I will use edit_code_artifact to generate a streamlit app that allows the user to upload an image and count the workers with helmets.</thinking>
+<execute_python>
+CODE = '''
+import streamlit as st
+from code imoprt count_workers_with_helmets
+import os
+def main():
+    st.title("Worker with Helmet Counter")
+    uploaded_file = st.file_uploader("Choose image...", type=["jpeg", "jpg", "png"])
+    if uploaded_file is not None:
+        with open("temp_image.jpg", "wb") as f:
+            f.write(uploaded_file.read())
+        count = count_workers_with_helmets("temp_image.jpg", "temp_image_viz.jpg")
+        st.image("temp_image_viz.jpg")
+        st.write(f"Number of workers with helmets: {count}")
+        os.remove("temp_image.jpg")
+        os.remove("temp_image_viz.jpg")
+if __name__ == "__main__":
+    main()
+'''
+edit_code_artifact(artifacts, 'streamlit_app.py', CODE, 0, 0)
+</execute_python>
+OBSERVATION:
+[Artifact streamlit_app.py (19 lines total)]
+0|import streamlit as st
+1|from code imoprt count_workers_with_helmets
+2|import os
+3|
+4|def main():
+5|    st.title("Worker with Helmet Counter")
+6|    uploaded_file = st.file_uploader("Choose image...", type=["jpeg", "jpg", "png"])
+7|    if uploaded_file is not None:
+8|        with open("temp_image.jpg", "wb") as f:
+9|            f.write(uploaded_file.read())
+10|
+11|        count = count_workers_with_helmets("temp_image.jpg", "temp_image_viz.jpg")
+12|        st.image("temp_image_viz.jpg")
+13|        st.write(f"Number of workers with helmets: {count}")
+14|        os.remove("temp_image.jpg")
+15|        os.remove("temp_image_viz.jpg")
+16|
+17|if __name__ == "__main__":
+18|    main()
+[End of artifact]
+AGENT: <thinking>I have generated the streamlit app that allows the user to upload an image and count the workers</thinking>
+<response>The streamlit app is saved in streamlit_app.py, you can run the app by running `streamlit run streamlit_app.py`.</response>
+<let_user_respond>true</let_user_respond>
+"""

{vision_agent-0.2.174 → vision_agent-0.2.175}/vision_agent/tools/meta_tools.py RENAMED Viewed

@@ -6,15 +6,13 @@ import re
 import subprocess
 import tempfile
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 import libcst as cst
 from IPython.display import display
 import vision_agent as va
-from vision_agent.agent.agent_utils import extract_json
 from vision_agent.clients.landing_public_api import LandingPublicAPI
-from vision_agent.lmm import AnthropicLMM
 from vision_agent.lmm.types import Message
 from vision_agent.tools.tool_utils import get_tool_documentation
 from vision_agent.tools.tools import TOOL_DESCRIPTIONS
@@ -26,7 +24,6 @@ CURRENT_FILE = None
 CURRENT_LINE = 0
 DEFAULT_WINDOW_SIZE = 100
 ZMQ_PORT = os.environ.get("ZMQ_PORT", None)
-VERBOSITY = os.environ.get("VERBOSITY", 0)
 def report_progress_callback(port: int, inp: Dict[str, Any]) -> None:
@@ -38,16 +35,6 @@ def report_progress_callback(port: int, inp: Dict[str, Any]) -> None:
     socket.send_json(inp)
-def filter_file(file_name: Union[str, Path]) -> bool:
-    file_name_p = Path(file_name)
-    return (
-        file_name_p.is_file()
-        and "__pycache__" not in str(file_name_p)
-        and file_name_p.suffix in [".py", ".txt"]
-        and not file_name_p.name.startswith(".")
-    )
 def redisplay_results(execution: Execution) -> None:
     """This function is used to add previous execution results to the current output.
     This is handy if you are inside a notebook environment, call it notebook1, and you
@@ -86,8 +73,11 @@ class Artifacts:
     need to be in sync with the remote environment the VisionAgent is running in.
     """
-    def __init__(self, remote_save_path: Union[str, Path]) -> None:
+    def __init__(
+        self, remote_save_path: Union[str, Path], local_save_path: Union[str, Path]
+    ) -> None:
         self.remote_save_path = Path(remote_save_path)
+        self.local_save_path = Path(local_save_path)
         self.artifacts: Dict[str, Any] = {}
         self.code_sandbox_runtime = None
@@ -131,9 +121,7 @@ class Artifacts:
         return output_str
     def save(self, local_path: Optional[Union[str, Path]] = None) -> None:
-        save_path = (
-            Path(local_path) if local_path is not None else self.remote_save_path
-        )
+        save_path = Path(local_path) if local_path is not None else self.local_save_path
         with open(save_path, "wb") as f:
             pkl.dump(self.artifacts, f)
@@ -150,6 +138,38 @@ class Artifacts:
         return name in self.artifacts
+def filter_file(file_name: Union[str, Path]) -> Tuple[bool, bool]:
+    file_name_p = Path(file_name)
+    return (
+        file_name_p.is_file()
+        and "__pycache__" not in str(file_name_p)
+        and not file_name_p.name.startswith(".")
+        and file_name_p.suffix
+        in [".png", ".jpeg", ".jpg", ".mp4", ".txt", ".json", ".csv"]
+    ), file_name_p.suffix in [".png", ".jpeg", ".jpg", ".mp4"]
+def capture_files_into_artifacts(artifacts: Artifacts) -> None:
+    """This function is used to capture all files in the current directory into an
+    artifact object. This is useful if you want to capture all files in the current
+    directory and use them in a different environment where you don't have access to
+    the file system.
+    Parameters:
+        artifact (Artifacts): The artifact object to save the files to.
+    """
+    for file in Path(".").glob("**/*"):
+        usable_file, is_media = filter_file(file)
+        mode = "rb" if is_media else "r"
+        if usable_file:
+            file_name = file.name
+            if file_name.startswith(str(Path(artifacts.remote_save_path).parents)):
+                idx = len(Path(artifacts.remote_save_path).parents)
+                file_name = file_name[idx:]
+            with open(file, mode) as f:
+                artifacts[file_name] = f.read()
 # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
@@ -174,9 +194,9 @@ def view_lines(
         f"[Artifact: {name} ({total_lines} lines total)]\n"
         + format_lines(lines[start:end], start)
         + (
-            "[End of artifact]"
+            "\n[End of artifact]"
             if end == len(lines)
-            else f"[{len(lines) - end} more lines]"
+            else f"\n[{len(lines) - end} more lines]"
         )
     )
@@ -256,8 +276,10 @@ def edit_code_artifact(
     Parameters:
         artifacts (Artifacts): The artifacts object to edit the artifact from.
         name (str): The name of the artifact to edit.
-        start (int): The line number to start the edit.
-        end (int): The line number to end the edit.
+        start (int): The line number to start the edit, can be in [-1, total_lines]
+            where -1 represents the end of the file.
+        end (int): The line number to end the edit, can be in [-1, total_lines] where
+            -1 represents the end of the file.
         content (str): The content to insert.
     """
     # just make the artifact if it doesn't exist instead of forcing agent to call
@@ -266,17 +288,21 @@ def edit_code_artifact(
         artifacts[name] = ""
     total_lines = len(artifacts[name].splitlines())
+    if start == -1:
+        start = total_lines
+    if end == -1:
+        end = total_lines
     if start < 0 or end < 0 or start > end or end > total_lines:
         print("[Invalid line range]")
         return "[Invalid line range]"
-    if start == end:
-        end += 1
     new_content_lines = content.splitlines(keepends=True)
     new_content_lines = [
         line if line.endswith("\n") else line + "\n" for line in new_content_lines
     ]
     lines = artifacts[name].splitlines(keepends=True)
+    lines = [line if line.endswith("\n") else line + "\n" for line in lines]
     edited_lines = lines[:start] + new_content_lines + lines[end:]
     cur_line = start + len(content.split("\n")) // 2
@@ -371,14 +397,16 @@ def generate_vision_plan(
         [End Plan Context]
     """
+    # verbosity is set to 0 to avoid adding extra content to the VisionAgent conversation
     if ZMQ_PORT is not None:
         agent = va.agent.VisionAgentPlanner(
             report_progress_callback=lambda inp: report_progress_callback(
                 int(ZMQ_PORT), inp
-            )
+            ),
+            verbosity=0,
         )
     else:
-        agent = va.agent.VisionAgentPlanner()
+        agent = va.agent.VisionAgentPlanner(verbosity=0)
     fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
     response = agent.generate_plan(
@@ -435,14 +463,16 @@ def generate_vision_code(
             dogs = owl_v2("dog", image)
             return dogs
     """
+    # verbosity is set to 0 to avoid adding extra content to the VisionAgent conversation
     if ZMQ_PORT is not None:
         agent = va.agent.VisionAgentCoder(
             report_progress_callback=lambda inp: report_progress_callback(
                 int(ZMQ_PORT), inp
-            )
+            ),
+            verbosity=0,
         )
     else:
-        agent = va.agent.VisionAgentCoder(verbosity=int(VERBOSITY))
+        agent = va.agent.VisionAgentCoder(verbosity=0)
     fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
     response = agent.generate_code(
@@ -506,7 +536,8 @@ def edit_vision_code(
             return dogs
     """
-    agent = va.agent.VisionAgentCoder(verbosity=int(VERBOSITY))
+    # verbosity is set to 0 to avoid adding extra content to the VisionAgent conversation
+    agent = va.agent.VisionAgentCoder(verbosity=0)
     if name not in artifacts:
         print(f"[Artifact {name} does not exist]")
         return f"[Artifact {name} does not exist]"
@@ -570,8 +601,9 @@ def check_and_load_image(code: str) -> List[str]:
 def view_media_artifact(artifacts: Artifacts, name: str) -> str:
-    """Allows you to view the media artifact with the given name. This does not show
-    the media to the user, the user can already see all media saved in the artifacts.
+    """Allows only the agent to view the media artifact with the given name. DO NOT use
+    this to show media to the user, the user can already see all media saved in the
+    artifacts.
     Parameters:
         artifacts (Artifacts): The artifacts object to show the image from.
@@ -648,10 +680,10 @@ def get_diff_with_prompts(name: str, before: str, after: str) -> str:
 def use_extra_vision_agent_args(
-    code: str,
+    code: Optional[str],
     test_multi_plan: bool = True,
     custom_tool_names: Optional[List[str]] = None,
-) -> str:
+) -> Optional[str]:
     """This is for forcing arguments passed by the user to VisionAgent into the
     VisionAgentCoder call.
@@ -663,6 +695,8 @@ def use_extra_vision_agent_args(
     Returns:
         str: The edited code.
     """
+    if code is None:
+        return None
     class VisionAgentTransformer(cst.CSTTransformer):
         def __init__(
@@ -815,74 +849,12 @@ def use_object_detection_fine_tuning(
     return diff
-def extract_and_save_files_to_artifacts(
-    artifacts: Artifacts, code: str, obs: str
-) -> None:
-    """Extracts and saves files used in the code to the artifacts object.
-    Parameters:
-        artifacts (Artifacts): The artifacts object to save the files to.
-        code (str): The code to extract the files from.
-    """
-    try:
-        response = extract_json(
-            AnthropicLMM()(  # type: ignore
-                f"""You are a helpful AI assistant. Your job is to look at a snippet of code and the output of running that code and return the file paths that are being saved in the file. Below is the code snippet:
-```python
-{code}
-```
-```output
-{obs}
-```
-Return the file paths in the following JSON format:
-{{"file_paths": ["/path/to/image1.jpg", "/other/path/to/data.json"]}}"""
-            )
-        )
-    except json.JSONDecodeError:
-        return
-    text_file_ext = [
-        ".txt",
-        ".md",
-        "rtf",
-        ".html",
-        ".htm",
-        "xml",
-        ".json",
-        ".csv",
-        ".tsv",
-        ".yaml",
-        ".yml",
-        ".toml",
-        ".conf",
-        ".env" ".ini",
-        ".log",
-        ".py",
-        ".java",
-        ".js",
-        ".cpp",
-        ".c" ".sql",
-        ".sh",
-    ]
-    if "file_paths" in response and isinstance(response["file_paths"], list):
-        for file_path in response["file_paths"]:
-            read_mode = "r" if Path(file_path).suffix in text_file_ext else "rb"
-            if Path(file_path).is_file():
-                with open(file_path, read_mode) as f:
-                    artifacts[Path(file_path).name] = f.read()
 META_TOOL_DOCSTRING = get_tool_documentation(
     [
         get_tool_descriptions,
         open_code_artifact,
         create_code_artifact,
         edit_code_artifact,
-        generate_vision_plan,
         generate_vision_code,
         edit_vision_code,
         view_media_artifact,

{vision_agent-0.2.174 → vision_agent-0.2.175}/vision_agent/utils/execute.py RENAMED Viewed

@@ -575,6 +575,7 @@ class LocalCodeInterpreter(CodeInterpreter):
         super().__init__(timeout=timeout)
         self.nb = nbformat.v4.new_notebook()
         # Set the notebook execution path to the remote path
+        self.remote_path = Path(remote_path if remote_path is not None else WORKSPACE)
         self.resources = {"metadata": {"path": str(self.remote_path)}}
         self.nb_client = NotebookClient(
             self.nb,
@@ -591,7 +592,6 @@ Timeout: {self.timeout}"""
         )
         sleep(1)
         self._new_kernel()
-        self.remote_path = Path(remote_path if remote_path is not None else WORKSPACE)
     def _new_kernel(self) -> None:
         if self.nb_client.kc is None or not run_sync(self.nb_client.kc.is_alive)():  # type: ignore
@@ -659,7 +659,7 @@ Timeout: {self.timeout}"""
     def download_file(
         self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]
     ) -> Path:
-        with open(self.remote_path / remote_file_path, "rb") as f:
+        with open(self.remote_path / Path(remote_file_path).name, "rb") as f:
             contents = f.read()
         with open(local_file_path, "wb") as f:
             f.write(contents)