PyPI - vision-agent - Versions diffs - 0.2.158__tar.gz → 0.2.159__tar.gz - Mend

vision-agent 0.2.158tar.gz → 0.2.159tar.gz

Files changed (33) hide show

{vision_agent-0.2.158 → vision_agent-0.2.159}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.158
+Version: 0.2.159
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.158 → vision_agent-0.2.159}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.158"
+version = "0.2.159"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"

{vision_agent-0.2.158 → vision_agent-0.2.159}/vision_agent/agent/vision_agent.py RENAMED Viewed

@@ -149,6 +149,32 @@ def execute_user_code_action(
     return user_result, user_obs
+def add_step_descriptions(response: Dict[str, str]) -> Dict[str, str]:
+    response = copy.deepcopy(response)
+    if "response" in response:
+        resp_str = response["response"]
+        if "<execute_python>" in resp_str:
+            # only include descriptions for these, the rest will just have executing
+            # code
+            description_map = {
+                "open_code_artifact": "Reading file.",
+                "create_code_artifact": "Creating file.",
+                "edit_code_artifact": "Editing file.",
+                "generate_vision_code": "Generating vision code.",
+                "edit_vision_code": "Editing vision code.",
+            }
+            description = ""
+            for k, v in description_map.items():
+                if k in resp_str:
+                    description += v + " "
+            if description == "":
+                description = "Executing code."
+            resp_str = resp_str[resp_str.find("<execute_python>") :]
+            resp_str = description + resp_str
+        response["response"] = resp_str
+    return response
 class VisionAgent(Agent):
     """Vision Agent is an agent that can chat with the user and call tools or other
     agents to generate code for it. Vision Agent uses python code to execute actions
@@ -335,8 +361,18 @@ class VisionAgent(Agent):
                 response = run_conversation(self.agent, int_chat)
                 if self.verbosity >= 1:
                     _LOGGER.info(response)
-                int_chat.append({"role": "assistant", "content": str(response)})
-                orig_chat.append({"role": "assistant", "content": str(response)})
+                int_chat.append(
+                    {
+                        "role": "assistant",
+                        "content": str(add_step_descriptions(response)),
+                    }
+                )
+                orig_chat.append(
+                    {
+                        "role": "assistant",
+                        "content": str(add_step_descriptions(response)),
+                    }
+                )
                 # sometimes it gets stuck in a loop, so we force it to exit
                 if last_response == response:
@@ -382,6 +418,16 @@ class VisionAgent(Agent):
                     obs_chat_elt: Message = {"role": "observation", "content": obs}
                     if media_obs and result.success:
+                        # for view_media_artifact, we need to ensure the media is loaded
+                        # locally so the conversation agent can actually see it
+                        code_interpreter.download_file(
+                            str(remote_artifacts_path.name),
+                            str(self.local_artifacts_path),
+                        )
+                        artifacts.load(
+                            self.local_artifacts_path,
+                            Path(self.local_artifacts_path).parent,
+                        )
                         obs_chat_elt["media"] = [
                             Path(self.local_artifacts_path).parent / media_ob
                             for media_ob in media_obs
@@ -407,8 +453,9 @@ class VisionAgent(Agent):
             code_interpreter.download_file(
                 str(remote_artifacts_path.name), str(self.local_artifacts_path)
             )
-            artifacts.load(self.local_artifacts_path)
-            artifacts.save()
+            artifacts.load(
+                self.local_artifacts_path, Path(self.local_artifacts_path).parent
+            )
         return orig_chat, artifacts
     def streaming_message(self, message: Dict[str, Any]) -> None:

{vision_agent-0.2.158 → vision_agent-0.2.159}/vision_agent/tools/meta_tools.py RENAMED Viewed

@@ -92,19 +92,26 @@ class Artifacts:
         self.code_sandbox_runtime = None
-    def load(self, file_path: Union[str, Path]) -> None:
-        """Loads are artifacts into the remote environment. If an artifact value is None
-        it will skip loading it.
+    def load(
+        self,
+        artifacts_path: Union[str, Path],
+        load_to: Optional[Union[str, Path]] = None,
+    ) -> None:
+        """Loads are artifacts into the load_to path. If load_to is None, it will load
+        into remote_save_path. If an artifact value is None it will skip loading it.
         Parameters:
-            file_path (Union[str, Path]): The file path to load the artifacts from
+            artifacts_path (Union[str, Path]): The file path to load the artifacts from
         """
-        with open(file_path, "rb") as f:
+        with open(artifacts_path, "rb") as f:
             self.artifacts = pkl.load(f)
+        load_to = self.remote_save_path.parent if load_to is None else Path(load_to)
         for k, v in self.artifacts.items():
             if v is not None:
                 mode = "w" if isinstance(v, str) else "wb"
-                with open(self.remote_save_path.parent / k, mode) as f:
+                with open(load_to / k, mode) as f:
                     f.write(v)
     def show(self, uploaded_file_path: Optional[Union[str, Path]] = None) -> str:

{vision_agent-0.2.158 → vision_agent-0.2.159}/vision_agent/tools/tools.py RENAMED Viewed

@@ -700,6 +700,7 @@ def countgd_counting(
             {'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58},
         ]
     """
+    image_size = image.shape[:2]
     buffer_bytes = numpy_to_bytes(image)
     files = [("image", buffer_bytes)]
     prompt = prompt.replace(", ", " .")
@@ -712,7 +713,7 @@ def countgd_counting(
     bboxes_formatted = [
         ODResponseData(
             label=bbox["label"],
-            bbox=list(map(lambda x: round(x, 2), bbox["bounding_box"])),
+            bbox=normalize_bbox(bbox["bounding_box"], image_size),
             score=round(bbox["score"], 2),
         )
         for bbox in bboxes_per_frame
@@ -757,6 +758,7 @@ def countgd_example_based_counting(
             {'score': 0.98, 'label': 'object', 'bounding_box': [0.44, 0.24, 0.49, 0.58},
         ]
     """
+    image_size = image.shape[:2]
     buffer_bytes = numpy_to_bytes(image)
     files = [("image", buffer_bytes)]
     visual_prompts = [
@@ -771,7 +773,7 @@ def countgd_example_based_counting(
     bboxes_formatted = [
         ODResponseData(
             label=bbox["label"],
-            bbox=list(map(lambda x: round(x, 2), bbox["bounding_box"])),
+            bbox=normalize_bbox(bbox["bounding_box"], image_size),
             score=round(bbox["score"], 2),
         )
         for bbox in bboxes_per_frame