PyPI - vision-agent - Versions diffs - 0.2.111__tar.gz → 0.2.113__tar.gz - Mend

vision-agent 0.2.111tar.gz → 0.2.113tar.gz

Files changed (33) hide show

{vision_agent-0.2.111 → vision_agent-0.2.113}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.111
+Version: 0.2.113
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.111 → vision_agent-0.2.113}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.111"
+version = "0.2.113"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"

{vision_agent-0.2.111 → vision_agent-0.2.113}/vision_agent/agent/agent_utils.py RENAMED Viewed

@@ -4,14 +4,13 @@ import sys
 from typing import Any, Dict
 logging.basicConfig(stream=sys.stdout)
-_LOGGER = logging.getLogger(__name__)
 def extract_json(json_str: str) -> Dict[str, Any]:
     try:
+        json_str = json_str.replace("\n", " ")
         json_dict = json.loads(json_str)
     except json.JSONDecodeError:
-        input_json_str = json_str
         if "```json" in json_str:
             json_str = json_str[json_str.find("```json") + len("```json") :]
             json_str = json_str[: json_str.find("```")]
@@ -19,12 +18,8 @@ def extract_json(json_str: str) -> Dict[str, Any]:
             json_str = json_str[json_str.find("```") + len("```") :]
             # get the last ``` not one from an intermediate string
             json_str = json_str[: json_str.find("}```")]
-        try:
-            json_dict = json.loads(json_str)
-        except json.JSONDecodeError as e:
-            error_msg = f"Could not extract JSON from the given str: {json_str}.\nFunction input:\n{input_json_str}"
-            _LOGGER.exception(error_msg)
-            raise ValueError(error_msg) from e
+        json_dict = json.loads(json_str)
     return json_dict  # type: ignore

{vision_agent-0.2.111 → vision_agent-0.2.113}/vision_agent/agent/vision_agent_coder.py RENAMED Viewed

@@ -4,6 +4,7 @@ import logging
 import os
 import sys
 import tempfile
+from json import JSONDecodeError
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, cast
@@ -86,8 +87,8 @@ def format_memory(memory: List[Dict[str, str]]) -> str:
 def format_plans(plans: Dict[str, Any]) -> str:
     plan_str = ""
     for k, v in plans.items():
-        plan_str += f"{k}:\n"
-        plan_str += "-" + "\n-".join([e["instructions"] for e in v])
+        plan_str += "\n" + f"{k}: {v['thoughts']}\n"
+        plan_str += "    -" + "\n    -".join([e for e in v["instructions"]])
     return plan_str
@@ -127,7 +128,11 @@ def write_plans(
     user_request = chat[-1]["content"]
     context = USER_REQ.format(user_request=user_request)
-    prompt = PLAN.format(context=context, tool_desc=tool_desc, feedback=working_memory)
+    prompt = PLAN.format(
+        context=context,
+        tool_desc=tool_desc,
+        feedback=working_memory,
+    )
     chat[-1]["content"] = prompt
     return extract_json(model(chat, stream=False))  # type: ignore
@@ -228,13 +233,11 @@ def pick_plan(
                 "status": "completed" if tool_output.success else "failed",
             }
         )
-        tool_output_str = ""
-        if len(tool_output.logs.stdout) > 0:
-            tool_output_str = tool_output.logs.stdout[0]
+        tool_output_str = tool_output.text().strip()
         if verbosity == 2:
             _print_code("Code and test after attempted fix:", code)
-            _LOGGER.info(f"Code execution result after attempte {count}")
+            _LOGGER.info(f"Code execution result after attempt {count}")
         count += 1
@@ -251,7 +254,21 @@ def pick_plan(
         tool_output=tool_output_str[:20_000],
     )
     chat[-1]["content"] = prompt
-    best_plan = extract_json(model(chat, stream=False))  # type: ignore
+    count = 0
+    best_plan = None
+    while best_plan is None and count < max_retries:
+        try:
+            best_plan = extract_json(model(chat, stream=False))  # type: ignore
+        except JSONDecodeError as e:
+            _LOGGER.exception(
+                f"Error while extracting JSON during picking best plan {str(e)}"
+            )
+            pass
+        count += 1
+    if best_plan is None:
+        best_plan = {"best_plan": list(plans.keys())[0]}
     if verbosity >= 1:
         _LOGGER.info(f"Best plan:\n{best_plan}")
@@ -525,7 +542,7 @@ def _print_code(title: str, code: str, test: Optional[str] = None) -> None:
 def retrieve_tools(
-    plans: Dict[str, List[Dict[str, str]]],
+    plans: Dict[str, Dict[str, Any]],
     tool_recommender: Sim,
     log_progress: Callable[[Dict[str, Any]], None],
     verbosity: int = 0,
@@ -542,8 +559,8 @@ def retrieve_tools(
     tool_lists: Dict[str, List[Dict[str, str]]] = {}
     for k, plan in plans.items():
         tool_lists[k] = []
-        for task in plan:
-            tools = tool_recommender.top_k(task["instructions"], k=2, thresh=0.3)
+        for task in plan["instructions"]:
+            tools = tool_recommender.top_k(task, k=2, thresh=0.3)
             tool_info.extend([e["doc"] for e in tools])
             tool_desc.extend([e["desc"] for e in tools])
             tool_lists[k].extend(
@@ -661,6 +678,7 @@ class VisionAgentCoder(Agent):
         chat: List[Message],
         test_multi_plan: bool = True,
         display_visualization: bool = False,
+        customized_tool_names: Optional[List[str]] = None,
     ) -> Dict[str, Any]:
         """Chat with VisionAgentCoder and return intermediate information regarding the
         task.
@@ -676,6 +694,8 @@ class VisionAgentCoder(Agent):
                 with the first plan.
             display_visualization (bool): If True, it opens a new window locally to
                 show the image(s) created by visualization code (if there is any).
+            customized_tool_names (List[str]): A list of customized tools for agent to pick and use.
+                If not provided, default to full tool set from vision_agent.tools.
         Returns:
             Dict[str, Any]: A dictionary containing the code, test, test result, plan,
@@ -729,7 +749,9 @@ class VisionAgentCoder(Agent):
             )
             plans = write_plans(
                 int_chat,
-                T.TOOL_DESCRIPTIONS,
+                T.get_tool_descriptions_by_names(
+                    customized_tool_names, T.FUNCTION_TOOLS, T.UTIL_TOOLS  # type: ignore
+                ),
                 format_memory(working_memory),
                 self.planner,
             )
@@ -737,18 +759,10 @@ class VisionAgentCoder(Agent):
             if self.verbosity >= 1:
                 for p in plans:
                     # tabulate will fail if the keys are not the same for all elements
-                    p_fixed = [
-                        {
-                            "instructions": (
-                                e["instructions"] if "instructions" in e else ""
-                            )
-                        }
-                        for e in plans[p]
-                    ]
+                    p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
                     _LOGGER.info(
                         f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
                     )
             tool_infos = retrieve_tools(
                 plans,
                 self.tool_recommender,
@@ -793,13 +807,15 @@ class VisionAgentCoder(Agent):
             )
             if self.verbosity >= 1:
+                plan_i_fixed = [{"instructions": e} for e in plan_i["instructions"]]
                 _LOGGER.info(
-                    f"Picked best plan:\n{tabulate(tabular_data=plan_i, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
+                    f"Picked best plan:\n{tabulate(tabular_data=plan_i_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
                 )
             results = write_and_test_code(
                 chat=[{"role": c["role"], "content": c["content"]} for c in int_chat],
-                plan="\n-" + "\n-".join([e["instructions"] for e in plan_i]),
+                plan=f"\n{plan_i['thoughts']}\n-"
+                + "\n-".join([e for e in plan_i["instructions"]]),
                 tool_info=tool_info,
                 tool_output=tool_output_str,
                 tool_utils=T.UTILITIES_DOCSTRING,

{vision_agent-0.2.111 → vision_agent-0.2.113}/vision_agent/agent/vision_agent_coder_prompts.py RENAMED Viewed

@@ -30,18 +30,19 @@ PLAN = """
 **Instructions**:
 1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request.
-2. Output three different plans each utilize a different strategy or tool.
+2. Output three different plans each utilize a different strategy or set of tools.
 Output a list of jsons in the following format
 ```json
 {{
     "plan1":
-        [
-            {{
-                "instructions": str # what you should do in this task associated with a tool
-            }}
-        ],
+        {{
+            "thoughts": str # your thought process for choosing this plan
+            "instructions": [
+                str # what you should do in this task associated with a tool
+            ]
+        }},
     "plan2": ...,
     "plan3": ...
 }}
@@ -127,7 +128,8 @@ PICK_PLAN = """
 **Instructions**:
 1. Given the plans, image, and tool outputs, decide which plan is the best to achieve the user request.
-2. Output a JSON object with the following format:
+2. Try solving the problem yourself given the image and pick the plan that matches your solution the best.
+3. Output a JSON object with the following format:
 {{
     "thoughts": str # your thought process for choosing the best plan
     "best_plan": str # the best plan you have chosen

{vision_agent-0.2.111 → vision_agent-0.2.113}/vision_agent/agent/vision_agent_prompts.py RENAMED Viewed

@@ -15,7 +15,7 @@ This is the documentation for the different actions you can take:
 **Examples**:
 Here is an example of how you can interact with a user and Actions to complete a task:
 --- START EXAMPLES ---
-[Current directory: /workspace/test]
+[Current directory: /example/workspace]
 {examples}
 --- END EXAMPLES ---
@@ -27,16 +27,17 @@ Here is an example of how you can interact with a user and Actions to complete a
 Here is the current conversation so far:
 --- START CONVERSATION ---
 [Current directory: {dir}]
 {conversation}
 """
 EXAMPLES_CODE1 = """
 USER: Can you detect the dogs in this image? Media name dog.jpg
-AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/workspace/test/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/workspace/test/dog.jpg'])</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
 OBSERVATION:
-[File /workspace/test/dog_detector.py]
+[File /example/workspace/dog_detector.py]
 0|from vision_agent.tools import load_image, owl_v2
 1|def detect_dogs(image_path: str):
 2|    image = load_image(image_path)
@@ -44,7 +45,7 @@ OBSERVATION:
 4|    return dogs
 [End of file]
-AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/workspace/test/dog.jpg'))</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
 OBSERVATION:
 ----- stdout -----
@@ -55,10 +56,10 @@ AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask
 USER: The the image only has one dog, can you fix this?
-AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/workspace/test/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/workspace/test/dog.jpg'])</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/example/workspace/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
 OBSERVATION:
-[File /workspace/test/dog_detector.py]
+[File /example/workspace/dog_detector.py]
 0|from vision_agent.tools import load_image, owl_v2
 1|def detect_dogs(image_path: str):
 2|    image = load_image(image_path)
@@ -66,7 +67,7 @@ OBSERVATION:
 4|    return dogs
 [End of file]
-AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/workspace/test/dog.jpg'))</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
 OBSERVATION:
 ----- stdout -----
@@ -82,10 +83,10 @@ AGENT: {"thoughts": "The user has asked to count workers with helmets but has no
 USER: Yes you can use workers.png
-AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/workspace/test/code.py', 'Can you write code to count workers with helmets in this image?', media=['/workspace/test/workers.png'])</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/code.py', 'Can you write code to count workers with helmets in this image?', media=['/example/workspace/workers.png'])</execute_python>", "let_user_respond": false}
 OBSERVATION:
-[File /workspace/test/code.py]
+[File /example/workspace/code.py]
 0|from vision_agent.tools import load_image, owl_v2, closest_box_distance
 1|def count_workers_with_helmets(image_path: str):
 2|    image = load_image(image_path)
@@ -104,7 +105,7 @@ OBSERVATION:
 15|    return count
 [End of file]
-AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n    print(count_workers_with_helmets('/workspace/test/workers.png'))</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n    print(count_workers_with_helmets('/example/workspace/workers.png'))</execute_python>", "let_user_respond": false}
 OBSERVATION:
 ----- stdout -----

{vision_agent-0.2.111 → vision_agent-0.2.113}/vision_agent/tools/__init__.py RENAMED Viewed

@@ -1,15 +1,16 @@
 from typing import Callable, List, Optional
-from .meta_tools import (
-    META_TOOL_DOCSTRING,
-)
+from .meta_tools import META_TOOL_DOCSTRING
 from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
+from .tool_utils import get_tool_descriptions_by_names
 from .tools import (
+    FUNCTION_TOOLS,
     TOOL_DESCRIPTIONS,
     TOOL_DOCSTRING,
     TOOLS,
     TOOLS_DF,
     TOOLS_INFO,
+    UTIL_TOOLS,
     UTILITIES_DOCSTRING,
     blip_image_caption,
     clip,
@@ -19,16 +20,20 @@ from .tools import (
     detr_segmentation,
     dpt_hybrid_midas,
     extract_frames,
-    florencev2_image_caption,
-    florencev2_object_detection,
-    florencev2_roberta_vqa,
-    florencev2_ocr,
+    florence2_image_caption,
+    florence2_object_detection,
+    florence2_ocr,
+    florence2_roberta_vqa,
+    florence2_sam2_image,
+    florence2_sam2_video,
     generate_pose_image,
     generate_soft_edge_image,
     get_tool_documentation,
     git_vqa_v2,
     grounding_dino,
     grounding_sam,
+    ixc25_image_vqa,
+    ixc25_video_vqa,
     load_image,
     loca_visual_prompt_counting,
     loca_zero_shot_counting,

{vision_agent-0.2.111 → vision_agent-0.2.113}/vision_agent/tools/meta_tools.py RENAMED Viewed

@@ -8,7 +8,6 @@ from vision_agent.lmm.types import Message
 from vision_agent.tools.tool_utils import get_tool_documentation
 from vision_agent.tools.tools import TOOL_DESCRIPTIONS
 # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
 CURRENT_FILE = None

{vision_agent-0.2.111 → vision_agent-0.2.113}/vision_agent/tools/tool_utils.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import inspect
 import logging
 import os
-from typing import Any, Callable, Dict, List, MutableMapping, Optional
+from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple
 import pandas as pd
 from IPython.display import display
@@ -31,6 +31,7 @@ class ToolCallTrace(BaseModel):
 def send_inference_request(
     payload: Dict[str, Any],
     endpoint_name: str,
+    files: Optional[List[Tuple[Any, ...]]] = None,
     v2: bool = False,
     metadata_payload: Optional[Dict[str, Any]] = None,
 ) -> Dict[str, Any]:
@@ -50,7 +51,7 @@ def send_inference_request(
             response={},
             error=None,
         )
-        headers = {"Content-Type": "application/json", "apikey": _LND_API_KEY}
+        headers = {"apikey": _LND_API_KEY}
         if "TOOL_ENDPOINT_AUTH" in os.environ:
             headers["Authorization"] = os.environ["TOOL_ENDPOINT_AUTH"]
             headers.pop("apikey")
@@ -60,7 +61,11 @@ def send_inference_request(
             num_retry=3,
             headers=headers,
         )
-        res = session.post(url, json=payload)
+        if files is not None:
+            res = session.post(url, data=payload, files=files)
+        else:
+            res = session.post(url, json=payload)
         if res.status_code != 200:
             tool_call_trace.error = Error(
                 name="RemoteToolCallFailed",
@@ -137,6 +142,31 @@ def get_tool_descriptions(funcs: List[Callable[..., Any]]) -> str:
     return descriptions
+def get_tool_descriptions_by_names(
+    tool_name: Optional[List[str]],
+    funcs: List[Callable[..., Any]],
+    util_funcs: List[
+        Callable[..., Any]
+    ],  # util_funcs will always be added to the list of functions
+) -> str:
+    if tool_name is None:
+        return get_tool_descriptions(funcs + util_funcs)
+    invalid_names = [
+        name for name in tool_name if name not in {func.__name__ for func in funcs}
+    ]
+    if invalid_names:
+        raise ValueError(f"Invalid customized tool names: {', '.join(invalid_names)}")
+    filtered_funcs = (
+        funcs
+        if not tool_name
+        else [func for func in funcs if func.__name__ in tool_name]
+    )
+    return get_tool_descriptions(filtered_funcs + util_funcs)
 def get_tools_df(funcs: List[Callable[..., Any]]) -> pd.DataFrame:
     data: Dict[str, List[str]] = {"desc": [], "doc": []}

vision-agent 0.2.111__tar.gz → 0.2.113__tar.gz

vision-agent 0.2.111tar.gz → 0.2.113tar.gz