PyPI - vision-agent - Versions diffs - 0.2.236__py3-none-any.whl → 0.2.237__py3-none-any.whl - Mend

vision-agent 0.2.236py3-none-any.whl → 0.2.237py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

vision_agent/.sim_tools/df.csv +57 -80
vision_agent/.sim_tools/embs.npy +0 -0
vision_agent/agent/agent.py +2 -2
vision_agent/agent/vision_agent.py +3 -2
vision_agent/agent/vision_agent_coder.py +13 -19
vision_agent/agent/vision_agent_coder_v2.py +17 -17
vision_agent/agent/vision_agent_planner.py +16 -21
vision_agent/agent/vision_agent_planner_prompts_v2.py +19 -20
vision_agent/agent/vision_agent_planner_v2.py +29 -15
vision_agent/agent/vision_agent_v2.py +12 -12
vision_agent/clients/landing_public_api.py +1 -1
vision_agent/configs/config.py +17 -3
vision_agent/lmm/__init__.py +0 -1
vision_agent/lmm/lmm.py +4 -3
vision_agent/models/__init__.py +11 -0
vision_agent/{lmm/types.py → models/lmm_types.py} +4 -1
vision_agent/sim/__init__.py +8 -0
vision_agent/{utils → sim}/sim.py +3 -3
vision_agent/tools/__init__.py +10 -23
vision_agent/tools/meta_tools.py +4 -5
vision_agent/tools/planner_tools.py +127 -37
vision_agent/tools/tools.py +388 -302
vision_agent/utils/__init__.py +0 -1
vision_agent/{agent/agent_utils.py → utils/agent.py} +11 -2
vision_agent/utils/image_utils.py +18 -7
vision_agent/{tools/tool_utils.py → utils/tools.py} +1 -93
vision_agent/utils/tools_doc.py +87 -0
vision_agent/utils/video.py +15 -0
vision_agent/utils/video_tracking.py +38 -5
{vision_agent-0.2.236.dist-info → vision_agent-0.2.237.dist-info}/METADATA +2 -2
vision_agent-0.2.237.dist-info/RECORD +55 -0
vision_agent-0.2.236.dist-info/RECORD +0 -52
/vision_agent/{agent/types.py → models/agent_types.py} +0 -0
/vision_agent/{tools → models}/tools_types.py +0 -0
{vision_agent-0.2.236.dist-info → vision_agent-0.2.237.dist-info}/LICENSE +0 -0
{vision_agent-0.2.236.dist-info → vision_agent-0.2.237.dist-info}/WHEEL +0 -0

vision_agent/agent/vision_agent_v2.py CHANGED Viewed

@@ -4,23 +4,23 @@ from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
 from vision_agent.agent import Agent, AgentCoder, VisionAgentCoderV2
-from vision_agent.agent.agent_utils import (
-    add_media_to_chat,
-    convert_message_to_agentmessage,
-    extract_tag,
-    format_conversation,
-)
-from vision_agent.agent.types import (
+from vision_agent.agent.vision_agent_coder_v2 import format_code_context
+from vision_agent.agent.vision_agent_prompts_v2 import CONVERSATION
+from vision_agent.configs import Config
+from vision_agent.lmm import LMM
+from vision_agent.models import (
     AgentMessage,
     CodeContext,
     InteractionContext,
+    Message,
     PlanContext,
 )
-from vision_agent.agent.vision_agent_coder_v2 import format_code_context
-from vision_agent.agent.vision_agent_prompts_v2 import CONVERSATION
-from vision_agent.configs import Config
-from vision_agent.lmm import LMM
-from vision_agent.lmm.types import Message
+from vision_agent.utils.agent import (
+    add_media_to_chat,
+    convert_message_to_agentmessage,
+    extract_tag,
+    format_conversation,
+)
 from vision_agent.utils.execute import CodeInterpreter, CodeInterpreterFactory
 CONFIG = Config()

vision_agent/clients/landing_public_api.py CHANGED Viewed

@@ -5,7 +5,7 @@ from uuid import UUID
 from requests.exceptions import HTTPError
 from vision_agent.clients.http import BaseHTTP
-from vision_agent.tools.tools_types import BboxInputBase64, JobStatus, PromptTask
+from vision_agent.models import BboxInputBase64, JobStatus, PromptTask
 from vision_agent.utils.exceptions import FineTuneModelNotFound
 from vision_agent.utils.type_defs import LandingaiAPIKey

vision_agent/configs/config.py CHANGED Viewed

@@ -96,13 +96,24 @@ class Config(BaseModel):
         }
     )
+    # for get_tool_for_task
+    od_judge: Type[LMM] = Field(default=AnthropicLMM)
+    od_judge_kwargs: dict = Field(
+        default_factory=lambda: {
+            "model_name": "claude-3-5-sonnet-20241022",
+            "temperature": 0.0,
+            "image_size": 512,
+        }
+    )
     # for suggestions module
-    suggester: Type[LMM] = Field(default=AnthropicLMM)
+    suggester: Type[LMM] = Field(default=OpenAILMM)
     suggester_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "claude-3-5-sonnet-20241022",
+            "model_name": "o1",
             "temperature": 1.0,
-            "image_size": 768,
+            "image_detail": "high",
+            "image_size": 1024,
         }
     )
@@ -143,6 +154,9 @@ class Config(BaseModel):
     def create_tool_chooser(self) -> LMM:
         return self.tool_chooser(**self.tool_chooser_kwargs)
+    def create_od_judge(self) -> LMM:
+        return self.od_judge(**self.od_judge_kwargs)
     def create_suggester(self) -> LMM:
         return self.suggester(**self.suggester_kwargs)

vision_agent/lmm/__init__.py CHANGED Viewed

	@@ -1,2 +1 @@
1 1	from .lmm import LMM, AnthropicLMM, AzureOpenAILMM, GoogleLMM, OllamaLMM, OpenAILMM
2	- from .types import Message

vision_agent/lmm/lmm.py CHANGED Viewed

@@ -9,10 +9,9 @@ import requests
 from anthropic.types import ImageBlockParam, MessageParam, TextBlockParam
 from openai import AzureOpenAI, OpenAI
+from vision_agent.models import Message
 from vision_agent.utils.image_utils import encode_media
-from .types import Message
 class LMM(ABC):
     @abstractmethod
@@ -64,7 +63,9 @@ class OpenAILMM(LMM):
         self.image_size = image_size
         self.image_detail = image_detail
         # o1 does not use max_tokens
-        if "max_tokens" not in kwargs and not model_name.startswith("o1"):
+        if "max_tokens" not in kwargs and not (
+            model_name.startswith("o1") or model_name.startswith("o3")
+        ):
             kwargs["max_tokens"] = max_tokens
         if json_mode:
             kwargs["response_format"] = {"type": "json_object"}

vision_agent/models/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from .agent_types import AgentMessage, CodeContext, InteractionContext, PlanContext
+from .lmm_types import Message, TextOrImage
+from .tools_types import (
+    BboxInput,
+    BboxInputBase64,
+    BoundingBoxes,
+    Florence2FtRequest,
+    JobStatus,
+    ODResponseData,
+    PromptTask,
+)

vision_agent/{lmm/types.py → models/lmm_types.py} RENAMED Viewed

@@ -1,7 +1,10 @@
 from pathlib import Path
 from typing import Dict, Sequence, Union
+import numpy as np
+from PIL.Image import Image as ImageType
 from vision_agent.utils.execute import Execution
-TextOrImage = Union[str, Sequence[Union[str, Path]]]
+TextOrImage = Union[str, Sequence[Union[str, Path, ImageType, np.ndarray]]]
 Message = Dict[str, Union[TextOrImage, Execution]]

vision_agent/sim/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+from .sim import (
+    AzureSim,
+    OllamaSim,
+    Sim,
+    get_tool_recommender,
+    load_cached_sim,
+    load_sim,
+)

vision_agent/{utils → sim}/sim.py RENAMED Viewed

@@ -12,17 +12,17 @@ import requests
 from openai import AzureOpenAI, OpenAI
 from scipy.spatial.distance import cosine  # type: ignore
-from vision_agent.tools.tool_utils import (
+from vision_agent.tools.tools import get_tools_df
+from vision_agent.utils.tools import (
     _LND_API_KEY,
     _create_requests_session,
     _LND_API_URL_v2,
 )
-from vision_agent.tools.tools import TOOLS_DF
 @lru_cache(maxsize=1)
 def get_tool_recommender() -> "Sim":
-    return load_cached_sim(TOOLS_DF)
+    return load_cached_sim(get_tools_df())
 @lru_cache(maxsize=512)

vision_agent/tools/__init__.py CHANGED Viewed

@@ -12,17 +12,10 @@ from .meta_tools import (
     use_object_detection_fine_tuning,
     view_media_artifact,
 )
+from .planner_tools import judge_od_results
 from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
-from .tool_utils import add_bboxes_from_masks, get_tool_descriptions_by_names
 from .tools import (
-    FUNCTION_TOOLS,
-    TOOL_DESCRIPTIONS,
-    TOOL_DOCSTRING,
-    TOOLS,
-    TOOLS_DF,
-    TOOLS_INFO,
-    UTIL_TOOLS,
-    UTILITIES_DOCSTRING,
+    activity_recognition,
     agentic_object_detection,
     agentic_sam2_instance_segmentation,
     agentic_sam2_video_tracking,
@@ -45,7 +38,11 @@ from .tools import (
     florence2_sam2_video_tracking,
     flux_image_inpainting,
     generate_pose_image,
-    get_tool_documentation,
+    get_tools,
+    get_tools_descriptions,
+    get_tools_df,
+    get_tools_docstring,
+    get_utilties_docstring,
     load_image,
     minimum_distance,
     ocr,
@@ -64,7 +61,6 @@ from .tools import (
     save_video,
     siglip_classification,
     template_match,
-    video_temporal_localization,
     vit_image_classification,
     vit_nsfw_classification,
 )
@@ -79,20 +75,11 @@ def register_tool(imports: Optional[List] = None) -> Callable:
     def decorator(tool: Callable) -> Callable:
         import inspect
-        from .tools import (  # noqa: F811
-            get_tool_descriptions,
-            get_tools_df,
-            get_tools_info,
-        )
         global TOOLS, TOOLS_DF, TOOL_DESCRIPTIONS, TOOL_DOCSTRING, TOOLS_INFO
+        from vision_agent.tools.tools import TOOLS
-        if tool not in TOOLS:
-            TOOLS.append(tool)
-            TOOLS_DF = get_tools_df(TOOLS)  # type: ignore
-            TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS)  # type: ignore
-            TOOL_DOCSTRING = get_tool_documentation(TOOLS)  # type: ignore
-            TOOLS_INFO = get_tools_info(TOOLS)  # type: ignore
+        if tool not in TOOLS:  # type: ignore
+            TOOLS.append(tool)  # type: ignore
             globals()[tool.__name__] = tool
             if imports is not None:

vision_agent/tools/meta_tools.py CHANGED Viewed

@@ -12,12 +12,11 @@ from IPython.display import display
 import vision_agent as va
 from vision_agent.clients.landing_public_api import LandingPublicAPI
-from vision_agent.lmm.types import Message
-from vision_agent.tools.tool_utils import get_tool_documentation
-from vision_agent.tools.tools import TOOL_DESCRIPTIONS
-from vision_agent.tools.tools_types import BboxInput, BboxInputBase64, PromptTask
+from vision_agent.models import BboxInput, BboxInputBase64, Message, PromptTask
+from vision_agent.tools.tools import get_tools_descriptions as _get_tool_descriptions
 from vision_agent.utils.execute import Execution, MimeType
 from vision_agent.utils.image_utils import convert_to_b64
+from vision_agent.utils.tools_doc import get_tool_documentation
 CURRENT_FILE = None
 CURRENT_LINE = 0
@@ -571,7 +570,7 @@ def get_tool_descriptions() -> str:
     """Returns a description of all the tools that `generate_vision_code` has access to.
     Helpful for answering questions about what types of vision tasks you can do with
     `generate_vision_code`."""
-    return TOOL_DESCRIPTIONS
+    return _get_tool_descriptions()
 def object_detection_fine_tuning(bboxes: List[Dict[str, Any]]) -> str:

vision_agent/tools/planner_tools.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import inspect
 import logging
+import math
+import random
 import tempfile
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
@@ -10,7 +12,6 @@ from IPython.display import display
 from PIL import Image
 import vision_agent.tools as T
-from vision_agent.agent.agent_utils import DefaultImports, extract_json, extract_tag
 from vision_agent.agent.vision_agent_planner_prompts_v2 import (
     CATEGORIZE_TOOL_REQUEST,
     FINALIZE_PLAN,
@@ -21,6 +22,9 @@ from vision_agent.agent.vision_agent_planner_prompts_v2 import (
 )
 from vision_agent.configs import Config
 from vision_agent.lmm import LMM, AnthropicLMM
+from vision_agent.sim import get_tool_recommender
+from vision_agent.tools.tools import get_tools, get_tools_info
+from vision_agent.utils.agent import DefaultImports, extract_json, extract_tag
 from vision_agent.utils.execute import (
     CodeInterpreter,
     CodeInterpreterFactory,
@@ -28,12 +32,16 @@ from vision_agent.utils.execute import (
     MimeType,
 )
 from vision_agent.utils.image_utils import convert_to_b64
-from vision_agent.utils.sim import get_tool_recommender
+from vision_agent.utils.tools_doc import get_tool_documentation
+def get_tool_functions() -> Dict[str, Callable]:
+    return {tool.__name__: tool for tool in get_tools()}
+def get_load_tools_docstring() -> str:
+    return get_tool_documentation([T.load_image, T.extract_frames_and_timestamps])
-TOOL_FUNCTIONS = {tool.__name__: tool for tool in T.TOOLS}
-LOAD_TOOLS_DOCSTRING = T.get_tool_documentation(
-    [T.load_image, T.extract_frames_and_timestamps]
-)
 CONFIG = Config()
 _LOGGER = logging.getLogger(__name__)
@@ -50,6 +58,59 @@ def format_tool_output(tool_thoughts: str, tool_docstring: str) -> str:
     return return_str
+def judge_od_results(
+    prompt: str,
+    image: np.ndarray,
+    detections: List[Dict[str, Any]],
+) -> str:
+    """Given an image and the detections, this function will judge the results and
+    return the thoughts on the results.
+    Parameters:
+        prompt (str): The prompt that was used to generate the detections.
+        image (np.ndarray): The image that the detections were made on.
+        detections (List[Dict[str, Any]]): The detections made on the image.
+    Returns:
+        str: The thoughts on the results.
+    """
+    if not detections:
+        return "No detections found in the image."
+    od_judge = CONFIG.create_od_judge()
+    max_crop_size = (512, 512)
+    # Randomly sample up to 10 detections
+    num_samples = min(10, len(detections))
+    sampled_detections = random.sample(detections, num_samples)
+    crops = []
+    h, w = image.shape[:2]
+    for detection in sampled_detections:
+        if "bbox" not in detection:
+            continue
+        x1, y1, x2, y2 = detection["bbox"]
+        crop = image[int(y1 * h) : int(y2 * h), int(x1 * w) : int(x2 * w)]
+        if crop.shape[0] > max_crop_size[0] or crop.shape[1] > max_crop_size[1]:
+            crop = Image.fromarray(crop)  # type: ignore
+            crop.thumbnail(max_crop_size)  # type: ignore
+            crop = np.array(crop)
+        crops.append("data:image/png;base64," + convert_to_b64(crop))
+    sampled_detection_info = [
+        {"score": d["score"], "label": d["label"]} for d in sampled_detections
+    ]
+    prompt = f"""The user is trying to detect '{prompt}' in an image. You are shown 10 images which represent crops of the detected objets. Below is the detection labels and scores:
+{sampled_detection_info}
+Look over each of the cropped images and corresponding labels and scores. Provide a judgement on whether or not the results are correct. If the results are incorrect you can only suggest a different prompt or a threshold."""
+    response = cast(str, od_judge.generate(prompt, media=crops))
+    return response
 def run_multi_judge(
     tool_chooser: LMM,
     tool_docs_str: str,
@@ -57,6 +118,7 @@ def run_multi_judge(
     code: str,
     tool_output_str: str,
     image_paths: List[str],
+    n_judges: int = 3,
 ) -> Tuple[Optional[Callable], str, str]:
     error_message = ""
     prompt = PICK_TOOL.format(
@@ -77,7 +139,7 @@ def run_multi_judge(
     responses = []
     with ThreadPoolExecutor() as executor:
-        futures = [executor.submit(run_judge) for _ in range(3)]
+        futures = [executor.submit(run_judge) for _ in range(n_judges)]
         for future in as_completed(futures):
             responses.append(future.result())
@@ -86,7 +148,7 @@ def run_multi_judge(
     for tool, tool_thoughts, tool_docstring in responses:
         if tool is not None:
             counts[tool.__name__] = counts.get(tool.__name__, 0) + 1
-            if counts[tool.__name__] >= 2:
+            if counts[tool.__name__] >= math.ceil(n_judges / 2):
                 return tool, tool_thoughts, tool_docstring
     if len(responses) == 0:
@@ -104,9 +166,12 @@ def extract_tool_info(
     tool_thoughts = tool_choice_context.get("thoughts", "")
     tool_docstring = ""
     tool = tool_choice_context.get("best_tool", None)
-    if tool in TOOL_FUNCTIONS:
-        tool = TOOL_FUNCTIONS[tool]
-        tool_docstring = T.TOOLS_INFO[tool.__name__]
+    tools_info = get_tools_info()
+    tool_functions = get_tool_functions()
+    if tool in tool_functions:
+        tool = tool_functions[tool]
+        tool_docstring = tools_info[tool.__name__]
     return tool, tool_thoughts, tool_docstring, ""
@@ -153,6 +218,42 @@ def replace_box_threshold(code: str, functions: List[str], box_threshold: float)
     return new_tree.code
+def retrieve_tool_docs(lmm: LMM, task: str, exclude_tools: Optional[List[str]]) -> str:
+    query = cast(str, lmm.generate(CATEGORIZE_TOOL_REQUEST.format(task=task)))
+    categories_str = extract_tag(query, "category")
+    if categories_str is None:
+        categories = []
+    else:
+        categories = [e.strip() for e in categories_str.split(",")]
+    explanation = query.split("<category>")[0].strip()
+    if "</category>" in query:
+        explanation += " " + query.split("</category>")[1].strip()
+        explanation = explanation.strip()
+    sim = get_tool_recommender()
+    all_tool_docs = []
+    all_tool_doc_names = set()
+    exclude_tools = [] if exclude_tools is None else exclude_tools
+    for category in categories:
+        tool_docs = sim.top_k(category, k=3, thresh=0.3)
+        for tool_doc in tool_docs:
+            if (
+                tool_doc["name"] not in all_tool_doc_names
+                and tool_doc["name"] not in exclude_tools
+            ):
+                all_tool_docs.append(tool_doc)
+                all_tool_doc_names.add(tool_doc["name"])
+    tool_docs_str = explanation + "\n\n" + "\n".join([e["doc"] for e in all_tool_docs])
+    tool_docs_str += (
+        "\n" + get_load_tools_docstring() + get_tool_documentation([judge_od_results])
+    )
+    return tool_docs_str
 def run_tool_testing(
     task: str,
     image_paths: List[str],
@@ -162,22 +263,8 @@ def run_tool_testing(
     process_code: Callable[[str], str] = lambda x: x,
 ) -> tuple[str, str, Execution]:
     """Helper function to generate and run tool testing code."""
-    query = lmm.generate(CATEGORIZE_TOOL_REQUEST.format(task=task))
-    category = extract_tag(query, "category")  # type: ignore
-    if category is None:
-        query = task
-    else:
-        query = f"{category.strip()}. {task}"
-    tool_docs = get_tool_recommender().top_k(query, k=5, thresh=0.3)
-    if exclude_tools is not None and len(exclude_tools) > 0:
-        cleaned_tool_docs = []
-        for tool_doc in tool_docs:
-            if not tool_doc["name"] in exclude_tools:
-                cleaned_tool_docs.append(tool_doc)
-        tool_docs = cleaned_tool_docs
-    tool_docs_str = "\n".join([e["doc"] for e in tool_docs])
-    tool_docs_str += "\n" + LOAD_TOOLS_DOCSTRING
+    tool_docs_str = retrieve_tool_docs(lmm, task, exclude_tools)
     prompt = TEST_TOOLS.format(
         tool_docs=tool_docs_str,
@@ -295,24 +382,26 @@ def get_tool_for_task(
                 Image.fromarray(image).save(image_path)
                 image_paths.append(image_path)
+        # run no more than 3 images or else it overloads the LLM
+        image_paths = image_paths[:3]
         code, tool_docs_str, tool_output = run_tool_testing(
             task, image_paths, tool_tester, exclude_tools, code_interpreter
         )
         tool_output_str = tool_output.text(include_results=False).strip()
         _, tool_thoughts, tool_docstring = run_multi_judge(
-            tool_chooser, tool_docs_str, task, code, tool_output_str, image_paths
+            tool_chooser,
+            tool_docs_str,
+            task,
+            code,
+            tool_output_str,
+            image_paths,
+            n_judges=3,
         )
     print(format_tool_output(tool_thoughts, tool_docstring))
-def get_tool_documentation(tool_name: str) -> str:
-    # use same format as get_tool_for_task
-    tool_doc = T.TOOLS_DF[T.TOOLS_DF["name"] == tool_name]["doc"].values[0]
-    return format_tool_output("", tool_doc)
 def get_tool_for_task_human_reviewer(
     task: str,
     images: Union[Dict[str, List[np.ndarray]], List[np.ndarray]],
@@ -337,8 +426,8 @@ def get_tool_for_task_human_reviewer(
         tools = [
             t.__name__
-            for t in T.TOOLS
-            if inspect.signature(t).parameters.get("box_threshold")  # type: ignore
+            for t in get_tools()
+            if inspect.signature(t).parameters.get("box_threshold")
         ]
         _, _, tool_output = run_tool_testing(
@@ -414,7 +503,8 @@ def suggestion(prompt: str, medias: List[np.ndarray]) -> None:
     a problem.
     Parameters:
-        prompt: str: The problem statement.
+        prompt: str: The problem statement, provide a detailed description of the
+            problem you are trying to solve.
         medias: List[np.ndarray]: The images to use for the problem
     """
     try:
@@ -431,4 +521,4 @@ PLANNER_TOOLS = [
     suggestion,
     get_tool_for_task,
 ]
-PLANNER_DOCSTRING = T.get_tool_documentation(PLANNER_TOOLS)  # type: ignore
+PLANNER_DOCSTRING = get_tool_documentation(PLANNER_TOOLS)  # type: ignore

vision-agent 0.2.236__py3-none-any.whl → 0.2.237__py3-none-any.whl

vision-agent 0.2.236py3-none-any.whl → 0.2.237py3-none-any.whl