PyPI - vision-agent - Versions diffs - 1.0.3__py3-none-any.whl → 1.0.5__py3-none-any.whl - Mend

vision-agent 1.0.3py3-none-any.whl → 1.0.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

vision_agent/.sim_tools/df.csv +46 -47
vision_agent/.sim_tools/embs.npy +0 -0
vision_agent/agent/vision_agent_planner_prompts_v2.py +57 -58
vision_agent/agent/vision_agent_planner_v2.py +3 -2
vision_agent/configs/anthropic_config.py +29 -16
vision_agent/configs/config.py +14 -15
vision_agent/configs/openai_config.py +10 -10
vision_agent/lmm/lmm.py +2 -2
vision_agent/sim/sim.py +4 -1
vision_agent/tools/planner_tools.py +13 -14
vision_agent/tools/tools.py +16 -27
vision_agent/utils/tools.py +8 -2
{vision_agent-1.0.3.dist-info → vision_agent-1.0.5.dist-info}/METADATA +31 -3
{vision_agent-1.0.3.dist-info → vision_agent-1.0.5.dist-info}/RECORD +16 -17
vision_agent/configs/anthropic_openai_config.py +0 -164
{vision_agent-1.0.3.dist-info → vision_agent-1.0.5.dist-info}/LICENSE +0 -0
{vision_agent-1.0.3.dist-info → vision_agent-1.0.5.dist-info}/WHEEL +0 -0

vision_agent/configs/anthropic_config.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import Type
 from pydantic import BaseModel, Field
-from vision_agent.lmm import LMM, AnthropicLMM
+from vision_agent.lmm import LMM, AnthropicLMM, OpenAILMM
 class Config(BaseModel):
@@ -10,7 +10,7 @@ class Config(BaseModel):
     agent: Type[LMM] = Field(default=AnthropicLMM)
     agent_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "claude-3-5-sonnet-20241022",
+            "model_name": "claude-3-7-sonnet-20250219",
             "temperature": 0.0,
             "image_size": 768,
         }
@@ -20,18 +20,17 @@ class Config(BaseModel):
     planner: Type[LMM] = Field(default=AnthropicLMM)
     planner_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "claude-3-5-sonnet-20241022",
+            "model_name": "claude-3-7-sonnet-20250219",
             "temperature": 0.0,
             "image_size": 768,
         }
     )
-    # for vision_agent_planner_v2
     summarizer: Type[LMM] = Field(default=AnthropicLMM)
     summarizer_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "claude-3-5-sonnet-20241022",
-            "temperature": 0.0,
+            "model_name": "claude-3-7-sonnet-20250219",
+            "temperature": 1.0,  # o1 has fixed temperature
             "image_size": 768,
         }
     )
@@ -40,7 +39,7 @@ class Config(BaseModel):
     critic: Type[LMM] = Field(default=AnthropicLMM)
     critic_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "claude-3-5-sonnet-20241022",
+            "model_name": "claude-3-7-sonnet-20250219",
             "temperature": 0.0,
             "image_size": 768,
         }
@@ -50,7 +49,7 @@ class Config(BaseModel):
     coder: Type[LMM] = Field(default=AnthropicLMM)
     coder_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "claude-3-5-sonnet-20241022",
+            "model_name": "claude-3-7-sonnet-20250219",
             "temperature": 0.0,
             "image_size": 768,
         }
@@ -60,7 +59,7 @@ class Config(BaseModel):
     tester: Type[LMM] = Field(default=AnthropicLMM)
     tester_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "claude-3-5-sonnet-20241022",
+            "model_name": "claude-3-7-sonnet-20250219",
             "temperature": 0.0,
             "image_size": 768,
         }
@@ -70,7 +69,7 @@ class Config(BaseModel):
     debugger: Type[LMM] = Field(default=AnthropicLMM)
     debugger_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "claude-3-5-sonnet-20241022",
+            "model_name": "claude-3-7-sonnet-20250219",
             "temperature": 0.0,
             "image_size": 768,
         }
@@ -80,7 +79,7 @@ class Config(BaseModel):
     tool_tester: Type[LMM] = Field(default=AnthropicLMM)
     tool_tester_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "claude-3-5-sonnet-20241022",
+            "model_name": "claude-3-7-sonnet-20250219",
             "temperature": 0.0,
             "image_size": 768,
         }
@@ -90,19 +89,30 @@ class Config(BaseModel):
     tool_chooser: Type[LMM] = Field(default=AnthropicLMM)
     tool_chooser_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "claude-3-5-sonnet-20241022",
+            "model_name": "claude-3-7-sonnet-20250219",
             "temperature": 1.0,
             "image_size": 768,
         }
     )
+    # for get_tool_for_task
+    od_judge: Type[LMM] = Field(default=AnthropicLMM)
+    od_judge_kwargs: dict = Field(
+        default_factory=lambda: {
+            "model_name": "claude-3-7-sonnet-20250219",
+            "temperature": 0.0,
+            "image_size": 512,
+        }
+    )
     # for suggestions module
-    suggester: Type[LMM] = Field(default=AnthropicLMM)
+    suggester: Type[LMM] = Field(default=OpenAILMM)
     suggester_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "claude-3-5-sonnet-20241022",
+            "model_name": "o1",
             "temperature": 1.0,
-            "image_size": 768,
+            "image_detail": "high",
+            "image_size": 1024,
         }
     )
@@ -110,7 +120,7 @@ class Config(BaseModel):
     vqa: Type[LMM] = Field(default=AnthropicLMM)
     vqa_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "claude-3-5-sonnet-20241022",
+            "model_name": "claude-3-7-sonnet-20250219",
             "temperature": 0.0,
             "image_size": 768,
         }
@@ -143,6 +153,9 @@ class Config(BaseModel):
     def create_tool_chooser(self) -> LMM:
         return self.tool_chooser(**self.tool_chooser_kwargs)
+    def create_od_judge(self) -> LMM:
+        return self.od_judge(**self.od_judge_kwargs)
     def create_suggester(self) -> LMM:
         return self.suggester(**self.suggester_kwargs)

vision_agent/configs/config.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import Type
 from pydantic import BaseModel, Field
-from vision_agent.lmm import LMM, AnthropicLMM, OpenAILMM
+from vision_agent.lmm import LMM, AnthropicLMM, OpenAILMM, GoogleLMM
 class Config(BaseModel):
@@ -10,7 +10,7 @@ class Config(BaseModel):
     agent: Type[LMM] = Field(default=AnthropicLMM)
     agent_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "claude-3-5-sonnet-20241022",
+            "model_name": "claude-3-7-sonnet-20250219",
             "temperature": 0.0,
             "image_size": 768,
         }
@@ -20,17 +20,16 @@ class Config(BaseModel):
     planner: Type[LMM] = Field(default=AnthropicLMM)
     planner_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "claude-3-5-sonnet-20241022",
+            "model_name": "claude-3-7-sonnet-20250219",
             "temperature": 0.0,
             "image_size": 768,
         }
     )
-    # for vision_agent_planner_v2
-    summarizer: Type[LMM] = Field(default=OpenAILMM)
+    summarizer: Type[LMM] = Field(default=AnthropicLMM)
     summarizer_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "o1",
+            "model_name": "claude-3-7-sonnet-20250219",
             "temperature": 1.0,  # o1 has fixed temperature
             "image_size": 768,
         }
@@ -40,7 +39,7 @@ class Config(BaseModel):
     critic: Type[LMM] = Field(default=AnthropicLMM)
     critic_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "claude-3-5-sonnet-20241022",
+            "model_name": "claude-3-7-sonnet-20250219",
             "temperature": 0.0,
             "image_size": 768,
         }
@@ -50,7 +49,7 @@ class Config(BaseModel):
     coder: Type[LMM] = Field(default=AnthropicLMM)
     coder_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "claude-3-5-sonnet-20241022",
+            "model_name": "claude-3-7-sonnet-20250219",
             "temperature": 0.0,
             "image_size": 768,
         }
@@ -60,7 +59,7 @@ class Config(BaseModel):
     tester: Type[LMM] = Field(default=AnthropicLMM)
     tester_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "claude-3-5-sonnet-20241022",
+            "model_name": "claude-3-7-sonnet-20250219",
             "temperature": 0.0,
             "image_size": 768,
         }
@@ -70,7 +69,7 @@ class Config(BaseModel):
     debugger: Type[LMM] = Field(default=AnthropicLMM)
     debugger_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "claude-3-5-sonnet-20241022",
+            "model_name": "claude-3-7-sonnet-20250219",
             "temperature": 0.0,
             "image_size": 768,
         }
@@ -80,7 +79,7 @@ class Config(BaseModel):
     tool_tester: Type[LMM] = Field(default=AnthropicLMM)
     tool_tester_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "claude-3-5-sonnet-20241022",
+            "model_name": "claude-3-7-sonnet-20250219",
             "temperature": 0.0,
             "image_size": 768,
         }
@@ -90,7 +89,7 @@ class Config(BaseModel):
     tool_chooser: Type[LMM] = Field(default=AnthropicLMM)
     tool_chooser_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "claude-3-5-sonnet-20241022",
+            "model_name": "claude-3-7-sonnet-20250219",
             "temperature": 1.0,
             "image_size": 768,
         }
@@ -100,7 +99,7 @@ class Config(BaseModel):
     od_judge: Type[LMM] = Field(default=AnthropicLMM)
     od_judge_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "claude-3-5-sonnet-20241022",
+            "model_name": "claude-3-7-sonnet-20250219",
             "temperature": 0.0,
             "image_size": 512,
         }
@@ -118,10 +117,10 @@ class Config(BaseModel):
     )
     # for vqa module
-    vqa: Type[LMM] = Field(default=AnthropicLMM)
+    vqa: Type[LMM] = Field(default=GoogleLMM)
     vqa_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "claude-3-5-sonnet-20241022",
+            "model_name": "gemini-2.0-flash-exp",
             "temperature": 0.0,
             "image_size": 768,
         }

vision_agent/configs/openai_config.py CHANGED Viewed

@@ -10,7 +10,7 @@ class Config(BaseModel):
     agent: Type[LMM] = Field(default=OpenAILMM)
     agent_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "gpt-4o-2024-08-06",
+            "model_name": "gpt-4o-2024-11-20",
             "temperature": 0.0,
             "image_size": 768,
             "image_detail": "low",
@@ -21,7 +21,7 @@ class Config(BaseModel):
     planner: Type[LMM] = Field(default=OpenAILMM)
     planner_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "gpt-4o-2024-08-06",
+            "model_name": "gpt-4o-2024-11-20",
             "temperature": 0.0,
             "image_size": 768,
             "image_detail": "low",
@@ -42,7 +42,7 @@ class Config(BaseModel):
     critic: Type[LMM] = Field(default=OpenAILMM)
     critic_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "gpt-4o-2024-08-06",
+            "model_name": "gpt-4o-2024-11-20",
             "temperature": 0.0,
             "image_size": 768,
             "image_detail": "low",
@@ -53,7 +53,7 @@ class Config(BaseModel):
     coder: Type[LMM] = Field(default=OpenAILMM)
     coder_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "gpt-4o-2024-08-06",
+            "model_name": "gpt-4o-2024-11-20",
             "temperature": 0.0,
             "image_size": 768,
             "image_detail": "low",
@@ -64,7 +64,7 @@ class Config(BaseModel):
     tester: Type[LMM] = Field(default=OpenAILMM)
     tester_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "gpt-4o-2024-08-06",
+            "model_name": "gpt-4o-2024-11-20",
             "temperature": 0.0,
             "image_size": 768,
             "image_detail": "low",
@@ -75,7 +75,7 @@ class Config(BaseModel):
     debugger: Type[LMM] = Field(default=OpenAILMM)
     debugger_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "gpt-4o-2024-08-06",
+            "model_name": "gpt-4o-2024-11-20",
             "temperature": 0.0,
             "image_size": 768,
             "image_detail": "low",
@@ -86,7 +86,7 @@ class Config(BaseModel):
     tool_tester: Type[LMM] = Field(default=OpenAILMM)
     tool_tester_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "gpt-4o-2024-08-06",
+            "model_name": "gpt-4o-2024-11-20",
             "temperature": 0.0,
             "image_size": 768,
             "image_detail": "low",
@@ -97,7 +97,7 @@ class Config(BaseModel):
     tool_chooser: Type[LMM] = Field(default=OpenAILMM)
     tool_chooser_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "gpt-4o-2024-08-06",
+            "model_name": "gpt-4o-2024-11-20",
             "temperature": 1.0,
             "image_size": 768,
             "image_detail": "low",
@@ -108,7 +108,7 @@ class Config(BaseModel):
     suggester: Type[LMM] = Field(default=OpenAILMM)
     suggester_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "gpt-4o-2024-08-06",
+            "model_name": "gpt-4o-2024-11-20",
             "temperature": 1.0,
             "image_size": 768,
             "image_detail": "low",
@@ -119,7 +119,7 @@ class Config(BaseModel):
     vqa: Type[LMM] = Field(default=OpenAILMM)
     vqa_kwargs: dict = Field(
         default_factory=lambda: {
-            "model_name": "gpt-4o-2024-08-06",
+            "model_name": "gpt-4o-2024-11-20",
             "temperature": 0.0,
             "image_size": 768,
             "image_detail": "low",

vision_agent/lmm/lmm.py CHANGED Viewed

@@ -98,7 +98,7 @@ class OpenAILMM(LMM):
         for c in chat:
             fixed_c = {"role": c["role"]}
             fixed_c["content"] = [{"type": "text", "text": c["content"]}]  # type: ignore
-            if "media" in c:
+            if "media" in c and self.model_name != "o3-mini":
                 for media in c["media"]:
                     resize = kwargs["resize"] if "resize" in kwargs else self.image_size
                     image_detail = (
@@ -154,7 +154,7 @@ class OpenAILMM(LMM):
                 ],
             }
         ]
-        if media and len(media) > 0:
+        if media and len(media) > 0 and self.model_name != "o3-mini":
             for m in media:
                 resize = kwargs["resize"] if "resize" in kwargs else None
                 image_detail = (

vision_agent/sim/sim.py CHANGED Viewed

@@ -58,7 +58,10 @@ def stella_embeddings(prompts: List[str]) -> List[np.ndarray]:
     }
     url = f"{_LND_API_URL_v2}/embeddings"
     vision_agent_api_key = get_vision_agent_api_key()
-    headers = {"Authorization": f"Basic {vision_agent_api_key}"}
+    headers = {
+        "Authorization": f"Basic {vision_agent_api_key}",
+        "X-Source": "vision_agent",
+    }
     session = _create_requests_session(
         url=url,
         num_retry=3,

vision_agent/tools/planner_tools.py CHANGED Viewed

@@ -236,7 +236,7 @@ def retrieve_tool_docs(lmm: LMM, task: str, exclude_tools: Optional[List[str]])
     all_tool_docs = []
     all_tool_doc_names = set()
     exclude_tools = [] if exclude_tools is None else exclude_tools
-    for category in categories:
+    for category in categories + [task]:
         tool_docs = sim.top_k(category, k=3, thresh=0.3)
         for tool_doc in tool_docs:
@@ -248,9 +248,7 @@ def retrieve_tool_docs(lmm: LMM, task: str, exclude_tools: Optional[List[str]])
                 all_tool_doc_names.add(tool_doc["name"])
     tool_docs_str = explanation + "\n\n" + "\n".join([e["doc"] for e in all_tool_docs])
-    tool_docs_str += (
-        "\n" + get_load_tools_docstring() + get_tool_documentation([judge_od_results])
-    )
+    tool_docs_str += get_load_tools_docstring()
     return tool_docs_str
@@ -346,22 +344,22 @@ def get_tool_for_task(
     and output signatures are.
     Parameters:
-        task: str: The task to accomplish.
-        images: Union[Dict[str, List[np.ndarray]], List[np.ndarray]]: The images to use
+        task (str): The task to accomplish.
+        images (Union[Dict[str, List[np.ndarray]], List[np.ndarray]]): The images to use
             for the task. If a key is provided, it is used as the file name.
-        exclude_tools: Optional[List[str]]: A list of tool names to exclude from the
+        exclude_tools (Optional[List[str]]): A list of tool names to exclude from the
             recommendations. This is helpful if you are calling get_tool_for_task twice
             and do not want the same tool recommended.
     Returns:
-        The tool to use for the task is printed to stdout
+        None: The function does not return the tool but prints it to stdout.
     Examples
     --------
         >>> get_tool_for_task(
         >>>     "Give me an OCR model that can find 'hot chocolate' in the image",
         >>>     {"image": [image]})
-        >>> get_tool_for_taks(
+        >>> get_tool_for_task(
         >>>     "I need a tool that can paint a background for this image and maks",
         >>>     {"image": [image], "mask": [mask]})
     """
@@ -497,8 +495,8 @@ def finalize_plan(user_request: str, chain_of_thoughts: str) -> str:
     return finalized_plan
-def claude35_vqa(prompt: str, medias: List[np.ndarray]) -> None:
-    """Asks the Claude-3.5 model a question about the given media and returns an answer.
+def vqa(prompt: str, medias: List[np.ndarray]) -> None:
+    """Asks the VQA model a question about the given media and returns an answer.
     Parameters:
         prompt: str: The question to ask the model.
@@ -515,13 +513,14 @@ def claude35_vqa(prompt: str, medias: List[np.ndarray]) -> None:
     ]
     response = cast(str, vqa.generate(prompt, media=all_media_b64))
-    print(f"[claude35_vqa output]\n{response}\n[end of claude35_vqa output]")
+    print(f"[vqa output]\n{response}\n[end of vqa output]")
 def suggestion(prompt: str, medias: List[np.ndarray]) -> None:
     """Given your problem statement and the images, this will provide you with a
     suggested plan on how to proceed. Always call suggestion when starting to solve
-    a problem.
+    a problem. 'suggestion' will only print pseudo code for you to execute, it will not
+    execute the code for you.
     Parameters:
         prompt: str: The problem statement, provide a detailed description of the
@@ -538,7 +537,7 @@ def suggestion(prompt: str, medias: List[np.ndarray]) -> None:
 PLANNER_TOOLS = [
-    claude35_vqa,
+    vqa,
     suggestion,
     get_tool_for_task,
 ]

vision_agent/tools/tools.py CHANGED Viewed

@@ -1488,8 +1488,8 @@ def agentic_object_detection(
     """'agentic_object_detection' is a tool that can detect multiple objects given a
     text prompt such as object names or referring expressions on images. It's
     particularly good at detecting specific objects given detailed descriptive prompts
-    but runs slower. It returns a list of bounding boxes with normalized coordinates,
-    label names and associated probability scores.
+    but runs slower so not ideal for high counts. It returns a list of bounding boxes
+    with normalized coordinates, label names and associated confidence score of 1.0.
     Parameters:
         prompt (str): The prompt to ground to the image, only supports a single prompt
@@ -1533,8 +1533,9 @@ def agentic_sam2_instance_segmentation(
     """'agentic_sam2_instance_segmentation' is a tool that can detect multiple
     instances given a text prompt such as object names or referring expressions on
     images. It's particularly good at detecting specific objects given detailed
-    descriptive prompts but runs slower. It returns a list of bounding boxes with
-    normalized coordinates, label names, masks and associated probability scores.
+    descriptive prompts but runs slower so not ideal for high counts. It returns a list
+    of bounding boxes with normalized coordinates, label names, masks and associated
+    confidence score of 1.0.
     Parameters:
         prompt (str): The object that needs to be counted, only supports a single
@@ -1591,9 +1592,9 @@ def agentic_sam2_video_tracking(
     """'agentic_sam2_video_tracking' is a tool that can track and segment multiple
     objects in a video given a text prompt such as object names or referring
     expressions. It's particularly good at detecting specific objects given detailed
-    descriptive prompts but runs slower, and returns a list of bounding boxes, label
-    names, masks and associated probability scores and is useful for tracking and
-    counting without duplicating counts.
+    descriptive prompts but runs slower so not ideal for high counts. It returns a list
+    of bounding boxes, label names, masks and associated confidence score of 1.0 and is
+    useful for tracking and counting without duplicating counts.
     Parameters:
         prompt (str): The prompt to ground to the image, only supports a single prompt
@@ -2307,22 +2308,10 @@ def _qwenvl_activity_recognition(
     return [0.0] * len(segment)
-def _qwen2vl_activity_recognition(
-    segment: List[np.ndarray], prompt: str
-) -> List[float]:
-    return _qwenvl_activity_recognition(segment, prompt, model_name="qwen2vl")
-def _qwen25vl_activity_recognition(
-    segment: List[np.ndarray], prompt: str
-) -> List[float]:
-    return _qwenvl_activity_recognition(segment, prompt, model_name="qwen25vl")
 def activity_recognition(
     prompt: str,
     frames: List[np.ndarray],
-    model: str = "qwen2vl",
+    model: str = "qwen25vl",
     chunk_length_frames: int = 10,
 ) -> List[float]:
     """'activity_recognition' is a tool that can recognize activities in a video given a
@@ -2371,12 +2360,12 @@ def activity_recognition(
     elif model == "qwen2vl":
         def _apply_activity_recognition(segment: List[np.ndarray]) -> List[float]:
-            return _qwen2vl_activity_recognition(segment, prompt)
+            return _qwenvl_activity_recognition(segment, prompt, model_name="qwen2vl")
     elif model == "qwen25vl":
         def _apply_activity_recognition(segment: List[np.ndarray]) -> List[float]:
-            return _qwen25vl_activity_recognition(segment, prompt)
+            return _qwenvl_activity_recognition(segment, prompt, model_name="qwen25vl")
     else:
         raise ValueError(f"Invalid model: {model}")
@@ -3488,9 +3477,9 @@ def _plot_counting(
 FUNCTION_TOOLS = [
-    owlv2_object_detection,
-    owlv2_sam2_instance_segmentation,
-    owlv2_sam2_video_tracking,
+    glee_object_detection,
+    glee_sam2_instance_segmentation,
+    glee_sam2_video_tracking,
     countgd_object_detection,
     countgd_sam2_instance_segmentation,
     countgd_sam2_video_tracking,
@@ -3502,8 +3491,8 @@ FUNCTION_TOOLS = [
     document_extraction,
     document_qa,
     ocr,
-    qwen2_vl_images_vqa,
-    qwen2_vl_video_vqa,
+    qwen25_vl_images_vqa,
+    qwen25_vl_video_vqa,
     activity_recognition,
     depth_anything_v2,
     generate_pose_image,

vision_agent/utils/tools.py CHANGED Viewed

@@ -56,7 +56,10 @@ def send_inference_request(
         url = os.environ["TOOL_ENDPOINT_URL"]
     vision_agent_api_key = get_vision_agent_api_key()
-    headers = {"Authorization": f"Basic {vision_agent_api_key}"}
+    headers = {
+        "Authorization": f"Basic {vision_agent_api_key}",
+        "X-Source": "vision_agent",
+    }
     if "TOOL_ENDPOINT_AUTH" in os.environ:
         headers["Authorization"] = os.environ["TOOL_ENDPOINT_AUTH"]
         headers.pop("apikey")
@@ -90,7 +93,10 @@ def send_task_inference_request(
 ) -> Any:
     url = f"{_LND_API_URL_v2}/{task_name}"
     vision_agent_api_key = get_vision_agent_api_key()
-    headers = {"Authorization": f"Basic {vision_agent_api_key}"}
+    headers = {
+        "Authorization": f"Basic {vision_agent_api_key}",
+        "X-Source": "vision_agent",
+    }
     session = _create_requests_session(
         url=url,
         num_retry=3,

{vision_agent-1.0.3.dist-info → vision_agent-1.0.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 1.0.3
+Version: 1.0.5
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai
@@ -65,10 +65,10 @@ pip install vision-agent
 ```bash
 export ANTHROPIC_API_KEY="your-api-key"
-export OPENAI_API_KEY="your-api-key"
+export GEMINI_API_KEY="your-api-key"
 ```
-> **_NOTE:_** We found using both Anthropic Claude-3.5 and OpenAI o1 to be provide the best performance for VisionAgent. If you want to use a different LLM provider or only one, see 'Using Other LLM Providers' below.
+> **_NOTE:_** We found using both Anthropic Claude-3.7 and Gemini-2.0-Flash-Exp to be provide the best performance for VisionAgent. If you want to use a different LLM provider or only one, see 'Using Other LLM Providers' below.
 You will also need to set your VisionAgent API key to be able to authenticate when using the hosted vision tools that we provide through our APIs. Currently, the APIs are free to use so you will only need to get it from [here](https://va.landing.ai/account/api-key).
@@ -147,5 +147,33 @@ directory. For example to change to Anthropic simply just run:
 cp vision_agent/configs/anthropic_config.py vision_agent/configs/config.py
 ```
+You can also modify the existing `config.py` file yourself to use a different LLM
+provider, for example if you wanted to change the planner from Anthropic inside
+`config.py` to OpenAI you would replace this code:
+```python
+    planner: Type[LMM] = Field(default=AnthropicLMM)
+    planner_kwargs: dict = Field(
+        default_factory=lambda: {
+            "model_name": "claude-3-7-sonnet-20250219",
+            "temperature": 0.0,
+            "image_size": 768,
+        }
+    )
+```
+with this code:
+```python
+    planner: Type[LMM] = Field(default=OpenAILMM)
+    planner_kwargs: dict = Field(
+        default_factory=lambda: {
+            "model_name": "gpt-4o-2024-11-20",
+            "temperature": 0.0,
+            "image_size": 768,
+            "image_detail": "low",
+        }
+    )
+```
 > **_NOTE:_** VisionAgent moves fast and we are constantly updating and changing the library. If you have any questions or need help, please reach out to us on our discord channel.

vision-agent 1.0.3__py3-none-any.whl → 1.0.5__py3-none-any.whl

vision-agent 1.0.3py3-none-any.whl → 1.0.5py3-none-any.whl