PyPI - vision-agent - Versions diffs - 0.2.140__py3-none-any.whl → 0.2.142__py3-none-any.whl - Mend

vision-agent 0.2.140py3-none-any.whl → 0.2.142py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

vision_agent/agent/__init__.py +2 -1
vision_agent/agent/agent_utils.py +8 -2
vision_agent/agent/vision_agent.py +97 -17
vision_agent/agent/vision_agent_coder.py +93 -66
vision_agent/agent/vision_agent_coder_prompts.py +53 -19
vision_agent/agent/vision_agent_prompts.py +31 -9
vision_agent/lmm/__init__.py +1 -1
vision_agent/lmm/lmm.py +6 -9
vision_agent/tools/__init__.py +1 -1
vision_agent/tools/meta_tools.py +65 -33
vision_agent/tools/tools.py +115 -30
vision_agent/tools/tools_types.py +1 -0
vision_agent/utils/image_utils.py +18 -7
vision_agent/utils/video.py +2 -1
{vision_agent-0.2.140.dist-info → vision_agent-0.2.142.dist-info}/METADATA +60 -12
vision_agent-0.2.142.dist-info/RECORD +33 -0
vision_agent-0.2.140.dist-info/RECORD +0 -33
{vision_agent-0.2.140.dist-info → vision_agent-0.2.142.dist-info}/LICENSE +0 -0
{vision_agent-0.2.140.dist-info → vision_agent-0.2.142.dist-info}/WHEEL +0 -0

vision_agent/agent/vision_agent_coder_prompts.py CHANGED Viewed

@@ -30,9 +30,10 @@ PLAN = """
 **Instructions**:
 1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request.
-2. Output three different plans each utilize a different strategy or set of tools.
+2. For each subtask, be sure to include the tool(s) you want to use to accomplish that subtask.
+3. Output three different plans each utilize a different strategy or set of tools ordering them from most likely to least likely to succeed.
-Output a list of jsons in the following format
+Output a list of jsons in the following format:
 ```json
 {{
@@ -67,7 +68,7 @@ This is the documentation for the functions you have access to. You may call any
 {previous_attempts}
 **Instructions**:
-1. Write a program to load the media and call each tool and save it's output.
+1. Write a program to load the media and call each tool and print it's output along with other relevant information.
 2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
 3. Your test case MUST run only on the given images which are {media}
 4. Print this final dictionary.
@@ -102,24 +103,25 @@ print(final_out)
 --- EXAMPLE2 ---
 plan1:
-- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
-- Use the 'owl_v2_image' tool with the prompt 'person' to detect where the people are in the video.
+- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
+- Use the 'owl_v2_video' tool with the prompt 'person' to detect where the people are in the video.
 plan2:
-- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
+- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
 - Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
 plan3:
-- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
+- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
 - Use the 'florence2_sam2_video_tracking' tool with the prompt 'person' to detect where the people are in the video.
 ```python
 import numpy as np
-from vision_agent.tools import extract_frames, owl_v2_image, florence2_phrase_grounding, florence2_sam2_video_tracking
+from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking
 # sample at 1 FPS and use the first 10 frames to reduce processing time
-frames = extract_frames("video.mp4", 1)
-frames = [f[0] for f in frames][:10]
+frames = extract_frames_and_timestamps("video.mp4", 1)
+frames = [f["frame"] for f in frames][:10]
+# strip arrays from the output to make it easier to read
 def remove_arrays(o):
     if isinstance(o, list):
         return [remove_arrays(e) for e in o]
@@ -130,18 +132,46 @@ def remove_arrays(o):
     else:
         return o
+# return the counts of each label per frame to help determine the stability of the model results
+def get_counts(preds):
+    counts = {{}}
+    for i, pred_frame in enumerate(preds):
+        counts_i = {{}}
+        for pred in pred_frame:
+            label = pred["label"].split(":")[1] if ":" in pred["label"] else pred["label"]
+            counts_i[label] = counts_i.get(label, 0) + 1
+        counts[f"frame_{{i}}"] = counts_i
+    return counts
 # plan1
-owl_v2_out = [owl_v2_image("person", f) for f in frames]
+owl_v2_out = owl_v2_video("person", frames)
+owl_v2_counts = get_counts(owl_v2_out)
 # plan2
 florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
+florence2_counts = get_counts(florence2_out)
 # plan3
 f2s2_tracking_out = florence2_sam2_video_tracking("person", frames)
 remove_arrays(f2s2_tracking_out)
+f2s2_counts = get_counts(f2s2_tracking_out)
+final_out = {{
+    "owl_v2_video": owl_v2_out,
+    "florence2_phrase_grounding": florence2_out,
+    "florence2_sam2_video_tracking": f2s2_out,
+}}
+counts = {{
+    "owl_v2_video": owl_v2_counts,
+    "florence2_phrase_grounding": florence2_counts,
+    "florence2_sam2_video_tracking": f2s2_counts,
+}}
-final_out = {{"owl_v2_image": owl_v2_out, "florence2_phrase_grounding": florence2_out, "florence2_sam2_video_tracking": f2s2_tracking_out}}
 print(final_out)
+print(labels_and_scores)
+print(counts)
 ```
 """
@@ -159,7 +189,7 @@ But got the following error or no stdout:
 PICK_PLAN = """
-**Role**: You are a software programmer.
+**Role**: You are an advanced AI model that can understand the user request and construct plans to accomplish it.
 **Task**: Your responsibility is to pick the best plan from the three plans provided.
@@ -173,13 +203,14 @@ PICK_PLAN = """
 {tool_output}
 **Instructions**:
-1. Given the plans, image, and tool outputs, decide which plan is the best to achieve the user request.
-2. Solve the problem yourself given the image and pick the plan that matches your solution the best.
+1. Re-read the user request, plans, tool outputs and examine the image.
+2. Solve the problem yourself given the image and pick the most accurate plan that matches your solution the best.
+3. Add modifications to improve the plan including: changing a tool, adding thresholds, string matching.
 3. Output a JSON object with the following format:
 {{
     "predicted_answer": str # the answer you would expect from the best plan
-    "thoughts": str # your thought process for choosing the best plan
-    "best_plan": str # the best plan you have chosen
+    "thoughts": str # your thought process for choosing the best plan over other plans and any modifications you made
+    "best_plan": str # the best plan you have chosen, must be `plan1`, `plan2`, or `plan3`
 }}
 """
@@ -201,15 +232,18 @@ This is the documentation for the functions you have access to. You may call any
 **User Instructions**:
 {question}
-**Tool Output**:
+**Tool Tests and Outputs**:
 {tool_output}
+**Tool Output Thoughts**:
+{plan_thoughts}
 **Previous Feedback**:
 {feedback}
 **Instructions**:
 1. **Understand and Clarify**: Make sure you understand the task.
-2. **Algorithm/Method Selection**: Decide on the most efficient way.
+2. **Algorithm/Method Selection**: Decide on the most efficient method, use the tool outputs and tool thoughts to guide you.
 3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode.
 4. **Code Generation**: Translate your pseudocode into executable Python code. Ensure you use correct arguments, remember coordinates are always returned normalized from `vision_agent.tools`. All images from `vision_agent.tools` are in RGB format, red is (255, 0, 0) and blue is (0, 0, 255).
 """

vision_agent/agent/vision_agent_prompts.py CHANGED Viewed

@@ -18,19 +18,24 @@ Here is an example of how you can interact with a user and Actions to complete a
 {examples}
 --- END EXAMPLES ---
-**Instructions**:
-1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
-2. **Output in JSON**: Respond in JSON format, {{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
 **Conversation**:
 Here is the current conversation so far:
 --- START CONVERSATION ---
 {conversation}
+--- END CONVERSATION ---
+**Instructions**:
+1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
+2. **Output in JSON**: Respond in the following format in JSON:
+```json
+{{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
+```
 """
 EXAMPLES_CODE1 = """
-USER: Can you detect the dogs in this image? Media name dog.jpg
+USER: Can you write code to detect the dogs in this image? Media name dog.jpg
 OBSERVATION:
 [Artifacts loaded]
@@ -61,6 +66,7 @@ AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask
 EXAMPLES_CODE1_EXTRA = """
 USER: The the image only has one dog, can you fix this?
+OBSERVATION:
 [Artifacts loaded]
 Artifact dog.jpg loaded to /path/to/images/dog.jpg
 Artifact dog_detector.py loaded to /path/to/code/dog_detector.py
@@ -86,8 +92,24 @@ OBSERVATION:
 AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect one dog and shown you the output, do the results look good to you?", "let_user_respond": true}
 """
 EXAMPLES_CODE2 = """
+USER: Can you describe this image?
+OBSERVATION:
+[Artifacts loaded]
+Artifact image.jpg loaded to /path/to/images/image.jpg
+[End of artifacts]
+AGENT: {"thoughts": "The user hasn't asked me to write any code and the task is very simple so I will view the image and answer myself to respond to the user quickly.", "response": "<execute_python>view_media_artifacts('image.jpg')</execute_python>", "let_user_respond": false}
+OBSERVATION:
+[Image image.jpg displayed]
+AGENT: {"thoughts": "The image shows a cat and a dog sitting on the couch, I will tell the user and ask them if they need any other assistance.", "response": "The image contains a dog and a cat sitting on a couch. Can I help you with any other tasks?", "let_user_respond": true}
+"""
+EXAMPLES_CODE3 = """
 USER: Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?
 OBSERVATION:
@@ -137,13 +159,13 @@ AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to
 USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
-AGENT: {"thoughts": "Because the user has supplied me with labels I can call florence2_fine_tuning on their behalf to fine tune the model", "response": "I will fine tune florence2 with the labels you provided <execute_python>florence2_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}], "phrase_grounding")</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "I will fine tune florence2 with the labels you provided <execute_python>object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])</execute_python>", "let_user_respond": false}
 OBSERVATION:
-[Florence2 fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
+[Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
-AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_florence2_fine_tuning(artifacts, "code.py", "phrase_grounding", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
 OBSERVATION:
 [Artifact code.py edits]

vision_agent/lmm/__init__.py CHANGED Viewed

@@ -1,2 +1,2 @@
-from .lmm import LMM, AzureOpenAILMM, ClaudeSonnetLMM, OllamaLMM, OpenAILMM
+from .lmm import LMM, AnthropicLMM, AzureOpenAILMM, OllamaLMM, OpenAILMM
 from .types import Message

vision_agent/lmm/lmm.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import json
-import logging
 import os
 from abc import ABC, abstractmethod
 from pathlib import Path
@@ -14,8 +13,6 @@ from vision_agent.utils.image_utils import encode_media
 from .types import Message
-_LOGGER = logging.getLogger(__name__)
 class LMM(ABC):
     @abstractmethod
@@ -45,11 +42,11 @@ class LMM(ABC):
 class OpenAILMM(LMM):
-    r"""An LMM class for the OpenAI GPT-4 Vision model."""
+    r"""An LMM class for the OpenAI LMMs."""
     def __init__(
         self,
-        model_name: str = "gpt-4o",
+        model_name: str = "gpt-4o-2024-05-13",
         api_key: Optional[str] = None,
         max_tokens: int = 4096,
         json_mode: bool = False,
@@ -365,8 +362,8 @@ class OllamaLMM(LMM):
             return resp["response"]  # type: ignore
-class ClaudeSonnetLMM(LMM):
-    r"""An LMM class for Anthropic's Claude Sonnet model."""
+class AnthropicLMM(LMM):
+    r"""An LMM class for Anthropic's LMMs."""
     def __init__(
         self,
@@ -402,7 +399,7 @@ class ClaudeSonnetLMM(LMM):
             ]
             if "media" in msg:
                 for media_path in msg["media"]:
-                    encoded_media = encode_media(media_path)
+                    encoded_media = encode_media(media_path, resize=768)
                     content.append(
                         ImageBlockParam(
                             type="image",
@@ -449,7 +446,7 @@ class ClaudeSonnetLMM(LMM):
         ]
         if media:
             for m in media:
-                encoded_media = encode_media(m)
+                encoded_media = encode_media(m, resize=768)
                 content.append(
                     ImageBlockParam(
                         type="image",

vision_agent/tools/__init__.py CHANGED Viewed

@@ -21,7 +21,7 @@ from .tools import (
     depth_anything_v2,
     detr_segmentation,
     dpt_hybrid_midas,
-    extract_frames,
+    extract_frames_and_timestamps,
     florence2_image_caption,
     florence2_ocr,
     florence2_phrase_grounding,

vision_agent/tools/meta_tools.py CHANGED Viewed

@@ -486,6 +486,33 @@ def list_artifacts(artifacts: Artifacts) -> str:
     return output_str
+def check_and_load_image(code: str) -> List[str]:
+    if not code.strip():
+        return []
+    pattern = r"show_media_artifact\(\s*([^\)]+),\s*['\"]([^\)]+)['\"]\s*\)"
+    match = re.search(pattern, code)
+    if match:
+        name = match.group(2)
+        return [name]
+    return []
+def view_media_artifact(artifacts: Artifacts, name: str) -> str:
+    """Views the image artifact with the given name.
+    Parameters:
+        artifacts (Artifacts): The artifacts object to show the image from.
+        name (str): The name of the image artifact to show.
+    """
+    if name not in artifacts:
+        output_str = f"[Artifact {name} does not exist]"
+    else:
+        output_str = f"[Image {name} displayed]"
+    print(output_str)
+    return output_str
 def get_tool_descriptions() -> str:
     """Returns a description of all the tools that `generate_vision_code` has access to.
     Helpful for answering questions about what types of vision tasks you can do with
@@ -493,16 +520,15 @@ def get_tool_descriptions() -> str:
     return TOOL_DESCRIPTIONS
-def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str:
+def object_detection_fine_tuning(bboxes: List[Dict[str, Any]]) -> str:
     """DO NOT use this function unless the user has supplied you with bboxes.
-    'florence2_fine_tuning' is a tool that fine-tune florence2 to be able to detect
-    objects in an image based on a given dataset. It returns the fine tuning job id.
+    'object_detection_fine_tuning' is a tool that fine-tunes object detection models to
+    be able to detect objects in an image based on a given dataset. It returns the fine
+    tuning job id.
     Parameters:
         bboxes (List[BboxInput]): A list of BboxInput containing the image path, labels
             and bounding boxes. The coordinates are unnormalized.
-        task (str): The florencev2 fine-tuning task. The options are
-            'phrase_grounding'.
     Returns:
         str: The fine tuning job id, this id will used to retrieve the fine tuned
@@ -510,12 +536,13 @@ def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str:
     Example
     -------
-        >>> fine_tuning_job_id = florencev2_fine_tuning(
+        >>> fine_tuning_job_id = object_detection_fine_tuning(
             [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
              {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
              "phrase_grounding"
         )
     """
+    task = "phrase_grounding"
     bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
     task_type = PromptTask[task.upper()]
     fine_tuning_request = [
@@ -531,7 +558,7 @@ def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str:
     fine_tune_id = str(
         landing_api.launch_fine_tuning_job("florencev2", task_type, fine_tuning_request)
     )
-    print(f"[Florence2 fine tuning id: {fine_tune_id}]")
+    print(f"[Fine tuning id: {fine_tune_id}]")
     return fine_tune_id
@@ -564,7 +591,7 @@ def use_extra_vision_agent_args(
     Returns:
         str: The edited code.
     """
-    generate_pattern = r"generate_vision_code\(\s*([^\)]+)\)"
+    generate_pattern = r"generate_vision_code\(\s*([^\)]+)\s*\)"
     def generate_replacer(match: re.Match) -> str:
         arg = match.group(1)
@@ -575,7 +602,7 @@ def use_extra_vision_agent_args(
             out_str += ")"
         return out_str
-    edit_pattern = r"edit_vision_code\(\s*([^\)]+)\)"
+    edit_pattern = r"edit_vision_code\(\s*([^\)]+)\s*\)"
     def edit_replacer(match: re.Match) -> str:
         arg = match.group(1)
@@ -591,48 +618,52 @@ def use_extra_vision_agent_args(
     return new_code
-def use_florence2_fine_tuning(
-    artifacts: Artifacts, name: str, task: str, fine_tune_id: str
+def use_object_detection_fine_tuning(
+    artifacts: Artifacts, name: str, fine_tune_id: str
 ) -> str:
-    """Replaces florence2 calls with the fine tuning id. This ensures that the code
-    utilizes the fined tuned florence2 model. Returns the diff between the original
-    code and the new code.
+    """Replaces calls to 'owl_v2_image', 'florence2_phrase_detection' and
+    'florence2_sam2_image' with the fine tuning id. This ensures that the code utilizes
+    the fined tuned florence2 model. Returns the diff between the original code and the
+    new code.
     Parameters:
         artifacts (Artifacts): The artifacts object to edit the code from.
         name (str): The name of the artifact to edit.
-        task (str): The task to fine tune the model for. The options are
-            'phrase_grounding'.
         fine_tune_id (str): The fine tuning job id.
     Examples
     --------
-        >>> diff = use_florence2_fine_tuning(artifacts, "code.py", "phrase_grounding", "23b3b022-5ebf-4798-9373-20ef36429abf")
+        >>> diff = use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")
     """
-    task_to_fn = {"phrase_grounding": "florence2_phrase_grounding"}
     if name not in artifacts:
         output_str = f"[Artifact {name} does not exist]"
         print(output_str)
         return output_str
     code = artifacts[name]
-    if task.lower() == "phrase_grounding":
-        pattern = r"florence2_phrase_grounding\(\s*([^\)]+)\)"
-        def replacer(match: re.Match) -> str:
-            arg = match.group(1)  # capture all initial arguments
-            return f'florence2_phrase_grounding({arg}, "{fine_tune_id}")'
-    else:
-        raise ValueError(f"Task {task} is not supported.")
+    patterns = [
+        (
+            r"florence2_phrase_grounding\(\s*([^\)]+)\s*\)",
+            lambda match: f'florence2_phrase_grounding({match.group(1)}, "{fine_tune_id}")',
+        ),
+        (
+            r"owl_v2_image\(\s*([^\)]+)\s*\)",
+            lambda match: f'owl_v2_image({match.group(1)}, "{fine_tune_id}")',
+        ),
+        (
+            r"florence2_sam2_image\(\s*([^\)]+)\s*\)",
+            lambda match: f'florence2_sam2_image({match.group(1)}, "{fine_tune_id}")',
+        ),
+    ]
-    new_code = re.sub(pattern, replacer, code)
+    new_code = code
+    for pattern, replacer in patterns:
+        new_code = re.sub(pattern, replacer, new_code)
     if new_code == code:
         output_str = (
-            f"[Fine tuning task {task} function {task_to_fn[task]} not found in code]"
+            f"[No function calls to replace with fine tuning id in artifact {name}]"
         )
         print(output_str)
         return output_str
@@ -645,7 +676,7 @@ def use_florence2_fine_tuning(
     display(
         {
             MimeType.APPLICATION_ARTIFACT: json.dumps(
-                {"name": name, "content": new_code}
+                {"name": name, "content": new_code, "action": "edit"}
             )
         },
         raw=True,
@@ -662,8 +693,9 @@ META_TOOL_DOCSTRING = get_tool_documentation(
         generate_vision_code,
         edit_vision_code,
         write_media_artifact,
-        florence2_fine_tuning,
-        use_florence2_fine_tuning,
+        view_media_artifact,
+        object_detection_fine_tuning,
+        use_object_detection_fine_tuning,
         list_artifacts,
     ]
 )

vision-agent 0.2.140__py3-none-any.whl → 0.2.142__py3-none-any.whl

vision-agent 0.2.140py3-none-any.whl → 0.2.142py3-none-any.whl