PyPI - vision-agent - Versions diffs - 1.0.4__py3-none-any.whl → 1.0.7__py3-none-any.whl - Mend

vision-agent 1.0.4py3-none-any.whl → 1.0.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

vision_agent/.sim_tools/df.csv +46 -47
vision_agent/.sim_tools/embs.npy +0 -0
vision_agent/agent/__init__.py +0 -16
vision_agent/agent/vision_agent_planner_prompts_v2.py +57 -58
vision_agent/agent/vision_agent_planner_v2.py +3 -2
vision_agent/configs/anthropic_config.py +29 -16
vision_agent/configs/config.py +14 -15
vision_agent/configs/openai_config.py +10 -10
vision_agent/lmm/lmm.py +2 -2
vision_agent/tools/__init__.py +0 -6
vision_agent/tools/meta_tools.py +1 -492
vision_agent/tools/planner_tools.py +13 -14
vision_agent/tools/tools.py +16 -27
{vision_agent-1.0.4.dist-info → vision_agent-1.0.7.dist-info}/METADATA +31 -3
{vision_agent-1.0.4.dist-info → vision_agent-1.0.7.dist-info}/RECORD +17 -24
vision_agent/agent/vision_agent.py +0 -605
vision_agent/agent/vision_agent_coder.py +0 -742
vision_agent/agent/vision_agent_coder_prompts.py +0 -290
vision_agent/agent/vision_agent_planner.py +0 -564
vision_agent/agent/vision_agent_planner_prompts.py +0 -199
vision_agent/agent/vision_agent_prompts.py +0 -312
vision_agent/configs/anthropic_openai_config.py +0 -164
{vision_agent-1.0.4.dist-info → vision_agent-1.0.7.dist-info}/LICENSE +0 -0
{vision_agent-1.0.4.dist-info → vision_agent-1.0.7.dist-info}/WHEEL +0 -0

vision_agent/agent/vision_agent_planner_prompts.py DELETED Viewed

@@ -1,199 +0,0 @@
-USER_REQ = """
-## User Request
-{user_request}
-"""
-PLAN = """
-**Context**:
-{context}
-**Tools Available**:
-{tool_desc}
-**Previous Feedback**:
-{feedback}
-**Instructions**:
-1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request.
-2. For each subtask, be sure to include the tool(s) you want to use to accomplish that subtask.
-3. Output three different plans each utilize a different strategy or set of tools ordering them from most likely to least likely to succeed.
-Output a list of jsons in the following format:
-```json
-{{
-    "plan1":
-        {{
-            "thoughts": str # your thought process for choosing this plan
-            "instructions": [
-                str # what you should do in this task associated with a tool
-            ]
-        }},
-    "plan2": ...,
-    "plan3": ...
-}}
-```
-"""
-TEST_PLANS = """
-**Role**: You are a software programmer responsible for testing different tools.
-**Task**: Your responsibility is to take a set of several plans and test the different tools for each plan.
-**Documentation**:
-This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`.
-{docstring}
-**Plans**:
-{plans}
-**Previous Attempts**:
-{previous_attempts}
-**Examples**:
---- EXAMPLE1 ---
-plan1:
-- Load the image from the provided file path 'image.jpg'.
-- Use the 'owlv2_object_detection' tool with the prompt 'person' to detect and count the number of people in the image.
-plan2:
-- Load the image from the provided file path 'image.jpg'.
-- Use the 'florence2_sam2_instance_segmentation' tool with the prompt 'person' to detect and count the number of people in the image.
-- Count the number of detected objects labeled as 'person'.
-plan3:
-- Load the image from the provided file path 'image.jpg'.
-- Use the 'countgd_object_detection' tool to count the dominant foreground object, which in this case is people.
-```python
-from vision_agent.tools import load_image, owlv2_object_detection, florence2_sam2_instance_segmentation, countgd_object_detection
-image = load_image("image.jpg")
-owl_v2_out = owlv2_object_detection("person", image)
-f2s2_out = florence2_sam2_instance_segmentation("person", image)
-# strip out the masks from the output becuase they don't provide useful information when printed
-f2s2_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in f2s2_out]
-cgd_out = countgd_object_detection("person", image)
-final_out = {{"owlv2_object_detection": owl_v2_out, "florence2_sam2_instance_segmentation": f2s2, "countgd_object_detection": cgd_out}}
-print(final_out)
---- END EXAMPLE1 ---
---- EXAMPLE2 ---
-plan1:
-- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
-- Use the 'owl_v2_video' tool with the prompt 'person' to detect where the people are in the video.
-plan2:
-- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
-- Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
-plan3:
-- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
-- Use the 'florence2_sam2_video_tracking' tool with the prompt 'person' to detect where the people are in the video.
-```python
-import numpy as np
-from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking
-# sample at 1 FPS and use the first 10 frames to reduce processing time
-frames = extract_frames_and_timestamps("video.mp4", 1)
-frames = [f["frame"] for f in frames][:10]
-# strip arrays from the output to make it easier to read
-def remove_arrays(o):
-    if isinstance(o, list):
-        return [remove_arrays(e) for e in o]
-    elif isinstance(o, dict):
-        return {{k: remove_arrays(v) for k, v in o.items()}}
-    elif isinstance(o, np.ndarray):
-        return "array: " + str(o.shape)
-    else:
-        return o
-# return the counts of each label per frame to help determine the stability of the model results
-def get_counts(preds):
-    counts = {{}}
-    for i, pred_frame in enumerate(preds):
-        counts_i = {{}}
-        for pred in pred_frame:
-            label = pred["label"].split(":")[1] if ":" in pred["label"] else pred["label"]
-            counts_i[label] = counts_i.get(label, 0) + 1
-        counts[f"frame_{{i}}"] = counts_i
-    return counts
-# plan1
-owl_v2_out = owl_v2_video("person", frames)
-owl_v2_counts = get_counts(owl_v2_out)
-# plan2
-florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
-florence2_counts = get_counts(florence2_out)
-# plan3
-f2s2_tracking_out = florence2_sam2_video_tracking("person", frames)
-remove_arrays(f2s2_tracking_out)
-f2s2_counts = get_counts(f2s2_tracking_out)
-final_out = {{
-    "owl_v2_video": owl_v2_out,
-    "florence2_phrase_grounding": florence2_out,
-    "florence2_sam2_video_tracking": f2s2_out,
-}}
-counts = {{
-    "owl_v2_video": owl_v2_counts,
-    "florence2_phrase_grounding": florence2_counts,
-    "florence2_sam2_video_tracking": f2s2_counts,
-}}
-print(final_out)
-print(labels_and_scores)
-print(counts)
-```
---- END EXAMPLE2 ---
-**Instructions**:
-1. Write a program to load the media and call each tool and print it's output along with other relevant information.
-2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
-3. Your test case MUST run only on the given images which are {media}
-4. Print this final dictionary.
-5. For video input, sample at 1 FPS and use the first 10 frames only to reduce processing time.
-"""
-PREVIOUS_FAILED = """
-**Previous Failed Attempts**:
-You previously ran this code:
-```python
-{code}
-```
-But got the following error or no stdout:
-{error}
-"""
-PICK_PLAN = """
-**Role**: You are an advanced AI model that can understand the user request and construct plans to accomplish it.
-**Task**: Your responsibility is to pick the best plan from the three plans provided.
-**Context**:
-{context}
-**Plans**:
-{plans}
-**Tool Output**:
-{tool_output}
-**Instructions**:
-1. Re-read the user request, plans, tool outputs and examine the image.
-2. Solve the problem yourself given the image and pick the most accurate plan that matches your solution the best.
-3. Add modifications to improve the plan including: changing a tool, adding thresholds, string matching.
-4. Output a JSON object with the following format:
-{{
-    "predicted_answer": str # the answer you would expect from the best plan
-    "thoughts": str # your thought process for choosing the best plan over other plans and any modifications you made
-    "best_plan": str # the best plan you have chosen, must be `plan1`, `plan2`, or `plan3`
-}}
-"""

vision_agent/agent/vision_agent_prompts.py DELETED Viewed

@@ -1,312 +0,0 @@
-VA_CODE = """
-**Role**: You are a helpful agent that assists users with writing code.
-**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execute_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be saved only AFTER you execute python code with `save_imgae` or `save_video`. The user can see all `artifacts`.
-<execute_python>
-print("Hello World!")
-</execute_python>
-**Documentation**:
-This is the documentation for the different actions you can take:
-{documentation}
-**Examples**:
-Here is an example of how you can interact with a user and Actions to complete a task:
---- START EXAMPLES ---
-{examples}
---- END EXAMPLES ---
-**Conversation**:
-Here is the current conversation so far:
---- START CONVERSATION ---
-{conversation}
---- END CONVERSATION ---
-**Instructions**:
-1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
-2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`. DO NOT run `edit_vision_code` or `edit_code_artifact` more than 2 times in a row and instead ask the user for help.
-3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question or show your results to the user, set <let_user_respond> to `true`.
-4. **Artifacts**: Files are only saved in `artifacts` after <execute_python>, do not try to access artifacts until you observe that they are loaded.
-5. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation.
-6. **Output**: You can only respond with <thinking>, <response>, <execute_python>, and <let_user_respond> tags.
-<thinking>Your thoughts here...</thinking>
-<response>Your response to the user here...</response>
-<execute_python>Your code here...</execute_python>
-<let_user_respond>true/false for whether or not to you want to let the user respond.</let_user_respond>
-"""
-EXAMPLES_CODE1 = """
-USER: Can you write code to detect the dogs in this image? Media name dog.jpg
-OBSERVATION:
-[Artifacts loaded]
-Artifact dog.jpg loaded to /path/to/images/dog.jpg
-[End of artifacts]
-AGENT: <thinking>I will use the generate_vision_code to detect the dogs in the image.</thinking>
-<execute_python>
-generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg'])
-</execute_python>
-<let_user_respond>false</let_user_respond>
-OBSERVATION:
-[Artifact dog_detector.py (5 lines total)]
-0|from vision_agent.tools import load_image, owlv2_object_detection
-1|def detect_dogs(image_path: str):
-2|    image = load_image(image_path)
-3|    dogs = owlv2_object_detection("dog", image)
-4|    return dogs
-[End of artifact]
-AGENT: <thinking>I have generated the code to detect the dogs in the image, I must now run the code and print the results to get the output.</thinking>
-<execute_python>
-from dog_detector import detect_dogs
-print(detect_dogs('/example/workspace/dog.jpg'))
-</execute_python>
-<let_user_respond>false</let_user_respond>
-OBSERVATION:
------ stdout -----
-[{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}, {'score': 0.23, 'label': 'dog', 'box': [0.2, 0.3, 0.4, 0.5]}]
-AGENT: <thinking>Two dogs are detected, I will show this to the user and ask them if the result looks good.</thinking>
-<response>The code detectd two dogs, do the results look good to you?</response>
-<let_user_respond>true</let_user_respond>
-"""
-EXAMPLES_CODE1_EXTRA = """
-USER: The the image only has one dog, can you fix this?
-OBSERVATION:
-[Artifacts loaded]
-Artifact dog.jpg loaded to /path/to/images/dog.jpg
-Artifact dog_detector.py loaded to /path/to/code/dog_detector.py
-[End of artifacts]
-AGENT: <thinking>I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that</thinking>
-<execute_python>
-edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/path/to/images/dog.jpg'])
-</execute_python>
-<let_user_respond>false</let_user_respond>
-OBSERVATION:
-[Artifact dog_detector.py (5 lines total)]
-0|from vision_agent.tools import load_image, owlv2_object_detection
-1|def detect_dogs(image_path: str):
-2|    image = load_image(image_path)
-3|    dogs = owlv2_object_detection("dog", image, threshold=0.24)
-4|    return dogs
-[End of artifact]
-AGENT: <thinking>I have edited the code to detect only one dog, I must now run the code and print the results to get the output.</thinking>
-<execute_python>
-from dog_detector import detect_dogs
-print(detect_dogs('/path/to/images/dog.jpg'))
-</execute_python>
-<let_user_respond>false</let_user_respond>
-OBSERVATION:
------ stdout -----
-[{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}]
-AGENT: <thinking>One dog is detected, I will show this to the user and ask them if the result looks good.</thinking>
-<response>The code detected one dog, do these results look good to you?</response>
-<let_user_respond>true</let_user_respond>
-"""
-EXAMPLES_CODE2 = """
-USER: Can you describe this image?
-OBSERVATION:
-[Artifacts loaded]
-Artifact image.jpg loaded to /path/to/images/image.jpg
-[End of artifacts]
-AGENT: <thinking>The user hasn't asked me to write any code and the task is very simple so I will view the image and answer myself to respond to the user quickly.</thinking>
-<execute_python>
-view_media_artifacts('image.jpg')
-</execute_python>
-<let_user_respond>false</let_user_respond>
-OBSERVATION:
-[Image image.jpg displayed]
-AGENT: <thinking>The image shows a cat and a dog sitting on the couch, I will tell the user and ask them if they need any other assistance.</thinking>
-<response>The image contains a dog and a cat sitting on a couch. Can I help you with any other tasks?</response>
-<let_user_respond>true</let_user_respond>
-"""
-EXAMPLES_CODE3 = """
-USER: Can you create a function to count workers with helmets, return the count and a visaulization of the bounding boxes?
-OBSERVATION:
-[Artifacts loaded]
-[End of artifacts]
-AGENT: <thinking>The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.</thinking>
-<response>Can you provide an image of workers with helmets?</response>
-<let_user_respond>true</let_user_respond>
-USER: Yes you can use workers.png
-OBSERVATION:
-[Artifacts loaded]
-Artifact workers.png loaded to /path/to/images/workers.png
-[End of artifacts]
-AGENT: <thinking>I will use the generate_vision_code to count the workers with helmets in the image.</thinking>
-<execute_python>
-generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/paths/to/images/workers.png'])
-</execute_python>
-<let_user_respond>false</let_user_respond>
-OBSERVATION:
-[Artifact code.py (17 lines total)]
-0|from vision_agent.tools import load_image, florence2_object_detection, closest_box_distance, overlay_bounding_boxes, save_image
-1|def count_workers_with_helmets(image_path: str):
-2|    image = load_image(image_path)
-3|    detections = florence2_object_detection("worker, helmet", image)
-4|    workers = [d for d in detections if d['label'] == 'worker']
-5|    helmets = [d for d in detections if d['label'] == 'helmet']
-6|    count = 0
-6|    for worker in workers:
-7|        person_has_helmet = False
-8|        for helmet in helmets:
-9|            if closest_box_distance(worker['box'], helmet['box']) < 0.01:
-10|                person_has_helmet = True
-11|                break
-11|        if person_has_helmet:
-13|            count += 1
-14|    viz = overlay_bounding_boxes(image, detections)
-16|    return count, viz
-[End of artifact]
-AGENT: <thinking>I have generated the code to count the workers with helmets in the image, I must now run the code, print the counts and save the visualization.</thinking>
-<execute_python>
-from code import count_workers_with_helmets
-count, viz = count_workers_with_helmets('/path/to/images/workers.png')
-save_image(viz, 'workers_viz.png')
-print(count)
-</execute_python>
-<let_user_respond>false</let_user_respond>
-OBSERVATION:
------ stdout -----
-2
-AGENT: <thinking>Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.</thinking>
-<response>The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.</response>
-<let_user_respond>true</let_user_respond>
-"""
-EXAMPLES_CODE3_EXTRA1 = """
-USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
-AGENT: <thinking>Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model</thinking>
-<execute_python>
-object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])
-</execute_python>
-<let_user_respond>false</let_user_respond>
-OBSERVATION:
-[Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
-AGENT: <thinking>The model has finished fine tuning, I will now replace the original florence2_object_detection call with the fine tuning id.</thinking>
-<execute_python>
-use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")
-</execute_python>
-<let_user_respond>false</let_user_respond>
-OBSERVATION:
-[Artifact code.py edits]
----
-+++
-@@ -1,7 +1,7 @@
- from vision_agent.tools import load_image, florence2_object_detection, closest_box_distance, overlay_bounding_boxes, save_image
- def count_workers_with_helmets(image_path: str, output_path: str):
-     image = load_image(image_path)
--    detections = florence2_object_detection("worker, helmet", image)
-+    detections = florence2_object_detection("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
-     workers = [d for d in detections if d['label'] == 'worker']
-     helmets = [d for d in detections if d['label'] == 'helmet']
-     count = 0
-[End of edits]
-AGENT: <thinking>I have updated the code to use the fine tuned model, I must now run the code and print the output.</thinking>
-<execute_python>
-from code import count_workers_with_helmets
-print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))
-</execute_python>
-<let_user_respond>false</let_user_respond>
-OBSERVATION:
------ stdout -----
-3
-AGENT: <thinking>Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.</thinking>
-<response>I have updated the code using the fine tuned florence2_object_detection model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.</response>
-<let_user_respond>true</let_user_respond>
-"""
-EXAMPLES_CODE3_EXTRA2 = """
-USER: Can you write a streamlit app that allows me to upload an image and count the workers with helmets?
-AGENT: <thinking>I will use edit_code_artifact to generate a streamlit app that allows the user to upload an image and count the workers with helmets.</thinking>
-<execute_python>
-CODE = '''
-import streamlit as st
-from code imoprt count_workers_with_helmets
-import os
-def main():
-    st.title("Worker with Helmet Counter")
-    uploaded_file = st.file_uploader("Choose image...", type=["jpeg", "jpg", "png"])
-    if uploaded_file is not None:
-        with open("temp_image.jpg", "wb") as f:
-            f.write(uploaded_file.read())
-        count = count_workers_with_helmets("temp_image.jpg", "temp_image_viz.jpg")
-        st.image("temp_image_viz.jpg")
-        st.write(f"Number of workers with helmets: {count}")
-        os.remove("temp_image.jpg")
-        os.remove("temp_image_viz.jpg")
-if __name__ == "__main__":
-    main()
-'''
-edit_code_artifact(artifacts, 'streamlit_app.py', 0, 0, CODE)
-</execute_python>
-OBSERVATION:
-[Artifact streamlit_app.py (19 lines total)]
-0|import streamlit as st
-1|from code imoprt count_workers_with_helmets
-2|import os
-3|
-4|def main():
-5|    st.title("Worker with Helmet Counter")
-6|    uploaded_file = st.file_uploader("Choose image...", type=["jpeg", "jpg", "png"])
-7|    if uploaded_file is not None:
-8|        with open("temp_image.jpg", "wb") as f:
-9|            f.write(uploaded_file.read())
-10|
-11|        count = count_workers_with_helmets("temp_image.jpg", "temp_image_viz.jpg")
-12|        st.image("temp_image_viz.jpg")
-13|        st.write(f"Number of workers with helmets: {count}")
-14|        os.remove("temp_image.jpg")
-15|        os.remove("temp_image_viz.jpg")
-16|
-17|if __name__ == "__main__":
-18|    main()
-[End of artifact]
-AGENT: <thinking>I have generated the streamlit app that allows the user to upload an image and count the workers</thinking>
-<response>The streamlit app is saved in streamlit_app.py, you can run the app by running `streamlit run streamlit_app.py`.</response>
-<let_user_respond>true</let_user_respond>
-"""

vision_agent/configs/anthropic_openai_config.py DELETED Viewed

@@ -1,164 +0,0 @@
-from typing import Type
-from pydantic import BaseModel, Field
-from vision_agent.lmm import LMM, AnthropicLMM, OpenAILMM
-class Config(BaseModel):
-    # for vision_agent_v2
-    agent: Type[LMM] = Field(default=AnthropicLMM)
-    agent_kwargs: dict = Field(
-        default_factory=lambda: {
-            "model_name": "claude-3-5-sonnet-20241022",
-            "temperature": 0.0,
-            "image_size": 768,
-        }
-    )
-    # for vision_agent_planner_v2
-    planner: Type[LMM] = Field(default=AnthropicLMM)
-    planner_kwargs: dict = Field(
-        default_factory=lambda: {
-            "model_name": "claude-3-5-sonnet-20241022",
-            "temperature": 0.0,
-            "image_size": 768,
-        }
-    )
-    # for vision_agent_planner_v2
-    summarizer: Type[LMM] = Field(default=OpenAILMM)
-    summarizer_kwargs: dict = Field(
-        default_factory=lambda: {
-            "model_name": "o1",
-            "temperature": 1.0,  # o1 has fixed temperature
-            "image_size": 768,
-        }
-    )
-    # for vision_agent_planner_v2
-    critic: Type[LMM] = Field(default=AnthropicLMM)
-    critic_kwargs: dict = Field(
-        default_factory=lambda: {
-            "model_name": "claude-3-5-sonnet-20241022",
-            "temperature": 0.0,
-            "image_size": 768,
-        }
-    )
-    # for vision_agent_coder_v2
-    coder: Type[LMM] = Field(default=AnthropicLMM)
-    coder_kwargs: dict = Field(
-        default_factory=lambda: {
-            "model_name": "claude-3-5-sonnet-20241022",
-            "temperature": 0.0,
-            "image_size": 768,
-        }
-    )
-    # for vision_agent_coder_v2
-    tester: Type[LMM] = Field(default=AnthropicLMM)
-    tester_kwargs: dict = Field(
-        default_factory=lambda: {
-            "model_name": "claude-3-5-sonnet-20241022",
-            "temperature": 0.0,
-            "image_size": 768,
-        }
-    )
-    # for vision_agent_coder_v2
-    debugger: Type[LMM] = Field(default=AnthropicLMM)
-    debugger_kwargs: dict = Field(
-        default_factory=lambda: {
-            "model_name": "claude-3-5-sonnet-20241022",
-            "temperature": 0.0,
-            "image_size": 768,
-        }
-    )
-    # for get_tool_for_task
-    tool_tester: Type[LMM] = Field(default=AnthropicLMM)
-    tool_tester_kwargs: dict = Field(
-        default_factory=lambda: {
-            "model_name": "claude-3-5-sonnet-20241022",
-            "temperature": 0.0,
-            "image_size": 768,
-        }
-    )
-    # for get_tool_for_task
-    tool_chooser: Type[LMM] = Field(default=AnthropicLMM)
-    tool_chooser_kwargs: dict = Field(
-        default_factory=lambda: {
-            "model_name": "claude-3-5-sonnet-20241022",
-            "temperature": 1.0,
-            "image_size": 768,
-        }
-    )
-    # for get_tool_for_task
-    od_judge: Type[LMM] = Field(default=AnthropicLMM)
-    od_judge_kwargs: dict = Field(
-        default_factory=lambda: {
-            "model_name": "claude-3-5-sonnet-20241022",
-            "temperature": 0.0,
-            "image_size": 512,
-        }
-    )
-    # for suggestions module
-    suggester: Type[LMM] = Field(default=OpenAILMM)
-    suggester_kwargs: dict = Field(
-        default_factory=lambda: {
-            "model_name": "o1",
-            "temperature": 1.0,
-            "image_detail": "high",
-            "image_size": 1024,
-        }
-    )
-    # for vqa module
-    vqa: Type[LMM] = Field(default=AnthropicLMM)
-    vqa_kwargs: dict = Field(
-        default_factory=lambda: {
-            "model_name": "claude-3-5-sonnet-20241022",
-            "temperature": 0.0,
-            "image_size": 768,
-        }
-    )
-    def create_agent(self) -> LMM:
-        return self.agent(**self.agent_kwargs)
-    def create_planner(self) -> LMM:
-        return self.planner(**self.planner_kwargs)
-    def create_summarizer(self) -> LMM:
-        return self.summarizer(**self.summarizer_kwargs)
-    def create_critic(self) -> LMM:
-        return self.critic(**self.critic_kwargs)
-    def create_coder(self) -> LMM:
-        return self.coder(**self.coder_kwargs)
-    def create_tester(self) -> LMM:
-        return self.tester(**self.tester_kwargs)
-    def create_debugger(self) -> LMM:
-        return self.debugger(**self.debugger_kwargs)
-    def create_tool_tester(self) -> LMM:
-        return self.tool_tester(**self.tool_tester_kwargs)
-    def create_tool_chooser(self) -> LMM:
-        return self.tool_chooser(**self.tool_chooser_kwargs)
-    def create_od_judge(self) -> LMM:
-        return self.od_judge(**self.od_judge_kwargs)
-    def create_suggester(self) -> LMM:
-        return self.suggester(**self.suggester_kwargs)
-    def create_vqa(self) -> LMM:
-        return self.vqa(**self.vqa_kwargs)

{vision_agent-1.0.4.dist-info → vision_agent-1.0.7.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-1.0.4.dist-info → vision_agent-1.0.7.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 1.0.4__py3-none-any.whl → 1.0.7__py3-none-any.whl

vision-agent 1.0.4py3-none-any.whl → 1.0.7py3-none-any.whl