PyPI - vision-agent - Versions diffs - 0.2.140__py3-none-any.whl → 0.2.142__py3-none-any.whl - Mend

vision-agent 0.2.140py3-none-any.whl → 0.2.142py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

vision_agent/agent/__init__.py +2 -1
vision_agent/agent/agent_utils.py +8 -2
vision_agent/agent/vision_agent.py +97 -17
vision_agent/agent/vision_agent_coder.py +93 -66
vision_agent/agent/vision_agent_coder_prompts.py +53 -19
vision_agent/agent/vision_agent_prompts.py +31 -9
vision_agent/lmm/__init__.py +1 -1
vision_agent/lmm/lmm.py +6 -9
vision_agent/tools/__init__.py +1 -1
vision_agent/tools/meta_tools.py +65 -33
vision_agent/tools/tools.py +115 -30
vision_agent/tools/tools_types.py +1 -0
vision_agent/utils/image_utils.py +18 -7
vision_agent/utils/video.py +2 -1
{vision_agent-0.2.140.dist-info → vision_agent-0.2.142.dist-info}/METADATA +60 -12
vision_agent-0.2.142.dist-info/RECORD +33 -0
vision_agent-0.2.140.dist-info/RECORD +0 -33
{vision_agent-0.2.140.dist-info → vision_agent-0.2.142.dist-info}/LICENSE +0 -0
{vision_agent-0.2.140.dist-info → vision_agent-0.2.142.dist-info}/WHEEL +0 -0

vision_agent/tools/tools.py CHANGED Viewed

@@ -149,6 +149,7 @@ def owl_v2_image(
     prompt: str,
     image: np.ndarray,
     box_threshold: float = 0.10,
+    fine_tune_id: Optional[str] = None,
 ) -> List[Dict[str, Any]]:
     """'owl_v2_image' is a tool that can detect and count multiple objects given a text
     prompt such as category names or referring expressions on images. The categories in
@@ -160,6 +161,8 @@ def owl_v2_image(
         image (np.ndarray): The image to ground the prompt to.
         box_threshold (float, optional): The threshold for the box detection. Defaults
             to 0.10.
+        fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
+            fine-tuned model ID here to use it.
     Returns:
         List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
@@ -176,7 +179,38 @@ def owl_v2_image(
             {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
         ]
     """
     image_size = image.shape[:2]
+    if fine_tune_id is not None:
+        image_b64 = convert_to_b64(image)
+        landing_api = LandingPublicAPI()
+        status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
+        if status is not JobStatus.SUCCEEDED:
+            raise FineTuneModelIsNotReady(
+                f"Fine-tuned model {fine_tune_id} is not ready yet"
+            )
+        data_obj = Florence2FtRequest(
+            image=image_b64,
+            task=PromptTask.PHRASE_GROUNDING,
+            tool="florencev2_fine_tuning",
+            prompt=prompt,
+            fine_tuning=FineTuning(job_id=UUID(fine_tune_id)),
+        )
+        data = data_obj.model_dump(by_alias=True)
+        detections = send_inference_request(data, "tools", v2=False)
+        detections = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
+        bboxes_formatted = [
+            ODResponseData(
+                label=detections["labels"][i],
+                bbox=normalize_bbox(detections["bboxes"][i], image_size),
+                score=1.0,
+            )
+            for i in range(len(detections["bboxes"]))
+        ]
+        return [bbox.model_dump() for bbox in bboxes_formatted]
     buffer_bytes = numpy_to_bytes(image)
     files = [("image", buffer_bytes)]
     payload = {
@@ -206,10 +240,10 @@ def owl_v2_video(
     box_threshold: float = 0.10,
 ) -> List[List[Dict[str, Any]]]:
     """'owl_v2_video' will run owl_v2 on each frame of a video. It can detect multiple
-    objects per frame given a text prompt sucha s a category name or referring
-    expression. The categories in text prompt are separated by commas. It returns a list
-    of lists where each inner list contains the score, label, and bounding box of the
-    detections for that frame.
+    objects indepdently per frame given a text prompt such as a category name or
+    referring expression but does not track objects across frames. The categories in
+    text prompt are separated by commas. It returns a list of lists where each inner
+    list contains the score, label, and bounding box of the detections for that frame.
     Parameters:
         prompt (str): The prompt to ground to the video.
@@ -335,7 +369,9 @@ def grounding_sam(
     return return_data
-def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
+def florence2_sam2_image(
+    prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
+) -> List[Dict[str, Any]]:
     """'florence2_sam2_image' is a tool that can segment multiple objects given a text
     prompt such as category names or referring expressions. The categories in the text
     prompt are separated by commas. It returns a list of bounding boxes, label names,
@@ -344,6 +380,8 @@ def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]
     Parameters:
         prompt (str): The prompt to ground to the image.
         image (np.ndarray): The image to ground the prompt to.
+        fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
+            fine-tuned model ID here to use it.
     Returns:
         List[Dict[str, Any]]: A list of dictionaries containing the score, label,
@@ -369,18 +407,52 @@ def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]
             },
         ]
     """
-    buffer_bytes = numpy_to_bytes(image)
+    if fine_tune_id is not None:
+        image_b64 = convert_to_b64(image)
+        landing_api = LandingPublicAPI()
+        status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
+        if status is not JobStatus.SUCCEEDED:
+            raise FineTuneModelIsNotReady(
+                f"Fine-tuned model {fine_tune_id} is not ready yet"
+            )
+        req_data_obj = Florence2FtRequest(
+            image=image_b64,
+            task=PromptTask.PHRASE_GROUNDING,
+            tool="florencev2_fine_tuning",
+            prompt=prompt,
+            fine_tuning=FineTuning(
+                job_id=UUID(fine_tune_id),
+                postprocessing="sam2",
+            ),
+        )
+        req_data = req_data_obj.model_dump(by_alias=True)
+        detections_ft = send_inference_request(req_data, "tools", v2=False)
+        detections_ft = detections_ft["<CAPTION_TO_PHRASE_GROUNDING>"]
+        return_data = []
+        all_masks = np.array(detections_ft["masks"])
+        for i in range(len(detections_ft["bboxes"])):
+            return_data.append(
+                {
+                    "score": 1.0,
+                    "label": detections_ft["labels"][i],
+                    "bbox": detections_ft["bboxes"][i],
+                    "mask": all_masks[i, :, :].astype(np.uint8),
+                }
+            )
+        return return_data
+    buffer_bytes = numpy_to_bytes(image)
     files = [("image", buffer_bytes)]
     payload = {
         "prompts": [s.strip() for s in prompt.split(",")],
         "function_name": "florence2_sam2_image",
     }
-    data: Dict[str, Any] = send_inference_request(
+    detections: Dict[str, Any] = send_inference_request(
         payload, "florence2-sam2", files=files, v2=True
     )
     return_data = []
-    for _, data_i in data["0"].items():
+    for _, data_i in detections["0"].items():
         mask = rle_decode_array(data_i["mask"])
         label = data_i["label"]
         bbox = normalize_bbox(data_i["bounding_box"], data_i["mask"]["size"])
@@ -389,17 +461,19 @@ def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]
 def florence2_sam2_video_tracking(
-    prompt: str, frames: List[np.ndarray]
+    prompt: str, frames: List[np.ndarray], chunk_length: Optional[int] = None
 ) -> List[List[Dict[str, Any]]]:
     """'florence2_sam2_video_tracking' is a tool that can segment and track multiple
     entities in a video given a text prompt such as category names or referring
     expressions. You can optionally separate the categories in the text with commas. It
-    only tracks entities present in the first frame and only returns segmentation
-    masks. It is useful for tracking and counting without duplicating counts.
+    can find new objects every 'chunk_length' frames and is useful for tracking and
+    counting without duplicating counts and always outputs scores of 1.0.
     Parameters:
         prompt (str): The prompt to ground to the video.
         frames (List[np.ndarray]): The list of frames to ground the prompt to.
+        chunk_length (Optional[int]): The number of frames to re-run florence2 to find
+            new objects.
     Returns:
         List[List[Dict[str, Any]]]: A list of list of dictionaries containing the label
@@ -432,6 +506,8 @@ def florence2_sam2_video_tracking(
         "prompts": [s.strip() for s in prompt.split(",")],
         "function_name": "florence2_sam2_video_tracking",
     }
+    if chunk_length is not None:
+        payload["chunk_length"] = chunk_length  # type: ignore
     data: Dict[str, Any] = send_inference_request(
         payload, "florence2-sam2", files=files, v2=True
     )
@@ -1119,13 +1195,13 @@ def florence2_phrase_grounding(
     return_data = []
     for i in range(len(detections["bboxes"])):
         return_data.append(
-            {
-                "score": 1.0,
-                "label": detections["labels"][i],
-                "bbox": normalize_bbox(detections["bboxes"][i], image_size),
-            }
+            ODResponseData(
+                label=detections["labels"][i],
+                bbox=normalize_bbox(detections["bboxes"][i], image_size),
+                score=1.0,
+            )
         )
-    return return_data
+    return [bbox.model_dump() for bbox in return_data]
 def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
@@ -1497,12 +1573,14 @@ def closest_box_distance(
 # Utility and visualization functions
-def extract_frames(
+def extract_frames_and_timestamps(
     video_uri: Union[str, Path], fps: float = 1
-) -> List[Tuple[np.ndarray, float]]:
-    """'extract_frames' extracts frames from a video which can be a file path, url or
-    youtube link, returns a list of tuples (frame, timestamp), where timestamp is the
-    relative time in seconds where the frame was captured. The frame is a numpy array.
+) -> List[Dict[str, Union[np.ndarray, float]]]:
+    """'extract_frames_and_timestamps' extracts frames and timestamps from a video
+    which can be a file path, url or youtube link, returns a list of dictionaries
+    with keys "frame" and "timestamp" where "frame" is a numpy array and "timestamp" is
+    the relative time in seconds where the frame was captured. The frame is a numpy
+    array.
     Parameters:
         video_uri (Union[str, Path]): The path to the video file, url or youtube link
@@ -1510,15 +1588,23 @@ def extract_frames(
             to 1.
     Returns:
-        List[Tuple[np.ndarray, float]]: A list of tuples containing the extracted frame
-            as a numpy array and the timestamp in seconds.
+        List[Dict[str, Union[np.ndarray, float]]]: A list of dictionaries containing the
+            extracted frame as a numpy array and the timestamp in seconds.
     Example
     -------
         >>> extract_frames("path/to/video.mp4")
-        [(frame1, 0.0), (frame2, 0.5), ...]
+        [{"frame": np.ndarray, "timestamp": 0.0}, ...]
     """
+    def reformat(
+        frames_and_timestamps: List[Tuple[np.ndarray, float]]
+    ) -> List[Dict[str, Union[np.ndarray, float]]]:
+        return [
+            {"frame": frame, "timestamp": timestamp}
+            for frame, timestamp in frames_and_timestamps
+        ]
     if str(video_uri).startswith(
         (
             "http://www.youtube.com/",
@@ -1540,16 +1626,16 @@ def extract_frames(
                 raise Exception("No suitable video stream found")
             video_file_path = video.download(output_path=temp_dir)
-            return extract_frames_from_video(video_file_path, fps)
+            return reformat(extract_frames_from_video(video_file_path, fps))
     elif str(video_uri).startswith(("http", "https")):
         _, image_suffix = os.path.splitext(video_uri)
         with tempfile.NamedTemporaryFile(delete=False, suffix=image_suffix) as tmp_file:
             # Download the video and save it to the temporary file
             with urllib.request.urlopen(str(video_uri)) as response:
                 tmp_file.write(response.read())
-            return extract_frames_from_video(tmp_file.name, fps)
+            return reformat(extract_frames_from_video(tmp_file.name, fps))
-    return extract_frames_from_video(str(video_uri), fps)
+    return reformat(extract_frames_from_video(str(video_uri), fps))
 def save_json(data: Any, file_path: str) -> None:
@@ -1953,7 +2039,6 @@ FUNCTION_TOOLS = [
     vit_image_classification,
     vit_nsfw_classification,
     countgd_counting,
-    florence2_image_caption,
     florence2_ocr,
     florence2_sam2_image,
     florence2_sam2_video_tracking,
@@ -1968,7 +2053,7 @@ FUNCTION_TOOLS = [
 ]
 UTIL_TOOLS = [
-    extract_frames,
+    extract_frames_and_timestamps,
     save_json,
     load_image,
     save_image,

vision_agent/tools/tools_types.py CHANGED Viewed

@@ -28,6 +28,7 @@ class FineTuning(BaseModel):
     model_config = ConfigDict(populate_by_name=True)
     job_id: UUID = Field(alias="jobId")
+    postprocessing: Optional[str] = None
     @field_serializer("job_id")
     def serialize_job_id(self, job_id: UUID, _info: SerializationInfo) -> str:

vision_agent/utils/image_utils.py CHANGED Viewed

@@ -5,7 +5,7 @@ import io
 from importlib import resources
 from io import BytesIO
 from pathlib import Path
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 import numpy as np
 from PIL import Image, ImageDraw, ImageFont
@@ -154,15 +154,20 @@ def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
         )
-def encode_image_bytes(image: bytes) -> str:
-    image = Image.open(io.BytesIO(image)).convert("RGB")  # type: ignore
+def encode_image_bytes(image: bytes, resize: Optional[int] = None) -> str:
+    if resize is not None:
+        image_pil = Image.open(io.BytesIO(image)).convert("RGB")
+        if image_pil.size[0] > resize or image_pil.size[1] > resize:
+            image_pil.thumbnail((resize, resize))
+    else:
+        image_pil = Image.open(io.BytesIO(image)).convert("RGB")
     buffer = io.BytesIO()
-    image.save(buffer, format="PNG")  # type: ignore
+    image_pil.save(buffer, format="PNG")
     encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
     return encoded_image
-def encode_media(media: Union[str, Path]) -> str:
+def encode_media(media: Union[str, Path], resize: Optional[int] = None) -> str:
     if isinstance(media, str) and media.startswith(("http", "https")):
         # for mp4 video url, we assume there is a same url but ends with png
         # vision-agent-ui will upload this png when uploading the video
@@ -192,11 +197,17 @@ def encode_media(media: Union[str, Path]) -> str:
         frames = extract_frames_from_video(str(media), fps=1)
         image = frames[len(frames) // 2]
         buffer = io.BytesIO()
-        Image.fromarray(image[0]).convert("RGB").save(buffer, format="PNG")
+        if resize is not None:
+            image_pil = Image.fromarray(image[0]).convert("RGB")
+            if image_pil.size[0] > resize or image_pil.size[1] > resize:
+                image_pil.thumbnail((resize, resize))
+        else:
+            image_pil = Image.fromarray(image[0]).convert("RGB")
+        image_pil.save(buffer, format="PNG")
         image_bytes = buffer.getvalue()
     else:
         image_bytes = open(media, "rb").read()
-    return encode_image_bytes(image_bytes)
+    return encode_image_bytes(image_bytes, resize=resize)
 def denormalize_bbox(

vision_agent/utils/video.py CHANGED Viewed

@@ -61,6 +61,7 @@ def video_writer(
     stream.height = height - (height % 2)
     stream.width = width - (width % 2)
     stream.pix_fmt = "yuv420p"
+    stream.options = {"crf": "10"}
     for frame in frames:
         # Remove the alpha channel (convert RGBA to RGB)
         frame_rgb = frame[:, :, :3]
@@ -77,7 +78,7 @@ def video_writer(
 def frames_to_bytes(
-    frames: List[np.ndarray], fps: float = 10, file_ext: str = ".mp4"
+    frames: List[np.ndarray], fps: float = 1.0, file_ext: str = ".mp4"
 ) -> bytes:
     r"""Convert a list of frames to a video file encoded into a byte string.

{vision_agent-0.2.140.dist-info → vision_agent-0.2.142.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.140
+Version: 0.2.142
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai
@@ -74,10 +74,11 @@ To get started, you can install the library using pip:
 pip install vision-agent
 ```
-Ensure you have an OpenAI API key and set it as an environment variable (if you are
-using Azure OpenAI please see the Azure setup section):
+Ensure you have an Anthropic key and an OpenAI API key and set in your environment
+variables (if you are using Azure OpenAI please see the Azure setup section):
 ```bash
+export ANTHROPIC_API_KEY="your-api-key"
 export OPENAI_API_KEY="your-api-key"
 ```
@@ -112,6 +113,9 @@ You can find more details about the streamlit app [here](examples/chat/).
 >>> resp = agent(resp)
 ```
+`VisionAgent` currently utilizes Claude-3.5 as it's default LMM and uses OpenAI for
+embeddings for tool searching.
 ### Vision Agent Coder
 #### Basic Usage
 You can interact with the agent as you would with any LLM or LMM model:
@@ -173,7 +177,8 @@ of the input is a list of dictionaries with the keys `role`, `content`, and `med
     "code": "from vision_agent.tools import ..."
     "test": "calculate_filled_percentage('jar.jpg')",
     "test_result": "...",
-    "plan": [{"code": "...", "test": "...", "plan": "..."}, ...],
+    "plans": {"plan1": {"thoughts": "..."}, ...},
+    "plan_thoughts": "...",
     "working_memory": ...,
 }
 ```
@@ -210,20 +215,25 @@ result = agent.chat_with_workflow(conv)
 ### Tools
 There are a variety of tools for the model or the user to use. Some are executed locally
 while others are hosted for you. You can easily access them yourself, for example if
-you want to run `owl_v2` and visualize the output you can run:
+you want to run `owl_v2_image` and visualize the output you can run:
 ```python
 import vision_agent.tools as T
 import matplotlib.pyplot as plt
 image = T.load_image("dogs.jpg")
-dets = T.owl_v2("dogs", image)
+dets = T.owl_v2_image("dogs", image)
 viz = T.overlay_bounding_boxes(image, dets)
 plt.imshow(viz)
 plt.show()
 ```
-You can also add custom tools to the agent:
+You can find all available tools in `vision_agent/tools/tools.py`, however,
+`VisionAgentCoder` only utilizes a subset of tools that have been tested and provide
+the best performance. Those can be found in the same file under the `TOOLS` variable.
+If you can't find the tool you are looking for you can also add custom tools to the
+agent:
 ```python
 import vision_agent as va
@@ -258,9 +268,48 @@ Can't find the tool you need and want add it to `VisionAgent`? Check out our
 we add the source code for all the tools used in `VisionAgent`.
 ## Additional Backends
+### Anthropic
+`AnthropicVisionAgentCoder` uses Anthropic. To get started you just need to get an
+Anthropic API key and set it in your environment variables:
+```bash
+export ANTHROPIC_API_KEY="your-api-key"
+```
+Because Anthropic does not support embedding models, the default embedding model used
+is the OpenAI model so you will also need to set your OpenAI API key:
+```bash
+export OPEN_AI_API_KEY="your-api-key"
+```
+Usage is the same as `VisionAgentCoder`:
+```python
+>>> import vision_agent as va
+>>> agent = va.agent.AnthropicVisionAgentCoder()
+>>> agent("Count the apples in the image", media="apples.jpg")
+```
+### OpenAI
+`OpenAIVisionAgentCoder` uses OpenAI. To get started you just need to get an OpenAI API
+key and set it in your environment variables:
+```bash
+export OPEN_AI_API_KEY="your-api-key"
+```
+Usage is the same as `VisionAgentCoder`:
+```python
+>>> import vision_agent as va
+>>> agent = va.agent.OpenAIVisionAgentCoder()
+>>> agent("Count the apples in the image", media="apples.jpg")
+```
 ### Ollama
-We also provide a `VisionAgentCoder` that uses Ollama. To get started you must download
-a few models:
+`OllamaVisionAgentCoder` uses Ollama. To get started you must download a few models:
 ```bash
 ollama pull llama3.1
@@ -281,9 +330,8 @@ tools. You can use it just like you would use `VisionAgentCoder`:
 > WARNING: VisionAgent doesn't work well unless the underlying LMM is sufficiently powerful. Do not expect good results or even working code with smaller models like Llama 3.1 8B.
 ### Azure OpenAI
-We also provide a `AzureVisionAgentCoder` that uses Azure OpenAI models. To get started
-follow the Azure Setup section below. You can use it just like you would use=
-`VisionAgentCoder`:
+`AzureVisionAgentCoder` uses Azure OpenAI models. To get started follow the Azure Setup
+section below. You can use it just like you would use `VisionAgentCoder`:
 ```python
 >>> import vision_agent as va

vision_agent-0.2.142.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,33 @@
+vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
+vision_agent/agent/__init__.py,sha256=NF2LABqHixLvbsOIO-fe-VKZ7awvShLtcT0oQT4eWtI,235
+vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
+vision_agent/agent/agent_utils.py,sha256=PEUHqvnHmFL4np_TeFmKMwr5s_dWfdfJz6TF_ogd1dU,2353
+vision_agent/agent/vision_agent.py,sha256=k1bUmvoz0KjVEu62PYA9djnq3pqzv2S1UsW6gLnTd7w,17023
+vision_agent/agent/vision_agent_coder.py,sha256=4bbebV1sKE10vsxcZR-R8P54X2HjLeU9lDt7ylIZAT4,38429
+vision_agent/agent/vision_agent_coder_prompts.py,sha256=YWK4C--YRS1Kuab11Gn-AXBzar1j_GNnTnxi_nnaPRY,14901
+vision_agent/agent/vision_agent_prompts.py,sha256=e_ASPeRFU1yZsQhCkK_bIBG-eyIWyWXmN64lFk-r7e0,10897
+vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
+vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
+vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
+vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
+vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
+vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
+vision_agent/tools/__init__.py,sha256=zUv3aVPN1MXfyQiQi5To4rkQGtG7mxLQ1NjLI3pxM80,2412
+vision_agent/tools/meta_tools.py,sha256=XO5Ahe5ZauomynxgDcBuzmm0ocXwTnmZ0wjfgvOzDWc,23426
+vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
+vision_agent/tools/tool_utils.py,sha256=5ukuDMxbEH4iKetYR9I7twzsA8ECyP4tVwYXQq54mxI,8020
+vision_agent/tools/tools.py,sha256=dD_8AmAQb0oKVZHg2w2kSKlvWrG9yaKRbaHTz_kHgjA,73648
+vision_agent/tools/tools_types.py,sha256=JUOZWGW2q-dlJ85CHr9gvo9KQk_rXyjJhi-iwPNn4eM,2397
+vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
+vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
+vision_agent/utils/execute.py,sha256=Lb78YX34v2Ydr-Md25a_gylsdRVXBFbE-_dc_z6oHvg,27968
+vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwdn6sk,11303
+vision_agent/utils/sim.py,sha256=ebE9Cs00pVEDI1HMjAzUBk88tQQmc2U-yAzIDinnekU,5572
+vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
+vision_agent/utils/video.py,sha256=xbMEoRk13l4fHeQlbvMQhLCn8RNndYmsDhUf01TUeR8,4781
+vision_agent-0.2.142.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.2.142.dist-info/METADATA,sha256=yP7ShheLQ_a50CME1rbSUifRlc4ylqmM6PeIKflW9Ig,13758
+vision_agent-0.2.142.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.2.142.dist-info/RECORD,,

vision_agent-0.2.140.dist-info/RECORD DELETED Viewed

@@ -1,33 +0,0 @@
-vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
-vision_agent/agent/__init__.py,sha256=TddDT4e3JVc68Dt0zSk0B4OBORx_R2WhAGK71uqEe2w,204
-vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
-vision_agent/agent/agent_utils.py,sha256=qOYQn-wJsa4j4YjFOBQ41xyklCg8Y94CIIGw9ZXmgIU,2053
-vision_agent/agent/vision_agent.py,sha256=Ed10_rWzHu-hejb5jF9lAF7xbmQ_qAGpCxDvByZw6M8,14100
-vision_agent/agent/vision_agent_coder.py,sha256=OI95goKTqVaEEPYwkn6bVsHsHZeifoBC8rjG9nD0Znc,36909
-vision_agent/agent/vision_agent_coder_prompts.py,sha256=a7P19QscKNiaweke0zHPCfi5GQImpG-ZGKv_kXz0seg,13452
-vision_agent/agent/vision_agent_prompts.py,sha256=-fXiIIb48duXVljWYcJ0Y4ZzfNnRFi3C5cKdF4SdDo8,10075
-vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
-vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
-vision_agent/fonts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1rtHiNC_6QosTE,1594400
-vision_agent/lmm/__init__.py,sha256=YuUZRsMHdn8cMOv6iBU8yUqlIOLrbZQqZl9KPnofsHQ,103
-vision_agent/lmm/lmm.py,sha256=soWmEjtleQUSH2G3tYZWxOmteIqkgMVcmuZfx4mxszU,16838
-vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
-vision_agent/tools/__init__.py,sha256=nufZNzbcLTuXwxFmvZNj99qE8EO2qtEPT8wFsuI9vyE,2397
-vision_agent/tools/meta_tools.py,sha256=orYbEPWOENXwmKSmbg52_2eMAoYT9ZbV5GjudUd-f0o,22563
-vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
-vision_agent/tools/tool_utils.py,sha256=5ukuDMxbEH4iKetYR9I7twzsA8ECyP4tVwYXQq54mxI,8020
-vision_agent/tools/tools.py,sha256=WKeB99ED0o_ISS_vZc-ch_1Dc8_Fl2fhnGlfVNwNouc,70024
-vision_agent/tools/tools_types.py,sha256=rLpCUODPY0yI65SLOTJOxfHFfqWM3WjOq-AYX25Chjk,2356
-vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
-vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
-vision_agent/utils/execute.py,sha256=Lb78YX34v2Ydr-Md25a_gylsdRVXBFbE-_dc_z6oHvg,27968
-vision_agent/utils/image_utils.py,sha256=zTTOJFOieMzwIquTFnW7T6ssx9o6XfoZ0Unqyk7GJrg,10746
-vision_agent/utils/sim.py,sha256=ebE9Cs00pVEDI1HMjAzUBk88tQQmc2U-yAzIDinnekU,5572
-vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
-vision_agent/utils/video.py,sha256=hOjfEOZNcddYdoa0CoviXA4Vo9kwURKuojIJgLLJdp0,4745
-vision_agent-0.2.140.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.2.140.dist-info/METADATA,sha256=B33v0XI-5ZlEBBu-I8DT7JrbU04PophTYEmRQMVEkBQ,12291
-vision_agent-0.2.140.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.2.140.dist-info/RECORD,,

{vision_agent-0.2.140.dist-info → vision_agent-0.2.142.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.140.dist-info → vision_agent-0.2.142.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.2.140__py3-none-any.whl → 0.2.142__py3-none-any.whl

vision-agent 0.2.140py3-none-any.whl → 0.2.142py3-none-any.whl