PyPI - vision-agent - Versions diffs - 0.2.151__py3-none-any.whl → 0.2.153__py3-none-any.whl - Mend

vision-agent 0.2.151py3-none-any.whl → 0.2.153py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

vision_agent/agent/vision_agent_coder_prompts.py CHANGED Viewed

@@ -101,7 +101,7 @@ plan1:
 - Use the 'owl_v2_video' tool with the prompt 'person' to detect where the people are in the video.
 plan2:
 - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
-- Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
+- Use the 'florence2_phrase_grounding_image' tool with the prompt 'person' to detect where the people are in the video.
 plan3:
 - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
 - Use the 'florence2_sam2_video_tracking' tool with the prompt 'person' to detect where the people are in the video.
@@ -109,7 +109,7 @@ plan3:
 ```python
 import numpy as np
-from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking
+from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding_image, florence2_sam2_video_tracking
 # sample at 1 FPS and use the first 10 frames to reduce processing time
 frames = extract_frames_and_timestamps("video.mp4", 1)
@@ -143,7 +143,7 @@ owl_v2_out = owl_v2_video("person", frames)
 owl_v2_counts = get_counts(owl_v2_out)
 # plan2
-florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
+florence2_out = [florence2_phrase_grounding_image("person", f) for f in frames]
 florence2_counts = get_counts(florence2_out)
 # plan3
@@ -153,13 +153,13 @@ f2s2_counts = get_counts(f2s2_tracking_out)
 final_out = {{
     "owl_v2_video": owl_v2_out,
-    "florence2_phrase_grounding": florence2_out,
+    "florence2_phrase_grounding_image": florence2_out,
     "florence2_sam2_video_tracking": f2s2_out,
 }}
 counts = {{
     "owl_v2_video": owl_v2_counts,
-    "florence2_phrase_grounding": florence2_counts,
+    "florence2_phrase_grounding_image": florence2_counts,
     "florence2_sam2_video_tracking": f2s2_counts,
 }}

vision_agent/agent/vision_agent_prompts.py CHANGED Viewed

@@ -131,10 +131,10 @@ AGENT: {"thoughts": "I will use the generate_vision_code to count the workers wi
 OBSERVATION:
 [Artifact code.py]
-0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
+0|from vision_agent.tools import load_image, florence2_phrase_grounding_image, closest_box_distance, overlay_bounding_boxes, save_image
 1|def count_workers_with_helmets(image_path: str, output_path: str):
 2|    image = load_image(image_path)
-3|    detections = florence2_phrase_grounding("worker, helmet", image)
+3|    detections = florence2_phrase_grounding_image("worker, helmet", image)
 4|    workers = [d for d in detections if d['label'] == 'worker']
 5|    helmets = [d for d in detections if d['label'] == 'helmet']
 6|    count = 0
@@ -166,18 +166,18 @@ AGENT: {"thoughts": "Because the user has supplied me with labels I can call obj
 OBSERVATION:
 [Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
-AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding_image call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
 OBSERVATION:
 [Artifact code.py edits]
 ---
 +++
@@ -1,7 +1,7 @@
- from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
+ from vision_agent.tools import load_image, florence2_phrase_grounding_image, closest_box_distance, overlay_bounding_boxes, save_image
  def count_workers_with_helmets(image_path: str, output_path: str):
      image = load_image(image_path)
--    detections = florence2_phrase_grounding("worker, helmet", image)
-+    detections = florence2_phrase_grounding("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
+-    detections = florence2_phrase_grounding_image("worker, helmet", image)
++    detections = florence2_phrase_grounding_image("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
      workers = [d for d in detections if d['label'] == 'worker']
      helmets = [d for d in detections if d['label'] == 'helmet']
      count = 0
@@ -189,5 +189,5 @@ OBSERVATION:
 ----- stdout -----
 3
-AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
+AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding_image model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
 """

vision_agent/tools/__init__.py CHANGED Viewed

@@ -24,7 +24,7 @@ from .tools import (
     extract_frames_and_timestamps,
     florence2_image_caption,
     florence2_ocr,
-    florence2_phrase_grounding,
+    florence2_phrase_grounding_image,
     florence2_roberta_vqa,
     florence2_sam2_image,
     florence2_sam2_video_tracking,

vision_agent/tools/meta_tools.py CHANGED Viewed

@@ -668,8 +668,12 @@ def use_object_detection_fine_tuning(
     patterns_with_fine_tune_id = [
         (
-            r'florence2_phrase_grounding\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
-            lambda match: f'florence2_phrase_grounding("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
+            r'florence2_phrase_grounding_image\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
+            lambda match: f'florence2_phrase_grounding_image("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
+        ),
+        (
+            r'florence2_phrase_grounding_video\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
+            lambda match: f'florence2_phrase_grounding_video("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
         ),
         (
             r'owl_v2_image\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',

vision_agent/tools/tool_utils.py CHANGED Viewed

@@ -1,6 +1,6 @@
+import os
 import inspect
 import logging
-import os
 from base64 import b64encode
 from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple
@@ -37,8 +37,9 @@ def send_inference_request(
     files: Optional[List[Tuple[Any, ...]]] = None,
     v2: bool = False,
     metadata_payload: Optional[Dict[str, Any]] = None,
+    is_form: bool = False,
 ) -> Any:
-    # TODO: runtime_tag and function_name should be metadata_payload and now included
+    # TODO: runtime_tag and function_name should be metadata_payload and not included
     # in the service payload
     if runtime_tag := os.environ.get("RUNTIME_TAG", ""):
         payload["runtime_tag"] = runtime_tag
@@ -64,7 +65,7 @@ def send_inference_request(
     elif metadata_payload is not None and "function_name" in metadata_payload:
         function_name = metadata_payload["function_name"]
-    response = _call_post(url, payload, session, files, function_name)
+    response = _call_post(url, payload, session, files, function_name, is_form)
     # TODO: consider making the response schema the same between below two sources
     return response if "TOOL_ENDPOINT_AUTH" in os.environ else response["data"]
@@ -75,6 +76,7 @@ def send_task_inference_request(
     task_name: str,
     files: Optional[List[Tuple[Any, ...]]] = None,
     metadata: Optional[Dict[str, Any]] = None,
+    is_form: bool = False,
 ) -> Any:
     url = f"{_LND_API_URL_v2}/{task_name}"
     headers = {"apikey": _LND_API_KEY}
@@ -87,7 +89,7 @@ def send_task_inference_request(
     function_name = "unknown"
     if metadata is not None and "function_name" in metadata:
         function_name = metadata["function_name"]
-    response = _call_post(url, payload, session, files, function_name)
+    response = _call_post(url, payload, session, files, function_name, is_form)
     return response["data"]
@@ -203,6 +205,7 @@ def _call_post(
     session: Session,
     files: Optional[List[Tuple[Any, ...]]] = None,
     function_name: str = "unknown",
+    is_form: bool = False,
 ) -> Any:
     files_in_b64 = None
     if files:
@@ -210,6 +213,8 @@ def _call_post(
     try:
         if files is not None:
             response = session.post(url, data=payload, files=files)
+        elif is_form:
+            response = session.post(url, data=payload)
         else:
             response = session.post(url, json=payload)

vision_agent/tools/tools.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import base64
 import io
 import json
 import logging
@@ -28,7 +29,6 @@ from vision_agent.tools.tool_utils import (
     send_task_inference_request,
 )
 from vision_agent.tools.tools_types import (
-    FineTuning,
     Florence2FtRequest,
     JobStatus,
     ODResponseData,
@@ -194,20 +194,26 @@ def owl_v2_image(
         data_obj = Florence2FtRequest(
             image=image_b64,
             task=PromptTask.PHRASE_GROUNDING,
-            tool="florencev2_fine_tuning",
             prompt=prompt,
-            fine_tuning=FineTuning(job_id=UUID(fine_tune_id)),
+            job_id=UUID(fine_tune_id),
         )
-        data = data_obj.model_dump(by_alias=True)
-        detections = send_inference_request(data, "tools", v2=False)
-        detections = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
+        data = data_obj.model_dump(by_alias=True, exclude_none=True)
+        detections = send_inference_request(
+            data,
+            "florence2-ft",
+            v2=True,
+            is_form=True,
+            metadata_payload={"function_name": "owl_v2_image"},
+        )
+        # get the first frame
+        detection = detections[0]
         bboxes_formatted = [
             ODResponseData(
-                label=detections["labels"][i],
-                bbox=normalize_bbox(detections["bboxes"][i], image_size),
+                label=detection["labels"][i],
+                bbox=normalize_bbox(detection["bboxes"][i], image_size),
                 score=1.0,
             )
-            for i in range(len(detections["bboxes"]))
+            for i in range(len(detection["bboxes"]))
         ]
         return [bbox.model_dump() for bbox in bboxes_formatted]
@@ -419,25 +425,30 @@ def florence2_sam2_image(
         req_data_obj = Florence2FtRequest(
             image=image_b64,
             task=PromptTask.PHRASE_GROUNDING,
-            tool="florencev2_fine_tuning",
             prompt=prompt,
-            fine_tuning=FineTuning(
-                job_id=UUID(fine_tune_id),
-                postprocessing="sam2",
-            ),
+            postprocessing="sam2",
+            job_id=UUID(fine_tune_id),
+        )
+        req_data = req_data_obj.model_dump(by_alias=True, exclude_none=True)
+        detections_ft = send_inference_request(
+            req_data,
+            "florence2-ft",
+            v2=True,
+            is_form=True,
+            metadata_payload={"function_name": "florence2_sam2_image"},
         )
-        req_data = req_data_obj.model_dump(by_alias=True)
-        detections_ft = send_inference_request(req_data, "tools", v2=False)
-        detections_ft = detections_ft["<CAPTION_TO_PHRASE_GROUNDING>"]
+        # get the first frame
+        detection = detections_ft[0]
         return_data = []
-        all_masks = np.array(detections_ft["masks"])
-        for i in range(len(detections_ft["bboxes"])):
+        for i in range(len(detection["bboxes"])):
             return_data.append(
                 {
                     "score": 1.0,
-                    "label": detections_ft["labels"][i],
-                    "bbox": detections_ft["bboxes"][i],
-                    "mask": all_masks[i, :, :].astype(np.uint8),
+                    "label": detection["labels"][i],
+                    "bbox": normalize_bbox(
+                        detection["bboxes"][i], detection["masks"][i]["size"]
+                    ),
+                    "mask": rle_decode_array(detection["masks"][i]),
                 }
             )
         return return_data
@@ -451,6 +462,7 @@ def florence2_sam2_image(
     detections: Dict[str, Any] = send_inference_request(
         payload, "florence2-sam2", files=files, v2=True
     )
     return_data = []
     for _, data_i in detections["0"].items():
         mask = rle_decode_array(data_i["mask"])
@@ -688,22 +700,18 @@ def countgd_counting(
             {'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58},
         ]
     """
-    buffer_bytes = numpy_to_bytes(image)
-    files = [("image", buffer_bytes)]
+    image_b64 = convert_to_b64(image)
     prompt = prompt.replace(", ", " .")
-    payload = {"prompts": [prompt], "model": "countgd"}
+    payload = {"prompt": prompt, "image": image_b64}
     metadata = {"function_name": "countgd_counting"}
-    resp_data = send_task_inference_request(
-        payload, "text-to-object-detection", files=files, metadata=metadata
-    )
-    bboxes_per_frame = resp_data[0]
+    resp_data = send_task_inference_request(payload, "countgd", metadata=metadata)
     bboxes_formatted = [
         ODResponseData(
             label=bbox["label"],
-            bbox=list(map(lambda x: round(x, 2), bbox["bounding_box"])),
+            bbox=list(map(lambda x: round(x, 2), bbox["bbox"])),
             score=round(bbox["score"], 2),
         )
-        for bbox in bboxes_per_frame
+        for bbox in resp_data
     ]
     filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
     return [bbox.model_dump() for bbox in filtered_bboxes]
@@ -887,7 +895,10 @@ def ixc25_temporal_localization(prompt: str, frames: List[np.ndarray]) -> List[b
         "function_name": "ixc25_temporal_localization",
     }
     data: List[int] = send_inference_request(
-        payload, "video-temporal-localization", files=files, v2=True
+        payload,
+        "video-temporal-localization?model=internlm-xcomposer",
+        files=files,
+        v2=True,
     )
     chunk_size = round(len(frames) / len(data))
     data_explode = [[elt] * chunk_size for elt in data]
@@ -1132,13 +1143,13 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
     return answer[task]  # type: ignore
-def florence2_phrase_grounding(
+def florence2_phrase_grounding_image(
     prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
 ) -> List[Dict[str, Any]]:
-    """'florence2_phrase_grounding' is a tool that can detect multiple
-    objects given a text prompt which can be object names or caption. You
-    can optionally separate the object names in the text with commas. It returns a list
-    of bounding boxes with normalized coordinates, label names and associated
+    """'florence2_phrase_grounding_image' will run florence2 on a image. It can
+    detect multiple objects given a text prompt which can be object names or caption.
+    You can optionally separate the object names in the text with commas. It returns
+    a list of bounding boxes with normalized coordinates, label names and associated
     probability scores of 1.0.
     Parameters:
@@ -1156,7 +1167,7 @@ def florence2_phrase_grounding(
     Example
     -------
-        >>> florence2_phrase_grounding('person looking at a coyote', image)
+        >>> florence2_phrase_grounding_image('person looking at a coyote', image)
         [
             {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
             {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
@@ -1176,39 +1187,128 @@ def florence2_phrase_grounding(
         data_obj = Florence2FtRequest(
             image=image_b64,
             task=PromptTask.PHRASE_GROUNDING,
-            tool="florencev2_fine_tuning",
             prompt=prompt,
-            fine_tuning=FineTuning(job_id=UUID(fine_tune_id)),
+            job_id=UUID(fine_tune_id),
         )
-        data = data_obj.model_dump(by_alias=True)
+        data = data_obj.model_dump(by_alias=True, exclude_none=True)
         detections = send_inference_request(
             data,
-            "tools",
-            v2=False,
-            metadata_payload={"function_name": "florence2_phrase_grounding"},
+            "florence2-ft",
+            v2=True,
+            is_form=True,
+            metadata_payload={"function_name": "florence2_phrase_grounding_image"},
         )
+        # get the first frame
+        detection = detections[0]
     else:
         data = {
             "image": image_b64,
             "task": "<CAPTION_TO_PHRASE_GROUNDING>",
             "prompt": prompt,
-            "function_name": "florence2_phrase_grounding",
+            "function_name": "florence2_phrase_grounding_image",
         }
         detections = send_inference_request(data, "florence2", v2=True)
+        detection = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
-    detections = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
     return_data = []
-    for i in range(len(detections["bboxes"])):
+    for i in range(len(detection["bboxes"])):
         return_data.append(
             ODResponseData(
-                label=detections["labels"][i],
-                bbox=normalize_bbox(detections["bboxes"][i], image_size),
+                label=detection["labels"][i],
+                bbox=normalize_bbox(detection["bboxes"][i], image_size),
                 score=1.0,
             )
         )
     return [bbox.model_dump() for bbox in return_data]
+def florence2_phrase_grounding_video(
+    prompt: str, frames: List[np.ndarray], fine_tune_id: Optional[str] = None
+) -> List[List[Dict[str, Any]]]:
+    """'florence2_phrase_grounding_video' will run florence2 on each frame of a video.
+    It can detect multiple objects given a text prompt which can be object names or
+    caption. You can optionally separate the object names in the text with commas.
+    It returns a list of lists where each inner list contains bounding boxes with
+    normalized coordinates, label names and associated probability scores of 1.0.
+    Parameters:
+        prompt (str): The prompt to ground to the video.
+        frames (List[np.ndarray]): The list of frames to detect objects.
+        fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
+            fine-tuned model ID here to use it.
+    Returns:
+        List[List[Dict[str, Any]]]: A list of lists of dictionaries containing the score,
+            label, and bounding box of the detected objects with normalized coordinates
+            between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates
+            of the top-left and xmax and ymax are the coordinates of the bottom-right of
+            the bounding box. The scores are always 1.0 and cannot be thresholded.
+    Example
+    -------
+        >>> florence2_phrase_grounding_video('person looking at a coyote', frames)
+        [
+            [
+                {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
+                {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
+            ],
+            ...
+        ]
+    """
+    if len(frames) == 0:
+        raise ValueError("No frames provided")
+    image_size = frames[0].shape[:2]
+    buffer_bytes = frames_to_bytes(frames)
+    files = [("video", buffer_bytes)]
+    if fine_tune_id is not None:
+        landing_api = LandingPublicAPI()
+        status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
+        if status is not JobStatus.SUCCEEDED:
+            raise FineTuneModelIsNotReady(
+                f"Fine-tuned model {fine_tune_id} is not ready yet"
+            )
+        data_obj = Florence2FtRequest(
+            task=PromptTask.PHRASE_GROUNDING,
+            prompt=prompt,
+            job_id=UUID(fine_tune_id),
+        )
+        data = data_obj.model_dump(by_alias=True, exclude_none=True, mode="json")
+        detections = send_inference_request(
+            data,
+            "florence2-ft",
+            v2=True,
+            files=files,
+            metadata_payload={"function_name": "florence2_phrase_grounding_video"},
+        )
+    else:
+        data = {
+            "prompt": prompt,
+            "task": "<CAPTION_TO_PHRASE_GROUNDING>",
+            "function_name": "florence2_phrase_grounding_video",
+            "video": base64.b64encode(buffer_bytes).decode("utf-8"),
+        }
+        detections = send_inference_request(data, "florence2", v2=True)
+        detections = [d["<CAPTION_TO_PHRASE_GROUNDING>"] for d in detections]
+    bboxes_formatted = []
+    for frame_data in detections:
+        bboxes_formatted_per_frame = []
+        for idx in range(len(frame_data["bboxes"])):
+            bboxes_formatted_per_frame.append(
+                ODResponseData(
+                    label=frame_data["labels"][idx],
+                    bbox=normalize_bbox(frame_data["bboxes"][idx], image_size),
+                    score=1.0,
+                )
+            )
+        bboxes_formatted.append(bboxes_formatted_per_frame)
+    return [[bbox.model_dump() for bbox in frame] for frame in bboxes_formatted]
 def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
     """'florence2_ocr' is a tool that can detect text and text regions in an image.
     Each text region contains one line of text. It returns a list of detected text,
@@ -1220,7 +1320,7 @@ def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
     Returns:
         List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
-            with nornmalized coordinates, and confidence score.
+            with normalized coordinates, and confidence score.
     Example
     -------
@@ -1603,7 +1703,7 @@ def extract_frames_and_timestamps(
     """
     def reformat(
-        frames_and_timestamps: List[Tuple[np.ndarray, float]]
+        frames_and_timestamps: List[Tuple[np.ndarray, float]],
     ) -> List[Dict[str, Union[np.ndarray, float]]]:
         return [
             {"frame": frame, "timestamp": timestamp}
@@ -2017,7 +2117,7 @@ def overlay_counting_results(
         fontsize,
     )
-    for i, elt in enumerate(instances):
+    for i, elt in enumerate(instances, 1):
         label = f"{i}"
         box = elt["bbox"]
@@ -2064,7 +2164,8 @@ FUNCTION_TOOLS = [
     florence2_ocr,
     florence2_sam2_image,
     florence2_sam2_video_tracking,
-    florence2_phrase_grounding,
+    florence2_phrase_grounding_image,
+    florence2_phrase_grounding_video,
     ixc25_image_vqa,
     ixc25_video_vqa,
     detr_segmentation,

vision_agent/tools/tools_types.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from enum import Enum
-from typing import List, Optional, Tuple, Union
 from uuid import UUID
+from typing import List, Optional, Tuple, Union
 from pydantic import BaseModel, ConfigDict, Field, SerializationInfo, field_serializer
@@ -24,27 +24,22 @@ class PromptTask(str, Enum):
     PHRASE_GROUNDING = "<CAPTION_TO_PHRASE_GROUNDING>"
-class FineTuning(BaseModel):
+class Florence2FtRequest(BaseModel):
     model_config = ConfigDict(populate_by_name=True)
-    job_id: UUID = Field(alias="jobId")
+    image: Optional[str] = None
+    video: Optional[bytes] = None
+    task: PromptTask
+    prompt: Optional[str] = ""
+    chunk_length_frames: Optional[int] = None
     postprocessing: Optional[str] = None
+    job_id: Optional[UUID] = Field(None, alias="jobId")
     @field_serializer("job_id")
     def serialize_job_id(self, job_id: UUID, _info: SerializationInfo) -> str:
         return str(job_id)
-class Florence2FtRequest(BaseModel):
-    model_config = ConfigDict(populate_by_name=True)
-    image: str
-    task: PromptTask
-    tool: str
-    prompt: Optional[str] = ""
-    fine_tuning: Optional[FineTuning] = Field(None, alias="fineTuning")
 class JobStatus(str, Enum):
     """The status of a fine-tuning job.

{vision_agent-0.2.151.dist-info → vision_agent-0.2.153.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.151
+Version: 0.2.153
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.151.dist-info → vision_agent-0.2.153.dist-info}/RECORD RENAMED Viewed

@@ -4,8 +4,8 @@ vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,5
 vision_agent/agent/agent_utils.py,sha256=PEUHqvnHmFL4np_TeFmKMwr5s_dWfdfJz6TF_ogd1dU,2353
 vision_agent/agent/vision_agent.py,sha256=m7apb1smJbRyj0VAellrN_mDrSPAee4DVm6FWRa-e78,18459
 vision_agent/agent/vision_agent_coder.py,sha256=9BT4gaXsqH5pvxo8WGwJN9MTvP1V3TgoJHBpjtlKP9I,38417
-vision_agent/agent/vision_agent_coder_prompts.py,sha256=BmbTMhth4v1qLexuoSeyo47QQ0kPQvL1pLbCJHMsWDw,18910
-vision_agent/agent/vision_agent_prompts.py,sha256=3n92aF-jpUyyrAy06izdHIMPEMZPKD1JV0wfQvt-PD8,11251
+vision_agent/agent/vision_agent_coder_prompts.py,sha256=Ea_v_qLBJMVwQVLLIdNq15MgV2-6qqhcThHAHFwzv-o,18940
+vision_agent/agent/vision_agent_prompts.py,sha256=eOqluRb1R_SJFsdWXd9HJuiJnJccEnDDUkfPXlHOjyw,11293
 vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 vision_agent/clients/http.py,sha256=k883i6M_4nl7zwwHSI-yP5sAgQZIDPM1nrKD6YFJ3Xs,2009
 vision_agent/clients/landing_public_api.py,sha256=lU2ev6E8NICmR8DMUljuGcVFy5VNJQ4WQkWC8WnnJEc,1503
@@ -14,12 +14,12 @@ vision_agent/fonts/default_font_ch_en.ttf,sha256=1YM0Z3XqLDjSNbF7ihQFSAIUdjF9m1r
 vision_agent/lmm/__init__.py,sha256=jyY1sJb_tYKg5-Wzs3p1lvwFkc-aUNZfMcLy3TOC4Zg,100
 vision_agent/lmm/lmm.py,sha256=B5ClgwvbybVCWkf9opDMLjTtJZemUU4KUkQoRxGh43I,16787
 vision_agent/lmm/types.py,sha256=ZEXR_ptBL0ZwDMTDYkgxUCmSZFmBYPQd2jreNzr_8UY,221
-vision_agent/tools/__init__.py,sha256=zUv3aVPN1MXfyQiQi5To4rkQGtG7mxLQ1NjLI3pxM80,2412
-vision_agent/tools/meta_tools.py,sha256=yBlkRTeEfI3sAMZbz5mvOsHu9e1OrzDw6XLd6t-U0IY,24909
+vision_agent/tools/__init__.py,sha256=cg4Axb9L3Z7WkdyEv5IyqDsmZKIrxmS4CmV3DEXURnU,2418
+vision_agent/tools/meta_tools.py,sha256=yrplxiDu-L9_Dw_L2ESehJabckAq59Q-xfMpIbYB0Ak,25179
 vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
-vision_agent/tools/tool_utils.py,sha256=5ukuDMxbEH4iKetYR9I7twzsA8ECyP4tVwYXQq54mxI,8020
-vision_agent/tools/tools.py,sha256=c7SjtZD7YfxhEAGYYe-ExVCBA4NDXmRwerBIbd-XEH8,74557
-vision_agent/tools/tools_types.py,sha256=JUOZWGW2q-dlJ85CHr9gvo9KQk_rXyjJhi-iwPNn4eM,2397
+vision_agent/tools/tool_utils.py,sha256=FTSboDmYPQLmIhsc9FeydcrdRZU6huBZKnyBmm0VsHE,8196
+vision_agent/tools/tools.py,sha256=Of7NTZTc1bim_fdAoDxx47WzttGI8VlMKKcId0sMwfk,78406
+vision_agent/tools/tools_types.py,sha256=Qijj5NmY6_Aq1fYwuQYf3J1TAQYTz_1mWkX3Dq4d4e0,2339
 vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
 vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
 vision_agent/utils/execute.py,sha256=FqSOr5gtBeKB1g2hbV6-bhox6qItDQNn2o9efq1w6f4,28017
@@ -27,7 +27,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
 vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
 vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
 vision_agent/utils/video.py,sha256=xbMEoRk13l4fHeQlbvMQhLCn8RNndYmsDhUf01TUeR8,4781
-vision_agent-0.2.151.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.2.151.dist-info/METADATA,sha256=61jba11RSszH3vWXJi2_CoqbwaXEqSTCcJWakNNFBTU,13758
-vision_agent-0.2.151.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.2.151.dist-info/RECORD,,
+vision_agent-0.2.153.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.2.153.dist-info/METADATA,sha256=zehWh4l1EfZeTKxSEgKXtQMb0EE5pvWP1UG0d2lyS44,13758
+vision_agent-0.2.153.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.2.153.dist-info/RECORD,,

{vision_agent-0.2.151.dist-info → vision_agent-0.2.153.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.151.dist-info → vision_agent-0.2.153.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.2.151__py3-none-any.whl → 0.2.153__py3-none-any.whl

vision-agent 0.2.151py3-none-any.whl → 0.2.153py3-none-any.whl