PyPI - vision-agent - Versions diffs - 0.2.180__py3-none-any.whl → 0.2.182__py3-none-any.whl - Mend

vision-agent 0.2.180py3-none-any.whl → 0.2.182py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

vision_agent/agent/vision_agent.py CHANGED Viewed

@@ -492,29 +492,8 @@ class VisionAgent(Agent):
                 code_interpreter.upload_file(artifacts.local_save_path)
                 response = run_conversation(self.agent, int_chat)
-                code_action = use_extra_vision_agent_args(
-                    response.get("execute_python", None),
-                    test_multi_plan,
-                    custom_tool_names,
-                )
                 if self.verbosity >= 1:
                     _LOGGER.info(response)
-                int_chat.append(
-                    {
-                        "role": "assistant",
-                        "content": json.dumps(
-                            new_format_to_old_format(add_step_descriptions(response))
-                        ),
-                    }
-                )
-                orig_chat.append(
-                    {
-                        "role": "assistant",
-                        "content": json.dumps(
-                            new_format_to_old_format(add_step_descriptions(response))
-                        ),
-                    }
-                )
                 code_action = response.get("execute_python", None)
                 # sometimes it gets stuck in a loop, so we force it to exit
@@ -529,7 +508,7 @@ class VisionAgent(Agent):
                                 "value": "Agent is stuck in conversation loop, exited",
                                 "traceback_raw": [],
                             },
-                            "finished": code_action is None,
+                            "finished": True,
                         }
                     )
                 else:
@@ -544,6 +523,22 @@ class VisionAgent(Agent):
                         }
                     )
+                int_chat.append(
+                    {
+                        "role": "assistant",
+                        "content": json.dumps(
+                            new_format_to_old_format(add_step_descriptions(response))
+                        ),
+                    }
+                )
+                orig_chat.append(
+                    {
+                        "role": "assistant",
+                        "content": json.dumps(
+                            new_format_to_old_format(add_step_descriptions(response))
+                        ),
+                    }
+                )
                 finished = response.get("let_user_respond", False)
                 if code_action is not None:

vision_agent/tools/tools.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import base64
 import io
 import json
 import logging
@@ -184,8 +183,16 @@ def owl_v2_image(
     if image_size[0] < 1 or image_size[1] < 1:
         return []
+    buffer_bytes = numpy_to_bytes(image)
+    files = [("image", buffer_bytes)]
+    payload = {
+        "prompts": [s.strip() for s in prompt.split(",")],
+        "confidence": box_threshold,
+        "model": "owlv2",
+    }
+    metadata = {"function_name": "owl_v2_image"}
     if fine_tune_id is not None:
-        image_b64 = convert_to_b64(image)
         landing_api = LandingPublicAPI()
         status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
         if status is not JobStatus.SUCCEEDED:
@@ -193,43 +200,22 @@ def owl_v2_image(
                 f"Fine-tuned model {fine_tune_id} is not ready yet"
             )
-        data_obj = Florence2FtRequest(
-            image=image_b64,
-            task=PromptTask.PHRASE_GROUNDING,
-            prompt=prompt,
-            job_id=UUID(fine_tune_id),
-        )
-        data = data_obj.model_dump(by_alias=True, exclude_none=True)
-        detections = send_inference_request(
-            data,
-            "florence2-ft",
-            v2=True,
-            is_form=True,
-            metadata_payload={"function_name": "owl_v2_image"},
-        )
-        # get the first frame
-        detection = detections[0]
-        bboxes_formatted = [
-            ODResponseData(
-                label=detection["labels"][i],
-                bbox=normalize_bbox(detection["bboxes"][i], image_size),
-                score=1.0,
-            )
-            for i in range(len(detection["bboxes"]))
-        ]
-        return [bbox.model_dump() for bbox in bboxes_formatted]
+        # we can only execute fine-tuned models with florence2
+        payload = {
+            "prompts": payload["prompts"],
+            "jobId": fine_tune_id,
+            "model": "florence2",
+        }
-    buffer_bytes = numpy_to_bytes(image)
-    files = [("image", buffer_bytes)]
-    payload = {
-        "prompts": [s.strip() for s in prompt.split(",")],
-        "model": "owlv2",
-        "function_name": "owl_v2_image",
-    }
-    resp_data = send_inference_request(
-        payload, "text-to-object-detection", files=files, v2=True
+    detections = send_task_inference_request(
+        payload,
+        "text-to-object-detection",
+        files=files,
+        metadata=metadata,
     )
-    bboxes = resp_data[0]
+    # get the first frame
+    bboxes = detections[0]
     bboxes_formatted = [
         ODResponseData(
             label=bbox["label"],
@@ -238,17 +224,17 @@ def owl_v2_image(
         )
         for bbox in bboxes
     ]
-    filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
-    return [bbox.model_dump() for bbox in filtered_bboxes]
+    return [bbox.model_dump() for bbox in bboxes_formatted]
 def owl_v2_video(
     prompt: str,
     frames: List[np.ndarray],
     box_threshold: float = 0.10,
+    fine_tune_id: Optional[str] = None,
 ) -> List[List[Dict[str, Any]]]:
     """'owl_v2_video' will run owl_v2 on each frame of a video. It can detect multiple
-    objects indepdently per frame given a text prompt such as a category name or
+    objects independently per frame given a text prompt such as a category name or
     referring expression but does not track objects across frames. The categories in
     text prompt are separated by commas. It returns a list of lists where each inner
     list contains the score, label, and bounding box of the detections for that frame.
@@ -258,6 +244,8 @@ def owl_v2_video(
         frames (List[np.ndarray]): The list of frames to ground the prompt to.
         box_threshold (float, optional): The threshold for the box detection. Defaults
             to 0.30.
+        fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
+            fine-tuned model ID here to use it.
     Returns:
         List[List[Dict[str, Any]]]: A list of lists of dictionaries containing the
@@ -285,30 +273,45 @@ def owl_v2_video(
     files = [("video", buffer_bytes)]
     payload = {
         "prompts": [s.strip() for s in prompt.split(",")],
+        "confidence": box_threshold,
         "model": "owlv2",
-        "function_name": "owl_v2_video",
     }
-    data: Dict[str, Any] = send_inference_request(
-        payload, "text-to-object-detection", files=files, v2=True
+    metadata = {"function_name": "owl_v2_video"}
+    if fine_tune_id is not None:
+        landing_api = LandingPublicAPI()
+        status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
+        if status is not JobStatus.SUCCEEDED:
+            raise FineTuneModelIsNotReady(
+                f"Fine-tuned model {fine_tune_id} is not ready yet"
+            )
+        # we can only execute fine-tuned models with florence2
+        payload = {
+            "prompts": payload["prompts"],
+            "jobId": fine_tune_id,
+            "model": "florence2",
+        }
+    detections = send_task_inference_request(
+        payload,
+        "text-to-object-detection",
+        files=files,
+        metadata=metadata,
     )
-    bboxes_formatted = []
-    if data is not None:
-        for frame_data in data:
-            bboxes_formated_frame = []
-            for elt in frame_data:
-                bboxes_formated_frame.append(
-                    ODResponseData(
-                        label=elt["label"],  # type: ignore
-                        bbox=normalize_bbox(elt["bounding_box"], image_size),  # type: ignore
-                        score=round(elt["score"], 2),  # type: ignore
-                    )
-                )
-            bboxes_formatted.append(bboxes_formated_frame)
-    filtered_bboxes = [
-        filter_bboxes_by_threshold(elt, box_threshold) for elt in bboxes_formatted
-    ]
-    return [[bbox.model_dump() for bbox in frame] for frame in filtered_bboxes]
+    bboxes_formatted = []
+    for frame_data in detections:
+        bboxes_formatted_per_frame = [
+            ODResponseData(
+                label=bbox["label"],
+                bbox=normalize_bbox(bbox["bounding_box"], image_size),
+                score=round(bbox["score"], 2),
+            )
+            for bbox in frame_data
+        ]
+        bboxes_formatted.append(bboxes_formatted_per_frame)
+    return [[bbox.model_dump() for bbox in frame] for frame in bboxes_formatted]
 def grounding_sam(
@@ -708,23 +711,31 @@ def countgd_counting(
     image_size = image.shape[:2]
     if image_size[0] < 1 or image_size[1] < 1:
         return []
     buffer_bytes = numpy_to_bytes(image)
     files = [("image", buffer_bytes)]
-    prompt = prompt.replace(", ", " .")
-    payload = {"prompts": [prompt], "model": "countgd"}
+    payload = {
+        "prompts": [prompt.replace(", ", " .")],
+        "confidence": box_threshold,  # still not being used in the API
+        "model": "countgd",
+    }
     metadata = {"function_name": "countgd_counting"}
-    resp_data = send_task_inference_request(
+    detections = send_task_inference_request(
         payload, "text-to-object-detection", files=files, metadata=metadata
     )
-    bboxes_per_frame = resp_data[0]
+    # get the first frame
+    bboxes = detections[0]
     bboxes_formatted = [
         ODResponseData(
             label=bbox["label"],
             bbox=normalize_bbox(bbox["bounding_box"], image_size),
             score=round(bbox["score"], 2),
         )
-        for bbox in bboxes_per_frame
+        for bbox in bboxes
     ]
+    # TODO: remove this once we start to use the confidence on countgd
     filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
     return [bbox.model_dump() for bbox in filtered_bboxes]
@@ -768,6 +779,7 @@ def countgd_example_based_counting(
     image_size = image.shape[:2]
     if image_size[0] < 1 or image_size[1] < 1:
         return []
     buffer_bytes = numpy_to_bytes(image)
     files = [("image", buffer_bytes)]
     visual_prompts = [
@@ -775,10 +787,13 @@ def countgd_example_based_counting(
     ]
     payload = {"visual_prompts": json.dumps(visual_prompts), "model": "countgd"}
     metadata = {"function_name": "countgd_example_based_counting"}
-    resp_data = send_task_inference_request(
+    detections = send_task_inference_request(
         payload, "visual-prompts-to-object-detection", files=files, metadata=metadata
     )
-    bboxes_per_frame = resp_data[0]
+    # get the first frame
+    bboxes_per_frame = detections[0]
     bboxes_formatted = [
         ODResponseData(
             label=bbox["label"],
@@ -1240,7 +1255,14 @@ def florence2_phrase_grounding(
     image_size = image.shape[:2]
     if image_size[0] < 1 or image_size[1] < 1:
         return []
-    image_b64 = convert_to_b64(image)
+    buffer_bytes = numpy_to_bytes(image)
+    files = [("image", buffer_bytes)]
+    payload = {
+        "prompts": [s.strip() for s in prompt.split(",")],
+        "model": "florence2",
+    }
+    metadata = {"function_name": "florence2_phrase_grounding"}
     if fine_tune_id is not None:
         landing_api = LandingPublicAPI()
@@ -1250,42 +1272,27 @@ def florence2_phrase_grounding(
                 f"Fine-tuned model {fine_tune_id} is not ready yet"
             )
-        data_obj = Florence2FtRequest(
-            image=image_b64,
-            task=PromptTask.PHRASE_GROUNDING,
-            prompt=prompt,
-            job_id=UUID(fine_tune_id),
-        )
-        data = data_obj.model_dump(by_alias=True, exclude_none=True)
-        detections = send_inference_request(
-            data,
-            "florence2-ft",
-            v2=True,
-            is_form=True,
-            metadata_payload={"function_name": "florence2_phrase_grounding"},
-        )
-        # get the first frame
-        detection = detections[0]
-    else:
-        data = {
-            "image": image_b64,
-            "task": "<CAPTION_TO_PHRASE_GROUNDING>",
-            "prompt": prompt,
-            "function_name": "florence2_phrase_grounding",
-        }
-        detections = send_inference_request(data, "florence2", v2=True)
-        detection = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
+        payload["jobId"] = fine_tune_id
-    return_data = []
-    for i in range(len(detection["bboxes"])):
-        return_data.append(
-            ODResponseData(
-                label=detection["labels"][i],
-                bbox=normalize_bbox(detection["bboxes"][i], image_size),
-                score=1.0,
-            )
+    detections = send_task_inference_request(
+        payload,
+        "text-to-object-detection",
+        files=files,
+        metadata=metadata,
+    )
+    # get the first frame
+    bboxes = detections[0]
+    bboxes_formatted = [
+        ODResponseData(
+            label=bbox["label"],
+            bbox=normalize_bbox(bbox["bounding_box"], image_size),
+            score=round(bbox["score"], 2),
         )
-    return [bbox.model_dump() for bbox in return_data]
+        for bbox in bboxes
+    ]
+    return [bbox.model_dump() for bbox in bboxes_formatted]
 def florence2_phrase_grounding_video(
@@ -1327,6 +1334,11 @@ def florence2_phrase_grounding_video(
     image_size = frames[0].shape[:2]
     buffer_bytes = frames_to_bytes(frames)
     files = [("video", buffer_bytes)]
+    payload = {
+        "prompts": [s.strip() for s in prompt.split(",")],
+        "model": "florence2",
+    }
+    metadata = {"function_name": "florence2_phrase_grounding_video"}
     if fine_tune_id is not None:
         landing_api = LandingPublicAPI()
@@ -1336,41 +1348,25 @@ def florence2_phrase_grounding_video(
                 f"Fine-tuned model {fine_tune_id} is not ready yet"
             )
-        data_obj = Florence2FtRequest(
-            task=PromptTask.PHRASE_GROUNDING,
-            prompt=prompt,
-            job_id=UUID(fine_tune_id),
-        )
+        payload["jobId"] = fine_tune_id
-        data = data_obj.model_dump(by_alias=True, exclude_none=True, mode="json")
-        detections = send_inference_request(
-            data,
-            "florence2-ft",
-            v2=True,
-            files=files,
-            metadata_payload={"function_name": "florence2_phrase_grounding_video"},
-        )
-    else:
-        data = {
-            "prompt": prompt,
-            "task": "<CAPTION_TO_PHRASE_GROUNDING>",
-            "function_name": "florence2_phrase_grounding_video",
-            "video": base64.b64encode(buffer_bytes).decode("utf-8"),
-        }
-        detections = send_inference_request(data, "florence2", v2=True)
-        detections = [d["<CAPTION_TO_PHRASE_GROUNDING>"] for d in detections]
+    detections = send_task_inference_request(
+        payload,
+        "text-to-object-detection",
+        files=files,
+        metadata=metadata,
+    )
     bboxes_formatted = []
     for frame_data in detections:
-        bboxes_formatted_per_frame = []
-        for idx in range(len(frame_data["bboxes"])):
-            bboxes_formatted_per_frame.append(
-                ODResponseData(
-                    label=frame_data["labels"][idx],
-                    bbox=normalize_bbox(frame_data["bboxes"][idx], image_size),
-                    score=1.0,
-                )
+        bboxes_formatted_per_frame = [
+            ODResponseData(
+                label=bbox["label"],
+                bbox=normalize_bbox(bbox["bounding_box"], image_size),
+                score=round(bbox["score"], 2),
             )
+            for bbox in frame_data
+        ]
         bboxes_formatted.append(bboxes_formatted_per_frame)
     return [[bbox.model_dump() for bbox in frame] for frame in bboxes_formatted]

{vision_agent-0.2.180.dist-info → vision_agent-0.2.182.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.180
+Version: 0.2.182
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.180.dist-info → vision_agent-0.2.182.dist-info}/RECORD RENAMED Viewed

@@ -2,7 +2,7 @@ vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
 vision_agent/agent/__init__.py,sha256=RRMPhH8mgm_pCtEKiVFSjJyDi4lCr4F7k05AhK01xlM,436
 vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
 vision_agent/agent/agent_utils.py,sha256=WYJF11PfKXlRMPnogGz3s7c2TlWoxoGzuLiIptVYE1s,5524
-vision_agent/agent/vision_agent.py,sha256=x0-TElnTRW7abyq2wAwKRiTUExBGg24C-c74wO1oKtI,26336
+vision_agent/agent/vision_agent.py,sha256=rr1P9iTbr7OsjgMYWCeIxQYI4cLwPWia3NIMJNi-9Yo,26110
 vision_agent/agent/vision_agent_coder.py,sha256=3Q1VWrN-BNUoSD4OAqKazvXkP2c04PXDYu2Z1f5dQb0,31960
 vision_agent/agent/vision_agent_coder_prompts.py,sha256=gPLVXQMNSzYnQYpNm0wlH_5FPkOTaFDV24bqzK3jQ40,12221
 vision_agent/agent/vision_agent_planner.py,sha256=mjmnXG9CvYf_ZA7ZJ3ri4H-2U_Km55gF1sZYRSOlxpY,19027
@@ -20,7 +20,7 @@ vision_agent/tools/__init__.py,sha256=OEBJGOXNpCG1Ye-N39ahjWR4lL0RPVkcX60s25LpdV
 vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB9Kg,32074
 vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
 vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
-vision_agent/tools/tools.py,sha256=9MbX3b_xff-cHeCh46_q6gt7b5jNSCVSwiu2rwM43Ws,81224
+vision_agent/tools/tools.py,sha256=p0MBQnwA10NF48ZhTIRWzHaarkezjvDazk7VuvjH1-k,80142
 vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
 vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
 vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
@@ -29,7 +29,7 @@ vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwd
 vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
 vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
 vision_agent/utils/video.py,sha256=fOPR48-SuwMbE5eB5rc2F7lVo6k1mVHn26eEJ0QCslc,5602
-vision_agent-0.2.180.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.2.180.dist-info/METADATA,sha256=KHeuZn1H6KJXyMlkPyrmie_AqUL1MMALOIoU0kKzg2s,18330
-vision_agent-0.2.180.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.2.180.dist-info/RECORD,,
+vision_agent-0.2.182.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.2.182.dist-info/METADATA,sha256=eLwHRDYfkonJsLN0ug1Sc2bqZv7SAHiDzVeYeTGCmj8,18330
+vision_agent-0.2.182.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.2.182.dist-info/RECORD,,

{vision_agent-0.2.180.dist-info → vision_agent-0.2.182.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.180.dist-info → vision_agent-0.2.182.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.2.180__py3-none-any.whl → 0.2.182__py3-none-any.whl

vision-agent 0.2.180py3-none-any.whl → 0.2.182py3-none-any.whl