PyPI - vision-agent - Versions diffs - 0.2.99__py3-none-any.whl → 0.2.101__py3-none-any.whl - Mend

vision-agent 0.2.99py3-none-any.whl → 0.2.101py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

vision_agent/tools/tool_utils.py CHANGED Viewed

@@ -16,7 +16,8 @@ from vision_agent.utils.type_defs import LandingaiAPIKey
 _LOGGER = logging.getLogger(__name__)
 _LND_API_KEY = LandingaiAPIKey().api_key
-_LND_API_URL = "https://api.landing.ai/v1/agent"
+_LND_API_URL = "https://api.landing.ai/v1/agent/model"
+_LND_API_URL_v2 = "https://api.landing.ai/v1/tools"
 class ToolCallTrace(BaseModel):
@@ -27,13 +28,13 @@ class ToolCallTrace(BaseModel):
 def send_inference_request(
-    payload: Dict[str, Any], endpoint_name: str
+    payload: Dict[str, Any], endpoint_name: str, v2: bool = False
 ) -> Dict[str, Any]:
     try:
         if runtime_tag := os.environ.get("RUNTIME_TAG", ""):
             payload["runtime_tag"] = runtime_tag
-        url = f"{_LND_API_URL}/model/{endpoint_name}"
+        url = f"{_LND_API_URL_v2 if v2 else _LND_API_URL}/{endpoint_name}"
         if "TOOL_ENDPOINT_URL" in os.environ:
             url = os.environ["TOOL_ENDPOINT_URL"]
@@ -61,7 +62,9 @@ def send_inference_request(
                 traceback_raw=[],
             )
             _LOGGER.error(f"Request failed: {res.status_code} {res.text}")
-            raise RemoteToolCallFailed(payload["tool"], res.status_code, res.text)
+            raise RemoteToolCallFailed(
+                payload["function_name"], res.status_code, res.text
+            )
         resp = res.json()
         tool_call_trace.response = resp

vision_agent/tools/tools.py CHANGED Viewed

@@ -126,7 +126,6 @@ def owl_v2(
     prompt: str,
     image: np.ndarray,
     box_threshold: float = 0.10,
-    iou_threshold: float = 0.10,
 ) -> List[Dict[str, Any]]:
     """'owl_v2' is a tool that can detect and count multiple objects given a text
     prompt such as category names or referring expressions. The categories in text prompt
@@ -138,8 +137,6 @@ def owl_v2(
         image (np.ndarray): The image to ground the prompt to.
         box_threshold (float, optional): The threshold for the box detection. Defaults
             to 0.10.
-        iou_threshold (float, optional): The threshold for the Intersection over Union
-            (IoU). Defaults to 0.10.
     Returns:
         List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
@@ -159,22 +156,22 @@ def owl_v2(
     image_size = image.shape[:2]
     image_b64 = convert_to_b64(image)
     request_data = {
-        "prompt": prompt,
+        "prompts": prompt.split("."),
         "image": image_b64,
-        "tool": "open_vocab_detection",
-        "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
+        "confidence": box_threshold,
         "function_name": "owl_v2",
     }
-    data: Dict[str, Any] = send_inference_request(request_data, "tools")
+    data: Dict[str, Any] = send_inference_request(request_data, "owlv2", v2=True)
     return_data = []
-    for i in range(len(data["bboxes"])):
-        return_data.append(
-            {
-                "score": round(data["scores"][i], 2),
-                "label": data["labels"][i].strip(),
-                "bbox": normalize_bbox(data["bboxes"][i], image_size),
-            }
-        )
+    if data is not None:
+        for elt in data:
+            return_data.append(
+                {
+                    "bbox": normalize_bbox(elt["bbox"], image_size),  # type: ignore
+                    "label": elt["label"],  # type: ignore
+                    "score": round(elt["score"], 2),  # type: ignore
+                }
+            )
     return return_data
@@ -367,11 +364,10 @@ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
     image_b64 = convert_to_b64(image)
     data = {
         "image": image_b64,
-        "tool": "zero_shot_counting",
         "function_name": "loca_zero_shot_counting",
     }
-    resp_data = send_inference_request(data, "tools")
-    resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
+    resp_data = send_inference_request(data, "loca", v2=True)
+    resp_data["heat_map"] = np.array(resp_data["heat_map"][0]).astype(np.uint8)
     return resp_data
@@ -397,17 +393,15 @@ def loca_visual_prompt_counting(
     image_size = get_image_size(image)
     bbox = visual_prompt["bbox"]
-    bbox_str = ", ".join(map(str, denormalize_bbox(bbox, image_size)))
     image_b64 = convert_to_b64(image)
     data = {
         "image": image_b64,
-        "prompt": bbox_str,
-        "tool": "few_shot_counting",
+        "bbox": list(map(int, denormalize_bbox(bbox, image_size))),
         "function_name": "loca_visual_prompt_counting",
     }
-    resp_data = send_inference_request(data, "tools")
-    resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
+    resp_data = send_inference_request(data, "loca", v2=True)
+    resp_data["heat_map"] = np.array(resp_data["heat_map"][0]).astype(np.uint8)
     return resp_data
@@ -432,13 +426,12 @@ def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
     image_b64 = convert_to_b64(image)
     data = {
         "image": image_b64,
-        "prompt": prompt,
-        "tool": "image_question_answering_with_context",
+        "question": prompt,
         "function_name": "florencev2_roberta_vqa",
     }
-    answer = send_inference_request(data, "tools")
-    return answer["text"][0]  # type: ignore
+    answer = send_inference_request(data, "florence2-qa", v2=True)
+    return answer  # type: ignore
 def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
@@ -544,17 +537,16 @@ def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]:
     Example
     -------
         >>> vit_nsfw_classification(image)
-        {"labels": "normal", "scores": 0.68},
+        {"label": "normal", "scores": 0.68},
     """
     image_b64 = convert_to_b64(image)
     data = {
         "image": image_b64,
-        "tool": "nsfw_image_classification",
         "function_name": "vit_nsfw_classification",
     }
-    resp_data = send_inference_request(data, "tools")
-    resp_data["scores"] = round(resp_data["scores"], 4)
+    resp_data = send_inference_request(data, "nsfw-classification", v2=True)
+    resp_data["score"] = round(resp_data["score"], 4)
     return resp_data
@@ -603,21 +595,21 @@ def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) ->
         'This image contains a cat sitting on a table with a bowl of milk.'
     """
     image_b64 = convert_to_b64(image)
+    task = "<MORE_DETAILED_CAPTION>" if detail_caption else "<DETAILED_CAPTION>"
     data = {
         "image": image_b64,
-        "tool": "florence2_image_captioning",
-        "detail_caption": detail_caption,
+        "task": task,
         "function_name": "florencev2_image_caption",
     }
-    answer = send_inference_request(data, "tools")
-    return answer["text"][0]  # type: ignore
+    answer = send_inference_request(data, "florence2", v2=True)
+    return answer[task]  # type: ignore
-def florencev2_object_detection(image: np.ndarray) -> List[Dict[str, Any]]:
-    """'florencev2_object_detection' is a tool that can detect common objects in an
-    image without any text prompt or thresholding. It returns a list of detected objects
-    as labels and their location as bounding boxes.
+def florencev2_object_detection(image: np.ndarray, prompt: str) -> List[Dict[str, Any]]:
+    """'florencev2_object_detection' is a tool that can detect objects given a text
+    prompt such as a phrase or class names separated by commas. It returns a list of
+    detected objects as labels and their location as bounding boxes with score of 1.0.
     Parameters:
         image (np.ndarray): The image to used to detect objects
@@ -631,29 +623,30 @@ def florencev2_object_detection(image: np.ndarray) -> List[Dict[str, Any]]:
     Example
     -------
-        >>> florencev2_object_detection(image)
+        >>> florencev2_object_detection(image, 'person looking at a coyote')
         [
-            {'score': 1.0, 'label': 'window', 'bbox': [0.1, 0.11, 0.35, 0.4]},
-            {'score': 1.0, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
-            {'score': 1.0, 'label': 'person', 'bbox': [0.34, 0.21, 0.85, 0.5},
+            {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
+            {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
         ]
     """
     image_size = image.shape[:2]
     image_b64 = convert_to_b64(image)
     data = {
         "image": image_b64,
-        "tool": "object_detection",
+        "task": "<CAPTION_TO_PHRASE_GROUNDING>",
+        "prompt": prompt,
         "function_name": "florencev2_object_detection",
     }
-    answer = send_inference_request(data, "tools")
+    detections = send_inference_request(data, "florence2", v2=True)
+    detections = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
     return_data = []
-    for i in range(len(answer["bboxes"])):
+    for i in range(len(detections["bboxes"])):
         return_data.append(
             {
-                "score": round(answer["scores"][i], 2),
-                "label": answer["labels"][i],
-                "bbox": normalize_bbox(answer["bboxes"][i], image_size),
+                "score": 1.0,
+                "label": detections["labels"][i],
+                "bbox": normalize_bbox(detections["bboxes"][i], image_size),
             }
         )
     return return_data
@@ -742,13 +735,16 @@ def depth_anything_v2(image: np.ndarray) -> np.ndarray:
     image_b64 = convert_to_b64(image)
     data = {
         "image": image_b64,
-        "tool": "generate_depth",
         "function_name": "depth_anything_v2",
     }
-    answer = send_inference_request(data, "tools")
-    return_data = np.array(b64_to_pil(answer["masks"][0]).convert("L"))
-    return return_data
+    depth_map = send_inference_request(data, "depth-anything-v2", v2=True)
+    depth_map_np = np.array(depth_map["map"])
+    depth_map_np = (depth_map_np - depth_map_np.min()) / (
+        depth_map_np.max() - depth_map_np.min()
+    )
+    depth_map_np = (255 * depth_map_np).astype(np.uint8)
+    return depth_map_np
 def generate_soft_edge_image(image: np.ndarray) -> np.ndarray:
@@ -839,12 +835,11 @@ def generate_pose_image(image: np.ndarray) -> np.ndarray:
     image_b64 = convert_to_b64(image)
     data = {
         "image": image_b64,
-        "tool": "generate_pose",
         "function_name": "generate_pose_image",
     }
-    answer = send_inference_request(data, "tools")
-    return_data = np.array(b64_to_pil(answer["masks"][0]).convert("RGB"))
+    pos_img = send_inference_request(data, "pose-detector", v2=True)
+    return_data = np.array(b64_to_pil(pos_img["data"]).convert("RGB"))
     return return_data
@@ -1253,7 +1248,6 @@ TOOLS = [
     loca_visual_prompt_counting,
     florencev2_roberta_vqa,
     florencev2_image_caption,
-    florencev2_object_detection,
     detr_segmentation,
     depth_anything_v2,
     generate_soft_edge_image,

vision_agent/utils/type_defs.py CHANGED Viewed

@@ -14,7 +14,7 @@ class LandingaiAPIKey(BaseSettings):
     """
     api_key: str = Field(
-        default="land_sk_fnmSzD0ksknSfvhyD8UGu9R4ss3bKfLL1Im5gb6tDQTy2z1Oy5",
+        default="land_sk_zKvyPcPV2bVoq7q87KwduoerAxuQpx33DnqP8M1BliOCiZOSoI",
         alias="LANDINGAI_API_KEY",
         description="The API key of LandingAI.",
     )

{vision_agent-0.2.99.dist-info → vision_agent-0.2.101.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vision-agent
-Version: 0.2.99
+Version: 0.2.101
 Summary: Toolset for Vision Agent
 Author: Landing AI
 Author-email: dev@landing.ai

{vision_agent-0.2.99.dist-info → vision_agent-0.2.101.dist-info}/RECORD RENAMED Viewed

@@ -18,16 +18,16 @@ vision_agent/tools/__init__.py,sha256=e8q4lYD3acyX1ikMKLz4nlaAR_WZpBAIyq2CGYOYnv
 vision_agent/tools/meta_tools.py,sha256=v2FrLl0YwM7JwsVRfgfnryd9qorbPRiObestexbnNBs,15170
 vision_agent/tools/meta_tools_types.py,sha256=aU4knXEhm0AnDYW958T6Q6qPwN4yq8pQzQOxqFaOjzg,596
 vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
-vision_agent/tools/tool_utils.py,sha256=XoB-iae8hHrBQgJd3fV6-UjZAkClysobUaOM17IcHuE,4597
-vision_agent/tools/tools.py,sha256=aYo0xSbdr-Q4gq_dKxa8yLyczmXoKv_vYYrZ7dM38bw,43219
+vision_agent/tools/tool_utils.py,sha256=ZhZ9oEcOvRSuWPy-gV0rx3pvaaXzBW-ZC3YQanXrq1g,4733
+vision_agent/tools/tools.py,sha256=jWWioqBNsoNaGa8WKVldKBk_y9ZD1shO52kSE-26MFc,43111
 vision_agent/utils/__init__.py,sha256=CW84HnhqI6XQVuxf2KifkLnSuO7EOhmuL09-gAymAak,219
 vision_agent/utils/exceptions.py,sha256=isVH-SVL4vHj3q5kK4z7cy5_aOapAqHXWkpibfSNbUs,1659
 vision_agent/utils/execute.py,sha256=ZRxztUfZwvMvPnFbKx5W_LZzTuKl8Zf5dP3Y8P2-3nk,25093
 vision_agent/utils/image_utils.py,sha256=y69wtNla0xHZ1h1x0-vv7nOyKUq69jtjSJBiDCn6EM0,7703
 vision_agent/utils/sim.py,sha256=7JvtWGN0Ik5ife3qQYWs7Fm3T8AnAXGFd5HnvDC15mQ,4433
-vision_agent/utils/type_defs.py,sha256=oVFJcicB-s_09lqvn61u0A5ncZsTqZArZledXWbrrg0,1384
+vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
 vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
-vision_agent-0.2.99.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-vision_agent-0.2.99.dist-info/METADATA,sha256=QDiN7-jSVTpGtrwJLhvSUM1A7aj1baWhZ9eFf1GVn2E,10728
-vision_agent-0.2.99.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
-vision_agent-0.2.99.dist-info/RECORD,,
+vision_agent-0.2.101.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+vision_agent-0.2.101.dist-info/METADATA,sha256=dgmoZNjCvvGK99H_Xt4aC3usp16r7g2yF-UJqTAn7RI,10729
+vision_agent-0.2.101.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
+vision_agent-0.2.101.dist-info/RECORD,,

{vision_agent-0.2.99.dist-info → vision_agent-0.2.101.dist-info}/LICENSE RENAMED Viewed

File without changes

{vision_agent-0.2.99.dist-info → vision_agent-0.2.101.dist-info}/WHEEL RENAMED Viewed

File without changes

vision-agent 0.2.99__py3-none-any.whl → 0.2.101__py3-none-any.whl

vision-agent 0.2.99py3-none-any.whl → 0.2.101py3-none-any.whl