vision-agent 0.2.99__py3-none-any.whl → 0.2.101__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/tools/tool_utils.py +7 -4
- vision_agent/tools/tools.py +51 -57
- vision_agent/utils/type_defs.py +1 -1
- {vision_agent-0.2.99.dist-info → vision_agent-0.2.101.dist-info}/METADATA +1 -1
- {vision_agent-0.2.99.dist-info → vision_agent-0.2.101.dist-info}/RECORD +7 -7
- {vision_agent-0.2.99.dist-info → vision_agent-0.2.101.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.99.dist-info → vision_agent-0.2.101.dist-info}/WHEEL +0 -0
vision_agent/tools/tool_utils.py
CHANGED
@@ -16,7 +16,8 @@ from vision_agent.utils.type_defs import LandingaiAPIKey
|
|
16
16
|
|
17
17
|
_LOGGER = logging.getLogger(__name__)
|
18
18
|
_LND_API_KEY = LandingaiAPIKey().api_key
|
19
|
-
_LND_API_URL = "https://api.landing.ai/v1/agent"
|
19
|
+
_LND_API_URL = "https://api.landing.ai/v1/agent/model"
|
20
|
+
_LND_API_URL_v2 = "https://api.landing.ai/v1/tools"
|
20
21
|
|
21
22
|
|
22
23
|
class ToolCallTrace(BaseModel):
|
@@ -27,13 +28,13 @@ class ToolCallTrace(BaseModel):
|
|
27
28
|
|
28
29
|
|
29
30
|
def send_inference_request(
|
30
|
-
payload: Dict[str, Any], endpoint_name: str
|
31
|
+
payload: Dict[str, Any], endpoint_name: str, v2: bool = False
|
31
32
|
) -> Dict[str, Any]:
|
32
33
|
try:
|
33
34
|
if runtime_tag := os.environ.get("RUNTIME_TAG", ""):
|
34
35
|
payload["runtime_tag"] = runtime_tag
|
35
36
|
|
36
|
-
url = f"{_LND_API_URL}/
|
37
|
+
url = f"{_LND_API_URL_v2 if v2 else _LND_API_URL}/{endpoint_name}"
|
37
38
|
if "TOOL_ENDPOINT_URL" in os.environ:
|
38
39
|
url = os.environ["TOOL_ENDPOINT_URL"]
|
39
40
|
|
@@ -61,7 +62,9 @@ def send_inference_request(
|
|
61
62
|
traceback_raw=[],
|
62
63
|
)
|
63
64
|
_LOGGER.error(f"Request failed: {res.status_code} {res.text}")
|
64
|
-
raise RemoteToolCallFailed(
|
65
|
+
raise RemoteToolCallFailed(
|
66
|
+
payload["function_name"], res.status_code, res.text
|
67
|
+
)
|
65
68
|
|
66
69
|
resp = res.json()
|
67
70
|
tool_call_trace.response = resp
|
vision_agent/tools/tools.py
CHANGED
@@ -126,7 +126,6 @@ def owl_v2(
|
|
126
126
|
prompt: str,
|
127
127
|
image: np.ndarray,
|
128
128
|
box_threshold: float = 0.10,
|
129
|
-
iou_threshold: float = 0.10,
|
130
129
|
) -> List[Dict[str, Any]]:
|
131
130
|
"""'owl_v2' is a tool that can detect and count multiple objects given a text
|
132
131
|
prompt such as category names or referring expressions. The categories in text prompt
|
@@ -138,8 +137,6 @@ def owl_v2(
|
|
138
137
|
image (np.ndarray): The image to ground the prompt to.
|
139
138
|
box_threshold (float, optional): The threshold for the box detection. Defaults
|
140
139
|
to 0.10.
|
141
|
-
iou_threshold (float, optional): The threshold for the Intersection over Union
|
142
|
-
(IoU). Defaults to 0.10.
|
143
140
|
|
144
141
|
Returns:
|
145
142
|
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
|
@@ -159,22 +156,22 @@ def owl_v2(
|
|
159
156
|
image_size = image.shape[:2]
|
160
157
|
image_b64 = convert_to_b64(image)
|
161
158
|
request_data = {
|
162
|
-
"
|
159
|
+
"prompts": prompt.split("."),
|
163
160
|
"image": image_b64,
|
164
|
-
"
|
165
|
-
"kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
|
161
|
+
"confidence": box_threshold,
|
166
162
|
"function_name": "owl_v2",
|
167
163
|
}
|
168
|
-
data: Dict[str, Any] = send_inference_request(request_data, "
|
164
|
+
data: Dict[str, Any] = send_inference_request(request_data, "owlv2", v2=True)
|
169
165
|
return_data = []
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
166
|
+
if data is not None:
|
167
|
+
for elt in data:
|
168
|
+
return_data.append(
|
169
|
+
{
|
170
|
+
"bbox": normalize_bbox(elt["bbox"], image_size), # type: ignore
|
171
|
+
"label": elt["label"], # type: ignore
|
172
|
+
"score": round(elt["score"], 2), # type: ignore
|
173
|
+
}
|
174
|
+
)
|
178
175
|
return return_data
|
179
176
|
|
180
177
|
|
@@ -367,11 +364,10 @@ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
|
|
367
364
|
image_b64 = convert_to_b64(image)
|
368
365
|
data = {
|
369
366
|
"image": image_b64,
|
370
|
-
"tool": "zero_shot_counting",
|
371
367
|
"function_name": "loca_zero_shot_counting",
|
372
368
|
}
|
373
|
-
resp_data = send_inference_request(data, "
|
374
|
-
resp_data["heat_map"] = np.array(
|
369
|
+
resp_data = send_inference_request(data, "loca", v2=True)
|
370
|
+
resp_data["heat_map"] = np.array(resp_data["heat_map"][0]).astype(np.uint8)
|
375
371
|
return resp_data
|
376
372
|
|
377
373
|
|
@@ -397,17 +393,15 @@ def loca_visual_prompt_counting(
|
|
397
393
|
|
398
394
|
image_size = get_image_size(image)
|
399
395
|
bbox = visual_prompt["bbox"]
|
400
|
-
bbox_str = ", ".join(map(str, denormalize_bbox(bbox, image_size)))
|
401
396
|
image_b64 = convert_to_b64(image)
|
402
397
|
|
403
398
|
data = {
|
404
399
|
"image": image_b64,
|
405
|
-
"
|
406
|
-
"tool": "few_shot_counting",
|
400
|
+
"bbox": list(map(int, denormalize_bbox(bbox, image_size))),
|
407
401
|
"function_name": "loca_visual_prompt_counting",
|
408
402
|
}
|
409
|
-
resp_data = send_inference_request(data, "
|
410
|
-
resp_data["heat_map"] = np.array(
|
403
|
+
resp_data = send_inference_request(data, "loca", v2=True)
|
404
|
+
resp_data["heat_map"] = np.array(resp_data["heat_map"][0]).astype(np.uint8)
|
411
405
|
return resp_data
|
412
406
|
|
413
407
|
|
@@ -432,13 +426,12 @@ def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
|
|
432
426
|
image_b64 = convert_to_b64(image)
|
433
427
|
data = {
|
434
428
|
"image": image_b64,
|
435
|
-
"
|
436
|
-
"tool": "image_question_answering_with_context",
|
429
|
+
"question": prompt,
|
437
430
|
"function_name": "florencev2_roberta_vqa",
|
438
431
|
}
|
439
432
|
|
440
|
-
answer = send_inference_request(data, "
|
441
|
-
return answer
|
433
|
+
answer = send_inference_request(data, "florence2-qa", v2=True)
|
434
|
+
return answer # type: ignore
|
442
435
|
|
443
436
|
|
444
437
|
def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
|
@@ -544,17 +537,16 @@ def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]:
|
|
544
537
|
Example
|
545
538
|
-------
|
546
539
|
>>> vit_nsfw_classification(image)
|
547
|
-
{"
|
540
|
+
{"label": "normal", "scores": 0.68},
|
548
541
|
"""
|
549
542
|
|
550
543
|
image_b64 = convert_to_b64(image)
|
551
544
|
data = {
|
552
545
|
"image": image_b64,
|
553
|
-
"tool": "nsfw_image_classification",
|
554
546
|
"function_name": "vit_nsfw_classification",
|
555
547
|
}
|
556
|
-
resp_data = send_inference_request(data, "
|
557
|
-
resp_data["
|
548
|
+
resp_data = send_inference_request(data, "nsfw-classification", v2=True)
|
549
|
+
resp_data["score"] = round(resp_data["score"], 4)
|
558
550
|
return resp_data
|
559
551
|
|
560
552
|
|
@@ -603,21 +595,21 @@ def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) ->
|
|
603
595
|
'This image contains a cat sitting on a table with a bowl of milk.'
|
604
596
|
"""
|
605
597
|
image_b64 = convert_to_b64(image)
|
598
|
+
task = "<MORE_DETAILED_CAPTION>" if detail_caption else "<DETAILED_CAPTION>"
|
606
599
|
data = {
|
607
600
|
"image": image_b64,
|
608
|
-
"
|
609
|
-
"detail_caption": detail_caption,
|
601
|
+
"task": task,
|
610
602
|
"function_name": "florencev2_image_caption",
|
611
603
|
}
|
612
604
|
|
613
|
-
answer = send_inference_request(data, "
|
614
|
-
return answer[
|
605
|
+
answer = send_inference_request(data, "florence2", v2=True)
|
606
|
+
return answer[task] # type: ignore
|
615
607
|
|
616
608
|
|
617
|
-
def florencev2_object_detection(image: np.ndarray) -> List[Dict[str, Any]]:
|
618
|
-
"""'florencev2_object_detection' is a tool that can detect
|
619
|
-
|
620
|
-
as labels and their location as bounding boxes.
|
609
|
+
def florencev2_object_detection(image: np.ndarray, prompt: str) -> List[Dict[str, Any]]:
|
610
|
+
"""'florencev2_object_detection' is a tool that can detect objects given a text
|
611
|
+
prompt such as a phrase or class names separated by commas. It returns a list of
|
612
|
+
detected objects as labels and their location as bounding boxes with score of 1.0.
|
621
613
|
|
622
614
|
Parameters:
|
623
615
|
image (np.ndarray): The image to used to detect objects
|
@@ -631,29 +623,30 @@ def florencev2_object_detection(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
631
623
|
|
632
624
|
Example
|
633
625
|
-------
|
634
|
-
>>> florencev2_object_detection(image)
|
626
|
+
>>> florencev2_object_detection(image, 'person looking at a coyote')
|
635
627
|
[
|
636
|
-
{'score': 1.0, 'label': '
|
637
|
-
{'score': 1.0, 'label': '
|
638
|
-
{'score': 1.0, 'label': 'person', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
628
|
+
{'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
|
629
|
+
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
|
639
630
|
]
|
640
631
|
"""
|
641
632
|
image_size = image.shape[:2]
|
642
633
|
image_b64 = convert_to_b64(image)
|
643
634
|
data = {
|
644
635
|
"image": image_b64,
|
645
|
-
"
|
636
|
+
"task": "<CAPTION_TO_PHRASE_GROUNDING>",
|
637
|
+
"prompt": prompt,
|
646
638
|
"function_name": "florencev2_object_detection",
|
647
639
|
}
|
648
640
|
|
649
|
-
|
641
|
+
detections = send_inference_request(data, "florence2", v2=True)
|
642
|
+
detections = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
|
650
643
|
return_data = []
|
651
|
-
for i in range(len(
|
644
|
+
for i in range(len(detections["bboxes"])):
|
652
645
|
return_data.append(
|
653
646
|
{
|
654
|
-
"score":
|
655
|
-
"label":
|
656
|
-
"bbox": normalize_bbox(
|
647
|
+
"score": 1.0,
|
648
|
+
"label": detections["labels"][i],
|
649
|
+
"bbox": normalize_bbox(detections["bboxes"][i], image_size),
|
657
650
|
}
|
658
651
|
)
|
659
652
|
return return_data
|
@@ -742,13 +735,16 @@ def depth_anything_v2(image: np.ndarray) -> np.ndarray:
|
|
742
735
|
image_b64 = convert_to_b64(image)
|
743
736
|
data = {
|
744
737
|
"image": image_b64,
|
745
|
-
"tool": "generate_depth",
|
746
738
|
"function_name": "depth_anything_v2",
|
747
739
|
}
|
748
740
|
|
749
|
-
|
750
|
-
|
751
|
-
|
741
|
+
depth_map = send_inference_request(data, "depth-anything-v2", v2=True)
|
742
|
+
depth_map_np = np.array(depth_map["map"])
|
743
|
+
depth_map_np = (depth_map_np - depth_map_np.min()) / (
|
744
|
+
depth_map_np.max() - depth_map_np.min()
|
745
|
+
)
|
746
|
+
depth_map_np = (255 * depth_map_np).astype(np.uint8)
|
747
|
+
return depth_map_np
|
752
748
|
|
753
749
|
|
754
750
|
def generate_soft_edge_image(image: np.ndarray) -> np.ndarray:
|
@@ -839,12 +835,11 @@ def generate_pose_image(image: np.ndarray) -> np.ndarray:
|
|
839
835
|
image_b64 = convert_to_b64(image)
|
840
836
|
data = {
|
841
837
|
"image": image_b64,
|
842
|
-
"tool": "generate_pose",
|
843
838
|
"function_name": "generate_pose_image",
|
844
839
|
}
|
845
840
|
|
846
|
-
|
847
|
-
return_data = np.array(b64_to_pil(
|
841
|
+
pos_img = send_inference_request(data, "pose-detector", v2=True)
|
842
|
+
return_data = np.array(b64_to_pil(pos_img["data"]).convert("RGB"))
|
848
843
|
return return_data
|
849
844
|
|
850
845
|
|
@@ -1253,7 +1248,6 @@ TOOLS = [
|
|
1253
1248
|
loca_visual_prompt_counting,
|
1254
1249
|
florencev2_roberta_vqa,
|
1255
1250
|
florencev2_image_caption,
|
1256
|
-
florencev2_object_detection,
|
1257
1251
|
detr_segmentation,
|
1258
1252
|
depth_anything_v2,
|
1259
1253
|
generate_soft_edge_image,
|
vision_agent/utils/type_defs.py
CHANGED
@@ -14,7 +14,7 @@ class LandingaiAPIKey(BaseSettings):
|
|
14
14
|
"""
|
15
15
|
|
16
16
|
api_key: str = Field(
|
17
|
-
default="
|
17
|
+
default="land_sk_zKvyPcPV2bVoq7q87KwduoerAxuQpx33DnqP8M1BliOCiZOSoI",
|
18
18
|
alias="LANDINGAI_API_KEY",
|
19
19
|
description="The API key of LandingAI.",
|
20
20
|
)
|
@@ -18,16 +18,16 @@ vision_agent/tools/__init__.py,sha256=e8q4lYD3acyX1ikMKLz4nlaAR_WZpBAIyq2CGYOYnv
|
|
18
18
|
vision_agent/tools/meta_tools.py,sha256=v2FrLl0YwM7JwsVRfgfnryd9qorbPRiObestexbnNBs,15170
|
19
19
|
vision_agent/tools/meta_tools_types.py,sha256=aU4knXEhm0AnDYW958T6Q6qPwN4yq8pQzQOxqFaOjzg,596
|
20
20
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
21
|
-
vision_agent/tools/tool_utils.py,sha256=
|
22
|
-
vision_agent/tools/tools.py,sha256=
|
21
|
+
vision_agent/tools/tool_utils.py,sha256=ZhZ9oEcOvRSuWPy-gV0rx3pvaaXzBW-ZC3YQanXrq1g,4733
|
22
|
+
vision_agent/tools/tools.py,sha256=jWWioqBNsoNaGa8WKVldKBk_y9ZD1shO52kSE-26MFc,43111
|
23
23
|
vision_agent/utils/__init__.py,sha256=CW84HnhqI6XQVuxf2KifkLnSuO7EOhmuL09-gAymAak,219
|
24
24
|
vision_agent/utils/exceptions.py,sha256=isVH-SVL4vHj3q5kK4z7cy5_aOapAqHXWkpibfSNbUs,1659
|
25
25
|
vision_agent/utils/execute.py,sha256=ZRxztUfZwvMvPnFbKx5W_LZzTuKl8Zf5dP3Y8P2-3nk,25093
|
26
26
|
vision_agent/utils/image_utils.py,sha256=y69wtNla0xHZ1h1x0-vv7nOyKUq69jtjSJBiDCn6EM0,7703
|
27
27
|
vision_agent/utils/sim.py,sha256=7JvtWGN0Ik5ife3qQYWs7Fm3T8AnAXGFd5HnvDC15mQ,4433
|
28
|
-
vision_agent/utils/type_defs.py,sha256=
|
28
|
+
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
29
29
|
vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
|
30
|
-
vision_agent-0.2.
|
31
|
-
vision_agent-0.2.
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
30
|
+
vision_agent-0.2.101.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
31
|
+
vision_agent-0.2.101.dist-info/METADATA,sha256=dgmoZNjCvvGK99H_Xt4aC3usp16r7g2yF-UJqTAn7RI,10729
|
32
|
+
vision_agent-0.2.101.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
33
|
+
vision_agent-0.2.101.dist-info/RECORD,,
|
File without changes
|
File without changes
|