vision-agent 0.2.99__py3-none-any.whl → 0.2.101__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,7 +16,8 @@ from vision_agent.utils.type_defs import LandingaiAPIKey
16
16
 
17
17
  _LOGGER = logging.getLogger(__name__)
18
18
  _LND_API_KEY = LandingaiAPIKey().api_key
19
- _LND_API_URL = "https://api.landing.ai/v1/agent"
19
+ _LND_API_URL = "https://api.landing.ai/v1/agent/model"
20
+ _LND_API_URL_v2 = "https://api.landing.ai/v1/tools"
20
21
 
21
22
 
22
23
  class ToolCallTrace(BaseModel):
@@ -27,13 +28,13 @@ class ToolCallTrace(BaseModel):
27
28
 
28
29
 
29
30
  def send_inference_request(
30
- payload: Dict[str, Any], endpoint_name: str
31
+ payload: Dict[str, Any], endpoint_name: str, v2: bool = False
31
32
  ) -> Dict[str, Any]:
32
33
  try:
33
34
  if runtime_tag := os.environ.get("RUNTIME_TAG", ""):
34
35
  payload["runtime_tag"] = runtime_tag
35
36
 
36
- url = f"{_LND_API_URL}/model/{endpoint_name}"
37
+ url = f"{_LND_API_URL_v2 if v2 else _LND_API_URL}/{endpoint_name}"
37
38
  if "TOOL_ENDPOINT_URL" in os.environ:
38
39
  url = os.environ["TOOL_ENDPOINT_URL"]
39
40
 
@@ -61,7 +62,9 @@ def send_inference_request(
61
62
  traceback_raw=[],
62
63
  )
63
64
  _LOGGER.error(f"Request failed: {res.status_code} {res.text}")
64
- raise RemoteToolCallFailed(payload["tool"], res.status_code, res.text)
65
+ raise RemoteToolCallFailed(
66
+ payload["function_name"], res.status_code, res.text
67
+ )
65
68
 
66
69
  resp = res.json()
67
70
  tool_call_trace.response = resp
@@ -126,7 +126,6 @@ def owl_v2(
126
126
  prompt: str,
127
127
  image: np.ndarray,
128
128
  box_threshold: float = 0.10,
129
- iou_threshold: float = 0.10,
130
129
  ) -> List[Dict[str, Any]]:
131
130
  """'owl_v2' is a tool that can detect and count multiple objects given a text
132
131
  prompt such as category names or referring expressions. The categories in text prompt
@@ -138,8 +137,6 @@ def owl_v2(
138
137
  image (np.ndarray): The image to ground the prompt to.
139
138
  box_threshold (float, optional): The threshold for the box detection. Defaults
140
139
  to 0.10.
141
- iou_threshold (float, optional): The threshold for the Intersection over Union
142
- (IoU). Defaults to 0.10.
143
140
 
144
141
  Returns:
145
142
  List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
@@ -159,22 +156,22 @@ def owl_v2(
159
156
  image_size = image.shape[:2]
160
157
  image_b64 = convert_to_b64(image)
161
158
  request_data = {
162
- "prompt": prompt,
159
+ "prompts": prompt.split("."),
163
160
  "image": image_b64,
164
- "tool": "open_vocab_detection",
165
- "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
161
+ "confidence": box_threshold,
166
162
  "function_name": "owl_v2",
167
163
  }
168
- data: Dict[str, Any] = send_inference_request(request_data, "tools")
164
+ data: Dict[str, Any] = send_inference_request(request_data, "owlv2", v2=True)
169
165
  return_data = []
170
- for i in range(len(data["bboxes"])):
171
- return_data.append(
172
- {
173
- "score": round(data["scores"][i], 2),
174
- "label": data["labels"][i].strip(),
175
- "bbox": normalize_bbox(data["bboxes"][i], image_size),
176
- }
177
- )
166
+ if data is not None:
167
+ for elt in data:
168
+ return_data.append(
169
+ {
170
+ "bbox": normalize_bbox(elt["bbox"], image_size), # type: ignore
171
+ "label": elt["label"], # type: ignore
172
+ "score": round(elt["score"], 2), # type: ignore
173
+ }
174
+ )
178
175
  return return_data
179
176
 
180
177
 
@@ -367,11 +364,10 @@ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
367
364
  image_b64 = convert_to_b64(image)
368
365
  data = {
369
366
  "image": image_b64,
370
- "tool": "zero_shot_counting",
371
367
  "function_name": "loca_zero_shot_counting",
372
368
  }
373
- resp_data = send_inference_request(data, "tools")
374
- resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
369
+ resp_data = send_inference_request(data, "loca", v2=True)
370
+ resp_data["heat_map"] = np.array(resp_data["heat_map"][0]).astype(np.uint8)
375
371
  return resp_data
376
372
 
377
373
 
@@ -397,17 +393,15 @@ def loca_visual_prompt_counting(
397
393
 
398
394
  image_size = get_image_size(image)
399
395
  bbox = visual_prompt["bbox"]
400
- bbox_str = ", ".join(map(str, denormalize_bbox(bbox, image_size)))
401
396
  image_b64 = convert_to_b64(image)
402
397
 
403
398
  data = {
404
399
  "image": image_b64,
405
- "prompt": bbox_str,
406
- "tool": "few_shot_counting",
400
+ "bbox": list(map(int, denormalize_bbox(bbox, image_size))),
407
401
  "function_name": "loca_visual_prompt_counting",
408
402
  }
409
- resp_data = send_inference_request(data, "tools")
410
- resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0]))
403
+ resp_data = send_inference_request(data, "loca", v2=True)
404
+ resp_data["heat_map"] = np.array(resp_data["heat_map"][0]).astype(np.uint8)
411
405
  return resp_data
412
406
 
413
407
 
@@ -432,13 +426,12 @@ def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
432
426
  image_b64 = convert_to_b64(image)
433
427
  data = {
434
428
  "image": image_b64,
435
- "prompt": prompt,
436
- "tool": "image_question_answering_with_context",
429
+ "question": prompt,
437
430
  "function_name": "florencev2_roberta_vqa",
438
431
  }
439
432
 
440
- answer = send_inference_request(data, "tools")
441
- return answer["text"][0] # type: ignore
433
+ answer = send_inference_request(data, "florence2-qa", v2=True)
434
+ return answer # type: ignore
442
435
 
443
436
 
444
437
  def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
@@ -544,17 +537,16 @@ def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]:
544
537
  Example
545
538
  -------
546
539
  >>> vit_nsfw_classification(image)
547
- {"labels": "normal", "scores": 0.68},
540
+ {"label": "normal", "scores": 0.68},
548
541
  """
549
542
 
550
543
  image_b64 = convert_to_b64(image)
551
544
  data = {
552
545
  "image": image_b64,
553
- "tool": "nsfw_image_classification",
554
546
  "function_name": "vit_nsfw_classification",
555
547
  }
556
- resp_data = send_inference_request(data, "tools")
557
- resp_data["scores"] = round(resp_data["scores"], 4)
548
+ resp_data = send_inference_request(data, "nsfw-classification", v2=True)
549
+ resp_data["score"] = round(resp_data["score"], 4)
558
550
  return resp_data
559
551
 
560
552
 
@@ -603,21 +595,21 @@ def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) ->
603
595
  'This image contains a cat sitting on a table with a bowl of milk.'
604
596
  """
605
597
  image_b64 = convert_to_b64(image)
598
+ task = "<MORE_DETAILED_CAPTION>" if detail_caption else "<DETAILED_CAPTION>"
606
599
  data = {
607
600
  "image": image_b64,
608
- "tool": "florence2_image_captioning",
609
- "detail_caption": detail_caption,
601
+ "task": task,
610
602
  "function_name": "florencev2_image_caption",
611
603
  }
612
604
 
613
- answer = send_inference_request(data, "tools")
614
- return answer["text"][0] # type: ignore
605
+ answer = send_inference_request(data, "florence2", v2=True)
606
+ return answer[task] # type: ignore
615
607
 
616
608
 
617
- def florencev2_object_detection(image: np.ndarray) -> List[Dict[str, Any]]:
618
- """'florencev2_object_detection' is a tool that can detect common objects in an
619
- image without any text prompt or thresholding. It returns a list of detected objects
620
- as labels and their location as bounding boxes.
609
+ def florencev2_object_detection(image: np.ndarray, prompt: str) -> List[Dict[str, Any]]:
610
+ """'florencev2_object_detection' is a tool that can detect objects given a text
611
+ prompt such as a phrase or class names separated by commas. It returns a list of
612
+ detected objects as labels and their location as bounding boxes with score of 1.0.
621
613
 
622
614
  Parameters:
623
615
  image (np.ndarray): The image to used to detect objects
@@ -631,29 +623,30 @@ def florencev2_object_detection(image: np.ndarray) -> List[Dict[str, Any]]:
631
623
 
632
624
  Example
633
625
  -------
634
- >>> florencev2_object_detection(image)
626
+ >>> florencev2_object_detection(image, 'person looking at a coyote')
635
627
  [
636
- {'score': 1.0, 'label': 'window', 'bbox': [0.1, 0.11, 0.35, 0.4]},
637
- {'score': 1.0, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
638
- {'score': 1.0, 'label': 'person', 'bbox': [0.34, 0.21, 0.85, 0.5},
628
+ {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
629
+ {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
639
630
  ]
640
631
  """
641
632
  image_size = image.shape[:2]
642
633
  image_b64 = convert_to_b64(image)
643
634
  data = {
644
635
  "image": image_b64,
645
- "tool": "object_detection",
636
+ "task": "<CAPTION_TO_PHRASE_GROUNDING>",
637
+ "prompt": prompt,
646
638
  "function_name": "florencev2_object_detection",
647
639
  }
648
640
 
649
- answer = send_inference_request(data, "tools")
641
+ detections = send_inference_request(data, "florence2", v2=True)
642
+ detections = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
650
643
  return_data = []
651
- for i in range(len(answer["bboxes"])):
644
+ for i in range(len(detections["bboxes"])):
652
645
  return_data.append(
653
646
  {
654
- "score": round(answer["scores"][i], 2),
655
- "label": answer["labels"][i],
656
- "bbox": normalize_bbox(answer["bboxes"][i], image_size),
647
+ "score": 1.0,
648
+ "label": detections["labels"][i],
649
+ "bbox": normalize_bbox(detections["bboxes"][i], image_size),
657
650
  }
658
651
  )
659
652
  return return_data
@@ -742,13 +735,16 @@ def depth_anything_v2(image: np.ndarray) -> np.ndarray:
742
735
  image_b64 = convert_to_b64(image)
743
736
  data = {
744
737
  "image": image_b64,
745
- "tool": "generate_depth",
746
738
  "function_name": "depth_anything_v2",
747
739
  }
748
740
 
749
- answer = send_inference_request(data, "tools")
750
- return_data = np.array(b64_to_pil(answer["masks"][0]).convert("L"))
751
- return return_data
741
+ depth_map = send_inference_request(data, "depth-anything-v2", v2=True)
742
+ depth_map_np = np.array(depth_map["map"])
743
+ depth_map_np = (depth_map_np - depth_map_np.min()) / (
744
+ depth_map_np.max() - depth_map_np.min()
745
+ )
746
+ depth_map_np = (255 * depth_map_np).astype(np.uint8)
747
+ return depth_map_np
752
748
 
753
749
 
754
750
  def generate_soft_edge_image(image: np.ndarray) -> np.ndarray:
@@ -839,12 +835,11 @@ def generate_pose_image(image: np.ndarray) -> np.ndarray:
839
835
  image_b64 = convert_to_b64(image)
840
836
  data = {
841
837
  "image": image_b64,
842
- "tool": "generate_pose",
843
838
  "function_name": "generate_pose_image",
844
839
  }
845
840
 
846
- answer = send_inference_request(data, "tools")
847
- return_data = np.array(b64_to_pil(answer["masks"][0]).convert("RGB"))
841
+ pos_img = send_inference_request(data, "pose-detector", v2=True)
842
+ return_data = np.array(b64_to_pil(pos_img["data"]).convert("RGB"))
848
843
  return return_data
849
844
 
850
845
 
@@ -1253,7 +1248,6 @@ TOOLS = [
1253
1248
  loca_visual_prompt_counting,
1254
1249
  florencev2_roberta_vqa,
1255
1250
  florencev2_image_caption,
1256
- florencev2_object_detection,
1257
1251
  detr_segmentation,
1258
1252
  depth_anything_v2,
1259
1253
  generate_soft_edge_image,
@@ -14,7 +14,7 @@ class LandingaiAPIKey(BaseSettings):
14
14
  """
15
15
 
16
16
  api_key: str = Field(
17
- default="land_sk_fnmSzD0ksknSfvhyD8UGu9R4ss3bKfLL1Im5gb6tDQTy2z1Oy5",
17
+ default="land_sk_zKvyPcPV2bVoq7q87KwduoerAxuQpx33DnqP8M1BliOCiZOSoI",
18
18
  alias="LANDINGAI_API_KEY",
19
19
  description="The API key of LandingAI.",
20
20
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.99
3
+ Version: 0.2.101
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -18,16 +18,16 @@ vision_agent/tools/__init__.py,sha256=e8q4lYD3acyX1ikMKLz4nlaAR_WZpBAIyq2CGYOYnv
18
18
  vision_agent/tools/meta_tools.py,sha256=v2FrLl0YwM7JwsVRfgfnryd9qorbPRiObestexbnNBs,15170
19
19
  vision_agent/tools/meta_tools_types.py,sha256=aU4knXEhm0AnDYW958T6Q6qPwN4yq8pQzQOxqFaOjzg,596
20
20
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
21
- vision_agent/tools/tool_utils.py,sha256=XoB-iae8hHrBQgJd3fV6-UjZAkClysobUaOM17IcHuE,4597
22
- vision_agent/tools/tools.py,sha256=aYo0xSbdr-Q4gq_dKxa8yLyczmXoKv_vYYrZ7dM38bw,43219
21
+ vision_agent/tools/tool_utils.py,sha256=ZhZ9oEcOvRSuWPy-gV0rx3pvaaXzBW-ZC3YQanXrq1g,4733
22
+ vision_agent/tools/tools.py,sha256=jWWioqBNsoNaGa8WKVldKBk_y9ZD1shO52kSE-26MFc,43111
23
23
  vision_agent/utils/__init__.py,sha256=CW84HnhqI6XQVuxf2KifkLnSuO7EOhmuL09-gAymAak,219
24
24
  vision_agent/utils/exceptions.py,sha256=isVH-SVL4vHj3q5kK4z7cy5_aOapAqHXWkpibfSNbUs,1659
25
25
  vision_agent/utils/execute.py,sha256=ZRxztUfZwvMvPnFbKx5W_LZzTuKl8Zf5dP3Y8P2-3nk,25093
26
26
  vision_agent/utils/image_utils.py,sha256=y69wtNla0xHZ1h1x0-vv7nOyKUq69jtjSJBiDCn6EM0,7703
27
27
  vision_agent/utils/sim.py,sha256=7JvtWGN0Ik5ife3qQYWs7Fm3T8AnAXGFd5HnvDC15mQ,4433
28
- vision_agent/utils/type_defs.py,sha256=oVFJcicB-s_09lqvn61u0A5ncZsTqZArZledXWbrrg0,1384
28
+ vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
29
29
  vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
30
- vision_agent-0.2.99.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
- vision_agent-0.2.99.dist-info/METADATA,sha256=QDiN7-jSVTpGtrwJLhvSUM1A7aj1baWhZ9eFf1GVn2E,10728
32
- vision_agent-0.2.99.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
- vision_agent-0.2.99.dist-info/RECORD,,
30
+ vision_agent-0.2.101.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
+ vision_agent-0.2.101.dist-info/METADATA,sha256=dgmoZNjCvvGK99H_Xt4aC3usp16r7g2yF-UJqTAn7RI,10729
32
+ vision_agent-0.2.101.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
+ vision_agent-0.2.101.dist-info/RECORD,,