vision-agent 1.1.16__py3-none-any.whl → 1.1.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -388,8 +388,8 @@ desc,doc,name
388
388
  -------
389
389
  >>> document_qa(image, question)
390
390
  'The answer to the question ...'",document_qa
391
- "'ocr' extracts text from an image. It returns a list of detected text, bounding boxes with normalized coordinates, and confidence scores. The results are sorted from top-left to bottom right.","ocr(image: numpy.ndarray) -> List[Dict[str, Any]]:
392
- 'ocr' extracts text from an image. It returns a list of detected text, bounding
391
+ "'paddle_ocr' extracts text from an image. It returns a list of detected text, bounding boxes with normalized coordinates, and confidence scores. The results are sorted from top-left to bottom right.","paddle_ocr(image: numpy.ndarray) -> List[Dict[str, Any]]:
392
+ 'paddle_ocr' extracts text from an image. It returns a list of detected text, bounding
393
393
  boxes with normalized coordinates, and confidence scores. The results are sorted
394
394
  from top-left to bottom right.
395
395
 
@@ -402,10 +402,10 @@ desc,doc,name
402
402
 
403
403
  Example
404
404
  -------
405
- >>> ocr(image)
405
+ >>> paddle_ocr(image)
406
406
  [
407
407
  {'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
408
- ]",ocr
408
+ ]",paddle_ocr
409
409
  "'gemini_image_generation' performs either image inpainting given an image and text prompt, or image generation given a prompt. It can be used to edit parts of an image or the entire image according to the prompt given.","gemini_image_generation(prompt: str, image: Optional[numpy.ndarray] = None) -> numpy.ndarray:
410
410
  'gemini_image_generation' performs either image inpainting given an image and text prompt, or image generation given a prompt.
411
411
  It can be used to edit parts of an image or the entire image according to the prompt given.
@@ -484,26 +484,26 @@ desc,doc,name
484
484
  {'start_time': 2, 'end_time': 4, 'location': 'Outdoor area', 'description': 'A person approaches a white bicycle parked in a row. The person then swings their leg over the bike and gets on it.', 'label': 0},
485
485
  {'start_time': 10, 'end_time': 13, 'location': 'Outdoor area', 'description': 'A person gets off a white bicycle parked in a row. The person swings their leg over the bike and dismounts.', 'label': 1},
486
486
  ]",agentic_activity_recognition
487
- 'depth_anything_v2' is a tool that runs depth anything v2 model to generate a depth image from a given RGB image. The returned depth image is monochrome and represents depth values as pixel intensities with pixel values ranging from 0 to 255.,"depth_anything_v2(image: numpy.ndarray) -> numpy.ndarray:
488
- 'depth_anything_v2' is a tool that runs depth anything v2 model to generate a
489
- depth image from a given RGB image. The returned depth image is monochrome and
490
- represents depth values as pixel intensities with pixel values ranging from 0 to 255.
487
+ "'depth_pro' is a tool that runs the Apple DepthPro model to generate a depth map from a given RGB image. The returned depth map has the same dimensions as the input image, with each pixel indicating the distance from the camera in meters.","depth_pro(image: numpy.ndarray) -> numpy.ndarray:
488
+ 'depth_pro' is a tool that runs the Apple DepthPro model to generate a
489
+ depth map from a given RGB image. The returned depth map has the same dimensions
490
+ as the input image, with each pixel indicating the distance from the camera in meters.
491
491
 
492
492
  Parameters:
493
493
  image (np.ndarray): The image to used to generate depth image
494
494
 
495
495
  Returns:
496
- np.ndarray: A grayscale depth image with pixel values ranging from 0 to 255
497
- where high values represent closer objects and low values further.
496
+ np.ndarray: A depth map with float32 pixel values that represent
497
+ the distance from the camera in meters.
498
498
 
499
499
  Example
500
500
  -------
501
- >>> depth_anything_v2(image)
501
+ >>> depth_pro(image)
502
502
  array([[0, 0, 0, ..., 0, 0, 0],
503
503
  [0, 20, 24, ..., 0, 100, 103],
504
504
  ...,
505
505
  [10, 11, 15, ..., 202, 202, 205],
506
- [10, 10, 10, ..., 200, 200, 200]], dtype=uint8),",depth_anything_v2
506
+ [10, 10, 10, ..., 200, 200, 200]], dtype=np.float32),",depth_pro
507
507
  'generate_pose_image' is a tool that generates a open pose bone/stick image from a given RGB image. The returned bone image is RGB with the pose amd keypoints colored and background as black.,"generate_pose_image(image: numpy.ndarray) -> numpy.ndarray:
508
508
  'generate_pose_image' is a tool that generates a open pose bone/stick image from
509
509
  a given RGB image. The returned bone image is RGB with the pose amd keypoints colored
Binary file
@@ -21,7 +21,7 @@ from .tools import (
21
21
  countgd_sam2_visual_instance_segmentation,
22
22
  countgd_visual_object_detection,
23
23
  custom_object_detection,
24
- depth_anything_v2,
24
+ depth_pro,
25
25
  detr_segmentation,
26
26
  document_extraction,
27
27
  document_qa,
@@ -42,7 +42,7 @@ from .tools import (
42
42
  glee_sam2_video_tracking,
43
43
  load_image,
44
44
  minimum_distance,
45
- ocr,
45
+ paddle_ocr,
46
46
  od_sam2_video_tracking,
47
47
  overlay_bounding_boxes,
48
48
  overlay_heat_map,
@@ -4,7 +4,7 @@ import logging
4
4
  import os
5
5
  import tempfile
6
6
  import urllib.request
7
- from base64 import b64encode
7
+ from base64 import b64encode, b64decode
8
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
9
9
  from importlib import resources
10
10
  from pathlib import Path
@@ -15,7 +15,6 @@ import time
15
15
  import cv2
16
16
  import numpy as np
17
17
  import pandas as pd
18
- import requests
19
18
  from IPython.display import display
20
19
  from PIL import Image, ImageDraw, ImageFont
21
20
  from pillow_heif import register_heif_opener # type: ignore
@@ -2034,8 +2033,8 @@ def qwen2_vl_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
2034
2033
  return cast(str, data)
2035
2034
 
2036
2035
 
2037
- def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
2038
- """'ocr' extracts text from an image. It returns a list of detected text, bounding
2036
+ def paddle_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
2037
+ """'paddle_ocr' extracts text from an image. It returns a list of detected text, bounding
2039
2038
  boxes with normalized coordinates, and confidence scores. The results are sorted
2040
2039
  from top-left to bottom right.
2041
2040
 
@@ -2048,51 +2047,33 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
2048
2047
 
2049
2048
  Example
2050
2049
  -------
2051
- >>> ocr(image)
2050
+ >>> paddle_ocr(image)
2052
2051
  [
2053
2052
  {'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
2054
2053
  ]
2055
2054
  """
2056
2055
 
2057
- pil_image = Image.fromarray(image).convert("RGB")
2058
- image_size = pil_image.size[::-1]
2056
+ image_size = image.shape[:2]
2059
2057
  if image_size[0] < 1 or image_size[1] < 1:
2060
2058
  return []
2061
- image_buffer = io.BytesIO()
2062
- pil_image.save(image_buffer, format="PNG")
2063
- buffer_bytes = image_buffer.getvalue()
2064
- image_buffer.close()
2065
-
2066
- res = requests.post(
2067
- _OCR_URL,
2068
- files={"images": buffer_bytes},
2069
- data={"language": "en"},
2070
- headers={"contentType": "multipart/form-data", "apikey": _API_KEY},
2071
- )
2072
-
2073
- if res.status_code != 200:
2074
- raise ValueError(f"OCR request failed with status code {res.status_code}")
2075
-
2076
- data = res.json()
2077
- output = []
2078
- for det in data[0]:
2079
- label = det["text"]
2080
- box = [
2081
- det["location"][0]["x"],
2082
- det["location"][0]["y"],
2083
- det["location"][2]["x"],
2084
- det["location"][2]["y"],
2085
- ]
2086
- box = normalize_bbox(box, image_size)
2087
- output.append({"label": label, "bbox": box, "score": round(det["score"], 2)})
2059
+ buffer_bytes = numpy_to_bytes(image)
2060
+ files = [("image", buffer_bytes)]
2061
+
2062
+ res = send_inference_request(
2063
+ payload={"function_name": "paddle-ocr"},
2064
+ endpoint_name="paddle-ocr",
2065
+ files=files,
2066
+ v2=True,
2067
+ )
2088
2068
 
2089
2069
  _display_tool_trace(
2090
- ocr.__name__,
2070
+ paddle_ocr.__name__,
2091
2071
  {},
2092
- data,
2093
- cast(List[Tuple[str, bytes]], [("image", buffer_bytes)]),
2072
+ res,
2073
+ files,
2094
2074
  )
2095
- return sorted(output, key=lambda x: (x["bbox"][1], x["bbox"][0]))
2075
+
2076
+ return sorted(res, key=lambda x: (x["bbox"][1], x["bbox"][0]))
2096
2077
 
2097
2078
 
2098
2079
  def claude35_text_extraction(image: np.ndarray) -> str:
@@ -2370,7 +2351,12 @@ def agentic_activity_recognition(
2370
2351
  buffer_bytes = frames_to_bytes(frames, fps=fps)
2371
2352
  files = [("video", buffer_bytes)]
2372
2353
 
2373
- payload = {"prompt": prompt, "specificity": specificity, "with_audio": with_audio}
2354
+ payload = {
2355
+ "prompt": prompt,
2356
+ "specificity": specificity,
2357
+ "with_audio": with_audio,
2358
+ "function_name": "agentic_activity_recognition",
2359
+ }
2374
2360
 
2375
2361
  response = send_inference_request(
2376
2362
  payload=payload, endpoint_name="activity-recognition", files=files, v2=True
@@ -2529,48 +2515,53 @@ def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
2529
2515
  return return_data
2530
2516
 
2531
2517
 
2532
- def depth_anything_v2(image: np.ndarray) -> np.ndarray:
2533
- """'depth_anything_v2' is a tool that runs depth anything v2 model to generate a
2534
- depth image from a given RGB image. The returned depth image is monochrome and
2535
- represents depth values as pixel intensities with pixel values ranging from 0 to 255.
2518
+ def depth_pro(
2519
+ image: np.ndarray,
2520
+ ) -> np.ndarray:
2521
+ """'depth_pro' is a tool that runs the Apple DepthPro model to generate a
2522
+ depth map from a given RGB image. The returned depth map has the same dimensions
2523
+ as the input image, with each pixel indicating the distance from the camera in meters.
2536
2524
 
2537
2525
  Parameters:
2538
2526
  image (np.ndarray): The image to used to generate depth image
2539
2527
 
2540
2528
  Returns:
2541
- np.ndarray: A grayscale depth image with pixel values ranging from 0 to 255
2542
- where high values represent closer objects and low values further.
2529
+ np.ndarray: A depth map with float32 pixel values that represent
2530
+ the distance from the camera in meters.
2543
2531
 
2544
2532
  Example
2545
2533
  -------
2546
- >>> depth_anything_v2(image)
2534
+ >>> depth_pro(image)
2547
2535
  array([[0, 0, 0, ..., 0, 0, 0],
2548
2536
  [0, 20, 24, ..., 0, 100, 103],
2549
2537
  ...,
2550
2538
  [10, 11, 15, ..., 202, 202, 205],
2551
- [10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
2539
+ [10, 10, 10, ..., 200, 200, 200]], dtype=np.float32),
2552
2540
  """
2553
- if image.shape[0] < 1 or image.shape[1] < 1:
2554
- raise ValueError(f"Image is empty, image shape: {image.shape}")
2555
2541
 
2556
- image_b64 = convert_to_b64(image)
2557
- data = {
2558
- "image": image_b64,
2559
- "function_name": "depth_anything_v2",
2560
- }
2542
+ image_size = image.shape[:2]
2543
+ if image_size[0] < 1 or image_size[1] < 1:
2544
+ return np.empty(0)
2545
+ buffer_bytes = numpy_to_bytes(image)
2546
+ files = [("image", buffer_bytes)]
2561
2547
 
2562
- depth_map = send_inference_request(data, "depth-anything-v2", v2=True)
2563
- depth_map_np = np.array(depth_map["map"])
2564
- depth_map_np = (depth_map_np - depth_map_np.min()) / (
2565
- depth_map_np.max() - depth_map_np.min()
2548
+ detections = send_inference_request(
2549
+ payload={"function_name": "depth-pro"},
2550
+ endpoint_name="depth-pro",
2551
+ files=files,
2552
+ v2=True,
2566
2553
  )
2567
- depth_map_np = (255 * depth_map_np).astype(np.uint8)
2554
+
2555
+ depth_bytes = b64decode(detections["depth"])
2556
+ depth_map_np = np.frombuffer(depth_bytes, dtype=np.float32).reshape(image_size)
2557
+
2568
2558
  _display_tool_trace(
2569
- depth_anything_v2.__name__,
2559
+ depth_pro.__name__,
2570
2560
  {},
2571
- depth_map,
2572
- image_b64,
2561
+ response=detections,
2562
+ files=files,
2573
2563
  )
2564
+
2574
2565
  return depth_map_np
2575
2566
 
2576
2567
 
@@ -3564,12 +3555,12 @@ FUNCTION_TOOLS = [
3564
3555
  claude35_text_extraction,
3565
3556
  agentic_document_extraction,
3566
3557
  document_qa,
3567
- ocr,
3558
+ paddle_ocr,
3568
3559
  gemini_image_generation,
3569
3560
  qwen25_vl_images_vqa,
3570
3561
  qwen25_vl_video_vqa,
3571
3562
  agentic_activity_recognition,
3572
- depth_anything_v2,
3563
+ depth_pro,
3573
3564
  generate_pose_image,
3574
3565
  vit_nsfw_classification,
3575
3566
  siglip_classification,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vision-agent
3
- Version: 1.1.16
3
+ Version: 1.1.17
4
4
  Summary: Toolset for Vision Agent
5
5
  Project-URL: Homepage, https://landing.ai
6
6
  Project-URL: repository, https://github.com/landing-ai/vision-agent
@@ -1,6 +1,6 @@
1
1
  vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
2
- vision_agent/.sim_tools/df.csv,sha256=i732_U1KQf55UNhT-9srtZXF91XvDnfWBDdc8EqDmpw,41215
3
- vision_agent/.sim_tools/embs.npy,sha256=XCu3LnLS10IS3npfPMqX2VHIbDPq9iY_NPDBwq5AEj0,245888
2
+ vision_agent/.sim_tools/df.csv,sha256=gheT5OXu68o0AfjV1623GzbD-T2csZ7GnkBbCMaVl8c,41188
3
+ vision_agent/.sim_tools/embs.npy,sha256=OLj2rt4aBFze2HIf9bQ3yn0-_3RVPecrHWxm2CWvgn0,245888
4
4
  vision_agent/agent/README.md,sha256=3XSPG_VO7-6y6P8COvcgSSonWj5uvfgvfmOkBpfKK8Q,5527
5
5
  vision_agent/agent/__init__.py,sha256=_-nGLHhRTLViXxBSb9D4OwLTqk9HXKPEkTBkvK8c7OU,206
6
6
  vision_agent/agent/agent.py,sha256=o1Zuhl6h2R7uVwvUur0Aj38kak8U08plfeFWPst_ErM,1576
@@ -26,11 +26,11 @@ vision_agent/models/lmm_types.py,sha256=v04h-NjbczHOIN8UWa1vvO5-1BDuZ4JQhD2mge1c
26
26
  vision_agent/models/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
27
27
  vision_agent/sim/__init__.py,sha256=Aouz6HEPPTYcLxR5_0fTYCL1OvPKAH1RMWAF90QXAlA,135
28
28
  vision_agent/sim/sim.py,sha256=WQY_x9A4VT647qGDBScJ3R8_Iv0aoYLHTgwcQSCXwv4,10059
29
- vision_agent/tools/__init__.py,sha256=zf8HzjcMSgxKhtrxbqYe9hmvsfuweeDMrOc8eVA8Ya8,2477
29
+ vision_agent/tools/__init__.py,sha256=WfynKGn0Zl2GPkyFhzA2YhGGC0Dtb1oei4Hk_GdSY1c,2476
30
30
  vision_agent/tools/meta_tools.py,sha256=9iJilpGYEiXW0nYPTYAWHa7l23wGN8IM5KbE7mWDOT0,6798
31
31
  vision_agent/tools/planner_tools.py,sha256=iQWtTgXdomn0IWrbmvXXM-y8Q_RSEOxyP04HIRLrgWI,19576
32
32
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
33
- vision_agent/tools/tools.py,sha256=i9GGGu8tvo2M6O5fF4UUBTpn_Ul2KEN9mG3ZlJ95qao,124929
33
+ vision_agent/tools/tools.py,sha256=lndSG8xrIWcs6Rpe1-Jq44niUDXQnWlYfGP2B1YjpI0,124216
34
34
  vision_agent/utils/__init__.py,sha256=mANUs_84VL-3gpZbXryvV2mWU623eWnRlJCSUHtMjuw,122
35
35
  vision_agent/utils/agent.py,sha256=2ifTP5QElItnr4YHOJR6L5P1PUzV0GhChTTqVxuVyQg,15153
36
36
  vision_agent/utils/exceptions.py,sha256=zis8smCbdEylBVZBTVfEUfAh7Rb7cWV3MSPambu6FsQ,1837
@@ -40,7 +40,7 @@ vision_agent/utils/tools.py,sha256=Days0dETPRQLSDamMKPnXFsc5g5IKX9QJcPPNmSHNdM,8
40
40
  vision_agent/utils/tools_doc.py,sha256=PKcXXbJktiuPi9q6Q1zXzFx24Dh229SNgWBDtZ2fQSQ,2730
41
41
  vision_agent/utils/video.py,sha256=rjsQ1sKKisaQ6AVjJz0zd_G4g-ovRweS_rs4JEhenoI,5340
42
42
  vision_agent/utils/video_tracking.py,sha256=DZLFpNCuzuPJQzbQoVNcp-m4dKxgiKdCNM5QTh_zURE,12245
43
- vision_agent-1.1.16.dist-info/METADATA,sha256=JMmL6rIdT1-WO6XTrjNHucAp4S_UlkjDW1dxznQJ994,12078
44
- vision_agent-1.1.16.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
45
- vision_agent-1.1.16.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
46
- vision_agent-1.1.16.dist-info/RECORD,,
43
+ vision_agent-1.1.17.dist-info/METADATA,sha256=LDH3i8vb2g6aqoEuRSPHdigP1bmhBjxZTQ37-cD9RlA,12078
44
+ vision_agent-1.1.17.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
45
+ vision_agent-1.1.17.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
46
+ vision_agent-1.1.17.dist-info/RECORD,,