vision-agent 1.1.15__py3-none-any.whl → 1.1.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/.sim_tools/df.csv +12 -12
- vision_agent/.sim_tools/embs.npy +0 -0
- vision_agent/tools/__init__.py +2 -2
- vision_agent/tools/tools.py +55 -64
- {vision_agent-1.1.15.dist-info → vision_agent-1.1.17.dist-info}/METADATA +3 -8
- {vision_agent-1.1.15.dist-info → vision_agent-1.1.17.dist-info}/RECORD +8 -8
- {vision_agent-1.1.15.dist-info → vision_agent-1.1.17.dist-info}/WHEEL +0 -0
- {vision_agent-1.1.15.dist-info → vision_agent-1.1.17.dist-info}/licenses/LICENSE +0 -0
vision_agent/.sim_tools/df.csv
CHANGED
@@ -388,8 +388,8 @@ desc,doc,name
|
|
388
388
|
-------
|
389
389
|
>>> document_qa(image, question)
|
390
390
|
'The answer to the question ...'",document_qa
|
391
|
-
"'
|
392
|
-
'
|
391
|
+
"'paddle_ocr' extracts text from an image. It returns a list of detected text, bounding boxes with normalized coordinates, and confidence scores. The results are sorted from top-left to bottom right.","paddle_ocr(image: numpy.ndarray) -> List[Dict[str, Any]]:
|
392
|
+
'paddle_ocr' extracts text from an image. It returns a list of detected text, bounding
|
393
393
|
boxes with normalized coordinates, and confidence scores. The results are sorted
|
394
394
|
from top-left to bottom right.
|
395
395
|
|
@@ -402,10 +402,10 @@ desc,doc,name
|
|
402
402
|
|
403
403
|
Example
|
404
404
|
-------
|
405
|
-
>>>
|
405
|
+
>>> paddle_ocr(image)
|
406
406
|
[
|
407
407
|
{'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
|
408
|
-
]",
|
408
|
+
]",paddle_ocr
|
409
409
|
"'gemini_image_generation' performs either image inpainting given an image and text prompt, or image generation given a prompt. It can be used to edit parts of an image or the entire image according to the prompt given.","gemini_image_generation(prompt: str, image: Optional[numpy.ndarray] = None) -> numpy.ndarray:
|
410
410
|
'gemini_image_generation' performs either image inpainting given an image and text prompt, or image generation given a prompt.
|
411
411
|
It can be used to edit parts of an image or the entire image according to the prompt given.
|
@@ -484,26 +484,26 @@ desc,doc,name
|
|
484
484
|
{'start_time': 2, 'end_time': 4, 'location': 'Outdoor area', 'description': 'A person approaches a white bicycle parked in a row. The person then swings their leg over the bike and gets on it.', 'label': 0},
|
485
485
|
{'start_time': 10, 'end_time': 13, 'location': 'Outdoor area', 'description': 'A person gets off a white bicycle parked in a row. The person swings their leg over the bike and dismounts.', 'label': 1},
|
486
486
|
]",agentic_activity_recognition
|
487
|
-
'
|
488
|
-
'
|
489
|
-
depth
|
490
|
-
|
487
|
+
"'depth_pro' is a tool that runs the Apple DepthPro model to generate a depth map from a given RGB image. The returned depth map has the same dimensions as the input image, with each pixel indicating the distance from the camera in meters.","depth_pro(image: numpy.ndarray) -> numpy.ndarray:
|
488
|
+
'depth_pro' is a tool that runs the Apple DepthPro model to generate a
|
489
|
+
depth map from a given RGB image. The returned depth map has the same dimensions
|
490
|
+
as the input image, with each pixel indicating the distance from the camera in meters.
|
491
491
|
|
492
492
|
Parameters:
|
493
493
|
image (np.ndarray): The image to used to generate depth image
|
494
494
|
|
495
495
|
Returns:
|
496
|
-
np.ndarray: A
|
497
|
-
|
496
|
+
np.ndarray: A depth map with float32 pixel values that represent
|
497
|
+
the distance from the camera in meters.
|
498
498
|
|
499
499
|
Example
|
500
500
|
-------
|
501
|
-
>>>
|
501
|
+
>>> depth_pro(image)
|
502
502
|
array([[0, 0, 0, ..., 0, 0, 0],
|
503
503
|
[0, 20, 24, ..., 0, 100, 103],
|
504
504
|
...,
|
505
505
|
[10, 11, 15, ..., 202, 202, 205],
|
506
|
-
[10, 10, 10, ..., 200, 200, 200]], dtype=
|
506
|
+
[10, 10, 10, ..., 200, 200, 200]], dtype=np.float32),",depth_pro
|
507
507
|
'generate_pose_image' is a tool that generates a open pose bone/stick image from a given RGB image. The returned bone image is RGB with the pose amd keypoints colored and background as black.,"generate_pose_image(image: numpy.ndarray) -> numpy.ndarray:
|
508
508
|
'generate_pose_image' is a tool that generates a open pose bone/stick image from
|
509
509
|
a given RGB image. The returned bone image is RGB with the pose amd keypoints colored
|
vision_agent/.sim_tools/embs.npy
CHANGED
Binary file
|
vision_agent/tools/__init__.py
CHANGED
@@ -21,7 +21,7 @@ from .tools import (
|
|
21
21
|
countgd_sam2_visual_instance_segmentation,
|
22
22
|
countgd_visual_object_detection,
|
23
23
|
custom_object_detection,
|
24
|
-
|
24
|
+
depth_pro,
|
25
25
|
detr_segmentation,
|
26
26
|
document_extraction,
|
27
27
|
document_qa,
|
@@ -42,7 +42,7 @@ from .tools import (
|
|
42
42
|
glee_sam2_video_tracking,
|
43
43
|
load_image,
|
44
44
|
minimum_distance,
|
45
|
-
|
45
|
+
paddle_ocr,
|
46
46
|
od_sam2_video_tracking,
|
47
47
|
overlay_bounding_boxes,
|
48
48
|
overlay_heat_map,
|
vision_agent/tools/tools.py
CHANGED
@@ -4,7 +4,7 @@ import logging
|
|
4
4
|
import os
|
5
5
|
import tempfile
|
6
6
|
import urllib.request
|
7
|
-
from base64 import b64encode
|
7
|
+
from base64 import b64encode, b64decode
|
8
8
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
9
9
|
from importlib import resources
|
10
10
|
from pathlib import Path
|
@@ -15,7 +15,6 @@ import time
|
|
15
15
|
import cv2
|
16
16
|
import numpy as np
|
17
17
|
import pandas as pd
|
18
|
-
import requests
|
19
18
|
from IPython.display import display
|
20
19
|
from PIL import Image, ImageDraw, ImageFont
|
21
20
|
from pillow_heif import register_heif_opener # type: ignore
|
@@ -2034,8 +2033,8 @@ def qwen2_vl_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
|
|
2034
2033
|
return cast(str, data)
|
2035
2034
|
|
2036
2035
|
|
2037
|
-
def
|
2038
|
-
"""'
|
2036
|
+
def paddle_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
2037
|
+
"""'paddle_ocr' extracts text from an image. It returns a list of detected text, bounding
|
2039
2038
|
boxes with normalized coordinates, and confidence scores. The results are sorted
|
2040
2039
|
from top-left to bottom right.
|
2041
2040
|
|
@@ -2048,51 +2047,33 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
2048
2047
|
|
2049
2048
|
Example
|
2050
2049
|
-------
|
2051
|
-
>>>
|
2050
|
+
>>> paddle_ocr(image)
|
2052
2051
|
[
|
2053
2052
|
{'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
|
2054
2053
|
]
|
2055
2054
|
"""
|
2056
2055
|
|
2057
|
-
|
2058
|
-
image_size = pil_image.size[::-1]
|
2056
|
+
image_size = image.shape[:2]
|
2059
2057
|
if image_size[0] < 1 or image_size[1] < 1:
|
2060
2058
|
return []
|
2061
|
-
|
2062
|
-
|
2063
|
-
|
2064
|
-
|
2065
|
-
|
2066
|
-
|
2067
|
-
|
2068
|
-
|
2069
|
-
|
2070
|
-
headers={"contentType": "multipart/form-data", "apikey": _API_KEY},
|
2071
|
-
)
|
2072
|
-
|
2073
|
-
if res.status_code != 200:
|
2074
|
-
raise ValueError(f"OCR request failed with status code {res.status_code}")
|
2075
|
-
|
2076
|
-
data = res.json()
|
2077
|
-
output = []
|
2078
|
-
for det in data[0]:
|
2079
|
-
label = det["text"]
|
2080
|
-
box = [
|
2081
|
-
det["location"][0]["x"],
|
2082
|
-
det["location"][0]["y"],
|
2083
|
-
det["location"][2]["x"],
|
2084
|
-
det["location"][2]["y"],
|
2085
|
-
]
|
2086
|
-
box = normalize_bbox(box, image_size)
|
2087
|
-
output.append({"label": label, "bbox": box, "score": round(det["score"], 2)})
|
2059
|
+
buffer_bytes = numpy_to_bytes(image)
|
2060
|
+
files = [("image", buffer_bytes)]
|
2061
|
+
|
2062
|
+
res = send_inference_request(
|
2063
|
+
payload={"function_name": "paddle-ocr"},
|
2064
|
+
endpoint_name="paddle-ocr",
|
2065
|
+
files=files,
|
2066
|
+
v2=True,
|
2067
|
+
)
|
2088
2068
|
|
2089
2069
|
_display_tool_trace(
|
2090
|
-
|
2070
|
+
paddle_ocr.__name__,
|
2091
2071
|
{},
|
2092
|
-
|
2093
|
-
|
2072
|
+
res,
|
2073
|
+
files,
|
2094
2074
|
)
|
2095
|
-
|
2075
|
+
|
2076
|
+
return sorted(res, key=lambda x: (x["bbox"][1], x["bbox"][0]))
|
2096
2077
|
|
2097
2078
|
|
2098
2079
|
def claude35_text_extraction(image: np.ndarray) -> str:
|
@@ -2370,7 +2351,12 @@ def agentic_activity_recognition(
|
|
2370
2351
|
buffer_bytes = frames_to_bytes(frames, fps=fps)
|
2371
2352
|
files = [("video", buffer_bytes)]
|
2372
2353
|
|
2373
|
-
payload = {
|
2354
|
+
payload = {
|
2355
|
+
"prompt": prompt,
|
2356
|
+
"specificity": specificity,
|
2357
|
+
"with_audio": with_audio,
|
2358
|
+
"function_name": "agentic_activity_recognition",
|
2359
|
+
}
|
2374
2360
|
|
2375
2361
|
response = send_inference_request(
|
2376
2362
|
payload=payload, endpoint_name="activity-recognition", files=files, v2=True
|
@@ -2529,48 +2515,53 @@ def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
|
|
2529
2515
|
return return_data
|
2530
2516
|
|
2531
2517
|
|
2532
|
-
def
|
2533
|
-
|
2534
|
-
|
2535
|
-
|
2518
|
+
def depth_pro(
|
2519
|
+
image: np.ndarray,
|
2520
|
+
) -> np.ndarray:
|
2521
|
+
"""'depth_pro' is a tool that runs the Apple DepthPro model to generate a
|
2522
|
+
depth map from a given RGB image. The returned depth map has the same dimensions
|
2523
|
+
as the input image, with each pixel indicating the distance from the camera in meters.
|
2536
2524
|
|
2537
2525
|
Parameters:
|
2538
2526
|
image (np.ndarray): The image to used to generate depth image
|
2539
2527
|
|
2540
2528
|
Returns:
|
2541
|
-
np.ndarray: A
|
2542
|
-
|
2529
|
+
np.ndarray: A depth map with float32 pixel values that represent
|
2530
|
+
the distance from the camera in meters.
|
2543
2531
|
|
2544
2532
|
Example
|
2545
2533
|
-------
|
2546
|
-
>>>
|
2534
|
+
>>> depth_pro(image)
|
2547
2535
|
array([[0, 0, 0, ..., 0, 0, 0],
|
2548
2536
|
[0, 20, 24, ..., 0, 100, 103],
|
2549
2537
|
...,
|
2550
2538
|
[10, 11, 15, ..., 202, 202, 205],
|
2551
|
-
[10, 10, 10, ..., 200, 200, 200]], dtype=
|
2539
|
+
[10, 10, 10, ..., 200, 200, 200]], dtype=np.float32),
|
2552
2540
|
"""
|
2553
|
-
if image.shape[0] < 1 or image.shape[1] < 1:
|
2554
|
-
raise ValueError(f"Image is empty, image shape: {image.shape}")
|
2555
2541
|
|
2556
|
-
|
2557
|
-
|
2558
|
-
|
2559
|
-
|
2560
|
-
|
2542
|
+
image_size = image.shape[:2]
|
2543
|
+
if image_size[0] < 1 or image_size[1] < 1:
|
2544
|
+
return np.empty(0)
|
2545
|
+
buffer_bytes = numpy_to_bytes(image)
|
2546
|
+
files = [("image", buffer_bytes)]
|
2561
2547
|
|
2562
|
-
|
2563
|
-
|
2564
|
-
|
2565
|
-
|
2548
|
+
detections = send_inference_request(
|
2549
|
+
payload={"function_name": "depth-pro"},
|
2550
|
+
endpoint_name="depth-pro",
|
2551
|
+
files=files,
|
2552
|
+
v2=True,
|
2566
2553
|
)
|
2567
|
-
|
2554
|
+
|
2555
|
+
depth_bytes = b64decode(detections["depth"])
|
2556
|
+
depth_map_np = np.frombuffer(depth_bytes, dtype=np.float32).reshape(image_size)
|
2557
|
+
|
2568
2558
|
_display_tool_trace(
|
2569
|
-
|
2559
|
+
depth_pro.__name__,
|
2570
2560
|
{},
|
2571
|
-
|
2572
|
-
|
2561
|
+
response=detections,
|
2562
|
+
files=files,
|
2573
2563
|
)
|
2564
|
+
|
2574
2565
|
return depth_map_np
|
2575
2566
|
|
2576
2567
|
|
@@ -3564,12 +3555,12 @@ FUNCTION_TOOLS = [
|
|
3564
3555
|
claude35_text_extraction,
|
3565
3556
|
agentic_document_extraction,
|
3566
3557
|
document_qa,
|
3567
|
-
|
3558
|
+
paddle_ocr,
|
3568
3559
|
gemini_image_generation,
|
3569
3560
|
qwen25_vl_images_vqa,
|
3570
3561
|
qwen25_vl_video_vqa,
|
3571
3562
|
agentic_activity_recognition,
|
3572
|
-
|
3563
|
+
depth_pro,
|
3573
3564
|
generate_pose_image,
|
3574
3565
|
vit_nsfw_classification,
|
3575
3566
|
siglip_classification,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 1.1.
|
3
|
+
Version: 1.1.17
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Project-URL: Homepage, https://landing.ai
|
6
6
|
Project-URL: repository, https://github.com/landing-ai/vision-agent
|
@@ -56,10 +56,8 @@ _Prompt with an image/video → Get runnable vision code → Build Visual AI App
|
|
56
56
|
</div>
|
57
57
|
|
58
58
|
<p align="center">
|
59
|
-
<a href="https://va.landing.ai/agent" target="_blank"><strong>Web App</strong></a> ·
|
60
59
|
<a href="https://discord.com/invite/RVcW3j9RgR" target="_blank"><strong>Discord</strong></a> ·
|
61
60
|
<a href="https://landing.ai/blog/visionagent-an-agentic-approach-for-complex-visual-reasoning" target="_blank"><strong>Architecture</strong></a> ·
|
62
|
-
<a href="https://support.landing.ai/docs/visionagent" target="_blank"><strong>Docs</strong></a> ·
|
63
61
|
<a href="https://www.youtube.com/playlist?list=PLrKGAzovU85fvo22OnVtPl90mxBygIf79" target="_blank"><strong>YouTube</strong></a>
|
64
62
|
</p>
|
65
63
|
|
@@ -67,12 +65,11 @@ _Prompt with an image/video → Get runnable vision code → Build Visual AI App
|
|
67
65
|
|
68
66
|
**VisionAgent** is the Visual AI pilot from LandingAI. Give it a prompt and an image, and it automatically picks the right vision models and outputs ready‑to‑run code—letting you build vision‑enabled apps in minutes.
|
69
67
|
|
70
|
-
Prefer full control? Install the library and run VisionAgent locally. Just want to dive in quickly? Use the [VisionAgent web app](https://va.landing.ai/).
|
71
68
|
|
72
69
|
## Steps to Set Up the Library
|
73
70
|
|
74
71
|
### Get Your VisionAgent API Key
|
75
|
-
The most important step is to [
|
72
|
+
The most important step is to [create an account](https://va.landing.ai/home) and obtain your [API key](https://va.landing.ai/settings/api-key).
|
76
73
|
|
77
74
|
### Other Prerequisites
|
78
75
|
- Python version 3.9 or higher
|
@@ -82,9 +79,8 @@ The most important step is to [signup](https://va.landing.ai/agent) and obtain y
|
|
82
79
|
### Why do I need Anthropic and Google API Keys?
|
83
80
|
VisionAgent uses models from Anthropic and Google to respond to prompts and generate code.
|
84
81
|
|
85
|
-
When you run the web-based version of VisionAgent, the app uses the LandingAI API keys to access these models.
|
86
82
|
|
87
|
-
When you run VisionAgent
|
83
|
+
When you run VisionAgent, the app will need to use your API keys to access the Anthropic and Google models. This ensures that any projects you run with VisionAgent aren’t limited by the rate limits in place with the LandingAI accounts, and it also prevents many users from overloading the LandingAI rate limits.
|
88
84
|
|
89
85
|
Anthropic and Google each have their own rate limits and paid tiers. Refer to their documentation and pricing to learn more.
|
90
86
|
|
@@ -271,5 +267,4 @@ with this code:
|
|
271
267
|
## Resources
|
272
268
|
- [Discord](https://discord.com/invite/RVcW3j9RgR): Check out our community of VisionAgent users to share use cases and learn about updates.
|
273
269
|
- [VisionAgent Library Docs](https://landing-ai.github.io/vision-agent/): Learn how to use this library.
|
274
|
-
- [VisionAgent Web App Docs](https://support.landing.ai/docs/agentic-ai): Learn how to use the web-based version of VisionAgent.
|
275
270
|
- [Video Tutorials](https://www.youtube.com/playlist?list=PLrKGAzovU85fvo22OnVtPl90mxBygIf79): Watch the latest video tutorials to see how VisionAgent is used in a variety of use cases.
|
@@ -1,6 +1,6 @@
|
|
1
1
|
vision_agent/__init__.py,sha256=EAb4-f9iyuEYkBrX4ag1syM8Syx8118_t0R6_C34M9w,57
|
2
|
-
vision_agent/.sim_tools/df.csv,sha256=
|
3
|
-
vision_agent/.sim_tools/embs.npy,sha256=
|
2
|
+
vision_agent/.sim_tools/df.csv,sha256=gheT5OXu68o0AfjV1623GzbD-T2csZ7GnkBbCMaVl8c,41188
|
3
|
+
vision_agent/.sim_tools/embs.npy,sha256=OLj2rt4aBFze2HIf9bQ3yn0-_3RVPecrHWxm2CWvgn0,245888
|
4
4
|
vision_agent/agent/README.md,sha256=3XSPG_VO7-6y6P8COvcgSSonWj5uvfgvfmOkBpfKK8Q,5527
|
5
5
|
vision_agent/agent/__init__.py,sha256=_-nGLHhRTLViXxBSb9D4OwLTqk9HXKPEkTBkvK8c7OU,206
|
6
6
|
vision_agent/agent/agent.py,sha256=o1Zuhl6h2R7uVwvUur0Aj38kak8U08plfeFWPst_ErM,1576
|
@@ -26,11 +26,11 @@ vision_agent/models/lmm_types.py,sha256=v04h-NjbczHOIN8UWa1vvO5-1BDuZ4JQhD2mge1c
|
|
26
26
|
vision_agent/models/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
27
27
|
vision_agent/sim/__init__.py,sha256=Aouz6HEPPTYcLxR5_0fTYCL1OvPKAH1RMWAF90QXAlA,135
|
28
28
|
vision_agent/sim/sim.py,sha256=WQY_x9A4VT647qGDBScJ3R8_Iv0aoYLHTgwcQSCXwv4,10059
|
29
|
-
vision_agent/tools/__init__.py,sha256=
|
29
|
+
vision_agent/tools/__init__.py,sha256=WfynKGn0Zl2GPkyFhzA2YhGGC0Dtb1oei4Hk_GdSY1c,2476
|
30
30
|
vision_agent/tools/meta_tools.py,sha256=9iJilpGYEiXW0nYPTYAWHa7l23wGN8IM5KbE7mWDOT0,6798
|
31
31
|
vision_agent/tools/planner_tools.py,sha256=iQWtTgXdomn0IWrbmvXXM-y8Q_RSEOxyP04HIRLrgWI,19576
|
32
32
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
33
|
-
vision_agent/tools/tools.py,sha256=
|
33
|
+
vision_agent/tools/tools.py,sha256=lndSG8xrIWcs6Rpe1-Jq44niUDXQnWlYfGP2B1YjpI0,124216
|
34
34
|
vision_agent/utils/__init__.py,sha256=mANUs_84VL-3gpZbXryvV2mWU623eWnRlJCSUHtMjuw,122
|
35
35
|
vision_agent/utils/agent.py,sha256=2ifTP5QElItnr4YHOJR6L5P1PUzV0GhChTTqVxuVyQg,15153
|
36
36
|
vision_agent/utils/exceptions.py,sha256=zis8smCbdEylBVZBTVfEUfAh7Rb7cWV3MSPambu6FsQ,1837
|
@@ -40,7 +40,7 @@ vision_agent/utils/tools.py,sha256=Days0dETPRQLSDamMKPnXFsc5g5IKX9QJcPPNmSHNdM,8
|
|
40
40
|
vision_agent/utils/tools_doc.py,sha256=PKcXXbJktiuPi9q6Q1zXzFx24Dh229SNgWBDtZ2fQSQ,2730
|
41
41
|
vision_agent/utils/video.py,sha256=rjsQ1sKKisaQ6AVjJz0zd_G4g-ovRweS_rs4JEhenoI,5340
|
42
42
|
vision_agent/utils/video_tracking.py,sha256=DZLFpNCuzuPJQzbQoVNcp-m4dKxgiKdCNM5QTh_zURE,12245
|
43
|
-
vision_agent-1.1.
|
44
|
-
vision_agent-1.1.
|
45
|
-
vision_agent-1.1.
|
46
|
-
vision_agent-1.1.
|
43
|
+
vision_agent-1.1.17.dist-info/METADATA,sha256=LDH3i8vb2g6aqoEuRSPHdigP1bmhBjxZTQ37-cD9RlA,12078
|
44
|
+
vision_agent-1.1.17.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
45
|
+
vision_agent-1.1.17.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
46
|
+
vision_agent-1.1.17.dist-info/RECORD,,
|
File without changes
|
File without changes
|