vision-agent 0.2.185__py3-none-any.whl → 0.2.187__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vision_agent/tools/tools.py +23 -15
- vision_agent/utils/video.py +9 -0
- {vision_agent-0.2.185.dist-info → vision_agent-0.2.187.dist-info}/METADATA +3 -3
- {vision_agent-0.2.185.dist-info → vision_agent-0.2.187.dist-info}/RECORD +6 -6
- {vision_agent-0.2.185.dist-info → vision_agent-0.2.187.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.185.dist-info → vision_agent-0.2.187.dist-info}/WHEEL +0 -0
vision_agent/tools/tools.py
CHANGED
@@ -1057,23 +1057,25 @@ def video_temporal_localization(
|
|
1057
1057
|
prompt: str,
|
1058
1058
|
frames: List[np.ndarray],
|
1059
1059
|
model: str = "qwen2vl",
|
1060
|
-
chunk_length: Optional[float] = None,
|
1061
|
-
chunk_length_seconds: Optional[float] = None,
|
1062
1060
|
chunk_length_frames: Optional[int] = 2,
|
1063
1061
|
) -> List[float]:
|
1064
|
-
"""'video_temporal_localization'
|
1065
|
-
|
1062
|
+
"""'video_temporal_localization' will run qwen2vl on each chunk_length_frames
|
1063
|
+
value selected for the video. It can detect multiple objects independently per
|
1064
|
+
chunk_length_frames given a text prompt such as a referring expression
|
1065
|
+
but does not track objects across frames.
|
1066
|
+
It returns a list of floats with a value of 1.0 if the objects are found in a given
|
1067
|
+
chunk_length_frames of the video.
|
1066
1068
|
|
1067
1069
|
Parameters:
|
1068
1070
|
prompt (str): The question about the video
|
1069
1071
|
frames (List[np.ndarray]): The reference frames used for the question
|
1070
|
-
model (str): The model to use for the inference. Valid values are
|
1071
|
-
|
1072
|
-
chunk_length_seconds (Optional[float]): alternative length for chunk in seconds
|
1072
|
+
model (str): The model to use for the inference. Valid values are
|
1073
|
+
'qwen2vl', 'gpt4o', 'internlm-xcomposer'
|
1073
1074
|
chunk_length_frames (Optional[int]): length of each chunk in frames
|
1074
1075
|
|
1075
1076
|
Returns:
|
1076
|
-
List[float]: A list of floats with a value of 1.0 if the
|
1077
|
+
List[float]: A list of floats with a value of 1.0 if the objects to be found
|
1078
|
+
are present in the chunk_length_frames of the video.
|
1077
1079
|
|
1078
1080
|
Example
|
1079
1081
|
-------
|
@@ -1088,10 +1090,6 @@ def video_temporal_localization(
|
|
1088
1090
|
"model": model,
|
1089
1091
|
"function_name": "video_temporal_localization",
|
1090
1092
|
}
|
1091
|
-
if chunk_length is not None:
|
1092
|
-
payload["chunk_length"] = chunk_length
|
1093
|
-
if chunk_length_seconds is not None:
|
1094
|
-
payload["chunk_length_seconds"] = chunk_length_seconds
|
1095
1093
|
if chunk_length_frames is not None:
|
1096
1094
|
payload["chunk_length_frames"] = chunk_length_frames
|
1097
1095
|
|
@@ -1790,9 +1788,8 @@ def flux_image_inpainting(
|
|
1790
1788
|
where 1 indicates areas to be inpainted and 0 indicates areas to be preserved.
|
1791
1789
|
|
1792
1790
|
Returns:
|
1793
|
-
np.ndarray:
|
1794
|
-
|
1795
|
-
with values ranging from 0 to 255.
|
1791
|
+
np.ndarray: The generated image(s) as a numpy array in RGB format with values
|
1792
|
+
ranging from 0 to 255.
|
1796
1793
|
|
1797
1794
|
-------
|
1798
1795
|
Example:
|
@@ -1874,6 +1871,9 @@ def extract_frames_and_timestamps(
|
|
1874
1871
|
>>> extract_frames("path/to/video.mp4")
|
1875
1872
|
[{"frame": np.ndarray, "timestamp": 0.0}, ...]
|
1876
1873
|
"""
|
1874
|
+
if isinstance(fps, str):
|
1875
|
+
# fps could be a string when it's passed in from a web endpoint deployment
|
1876
|
+
fps = float(fps)
|
1877
1877
|
|
1878
1878
|
def reformat(
|
1879
1879
|
frames_and_timestamps: List[Tuple[np.ndarray, float]],
|
@@ -1937,6 +1937,7 @@ def save_json(data: Any, file_path: str) -> None:
|
|
1937
1937
|
return bool(obj)
|
1938
1938
|
return json.JSONEncoder.default(self, obj)
|
1939
1939
|
|
1940
|
+
Path(file_path).parent.mkdir(parents=True, exist_ok=True)
|
1940
1941
|
with open(file_path, "w") as f:
|
1941
1942
|
json.dump(data, f, cls=NumpyEncoder)
|
1942
1943
|
|
@@ -1979,6 +1980,7 @@ def save_image(image: np.ndarray, file_path: str) -> None:
|
|
1979
1980
|
-------
|
1980
1981
|
>>> save_image(image)
|
1981
1982
|
"""
|
1983
|
+
Path(file_path).parent.mkdir(parents=True, exist_ok=True)
|
1982
1984
|
from IPython.display import display
|
1983
1985
|
|
1984
1986
|
if not isinstance(image, np.ndarray) or (
|
@@ -2009,6 +2011,9 @@ def save_video(
|
|
2009
2011
|
>>> save_video(frames)
|
2010
2012
|
"/tmp/tmpvideo123.mp4"
|
2011
2013
|
"""
|
2014
|
+
if isinstance(fps, str):
|
2015
|
+
# fps could be a string when it's passed in from a web endpoint deployment
|
2016
|
+
fps = float(fps)
|
2012
2017
|
if fps <= 0:
|
2013
2018
|
raise ValueError(f"fps must be greater than 0 got {fps}")
|
2014
2019
|
|
@@ -2025,6 +2030,8 @@ def save_video(
|
|
2025
2030
|
output_video_path = tempfile.NamedTemporaryFile(
|
2026
2031
|
delete=False, suffix=".mp4"
|
2027
2032
|
).name
|
2033
|
+
else:
|
2034
|
+
Path(output_video_path).parent.mkdir(parents=True, exist_ok=True)
|
2028
2035
|
|
2029
2036
|
output_video_path = video_writer(frames, fps, output_video_path)
|
2030
2037
|
_save_video_to_result(output_video_path)
|
@@ -2351,6 +2358,7 @@ FUNCTION_TOOLS = [
|
|
2351
2358
|
closest_box_distance,
|
2352
2359
|
qwen2_vl_images_vqa,
|
2353
2360
|
qwen2_vl_video_vqa,
|
2361
|
+
video_temporal_localization,
|
2354
2362
|
]
|
2355
2363
|
|
2356
2364
|
UTIL_TOOLS = [
|
vision_agent/utils/video.py
CHANGED
@@ -58,6 +58,9 @@ def video_writer(
|
|
58
58
|
fps: float = _DEFAULT_INPUT_FPS,
|
59
59
|
filename: Optional[str] = None,
|
60
60
|
) -> str:
|
61
|
+
if isinstance(fps, str):
|
62
|
+
# fps could be a string when it's passed in from a web endpoint deployment
|
63
|
+
fps = float(fps)
|
61
64
|
if filename is None:
|
62
65
|
filename = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
|
63
66
|
container = av.open(filename, mode="w")
|
@@ -92,6 +95,9 @@ def frames_to_bytes(
|
|
92
95
|
fps: the frames per second of the video
|
93
96
|
file_ext: the file extension of the video file
|
94
97
|
"""
|
98
|
+
if isinstance(fps, str):
|
99
|
+
# fps could be a string when it's passed in from a web endpoint deployment
|
100
|
+
fps = float(fps)
|
95
101
|
with tempfile.NamedTemporaryFile(delete=True, suffix=file_ext) as temp_file:
|
96
102
|
video_writer(frames, fps, temp_file.name)
|
97
103
|
|
@@ -120,6 +126,9 @@ def extract_frames_from_video(
|
|
120
126
|
from the start of the video. E.g. 12.125 means 12.125 seconds from the start of
|
121
127
|
the video. The frames are sorted by the timestamp in ascending order.
|
122
128
|
"""
|
129
|
+
if isinstance(fps, str):
|
130
|
+
# fps could be a string when it's passed in from a web endpoint deployment
|
131
|
+
fps = float(fps)
|
123
132
|
|
124
133
|
cap = cv2.VideoCapture(video_uri)
|
125
134
|
orig_fps = cap.get(cv2.CAP_PROP_FPS)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.187
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -63,10 +63,10 @@ code to solve the task for them. Check out our discord for updates and roadmaps!
|
|
63
63
|
## Table of Contents
|
64
64
|
- [🚀Quick Start](#quick-start)
|
65
65
|
- [📚Documentation](#documentation)
|
66
|
-
- [🔍🤖VisionAgent](#
|
66
|
+
- [🔍🤖VisionAgent](#visionagent-basic-usage)
|
67
67
|
- [🛠️Tools](#tools)
|
68
68
|
- [🤖LMMs](#lmms)
|
69
|
-
- [💻🤖VisionAgent Coder](#
|
69
|
+
- [💻🤖VisionAgent Coder](#visionagent-coder)
|
70
70
|
- [🏗️Additional Backends](#additional-backends)
|
71
71
|
|
72
72
|
## Quick Start
|
@@ -20,7 +20,7 @@ vision_agent/tools/__init__.py,sha256=KVP4_6qxOb2lpFdQgQtyDfdkMLL1O6wVZNK19MXp-x
|
|
20
20
|
vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB9Kg,32074
|
21
21
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
22
22
|
vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
|
23
|
-
vision_agent/tools/tools.py,sha256=
|
23
|
+
vision_agent/tools/tools.py,sha256=rsvQ7cz2xiGiJZme8yb-r-omSarhtC0zapUSt3_pmuo,83541
|
24
24
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
25
25
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
26
26
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
@@ -28,8 +28,8 @@ vision_agent/utils/execute.py,sha256=2sIQn45llOENMyrKu3TPINVRLLbOvvZ6SVHFCB9MQUo
|
|
28
28
|
vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwdn6sk,11303
|
29
29
|
vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
|
30
30
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
31
|
-
vision_agent/utils/video.py,sha256=
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
34
|
-
vision_agent-0.2.
|
35
|
-
vision_agent-0.2.
|
31
|
+
vision_agent/utils/video.py,sha256=tRcGp4vEnaDycigL1hBO9k0FBPtDH35fCQciVr9GqYI,6013
|
32
|
+
vision_agent-0.2.187.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
33
|
+
vision_agent-0.2.187.dist-info/METADATA,sha256=SYs-27G_7CqSSP8tNatzvnca9BTA-gupcRSVSezmxsw,18328
|
34
|
+
vision_agent-0.2.187.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
35
|
+
vision_agent-0.2.187.dist-info/RECORD,,
|
File without changes
|
File without changes
|