vision-agent 0.2.185__py3-none-any.whl → 0.2.187__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/tools/tools.py +23 -15
- vision_agent/utils/video.py +9 -0
- {vision_agent-0.2.185.dist-info → vision_agent-0.2.187.dist-info}/METADATA +3 -3
- {vision_agent-0.2.185.dist-info → vision_agent-0.2.187.dist-info}/RECORD +6 -6
- {vision_agent-0.2.185.dist-info → vision_agent-0.2.187.dist-info}/LICENSE +0 -0
- {vision_agent-0.2.185.dist-info → vision_agent-0.2.187.dist-info}/WHEEL +0 -0
vision_agent/tools/tools.py
CHANGED
@@ -1057,23 +1057,25 @@ def video_temporal_localization(
|
|
1057
1057
|
prompt: str,
|
1058
1058
|
frames: List[np.ndarray],
|
1059
1059
|
model: str = "qwen2vl",
|
1060
|
-
chunk_length: Optional[float] = None,
|
1061
|
-
chunk_length_seconds: Optional[float] = None,
|
1062
1060
|
chunk_length_frames: Optional[int] = 2,
|
1063
1061
|
) -> List[float]:
|
1064
|
-
"""'video_temporal_localization'
|
1065
|
-
|
1062
|
+
"""'video_temporal_localization' will run qwen2vl on each chunk_length_frames
|
1063
|
+
value selected for the video. It can detect multiple objects independently per
|
1064
|
+
chunk_length_frames given a text prompt such as a referring expression
|
1065
|
+
but does not track objects across frames.
|
1066
|
+
It returns a list of floats with a value of 1.0 if the objects are found in a given
|
1067
|
+
chunk_length_frames of the video.
|
1066
1068
|
|
1067
1069
|
Parameters:
|
1068
1070
|
prompt (str): The question about the video
|
1069
1071
|
frames (List[np.ndarray]): The reference frames used for the question
|
1070
|
-
model (str): The model to use for the inference. Valid values are
|
1071
|
-
|
1072
|
-
chunk_length_seconds (Optional[float]): alternative length for chunk in seconds
|
1072
|
+
model (str): The model to use for the inference. Valid values are
|
1073
|
+
'qwen2vl', 'gpt4o', 'internlm-xcomposer'
|
1073
1074
|
chunk_length_frames (Optional[int]): length of each chunk in frames
|
1074
1075
|
|
1075
1076
|
Returns:
|
1076
|
-
List[float]: A list of floats with a value of 1.0 if the
|
1077
|
+
List[float]: A list of floats with a value of 1.0 if the objects to be found
|
1078
|
+
are present in the chunk_length_frames of the video.
|
1077
1079
|
|
1078
1080
|
Example
|
1079
1081
|
-------
|
@@ -1088,10 +1090,6 @@ def video_temporal_localization(
|
|
1088
1090
|
"model": model,
|
1089
1091
|
"function_name": "video_temporal_localization",
|
1090
1092
|
}
|
1091
|
-
if chunk_length is not None:
|
1092
|
-
payload["chunk_length"] = chunk_length
|
1093
|
-
if chunk_length_seconds is not None:
|
1094
|
-
payload["chunk_length_seconds"] = chunk_length_seconds
|
1095
1093
|
if chunk_length_frames is not None:
|
1096
1094
|
payload["chunk_length_frames"] = chunk_length_frames
|
1097
1095
|
|
@@ -1790,9 +1788,8 @@ def flux_image_inpainting(
|
|
1790
1788
|
where 1 indicates areas to be inpainted and 0 indicates areas to be preserved.
|
1791
1789
|
|
1792
1790
|
Returns:
|
1793
|
-
np.ndarray:
|
1794
|
-
|
1795
|
-
with values ranging from 0 to 255.
|
1791
|
+
np.ndarray: The generated image(s) as a numpy array in RGB format with values
|
1792
|
+
ranging from 0 to 255.
|
1796
1793
|
|
1797
1794
|
-------
|
1798
1795
|
Example:
|
@@ -1874,6 +1871,9 @@ def extract_frames_and_timestamps(
|
|
1874
1871
|
>>> extract_frames("path/to/video.mp4")
|
1875
1872
|
[{"frame": np.ndarray, "timestamp": 0.0}, ...]
|
1876
1873
|
"""
|
1874
|
+
if isinstance(fps, str):
|
1875
|
+
# fps could be a string when it's passed in from a web endpoint deployment
|
1876
|
+
fps = float(fps)
|
1877
1877
|
|
1878
1878
|
def reformat(
|
1879
1879
|
frames_and_timestamps: List[Tuple[np.ndarray, float]],
|
@@ -1937,6 +1937,7 @@ def save_json(data: Any, file_path: str) -> None:
|
|
1937
1937
|
return bool(obj)
|
1938
1938
|
return json.JSONEncoder.default(self, obj)
|
1939
1939
|
|
1940
|
+
Path(file_path).parent.mkdir(parents=True, exist_ok=True)
|
1940
1941
|
with open(file_path, "w") as f:
|
1941
1942
|
json.dump(data, f, cls=NumpyEncoder)
|
1942
1943
|
|
@@ -1979,6 +1980,7 @@ def save_image(image: np.ndarray, file_path: str) -> None:
|
|
1979
1980
|
-------
|
1980
1981
|
>>> save_image(image)
|
1981
1982
|
"""
|
1983
|
+
Path(file_path).parent.mkdir(parents=True, exist_ok=True)
|
1982
1984
|
from IPython.display import display
|
1983
1985
|
|
1984
1986
|
if not isinstance(image, np.ndarray) or (
|
@@ -2009,6 +2011,9 @@ def save_video(
|
|
2009
2011
|
>>> save_video(frames)
|
2010
2012
|
"/tmp/tmpvideo123.mp4"
|
2011
2013
|
"""
|
2014
|
+
if isinstance(fps, str):
|
2015
|
+
# fps could be a string when it's passed in from a web endpoint deployment
|
2016
|
+
fps = float(fps)
|
2012
2017
|
if fps <= 0:
|
2013
2018
|
raise ValueError(f"fps must be greater than 0 got {fps}")
|
2014
2019
|
|
@@ -2025,6 +2030,8 @@ def save_video(
|
|
2025
2030
|
output_video_path = tempfile.NamedTemporaryFile(
|
2026
2031
|
delete=False, suffix=".mp4"
|
2027
2032
|
).name
|
2033
|
+
else:
|
2034
|
+
Path(output_video_path).parent.mkdir(parents=True, exist_ok=True)
|
2028
2035
|
|
2029
2036
|
output_video_path = video_writer(frames, fps, output_video_path)
|
2030
2037
|
_save_video_to_result(output_video_path)
|
@@ -2351,6 +2358,7 @@ FUNCTION_TOOLS = [
|
|
2351
2358
|
closest_box_distance,
|
2352
2359
|
qwen2_vl_images_vqa,
|
2353
2360
|
qwen2_vl_video_vqa,
|
2361
|
+
video_temporal_localization,
|
2354
2362
|
]
|
2355
2363
|
|
2356
2364
|
UTIL_TOOLS = [
|
vision_agent/utils/video.py
CHANGED
@@ -58,6 +58,9 @@ def video_writer(
|
|
58
58
|
fps: float = _DEFAULT_INPUT_FPS,
|
59
59
|
filename: Optional[str] = None,
|
60
60
|
) -> str:
|
61
|
+
if isinstance(fps, str):
|
62
|
+
# fps could be a string when it's passed in from a web endpoint deployment
|
63
|
+
fps = float(fps)
|
61
64
|
if filename is None:
|
62
65
|
filename = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
|
63
66
|
container = av.open(filename, mode="w")
|
@@ -92,6 +95,9 @@ def frames_to_bytes(
|
|
92
95
|
fps: the frames per second of the video
|
93
96
|
file_ext: the file extension of the video file
|
94
97
|
"""
|
98
|
+
if isinstance(fps, str):
|
99
|
+
# fps could be a string when it's passed in from a web endpoint deployment
|
100
|
+
fps = float(fps)
|
95
101
|
with tempfile.NamedTemporaryFile(delete=True, suffix=file_ext) as temp_file:
|
96
102
|
video_writer(frames, fps, temp_file.name)
|
97
103
|
|
@@ -120,6 +126,9 @@ def extract_frames_from_video(
|
|
120
126
|
from the start of the video. E.g. 12.125 means 12.125 seconds from the start of
|
121
127
|
the video. The frames are sorted by the timestamp in ascending order.
|
122
128
|
"""
|
129
|
+
if isinstance(fps, str):
|
130
|
+
# fps could be a string when it's passed in from a web endpoint deployment
|
131
|
+
fps = float(fps)
|
123
132
|
|
124
133
|
cap = cv2.VideoCapture(video_uri)
|
125
134
|
orig_fps = cap.get(cv2.CAP_PROP_FPS)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.187
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -63,10 +63,10 @@ code to solve the task for them. Check out our discord for updates and roadmaps!
|
|
63
63
|
## Table of Contents
|
64
64
|
- [🚀Quick Start](#quick-start)
|
65
65
|
- [📚Documentation](#documentation)
|
66
|
-
- [🔍🤖VisionAgent](#
|
66
|
+
- [🔍🤖VisionAgent](#visionagent-basic-usage)
|
67
67
|
- [🛠️Tools](#tools)
|
68
68
|
- [🤖LMMs](#lmms)
|
69
|
-
- [💻🤖VisionAgent Coder](#
|
69
|
+
- [💻🤖VisionAgent Coder](#visionagent-coder)
|
70
70
|
- [🏗️Additional Backends](#additional-backends)
|
71
71
|
|
72
72
|
## Quick Start
|
@@ -20,7 +20,7 @@ vision_agent/tools/__init__.py,sha256=KVP4_6qxOb2lpFdQgQtyDfdkMLL1O6wVZNK19MXp-x
|
|
20
20
|
vision_agent/tools/meta_tools.py,sha256=by7TIbH7lsLIayX_Pe2mS1iw8aeLn2T8yqAo8SkB9Kg,32074
|
21
21
|
vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
|
22
22
|
vision_agent/tools/tool_utils.py,sha256=VPGqGJ2ZYEJA6AW7K9X7hQv6vRlMtAQcybE4izdToCw,8196
|
23
|
-
vision_agent/tools/tools.py,sha256=
|
23
|
+
vision_agent/tools/tools.py,sha256=rsvQ7cz2xiGiJZme8yb-r-omSarhtC0zapUSt3_pmuo,83541
|
24
24
|
vision_agent/tools/tools_types.py,sha256=8hYf2OZhI58gvf65KGaeGkt4EQ56nwLFqIQDPHioOBc,2339
|
25
25
|
vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
|
26
26
|
vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
|
@@ -28,8 +28,8 @@ vision_agent/utils/execute.py,sha256=2sIQn45llOENMyrKu3TPINVRLLbOvvZ6SVHFCB9MQUo
|
|
28
28
|
vision_agent/utils/image_utils.py,sha256=rm9GfXvD4JrjnqKrP_f2gfq4SzmqYC0IdC1kKwdn6sk,11303
|
29
29
|
vision_agent/utils/sim.py,sha256=ZuSS07TUXFGjipmiQoY8TKRmSes7XXCdtU9PI8PC1sw,5609
|
30
30
|
vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
|
31
|
-
vision_agent/utils/video.py,sha256=
|
32
|
-
vision_agent-0.2.
|
33
|
-
vision_agent-0.2.
|
34
|
-
vision_agent-0.2.
|
35
|
-
vision_agent-0.2.
|
31
|
+
vision_agent/utils/video.py,sha256=tRcGp4vEnaDycigL1hBO9k0FBPtDH35fCQciVr9GqYI,6013
|
32
|
+
vision_agent-0.2.187.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
33
|
+
vision_agent-0.2.187.dist-info/METADATA,sha256=SYs-27G_7CqSSP8tNatzvnca9BTA-gupcRSVSezmxsw,18328
|
34
|
+
vision_agent-0.2.187.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
35
|
+
vision_agent-0.2.187.dist-info/RECORD,,
|
File without changes
|
File without changes
|