vision-agent 0.0.37__py3-none-any.whl → 0.0.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vision_agent/tools/tools.py +43 -0
- vision_agent/tools/video.py +176 -0
- {vision_agent-0.0.37.dist-info → vision_agent-0.0.39.dist-info}/METADATA +11 -2
- {vision_agent-0.0.37.dist-info → vision_agent-0.0.39.dist-info}/RECORD +6 -5
- {vision_agent-0.0.37.dist-info → vision_agent-0.0.39.dist-info}/LICENSE +0 -0
- {vision_agent-0.0.37.dist-info → vision_agent-0.0.39.dist-info}/WHEEL +0 -0
vision_agent/tools/tools.py
CHANGED
@@ -11,6 +11,7 @@ from PIL import Image
|
|
11
11
|
from PIL.Image import Image as ImageType
|
12
12
|
|
13
13
|
from vision_agent.image_utils import convert_to_b64, get_image_size
|
14
|
+
from vision_agent.tools.video import extract_frames_from_video
|
14
15
|
|
15
16
|
_LOGGER = logging.getLogger(__name__)
|
16
17
|
|
@@ -505,6 +506,47 @@ class Divide(Tool):
|
|
505
506
|
return round(input[0] / input[1], 2)
|
506
507
|
|
507
508
|
|
509
|
+
class ExtractFrames(Tool):
|
510
|
+
r"""Extract frames from a video."""
|
511
|
+
|
512
|
+
name = "extract_frames_"
|
513
|
+
description = "'extract_frames_' extract image frames from the input video, return a list of tuple (frame, timestamp), where the timestamp is the relative time in seconds of the frame occurred in the video, the frame is a local image file path that stores the frame."
|
514
|
+
usage = {
|
515
|
+
"required_parameters": [{"name": "video_uri", "type": "str"}],
|
516
|
+
"examples": [
|
517
|
+
{
|
518
|
+
"scenario": "Can you extract the frames from this video? Video: www.foobar.com/video?name=test.mp4",
|
519
|
+
"parameters": {"video_uri": "www.foobar.com/video?name=test.mp4"},
|
520
|
+
},
|
521
|
+
{
|
522
|
+
"scenario": "Can you extract the images from this video file? Video path: tests/data/test.mp4",
|
523
|
+
"parameters": {"video_uri": "tests/data/test.mp4"},
|
524
|
+
},
|
525
|
+
],
|
526
|
+
}
|
527
|
+
|
528
|
+
def __call__(self, video_uri: str) -> list[tuple[str, float]]:
|
529
|
+
"""Extract frames from a video.
|
530
|
+
|
531
|
+
|
532
|
+
Parameters:
|
533
|
+
video_uri: the path to the video file or a url points to the video data
|
534
|
+
|
535
|
+
Returns:
|
536
|
+
a list of tuples containing the extracted frame and the timestamp in seconds. E.g. [(path_to_frame1, 0.0), (path_to_frame2, 0.5), ...]. The timestamp is the time in seconds from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video. The frames are sorted by the timestamp in ascending order.
|
537
|
+
"""
|
538
|
+
frames = extract_frames_from_video(video_uri)
|
539
|
+
result = []
|
540
|
+
_LOGGER.info(
|
541
|
+
f"Extracted {len(frames)} frames from video {video_uri}. Temporarily saving them as images to disk for downstream tasks."
|
542
|
+
)
|
543
|
+
for frame, ts in frames:
|
544
|
+
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
|
545
|
+
Image.fromarray(frame).save(tmp)
|
546
|
+
result.append((tmp.name, ts))
|
547
|
+
return result
|
548
|
+
|
549
|
+
|
508
550
|
TOOLS = {
|
509
551
|
i: {"name": c.name, "description": c.description, "usage": c.usage, "class": c}
|
510
552
|
for i, c in enumerate(
|
@@ -520,6 +562,7 @@ TOOLS = {
|
|
520
562
|
Subtract,
|
521
563
|
Multiply,
|
522
564
|
Divide,
|
565
|
+
ExtractFrames,
|
523
566
|
]
|
524
567
|
)
|
525
568
|
if (hasattr(c, "name") and hasattr(c, "description") and hasattr(c, "usage"))
|
@@ -0,0 +1,176 @@
|
|
1
|
+
import logging
|
2
|
+
import math
|
3
|
+
import os
|
4
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
5
|
+
from typing import List, Tuple, cast
|
6
|
+
|
7
|
+
import cv2
|
8
|
+
import numpy as np
|
9
|
+
from moviepy.video.io.VideoFileClip import VideoFileClip
|
10
|
+
from tqdm import tqdm
|
11
|
+
|
12
|
+
_LOGGER = logging.getLogger(__name__)
|
13
|
+
# The maximum length of the clip to extract frames from, in seconds
|
14
|
+
_CLIP_LENGTH = 30.0
|
15
|
+
|
16
|
+
|
17
|
+
def extract_frames_from_video(
|
18
|
+
video_uri: str, fps: int = 2, motion_detection_threshold: float = 0.06
|
19
|
+
) -> List[Tuple[np.ndarray, float]]:
|
20
|
+
"""Extract frames from a video
|
21
|
+
|
22
|
+
Parameters:
|
23
|
+
video_uri: the path to the video file or a video file url
|
24
|
+
fps: the frame rate per second to extract the frames
|
25
|
+
motion_detection_threshold: The threshold to detect motion between changes/frames.
|
26
|
+
A value between 0-1, which represents the percentage change required for the frames to be considered in motion.
|
27
|
+
For example, a lower value means more frames will be extracted.
|
28
|
+
|
29
|
+
Returns:
|
30
|
+
a list of tuples containing the extracted frame and the timestamp in seconds. E.g. [(frame1, 0.0), (frame2, 0.5), ...]. The timestamp is the time in seconds from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video. The frames are sorted by the timestamp in ascending order.
|
31
|
+
"""
|
32
|
+
with VideoFileClip(video_uri) as video:
|
33
|
+
video_duration: float = video.duration
|
34
|
+
num_workers = os.cpu_count()
|
35
|
+
clip_length: float = min(video_duration, _CLIP_LENGTH)
|
36
|
+
start_times = list(range(0, math.ceil(video_duration), math.ceil(clip_length)))
|
37
|
+
assert start_times, f"No frames to extract from the input video: {video_uri}"
|
38
|
+
segment_args = [
|
39
|
+
{
|
40
|
+
"video_uri": video_uri,
|
41
|
+
"start": start,
|
42
|
+
"end": (
|
43
|
+
start + clip_length if i < len(start_times) - 1 else video_duration
|
44
|
+
),
|
45
|
+
"fps": fps,
|
46
|
+
"motion_detection_threshold": motion_detection_threshold,
|
47
|
+
}
|
48
|
+
for i, start in enumerate(start_times)
|
49
|
+
]
|
50
|
+
if (
|
51
|
+
cast(float, segment_args[-1]["end"])
|
52
|
+
- cast(float, segment_args[-1]["start"])
|
53
|
+
< 1
|
54
|
+
):
|
55
|
+
# If the last segment is less than 1s, merge it with the previous segment
|
56
|
+
# This is to avoid the failure of the last segment extraction
|
57
|
+
assert (
|
58
|
+
len(segment_args) > 1
|
59
|
+
), "Development bug - Expect at least 2 segments."
|
60
|
+
segment_args[-2]["end"] = video_duration
|
61
|
+
segment_args.pop(-1)
|
62
|
+
_LOGGER.info(
|
63
|
+
f"""Created {len(segment_args)} segments from the input video {video_uri} of length {video.duration}s, with clip size: {clip_length}s and {num_workers} workers.
|
64
|
+
Segments: {segment_args}
|
65
|
+
"""
|
66
|
+
)
|
67
|
+
frames = []
|
68
|
+
with tqdm(total=len(segment_args)) as pbar:
|
69
|
+
with ProcessPoolExecutor(max_workers=num_workers) as executor:
|
70
|
+
futures = [
|
71
|
+
executor.submit(_extract_frames_by_clip, **kwargs) # type: ignore
|
72
|
+
for kwargs in segment_args
|
73
|
+
]
|
74
|
+
for future in as_completed(futures):
|
75
|
+
result = future.result()
|
76
|
+
frames.extend(result)
|
77
|
+
pbar.update(1)
|
78
|
+
frames.sort(key=lambda x: x[1])
|
79
|
+
_LOGGER.info(f"Extracted {len(frames)} frames from video {video_uri}")
|
80
|
+
return frames
|
81
|
+
|
82
|
+
|
83
|
+
def _extract_frames_by_clip(
|
84
|
+
video_uri: str,
|
85
|
+
start: int = 0,
|
86
|
+
end: float = -1,
|
87
|
+
fps: int = 2,
|
88
|
+
motion_detection_threshold: float = 0.06,
|
89
|
+
) -> List[Tuple[np.ndarray, float]]:
|
90
|
+
"""Extract frames from a video clip with start and end time in seconds.
|
91
|
+
|
92
|
+
Parameters:
|
93
|
+
video_uri: the path to the video file or a video file url
|
94
|
+
start: the start time (in seconds) of the clip to extract
|
95
|
+
end: the end time (in seconds, up to millisecond level precision) of the clip to extract, if -1, extract the whole video
|
96
|
+
fps: the frame rate to extract the frames
|
97
|
+
motion_detection_threshold: the threshold to detect the motion between frames
|
98
|
+
"""
|
99
|
+
with VideoFileClip(video_uri) as video:
|
100
|
+
source_fps = video.fps
|
101
|
+
if end <= 0:
|
102
|
+
end = video.duration
|
103
|
+
_LOGGER.info(
|
104
|
+
f"Extracting frames from video {video_uri} ({video.duration}s) with start={start}s and end={end}s"
|
105
|
+
)
|
106
|
+
clip = video.subclip(start, end)
|
107
|
+
processable_frames = int(clip.duration * fps)
|
108
|
+
_LOGGER.info(
|
109
|
+
f"Extracting frames from video clip of length {clip.duration}s with FPS={fps} and start_time={start}s. Total number of frames in clip: {processable_frames}"
|
110
|
+
)
|
111
|
+
frames = []
|
112
|
+
total_count, skipped_count = 0, 0
|
113
|
+
prev_processed_frame = None
|
114
|
+
pbar = tqdm(
|
115
|
+
total=processable_frames, desc=f"Extracting frames from clip {start}-{end}"
|
116
|
+
)
|
117
|
+
for i, frame in enumerate(clip.iter_frames(fps=fps, dtype="uint8")):
|
118
|
+
curr_processed_frame = _preprocess_frame(frame)
|
119
|
+
total_count += 1
|
120
|
+
pbar.update(1)
|
121
|
+
# Skip the frame if it is similar to the previous one
|
122
|
+
if prev_processed_frame is not None and _similar_frame(
|
123
|
+
prev_processed_frame,
|
124
|
+
curr_processed_frame,
|
125
|
+
threshold=motion_detection_threshold,
|
126
|
+
):
|
127
|
+
skipped_count += 1
|
128
|
+
continue
|
129
|
+
prev_processed_frame = curr_processed_frame
|
130
|
+
ts = round(clip.reader.pos / source_fps, 3)
|
131
|
+
frames.append((frame, ts))
|
132
|
+
|
133
|
+
_LOGGER.info(
|
134
|
+
f"""Finished!
|
135
|
+
Frames extracted: {len(frames)}
|
136
|
+
Extracted frame timestamp: {[f[1] for f in frames]}
|
137
|
+
Total processed frames: {total_count}
|
138
|
+
Skipped frames: {skipped_count}
|
139
|
+
Scan FPS: {fps}
|
140
|
+
Clip start time: {start}s, {clip.pos}
|
141
|
+
Clip end time: {end}s
|
142
|
+
Clip duration: {clip.duration}s
|
143
|
+
Clip total frames: {clip.duration * source_fps}
|
144
|
+
Video duration: {video.duration}s
|
145
|
+
Video FPS: {video.fps}
|
146
|
+
Video total frames: {video.reader.nframes}"""
|
147
|
+
)
|
148
|
+
return frames
|
149
|
+
|
150
|
+
|
151
|
+
def _preprocess_frame(frame: np.ndarray) -> np.ndarray:
|
152
|
+
# Convert to grayscale
|
153
|
+
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
|
154
|
+
frame = cv2.GaussianBlur(src=frame, ksize=(5, 5), sigmaX=0)
|
155
|
+
return frame
|
156
|
+
|
157
|
+
|
158
|
+
def _similar_frame(
|
159
|
+
prev_frame: np.ndarray, curr_frame: np.ndarray, threshold: float
|
160
|
+
) -> bool:
|
161
|
+
"""Detect two frames are similar or not
|
162
|
+
|
163
|
+
Parameters:
|
164
|
+
threshold: similarity threshold, a value between 0-1, the percentage change that is considered a different frame.
|
165
|
+
"""
|
166
|
+
# calculate difference and update previous frame TODO: don't assume the processed image is cached
|
167
|
+
diff_frame = cv2.absdiff(src1=prev_frame, src2=curr_frame)
|
168
|
+
# Only take different areas that are different enough (>20 / 255)
|
169
|
+
thresh_frame = cv2.threshold(
|
170
|
+
src=diff_frame, thresh=20, maxval=255, type=cv2.THRESH_BINARY
|
171
|
+
)[1]
|
172
|
+
change_percentage = cv2.countNonZero(thresh_frame) / (
|
173
|
+
curr_frame.shape[0] * curr_frame.shape[1]
|
174
|
+
)
|
175
|
+
_LOGGER.debug(f"Image diff: {change_percentage}")
|
176
|
+
return change_percentage < threshold
|
@@ -1,16 +1,19 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.39
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
7
|
-
Requires-Python: >=3.
|
7
|
+
Requires-Python: >=3.9,<3.12
|
8
8
|
Classifier: Programming Language :: Python :: 3
|
9
|
+
Classifier: Programming Language :: Python :: 3.9
|
9
10
|
Classifier: Programming Language :: Python :: 3.10
|
10
11
|
Classifier: Programming Language :: Python :: 3.11
|
11
12
|
Requires-Dist: faiss-cpu (>=1.0.0,<2.0.0)
|
13
|
+
Requires-Dist: moviepy (>=1.0.0,<2.0.0)
|
12
14
|
Requires-Dist: numpy (>=1.21.0,<2.0.0)
|
13
15
|
Requires-Dist: openai (>=1.0.0,<2.0.0)
|
16
|
+
Requires-Dist: opencv-python-headless (>=4.0.0,<5.0.0)
|
14
17
|
Requires-Dist: pandas (>=2.0.0,<3.0.0)
|
15
18
|
Requires-Dist: pillow (>=10.0.0,<11.0.0)
|
16
19
|
Requires-Dist: requests (>=2.0.0,<3.0.0)
|
@@ -42,6 +45,11 @@ aims to provide an in-seconds experience by allowing users to describe their pro
|
|
42
45
|
text and utilizing agent frameworks to solve the task for them. Check out our discord
|
43
46
|
for updates and roadmaps!
|
44
47
|
|
48
|
+
## Documentation
|
49
|
+
|
50
|
+
- [Vision Agent Library Docs](https://landing-ai.github.io/vision-agent/)
|
51
|
+
|
52
|
+
|
45
53
|
## Getting Started
|
46
54
|
### Installation
|
47
55
|
To get started, you can install the library using pip:
|
@@ -124,6 +132,7 @@ you. For example:
|
|
124
132
|
| Crop | Crop crops an image given a bounding box and returns a file name of the cropped image. |
|
125
133
|
| BboxArea | BboxArea returns the area of the bounding box in pixels normalized to 2 decimal places. |
|
126
134
|
| SegArea | SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places. |
|
135
|
+
| ExtractFrames | ExtractFrames extracts image frames from the input video. |
|
127
136
|
|
128
137
|
|
129
138
|
It also has a basic set of calculate tools such as add, subtract, multiply and divide.
|
@@ -18,8 +18,9 @@ vision_agent/lmm/__init__.py,sha256=I8mbeNUajTfWVNqLsuFQVOaNBDlkIhYp9DFU8H4kB7g,
|
|
18
18
|
vision_agent/lmm/lmm.py,sha256=ARcbgkcyP83TbVVoXI9B-gtG0gJuTaG_MjcUGbams4U,8052
|
19
19
|
vision_agent/tools/__init__.py,sha256=aX0pU3pXU1V0Cj9FzYCvdsX76TAglFMHx59kNhXHbPs,131
|
20
20
|
vision_agent/tools/prompts.py,sha256=9RBbyqlNlExsGKlJ89Jkph83DAEJ8PCVGaHoNbyN7TM,1416
|
21
|
-
vision_agent/tools/tools.py,sha256=
|
22
|
-
vision_agent
|
23
|
-
vision_agent-0.0.
|
24
|
-
vision_agent-0.0.
|
25
|
-
vision_agent-0.0.
|
21
|
+
vision_agent/tools/tools.py,sha256=2mmomPDbldXRpw3q5zAcazKJMjAGd0Jl9ak9JykHQYI,21211
|
22
|
+
vision_agent/tools/video.py,sha256=KV_Wcat7DDGxpHSaGBu7s4lj4crlYaUu4YKpCO_86k4,7440
|
23
|
+
vision_agent-0.0.39.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
24
|
+
vision_agent-0.0.39.dist-info/METADATA,sha256=_jugEQnOeNbLa3kSSo0zTn2bII3Rh5dfop9qyMWXPfw,5282
|
25
|
+
vision_agent-0.0.39.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
|
26
|
+
vision_agent-0.0.39.dist-info/RECORD,,
|
File without changes
|
File without changes
|