vision-agent 0.2.125__tar.gz → 0.2.127__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {vision_agent-0.2.125 → vision_agent-0.2.127}/PKG-INFO +3 -2
- {vision_agent-0.2.125 → vision_agent-0.2.127}/pyproject.toml +3 -4
- {vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/agent/vision_agent_coder.py +1 -1
- {vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/tools/tools.py +15 -14
- {vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/utils/__init__.py +1 -1
- {vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/utils/image_utils.py +0 -20
- vision_agent-0.2.127/vision_agent/utils/video.py +126 -0
- vision_agent-0.2.125/vision_agent/utils/video.py +0 -215
- {vision_agent-0.2.125 → vision_agent-0.2.127}/LICENSE +0 -0
- {vision_agent-0.2.125 → vision_agent-0.2.127}/README.md +0 -0
- {vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/__init__.py +0 -0
- {vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/agent/__init__.py +0 -0
- {vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/agent/agent.py +0 -0
- {vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/agent/agent_utils.py +0 -0
- {vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/agent/vision_agent.py +0 -0
- {vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/agent/vision_agent_coder_prompts.py +0 -0
- {vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/agent/vision_agent_prompts.py +0 -0
- {vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/clients/__init__.py +0 -0
- {vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/clients/http.py +0 -0
- {vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/clients/landing_public_api.py +0 -0
- {vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/fonts/__init__.py +0 -0
- {vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/fonts/default_font_ch_en.ttf +0 -0
- {vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/lmm/__init__.py +0 -0
- {vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/lmm/lmm.py +0 -0
- {vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/lmm/types.py +0 -0
- {vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/tools/__init__.py +0 -0
- {vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/tools/meta_tools.py +0 -0
- {vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/tools/prompts.py +0 -0
- {vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/tools/tool_utils.py +0 -0
- {vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/tools/tools_types.py +0 -0
- {vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/utils/exceptions.py +0 -0
- {vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/utils/execute.py +0 -0
- {vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/utils/sim.py +0 -0
- {vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/utils/type_defs.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vision-agent
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.127
|
4
4
|
Summary: Toolset for Vision Agent
|
5
5
|
Author: Landing AI
|
6
6
|
Author-email: dev@landing.ai
|
@@ -10,11 +10,12 @@ Classifier: Programming Language :: Python :: 3.9
|
|
10
10
|
Classifier: Programming Language :: Python :: 3.10
|
11
11
|
Classifier: Programming Language :: Python :: 3.11
|
12
12
|
Requires-Dist: anthropic (>=0.31.0,<0.32.0)
|
13
|
+
Requires-Dist: av (>=11.0.0,<12.0.0)
|
13
14
|
Requires-Dist: e2b (>=0.17.2a50,<0.18.0)
|
14
15
|
Requires-Dist: e2b-code-interpreter (==0.0.11a37)
|
16
|
+
Requires-Dist: eva-decord (>=0.6.1,<0.7.0)
|
15
17
|
Requires-Dist: ipykernel (>=6.29.4,<7.0.0)
|
16
18
|
Requires-Dist: langsmith (>=0.1.58,<0.2.0)
|
17
|
-
Requires-Dist: moviepy (>=1.0.0,<2.0.0)
|
18
19
|
Requires-Dist: nbclient (>=0.10.0,<0.11.0)
|
19
20
|
Requires-Dist: nbformat (>=5.10.4,<6.0.0)
|
20
21
|
Requires-Dist: numpy (>=1.21.0,<2.0.0)
|
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|
4
4
|
|
5
5
|
[tool.poetry]
|
6
6
|
name = "vision-agent"
|
7
|
-
version = "0.2.
|
7
|
+
version = "0.2.127"
|
8
8
|
description = "Toolset for Vision Agent"
|
9
9
|
authors = ["Landing AI <dev@landing.ai>"]
|
10
10
|
readme = "README.md"
|
@@ -25,7 +25,6 @@ tqdm = ">=4.64.0,<5.0.0"
|
|
25
25
|
pandas = "2.*"
|
26
26
|
openai = "1.*"
|
27
27
|
typing_extensions = "4.*"
|
28
|
-
moviepy = "1.*"
|
29
28
|
opencv-python = "4.*"
|
30
29
|
tabulate = "^0.9.0"
|
31
30
|
pydantic-settings = "^2.2.1"
|
@@ -42,6 +41,8 @@ pillow-heif = "^0.16.0"
|
|
42
41
|
pytube = "15.0.0"
|
43
42
|
anthropic = "^0.31.0"
|
44
43
|
pydantic = "2.7.4"
|
44
|
+
eva-decord = "^0.6.1"
|
45
|
+
av = "^11.0.0"
|
45
46
|
|
46
47
|
[tool.poetry.group.dev.dependencies]
|
47
48
|
autoflake = "1.*"
|
@@ -100,10 +101,8 @@ show_error_codes = true
|
|
100
101
|
ignore_missing_imports = true
|
101
102
|
module = [
|
102
103
|
"cv2.*",
|
103
|
-
"faiss.*",
|
104
104
|
"openai.*",
|
105
105
|
"sentence_transformers.*",
|
106
|
-
"moviepy.*",
|
107
106
|
"e2b_code_interpreter.*",
|
108
107
|
"e2b.*"
|
109
108
|
]
|
@@ -173,7 +173,7 @@ def pick_plan(
|
|
173
173
|
|
174
174
|
if verbosity == 2:
|
175
175
|
_print_code("Initial code and tests:", code)
|
176
|
-
_LOGGER.info(f"Initial code execution result:\n{
|
176
|
+
_LOGGER.info(f"Initial code execution result:\n{tool_output_str}")
|
177
177
|
|
178
178
|
log_progress(
|
179
179
|
{
|
@@ -12,7 +12,6 @@ from uuid import UUID
|
|
12
12
|
import cv2
|
13
13
|
import numpy as np
|
14
14
|
import requests
|
15
|
-
from moviepy.editor import ImageSequenceClip
|
16
15
|
from PIL import Image, ImageDraw, ImageEnhance, ImageFont
|
17
16
|
from pillow_heif import register_heif_opener # type: ignore
|
18
17
|
from pytube import YouTube # type: ignore
|
@@ -35,7 +34,6 @@ from vision_agent.tools.tools_types import (
|
|
35
34
|
ODResponseData,
|
36
35
|
PromptTask,
|
37
36
|
)
|
38
|
-
from vision_agent.utils import extract_frames_from_video
|
39
37
|
from vision_agent.utils.exceptions import FineTuneModelIsNotReady
|
40
38
|
from vision_agent.utils.execute import FileSerializer, MimeType
|
41
39
|
from vision_agent.utils.image_utils import (
|
@@ -44,13 +42,17 @@ from vision_agent.utils.image_utils import (
|
|
44
42
|
convert_to_b64,
|
45
43
|
denormalize_bbox,
|
46
44
|
encode_image_bytes,
|
47
|
-
frames_to_bytes,
|
48
45
|
get_image_size,
|
49
46
|
normalize_bbox,
|
50
47
|
numpy_to_bytes,
|
51
48
|
rle_decode,
|
52
49
|
rle_decode_array,
|
53
50
|
)
|
51
|
+
from vision_agent.utils.video import (
|
52
|
+
extract_frames_from_video,
|
53
|
+
frames_to_bytes,
|
54
|
+
video_writer,
|
55
|
+
)
|
54
56
|
|
55
57
|
register_heif_opener()
|
56
58
|
|
@@ -1513,17 +1515,16 @@ def save_video(
|
|
1513
1515
|
"/tmp/tmpvideo123.mp4"
|
1514
1516
|
"""
|
1515
1517
|
if fps <= 0:
|
1516
|
-
|
1517
|
-
|
1518
|
-
|
1519
|
-
|
1520
|
-
|
1521
|
-
|
1522
|
-
|
1523
|
-
|
1524
|
-
|
1525
|
-
|
1526
|
-
return f.name
|
1518
|
+
raise ValueError(f"fps must be greater than 0 got {fps}")
|
1519
|
+
|
1520
|
+
if output_video_path is None:
|
1521
|
+
output_video_path = tempfile.NamedTemporaryFile(
|
1522
|
+
delete=False, suffix=".mp4"
|
1523
|
+
).name
|
1524
|
+
|
1525
|
+
output_video_path = video_writer(frames, fps, output_video_path)
|
1526
|
+
_save_video_to_result(output_video_path)
|
1527
|
+
return output_video_path
|
1527
1528
|
|
1528
1529
|
|
1529
1530
|
def _save_video_to_result(video_uri: str) -> None:
|
@@ -2,14 +2,12 @@
|
|
2
2
|
|
3
3
|
import base64
|
4
4
|
import io
|
5
|
-
import tempfile
|
6
5
|
from importlib import resources
|
7
6
|
from io import BytesIO
|
8
7
|
from pathlib import Path
|
9
8
|
from typing import Dict, List, Tuple, Union
|
10
9
|
|
11
10
|
import numpy as np
|
12
|
-
from moviepy.editor import ImageSequenceClip
|
13
11
|
from PIL import Image, ImageDraw, ImageFont
|
14
12
|
from PIL.Image import Image as ImageType
|
15
13
|
|
@@ -90,24 +88,6 @@ def rle_decode_array(rle: Dict[str, List[int]]) -> np.ndarray:
|
|
90
88
|
return binary_mask
|
91
89
|
|
92
90
|
|
93
|
-
def frames_to_bytes(
|
94
|
-
frames: List[np.ndarray], fps: float = 10, file_ext: str = "mp4"
|
95
|
-
) -> bytes:
|
96
|
-
r"""Convert a list of frames to a video file encoded into a byte string.
|
97
|
-
|
98
|
-
Parameters:
|
99
|
-
frames: the list of frames
|
100
|
-
fps: the frames per second of the video
|
101
|
-
file_ext: the file extension of the video file
|
102
|
-
"""
|
103
|
-
with tempfile.NamedTemporaryFile(delete=True) as temp_file:
|
104
|
-
clip = ImageSequenceClip(frames, fps=fps)
|
105
|
-
clip.write_videofile(temp_file.name + f".{file_ext}", fps=fps, codec="libx264")
|
106
|
-
with open(temp_file.name + f".{file_ext}", "rb") as f:
|
107
|
-
buffer_bytes = f.read()
|
108
|
-
return buffer_bytes
|
109
|
-
|
110
|
-
|
111
91
|
def b64_to_pil(b64_str: str) -> ImageType:
|
112
92
|
r"""Convert a base64 string to a PIL Image.
|
113
93
|
|
@@ -0,0 +1,126 @@
|
|
1
|
+
import base64
|
2
|
+
import logging
|
3
|
+
import tempfile
|
4
|
+
from functools import lru_cache
|
5
|
+
from typing import List, Optional, Tuple
|
6
|
+
|
7
|
+
import cv2
|
8
|
+
import av # type: ignore
|
9
|
+
import numpy as np
|
10
|
+
from decord import VideoReader # type: ignore
|
11
|
+
|
12
|
+
_LOGGER = logging.getLogger(__name__)
|
13
|
+
# The maximum length of the clip to extract frames from, in seconds
|
14
|
+
|
15
|
+
|
16
|
+
def play_video(video_base64: str) -> None:
|
17
|
+
"""Play a video file"""
|
18
|
+
video_data = base64.b64decode(video_base64)
|
19
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video:
|
20
|
+
temp_video.write(video_data)
|
21
|
+
temp_video_path = temp_video.name
|
22
|
+
|
23
|
+
cap = cv2.VideoCapture(temp_video_path)
|
24
|
+
if not cap.isOpened():
|
25
|
+
_LOGGER.error("Error: Could not open video.")
|
26
|
+
return
|
27
|
+
|
28
|
+
# Display the first frame and wait for any key press to start the video
|
29
|
+
ret, frame = cap.read()
|
30
|
+
if ret:
|
31
|
+
cv2.imshow("Video Player", frame)
|
32
|
+
_LOGGER.info(f"Press any key to start playing the video: {temp_video_path}")
|
33
|
+
cv2.waitKey(0) # Wait for any key press
|
34
|
+
|
35
|
+
while cap.isOpened():
|
36
|
+
ret, frame = cap.read()
|
37
|
+
if not ret:
|
38
|
+
break
|
39
|
+
cv2.imshow("Video Player", frame)
|
40
|
+
# Press 'q' to exit the video
|
41
|
+
if cv2.waitKey(200) & 0xFF == ord("q"):
|
42
|
+
break
|
43
|
+
cap.release()
|
44
|
+
cv2.destroyAllWindows()
|
45
|
+
|
46
|
+
|
47
|
+
def _resize_frame(frame: np.ndarray) -> np.ndarray:
|
48
|
+
height, width = frame.shape[:2]
|
49
|
+
new_width = width - (width % 2)
|
50
|
+
new_height = height - (height % 2)
|
51
|
+
return cv2.resize(frame, (new_width, new_height))
|
52
|
+
|
53
|
+
|
54
|
+
def video_writer(
|
55
|
+
frames: List[np.ndarray], fps: float = 1.0, filename: Optional[str] = None
|
56
|
+
) -> str:
|
57
|
+
if filename is None:
|
58
|
+
filename = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
|
59
|
+
container = av.open(filename, mode="w")
|
60
|
+
stream = container.add_stream("h264", rate=fps)
|
61
|
+
height, width = frames[0].shape[:2]
|
62
|
+
stream.height = height - (height % 2)
|
63
|
+
stream.width = width - (width % 2)
|
64
|
+
stream.pix_fmt = "yuv420p"
|
65
|
+
for frame in frames:
|
66
|
+
# Remove the alpha channel (convert RGBA to RGB)
|
67
|
+
frame_rgb = frame[:, :, :3]
|
68
|
+
# Resize the frame to make dimensions divisible by 2
|
69
|
+
frame_rgb = _resize_frame(frame_rgb)
|
70
|
+
av_frame = av.VideoFrame.from_ndarray(frame_rgb, format="rgb24")
|
71
|
+
for packet in stream.encode(av_frame):
|
72
|
+
container.mux(packet)
|
73
|
+
|
74
|
+
for packet in stream.encode():
|
75
|
+
container.mux(packet)
|
76
|
+
container.close()
|
77
|
+
return filename
|
78
|
+
|
79
|
+
|
80
|
+
def frames_to_bytes(
|
81
|
+
frames: List[np.ndarray], fps: float = 10, file_ext: str = ".mp4"
|
82
|
+
) -> bytes:
|
83
|
+
r"""Convert a list of frames to a video file encoded into a byte string.
|
84
|
+
|
85
|
+
Parameters:
|
86
|
+
frames: the list of frames
|
87
|
+
fps: the frames per second of the video
|
88
|
+
file_ext: the file extension of the video file
|
89
|
+
"""
|
90
|
+
with tempfile.NamedTemporaryFile(delete=True, suffix=file_ext) as temp_file:
|
91
|
+
video_writer(frames, fps, temp_file.name)
|
92
|
+
|
93
|
+
with open(temp_file.name, "rb") as f:
|
94
|
+
buffer_bytes = f.read()
|
95
|
+
return buffer_bytes
|
96
|
+
|
97
|
+
|
98
|
+
# WARNING: this cache is cache is a little dangerous because if the underlying video
|
99
|
+
# contents change but the filename remains the same it will return the old file contents
|
100
|
+
# but for vision agent it's unlikely to change the file contents while keeping the
|
101
|
+
# same file name and the time savings are very large.
|
102
|
+
@lru_cache(maxsize=8)
|
103
|
+
def extract_frames_from_video(
|
104
|
+
video_uri: str, fps: float = 1.0
|
105
|
+
) -> List[Tuple[np.ndarray, float]]:
|
106
|
+
"""Extract frames from a video
|
107
|
+
|
108
|
+
Parameters:
|
109
|
+
video_uri (str): the path to the video file or a video file url
|
110
|
+
fps (float): the frame rate per second to extract the frames
|
111
|
+
|
112
|
+
Returns:
|
113
|
+
a list of tuples containing the extracted frame and the timestamp in seconds.
|
114
|
+
E.g. [(frame1, 0.0), (frame2, 0.5), ...]. The timestamp is the time in seconds
|
115
|
+
from the start of the video. E.g. 12.125 means 12.125 seconds from the start of
|
116
|
+
the video. The frames are sorted by the timestamp in ascending order.
|
117
|
+
"""
|
118
|
+
vr = VideoReader(video_uri)
|
119
|
+
orig_fps = vr.get_avg_fps()
|
120
|
+
if fps > orig_fps:
|
121
|
+
fps = orig_fps
|
122
|
+
|
123
|
+
s = orig_fps / fps
|
124
|
+
samples = [(int(i * s), int(i * s) / orig_fps) for i in range(int(len(vr) / s))]
|
125
|
+
frames = vr.get_batch([s[0] for s in samples]).asnumpy()
|
126
|
+
return [(frames[i, :, :, :], samples[i][1]) for i in range(len(samples))]
|
@@ -1,215 +0,0 @@
|
|
1
|
-
import base64
|
2
|
-
import logging
|
3
|
-
import math
|
4
|
-
import os
|
5
|
-
import tempfile
|
6
|
-
from concurrent.futures import ProcessPoolExecutor, as_completed
|
7
|
-
from typing import List, Tuple, cast
|
8
|
-
|
9
|
-
import cv2
|
10
|
-
import numpy as np
|
11
|
-
from moviepy.video.io.VideoFileClip import VideoFileClip
|
12
|
-
from tqdm import tqdm
|
13
|
-
|
14
|
-
_LOGGER = logging.getLogger(__name__)
|
15
|
-
# The maximum length of the clip to extract frames from, in seconds
|
16
|
-
_CLIP_LENGTH = 30.0
|
17
|
-
|
18
|
-
|
19
|
-
def play_video(video_base64: str) -> None:
|
20
|
-
"""Play a video file"""
|
21
|
-
video_data = base64.b64decode(video_base64)
|
22
|
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video:
|
23
|
-
temp_video.write(video_data)
|
24
|
-
temp_video_path = temp_video.name
|
25
|
-
|
26
|
-
cap = cv2.VideoCapture(temp_video_path)
|
27
|
-
if not cap.isOpened():
|
28
|
-
_LOGGER.error("Error: Could not open video.")
|
29
|
-
return
|
30
|
-
|
31
|
-
# Display the first frame and wait for any key press to start the video
|
32
|
-
ret, frame = cap.read()
|
33
|
-
if ret:
|
34
|
-
cv2.imshow("Video Player", frame)
|
35
|
-
_LOGGER.info(f"Press any key to start playing the video: {temp_video_path}")
|
36
|
-
cv2.waitKey(0) # Wait for any key press
|
37
|
-
|
38
|
-
while cap.isOpened():
|
39
|
-
ret, frame = cap.read()
|
40
|
-
if not ret:
|
41
|
-
break
|
42
|
-
cv2.imshow("Video Player", frame)
|
43
|
-
# Press 'q' to exit the video
|
44
|
-
if cv2.waitKey(200) & 0xFF == ord("q"):
|
45
|
-
break
|
46
|
-
cap.release()
|
47
|
-
cv2.destroyAllWindows()
|
48
|
-
|
49
|
-
|
50
|
-
def extract_frames_from_video(
|
51
|
-
video_uri: str, fps: float = 0.5, motion_detection_threshold: float = 0.0
|
52
|
-
) -> List[Tuple[np.ndarray, float]]:
|
53
|
-
"""Extract frames from a video
|
54
|
-
|
55
|
-
Parameters:
|
56
|
-
video_uri: the path to the video file or a video file url
|
57
|
-
fps: the frame rate per second to extract the frames
|
58
|
-
motion_detection_threshold: The threshold to detect motion between
|
59
|
-
changes/frames. A value between 0-1, which represents the percentage change
|
60
|
-
required for the frames to be considered in motion. For example, a lower
|
61
|
-
value means more frames will be extracted. A non-positive value will disable
|
62
|
-
motion detection and extract all frames.
|
63
|
-
|
64
|
-
Returns:
|
65
|
-
a list of tuples containing the extracted frame and the timestamp in seconds.
|
66
|
-
E.g. [(frame1, 0.0), (frame2, 0.5), ...]. The timestamp is the time in seconds
|
67
|
-
from the start of the video. E.g. 12.125 means 12.125 seconds from the start of
|
68
|
-
the video. The frames are sorted by the timestamp in ascending order.
|
69
|
-
"""
|
70
|
-
with VideoFileClip(video_uri) as video:
|
71
|
-
video_duration: float = video.duration
|
72
|
-
num_workers = os.cpu_count()
|
73
|
-
clip_length: float = min(video_duration, _CLIP_LENGTH)
|
74
|
-
start_times = list(range(0, math.ceil(video_duration), math.ceil(clip_length)))
|
75
|
-
assert start_times, f"No frames to extract from the input video: {video_uri}"
|
76
|
-
segment_args = [
|
77
|
-
{
|
78
|
-
"video_uri": video_uri,
|
79
|
-
"start": start,
|
80
|
-
"end": (
|
81
|
-
start + clip_length if i < len(start_times) - 1 else video_duration
|
82
|
-
),
|
83
|
-
"fps": fps,
|
84
|
-
"motion_detection_threshold": motion_detection_threshold,
|
85
|
-
}
|
86
|
-
for i, start in enumerate(start_times)
|
87
|
-
]
|
88
|
-
if (
|
89
|
-
cast(float, segment_args[-1]["end"])
|
90
|
-
- cast(float, segment_args[-1]["start"])
|
91
|
-
< 1
|
92
|
-
):
|
93
|
-
# If the last segment is less than 1s, merge it with the previous segment
|
94
|
-
# This is to avoid the failure of the last segment extraction
|
95
|
-
assert (
|
96
|
-
len(segment_args) > 1
|
97
|
-
), "Development bug - Expect at least 2 segments."
|
98
|
-
segment_args[-2]["end"] = video_duration
|
99
|
-
segment_args.pop(-1)
|
100
|
-
_LOGGER.info(
|
101
|
-
f"""Created {len(segment_args)} segments from the input video {video_uri} of length {video.duration}s, with clip size: {clip_length}s and {num_workers} workers.
|
102
|
-
Segments: {segment_args}
|
103
|
-
"""
|
104
|
-
)
|
105
|
-
frames = []
|
106
|
-
with tqdm(total=len(segment_args)) as pbar:
|
107
|
-
with ProcessPoolExecutor(max_workers=num_workers) as executor:
|
108
|
-
futures = [
|
109
|
-
executor.submit(_extract_frames_by_clip, **kwargs) # type: ignore
|
110
|
-
for kwargs in segment_args
|
111
|
-
]
|
112
|
-
for future in as_completed(futures):
|
113
|
-
result = future.result()
|
114
|
-
frames.extend(result)
|
115
|
-
pbar.update(1)
|
116
|
-
frames.sort(key=lambda x: x[1])
|
117
|
-
_LOGGER.info(f"Extracted {len(frames)} frames from video {video_uri}")
|
118
|
-
return frames
|
119
|
-
|
120
|
-
|
121
|
-
def _extract_frames_by_clip(
|
122
|
-
video_uri: str,
|
123
|
-
start: int = 0,
|
124
|
-
end: float = -1,
|
125
|
-
fps: int = 2,
|
126
|
-
motion_detection_threshold: float = 0.06,
|
127
|
-
) -> List[Tuple[np.ndarray, float]]:
|
128
|
-
"""Extract frames from a video clip with start and end time in seconds.
|
129
|
-
|
130
|
-
Parameters:
|
131
|
-
video_uri: the path to the video file or a video file url
|
132
|
-
start: the start time (in seconds) of the clip to extract
|
133
|
-
end: the end time (in seconds, up to millisecond level precision) of the clip to extract, if -1, extract the whole video
|
134
|
-
fps: the frame rate to extract the frames
|
135
|
-
motion_detection_threshold: the threshold to detect the motion between frames
|
136
|
-
"""
|
137
|
-
with VideoFileClip(video_uri) as video:
|
138
|
-
source_fps = video.fps
|
139
|
-
if end <= 0:
|
140
|
-
end = video.duration
|
141
|
-
_LOGGER.info(
|
142
|
-
f"Extracting frames from video {video_uri} ({video.duration}s) with start={start}s and end={end}s"
|
143
|
-
)
|
144
|
-
clip = video.subclip(start, end)
|
145
|
-
processable_frames = int(clip.duration * fps)
|
146
|
-
_LOGGER.info(
|
147
|
-
f"Extracting frames from video clip of length {clip.duration}s with FPS={fps} and start_time={start}s. Total number of frames in clip: {processable_frames}"
|
148
|
-
)
|
149
|
-
frames = []
|
150
|
-
total_count, skipped_count = 0, 0
|
151
|
-
prev_processed_frame = None
|
152
|
-
pbar = tqdm(
|
153
|
-
total=processable_frames, desc=f"Extracting frames from clip {start}-{end}"
|
154
|
-
)
|
155
|
-
for i, frame in enumerate(clip.iter_frames(fps=fps, dtype="uint8")):
|
156
|
-
total_count += 1
|
157
|
-
pbar.update(1)
|
158
|
-
if motion_detection_threshold > 0:
|
159
|
-
curr_processed_frame = _preprocess_frame(frame)
|
160
|
-
# Skip the frame if it is similar to the previous one
|
161
|
-
if prev_processed_frame is not None and _similar_frame(
|
162
|
-
prev_processed_frame,
|
163
|
-
curr_processed_frame,
|
164
|
-
threshold=motion_detection_threshold,
|
165
|
-
):
|
166
|
-
skipped_count += 1
|
167
|
-
continue
|
168
|
-
prev_processed_frame = curr_processed_frame
|
169
|
-
ts = round(clip.reader.pos / source_fps, 3)
|
170
|
-
frames.append((frame, ts))
|
171
|
-
|
172
|
-
_LOGGER.info(
|
173
|
-
f"""Finished!
|
174
|
-
Frames extracted: {len(frames)}
|
175
|
-
Extracted frame timestamp: {[f[1] for f in frames]}
|
176
|
-
Total processed frames: {total_count}
|
177
|
-
Skipped frames: {skipped_count}
|
178
|
-
Scan FPS: {fps}
|
179
|
-
Clip start time: {start}s, {clip.pos}
|
180
|
-
Clip end time: {end}s
|
181
|
-
Clip duration: {clip.duration}s
|
182
|
-
Clip total frames: {clip.duration * source_fps}
|
183
|
-
Video duration: {video.duration}s
|
184
|
-
Video FPS: {video.fps}
|
185
|
-
Video total frames: {video.reader.nframes}"""
|
186
|
-
)
|
187
|
-
return frames
|
188
|
-
|
189
|
-
|
190
|
-
def _preprocess_frame(frame: np.ndarray) -> np.ndarray:
|
191
|
-
# Convert to grayscale
|
192
|
-
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
|
193
|
-
frame = cv2.GaussianBlur(src=frame, ksize=(5, 5), sigmaX=0)
|
194
|
-
return frame
|
195
|
-
|
196
|
-
|
197
|
-
def _similar_frame(
|
198
|
-
prev_frame: np.ndarray, curr_frame: np.ndarray, threshold: float
|
199
|
-
) -> bool:
|
200
|
-
"""Detect two frames are similar or not
|
201
|
-
|
202
|
-
Parameters:
|
203
|
-
threshold: similarity threshold, a value between 0-1, the percentage change that is considered a different frame.
|
204
|
-
"""
|
205
|
-
# calculate difference and update previous frame TODO: don't assume the processed image is cached
|
206
|
-
diff_frame = cv2.absdiff(src1=prev_frame, src2=curr_frame)
|
207
|
-
# Only take different areas that are different enough (>20 / 255)
|
208
|
-
thresh_frame = cv2.threshold(
|
209
|
-
src=diff_frame, thresh=20, maxval=255, type=cv2.THRESH_BINARY
|
210
|
-
)[1]
|
211
|
-
change_percentage = cv2.countNonZero(thresh_frame) / (
|
212
|
-
curr_frame.shape[0] * curr_frame.shape[1]
|
213
|
-
)
|
214
|
-
_LOGGER.debug(f"Image diff: {change_percentage}")
|
215
|
-
return change_percentage < threshold
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{vision_agent-0.2.125 → vision_agent-0.2.127}/vision_agent/agent/vision_agent_coder_prompts.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|