vision-agent 0.0.37__py3-none-any.whl → 0.0.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,6 +11,7 @@ from PIL import Image
11
11
  from PIL.Image import Image as ImageType
12
12
 
13
13
  from vision_agent.image_utils import convert_to_b64, get_image_size
14
+ from vision_agent.tools.video import extract_frames_from_video
14
15
 
15
16
  _LOGGER = logging.getLogger(__name__)
16
17
 
@@ -505,6 +506,47 @@ class Divide(Tool):
505
506
  return round(input[0] / input[1], 2)
506
507
 
507
508
 
509
+ class ExtractFrames(Tool):
510
+ r"""Extract frames from a video."""
511
+
512
+ name = "extract_frames_"
513
+ description = "'extract_frames_' extract image frames from the input video, return a list of tuple (frame, timestamp), where the timestamp is the relative time in seconds of the frame occurred in the video, the frame is a local image file path that stores the frame."
514
+ usage = {
515
+ "required_parameters": [{"name": "video_uri", "type": "str"}],
516
+ "examples": [
517
+ {
518
+ "scenario": "Can you extract the frames from this video? Video: www.foobar.com/video?name=test.mp4",
519
+ "parameters": {"video_uri": "www.foobar.com/video?name=test.mp4"},
520
+ },
521
+ {
522
+ "scenario": "Can you extract the images from this video file? Video path: tests/data/test.mp4",
523
+ "parameters": {"video_uri": "tests/data/test.mp4"},
524
+ },
525
+ ],
526
+ }
527
+
528
+ def __call__(self, video_uri: str) -> list[tuple[str, float]]:
529
+ """Extract frames from a video.
530
+
531
+
532
+ Parameters:
533
+ video_uri: the path to the video file or a url points to the video data
534
+
535
+ Returns:
536
+ a list of tuples containing the extracted frame and the timestamp in seconds. E.g. [(path_to_frame1, 0.0), (path_to_frame2, 0.5), ...]. The timestamp is the time in seconds from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video. The frames are sorted by the timestamp in ascending order.
537
+ """
538
+ frames = extract_frames_from_video(video_uri)
539
+ result = []
540
+ _LOGGER.info(
541
+ f"Extracted {len(frames)} frames from video {video_uri}. Temporarily saving them as images to disk for downstream tasks."
542
+ )
543
+ for frame, ts in frames:
544
+ with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
545
+ Image.fromarray(frame).save(tmp)
546
+ result.append((tmp.name, ts))
547
+ return result
548
+
549
+
508
550
  TOOLS = {
509
551
  i: {"name": c.name, "description": c.description, "usage": c.usage, "class": c}
510
552
  for i, c in enumerate(
@@ -520,6 +562,7 @@ TOOLS = {
520
562
  Subtract,
521
563
  Multiply,
522
564
  Divide,
565
+ ExtractFrames,
523
566
  ]
524
567
  )
525
568
  if (hasattr(c, "name") and hasattr(c, "description") and hasattr(c, "usage"))
@@ -0,0 +1,176 @@
1
+ import logging
2
+ import math
3
+ import os
4
+ from concurrent.futures import ProcessPoolExecutor, as_completed
5
+ from typing import List, Tuple, cast
6
+
7
+ import cv2
8
+ import numpy as np
9
+ from moviepy.video.io.VideoFileClip import VideoFileClip
10
+ from tqdm import tqdm
11
+
12
+ _LOGGER = logging.getLogger(__name__)
13
+ # The maximum length of the clip to extract frames from, in seconds
14
+ _CLIP_LENGTH = 30.0
15
+
16
+
17
+ def extract_frames_from_video(
18
+ video_uri: str, fps: int = 2, motion_detection_threshold: float = 0.06
19
+ ) -> List[Tuple[np.ndarray, float]]:
20
+ """Extract frames from a video
21
+
22
+ Parameters:
23
+ video_uri: the path to the video file or a video file url
24
+ fps: the frame rate per second to extract the frames
25
+ motion_detection_threshold: The threshold to detect motion between changes/frames.
26
+ A value between 0-1, which represents the percentage change required for the frames to be considered in motion.
27
+ For example, a lower value means more frames will be extracted.
28
+
29
+ Returns:
30
+ a list of tuples containing the extracted frame and the timestamp in seconds. E.g. [(frame1, 0.0), (frame2, 0.5), ...]. The timestamp is the time in seconds from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video. The frames are sorted by the timestamp in ascending order.
31
+ """
32
+ with VideoFileClip(video_uri) as video:
33
+ video_duration: float = video.duration
34
+ num_workers = os.cpu_count()
35
+ clip_length: float = min(video_duration, _CLIP_LENGTH)
36
+ start_times = list(range(0, math.ceil(video_duration), math.ceil(clip_length)))
37
+ assert start_times, f"No frames to extract from the input video: {video_uri}"
38
+ segment_args = [
39
+ {
40
+ "video_uri": video_uri,
41
+ "start": start,
42
+ "end": (
43
+ start + clip_length if i < len(start_times) - 1 else video_duration
44
+ ),
45
+ "fps": fps,
46
+ "motion_detection_threshold": motion_detection_threshold,
47
+ }
48
+ for i, start in enumerate(start_times)
49
+ ]
50
+ if (
51
+ cast(float, segment_args[-1]["end"])
52
+ - cast(float, segment_args[-1]["start"])
53
+ < 1
54
+ ):
55
+ # If the last segment is less than 1s, merge it with the previous segment
56
+ # This is to avoid the failure of the last segment extraction
57
+ assert (
58
+ len(segment_args) > 1
59
+ ), "Development bug - Expect at least 2 segments."
60
+ segment_args[-2]["end"] = video_duration
61
+ segment_args.pop(-1)
62
+ _LOGGER.info(
63
+ f"""Created {len(segment_args)} segments from the input video {video_uri} of length {video.duration}s, with clip size: {clip_length}s and {num_workers} workers.
64
+ Segments: {segment_args}
65
+ """
66
+ )
67
+ frames = []
68
+ with tqdm(total=len(segment_args)) as pbar:
69
+ with ProcessPoolExecutor(max_workers=num_workers) as executor:
70
+ futures = [
71
+ executor.submit(_extract_frames_by_clip, **kwargs) # type: ignore
72
+ for kwargs in segment_args
73
+ ]
74
+ for future in as_completed(futures):
75
+ result = future.result()
76
+ frames.extend(result)
77
+ pbar.update(1)
78
+ frames.sort(key=lambda x: x[1])
79
+ _LOGGER.info(f"Extracted {len(frames)} frames from video {video_uri}")
80
+ return frames
81
+
82
+
83
+ def _extract_frames_by_clip(
84
+ video_uri: str,
85
+ start: int = 0,
86
+ end: float = -1,
87
+ fps: int = 2,
88
+ motion_detection_threshold: float = 0.06,
89
+ ) -> List[Tuple[np.ndarray, float]]:
90
+ """Extract frames from a video clip with start and end time in seconds.
91
+
92
+ Parameters:
93
+ video_uri: the path to the video file or a video file url
94
+ start: the start time (in seconds) of the clip to extract
95
+ end: the end time (in seconds, up to millisecond level precision) of the clip to extract, if -1, extract the whole video
96
+ fps: the frame rate to extract the frames
97
+ motion_detection_threshold: the threshold to detect the motion between frames
98
+ """
99
+ with VideoFileClip(video_uri) as video:
100
+ source_fps = video.fps
101
+ if end <= 0:
102
+ end = video.duration
103
+ _LOGGER.info(
104
+ f"Extracting frames from video {video_uri} ({video.duration}s) with start={start}s and end={end}s"
105
+ )
106
+ clip = video.subclip(start, end)
107
+ processable_frames = int(clip.duration * fps)
108
+ _LOGGER.info(
109
+ f"Extracting frames from video clip of length {clip.duration}s with FPS={fps} and start_time={start}s. Total number of frames in clip: {processable_frames}"
110
+ )
111
+ frames = []
112
+ total_count, skipped_count = 0, 0
113
+ prev_processed_frame = None
114
+ pbar = tqdm(
115
+ total=processable_frames, desc=f"Extracting frames from clip {start}-{end}"
116
+ )
117
+ for i, frame in enumerate(clip.iter_frames(fps=fps, dtype="uint8")):
118
+ curr_processed_frame = _preprocess_frame(frame)
119
+ total_count += 1
120
+ pbar.update(1)
121
+ # Skip the frame if it is similar to the previous one
122
+ if prev_processed_frame is not None and _similar_frame(
123
+ prev_processed_frame,
124
+ curr_processed_frame,
125
+ threshold=motion_detection_threshold,
126
+ ):
127
+ skipped_count += 1
128
+ continue
129
+ prev_processed_frame = curr_processed_frame
130
+ ts = round(clip.reader.pos / source_fps, 3)
131
+ frames.append((frame, ts))
132
+
133
+ _LOGGER.info(
134
+ f"""Finished!
135
+ Frames extracted: {len(frames)}
136
+ Extracted frame timestamp: {[f[1] for f in frames]}
137
+ Total processed frames: {total_count}
138
+ Skipped frames: {skipped_count}
139
+ Scan FPS: {fps}
140
+ Clip start time: {start}s, {clip.pos}
141
+ Clip end time: {end}s
142
+ Clip duration: {clip.duration}s
143
+ Clip total frames: {clip.duration * source_fps}
144
+ Video duration: {video.duration}s
145
+ Video FPS: {video.fps}
146
+ Video total frames: {video.reader.nframes}"""
147
+ )
148
+ return frames
149
+
150
+
151
+ def _preprocess_frame(frame: np.ndarray) -> np.ndarray:
152
+ # Convert to grayscale
153
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
154
+ frame = cv2.GaussianBlur(src=frame, ksize=(5, 5), sigmaX=0)
155
+ return frame
156
+
157
+
158
+ def _similar_frame(
159
+ prev_frame: np.ndarray, curr_frame: np.ndarray, threshold: float
160
+ ) -> bool:
161
+ """Detect two frames are similar or not
162
+
163
+ Parameters:
164
+ threshold: similarity threshold, a value between 0-1, the percentage change that is considered a different frame.
165
+ """
166
+ # calculate difference and update previous frame TODO: don't assume the processed image is cached
167
+ diff_frame = cv2.absdiff(src1=prev_frame, src2=curr_frame)
168
+ # Only take different areas that are different enough (>20 / 255)
169
+ thresh_frame = cv2.threshold(
170
+ src=diff_frame, thresh=20, maxval=255, type=cv2.THRESH_BINARY
171
+ )[1]
172
+ change_percentage = cv2.countNonZero(thresh_frame) / (
173
+ curr_frame.shape[0] * curr_frame.shape[1]
174
+ )
175
+ _LOGGER.debug(f"Image diff: {change_percentage}")
176
+ return change_percentage < threshold
@@ -1,16 +1,19 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.0.37
3
+ Version: 0.0.39
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
7
- Requires-Python: >=3.10,<3.12
7
+ Requires-Python: >=3.9,<3.12
8
8
  Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.9
9
10
  Classifier: Programming Language :: Python :: 3.10
10
11
  Classifier: Programming Language :: Python :: 3.11
11
12
  Requires-Dist: faiss-cpu (>=1.0.0,<2.0.0)
13
+ Requires-Dist: moviepy (>=1.0.0,<2.0.0)
12
14
  Requires-Dist: numpy (>=1.21.0,<2.0.0)
13
15
  Requires-Dist: openai (>=1.0.0,<2.0.0)
16
+ Requires-Dist: opencv-python-headless (>=4.0.0,<5.0.0)
14
17
  Requires-Dist: pandas (>=2.0.0,<3.0.0)
15
18
  Requires-Dist: pillow (>=10.0.0,<11.0.0)
16
19
  Requires-Dist: requests (>=2.0.0,<3.0.0)
@@ -42,6 +45,11 @@ aims to provide an in-seconds experience by allowing users to describe their pro
42
45
  text and utilizing agent frameworks to solve the task for them. Check out our discord
43
46
  for updates and roadmaps!
44
47
 
48
+ ## Documentation
49
+
50
+ - [Vision Agent Library Docs](https://landing-ai.github.io/vision-agent/)
51
+
52
+
45
53
  ## Getting Started
46
54
  ### Installation
47
55
  To get started, you can install the library using pip:
@@ -124,6 +132,7 @@ you. For example:
124
132
  | Crop | Crop crops an image given a bounding box and returns a file name of the cropped image. |
125
133
  | BboxArea | BboxArea returns the area of the bounding box in pixels normalized to 2 decimal places. |
126
134
  | SegArea | SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places. |
135
+ | ExtractFrames | ExtractFrames extracts image frames from the input video. |
127
136
 
128
137
 
129
138
  It also has a basic set of calculate tools such as add, subtract, multiply and divide.
@@ -18,8 +18,9 @@ vision_agent/lmm/__init__.py,sha256=I8mbeNUajTfWVNqLsuFQVOaNBDlkIhYp9DFU8H4kB7g,
18
18
  vision_agent/lmm/lmm.py,sha256=ARcbgkcyP83TbVVoXI9B-gtG0gJuTaG_MjcUGbams4U,8052
19
19
  vision_agent/tools/__init__.py,sha256=aX0pU3pXU1V0Cj9FzYCvdsX76TAglFMHx59kNhXHbPs,131
20
20
  vision_agent/tools/prompts.py,sha256=9RBbyqlNlExsGKlJ89Jkph83DAEJ8PCVGaHoNbyN7TM,1416
21
- vision_agent/tools/tools.py,sha256=Vlb8H9qm4rA5HxGw5p-gJES6jgPIkfrtVlM7jcxw7d8,19141
22
- vision_agent-0.0.37.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
23
- vision_agent-0.0.37.dist-info/METADATA,sha256=Y9oIfWbRK-3EuNewrwK4WOnpHY2ca7FB8jDa5oucT5Y,4966
24
- vision_agent-0.0.37.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
25
- vision_agent-0.0.37.dist-info/RECORD,,
21
+ vision_agent/tools/tools.py,sha256=2mmomPDbldXRpw3q5zAcazKJMjAGd0Jl9ak9JykHQYI,21211
22
+ vision_agent/tools/video.py,sha256=KV_Wcat7DDGxpHSaGBu7s4lj4crlYaUu4YKpCO_86k4,7440
23
+ vision_agent-0.0.39.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
24
+ vision_agent-0.0.39.dist-info/METADATA,sha256=_jugEQnOeNbLa3kSSo0zTn2bII3Rh5dfop9qyMWXPfw,5282
25
+ vision_agent-0.0.39.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
26
+ vision_agent-0.0.39.dist-info/RECORD,,