vision-agent 0.2.124__py3-none-any.whl → 0.2.126__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -173,7 +173,7 @@ def pick_plan(
173
173
 
174
174
  if verbosity == 2:
175
175
  _print_code("Initial code and tests:", code)
176
- _LOGGER.info(f"Initial code execution result:\n{tool_output.text()}")
176
+ _LOGGER.info(f"Initial code execution result:\n{tool_output_str}")
177
177
 
178
178
  log_progress(
179
179
  {
@@ -12,7 +12,6 @@ from uuid import UUID
12
12
  import cv2
13
13
  import numpy as np
14
14
  import requests
15
- from moviepy.editor import ImageSequenceClip
16
15
  from PIL import Image, ImageDraw, ImageEnhance, ImageFont
17
16
  from pillow_heif import register_heif_opener # type: ignore
18
17
  from pytube import YouTube # type: ignore
@@ -35,7 +34,6 @@ from vision_agent.tools.tools_types import (
35
34
  ODResponseData,
36
35
  PromptTask,
37
36
  )
38
- from vision_agent.utils import extract_frames_from_video
39
37
  from vision_agent.utils.exceptions import FineTuneModelIsNotReady
40
38
  from vision_agent.utils.execute import FileSerializer, MimeType
41
39
  from vision_agent.utils.image_utils import (
@@ -44,13 +42,17 @@ from vision_agent.utils.image_utils import (
44
42
  convert_to_b64,
45
43
  denormalize_bbox,
46
44
  encode_image_bytes,
47
- frames_to_bytes,
48
45
  get_image_size,
49
46
  normalize_bbox,
50
47
  numpy_to_bytes,
51
48
  rle_decode,
52
49
  rle_decode_array,
53
50
  )
51
+ from vision_agent.utils.video import (
52
+ extract_frames_from_video,
53
+ frames_to_bytes,
54
+ video_writer,
55
+ )
54
56
 
55
57
  register_heif_opener()
56
58
 
@@ -1513,17 +1515,14 @@ def save_video(
1513
1515
  "/tmp/tmpvideo123.mp4"
1514
1516
  """
1515
1517
  if fps <= 0:
1516
- _LOGGER.warning(f"Invalid fps value: {fps}. Setting fps to 4 (default value).")
1517
- fps = 4
1518
- with ImageSequenceClip(frames, fps=fps) as video:
1519
- if output_video_path:
1520
- f = open(output_video_path, "wb")
1521
- else:
1522
- f = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) # type: ignore
1523
- video.write_videofile(f.name, codec="libx264")
1524
- f.close()
1525
- _save_video_to_result(f.name)
1526
- return f.name
1518
+ raise ValueError(f"fps must be greater than 0 got {fps}")
1519
+
1520
+ if output_video_path is None:
1521
+ output_video_path = tempfile.NamedTemporaryFile(delete=False).name
1522
+
1523
+ output_video_path = video_writer(frames, fps, output_video_path)
1524
+ _save_video_to_result(output_video_path)
1525
+ return output_video_path
1527
1526
 
1528
1527
 
1529
1528
  def _save_video_to_result(video_uri: str) -> None:
@@ -1820,7 +1819,6 @@ def overlay_counting_results(
1820
1819
 
1821
1820
  FUNCTION_TOOLS = [
1822
1821
  owl_v2,
1823
- extract_frames,
1824
1822
  ocr,
1825
1823
  clip,
1826
1824
  vit_image_classification,
@@ -1841,6 +1839,7 @@ FUNCTION_TOOLS = [
1841
1839
  ]
1842
1840
 
1843
1841
  UTIL_TOOLS = [
1842
+ extract_frames,
1844
1843
  save_json,
1845
1844
  load_image,
1846
1845
  save_image,
@@ -1856,7 +1855,7 @@ TOOLS = FUNCTION_TOOLS + UTIL_TOOLS
1856
1855
  TOOLS_DF = get_tools_df(TOOLS) # type: ignore
1857
1856
  TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS) # type: ignore
1858
1857
  TOOL_DOCSTRING = get_tool_documentation(TOOLS) # type: ignore
1859
- TOOLS_INFO = get_tools_info(TOOLS) # type: ignore
1858
+ TOOLS_INFO = get_tools_info(FUNCTION_TOOLS) # type: ignore
1860
1859
  UTILITIES_DOCSTRING = get_tool_documentation(
1861
1860
  [
1862
1861
  save_json,
@@ -7,4 +7,4 @@ from .execute import (
7
7
  Result,
8
8
  )
9
9
  from .sim import AzureSim, OllamaSim, Sim, load_sim, merge_sim
10
- from .video import extract_frames_from_video
10
+ from .video import extract_frames_from_video, video_writer
@@ -2,14 +2,12 @@
2
2
 
3
3
  import base64
4
4
  import io
5
- import tempfile
6
5
  from importlib import resources
7
6
  from io import BytesIO
8
7
  from pathlib import Path
9
8
  from typing import Dict, List, Tuple, Union
10
9
 
11
10
  import numpy as np
12
- from moviepy.editor import ImageSequenceClip
13
11
  from PIL import Image, ImageDraw, ImageFont
14
12
  from PIL.Image import Image as ImageType
15
13
 
@@ -90,24 +88,6 @@ def rle_decode_array(rle: Dict[str, List[int]]) -> np.ndarray:
90
88
  return binary_mask
91
89
 
92
90
 
93
- def frames_to_bytes(
94
- frames: List[np.ndarray], fps: float = 10, file_ext: str = "mp4"
95
- ) -> bytes:
96
- r"""Convert a list of frames to a video file encoded into a byte string.
97
-
98
- Parameters:
99
- frames: the list of frames
100
- fps: the frames per second of the video
101
- file_ext: the file extension of the video file
102
- """
103
- with tempfile.NamedTemporaryFile(delete=True) as temp_file:
104
- clip = ImageSequenceClip(frames, fps=fps)
105
- clip.write_videofile(temp_file.name + f".{file_ext}", fps=fps, codec="libx264")
106
- with open(temp_file.name + f".{file_ext}", "rb") as f:
107
- buffer_bytes = f.read()
108
- return buffer_bytes
109
-
110
-
111
91
  def b64_to_pil(b64_str: str) -> ImageType:
112
92
  r"""Convert a base64 string to a PIL Image.
113
93
 
@@ -1,19 +1,15 @@
1
1
  import base64
2
2
  import logging
3
- import math
4
- import os
5
3
  import tempfile
6
- from concurrent.futures import ProcessPoolExecutor, as_completed
7
- from typing import List, Tuple, cast
4
+ from functools import lru_cache
5
+ from typing import List, Optional, Tuple
8
6
 
9
7
  import cv2
10
8
  import numpy as np
11
- from moviepy.video.io.VideoFileClip import VideoFileClip
12
- from tqdm import tqdm
9
+ from decord import VideoReader # type: ignore
13
10
 
14
11
  _LOGGER = logging.getLogger(__name__)
15
12
  # The maximum length of the clip to extract frames from, in seconds
16
- _CLIP_LENGTH = 30.0
17
13
 
18
14
 
19
15
  def play_video(video_base64: str) -> None:
@@ -47,19 +43,52 @@ def play_video(video_base64: str) -> None:
47
43
  cv2.destroyAllWindows()
48
44
 
49
45
 
46
+ def video_writer(
47
+ frames: List[np.ndarray], fps: float = 1.0, filename: Optional[str] = None
48
+ ) -> str:
49
+ if filename is None:
50
+ filename = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
51
+
52
+ fourcc = cv2.VideoWriter_fourcc(*"mp4v") # type: ignore
53
+ height, width = frames[0].shape[:2]
54
+ writer = cv2.VideoWriter(filename, fourcc, fps, (width, height))
55
+ for frame in frames:
56
+ writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
57
+ writer.release()
58
+ return filename
59
+
60
+
61
+ def frames_to_bytes(
62
+ frames: List[np.ndarray], fps: float = 10, file_ext: str = ".mp4"
63
+ ) -> bytes:
64
+ r"""Convert a list of frames to a video file encoded into a byte string.
65
+
66
+ Parameters:
67
+ frames: the list of frames
68
+ fps: the frames per second of the video
69
+ file_ext: the file extension of the video file
70
+ """
71
+ with tempfile.NamedTemporaryFile(delete=True, suffix=file_ext) as temp_file:
72
+ video_writer(frames, fps, temp_file.name)
73
+
74
+ with open(temp_file.name, "rb") as f:
75
+ buffer_bytes = f.read()
76
+ return buffer_bytes
77
+
78
+
79
+ # WARNING: this cache is cache is a little dangerous because if the underlying video
80
+ # contents change but the filename remains the same it will return the old file contents
81
+ # but for vision agent it's unlikely to change the file contents while keeping the
82
+ # same file name and the time savings are very large.
83
+ @lru_cache(maxsize=8)
50
84
  def extract_frames_from_video(
51
- video_uri: str, fps: float = 0.5, motion_detection_threshold: float = 0.0
85
+ video_uri: str, fps: float = 1.0
52
86
  ) -> List[Tuple[np.ndarray, float]]:
53
87
  """Extract frames from a video
54
88
 
55
89
  Parameters:
56
- video_uri: the path to the video file or a video file url
57
- fps: the frame rate per second to extract the frames
58
- motion_detection_threshold: The threshold to detect motion between
59
- changes/frames. A value between 0-1, which represents the percentage change
60
- required for the frames to be considered in motion. For example, a lower
61
- value means more frames will be extracted. A non-positive value will disable
62
- motion detection and extract all frames.
90
+ video_uri (str): the path to the video file or a video file url
91
+ fps (float): the frame rate per second to extract the frames
63
92
 
64
93
  Returns:
65
94
  a list of tuples containing the extracted frame and the timestamp in seconds.
@@ -67,149 +96,12 @@ def extract_frames_from_video(
67
96
  from the start of the video. E.g. 12.125 means 12.125 seconds from the start of
68
97
  the video. The frames are sorted by the timestamp in ascending order.
69
98
  """
70
- with VideoFileClip(video_uri) as video:
71
- video_duration: float = video.duration
72
- num_workers = os.cpu_count()
73
- clip_length: float = min(video_duration, _CLIP_LENGTH)
74
- start_times = list(range(0, math.ceil(video_duration), math.ceil(clip_length)))
75
- assert start_times, f"No frames to extract from the input video: {video_uri}"
76
- segment_args = [
77
- {
78
- "video_uri": video_uri,
79
- "start": start,
80
- "end": (
81
- start + clip_length if i < len(start_times) - 1 else video_duration
82
- ),
83
- "fps": fps,
84
- "motion_detection_threshold": motion_detection_threshold,
85
- }
86
- for i, start in enumerate(start_times)
87
- ]
88
- if (
89
- cast(float, segment_args[-1]["end"])
90
- - cast(float, segment_args[-1]["start"])
91
- < 1
92
- ):
93
- # If the last segment is less than 1s, merge it with the previous segment
94
- # This is to avoid the failure of the last segment extraction
95
- assert (
96
- len(segment_args) > 1
97
- ), "Development bug - Expect at least 2 segments."
98
- segment_args[-2]["end"] = video_duration
99
- segment_args.pop(-1)
100
- _LOGGER.info(
101
- f"""Created {len(segment_args)} segments from the input video {video_uri} of length {video.duration}s, with clip size: {clip_length}s and {num_workers} workers.
102
- Segments: {segment_args}
103
- """
104
- )
105
- frames = []
106
- with tqdm(total=len(segment_args)) as pbar:
107
- with ProcessPoolExecutor(max_workers=num_workers) as executor:
108
- futures = [
109
- executor.submit(_extract_frames_by_clip, **kwargs) # type: ignore
110
- for kwargs in segment_args
111
- ]
112
- for future in as_completed(futures):
113
- result = future.result()
114
- frames.extend(result)
115
- pbar.update(1)
116
- frames.sort(key=lambda x: x[1])
117
- _LOGGER.info(f"Extracted {len(frames)} frames from video {video_uri}")
118
- return frames
119
-
120
-
121
- def _extract_frames_by_clip(
122
- video_uri: str,
123
- start: int = 0,
124
- end: float = -1,
125
- fps: int = 2,
126
- motion_detection_threshold: float = 0.06,
127
- ) -> List[Tuple[np.ndarray, float]]:
128
- """Extract frames from a video clip with start and end time in seconds.
129
-
130
- Parameters:
131
- video_uri: the path to the video file or a video file url
132
- start: the start time (in seconds) of the clip to extract
133
- end: the end time (in seconds, up to millisecond level precision) of the clip to extract, if -1, extract the whole video
134
- fps: the frame rate to extract the frames
135
- motion_detection_threshold: the threshold to detect the motion between frames
136
- """
137
- with VideoFileClip(video_uri) as video:
138
- source_fps = video.fps
139
- if end <= 0:
140
- end = video.duration
141
- _LOGGER.info(
142
- f"Extracting frames from video {video_uri} ({video.duration}s) with start={start}s and end={end}s"
143
- )
144
- clip = video.subclip(start, end)
145
- processable_frames = int(clip.duration * fps)
146
- _LOGGER.info(
147
- f"Extracting frames from video clip of length {clip.duration}s with FPS={fps} and start_time={start}s. Total number of frames in clip: {processable_frames}"
148
- )
149
- frames = []
150
- total_count, skipped_count = 0, 0
151
- prev_processed_frame = None
152
- pbar = tqdm(
153
- total=processable_frames, desc=f"Extracting frames from clip {start}-{end}"
154
- )
155
- for i, frame in enumerate(clip.iter_frames(fps=fps, dtype="uint8")):
156
- total_count += 1
157
- pbar.update(1)
158
- if motion_detection_threshold > 0:
159
- curr_processed_frame = _preprocess_frame(frame)
160
- # Skip the frame if it is similar to the previous one
161
- if prev_processed_frame is not None and _similar_frame(
162
- prev_processed_frame,
163
- curr_processed_frame,
164
- threshold=motion_detection_threshold,
165
- ):
166
- skipped_count += 1
167
- continue
168
- prev_processed_frame = curr_processed_frame
169
- ts = round(clip.reader.pos / source_fps, 3)
170
- frames.append((frame, ts))
171
-
172
- _LOGGER.info(
173
- f"""Finished!
174
- Frames extracted: {len(frames)}
175
- Extracted frame timestamp: {[f[1] for f in frames]}
176
- Total processed frames: {total_count}
177
- Skipped frames: {skipped_count}
178
- Scan FPS: {fps}
179
- Clip start time: {start}s, {clip.pos}
180
- Clip end time: {end}s
181
- Clip duration: {clip.duration}s
182
- Clip total frames: {clip.duration * source_fps}
183
- Video duration: {video.duration}s
184
- Video FPS: {video.fps}
185
- Video total frames: {video.reader.nframes}"""
186
- )
187
- return frames
188
-
189
-
190
- def _preprocess_frame(frame: np.ndarray) -> np.ndarray:
191
- # Convert to grayscale
192
- frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
193
- frame = cv2.GaussianBlur(src=frame, ksize=(5, 5), sigmaX=0)
194
- return frame
195
-
196
-
197
- def _similar_frame(
198
- prev_frame: np.ndarray, curr_frame: np.ndarray, threshold: float
199
- ) -> bool:
200
- """Detect two frames are similar or not
201
-
202
- Parameters:
203
- threshold: similarity threshold, a value between 0-1, the percentage change that is considered a different frame.
204
- """
205
- # calculate difference and update previous frame TODO: don't assume the processed image is cached
206
- diff_frame = cv2.absdiff(src1=prev_frame, src2=curr_frame)
207
- # Only take different areas that are different enough (>20 / 255)
208
- thresh_frame = cv2.threshold(
209
- src=diff_frame, thresh=20, maxval=255, type=cv2.THRESH_BINARY
210
- )[1]
211
- change_percentage = cv2.countNonZero(thresh_frame) / (
212
- curr_frame.shape[0] * curr_frame.shape[1]
213
- )
214
- _LOGGER.debug(f"Image diff: {change_percentage}")
215
- return change_percentage < threshold
99
+ vr = VideoReader(video_uri)
100
+ orig_fps = vr.get_avg_fps()
101
+ if fps > orig_fps:
102
+ fps = orig_fps
103
+
104
+ s = orig_fps / fps
105
+ samples = [(int(i * s), int(i * s) / orig_fps) for i in range(int(len(vr) / s))]
106
+ frames = vr.get_batch([s[0] for s in samples]).asnumpy()
107
+ return [(frames[i, :, :, :], samples[i][1]) for i in range(len(samples))]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vision-agent
3
- Version: 0.2.124
3
+ Version: 0.2.126
4
4
  Summary: Toolset for Vision Agent
5
5
  Author: Landing AI
6
6
  Author-email: dev@landing.ai
@@ -12,9 +12,9 @@ Classifier: Programming Language :: Python :: 3.11
12
12
  Requires-Dist: anthropic (>=0.31.0,<0.32.0)
13
13
  Requires-Dist: e2b (>=0.17.2a50,<0.18.0)
14
14
  Requires-Dist: e2b-code-interpreter (==0.0.11a37)
15
+ Requires-Dist: eva-decord (>=0.6.1,<0.7.0)
15
16
  Requires-Dist: ipykernel (>=6.29.4,<7.0.0)
16
17
  Requires-Dist: langsmith (>=0.1.58,<0.2.0)
17
- Requires-Dist: moviepy (>=1.0.0,<2.0.0)
18
18
  Requires-Dist: nbclient (>=0.10.0,<0.11.0)
19
19
  Requires-Dist: nbformat (>=5.10.4,<6.0.0)
20
20
  Requires-Dist: numpy (>=1.21.0,<2.0.0)
@@ -3,7 +3,7 @@ vision_agent/agent/__init__.py,sha256=FRwiux1FGvGccetyUCtY46KP01fQteqorm-JtFepov
3
3
  vision_agent/agent/agent.py,sha256=2cjIOxEuSJrqbfPXYoV0qER5ihXsPFCoEFJa4jpqan0,597
4
4
  vision_agent/agent/agent_utils.py,sha256=22LiPhkJlS5mVeo2dIi259pc2NgA7PGHRpcbnrtKo78,1930
5
5
  vision_agent/agent/vision_agent.py,sha256=WM1_o0VAQokAKlDr-0lpFxCRwUm_eFfFNWP-wSNjo7s,11180
6
- vision_agent/agent/vision_agent_coder.py,sha256=ujctkpmQkX2C6YXjlp7VLZFqSB00xwkGe-9swA8Gv8s,34240
6
+ vision_agent/agent/vision_agent_coder.py,sha256=_2QQd_nTGojkk2ZOiMevVCY6-eUA9q1QdCWH7-Noq4w,34237
7
7
  vision_agent/agent/vision_agent_coder_prompts.py,sha256=Rg7-Ih7oFgFbHFFno0EHpaZEgm0SYj_nTdqqdp21YLo,11246
8
8
  vision_agent/agent/vision_agent_prompts.py,sha256=K1nLo3XKQ-IqCom1TRwh3cMoGZNxNwEgZqf3uJ6eL18,7221
9
9
  vision_agent/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -18,16 +18,16 @@ vision_agent/tools/__init__.py,sha256=T8Hi5aHf4J2QJDoPRvu5fxbiqMpAY-1Gi2EFIhJbf3
18
18
  vision_agent/tools/meta_tools.py,sha256=KeGiw2OtY8ARpGbtWjoNAoO1dwevt7LbCupaJX61MkE,18929
19
19
  vision_agent/tools/prompts.py,sha256=V1z4YJLXZuUl_iZ5rY0M5hHc_2tmMEUKr0WocXKGt4E,1430
20
20
  vision_agent/tools/tool_utils.py,sha256=62NVlojPMf9MuJ-3yJEcrB3mzmOxN2HrNQzzjVa-FZg,7527
21
- vision_agent/tools/tools.py,sha256=xT-lDC3NCdltK0_CDTOOiU8B2YhlIdzFhuSbvRVFBI8,65545
21
+ vision_agent/tools/tools.py,sha256=Y6BTLFoueLtjId2qG06UyZwCQA_TTA6uFxPkxzhRI50,65396
22
22
  vision_agent/tools/tools_types.py,sha256=rLpCUODPY0yI65SLOTJOxfHFfqWM3WjOq-AYX25Chjk,2356
23
- vision_agent/utils/__init__.py,sha256=pWk0ktvR4aUEhuEIzSLM9kSgW4WDVqptdvOTeGLkJ6M,230
23
+ vision_agent/utils/__init__.py,sha256=7fMgbZiEwbNS0fBOS_hJI5PuEYBblw36zLi_UjUzvj4,244
24
24
  vision_agent/utils/exceptions.py,sha256=booSPSuoULF7OXRr_YbC4dtKt6gM_HyiFQHBuaW86C4,2052
25
25
  vision_agent/utils/execute.py,sha256=gc4R_0BKUrZyhiKvIxOpYuzQPYVWQEqxr3ANy1lJAw4,27037
26
- vision_agent/utils/image_utils.py,sha256=lhdvRWMbQmMMLTmJGI1dFjzNeQSLfPYJEsAkq5Ydj3Y,11476
26
+ vision_agent/utils/image_utils.py,sha256=zTTOJFOieMzwIquTFnW7T6ssx9o6XfoZ0Unqyk7GJrg,10746
27
27
  vision_agent/utils/sim.py,sha256=ebE9Cs00pVEDI1HMjAzUBk88tQQmc2U-yAzIDinnekU,5572
28
28
  vision_agent/utils/type_defs.py,sha256=BE12s3JNQy36QvauXHjwyeffVh5enfcvd4vTzSwvEZI,1384
29
- vision_agent/utils/video.py,sha256=rNmU9KEIkZB5-EztZNlUiKYN0mm_55A_2VGUM0QpqLA,8779
30
- vision_agent-0.2.124.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
- vision_agent-0.2.124.dist-info/METADATA,sha256=mDvhJytcxFZW_B18Vkn4egk4HJ8UHYl6YQhEJHQAbPk,12255
32
- vision_agent-0.2.124.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
- vision_agent-0.2.124.dist-info/RECORD,,
29
+ vision_agent/utils/video.py,sha256=oM3sdQVGGI3xwrCN2GKt9otzDb0SPW-JUo5SABxTVl4,3847
30
+ vision_agent-0.2.126.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
+ vision_agent-0.2.126.dist-info/METADATA,sha256=4O_OuQh5yhJ8unzNtfU4E_0RNykXxkbdjkiGPAXi9Ek,12258
32
+ vision_agent-0.2.126.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
33
+ vision_agent-0.2.126.dist-info/RECORD,,