camel-ai 0.2.23a0__py3-none-any.whl → 0.2.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of camel-ai might be problematic. Click here for more details.

@@ -0,0 +1,407 @@
1
+ # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
14
+
15
+ import os
16
+ import tempfile
17
+ from pathlib import Path
18
+ from typing import List, Optional
19
+
20
+ from PIL import Image
21
+
22
+ from camel.logger import get_logger
23
+ from camel.messages import BaseMessage
24
+ from camel.models import BaseModelBackend, OpenAIAudioModels
25
+ from camel.toolkits.base import BaseToolkit
26
+ from camel.toolkits.function_tool import FunctionTool
27
+ from camel.utils import dependencies_required
28
+
29
+ from .video_download_toolkit import (
30
+ VideoDownloaderToolkit,
31
+ _capture_screenshot,
32
+ )
33
+
34
+ logger = get_logger(__name__)
35
+
36
+ VIDEO_QA_PROMPT = """
37
+ Analyze the provided video frames and corresponding audio transcription to \
38
+ answer the given question(s) thoroughly and accurately.
39
+
40
+ Instructions:
41
+ 1. Visual Analysis:
42
+ - Examine the video frames to identify visible entities.
43
+ - Differentiate objects, species, or features based on key attributes \
44
+ such as size, color, shape, texture, or behavior.
45
+ - Note significant groupings, interactions, or contextual patterns \
46
+ relevant to the analysis.
47
+
48
+ 2. Audio Integration:
49
+ - Use the audio transcription to complement or clarify your visual \
50
+ observations.
51
+ - Identify names, descriptions, or contextual hints in the \
52
+ transcription that help confirm or refine your visual analysis.
53
+
54
+ 3. Detailed Reasoning and Justification:
55
+ - Provide a brief explanation of how you identified and distinguished \
56
+ each species or object.
57
+ - Highlight specific features or contextual clues that informed \
58
+ your reasoning.
59
+
60
+ 4. Comprehensive Answer:
61
+ - Specify the total number of distinct species or object types \
62
+ identified in the video.
63
+ - Describe the defining characteristics and any supporting evidence \
64
+ from the video and transcription.
65
+
66
+ 5. Important Considerations:
67
+ - Pay close attention to subtle differences that could distinguish \
68
+ similar-looking species or objects
69
+ (e.g., juveniles vs. adults, closely related species).
70
+ - Provide concise yet complete explanations to ensure clarity.
71
+
72
+ **Audio Transcription:**
73
+ {audio_transcription}
74
+
75
+ **Question:**
76
+ {question}
77
+ """
78
+
79
+
80
+ class VideoAnalysisToolkit(BaseToolkit):
81
+ r"""A class for analysing videos with vision-language model.
82
+
83
+ Args:
84
+ download_directory (Optional[str], optional): The directory where the
85
+ video will be downloaded to. If not provided, video will be stored
86
+ in a temporary directory and will be cleaned up after use.
87
+ (default: :obj:`None`)
88
+ model (Optional[BaseModelBackend], optional): The model to use for
89
+ visual analysis. (default: :obj:`None`)
90
+ use_audio_transcription (bool, optional): Whether to enable audio
91
+ transcription using OpenAI's audio models. Requires a valid OpenAI
92
+ API key. When disabled, video analysis will be based solely on
93
+ visual content. (default: :obj:`False`)
94
+ """
95
+
96
+ @dependencies_required("ffmpeg", "scenedetect")
97
+ def __init__(
98
+ self,
99
+ download_directory: Optional[str] = None,
100
+ model: Optional[BaseModelBackend] = None,
101
+ use_audio_transcription: bool = False,
102
+ ) -> None:
103
+ self._cleanup = download_directory is None
104
+ self._temp_files: list[str] = [] # Track temporary files for cleanup
105
+ self._use_audio_transcription = use_audio_transcription
106
+
107
+ self._download_directory = Path(
108
+ download_directory or tempfile.mkdtemp()
109
+ ).resolve()
110
+
111
+ self.video_downloader_toolkit = VideoDownloaderToolkit(
112
+ download_directory=str(self._download_directory)
113
+ )
114
+
115
+ try:
116
+ self._download_directory.mkdir(parents=True, exist_ok=True)
117
+ except FileExistsError:
118
+ raise ValueError(
119
+ f"{self._download_directory} is not a valid directory."
120
+ )
121
+ except OSError as e:
122
+ raise ValueError(
123
+ f"Error creating directory {self._download_directory}: {e}"
124
+ )
125
+
126
+ logger.info(f"Video will be downloaded to {self._download_directory}")
127
+
128
+ self.vl_model = model
129
+ # Ensure ChatAgent is initialized with a model if provided
130
+ if self.vl_model:
131
+ # Import ChatAgent at runtime to avoid circular imports
132
+ from camel.agents import ChatAgent
133
+
134
+ self.vl_agent = ChatAgent(model=self.vl_model)
135
+ else:
136
+ # If no model is provided, use default model in ChatAgent
137
+ # Import ChatAgent at runtime to avoid circular imports
138
+ from camel.agents import ChatAgent
139
+
140
+ self.vl_agent = ChatAgent()
141
+ logger.warning(
142
+ "No vision-language model provided. Using default model in"
143
+ " ChatAgent."
144
+ )
145
+
146
+ # Initialize audio models only if audio transcription is enabled
147
+ self.audio_models = None
148
+ if self._use_audio_transcription:
149
+ try:
150
+ self.audio_models = OpenAIAudioModels()
151
+ except Exception as e:
152
+ logger.warning(
153
+ f"Failed to initialize OpenAIAudioModels: {e}. "
154
+ "Audio transcription will be disabled."
155
+ )
156
+ self._use_audio_transcription = False
157
+
158
+ def __del__(self):
159
+ r"""Clean up temporary directories and files when the object is
160
+ destroyed.
161
+ """
162
+ # Clean up temporary files
163
+ for temp_file in self._temp_files:
164
+ if os.path.exists(temp_file):
165
+ try:
166
+ os.remove(temp_file)
167
+ logger.debug(f"Removed temporary file: {temp_file}")
168
+ except OSError as e:
169
+ logger.warning(
170
+ f"Failed to remove temporary file {temp_file}: {e}"
171
+ )
172
+
173
+ # Clean up temporary directory if needed
174
+ if self._cleanup and os.path.exists(self._download_directory):
175
+ try:
176
+ import shutil
177
+
178
+ shutil.rmtree(self._download_directory)
179
+ logger.debug(
180
+ f"Removed temporary directory: {self._download_directory}"
181
+ )
182
+ except OSError as e:
183
+ logger.warning(
184
+ f"Failed to remove temporary directory"
185
+ f" {self._download_directory}: {e}"
186
+ )
187
+
188
+ def _extract_audio_from_video(
189
+ self, video_path: str, output_format: str = "mp3"
190
+ ) -> str:
191
+ r"""Extract audio from the video.
192
+
193
+ Args:
194
+ video_path (str): The path to the video file.
195
+ output_format (str): The format of the audio file to be saved.
196
+ (default: :obj:`"mp3"`)
197
+
198
+ Returns:
199
+ str: The path to the audio file.
200
+ """
201
+ import ffmpeg
202
+
203
+ # Handle case where video file doesn't have an extension
204
+ base_path = os.path.splitext(video_path)[0]
205
+ output_path = f"{base_path}.{output_format}"
206
+
207
+ try:
208
+ (
209
+ ffmpeg.input(video_path)
210
+ .output(output_path, vn=None, acodec="libmp3lame")
211
+ .run(quiet=True)
212
+ )
213
+ # Track the audio file for cleanup
214
+ self._temp_files.append(output_path)
215
+ return output_path
216
+ except ffmpeg.Error as e:
217
+ error_message = f"FFmpeg-Python failed: {e}"
218
+ logger.error(error_message)
219
+ raise RuntimeError(error_message)
220
+
221
+ def _transcribe_audio(self, audio_path: str) -> str:
222
+ r"""Transcribe the audio of the video."""
223
+ # Check if audio transcription is enabled and audio models are
224
+ # available
225
+ if not self._use_audio_transcription or self.audio_models is None:
226
+ logger.warning("Audio transcription is disabled or not available")
227
+ return "No audio transcription available."
228
+
229
+ try:
230
+ audio_transcript = self.audio_models.speech_to_text(audio_path)
231
+ if not audio_transcript:
232
+ logger.warning("Audio transcription returned empty result")
233
+ return "No audio transcription available."
234
+ return audio_transcript
235
+ except Exception as e:
236
+ logger.error(f"Audio transcription failed: {e}")
237
+ return "Audio transcription failed."
238
+
239
+ def _extract_keyframes(
240
+ self, video_path: str, num_frames: int, threshold: float = 25.0
241
+ ) -> List[Image.Image]:
242
+ r"""Extract keyframes from a video based on scene changes
243
+ and return them as PIL.Image.Image objects.
244
+
245
+ Args:
246
+ video_path (str): Path to the video file.
247
+ num_frames (int): Number of keyframes to extract.
248
+ threshold (float): The threshold value for scene change detection.
249
+
250
+ Returns:
251
+ list: A list of PIL.Image.Image objects representing
252
+ the extracted keyframes.
253
+ """
254
+ from scenedetect import ( # type: ignore[import-untyped]
255
+ SceneManager,
256
+ VideoManager,
257
+ )
258
+ from scenedetect.detectors import ( # type: ignore[import-untyped]
259
+ ContentDetector,
260
+ )
261
+
262
+ if num_frames <= 0:
263
+ logger.warning(
264
+ f"Invalid num_frames: {num_frames}, using default of 1"
265
+ )
266
+ num_frames = 1
267
+
268
+ video_manager = VideoManager([video_path])
269
+ scene_manager = SceneManager()
270
+ scene_manager.add_detector(ContentDetector(threshold=threshold))
271
+
272
+ video_manager.set_duration()
273
+ video_manager.start()
274
+ scene_manager.detect_scenes(video_manager)
275
+
276
+ scenes = scene_manager.get_scene_list()
277
+ keyframes: List[Image.Image] = []
278
+
279
+ # Handle case where no scenes are detected
280
+ if not scenes:
281
+ logger.warning(
282
+ "No scenes detected in video, capturing frames at "
283
+ "regular intervals"
284
+ )
285
+ import cv2
286
+
287
+ cap = cv2.VideoCapture(video_path)
288
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
289
+ fps = cap.get(cv2.CAP_PROP_FPS)
290
+ duration = total_frames / fps if fps > 0 else 0
291
+
292
+ if duration > 0 and total_frames > 0:
293
+ # Extract frames at regular intervals
294
+ interval = duration / min(num_frames, total_frames)
295
+ for i in range(min(num_frames, total_frames)):
296
+ time_sec = i * interval
297
+ frame = _capture_screenshot(video_path, time_sec)
298
+ keyframes.append(frame)
299
+
300
+ cap.release()
301
+ else:
302
+ # Extract frames from detected scenes
303
+ for start_time, _ in scenes:
304
+ if len(keyframes) >= num_frames:
305
+ break
306
+ frame = _capture_screenshot(video_path, start_time)
307
+ keyframes.append(frame)
308
+
309
+ if not keyframes:
310
+ logger.error("Failed to extract any keyframes from video")
311
+ raise ValueError("Failed to extract keyframes from video")
312
+
313
+ logger.info(f"Extracted {len(keyframes)} keyframes")
314
+ return keyframes
315
+
316
+ def ask_question_about_video(
317
+ self,
318
+ video_path: str,
319
+ question: str,
320
+ num_frames: int = 28,
321
+ ) -> str:
322
+ r"""Ask a question about the video.
323
+
324
+ Args:
325
+ video_path (str): The path to the video file.
326
+ It can be a local file or a URL (such as Youtube website).
327
+ question (str): The question to ask about the video.
328
+ num_frames (int): The number of frames to extract from the video.
329
+ To be adjusted based on the length of the video.
330
+ (default: :obj:`28`)
331
+
332
+ Returns:
333
+ str: The answer to the question.
334
+ """
335
+ from urllib.parse import urlparse
336
+
337
+ if not question:
338
+ raise ValueError("Question cannot be empty")
339
+
340
+ if num_frames <= 0:
341
+ logger.warning(
342
+ f"Invalid num_frames: {num_frames}, using default of 28"
343
+ )
344
+ num_frames = 28
345
+
346
+ parsed_url = urlparse(video_path)
347
+ is_url = all([parsed_url.scheme, parsed_url.netloc])
348
+
349
+ downloaded_video_path = None
350
+ try:
351
+ if is_url:
352
+ downloaded_video_path = (
353
+ self.video_downloader_toolkit.download_video(video_path)
354
+ )
355
+ if not downloaded_video_path or not os.path.exists(
356
+ downloaded_video_path
357
+ ):
358
+ raise ValueError(
359
+ f"Failed to download video from {video_path}"
360
+ )
361
+ video_path = downloaded_video_path
362
+
363
+ if not os.path.exists(video_path):
364
+ raise FileNotFoundError(f"Video file not found: {video_path}")
365
+
366
+ audio_transcript = "No audio transcription available."
367
+ if self._use_audio_transcription:
368
+ audio_path = self._extract_audio_from_video(video_path)
369
+ audio_transcript = self._transcribe_audio(audio_path)
370
+
371
+ video_frames = self._extract_keyframes(video_path, num_frames)
372
+ prompt = VIDEO_QA_PROMPT.format(
373
+ audio_transcription=audio_transcript,
374
+ question=question,
375
+ )
376
+
377
+ msg = BaseMessage.make_user_message(
378
+ role_name="User",
379
+ content=prompt,
380
+ image_list=video_frames,
381
+ )
382
+
383
+ response = self.vl_agent.step(msg)
384
+ if not response or not response.msgs:
385
+ logger.error("Model returned empty response")
386
+ return (
387
+ "Failed to generate an answer. "
388
+ "The model returned an empty response."
389
+ )
390
+
391
+ answer = response.msgs[0].content
392
+ return answer
393
+
394
+ except Exception as e:
395
+ error_message = f"Error processing video: {e!s}"
396
+ logger.error(error_message)
397
+ return f"Error: {error_message}"
398
+
399
+ def get_tools(self) -> List[FunctionTool]:
400
+ r"""Returns a list of FunctionTool objects representing the
401
+ functions in the toolkit.
402
+
403
+ Returns:
404
+ List[FunctionTool]: A list of FunctionTool objects representing
405
+ the functions in the toolkit.
406
+ """
407
+ return [FunctionTool(self.ask_question_about_video)]
@@ -13,32 +13,19 @@
13
13
  # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
14
14
 
15
15
  import io
16
- import logging
17
- import re
18
16
  import tempfile
19
17
  from pathlib import Path
20
18
  from typing import List, Optional
19
+ from urllib.parse import urlparse
21
20
 
22
21
  from PIL import Image
23
22
 
23
+ from camel.logger import get_logger
24
24
  from camel.toolkits.base import BaseToolkit
25
25
  from camel.toolkits.function_tool import FunctionTool
26
26
  from camel.utils import dependencies_required
27
27
 
28
- logger = logging.getLogger(__name__)
29
-
30
-
31
- def _standardize_url(url: str) -> str:
32
- r"""Standardize the given URL."""
33
- # Special case for YouTube embed URLs
34
- if "youtube.com/embed/" in url:
35
- match = re.search(r"embed/([a-zA-Z0-9_-]+)", url)
36
- if match:
37
- return f"https://www.youtube.com/watch?v={match.group(1)}"
38
- else:
39
- raise ValueError(f"Invalid YouTube URL: {url}")
40
-
41
- return url
28
+ logger = get_logger(__name__)
42
29
 
43
30
 
44
31
  def _capture_screenshot(video_file: str, timestamp: float) -> Image.Image:
@@ -119,7 +106,7 @@ class VideoDownloaderToolkit(BaseToolkit):
119
106
  if self._cleanup:
120
107
  shutil.rmtree(self._download_directory, ignore_errors=True)
121
108
 
122
- def _download_video(self, url: str) -> str:
109
+ def download_video(self, url: str) -> str:
123
110
  r"""Download the video and optionally split it into chunks.
124
111
 
125
112
  yt-dlp will detect if the video is downloaded automatically so there
@@ -149,18 +136,21 @@ class VideoDownloaderToolkit(BaseToolkit):
149
136
 
150
137
  def get_video_bytes(
151
138
  self,
152
- video_url: str,
139
+ video_path: str,
153
140
  ) -> bytes:
154
- r"""Download video by the URL, and return the content in bytes.
141
+ r"""Download video by the path, and return the content in bytes.
155
142
 
156
143
  Args:
157
- video_url (str): The URL of the video to download.
144
+ video_path (str): The path to the video file.
158
145
 
159
146
  Returns:
160
147
  bytes: The video file content in bytes.
161
148
  """
162
- url = _standardize_url(video_url)
163
- video_file = self._download_video(url)
149
+ parsed_url = urlparse(video_path)
150
+ is_url = all([parsed_url.scheme, parsed_url.netloc])
151
+ if is_url:
152
+ video_path = self.download_video(video_path)
153
+ video_file = video_path
164
154
 
165
155
  with open(video_file, 'rb') as f:
166
156
  video_bytes = f.read()
@@ -168,7 +158,7 @@ class VideoDownloaderToolkit(BaseToolkit):
168
158
  return video_bytes
169
159
 
170
160
  def get_video_screenshots(
171
- self, video_url: str, amount: int
161
+ self, video_path: str, amount: int
172
162
  ) -> List[Image.Image]:
173
163
  r"""Capture screenshots from the video at specified timestamps or by
174
164
  dividing the video into equal parts if an integer is provided.
@@ -182,8 +172,11 @@ class VideoDownloaderToolkit(BaseToolkit):
182
172
  """
183
173
  import ffmpeg
184
174
 
185
- url = _standardize_url(video_url)
186
- video_file = self._download_video(url)
175
+ parsed_url = urlparse(video_path)
176
+ is_url = all([parsed_url.scheme, parsed_url.netloc])
177
+ if is_url:
178
+ video_path = self.download_video(video_path)
179
+ video_file = video_path
187
180
 
188
181
  # Get the video length
189
182
  try:
@@ -208,6 +201,7 @@ class VideoDownloaderToolkit(BaseToolkit):
208
201
  the functions in the toolkit.
209
202
  """
210
203
  return [
204
+ FunctionTool(self.download_video),
211
205
  FunctionTool(self.get_video_bytes),
212
206
  FunctionTool(self.get_video_screenshots),
213
207
  ]