massgen 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of massgen might be problematic. Click here for more details.

Files changed (82) hide show
  1. massgen/__init__.py +1 -1
  2. massgen/agent_config.py +33 -7
  3. massgen/api_params_handler/_api_params_handler_base.py +3 -0
  4. massgen/api_params_handler/_chat_completions_api_params_handler.py +4 -0
  5. massgen/api_params_handler/_claude_api_params_handler.py +4 -0
  6. massgen/api_params_handler/_gemini_api_params_handler.py +4 -0
  7. massgen/api_params_handler/_response_api_params_handler.py +4 -0
  8. massgen/backend/azure_openai.py +9 -1
  9. massgen/backend/base.py +4 -0
  10. massgen/backend/base_with_custom_tool_and_mcp.py +25 -5
  11. massgen/backend/claude_code.py +9 -1
  12. massgen/backend/docs/permissions_and_context_files.md +2 -2
  13. massgen/backend/gemini.py +35 -6
  14. massgen/backend/gemini_utils.py +30 -0
  15. massgen/backend/response.py +2 -0
  16. massgen/chat_agent.py +9 -3
  17. massgen/cli.py +291 -43
  18. massgen/config_builder.py +163 -18
  19. massgen/configs/README.md +69 -14
  20. massgen/configs/debug/restart_test_controlled.yaml +60 -0
  21. massgen/configs/debug/restart_test_controlled_filesystem.yaml +73 -0
  22. massgen/configs/tools/code-execution/docker_with_sudo.yaml +35 -0
  23. massgen/configs/tools/custom_tools/computer_use_browser_example.yaml +56 -0
  24. massgen/configs/tools/custom_tools/computer_use_docker_example.yaml +65 -0
  25. massgen/configs/tools/custom_tools/computer_use_example.yaml +50 -0
  26. massgen/configs/tools/custom_tools/crawl4ai_example.yaml +55 -0
  27. massgen/configs/tools/custom_tools/multimodal_tools/text_to_file_generation_multi.yaml +61 -0
  28. massgen/configs/tools/custom_tools/multimodal_tools/text_to_file_generation_single.yaml +29 -0
  29. massgen/configs/tools/custom_tools/multimodal_tools/text_to_image_generation_multi.yaml +51 -0
  30. massgen/configs/tools/custom_tools/multimodal_tools/text_to_image_generation_single.yaml +33 -0
  31. massgen/configs/tools/custom_tools/multimodal_tools/text_to_speech_generation_multi.yaml +55 -0
  32. massgen/configs/tools/custom_tools/multimodal_tools/text_to_speech_generation_single.yaml +33 -0
  33. massgen/configs/tools/custom_tools/multimodal_tools/text_to_video_generation_multi.yaml +47 -0
  34. massgen/configs/tools/custom_tools/multimodal_tools/text_to_video_generation_single.yaml +29 -0
  35. massgen/configs/tools/custom_tools/multimodal_tools/understand_audio.yaml +33 -0
  36. massgen/configs/tools/custom_tools/multimodal_tools/understand_file.yaml +34 -0
  37. massgen/configs/tools/custom_tools/multimodal_tools/understand_image.yaml +33 -0
  38. massgen/configs/tools/custom_tools/multimodal_tools/understand_video.yaml +34 -0
  39. massgen/configs/tools/custom_tools/multimodal_tools/youtube_video_analysis.yaml +59 -0
  40. massgen/docker/README.md +83 -0
  41. massgen/filesystem_manager/_code_execution_server.py +22 -7
  42. massgen/filesystem_manager/_docker_manager.py +21 -1
  43. massgen/filesystem_manager/_filesystem_manager.py +9 -0
  44. massgen/filesystem_manager/_path_permission_manager.py +148 -0
  45. massgen/filesystem_manager/_workspace_tools_server.py +0 -997
  46. massgen/formatter/_gemini_formatter.py +73 -0
  47. massgen/frontend/coordination_ui.py +175 -257
  48. massgen/frontend/displays/base_display.py +29 -0
  49. massgen/frontend/displays/rich_terminal_display.py +155 -9
  50. massgen/frontend/displays/simple_display.py +21 -0
  51. massgen/frontend/displays/terminal_display.py +22 -2
  52. massgen/logger_config.py +50 -6
  53. massgen/message_templates.py +283 -15
  54. massgen/orchestrator.py +335 -38
  55. massgen/tests/test_binary_file_blocking.py +274 -0
  56. massgen/tests/test_case_studies.md +12 -12
  57. massgen/tests/test_code_execution.py +178 -0
  58. massgen/tests/test_multimodal_size_limits.py +407 -0
  59. massgen/tests/test_orchestration_restart.py +204 -0
  60. massgen/tool/__init__.py +4 -0
  61. massgen/tool/_manager.py +7 -2
  62. massgen/tool/_multimodal_tools/image_to_image_generation.py +293 -0
  63. massgen/tool/_multimodal_tools/text_to_file_generation.py +455 -0
  64. massgen/tool/_multimodal_tools/text_to_image_generation.py +222 -0
  65. massgen/tool/_multimodal_tools/text_to_speech_continue_generation.py +226 -0
  66. massgen/tool/_multimodal_tools/text_to_speech_transcription_generation.py +217 -0
  67. massgen/tool/_multimodal_tools/text_to_video_generation.py +223 -0
  68. massgen/tool/_multimodal_tools/understand_audio.py +211 -0
  69. massgen/tool/_multimodal_tools/understand_file.py +555 -0
  70. massgen/tool/_multimodal_tools/understand_image.py +316 -0
  71. massgen/tool/_multimodal_tools/understand_video.py +340 -0
  72. massgen/tool/_web_tools/crawl4ai_tool.py +718 -0
  73. massgen/tool/docs/multimodal_tools.md +1368 -0
  74. massgen/tool/workflow_toolkits/__init__.py +26 -0
  75. massgen/tool/workflow_toolkits/post_evaluation.py +216 -0
  76. massgen/utils.py +1 -0
  77. {massgen-0.1.2.dist-info → massgen-0.1.4.dist-info}/METADATA +101 -69
  78. {massgen-0.1.2.dist-info → massgen-0.1.4.dist-info}/RECORD +82 -46
  79. {massgen-0.1.2.dist-info → massgen-0.1.4.dist-info}/WHEEL +0 -0
  80. {massgen-0.1.2.dist-info → massgen-0.1.4.dist-info}/entry_points.txt +0 -0
  81. {massgen-0.1.2.dist-info → massgen-0.1.4.dist-info}/licenses/LICENSE +0 -0
  82. {massgen-0.1.2.dist-info → massgen-0.1.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,316 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Understand and analyze images using OpenAI's gpt-4.1 API.
4
+ """
5
+
6
+ import base64
7
+ import json
8
+ import os
9
+ from pathlib import Path
10
+ from typing import List, Optional
11
+
12
+ from dotenv import load_dotenv
13
+ from openai import OpenAI
14
+
15
+ from massgen.logger_config import logger
16
+ from massgen.tool._result import ExecutionResult, TextContent
17
+
18
+
19
+ def _validate_path_access(path: Path, allowed_paths: Optional[List[Path]] = None) -> None:
20
+ """
21
+ Validate that a path is within allowed directories.
22
+
23
+ Args:
24
+ path: Path to validate
25
+ allowed_paths: List of allowed base paths (optional)
26
+
27
+ Raises:
28
+ ValueError: If path is not within allowed directories
29
+ """
30
+ if not allowed_paths:
31
+ return # No restrictions
32
+
33
+ for allowed_path in allowed_paths:
34
+ try:
35
+ path.relative_to(allowed_path)
36
+ return # Path is within this allowed directory
37
+ except ValueError:
38
+ continue
39
+
40
+ raise ValueError(f"Path not in allowed directories: {path}")
41
+
42
+
43
+ async def understand_image(
44
+ image_path: str,
45
+ prompt: str = "What's in this image? Please describe it in detail.",
46
+ model: str = "gpt-4.1",
47
+ allowed_paths: Optional[List[str]] = None,
48
+ agent_cwd: Optional[str] = None,
49
+ ) -> ExecutionResult:
50
+ """
51
+ Understand and analyze an image using OpenAI's gpt-4.1 API.
52
+
53
+ This tool processes an image through OpenAI's gpt-4.1 API to extract insights,
54
+ descriptions, or answer questions about the image content.
55
+
56
+ Args:
57
+ image_path: Path to the image file (PNG/JPEG/JPG)
58
+ - Relative path: Resolved relative to workspace
59
+ - Absolute path: Must be within allowed directories
60
+ prompt: Question or instruction about the image (default: "What's in this image? Please describe it in detail.")
61
+ model: Model to use (default: "gpt-4.1")
62
+ allowed_paths: List of allowed base paths for validation (optional)
63
+ agent_cwd: Agent's current working directory (automatically injected)
64
+
65
+ Returns:
66
+ ExecutionResult containing:
67
+ - success: Whether operation succeeded
68
+ - operation: "understand_image"
69
+ - image_path: Path to the analyzed image
70
+ - prompt: The prompt used
71
+ - model: Model used for analysis
72
+ - response: The model's understanding/description of the image
73
+
74
+ Examples:
75
+ understand_image("photo.jpg")
76
+ → Returns detailed description of the image
77
+
78
+ understand_image("chart.png", "What data is shown in this chart?")
79
+ → Returns analysis of the chart data
80
+
81
+ understand_image("screenshot.png", "What UI elements are visible in this screenshot?")
82
+ → Returns description of UI elements
83
+
84
+ Security:
85
+ - Requires valid OpenAI API key
86
+ - Image file must exist and be readable
87
+ - Only supports PNG, JPEG, and JPG formats
88
+ """
89
+ try:
90
+ # Convert allowed_paths from strings to Path objects
91
+ allowed_paths_list = [Path(p) for p in allowed_paths] if allowed_paths else None
92
+
93
+ # Load environment variables
94
+ script_dir = Path(__file__).parent.parent.parent.parent
95
+ env_path = script_dir / ".env"
96
+ if env_path.exists():
97
+ load_dotenv(env_path)
98
+ else:
99
+ load_dotenv()
100
+
101
+ openai_api_key = os.getenv("OPENAI_API_KEY")
102
+
103
+ if not openai_api_key:
104
+ result = {
105
+ "success": False,
106
+ "operation": "understand_image",
107
+ "error": "OpenAI API key not found. Please set OPENAI_API_KEY in .env file or environment variable.",
108
+ }
109
+ return ExecutionResult(
110
+ output_blocks=[TextContent(data=json.dumps(result, indent=2))],
111
+ )
112
+
113
+ # Initialize OpenAI client
114
+ client = OpenAI(api_key=openai_api_key)
115
+
116
+ # Resolve image path
117
+ # Use agent_cwd if available, otherwise fall back to Path.cwd()
118
+ base_dir = Path(agent_cwd) if agent_cwd else Path.cwd()
119
+
120
+ if Path(image_path).is_absolute():
121
+ img_path = Path(image_path).resolve()
122
+ else:
123
+ img_path = (base_dir / image_path).resolve()
124
+
125
+ # Validate image path
126
+ _validate_path_access(img_path, allowed_paths_list)
127
+
128
+ if not img_path.exists():
129
+ result = {
130
+ "success": False,
131
+ "operation": "understand_image",
132
+ "error": f"Image file does not exist: {img_path}",
133
+ }
134
+ return ExecutionResult(
135
+ output_blocks=[TextContent(data=json.dumps(result, indent=2))],
136
+ )
137
+
138
+ # Check file format
139
+ if img_path.suffix.lower() not in [".png", ".jpg", ".jpeg"]:
140
+ result = {
141
+ "success": False,
142
+ "operation": "understand_image",
143
+ "error": f"Image must be PNG, JPEG, or JPG format: {img_path}",
144
+ }
145
+ return ExecutionResult(
146
+ output_blocks=[TextContent(data=json.dumps(result, indent=2))],
147
+ )
148
+
149
+ # Read image and check size and dimensions
150
+ try:
151
+ # OpenAI Vision API limits:
152
+ # - Up to 20MB per image
153
+ # - High-resolution: 768px (short side) x 2000px (long side)
154
+ file_size = img_path.stat().st_size
155
+ max_size = 18 * 1024 * 1024 # 18MB (conservative buffer under OpenAI's 20MB limit)
156
+ max_short_side = 768 # Maximum pixels for short side
157
+ max_long_side = 2000 # Maximum pixels for long side
158
+
159
+ # Try to import PIL for dimension/size checking
160
+ try:
161
+ import io
162
+
163
+ from PIL import Image
164
+ except ImportError:
165
+ # PIL not available - fall back to simple file reading
166
+ # This will work for small images but may fail for large ones
167
+ if file_size > max_size:
168
+ result = {
169
+ "success": False,
170
+ "operation": "understand_image",
171
+ "error": f"Image too large ({file_size/1024/1024:.1f}MB > {max_size/1024/1024:.0f}MB) and PIL not available for resizing. Install with: pip install pillow",
172
+ }
173
+ return ExecutionResult(
174
+ output_blocks=[TextContent(data=json.dumps(result, indent=2))],
175
+ )
176
+ # Read without resizing
177
+ with open(img_path, "rb") as image_file:
178
+ image_data = image_file.read()
179
+ base64_image = base64.b64encode(image_data).decode("utf-8")
180
+ mime_type = "image/jpeg" if img_path.suffix.lower() in [".jpg", ".jpeg"] else "image/png"
181
+ logger.info(f"Read image without dimension check (PIL not available): {img_path.name} ({file_size/1024/1024:.1f}MB)")
182
+
183
+ else:
184
+ # PIL available - check both file size and dimensions
185
+ img = Image.open(img_path)
186
+ img.size
187
+ original_width, original_height = img.size
188
+
189
+ # Determine short and long sides
190
+ short_side = min(original_width, original_height)
191
+ long_side = max(original_width, original_height)
192
+
193
+ # Check if we need to resize
194
+ needs_resize = False
195
+ resize_reason = []
196
+
197
+ if file_size > max_size:
198
+ needs_resize = True
199
+ resize_reason.append(f"file size {file_size/1024/1024:.1f}MB > {max_size/1024/1024:.0f}MB")
200
+
201
+ if short_side > max_short_side or long_side > max_long_side:
202
+ needs_resize = True
203
+ resize_reason.append(f"dimensions {original_width}x{original_height} exceed {max_short_side}x{max_long_side}")
204
+
205
+ if needs_resize:
206
+ # Calculate scale factor based on both size and dimensions
207
+ scale_factors = []
208
+
209
+ # Scale for file size (if needed)
210
+ if file_size > max_size:
211
+ # Estimate: reduce dimensions by sqrt of size ratio
212
+ size_scale = (max_size / file_size) ** 0.5 * 0.8 # 0.8 for safety margin
213
+ scale_factors.append(size_scale)
214
+
215
+ # Scale for dimensions (if needed)
216
+ if short_side > max_short_side or long_side > max_long_side:
217
+ # Calculate scale needed to fit within dimension constraints
218
+ short_scale = max_short_side / short_side if short_side > max_short_side else 1.0
219
+ long_scale = max_long_side / long_side if long_side > max_long_side else 1.0
220
+ dimension_scale = min(short_scale, long_scale) * 0.95 # 0.95 for safety margin
221
+ scale_factors.append(dimension_scale)
222
+
223
+ # Use the most restrictive scale factor
224
+ scale_factor = min(scale_factors)
225
+ new_width = int(original_width * scale_factor)
226
+ new_height = int(original_height * scale_factor)
227
+
228
+ # Resize image
229
+ img_resized = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
230
+
231
+ # Convert to bytes
232
+ img_byte_arr = io.BytesIO()
233
+ # Save as JPEG for better compression
234
+ img_resized.convert("RGB").save(img_byte_arr, format="JPEG", quality=85, optimize=True)
235
+ image_data = img_byte_arr.getvalue()
236
+
237
+ base64_image = base64.b64encode(image_data).decode("utf-8")
238
+ mime_type = "image/jpeg"
239
+
240
+ logger.info(
241
+ f"Resized image ({', '.join(resize_reason)}): "
242
+ f"{original_width}x{original_height} ({file_size/1024/1024:.1f}MB) -> "
243
+ f"{new_width}x{new_height} ({len(image_data)/1024/1024:.1f}MB)",
244
+ )
245
+
246
+ else:
247
+ # No resize needed - read normally
248
+ with open(img_path, "rb") as image_file:
249
+ image_data = image_file.read()
250
+ base64_image = base64.b64encode(image_data).decode("utf-8")
251
+ # Determine MIME type
252
+ mime_type = "image/jpeg" if img_path.suffix.lower() in [".jpg", ".jpeg"] else "image/png"
253
+ logger.info(f"Image within limits: {original_width}x{original_height} ({file_size/1024/1024:.1f}MB)")
254
+
255
+ except Exception as read_error:
256
+ result = {
257
+ "success": False,
258
+ "operation": "understand_image",
259
+ "error": f"Failed to read image file: {str(read_error)}",
260
+ }
261
+ return ExecutionResult(
262
+ output_blocks=[TextContent(data=json.dumps(result, indent=2))],
263
+ )
264
+
265
+ try:
266
+ # Call OpenAI API for image understanding
267
+ response = client.responses.create(
268
+ model=model,
269
+ input=[
270
+ {
271
+ "role": "user",
272
+ "content": [
273
+ {"type": "input_text", "text": prompt},
274
+ {
275
+ "type": "input_image",
276
+ "image_url": f"data:{mime_type};base64,{base64_image}",
277
+ },
278
+ ],
279
+ },
280
+ ],
281
+ )
282
+
283
+ # Extract response text
284
+ response_text = response.output_text if hasattr(response, "output_text") else str(response.output)
285
+
286
+ result = {
287
+ "success": True,
288
+ "operation": "understand_image",
289
+ "image_path": str(img_path),
290
+ "prompt": prompt,
291
+ "model": model,
292
+ "response": response_text,
293
+ }
294
+ return ExecutionResult(
295
+ output_blocks=[TextContent(data=json.dumps(result, indent=2))],
296
+ )
297
+
298
+ except Exception as api_error:
299
+ result = {
300
+ "success": False,
301
+ "operation": "understand_image",
302
+ "error": f"OpenAI API error: {str(api_error)}",
303
+ }
304
+ return ExecutionResult(
305
+ output_blocks=[TextContent(data=json.dumps(result, indent=2))],
306
+ )
307
+
308
+ except Exception as e:
309
+ result = {
310
+ "success": False,
311
+ "operation": "understand_image",
312
+ "error": f"Failed to understand image: {str(e)}",
313
+ }
314
+ return ExecutionResult(
315
+ output_blocks=[TextContent(data=json.dumps(result, indent=2))],
316
+ )
@@ -0,0 +1,340 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Understand and analyze videos by extracting key frames and using OpenAI's gpt-4.1 API.
4
+ """
5
+
6
+ import base64
7
+ import json
8
+ import os
9
+ from pathlib import Path
10
+ from typing import List, Optional
11
+
12
+ from dotenv import load_dotenv
13
+ from openai import OpenAI
14
+
15
+ from massgen.tool._result import ExecutionResult, TextContent
16
+
17
+
18
+ def _validate_path_access(path: Path, allowed_paths: Optional[List[Path]] = None) -> None:
19
+ """
20
+ Validate that a path is within allowed directories.
21
+
22
+ Args:
23
+ path: Path to validate
24
+ allowed_paths: List of allowed base paths (optional)
25
+
26
+ Raises:
27
+ ValueError: If path is not within allowed directories
28
+ """
29
+ if not allowed_paths:
30
+ return # No restrictions
31
+
32
+ for allowed_path in allowed_paths:
33
+ try:
34
+ path.relative_to(allowed_path)
35
+ return # Path is within this allowed directory
36
+ except ValueError:
37
+ continue
38
+
39
+ raise ValueError(f"Path not in allowed directories: {path}")
40
+
41
+
42
+ def _extract_key_frames(video_path: Path, num_frames: int = 8) -> List[str]:
43
+ """
44
+ Extract key frames from a video file and resize them to fit OpenAI Vision API limits.
45
+
46
+ Args:
47
+ video_path: Path to the video file
48
+ num_frames: Number of key frames to extract
49
+
50
+ Returns:
51
+ List of base64-encoded frame images (resized to fit 768px x 2000px limits)
52
+
53
+ Raises:
54
+ ImportError: If opencv-python is not installed
55
+ Exception: If frame extraction fails
56
+ """
57
+ try:
58
+ import cv2
59
+ except ImportError:
60
+ raise ImportError(
61
+ "opencv-python is required for video frame extraction. " "Please install it with: pip install opencv-python",
62
+ )
63
+
64
+ # OpenAI Vision API limits for images (same as understand_image)
65
+ max_short_side = 768 # Maximum pixels for short side
66
+ max_long_side = 2000 # Maximum pixels for long side
67
+
68
+ # Open the video file
69
+ video = cv2.VideoCapture(str(video_path))
70
+
71
+ if not video.isOpened():
72
+ raise Exception(f"Failed to open video file: {video_path}")
73
+
74
+ try:
75
+ # Get total number of frames
76
+ total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
77
+
78
+ if total_frames == 0:
79
+ raise Exception(f"Video file has no frames: {video_path}")
80
+
81
+ # Calculate frame indices to extract (evenly spaced)
82
+ frame_indices = []
83
+ if num_frames >= total_frames:
84
+ # If requesting more frames than available, use all frames
85
+ frame_indices = list(range(total_frames))
86
+ else:
87
+ # Extract evenly spaced frames
88
+ step = total_frames / num_frames
89
+ frame_indices = [int(i * step) for i in range(num_frames)]
90
+
91
+ # Extract frames
92
+ frames_base64 = []
93
+ for frame_idx in frame_indices:
94
+ # Set video position to the frame
95
+ video.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
96
+
97
+ # Read the frame
98
+ ret, frame = video.read()
99
+
100
+ if not ret:
101
+ continue
102
+
103
+ # Check and resize frame if needed to fit OpenAI Vision API limits
104
+ height, width = frame.shape[:2]
105
+ short_side = min(width, height)
106
+ long_side = max(width, height)
107
+
108
+ if short_side > max_short_side or long_side > max_long_side:
109
+ # Calculate scale factor to fit within dimension constraints
110
+ short_scale = max_short_side / short_side if short_side > max_short_side else 1.0
111
+ long_scale = max_long_side / long_side if long_side > max_long_side else 1.0
112
+ scale_factor = min(short_scale, long_scale) * 0.95 # 0.95 for safety margin
113
+
114
+ new_width = int(width * scale_factor)
115
+ new_height = int(height * scale_factor)
116
+
117
+ # Resize frame using LANCZOS (high quality)
118
+ frame = cv2.resize(frame, (new_width, new_height), interpolation=cv2.INTER_LANCZOS4)
119
+
120
+ # Encode frame to JPEG with quality=85 (same as understand_image)
121
+ encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), 85]
122
+ ret, buffer = cv2.imencode(".jpg", frame, encode_param)
123
+
124
+ if not ret:
125
+ continue
126
+
127
+ # Convert to base64
128
+ frame_base64 = base64.b64encode(buffer).decode("utf-8")
129
+ frames_base64.append(frame_base64)
130
+
131
+ if not frames_base64:
132
+ raise Exception("Failed to extract any frames from video")
133
+
134
+ return frames_base64
135
+
136
+ finally:
137
+ # Release the video capture object
138
+ video.release()
139
+
140
+
141
+ async def understand_video(
142
+ video_path: str,
143
+ prompt: str = "What's happening in this video? Please describe the content, actions, and any important details you observe across these frames.",
144
+ num_frames: int = 8,
145
+ model: str = "gpt-4.1",
146
+ allowed_paths: Optional[List[str]] = None,
147
+ agent_cwd: Optional[str] = None,
148
+ ) -> ExecutionResult:
149
+ """
150
+ Understand and analyze a video by extracting key frames and using OpenAI's gpt-4.1 API.
151
+
152
+ This tool extracts key frames from a video file and processes them through OpenAI's
153
+ gpt-4.1 API to provide insights, descriptions, or answer questions about the video content.
154
+
155
+ Args:
156
+ video_path: Path to the video file (MP4, AVI, MOV, etc.)
157
+ - Relative path: Resolved relative to workspace
158
+ - Absolute path: Must be within allowed directories
159
+ prompt: Question or instruction about the video (default: asks for general description)
160
+ num_frames: Number of key frames to extract from the video (default: 8)
161
+ - Higher values provide more detail but increase API costs
162
+ - Recommended range: 4-16 frames
163
+ model: Model to use (default: "gpt-4.1")
164
+ allowed_paths: List of allowed base paths for validation (optional)
165
+ agent_cwd: Agent's current working directory (automatically injected, optional)
166
+
167
+ Returns:
168
+ ExecutionResult containing:
169
+ - success: Whether operation succeeded
170
+ - operation: "understand_video"
171
+ - video_path: Path to the analyzed video
172
+ - num_frames_extracted: Number of frames extracted
173
+ - prompt: The prompt used
174
+ - model: Model used for analysis
175
+ - response: The model's understanding/description of the video
176
+
177
+ Examples:
178
+ understand_video("demo.mp4")
179
+ → Returns detailed description of the video content
180
+
181
+ understand_video("tutorial.mp4", "What steps are shown in this tutorial?")
182
+ → Returns analysis of tutorial steps
183
+
184
+ understand_video("meeting.mp4", "Summarize the key points discussed in this meeting", num_frames=12)
185
+ → Returns meeting summary based on 12 key frames
186
+
187
+ understand_video("sports.mp4", "What sport is being played and what are the key moments?")
188
+ → Returns sports analysis
189
+
190
+ Security:
191
+ - Requires valid OpenAI API key
192
+ - Requires opencv-python package for video processing
193
+ - Video file must exist and be readable
194
+ - Supports common video formats (MP4, AVI, MOV, MKV, etc.)
195
+
196
+ Note:
197
+ This tool extracts still frames from the video. Audio content is not analyzed.
198
+ For audio analysis, use the generate_text_with_input_audio tool.
199
+ """
200
+ try:
201
+ # Convert allowed_paths from strings to Path objects
202
+ allowed_paths_list = [Path(p) for p in allowed_paths] if allowed_paths else None
203
+
204
+ # Load environment variables
205
+ script_dir = Path(__file__).parent.parent.parent.parent
206
+ env_path = script_dir / ".env"
207
+ if env_path.exists():
208
+ load_dotenv(env_path)
209
+ else:
210
+ load_dotenv()
211
+
212
+ openai_api_key = os.getenv("OPENAI_API_KEY")
213
+
214
+ if not openai_api_key:
215
+ result = {
216
+ "success": False,
217
+ "operation": "understand_video",
218
+ "error": "OpenAI API key not found. Please set OPENAI_API_KEY in .env file or environment variable.",
219
+ }
220
+ return ExecutionResult(
221
+ output_blocks=[TextContent(data=json.dumps(result, indent=2))],
222
+ )
223
+
224
+ # Initialize OpenAI client
225
+ client = OpenAI(api_key=openai_api_key)
226
+
227
+ # Resolve video path
228
+ # Use agent_cwd if available, otherwise fall back to Path.cwd()
229
+ base_dir = Path(agent_cwd) if agent_cwd else Path.cwd()
230
+
231
+ if Path(video_path).is_absolute():
232
+ vid_path = Path(video_path).resolve()
233
+ else:
234
+ vid_path = (base_dir / video_path).resolve()
235
+
236
+ # Validate video path
237
+ _validate_path_access(vid_path, allowed_paths_list)
238
+
239
+ if not vid_path.exists():
240
+ result = {
241
+ "success": False,
242
+ "operation": "understand_video",
243
+ "error": f"Video file does not exist: {vid_path}",
244
+ }
245
+ return ExecutionResult(
246
+ output_blocks=[TextContent(data=json.dumps(result, indent=2))],
247
+ )
248
+
249
+ # Check if file is likely a video (by extension)
250
+ video_extensions = [".mp4", ".avi", ".mov", ".mkv", ".flv", ".wmv", ".webm", ".m4v", ".mpg", ".mpeg"]
251
+ if vid_path.suffix.lower() not in video_extensions:
252
+ result = {
253
+ "success": False,
254
+ "operation": "understand_video",
255
+ "error": f"File does not appear to be a video file: {vid_path}. Supported formats: {', '.join(video_extensions)}",
256
+ }
257
+ return ExecutionResult(
258
+ output_blocks=[TextContent(data=json.dumps(result, indent=2))],
259
+ )
260
+
261
+ # Extract key frames from video
262
+ try:
263
+ frames_base64 = _extract_key_frames(vid_path, num_frames)
264
+ except ImportError as import_error:
265
+ result = {
266
+ "success": False,
267
+ "operation": "understand_video",
268
+ "error": str(import_error),
269
+ }
270
+ return ExecutionResult(
271
+ output_blocks=[TextContent(data=json.dumps(result, indent=2))],
272
+ )
273
+ except Exception as extract_error:
274
+ result = {
275
+ "success": False,
276
+ "operation": "understand_video",
277
+ "error": f"Failed to extract frames from video: {str(extract_error)}",
278
+ }
279
+ return ExecutionResult(
280
+ output_blocks=[TextContent(data=json.dumps(result, indent=2))],
281
+ )
282
+
283
+ # Build content array with prompt and all frames
284
+ content = [{"type": "input_text", "text": prompt}]
285
+
286
+ for frame_base64 in frames_base64:
287
+ content.append(
288
+ {
289
+ "type": "input_image",
290
+ "image_url": f"data:image/jpeg;base64,{frame_base64}",
291
+ },
292
+ )
293
+
294
+ try:
295
+ # Call OpenAI API for video understanding
296
+ response = client.responses.create(
297
+ model=model,
298
+ input=[
299
+ {
300
+ "role": "user",
301
+ "content": content,
302
+ },
303
+ ],
304
+ )
305
+
306
+ # Extract response text
307
+ response_text = response.output_text if hasattr(response, "output_text") else str(response.output)
308
+
309
+ result = {
310
+ "success": True,
311
+ "operation": "understand_video",
312
+ "video_path": str(vid_path),
313
+ "num_frames_extracted": len(frames_base64),
314
+ "prompt": prompt,
315
+ "model": model,
316
+ "response": response_text,
317
+ }
318
+ return ExecutionResult(
319
+ output_blocks=[TextContent(data=json.dumps(result, indent=2))],
320
+ )
321
+
322
+ except Exception as api_error:
323
+ result = {
324
+ "success": False,
325
+ "operation": "understand_video",
326
+ "error": f"OpenAI API error: {str(api_error)}",
327
+ }
328
+ return ExecutionResult(
329
+ output_blocks=[TextContent(data=json.dumps(result, indent=2))],
330
+ )
331
+
332
+ except Exception as e:
333
+ result = {
334
+ "success": False,
335
+ "operation": "understand_video",
336
+ "error": f"Failed to understand video: {str(e)}",
337
+ }
338
+ return ExecutionResult(
339
+ output_blocks=[TextContent(data=json.dumps(result, indent=2))],
340
+ )