massgen 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- massgen/__init__.py +1 -1
- massgen/agent_config.py +33 -7
- massgen/api_params_handler/_api_params_handler_base.py +3 -0
- massgen/backend/azure_openai.py +9 -1
- massgen/backend/base.py +4 -0
- massgen/backend/claude_code.py +9 -1
- massgen/backend/gemini.py +35 -6
- massgen/backend/gemini_utils.py +30 -0
- massgen/chat_agent.py +9 -3
- massgen/cli.py +291 -43
- massgen/config_builder.py +163 -18
- massgen/configs/README.md +52 -6
- massgen/configs/debug/restart_test_controlled.yaml +60 -0
- massgen/configs/debug/restart_test_controlled_filesystem.yaml +73 -0
- massgen/configs/tools/code-execution/docker_with_sudo.yaml +35 -0
- massgen/configs/tools/custom_tools/computer_use_browser_example.yaml +56 -0
- massgen/configs/tools/custom_tools/computer_use_docker_example.yaml +65 -0
- massgen/configs/tools/custom_tools/computer_use_example.yaml +50 -0
- massgen/configs/tools/custom_tools/crawl4ai_mcp_example.yaml +67 -0
- massgen/configs/tools/custom_tools/crawl4ai_multi_agent_example.yaml +68 -0
- massgen/configs/tools/custom_tools/multimodal_tools/playwright_with_img_understanding.yaml +98 -0
- massgen/configs/tools/custom_tools/multimodal_tools/understand_audio.yaml +33 -0
- massgen/configs/tools/custom_tools/multimodal_tools/understand_file.yaml +34 -0
- massgen/configs/tools/custom_tools/multimodal_tools/understand_image.yaml +33 -0
- massgen/configs/tools/custom_tools/multimodal_tools/understand_video.yaml +34 -0
- massgen/configs/tools/custom_tools/multimodal_tools/understand_video_example.yaml +54 -0
- massgen/configs/tools/custom_tools/multimodal_tools/youtube_video_analysis.yaml +59 -0
- massgen/configs/tools/memory/README.md +199 -0
- massgen/configs/tools/memory/gpt5mini_gemini_context_window_management.yaml +131 -0
- massgen/configs/tools/memory/gpt5mini_gemini_no_persistent_memory.yaml +133 -0
- massgen/configs/tools/memory/test_context_window_management.py +286 -0
- massgen/configs/tools/multimodal/gpt5mini_gpt5nano_documentation_evolution.yaml +97 -0
- massgen/docker/README.md +83 -0
- massgen/filesystem_manager/_code_execution_server.py +22 -7
- massgen/filesystem_manager/_docker_manager.py +21 -1
- massgen/filesystem_manager/_filesystem_manager.py +8 -0
- massgen/filesystem_manager/_workspace_tools_server.py +0 -997
- massgen/formatter/_gemini_formatter.py +73 -0
- massgen/frontend/coordination_ui.py +175 -257
- massgen/frontend/displays/base_display.py +29 -0
- massgen/frontend/displays/rich_terminal_display.py +155 -9
- massgen/frontend/displays/simple_display.py +21 -0
- massgen/frontend/displays/terminal_display.py +22 -2
- massgen/logger_config.py +50 -6
- massgen/message_templates.py +123 -3
- massgen/orchestrator.py +319 -38
- massgen/tests/test_code_execution.py +178 -0
- massgen/tests/test_orchestration_restart.py +204 -0
- massgen/tool/__init__.py +4 -0
- massgen/tool/_multimodal_tools/understand_audio.py +193 -0
- massgen/tool/_multimodal_tools/understand_file.py +550 -0
- massgen/tool/_multimodal_tools/understand_image.py +212 -0
- massgen/tool/_multimodal_tools/understand_video.py +313 -0
- massgen/tool/docs/multimodal_tools.md +779 -0
- massgen/tool/workflow_toolkits/__init__.py +26 -0
- massgen/tool/workflow_toolkits/post_evaluation.py +216 -0
- massgen/utils.py +1 -0
- {massgen-0.1.2.dist-info → massgen-0.1.3.dist-info}/METADATA +8 -3
- {massgen-0.1.2.dist-info → massgen-0.1.3.dist-info}/RECORD +63 -36
- {massgen-0.1.2.dist-info → massgen-0.1.3.dist-info}/WHEEL +0 -0
- {massgen-0.1.2.dist-info → massgen-0.1.3.dist-info}/entry_points.txt +0 -0
- {massgen-0.1.2.dist-info → massgen-0.1.3.dist-info}/licenses/LICENSE +0 -0
- {massgen-0.1.2.dist-info → massgen-0.1.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
Understand and analyze images using OpenAI's gpt-4.1 API.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import base64
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import List, Optional
|
|
11
|
+
|
|
12
|
+
from dotenv import load_dotenv
|
|
13
|
+
from openai import OpenAI
|
|
14
|
+
|
|
15
|
+
from massgen.tool._result import ExecutionResult, TextContent
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _validate_path_access(path: Path, allowed_paths: Optional[List[Path]] = None) -> None:
|
|
19
|
+
"""
|
|
20
|
+
Validate that a path is within allowed directories.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
path: Path to validate
|
|
24
|
+
allowed_paths: List of allowed base paths (optional)
|
|
25
|
+
|
|
26
|
+
Raises:
|
|
27
|
+
ValueError: If path is not within allowed directories
|
|
28
|
+
"""
|
|
29
|
+
if not allowed_paths:
|
|
30
|
+
return # No restrictions
|
|
31
|
+
|
|
32
|
+
for allowed_path in allowed_paths:
|
|
33
|
+
try:
|
|
34
|
+
path.relative_to(allowed_path)
|
|
35
|
+
return # Path is within this allowed directory
|
|
36
|
+
except ValueError:
|
|
37
|
+
continue
|
|
38
|
+
|
|
39
|
+
raise ValueError(f"Path not in allowed directories: {path}")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
async def understand_image(
|
|
43
|
+
image_path: str,
|
|
44
|
+
prompt: str = "What's in this image? Please describe it in detail.",
|
|
45
|
+
model: str = "gpt-4.1",
|
|
46
|
+
allowed_paths: Optional[List[str]] = None,
|
|
47
|
+
) -> ExecutionResult:
|
|
48
|
+
"""
|
|
49
|
+
Understand and analyze an image using OpenAI's gpt-4.1 API.
|
|
50
|
+
|
|
51
|
+
This tool processes an image through OpenAI's gpt-4.1 API to extract insights,
|
|
52
|
+
descriptions, or answer questions about the image content.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
image_path: Path to the image file (PNG/JPEG/JPG)
|
|
56
|
+
- Relative path: Resolved relative to workspace
|
|
57
|
+
- Absolute path: Must be within allowed directories
|
|
58
|
+
prompt: Question or instruction about the image (default: "What's in this image? Please describe it in detail.")
|
|
59
|
+
model: Model to use (default: "gpt-4.1")
|
|
60
|
+
allowed_paths: List of allowed base paths for validation (optional)
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
ExecutionResult containing:
|
|
64
|
+
- success: Whether operation succeeded
|
|
65
|
+
- operation: "understand_image"
|
|
66
|
+
- image_path: Path to the analyzed image
|
|
67
|
+
- prompt: The prompt used
|
|
68
|
+
- model: Model used for analysis
|
|
69
|
+
- response: The model's understanding/description of the image
|
|
70
|
+
|
|
71
|
+
Examples:
|
|
72
|
+
understand_image("photo.jpg")
|
|
73
|
+
→ Returns detailed description of the image
|
|
74
|
+
|
|
75
|
+
understand_image("chart.png", "What data is shown in this chart?")
|
|
76
|
+
→ Returns analysis of the chart data
|
|
77
|
+
|
|
78
|
+
understand_image("screenshot.png", "What UI elements are visible in this screenshot?")
|
|
79
|
+
→ Returns description of UI elements
|
|
80
|
+
|
|
81
|
+
Security:
|
|
82
|
+
- Requires valid OpenAI API key
|
|
83
|
+
- Image file must exist and be readable
|
|
84
|
+
- Only supports PNG, JPEG, and JPG formats
|
|
85
|
+
"""
|
|
86
|
+
try:
|
|
87
|
+
# Convert allowed_paths from strings to Path objects
|
|
88
|
+
allowed_paths_list = [Path(p) for p in allowed_paths] if allowed_paths else None
|
|
89
|
+
|
|
90
|
+
# Load environment variables
|
|
91
|
+
script_dir = Path(__file__).parent.parent.parent.parent
|
|
92
|
+
env_path = script_dir / ".env"
|
|
93
|
+
if env_path.exists():
|
|
94
|
+
load_dotenv(env_path)
|
|
95
|
+
else:
|
|
96
|
+
load_dotenv()
|
|
97
|
+
|
|
98
|
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
|
99
|
+
|
|
100
|
+
if not openai_api_key:
|
|
101
|
+
result = {
|
|
102
|
+
"success": False,
|
|
103
|
+
"operation": "understand_image",
|
|
104
|
+
"error": "OpenAI API key not found. Please set OPENAI_API_KEY in .env file or environment variable.",
|
|
105
|
+
}
|
|
106
|
+
return ExecutionResult(
|
|
107
|
+
output_blocks=[TextContent(data=json.dumps(result, indent=2))],
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# Initialize OpenAI client
|
|
111
|
+
client = OpenAI(api_key=openai_api_key)
|
|
112
|
+
|
|
113
|
+
# Resolve image path
|
|
114
|
+
if Path(image_path).is_absolute():
|
|
115
|
+
img_path = Path(image_path).resolve()
|
|
116
|
+
else:
|
|
117
|
+
img_path = (Path.cwd() / image_path).resolve()
|
|
118
|
+
|
|
119
|
+
# Validate image path
|
|
120
|
+
_validate_path_access(img_path, allowed_paths_list)
|
|
121
|
+
|
|
122
|
+
if not img_path.exists():
|
|
123
|
+
result = {
|
|
124
|
+
"success": False,
|
|
125
|
+
"operation": "understand_image",
|
|
126
|
+
"error": f"Image file does not exist: {img_path}",
|
|
127
|
+
}
|
|
128
|
+
return ExecutionResult(
|
|
129
|
+
output_blocks=[TextContent(data=json.dumps(result, indent=2))],
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# Check file format
|
|
133
|
+
if img_path.suffix.lower() not in [".png", ".jpg", ".jpeg"]:
|
|
134
|
+
result = {
|
|
135
|
+
"success": False,
|
|
136
|
+
"operation": "understand_image",
|
|
137
|
+
"error": f"Image must be PNG, JPEG, or JPG format: {img_path}",
|
|
138
|
+
}
|
|
139
|
+
return ExecutionResult(
|
|
140
|
+
output_blocks=[TextContent(data=json.dumps(result, indent=2))],
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# Read and encode image to base64
|
|
144
|
+
try:
|
|
145
|
+
with open(img_path, "rb") as image_file:
|
|
146
|
+
image_data = image_file.read()
|
|
147
|
+
base64_image = base64.b64encode(image_data).decode("utf-8")
|
|
148
|
+
except Exception as read_error:
|
|
149
|
+
result = {
|
|
150
|
+
"success": False,
|
|
151
|
+
"operation": "understand_image",
|
|
152
|
+
"error": f"Failed to read image file: {str(read_error)}",
|
|
153
|
+
}
|
|
154
|
+
return ExecutionResult(
|
|
155
|
+
output_blocks=[TextContent(data=json.dumps(result, indent=2))],
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
# Determine MIME type
|
|
159
|
+
mime_type = "image/jpeg" if img_path.suffix.lower() in [".jpg", ".jpeg"] else "image/png"
|
|
160
|
+
|
|
161
|
+
try:
|
|
162
|
+
# Call OpenAI API for image understanding
|
|
163
|
+
response = client.responses.create(
|
|
164
|
+
model=model,
|
|
165
|
+
input=[
|
|
166
|
+
{
|
|
167
|
+
"role": "user",
|
|
168
|
+
"content": [
|
|
169
|
+
{"type": "input_text", "text": prompt},
|
|
170
|
+
{
|
|
171
|
+
"type": "input_image",
|
|
172
|
+
"image_url": f"data:{mime_type};base64,{base64_image}",
|
|
173
|
+
},
|
|
174
|
+
],
|
|
175
|
+
},
|
|
176
|
+
],
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
# Extract response text
|
|
180
|
+
response_text = response.output_text if hasattr(response, "output_text") else str(response.output)
|
|
181
|
+
|
|
182
|
+
result = {
|
|
183
|
+
"success": True,
|
|
184
|
+
"operation": "understand_image",
|
|
185
|
+
"image_path": str(img_path),
|
|
186
|
+
"prompt": prompt,
|
|
187
|
+
"model": model,
|
|
188
|
+
"response": response_text,
|
|
189
|
+
}
|
|
190
|
+
return ExecutionResult(
|
|
191
|
+
output_blocks=[TextContent(data=json.dumps(result, indent=2))],
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
except Exception as api_error:
|
|
195
|
+
result = {
|
|
196
|
+
"success": False,
|
|
197
|
+
"operation": "understand_image",
|
|
198
|
+
"error": f"OpenAI API error: {str(api_error)}",
|
|
199
|
+
}
|
|
200
|
+
return ExecutionResult(
|
|
201
|
+
output_blocks=[TextContent(data=json.dumps(result, indent=2))],
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
except Exception as e:
|
|
205
|
+
result = {
|
|
206
|
+
"success": False,
|
|
207
|
+
"operation": "understand_image",
|
|
208
|
+
"error": f"Failed to understand image: {str(e)}",
|
|
209
|
+
}
|
|
210
|
+
return ExecutionResult(
|
|
211
|
+
output_blocks=[TextContent(data=json.dumps(result, indent=2))],
|
|
212
|
+
)
|
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
Understand and analyze videos by extracting key frames and using OpenAI's gpt-4.1 API.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import base64
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import List, Optional
|
|
11
|
+
|
|
12
|
+
from dotenv import load_dotenv
|
|
13
|
+
from openai import OpenAI
|
|
14
|
+
|
|
15
|
+
from massgen.tool._result import ExecutionResult, TextContent
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _validate_path_access(path: Path, allowed_paths: Optional[List[Path]] = None) -> None:
|
|
19
|
+
"""
|
|
20
|
+
Validate that a path is within allowed directories.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
path: Path to validate
|
|
24
|
+
allowed_paths: List of allowed base paths (optional)
|
|
25
|
+
|
|
26
|
+
Raises:
|
|
27
|
+
ValueError: If path is not within allowed directories
|
|
28
|
+
"""
|
|
29
|
+
if not allowed_paths:
|
|
30
|
+
return # No restrictions
|
|
31
|
+
|
|
32
|
+
for allowed_path in allowed_paths:
|
|
33
|
+
try:
|
|
34
|
+
path.relative_to(allowed_path)
|
|
35
|
+
return # Path is within this allowed directory
|
|
36
|
+
except ValueError:
|
|
37
|
+
continue
|
|
38
|
+
|
|
39
|
+
raise ValueError(f"Path not in allowed directories: {path}")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _extract_key_frames(video_path: Path, num_frames: int = 8) -> List[str]:
|
|
43
|
+
"""
|
|
44
|
+
Extract key frames from a video file.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
video_path: Path to the video file
|
|
48
|
+
num_frames: Number of key frames to extract
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
List of base64-encoded frame images
|
|
52
|
+
|
|
53
|
+
Raises:
|
|
54
|
+
ImportError: If opencv-python is not installed
|
|
55
|
+
Exception: If frame extraction fails
|
|
56
|
+
"""
|
|
57
|
+
try:
|
|
58
|
+
import cv2
|
|
59
|
+
except ImportError:
|
|
60
|
+
raise ImportError(
|
|
61
|
+
"opencv-python is required for video frame extraction. " "Please install it with: pip install opencv-python",
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# Open the video file
|
|
65
|
+
video = cv2.VideoCapture(str(video_path))
|
|
66
|
+
|
|
67
|
+
if not video.isOpened():
|
|
68
|
+
raise Exception(f"Failed to open video file: {video_path}")
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
# Get total number of frames
|
|
72
|
+
total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
|
|
73
|
+
|
|
74
|
+
if total_frames == 0:
|
|
75
|
+
raise Exception(f"Video file has no frames: {video_path}")
|
|
76
|
+
|
|
77
|
+
# Calculate frame indices to extract (evenly spaced)
|
|
78
|
+
frame_indices = []
|
|
79
|
+
if num_frames >= total_frames:
|
|
80
|
+
# If requesting more frames than available, use all frames
|
|
81
|
+
frame_indices = list(range(total_frames))
|
|
82
|
+
else:
|
|
83
|
+
# Extract evenly spaced frames
|
|
84
|
+
step = total_frames / num_frames
|
|
85
|
+
frame_indices = [int(i * step) for i in range(num_frames)]
|
|
86
|
+
|
|
87
|
+
# Extract frames
|
|
88
|
+
frames_base64 = []
|
|
89
|
+
for frame_idx in frame_indices:
|
|
90
|
+
# Set video position to the frame
|
|
91
|
+
video.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
|
|
92
|
+
|
|
93
|
+
# Read the frame
|
|
94
|
+
ret, frame = video.read()
|
|
95
|
+
|
|
96
|
+
if not ret:
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
# Encode frame to JPEG
|
|
100
|
+
ret, buffer = cv2.imencode(".jpg", frame)
|
|
101
|
+
|
|
102
|
+
if not ret:
|
|
103
|
+
continue
|
|
104
|
+
|
|
105
|
+
# Convert to base64
|
|
106
|
+
frame_base64 = base64.b64encode(buffer).decode("utf-8")
|
|
107
|
+
frames_base64.append(frame_base64)
|
|
108
|
+
|
|
109
|
+
if not frames_base64:
|
|
110
|
+
raise Exception("Failed to extract any frames from video")
|
|
111
|
+
|
|
112
|
+
return frames_base64
|
|
113
|
+
|
|
114
|
+
finally:
|
|
115
|
+
# Release the video capture object
|
|
116
|
+
video.release()
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
async def understand_video(
|
|
120
|
+
video_path: str,
|
|
121
|
+
prompt: str = "What's happening in this video? Please describe the content, actions, and any important details you observe across these frames.",
|
|
122
|
+
num_frames: int = 8,
|
|
123
|
+
model: str = "gpt-4.1",
|
|
124
|
+
allowed_paths: Optional[List[str]] = None,
|
|
125
|
+
) -> ExecutionResult:
|
|
126
|
+
"""
|
|
127
|
+
Understand and analyze a video by extracting key frames and using OpenAI's gpt-4.1 API.
|
|
128
|
+
|
|
129
|
+
This tool extracts key frames from a video file and processes them through OpenAI's
|
|
130
|
+
gpt-4.1 API to provide insights, descriptions, or answer questions about the video content.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
video_path: Path to the video file (MP4, AVI, MOV, etc.)
|
|
134
|
+
- Relative path: Resolved relative to workspace
|
|
135
|
+
- Absolute path: Must be within allowed directories
|
|
136
|
+
prompt: Question or instruction about the video (default: asks for general description)
|
|
137
|
+
num_frames: Number of key frames to extract from the video (default: 8)
|
|
138
|
+
- Higher values provide more detail but increase API costs
|
|
139
|
+
- Recommended range: 4-16 frames
|
|
140
|
+
model: Model to use (default: "gpt-4.1")
|
|
141
|
+
allowed_paths: List of allowed base paths for validation (optional)
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
ExecutionResult containing:
|
|
145
|
+
- success: Whether operation succeeded
|
|
146
|
+
- operation: "understand_video"
|
|
147
|
+
- video_path: Path to the analyzed video
|
|
148
|
+
- num_frames_extracted: Number of frames extracted
|
|
149
|
+
- prompt: The prompt used
|
|
150
|
+
- model: Model used for analysis
|
|
151
|
+
- response: The model's understanding/description of the video
|
|
152
|
+
|
|
153
|
+
Examples:
|
|
154
|
+
understand_video("demo.mp4")
|
|
155
|
+
→ Returns detailed description of the video content
|
|
156
|
+
|
|
157
|
+
understand_video("tutorial.mp4", "What steps are shown in this tutorial?")
|
|
158
|
+
→ Returns analysis of tutorial steps
|
|
159
|
+
|
|
160
|
+
understand_video("meeting.mp4", "Summarize the key points discussed in this meeting", num_frames=12)
|
|
161
|
+
→ Returns meeting summary based on 12 key frames
|
|
162
|
+
|
|
163
|
+
understand_video("sports.mp4", "What sport is being played and what are the key moments?")
|
|
164
|
+
→ Returns sports analysis
|
|
165
|
+
|
|
166
|
+
Security:
|
|
167
|
+
- Requires valid OpenAI API key
|
|
168
|
+
- Requires opencv-python package for video processing
|
|
169
|
+
- Video file must exist and be readable
|
|
170
|
+
- Supports common video formats (MP4, AVI, MOV, MKV, etc.)
|
|
171
|
+
|
|
172
|
+
Note:
|
|
173
|
+
This tool extracts still frames from the video. Audio content is not analyzed.
|
|
174
|
+
For audio analysis, use the generate_text_with_input_audio tool.
|
|
175
|
+
"""
|
|
176
|
+
try:
|
|
177
|
+
# Convert allowed_paths from strings to Path objects
|
|
178
|
+
allowed_paths_list = [Path(p) for p in allowed_paths] if allowed_paths else None
|
|
179
|
+
|
|
180
|
+
# Load environment variables
|
|
181
|
+
script_dir = Path(__file__).parent.parent.parent.parent
|
|
182
|
+
env_path = script_dir / ".env"
|
|
183
|
+
if env_path.exists():
|
|
184
|
+
load_dotenv(env_path)
|
|
185
|
+
else:
|
|
186
|
+
load_dotenv()
|
|
187
|
+
|
|
188
|
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
|
189
|
+
|
|
190
|
+
if not openai_api_key:
|
|
191
|
+
result = {
|
|
192
|
+
"success": False,
|
|
193
|
+
"operation": "understand_video",
|
|
194
|
+
"error": "OpenAI API key not found. Please set OPENAI_API_KEY in .env file or environment variable.",
|
|
195
|
+
}
|
|
196
|
+
return ExecutionResult(
|
|
197
|
+
output_blocks=[TextContent(data=json.dumps(result, indent=2))],
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# Initialize OpenAI client
|
|
201
|
+
client = OpenAI(api_key=openai_api_key)
|
|
202
|
+
|
|
203
|
+
# Resolve video path
|
|
204
|
+
if Path(video_path).is_absolute():
|
|
205
|
+
vid_path = Path(video_path).resolve()
|
|
206
|
+
else:
|
|
207
|
+
vid_path = (Path.cwd() / video_path).resolve()
|
|
208
|
+
|
|
209
|
+
# Validate video path
|
|
210
|
+
_validate_path_access(vid_path, allowed_paths_list)
|
|
211
|
+
|
|
212
|
+
if not vid_path.exists():
|
|
213
|
+
result = {
|
|
214
|
+
"success": False,
|
|
215
|
+
"operation": "understand_video",
|
|
216
|
+
"error": f"Video file does not exist: {vid_path}",
|
|
217
|
+
}
|
|
218
|
+
return ExecutionResult(
|
|
219
|
+
output_blocks=[TextContent(data=json.dumps(result, indent=2))],
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
# Check if file is likely a video (by extension)
|
|
223
|
+
video_extensions = [".mp4", ".avi", ".mov", ".mkv", ".flv", ".wmv", ".webm", ".m4v", ".mpg", ".mpeg"]
|
|
224
|
+
if vid_path.suffix.lower() not in video_extensions:
|
|
225
|
+
result = {
|
|
226
|
+
"success": False,
|
|
227
|
+
"operation": "understand_video",
|
|
228
|
+
"error": f"File does not appear to be a video file: {vid_path}. Supported formats: {', '.join(video_extensions)}",
|
|
229
|
+
}
|
|
230
|
+
return ExecutionResult(
|
|
231
|
+
output_blocks=[TextContent(data=json.dumps(result, indent=2))],
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
# Extract key frames from video
|
|
235
|
+
try:
|
|
236
|
+
frames_base64 = _extract_key_frames(vid_path, num_frames)
|
|
237
|
+
except ImportError as import_error:
|
|
238
|
+
result = {
|
|
239
|
+
"success": False,
|
|
240
|
+
"operation": "understand_video",
|
|
241
|
+
"error": str(import_error),
|
|
242
|
+
}
|
|
243
|
+
return ExecutionResult(
|
|
244
|
+
output_blocks=[TextContent(data=json.dumps(result, indent=2))],
|
|
245
|
+
)
|
|
246
|
+
except Exception as extract_error:
|
|
247
|
+
result = {
|
|
248
|
+
"success": False,
|
|
249
|
+
"operation": "understand_video",
|
|
250
|
+
"error": f"Failed to extract frames from video: {str(extract_error)}",
|
|
251
|
+
}
|
|
252
|
+
return ExecutionResult(
|
|
253
|
+
output_blocks=[TextContent(data=json.dumps(result, indent=2))],
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
# Build content array with prompt and all frames
|
|
257
|
+
content = [{"type": "input_text", "text": prompt}]
|
|
258
|
+
|
|
259
|
+
for frame_base64 in frames_base64:
|
|
260
|
+
content.append(
|
|
261
|
+
{
|
|
262
|
+
"type": "input_image",
|
|
263
|
+
"image_url": f"data:image/jpeg;base64,{frame_base64}",
|
|
264
|
+
},
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
try:
|
|
268
|
+
# Call OpenAI API for video understanding
|
|
269
|
+
response = client.responses.create(
|
|
270
|
+
model=model,
|
|
271
|
+
input=[
|
|
272
|
+
{
|
|
273
|
+
"role": "user",
|
|
274
|
+
"content": content,
|
|
275
|
+
},
|
|
276
|
+
],
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
# Extract response text
|
|
280
|
+
response_text = response.output_text if hasattr(response, "output_text") else str(response.output)
|
|
281
|
+
|
|
282
|
+
result = {
|
|
283
|
+
"success": True,
|
|
284
|
+
"operation": "understand_video",
|
|
285
|
+
"video_path": str(vid_path),
|
|
286
|
+
"num_frames_extracted": len(frames_base64),
|
|
287
|
+
"prompt": prompt,
|
|
288
|
+
"model": model,
|
|
289
|
+
"response": response_text,
|
|
290
|
+
}
|
|
291
|
+
return ExecutionResult(
|
|
292
|
+
output_blocks=[TextContent(data=json.dumps(result, indent=2))],
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
except Exception as api_error:
|
|
296
|
+
result = {
|
|
297
|
+
"success": False,
|
|
298
|
+
"operation": "understand_video",
|
|
299
|
+
"error": f"OpenAI API error: {str(api_error)}",
|
|
300
|
+
}
|
|
301
|
+
return ExecutionResult(
|
|
302
|
+
output_blocks=[TextContent(data=json.dumps(result, indent=2))],
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
except Exception as e:
|
|
306
|
+
result = {
|
|
307
|
+
"success": False,
|
|
308
|
+
"operation": "understand_video",
|
|
309
|
+
"error": f"Failed to understand video: {str(e)}",
|
|
310
|
+
}
|
|
311
|
+
return ExecutionResult(
|
|
312
|
+
output_blocks=[TextContent(data=json.dumps(result, indent=2))],
|
|
313
|
+
)
|