massgen 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of massgen might be problematic. Click here for more details.
- massgen/__init__.py +1 -1
- massgen/api_params_handler/_chat_completions_api_params_handler.py +4 -0
- massgen/api_params_handler/_claude_api_params_handler.py +4 -0
- massgen/api_params_handler/_gemini_api_params_handler.py +4 -0
- massgen/api_params_handler/_response_api_params_handler.py +4 -0
- massgen/backend/base_with_custom_tool_and_mcp.py +25 -5
- massgen/backend/docs/permissions_and_context_files.md +2 -2
- massgen/backend/response.py +2 -0
- massgen/configs/README.md +49 -40
- massgen/configs/tools/custom_tools/crawl4ai_example.yaml +55 -0
- massgen/configs/tools/custom_tools/multimodal_tools/text_to_file_generation_multi.yaml +61 -0
- massgen/configs/tools/custom_tools/multimodal_tools/text_to_file_generation_single.yaml +29 -0
- massgen/configs/tools/custom_tools/multimodal_tools/text_to_image_generation_multi.yaml +51 -0
- massgen/configs/tools/custom_tools/multimodal_tools/text_to_image_generation_single.yaml +33 -0
- massgen/configs/tools/custom_tools/multimodal_tools/text_to_speech_generation_multi.yaml +55 -0
- massgen/configs/tools/custom_tools/multimodal_tools/text_to_speech_generation_single.yaml +33 -0
- massgen/configs/tools/custom_tools/multimodal_tools/text_to_video_generation_multi.yaml +47 -0
- massgen/configs/tools/custom_tools/multimodal_tools/text_to_video_generation_single.yaml +29 -0
- massgen/configs/tools/custom_tools/multimodal_tools/understand_audio.yaml +1 -1
- massgen/configs/tools/custom_tools/multimodal_tools/understand_file.yaml +1 -1
- massgen/configs/tools/custom_tools/multimodal_tools/understand_image.yaml +1 -1
- massgen/configs/tools/custom_tools/multimodal_tools/understand_video.yaml +1 -1
- massgen/configs/tools/custom_tools/multimodal_tools/youtube_video_analysis.yaml +1 -1
- massgen/filesystem_manager/_filesystem_manager.py +1 -0
- massgen/filesystem_manager/_path_permission_manager.py +148 -0
- massgen/message_templates.py +160 -12
- massgen/orchestrator.py +16 -0
- massgen/tests/test_binary_file_blocking.py +274 -0
- massgen/tests/test_case_studies.md +12 -12
- massgen/tests/test_multimodal_size_limits.py +407 -0
- massgen/tool/_manager.py +7 -2
- massgen/tool/_multimodal_tools/image_to_image_generation.py +293 -0
- massgen/tool/_multimodal_tools/text_to_file_generation.py +455 -0
- massgen/tool/_multimodal_tools/text_to_image_generation.py +222 -0
- massgen/tool/_multimodal_tools/text_to_speech_continue_generation.py +226 -0
- massgen/tool/_multimodal_tools/text_to_speech_transcription_generation.py +217 -0
- massgen/tool/_multimodal_tools/text_to_video_generation.py +223 -0
- massgen/tool/_multimodal_tools/understand_audio.py +19 -1
- massgen/tool/_multimodal_tools/understand_file.py +6 -1
- massgen/tool/_multimodal_tools/understand_image.py +112 -8
- massgen/tool/_multimodal_tools/understand_video.py +32 -5
- massgen/tool/_web_tools/crawl4ai_tool.py +718 -0
- massgen/tool/docs/multimodal_tools.md +589 -0
- {massgen-0.1.3.dist-info → massgen-0.1.4.dist-info}/METADATA +96 -69
- {massgen-0.1.3.dist-info → massgen-0.1.4.dist-info}/RECORD +49 -40
- massgen/configs/tools/custom_tools/crawl4ai_mcp_example.yaml +0 -67
- massgen/configs/tools/custom_tools/crawl4ai_multi_agent_example.yaml +0 -68
- massgen/configs/tools/custom_tools/multimodal_tools/playwright_with_img_understanding.yaml +0 -98
- massgen/configs/tools/custom_tools/multimodal_tools/understand_video_example.yaml +0 -54
- massgen/configs/tools/memory/README.md +0 -199
- massgen/configs/tools/memory/gpt5mini_gemini_context_window_management.yaml +0 -131
- massgen/configs/tools/memory/gpt5mini_gemini_no_persistent_memory.yaml +0 -133
- massgen/configs/tools/memory/test_context_window_management.py +0 -286
- massgen/configs/tools/multimodal/gpt5mini_gpt5nano_documentation_evolution.yaml +0 -97
- {massgen-0.1.3.dist-info → massgen-0.1.4.dist-info}/WHEEL +0 -0
- {massgen-0.1.3.dist-info → massgen-0.1.4.dist-info}/entry_points.txt +0 -0
- {massgen-0.1.3.dist-info → massgen-0.1.4.dist-info}/licenses/LICENSE +0 -0
- {massgen-0.1.3.dist-info → massgen-0.1.4.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
Generate audio from text using OpenAI's gpt-4o-audio-preview model and store it in the workspace.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import base64
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import List, Optional
|
|
12
|
+
|
|
13
|
+
from dotenv import load_dotenv
|
|
14
|
+
from openai import OpenAI
|
|
15
|
+
|
|
16
|
+
from massgen.tool._result import ExecutionResult, TextContent
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _validate_path_access(path: Path, allowed_paths: Optional[List[Path]] = None) -> None:
|
|
20
|
+
"""
|
|
21
|
+
Validate that a path is within allowed directories.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
path: Path to validate
|
|
25
|
+
allowed_paths: List of allowed base paths (optional)
|
|
26
|
+
agent_cwd: Agent\'s current working directory (automatically injected)
|
|
27
|
+
|
|
28
|
+
Raises:
|
|
29
|
+
ValueError: If path is not within allowed directories
|
|
30
|
+
"""
|
|
31
|
+
if not allowed_paths:
|
|
32
|
+
return # No restrictions
|
|
33
|
+
|
|
34
|
+
for allowed_path in allowed_paths:
|
|
35
|
+
try:
|
|
36
|
+
path.relative_to(allowed_path)
|
|
37
|
+
return # Path is within this allowed directory
|
|
38
|
+
except ValueError:
|
|
39
|
+
continue
|
|
40
|
+
|
|
41
|
+
raise ValueError(f"Path not in allowed directories: {path}")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
async def text_to_speech_continue_generation(
|
|
45
|
+
prompt: str,
|
|
46
|
+
model: str = "gpt-4o-audio-preview",
|
|
47
|
+
voice: str = "alloy",
|
|
48
|
+
audio_format: str = "wav",
|
|
49
|
+
storage_path: Optional[str] = None,
|
|
50
|
+
allowed_paths: Optional[List[str]] = None,
|
|
51
|
+
agent_cwd: Optional[str] = None,
|
|
52
|
+
) -> ExecutionResult:
|
|
53
|
+
"""
|
|
54
|
+
Generate audio from text using OpenAI's gpt-4o-audio-preview model and store it in the workspace.
|
|
55
|
+
|
|
56
|
+
This tool generates audio speech from text prompts using OpenAI's audio generation API
|
|
57
|
+
and saves the audio files to the workspace with automatic organization.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
prompt: Text content to convert to audio speech
|
|
61
|
+
model: Model to use for generation (default: "gpt-4o-audio-preview")
|
|
62
|
+
voice: Voice to use for audio generation (default: "alloy")
|
|
63
|
+
Options: "alloy", "echo", "fable", "onyx", "nova", "shimmer"
|
|
64
|
+
audio_format: Audio format for output (default: "wav")
|
|
65
|
+
Options: "wav", "mp3", "opus", "aac", "flac"
|
|
66
|
+
storage_path: Directory path where to save the audio (optional)
|
|
67
|
+
- **IMPORTANT**: Must be a DIRECTORY path only, NOT a file path (e.g., "audio/generated" NOT "audio/output.wav")
|
|
68
|
+
- The filename is automatically generated from the prompt and timestamp
|
|
69
|
+
- Relative path: Resolved relative to agent's workspace (e.g., "audio/generated")
|
|
70
|
+
- Absolute path: Must be within allowed directories
|
|
71
|
+
- None/empty: Saves to agent's workspace root
|
|
72
|
+
allowed_paths: List of allowed base paths for validation (optional)
|
|
73
|
+
agent_cwd: Agent\'s current working directory (automatically injected)
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
ExecutionResult containing:
|
|
77
|
+
- success: Whether operation succeeded
|
|
78
|
+
- operation: "generate_and_store_audio_no_input_audios"
|
|
79
|
+
- audio_file: Generated audio file with path and metadata
|
|
80
|
+
- model: Model used for generation
|
|
81
|
+
- prompt: The prompt used for generation
|
|
82
|
+
- voice: Voice used for generation
|
|
83
|
+
- format: Audio format used
|
|
84
|
+
|
|
85
|
+
Examples:
|
|
86
|
+
generate_and_store_audio_no_input_audios("Is a golden retriever a good family dog?")
|
|
87
|
+
→ Generates and saves to: 20240115_143022_audio.wav
|
|
88
|
+
|
|
89
|
+
generate_and_store_audio_no_input_audios("Hello world", voice="nova", audio_format="mp3")
|
|
90
|
+
→ Generates with nova voice and saves as: 20240115_143022_audio.mp3
|
|
91
|
+
|
|
92
|
+
Security:
|
|
93
|
+
- Requires valid OpenAI API key (automatically detected from .env or environment)
|
|
94
|
+
- Files are saved to specified path within workspace
|
|
95
|
+
- Path must be within allowed directories
|
|
96
|
+
"""
|
|
97
|
+
try:
|
|
98
|
+
# Convert allowed_paths from strings to Path objects
|
|
99
|
+
allowed_paths_list = [Path(p) for p in allowed_paths] if allowed_paths else None
|
|
100
|
+
|
|
101
|
+
# Use agent_cwd if available, otherwise fall back to base_dir
|
|
102
|
+
base_dir = Path(agent_cwd) if agent_cwd else Path.cwd()
|
|
103
|
+
|
|
104
|
+
# Load environment variables
|
|
105
|
+
script_dir = Path(__file__).parent.parent.parent.parent
|
|
106
|
+
env_path = script_dir / ".env"
|
|
107
|
+
if env_path.exists():
|
|
108
|
+
load_dotenv(env_path)
|
|
109
|
+
else:
|
|
110
|
+
load_dotenv()
|
|
111
|
+
|
|
112
|
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
|
113
|
+
|
|
114
|
+
if not openai_api_key:
|
|
115
|
+
result = {
|
|
116
|
+
"success": False,
|
|
117
|
+
"operation": "generate_and_store_audio_no_input_audios",
|
|
118
|
+
"error": "OpenAI API key not found. Please set OPENAI_API_KEY in .env file or environment variable.",
|
|
119
|
+
}
|
|
120
|
+
return ExecutionResult(
|
|
121
|
+
output_blocks=[TextContent(data=json.dumps(result, indent=2))],
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Initialize OpenAI client
|
|
125
|
+
client = OpenAI(api_key=openai_api_key)
|
|
126
|
+
|
|
127
|
+
# Determine storage directory
|
|
128
|
+
if storage_path:
|
|
129
|
+
if Path(storage_path).is_absolute():
|
|
130
|
+
storage_dir = Path(storage_path).resolve()
|
|
131
|
+
else:
|
|
132
|
+
storage_dir = (base_dir / storage_path).resolve()
|
|
133
|
+
else:
|
|
134
|
+
storage_dir = base_dir
|
|
135
|
+
|
|
136
|
+
# Validate storage directory is within allowed paths
|
|
137
|
+
_validate_path_access(storage_dir, allowed_paths_list)
|
|
138
|
+
|
|
139
|
+
# Create directory if it doesn't exist
|
|
140
|
+
storage_dir.mkdir(parents=True, exist_ok=True)
|
|
141
|
+
|
|
142
|
+
try:
|
|
143
|
+
# Generate audio using OpenAI API
|
|
144
|
+
completion = client.chat.completions.create(
|
|
145
|
+
model=model,
|
|
146
|
+
modalities=["text", "audio"],
|
|
147
|
+
audio={"voice": voice, "format": audio_format},
|
|
148
|
+
messages=[
|
|
149
|
+
{
|
|
150
|
+
"role": "user",
|
|
151
|
+
"content": prompt,
|
|
152
|
+
},
|
|
153
|
+
],
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# Check if audio data is available
|
|
157
|
+
if not completion.choices[0].message.audio or not completion.choices[0].message.audio.data:
|
|
158
|
+
result = {
|
|
159
|
+
"success": False,
|
|
160
|
+
"operation": "generate_and_store_audio_no_input_audios",
|
|
161
|
+
"error": "No audio data received from API",
|
|
162
|
+
}
|
|
163
|
+
return ExecutionResult(
|
|
164
|
+
output_blocks=[TextContent(data=json.dumps(result, indent=2))],
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
# Decode audio data from base64
|
|
168
|
+
audio_bytes = base64.b64decode(completion.choices[0].message.audio.data)
|
|
169
|
+
|
|
170
|
+
# Generate filename with timestamp
|
|
171
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
172
|
+
|
|
173
|
+
# Clean prompt for filename (first 30 chars)
|
|
174
|
+
clean_prompt = "".join(c for c in prompt[:30] if c.isalnum() or c in (" ", "-", "_")).strip()
|
|
175
|
+
clean_prompt = clean_prompt.replace(" ", "_")
|
|
176
|
+
|
|
177
|
+
filename = f"{timestamp}_{clean_prompt}.{audio_format}"
|
|
178
|
+
|
|
179
|
+
# Full file path
|
|
180
|
+
file_path = storage_dir / filename
|
|
181
|
+
|
|
182
|
+
# Write audio to file
|
|
183
|
+
file_path.write_bytes(audio_bytes)
|
|
184
|
+
file_size = len(audio_bytes)
|
|
185
|
+
|
|
186
|
+
# Get text response if available
|
|
187
|
+
text_response = completion.choices[0].message.content if completion.choices[0].message.content else None
|
|
188
|
+
|
|
189
|
+
result = {
|
|
190
|
+
"success": True,
|
|
191
|
+
"operation": "generate_and_store_audio_no_input_audios",
|
|
192
|
+
"audio_file": {
|
|
193
|
+
"file_path": str(file_path),
|
|
194
|
+
"filename": filename,
|
|
195
|
+
"size": file_size,
|
|
196
|
+
"format": audio_format,
|
|
197
|
+
},
|
|
198
|
+
"model": model,
|
|
199
|
+
"prompt": prompt,
|
|
200
|
+
"voice": voice,
|
|
201
|
+
"format": audio_format,
|
|
202
|
+
"text_response": text_response,
|
|
203
|
+
}
|
|
204
|
+
return ExecutionResult(
|
|
205
|
+
output_blocks=[TextContent(data=json.dumps(result, indent=2))],
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
except Exception as api_error:
|
|
209
|
+
result = {
|
|
210
|
+
"success": False,
|
|
211
|
+
"operation": "generate_and_store_audio_no_input_audios",
|
|
212
|
+
"error": f"OpenAI API error: {str(api_error)}",
|
|
213
|
+
}
|
|
214
|
+
return ExecutionResult(
|
|
215
|
+
output_blocks=[TextContent(data=json.dumps(result, indent=2))],
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
except Exception as e:
|
|
219
|
+
result = {
|
|
220
|
+
"success": False,
|
|
221
|
+
"operation": "generate_and_store_audio_no_input_audios",
|
|
222
|
+
"error": f"Failed to generate or save audio: {str(e)}",
|
|
223
|
+
}
|
|
224
|
+
return ExecutionResult(
|
|
225
|
+
output_blocks=[TextContent(data=json.dumps(result, indent=2))],
|
|
226
|
+
)
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
Convert text (transcription) directly to speech using OpenAI's TTS API with streaming response.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import json
|
|
7
|
+
import os
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import List, Optional
|
|
11
|
+
|
|
12
|
+
from dotenv import load_dotenv
|
|
13
|
+
from openai import OpenAI
|
|
14
|
+
|
|
15
|
+
from massgen.tool._result import ExecutionResult, TextContent
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _validate_path_access(path: Path, allowed_paths: Optional[List[Path]] = None) -> None:
|
|
19
|
+
"""
|
|
20
|
+
Validate that a path is within allowed directories.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
path: Path to validate
|
|
24
|
+
allowed_paths: List of allowed base paths (optional)
|
|
25
|
+
agent_cwd: Agent\'s current working directory (automatically injected)
|
|
26
|
+
|
|
27
|
+
Raises:
|
|
28
|
+
ValueError: If path is not within allowed directories
|
|
29
|
+
"""
|
|
30
|
+
if not allowed_paths:
|
|
31
|
+
return # No restrictions
|
|
32
|
+
|
|
33
|
+
for allowed_path in allowed_paths:
|
|
34
|
+
try:
|
|
35
|
+
path.relative_to(allowed_path)
|
|
36
|
+
return # Path is within this allowed directory
|
|
37
|
+
except ValueError:
|
|
38
|
+
continue
|
|
39
|
+
|
|
40
|
+
raise ValueError(f"Path not in allowed directories: {path}")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
async def text_to_speech_transcription_generation(
|
|
44
|
+
input_text: str,
|
|
45
|
+
model: str = "gpt-4o-mini-tts",
|
|
46
|
+
voice: str = "alloy",
|
|
47
|
+
instructions: Optional[str] = None,
|
|
48
|
+
storage_path: Optional[str] = None,
|
|
49
|
+
audio_format: str = "mp3",
|
|
50
|
+
allowed_paths: Optional[List[str]] = None,
|
|
51
|
+
agent_cwd: Optional[str] = None,
|
|
52
|
+
) -> ExecutionResult:
|
|
53
|
+
"""
|
|
54
|
+
Convert text (transcription) directly to speech using OpenAI's TTS API with streaming response.
|
|
55
|
+
|
|
56
|
+
This tool converts text directly to speech audio using OpenAI's Text-to-Speech API,
|
|
57
|
+
designed specifically for converting transcriptions or any text content to spoken audio.
|
|
58
|
+
Uses streaming response for efficient file handling.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
input_text: The text content to convert to speech (e.g., transcription text)
|
|
62
|
+
model: TTS model to use (default: "gpt-4o-mini-tts")
|
|
63
|
+
Options: "gpt-4o-mini-tts", "tts-1", "tts-1-hd"
|
|
64
|
+
voice: Voice to use for speech synthesis (default: "alloy")
|
|
65
|
+
Options: "alloy", "echo", "fable", "onyx", "nova", "shimmer", "coral", "sage"
|
|
66
|
+
instructions: Optional speaking instructions for tone and style (e.g., "Speak in a cheerful tone")
|
|
67
|
+
storage_path: Directory path where to save the audio file (optional)
|
|
68
|
+
- **IMPORTANT**: Must be a DIRECTORY path only, NOT a file path (e.g., "audio/speech" NOT "audio/speech.mp3")
|
|
69
|
+
- The filename is automatically generated from the text content and timestamp
|
|
70
|
+
- Relative path: Resolved relative to agent's workspace (e.g., "audio/speech")
|
|
71
|
+
- Absolute path: Must be within allowed directories
|
|
72
|
+
- None/empty: Saves to agent's workspace root
|
|
73
|
+
audio_format: Output audio format (default: "mp3")
|
|
74
|
+
Options: "mp3", "opus", "aac", "flac", "wav", "pcm"
|
|
75
|
+
allowed_paths: List of allowed base paths for validation (optional)
|
|
76
|
+
agent_cwd: Agent\'s current working directory (automatically injected)
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
ExecutionResult containing:
|
|
80
|
+
- success: Whether operation succeeded
|
|
81
|
+
- operation: "convert_text_to_speech"
|
|
82
|
+
- audio_file: Generated audio file with path and metadata
|
|
83
|
+
- model: TTS model used
|
|
84
|
+
- voice: Voice used
|
|
85
|
+
- format: Audio format used
|
|
86
|
+
- text_length: Length of input text
|
|
87
|
+
- instructions: Speaking instructions if provided
|
|
88
|
+
|
|
89
|
+
Examples:
|
|
90
|
+
convert_text_to_speech("Hello world, this is a test.")
|
|
91
|
+
→ Converts text to speech and saves as MP3
|
|
92
|
+
|
|
93
|
+
convert_text_to_speech(
|
|
94
|
+
"Today is a wonderful day to build something people love!",
|
|
95
|
+
voice="coral",
|
|
96
|
+
instructions="Speak in a cheerful and positive tone."
|
|
97
|
+
)
|
|
98
|
+
→ Converts with specific voice and speaking instructions
|
|
99
|
+
|
|
100
|
+
Security:
|
|
101
|
+
- Requires valid OpenAI API key
|
|
102
|
+
- Files are saved to specified path within workspace
|
|
103
|
+
- Path must be within allowed directories
|
|
104
|
+
"""
|
|
105
|
+
try:
|
|
106
|
+
# Convert allowed_paths from strings to Path objects
|
|
107
|
+
allowed_paths_list = [Path(p) for p in allowed_paths] if allowed_paths else None
|
|
108
|
+
|
|
109
|
+
# Use agent_cwd if available, otherwise fall back to base_dir
|
|
110
|
+
base_dir = Path(agent_cwd) if agent_cwd else Path.cwd()
|
|
111
|
+
|
|
112
|
+
# Load environment variables
|
|
113
|
+
script_dir = Path(__file__).parent.parent.parent.parent
|
|
114
|
+
env_path = script_dir / ".env"
|
|
115
|
+
if env_path.exists():
|
|
116
|
+
load_dotenv(env_path)
|
|
117
|
+
else:
|
|
118
|
+
load_dotenv()
|
|
119
|
+
|
|
120
|
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
|
121
|
+
|
|
122
|
+
if not openai_api_key:
|
|
123
|
+
result = {
|
|
124
|
+
"success": False,
|
|
125
|
+
"operation": "convert_text_to_speech",
|
|
126
|
+
"error": "OpenAI API key not found. Please set OPENAI_API_KEY in .env file or environment variable.",
|
|
127
|
+
}
|
|
128
|
+
return ExecutionResult(
|
|
129
|
+
output_blocks=[TextContent(data=json.dumps(result, indent=2))],
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# Initialize OpenAI client
|
|
133
|
+
client = OpenAI(api_key=openai_api_key)
|
|
134
|
+
|
|
135
|
+
# Determine storage directory
|
|
136
|
+
if storage_path:
|
|
137
|
+
if Path(storage_path).is_absolute():
|
|
138
|
+
storage_dir = Path(storage_path).resolve()
|
|
139
|
+
else:
|
|
140
|
+
storage_dir = (base_dir / storage_path).resolve()
|
|
141
|
+
else:
|
|
142
|
+
storage_dir = base_dir
|
|
143
|
+
|
|
144
|
+
# Validate storage directory is within allowed paths
|
|
145
|
+
_validate_path_access(storage_dir, allowed_paths_list)
|
|
146
|
+
|
|
147
|
+
# Create directory if it doesn't exist
|
|
148
|
+
storage_dir.mkdir(parents=True, exist_ok=True)
|
|
149
|
+
|
|
150
|
+
# Generate filename with timestamp
|
|
151
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
152
|
+
|
|
153
|
+
# Clean text for filename (first 30 chars)
|
|
154
|
+
clean_text = "".join(c for c in input_text[:30] if c.isalnum() or c in (" ", "-", "_")).strip()
|
|
155
|
+
clean_text = clean_text.replace(" ", "_")
|
|
156
|
+
|
|
157
|
+
filename = f"speech_{timestamp}_{clean_text}.{audio_format}"
|
|
158
|
+
file_path = storage_dir / filename
|
|
159
|
+
|
|
160
|
+
try:
|
|
161
|
+
# Prepare request parameters
|
|
162
|
+
request_params = {
|
|
163
|
+
"model": model,
|
|
164
|
+
"voice": voice,
|
|
165
|
+
"input": input_text,
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
# Add instructions if provided (only for models that support it)
|
|
169
|
+
if instructions and model in ["gpt-4o-mini-tts"]:
|
|
170
|
+
request_params["instructions"] = instructions
|
|
171
|
+
|
|
172
|
+
# Use streaming response for efficient file handling
|
|
173
|
+
with client.audio.speech.with_streaming_response.create(**request_params) as response:
|
|
174
|
+
# Stream directly to file
|
|
175
|
+
response.stream_to_file(file_path)
|
|
176
|
+
|
|
177
|
+
# Get file size
|
|
178
|
+
file_size = file_path.stat().st_size
|
|
179
|
+
|
|
180
|
+
result = {
|
|
181
|
+
"success": True,
|
|
182
|
+
"operation": "convert_text_to_speech",
|
|
183
|
+
"audio_file": {
|
|
184
|
+
"file_path": str(file_path),
|
|
185
|
+
"filename": filename,
|
|
186
|
+
"size": file_size,
|
|
187
|
+
"format": audio_format,
|
|
188
|
+
},
|
|
189
|
+
"model": model,
|
|
190
|
+
"voice": voice,
|
|
191
|
+
"format": audio_format,
|
|
192
|
+
"text_length": len(input_text),
|
|
193
|
+
"instructions": instructions if instructions else None,
|
|
194
|
+
}
|
|
195
|
+
return ExecutionResult(
|
|
196
|
+
output_blocks=[TextContent(data=json.dumps(result, indent=2))],
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
except Exception as api_error:
|
|
200
|
+
result = {
|
|
201
|
+
"success": False,
|
|
202
|
+
"operation": "convert_text_to_speech",
|
|
203
|
+
"error": f"OpenAI TTS API error: {str(api_error)}",
|
|
204
|
+
}
|
|
205
|
+
return ExecutionResult(
|
|
206
|
+
output_blocks=[TextContent(data=json.dumps(result, indent=2))],
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
except Exception as e:
|
|
210
|
+
result = {
|
|
211
|
+
"success": False,
|
|
212
|
+
"operation": "convert_text_to_speech",
|
|
213
|
+
"error": f"Failed to convert text to speech: {str(e)}",
|
|
214
|
+
}
|
|
215
|
+
return ExecutionResult(
|
|
216
|
+
output_blocks=[TextContent(data=json.dumps(result, indent=2))],
|
|
217
|
+
)
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
Generate a video from a text prompt using OpenAI's Sora-2 API.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import json
|
|
7
|
+
import os
|
|
8
|
+
import time
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import List, Optional
|
|
12
|
+
|
|
13
|
+
from dotenv import load_dotenv
|
|
14
|
+
from openai import OpenAI
|
|
15
|
+
|
|
16
|
+
from massgen.tool._result import ExecutionResult, TextContent
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _validate_path_access(path: Path, allowed_paths: Optional[List[Path]] = None) -> None:
|
|
20
|
+
"""
|
|
21
|
+
Validate that a path is within allowed directories.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
path: Path to validate
|
|
25
|
+
allowed_paths: List of allowed base paths (optional)
|
|
26
|
+
agent_cwd: Agent\'s current working directory (automatically injected)
|
|
27
|
+
|
|
28
|
+
Raises:
|
|
29
|
+
ValueError: If path is not within allowed directories
|
|
30
|
+
"""
|
|
31
|
+
if not allowed_paths:
|
|
32
|
+
return # No restrictions
|
|
33
|
+
|
|
34
|
+
for allowed_path in allowed_paths:
|
|
35
|
+
try:
|
|
36
|
+
path.relative_to(allowed_path)
|
|
37
|
+
return # Path is within this allowed directory
|
|
38
|
+
except ValueError:
|
|
39
|
+
continue
|
|
40
|
+
|
|
41
|
+
raise ValueError(f"Path not in allowed directories: {path}")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
async def text_to_video_generation(
|
|
45
|
+
prompt: str,
|
|
46
|
+
model: str = "sora-2",
|
|
47
|
+
seconds: int = 4,
|
|
48
|
+
storage_path: Optional[str] = None,
|
|
49
|
+
allowed_paths: Optional[List[str]] = None,
|
|
50
|
+
agent_cwd: Optional[str] = None,
|
|
51
|
+
) -> ExecutionResult:
|
|
52
|
+
"""
|
|
53
|
+
Generate a video from a text prompt using OpenAI's Sora-2 API.
|
|
54
|
+
|
|
55
|
+
This tool generates a video based on a text prompt using OpenAI's Sora-2 API
|
|
56
|
+
and saves it to the workspace with automatic organization.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
prompt: Text description for the video to generate
|
|
60
|
+
model: Model to use (default: "sora-2")
|
|
61
|
+
seconds: Video duration in seconds (default: 4)
|
|
62
|
+
storage_path: Directory path where to save the video (optional)
|
|
63
|
+
- **IMPORTANT**: Must be a DIRECTORY path only, NOT a file path (e.g., "videos/generated" NOT "videos/output.mp4")
|
|
64
|
+
- The filename is automatically generated from the prompt and timestamp
|
|
65
|
+
- Relative path: Resolved relative to agent's workspace (e.g., "videos/generated")
|
|
66
|
+
- Absolute path: Must be within allowed directories
|
|
67
|
+
- None/empty: Saves to agent's workspace root
|
|
68
|
+
allowed_paths: List of allowed base paths for validation (optional)
|
|
69
|
+
agent_cwd: Agent\'s current working directory (automatically injected)
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
ExecutionResult containing:
|
|
73
|
+
- success: Whether operation succeeded
|
|
74
|
+
- operation: "generate_and_store_video_no_input_images"
|
|
75
|
+
- video_path: Path to the saved video file
|
|
76
|
+
- model: Model used for generation
|
|
77
|
+
- prompt: The prompt used
|
|
78
|
+
- duration: Time taken for generation in seconds
|
|
79
|
+
|
|
80
|
+
Examples:
|
|
81
|
+
generate_and_store_video_no_input_images("A cool cat on a motorcycle in the night")
|
|
82
|
+
→ Generates a video and saves to workspace root
|
|
83
|
+
|
|
84
|
+
generate_and_store_video_no_input_images("Dancing robot", storage_path="videos/")
|
|
85
|
+
→ Generates a video and saves to videos/ directory
|
|
86
|
+
|
|
87
|
+
Security:
|
|
88
|
+
- Requires valid OpenAI API key with Sora-2 access
|
|
89
|
+
- Files are saved to specified path within workspace
|
|
90
|
+
"""
|
|
91
|
+
try:
|
|
92
|
+
# Convert allowed_paths from strings to Path objects
|
|
93
|
+
allowed_paths_list = [Path(p) for p in allowed_paths] if allowed_paths else None
|
|
94
|
+
|
|
95
|
+
# Use agent_cwd if available, otherwise fall back to base_dir
|
|
96
|
+
base_dir = Path(agent_cwd) if agent_cwd else Path.cwd()
|
|
97
|
+
|
|
98
|
+
# Load environment variables
|
|
99
|
+
script_dir = Path(__file__).parent.parent.parent.parent
|
|
100
|
+
env_path = script_dir / ".env"
|
|
101
|
+
if env_path.exists():
|
|
102
|
+
load_dotenv(env_path)
|
|
103
|
+
else:
|
|
104
|
+
load_dotenv()
|
|
105
|
+
|
|
106
|
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
|
107
|
+
|
|
108
|
+
if not openai_api_key:
|
|
109
|
+
result = {
|
|
110
|
+
"success": False,
|
|
111
|
+
"operation": "generate_and_store_video_no_input_images",
|
|
112
|
+
"error": "OpenAI API key not found. Please set OPENAI_API_KEY in .env file or environment variable.",
|
|
113
|
+
}
|
|
114
|
+
return ExecutionResult(
|
|
115
|
+
output_blocks=[TextContent(data=json.dumps(result, indent=2))],
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Initialize OpenAI client
|
|
119
|
+
client = OpenAI(api_key=openai_api_key)
|
|
120
|
+
|
|
121
|
+
# Determine storage directory
|
|
122
|
+
if storage_path:
|
|
123
|
+
if Path(storage_path).is_absolute():
|
|
124
|
+
storage_dir = Path(storage_path).resolve()
|
|
125
|
+
else:
|
|
126
|
+
storage_dir = (base_dir / storage_path).resolve()
|
|
127
|
+
else:
|
|
128
|
+
storage_dir = base_dir
|
|
129
|
+
|
|
130
|
+
# Validate storage directory is within allowed paths
|
|
131
|
+
_validate_path_access(storage_dir, allowed_paths_list)
|
|
132
|
+
|
|
133
|
+
# Create directory if it doesn't exist
|
|
134
|
+
storage_dir.mkdir(parents=True, exist_ok=True)
|
|
135
|
+
|
|
136
|
+
try:
|
|
137
|
+
start_time = time.time()
|
|
138
|
+
|
|
139
|
+
# Start video generation (no print statements to avoid MCP JSON parsing issues)
|
|
140
|
+
video = client.videos.create(
|
|
141
|
+
model=model,
|
|
142
|
+
prompt=prompt,
|
|
143
|
+
seconds=str(seconds),
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
getattr(video, "progress", 0)
|
|
147
|
+
|
|
148
|
+
# Monitor progress (silently, no stdout writes)
|
|
149
|
+
while video.status in ("in_progress", "queued"):
|
|
150
|
+
# Refresh status
|
|
151
|
+
video = client.videos.retrieve(video.id)
|
|
152
|
+
getattr(video, "progress", 0)
|
|
153
|
+
time.sleep(2)
|
|
154
|
+
|
|
155
|
+
if video.status == "failed":
|
|
156
|
+
message = getattr(
|
|
157
|
+
getattr(video, "error", None),
|
|
158
|
+
"message",
|
|
159
|
+
"Video generation failed",
|
|
160
|
+
)
|
|
161
|
+
result = {
|
|
162
|
+
"success": False,
|
|
163
|
+
"operation": "generate_and_store_video_no_input_images",
|
|
164
|
+
"error": message,
|
|
165
|
+
}
|
|
166
|
+
return ExecutionResult(
|
|
167
|
+
output_blocks=[TextContent(data=json.dumps(result, indent=2))],
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# Download video content
|
|
171
|
+
content = client.videos.download_content(video.id, variant="video")
|
|
172
|
+
|
|
173
|
+
# Generate filename with timestamp
|
|
174
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
175
|
+
clean_prompt = "".join(c for c in prompt[:30] if c.isalnum() or c in (" ", "-", "_")).strip()
|
|
176
|
+
clean_prompt = clean_prompt.replace(" ", "_")
|
|
177
|
+
filename = f"{timestamp}_{clean_prompt}.mp4"
|
|
178
|
+
|
|
179
|
+
# Full file path
|
|
180
|
+
file_path = storage_dir / filename
|
|
181
|
+
|
|
182
|
+
# Write video to file
|
|
183
|
+
content.write_to_file(str(file_path))
|
|
184
|
+
|
|
185
|
+
# Calculate duration
|
|
186
|
+
duration = time.time() - start_time
|
|
187
|
+
|
|
188
|
+
# Get file size
|
|
189
|
+
file_size = file_path.stat().st_size
|
|
190
|
+
|
|
191
|
+
result = {
|
|
192
|
+
"success": True,
|
|
193
|
+
"operation": "generate_and_store_video_no_input_images",
|
|
194
|
+
"video_path": str(file_path),
|
|
195
|
+
"filename": filename,
|
|
196
|
+
"size": file_size,
|
|
197
|
+
"model": model,
|
|
198
|
+
"prompt": prompt,
|
|
199
|
+
"duration": duration,
|
|
200
|
+
}
|
|
201
|
+
return ExecutionResult(
|
|
202
|
+
output_blocks=[TextContent(data=json.dumps(result, indent=2))],
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
except Exception as api_error:
|
|
206
|
+
result = {
|
|
207
|
+
"success": False,
|
|
208
|
+
"operation": "generate_and_store_video_no_input_images",
|
|
209
|
+
"error": f"OpenAI API error: {str(api_error)}",
|
|
210
|
+
}
|
|
211
|
+
return ExecutionResult(
|
|
212
|
+
output_blocks=[TextContent(data=json.dumps(result, indent=2))],
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
except Exception as e:
|
|
216
|
+
result = {
|
|
217
|
+
"success": False,
|
|
218
|
+
"operation": "generate_and_store_video_no_input_images",
|
|
219
|
+
"error": f"Failed to generate or save video: {str(e)}",
|
|
220
|
+
}
|
|
221
|
+
return ExecutionResult(
|
|
222
|
+
output_blocks=[TextContent(data=json.dumps(result, indent=2))],
|
|
223
|
+
)
|