massgen 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of massgen might be problematic. Click here for more details.

Files changed (82) hide show
  1. massgen/__init__.py +1 -1
  2. massgen/agent_config.py +33 -7
  3. massgen/api_params_handler/_api_params_handler_base.py +3 -0
  4. massgen/api_params_handler/_chat_completions_api_params_handler.py +4 -0
  5. massgen/api_params_handler/_claude_api_params_handler.py +4 -0
  6. massgen/api_params_handler/_gemini_api_params_handler.py +4 -0
  7. massgen/api_params_handler/_response_api_params_handler.py +4 -0
  8. massgen/backend/azure_openai.py +9 -1
  9. massgen/backend/base.py +4 -0
  10. massgen/backend/base_with_custom_tool_and_mcp.py +25 -5
  11. massgen/backend/claude_code.py +9 -1
  12. massgen/backend/docs/permissions_and_context_files.md +2 -2
  13. massgen/backend/gemini.py +35 -6
  14. massgen/backend/gemini_utils.py +30 -0
  15. massgen/backend/response.py +2 -0
  16. massgen/chat_agent.py +9 -3
  17. massgen/cli.py +291 -43
  18. massgen/config_builder.py +163 -18
  19. massgen/configs/README.md +69 -14
  20. massgen/configs/debug/restart_test_controlled.yaml +60 -0
  21. massgen/configs/debug/restart_test_controlled_filesystem.yaml +73 -0
  22. massgen/configs/tools/code-execution/docker_with_sudo.yaml +35 -0
  23. massgen/configs/tools/custom_tools/computer_use_browser_example.yaml +56 -0
  24. massgen/configs/tools/custom_tools/computer_use_docker_example.yaml +65 -0
  25. massgen/configs/tools/custom_tools/computer_use_example.yaml +50 -0
  26. massgen/configs/tools/custom_tools/crawl4ai_example.yaml +55 -0
  27. massgen/configs/tools/custom_tools/multimodal_tools/text_to_file_generation_multi.yaml +61 -0
  28. massgen/configs/tools/custom_tools/multimodal_tools/text_to_file_generation_single.yaml +29 -0
  29. massgen/configs/tools/custom_tools/multimodal_tools/text_to_image_generation_multi.yaml +51 -0
  30. massgen/configs/tools/custom_tools/multimodal_tools/text_to_image_generation_single.yaml +33 -0
  31. massgen/configs/tools/custom_tools/multimodal_tools/text_to_speech_generation_multi.yaml +55 -0
  32. massgen/configs/tools/custom_tools/multimodal_tools/text_to_speech_generation_single.yaml +33 -0
  33. massgen/configs/tools/custom_tools/multimodal_tools/text_to_video_generation_multi.yaml +47 -0
  34. massgen/configs/tools/custom_tools/multimodal_tools/text_to_video_generation_single.yaml +29 -0
  35. massgen/configs/tools/custom_tools/multimodal_tools/understand_audio.yaml +33 -0
  36. massgen/configs/tools/custom_tools/multimodal_tools/understand_file.yaml +34 -0
  37. massgen/configs/tools/custom_tools/multimodal_tools/understand_image.yaml +33 -0
  38. massgen/configs/tools/custom_tools/multimodal_tools/understand_video.yaml +34 -0
  39. massgen/configs/tools/custom_tools/multimodal_tools/youtube_video_analysis.yaml +59 -0
  40. massgen/docker/README.md +83 -0
  41. massgen/filesystem_manager/_code_execution_server.py +22 -7
  42. massgen/filesystem_manager/_docker_manager.py +21 -1
  43. massgen/filesystem_manager/_filesystem_manager.py +9 -0
  44. massgen/filesystem_manager/_path_permission_manager.py +148 -0
  45. massgen/filesystem_manager/_workspace_tools_server.py +0 -997
  46. massgen/formatter/_gemini_formatter.py +73 -0
  47. massgen/frontend/coordination_ui.py +175 -257
  48. massgen/frontend/displays/base_display.py +29 -0
  49. massgen/frontend/displays/rich_terminal_display.py +155 -9
  50. massgen/frontend/displays/simple_display.py +21 -0
  51. massgen/frontend/displays/terminal_display.py +22 -2
  52. massgen/logger_config.py +50 -6
  53. massgen/message_templates.py +283 -15
  54. massgen/orchestrator.py +335 -38
  55. massgen/tests/test_binary_file_blocking.py +274 -0
  56. massgen/tests/test_case_studies.md +12 -12
  57. massgen/tests/test_code_execution.py +178 -0
  58. massgen/tests/test_multimodal_size_limits.py +407 -0
  59. massgen/tests/test_orchestration_restart.py +204 -0
  60. massgen/tool/__init__.py +4 -0
  61. massgen/tool/_manager.py +7 -2
  62. massgen/tool/_multimodal_tools/image_to_image_generation.py +293 -0
  63. massgen/tool/_multimodal_tools/text_to_file_generation.py +455 -0
  64. massgen/tool/_multimodal_tools/text_to_image_generation.py +222 -0
  65. massgen/tool/_multimodal_tools/text_to_speech_continue_generation.py +226 -0
  66. massgen/tool/_multimodal_tools/text_to_speech_transcription_generation.py +217 -0
  67. massgen/tool/_multimodal_tools/text_to_video_generation.py +223 -0
  68. massgen/tool/_multimodal_tools/understand_audio.py +211 -0
  69. massgen/tool/_multimodal_tools/understand_file.py +555 -0
  70. massgen/tool/_multimodal_tools/understand_image.py +316 -0
  71. massgen/tool/_multimodal_tools/understand_video.py +340 -0
  72. massgen/tool/_web_tools/crawl4ai_tool.py +718 -0
  73. massgen/tool/docs/multimodal_tools.md +1368 -0
  74. massgen/tool/workflow_toolkits/__init__.py +26 -0
  75. massgen/tool/workflow_toolkits/post_evaluation.py +216 -0
  76. massgen/utils.py +1 -0
  77. {massgen-0.1.2.dist-info → massgen-0.1.4.dist-info}/METADATA +101 -69
  78. {massgen-0.1.2.dist-info → massgen-0.1.4.dist-info}/RECORD +82 -46
  79. {massgen-0.1.2.dist-info → massgen-0.1.4.dist-info}/WHEEL +0 -0
  80. {massgen-0.1.2.dist-info → massgen-0.1.4.dist-info}/entry_points.txt +0 -0
  81. {massgen-0.1.2.dist-info → massgen-0.1.4.dist-info}/licenses/LICENSE +0 -0
  82. {massgen-0.1.2.dist-info → massgen-0.1.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1368 @@
1
+ # Multimodal Tools Guide
2
+
3
+ ## Overview
4
+
5
+ MassGen provides a comprehensive suite of multimodal understanding tools that enable AI agents to analyze and understand various media types including videos, images, audio files, and documents. These tools leverage OpenAI's advanced multimodal APIs (gpt-4.1 and transcription services) to provide intelligent content analysis capabilities.
6
+
7
+ ## Tool Categories
8
+
9
+ Multimodal tools are organized into four main categories:
10
+
11
+ - **Video Understanding**: Extract key frames and analyze video content
12
+ - **Audio Understanding**: Transcribe and analyze audio files
13
+ - **Image Understanding**: Analyze and describe image content
14
+ - **File Understanding**: Process and understand documents (PDF, DOCX, PPTX, XLSX)
15
+
16
+ ## Video Understanding Tool
17
+
18
+ ### understand_video
19
+
20
+ **What it does**: Extracts key frames from video files and uses OpenAI's gpt-4.1 API to analyze and understand video content. The tool samples frames evenly across the video timeline to provide comprehensive coverage of the video's content.
21
+
22
+ **Why use it**: Allows agents to understand video content without manually watching videos. Perfect for summarizing videos, extracting key information, analyzing tutorial steps, or answering specific questions about video content.
23
+
24
+ **Location**: `massgen.tool._multimodal_tools.understand_video`
25
+
26
+ #### Parameters
27
+
28
+ - `video_path` (required): Path to the video file
29
+ - Relative path: Resolved relative to workspace
30
+ - Absolute path: Must be within allowed directories
31
+ - Supported formats: MP4, AVI, MOV, MKV, FLV, WMV, WEBM, M4V, MPG, MPEG
32
+ - `prompt` (optional): Question or instruction about the video (default: "What's happening in this video? Please describe the content, actions, and any important details you observe across these frames.")
33
+ - `num_frames` (optional): Number of key frames to extract (default: 8)
34
+ - Higher values provide more detail but increase API costs
35
+ - Recommended range: 4-16 frames
36
+ - `model` (optional): OpenAI model to use (default: "gpt-4.1")
37
+ - `allowed_paths` (optional): List of allowed base paths for validation
38
+
39
+ #### Returns
40
+
41
+ ExecutionResult containing:
42
+ - `success`: Whether operation succeeded
43
+ - `operation`: "understand_video"
44
+ - `video_path`: Path to the analyzed video
45
+ - `num_frames_extracted`: Number of frames extracted
46
+ - `prompt`: The prompt used
47
+ - `model`: Model used for analysis
48
+ - `response`: The model's understanding/description of the video
49
+
50
+ #### Security Features
51
+
52
+ - Path validation to ensure access only to allowed directories
53
+ - Requires valid OpenAI API key
54
+ - File existence and format validation
55
+ - Automatic cleanup of video capture resources
56
+
57
+ #### Dependencies
58
+
59
+ Requires `opencv-python` package:
60
+ ```bash
61
+ pip install opencv-python>=4.12.0.88
62
+ ```
63
+
64
+ #### Examples
65
+
66
+ **Basic Video Analysis**:
67
+
68
+ ```python
69
+ from massgen.tool._multimodal_tools import understand_video
70
+
71
+ # Analyze a video with default prompt
72
+ result = await understand_video(video_path="demo.mp4")
73
+
74
+ # Output includes detailed description of the video
75
+ print(result.output_blocks[0].data)
76
+ # {
77
+ # "success": true,
78
+ # "operation": "understand_video",
79
+ # "video_path": "/path/to/demo.mp4",
80
+ # "num_frames_extracted": 8,
81
+ # "response": "The video shows..."
82
+ # }
83
+ ```
84
+
85
+ **Custom Prompt and Frame Count**:
86
+
87
+ ```python
88
+ # Ask specific questions about the video
89
+ result = await understand_video(
90
+ video_path="tutorial.mp4",
91
+ prompt="What steps are shown in this tutorial? List them in order.",
92
+ num_frames=12 # Extract more frames for detailed analysis
93
+ )
94
+ ```
95
+
96
+ **Meeting Summary**:
97
+
98
+ ```python
99
+ # Summarize a meeting recording
100
+ result = await understand_video(
101
+ video_path="meeting_recording.mp4",
102
+ prompt="Summarize the key points and decisions made in this meeting."
103
+ )
104
+ ```
105
+
106
+ **Configuration Example**:
107
+
108
+ ```yaml
109
+ # massgen/configs/tools/custom_tools/multimodal_tools/understand_video.yaml
110
+ agents:
111
+ - id: "understand_video_tool"
112
+ backend:
113
+ type: "openai"
114
+ model: "gpt-5-nano"
115
+ cwd: "workspace1"
116
+ custom_tools:
117
+ - name: ["understand_video"]
118
+ category: "multimodal"
119
+ path: "massgen/tool/_multimodal_tools/understand_video.py"
120
+ function: ["understand_video"]
121
+ ```
122
+
123
+ **CLI Usage**:
124
+
125
+ ```bash
126
+ massgen --config massgen/configs/tools/custom_tools/multimodal_tools/understand_video.yaml "What's happening in this video?"
127
+ ```
128
+
129
+ #### Note
130
+
131
+ This tool extracts still frames from the video. Audio content is not analyzed. For audio analysis, use the `understand_audio` tool.
132
+
133
+ ---
134
+
135
+ ## Audio Understanding Tool
136
+
137
+ ### understand_audio
138
+
139
+ **What it does**: Transcribes audio files to text using OpenAI's Transcription API. Supports multiple audio file formats and can process multiple files in a single call.
140
+
141
+ **Why use it**: Enables agents to understand spoken content in audio files without manual listening. Ideal for transcribing interviews, meetings, podcasts, or any audio content that needs to be converted to text.
142
+
143
+ **Location**: `massgen.tool._multimodal_tools.understand_audio`
144
+
145
+ #### Parameters
146
+
147
+ - `audio_paths` (required): List of paths to input audio files
148
+ - Relative paths: Resolved relative to workspace
149
+ - Absolute paths: Must be within allowed directories
150
+ - Supported formats: WAV, MP3, M4A, MP4, OGG, FLAC, AAC, WMA, OPUS
151
+ - `model` (optional): Model to use (default: "gpt-4o-transcribe")
152
+ - `allowed_paths` (optional): List of allowed base paths for validation
153
+
154
+ #### Returns
155
+
156
+ ExecutionResult containing:
157
+ - `success`: Whether operation succeeded
158
+ - `operation`: "generate_text_with_input_audio"
159
+ - `transcriptions`: List of transcription results for each file
160
+ - Each contains `file` path and `transcription` text
161
+ - `audio_files`: List of paths to the input audio files
162
+ - `model`: Model used
163
+
164
+ #### Security Features
165
+
166
+ - Path validation for all audio files
167
+ - File existence and format validation
168
+ - Requires valid OpenAI API key
169
+ - Separate error handling for each file
170
+
171
+ #### Examples
172
+
173
+ **Single Audio File Transcription**:
174
+
175
+ ```python
176
+ from massgen.tool._multimodal_tools import understand_audio
177
+
178
+ # Transcribe a single audio file
179
+ result = await understand_audio(audio_paths=["recording.wav"])
180
+
181
+ # Output includes transcription
182
+ print(result.output_blocks[0].data)
183
+ # {
184
+ # "success": true,
185
+ # "operation": "generate_text_with_input_audio",
186
+ # "transcriptions": [
187
+ # {
188
+ # "file": "/path/to/recording.wav",
189
+ # "transcription": "Hello, this is a test recording..."
190
+ # }
191
+ # ],
192
+ # "audio_files": ["/path/to/recording.wav"],
193
+ # "model": "gpt-4o-transcribe"
194
+ # }
195
+ ```
196
+
197
+ **Multiple Audio Files**:
198
+
199
+ ```python
200
+ # Transcribe multiple audio files in one call
201
+ result = await understand_audio(
202
+ audio_paths=["interview1.mp3", "interview2.mp3", "interview3.mp3"]
203
+ )
204
+
205
+ # Each file is transcribed separately
206
+ for transcription in result["transcriptions"]:
207
+ print(f"File: {transcription['file']}")
208
+ print(f"Text: {transcription['transcription']}")
209
+ ```
210
+
211
+ **Configuration Example**:
212
+
213
+ ```yaml
214
+ # massgen/configs/tools/custom_tools/multimodal_tools/understand_audio.yaml
215
+ agents:
216
+ - id: "understand_audio_tool"
217
+ backend:
218
+ type: "openai"
219
+ model: "gpt-5-nano"
220
+ cwd: "workspace1"
221
+ custom_tools:
222
+ - name: ["understand_audio"]
223
+ category: "multimodal"
224
+ path: "massgen/tool/_multimodal_tools/understand_audio.py"
225
+ function: ["understand_audio"]
226
+ ```
227
+
228
+ **CLI Usage**:
229
+
230
+ ```bash
231
+ massgen --config massgen/configs/tools/custom_tools/multimodal_tools/understand_audio.yaml "What is being said in this audio?"
232
+ ```
233
+
234
+ ---
235
+
236
+ ## Image Understanding Tool
237
+
238
+ ### understand_image
239
+
240
+ **What it does**: Analyzes images using OpenAI's gpt-4.1 API to provide descriptions, answer questions, or extract insights from image content.
241
+
242
+ **Why use it**: Allows agents to "see" and understand images. Perfect for analyzing charts, screenshots, photos, diagrams, or any visual content that needs interpretation.
243
+
244
+ **Location**: `massgen.tool._multimodal_tools.understand_image`
245
+
246
+ #### Parameters
247
+
248
+ - `image_path` (required): Path to the image file
249
+ - Relative path: Resolved relative to workspace
250
+ - Absolute path: Must be within allowed directories
251
+ - Supported formats: PNG, JPEG, JPG
252
+ - `prompt` (optional): Question or instruction about the image (default: "What's in this image? Please describe it in detail.")
253
+ - `model` (optional): Model to use (default: "gpt-4.1")
254
+ - `allowed_paths` (optional): List of allowed base paths for validation
255
+
256
+ #### Returns
257
+
258
+ ExecutionResult containing:
259
+ - `success`: Whether operation succeeded
260
+ - `operation`: "understand_image"
261
+ - `image_path`: Path to the analyzed image
262
+ - `prompt`: The prompt used
263
+ - `model`: Model used for analysis
264
+ - `response`: The model's understanding/description of the image
265
+
266
+ #### Security Features
267
+
268
+ - Path validation to ensure access only to allowed directories
269
+ - Requires valid OpenAI API key
270
+ - File format validation (PNG, JPEG, JPG only)
271
+ - Secure base64 encoding for API transmission
272
+
273
+ #### Examples
274
+
275
+ **Basic Image Description**:
276
+
277
+ ```python
278
+ from massgen.tool._multimodal_tools import understand_image
279
+
280
+ # Get a detailed description of an image
281
+ result = await understand_image(image_path="photo.jpg")
282
+
283
+ # Output includes image analysis
284
+ print(result.output_blocks[0].data)
285
+ # {
286
+ # "success": true,
287
+ # "operation": "understand_image",
288
+ # "image_path": "/path/to/photo.jpg",
289
+ # "response": "This image shows..."
290
+ # }
291
+ ```
292
+
293
+ **Chart Analysis**:
294
+
295
+ ```python
296
+ # Analyze a chart or graph
297
+ result = await understand_image(
298
+ image_path="sales_chart.png",
299
+ prompt="What data is shown in this chart? What are the key trends?"
300
+ )
301
+ ```
302
+
303
+ **Screenshot Analysis**:
304
+
305
+ ```python
306
+ # Analyze UI elements in a screenshot
307
+ result = await understand_image(
308
+ image_path="app_screenshot.png",
309
+ prompt="What UI elements are visible in this screenshot? Describe the layout and functionality."
310
+ )
311
+ ```
312
+
313
+ **Diagram Understanding**:
314
+
315
+ ```python
316
+ # Understand technical diagrams
317
+ result = await understand_image(
318
+ image_path="architecture_diagram.png",
319
+ prompt="Explain the system architecture shown in this diagram."
320
+ )
321
+ ```
322
+
323
+ **Configuration Example**:
324
+
325
+ ```yaml
326
+ # massgen/configs/tools/custom_tools/multimodal_tools/understand_image.yaml
327
+ agents:
328
+ - id: "understand_image_tool"
329
+ backend:
330
+ type: "openai"
331
+ model: "gpt-5-nano"
332
+ cwd: "workspace1"
333
+ custom_tools:
334
+ - name: ["understand_image"]
335
+ category: "multimodal"
336
+ path: "massgen/tool/_multimodal_tools/understand_image.py"
337
+ function: ["understand_image"]
338
+ ```
339
+
340
+ **CLI Usage**:
341
+
342
+ ```bash
343
+ massgen --config massgen/configs/tools/custom_tools/multimodal_tools/understand_image.yaml "Describe this image in detail"
344
+ ```
345
+
346
+ ---
347
+
348
+ ## File Understanding Tool
349
+
350
+ ### understand_file
351
+
352
+ **What it does**: Reads and analyzes various file types (text files, PDF, DOCX, XLSX, PPTX) using OpenAI's gpt-4.1 API. Automatically extracts content from different document formats and processes it for analysis.
353
+
354
+ **Why use it**: Enables agents to understand document content without manual reading. Perfect for summarizing documents, extracting key information, answering questions about files, or analyzing structured data.
355
+
356
+ **Location**: `massgen.tool._multimodal_tools.understand_file`
357
+
358
+ #### Parameters
359
+
360
+ - `file_path` (required): Path to the file to analyze
361
+ - Relative path: Resolved relative to workspace
362
+ - Absolute path: Must be within allowed directories
363
+ - `prompt` (optional): Question or instruction about the file (default: "Please analyze this file and provide a comprehensive understanding of its content, purpose, and structure.")
364
+ - `model` (optional): Model to use (default: "gpt-4.1")
365
+ - `max_chars` (optional): Maximum number of characters to read/extract (default: 50000)
366
+ - Prevents processing extremely large files
367
+ - Applies to both text files and extracted content
368
+ - `allowed_paths` (optional): List of allowed base paths for validation
369
+
370
+ #### Returns
371
+
372
+ ExecutionResult containing:
373
+ - `success`: Whether operation succeeded
374
+ - `operation`: "understand_file"
375
+ - `file_path`: Path to the analyzed file
376
+ - `file_name`: Name of the file
377
+ - `file_type`: Extraction method used ("text", "pdf", "docx", "excel", "pptx")
378
+ - `file_size`: Size of the file in bytes
379
+ - `chars_read`: Number of characters read/extracted
380
+ - `truncated`: Whether content was truncated
381
+ - `prompt`: The prompt used
382
+ - `model`: Model used for analysis
383
+ - `response`: The model's understanding/analysis of the file
384
+
385
+ #### Security Features
386
+
387
+ - Path validation to ensure access only to allowed directories
388
+ - File existence and type validation
389
+ - Content size limits to prevent memory issues
390
+ - Requires valid OpenAI API key
391
+ - Blocks unsupported binary formats
392
+
393
+ #### Supported File Types
394
+
395
+ **Text Files** (read directly):
396
+ - Code: `.py`, `.js`, `.java`, `.cpp`, `.c`, `.go`, `.rs`, `.ts`, `.tsx`, `.jsx`, etc.
397
+ - Config: `.md`, `.yaml`, `.yml`, `.json`, `.xml`, `.toml`, `.ini`, etc.
398
+ - Data: `.txt`, `.log`, `.csv`, `.tsv`, etc.
399
+
400
+ **Document Files** (require additional packages):
401
+ - PDF: `.pdf` (requires `PyPDF2`)
402
+ - Word: `.docx` (requires `python-docx`)
403
+ - Excel: `.xlsx` (requires `openpyxl`)
404
+ - PowerPoint: `.pptx` (requires `python-pptx`)
405
+
406
+ **Unsupported Formats**:
407
+ - Old Office formats (`.doc`, `.xls`, `.ppt`)
408
+ - Images (use `understand_image` instead)
409
+ - Videos (use `understand_video` instead)
410
+ - Audio (use `understand_audio` instead)
411
+ - Archives (`.zip`, `.tar`, `.gz`, etc.)
412
+ - Executables (`.exe`, `.dll`, `.so`, etc.)
413
+
414
+ #### Dependencies
415
+
416
+ For document processing:
417
+ ```bash
418
+ pip install PyPDF2>=3.0.1 # For PDF files
419
+ pip install python-docx>=1.2.0 # For DOCX files
420
+ pip install openpyxl>=3.1.5 # For XLSX files
421
+ pip install python-pptx>=1.0.2 # For PPTX files
422
+ ```
423
+
424
+ #### Examples
425
+
426
+ **Analyze Python Script**:
427
+
428
+ ```python
429
+ from massgen.tool._multimodal_tools import understand_file
430
+
431
+ # Analyze a Python script
432
+ result = await understand_file(
433
+ file_path="script.py",
434
+ prompt="Explain what this script does and how it works."
435
+ )
436
+
437
+ print(result.output_blocks[0].data)
438
+ # {
439
+ # "success": true,
440
+ # "operation": "understand_file",
441
+ # "file_type": "text",
442
+ # "response": "This Python script..."
443
+ # }
444
+ ```
445
+
446
+ **Summarize Documentation**:
447
+
448
+ ```python
449
+ # Summarize a README file
450
+ result = await understand_file(
451
+ file_path="README.md",
452
+ prompt="Summarize the key points of this documentation in 3-5 bullet points."
453
+ )
454
+ ```
455
+
456
+ **Analyze PDF Document**:
457
+
458
+ ```python
459
+ # Analyze a research paper
460
+ result = await understand_file(
461
+ file_path="research_paper.pdf",
462
+ prompt="What are the main findings and conclusions of this research paper?"
463
+ )
464
+ ```
465
+
466
+ **Process Word Document**:
467
+
468
+ ```python
469
+ # Summarize a business proposal
470
+ result = await understand_file(
471
+ file_path="proposal.docx",
472
+ prompt="Provide a summary of this business proposal including objectives, timeline, and budget."
473
+ )
474
+ ```
475
+
476
+ **Analyze Excel Spreadsheet**:
477
+
478
+ ```python
479
+ # Analyze data in spreadsheet
480
+ result = await understand_file(
481
+ file_path="sales_data.xlsx",
482
+ prompt="What patterns and trends can you identify in this sales data?"
483
+ )
484
+ ```
485
+
486
+ **Process PowerPoint Presentation**:
487
+
488
+ ```python
489
+ # Summarize presentation
490
+ result = await understand_file(
491
+ file_path="quarterly_review.pptx",
492
+ prompt="Summarize the key points from each slide of this presentation."
493
+ )
494
+ ```
495
+
496
+ **Handle Large Files**:
497
+
498
+ ```python
499
+ # Process large file with custom character limit
500
+ result = await understand_file(
501
+ file_path="large_document.pdf",
502
+ prompt="Summarize the introduction and conclusion sections.",
503
+ max_chars=100000 # Increase limit for larger files
504
+ )
505
+ ```
506
+
507
+ **Configuration Example**:
508
+
509
+ ```yaml
510
+ # massgen/configs/tools/custom_tools/multimodal_tools/understand_file.yaml
511
+ agents:
512
+ - id: "understand_file_tool"
513
+ backend:
514
+ type: "openai"
515
+ model: "gpt-5-nano"
516
+ cwd: "workspace1"
517
+ custom_tools:
518
+ - name: ["understand_file"]
519
+ category: "multimodal"
520
+ path: "massgen/tool/_multimodal_tools/understand_file.py"
521
+ function: ["understand_file"]
522
+ ```
523
+
524
+ **CLI Usage**:
525
+
526
+ ```bash
527
+ massgen --config massgen/configs/tools/custom_tools/multimodal_tools/understand_file.yaml "Summarize the content of this PDF"
528
+ ```
529
+
530
+ ---
531
+
532
+ ## Setup and Configuration
533
+
534
+ ### Environment Setup
535
+
536
+ All multimodal tools require an OpenAI API key. Set it in your environment or `.env` file:
537
+
538
+ ```bash
539
+ # In your .env file or environment
540
+ OPENAI_API_KEY=your-api-key-here
541
+ ```
542
+
543
+ ### Installing Dependencies
544
+
545
+ Install all required dependencies:
546
+
547
+ ```bash
548
+ # For video understanding
549
+ pip install opencv-python>=4.12.0.88
550
+
551
+ # For document processing
552
+ pip install PyPDF2>=3.0.1
553
+ pip install python-docx>=1.2.0
554
+ pip install openpyxl>=3.1.5
555
+ pip install python-pptx>=1.0.2
556
+
557
+ # Or install all at once via pyproject.toml
558
+ uv sync
559
+ ```
560
+
561
+ ### Path Security
562
+
563
+ All tools implement path validation to ensure files are accessed only from allowed directories:
564
+
565
+ ```python
566
+ # Configure allowed paths in your agent configuration
567
+ allowed_paths = ["/path/to/workspace", "/path/to/data"]
568
+
569
+ # Tools will validate all file accesses
570
+ result = await understand_file(
571
+ file_path="document.pdf",
572
+ allowed_paths=allowed_paths
573
+ )
574
+ ```
575
+
576
+ ---
577
+
578
+ ## Best Practices
579
+
580
+ ### Video Analysis
581
+
582
+ 1. **Frame Selection**:
583
+ - Use 8 frames for general videos
584
+ - Use 12-16 frames for detailed tutorials or complex content
585
+ - Use 4-6 frames for short clips
586
+
587
+ 2. **Prompting**:
588
+ - Be specific about what you want to know
589
+ - Ask for step-by-step descriptions for tutorials
590
+ - Request timestamps or sequence information when relevant
591
+
592
+ ### Audio Transcription
593
+
594
+ 1. **File Quality**:
595
+ - Use high-quality audio files for best transcription results
596
+ - Ensure audio is clear and audible
597
+ - Consider splitting very long audio files
598
+
599
+ 2. **Batch Processing**:
600
+ - Process multiple related audio files in a single call
601
+ - Organize transcriptions by file for clarity
602
+
603
+ ### Image Analysis
604
+
605
+ 1. **Image Quality**:
606
+ - Use high-resolution images when possible
607
+ - Ensure images are clear and properly exposed
608
+ - Avoid heavily compressed images
609
+
610
+ 2. **Specific Prompts**:
611
+ - Ask targeted questions for specific information
612
+ - Request structured output (lists, tables) when appropriate
613
+ - Specify areas of focus for complex images
614
+
615
+ ### File Understanding
616
+
617
+ 1. **Content Size**:
618
+ - Adjust `max_chars` based on file size and needs
619
+ - For large files, focus prompts on specific sections
620
+ - Consider extracting specific pages or sections first
621
+
622
+ 2. **Document Types**:
623
+ - Use appropriate prompts for different document types
624
+ - For spreadsheets, specify which sheets or columns to focus on
625
+ - For presentations, ask for slide-by-slide summaries
626
+
627
+ ---
628
+
629
+ ## Error Handling
630
+
631
+ All tools return structured error messages in the ExecutionResult:
632
+
633
+ ```python
634
+ result = await understand_video(video_path="missing.mp4")
635
+
636
+ # Check for errors
637
+ if not result["success"]:
638
+ print(f"Error: {result['error']}")
639
+ # Error: Video file does not exist: /path/to/missing.mp4
640
+ ```
641
+
642
+ Common errors:
643
+ - Missing API key
644
+ - File not found
645
+ - Invalid file format
646
+ - Path access violation
647
+ - API errors
648
+ - Missing dependencies
649
+
650
+ ---
651
+
652
+ ## Performance Considerations
653
+
654
+ 1. **API Costs**:
655
+ - Video and image analysis incur higher API costs
656
+ - Limit frame count for videos to control costs
657
+ - Use appropriate `max_chars` limits for files
658
+
659
+ 2. **Processing Time**:
660
+ - Video processing time increases with frame count
661
+ - Large documents take longer to process
662
+ - Multiple audio files are processed sequentially
663
+
664
+ 3. **Resource Usage**:
665
+ - Video frame extraction requires memory
666
+ - Large files are read into memory
667
+ - Consider file size limits for production use
668
+
669
+ ---
670
+
671
+ ## Integration Examples
672
+
673
+ ### Using in Agent Workflows
674
+
675
+ ```python
676
+ # Example: Analyze a video and generate a report
677
+ async def analyze_video_content(video_path: str):
678
+ # Step 1: Understand the video
679
+ video_result = await understand_video(
680
+ video_path=video_path,
681
+ prompt="Describe the main content and key moments in this video."
682
+ )
683
+
684
+ # Step 2: Extract any text/captions from a screenshot
685
+ screenshot_result = await understand_image(
686
+ image_path="video_screenshot.png",
687
+ prompt="Extract any text visible in this image."
688
+ )
689
+
690
+ # Step 3: Transcribe audio
691
+ audio_result = await understand_audio(
692
+ audio_paths=["video_audio.mp3"]
693
+ )
694
+
695
+ # Step 4: Generate comprehensive report
696
+ report = {
697
+ "visual_content": video_result["response"],
698
+ "visible_text": screenshot_result["response"],
699
+ "audio_transcription": audio_result["transcriptions"][0]["transcription"]
700
+ }
701
+
702
+ return report
703
+ ```
704
+
705
+ ### Multi-Modal Document Analysis
706
+
707
+ ```python
708
+ # Example: Analyze a presentation with images
709
+ async def analyze_presentation(pptx_path: str, image_dir: str):
710
+ # Analyze presentation structure
711
+ pptx_result = await understand_file(
712
+ file_path=pptx_path,
713
+ prompt="List the main topic of each slide."
714
+ )
715
+
716
+ # Analyze individual slide images
717
+ image_results = []
718
+ for image_file in Path(image_dir).glob("*.png"):
719
+ result = await understand_image(
720
+ image_path=str(image_file),
721
+ prompt="Describe the content and any charts/diagrams in this slide."
722
+ )
723
+ image_results.append(result)
724
+
725
+ return {
726
+ "structure": pptx_result["response"],
727
+ "slide_visuals": [r["response"] for r in image_results]
728
+ }
729
+ ```
730
+
731
+ ---
732
+
733
+ ## Troubleshooting
734
+
735
+ ### OpenAI API Key Issues
736
+
737
+ ```
738
+ Error: OpenAI API key not found
739
+ ```
740
+
741
+ **Solution**: Set `OPENAI_API_KEY` in your `.env` file or environment variables.
742
+
743
+ ### Missing Dependencies
744
+
745
+ ```
746
+ Error: opencv-python is required for video frame extraction
747
+ ```
748
+
749
+ **Solution**: Install the required package:
750
+ ```bash
751
+ pip install opencv-python>=4.12.0.88
752
+ ```
753
+
754
+ ### Path Access Errors
755
+
756
+ ```
757
+ Error: Path not in allowed directories
758
+ ```
759
+
760
+ **Solution**: Ensure the file path is within the allowed directories or adjust the `allowed_paths` parameter.
761
+
762
+ ### File Format Errors
763
+
764
+ ```
765
+ Error: File does not appear to be a video file
766
+ ```
767
+
768
+ **Solution**: Check that the file has the correct extension and is a valid media file.
769
+
770
+ ---
771
+
772
+ ---
773
+
774
+ ## Image Generation Tools
775
+
776
+ ### text_to_image_generation
777
+
778
+ **What it does**: Generates images from text descriptions using OpenAI's GPT-4.1 API **WITHOUT ANY INPUT IMAGES**. Creates new images from scratch based solely on text prompts.
779
+
780
+ **Why use it**: Allows agents to create original visual content from descriptions. Perfect for generating illustrations, concept art, product visualizations, or any creative visual content.
781
+
782
+ **Location**: `massgen.tool._multimodal_tools.text_to_image_generation`
783
+
784
+ #### Parameters
785
+
786
+ - `prompt` (required): Text description of the image to generate
787
+ - Be specific and detailed for better results
788
+ - Include style, composition, lighting, and mood details
789
+ - `model` (optional): Model to use (default: "gpt-4.1")
790
+ - Options: "gpt-4.1"
791
+ - `storage_path` (optional): Directory path where to save the image
792
+ - **IMPORTANT**: Must be a DIRECTORY path only, NOT a file path
793
+ - Example: "images/generated" NOT "images/cat.png"
794
+ - Filename is automatically generated from prompt and timestamp
795
+ - Relative path: Resolved relative to agent's workspace
796
+ - Absolute path: Must be within allowed directories
797
+ - None/empty: Saves to agent's workspace root
798
+ - `allowed_paths` (optional): List of allowed base paths for validation
799
+
800
+ #### Returns
801
+
802
+ ExecutionResult containing:
803
+ - `success`: Whether operation succeeded
804
+ - `operation`: "generate_and_store_image_no_input_images"
805
+ - `note`: Note about operation
806
+ - `images`: List of generated images with file paths and metadata
807
+ - `model`: Model used for generation
808
+ - `prompt`: The prompt used for generation
809
+ - `total_images`: Total number of images generated and saved
810
+
811
+ #### Security Features
812
+
813
+ - Requires valid OpenAI API key
814
+ - Files are saved to specified path within workspace
815
+ - Path must be within allowed directories
816
+ - Automatic timestamp-based filename generation
817
+
818
+ #### Examples
819
+
820
+ **Basic Image Generation**:
821
+
822
+ ```python
823
+ from massgen.tool._multimodal_tools import text_to_image_generation
824
+
825
+ # Generate an image from a text description
826
+ result = await text_to_image_generation(prompt="a cat in space")
827
+
828
+ # Output includes file path and metadata
829
+ print(result.output_blocks[0].data)
830
+ # {
831
+ # "success": true,
832
+ # "operation": "generate_and_store_image_no_input_images",
833
+ # "images": [{
834
+ # "file_path": "/workspace/20240115_143022_a_cat_in_space.png",
835
+ # "filename": "20240115_143022_a_cat_in_space.png",
836
+ # "size": 125340
837
+ # }],
838
+ # "total_images": 1
839
+ # }
840
+ ```
841
+
842
+ **Custom Storage Path**:
843
+
844
+ ```python
845
+ # Generate with custom storage location
846
+ result = await text_to_image_generation(
847
+ prompt="sunset over mountains",
848
+ storage_path="art/landscapes"
849
+ )
850
+ ```
851
+
852
+ **Configuration Example**:
853
+
854
+ ```yaml
855
+ # massgen/configs/tools/custom_tools/multimodal_tools/text_to_image_generation_single.yaml
856
+ agents:
857
+ - id: "image_generator"
858
+ backend:
859
+ type: "openai"
860
+ model: "gpt-4o"
861
+ cwd: "workspace1"
862
+ enable_image_generation: true
863
+ custom_tools:
864
+ - name: ["text_to_image_generation"]
865
+ category: "multimodal"
866
+ path: "massgen/tool/_multimodal_tools/text_to_image_generation.py"
867
+ function: ["text_to_image_generation"]
868
+ ```
869
+
870
+ **CLI Usage**:
871
+
872
+ ```bash
873
+ massgen --config massgen/configs/tools/custom_tools/multimodal_tools/text_to_image_generation_single.yaml "Generate an image of a futuristic city at night"
874
+ ```
875
+
876
+ ---
877
+
878
+ ### image_to_image_generation
879
+
880
+ **What it does**: Creates variations based on multiple input images using OpenAI's GPT-4.1 API. Generates new images inspired by existing ones.
881
+
882
+ **Why use it**: Allows agents to create variations, mashups, or transformations of existing images. Perfect for style transfer, image editing, or creating variations of designs.
883
+
884
+ **Location**: `massgen.tool._multimodal_tools.image_to_image_generation`
885
+
886
+ #### Parameters
887
+
888
+ - `base_image_paths` (required): List of paths to base images
889
+ - Supported formats: PNG, JPEG (less than 4MB each)
890
+ - Relative paths: Resolved relative to workspace
891
+ - Absolute paths: Must be within allowed directories
892
+ - `prompt` (optional): Text description for the variation (default: "Create a variation of the provided images")
893
+ - `model` (optional): Model to use (default: "gpt-4.1")
894
+ - `storage_path` (optional): Directory path where to save variations
895
+ - **IMPORTANT**: Must be a DIRECTORY path only
896
+ - Filename is automatically generated
897
+ - `allowed_paths` (optional): List of allowed base paths for validation
898
+
899
+ #### Returns
900
+
901
+ ExecutionResult containing:
902
+ - `success`: Whether operation succeeded
903
+ - `operation`: "generate_and_store_image_with_input_images"
904
+ - `note`: Note about usage
905
+ - `images`: List of generated images with file paths and metadata
906
+ - `model`: Model used for generation
907
+ - `prompt`: The prompt used
908
+ - `total_images`: Total number of images generated
909
+
910
+ #### Security Features
911
+
912
+ - Requires valid OpenAI API key
913
+ - Input images must be valid image files less than 4MB
914
+ - Files are saved to specified path within workspace
915
+ - Path validation for security
916
+
917
+ #### Examples
918
+
919
+ **Create Image Variation**:
920
+
921
+ ```python
922
+ from massgen.tool._multimodal_tools import image_to_image_generation
923
+
924
+ # Generate variation from a single image
925
+ result = await image_to_image_generation(
926
+ base_image_paths=["logo.png"],
927
+ prompt="Create a modern variation of this logo"
928
+ )
929
+ ```
930
+
931
+ **Combine Multiple Images**:
932
+
933
+ ```python
934
+ # Generate variation combining multiple images
935
+ result = await image_to_image_generation(
936
+ base_image_paths=["cat.png", "dog.png"],
937
+ prompt="Combine these animals into a single creature"
938
+ )
939
+ ```
940
+
941
+ ---
942
+
943
+ ## Video Generation Tools
944
+
945
+ ### text_to_video_generation
946
+
947
+ **What it does**: Generates videos from text descriptions using OpenAI's Sora-2 API. Creates high-quality video content from detailed scene descriptions.
948
+
949
+ **Why use it**: Allows agents to create video content from descriptions. Perfect for marketing content, concept visualization, educational videos, or social media content.
950
+
951
+ **Location**: `massgen.tool._multimodal_tools.text_to_video_generation`
952
+
953
+ #### Parameters
954
+
955
+ - `prompt` (required): Text description for the video to generate
956
+ - Include scene details, camera movements, lighting, atmosphere
957
+ - Be specific about actions, objects, and environment
958
+ - `model` (optional): Model to use (default: "sora-2")
959
+ - `seconds` (optional): Video duration in seconds (default: 4)
960
+ - Supported range: 4-20 seconds
961
+ - `storage_path` (optional): Directory path where to save the video
962
+ - **IMPORTANT**: Must be a DIRECTORY path only
963
+ - Filename is automatically generated from prompt and timestamp
964
+ - Relative path: Resolved relative to workspace
965
+ - Absolute path: Must be within allowed directories
966
+ - `allowed_paths` (optional): List of allowed base paths for validation
967
+
968
+ #### Returns
969
+
970
+ ExecutionResult containing:
971
+ - `success`: Whether operation succeeded
972
+ - `operation`: "generate_and_store_video_no_input_images"
973
+ - `video_path`: Path to the saved video file
974
+ - `filename`: Name of the generated file
975
+ - `size`: File size in bytes
976
+ - `model`: Model used for generation
977
+ - `prompt`: The prompt used
978
+ - `duration`: Time taken for generation in seconds
979
+
980
+ #### Security Features
981
+
982
+ - Requires valid OpenAI API key with Sora-2 access
983
+ - Files are saved to specified path within workspace
984
+ - Automatic video download and storage
985
+
986
+ #### Examples
987
+
988
+ **Basic Video Generation**:
989
+
990
+ ```python
991
+ from massgen.tool._multimodal_tools import text_to_video_generation
992
+
993
+ # Generate a 4-second video
994
+ result = await text_to_video_generation(
995
+ prompt="A cool cat on a motorcycle in the night"
996
+ )
997
+
998
+ # Output includes video path
999
+ print(result.output_blocks[0].data)
1000
+ # {
1001
+ # "success": true,
1002
+ # "operation": "generate_and_store_video_no_input_images",
1003
+ # "video_path": "/workspace/20240115_143022_a_cool_cat_on_motorcycle.mp4",
1004
+ # "size": 5242880,
1005
+ # "duration": 45.2
1006
+ # }
1007
+ ```
1008
+
1009
+ **Detailed Scene with Duration**:
1010
+
1011
+ ```python
1012
+ # Generate with detailed prompt and custom duration
1013
+ result = await text_to_video_generation(
1014
+ prompt="Neon-lit alley at night, light rain, slow push-in, cinematic lighting",
1015
+ seconds=10,
1016
+ storage_path="videos/cinematic"
1017
+ )
1018
+ ```
1019
+
1020
+ **Configuration Example**:
1021
+
1022
+ ```yaml
1023
+ # massgen/configs/tools/custom_tools/multimodal_tools/text_to_video_generation_single.yaml
1024
+ agents:
1025
+ - id: "video_generator"
1026
+ backend:
1027
+ type: "openai"
1028
+ model: "gpt-4o"
1029
+ cwd: "workspace1"
1030
+ enable_video_generation: true
1031
+ custom_tools:
1032
+ - name: ["text_to_video_generation"]
1033
+ category: "multimodal"
1034
+ path: "massgen/tool/_multimodal_tools/text_to_video_generation.py"
1035
+ function: ["text_to_video_generation"]
1036
+ ```
1037
+
1038
+ **CLI Usage**:
1039
+
1040
+ ```bash
1041
+ massgen --config massgen/configs/tools/custom_tools/multimodal_tools/text_to_video_generation_single.yaml "Generate a 4 seconds video with neon-lit alley at night, light rain, slow push-in, cinematic."
1042
+ ```
1043
+
1044
+ ---
1045
+
1046
+ ## Audio/Speech Generation Tools
1047
+
1048
+ ### text_to_speech_continue_generation
1049
+
1050
+ **What it does**: Generates expressive speech from text using OpenAI's GPT-4o Audio Preview model. Creates natural-sounding speech with emotional expression and context awareness.
1051
+
1052
+ **Why use it**: Allows agents to generate expressive speech with emotional tone. Perfect for creating voice-overs, narrations, audiobooks, or any content requiring natural, emotional speech.
1053
+
1054
+ **Location**: `massgen.tool._multimodal_tools.text_to_speech_continue_generation`
1055
+
1056
+ #### Parameters
1057
+
1058
+ - `prompt` (required): Text content to convert to audio speech
1059
+ - `model` (optional): Model to use (default: "gpt-4o-audio-preview")
1060
+ - `voice` (optional): Voice to use (default: "alloy")
1061
+ - Options: "alloy", "echo", "fable", "onyx", "nova", "shimmer"
1062
+ - `audio_format` (optional): Audio format for output (default: "wav")
1063
+ - Options: "wav", "mp3", "opus", "aac", "flac"
1064
+ - `storage_path` (optional): Directory path where to save the audio
1065
+ - **IMPORTANT**: Must be a DIRECTORY path only
1066
+ - Filename is automatically generated from prompt and timestamp
1067
+ - `allowed_paths` (optional): List of allowed base paths for validation
1068
+
1069
+ #### Returns
1070
+
1071
+ ExecutionResult containing:
1072
+ - `success`: Whether operation succeeded
1073
+ - `operation`: "generate_and_store_audio_no_input_audios"
1074
+ - `audio_file`: Generated audio file with path and metadata
1075
+ - `model`: Model used for generation
1076
+ - `prompt`: The prompt used for generation
1077
+ - `voice`: Voice used for generation
1078
+ - `format`: Audio format used
1079
+
1080
+ #### Examples
1081
+
1082
+ **Expressive Speech**:
1083
+
1084
+ ```python
1085
+ from massgen.tool._multimodal_tools import text_to_speech_continue_generation
1086
+
1087
+ # Generate expressive speech
1088
+ result = await text_to_speech_continue_generation(
1089
+ prompt="I want you to tell me a very short introduction about Sherlock Holmes in one sentence, and I want you to use emotion voice to read it out loud."
1090
+ )
1091
+ ```
1092
+
1093
+ **Custom Voice and Format**:
1094
+
1095
+ ```python
1096
+ # Generate with specific voice and format
1097
+ result = await text_to_speech_continue_generation(
1098
+ prompt="Hello world",
1099
+ voice="nova",
1100
+ audio_format="mp3",
1101
+ storage_path="audio/generated"
1102
+ )
1103
+ ```
1104
+
1105
+ ---
1106
+
1107
+ ### text_to_speech_transcription_generation
1108
+
1109
+ **What it does**: Converts text directly to speech using OpenAI's TTS API with streaming response. Provides fast, cost-effective text-to-speech conversion.
1110
+
1111
+ **Why use it**: Allows agents to quickly convert text to speech. Perfect for transcription conversion, simple voice-overs, or when expressive emotion is not required.
1112
+
1113
+ **Location**: `massgen.tool._multimodal_tools.text_to_speech_transcription_generation`
1114
+
1115
+ #### Parameters
1116
+
1117
+ - `input_text` (required): The text content to convert to speech
1118
+ - `model` (optional): TTS model to use (default: "gpt-4o-mini-tts")
1119
+ - Options: "gpt-4o-mini-tts", "tts-1", "tts-1-hd"
1120
+ - `voice` (optional): Voice to use (default: "alloy")
1121
+ - Options: "alloy", "echo", "fable", "onyx", "nova", "shimmer", "coral", "sage"
1122
+ - `instructions` (optional): Optional speaking instructions for tone and style
1123
+ - Example: "Speak in a cheerful tone"
1124
+ - `storage_path` (optional): Directory path where to save the audio file
1125
+ - **IMPORTANT**: Must be a DIRECTORY path only
1126
+ - `audio_format` (optional): Output audio format (default: "mp3")
1127
+ - Options: "mp3", "opus", "aac", "flac", "wav", "pcm"
1128
+ - `allowed_paths` (optional): List of allowed base paths for validation
1129
+
1130
+ #### Returns
1131
+
1132
+ ExecutionResult containing:
1133
+ - `success`: Whether operation succeeded
1134
+ - `operation`: "convert_text_to_speech"
1135
+ - `audio_file`: Generated audio file with path and metadata
1136
+ - `model`: TTS model used
1137
+ - `voice`: Voice used
1138
+ - `format`: Audio format used
1139
+ - `text_length`: Length of input text
1140
+ - `instructions`: Speaking instructions if provided
1141
+
1142
+ #### Examples
1143
+
1144
+ **Simple Text-to-Speech**:
1145
+
1146
+ ```python
1147
+ from massgen.tool._multimodal_tools import text_to_speech_transcription_generation
1148
+
1149
+ # Convert text to speech
1150
+ result = await text_to_speech_transcription_generation(
1151
+ input_text="Hello world, this is a test."
1152
+ )
1153
+ ```
1154
+
1155
+ **With Instructions**:
1156
+
1157
+ ```python
1158
+ # Convert with specific voice and instructions
1159
+ result = await text_to_speech_transcription_generation(
1160
+ input_text="Today is a wonderful day to build something people love!",
1161
+ voice="coral",
1162
+ instructions="Speak in a cheerful and positive tone."
1163
+ )
1164
+ ```
1165
+
1166
+ ---
1167
+
1168
+ ## File Generation Tools
1169
+
1170
+ ### text_to_file_generation
1171
+
1172
+ **What it does**: Generates text content using OpenAI API and saves it as various file formats (TXT, MD, PDF, PPTX). Creates professional documents from text prompts.
1173
+
1174
+ **Why use it**: Allows agents to create formatted documents automatically. Perfect for generating reports, documentation, presentations, or any structured text content.
1175
+
1176
+ **Location**: `massgen.tool._multimodal_tools.text_to_file_generation`
1177
+
1178
+ #### Parameters
1179
+
1180
+ - `prompt` (required): Description of the content to generate
1181
+ - Be specific about structure, sections, and formatting
1182
+ - Example: "Write a technical report about AI"
1183
+ - `file_format` (optional): Output file format (default: "txt")
1184
+ - Options: "txt", "md", "pdf", "pptx"
1185
+ - `filename` (optional): Custom filename without extension
1186
+ - If not provided, generates from prompt and timestamp
1187
+ - `model` (optional): OpenAI model to use (default: "gpt-4o")
1188
+ - Options: "gpt-4o", "gpt-4o-mini", "gpt-4-turbo", "gpt-3.5-turbo"
1189
+ - `storage_path` (optional): Directory path where to save the file
1190
+ - **IMPORTANT**: Must be a DIRECTORY path only
1191
+ - Filename is automatically generated from prompt or custom filename
1192
+ - Relative path: Resolved relative to workspace
1193
+ - Absolute path: Must be within allowed directories
1194
+ - `allowed_paths` (optional): List of allowed base paths for validation
1195
+
1196
+ #### Returns
1197
+
1198
+ ExecutionResult containing:
1199
+ - `success`: Whether operation succeeded
1200
+ - `operation`: "generate_and_store_file"
1201
+ - `file_path`: Path to the generated file
1202
+ - `filename`: Name of the generated file
1203
+ - `file_format`: Format of the generated file
1204
+ - `content_preview`: First 500 characters of generated content
1205
+ - `file_size`: Size of the generated file in bytes
1206
+ - `model`: Model used for generation
1207
+ - `prompt`: The prompt used
1208
+
1209
+ #### Security Features
1210
+
1211
+ - Requires valid OpenAI API key
1212
+ - Files are saved to specified path within workspace
1213
+ - Path must be within allowed directories
1214
+
1215
+ #### Dependencies
1216
+
1217
+ - PDF generation requires either `reportlab` or `fpdf2` library
1218
+ - PPTX generation requires `python-pptx` library
1219
+
1220
+ ```bash
1221
+ pip install reportlab # For PDF
1222
+ pip install python-pptx # For PPTX
1223
+ ```
1224
+
1225
+ #### Examples
1226
+
1227
+ **Generate Markdown Document**:
1228
+
1229
+ ```python
1230
+ from massgen.tool._multimodal_tools import text_to_file_generation
1231
+
1232
+ # Generate a markdown file
1233
+ result = await text_to_file_generation(
1234
+ prompt="Write a blog post about Python",
1235
+ file_format="md"
1236
+ )
1237
+ ```
1238
+
1239
+ **Generate PDF Report**:
1240
+
1241
+ ```python
1242
+ # Generate a PDF with custom filename
1243
+ result = await text_to_file_generation(
1244
+ prompt="Create a technical report on machine learning",
1245
+ file_format="pdf",
1246
+ filename="ml_report",
1247
+ storage_path="documents/reports"
1248
+ )
1249
+ ```
1250
+
1251
+ **Generate PowerPoint Presentation**:
1252
+
1253
+ ```python
1254
+ # Generate PPTX - structure prompt with slide titles (# or ##) and bullet points (-)
1255
+ result = await text_to_file_generation(
1256
+ prompt="""Create a presentation about AI trends:
1257
+
1258
+ # Introduction
1259
+ - Overview of AI landscape
1260
+ - Key developments in 2024
1261
+
1262
+ # Current Trends
1263
+ - Large Language Models
1264
+ - Multimodal AI
1265
+ - AI Safety
1266
+
1267
+ # Future Outlook
1268
+ - Predictions for 2025
1269
+ - Emerging technologies
1270
+ """,
1271
+ file_format="pptx",
1272
+ filename="ai_trends_presentation"
1273
+ )
1274
+ ```
1275
+
1276
+ **Configuration Example**:
1277
+
1278
+ ```yaml
1279
+ # massgen/configs/tools/custom_tools/multimodal_tools/text_to_file_generation_single.yaml
1280
+ agents:
1281
+ - id: "document_generator"
1282
+ backend:
1283
+ type: "openai"
1284
+ model: "gpt-4o"
1285
+ cwd: "workspace1"
1286
+ enable_file_generation: true
1287
+ custom_tools:
1288
+ - name: ["text_to_file_generation"]
1289
+ category: "multimodal"
1290
+ path: "massgen/tool/_multimodal_tools/text_to_file_generation.py"
1291
+ function: ["text_to_file_generation"]
1292
+ ```
1293
+
1294
+ **CLI Usage**:
1295
+
1296
+ ```bash
1297
+ massgen --config massgen/configs/tools/custom_tools/multimodal_tools/text_to_file_generation_single.yaml "Generate a technical report about LLMs and save as PDF"
1298
+ ```
1299
+
1300
+ #### Note
1301
+
1302
+ - For PPTX format, structure your prompt to include slide titles (using # or ##) and bullet points (using -)
1303
+ - The quality and format of generated content depends on the prompt
1304
+ - Longer content may consume more tokens
1305
+
1306
+ ---
1307
+
1308
+ ## Best Practices for Generation Tools
1309
+
1310
+ ### Image Generation
1311
+
1312
+ 1. **Prompt Quality**:
1313
+ - Be specific about style, composition, lighting, and mood
1314
+ - Include details about colors, perspective, and atmosphere
1315
+ - Use artistic terminology for better results
1316
+
1317
+ 2. **Cost Management**:
1318
+ - Image generation (GPT-4.1) is more expensive than standard API calls
1319
+ - Test prompts with understanding tools first
1320
+ - Use multi-agent workflows to refine prompts before generation
1321
+
1322
+ ### Video Generation
1323
+
1324
+ 1. **Prompt Structure**:
1325
+ - Include: setting, lighting, camera movements, atmosphere
1326
+ - Specify duration based on content complexity (4-20 seconds)
1327
+ - Use cinematic terminology (push-in, pull-out, pan, etc.)
1328
+
1329
+ 2. **Quality Verification**:
1330
+ - Combine with `understand_video` tool for quality checks
1331
+ - Use multi-agent workflows for iterative refinement
1332
+
1333
+ ### Audio/Speech Generation
1334
+
1335
+ 1. **Voice Selection**:
1336
+ - Choose appropriate voice for content type
1337
+ - Use expressive model (gpt-4o-audio-preview) for emotional content
1338
+ - Use TTS model (gpt-4o-mini-tts) for simple conversions
1339
+
1340
+ 2. **Format Selection**:
1341
+ - WAV for highest quality
1342
+ - MP3 for balanced quality/size
1343
+ - OPUS for web streaming
1344
+
1345
+ ### Document Generation
1346
+
1347
+ 1. **Format Selection**:
1348
+ - TXT for simple content
1349
+ - MD for formatted documentation
1350
+ - PDF for professional documents
1351
+ - PPTX for presentations
1352
+
1353
+ 2. **Prompt Structure**:
1354
+ - Outline structure clearly
1355
+ - Specify sections and formatting
1356
+ - For PPTX, use markdown-style headers and bullets
1357
+
1358
+ ---
1359
+
1360
+ ## Additional Resources
1361
+
1362
+ - [OpenAI API Documentation](https://platform.openai.com/docs)
1363
+ - [OpenCV Documentation](https://docs.opencv.org/)
1364
+ - [PyPDF2 Documentation](https://pypdf2.readthedocs.io/)
1365
+ - [python-docx Documentation](https://python-docx.readthedocs.io/)
1366
+ - [openpyxl Documentation](https://openpyxl.readthedocs.io/)
1367
+ - [python-pptx Documentation](https://python-pptx.readthedocs.io/)
1368
+ - [reportlab Documentation](https://www.reportlab.com/documentation/)