massgen 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. massgen/__init__.py +1 -1
  2. massgen/agent_config.py +33 -7
  3. massgen/api_params_handler/_api_params_handler_base.py +3 -0
  4. massgen/api_params_handler/_chat_completions_api_params_handler.py +7 -1
  5. massgen/backend/azure_openai.py +9 -1
  6. massgen/backend/base.py +56 -0
  7. massgen/backend/base_with_custom_tool_and_mcp.py +4 -4
  8. massgen/backend/capabilities.py +6 -6
  9. massgen/backend/chat_completions.py +18 -11
  10. massgen/backend/claude_code.py +9 -1
  11. massgen/backend/gemini.py +71 -6
  12. massgen/backend/gemini_utils.py +30 -0
  13. massgen/backend/grok.py +39 -6
  14. massgen/backend/response.py +18 -11
  15. massgen/chat_agent.py +9 -3
  16. massgen/cli.py +319 -43
  17. massgen/config_builder.py +163 -18
  18. massgen/configs/README.md +78 -20
  19. massgen/configs/basic/multi/three_agents_default.yaml +2 -2
  20. massgen/configs/debug/restart_test_controlled.yaml +60 -0
  21. massgen/configs/debug/restart_test_controlled_filesystem.yaml +73 -0
  22. massgen/configs/tools/code-execution/docker_with_sudo.yaml +35 -0
  23. massgen/configs/tools/custom_tools/computer_use_browser_example.yaml +56 -0
  24. massgen/configs/tools/custom_tools/computer_use_docker_example.yaml +65 -0
  25. massgen/configs/tools/custom_tools/computer_use_example.yaml +50 -0
  26. massgen/configs/tools/custom_tools/crawl4ai_mcp_example.yaml +67 -0
  27. massgen/configs/tools/custom_tools/crawl4ai_multi_agent_example.yaml +68 -0
  28. massgen/configs/tools/custom_tools/multimodal_tools/playwright_with_img_understanding.yaml +98 -0
  29. massgen/configs/tools/custom_tools/multimodal_tools/understand_audio.yaml +33 -0
  30. massgen/configs/tools/custom_tools/multimodal_tools/understand_file.yaml +34 -0
  31. massgen/configs/tools/custom_tools/multimodal_tools/understand_image.yaml +33 -0
  32. massgen/configs/tools/custom_tools/multimodal_tools/understand_video.yaml +34 -0
  33. massgen/configs/tools/custom_tools/multimodal_tools/understand_video_example.yaml +54 -0
  34. massgen/configs/tools/custom_tools/multimodal_tools/youtube_video_analysis.yaml +59 -0
  35. massgen/configs/tools/memory/README.md +199 -0
  36. massgen/configs/tools/memory/gpt5mini_gemini_context_window_management.yaml +131 -0
  37. massgen/configs/tools/memory/gpt5mini_gemini_no_persistent_memory.yaml +133 -0
  38. massgen/configs/tools/memory/test_context_window_management.py +286 -0
  39. massgen/configs/tools/multimodal/gpt5mini_gpt5nano_documentation_evolution.yaml +97 -0
  40. massgen/configs/tools/planning/five_agents_discord_mcp_planning_mode.yaml +7 -29
  41. massgen/configs/tools/planning/five_agents_filesystem_mcp_planning_mode.yaml +5 -6
  42. massgen/configs/tools/planning/five_agents_notion_mcp_planning_mode.yaml +4 -4
  43. massgen/configs/tools/planning/five_agents_twitter_mcp_planning_mode.yaml +4 -4
  44. massgen/configs/tools/planning/gpt5_mini_case_study_mcp_planning_mode.yaml +2 -2
  45. massgen/docker/README.md +83 -0
  46. massgen/filesystem_manager/_code_execution_server.py +22 -7
  47. massgen/filesystem_manager/_docker_manager.py +21 -1
  48. massgen/filesystem_manager/_filesystem_manager.py +8 -0
  49. massgen/filesystem_manager/_workspace_tools_server.py +0 -997
  50. massgen/formatter/_gemini_formatter.py +73 -0
  51. massgen/frontend/coordination_ui.py +175 -257
  52. massgen/frontend/displays/base_display.py +29 -0
  53. massgen/frontend/displays/rich_terminal_display.py +155 -9
  54. massgen/frontend/displays/simple_display.py +21 -0
  55. massgen/frontend/displays/terminal_display.py +22 -2
  56. massgen/logger_config.py +50 -6
  57. massgen/message_templates.py +123 -3
  58. massgen/orchestrator.py +652 -44
  59. massgen/tests/test_code_execution.py +178 -0
  60. massgen/tests/test_intelligent_planning_mode.py +643 -0
  61. massgen/tests/test_orchestration_restart.py +204 -0
  62. massgen/token_manager/token_manager.py +13 -4
  63. massgen/tool/__init__.py +4 -0
  64. massgen/tool/_multimodal_tools/understand_audio.py +193 -0
  65. massgen/tool/_multimodal_tools/understand_file.py +550 -0
  66. massgen/tool/_multimodal_tools/understand_image.py +212 -0
  67. massgen/tool/_multimodal_tools/understand_video.py +313 -0
  68. massgen/tool/docs/multimodal_tools.md +779 -0
  69. massgen/tool/workflow_toolkits/__init__.py +26 -0
  70. massgen/tool/workflow_toolkits/post_evaluation.py +216 -0
  71. massgen/utils.py +1 -0
  72. {massgen-0.1.1.dist-info → massgen-0.1.3.dist-info}/METADATA +57 -52
  73. {massgen-0.1.1.dist-info → massgen-0.1.3.dist-info}/RECORD +77 -49
  74. {massgen-0.1.1.dist-info → massgen-0.1.3.dist-info}/WHEEL +0 -0
  75. {massgen-0.1.1.dist-info → massgen-0.1.3.dist-info}/entry_points.txt +0 -0
  76. {massgen-0.1.1.dist-info → massgen-0.1.3.dist-info}/licenses/LICENSE +0 -0
  77. {massgen-0.1.1.dist-info → massgen-0.1.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,779 @@
1
+ # Multimodal Tools Guide
2
+
3
+ ## Overview
4
+
5
+ MassGen provides a comprehensive suite of multimodal understanding tools that enable AI agents to analyze and understand various media types including videos, images, audio files, and documents. These tools leverage OpenAI's advanced multimodal APIs (gpt-4.1 and transcription services) to provide intelligent content analysis capabilities.
6
+
7
+ ## Tool Categories
8
+
9
+ Multimodal tools are organized into four main categories:
10
+
11
+ - **Video Understanding**: Extract key frames and analyze video content
12
+ - **Audio Understanding**: Transcribe and analyze audio files
13
+ - **Image Understanding**: Analyze and describe image content
14
+ - **File Understanding**: Process and understand documents (PDF, DOCX, PPTX, XLSX)
15
+
16
+ ## Video Understanding Tool
17
+
18
+ ### understand_video
19
+
20
+ **What it does**: Extracts key frames from video files and uses OpenAI's gpt-4.1 API to analyze and understand video content. The tool samples frames evenly across the video timeline to provide comprehensive coverage of the video's content.
21
+
22
+ **Why use it**: Allows agents to understand video content without manually watching videos. Perfect for summarizing videos, extracting key information, analyzing tutorial steps, or answering specific questions about video content.
23
+
24
+ **Location**: `massgen.tool._multimodal_tools.understand_video`
25
+
26
+ #### Parameters
27
+
28
+ - `video_path` (required): Path to the video file
29
+ - Relative path: Resolved relative to workspace
30
+ - Absolute path: Must be within allowed directories
31
+ - Supported formats: MP4, AVI, MOV, MKV, FLV, WMV, WEBM, M4V, MPG, MPEG
32
+ - `prompt` (optional): Question or instruction about the video (default: "What's happening in this video? Please describe the content, actions, and any important details you observe across these frames.")
33
+ - `num_frames` (optional): Number of key frames to extract (default: 8)
34
+ - Higher values provide more detail but increase API costs
35
+ - Recommended range: 4-16 frames
36
+ - `model` (optional): OpenAI model to use (default: "gpt-4.1")
37
+ - `allowed_paths` (optional): List of allowed base paths for validation
38
+
39
+ #### Returns
40
+
41
+ ExecutionResult containing:
42
+ - `success`: Whether operation succeeded
43
+ - `operation`: "understand_video"
44
+ - `video_path`: Path to the analyzed video
45
+ - `num_frames_extracted`: Number of frames extracted
46
+ - `prompt`: The prompt used
47
+ - `model`: Model used for analysis
48
+ - `response`: The model's understanding/description of the video
49
+
50
+ #### Security Features
51
+
52
+ - Path validation to ensure access only to allowed directories
53
+ - Requires valid OpenAI API key
54
+ - File existence and format validation
55
+ - Automatic cleanup of video capture resources
56
+
57
+ #### Dependencies
58
+
59
+ Requires `opencv-python` package:
60
+ ```bash
61
+ pip install opencv-python>=4.12.0.88
62
+ ```
63
+
64
+ #### Examples
65
+
66
+ **Basic Video Analysis**:
67
+
68
+ ```python
69
+ from massgen.tool._multimodal_tools import understand_video
70
+
71
+ # Analyze a video with default prompt
72
+ result = await understand_video(video_path="demo.mp4")
73
+
74
+ # Output includes detailed description of the video
75
+ print(result.output_blocks[0].data)
76
+ # {
77
+ # "success": true,
78
+ # "operation": "understand_video",
79
+ # "video_path": "/path/to/demo.mp4",
80
+ # "num_frames_extracted": 8,
81
+ # "response": "The video shows..."
82
+ # }
83
+ ```
84
+
85
+ **Custom Prompt and Frame Count**:
86
+
87
+ ```python
88
+ # Ask specific questions about the video
89
+ result = await understand_video(
90
+ video_path="tutorial.mp4",
91
+ prompt="What steps are shown in this tutorial? List them in order.",
92
+ num_frames=12 # Extract more frames for detailed analysis
93
+ )
94
+ ```
95
+
96
+ **Meeting Summary**:
97
+
98
+ ```python
99
+ # Summarize a meeting recording
100
+ result = await understand_video(
101
+ video_path="meeting_recording.mp4",
102
+ prompt="Summarize the key points and decisions made in this meeting."
103
+ )
104
+ ```
105
+
106
+ **Configuration Example**:
107
+
108
+ ```yaml
109
+ # massgen/configs/tools/custom_tools/multimodal_tools/understand_video.yaml
110
+ agents:
111
+ - id: "understand_video_tool"
112
+ backend:
113
+ type: "openai"
114
+ model: "gpt-5-nano"
115
+ cwd: "workspace1"
116
+ custom_tools:
117
+ - name: ["understand_video"]
118
+ category: "multimodal"
119
+ path: "massgen/tool/_multimodal_tools/understand_video.py"
120
+ function: ["understand_video"]
121
+ ```
122
+
123
+ **CLI Usage**:
124
+
125
+ ```bash
126
+ massgen --config massgen/configs/tools/custom_tools/multimodal_tools/understand_video.yaml "What's happening in this video?"
127
+ ```
128
+
129
+ #### Note
130
+
131
+ This tool extracts still frames from the video. Audio content is not analyzed. For audio analysis, use the `understand_audio` tool.
132
+
133
+ ---
134
+
135
+ ## Audio Understanding Tool
136
+
137
+ ### understand_audio
138
+
139
+ **What it does**: Transcribes audio files to text using OpenAI's Transcription API. Supports multiple audio file formats and can process multiple files in a single call.
140
+
141
+ **Why use it**: Enables agents to understand spoken content in audio files without manual listening. Ideal for transcribing interviews, meetings, podcasts, or any audio content that needs to be converted to text.
142
+
143
+ **Location**: `massgen.tool._multimodal_tools.understand_audio`
144
+
145
+ #### Parameters
146
+
147
+ - `audio_paths` (required): List of paths to input audio files
148
+ - Relative paths: Resolved relative to workspace
149
+ - Absolute paths: Must be within allowed directories
150
+ - Supported formats: WAV, MP3, M4A, MP4, OGG, FLAC, AAC, WMA, OPUS
151
+ - `model` (optional): Model to use (default: "gpt-4o-transcribe")
152
+ - `allowed_paths` (optional): List of allowed base paths for validation
153
+
154
+ #### Returns
155
+
156
+ ExecutionResult containing:
157
+ - `success`: Whether operation succeeded
158
+ - `operation`: "generate_text_with_input_audio"
159
+ - `transcriptions`: List of transcription results for each file
160
+ - Each contains `file` path and `transcription` text
161
+ - `audio_files`: List of paths to the input audio files
162
+ - `model`: Model used
163
+
164
+ #### Security Features
165
+
166
+ - Path validation for all audio files
167
+ - File existence and format validation
168
+ - Requires valid OpenAI API key
169
+ - Separate error handling for each file
170
+
171
+ #### Examples
172
+
173
+ **Single Audio File Transcription**:
174
+
175
+ ```python
176
+ from massgen.tool._multimodal_tools import understand_audio
177
+
178
+ # Transcribe a single audio file
179
+ result = await understand_audio(audio_paths=["recording.wav"])
180
+
181
+ # Output includes transcription
182
+ print(result.output_blocks[0].data)
183
+ # {
184
+ # "success": true,
185
+ # "operation": "generate_text_with_input_audio",
186
+ # "transcriptions": [
187
+ # {
188
+ # "file": "/path/to/recording.wav",
189
+ # "transcription": "Hello, this is a test recording..."
190
+ # }
191
+ # ],
192
+ # "audio_files": ["/path/to/recording.wav"],
193
+ # "model": "gpt-4o-transcribe"
194
+ # }
195
+ ```
196
+
197
+ **Multiple Audio Files**:
198
+
199
+ ```python
200
+ # Transcribe multiple audio files in one call
201
+ result = await understand_audio(
202
+ audio_paths=["interview1.mp3", "interview2.mp3", "interview3.mp3"]
203
+ )
204
+
205
+ # Each file is transcribed separately
206
+ for transcription in result["transcriptions"]:
207
+ print(f"File: {transcription['file']}")
208
+ print(f"Text: {transcription['transcription']}")
209
+ ```
210
+
211
+ **Configuration Example**:
212
+
213
+ ```yaml
214
+ # massgen/configs/tools/custom_tools/multimodal_tools/understand_audio.yaml
215
+ agents:
216
+ - id: "understand_audio_tool"
217
+ backend:
218
+ type: "openai"
219
+ model: "gpt-5-nano"
220
+ cwd: "workspace1"
221
+ custom_tools:
222
+ - name: ["understand_audio"]
223
+ category: "multimodal"
224
+ path: "massgen/tool/_multimodal_tools/understand_audio.py"
225
+ function: ["understand_audio"]
226
+ ```
227
+
228
+ **CLI Usage**:
229
+
230
+ ```bash
231
+ massgen --config massgen/configs/tools/custom_tools/multimodal_tools/understand_audio.yaml "What is being said in this audio?"
232
+ ```
233
+
234
+ ---
235
+
236
+ ## Image Understanding Tool
237
+
238
+ ### understand_image
239
+
240
+ **What it does**: Analyzes images using OpenAI's gpt-4.1 API to provide descriptions, answer questions, or extract insights from image content.
241
+
242
+ **Why use it**: Allows agents to "see" and understand images. Perfect for analyzing charts, screenshots, photos, diagrams, or any visual content that needs interpretation.
243
+
244
+ **Location**: `massgen.tool._multimodal_tools.understand_image`
245
+
246
+ #### Parameters
247
+
248
+ - `image_path` (required): Path to the image file
249
+ - Relative path: Resolved relative to workspace
250
+ - Absolute path: Must be within allowed directories
251
+ - Supported formats: PNG, JPEG, JPG
252
+ - `prompt` (optional): Question or instruction about the image (default: "What's in this image? Please describe it in detail.")
253
+ - `model` (optional): Model to use (default: "gpt-4.1")
254
+ - `allowed_paths` (optional): List of allowed base paths for validation
255
+
256
+ #### Returns
257
+
258
+ ExecutionResult containing:
259
+ - `success`: Whether operation succeeded
260
+ - `operation`: "understand_image"
261
+ - `image_path`: Path to the analyzed image
262
+ - `prompt`: The prompt used
263
+ - `model`: Model used for analysis
264
+ - `response`: The model's understanding/description of the image
265
+
266
+ #### Security Features
267
+
268
+ - Path validation to ensure access only to allowed directories
269
+ - Requires valid OpenAI API key
270
+ - File format validation (PNG, JPEG, JPG only)
271
+ - Secure base64 encoding for API transmission
272
+
273
+ #### Examples
274
+
275
+ **Basic Image Description**:
276
+
277
+ ```python
278
+ from massgen.tool._multimodal_tools import understand_image
279
+
280
+ # Get a detailed description of an image
281
+ result = await understand_image(image_path="photo.jpg")
282
+
283
+ # Output includes image analysis
284
+ print(result.output_blocks[0].data)
285
+ # {
286
+ # "success": true,
287
+ # "operation": "understand_image",
288
+ # "image_path": "/path/to/photo.jpg",
289
+ # "response": "This image shows..."
290
+ # }
291
+ ```
292
+
293
+ **Chart Analysis**:
294
+
295
+ ```python
296
+ # Analyze a chart or graph
297
+ result = await understand_image(
298
+ image_path="sales_chart.png",
299
+ prompt="What data is shown in this chart? What are the key trends?"
300
+ )
301
+ ```
302
+
303
+ **Screenshot Analysis**:
304
+
305
+ ```python
306
+ # Analyze UI elements in a screenshot
307
+ result = await understand_image(
308
+ image_path="app_screenshot.png",
309
+ prompt="What UI elements are visible in this screenshot? Describe the layout and functionality."
310
+ )
311
+ ```
312
+
313
+ **Diagram Understanding**:
314
+
315
+ ```python
316
+ # Understand technical diagrams
317
+ result = await understand_image(
318
+ image_path="architecture_diagram.png",
319
+ prompt="Explain the system architecture shown in this diagram."
320
+ )
321
+ ```
322
+
323
+ **Configuration Example**:
324
+
325
+ ```yaml
326
+ # massgen/configs/tools/custom_tools/multimodal_tools/understand_image.yaml
327
+ agents:
328
+ - id: "understand_image_tool"
329
+ backend:
330
+ type: "openai"
331
+ model: "gpt-5-nano"
332
+ cwd: "workspace1"
333
+ custom_tools:
334
+ - name: ["understand_image"]
335
+ category: "multimodal"
336
+ path: "massgen/tool/_multimodal_tools/understand_image.py"
337
+ function: ["understand_image"]
338
+ ```
339
+
340
+ **CLI Usage**:
341
+
342
+ ```bash
343
+ massgen --config massgen/configs/tools/custom_tools/multimodal_tools/understand_image.yaml "Describe this image in detail"
344
+ ```
345
+
346
+ ---
347
+
348
+ ## File Understanding Tool
349
+
350
+ ### understand_file
351
+
352
+ **What it does**: Reads and analyzes various file types (text files, PDF, DOCX, XLSX, PPTX) using OpenAI's gpt-4.1 API. Automatically extracts content from different document formats and processes it for analysis.
353
+
354
+ **Why use it**: Enables agents to understand document content without manual reading. Perfect for summarizing documents, extracting key information, answering questions about files, or analyzing structured data.
355
+
356
+ **Location**: `massgen.tool._multimodal_tools.understand_file`
357
+
358
+ #### Parameters
359
+
360
+ - `file_path` (required): Path to the file to analyze
361
+ - Relative path: Resolved relative to workspace
362
+ - Absolute path: Must be within allowed directories
363
+ - `prompt` (optional): Question or instruction about the file (default: "Please analyze this file and provide a comprehensive understanding of its content, purpose, and structure.")
364
+ - `model` (optional): Model to use (default: "gpt-4.1")
365
+ - `max_chars` (optional): Maximum number of characters to read/extract (default: 50000)
366
+ - Prevents processing extremely large files
367
+ - Applies to both text files and extracted content
368
+ - `allowed_paths` (optional): List of allowed base paths for validation
369
+
370
+ #### Returns
371
+
372
+ ExecutionResult containing:
373
+ - `success`: Whether operation succeeded
374
+ - `operation`: "understand_file"
375
+ - `file_path`: Path to the analyzed file
376
+ - `file_name`: Name of the file
377
+ - `file_type`: Extraction method used ("text", "pdf", "docx", "excel", "pptx")
378
+ - `file_size`: Size of the file in bytes
379
+ - `chars_read`: Number of characters read/extracted
380
+ - `truncated`: Whether content was truncated
381
+ - `prompt`: The prompt used
382
+ - `model`: Model used for analysis
383
+ - `response`: The model's understanding/analysis of the file
384
+
385
+ #### Security Features
386
+
387
+ - Path validation to ensure access only to allowed directories
388
+ - File existence and type validation
389
+ - Content size limits to prevent memory issues
390
+ - Requires valid OpenAI API key
391
+ - Blocks unsupported binary formats
392
+
393
+ #### Supported File Types
394
+
395
+ **Text Files** (read directly):
396
+ - Code: `.py`, `.js`, `.java`, `.cpp`, `.c`, `.go`, `.rs`, `.ts`, `.tsx`, `.jsx`, etc.
397
+ - Config: `.md`, `.yaml`, `.yml`, `.json`, `.xml`, `.toml`, `.ini`, etc.
398
+ - Data: `.txt`, `.log`, `.csv`, `.tsv`, etc.
399
+
400
+ **Document Files** (require additional packages):
401
+ - PDF: `.pdf` (requires `PyPDF2`)
402
+ - Word: `.docx` (requires `python-docx`)
403
+ - Excel: `.xlsx` (requires `openpyxl`)
404
+ - PowerPoint: `.pptx` (requires `python-pptx`)
405
+
406
+ **Unsupported Formats**:
407
+ - Old Office formats (`.doc`, `.xls`, `.ppt`)
408
+ - Images (use `understand_image` instead)
409
+ - Videos (use `understand_video` instead)
410
+ - Audio (use `understand_audio` instead)
411
+ - Archives (`.zip`, `.tar`, `.gz`, etc.)
412
+ - Executables (`.exe`, `.dll`, `.so`, etc.)
413
+
414
+ #### Dependencies
415
+
416
+ For document processing:
417
+ ```bash
418
+ pip install PyPDF2>=3.0.1 # For PDF files
419
+ pip install python-docx>=1.2.0 # For DOCX files
420
+ pip install openpyxl>=3.1.5 # For XLSX files
421
+ pip install python-pptx>=1.0.2 # For PPTX files
422
+ ```
423
+
424
+ #### Examples
425
+
426
+ **Analyze Python Script**:
427
+
428
+ ```python
429
+ from massgen.tool._multimodal_tools import understand_file
430
+
431
+ # Analyze a Python script
432
+ result = await understand_file(
433
+ file_path="script.py",
434
+ prompt="Explain what this script does and how it works."
435
+ )
436
+
437
+ print(result.output_blocks[0].data)
438
+ # {
439
+ # "success": true,
440
+ # "operation": "understand_file",
441
+ # "file_type": "text",
442
+ # "response": "This Python script..."
443
+ # }
444
+ ```
445
+
446
+ **Summarize Documentation**:
447
+
448
+ ```python
449
+ # Summarize a README file
450
+ result = await understand_file(
451
+ file_path="README.md",
452
+ prompt="Summarize the key points of this documentation in 3-5 bullet points."
453
+ )
454
+ ```
455
+
456
+ **Analyze PDF Document**:
457
+
458
+ ```python
459
+ # Analyze a research paper
460
+ result = await understand_file(
461
+ file_path="research_paper.pdf",
462
+ prompt="What are the main findings and conclusions of this research paper?"
463
+ )
464
+ ```
465
+
466
+ **Process Word Document**:
467
+
468
+ ```python
469
+ # Summarize a business proposal
470
+ result = await understand_file(
471
+ file_path="proposal.docx",
472
+ prompt="Provide a summary of this business proposal including objectives, timeline, and budget."
473
+ )
474
+ ```
475
+
476
+ **Analyze Excel Spreadsheet**:
477
+
478
+ ```python
479
+ # Analyze data in spreadsheet
480
+ result = await understand_file(
481
+ file_path="sales_data.xlsx",
482
+ prompt="What patterns and trends can you identify in this sales data?"
483
+ )
484
+ ```
485
+
486
+ **Process PowerPoint Presentation**:
487
+
488
+ ```python
489
+ # Summarize presentation
490
+ result = await understand_file(
491
+ file_path="quarterly_review.pptx",
492
+ prompt="Summarize the key points from each slide of this presentation."
493
+ )
494
+ ```
495
+
496
+ **Handle Large Files**:
497
+
498
+ ```python
499
+ # Process large file with custom character limit
500
+ result = await understand_file(
501
+ file_path="large_document.pdf",
502
+ prompt="Summarize the introduction and conclusion sections.",
503
+ max_chars=100000 # Increase limit for larger files
504
+ )
505
+ ```
506
+
507
+ **Configuration Example**:
508
+
509
+ ```yaml
510
+ # massgen/configs/tools/custom_tools/multimodal_tools/understand_file.yaml
511
+ agents:
512
+ - id: "understand_file_tool"
513
+ backend:
514
+ type: "openai"
515
+ model: "gpt-5-nano"
516
+ cwd: "workspace1"
517
+ custom_tools:
518
+ - name: ["understand_file"]
519
+ category: "multimodal"
520
+ path: "massgen/tool/_multimodal_tools/understand_file.py"
521
+ function: ["understand_file"]
522
+ ```
523
+
524
+ **CLI Usage**:
525
+
526
+ ```bash
527
+ massgen --config massgen/configs/tools/custom_tools/multimodal_tools/understand_file.yaml "Summarize the content of this PDF"
528
+ ```
529
+
530
+ ---
531
+
532
+ ## Setup and Configuration
533
+
534
+ ### Environment Setup
535
+
536
+ All multimodal tools require an OpenAI API key. Set it in your environment or `.env` file:
537
+
538
+ ```bash
539
+ # In your .env file or environment
540
+ OPENAI_API_KEY=your-api-key-here
541
+ ```
542
+
543
+ ### Installing Dependencies
544
+
545
+ Install all required dependencies:
546
+
547
+ ```bash
548
+ # For video understanding
549
+ pip install opencv-python>=4.12.0.88
550
+
551
+ # For document processing
552
+ pip install PyPDF2>=3.0.1
553
+ pip install python-docx>=1.2.0
554
+ pip install openpyxl>=3.1.5
555
+ pip install python-pptx>=1.0.2
556
+
557
+ # Or install all at once via pyproject.toml
558
+ uv sync
559
+ ```
560
+
561
+ ### Path Security
562
+
563
+ All tools implement path validation to ensure files are accessed only from allowed directories:
564
+
565
+ ```python
566
+ # Configure allowed paths in your agent configuration
567
+ allowed_paths = ["/path/to/workspace", "/path/to/data"]
568
+
569
+ # Tools will validate all file accesses
570
+ result = await understand_file(
571
+ file_path="document.pdf",
572
+ allowed_paths=allowed_paths
573
+ )
574
+ ```
575
+
576
+ ---
577
+
578
+ ## Best Practices
579
+
580
+ ### Video Analysis
581
+
582
+ 1. **Frame Selection**:
583
+ - Use 8 frames for general videos
584
+ - Use 12-16 frames for detailed tutorials or complex content
585
+ - Use 4-6 frames for short clips
586
+
587
+ 2. **Prompting**:
588
+ - Be specific about what you want to know
589
+ - Ask for step-by-step descriptions for tutorials
590
+ - Request timestamps or sequence information when relevant
591
+
592
+ ### Audio Transcription
593
+
594
+ 1. **File Quality**:
595
+ - Use high-quality audio files for best transcription results
596
+ - Ensure audio is clear and audible
597
+ - Consider splitting very long audio files
598
+
599
+ 2. **Batch Processing**:
600
+ - Process multiple related audio files in a single call
601
+ - Organize transcriptions by file for clarity
602
+
603
+ ### Image Analysis
604
+
605
+ 1. **Image Quality**:
606
+ - Use high-resolution images when possible
607
+ - Ensure images are clear and properly exposed
608
+ - Avoid heavily compressed images
609
+
610
+ 2. **Specific Prompts**:
611
+ - Ask targeted questions for specific information
612
+ - Request structured output (lists, tables) when appropriate
613
+ - Specify areas of focus for complex images
614
+
615
+ ### File Understanding
616
+
617
+ 1. **Content Size**:
618
+ - Adjust `max_chars` based on file size and needs
619
+ - For large files, focus prompts on specific sections
620
+ - Consider extracting specific pages or sections first
621
+
622
+ 2. **Document Types**:
623
+ - Use appropriate prompts for different document types
624
+ - For spreadsheets, specify which sheets or columns to focus on
625
+ - For presentations, ask for slide-by-slide summaries
626
+
627
+ ---
628
+
629
+ ## Error Handling
630
+
631
+ All tools return structured error messages in the ExecutionResult:
632
+
633
+ ```python
634
+ result = await understand_video(video_path="missing.mp4")
635
+
636
+ # Check for errors
637
+ if not result["success"]:
638
+ print(f"Error: {result['error']}")
639
+ # Error: Video file does not exist: /path/to/missing.mp4
640
+ ```
641
+
642
+ Common errors:
643
+ - Missing API key
644
+ - File not found
645
+ - Invalid file format
646
+ - Path access violation
647
+ - API errors
648
+ - Missing dependencies
649
+
650
+ ---
651
+
652
+ ## Performance Considerations
653
+
654
+ 1. **API Costs**:
655
+ - Video and image analysis incur higher API costs
656
+ - Limit frame count for videos to control costs
657
+ - Use appropriate `max_chars` limits for files
658
+
659
+ 2. **Processing Time**:
660
+ - Video processing time increases with frame count
661
+ - Large documents take longer to process
662
+ - Multiple audio files are processed sequentially
663
+
664
+ 3. **Resource Usage**:
665
+ - Video frame extraction requires memory
666
+ - Large files are read into memory
667
+ - Consider file size limits for production use
668
+
669
+ ---
670
+
671
+ ## Integration Examples
672
+
673
+ ### Using in Agent Workflows
674
+
675
+ ```python
676
+ # Example: Analyze a video and generate a report
677
+ async def analyze_video_content(video_path: str):
678
+ # Step 1: Understand the video
679
+ video_result = await understand_video(
680
+ video_path=video_path,
681
+ prompt="Describe the main content and key moments in this video."
682
+ )
683
+
684
+ # Step 2: Extract any text/captions from a screenshot
685
+ screenshot_result = await understand_image(
686
+ image_path="video_screenshot.png",
687
+ prompt="Extract any text visible in this image."
688
+ )
689
+
690
+ # Step 3: Transcribe audio
691
+ audio_result = await understand_audio(
692
+ audio_paths=["video_audio.mp3"]
693
+ )
694
+
695
+ # Step 4: Generate comprehensive report
696
+ report = {
697
+ "visual_content": video_result["response"],
698
+ "visible_text": screenshot_result["response"],
699
+ "audio_transcription": audio_result["transcriptions"][0]["transcription"]
700
+ }
701
+
702
+ return report
703
+ ```
704
+
705
+ ### Multi-Modal Document Analysis
706
+
707
+ ```python
708
+ # Example: Analyze a presentation with images
709
+ async def analyze_presentation(pptx_path: str, image_dir: str):
710
+ # Analyze presentation structure
711
+ pptx_result = await understand_file(
712
+ file_path=pptx_path,
713
+ prompt="List the main topic of each slide."
714
+ )
715
+
716
+ # Analyze individual slide images
717
+ image_results = []
718
+ for image_file in Path(image_dir).glob("*.png"):
719
+ result = await understand_image(
720
+ image_path=str(image_file),
721
+ prompt="Describe the content and any charts/diagrams in this slide."
722
+ )
723
+ image_results.append(result)
724
+
725
+ return {
726
+ "structure": pptx_result["response"],
727
+ "slide_visuals": [r["response"] for r in image_results]
728
+ }
729
+ ```
730
+
731
+ ---
732
+
733
+ ## Troubleshooting
734
+
735
+ ### OpenAI API Key Issues
736
+
737
+ ```
738
+ Error: OpenAI API key not found
739
+ ```
740
+
741
+ **Solution**: Set `OPENAI_API_KEY` in your `.env` file or environment variables.
742
+
743
+ ### Missing Dependencies
744
+
745
+ ```
746
+ Error: opencv-python is required for video frame extraction
747
+ ```
748
+
749
+ **Solution**: Install the required package:
750
+ ```bash
751
+ pip install opencv-python>=4.12.0.88
752
+ ```
753
+
754
+ ### Path Access Errors
755
+
756
+ ```
757
+ Error: Path not in allowed directories
758
+ ```
759
+
760
+ **Solution**: Ensure the file path is within the allowed directories or adjust the `allowed_paths` parameter.
761
+
762
+ ### File Format Errors
763
+
764
+ ```
765
+ Error: File does not appear to be a video file
766
+ ```
767
+
768
+ **Solution**: Check that the file has the correct extension and is a valid media file.
769
+
770
+ ---
771
+
772
+ ## Additional Resources
773
+
774
+ - [OpenAI API Documentation](https://platform.openai.com/docs)
775
+ - [OpenCV Documentation](https://docs.opencv.org/)
776
+ - [PyPDF2 Documentation](https://pypdf2.readthedocs.io/)
777
+ - [python-docx Documentation](https://python-docx.readthedocs.io/)
778
+ - [openpyxl Documentation](https://openpyxl.readthedocs.io/)
779
+ - [python-pptx Documentation](https://python-pptx.readthedocs.io/)