massgen 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of massgen might be problematic. Click here for more details.

Files changed (82) hide show
  1. massgen/__init__.py +1 -1
  2. massgen/agent_config.py +33 -7
  3. massgen/api_params_handler/_api_params_handler_base.py +3 -0
  4. massgen/api_params_handler/_chat_completions_api_params_handler.py +4 -0
  5. massgen/api_params_handler/_claude_api_params_handler.py +4 -0
  6. massgen/api_params_handler/_gemini_api_params_handler.py +4 -0
  7. massgen/api_params_handler/_response_api_params_handler.py +4 -0
  8. massgen/backend/azure_openai.py +9 -1
  9. massgen/backend/base.py +4 -0
  10. massgen/backend/base_with_custom_tool_and_mcp.py +25 -5
  11. massgen/backend/claude_code.py +9 -1
  12. massgen/backend/docs/permissions_and_context_files.md +2 -2
  13. massgen/backend/gemini.py +35 -6
  14. massgen/backend/gemini_utils.py +30 -0
  15. massgen/backend/response.py +2 -0
  16. massgen/chat_agent.py +9 -3
  17. massgen/cli.py +291 -43
  18. massgen/config_builder.py +163 -18
  19. massgen/configs/README.md +69 -14
  20. massgen/configs/debug/restart_test_controlled.yaml +60 -0
  21. massgen/configs/debug/restart_test_controlled_filesystem.yaml +73 -0
  22. massgen/configs/tools/code-execution/docker_with_sudo.yaml +35 -0
  23. massgen/configs/tools/custom_tools/computer_use_browser_example.yaml +56 -0
  24. massgen/configs/tools/custom_tools/computer_use_docker_example.yaml +65 -0
  25. massgen/configs/tools/custom_tools/computer_use_example.yaml +50 -0
  26. massgen/configs/tools/custom_tools/crawl4ai_example.yaml +55 -0
  27. massgen/configs/tools/custom_tools/multimodal_tools/text_to_file_generation_multi.yaml +61 -0
  28. massgen/configs/tools/custom_tools/multimodal_tools/text_to_file_generation_single.yaml +29 -0
  29. massgen/configs/tools/custom_tools/multimodal_tools/text_to_image_generation_multi.yaml +51 -0
  30. massgen/configs/tools/custom_tools/multimodal_tools/text_to_image_generation_single.yaml +33 -0
  31. massgen/configs/tools/custom_tools/multimodal_tools/text_to_speech_generation_multi.yaml +55 -0
  32. massgen/configs/tools/custom_tools/multimodal_tools/text_to_speech_generation_single.yaml +33 -0
  33. massgen/configs/tools/custom_tools/multimodal_tools/text_to_video_generation_multi.yaml +47 -0
  34. massgen/configs/tools/custom_tools/multimodal_tools/text_to_video_generation_single.yaml +29 -0
  35. massgen/configs/tools/custom_tools/multimodal_tools/understand_audio.yaml +33 -0
  36. massgen/configs/tools/custom_tools/multimodal_tools/understand_file.yaml +34 -0
  37. massgen/configs/tools/custom_tools/multimodal_tools/understand_image.yaml +33 -0
  38. massgen/configs/tools/custom_tools/multimodal_tools/understand_video.yaml +34 -0
  39. massgen/configs/tools/custom_tools/multimodal_tools/youtube_video_analysis.yaml +59 -0
  40. massgen/docker/README.md +83 -0
  41. massgen/filesystem_manager/_code_execution_server.py +22 -7
  42. massgen/filesystem_manager/_docker_manager.py +21 -1
  43. massgen/filesystem_manager/_filesystem_manager.py +9 -0
  44. massgen/filesystem_manager/_path_permission_manager.py +148 -0
  45. massgen/filesystem_manager/_workspace_tools_server.py +0 -997
  46. massgen/formatter/_gemini_formatter.py +73 -0
  47. massgen/frontend/coordination_ui.py +175 -257
  48. massgen/frontend/displays/base_display.py +29 -0
  49. massgen/frontend/displays/rich_terminal_display.py +155 -9
  50. massgen/frontend/displays/simple_display.py +21 -0
  51. massgen/frontend/displays/terminal_display.py +22 -2
  52. massgen/logger_config.py +50 -6
  53. massgen/message_templates.py +283 -15
  54. massgen/orchestrator.py +335 -38
  55. massgen/tests/test_binary_file_blocking.py +274 -0
  56. massgen/tests/test_case_studies.md +12 -12
  57. massgen/tests/test_code_execution.py +178 -0
  58. massgen/tests/test_multimodal_size_limits.py +407 -0
  59. massgen/tests/test_orchestration_restart.py +204 -0
  60. massgen/tool/__init__.py +4 -0
  61. massgen/tool/_manager.py +7 -2
  62. massgen/tool/_multimodal_tools/image_to_image_generation.py +293 -0
  63. massgen/tool/_multimodal_tools/text_to_file_generation.py +455 -0
  64. massgen/tool/_multimodal_tools/text_to_image_generation.py +222 -0
  65. massgen/tool/_multimodal_tools/text_to_speech_continue_generation.py +226 -0
  66. massgen/tool/_multimodal_tools/text_to_speech_transcription_generation.py +217 -0
  67. massgen/tool/_multimodal_tools/text_to_video_generation.py +223 -0
  68. massgen/tool/_multimodal_tools/understand_audio.py +211 -0
  69. massgen/tool/_multimodal_tools/understand_file.py +555 -0
  70. massgen/tool/_multimodal_tools/understand_image.py +316 -0
  71. massgen/tool/_multimodal_tools/understand_video.py +340 -0
  72. massgen/tool/_web_tools/crawl4ai_tool.py +718 -0
  73. massgen/tool/docs/multimodal_tools.md +1368 -0
  74. massgen/tool/workflow_toolkits/__init__.py +26 -0
  75. massgen/tool/workflow_toolkits/post_evaluation.py +216 -0
  76. massgen/utils.py +1 -0
  77. {massgen-0.1.2.dist-info → massgen-0.1.4.dist-info}/METADATA +101 -69
  78. {massgen-0.1.2.dist-info → massgen-0.1.4.dist-info}/RECORD +82 -46
  79. {massgen-0.1.2.dist-info → massgen-0.1.4.dist-info}/WHEEL +0 -0
  80. {massgen-0.1.2.dist-info → massgen-0.1.4.dist-info}/entry_points.txt +0 -0
  81. {massgen-0.1.2.dist-info → massgen-0.1.4.dist-info}/licenses/LICENSE +0 -0
  82. {massgen-0.1.2.dist-info → massgen-0.1.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,65 @@
1
+ # MassGen Configuration: Computer Use - Docker Environment
2
+ #
3
+ # This configuration uses a Docker container for OS-level automation.
4
+ #
5
+ # Usage:
6
+ # massgen --config @examples/tools/custom_tools/computer_use_docker_example "Open calculator and compute 123 + 456"
7
+ #
8
+ # Prerequisites:
9
+ # 1. Set OPENAI_API_KEY in your .env file
10
+ # 2. Have Docker installed and running
11
+ # 3. Start the Docker container (see README for instructions)
12
+
13
+ agents:
14
+ - id: "docker_automation_agent"
15
+ backend:
16
+ type: "openai"
17
+ model: "computer-use-preview"
18
+ custom_tools:
19
+ - name: ["computer_use"]
20
+ category: "automation"
21
+ path: "massgen/tool/_computer_use/computer_use_tool.py"
22
+ function: ["computer_use"]
23
+ # Default parameters for Docker environment
24
+ default_params:
25
+ environment: "ubuntu"
26
+ display_width: 1280
27
+ display_height: 800
28
+ max_iterations: 25
29
+ include_reasoning: true
30
+ environment_config:
31
+ container_name: "cua-container"
32
+ display: ":99"
33
+
34
+ system_message: |
35
+ You are a computer automation specialist with access to the computer_use tool.
36
+
37
+ You can control a virtual Ubuntu environment running in Docker to:
38
+ - Launch and use desktop applications
39
+ - Perform file operations
40
+ - Execute system-level tasks
41
+ - Automate GUI interactions
42
+
43
+ The environment includes:
44
+ - Ubuntu 22.04 desktop (Xfce)
45
+ - Firefox browser
46
+ - Standard desktop applications
47
+ - X11 display system
48
+
49
+ When using the computer_use tool:
50
+ 1. Tasks are executed in a sandboxed Docker container
51
+ 2. You have full desktop access via xdotool commands
52
+ 3. Screenshots are captured after each action
53
+ 4. Be specific about coordinates when clicking
54
+
55
+ Best practices:
56
+ - Allow time for applications to launch
57
+ - Use wait actions between steps
58
+ - Verify GUI elements are visible before clicking
59
+ - Consider screen resolution (1280x800)
60
+
61
+ ui:
62
+ display_type: "detailed"
63
+ logging_enabled: true
64
+ show_screenshots: true
65
+ show_reasoning: true
@@ -0,0 +1,50 @@
1
+ # MassGen Configuration: Computer Use Tool Example
2
+ #
3
+ # This configuration demonstrates how to use the computer_use tool for automating
4
+ # browser and computer interactions using OpenAI's computer-use-preview model.
5
+ #
6
+ # Usage:
7
+ # massgen --config @examples/tools/custom_tools/computer_use_example "Search for Python documentation on Google"
8
+ #
9
+ # Prerequisites:
10
+ # 1. Set OPENAI_API_KEY in your .env file
11
+ # 2. For browser environment: pip install playwright && playwright install
12
+ # 3. For Docker environment: Have Docker installed and running
13
+
14
+ agents:
15
+ - id: "computer_use_agent"
16
+ backend:
17
+ type: "openai"
18
+ model: "gpt-4.1" # You can also use "computer-use-preview" for the main model
19
+ custom_tools:
20
+ - name: ["computer_use"]
21
+ category: "automation"
22
+ path: "massgen/tool/_computer_use/computer_use_tool.py"
23
+ function: ["computer_use"]
24
+
25
+ system_message: |
26
+ You are an AI assistant with access to computer automation capabilities.
27
+
28
+ The computer_use tool is available to you. This tool allows you to:
29
+ - Control a web browser (click, type, scroll, etc.)
30
+ - Automate computer tasks
31
+ - Search the web, fill forms, navigate websites
32
+ - Perform multi-step workflows
33
+
34
+ When a user asks you to perform a task that requires browser or computer interaction,
35
+ use the computer_use tool with a clear task description.
36
+
37
+ Important:
38
+ - Always provide clear, specific task descriptions
39
+ - The tool will execute actions step-by-step
40
+ - You will receive screenshots and action logs
41
+ - Safety checks may be triggered - acknowledge them when appropriate
42
+
43
+ Example usage:
44
+ - "Search for the latest AI news on Google"
45
+ - "Navigate to example.com and fill out the contact form"
46
+ - "Find Python documentation and save the URL"
47
+
48
+ ui:
49
+ display_type: "simple"
50
+ logging_enabled: true
@@ -0,0 +1,55 @@
1
+ # MassGen Configuration: Crawl4AI Web Scraping via Custom Tools
2
+ #
3
+ # Prerequisites:
4
+ # 1. Start crawl4ai Docker container (one-time setup):
5
+ # docker pull unclecode/crawl4ai:latest
6
+ # docker run -d -p 11235:11235 --name crawl4ai --shm-size=1g unclecode/crawl4ai:latest
7
+ #
8
+ # 2. Verify container is running:
9
+ # docker ps | grep crawl4ai
10
+ #
11
+ # 3. Test REST API endpoint (optional):
12
+ # curl -X POST http://localhost:11235/md -H "Content-Type: application/json" -d '{"url": "https://example.com", "f": "fit"}'
13
+ #
14
+ # Usage:
15
+ # massgen --config @examples/configs/tools/custom_tools/crawl4ai_example.yaml "Please search for the MassGen docs, take a screenshot of the website, and explain that screenshot"
16
+ #
17
+ # Available Tools (via Custom Tools):
18
+ # - crawl4ai_md: Generate markdown from web content
19
+ # - crawl4ai_html: Extract preprocessed HTML
20
+ # - crawl4ai_screenshot: Capture webpage screenshots
21
+ # - crawl4ai_pdf: Generate PDF documents
22
+ # - crawl4ai_execute_js: Run JavaScript on web pages
23
+ # - crawl4ai_crawl: Perform multi-URL crawling
24
+ # - crawl4ai_ask: Query the Crawl4AI library context
25
+ #
26
+ # Note: Multiple agents can use these tools concurrently.
27
+ # The server handles up to 5 concurrent crawls by default.
28
+
29
+ agents:
30
+ - id: "web_scraper_agent"
31
+ backend:
32
+ type: "openai" # Works with any backend: openai, gemini, claude_code, etc.
33
+ model: "gpt-5-mini"
34
+ cwd: "workspace1"
35
+
36
+ # Register crawl4ai custom tools
37
+ custom_tools:
38
+ - name: ["crawl4ai_md", "crawl4ai_html", "crawl4ai_screenshot", "crawl4ai_pdf", "crawl4ai_execute_js", "crawl4ai_crawl"]
39
+ category: "web_scraping"
40
+ path: "massgen/tool/_web_tools/crawl4ai_tool.py"
41
+ function: ["crawl4ai_md", "crawl4ai_html", "crawl4ai_screenshot", "crawl4ai_pdf", "crawl4ai_execute_js", "crawl4ai_crawl"]
42
+ - name: ["understand_image"]
43
+ category: "multimodal"
44
+ path: "massgen/tool/_multimodal_tools/understand_image.py"
45
+ function: ["understand_image"]
46
+
47
+ orchestrator:
48
+ snapshot_storage: "snapshots"
49
+ agent_temporary_workspace: "temp_workspaces"
50
+ coordination:
51
+ max_orchestration_restarts: 2 # Default: 0 (allows 3 total attempts: initial + 2 restarts)
52
+
53
+ ui:
54
+ display_type: "rich_terminal"
55
+ logging_enabled: true
@@ -0,0 +1,61 @@
1
+ # MassGen Configuration: Text to File Generation Tool
2
+ # Usage:
3
+ # uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/text_to_file_generation_multi.yaml "Please generate a comprehensive business presentation about Artificial Intelligence in Healthcare for our upcoming board meeting. The presentation should include the following slides: 1) Title slide with presentation title and date, 2) Executive Summary highlighting key findings, 3) Market Overview showing the current AI healthcare market size and growth trends, 4) Technology Applications including AI in diagnostics, drug discovery, and patient care, 5) Case Studies showcasing 3-4 successful implementations with metrics, 6) Competitive Landscape analyzing major players and their solutions, 7) Implementation Roadmap with timeline and milestones, 8) ROI Analysis with projected costs and benefits, 9) Risk Assessment and mitigation strategies, 10) Recommendations and next steps. Please make it professional with approximately 15-20 slides, use clear bullet points, include suggested visual elements for each slide, and save it as a PPTX file with a modern business layout."
4
+ agents:
5
+ - id: "text_to_file_generation_tool1"
6
+ backend:
7
+ type: "openai"
8
+ model: "gpt-4o"
9
+ cwd: "workspace1"
10
+ enable_file_generation: true
11
+ custom_tools:
12
+ - name: ["text_to_file_generation"]
13
+ category: "multimodal"
14
+ path: "massgen/tool/_multimodal_tools/text_to_file_generation.py"
15
+ function: ["text_to_file_generation"]
16
+ - name: ["understand_file"]
17
+ category: "multimodal"
18
+ path: "massgen/tool/_multimodal_tools/understand_file.py"
19
+ function: ["understand_file"]
20
+ system_message: |
21
+ You are an AI assistant with access to text-to-file generation capabilities.
22
+
23
+ When generating PPTX presentations, format your content with:
24
+ - Use "# Title" or "## Title" for slide titles
25
+ - Use "---" to separate slides
26
+ - Use "- Item" for bullet points
27
+ - Use " - Subitem" for sub-bullets (two spaces indent)
28
+ - Structure content in a slide-friendly format with clear, concise points
29
+
30
+ - id: "text_to_file_generation_tool2"
31
+ backend:
32
+ type: "openai"
33
+ model: "gpt-4o"
34
+ cwd: "workspace2"
35
+ enable_file_generation: true
36
+ custom_tools:
37
+ - name: ["text_to_file_generation"]
38
+ category: "multimodal"
39
+ path: "massgen/tool/_multimodal_tools/text_to_file_generation.py"
40
+ function: ["text_to_file_generation"]
41
+ - name: ["understand_file"]
42
+ category: "multimodal"
43
+ path: "massgen/tool/_multimodal_tools/understand_file.py"
44
+ function: ["understand_file"]
45
+ system_message: |
46
+ You are an AI assistant with access to text-to-file generation capabilities.
47
+
48
+ When generating PPTX presentations, format your content with:
49
+ - Use "# Title" or "## Title" for slide titles
50
+ - Use "---" to separate slides
51
+ - Use "- Item" for bullet points
52
+ - Use " - Subitem" for sub-bullets (two spaces indent)
53
+ - Structure content in a slide-friendly format with clear, concise points
54
+
55
+ orchestrator:
56
+ snapshot_storage: "snapshots"
57
+ agent_temporary_workspace: "temp_workspaces"
58
+
59
+ ui:
60
+ display_type: "rich_terminal"
61
+ logging_enabled: true
@@ -0,0 +1,29 @@
1
+ # MassGen Configuration: Text to File Generation Tool
2
+ # Usage:
3
+ # uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/text_to_file_generation_single.yaml "Please generate a comprehensive technical report about the latest developments in Large Language Models (LLMs) and Generative AI. The report should include the following sections: 1) Executive Summary, 2) Introduction to LLMs and their architecture, 3) Recent breakthroughs in 2024-2025, 4) Applications in industry including healthcare, finance, and education, 5) Ethical considerations and limitations, 6) Future directions and research opportunities. Please make the report approximately 10-15 pages long with proper citations and references, and save it as a PDF file with a professional layout."
4
+ agents:
5
+ - id: "text_to_file_generation_tool"
6
+ backend:
7
+ type: "openai"
8
+ model: "gpt-4o"
9
+ cwd: "workspace1"
10
+ enable_file_generation: true
11
+ custom_tools:
12
+ - name: ["text_to_file_generation"]
13
+ category: "multimodal"
14
+ path: "massgen/tool/_multimodal_tools/text_to_file_generation.py"
15
+ function: ["text_to_file_generation"]
16
+ - name: ["understand_file"]
17
+ category: "multimodal"
18
+ path: "massgen/tool/_multimodal_tools/understand_file.py"
19
+ function: ["understand_file"]
20
+ system_message: |
21
+ You are an AI assistant with access to text-to-file generation capabilities.
22
+
23
+ orchestrator:
24
+ snapshot_storage: "snapshots"
25
+ agent_temporary_workspace: "temp_workspaces"
26
+
27
+ ui:
28
+ display_type: "simple"
29
+ logging_enabled: true
@@ -0,0 +1,51 @@
1
+ # MassGen Configuration: Text to Image Generation Tool
2
+ # Usage:
3
+ # uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/text_to_image_generation_multi.yaml "Please generate an image of a cat in space."
4
+ agents:
5
+ - id: "text_to_image_generation_tool1"
6
+ backend:
7
+ type: "openai"
8
+ model: "gpt-4o"
9
+ cwd: "workspace1"
10
+ enable_image_generation: true
11
+ custom_tools:
12
+ - name: ["text_to_image_generation"]
13
+ category: "multimodal"
14
+ path: "massgen/tool/_multimodal_tools/text_to_image_generation.py"
15
+ function: ["text_to_image_generation"]
16
+ - name: ["understand_image"]
17
+ category: "multimodal"
18
+ path: "massgen/tool/_multimodal_tools/understand_image.py"
19
+ function: ["understand_image"]
20
+ - name: ["image_to_image_generation"]
21
+ category: "multimodal"
22
+ path: "massgen/tool/_multimodal_tools/image_to_image_generation.py"
23
+ function: ["image_to_image_generation"]
24
+ system_message: |
25
+ You are an AI assistant with access to text-to-image generation capabilities.
26
+
27
+ - id: "text_to_image_generation_tool2"
28
+ backend:
29
+ type: "openai"
30
+ model: "gpt-4o"
31
+ cwd: "workspace2"
32
+ enable_image_generation: true
33
+ custom_tools:
34
+ - name: ["text_to_image_generation"]
35
+ category: "multimodal"
36
+ path: "massgen/tool/_multimodal_tools/text_to_image_generation.py"
37
+ function: ["text_to_image_generation"]
38
+ - name: ["understand_image"]
39
+ category: "multimodal"
40
+ path: "massgen/tool/_multimodal_tools/understand_image.py"
41
+ function: ["understand_image"]
42
+ system_message: |
43
+ You are an AI assistant with access to text-to-image generation capabilities.
44
+
45
+ orchestrator:
46
+ snapshot_storage: "snapshots"
47
+ agent_temporary_workspace: "temp_workspaces"
48
+
49
+ ui:
50
+ display_type: "rich_terminal"
51
+ logging_enabled: true
@@ -0,0 +1,33 @@
1
+ # MassGen Configuration: Text to Image Generation Tool
2
+ # Usage:
3
+ # uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/text_to_image_generation_single.yaml "Please generate an image of a cat in space."
4
+ agents:
5
+ - id: "text_to_image_generation_tool"
6
+ backend:
7
+ type: "openai"
8
+ model: "gpt-4o"
9
+ cwd: "workspace1"
10
+ enable_image_generation: true
11
+ custom_tools:
12
+ - name: ["text_to_image_generation"]
13
+ category: "multimodal"
14
+ path: "massgen/tool/_multimodal_tools/text_to_image_generation.py"
15
+ function: ["text_to_image_generation"]
16
+ - name: ["understand_image"]
17
+ category: "multimodal"
18
+ path: "massgen/tool/_multimodal_tools/understand_image.py"
19
+ function: ["understand_image"]
20
+ - name: ["image_to_image_generation"]
21
+ category: "multimodal"
22
+ path: "massgen/tool/_multimodal_tools/image_to_image_generation.py"
23
+ function: ["image_to_image_generation"]
24
+ system_message: |
25
+ You are an AI assistant with access to text-to-image generation capabilities.
26
+
27
+ orchestrator:
28
+ snapshot_storage: "snapshots"
29
+ agent_temporary_workspace: "temp_workspaces"
30
+
31
+ ui:
32
+ display_type: "simple"
33
+ logging_enabled: true
@@ -0,0 +1,55 @@
1
+ # MassGen Configuration: Text to Speech Continue Generation Tool
2
+ # Usage:
3
+ # uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/text_to_speech_generation_multi.yaml "I want to you tell me a very short introduction about Sherlock Homes in one sentence, and I want you to use emotion voice to read it out loud."
4
+ agents:
5
+ - id: "text_to_speech_continue_generation_tool1"
6
+ backend:
7
+ type: "openai"
8
+ model: "gpt-4o"
9
+ cwd: "workspace1"
10
+ enable_audio_generation: true
11
+ custom_tools:
12
+ - name: ["text_to_speech_transcription_generation"]
13
+ category: "multimodal"
14
+ path: "massgen/tool/_multimodal_tools/text_to_speech_transcription_generation.py"
15
+ function: ["text_to_speech_transcription_generation"]
16
+ - name: ["understand_audio"]
17
+ category: "multimodal"
18
+ path: "massgen/tool/_multimodal_tools/understand_audio.py"
19
+ function: ["understand_audio"]
20
+ - name: ["text_to_speech_continue_generation"]
21
+ category: "multimodal"
22
+ path: "massgen/tool/_multimodal_tools/text_to_speech_continue_generation.py"
23
+ function: ["text_to_speech_continue_generation"]
24
+ system_message: |
25
+ You are an AI assistant with access to text-to-speech generation capabilities.
26
+
27
+ - id: "text_to_speech_continue_generation_tool2"
28
+ backend:
29
+ type: "openai"
30
+ model: "gpt-4o"
31
+ cwd: "workspace2"
32
+ enable_audio_generation: true
33
+ custom_tools:
34
+ - name: ["text_to_speech_transcription_generation"]
35
+ category: "multimodal"
36
+ path: "massgen/tool/_multimodal_tools/text_to_speech_transcription_generation.py"
37
+ function: ["text_to_speech_transcription_generation"]
38
+ - name: ["understand_audio"]
39
+ category: "multimodal"
40
+ path: "massgen/tool/_multimodal_tools/understand_audio.py"
41
+ function: ["understand_audio"]
42
+ - name: ["text_to_speech_continue_generation"]
43
+ category: "multimodal"
44
+ path: "massgen/tool/_multimodal_tools/text_to_speech_continue_generation.py"
45
+ function: ["text_to_speech_continue_generation"]
46
+ system_message: |
47
+ You are an AI assistant with access to text-to-speech generation capabilities.
48
+
49
+ orchestrator:
50
+ snapshot_storage: "snapshots"
51
+ agent_temporary_workspace: "temp_workspaces"
52
+
53
+ ui:
54
+ display_type: "rich_terminal"
55
+ logging_enabled: true
@@ -0,0 +1,33 @@
1
+ # MassGen Configuration: Text to Speech Continue Generation Tool
2
+ # Usage:
3
+ # uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/text_to_speech_generation_single.yaml "I want to you tell me a very short introduction about Sherlock Homes in one sentence, and I want you to use emotion voice to read it out loud."
4
+ agents:
5
+ - id: "text_to_speech_continue_generation_tool"
6
+ backend:
7
+ type: "openai"
8
+ model: "gpt-4o"
9
+ cwd: "workspace1"
10
+ enable_audio_generation: true
11
+ custom_tools:
12
+ - name: ["text_to_speech_transcription_generation"]
13
+ category: "multimodal"
14
+ path: "massgen/tool/_multimodal_tools/text_to_speech_transcription_generation.py"
15
+ function: ["text_to_speech_transcription_generation"]
16
+ - name: ["understand_audio"]
17
+ category: "multimodal"
18
+ path: "massgen/tool/_multimodal_tools/understand_audio.py"
19
+ function: ["understand_audio"]
20
+ - name: ["text_to_speech_continue_generation"]
21
+ category: "multimodal"
22
+ path: "massgen/tool/_multimodal_tools/text_to_speech_continue_generation.py"
23
+ function: ["text_to_speech_continue_generation"]
24
+ system_message: |
25
+ You are an AI assistant with access to text-to-speech generation capabilities.
26
+
27
+ orchestrator:
28
+ snapshot_storage: "snapshots"
29
+ agent_temporary_workspace: "temp_workspaces"
30
+
31
+ ui:
32
+ display_type: "simple"
33
+ logging_enabled: true
@@ -0,0 +1,47 @@
1
+ # MassGen Configuration: Text to Video Generation Tool
2
+ # Usage:
3
+ # uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/text_to_video_generation_multi.yaml "Generate a 4 seconds video with neon-lit alley at night, light rain, slow push-in, cinematic."
4
+ agents:
5
+ - id: "text_to_video_generation_tool1"
6
+ backend:
7
+ type: "openai"
8
+ model: "gpt-4o"
9
+ cwd: "workspace1"
10
+ enable_video_generation: true
11
+ custom_tools:
12
+ - name: ["understand_video"]
13
+ category: "multimodal"
14
+ path: "massgen/tool/_multimodal_tools/understand_video.py"
15
+ function: ["understand_video"]
16
+ - name: ["text_to_video_generation"]
17
+ category: "multimodal"
18
+ path: "massgen/tool/_multimodal_tools/text_to_video_generation.py"
19
+ function: ["text_to_video_generation"]
20
+ system_message: |
21
+ You are an AI assistant with access to text-to-video generation capabilities.
22
+
23
+ - id: "text_to_video_generation_tool2"
24
+ backend:
25
+ type: "openai"
26
+ model: "gpt-4o"
27
+ cwd: "workspace2"
28
+ enable_video_generation: true
29
+ custom_tools:
30
+ - name: ["understand_video"]
31
+ category: "multimodal"
32
+ path: "massgen/tool/_multimodal_tools/understand_video.py"
33
+ function: ["understand_video"]
34
+ - name: ["text_to_video_generation"]
35
+ category: "multimodal"
36
+ path: "massgen/tool/_multimodal_tools/text_to_video_generation.py"
37
+ function: ["text_to_video_generation"]
38
+ system_message: |
39
+ You are an AI assistant with access to text-to-video generation capabilities.
40
+
41
+ orchestrator:
42
+ snapshot_storage: "snapshots"
43
+ agent_temporary_workspace: "temp_workspaces"
44
+
45
+ ui:
46
+ display_type: "rich_terminal"
47
+ logging_enabled: true
@@ -0,0 +1,29 @@
1
+ # MassGen Configuration: Text to Video Generation Tool
2
+ # Usage:
3
+ # uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/text_to_video_generation_single.yaml "Generate a 4 seconds video with neon-lit alley at night, light rain, slow push-in, cinematic."
4
+ agents:
5
+ - id: "text_to_video_generation_tool"
6
+ backend:
7
+ type: "openai"
8
+ model: "gpt-4o"
9
+ cwd: "workspace1"
10
+ enable_video_generation: true
11
+ custom_tools:
12
+ - name: ["understand_video"]
13
+ category: "multimodal"
14
+ path: "massgen/tool/_multimodal_tools/understand_video.py"
15
+ function: ["understand_video"]
16
+ - name: ["text_to_video_generation"]
17
+ category: "multimodal"
18
+ path: "massgen/tool/_multimodal_tools/text_to_video_generation.py"
19
+ function: ["text_to_video_generation"]
20
+ system_message: |
21
+ You are an AI assistant with access to text-to-video generation capabilities.
22
+
23
+ orchestrator:
24
+ snapshot_storage: "snapshots"
25
+ agent_temporary_workspace: "temp_workspaces"
26
+
27
+ ui:
28
+ display_type: "simple"
29
+ logging_enabled: true
@@ -0,0 +1,33 @@
1
+ # MassGen Configuration: Understand Audio Tool
2
+ # Usage:
3
+ # uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/understand_audio.yaml "Please summarize the content in this audio."
4
+ agents:
5
+ - id: "understand_audio_tool"
6
+ backend:
7
+ type: "openai"
8
+ model: "gpt-5-nano"
9
+ cwd: "workspace1"
10
+ custom_tools:
11
+ - name: ["understand_audio"]
12
+ category: "multimodal"
13
+ path: "massgen/tool/_multimodal_tools/understand_audio.py"
14
+ function: ["understand_audio"]
15
+ system_message: |
16
+ You are an AI assistant with access to audio transcription capabilities.
17
+
18
+ The understand_audio tool is available to transcribe audio files to text using OpenAI's Transcription API.
19
+ Do not output tool call syntax or function declarations. Focus on answering the user's question clearly.
20
+
21
+ When users ask about transcribing or understanding audio files, use the understand_audio tool to
22
+ process the audio and provide the transcription.
23
+
24
+ orchestrator:
25
+ snapshot_storage: "snapshots"
26
+ agent_temporary_workspace: "temp_workspaces"
27
+ context_paths:
28
+ - path: "massgen/configs/resources/v0.1.3-example/Sherlock_Holmes.mp3"
29
+ permission: "read"
30
+
31
+ ui:
32
+ display_type: "rich_terminal"
33
+ logging_enabled: true
@@ -0,0 +1,34 @@
1
+ # MassGen Configuration: Understand File Tool
2
+ # Usage:
3
+ # uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/understand_file.yaml "Please summarize the content in this file."
4
+ agents:
5
+ - id: "understand_file_tool"
6
+ backend:
7
+ type: "openai"
8
+ model: "gpt-5-nano"
9
+ cwd: "workspace1"
10
+ custom_tools:
11
+ - name: ["understand_file"]
12
+ category: "multimodal"
13
+ path: "massgen/tool/_multimodal_tools/understand_file.py"
14
+ function: ["understand_file"]
15
+ system_message: |
16
+ You are an AI assistant with access to file understanding capabilities.
17
+
18
+ The understand_file tool is available to analyze and understand file contents using OpenAI's gpt-4.1 API.
19
+ It supports text files, PDF, DOCX, XLSX, PPTX, and more.
20
+ Do not output tool call syntax or function declarations. Focus on answering the user's question clearly.
21
+
22
+ When users ask about analyzing or understanding files, use the understand_file tool to process
23
+ the file and provide detailed descriptions or answers to their questions.
24
+
25
+ orchestrator:
26
+ snapshot_storage: "snapshots"
27
+ agent_temporary_workspace: "temp_workspaces"
28
+ context_paths:
29
+ - path: "massgen/configs/resources/v0.1.3-example/TUMIX.pdf"
30
+ permission: "read"
31
+
32
+ ui:
33
+ display_type: "rich_terminal"
34
+ logging_enabled: true
@@ -0,0 +1,33 @@
1
+ # MassGen Configuration: Understand Image Tool
2
+ # Usage:
3
+ # uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/understand_image.yaml "Please summarize the content in this image."
4
+ agents:
5
+ - id: "understand_image_tool"
6
+ backend:
7
+ type: "openai"
8
+ model: "gpt-5-nano"
9
+ cwd: "workspace1"
10
+ custom_tools:
11
+ - name: ["understand_image"]
12
+ category: "multimodal"
13
+ path: "massgen/tool/_multimodal_tools/understand_image.py"
14
+ function: ["understand_image"]
15
+ system_message: |
16
+ You are an AI assistant with access to image understanding capabilities.
17
+
18
+ The understand_image tool is available to analyze and understand images using OpenAI's gpt-4.1 API.
19
+ Do not output tool call syntax or function declarations. Focus on answering the user's question clearly.
20
+
21
+ When users ask about analyzing or understanding images, use the understand_image tool to process
22
+ the image and provide detailed descriptions or answers to their questions.
23
+
24
+ orchestrator:
25
+ snapshot_storage: "snapshots"
26
+ agent_temporary_workspace: "temp_workspaces"
27
+ context_paths:
28
+ - path: "massgen/configs/resources/v0.1.3-example/multimodality.jpg"
29
+ permission: "read"
30
+
31
+ ui:
32
+ display_type: "rich_terminal"
33
+ logging_enabled: true
@@ -0,0 +1,34 @@
1
+ # MassGen Configuration: Understand Video Tool
2
+ # Usage:
3
+ # uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/understand_video.yaml "What's happening in this video?"
4
+ agents:
5
+ - id: "understand_video_tool"
6
+ backend:
7
+ type: "openai"
8
+ model: "gpt-5-nano"
9
+ cwd: "workspace1"
10
+ custom_tools:
11
+ - name: ["understand_video"]
12
+ category: "multimodal"
13
+ path: "massgen/tool/_multimodal_tools/understand_video.py"
14
+ function: ["understand_video"]
15
+ system_message: |
16
+ You are an AI assistant with access to video understanding capabilities.
17
+
18
+ The understand_video tool is available to analyze and understand videos by extracting key frames
19
+ and using OpenAI's gpt-4.1 API.
20
+ Do not output tool call syntax or function declarations. Focus on answering the user's question clearly.
21
+
22
+ When users ask about analyzing or understanding videos, use the understand_video tool to process
23
+ the video and provide detailed descriptions or answers to their questions.
24
+
25
+ orchestrator:
26
+ snapshot_storage: "snapshots"
27
+ agent_temporary_workspace: "temp_workspaces"
28
+ context_paths:
29
+ - path: "massgen/configs/resources/v0.1.3-example/oppenheimer_trailer_1920.mp4"
30
+ permission: "read"
31
+
32
+ ui:
33
+ display_type: "rich_terminal"
34
+ logging_enabled: true