massgen 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. massgen/__init__.py +1 -1
  2. massgen/agent_config.py +33 -7
  3. massgen/api_params_handler/_api_params_handler_base.py +3 -0
  4. massgen/api_params_handler/_chat_completions_api_params_handler.py +7 -1
  5. massgen/backend/azure_openai.py +9 -1
  6. massgen/backend/base.py +56 -0
  7. massgen/backend/base_with_custom_tool_and_mcp.py +4 -4
  8. massgen/backend/capabilities.py +6 -6
  9. massgen/backend/chat_completions.py +18 -11
  10. massgen/backend/claude_code.py +9 -1
  11. massgen/backend/gemini.py +71 -6
  12. massgen/backend/gemini_utils.py +30 -0
  13. massgen/backend/grok.py +39 -6
  14. massgen/backend/response.py +18 -11
  15. massgen/chat_agent.py +9 -3
  16. massgen/cli.py +319 -43
  17. massgen/config_builder.py +163 -18
  18. massgen/configs/README.md +78 -20
  19. massgen/configs/basic/multi/three_agents_default.yaml +2 -2
  20. massgen/configs/debug/restart_test_controlled.yaml +60 -0
  21. massgen/configs/debug/restart_test_controlled_filesystem.yaml +73 -0
  22. massgen/configs/tools/code-execution/docker_with_sudo.yaml +35 -0
  23. massgen/configs/tools/custom_tools/computer_use_browser_example.yaml +56 -0
  24. massgen/configs/tools/custom_tools/computer_use_docker_example.yaml +65 -0
  25. massgen/configs/tools/custom_tools/computer_use_example.yaml +50 -0
  26. massgen/configs/tools/custom_tools/crawl4ai_mcp_example.yaml +67 -0
  27. massgen/configs/tools/custom_tools/crawl4ai_multi_agent_example.yaml +68 -0
  28. massgen/configs/tools/custom_tools/multimodal_tools/playwright_with_img_understanding.yaml +98 -0
  29. massgen/configs/tools/custom_tools/multimodal_tools/understand_audio.yaml +33 -0
  30. massgen/configs/tools/custom_tools/multimodal_tools/understand_file.yaml +34 -0
  31. massgen/configs/tools/custom_tools/multimodal_tools/understand_image.yaml +33 -0
  32. massgen/configs/tools/custom_tools/multimodal_tools/understand_video.yaml +34 -0
  33. massgen/configs/tools/custom_tools/multimodal_tools/understand_video_example.yaml +54 -0
  34. massgen/configs/tools/custom_tools/multimodal_tools/youtube_video_analysis.yaml +59 -0
  35. massgen/configs/tools/memory/README.md +199 -0
  36. massgen/configs/tools/memory/gpt5mini_gemini_context_window_management.yaml +131 -0
  37. massgen/configs/tools/memory/gpt5mini_gemini_no_persistent_memory.yaml +133 -0
  38. massgen/configs/tools/memory/test_context_window_management.py +286 -0
  39. massgen/configs/tools/multimodal/gpt5mini_gpt5nano_documentation_evolution.yaml +97 -0
  40. massgen/configs/tools/planning/five_agents_discord_mcp_planning_mode.yaml +7 -29
  41. massgen/configs/tools/planning/five_agents_filesystem_mcp_planning_mode.yaml +5 -6
  42. massgen/configs/tools/planning/five_agents_notion_mcp_planning_mode.yaml +4 -4
  43. massgen/configs/tools/planning/five_agents_twitter_mcp_planning_mode.yaml +4 -4
  44. massgen/configs/tools/planning/gpt5_mini_case_study_mcp_planning_mode.yaml +2 -2
  45. massgen/docker/README.md +83 -0
  46. massgen/filesystem_manager/_code_execution_server.py +22 -7
  47. massgen/filesystem_manager/_docker_manager.py +21 -1
  48. massgen/filesystem_manager/_filesystem_manager.py +8 -0
  49. massgen/filesystem_manager/_workspace_tools_server.py +0 -997
  50. massgen/formatter/_gemini_formatter.py +73 -0
  51. massgen/frontend/coordination_ui.py +175 -257
  52. massgen/frontend/displays/base_display.py +29 -0
  53. massgen/frontend/displays/rich_terminal_display.py +155 -9
  54. massgen/frontend/displays/simple_display.py +21 -0
  55. massgen/frontend/displays/terminal_display.py +22 -2
  56. massgen/logger_config.py +50 -6
  57. massgen/message_templates.py +123 -3
  58. massgen/orchestrator.py +652 -44
  59. massgen/tests/test_code_execution.py +178 -0
  60. massgen/tests/test_intelligent_planning_mode.py +643 -0
  61. massgen/tests/test_orchestration_restart.py +204 -0
  62. massgen/token_manager/token_manager.py +13 -4
  63. massgen/tool/__init__.py +4 -0
  64. massgen/tool/_multimodal_tools/understand_audio.py +193 -0
  65. massgen/tool/_multimodal_tools/understand_file.py +550 -0
  66. massgen/tool/_multimodal_tools/understand_image.py +212 -0
  67. massgen/tool/_multimodal_tools/understand_video.py +313 -0
  68. massgen/tool/docs/multimodal_tools.md +779 -0
  69. massgen/tool/workflow_toolkits/__init__.py +26 -0
  70. massgen/tool/workflow_toolkits/post_evaluation.py +216 -0
  71. massgen/utils.py +1 -0
  72. {massgen-0.1.1.dist-info → massgen-0.1.3.dist-info}/METADATA +57 -52
  73. {massgen-0.1.1.dist-info → massgen-0.1.3.dist-info}/RECORD +77 -49
  74. {massgen-0.1.1.dist-info → massgen-0.1.3.dist-info}/WHEEL +0 -0
  75. {massgen-0.1.1.dist-info → massgen-0.1.3.dist-info}/entry_points.txt +0 -0
  76. {massgen-0.1.1.dist-info → massgen-0.1.3.dist-info}/licenses/LICENSE +0 -0
  77. {massgen-0.1.1.dist-info → massgen-0.1.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,65 @@
1
+ # MassGen Configuration: Computer Use - Docker Environment
2
+ #
3
+ # This configuration uses a Docker container for OS-level automation.
4
+ #
5
+ # Usage:
6
+ # massgen --config @examples/tools/custom_tools/computer_use_docker_example "Open calculator and compute 123 + 456"
7
+ #
8
+ # Prerequisites:
9
+ # 1. Set OPENAI_API_KEY in your .env file
10
+ # 2. Have Docker installed and running
11
+ # 3. Start the Docker container (see README for instructions)
12
+
13
+ agents:
14
+ - id: "docker_automation_agent"
15
+ backend:
16
+ type: "openai"
17
+ model: "computer-use-preview"
18
+ custom_tools:
19
+ - name: ["computer_use"]
20
+ category: "automation"
21
+ path: "massgen/tool/_computer_use/computer_use_tool.py"
22
+ function: ["computer_use"]
23
+ # Default parameters for Docker environment
24
+ default_params:
25
+ environment: "ubuntu"
26
+ display_width: 1280
27
+ display_height: 800
28
+ max_iterations: 25
29
+ include_reasoning: true
30
+ environment_config:
31
+ container_name: "cua-container"
32
+ display: ":99"
33
+
34
+ system_message: |
35
+ You are a computer automation specialist with access to the computer_use tool.
36
+
37
+ You can control a virtual Ubuntu environment running in Docker to:
38
+ - Launch and use desktop applications
39
+ - Perform file operations
40
+ - Execute system-level tasks
41
+ - Automate GUI interactions
42
+
43
+ The environment includes:
44
+ - Ubuntu 22.04 desktop (Xfce)
45
+ - Firefox browser
46
+ - Standard desktop applications
47
+ - X11 display system
48
+
49
+ When using the computer_use tool:
50
+ 1. Tasks are executed in a sandboxed Docker container
51
+ 2. You have full desktop access via xdotool commands
52
+ 3. Screenshots are captured after each action
53
+ 4. Be specific about coordinates when clicking
54
+
55
+ Best practices:
56
+ - Allow time for applications to launch
57
+ - Use wait actions between steps
58
+ - Verify GUI elements are visible before clicking
59
+ - Consider screen resolution (1280x800)
60
+
61
+ ui:
62
+ display_type: "detailed"
63
+ logging_enabled: true
64
+ show_screenshots: true
65
+ show_reasoning: true
@@ -0,0 +1,50 @@
1
+ # MassGen Configuration: Computer Use Tool Example
2
+ #
3
+ # This configuration demonstrates how to use the computer_use tool for automating
4
+ # browser and computer interactions using OpenAI's computer-use-preview model.
5
+ #
6
+ # Usage:
7
+ # massgen --config @examples/tools/custom_tools/computer_use_example "Search for Python documentation on Google"
8
+ #
9
+ # Prerequisites:
10
+ # 1. Set OPENAI_API_KEY in your .env file
11
+ # 2. For browser environment: pip install playwright && playwright install
12
+ # 3. For Docker environment: Have Docker installed and running
13
+
14
+ agents:
15
+ - id: "computer_use_agent"
16
+ backend:
17
+ type: "openai"
18
+ model: "gpt-4.1" # You can also use "computer-use-preview" for the main model
19
+ custom_tools:
20
+ - name: ["computer_use"]
21
+ category: "automation"
22
+ path: "massgen/tool/_computer_use/computer_use_tool.py"
23
+ function: ["computer_use"]
24
+
25
+ system_message: |
26
+ You are an AI assistant with access to computer automation capabilities.
27
+
28
+ The computer_use tool is available to you. This tool allows you to:
29
+ - Control a web browser (click, type, scroll, etc.)
30
+ - Automate computer tasks
31
+ - Search the web, fill forms, navigate websites
32
+ - Perform multi-step workflows
33
+
34
+ When a user asks you to perform a task that requires browser or computer interaction,
35
+ use the computer_use tool with a clear task description.
36
+
37
+ Important:
38
+ - Always provide clear, specific task descriptions
39
+ - The tool will execute actions step-by-step
40
+ - You will receive screenshots and action logs
41
+ - Safety checks may be triggered - acknowledge them when appropriate
42
+
43
+ Example usage:
44
+ - "Search for the latest AI news on Google"
45
+ - "Navigate to example.com and fill out the contact form"
46
+ - "Find Python documentation and save the URL"
47
+
48
+ ui:
49
+ display_type: "simple"
50
+ logging_enabled: true
@@ -0,0 +1,67 @@
1
+ # MassGen Configuration: Crawl4AI Web Scraping via MCP
2
+ #
3
+ # Prerequisites:
4
+ # 1. Start crawl4ai Docker container (one-time setup):
5
+ # docker pull unclecode/crawl4ai:latest
6
+ # docker run -d -p 11235:11235 --name crawl4ai --shm-size=1g unclecode/crawl4ai:latest
7
+ #
8
+ # 2. Verify container is running:
9
+ # docker ps | grep crawl4ai
10
+ #
11
+ # 3. Test MCP endpoint (optional):
12
+ # curl http://localhost:11235/mcp/schema
13
+ #
14
+ # Usage:
15
+ # massgen --config massgen/configs/tools/custom_tools/crawl4ai_mcp_example.yaml "Scrape https://example.com and summarize the content"
16
+ #
17
+ # Available Tools (via MCP):
18
+ # - md: Generate markdown from web content
19
+ # - html: Extract preprocessed HTML
20
+ # - screenshot: Capture webpage screenshots
21
+ # - pdf: Generate PDF documents
22
+ # - execute_js: Run JavaScript on web pages
23
+ # - crawl: Perform multi-URL crawling
24
+ # - ask: Query the Crawl4AI library context
25
+ #
26
+ # Note: Multiple agents can connect to the same crawl4ai container.
27
+ # The server handles up to 5 concurrent crawls by default.
28
+
29
+ orchestrator:
30
+ snapshot_storage: "snapshots"
31
+ agent_temporary_workspace: "temp_workspaces"
32
+
33
+ agents:
34
+ - id: "web_scraper_agent"
35
+ backend:
36
+ type: "claude_code"
37
+ model: "claude-sonnet-4-20250514"
38
+ cwd: "workspace1"
39
+
40
+ # Connect to crawl4ai MCP server
41
+ mcp_servers:
42
+ - name: "crawl4ai"
43
+ type: "sse" # Server-Sent Events transport
44
+ url: "http://localhost:11235/mcp/sse"
45
+
46
+ append_system_prompt: |
47
+ You are a web scraping specialist with access to the Crawl4AI toolset via MCP.
48
+
49
+ Available tools:
50
+ - md: Convert webpages to clean markdown (best for LLM consumption)
51
+ - html: Extract preprocessed HTML
52
+ - screenshot: Capture webpage as image
53
+ - pdf: Generate PDF from webpage
54
+ - execute_js: Run JavaScript on pages (for dynamic content)
55
+ - crawl: Scrape multiple URLs in parallel
56
+
57
+ When users ask to scrape, analyze, or extract web content:
58
+ 1. Use 'md' tool for text-based content (articles, docs, etc.)
59
+ 2. Use 'screenshot' for visual content or layout analysis
60
+ 3. Use 'execute_js' for JavaScript-heavy sites
61
+ 4. Use 'crawl' for multiple pages
62
+
63
+ Always provide clear summaries of scraped content.
64
+
65
+ ui:
66
+ display_type: "rich_terminal"
67
+ logging_enabled: true
@@ -0,0 +1,68 @@
1
+ # MassGen Configuration: Multi-Agent Web Research with Crawl4AI
2
+ #
3
+ # This example demonstrates multiple agents sharing a single crawl4ai MCP server
4
+ # for collaborative web research and analysis.
5
+ #
6
+ # Prerequisites:
7
+ # 1. Start crawl4ai Docker container:
8
+ # docker run -d -p 11235:11235 --name crawl4ai --shm-size=1g unclecode/crawl4ai:latest
9
+ #
10
+ # Usage:
11
+ # massgen --config massgen/configs/tools/custom_tools/crawl4ai_multi_agent_example.yaml "Research AI safety best practices by scraping relevant documentation sites and academic papers"
12
+ #
13
+ # What happens:
14
+ # - Multiple agents can scrape different URLs concurrently
15
+ # - The crawl4ai server handles up to 5 parallel crawls
16
+ # - Agents collaborate and vote on the synthesized research
17
+
18
+ orchestrator:
19
+ snapshot_storage: "snapshots"
20
+ agent_temporary_workspace: "temp_workspaces"
21
+ voting_sensitivity: "balanced"
22
+ answer_novelty_requirement: "balanced"
23
+ max_new_answers_per_agent: 3
24
+
25
+ agents:
26
+ - id: "research_agent_1"
27
+ backend:
28
+ type: "openai"
29
+ model: "gpt-5-nano"
30
+ cwd: "workspace1"
31
+
32
+ # All agents connect to the same crawl4ai container
33
+ mcp_servers:
34
+ - name: "crawl4ai"
35
+ type: "sse"
36
+ url: "http://localhost:11235/mcp/sse"
37
+
38
+ enable_web_search: true
39
+
40
+ system_message: |
41
+ You are a research specialist focused on finding authoritative sources and extracting key insights.
42
+
43
+ Use the crawl4ai tools to scrape documentation, articles, and research papers.
44
+ Prioritize official documentation and academic sources.
45
+
46
+ - id: "research_agent_2"
47
+ backend:
48
+ type: "gemini"
49
+ model: "gemini-2.5-pro"
50
+ cwd: "workspace2"
51
+
52
+ # Connects to the SAME crawl4ai container as agent_1
53
+ mcp_servers:
54
+ - name: "crawl4ai"
55
+ type: "sse"
56
+ url: "http://localhost:11235/mcp/sse"
57
+
58
+ enable_web_search: true
59
+
60
+ system_message: |
61
+ You are a synthesis specialist who combines information from multiple sources.
62
+
63
+ Use crawl4ai to gather diverse perspectives and cross-reference information.
64
+ Focus on finding patterns and connections across sources.
65
+
66
+ ui:
67
+ display_type: "rich_terminal"
68
+ logging_enabled: true
@@ -0,0 +1,98 @@
1
+ # massgen --config massgen/configs/tools/custom_tools/multimodal_tools/playwright_with_img_understanding.yaml "Analyze docs.massgen.ai and tell me how to improve its design."
2
+
3
+ agents:
4
+ - id: agent_a
5
+ backend:
6
+ type: openai
7
+ model: gpt-5-codex
8
+ text:
9
+ verbosity: medium
10
+ reasoning:
11
+ effort: medium
12
+ summary: auto
13
+ cwd: workspace1
14
+ enable_mcp_command_line: true
15
+ command_line_execution_mode: docker
16
+ command_line_docker_network_mode: "bridge" # Enable network access (default: none)
17
+ enable_web_search: true
18
+ custom_tools:
19
+ - name: ["understand_image"]
20
+ category: "multimodal"
21
+ path: "massgen/tool/_multimodal_tools/understand_image.py"
22
+ function: ["understand_image"]
23
+ mcp_servers:
24
+ playwright:
25
+ type: "stdio"
26
+ command: "npx"
27
+ args: [
28
+ "@playwright/mcp@latest",
29
+ "--browser=chrome", # Use Chrome browser
30
+ "--caps=vision,pdf", # Enable vision and PDF capabilities
31
+ "--user-data-dir=${cwd}/playwright-profile", # Persistent browser profile within workspace
32
+ "--output-dir=${cwd}", # Save screenshots/PDFs directly to workspace
33
+ # "--save-trace" # Save Playwright traces for debugging
34
+ ]
35
+
36
+ - id: agent_b
37
+ backend:
38
+ type: claude_code
39
+ model: claude-sonnet-4-5-20250929
40
+ cwd: workspace2
41
+ enable_mcp_command_line: true
42
+ command_line_execution_mode: docker
43
+ command_line_docker_network_mode: "bridge" # Enable network access (default: none)
44
+ custom_tools:
45
+ - name: ["understand_image"]
46
+ category: "multimodal"
47
+ path: "massgen/tool/_multimodal_tools/understand_image.py"
48
+ function: ["understand_image"]
49
+ mcp_servers:
50
+ playwright:
51
+ type: "stdio"
52
+ command: "npx"
53
+ args: [
54
+ "@playwright/mcp@latest",
55
+ "--browser=chrome", # Use Chrome browser
56
+ "--caps=vision,pdf", # Enable vision and PDF capabilities
57
+ "--user-data-dir=${cwd}/playwright-profile", # Persistent browser profile within workspace
58
+ "--output-dir=${cwd}", # Save screenshots/PDFs directly to workspace
59
+ # "--save-trace" # Save Playwright traces for debugging
60
+ ]
61
+
62
+ - id: agent_c
63
+ backend:
64
+ type: chatcompletion
65
+ base_url: "https://openrouter.ai/api/v1"
66
+ model: qwen/qwen3-coder
67
+ cwd: workspace3
68
+ enable_mcp_command_line: true
69
+ command_line_execution_mode: docker
70
+ command_line_docker_network_mode: "bridge" # Enable network access (default: none)
71
+ custom_tools:
72
+ - name: ["understand_image"]
73
+ category: "multimodal"
74
+ path: "massgen/tool/_multimodal_tools/understand_image.py"
75
+ function: ["understand_image"]
76
+ mcp_servers:
77
+ playwright:
78
+ type: "stdio"
79
+ command: "npx"
80
+ args: [
81
+ "@playwright/mcp@latest",
82
+ "--browser=chrome", # Use Chrome browser
83
+ "--caps=vision,pdf", # Enable vision and PDF capabilities
84
+ "--user-data-dir=${cwd}/playwright-profile", # Persistent browser profile within workspace
85
+ "--output-dir=${cwd}", # Save screenshots/PDFs directly to workspace
86
+ # "--save-trace" # Save Playwright traces for debugging
87
+ ]
88
+
89
+ ui:
90
+ display_type: rich_terminal
91
+ logging_enabled: true
92
+ orchestrator:
93
+ snapshot_storage: snapshots
94
+ agent_temporary_workspace: temp_workspaces
95
+ session_storage: sessions
96
+ # voting_sensitivity: balanced
97
+ max_new_answers_per_agent: 5
98
+ # answer_novelty_requirement: balanced
@@ -0,0 +1,33 @@
1
+ # MassGen Configuration: Understand Audio Tool
2
+ # Usage:
3
+ # massgen --config massgen/configs/tools/custom_tools/multimodal_tools/understand_audio.yaml "Please summarize the content in this audio."
4
+ agents:
5
+ - id: "understand_audio_tool"
6
+ backend:
7
+ type: "openai"
8
+ model: "gpt-5-nano"
9
+ cwd: "workspace1"
10
+ custom_tools:
11
+ - name: ["understand_audio"]
12
+ category: "multimodal"
13
+ path: "massgen/tool/_multimodal_tools/understand_audio.py"
14
+ function: ["understand_audio"]
15
+ system_message: |
16
+ You are an AI assistant with access to audio transcription capabilities.
17
+
18
+ The understand_audio tool is available to transcribe audio files to text using OpenAI's Transcription API.
19
+ Do not output tool call syntax or function declarations. Focus on answering the user's question clearly.
20
+
21
+ When users ask about transcribing or understanding audio files, use the understand_audio tool to
22
+ process the audio and provide the transcription.
23
+
24
+ orchestrator:
25
+ snapshot_storage: "snapshots"
26
+ agent_temporary_workspace: "temp_workspaces"
27
+ context_paths:
28
+ - path: "massgen/configs/resources/v0.1.3-example/Sherlock_Holmes.mp3"
29
+ permission: "read"
30
+
31
+ ui:
32
+ display_type: "rich_terminal"
33
+ logging_enabled: true
@@ -0,0 +1,34 @@
1
+ # MassGen Configuration: Understand File Tool
2
+ # Usage:
3
+ # massgen --config massgen/configs/tools/custom_tools/multimodal_tools/understand_file.yaml "Please summarize the content in this file."
4
+ agents:
5
+ - id: "understand_file_tool"
6
+ backend:
7
+ type: "openai"
8
+ model: "gpt-5-nano"
9
+ cwd: "workspace1"
10
+ custom_tools:
11
+ - name: ["understand_file"]
12
+ category: "multimodal"
13
+ path: "massgen/tool/_multimodal_tools/understand_file.py"
14
+ function: ["understand_file"]
15
+ system_message: |
16
+ You are an AI assistant with access to file understanding capabilities.
17
+
18
+ The understand_file tool is available to analyze and understand file contents using OpenAI's gpt-4.1 API.
19
+ It supports text files, PDF, DOCX, XLSX, PPTX, and more.
20
+ Do not output tool call syntax or function declarations. Focus on answering the user's question clearly.
21
+
22
+ When users ask about analyzing or understanding files, use the understand_file tool to process
23
+ the file and provide detailed descriptions or answers to their questions.
24
+
25
+ orchestrator:
26
+ snapshot_storage: "snapshots"
27
+ agent_temporary_workspace: "temp_workspaces"
28
+ context_paths:
29
+ - path: "massgen/configs/resources/v0.1.3-example/TUMIX.pdf"
30
+ permission: "read"
31
+
32
+ ui:
33
+ display_type: "rich_terminal"
34
+ logging_enabled: true
@@ -0,0 +1,33 @@
1
+ # MassGen Configuration: Understand Image Tool
2
+ # Usage:
3
+ # massgen --config massgen/configs/tools/custom_tools/multimodal_tools/understand_image.yaml "Please summarize the content in this image."
4
+ agents:
5
+ - id: "understand_image_tool"
6
+ backend:
7
+ type: "openai"
8
+ model: "gpt-5-nano"
9
+ cwd: "workspace1"
10
+ custom_tools:
11
+ - name: ["understand_image"]
12
+ category: "multimodal"
13
+ path: "massgen/tool/_multimodal_tools/understand_image.py"
14
+ function: ["understand_image"]
15
+ system_message: |
16
+ You are an AI assistant with access to image understanding capabilities.
17
+
18
+ The understand_image tool is available to analyze and understand images using OpenAI's gpt-4.1 API.
19
+ Do not output tool call syntax or function declarations. Focus on answering the user's question clearly.
20
+
21
+ When users ask about analyzing or understanding images, use the understand_image tool to process
22
+ the image and provide detailed descriptions or answers to their questions.
23
+
24
+ orchestrator:
25
+ snapshot_storage: "snapshots"
26
+ agent_temporary_workspace: "temp_workspaces"
27
+ context_paths:
28
+ - path: "massgen/configs/resources/v0.1.3-example/multimodality.jpg"
29
+ permission: "read"
30
+
31
+ ui:
32
+ display_type: "rich_terminal"
33
+ logging_enabled: true
@@ -0,0 +1,34 @@
1
+ # MassGen Configuration: Understand Video Tool
2
+ # Usage:
3
+ # massgen --config massgen/configs/tools/custom_tools/multimodal_tools/understand_video.yaml "What's happening in this video?"
4
+ agents:
5
+ - id: "understand_video_tool"
6
+ backend:
7
+ type: "openai"
8
+ model: "gpt-5-nano"
9
+ cwd: "workspace1"
10
+ custom_tools:
11
+ - name: ["understand_video"]
12
+ category: "multimodal"
13
+ path: "massgen/tool/_multimodal_tools/understand_video.py"
14
+ function: ["understand_video"]
15
+ system_message: |
16
+ You are an AI assistant with access to video understanding capabilities.
17
+
18
+ The understand_video tool is available to analyze and understand videos by extracting key frames
19
+ and using OpenAI's gpt-4.1 API.
20
+ Do not output tool call syntax or function declarations. Focus on answering the user's question clearly.
21
+
22
+ When users ask about analyzing or understanding videos, use the understand_video tool to process
23
+ the video and provide detailed descriptions or answers to their questions.
24
+
25
+ orchestrator:
26
+ snapshot_storage: "snapshots"
27
+ agent_temporary_workspace: "temp_workspaces"
28
+ context_paths:
29
+ - path: "massgen/configs/resources/v0.1.3-example/oppenheimer_trailer_1920.mp4"
30
+ permission: "read"
31
+
32
+ ui:
33
+ display_type: "rich_terminal"
34
+ logging_enabled: true
@@ -0,0 +1,54 @@
1
+ # MassGen Configuration: Understand Video Example
2
+ #
3
+ # Use Case: Analyze a specific video file using the understand_video tool
4
+ #
5
+ # This demonstrates direct video analysis without needing to download.
6
+ # The video file is provided as a context path for agents to analyze.
7
+ #
8
+ # Run with:
9
+ # uv run massgen --config massgen/configs/tools/custom_tools/multimodal_tools/understand_video_example.yaml "What is shown in this video?"
10
+
11
+ agents:
12
+ - id: "agent_a"
13
+ backend:
14
+ type: "openai"
15
+ model: "gpt-5-mini"
16
+ text:
17
+ verbosity: "medium"
18
+ reasoning:
19
+ effort: "medium"
20
+ summary: "auto"
21
+ custom_tools:
22
+ - name: ["understand_video"]
23
+ category: "multimodal"
24
+ path: "massgen/tool/_multimodal_tools/understand_video.py"
25
+ function: ["understand_video"]
26
+ cwd: "workspace1"
27
+
28
+ - id: "agent_b"
29
+ backend:
30
+ type: "gemini"
31
+ model: "gemini-2.5-pro"
32
+ custom_tools:
33
+ - name: ["understand_video"]
34
+ category: "multimodal"
35
+ path: "massgen/tool/_multimodal_tools/understand_video.py"
36
+ function: ["understand_video"]
37
+ cwd: "workspace2"
38
+
39
+ orchestrator:
40
+ snapshot_storage: "snapshots"
41
+ agent_temporary_workspace: "temp_workspaces"
42
+ context_paths:
43
+ - path: "massgen/configs/resources/v0.1.3-example/case-study-videos/Dp2oldJJImw.mp4"
44
+ permission: "read"
45
+
46
+ ui:
47
+ display_type: "rich_terminal"
48
+ logging_enabled: true
49
+
50
+ # What happens:
51
+ # 1. Agents have read access to the video file
52
+ # 2. They can use understand_video tool to analyze it
53
+ # 3. Tool extracts 8 frames and analyzes with GPT-4.1
54
+ # 4. Agents collaborate to provide comprehensive insights
@@ -0,0 +1,59 @@
1
+ # MassGen Configuration: YouTube Video Analysis with Multimodal Understanding
2
+ #
3
+ # Use Case: Download and analyze YouTube videos from MassGen case studies
4
+ #
5
+ # This demonstrates MassGen's self-evolution capabilities by having agents:
6
+ # 1. Read local case study documentation to discover video URLs
7
+ # 2. Download YouTube videos using yt-dlp via command-line execution
8
+ # 3. Analyze video content using the understand_video multimodal tool
9
+ # 4. Extract insights that could inform future feature development
10
+ #
11
+ # Run with:
12
+ # uv run massgen --config massgen/configs/tools/custom_tools/multimodal_tools/youtube_video_analysis.yaml "Download recent MassGen case study videos listed in the case study md files, analyze them, find out how to improve them and automate their creation."
13
+
14
+ agents:
15
+ - id: "agent_a"
16
+ backend:
17
+ type: "openai"
18
+ model: "gpt-5-mini"
19
+ text:
20
+ verbosity: "medium"
21
+ reasoning:
22
+ effort: "medium"
23
+ summary: "auto"
24
+ custom_tools:
25
+ - name: ["understand_video"]
26
+ category: "multimodal"
27
+ path: "massgen/tool/_multimodal_tools/understand_video.py"
28
+ function: ["understand_video"]
29
+ enable_mcp_command_line: true
30
+ command_line_execution_mode: docker
31
+ command_line_docker_enable_sudo: true
32
+ command_line_docker_network_mode: "bridge"
33
+ cwd: "workspace1"
34
+
35
+ - id: "agent_b"
36
+ backend:
37
+ type: "claude_code"
38
+ model: "claude-sonnet-4-5-20250929"
39
+ custom_tools:
40
+ - name: ["understand_video"]
41
+ category: "multimodal"
42
+ path: "massgen/tool/_multimodal_tools/understand_video.py"
43
+ function: ["understand_video"]
44
+ enable_mcp_command_line: true
45
+ command_line_execution_mode: docker
46
+ command_line_docker_enable_sudo: true
47
+ command_line_docker_network_mode: "bridge"
48
+ cwd: "workspace2"
49
+
50
+ orchestrator:
51
+ snapshot_storage: "snapshots"
52
+ agent_temporary_workspace: "temp_workspaces"
53
+ context_paths:
54
+ - path: "docs/case_studies"
55
+ permission: "read"
56
+
57
+ ui:
58
+ display_type: "rich_terminal"
59
+ logging_enabled: true