massgen 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- massgen/__init__.py +1 -1
- massgen/agent_config.py +33 -7
- massgen/api_params_handler/_api_params_handler_base.py +3 -0
- massgen/backend/azure_openai.py +9 -1
- massgen/backend/base.py +4 -0
- massgen/backend/claude_code.py +9 -1
- massgen/backend/gemini.py +35 -6
- massgen/backend/gemini_utils.py +30 -0
- massgen/chat_agent.py +9 -3
- massgen/cli.py +291 -43
- massgen/config_builder.py +163 -18
- massgen/configs/README.md +52 -6
- massgen/configs/debug/restart_test_controlled.yaml +60 -0
- massgen/configs/debug/restart_test_controlled_filesystem.yaml +73 -0
- massgen/configs/tools/code-execution/docker_with_sudo.yaml +35 -0
- massgen/configs/tools/custom_tools/computer_use_browser_example.yaml +56 -0
- massgen/configs/tools/custom_tools/computer_use_docker_example.yaml +65 -0
- massgen/configs/tools/custom_tools/computer_use_example.yaml +50 -0
- massgen/configs/tools/custom_tools/crawl4ai_mcp_example.yaml +67 -0
- massgen/configs/tools/custom_tools/crawl4ai_multi_agent_example.yaml +68 -0
- massgen/configs/tools/custom_tools/multimodal_tools/playwright_with_img_understanding.yaml +98 -0
- massgen/configs/tools/custom_tools/multimodal_tools/understand_audio.yaml +33 -0
- massgen/configs/tools/custom_tools/multimodal_tools/understand_file.yaml +34 -0
- massgen/configs/tools/custom_tools/multimodal_tools/understand_image.yaml +33 -0
- massgen/configs/tools/custom_tools/multimodal_tools/understand_video.yaml +34 -0
- massgen/configs/tools/custom_tools/multimodal_tools/understand_video_example.yaml +54 -0
- massgen/configs/tools/custom_tools/multimodal_tools/youtube_video_analysis.yaml +59 -0
- massgen/configs/tools/memory/README.md +199 -0
- massgen/configs/tools/memory/gpt5mini_gemini_context_window_management.yaml +131 -0
- massgen/configs/tools/memory/gpt5mini_gemini_no_persistent_memory.yaml +133 -0
- massgen/configs/tools/memory/test_context_window_management.py +286 -0
- massgen/configs/tools/multimodal/gpt5mini_gpt5nano_documentation_evolution.yaml +97 -0
- massgen/docker/README.md +83 -0
- massgen/filesystem_manager/_code_execution_server.py +22 -7
- massgen/filesystem_manager/_docker_manager.py +21 -1
- massgen/filesystem_manager/_filesystem_manager.py +8 -0
- massgen/filesystem_manager/_workspace_tools_server.py +0 -997
- massgen/formatter/_gemini_formatter.py +73 -0
- massgen/frontend/coordination_ui.py +175 -257
- massgen/frontend/displays/base_display.py +29 -0
- massgen/frontend/displays/rich_terminal_display.py +155 -9
- massgen/frontend/displays/simple_display.py +21 -0
- massgen/frontend/displays/terminal_display.py +22 -2
- massgen/logger_config.py +50 -6
- massgen/message_templates.py +123 -3
- massgen/orchestrator.py +319 -38
- massgen/tests/test_code_execution.py +178 -0
- massgen/tests/test_orchestration_restart.py +204 -0
- massgen/tool/__init__.py +4 -0
- massgen/tool/_multimodal_tools/understand_audio.py +193 -0
- massgen/tool/_multimodal_tools/understand_file.py +550 -0
- massgen/tool/_multimodal_tools/understand_image.py +212 -0
- massgen/tool/_multimodal_tools/understand_video.py +313 -0
- massgen/tool/docs/multimodal_tools.md +779 -0
- massgen/tool/workflow_toolkits/__init__.py +26 -0
- massgen/tool/workflow_toolkits/post_evaluation.py +216 -0
- massgen/utils.py +1 -0
- {massgen-0.1.2.dist-info → massgen-0.1.3.dist-info}/METADATA +8 -3
- {massgen-0.1.2.dist-info → massgen-0.1.3.dist-info}/RECORD +63 -36
- {massgen-0.1.2.dist-info → massgen-0.1.3.dist-info}/WHEEL +0 -0
- {massgen-0.1.2.dist-info → massgen-0.1.3.dist-info}/entry_points.txt +0 -0
- {massgen-0.1.2.dist-info → massgen-0.1.3.dist-info}/licenses/LICENSE +0 -0
- {massgen-0.1.2.dist-info → massgen-0.1.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# MassGen Configuration: Computer Use Tool Example
|
|
2
|
+
#
|
|
3
|
+
# This configuration demonstrates how to use the computer_use tool for automating
|
|
4
|
+
# browser and computer interactions using OpenAI's computer-use-preview model.
|
|
5
|
+
#
|
|
6
|
+
# Usage:
|
|
7
|
+
# massgen --config @examples/tools/custom_tools/computer_use_example "Search for Python documentation on Google"
|
|
8
|
+
#
|
|
9
|
+
# Prerequisites:
|
|
10
|
+
# 1. Set OPENAI_API_KEY in your .env file
|
|
11
|
+
# 2. For browser environment: pip install playwright && playwright install
|
|
12
|
+
# 3. For Docker environment: Have Docker installed and running
|
|
13
|
+
|
|
14
|
+
agents:
|
|
15
|
+
- id: "computer_use_agent"
|
|
16
|
+
backend:
|
|
17
|
+
type: "openai"
|
|
18
|
+
model: "gpt-4.1" # You can also use "computer-use-preview" for the main model
|
|
19
|
+
custom_tools:
|
|
20
|
+
- name: ["computer_use"]
|
|
21
|
+
category: "automation"
|
|
22
|
+
path: "massgen/tool/_computer_use/computer_use_tool.py"
|
|
23
|
+
function: ["computer_use"]
|
|
24
|
+
|
|
25
|
+
system_message: |
|
|
26
|
+
You are an AI assistant with access to computer automation capabilities.
|
|
27
|
+
|
|
28
|
+
The computer_use tool is available to you. This tool allows you to:
|
|
29
|
+
- Control a web browser (click, type, scroll, etc.)
|
|
30
|
+
- Automate computer tasks
|
|
31
|
+
- Search the web, fill forms, navigate websites
|
|
32
|
+
- Perform multi-step workflows
|
|
33
|
+
|
|
34
|
+
When a user asks you to perform a task that requires browser or computer interaction,
|
|
35
|
+
use the computer_use tool with a clear task description.
|
|
36
|
+
|
|
37
|
+
Important:
|
|
38
|
+
- Always provide clear, specific task descriptions
|
|
39
|
+
- The tool will execute actions step-by-step
|
|
40
|
+
- You will receive screenshots and action logs
|
|
41
|
+
- Safety checks may be triggered - acknowledge them when appropriate
|
|
42
|
+
|
|
43
|
+
Example usage:
|
|
44
|
+
- "Search for the latest AI news on Google"
|
|
45
|
+
- "Navigate to example.com and fill out the contact form"
|
|
46
|
+
- "Find Python documentation and save the URL"
|
|
47
|
+
|
|
48
|
+
ui:
|
|
49
|
+
display_type: "simple"
|
|
50
|
+
logging_enabled: true
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# MassGen Configuration: Crawl4AI Web Scraping via MCP
|
|
2
|
+
#
|
|
3
|
+
# Prerequisites:
|
|
4
|
+
# 1. Start crawl4ai Docker container (one-time setup):
|
|
5
|
+
# docker pull unclecode/crawl4ai:latest
|
|
6
|
+
# docker run -d -p 11235:11235 --name crawl4ai --shm-size=1g unclecode/crawl4ai:latest
|
|
7
|
+
#
|
|
8
|
+
# 2. Verify container is running:
|
|
9
|
+
# docker ps | grep crawl4ai
|
|
10
|
+
#
|
|
11
|
+
# 3. Test MCP endpoint (optional):
|
|
12
|
+
# curl http://localhost:11235/mcp/schema
|
|
13
|
+
#
|
|
14
|
+
# Usage:
|
|
15
|
+
# massgen --config massgen/configs/tools/custom_tools/crawl4ai_mcp_example.yaml "Scrape https://example.com and summarize the content"
|
|
16
|
+
#
|
|
17
|
+
# Available Tools (via MCP):
|
|
18
|
+
# - md: Generate markdown from web content
|
|
19
|
+
# - html: Extract preprocessed HTML
|
|
20
|
+
# - screenshot: Capture webpage screenshots
|
|
21
|
+
# - pdf: Generate PDF documents
|
|
22
|
+
# - execute_js: Run JavaScript on web pages
|
|
23
|
+
# - crawl: Perform multi-URL crawling
|
|
24
|
+
# - ask: Query the Crawl4AI library context
|
|
25
|
+
#
|
|
26
|
+
# Note: Multiple agents can connect to the same crawl4ai container.
|
|
27
|
+
# The server handles up to 5 concurrent crawls by default.
|
|
28
|
+
|
|
29
|
+
orchestrator:
|
|
30
|
+
snapshot_storage: "snapshots"
|
|
31
|
+
agent_temporary_workspace: "temp_workspaces"
|
|
32
|
+
|
|
33
|
+
agents:
|
|
34
|
+
- id: "web_scraper_agent"
|
|
35
|
+
backend:
|
|
36
|
+
type: "claude_code"
|
|
37
|
+
model: "claude-sonnet-4-20250514"
|
|
38
|
+
cwd: "workspace1"
|
|
39
|
+
|
|
40
|
+
# Connect to crawl4ai MCP server
|
|
41
|
+
mcp_servers:
|
|
42
|
+
- name: "crawl4ai"
|
|
43
|
+
type: "sse" # Server-Sent Events transport
|
|
44
|
+
url: "http://localhost:11235/mcp/sse"
|
|
45
|
+
|
|
46
|
+
append_system_prompt: |
|
|
47
|
+
You are a web scraping specialist with access to the Crawl4AI toolset via MCP.
|
|
48
|
+
|
|
49
|
+
Available tools:
|
|
50
|
+
- md: Convert webpages to clean markdown (best for LLM consumption)
|
|
51
|
+
- html: Extract preprocessed HTML
|
|
52
|
+
- screenshot: Capture webpage as image
|
|
53
|
+
- pdf: Generate PDF from webpage
|
|
54
|
+
- execute_js: Run JavaScript on pages (for dynamic content)
|
|
55
|
+
- crawl: Scrape multiple URLs in parallel
|
|
56
|
+
|
|
57
|
+
When users ask to scrape, analyze, or extract web content:
|
|
58
|
+
1. Use 'md' tool for text-based content (articles, docs, etc.)
|
|
59
|
+
2. Use 'screenshot' for visual content or layout analysis
|
|
60
|
+
3. Use 'execute_js' for JavaScript-heavy sites
|
|
61
|
+
4. Use 'crawl' for multiple pages
|
|
62
|
+
|
|
63
|
+
Always provide clear summaries of scraped content.
|
|
64
|
+
|
|
65
|
+
ui:
|
|
66
|
+
display_type: "rich_terminal"
|
|
67
|
+
logging_enabled: true
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# MassGen Configuration: Multi-Agent Web Research with Crawl4AI
|
|
2
|
+
#
|
|
3
|
+
# This example demonstrates multiple agents sharing a single crawl4ai MCP server
|
|
4
|
+
# for collaborative web research and analysis.
|
|
5
|
+
#
|
|
6
|
+
# Prerequisites:
|
|
7
|
+
# 1. Start crawl4ai Docker container:
|
|
8
|
+
# docker run -d -p 11235:11235 --name crawl4ai --shm-size=1g unclecode/crawl4ai:latest
|
|
9
|
+
#
|
|
10
|
+
# Usage:
|
|
11
|
+
# massgen --config massgen/configs/tools/custom_tools/crawl4ai_multi_agent_example.yaml "Research AI safety best practices by scraping relevant documentation sites and academic papers"
|
|
12
|
+
#
|
|
13
|
+
# What happens:
|
|
14
|
+
# - Multiple agents can scrape different URLs concurrently
|
|
15
|
+
# - The crawl4ai server handles up to 5 parallel crawls
|
|
16
|
+
# - Agents collaborate and vote on the synthesized research
|
|
17
|
+
|
|
18
|
+
orchestrator:
|
|
19
|
+
snapshot_storage: "snapshots"
|
|
20
|
+
agent_temporary_workspace: "temp_workspaces"
|
|
21
|
+
voting_sensitivity: "balanced"
|
|
22
|
+
answer_novelty_requirement: "balanced"
|
|
23
|
+
max_new_answers_per_agent: 3
|
|
24
|
+
|
|
25
|
+
agents:
|
|
26
|
+
- id: "research_agent_1"
|
|
27
|
+
backend:
|
|
28
|
+
type: "openai"
|
|
29
|
+
model: "gpt-5-nano"
|
|
30
|
+
cwd: "workspace1"
|
|
31
|
+
|
|
32
|
+
# All agents connect to the same crawl4ai container
|
|
33
|
+
mcp_servers:
|
|
34
|
+
- name: "crawl4ai"
|
|
35
|
+
type: "sse"
|
|
36
|
+
url: "http://localhost:11235/mcp/sse"
|
|
37
|
+
|
|
38
|
+
enable_web_search: true
|
|
39
|
+
|
|
40
|
+
system_message: |
|
|
41
|
+
You are a research specialist focused on finding authoritative sources and extracting key insights.
|
|
42
|
+
|
|
43
|
+
Use the crawl4ai tools to scrape documentation, articles, and research papers.
|
|
44
|
+
Prioritize official documentation and academic sources.
|
|
45
|
+
|
|
46
|
+
- id: "research_agent_2"
|
|
47
|
+
backend:
|
|
48
|
+
type: "gemini"
|
|
49
|
+
model: "gemini-2.5-pro"
|
|
50
|
+
cwd: "workspace2"
|
|
51
|
+
|
|
52
|
+
# Connects to the SAME crawl4ai container as agent_1
|
|
53
|
+
mcp_servers:
|
|
54
|
+
- name: "crawl4ai"
|
|
55
|
+
type: "sse"
|
|
56
|
+
url: "http://localhost:11235/mcp/sse"
|
|
57
|
+
|
|
58
|
+
enable_web_search: true
|
|
59
|
+
|
|
60
|
+
system_message: |
|
|
61
|
+
You are a synthesis specialist who combines information from multiple sources.
|
|
62
|
+
|
|
63
|
+
Use crawl4ai to gather diverse perspectives and cross-reference information.
|
|
64
|
+
Focus on finding patterns and connections across sources.
|
|
65
|
+
|
|
66
|
+
ui:
|
|
67
|
+
display_type: "rich_terminal"
|
|
68
|
+
logging_enabled: true
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# massgen --config massgen/configs/tools/custom_tools/multimodal_tools/playwright_with_img_understanding.yaml "Analyze docs.massgen.ai and tell me how to improve its design."
|
|
2
|
+
|
|
3
|
+
agents:
|
|
4
|
+
- id: agent_a
|
|
5
|
+
backend:
|
|
6
|
+
type: openai
|
|
7
|
+
model: gpt-5-codex
|
|
8
|
+
text:
|
|
9
|
+
verbosity: medium
|
|
10
|
+
reasoning:
|
|
11
|
+
effort: medium
|
|
12
|
+
summary: auto
|
|
13
|
+
cwd: workspace1
|
|
14
|
+
enable_mcp_command_line: true
|
|
15
|
+
command_line_execution_mode: docker
|
|
16
|
+
command_line_docker_network_mode: "bridge" # Enable network access (default: none)
|
|
17
|
+
enable_web_search: true
|
|
18
|
+
custom_tools:
|
|
19
|
+
- name: ["understand_image"]
|
|
20
|
+
category: "multimodal"
|
|
21
|
+
path: "massgen/tool/_multimodal_tools/understand_image.py"
|
|
22
|
+
function: ["understand_image"]
|
|
23
|
+
mcp_servers:
|
|
24
|
+
playwright:
|
|
25
|
+
type: "stdio"
|
|
26
|
+
command: "npx"
|
|
27
|
+
args: [
|
|
28
|
+
"@playwright/mcp@latest",
|
|
29
|
+
"--browser=chrome", # Use Chrome browser
|
|
30
|
+
"--caps=vision,pdf", # Enable vision and PDF capabilities
|
|
31
|
+
"--user-data-dir=${cwd}/playwright-profile", # Persistent browser profile within workspace
|
|
32
|
+
"--output-dir=${cwd}", # Save screenshots/PDFs directly to workspace
|
|
33
|
+
# "--save-trace" # Save Playwright traces for debugging
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
- id: agent_b
|
|
37
|
+
backend:
|
|
38
|
+
type: claude_code
|
|
39
|
+
model: claude-sonnet-4-5-20250929
|
|
40
|
+
cwd: workspace2
|
|
41
|
+
enable_mcp_command_line: true
|
|
42
|
+
command_line_execution_mode: docker
|
|
43
|
+
command_line_docker_network_mode: "bridge" # Enable network access (default: none)
|
|
44
|
+
custom_tools:
|
|
45
|
+
- name: ["understand_image"]
|
|
46
|
+
category: "multimodal"
|
|
47
|
+
path: "massgen/tool/_multimodal_tools/understand_image.py"
|
|
48
|
+
function: ["understand_image"]
|
|
49
|
+
mcp_servers:
|
|
50
|
+
playwright:
|
|
51
|
+
type: "stdio"
|
|
52
|
+
command: "npx"
|
|
53
|
+
args: [
|
|
54
|
+
"@playwright/mcp@latest",
|
|
55
|
+
"--browser=chrome", # Use Chrome browser
|
|
56
|
+
"--caps=vision,pdf", # Enable vision and PDF capabilities
|
|
57
|
+
"--user-data-dir=${cwd}/playwright-profile", # Persistent browser profile within workspace
|
|
58
|
+
"--output-dir=${cwd}", # Save screenshots/PDFs directly to workspace
|
|
59
|
+
# "--save-trace" # Save Playwright traces for debugging
|
|
60
|
+
]
|
|
61
|
+
|
|
62
|
+
- id: agent_c
|
|
63
|
+
backend:
|
|
64
|
+
type: chatcompletion
|
|
65
|
+
base_url: "https://openrouter.ai/api/v1"
|
|
66
|
+
model: qwen/qwen3-coder
|
|
67
|
+
cwd: workspace3
|
|
68
|
+
enable_mcp_command_line: true
|
|
69
|
+
command_line_execution_mode: docker
|
|
70
|
+
command_line_docker_network_mode: "bridge" # Enable network access (default: none)
|
|
71
|
+
custom_tools:
|
|
72
|
+
- name: ["understand_image"]
|
|
73
|
+
category: "multimodal"
|
|
74
|
+
path: "massgen/tool/_multimodal_tools/understand_image.py"
|
|
75
|
+
function: ["understand_image"]
|
|
76
|
+
mcp_servers:
|
|
77
|
+
playwright:
|
|
78
|
+
type: "stdio"
|
|
79
|
+
command: "npx"
|
|
80
|
+
args: [
|
|
81
|
+
"@playwright/mcp@latest",
|
|
82
|
+
"--browser=chrome", # Use Chrome browser
|
|
83
|
+
"--caps=vision,pdf", # Enable vision and PDF capabilities
|
|
84
|
+
"--user-data-dir=${cwd}/playwright-profile", # Persistent browser profile within workspace
|
|
85
|
+
"--output-dir=${cwd}", # Save screenshots/PDFs directly to workspace
|
|
86
|
+
# "--save-trace" # Save Playwright traces for debugging
|
|
87
|
+
]
|
|
88
|
+
|
|
89
|
+
ui:
|
|
90
|
+
display_type: rich_terminal
|
|
91
|
+
logging_enabled: true
|
|
92
|
+
orchestrator:
|
|
93
|
+
snapshot_storage: snapshots
|
|
94
|
+
agent_temporary_workspace: temp_workspaces
|
|
95
|
+
session_storage: sessions
|
|
96
|
+
# voting_sensitivity: balanced
|
|
97
|
+
max_new_answers_per_agent: 5
|
|
98
|
+
# answer_novelty_requirement: balanced
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# MassGen Configuration: Understand Audio Tool
|
|
2
|
+
# Usage:
|
|
3
|
+
# massgen --config massgen/configs/tools/custom_tools/multimodal_tools/understand_audio.yaml "Please summarize the content in this audio."
|
|
4
|
+
agents:
|
|
5
|
+
- id: "understand_audio_tool"
|
|
6
|
+
backend:
|
|
7
|
+
type: "openai"
|
|
8
|
+
model: "gpt-5-nano"
|
|
9
|
+
cwd: "workspace1"
|
|
10
|
+
custom_tools:
|
|
11
|
+
- name: ["understand_audio"]
|
|
12
|
+
category: "multimodal"
|
|
13
|
+
path: "massgen/tool/_multimodal_tools/understand_audio.py"
|
|
14
|
+
function: ["understand_audio"]
|
|
15
|
+
system_message: |
|
|
16
|
+
You are an AI assistant with access to audio transcription capabilities.
|
|
17
|
+
|
|
18
|
+
The understand_audio tool is available to transcribe audio files to text using OpenAI's Transcription API.
|
|
19
|
+
Do not output tool call syntax or function declarations. Focus on answering the user's question clearly.
|
|
20
|
+
|
|
21
|
+
When users ask about transcribing or understanding audio files, use the understand_audio tool to
|
|
22
|
+
process the audio and provide the transcription.
|
|
23
|
+
|
|
24
|
+
orchestrator:
|
|
25
|
+
snapshot_storage: "snapshots"
|
|
26
|
+
agent_temporary_workspace: "temp_workspaces"
|
|
27
|
+
context_paths:
|
|
28
|
+
- path: "massgen/configs/resources/v0.1.3-example/Sherlock_Holmes.mp3"
|
|
29
|
+
permission: "read"
|
|
30
|
+
|
|
31
|
+
ui:
|
|
32
|
+
display_type: "rich_terminal"
|
|
33
|
+
logging_enabled: true
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# MassGen Configuration: Understand File Tool
|
|
2
|
+
# Usage:
|
|
3
|
+
# massgen --config massgen/configs/tools/custom_tools/multimodal_tools/understand_file.yaml "Please summarize the content in this file."
|
|
4
|
+
agents:
|
|
5
|
+
- id: "understand_file_tool"
|
|
6
|
+
backend:
|
|
7
|
+
type: "openai"
|
|
8
|
+
model: "gpt-5-nano"
|
|
9
|
+
cwd: "workspace1"
|
|
10
|
+
custom_tools:
|
|
11
|
+
- name: ["understand_file"]
|
|
12
|
+
category: "multimodal"
|
|
13
|
+
path: "massgen/tool/_multimodal_tools/understand_file.py"
|
|
14
|
+
function: ["understand_file"]
|
|
15
|
+
system_message: |
|
|
16
|
+
You are an AI assistant with access to file understanding capabilities.
|
|
17
|
+
|
|
18
|
+
The understand_file tool is available to analyze and understand file contents using OpenAI's gpt-4.1 API.
|
|
19
|
+
It supports text files, PDF, DOCX, XLSX, PPTX, and more.
|
|
20
|
+
Do not output tool call syntax or function declarations. Focus on answering the user's question clearly.
|
|
21
|
+
|
|
22
|
+
When users ask about analyzing or understanding files, use the understand_file tool to process
|
|
23
|
+
the file and provide detailed descriptions or answers to their questions.
|
|
24
|
+
|
|
25
|
+
orchestrator:
|
|
26
|
+
snapshot_storage: "snapshots"
|
|
27
|
+
agent_temporary_workspace: "temp_workspaces"
|
|
28
|
+
context_paths:
|
|
29
|
+
- path: "massgen/configs/resources/v0.1.3-example/TUMIX.pdf"
|
|
30
|
+
permission: "read"
|
|
31
|
+
|
|
32
|
+
ui:
|
|
33
|
+
display_type: "rich_terminal"
|
|
34
|
+
logging_enabled: true
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# MassGen Configuration: Understand Image Tool
|
|
2
|
+
# Usage:
|
|
3
|
+
# massgen --config massgen/configs/tools/custom_tools/multimodal_tools/understand_image.yaml "Please summarize the content in this image."
|
|
4
|
+
agents:
|
|
5
|
+
- id: "understand_image_tool"
|
|
6
|
+
backend:
|
|
7
|
+
type: "openai"
|
|
8
|
+
model: "gpt-5-nano"
|
|
9
|
+
cwd: "workspace1"
|
|
10
|
+
custom_tools:
|
|
11
|
+
- name: ["understand_image"]
|
|
12
|
+
category: "multimodal"
|
|
13
|
+
path: "massgen/tool/_multimodal_tools/understand_image.py"
|
|
14
|
+
function: ["understand_image"]
|
|
15
|
+
system_message: |
|
|
16
|
+
You are an AI assistant with access to image understanding capabilities.
|
|
17
|
+
|
|
18
|
+
The understand_image tool is available to analyze and understand images using OpenAI's gpt-4.1 API.
|
|
19
|
+
Do not output tool call syntax or function declarations. Focus on answering the user's question clearly.
|
|
20
|
+
|
|
21
|
+
When users ask about analyzing or understanding images, use the understand_image tool to process
|
|
22
|
+
the image and provide detailed descriptions or answers to their questions.
|
|
23
|
+
|
|
24
|
+
orchestrator:
|
|
25
|
+
snapshot_storage: "snapshots"
|
|
26
|
+
agent_temporary_workspace: "temp_workspaces"
|
|
27
|
+
context_paths:
|
|
28
|
+
- path: "massgen/configs/resources/v0.1.3-example/multimodality.jpg"
|
|
29
|
+
permission: "read"
|
|
30
|
+
|
|
31
|
+
ui:
|
|
32
|
+
display_type: "rich_terminal"
|
|
33
|
+
logging_enabled: true
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# MassGen Configuration: Understand Video Tool
|
|
2
|
+
# Usage:
|
|
3
|
+
# massgen --config massgen/configs/tools/custom_tools/multimodal_tools/understand_video.yaml "What's happening in this video?"
|
|
4
|
+
agents:
|
|
5
|
+
- id: "understand_video_tool"
|
|
6
|
+
backend:
|
|
7
|
+
type: "openai"
|
|
8
|
+
model: "gpt-5-nano"
|
|
9
|
+
cwd: "workspace1"
|
|
10
|
+
custom_tools:
|
|
11
|
+
- name: ["understand_video"]
|
|
12
|
+
category: "multimodal"
|
|
13
|
+
path: "massgen/tool/_multimodal_tools/understand_video.py"
|
|
14
|
+
function: ["understand_video"]
|
|
15
|
+
system_message: |
|
|
16
|
+
You are an AI assistant with access to video understanding capabilities.
|
|
17
|
+
|
|
18
|
+
The understand_video tool is available to analyze and understand videos by extracting key frames
|
|
19
|
+
and using OpenAI's gpt-4.1 API.
|
|
20
|
+
Do not output tool call syntax or function declarations. Focus on answering the user's question clearly.
|
|
21
|
+
|
|
22
|
+
When users ask about analyzing or understanding videos, use the understand_video tool to process
|
|
23
|
+
the video and provide detailed descriptions or answers to their questions.
|
|
24
|
+
|
|
25
|
+
orchestrator:
|
|
26
|
+
snapshot_storage: "snapshots"
|
|
27
|
+
agent_temporary_workspace: "temp_workspaces"
|
|
28
|
+
context_paths:
|
|
29
|
+
- path: "massgen/configs/resources/v0.1.3-example/oppenheimer_trailer_1920.mp4"
|
|
30
|
+
permission: "read"
|
|
31
|
+
|
|
32
|
+
ui:
|
|
33
|
+
display_type: "rich_terminal"
|
|
34
|
+
logging_enabled: true
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# MassGen Configuration: Understand Video Example
|
|
2
|
+
#
|
|
3
|
+
# Use Case: Analyze a specific video file using the understand_video tool
|
|
4
|
+
#
|
|
5
|
+
# This demonstrates direct video analysis without needing to download.
|
|
6
|
+
# The video file is provided as a context path for agents to analyze.
|
|
7
|
+
#
|
|
8
|
+
# Run with:
|
|
9
|
+
# uv run massgen --config massgen/configs/tools/custom_tools/multimodal_tools/understand_video_example.yaml "What is shown in this video?"
|
|
10
|
+
|
|
11
|
+
agents:
|
|
12
|
+
- id: "agent_a"
|
|
13
|
+
backend:
|
|
14
|
+
type: "openai"
|
|
15
|
+
model: "gpt-5-mini"
|
|
16
|
+
text:
|
|
17
|
+
verbosity: "medium"
|
|
18
|
+
reasoning:
|
|
19
|
+
effort: "medium"
|
|
20
|
+
summary: "auto"
|
|
21
|
+
custom_tools:
|
|
22
|
+
- name: ["understand_video"]
|
|
23
|
+
category: "multimodal"
|
|
24
|
+
path: "massgen/tool/_multimodal_tools/understand_video.py"
|
|
25
|
+
function: ["understand_video"]
|
|
26
|
+
cwd: "workspace1"
|
|
27
|
+
|
|
28
|
+
- id: "agent_b"
|
|
29
|
+
backend:
|
|
30
|
+
type: "gemini"
|
|
31
|
+
model: "gemini-2.5-pro"
|
|
32
|
+
custom_tools:
|
|
33
|
+
- name: ["understand_video"]
|
|
34
|
+
category: "multimodal"
|
|
35
|
+
path: "massgen/tool/_multimodal_tools/understand_video.py"
|
|
36
|
+
function: ["understand_video"]
|
|
37
|
+
cwd: "workspace2"
|
|
38
|
+
|
|
39
|
+
orchestrator:
|
|
40
|
+
snapshot_storage: "snapshots"
|
|
41
|
+
agent_temporary_workspace: "temp_workspaces"
|
|
42
|
+
context_paths:
|
|
43
|
+
- path: "massgen/configs/resources/v0.1.3-example/case-study-videos/Dp2oldJJImw.mp4"
|
|
44
|
+
permission: "read"
|
|
45
|
+
|
|
46
|
+
ui:
|
|
47
|
+
display_type: "rich_terminal"
|
|
48
|
+
logging_enabled: true
|
|
49
|
+
|
|
50
|
+
# What happens:
|
|
51
|
+
# 1. Agents have read access to the video file
|
|
52
|
+
# 2. They can use understand_video tool to analyze it
|
|
53
|
+
# 3. Tool extracts 8 frames and analyzes with GPT-4.1
|
|
54
|
+
# 4. Agents collaborate to provide comprehensive insights
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# MassGen Configuration: YouTube Video Analysis with Multimodal Understanding
|
|
2
|
+
#
|
|
3
|
+
# Use Case: Download and analyze YouTube videos from MassGen case studies
|
|
4
|
+
#
|
|
5
|
+
# This demonstrates MassGen's self-evolution capabilities by having agents:
|
|
6
|
+
# 1. Read local case study documentation to discover video URLs
|
|
7
|
+
# 2. Download YouTube videos using yt-dlp via command-line execution
|
|
8
|
+
# 3. Analyze video content using the understand_video multimodal tool
|
|
9
|
+
# 4. Extract insights that could inform future feature development
|
|
10
|
+
#
|
|
11
|
+
# Run with:
|
|
12
|
+
# uv run massgen --config massgen/configs/tools/custom_tools/multimodal_tools/youtube_video_analysis.yaml "Download recent MassGen case study videos listed in the case study md files, analyze them, find out how to improve them and automate their creation."
|
|
13
|
+
|
|
14
|
+
agents:
|
|
15
|
+
- id: "agent_a"
|
|
16
|
+
backend:
|
|
17
|
+
type: "openai"
|
|
18
|
+
model: "gpt-5-mini"
|
|
19
|
+
text:
|
|
20
|
+
verbosity: "medium"
|
|
21
|
+
reasoning:
|
|
22
|
+
effort: "medium"
|
|
23
|
+
summary: "auto"
|
|
24
|
+
custom_tools:
|
|
25
|
+
- name: ["understand_video"]
|
|
26
|
+
category: "multimodal"
|
|
27
|
+
path: "massgen/tool/_multimodal_tools/understand_video.py"
|
|
28
|
+
function: ["understand_video"]
|
|
29
|
+
enable_mcp_command_line: true
|
|
30
|
+
command_line_execution_mode: docker
|
|
31
|
+
command_line_docker_enable_sudo: true
|
|
32
|
+
command_line_docker_network_mode: "bridge"
|
|
33
|
+
cwd: "workspace1"
|
|
34
|
+
|
|
35
|
+
- id: "agent_b"
|
|
36
|
+
backend:
|
|
37
|
+
type: "claude_code"
|
|
38
|
+
model: "claude-sonnet-4-5-20250929"
|
|
39
|
+
custom_tools:
|
|
40
|
+
- name: ["understand_video"]
|
|
41
|
+
category: "multimodal"
|
|
42
|
+
path: "massgen/tool/_multimodal_tools/understand_video.py"
|
|
43
|
+
function: ["understand_video"]
|
|
44
|
+
enable_mcp_command_line: true
|
|
45
|
+
command_line_execution_mode: docker
|
|
46
|
+
command_line_docker_enable_sudo: true
|
|
47
|
+
command_line_docker_network_mode: "bridge"
|
|
48
|
+
cwd: "workspace2"
|
|
49
|
+
|
|
50
|
+
orchestrator:
|
|
51
|
+
snapshot_storage: "snapshots"
|
|
52
|
+
agent_temporary_workspace: "temp_workspaces"
|
|
53
|
+
context_paths:
|
|
54
|
+
- path: "docs/case_studies"
|
|
55
|
+
permission: "read"
|
|
56
|
+
|
|
57
|
+
ui:
|
|
58
|
+
display_type: "rich_terminal"
|
|
59
|
+
logging_enabled: true
|