massgen 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of massgen might be problematic. Click here for more details.
- massgen/__init__.py +1 -1
- massgen/api_params_handler/_chat_completions_api_params_handler.py +4 -0
- massgen/api_params_handler/_claude_api_params_handler.py +4 -0
- massgen/api_params_handler/_gemini_api_params_handler.py +4 -0
- massgen/api_params_handler/_response_api_params_handler.py +4 -0
- massgen/backend/base_with_custom_tool_and_mcp.py +25 -5
- massgen/backend/docs/permissions_and_context_files.md +2 -2
- massgen/backend/response.py +2 -0
- massgen/configs/README.md +49 -40
- massgen/configs/tools/custom_tools/crawl4ai_example.yaml +55 -0
- massgen/configs/tools/custom_tools/multimodal_tools/text_to_file_generation_multi.yaml +61 -0
- massgen/configs/tools/custom_tools/multimodal_tools/text_to_file_generation_single.yaml +29 -0
- massgen/configs/tools/custom_tools/multimodal_tools/text_to_image_generation_multi.yaml +51 -0
- massgen/configs/tools/custom_tools/multimodal_tools/text_to_image_generation_single.yaml +33 -0
- massgen/configs/tools/custom_tools/multimodal_tools/text_to_speech_generation_multi.yaml +55 -0
- massgen/configs/tools/custom_tools/multimodal_tools/text_to_speech_generation_single.yaml +33 -0
- massgen/configs/tools/custom_tools/multimodal_tools/text_to_video_generation_multi.yaml +47 -0
- massgen/configs/tools/custom_tools/multimodal_tools/text_to_video_generation_single.yaml +29 -0
- massgen/configs/tools/custom_tools/multimodal_tools/understand_audio.yaml +1 -1
- massgen/configs/tools/custom_tools/multimodal_tools/understand_file.yaml +1 -1
- massgen/configs/tools/custom_tools/multimodal_tools/understand_image.yaml +1 -1
- massgen/configs/tools/custom_tools/multimodal_tools/understand_video.yaml +1 -1
- massgen/configs/tools/custom_tools/multimodal_tools/youtube_video_analysis.yaml +1 -1
- massgen/filesystem_manager/_filesystem_manager.py +1 -0
- massgen/filesystem_manager/_path_permission_manager.py +148 -0
- massgen/message_templates.py +160 -12
- massgen/orchestrator.py +16 -0
- massgen/tests/test_binary_file_blocking.py +274 -0
- massgen/tests/test_case_studies.md +12 -12
- massgen/tests/test_multimodal_size_limits.py +407 -0
- massgen/tool/_manager.py +7 -2
- massgen/tool/_multimodal_tools/image_to_image_generation.py +293 -0
- massgen/tool/_multimodal_tools/text_to_file_generation.py +455 -0
- massgen/tool/_multimodal_tools/text_to_image_generation.py +222 -0
- massgen/tool/_multimodal_tools/text_to_speech_continue_generation.py +226 -0
- massgen/tool/_multimodal_tools/text_to_speech_transcription_generation.py +217 -0
- massgen/tool/_multimodal_tools/text_to_video_generation.py +223 -0
- massgen/tool/_multimodal_tools/understand_audio.py +19 -1
- massgen/tool/_multimodal_tools/understand_file.py +6 -1
- massgen/tool/_multimodal_tools/understand_image.py +112 -8
- massgen/tool/_multimodal_tools/understand_video.py +32 -5
- massgen/tool/_web_tools/crawl4ai_tool.py +718 -0
- massgen/tool/docs/multimodal_tools.md +589 -0
- {massgen-0.1.3.dist-info → massgen-0.1.4.dist-info}/METADATA +96 -69
- {massgen-0.1.3.dist-info → massgen-0.1.4.dist-info}/RECORD +49 -40
- massgen/configs/tools/custom_tools/crawl4ai_mcp_example.yaml +0 -67
- massgen/configs/tools/custom_tools/crawl4ai_multi_agent_example.yaml +0 -68
- massgen/configs/tools/custom_tools/multimodal_tools/playwright_with_img_understanding.yaml +0 -98
- massgen/configs/tools/custom_tools/multimodal_tools/understand_video_example.yaml +0 -54
- massgen/configs/tools/memory/README.md +0 -199
- massgen/configs/tools/memory/gpt5mini_gemini_context_window_management.yaml +0 -131
- massgen/configs/tools/memory/gpt5mini_gemini_no_persistent_memory.yaml +0 -133
- massgen/configs/tools/memory/test_context_window_management.py +0 -286
- massgen/configs/tools/multimodal/gpt5mini_gpt5nano_documentation_evolution.yaml +0 -97
- {massgen-0.1.3.dist-info → massgen-0.1.4.dist-info}/WHEEL +0 -0
- {massgen-0.1.3.dist-info → massgen-0.1.4.dist-info}/entry_points.txt +0 -0
- {massgen-0.1.3.dist-info → massgen-0.1.4.dist-info}/licenses/LICENSE +0 -0
- {massgen-0.1.3.dist-info → massgen-0.1.4.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# MassGen Configuration: Text to Speech Continue Generation Tool
|
|
2
|
+
# Usage:
|
|
3
|
+
# uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/text_to_speech_generation_single.yaml "I want to you tell me a very short introduction about Sherlock Homes in one sentence, and I want you to use emotion voice to read it out loud."
|
|
4
|
+
agents:
|
|
5
|
+
- id: "text_to_speech_continue_generation_tool"
|
|
6
|
+
backend:
|
|
7
|
+
type: "openai"
|
|
8
|
+
model: "gpt-4o"
|
|
9
|
+
cwd: "workspace1"
|
|
10
|
+
enable_audio_generation: true
|
|
11
|
+
custom_tools:
|
|
12
|
+
- name: ["text_to_speech_transcription_generation"]
|
|
13
|
+
category: "multimodal"
|
|
14
|
+
path: "massgen/tool/_multimodal_tools/text_to_speech_transcription_generation.py"
|
|
15
|
+
function: ["text_to_speech_transcription_generation"]
|
|
16
|
+
- name: ["understand_audio"]
|
|
17
|
+
category: "multimodal"
|
|
18
|
+
path: "massgen/tool/_multimodal_tools/understand_audio.py"
|
|
19
|
+
function: ["understand_audio"]
|
|
20
|
+
- name: ["text_to_speech_continue_generation"]
|
|
21
|
+
category: "multimodal"
|
|
22
|
+
path: "massgen/tool/_multimodal_tools/text_to_speech_continue_generation.py"
|
|
23
|
+
function: ["text_to_speech_continue_generation"]
|
|
24
|
+
system_message: |
|
|
25
|
+
You are an AI assistant with access to text-to-speech generation capabilities.
|
|
26
|
+
|
|
27
|
+
orchestrator:
|
|
28
|
+
snapshot_storage: "snapshots"
|
|
29
|
+
agent_temporary_workspace: "temp_workspaces"
|
|
30
|
+
|
|
31
|
+
ui:
|
|
32
|
+
display_type: "simple"
|
|
33
|
+
logging_enabled: true
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# MassGen Configuration: Text to Video Generation Tool
|
|
2
|
+
# Usage:
|
|
3
|
+
# uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/text_to_video_generation_multi.yaml "Generate a 4 seconds video with neon-lit alley at night, light rain, slow push-in, cinematic."
|
|
4
|
+
agents:
|
|
5
|
+
- id: "text_to_video_generation_tool1"
|
|
6
|
+
backend:
|
|
7
|
+
type: "openai"
|
|
8
|
+
model: "gpt-4o"
|
|
9
|
+
cwd: "workspace1"
|
|
10
|
+
enable_video_generation: true
|
|
11
|
+
custom_tools:
|
|
12
|
+
- name: ["understand_video"]
|
|
13
|
+
category: "multimodal"
|
|
14
|
+
path: "massgen/tool/_multimodal_tools/understand_video.py"
|
|
15
|
+
function: ["understand_video"]
|
|
16
|
+
- name: ["text_to_video_generation"]
|
|
17
|
+
category: "multimodal"
|
|
18
|
+
path: "massgen/tool/_multimodal_tools/text_to_video_generation.py"
|
|
19
|
+
function: ["text_to_video_generation"]
|
|
20
|
+
system_message: |
|
|
21
|
+
You are an AI assistant with access to text-to-video generation capabilities.
|
|
22
|
+
|
|
23
|
+
- id: "text_to_video_generation_tool2"
|
|
24
|
+
backend:
|
|
25
|
+
type: "openai"
|
|
26
|
+
model: "gpt-4o"
|
|
27
|
+
cwd: "workspace2"
|
|
28
|
+
enable_video_generation: true
|
|
29
|
+
custom_tools:
|
|
30
|
+
- name: ["understand_video"]
|
|
31
|
+
category: "multimodal"
|
|
32
|
+
path: "massgen/tool/_multimodal_tools/understand_video.py"
|
|
33
|
+
function: ["understand_video"]
|
|
34
|
+
- name: ["text_to_video_generation"]
|
|
35
|
+
category: "multimodal"
|
|
36
|
+
path: "massgen/tool/_multimodal_tools/text_to_video_generation.py"
|
|
37
|
+
function: ["text_to_video_generation"]
|
|
38
|
+
system_message: |
|
|
39
|
+
You are an AI assistant with access to text-to-video generation capabilities.
|
|
40
|
+
|
|
41
|
+
orchestrator:
|
|
42
|
+
snapshot_storage: "snapshots"
|
|
43
|
+
agent_temporary_workspace: "temp_workspaces"
|
|
44
|
+
|
|
45
|
+
ui:
|
|
46
|
+
display_type: "rich_terminal"
|
|
47
|
+
logging_enabled: true
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# MassGen Configuration: Text to Video Generation Tool
|
|
2
|
+
# Usage:
|
|
3
|
+
# uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/text_to_video_generation_single.yaml "Generate a 4 seconds video with neon-lit alley at night, light rain, slow push-in, cinematic."
|
|
4
|
+
agents:
|
|
5
|
+
- id: "text_to_video_generation_tool"
|
|
6
|
+
backend:
|
|
7
|
+
type: "openai"
|
|
8
|
+
model: "gpt-4o"
|
|
9
|
+
cwd: "workspace1"
|
|
10
|
+
enable_video_generation: true
|
|
11
|
+
custom_tools:
|
|
12
|
+
- name: ["understand_video"]
|
|
13
|
+
category: "multimodal"
|
|
14
|
+
path: "massgen/tool/_multimodal_tools/understand_video.py"
|
|
15
|
+
function: ["understand_video"]
|
|
16
|
+
- name: ["text_to_video_generation"]
|
|
17
|
+
category: "multimodal"
|
|
18
|
+
path: "massgen/tool/_multimodal_tools/text_to_video_generation.py"
|
|
19
|
+
function: ["text_to_video_generation"]
|
|
20
|
+
system_message: |
|
|
21
|
+
You are an AI assistant with access to text-to-video generation capabilities.
|
|
22
|
+
|
|
23
|
+
orchestrator:
|
|
24
|
+
snapshot_storage: "snapshots"
|
|
25
|
+
agent_temporary_workspace: "temp_workspaces"
|
|
26
|
+
|
|
27
|
+
ui:
|
|
28
|
+
display_type: "simple"
|
|
29
|
+
logging_enabled: true
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# MassGen Configuration: Understand Audio Tool
|
|
2
2
|
# Usage:
|
|
3
|
-
# massgen --config massgen/configs/tools/custom_tools/multimodal_tools/understand_audio.yaml "Please summarize the content in this audio."
|
|
3
|
+
# uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/understand_audio.yaml "Please summarize the content in this audio."
|
|
4
4
|
agents:
|
|
5
5
|
- id: "understand_audio_tool"
|
|
6
6
|
backend:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# MassGen Configuration: Understand File Tool
|
|
2
2
|
# Usage:
|
|
3
|
-
# massgen --config massgen/configs/tools/custom_tools/multimodal_tools/understand_file.yaml "Please summarize the content in this file."
|
|
3
|
+
# uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/understand_file.yaml "Please summarize the content in this file."
|
|
4
4
|
agents:
|
|
5
5
|
- id: "understand_file_tool"
|
|
6
6
|
backend:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# MassGen Configuration: Understand Image Tool
|
|
2
2
|
# Usage:
|
|
3
|
-
# massgen --config massgen/configs/tools/custom_tools/multimodal_tools/understand_image.yaml "Please summarize the content in this image."
|
|
3
|
+
# uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/understand_image.yaml "Please summarize the content in this image."
|
|
4
4
|
agents:
|
|
5
5
|
- id: "understand_image_tool"
|
|
6
6
|
backend:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# MassGen Configuration: Understand Video Tool
|
|
2
2
|
# Usage:
|
|
3
|
-
# massgen --config massgen/configs/tools/custom_tools/multimodal_tools/understand_video.yaml "What's happening in this video?"
|
|
3
|
+
# uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/understand_video.yaml "What's happening in this video?"
|
|
4
4
|
agents:
|
|
5
5
|
- id: "understand_video_tool"
|
|
6
6
|
backend:
|
|
@@ -90,6 +90,68 @@ class PathPermissionManager:
|
|
|
90
90
|
"massgen_logs",
|
|
91
91
|
]
|
|
92
92
|
|
|
93
|
+
# Binary file extensions that should not be read by text-based tools
|
|
94
|
+
# These files should be handled by specialized tools (understand_image, understand_video, etc.)
|
|
95
|
+
BINARY_FILE_EXTENSIONS = {
|
|
96
|
+
# Images
|
|
97
|
+
".jpg",
|
|
98
|
+
".jpeg",
|
|
99
|
+
".png",
|
|
100
|
+
".gif",
|
|
101
|
+
".bmp",
|
|
102
|
+
".ico",
|
|
103
|
+
".svg",
|
|
104
|
+
".webp",
|
|
105
|
+
".tiff",
|
|
106
|
+
".tif",
|
|
107
|
+
# Videos
|
|
108
|
+
".mp4",
|
|
109
|
+
".avi",
|
|
110
|
+
".mov",
|
|
111
|
+
".mkv",
|
|
112
|
+
".flv",
|
|
113
|
+
".wmv",
|
|
114
|
+
".webm",
|
|
115
|
+
".m4v",
|
|
116
|
+
".mpg",
|
|
117
|
+
".mpeg",
|
|
118
|
+
# Audio
|
|
119
|
+
".mp3",
|
|
120
|
+
".wav",
|
|
121
|
+
".ogg",
|
|
122
|
+
".flac",
|
|
123
|
+
".aac",
|
|
124
|
+
".m4a",
|
|
125
|
+
".wma",
|
|
126
|
+
# Archives
|
|
127
|
+
".zip",
|
|
128
|
+
".tar",
|
|
129
|
+
".gz",
|
|
130
|
+
".bz2",
|
|
131
|
+
".7z",
|
|
132
|
+
".rar",
|
|
133
|
+
".xz",
|
|
134
|
+
# Executables and binaries
|
|
135
|
+
".exe",
|
|
136
|
+
".bin",
|
|
137
|
+
".dll",
|
|
138
|
+
".so",
|
|
139
|
+
".dylib",
|
|
140
|
+
".o",
|
|
141
|
+
".a",
|
|
142
|
+
".pyc",
|
|
143
|
+
".class",
|
|
144
|
+
".jar",
|
|
145
|
+
# Office documents (binary formats - use understand_file tool)
|
|
146
|
+
".doc", # Old Word (not supported by understand_file)
|
|
147
|
+
".xls", # Old Excel (not supported by understand_file)
|
|
148
|
+
".ppt", # Old PowerPoint (not supported by understand_file)
|
|
149
|
+
".pdf", # PDF (supported by understand_file with PyPDF2)
|
|
150
|
+
".docx", # Word (supported by understand_file with python-docx)
|
|
151
|
+
".xlsx", # Excel (supported by understand_file with openpyxl)
|
|
152
|
+
".pptx", # PowerPoint (supported by understand_file with python-pptx)
|
|
153
|
+
}
|
|
154
|
+
|
|
93
155
|
def __init__(
|
|
94
156
|
self,
|
|
95
157
|
context_write_access_enabled: bool = False,
|
|
@@ -440,6 +502,12 @@ class PathPermissionManager:
|
|
|
440
502
|
- allowed: Whether the tool call should proceed
|
|
441
503
|
- reason: Explanation if blocked (None if allowed)
|
|
442
504
|
"""
|
|
505
|
+
# Check if read tool is trying to read binary files (images, videos, etc.)
|
|
506
|
+
if self._is_text_read_tool(tool_name):
|
|
507
|
+
binary_check_result = self._validate_binary_file_access(tool_name, tool_args)
|
|
508
|
+
if not binary_check_result[0]:
|
|
509
|
+
return binary_check_result
|
|
510
|
+
|
|
443
511
|
# Track read operations for read-before-delete enforcement
|
|
444
512
|
if self._is_read_tool(tool_name):
|
|
445
513
|
self._track_read_operation(tool_name, tool_args)
|
|
@@ -495,6 +563,33 @@ class PathPermissionManager:
|
|
|
495
563
|
|
|
496
564
|
return False
|
|
497
565
|
|
|
566
|
+
def _is_text_read_tool(self, tool_name: str) -> bool:
|
|
567
|
+
"""
|
|
568
|
+
Check if a tool is a text-based read operation that should not access binary files.
|
|
569
|
+
|
|
570
|
+
These tools are designed for reading text files and should be blocked from
|
|
571
|
+
reading binary files (images, videos, audio, etc.) to prevent context pollution.
|
|
572
|
+
|
|
573
|
+
Tools that read text file contents:
|
|
574
|
+
- Read: Claude Code read tool
|
|
575
|
+
- read_text_file: MCP filesystem read tool
|
|
576
|
+
- read_file: Generic read operations
|
|
577
|
+
"""
|
|
578
|
+
# Use lowercase for case-insensitive matching
|
|
579
|
+
tool_lower = tool_name.lower()
|
|
580
|
+
|
|
581
|
+
# Check if tool name contains any text read operation keywords
|
|
582
|
+
text_read_keywords = [
|
|
583
|
+
"read_text_file", # MCP filesystem: read_text_file
|
|
584
|
+
"read_file", # Generic read operations
|
|
585
|
+
]
|
|
586
|
+
|
|
587
|
+
# Also check for exact "Read" match (Claude Code tool)
|
|
588
|
+
if tool_name == "Read":
|
|
589
|
+
return True
|
|
590
|
+
|
|
591
|
+
return any(keyword in tool_lower for keyword in text_read_keywords)
|
|
592
|
+
|
|
498
593
|
def _is_read_tool(self, tool_name: str) -> bool:
|
|
499
594
|
"""
|
|
500
595
|
Check if a tool is a read operation that should be tracked.
|
|
@@ -518,6 +613,59 @@ class PathPermissionManager:
|
|
|
518
613
|
|
|
519
614
|
return any(keyword in tool_lower for keyword in read_keywords)
|
|
520
615
|
|
|
616
|
+
def _validate_binary_file_access(self, tool_name: str, tool_args: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
|
|
617
|
+
"""
|
|
618
|
+
Validate that text-based read tools are not trying to read binary files.
|
|
619
|
+
|
|
620
|
+
Binary files (images, videos, audio, etc.) should be handled by specialized tools
|
|
621
|
+
to prevent context pollution with binary data.
|
|
622
|
+
|
|
623
|
+
Args:
|
|
624
|
+
tool_name: Name of the tool being called
|
|
625
|
+
tool_args: Arguments passed to the tool
|
|
626
|
+
|
|
627
|
+
Returns:
|
|
628
|
+
Tuple of (allowed: bool, reason: Optional[str])
|
|
629
|
+
- allowed: False if trying to read binary file, True otherwise
|
|
630
|
+
- reason: Explanation if blocked (None if allowed)
|
|
631
|
+
"""
|
|
632
|
+
# Extract file path from arguments
|
|
633
|
+
file_path = self._extract_file_path(tool_args)
|
|
634
|
+
if not file_path:
|
|
635
|
+
# Can't determine path - allow (tool may not access files)
|
|
636
|
+
return (True, None)
|
|
637
|
+
|
|
638
|
+
# Resolve path
|
|
639
|
+
try:
|
|
640
|
+
file_path_str = self._resolve_path_against_workspace(file_path)
|
|
641
|
+
path = Path(file_path_str)
|
|
642
|
+
except Exception:
|
|
643
|
+
# If path resolution fails, allow (will fail elsewhere if invalid)
|
|
644
|
+
return (True, None)
|
|
645
|
+
|
|
646
|
+
# Check file extension
|
|
647
|
+
file_extension = path.suffix.lower()
|
|
648
|
+
if file_extension in self.BINARY_FILE_EXTENSIONS:
|
|
649
|
+
# Determine appropriate tool suggestion based on file type
|
|
650
|
+
if file_extension in {".jpg", ".jpeg", ".png", ".gif", ".bmp", ".ico", ".svg", ".webp", ".tiff", ".tif"}:
|
|
651
|
+
suggestion = "For images, use understand_image tool"
|
|
652
|
+
elif file_extension in {".mp4", ".avi", ".mov", ".mkv", ".flv", ".wmv", ".webm", ".m4v", ".mpg", ".mpeg"}:
|
|
653
|
+
suggestion = "For videos, use understand_video tool"
|
|
654
|
+
elif file_extension in {".mp3", ".wav", ".ogg", ".flac", ".aac", ".m4a", ".wma"}:
|
|
655
|
+
suggestion = "For audio files, use generate_text_with_input_audio tool"
|
|
656
|
+
elif file_extension in {".pdf"}:
|
|
657
|
+
suggestion = "For PDF files, use understand_file tool"
|
|
658
|
+
elif file_extension in {".docx", ".xlsx", ".pptx"}:
|
|
659
|
+
suggestion = "For Office documents, use understand_file tool"
|
|
660
|
+
else:
|
|
661
|
+
suggestion = "Use appropriate specialized tool for this file type"
|
|
662
|
+
|
|
663
|
+
reason = f"Cannot read binary file '{path.name}' with {tool_name}. {suggestion}."
|
|
664
|
+
logger.warning(f"[PathPermissionManager] Blocked {tool_name} from reading binary file: {path}")
|
|
665
|
+
return (False, reason)
|
|
666
|
+
|
|
667
|
+
return (True, None)
|
|
668
|
+
|
|
521
669
|
def _is_delete_tool(self, tool_name: str) -> bool:
|
|
522
670
|
"""
|
|
523
671
|
Check if a tool is a delete operation.
|
massgen/message_templates.py
CHANGED
|
@@ -302,6 +302,8 @@ IMPORTANT: You are responding to the latest message in an ongoing conversation.
|
|
|
302
302
|
original_system_message: Optional[str] = None,
|
|
303
303
|
enable_image_generation: bool = False,
|
|
304
304
|
enable_audio_generation: bool = False,
|
|
305
|
+
enable_file_generation: bool = False,
|
|
306
|
+
enable_video_generation: bool = False,
|
|
305
307
|
has_irreversible_actions: bool = False,
|
|
306
308
|
enable_command_execution: bool = False,
|
|
307
309
|
) -> str:
|
|
@@ -311,6 +313,8 @@ IMPORTANT: You are responding to the latest message in an ongoing conversation.
|
|
|
311
313
|
original_system_message: The agent's original system message to preserve
|
|
312
314
|
enable_image_generation: Whether image generation is enabled
|
|
313
315
|
enable_audio_generation: Whether audio generation is enabled
|
|
316
|
+
enable_file_generation: Whether file generation is enabled
|
|
317
|
+
enable_video_generation: Whether video generation is enabled
|
|
314
318
|
has_irreversible_actions: Whether agent has write access to context paths (requires actual file delivery)
|
|
315
319
|
enable_command_execution: Whether command execution is enabled for this agent
|
|
316
320
|
"""
|
|
@@ -335,21 +339,165 @@ Present the best possible coordinated answer by combining the strengths from all
|
|
|
335
339
|
# Add image generation instructions only if enabled
|
|
336
340
|
if enable_image_generation:
|
|
337
341
|
presentation_instructions += """For image generation tasks:
|
|
338
|
-
|
|
339
|
-
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
342
|
+
|
|
343
|
+
**MANDATORY WORKFLOW - You MUST follow these steps in order:**
|
|
344
|
+
|
|
345
|
+
Step 1: **Check for existing images (REQUIRED)**
|
|
346
|
+
- First, list all files in the Shared Reference directory (temp_workspaces) to find ALL images from EVERY agent
|
|
347
|
+
- Look for image files (.png, .jpg, .jpeg, .gif, .webp, etc.) in each agent's workspace subdirectory
|
|
348
|
+
|
|
349
|
+
Step 2: **Understand ALL existing images (REQUIRED if images exist)**
|
|
350
|
+
- For EACH image file you found, you MUST call the **understand_image** tool to extract its key visual elements, composition, style, and quality
|
|
351
|
+
- Do this for images from yourself AND from other agents - analyze ALL images found
|
|
352
|
+
- DO NOT skip this step even if you think you know the content
|
|
353
|
+
|
|
354
|
+
Step 3: **Synthesize and generate final image (REQUIRED)**
|
|
355
|
+
- If existing images were found and analyzed:
|
|
356
|
+
* Synthesize ALL image analyses into a single, detailed, combined prompt
|
|
357
|
+
* The combined prompt should capture the best visual elements, composition, style, and quality from all analyzed images
|
|
358
|
+
* Call **image_to_image_generation** with this synthesized prompt and ALL images to create the final unified image
|
|
359
|
+
- If NO existing images were found:
|
|
360
|
+
* Generate a new image based directly on the original task requirements
|
|
361
|
+
* Call **text_to_image_generation** with a prompt derived from the original task
|
|
362
|
+
|
|
363
|
+
Step 4: **Save and report (REQUIRED)**
|
|
364
|
+
- Save the final generated image in your workspace
|
|
365
|
+
- Report the saved path in your final answer
|
|
366
|
+
|
|
367
|
+
**CRITICAL**: You MUST complete Steps 1-4 in order. Do not skip checking for existing images. Do not skip calling
|
|
368
|
+
understand_image on found images. This is a mandatory synthesis workflow.
|
|
369
|
+
"""
|
|
370
|
+
# presentation_instructions += """For image generation tasks:
|
|
371
|
+
# - Extract image paths from the existing answer and resolve them in the shared reference.
|
|
372
|
+
# - Gather all agent-produced images (ignore non-existent files).
|
|
373
|
+
# - IMPORTANT: If you find ANY existing images (from yourself or other agents), you MUST call the understand_image tool
|
|
374
|
+
# to analyze EACH image and extract their key visual elements, composition, style, and quality.
|
|
375
|
+
# - IMPORTANT: Synthesize insights from all analyzed images into a detailed, combined prompt that captures the best elements.
|
|
376
|
+
# - IMPORTANT: Call text_to_image_generation with this synthesized prompt to generate the final image.
|
|
377
|
+
# - IMPORTANT: Save the final output in your workspace and output the saved path.
|
|
378
|
+
# - If no existing images are found, generate based on the original task requirements.
|
|
379
|
+
# """
|
|
343
380
|
# Add audio generation instructions only if enabled
|
|
344
381
|
if enable_audio_generation:
|
|
345
382
|
presentation_instructions += """For audio generation tasks:
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
-
|
|
351
|
-
-
|
|
352
|
-
|
|
383
|
+
|
|
384
|
+
**MANDATORY WORKFLOW - You MUST follow these steps in order:**
|
|
385
|
+
|
|
386
|
+
Step 1: **Check for existing audios (REQUIRED)**
|
|
387
|
+
- First, list all files in the Shared Reference directory (temp_workspaces) to find ALL audio files from EVERY agent
|
|
388
|
+
- Look for audio files (.mp3, .wav, .flac, etc.) in each agent's workspace subdirectory
|
|
389
|
+
|
|
390
|
+
Step 2: **Understand ALL existing audios (REQUIRED if audios exist)**
|
|
391
|
+
- For EACH audio file you found, you MUST call the **understand_audio** tool to extract its transcription
|
|
392
|
+
- Do this for audios from yourself AND from other agents - analyze ALL audios found
|
|
393
|
+
- DO NOT skip this step even if you think you know the content
|
|
394
|
+
|
|
395
|
+
Step 3: **Synthesize and generate final audio (REQUIRED)**
|
|
396
|
+
- If existing audios were found and analyzed:
|
|
397
|
+
* Synthesize ALL audio transcriptions into a single, detailed, combined transcription
|
|
398
|
+
* The combined transcription should capture the best content from all analyzed audios
|
|
399
|
+
* Call **text_to_speech_transcription_generation** with this synthesized transcription to create the final unified audio
|
|
400
|
+
- If NO existing audios were found:
|
|
401
|
+
* Generate a new audio based directly on the original task requirements
|
|
402
|
+
* Call **text_to_speech_transcription_generation** with a transcription derived from the original task
|
|
403
|
+
|
|
404
|
+
Step 4: **Save and report (REQUIRED)**
|
|
405
|
+
- Save the final generated audio in your workspace
|
|
406
|
+
- Report the saved path in your final answer
|
|
407
|
+
|
|
408
|
+
**CRITICAL**: You MUST complete Steps 1-4 in order. Do not skip checking for existing audios. Do not skip calling
|
|
409
|
+
understand_audio on found audios. This is a mandatory synthesis workflow.
|
|
410
|
+
"""
|
|
411
|
+
# presentation_instructions += """For audio generation tasks:
|
|
412
|
+
# - Extract audio paths from the existing answer and resolve them in the shared reference.
|
|
413
|
+
# - Gather ALL audio files produced by EVERY agent (ignore non-existent files).
|
|
414
|
+
# - IMPORTANT: If you find ANY existing audios (from yourself or other agents), you MUST call the **understand_audio** tool to extract each audio's transcription.
|
|
415
|
+
# - IMPORTANT: Synthesize transcriptions from all audios into a detailed, combined transcription.
|
|
416
|
+
# - IMPORTANT: You MUST call the **text_to_speech_transcription_generation** tool with this synthesized transcription to generate the final audio.
|
|
417
|
+
# - IMPORTANT: Save the final output in your workspace and output the saved path.
|
|
418
|
+
# - If no existing audios are found, generate based on the original task requirements.
|
|
419
|
+
# """
|
|
420
|
+
# Add file generation instructions only if enabled
|
|
421
|
+
if enable_file_generation:
|
|
422
|
+
presentation_instructions += """For file generation tasks:
|
|
423
|
+
|
|
424
|
+
**MANDATORY WORKFLOW - You MUST follow these steps in order:**
|
|
425
|
+
|
|
426
|
+
Step 1: **Check for existing files (REQUIRED)**
|
|
427
|
+
- First, list all files in the Shared Reference directory (temp_workspaces) to find ALL files from EVERY agent
|
|
428
|
+
- Look for files of the requested type in each agent's workspace subdirectory
|
|
429
|
+
|
|
430
|
+
Step 2: **Understand ALL existing files (REQUIRED if files exist)**
|
|
431
|
+
- For EACH file you found, you MUST call the **understand_file** tool to extract its content, structure, and key elements
|
|
432
|
+
- Do this for files from yourself AND from other agents - analyze ALL files found
|
|
433
|
+
- DO NOT skip this step even if you think you know the content
|
|
434
|
+
|
|
435
|
+
Step 3: **Synthesize and generate final file (REQUIRED)**
|
|
436
|
+
- If existing files were found and analyzed:
|
|
437
|
+
* Synthesize ALL file contents into a single, detailed, combined content
|
|
438
|
+
* The combined content should capture the best elements, structure, and information from all analyzed files
|
|
439
|
+
* Call **text_to_file_generation** with this synthesized content to generate the final unified file
|
|
440
|
+
- If NO existing files were found:
|
|
441
|
+
* Generate a new file based directly on the original task requirements
|
|
442
|
+
* Call **text_to_file_generation** with content derived from the original task
|
|
443
|
+
|
|
444
|
+
Step 4: **Save and report (REQUIRED)**
|
|
445
|
+
- Save the final generated file in your workspace
|
|
446
|
+
- Report the saved path in your final answer
|
|
447
|
+
|
|
448
|
+
**CRITICAL**: You MUST complete Steps 1-4 in order. Do not skip checking for existing files. Do not skip calling
|
|
449
|
+
understand_file on found files. This is a mandatory synthesis workflow.
|
|
450
|
+
"""
|
|
451
|
+
# presentation_instructions += """For file generation tasks:
|
|
452
|
+
# - Extract file paths from the existing answer and resolve them in the shared reference.
|
|
453
|
+
# - Gather ALL files produced by EVERY agent (ignore non-existent files).
|
|
454
|
+
# - IMPORTANT: If you find ANY existing files (from yourself or other agents), you MUST call the **understand_file** tool to extract each file's content.
|
|
455
|
+
# - IMPORTANT: Synthesize contents from all files into a detailed, combined content.
|
|
456
|
+
# - IMPORTANT: You MUST call the **text_to_file_generation** tool with this synthesized content to generate the final file.
|
|
457
|
+
# - IMPORTANT: Save the final output in your workspace and output the saved path.
|
|
458
|
+
# - If no existing files are found, generate based on the original task requirements.
|
|
459
|
+
# """
|
|
460
|
+
# Add video generation instructions only if enabled
|
|
461
|
+
if enable_video_generation:
|
|
462
|
+
presentation_instructions += """For video generation tasks:
|
|
463
|
+
|
|
464
|
+
**MANDATORY WORKFLOW - You MUST follow these steps in order:**
|
|
465
|
+
|
|
466
|
+
Step 1: **Check for existing videos (REQUIRED)**
|
|
467
|
+
- First, list all files in the Shared Reference directory (temp_workspaces) to find ALL videos from EVERY agent
|
|
468
|
+
- Look for video files (.mp4, .avi, .mov, etc.) in each agent's workspace subdirectory
|
|
469
|
+
|
|
470
|
+
Step 2: **Understand ALL existing videos (REQUIRED if videos exist)**
|
|
471
|
+
- For EACH video file you found, you MUST call the **understand_video** tool to extract its description, visual features, and
|
|
472
|
+
key elements
|
|
473
|
+
- Do this for videos from yourself AND from other agents - analyze ALL videos found
|
|
474
|
+
- DO NOT skip this step even if you think you know the content
|
|
475
|
+
|
|
476
|
+
Step 3: **Synthesize and generate final video (REQUIRED)**
|
|
477
|
+
- If existing videos were found and analyzed:
|
|
478
|
+
* Synthesize ALL video descriptions into a single, detailed, combined prompt
|
|
479
|
+
* The combined prompt should capture the best visual elements, composition, motion, and style from all analyzed videos
|
|
480
|
+
* Call **text_to_video_generation** with this synthesized prompt to create the final unified video
|
|
481
|
+
- If NO existing videos were found:
|
|
482
|
+
* Generate a new video based directly on the original task requirements
|
|
483
|
+
* Call **text_to_video_generation** with a prompt derived from the original task
|
|
484
|
+
|
|
485
|
+
Step 4: **Save and report (REQUIRED)**
|
|
486
|
+
- Save the final generated video in your workspace
|
|
487
|
+
- Report the saved path in your final answer
|
|
488
|
+
|
|
489
|
+
**CRITICAL**: You MUST complete Steps 1-4 in order. Do not skip checking for existing videos. Do not skip calling
|
|
490
|
+
understand_video on found videos. This is a mandatory synthesis workflow.
|
|
491
|
+
"""
|
|
492
|
+
# presentation_instructions += """For video generation tasks:
|
|
493
|
+
# - Extract video paths from the existing answer and resolve them in the shared reference.
|
|
494
|
+
# - Gather ALL videos produced by EVERY agent (ignore non-existent files).
|
|
495
|
+
# - IMPORTANT: If you find ANY existing videos (from yourself or other agents), you MUST call the **understand_video** tool to extract each video's description and key features.
|
|
496
|
+
# - IMPORTANT: Synthesize descriptions from all videos into a detailed, combined prompt capturing the best elements.
|
|
497
|
+
# - IMPORTANT: You MUST call the **text_to_video_generation** tool with this synthesized prompt to generate the final video.
|
|
498
|
+
# - IMPORTANT: Save the final output in your workspace and output the saved path.
|
|
499
|
+
# - If no existing videos are found, generate based on the original task requirements.
|
|
500
|
+
# """
|
|
353
501
|
|
|
354
502
|
# Add irreversible actions reminder if needed
|
|
355
503
|
# TODO: Integrate more general irreversible actions handling in future (i.e., not just for context file delivery)
|
massgen/orchestrator.py
CHANGED
|
@@ -2513,6 +2513,20 @@ INSTRUCTIONS FOR NEXT ATTEMPT:
|
|
|
2513
2513
|
elif hasattr(agent, "backend") and hasattr(agent.backend, "backend_params"):
|
|
2514
2514
|
enable_audio_generation = agent.backend.backend_params.get("enable_audio_generation", False)
|
|
2515
2515
|
|
|
2516
|
+
# Check if file generation is enabled for this agent
|
|
2517
|
+
enable_file_generation = False
|
|
2518
|
+
if hasattr(agent, "config") and agent.config:
|
|
2519
|
+
enable_file_generation = agent.config.backend_params.get("enable_file_generation", False)
|
|
2520
|
+
elif hasattr(agent, "backend") and hasattr(agent.backend, "backend_params"):
|
|
2521
|
+
enable_file_generation = agent.backend.backend_params.get("enable_file_generation", False)
|
|
2522
|
+
|
|
2523
|
+
# Check if video generation is enabled for this agent
|
|
2524
|
+
enable_video_generation = False
|
|
2525
|
+
if hasattr(agent, "config") and agent.config:
|
|
2526
|
+
enable_video_generation = agent.config.backend_params.get("enable_video_generation", False)
|
|
2527
|
+
elif hasattr(agent, "backend") and hasattr(agent.backend, "backend_params"):
|
|
2528
|
+
enable_video_generation = agent.backend.backend_params.get("enable_video_generation", False)
|
|
2529
|
+
|
|
2516
2530
|
# Check if agent has write access to context paths (requires file delivery)
|
|
2517
2531
|
has_irreversible_actions = False
|
|
2518
2532
|
if agent.backend.filesystem_manager:
|
|
@@ -2525,6 +2539,8 @@ INSTRUCTIONS FOR NEXT ATTEMPT:
|
|
|
2525
2539
|
agent_system_message,
|
|
2526
2540
|
enable_image_generation,
|
|
2527
2541
|
enable_audio_generation,
|
|
2542
|
+
enable_file_generation,
|
|
2543
|
+
enable_video_generation,
|
|
2528
2544
|
has_irreversible_actions,
|
|
2529
2545
|
enable_command_execution,
|
|
2530
2546
|
)
|