massgen 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of massgen might be problematic. Click here for more details.

Files changed (58) hide show
  1. massgen/__init__.py +1 -1
  2. massgen/api_params_handler/_chat_completions_api_params_handler.py +4 -0
  3. massgen/api_params_handler/_claude_api_params_handler.py +4 -0
  4. massgen/api_params_handler/_gemini_api_params_handler.py +4 -0
  5. massgen/api_params_handler/_response_api_params_handler.py +4 -0
  6. massgen/backend/base_with_custom_tool_and_mcp.py +25 -5
  7. massgen/backend/docs/permissions_and_context_files.md +2 -2
  8. massgen/backend/response.py +2 -0
  9. massgen/configs/README.md +49 -40
  10. massgen/configs/tools/custom_tools/crawl4ai_example.yaml +55 -0
  11. massgen/configs/tools/custom_tools/multimodal_tools/text_to_file_generation_multi.yaml +61 -0
  12. massgen/configs/tools/custom_tools/multimodal_tools/text_to_file_generation_single.yaml +29 -0
  13. massgen/configs/tools/custom_tools/multimodal_tools/text_to_image_generation_multi.yaml +51 -0
  14. massgen/configs/tools/custom_tools/multimodal_tools/text_to_image_generation_single.yaml +33 -0
  15. massgen/configs/tools/custom_tools/multimodal_tools/text_to_speech_generation_multi.yaml +55 -0
  16. massgen/configs/tools/custom_tools/multimodal_tools/text_to_speech_generation_single.yaml +33 -0
  17. massgen/configs/tools/custom_tools/multimodal_tools/text_to_video_generation_multi.yaml +47 -0
  18. massgen/configs/tools/custom_tools/multimodal_tools/text_to_video_generation_single.yaml +29 -0
  19. massgen/configs/tools/custom_tools/multimodal_tools/understand_audio.yaml +1 -1
  20. massgen/configs/tools/custom_tools/multimodal_tools/understand_file.yaml +1 -1
  21. massgen/configs/tools/custom_tools/multimodal_tools/understand_image.yaml +1 -1
  22. massgen/configs/tools/custom_tools/multimodal_tools/understand_video.yaml +1 -1
  23. massgen/configs/tools/custom_tools/multimodal_tools/youtube_video_analysis.yaml +1 -1
  24. massgen/filesystem_manager/_filesystem_manager.py +1 -0
  25. massgen/filesystem_manager/_path_permission_manager.py +148 -0
  26. massgen/message_templates.py +160 -12
  27. massgen/orchestrator.py +16 -0
  28. massgen/tests/test_binary_file_blocking.py +274 -0
  29. massgen/tests/test_case_studies.md +12 -12
  30. massgen/tests/test_multimodal_size_limits.py +407 -0
  31. massgen/tool/_manager.py +7 -2
  32. massgen/tool/_multimodal_tools/image_to_image_generation.py +293 -0
  33. massgen/tool/_multimodal_tools/text_to_file_generation.py +455 -0
  34. massgen/tool/_multimodal_tools/text_to_image_generation.py +222 -0
  35. massgen/tool/_multimodal_tools/text_to_speech_continue_generation.py +226 -0
  36. massgen/tool/_multimodal_tools/text_to_speech_transcription_generation.py +217 -0
  37. massgen/tool/_multimodal_tools/text_to_video_generation.py +223 -0
  38. massgen/tool/_multimodal_tools/understand_audio.py +19 -1
  39. massgen/tool/_multimodal_tools/understand_file.py +6 -1
  40. massgen/tool/_multimodal_tools/understand_image.py +112 -8
  41. massgen/tool/_multimodal_tools/understand_video.py +32 -5
  42. massgen/tool/_web_tools/crawl4ai_tool.py +718 -0
  43. massgen/tool/docs/multimodal_tools.md +589 -0
  44. {massgen-0.1.3.dist-info → massgen-0.1.4.dist-info}/METADATA +96 -69
  45. {massgen-0.1.3.dist-info → massgen-0.1.4.dist-info}/RECORD +49 -40
  46. massgen/configs/tools/custom_tools/crawl4ai_mcp_example.yaml +0 -67
  47. massgen/configs/tools/custom_tools/crawl4ai_multi_agent_example.yaml +0 -68
  48. massgen/configs/tools/custom_tools/multimodal_tools/playwright_with_img_understanding.yaml +0 -98
  49. massgen/configs/tools/custom_tools/multimodal_tools/understand_video_example.yaml +0 -54
  50. massgen/configs/tools/memory/README.md +0 -199
  51. massgen/configs/tools/memory/gpt5mini_gemini_context_window_management.yaml +0 -131
  52. massgen/configs/tools/memory/gpt5mini_gemini_no_persistent_memory.yaml +0 -133
  53. massgen/configs/tools/memory/test_context_window_management.py +0 -286
  54. massgen/configs/tools/multimodal/gpt5mini_gpt5nano_documentation_evolution.yaml +0 -97
  55. {massgen-0.1.3.dist-info → massgen-0.1.4.dist-info}/WHEEL +0 -0
  56. {massgen-0.1.3.dist-info → massgen-0.1.4.dist-info}/entry_points.txt +0 -0
  57. {massgen-0.1.3.dist-info → massgen-0.1.4.dist-info}/licenses/LICENSE +0 -0
  58. {massgen-0.1.3.dist-info → massgen-0.1.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,33 @@
1
+ # MassGen Configuration: Text to Speech Continue Generation Tool
2
+ # Usage:
3
+ # uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/text_to_speech_generation_single.yaml "I want to you tell me a very short introduction about Sherlock Homes in one sentence, and I want you to use emotion voice to read it out loud."
4
+ agents:
5
+ - id: "text_to_speech_continue_generation_tool"
6
+ backend:
7
+ type: "openai"
8
+ model: "gpt-4o"
9
+ cwd: "workspace1"
10
+ enable_audio_generation: true
11
+ custom_tools:
12
+ - name: ["text_to_speech_transcription_generation"]
13
+ category: "multimodal"
14
+ path: "massgen/tool/_multimodal_tools/text_to_speech_transcription_generation.py"
15
+ function: ["text_to_speech_transcription_generation"]
16
+ - name: ["understand_audio"]
17
+ category: "multimodal"
18
+ path: "massgen/tool/_multimodal_tools/understand_audio.py"
19
+ function: ["understand_audio"]
20
+ - name: ["text_to_speech_continue_generation"]
21
+ category: "multimodal"
22
+ path: "massgen/tool/_multimodal_tools/text_to_speech_continue_generation.py"
23
+ function: ["text_to_speech_continue_generation"]
24
+ system_message: |
25
+ You are an AI assistant with access to text-to-speech generation capabilities.
26
+
27
+ orchestrator:
28
+ snapshot_storage: "snapshots"
29
+ agent_temporary_workspace: "temp_workspaces"
30
+
31
+ ui:
32
+ display_type: "simple"
33
+ logging_enabled: true
@@ -0,0 +1,47 @@
1
+ # MassGen Configuration: Text to Video Generation Tool
2
+ # Usage:
3
+ # uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/text_to_video_generation_multi.yaml "Generate a 4 seconds video with neon-lit alley at night, light rain, slow push-in, cinematic."
4
+ agents:
5
+ - id: "text_to_video_generation_tool1"
6
+ backend:
7
+ type: "openai"
8
+ model: "gpt-4o"
9
+ cwd: "workspace1"
10
+ enable_video_generation: true
11
+ custom_tools:
12
+ - name: ["understand_video"]
13
+ category: "multimodal"
14
+ path: "massgen/tool/_multimodal_tools/understand_video.py"
15
+ function: ["understand_video"]
16
+ - name: ["text_to_video_generation"]
17
+ category: "multimodal"
18
+ path: "massgen/tool/_multimodal_tools/text_to_video_generation.py"
19
+ function: ["text_to_video_generation"]
20
+ system_message: |
21
+ You are an AI assistant with access to text-to-video generation capabilities.
22
+
23
+ - id: "text_to_video_generation_tool2"
24
+ backend:
25
+ type: "openai"
26
+ model: "gpt-4o"
27
+ cwd: "workspace2"
28
+ enable_video_generation: true
29
+ custom_tools:
30
+ - name: ["understand_video"]
31
+ category: "multimodal"
32
+ path: "massgen/tool/_multimodal_tools/understand_video.py"
33
+ function: ["understand_video"]
34
+ - name: ["text_to_video_generation"]
35
+ category: "multimodal"
36
+ path: "massgen/tool/_multimodal_tools/text_to_video_generation.py"
37
+ function: ["text_to_video_generation"]
38
+ system_message: |
39
+ You are an AI assistant with access to text-to-video generation capabilities.
40
+
41
+ orchestrator:
42
+ snapshot_storage: "snapshots"
43
+ agent_temporary_workspace: "temp_workspaces"
44
+
45
+ ui:
46
+ display_type: "rich_terminal"
47
+ logging_enabled: true
@@ -0,0 +1,29 @@
1
+ # MassGen Configuration: Text to Video Generation Tool
2
+ # Usage:
3
+ # uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/text_to_video_generation_single.yaml "Generate a 4 seconds video with neon-lit alley at night, light rain, slow push-in, cinematic."
4
+ agents:
5
+ - id: "text_to_video_generation_tool"
6
+ backend:
7
+ type: "openai"
8
+ model: "gpt-4o"
9
+ cwd: "workspace1"
10
+ enable_video_generation: true
11
+ custom_tools:
12
+ - name: ["understand_video"]
13
+ category: "multimodal"
14
+ path: "massgen/tool/_multimodal_tools/understand_video.py"
15
+ function: ["understand_video"]
16
+ - name: ["text_to_video_generation"]
17
+ category: "multimodal"
18
+ path: "massgen/tool/_multimodal_tools/text_to_video_generation.py"
19
+ function: ["text_to_video_generation"]
20
+ system_message: |
21
+ You are an AI assistant with access to text-to-video generation capabilities.
22
+
23
+ orchestrator:
24
+ snapshot_storage: "snapshots"
25
+ agent_temporary_workspace: "temp_workspaces"
26
+
27
+ ui:
28
+ display_type: "simple"
29
+ logging_enabled: true
@@ -1,6 +1,6 @@
1
1
  # MassGen Configuration: Understand Audio Tool
2
2
  # Usage:
3
- # massgen --config massgen/configs/tools/custom_tools/multimodal_tools/understand_audio.yaml "Please summarize the content in this audio."
3
+ # uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/understand_audio.yaml "Please summarize the content in this audio."
4
4
  agents:
5
5
  - id: "understand_audio_tool"
6
6
  backend:
@@ -1,6 +1,6 @@
1
1
  # MassGen Configuration: Understand File Tool
2
2
  # Usage:
3
- # massgen --config massgen/configs/tools/custom_tools/multimodal_tools/understand_file.yaml "Please summarize the content in this file."
3
+ # uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/understand_file.yaml "Please summarize the content in this file."
4
4
  agents:
5
5
  - id: "understand_file_tool"
6
6
  backend:
@@ -1,6 +1,6 @@
1
1
  # MassGen Configuration: Understand Image Tool
2
2
  # Usage:
3
- # massgen --config massgen/configs/tools/custom_tools/multimodal_tools/understand_image.yaml "Please summarize the content in this image."
3
+ # uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/understand_image.yaml "Please summarize the content in this image."
4
4
  agents:
5
5
  - id: "understand_image_tool"
6
6
  backend:
@@ -1,6 +1,6 @@
1
1
  # MassGen Configuration: Understand Video Tool
2
2
  # Usage:
3
- # massgen --config massgen/configs/tools/custom_tools/multimodal_tools/understand_video.yaml "What's happening in this video?"
3
+ # uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/understand_video.yaml "What's happening in this video?"
4
4
  agents:
5
5
  - id: "understand_video_tool"
6
6
  backend:
@@ -51,7 +51,7 @@ orchestrator:
51
51
  snapshot_storage: "snapshots"
52
52
  agent_temporary_workspace: "temp_workspaces"
53
53
  context_paths:
54
- - path: "docs/case_studies"
54
+ - path: "docs/source/examples/case_studies"
55
55
  permission: "read"
56
56
 
57
57
  ui:
@@ -57,6 +57,7 @@ class FilesystemManager:
57
57
  command_line_docker_network_mode: str = "none",
58
58
  command_line_docker_enable_sudo: bool = False,
59
59
  enable_audio_generation: bool = False,
60
+ enable_file_generation: bool = False,
60
61
  ):
61
62
  """
62
63
  Initialize FilesystemManager.
@@ -90,6 +90,68 @@ class PathPermissionManager:
90
90
  "massgen_logs",
91
91
  ]
92
92
 
93
+ # Binary file extensions that should not be read by text-based tools
94
+ # These files should be handled by specialized tools (understand_image, understand_video, etc.)
95
+ BINARY_FILE_EXTENSIONS = {
96
+ # Images
97
+ ".jpg",
98
+ ".jpeg",
99
+ ".png",
100
+ ".gif",
101
+ ".bmp",
102
+ ".ico",
103
+ ".svg",
104
+ ".webp",
105
+ ".tiff",
106
+ ".tif",
107
+ # Videos
108
+ ".mp4",
109
+ ".avi",
110
+ ".mov",
111
+ ".mkv",
112
+ ".flv",
113
+ ".wmv",
114
+ ".webm",
115
+ ".m4v",
116
+ ".mpg",
117
+ ".mpeg",
118
+ # Audio
119
+ ".mp3",
120
+ ".wav",
121
+ ".ogg",
122
+ ".flac",
123
+ ".aac",
124
+ ".m4a",
125
+ ".wma",
126
+ # Archives
127
+ ".zip",
128
+ ".tar",
129
+ ".gz",
130
+ ".bz2",
131
+ ".7z",
132
+ ".rar",
133
+ ".xz",
134
+ # Executables and binaries
135
+ ".exe",
136
+ ".bin",
137
+ ".dll",
138
+ ".so",
139
+ ".dylib",
140
+ ".o",
141
+ ".a",
142
+ ".pyc",
143
+ ".class",
144
+ ".jar",
145
+ # Office documents (binary formats - use understand_file tool)
146
+ ".doc", # Old Word (not supported by understand_file)
147
+ ".xls", # Old Excel (not supported by understand_file)
148
+ ".ppt", # Old PowerPoint (not supported by understand_file)
149
+ ".pdf", # PDF (supported by understand_file with PyPDF2)
150
+ ".docx", # Word (supported by understand_file with python-docx)
151
+ ".xlsx", # Excel (supported by understand_file with openpyxl)
152
+ ".pptx", # PowerPoint (supported by understand_file with python-pptx)
153
+ }
154
+
93
155
  def __init__(
94
156
  self,
95
157
  context_write_access_enabled: bool = False,
@@ -440,6 +502,12 @@ class PathPermissionManager:
440
502
  - allowed: Whether the tool call should proceed
441
503
  - reason: Explanation if blocked (None if allowed)
442
504
  """
505
+ # Check if read tool is trying to read binary files (images, videos, etc.)
506
+ if self._is_text_read_tool(tool_name):
507
+ binary_check_result = self._validate_binary_file_access(tool_name, tool_args)
508
+ if not binary_check_result[0]:
509
+ return binary_check_result
510
+
443
511
  # Track read operations for read-before-delete enforcement
444
512
  if self._is_read_tool(tool_name):
445
513
  self._track_read_operation(tool_name, tool_args)
@@ -495,6 +563,33 @@ class PathPermissionManager:
495
563
 
496
564
  return False
497
565
 
566
+ def _is_text_read_tool(self, tool_name: str) -> bool:
567
+ """
568
+ Check if a tool is a text-based read operation that should not access binary files.
569
+
570
+ These tools are designed for reading text files and should be blocked from
571
+ reading binary files (images, videos, audio, etc.) to prevent context pollution.
572
+
573
+ Tools that read text file contents:
574
+ - Read: Claude Code read tool
575
+ - read_text_file: MCP filesystem read tool
576
+ - read_file: Generic read operations
577
+ """
578
+ # Use lowercase for case-insensitive matching
579
+ tool_lower = tool_name.lower()
580
+
581
+ # Check if tool name contains any text read operation keywords
582
+ text_read_keywords = [
583
+ "read_text_file", # MCP filesystem: read_text_file
584
+ "read_file", # Generic read operations
585
+ ]
586
+
587
+ # Also check for exact "Read" match (Claude Code tool)
588
+ if tool_name == "Read":
589
+ return True
590
+
591
+ return any(keyword in tool_lower for keyword in text_read_keywords)
592
+
498
593
  def _is_read_tool(self, tool_name: str) -> bool:
499
594
  """
500
595
  Check if a tool is a read operation that should be tracked.
@@ -518,6 +613,59 @@ class PathPermissionManager:
518
613
 
519
614
  return any(keyword in tool_lower for keyword in read_keywords)
520
615
 
616
+ def _validate_binary_file_access(self, tool_name: str, tool_args: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
617
+ """
618
+ Validate that text-based read tools are not trying to read binary files.
619
+
620
+ Binary files (images, videos, audio, etc.) should be handled by specialized tools
621
+ to prevent context pollution with binary data.
622
+
623
+ Args:
624
+ tool_name: Name of the tool being called
625
+ tool_args: Arguments passed to the tool
626
+
627
+ Returns:
628
+ Tuple of (allowed: bool, reason: Optional[str])
629
+ - allowed: False if trying to read binary file, True otherwise
630
+ - reason: Explanation if blocked (None if allowed)
631
+ """
632
+ # Extract file path from arguments
633
+ file_path = self._extract_file_path(tool_args)
634
+ if not file_path:
635
+ # Can't determine path - allow (tool may not access files)
636
+ return (True, None)
637
+
638
+ # Resolve path
639
+ try:
640
+ file_path_str = self._resolve_path_against_workspace(file_path)
641
+ path = Path(file_path_str)
642
+ except Exception:
643
+ # If path resolution fails, allow (will fail elsewhere if invalid)
644
+ return (True, None)
645
+
646
+ # Check file extension
647
+ file_extension = path.suffix.lower()
648
+ if file_extension in self.BINARY_FILE_EXTENSIONS:
649
+ # Determine appropriate tool suggestion based on file type
650
+ if file_extension in {".jpg", ".jpeg", ".png", ".gif", ".bmp", ".ico", ".svg", ".webp", ".tiff", ".tif"}:
651
+ suggestion = "For images, use understand_image tool"
652
+ elif file_extension in {".mp4", ".avi", ".mov", ".mkv", ".flv", ".wmv", ".webm", ".m4v", ".mpg", ".mpeg"}:
653
+ suggestion = "For videos, use understand_video tool"
654
+ elif file_extension in {".mp3", ".wav", ".ogg", ".flac", ".aac", ".m4a", ".wma"}:
655
+ suggestion = "For audio files, use generate_text_with_input_audio tool"
656
+ elif file_extension in {".pdf"}:
657
+ suggestion = "For PDF files, use understand_file tool"
658
+ elif file_extension in {".docx", ".xlsx", ".pptx"}:
659
+ suggestion = "For Office documents, use understand_file tool"
660
+ else:
661
+ suggestion = "Use appropriate specialized tool for this file type"
662
+
663
+ reason = f"Cannot read binary file '{path.name}' with {tool_name}. {suggestion}."
664
+ logger.warning(f"[PathPermissionManager] Blocked {tool_name} from reading binary file: {path}")
665
+ return (False, reason)
666
+
667
+ return (True, None)
668
+
521
669
  def _is_delete_tool(self, tool_name: str) -> bool:
522
670
  """
523
671
  Check if a tool is a delete operation.
@@ -302,6 +302,8 @@ IMPORTANT: You are responding to the latest message in an ongoing conversation.
302
302
  original_system_message: Optional[str] = None,
303
303
  enable_image_generation: bool = False,
304
304
  enable_audio_generation: bool = False,
305
+ enable_file_generation: bool = False,
306
+ enable_video_generation: bool = False,
305
307
  has_irreversible_actions: bool = False,
306
308
  enable_command_execution: bool = False,
307
309
  ) -> str:
@@ -311,6 +313,8 @@ IMPORTANT: You are responding to the latest message in an ongoing conversation.
311
313
  original_system_message: The agent's original system message to preserve
312
314
  enable_image_generation: Whether image generation is enabled
313
315
  enable_audio_generation: Whether audio generation is enabled
316
+ enable_file_generation: Whether file generation is enabled
317
+ enable_video_generation: Whether video generation is enabled
314
318
  has_irreversible_actions: Whether agent has write access to context paths (requires actual file delivery)
315
319
  enable_command_execution: Whether command execution is enabled for this agent
316
320
  """
@@ -335,21 +339,165 @@ Present the best possible coordinated answer by combining the strengths from all
335
339
  # Add image generation instructions only if enabled
336
340
  if enable_image_generation:
337
341
  presentation_instructions += """For image generation tasks:
338
- - Extract image paths from the existing answer and resolve them in the shared reference.
339
- - Gather all agent-produced images (ignore non-existent files).
340
- - MUST call the generate-image tool with these input images to synthesize one final image combining their strengths.
341
- - MUST save the final outputand output the saved path.
342
- """
342
+
343
+ **MANDATORY WORKFLOW - You MUST follow these steps in order:**
344
+
345
+ Step 1: **Check for existing images (REQUIRED)**
346
+ - First, list all files in the Shared Reference directory (temp_workspaces) to find ALL images from EVERY agent
347
+ - Look for image files (.png, .jpg, .jpeg, .gif, .webp, etc.) in each agent's workspace subdirectory
348
+
349
+ Step 2: **Understand ALL existing images (REQUIRED if images exist)**
350
+ - For EACH image file you found, you MUST call the **understand_image** tool to extract its key visual elements, composition, style, and quality
351
+ - Do this for images from yourself AND from other agents - analyze ALL images found
352
+ - DO NOT skip this step even if you think you know the content
353
+
354
+ Step 3: **Synthesize and generate final image (REQUIRED)**
355
+ - If existing images were found and analyzed:
356
+ * Synthesize ALL image analyses into a single, detailed, combined prompt
357
+ * The combined prompt should capture the best visual elements, composition, style, and quality from all analyzed images
358
+ * Call **image_to_image_generation** with this synthesized prompt and ALL images to create the final unified image
359
+ - If NO existing images were found:
360
+ * Generate a new image based directly on the original task requirements
361
+ * Call **text_to_image_generation** with a prompt derived from the original task
362
+
363
+ Step 4: **Save and report (REQUIRED)**
364
+ - Save the final generated image in your workspace
365
+ - Report the saved path in your final answer
366
+
367
+ **CRITICAL**: You MUST complete Steps 1-4 in order. Do not skip checking for existing images. Do not skip calling
368
+ understand_image on found images. This is a mandatory synthesis workflow.
369
+ """
370
+ # presentation_instructions += """For image generation tasks:
371
+ # - Extract image paths from the existing answer and resolve them in the shared reference.
372
+ # - Gather all agent-produced images (ignore non-existent files).
373
+ # - IMPORTANT: If you find ANY existing images (from yourself or other agents), you MUST call the understand_image tool
374
+ # to analyze EACH image and extract their key visual elements, composition, style, and quality.
375
+ # - IMPORTANT: Synthesize insights from all analyzed images into a detailed, combined prompt that captures the best elements.
376
+ # - IMPORTANT: Call text_to_image_generation with this synthesized prompt to generate the final image.
377
+ # - IMPORTANT: Save the final output in your workspace and output the saved path.
378
+ # - If no existing images are found, generate based on the original task requirements.
379
+ # """
343
380
  # Add audio generation instructions only if enabled
344
381
  if enable_audio_generation:
345
382
  presentation_instructions += """For audio generation tasks:
346
- - Extract audio paths from the existing answer and resolve them in the shared reference.
347
- - Gather ALL audio files produced by EVERY agent (ignore non-existent files).
348
- IMPORTANT: You MUST call the generate_text_with_input_audio tool to obtain transcriptions
349
- for EACH AND EVERY audio file from ALL agents - no audio should be skipped or overlooked.
350
- - MUST combine the strengths of all transcriptions into one final detailed transcription that captures the best elements from each.
351
- - MUST use the convert_text_to_audio tool to convert this final transcription to a new audio file and save it, then output the saved path.
352
- """
383
+
384
+ **MANDATORY WORKFLOW - You MUST follow these steps in order:**
385
+
386
+ Step 1: **Check for existing audios (REQUIRED)**
387
+ - First, list all files in the Shared Reference directory (temp_workspaces) to find ALL audio files from EVERY agent
388
+ - Look for audio files (.mp3, .wav, .flac, etc.) in each agent's workspace subdirectory
389
+
390
+ Step 2: **Understand ALL existing audios (REQUIRED if audios exist)**
391
+ - For EACH audio file you found, you MUST call the **understand_audio** tool to extract its transcription
392
+ - Do this for audios from yourself AND from other agents - analyze ALL audios found
393
+ - DO NOT skip this step even if you think you know the content
394
+
395
+ Step 3: **Synthesize and generate final audio (REQUIRED)**
396
+ - If existing audios were found and analyzed:
397
+ * Synthesize ALL audio transcriptions into a single, detailed, combined transcription
398
+ * The combined transcription should capture the best content from all analyzed audios
399
+ * Call **text_to_speech_transcription_generation** with this synthesized transcription to create the final unified audio
400
+ - If NO existing audios were found:
401
+ * Generate a new audio based directly on the original task requirements
402
+ * Call **text_to_speech_transcription_generation** with a transcription derived from the original task
403
+
404
+ Step 4: **Save and report (REQUIRED)**
405
+ - Save the final generated audio in your workspace
406
+ - Report the saved path in your final answer
407
+
408
+ **CRITICAL**: You MUST complete Steps 1-4 in order. Do not skip checking for existing audios. Do not skip calling
409
+ understand_audio on found audios. This is a mandatory synthesis workflow.
410
+ """
411
+ # presentation_instructions += """For audio generation tasks:
412
+ # - Extract audio paths from the existing answer and resolve them in the shared reference.
413
+ # - Gather ALL audio files produced by EVERY agent (ignore non-existent files).
414
+ # - IMPORTANT: If you find ANY existing audios (from yourself or other agents), you MUST call the **understand_audio** tool to extract each audio's transcription.
415
+ # - IMPORTANT: Synthesize transcriptions from all audios into a detailed, combined transcription.
416
+ # - IMPORTANT: You MUST call the **text_to_speech_transcription_generation** tool with this synthesized transcription to generate the final audio.
417
+ # - IMPORTANT: Save the final output in your workspace and output the saved path.
418
+ # - If no existing audios are found, generate based on the original task requirements.
419
+ # """
420
+ # Add file generation instructions only if enabled
421
+ if enable_file_generation:
422
+ presentation_instructions += """For file generation tasks:
423
+
424
+ **MANDATORY WORKFLOW - You MUST follow these steps in order:**
425
+
426
+ Step 1: **Check for existing files (REQUIRED)**
427
+ - First, list all files in the Shared Reference directory (temp_workspaces) to find ALL files from EVERY agent
428
+ - Look for files of the requested type in each agent's workspace subdirectory
429
+
430
+ Step 2: **Understand ALL existing files (REQUIRED if files exist)**
431
+ - For EACH file you found, you MUST call the **understand_file** tool to extract its content, structure, and key elements
432
+ - Do this for files from yourself AND from other agents - analyze ALL files found
433
+ - DO NOT skip this step even if you think you know the content
434
+
435
+ Step 3: **Synthesize and generate final file (REQUIRED)**
436
+ - If existing files were found and analyzed:
437
+ * Synthesize ALL file contents into a single, detailed, combined content
438
+ * The combined content should capture the best elements, structure, and information from all analyzed files
439
+ * Call **text_to_file_generation** with this synthesized content to generate the final unified file
440
+ - If NO existing files were found:
441
+ * Generate a new file based directly on the original task requirements
442
+ * Call **text_to_file_generation** with content derived from the original task
443
+
444
+ Step 4: **Save and report (REQUIRED)**
445
+ - Save the final generated file in your workspace
446
+ - Report the saved path in your final answer
447
+
448
+ **CRITICAL**: You MUST complete Steps 1-4 in order. Do not skip checking for existing files. Do not skip calling
449
+ understand_file on found files. This is a mandatory synthesis workflow.
450
+ """
451
+ # presentation_instructions += """For file generation tasks:
452
+ # - Extract file paths from the existing answer and resolve them in the shared reference.
453
+ # - Gather ALL files produced by EVERY agent (ignore non-existent files).
454
+ # - IMPORTANT: If you find ANY existing files (from yourself or other agents), you MUST call the **understand_file** tool to extract each file's content.
455
+ # - IMPORTANT: Synthesize contents from all files into a detailed, combined content.
456
+ # - IMPORTANT: You MUST call the **text_to_file_generation** tool with this synthesized content to generate the final file.
457
+ # - IMPORTANT: Save the final output in your workspace and output the saved path.
458
+ # - If no existing files are found, generate based on the original task requirements.
459
+ # """
460
+ # Add video generation instructions only if enabled
461
+ if enable_video_generation:
462
+ presentation_instructions += """For video generation tasks:
463
+
464
+ **MANDATORY WORKFLOW - You MUST follow these steps in order:**
465
+
466
+ Step 1: **Check for existing videos (REQUIRED)**
467
+ - First, list all files in the Shared Reference directory (temp_workspaces) to find ALL videos from EVERY agent
468
+ - Look for video files (.mp4, .avi, .mov, etc.) in each agent's workspace subdirectory
469
+
470
+ Step 2: **Understand ALL existing videos (REQUIRED if videos exist)**
471
+ - For EACH video file you found, you MUST call the **understand_video** tool to extract its description, visual features, and
472
+ key elements
473
+ - Do this for videos from yourself AND from other agents - analyze ALL videos found
474
+ - DO NOT skip this step even if you think you know the content
475
+
476
+ Step 3: **Synthesize and generate final video (REQUIRED)**
477
+ - If existing videos were found and analyzed:
478
+ * Synthesize ALL video descriptions into a single, detailed, combined prompt
479
+ * The combined prompt should capture the best visual elements, composition, motion, and style from all analyzed videos
480
+ * Call **text_to_video_generation** with this synthesized prompt to create the final unified video
481
+ - If NO existing videos were found:
482
+ * Generate a new video based directly on the original task requirements
483
+ * Call **text_to_video_generation** with a prompt derived from the original task
484
+
485
+ Step 4: **Save and report (REQUIRED)**
486
+ - Save the final generated video in your workspace
487
+ - Report the saved path in your final answer
488
+
489
+ **CRITICAL**: You MUST complete Steps 1-4 in order. Do not skip checking for existing videos. Do not skip calling
490
+ understand_video on found videos. This is a mandatory synthesis workflow.
491
+ """
492
+ # presentation_instructions += """For video generation tasks:
493
+ # - Extract video paths from the existing answer and resolve them in the shared reference.
494
+ # - Gather ALL videos produced by EVERY agent (ignore non-existent files).
495
+ # - IMPORTANT: If you find ANY existing videos (from yourself or other agents), you MUST call the **understand_video** tool to extract each video's description and key features.
496
+ # - IMPORTANT: Synthesize descriptions from all videos into a detailed, combined prompt capturing the best elements.
497
+ # - IMPORTANT: You MUST call the **text_to_video_generation** tool with this synthesized prompt to generate the final video.
498
+ # - IMPORTANT: Save the final output in your workspace and output the saved path.
499
+ # - If no existing videos are found, generate based on the original task requirements.
500
+ # """
353
501
 
354
502
  # Add irreversible actions reminder if needed
355
503
  # TODO: Integrate more general irreversible actions handling in future (i.e., not just for context file delivery)
massgen/orchestrator.py CHANGED
@@ -2513,6 +2513,20 @@ INSTRUCTIONS FOR NEXT ATTEMPT:
2513
2513
  elif hasattr(agent, "backend") and hasattr(agent.backend, "backend_params"):
2514
2514
  enable_audio_generation = agent.backend.backend_params.get("enable_audio_generation", False)
2515
2515
 
2516
+ # Check if file generation is enabled for this agent
2517
+ enable_file_generation = False
2518
+ if hasattr(agent, "config") and agent.config:
2519
+ enable_file_generation = agent.config.backend_params.get("enable_file_generation", False)
2520
+ elif hasattr(agent, "backend") and hasattr(agent.backend, "backend_params"):
2521
+ enable_file_generation = agent.backend.backend_params.get("enable_file_generation", False)
2522
+
2523
+ # Check if video generation is enabled for this agent
2524
+ enable_video_generation = False
2525
+ if hasattr(agent, "config") and agent.config:
2526
+ enable_video_generation = agent.config.backend_params.get("enable_video_generation", False)
2527
+ elif hasattr(agent, "backend") and hasattr(agent.backend, "backend_params"):
2528
+ enable_video_generation = agent.backend.backend_params.get("enable_video_generation", False)
2529
+
2516
2530
  # Check if agent has write access to context paths (requires file delivery)
2517
2531
  has_irreversible_actions = False
2518
2532
  if agent.backend.filesystem_manager:
@@ -2525,6 +2539,8 @@ INSTRUCTIONS FOR NEXT ATTEMPT:
2525
2539
  agent_system_message,
2526
2540
  enable_image_generation,
2527
2541
  enable_audio_generation,
2542
+ enable_file_generation,
2543
+ enable_video_generation,
2528
2544
  has_irreversible_actions,
2529
2545
  enable_command_execution,
2530
2546
  )