massgen 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of massgen might be problematic. Click here for more details.

Files changed (90) hide show
  1. massgen/__init__.py +1 -1
  2. massgen/api_params_handler/_chat_completions_api_params_handler.py +4 -0
  3. massgen/api_params_handler/_claude_api_params_handler.py +4 -0
  4. massgen/api_params_handler/_gemini_api_params_handler.py +4 -0
  5. massgen/api_params_handler/_response_api_params_handler.py +4 -0
  6. massgen/backend/base_with_custom_tool_and_mcp.py +25 -5
  7. massgen/backend/docs/permissions_and_context_files.md +2 -2
  8. massgen/backend/response.py +2 -0
  9. massgen/chat_agent.py +340 -20
  10. massgen/cli.py +326 -19
  11. massgen/configs/README.md +92 -41
  12. massgen/configs/memory/gpt5mini_gemini_baseline_research_to_implementation.yaml +94 -0
  13. massgen/configs/memory/gpt5mini_gemini_context_window_management.yaml +187 -0
  14. massgen/configs/memory/gpt5mini_gemini_research_to_implementation.yaml +127 -0
  15. massgen/configs/memory/gpt5mini_high_reasoning_gemini.yaml +107 -0
  16. massgen/configs/memory/single_agent_compression_test.yaml +64 -0
  17. massgen/configs/tools/custom_tools/crawl4ai_example.yaml +55 -0
  18. massgen/configs/tools/custom_tools/multimodal_tools/text_to_file_generation_multi.yaml +61 -0
  19. massgen/configs/tools/custom_tools/multimodal_tools/text_to_file_generation_single.yaml +29 -0
  20. massgen/configs/tools/custom_tools/multimodal_tools/text_to_image_generation_multi.yaml +51 -0
  21. massgen/configs/tools/custom_tools/multimodal_tools/text_to_image_generation_single.yaml +33 -0
  22. massgen/configs/tools/custom_tools/multimodal_tools/text_to_speech_generation_multi.yaml +55 -0
  23. massgen/configs/tools/custom_tools/multimodal_tools/text_to_speech_generation_single.yaml +33 -0
  24. massgen/configs/tools/custom_tools/multimodal_tools/text_to_video_generation_multi.yaml +47 -0
  25. massgen/configs/tools/custom_tools/multimodal_tools/text_to_video_generation_single.yaml +29 -0
  26. massgen/configs/tools/custom_tools/multimodal_tools/understand_audio.yaml +1 -1
  27. massgen/configs/tools/custom_tools/multimodal_tools/understand_file.yaml +1 -1
  28. massgen/configs/tools/custom_tools/multimodal_tools/understand_image.yaml +1 -1
  29. massgen/configs/tools/custom_tools/multimodal_tools/understand_video.yaml +1 -1
  30. massgen/configs/tools/custom_tools/multimodal_tools/youtube_video_analysis.yaml +1 -1
  31. massgen/filesystem_manager/_filesystem_manager.py +1 -0
  32. massgen/filesystem_manager/_path_permission_manager.py +148 -0
  33. massgen/memory/README.md +277 -0
  34. massgen/memory/__init__.py +26 -0
  35. massgen/memory/_base.py +193 -0
  36. massgen/memory/_compression.py +237 -0
  37. massgen/memory/_context_monitor.py +211 -0
  38. massgen/memory/_conversation.py +255 -0
  39. massgen/memory/_fact_extraction_prompts.py +333 -0
  40. massgen/memory/_mem0_adapters.py +257 -0
  41. massgen/memory/_persistent.py +687 -0
  42. massgen/memory/docker-compose.qdrant.yml +36 -0
  43. massgen/memory/docs/DESIGN.md +388 -0
  44. massgen/memory/docs/QUICKSTART.md +409 -0
  45. massgen/memory/docs/SUMMARY.md +319 -0
  46. massgen/memory/docs/agent_use_memory.md +408 -0
  47. massgen/memory/docs/orchestrator_use_memory.md +586 -0
  48. massgen/memory/examples.py +237 -0
  49. massgen/message_templates.py +160 -12
  50. massgen/orchestrator.py +223 -7
  51. massgen/tests/memory/test_agent_compression.py +174 -0
  52. massgen/{configs/tools → tests}/memory/test_context_window_management.py +30 -30
  53. massgen/tests/memory/test_force_compression.py +154 -0
  54. massgen/tests/memory/test_simple_compression.py +147 -0
  55. massgen/tests/test_agent_memory.py +534 -0
  56. massgen/tests/test_binary_file_blocking.py +274 -0
  57. massgen/tests/test_case_studies.md +12 -12
  58. massgen/tests/test_conversation_memory.py +382 -0
  59. massgen/tests/test_multimodal_size_limits.py +407 -0
  60. massgen/tests/test_orchestrator_memory.py +620 -0
  61. massgen/tests/test_persistent_memory.py +435 -0
  62. massgen/token_manager/token_manager.py +6 -0
  63. massgen/tool/_manager.py +7 -2
  64. massgen/tool/_multimodal_tools/image_to_image_generation.py +293 -0
  65. massgen/tool/_multimodal_tools/text_to_file_generation.py +455 -0
  66. massgen/tool/_multimodal_tools/text_to_image_generation.py +222 -0
  67. massgen/tool/_multimodal_tools/text_to_speech_continue_generation.py +226 -0
  68. massgen/tool/_multimodal_tools/text_to_speech_transcription_generation.py +217 -0
  69. massgen/tool/_multimodal_tools/text_to_video_generation.py +223 -0
  70. massgen/tool/_multimodal_tools/understand_audio.py +19 -1
  71. massgen/tool/_multimodal_tools/understand_file.py +6 -1
  72. massgen/tool/_multimodal_tools/understand_image.py +112 -8
  73. massgen/tool/_multimodal_tools/understand_video.py +32 -5
  74. massgen/tool/_web_tools/crawl4ai_tool.py +718 -0
  75. massgen/tool/docs/multimodal_tools.md +589 -0
  76. massgen/tools/__init__.py +8 -0
  77. massgen/tools/_planning_mcp_server.py +520 -0
  78. massgen/tools/planning_dataclasses.py +434 -0
  79. {massgen-0.1.3.dist-info → massgen-0.1.5.dist-info}/METADATA +142 -82
  80. {massgen-0.1.3.dist-info → massgen-0.1.5.dist-info}/RECORD +84 -41
  81. massgen/configs/tools/custom_tools/crawl4ai_mcp_example.yaml +0 -67
  82. massgen/configs/tools/custom_tools/crawl4ai_multi_agent_example.yaml +0 -68
  83. massgen/configs/tools/memory/README.md +0 -199
  84. massgen/configs/tools/memory/gpt5mini_gemini_context_window_management.yaml +0 -131
  85. massgen/configs/tools/memory/gpt5mini_gemini_no_persistent_memory.yaml +0 -133
  86. massgen/configs/tools/multimodal/gpt5mini_gpt5nano_documentation_evolution.yaml +0 -97
  87. {massgen-0.1.3.dist-info → massgen-0.1.5.dist-info}/WHEEL +0 -0
  88. {massgen-0.1.3.dist-info → massgen-0.1.5.dist-info}/entry_points.txt +0 -0
  89. {massgen-0.1.3.dist-info → massgen-0.1.5.dist-info}/licenses/LICENSE +0 -0
  90. {massgen-0.1.3.dist-info → massgen-0.1.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,61 @@
1
+ # MassGen Configuration: Text to File Generation Tool
2
+ # Usage:
3
+ # uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/text_to_file_generation_multi.yaml "Please generate a comprehensive business presentation about Artificial Intelligence in Healthcare for our upcoming board meeting. The presentation should include the following slides: 1) Title slide with presentation title and date, 2) Executive Summary highlighting key findings, 3) Market Overview showing the current AI healthcare market size and growth trends, 4) Technology Applications including AI in diagnostics, drug discovery, and patient care, 5) Case Studies showcasing 3-4 successful implementations with metrics, 6) Competitive Landscape analyzing major players and their solutions, 7) Implementation Roadmap with timeline and milestones, 8) ROI Analysis with projected costs and benefits, 9) Risk Assessment and mitigation strategies, 10) Recommendations and next steps. Please make it professional with approximately 15-20 slides, use clear bullet points, include suggested visual elements for each slide, and save it as a PPTX file with a modern business layout."
4
+ agents:
5
+ - id: "text_to_file_generation_tool1"
6
+ backend:
7
+ type: "openai"
8
+ model: "gpt-4o"
9
+ cwd: "workspace1"
10
+ enable_file_generation: true
11
+ custom_tools:
12
+ - name: ["text_to_file_generation"]
13
+ category: "multimodal"
14
+ path: "massgen/tool/_multimodal_tools/text_to_file_generation.py"
15
+ function: ["text_to_file_generation"]
16
+ - name: ["understand_file"]
17
+ category: "multimodal"
18
+ path: "massgen/tool/_multimodal_tools/understand_file.py"
19
+ function: ["understand_file"]
20
+ system_message: |
21
+ You are an AI assistant with access to text-to-file generation capabilities.
22
+
23
+ When generating PPTX presentations, format your content with:
24
+ - Use "# Title" or "## Title" for slide titles
25
+ - Use "---" to separate slides
26
+ - Use "- Item" for bullet points
27
+ - Use " - Subitem" for sub-bullets (two spaces indent)
28
+ - Structure content in a slide-friendly format with clear, concise points
29
+
30
+ - id: "text_to_file_generation_tool2"
31
+ backend:
32
+ type: "openai"
33
+ model: "gpt-4o"
34
+ cwd: "workspace2"
35
+ enable_file_generation: true
36
+ custom_tools:
37
+ - name: ["text_to_file_generation"]
38
+ category: "multimodal"
39
+ path: "massgen/tool/_multimodal_tools/text_to_file_generation.py"
40
+ function: ["text_to_file_generation"]
41
+ - name: ["understand_file"]
42
+ category: "multimodal"
43
+ path: "massgen/tool/_multimodal_tools/understand_file.py"
44
+ function: ["understand_file"]
45
+ system_message: |
46
+ You are an AI assistant with access to text-to-file generation capabilities.
47
+
48
+ When generating PPTX presentations, format your content with:
49
+ - Use "# Title" or "## Title" for slide titles
50
+ - Use "---" to separate slides
51
+ - Use "- Item" for bullet points
52
+ - Use " - Subitem" for sub-bullets (two spaces indent)
53
+ - Structure content in a slide-friendly format with clear, concise points
54
+
55
+ orchestrator:
56
+ snapshot_storage: "snapshots"
57
+ agent_temporary_workspace: "temp_workspaces"
58
+
59
+ ui:
60
+ display_type: "rich_terminal"
61
+ logging_enabled: true
@@ -0,0 +1,29 @@
1
+ # MassGen Configuration: Text to File Generation Tool
2
+ # Usage:
3
+ # uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/text_to_file_generation_single.yaml "Please generate a comprehensive technical report about the latest developments in Large Language Models (LLMs) and Generative AI. The report should include the following sections: 1) Executive Summary, 2) Introduction to LLMs and their architecture, 3) Recent breakthroughs in 2024-2025, 4) Applications in industry including healthcare, finance, and education, 5) Ethical considerations and limitations, 6) Future directions and research opportunities. Please make the report approximately 10-15 pages long with proper citations and references, and save it as a PDF file with a professional layout."
4
+ agents:
5
+ - id: "text_to_file_generation_tool"
6
+ backend:
7
+ type: "openai"
8
+ model: "gpt-4o"
9
+ cwd: "workspace1"
10
+ enable_file_generation: true
11
+ custom_tools:
12
+ - name: ["text_to_file_generation"]
13
+ category: "multimodal"
14
+ path: "massgen/tool/_multimodal_tools/text_to_file_generation.py"
15
+ function: ["text_to_file_generation"]
16
+ - name: ["understand_file"]
17
+ category: "multimodal"
18
+ path: "massgen/tool/_multimodal_tools/understand_file.py"
19
+ function: ["understand_file"]
20
+ system_message: |
21
+ You are an AI assistant with access to text-to-file generation capabilities.
22
+
23
+ orchestrator:
24
+ snapshot_storage: "snapshots"
25
+ agent_temporary_workspace: "temp_workspaces"
26
+
27
+ ui:
28
+ display_type: "simple"
29
+ logging_enabled: true
@@ -0,0 +1,51 @@
1
+ # MassGen Configuration: Text to Image Generation Tool
2
+ # Usage:
3
+ # uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/text_to_image_generation_multi.yaml "Please generate an image of a cat in space."
4
+ agents:
5
+ - id: "text_to_image_generation_tool1"
6
+ backend:
7
+ type: "openai"
8
+ model: "gpt-4o"
9
+ cwd: "workspace1"
10
+ enable_image_generation: true
11
+ custom_tools:
12
+ - name: ["text_to_image_generation"]
13
+ category: "multimodal"
14
+ path: "massgen/tool/_multimodal_tools/text_to_image_generation.py"
15
+ function: ["text_to_image_generation"]
16
+ - name: ["understand_image"]
17
+ category: "multimodal"
18
+ path: "massgen/tool/_multimodal_tools/understand_image.py"
19
+ function: ["understand_image"]
20
+ - name: ["image_to_image_generation"]
21
+ category: "multimodal"
22
+ path: "massgen/tool/_multimodal_tools/image_to_image_generation.py"
23
+ function: ["image_to_image_generation"]
24
+ system_message: |
25
+ You are an AI assistant with access to text-to-image generation capabilities.
26
+
27
+ - id: "text_to_image_generation_tool2"
28
+ backend:
29
+ type: "openai"
30
+ model: "gpt-4o"
31
+ cwd: "workspace2"
32
+ enable_image_generation: true
33
+ custom_tools:
34
+ - name: ["text_to_image_generation"]
35
+ category: "multimodal"
36
+ path: "massgen/tool/_multimodal_tools/text_to_image_generation.py"
37
+ function: ["text_to_image_generation"]
38
+ - name: ["understand_image"]
39
+ category: "multimodal"
40
+ path: "massgen/tool/_multimodal_tools/understand_image.py"
41
+ function: ["understand_image"]
42
+ system_message: |
43
+ You are an AI assistant with access to text-to-image generation capabilities.
44
+
45
+ orchestrator:
46
+ snapshot_storage: "snapshots"
47
+ agent_temporary_workspace: "temp_workspaces"
48
+
49
+ ui:
50
+ display_type: "rich_terminal"
51
+ logging_enabled: true
@@ -0,0 +1,33 @@
1
+ # MassGen Configuration: Text to Image Generation Tool
2
+ # Usage:
3
+ # uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/text_to_image_generation_single.yaml "Please generate an image of a cat in space."
4
+ agents:
5
+ - id: "text_to_image_generation_tool"
6
+ backend:
7
+ type: "openai"
8
+ model: "gpt-4o"
9
+ cwd: "workspace1"
10
+ enable_image_generation: true
11
+ custom_tools:
12
+ - name: ["text_to_image_generation"]
13
+ category: "multimodal"
14
+ path: "massgen/tool/_multimodal_tools/text_to_image_generation.py"
15
+ function: ["text_to_image_generation"]
16
+ - name: ["understand_image"]
17
+ category: "multimodal"
18
+ path: "massgen/tool/_multimodal_tools/understand_image.py"
19
+ function: ["understand_image"]
20
+ - name: ["image_to_image_generation"]
21
+ category: "multimodal"
22
+ path: "massgen/tool/_multimodal_tools/image_to_image_generation.py"
23
+ function: ["image_to_image_generation"]
24
+ system_message: |
25
+ You are an AI assistant with access to text-to-image generation capabilities.
26
+
27
+ orchestrator:
28
+ snapshot_storage: "snapshots"
29
+ agent_temporary_workspace: "temp_workspaces"
30
+
31
+ ui:
32
+ display_type: "simple"
33
+ logging_enabled: true
@@ -0,0 +1,55 @@
1
+ # MassGen Configuration: Text to Speech Continue Generation Tool
2
+ # Usage:
3
+ # uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/text_to_speech_generation_multi.yaml "I want to you tell me a very short introduction about Sherlock Homes in one sentence, and I want you to use emotion voice to read it out loud."
4
+ agents:
5
+ - id: "text_to_speech_continue_generation_tool1"
6
+ backend:
7
+ type: "openai"
8
+ model: "gpt-4o"
9
+ cwd: "workspace1"
10
+ enable_audio_generation: true
11
+ custom_tools:
12
+ - name: ["text_to_speech_transcription_generation"]
13
+ category: "multimodal"
14
+ path: "massgen/tool/_multimodal_tools/text_to_speech_transcription_generation.py"
15
+ function: ["text_to_speech_transcription_generation"]
16
+ - name: ["understand_audio"]
17
+ category: "multimodal"
18
+ path: "massgen/tool/_multimodal_tools/understand_audio.py"
19
+ function: ["understand_audio"]
20
+ - name: ["text_to_speech_continue_generation"]
21
+ category: "multimodal"
22
+ path: "massgen/tool/_multimodal_tools/text_to_speech_continue_generation.py"
23
+ function: ["text_to_speech_continue_generation"]
24
+ system_message: |
25
+ You are an AI assistant with access to text-to-speech generation capabilities.
26
+
27
+ - id: "text_to_speech_continue_generation_tool2"
28
+ backend:
29
+ type: "openai"
30
+ model: "gpt-4o"
31
+ cwd: "workspace2"
32
+ enable_audio_generation: true
33
+ custom_tools:
34
+ - name: ["text_to_speech_transcription_generation"]
35
+ category: "multimodal"
36
+ path: "massgen/tool/_multimodal_tools/text_to_speech_transcription_generation.py"
37
+ function: ["text_to_speech_transcription_generation"]
38
+ - name: ["understand_audio"]
39
+ category: "multimodal"
40
+ path: "massgen/tool/_multimodal_tools/understand_audio.py"
41
+ function: ["understand_audio"]
42
+ - name: ["text_to_speech_continue_generation"]
43
+ category: "multimodal"
44
+ path: "massgen/tool/_multimodal_tools/text_to_speech_continue_generation.py"
45
+ function: ["text_to_speech_continue_generation"]
46
+ system_message: |
47
+ You are an AI assistant with access to text-to-speech generation capabilities.
48
+
49
+ orchestrator:
50
+ snapshot_storage: "snapshots"
51
+ agent_temporary_workspace: "temp_workspaces"
52
+
53
+ ui:
54
+ display_type: "rich_terminal"
55
+ logging_enabled: true
@@ -0,0 +1,33 @@
1
+ # MassGen Configuration: Text to Speech Continue Generation Tool
2
+ # Usage:
3
+ # uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/text_to_speech_generation_single.yaml "I want to you tell me a very short introduction about Sherlock Homes in one sentence, and I want you to use emotion voice to read it out loud."
4
+ agents:
5
+ - id: "text_to_speech_continue_generation_tool"
6
+ backend:
7
+ type: "openai"
8
+ model: "gpt-4o"
9
+ cwd: "workspace1"
10
+ enable_audio_generation: true
11
+ custom_tools:
12
+ - name: ["text_to_speech_transcription_generation"]
13
+ category: "multimodal"
14
+ path: "massgen/tool/_multimodal_tools/text_to_speech_transcription_generation.py"
15
+ function: ["text_to_speech_transcription_generation"]
16
+ - name: ["understand_audio"]
17
+ category: "multimodal"
18
+ path: "massgen/tool/_multimodal_tools/understand_audio.py"
19
+ function: ["understand_audio"]
20
+ - name: ["text_to_speech_continue_generation"]
21
+ category: "multimodal"
22
+ path: "massgen/tool/_multimodal_tools/text_to_speech_continue_generation.py"
23
+ function: ["text_to_speech_continue_generation"]
24
+ system_message: |
25
+ You are an AI assistant with access to text-to-speech generation capabilities.
26
+
27
+ orchestrator:
28
+ snapshot_storage: "snapshots"
29
+ agent_temporary_workspace: "temp_workspaces"
30
+
31
+ ui:
32
+ display_type: "simple"
33
+ logging_enabled: true
@@ -0,0 +1,47 @@
1
+ # MassGen Configuration: Text to Video Generation Tool
2
+ # Usage:
3
+ # uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/text_to_video_generation_multi.yaml "Generate a 4 seconds video with neon-lit alley at night, light rain, slow push-in, cinematic."
4
+ agents:
5
+ - id: "text_to_video_generation_tool1"
6
+ backend:
7
+ type: "openai"
8
+ model: "gpt-4o"
9
+ cwd: "workspace1"
10
+ enable_video_generation: true
11
+ custom_tools:
12
+ - name: ["understand_video"]
13
+ category: "multimodal"
14
+ path: "massgen/tool/_multimodal_tools/understand_video.py"
15
+ function: ["understand_video"]
16
+ - name: ["text_to_video_generation"]
17
+ category: "multimodal"
18
+ path: "massgen/tool/_multimodal_tools/text_to_video_generation.py"
19
+ function: ["text_to_video_generation"]
20
+ system_message: |
21
+ You are an AI assistant with access to text-to-video generation capabilities.
22
+
23
+ - id: "text_to_video_generation_tool2"
24
+ backend:
25
+ type: "openai"
26
+ model: "gpt-4o"
27
+ cwd: "workspace2"
28
+ enable_video_generation: true
29
+ custom_tools:
30
+ - name: ["understand_video"]
31
+ category: "multimodal"
32
+ path: "massgen/tool/_multimodal_tools/understand_video.py"
33
+ function: ["understand_video"]
34
+ - name: ["text_to_video_generation"]
35
+ category: "multimodal"
36
+ path: "massgen/tool/_multimodal_tools/text_to_video_generation.py"
37
+ function: ["text_to_video_generation"]
38
+ system_message: |
39
+ You are an AI assistant with access to text-to-video generation capabilities.
40
+
41
+ orchestrator:
42
+ snapshot_storage: "snapshots"
43
+ agent_temporary_workspace: "temp_workspaces"
44
+
45
+ ui:
46
+ display_type: "rich_terminal"
47
+ logging_enabled: true
@@ -0,0 +1,29 @@
1
+ # MassGen Configuration: Text to Video Generation Tool
2
+ # Usage:
3
+ # uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/text_to_video_generation_single.yaml "Generate a 4 seconds video with neon-lit alley at night, light rain, slow push-in, cinematic."
4
+ agents:
5
+ - id: "text_to_video_generation_tool"
6
+ backend:
7
+ type: "openai"
8
+ model: "gpt-4o"
9
+ cwd: "workspace1"
10
+ enable_video_generation: true
11
+ custom_tools:
12
+ - name: ["understand_video"]
13
+ category: "multimodal"
14
+ path: "massgen/tool/_multimodal_tools/understand_video.py"
15
+ function: ["understand_video"]
16
+ - name: ["text_to_video_generation"]
17
+ category: "multimodal"
18
+ path: "massgen/tool/_multimodal_tools/text_to_video_generation.py"
19
+ function: ["text_to_video_generation"]
20
+ system_message: |
21
+ You are an AI assistant with access to text-to-video generation capabilities.
22
+
23
+ orchestrator:
24
+ snapshot_storage: "snapshots"
25
+ agent_temporary_workspace: "temp_workspaces"
26
+
27
+ ui:
28
+ display_type: "simple"
29
+ logging_enabled: true
@@ -1,6 +1,6 @@
1
1
  # MassGen Configuration: Understand Audio Tool
2
2
  # Usage:
3
- # massgen --config massgen/configs/tools/custom_tools/multimodal_tools/understand_audio.yaml "Please summarize the content in this audio."
3
+ # uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/understand_audio.yaml "Please summarize the content in this audio."
4
4
  agents:
5
5
  - id: "understand_audio_tool"
6
6
  backend:
@@ -1,6 +1,6 @@
1
1
  # MassGen Configuration: Understand File Tool
2
2
  # Usage:
3
- # massgen --config massgen/configs/tools/custom_tools/multimodal_tools/understand_file.yaml "Please summarize the content in this file."
3
+ # uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/understand_file.yaml "Please summarize the content in this file."
4
4
  agents:
5
5
  - id: "understand_file_tool"
6
6
  backend:
@@ -1,6 +1,6 @@
1
1
  # MassGen Configuration: Understand Image Tool
2
2
  # Usage:
3
- # massgen --config massgen/configs/tools/custom_tools/multimodal_tools/understand_image.yaml "Please summarize the content in this image."
3
+ # uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/understand_image.yaml "Please summarize the content in this image."
4
4
  agents:
5
5
  - id: "understand_image_tool"
6
6
  backend:
@@ -1,6 +1,6 @@
1
1
  # MassGen Configuration: Understand Video Tool
2
2
  # Usage:
3
- # massgen --config massgen/configs/tools/custom_tools/multimodal_tools/understand_video.yaml "What's happening in this video?"
3
+ # uv run python -m massgen.cli --config massgen/configs/tools/custom_tools/multimodal_tools/understand_video.yaml "What's happening in this video?"
4
4
  agents:
5
5
  - id: "understand_video_tool"
6
6
  backend:
@@ -51,7 +51,7 @@ orchestrator:
51
51
  snapshot_storage: "snapshots"
52
52
  agent_temporary_workspace: "temp_workspaces"
53
53
  context_paths:
54
- - path: "docs/case_studies"
54
+ - path: "docs/source/examples/case_studies"
55
55
  permission: "read"
56
56
 
57
57
  ui:
@@ -57,6 +57,7 @@ class FilesystemManager:
57
57
  command_line_docker_network_mode: str = "none",
58
58
  command_line_docker_enable_sudo: bool = False,
59
59
  enable_audio_generation: bool = False,
60
+ enable_file_generation: bool = False,
60
61
  ):
61
62
  """
62
63
  Initialize FilesystemManager.
@@ -90,6 +90,68 @@ class PathPermissionManager:
90
90
  "massgen_logs",
91
91
  ]
92
92
 
93
+ # Binary file extensions that should not be read by text-based tools
94
+ # These files should be handled by specialized tools (understand_image, understand_video, etc.)
95
+ BINARY_FILE_EXTENSIONS = {
96
+ # Images
97
+ ".jpg",
98
+ ".jpeg",
99
+ ".png",
100
+ ".gif",
101
+ ".bmp",
102
+ ".ico",
103
+ ".svg",
104
+ ".webp",
105
+ ".tiff",
106
+ ".tif",
107
+ # Videos
108
+ ".mp4",
109
+ ".avi",
110
+ ".mov",
111
+ ".mkv",
112
+ ".flv",
113
+ ".wmv",
114
+ ".webm",
115
+ ".m4v",
116
+ ".mpg",
117
+ ".mpeg",
118
+ # Audio
119
+ ".mp3",
120
+ ".wav",
121
+ ".ogg",
122
+ ".flac",
123
+ ".aac",
124
+ ".m4a",
125
+ ".wma",
126
+ # Archives
127
+ ".zip",
128
+ ".tar",
129
+ ".gz",
130
+ ".bz2",
131
+ ".7z",
132
+ ".rar",
133
+ ".xz",
134
+ # Executables and binaries
135
+ ".exe",
136
+ ".bin",
137
+ ".dll",
138
+ ".so",
139
+ ".dylib",
140
+ ".o",
141
+ ".a",
142
+ ".pyc",
143
+ ".class",
144
+ ".jar",
145
+ # Office documents (binary formats - use understand_file tool)
146
+ ".doc", # Old Word (not supported by understand_file)
147
+ ".xls", # Old Excel (not supported by understand_file)
148
+ ".ppt", # Old PowerPoint (not supported by understand_file)
149
+ ".pdf", # PDF (supported by understand_file with PyPDF2)
150
+ ".docx", # Word (supported by understand_file with python-docx)
151
+ ".xlsx", # Excel (supported by understand_file with openpyxl)
152
+ ".pptx", # PowerPoint (supported by understand_file with python-pptx)
153
+ }
154
+
93
155
  def __init__(
94
156
  self,
95
157
  context_write_access_enabled: bool = False,
@@ -440,6 +502,12 @@ class PathPermissionManager:
440
502
  - allowed: Whether the tool call should proceed
441
503
  - reason: Explanation if blocked (None if allowed)
442
504
  """
505
+ # Check if read tool is trying to read binary files (images, videos, etc.)
506
+ if self._is_text_read_tool(tool_name):
507
+ binary_check_result = self._validate_binary_file_access(tool_name, tool_args)
508
+ if not binary_check_result[0]:
509
+ return binary_check_result
510
+
443
511
  # Track read operations for read-before-delete enforcement
444
512
  if self._is_read_tool(tool_name):
445
513
  self._track_read_operation(tool_name, tool_args)
@@ -495,6 +563,33 @@ class PathPermissionManager:
495
563
 
496
564
  return False
497
565
 
566
+ def _is_text_read_tool(self, tool_name: str) -> bool:
567
+ """
568
+ Check if a tool is a text-based read operation that should not access binary files.
569
+
570
+ These tools are designed for reading text files and should be blocked from
571
+ reading binary files (images, videos, audio, etc.) to prevent context pollution.
572
+
573
+ Tools that read text file contents:
574
+ - Read: Claude Code read tool
575
+ - read_text_file: MCP filesystem read tool
576
+ - read_file: Generic read operations
577
+ """
578
+ # Use lowercase for case-insensitive matching
579
+ tool_lower = tool_name.lower()
580
+
581
+ # Check if tool name contains any text read operation keywords
582
+ text_read_keywords = [
583
+ "read_text_file", # MCP filesystem: read_text_file
584
+ "read_file", # Generic read operations
585
+ ]
586
+
587
+ # Also check for exact "Read" match (Claude Code tool)
588
+ if tool_name == "Read":
589
+ return True
590
+
591
+ return any(keyword in tool_lower for keyword in text_read_keywords)
592
+
498
593
  def _is_read_tool(self, tool_name: str) -> bool:
499
594
  """
500
595
  Check if a tool is a read operation that should be tracked.
@@ -518,6 +613,59 @@ class PathPermissionManager:
518
613
 
519
614
  return any(keyword in tool_lower for keyword in read_keywords)
520
615
 
616
+ def _validate_binary_file_access(self, tool_name: str, tool_args: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
617
+ """
618
+ Validate that text-based read tools are not trying to read binary files.
619
+
620
+ Binary files (images, videos, audio, etc.) should be handled by specialized tools
621
+ to prevent context pollution with binary data.
622
+
623
+ Args:
624
+ tool_name: Name of the tool being called
625
+ tool_args: Arguments passed to the tool
626
+
627
+ Returns:
628
+ Tuple of (allowed: bool, reason: Optional[str])
629
+ - allowed: False if trying to read binary file, True otherwise
630
+ - reason: Explanation if blocked (None if allowed)
631
+ """
632
+ # Extract file path from arguments
633
+ file_path = self._extract_file_path(tool_args)
634
+ if not file_path:
635
+ # Can't determine path - allow (tool may not access files)
636
+ return (True, None)
637
+
638
+ # Resolve path
639
+ try:
640
+ file_path_str = self._resolve_path_against_workspace(file_path)
641
+ path = Path(file_path_str)
642
+ except Exception:
643
+ # If path resolution fails, allow (will fail elsewhere if invalid)
644
+ return (True, None)
645
+
646
+ # Check file extension
647
+ file_extension = path.suffix.lower()
648
+ if file_extension in self.BINARY_FILE_EXTENSIONS:
649
+ # Determine appropriate tool suggestion based on file type
650
+ if file_extension in {".jpg", ".jpeg", ".png", ".gif", ".bmp", ".ico", ".svg", ".webp", ".tiff", ".tif"}:
651
+ suggestion = "For images, use understand_image tool"
652
+ elif file_extension in {".mp4", ".avi", ".mov", ".mkv", ".flv", ".wmv", ".webm", ".m4v", ".mpg", ".mpeg"}:
653
+ suggestion = "For videos, use understand_video tool"
654
+ elif file_extension in {".mp3", ".wav", ".ogg", ".flac", ".aac", ".m4a", ".wma"}:
655
+ suggestion = "For audio files, use generate_text_with_input_audio tool"
656
+ elif file_extension in {".pdf"}:
657
+ suggestion = "For PDF files, use understand_file tool"
658
+ elif file_extension in {".docx", ".xlsx", ".pptx"}:
659
+ suggestion = "For Office documents, use understand_file tool"
660
+ else:
661
+ suggestion = "Use appropriate specialized tool for this file type"
662
+
663
+ reason = f"Cannot read binary file '{path.name}' with {tool_name}. {suggestion}."
664
+ logger.warning(f"[PathPermissionManager] Blocked {tool_name} from reading binary file: {path}")
665
+ return (False, reason)
666
+
667
+ return (True, None)
668
+
521
669
  def _is_delete_tool(self, tool_name: str) -> bool:
522
670
  """
523
671
  Check if a tool is a delete operation.