massgen 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of massgen might be problematic. Click here for more details.
- massgen/__init__.py +1 -1
- massgen/agent_config.py +33 -7
- massgen/api_params_handler/_api_params_handler_base.py +3 -0
- massgen/api_params_handler/_chat_completions_api_params_handler.py +4 -0
- massgen/api_params_handler/_claude_api_params_handler.py +4 -0
- massgen/api_params_handler/_gemini_api_params_handler.py +4 -0
- massgen/api_params_handler/_response_api_params_handler.py +4 -0
- massgen/backend/azure_openai.py +9 -1
- massgen/backend/base.py +4 -0
- massgen/backend/base_with_custom_tool_and_mcp.py +25 -5
- massgen/backend/claude_code.py +9 -1
- massgen/backend/docs/permissions_and_context_files.md +2 -2
- massgen/backend/gemini.py +35 -6
- massgen/backend/gemini_utils.py +30 -0
- massgen/backend/response.py +2 -0
- massgen/chat_agent.py +9 -3
- massgen/cli.py +291 -43
- massgen/config_builder.py +163 -18
- massgen/configs/README.md +69 -14
- massgen/configs/debug/restart_test_controlled.yaml +60 -0
- massgen/configs/debug/restart_test_controlled_filesystem.yaml +73 -0
- massgen/configs/tools/code-execution/docker_with_sudo.yaml +35 -0
- massgen/configs/tools/custom_tools/computer_use_browser_example.yaml +56 -0
- massgen/configs/tools/custom_tools/computer_use_docker_example.yaml +65 -0
- massgen/configs/tools/custom_tools/computer_use_example.yaml +50 -0
- massgen/configs/tools/custom_tools/crawl4ai_example.yaml +55 -0
- massgen/configs/tools/custom_tools/multimodal_tools/text_to_file_generation_multi.yaml +61 -0
- massgen/configs/tools/custom_tools/multimodal_tools/text_to_file_generation_single.yaml +29 -0
- massgen/configs/tools/custom_tools/multimodal_tools/text_to_image_generation_multi.yaml +51 -0
- massgen/configs/tools/custom_tools/multimodal_tools/text_to_image_generation_single.yaml +33 -0
- massgen/configs/tools/custom_tools/multimodal_tools/text_to_speech_generation_multi.yaml +55 -0
- massgen/configs/tools/custom_tools/multimodal_tools/text_to_speech_generation_single.yaml +33 -0
- massgen/configs/tools/custom_tools/multimodal_tools/text_to_video_generation_multi.yaml +47 -0
- massgen/configs/tools/custom_tools/multimodal_tools/text_to_video_generation_single.yaml +29 -0
- massgen/configs/tools/custom_tools/multimodal_tools/understand_audio.yaml +33 -0
- massgen/configs/tools/custom_tools/multimodal_tools/understand_file.yaml +34 -0
- massgen/configs/tools/custom_tools/multimodal_tools/understand_image.yaml +33 -0
- massgen/configs/tools/custom_tools/multimodal_tools/understand_video.yaml +34 -0
- massgen/configs/tools/custom_tools/multimodal_tools/youtube_video_analysis.yaml +59 -0
- massgen/docker/README.md +83 -0
- massgen/filesystem_manager/_code_execution_server.py +22 -7
- massgen/filesystem_manager/_docker_manager.py +21 -1
- massgen/filesystem_manager/_filesystem_manager.py +9 -0
- massgen/filesystem_manager/_path_permission_manager.py +148 -0
- massgen/filesystem_manager/_workspace_tools_server.py +0 -997
- massgen/formatter/_gemini_formatter.py +73 -0
- massgen/frontend/coordination_ui.py +175 -257
- massgen/frontend/displays/base_display.py +29 -0
- massgen/frontend/displays/rich_terminal_display.py +155 -9
- massgen/frontend/displays/simple_display.py +21 -0
- massgen/frontend/displays/terminal_display.py +22 -2
- massgen/logger_config.py +50 -6
- massgen/message_templates.py +283 -15
- massgen/orchestrator.py +335 -38
- massgen/tests/test_binary_file_blocking.py +274 -0
- massgen/tests/test_case_studies.md +12 -12
- massgen/tests/test_code_execution.py +178 -0
- massgen/tests/test_multimodal_size_limits.py +407 -0
- massgen/tests/test_orchestration_restart.py +204 -0
- massgen/tool/__init__.py +4 -0
- massgen/tool/_manager.py +7 -2
- massgen/tool/_multimodal_tools/image_to_image_generation.py +293 -0
- massgen/tool/_multimodal_tools/text_to_file_generation.py +455 -0
- massgen/tool/_multimodal_tools/text_to_image_generation.py +222 -0
- massgen/tool/_multimodal_tools/text_to_speech_continue_generation.py +226 -0
- massgen/tool/_multimodal_tools/text_to_speech_transcription_generation.py +217 -0
- massgen/tool/_multimodal_tools/text_to_video_generation.py +223 -0
- massgen/tool/_multimodal_tools/understand_audio.py +211 -0
- massgen/tool/_multimodal_tools/understand_file.py +555 -0
- massgen/tool/_multimodal_tools/understand_image.py +316 -0
- massgen/tool/_multimodal_tools/understand_video.py +340 -0
- massgen/tool/_web_tools/crawl4ai_tool.py +718 -0
- massgen/tool/docs/multimodal_tools.md +1368 -0
- massgen/tool/workflow_toolkits/__init__.py +26 -0
- massgen/tool/workflow_toolkits/post_evaluation.py +216 -0
- massgen/utils.py +1 -0
- {massgen-0.1.2.dist-info → massgen-0.1.4.dist-info}/METADATA +101 -69
- {massgen-0.1.2.dist-info → massgen-0.1.4.dist-info}/RECORD +82 -46
- {massgen-0.1.2.dist-info → massgen-0.1.4.dist-info}/WHEEL +0 -0
- {massgen-0.1.2.dist-info → massgen-0.1.4.dist-info}/entry_points.txt +0 -0
- {massgen-0.1.2.dist-info → massgen-0.1.4.dist-info}/licenses/LICENSE +0 -0
- {massgen-0.1.2.dist-info → massgen-0.1.4.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Tests for binary file blocking in PathPermissionManager.
|
|
5
|
+
|
|
6
|
+
These tests ensure that text-based read tools (Read, read_text_file, etc.)
|
|
7
|
+
are blocked from reading binary files (images, videos, audio, etc.) to prevent
|
|
8
|
+
context pollution with binary data.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
import pytest
|
|
14
|
+
|
|
15
|
+
from massgen.filesystem_manager._base import Permission
|
|
16
|
+
from massgen.filesystem_manager._path_permission_manager import PathPermissionManager
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@pytest.fixture
|
|
20
|
+
def permission_manager():
|
|
21
|
+
"""Create a PathPermissionManager instance for testing."""
|
|
22
|
+
manager = PathPermissionManager(
|
|
23
|
+
context_write_access_enabled=False,
|
|
24
|
+
enforce_read_before_delete=True,
|
|
25
|
+
)
|
|
26
|
+
# Add a workspace path for testing
|
|
27
|
+
test_workspace = Path("/tmp/test_workspace").resolve()
|
|
28
|
+
manager.add_path(test_workspace, Permission.WRITE, "workspace")
|
|
29
|
+
return manager
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class TestBinaryFileBlocking:
|
|
33
|
+
"""Test suite for binary file blocking functionality."""
|
|
34
|
+
|
|
35
|
+
@pytest.mark.asyncio
|
|
36
|
+
async def test_block_read_image_with_read_tool(self, permission_manager):
|
|
37
|
+
"""Test that Read tool is blocked from reading image files."""
|
|
38
|
+
tool_name = "Read"
|
|
39
|
+
tool_args = {"file_path": "/tmp/test_workspace/photo.jpg"}
|
|
40
|
+
|
|
41
|
+
allowed, reason = await permission_manager.pre_tool_use_hook(tool_name, tool_args)
|
|
42
|
+
|
|
43
|
+
assert not allowed, "Read should be blocked from reading .jpg files"
|
|
44
|
+
assert reason is not None
|
|
45
|
+
assert "understand_image" in reason.lower()
|
|
46
|
+
assert "photo.jpg" in reason
|
|
47
|
+
|
|
48
|
+
@pytest.mark.asyncio
|
|
49
|
+
async def test_block_read_text_file_image(self, permission_manager):
|
|
50
|
+
"""Test that read_text_file (MCP) is blocked from reading image files."""
|
|
51
|
+
tool_name = "mcp__filesystem__read_text_file"
|
|
52
|
+
tool_args = {"path": "/tmp/test_workspace/diagram.png"}
|
|
53
|
+
|
|
54
|
+
allowed, reason = await permission_manager.pre_tool_use_hook(tool_name, tool_args)
|
|
55
|
+
|
|
56
|
+
assert not allowed, "read_text_file should be blocked from reading .png files"
|
|
57
|
+
assert reason is not None
|
|
58
|
+
assert "understand_image" in reason.lower()
|
|
59
|
+
|
|
60
|
+
@pytest.mark.asyncio
|
|
61
|
+
async def test_block_read_video(self, permission_manager):
|
|
62
|
+
"""Test that Read tool is blocked from reading video files."""
|
|
63
|
+
tool_name = "Read"
|
|
64
|
+
tool_args = {"file_path": "/tmp/test_workspace/demo.mp4"}
|
|
65
|
+
|
|
66
|
+
allowed, reason = await permission_manager.pre_tool_use_hook(tool_name, tool_args)
|
|
67
|
+
|
|
68
|
+
assert not allowed, "Read should be blocked from reading .mp4 files"
|
|
69
|
+
assert reason is not None
|
|
70
|
+
assert "understand_video" in reason.lower()
|
|
71
|
+
|
|
72
|
+
@pytest.mark.asyncio
|
|
73
|
+
async def test_block_read_audio(self, permission_manager):
|
|
74
|
+
"""Test that Read tool is blocked from reading audio files."""
|
|
75
|
+
tool_name = "Read"
|
|
76
|
+
tool_args = {"file_path": "/tmp/test_workspace/recording.mp3"}
|
|
77
|
+
|
|
78
|
+
allowed, reason = await permission_manager.pre_tool_use_hook(tool_name, tool_args)
|
|
79
|
+
|
|
80
|
+
assert not allowed, "Read should be blocked from reading .mp3 files"
|
|
81
|
+
assert reason is not None
|
|
82
|
+
assert "audio" in reason.lower()
|
|
83
|
+
|
|
84
|
+
@pytest.mark.asyncio
|
|
85
|
+
async def test_allow_read_text_file(self, permission_manager):
|
|
86
|
+
"""Test that Read tool is allowed to read text files."""
|
|
87
|
+
tool_name = "Read"
|
|
88
|
+
tool_args = {"file_path": "/tmp/test_workspace/document.txt"}
|
|
89
|
+
|
|
90
|
+
allowed, reason = await permission_manager.pre_tool_use_hook(tool_name, tool_args)
|
|
91
|
+
|
|
92
|
+
assert allowed, "Read should be allowed to read .txt files"
|
|
93
|
+
assert reason is None
|
|
94
|
+
|
|
95
|
+
@pytest.mark.asyncio
|
|
96
|
+
async def test_allow_read_code_file(self, permission_manager):
|
|
97
|
+
"""Test that Read tool is allowed to read code files."""
|
|
98
|
+
test_cases = [
|
|
99
|
+
"script.py",
|
|
100
|
+
"app.js",
|
|
101
|
+
"component.tsx",
|
|
102
|
+
"main.go",
|
|
103
|
+
"app.rs",
|
|
104
|
+
]
|
|
105
|
+
|
|
106
|
+
for filename in test_cases:
|
|
107
|
+
tool_name = "Read"
|
|
108
|
+
tool_args = {"file_path": f"/tmp/test_workspace/{filename}"}
|
|
109
|
+
|
|
110
|
+
allowed, reason = await permission_manager.pre_tool_use_hook(tool_name, tool_args)
|
|
111
|
+
|
|
112
|
+
assert allowed, f"Read should be allowed to read {filename}"
|
|
113
|
+
assert reason is None
|
|
114
|
+
|
|
115
|
+
@pytest.mark.asyncio
|
|
116
|
+
async def test_block_all_image_formats(self, permission_manager):
|
|
117
|
+
"""Test that all image formats are blocked."""
|
|
118
|
+
image_extensions = [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".ico", ".svg", ".webp", ".tiff", ".tif"]
|
|
119
|
+
|
|
120
|
+
for ext in image_extensions:
|
|
121
|
+
tool_name = "Read"
|
|
122
|
+
tool_args = {"file_path": f"/tmp/test_workspace/image{ext}"}
|
|
123
|
+
|
|
124
|
+
allowed, reason = await permission_manager.pre_tool_use_hook(tool_name, tool_args)
|
|
125
|
+
|
|
126
|
+
assert not allowed, f"Read should be blocked from reading {ext} files"
|
|
127
|
+
assert reason is not None
|
|
128
|
+
|
|
129
|
+
@pytest.mark.asyncio
|
|
130
|
+
async def test_block_all_video_formats(self, permission_manager):
|
|
131
|
+
"""Test that all video formats are blocked."""
|
|
132
|
+
video_extensions = [".mp4", ".avi", ".mov", ".mkv", ".flv", ".wmv", ".webm", ".m4v", ".mpg", ".mpeg"]
|
|
133
|
+
|
|
134
|
+
for ext in video_extensions:
|
|
135
|
+
tool_name = "Read"
|
|
136
|
+
tool_args = {"file_path": f"/tmp/test_workspace/video{ext}"}
|
|
137
|
+
|
|
138
|
+
allowed, reason = await permission_manager.pre_tool_use_hook(tool_name, tool_args)
|
|
139
|
+
|
|
140
|
+
assert not allowed, f"Read should be blocked from reading {ext} files"
|
|
141
|
+
assert reason is not None
|
|
142
|
+
|
|
143
|
+
@pytest.mark.asyncio
|
|
144
|
+
async def test_block_all_audio_formats(self, permission_manager):
|
|
145
|
+
"""Test that all audio formats are blocked."""
|
|
146
|
+
audio_extensions = [".mp3", ".wav", ".ogg", ".flac", ".aac", ".m4a", ".wma"]
|
|
147
|
+
|
|
148
|
+
for ext in audio_extensions:
|
|
149
|
+
tool_name = "Read"
|
|
150
|
+
tool_args = {"file_path": f"/tmp/test_workspace/audio{ext}"}
|
|
151
|
+
|
|
152
|
+
allowed, reason = await permission_manager.pre_tool_use_hook(tool_name, tool_args)
|
|
153
|
+
|
|
154
|
+
assert not allowed, f"Read should be blocked from reading {ext} files"
|
|
155
|
+
assert reason is not None
|
|
156
|
+
|
|
157
|
+
@pytest.mark.asyncio
|
|
158
|
+
async def test_block_archive_formats(self, permission_manager):
|
|
159
|
+
"""Test that archive formats are blocked."""
|
|
160
|
+
archive_extensions = [".zip", ".tar", ".gz", ".bz2", ".7z", ".rar", ".xz"]
|
|
161
|
+
|
|
162
|
+
for ext in archive_extensions:
|
|
163
|
+
tool_name = "Read"
|
|
164
|
+
tool_args = {"file_path": f"/tmp/test_workspace/archive{ext}"}
|
|
165
|
+
|
|
166
|
+
allowed, reason = await permission_manager.pre_tool_use_hook(tool_name, tool_args)
|
|
167
|
+
|
|
168
|
+
assert not allowed, f"Read should be blocked from reading {ext} files"
|
|
169
|
+
assert reason is not None
|
|
170
|
+
|
|
171
|
+
@pytest.mark.asyncio
|
|
172
|
+
async def test_block_executable_formats(self, permission_manager):
|
|
173
|
+
"""Test that executable/binary formats are blocked."""
|
|
174
|
+
binary_extensions = [".exe", ".bin", ".dll", ".so", ".dylib", ".o", ".a", ".pyc", ".class", ".jar"]
|
|
175
|
+
|
|
176
|
+
for ext in binary_extensions:
|
|
177
|
+
tool_name = "Read"
|
|
178
|
+
tool_args = {"file_path": f"/tmp/test_workspace/binary{ext}"}
|
|
179
|
+
|
|
180
|
+
allowed, reason = await permission_manager.pre_tool_use_hook(tool_name, tool_args)
|
|
181
|
+
|
|
182
|
+
assert not allowed, f"Read should be blocked from reading {ext} files"
|
|
183
|
+
assert reason is not None
|
|
184
|
+
|
|
185
|
+
@pytest.mark.asyncio
|
|
186
|
+
async def test_block_old_office_formats(self, permission_manager):
|
|
187
|
+
"""Test that old Office formats are blocked (use understand_file instead)."""
|
|
188
|
+
old_office_extensions = [".doc", ".xls", ".ppt"]
|
|
189
|
+
|
|
190
|
+
for ext in old_office_extensions:
|
|
191
|
+
tool_name = "Read"
|
|
192
|
+
tool_args = {"file_path": f"/tmp/test_workspace/document{ext}"}
|
|
193
|
+
|
|
194
|
+
allowed, reason = await permission_manager.pre_tool_use_hook(tool_name, tool_args)
|
|
195
|
+
|
|
196
|
+
assert not allowed, f"Read should be blocked from reading {ext} files"
|
|
197
|
+
assert reason is not None
|
|
198
|
+
|
|
199
|
+
@pytest.mark.asyncio
|
|
200
|
+
async def test_block_office_formats(self, permission_manager):
|
|
201
|
+
"""Test that Office document formats are blocked from Read (must use understand_file).
|
|
202
|
+
|
|
203
|
+
These are binary formats that should be handled by understand_file tool,
|
|
204
|
+
which can properly extract text from them using specialized libraries.
|
|
205
|
+
"""
|
|
206
|
+
office_extensions = [".pdf", ".docx", ".xlsx", ".pptx"]
|
|
207
|
+
|
|
208
|
+
for ext in office_extensions:
|
|
209
|
+
tool_name = "Read"
|
|
210
|
+
tool_args = {"file_path": f"/tmp/test_workspace/document{ext}"}
|
|
211
|
+
|
|
212
|
+
allowed, reason = await permission_manager.pre_tool_use_hook(tool_name, tool_args)
|
|
213
|
+
|
|
214
|
+
assert not allowed, f"Read should be blocked from reading {ext} files (use understand_file)"
|
|
215
|
+
assert reason is not None
|
|
216
|
+
assert "understand_file" in reason.lower()
|
|
217
|
+
|
|
218
|
+
@pytest.mark.asyncio
|
|
219
|
+
async def test_case_insensitive_extension_check(self, permission_manager):
|
|
220
|
+
"""Test that extension checking is case-insensitive."""
|
|
221
|
+
test_cases = [
|
|
222
|
+
"/tmp/test_workspace/PHOTO.JPG",
|
|
223
|
+
"/tmp/test_workspace/Video.MP4",
|
|
224
|
+
"/tmp/test_workspace/Audio.MP3",
|
|
225
|
+
"/tmp/test_workspace/Image.PNG",
|
|
226
|
+
]
|
|
227
|
+
|
|
228
|
+
for file_path in test_cases:
|
|
229
|
+
tool_name = "Read"
|
|
230
|
+
tool_args = {"file_path": file_path}
|
|
231
|
+
|
|
232
|
+
allowed, reason = await permission_manager.pre_tool_use_hook(tool_name, tool_args)
|
|
233
|
+
|
|
234
|
+
assert not allowed, f"Read should be blocked from reading {file_path} (case-insensitive)"
|
|
235
|
+
assert reason is not None
|
|
236
|
+
|
|
237
|
+
@pytest.mark.asyncio
|
|
238
|
+
async def test_non_text_read_tools_not_affected(self, permission_manager):
|
|
239
|
+
"""Test that non-text-read tools are not affected by binary file blocking."""
|
|
240
|
+
# Tools like Write, Edit, Delete should not be affected
|
|
241
|
+
test_cases = [
|
|
242
|
+
("Write", {"file_path": "/tmp/test_workspace/image.jpg"}),
|
|
243
|
+
("Edit", {"file_path": "/tmp/test_workspace/video.mp4"}),
|
|
244
|
+
("Grep", {"pattern": "test"}), # No file_path, should pass
|
|
245
|
+
]
|
|
246
|
+
|
|
247
|
+
for tool_name, tool_args in test_cases:
|
|
248
|
+
allowed, reason = await permission_manager.pre_tool_use_hook(tool_name, tool_args)
|
|
249
|
+
|
|
250
|
+
# These tools have their own validation, but shouldn't be blocked by binary check
|
|
251
|
+
# (they may be blocked for other reasons like permissions)
|
|
252
|
+
# The key is that _validate_binary_file_access is not called for these
|
|
253
|
+
assert isinstance(allowed, bool) # Should complete without binary file error
|
|
254
|
+
|
|
255
|
+
@pytest.mark.asyncio
|
|
256
|
+
async def test_helpful_error_messages(self, permission_manager):
|
|
257
|
+
"""Test that error messages provide helpful suggestions for blocked binary files."""
|
|
258
|
+
test_cases = [
|
|
259
|
+
(".jpg", "understand_image"),
|
|
260
|
+
(".mp4", "understand_video"),
|
|
261
|
+
(".mp3", "audio"),
|
|
262
|
+
(".pdf", "understand_file"),
|
|
263
|
+
(".docx", "understand_file"),
|
|
264
|
+
]
|
|
265
|
+
|
|
266
|
+
for ext, expected_suggestion in test_cases:
|
|
267
|
+
tool_name = "Read"
|
|
268
|
+
tool_args = {"file_path": f"/tmp/test_workspace/file{ext}"}
|
|
269
|
+
|
|
270
|
+
allowed, reason = await permission_manager.pre_tool_use_hook(tool_name, tool_args)
|
|
271
|
+
|
|
272
|
+
assert not allowed, f"File with {ext} extension should be blocked"
|
|
273
|
+
assert reason is not None
|
|
274
|
+
assert expected_suggestion.lower() in reason.lower(), f"Error message should suggest {expected_suggestion} for {ext} files"
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
# MassGen Case Study Test Commands
|
|
2
2
|
|
|
3
|
-
This document contains commands to test all the case studies from `docs/case_studies/` using the three agents default configuration.
|
|
3
|
+
This document contains commands to test all the case studies from `docs/source/examples/case_studies/` using the three agents default configuration.
|
|
4
4
|
|
|
5
5
|
## Quick Commands
|
|
6
6
|
|
|
7
7
|
All tests use the `three_agents_default.yaml` configuration with:
|
|
8
8
|
- **Gemini 2.5 Flash** (web search enabled)
|
|
9
|
-
- **GPT-4o-mini** (web search + code interpreter)
|
|
9
|
+
- **GPT-4o-mini** (web search + code interpreter)
|
|
10
10
|
- **Grok 3 mini** (web search with citations)
|
|
11
11
|
|
|
12
12
|
### 1. Collaborative Creative Writing
|
|
@@ -17,8 +17,8 @@ python massgen/cli.py --config massgen/configs/three_agents_default.yaml "Write
|
|
|
17
17
|
# From tests directory:
|
|
18
18
|
python ../cli.py --config ../configs/three_agents_default.yaml "Write a short story about a robot who discovers music."
|
|
19
19
|
```
|
|
20
|
-
**Original:** gpt-4o, gemini-2.5-flash, grok-3-mini
|
|
21
|
-
**Current:** gemini2.5flash, 4omini, grok3mini with builtin tools
|
|
20
|
+
**Original:** gpt-4o, gemini-2.5-flash, grok-3-mini
|
|
21
|
+
**Current:** gemini2.5flash, 4omini, grok3mini with builtin tools
|
|
22
22
|
|
|
23
23
|
### 2. AI News Synthesis
|
|
24
24
|
```bash
|
|
@@ -28,8 +28,8 @@ python massgen/cli.py --config massgen/configs/three_agents_default.yaml "find b
|
|
|
28
28
|
# From tests directory:
|
|
29
29
|
python ../cli.py --config ../configs/three_agents_default.yaml "find big AI news this week"
|
|
30
30
|
```
|
|
31
|
-
**Original:** gpt-4.1, gemini-2.5-flash, grok-3-mini
|
|
32
|
-
**Current:** gemini2.5flash, 4omini, grok3mini with web search
|
|
31
|
+
**Original:** gpt-4.1, gemini-2.5-flash, grok-3-mini
|
|
32
|
+
**Current:** gemini2.5flash, 4omini, grok3mini with web search
|
|
33
33
|
|
|
34
34
|
### 3. Grok HLE Cost Estimation
|
|
35
35
|
```bash
|
|
@@ -39,8 +39,8 @@ python massgen/cli.py --config massgen/configs/three_agents_default.yaml "How mu
|
|
|
39
39
|
# From tests directory:
|
|
40
40
|
python ../cli.py --config ../configs/three_agents_default.yaml "How much does it cost to run HLE benchmark with Grok-4"
|
|
41
41
|
```
|
|
42
|
-
**Original:** gpt-4o, gemini-2.5-flash, grok-3-mini
|
|
43
|
-
**Current:** gemini2.5flash, 4omini, grok3mini with web search
|
|
42
|
+
**Original:** gpt-4o, gemini-2.5-flash, grok-3-mini
|
|
43
|
+
**Current:** gemini2.5flash, 4omini, grok3mini with web search
|
|
44
44
|
|
|
45
45
|
### 4. IMO 2025 Winner
|
|
46
46
|
```bash
|
|
@@ -50,8 +50,8 @@ python massgen/cli.py --config massgen/configs/three_agents_default.yaml "Which
|
|
|
50
50
|
# From tests directory:
|
|
51
51
|
python ../cli.py --config ../configs/three_agents_default.yaml "Which AI won IMO 2025?"
|
|
52
52
|
```
|
|
53
|
-
**Original:** gemini-2.5-flash, gpt-4.1 (2 agents)
|
|
54
|
-
**Current:** gemini2.5flash, 4omini, grok3mini (3 agents with web search)
|
|
53
|
+
**Original:** gemini-2.5-flash, gpt-4.1 (2 agents)
|
|
54
|
+
**Current:** gemini2.5flash, 4omini, grok3mini (3 agents with web search)
|
|
55
55
|
|
|
56
56
|
### 5. Stockholm Travel Guide
|
|
57
57
|
```bash
|
|
@@ -61,7 +61,7 @@ python massgen/cli.py --config massgen/configs/three_agents_default.yaml "what's
|
|
|
61
61
|
# From tests directory:
|
|
62
62
|
python ../cli.py --config ../configs/three_agents_default.yaml "what's best to do in Stockholm in October 2025"
|
|
63
63
|
```
|
|
64
|
-
**Original:** gemini-2.5-flash, gpt-4o (2 agents)
|
|
64
|
+
**Original:** gemini-2.5-flash, gpt-4o (2 agents)
|
|
65
65
|
**Current:** gemini2.5flash, 4omini, grok3mini with web search for current info
|
|
66
66
|
|
|
67
67
|
## Configuration Details
|
|
@@ -70,7 +70,7 @@ The `three_agents_default.yaml` configuration provides:
|
|
|
70
70
|
|
|
71
71
|
### Agent Capabilities
|
|
72
72
|
- **gemini2.5flash**: Gemini 2.5 Flash with web search
|
|
73
|
-
- **4omini**: GPT-4o-mini with web search + code interpreter
|
|
73
|
+
- **4omini**: GPT-4o-mini with web search + code interpreter
|
|
74
74
|
- **grok3mini**: Grok 3 mini with web search and citations
|
|
75
75
|
|
|
76
76
|
### UI Features
|
|
@@ -154,6 +154,112 @@ class TestCommandSanitization:
|
|
|
154
154
|
_sanitize_command(cmd)
|
|
155
155
|
|
|
156
156
|
|
|
157
|
+
class TestSudoSanitization:
|
|
158
|
+
"""Test sudo sanitization respects enable_sudo flag."""
|
|
159
|
+
|
|
160
|
+
def test_sudo_blocked_by_default(self):
|
|
161
|
+
"""Test that sudo is blocked when enable_sudo=False (default)."""
|
|
162
|
+
from massgen.filesystem_manager._code_execution_server import _sanitize_command
|
|
163
|
+
|
|
164
|
+
sudo_commands = [
|
|
165
|
+
"sudo apt-get update",
|
|
166
|
+
"sudo apt-get install -y ffmpeg",
|
|
167
|
+
"sudo pip install tensorflow",
|
|
168
|
+
"sudo npm install -g typescript",
|
|
169
|
+
"sudo chmod 755 file.txt",
|
|
170
|
+
"echo 'test' && sudo apt update",
|
|
171
|
+
]
|
|
172
|
+
|
|
173
|
+
for cmd in sudo_commands:
|
|
174
|
+
with pytest.raises(ValueError, match="sudo.*not allowed"):
|
|
175
|
+
_sanitize_command(cmd, enable_sudo=False)
|
|
176
|
+
|
|
177
|
+
def test_sudo_allowed_when_enabled(self):
|
|
178
|
+
"""Test that sudo is allowed when enable_sudo=True."""
|
|
179
|
+
from massgen.filesystem_manager._code_execution_server import _sanitize_command
|
|
180
|
+
|
|
181
|
+
sudo_commands = [
|
|
182
|
+
"sudo apt-get update",
|
|
183
|
+
"sudo apt-get install -y ffmpeg",
|
|
184
|
+
"sudo pip install tensorflow",
|
|
185
|
+
"sudo npm install -g typescript",
|
|
186
|
+
"sudo chown user:group file.txt", # chown allowed with sudo enabled
|
|
187
|
+
"sudo chmod 755 file.txt", # chmod allowed with sudo enabled
|
|
188
|
+
]
|
|
189
|
+
|
|
190
|
+
for cmd in sudo_commands:
|
|
191
|
+
# Should not raise when enable_sudo=True
|
|
192
|
+
_sanitize_command(cmd, enable_sudo=True)
|
|
193
|
+
|
|
194
|
+
def test_other_dangerous_patterns_still_blocked_with_sudo(self):
|
|
195
|
+
"""Test that other dangerous patterns are still blocked even with sudo enabled."""
|
|
196
|
+
from massgen.filesystem_manager._code_execution_server import _sanitize_command
|
|
197
|
+
|
|
198
|
+
# These should ALWAYS be blocked, regardless of enable_sudo
|
|
199
|
+
dangerous_commands = [
|
|
200
|
+
"sudo rm -rf /", # Still blocked - root deletion
|
|
201
|
+
"rm -rf /", # Still blocked
|
|
202
|
+
"dd if=/dev/zero of=/dev/sda", # Still blocked - dd command
|
|
203
|
+
"sudo dd if=/dev/zero of=/dev/sda", # Still blocked
|
|
204
|
+
":(){ :|:& };:", # Still blocked - fork bomb
|
|
205
|
+
"mv file /dev/null", # Still blocked
|
|
206
|
+
"sudo mv file /dev/null", # Still blocked
|
|
207
|
+
"echo test > /dev/sda1", # Still blocked - writing to disk
|
|
208
|
+
]
|
|
209
|
+
|
|
210
|
+
for cmd in dangerous_commands:
|
|
211
|
+
with pytest.raises(ValueError, match="dangerous|not allowed"):
|
|
212
|
+
_sanitize_command(cmd, enable_sudo=True)
|
|
213
|
+
|
|
214
|
+
def test_su_chown_chmod_blocked_without_sudo_flag(self):
|
|
215
|
+
"""Test that su, chown, chmod are blocked when enable_sudo=False."""
|
|
216
|
+
from massgen.filesystem_manager._code_execution_server import _sanitize_command
|
|
217
|
+
|
|
218
|
+
commands = [
|
|
219
|
+
"su root",
|
|
220
|
+
"su - postgres",
|
|
221
|
+
"chown root:root file.txt",
|
|
222
|
+
"chmod 777 file.txt",
|
|
223
|
+
"chmod +x script.sh",
|
|
224
|
+
]
|
|
225
|
+
|
|
226
|
+
for cmd in commands:
|
|
227
|
+
with pytest.raises(ValueError, match="not allowed"):
|
|
228
|
+
_sanitize_command(cmd, enable_sudo=False)
|
|
229
|
+
|
|
230
|
+
def test_su_chown_chmod_allowed_with_sudo_flag(self):
|
|
231
|
+
"""Test that su, chown, chmod are allowed when enable_sudo=True (Docker sudo mode)."""
|
|
232
|
+
from massgen.filesystem_manager._code_execution_server import _sanitize_command
|
|
233
|
+
|
|
234
|
+
# In Docker sudo mode, these are safe because they're confined to container
|
|
235
|
+
commands = [
|
|
236
|
+
"su postgres",
|
|
237
|
+
"chown user:group file.txt",
|
|
238
|
+
"chmod 755 file.txt",
|
|
239
|
+
"chmod +x script.sh",
|
|
240
|
+
]
|
|
241
|
+
|
|
242
|
+
for cmd in commands:
|
|
243
|
+
# Should not raise when enable_sudo=True
|
|
244
|
+
_sanitize_command(cmd, enable_sudo=True)
|
|
245
|
+
|
|
246
|
+
def test_local_mode_blocks_sudo(self):
|
|
247
|
+
"""Test that local mode (non-Docker) blocks sudo commands."""
|
|
248
|
+
from massgen.filesystem_manager._code_execution_server import _sanitize_command
|
|
249
|
+
|
|
250
|
+
# In local mode (enable_sudo=False), sudo should be blocked for safety
|
|
251
|
+
with pytest.raises(ValueError, match="sudo.*not allowed"):
|
|
252
|
+
_sanitize_command("sudo apt-get install malicious-package", enable_sudo=False)
|
|
253
|
+
|
|
254
|
+
def test_docker_sudo_mode_allows_sudo(self):
|
|
255
|
+
"""Test that Docker sudo mode allows sudo commands."""
|
|
256
|
+
from massgen.filesystem_manager._code_execution_server import _sanitize_command
|
|
257
|
+
|
|
258
|
+
# In Docker mode with enable_sudo=True, sudo should be allowed
|
|
259
|
+
# (safe because it's inside container)
|
|
260
|
+
_sanitize_command("sudo apt-get install gh", enable_sudo=True)
|
|
261
|
+
|
|
262
|
+
|
|
157
263
|
class TestOutputHandling:
|
|
158
264
|
"""Test output capture and size limits."""
|
|
159
265
|
|
|
@@ -674,6 +780,78 @@ class TestDockerExecution:
|
|
|
674
780
|
# Cleanup
|
|
675
781
|
manager.cleanup("test_context")
|
|
676
782
|
|
|
783
|
+
@pytest.mark.docker
|
|
784
|
+
def test_docker_sudo_enabled_image_selection(self):
|
|
785
|
+
"""Test that enabling sudo automatically selects the sudo image variant."""
|
|
786
|
+
from massgen.filesystem_manager._docker_manager import DockerManager
|
|
787
|
+
|
|
788
|
+
# Test 1: Default image with sudo=False should use regular image
|
|
789
|
+
manager_no_sudo = DockerManager(enable_sudo=False)
|
|
790
|
+
assert manager_no_sudo.image == "massgen/mcp-runtime:latest"
|
|
791
|
+
assert manager_no_sudo.enable_sudo is False
|
|
792
|
+
|
|
793
|
+
# Test 2: Default image with sudo=True should auto-switch to sudo variant
|
|
794
|
+
manager_with_sudo = DockerManager(enable_sudo=True)
|
|
795
|
+
assert manager_with_sudo.image == "massgen/mcp-runtime-sudo:latest"
|
|
796
|
+
assert manager_with_sudo.enable_sudo is True
|
|
797
|
+
|
|
798
|
+
# Test 3: Custom image with sudo=True should keep custom image
|
|
799
|
+
manager_custom = DockerManager(
|
|
800
|
+
image="my-custom-image:latest",
|
|
801
|
+
enable_sudo=True,
|
|
802
|
+
)
|
|
803
|
+
assert manager_custom.image == "my-custom-image:latest"
|
|
804
|
+
assert manager_custom.enable_sudo is True
|
|
805
|
+
|
|
806
|
+
@pytest.mark.docker
|
|
807
|
+
def test_docker_sudo_functionality(self, tmp_path):
|
|
808
|
+
"""Test that sudo commands work in sudo-enabled container."""
|
|
809
|
+
from massgen.filesystem_manager._docker_manager import DockerManager
|
|
810
|
+
|
|
811
|
+
# Skip if sudo image not built
|
|
812
|
+
manager = DockerManager(enable_sudo=True)
|
|
813
|
+
try:
|
|
814
|
+
manager.ensure_image_exists()
|
|
815
|
+
except RuntimeError:
|
|
816
|
+
pytest.skip("Sudo Docker image not built. Run: bash massgen/docker/build.sh --sudo")
|
|
817
|
+
|
|
818
|
+
workspace = tmp_path / "workspace_sudo"
|
|
819
|
+
workspace.mkdir()
|
|
820
|
+
|
|
821
|
+
# Create container with sudo enabled
|
|
822
|
+
manager.create_container(
|
|
823
|
+
agent_id="test_sudo",
|
|
824
|
+
workspace_path=workspace,
|
|
825
|
+
)
|
|
826
|
+
|
|
827
|
+
# Test 1: Verify whoami returns 'massgen' (non-root user)
|
|
828
|
+
result_whoami = manager.exec_command(
|
|
829
|
+
agent_id="test_sudo",
|
|
830
|
+
command="whoami",
|
|
831
|
+
)
|
|
832
|
+
assert result_whoami["success"] is True
|
|
833
|
+
assert "massgen" in result_whoami["stdout"]
|
|
834
|
+
|
|
835
|
+
# Test 2: Verify sudo whoami returns 'root' (sudo works)
|
|
836
|
+
result_sudo_whoami = manager.exec_command(
|
|
837
|
+
agent_id="test_sudo",
|
|
838
|
+
command="sudo whoami",
|
|
839
|
+
)
|
|
840
|
+
assert result_sudo_whoami["success"] is True
|
|
841
|
+
assert "root" in result_sudo_whoami["stdout"]
|
|
842
|
+
|
|
843
|
+
# Test 3: Verify sudo apt-get update works (package installation capability)
|
|
844
|
+
result_apt = manager.exec_command(
|
|
845
|
+
agent_id="test_sudo",
|
|
846
|
+
command="sudo apt-get update",
|
|
847
|
+
timeout=60,
|
|
848
|
+
)
|
|
849
|
+
# This should succeed in sudo image (may fail in network=none, but command should run)
|
|
850
|
+
assert result_apt["exit_code"] is not None
|
|
851
|
+
|
|
852
|
+
# Cleanup
|
|
853
|
+
manager.cleanup("test_sudo")
|
|
854
|
+
|
|
677
855
|
|
|
678
856
|
if __name__ == "__main__":
|
|
679
857
|
pytest.main([__file__, "-v"])
|