massgen 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. massgen/__init__.py +1 -1
  2. massgen/agent_config.py +33 -7
  3. massgen/api_params_handler/_api_params_handler_base.py +3 -0
  4. massgen/backend/azure_openai.py +9 -1
  5. massgen/backend/base.py +4 -0
  6. massgen/backend/claude_code.py +9 -1
  7. massgen/backend/gemini.py +35 -6
  8. massgen/backend/gemini_utils.py +30 -0
  9. massgen/chat_agent.py +9 -3
  10. massgen/cli.py +291 -43
  11. massgen/config_builder.py +163 -18
  12. massgen/configs/README.md +52 -6
  13. massgen/configs/debug/restart_test_controlled.yaml +60 -0
  14. massgen/configs/debug/restart_test_controlled_filesystem.yaml +73 -0
  15. massgen/configs/tools/code-execution/docker_with_sudo.yaml +35 -0
  16. massgen/configs/tools/custom_tools/computer_use_browser_example.yaml +56 -0
  17. massgen/configs/tools/custom_tools/computer_use_docker_example.yaml +65 -0
  18. massgen/configs/tools/custom_tools/computer_use_example.yaml +50 -0
  19. massgen/configs/tools/custom_tools/crawl4ai_mcp_example.yaml +67 -0
  20. massgen/configs/tools/custom_tools/crawl4ai_multi_agent_example.yaml +68 -0
  21. massgen/configs/tools/custom_tools/multimodal_tools/playwright_with_img_understanding.yaml +98 -0
  22. massgen/configs/tools/custom_tools/multimodal_tools/understand_audio.yaml +33 -0
  23. massgen/configs/tools/custom_tools/multimodal_tools/understand_file.yaml +34 -0
  24. massgen/configs/tools/custom_tools/multimodal_tools/understand_image.yaml +33 -0
  25. massgen/configs/tools/custom_tools/multimodal_tools/understand_video.yaml +34 -0
  26. massgen/configs/tools/custom_tools/multimodal_tools/understand_video_example.yaml +54 -0
  27. massgen/configs/tools/custom_tools/multimodal_tools/youtube_video_analysis.yaml +59 -0
  28. massgen/configs/tools/memory/README.md +199 -0
  29. massgen/configs/tools/memory/gpt5mini_gemini_context_window_management.yaml +131 -0
  30. massgen/configs/tools/memory/gpt5mini_gemini_no_persistent_memory.yaml +133 -0
  31. massgen/configs/tools/memory/test_context_window_management.py +286 -0
  32. massgen/configs/tools/multimodal/gpt5mini_gpt5nano_documentation_evolution.yaml +97 -0
  33. massgen/docker/README.md +83 -0
  34. massgen/filesystem_manager/_code_execution_server.py +22 -7
  35. massgen/filesystem_manager/_docker_manager.py +21 -1
  36. massgen/filesystem_manager/_filesystem_manager.py +8 -0
  37. massgen/filesystem_manager/_workspace_tools_server.py +0 -997
  38. massgen/formatter/_gemini_formatter.py +73 -0
  39. massgen/frontend/coordination_ui.py +175 -257
  40. massgen/frontend/displays/base_display.py +29 -0
  41. massgen/frontend/displays/rich_terminal_display.py +155 -9
  42. massgen/frontend/displays/simple_display.py +21 -0
  43. massgen/frontend/displays/terminal_display.py +22 -2
  44. massgen/logger_config.py +50 -6
  45. massgen/message_templates.py +123 -3
  46. massgen/orchestrator.py +319 -38
  47. massgen/tests/test_code_execution.py +178 -0
  48. massgen/tests/test_orchestration_restart.py +204 -0
  49. massgen/tool/__init__.py +4 -0
  50. massgen/tool/_multimodal_tools/understand_audio.py +193 -0
  51. massgen/tool/_multimodal_tools/understand_file.py +550 -0
  52. massgen/tool/_multimodal_tools/understand_image.py +212 -0
  53. massgen/tool/_multimodal_tools/understand_video.py +313 -0
  54. massgen/tool/docs/multimodal_tools.md +779 -0
  55. massgen/tool/workflow_toolkits/__init__.py +26 -0
  56. massgen/tool/workflow_toolkits/post_evaluation.py +216 -0
  57. massgen/utils.py +1 -0
  58. {massgen-0.1.2.dist-info → massgen-0.1.3.dist-info}/METADATA +8 -3
  59. {massgen-0.1.2.dist-info → massgen-0.1.3.dist-info}/RECORD +63 -36
  60. {massgen-0.1.2.dist-info → massgen-0.1.3.dist-info}/WHEEL +0 -0
  61. {massgen-0.1.2.dist-info → massgen-0.1.3.dist-info}/entry_points.txt +0 -0
  62. {massgen-0.1.2.dist-info → massgen-0.1.3.dist-info}/licenses/LICENSE +0 -0
  63. {massgen-0.1.2.dist-info → massgen-0.1.3.dist-info}/top_level.txt +0 -0
massgen/orchestrator.py CHANGED
@@ -44,7 +44,7 @@ from .logger_config import (
44
44
  )
45
45
  from .message_templates import MessageTemplates
46
46
  from .stream_chunk import ChunkType
47
- from .tool import get_workflow_tools
47
+ from .tool import get_post_evaluation_tools, get_workflow_tools
48
48
  from .utils import ActionType, AgentStatus, CoordinationStage
49
49
 
50
50
 
@@ -164,6 +164,14 @@ class Orchestrator(ChatAgent):
164
164
  self.is_orchestrator_timeout: bool = False
165
165
  self.timeout_reason: Optional[str] = None
166
166
 
167
+ # Restart feature state tracking
168
+ self.current_attempt: int = 0
169
+ max_restarts = self.config.coordination_config.max_orchestration_restarts
170
+ self.max_attempts: int = 1 + max_restarts
171
+ self.restart_pending: bool = False
172
+ self.restart_reason: Optional[str] = None
173
+ self.restart_instructions: Optional[str] = None
174
+
167
175
  # Coordination state tracking for cleanup
168
176
  self._active_streams: Dict = {}
169
177
  self._active_tasks: Dict = {}
@@ -264,6 +272,9 @@ class Orchestrator(ChatAgent):
264
272
  self.coordination_tracker.initialize_session(list(self.agents.keys()), self.current_task)
265
273
  self.workflow_phase = "coordinating"
266
274
 
275
+ # Reset restart_pending flag at start of coordination (will be set again if restart needed)
276
+ self.restart_pending = False
277
+
267
278
  # Clear agent workspaces for new turn (if this is a multi-turn conversation with history)
268
279
  if conversation_context and conversation_context.get("conversation_history"):
269
280
  self._clear_agent_workspaces()
@@ -651,7 +662,12 @@ Your answer:"""
651
662
  return {"has_irreversible": True, "blocked_tools": set()}
652
663
 
653
664
  async def _coordinate_agents_with_timeout(self, conversation_context: Optional[Dict[str, Any]] = None) -> AsyncGenerator[StreamChunk, None]:
654
- """Execute coordination with orchestrator-level timeout protection."""
665
+ """Execute coordination with orchestrator-level timeout protection.
666
+
667
+ When restart is needed, this method completes and returns control to CLI,
668
+ which will call coordinate() again (similar to multiturn pattern).
669
+ """
670
+ # Reset timing and state for this attempt
655
671
  self.coordination_start_time = time.time()
656
672
  self.total_tokens = 0
657
673
  self.is_orchestrator_timeout = False
@@ -659,13 +675,19 @@ Your answer:"""
659
675
 
660
676
  log_orchestrator_activity(
661
677
  self.orchestrator_id,
662
- "Starting coordination with timeout",
678
+ f"Starting coordination attempt {self.current_attempt + 1}/{self.max_attempts}",
663
679
  {
664
680
  "timeout_seconds": self.config.timeout_config.orchestrator_timeout_seconds,
665
681
  "agents": list(self.agents.keys()),
682
+ "has_restart_context": bool(self.restart_reason),
666
683
  },
667
684
  )
668
685
 
686
+ # Set log attempt for directory organization
687
+ from massgen.logger_config import set_log_attempt
688
+
689
+ set_log_attempt(self.current_attempt + 1)
690
+
669
691
  # Track active coordination state for cleanup
670
692
  self._active_streams = {}
671
693
  self._active_tasks = {}
@@ -699,6 +721,8 @@ Your answer:"""
699
721
  async for chunk in self._handle_orchestrator_timeout():
700
722
  yield chunk
701
723
 
724
+ # Exit here - if restart is needed, CLI will call coordinate() again
725
+
702
726
  async def _coordinate_agents(self, conversation_context: Optional[Dict[str, Any]] = None) -> AsyncGenerator[StreamChunk, None]:
703
727
  """Execute unified MassGen coordination workflow with real-time streaming."""
704
728
  log_coordination_step(
@@ -1666,10 +1690,16 @@ Your answer:"""
1666
1690
 
1667
1691
  # Extract command execution parameters
1668
1692
  enable_command_execution = False
1693
+ docker_mode = False
1694
+ enable_sudo = False
1669
1695
  if hasattr(agent, "config") and agent.config:
1670
1696
  enable_command_execution = agent.config.backend_params.get("enable_mcp_command_line", False)
1697
+ docker_mode = agent.config.backend_params.get("command_line_execution_mode", "local") == "docker"
1698
+ enable_sudo = agent.config.backend_params.get("command_line_docker_enable_sudo", False)
1671
1699
  elif hasattr(agent, "backend") and hasattr(agent.backend, "backend_params"):
1672
1700
  enable_command_execution = agent.backend.backend_params.get("enable_mcp_command_line", False)
1701
+ docker_mode = agent.backend.backend_params.get("command_line_execution_mode", "local") == "docker"
1702
+ enable_sudo = agent.backend.backend_params.get("command_line_docker_enable_sudo", False)
1673
1703
 
1674
1704
  filesystem_system_message = self.message_templates.filesystem_system_message(
1675
1705
  main_workspace=main_workspace,
@@ -1680,6 +1710,8 @@ Your answer:"""
1680
1710
  enable_image_generation=enable_image_generation,
1681
1711
  agent_answers=answers,
1682
1712
  enable_command_execution=enable_command_execution,
1713
+ docker_mode=docker_mode,
1714
+ enable_sudo=enable_sudo,
1683
1715
  )
1684
1716
  agent_system_message = f"{agent_system_message}\n\n{filesystem_system_message}" if agent_system_message else filesystem_system_message
1685
1717
 
@@ -1724,6 +1756,15 @@ Your answer:"""
1724
1756
  base_system_message=agent_system_message,
1725
1757
  )
1726
1758
 
1759
+ # Inject restart context if this is a restart attempt (like multi-turn context)
1760
+ if self.restart_reason and self.restart_instructions:
1761
+ restart_context = self.message_templates.format_restart_context(
1762
+ self.restart_reason,
1763
+ self.restart_instructions,
1764
+ )
1765
+ # Prepend restart context to user message
1766
+ conversation["user_message"] = restart_context + "\n\n" + conversation["user_message"]
1767
+
1727
1768
  # Track all the context used for this agent execution
1728
1769
  self.coordination_tracker.track_agent_context(
1729
1770
  agent_id,
@@ -2205,48 +2246,81 @@ Your answer:"""
2205
2246
  return ("error", str(e))
2206
2247
 
2207
2248
  async def _present_final_answer(self) -> AsyncGenerator[StreamChunk, None]:
2208
- """Present the final coordinated answer."""
2209
- log_stream_chunk("orchestrator", "content", "## 🎯 Final Coordinated Answer\n")
2210
- yield StreamChunk(type="content", content="## 🎯 Final Coordinated Answer\n")
2249
+ """Present the final coordinated answer with optional post-evaluation and restart loop."""
2211
2250
 
2212
2251
  # Select the best agent based on current state
2213
2252
  if not self._selected_agent:
2214
2253
  self._selected_agent = self._determine_final_agent_from_states()
2215
- if self._selected_agent:
2216
- log_stream_chunk(
2217
- "orchestrator",
2218
- "content",
2219
- f"🏆 Selected Agent: {self._selected_agent}\n",
2220
- )
2221
- yield StreamChunk(
2222
- type="content",
2223
- content=f"🏆 Selected Agent: {self._selected_agent}\n",
2224
- )
2225
-
2226
- if self._selected_agent and self._selected_agent in self.agent_states and self.agent_states[self._selected_agent].answer:
2227
- final_answer = self.agent_states[self._selected_agent].answer # NOTE: This is the raw answer from the winning agent, not the actual final answer.
2228
2254
 
2229
- # Add to conversation history
2230
- self.add_to_history("assistant", final_answer)
2231
-
2232
- log_stream_chunk("orchestrator", "content", f"🏆 Selected Agent: {self._selected_agent}\n")
2233
- yield StreamChunk(type="content", content=f"🏆 Selected Agent: {self._selected_agent}\n")
2234
- log_stream_chunk("orchestrator", "content", final_answer)
2235
- yield StreamChunk(type="content", content=final_answer)
2236
- log_stream_chunk(
2237
- "orchestrator",
2238
- "content",
2239
- f"\n\n---\n*Coordinated by {len(self.agents)} agents via MassGen framework*",
2240
- )
2241
- yield StreamChunk(
2242
- type="content",
2243
- content=f"\n\n---\n*Coordinated by {len(self.agents)} agents via MassGen framework*",
2244
- )
2245
- else:
2255
+ if not self._selected_agent:
2246
2256
  error_msg = "❌ Unable to provide coordinated answer - no successful agents"
2247
2257
  self.add_to_history("assistant", error_msg)
2248
2258
  log_stream_chunk("orchestrator", "error", error_msg)
2249
2259
  yield StreamChunk(type="content", content=error_msg)
2260
+ self.workflow_phase = "presenting"
2261
+ log_stream_chunk("orchestrator", "done", None)
2262
+ yield StreamChunk(type="done")
2263
+ return
2264
+
2265
+ # Get vote results for presentation
2266
+ vote_results = self._get_vote_results()
2267
+
2268
+ log_stream_chunk("orchestrator", "content", "## 🎯 Final Coordinated Answer\n")
2269
+ yield StreamChunk(type="content", content="## 🎯 Final Coordinated Answer\n")
2270
+
2271
+ # Stream final presentation from winning agent
2272
+ log_stream_chunk("orchestrator", "content", f"🏆 Selected Agent: {self._selected_agent}\n")
2273
+ yield StreamChunk(type="content", content=f"🏆 Selected Agent: {self._selected_agent}\n")
2274
+
2275
+ # Stream the final presentation (with full tool support)
2276
+ presentation_content = ""
2277
+ async for chunk in self.get_final_presentation(self._selected_agent, vote_results):
2278
+ if chunk.type == "content" and chunk.content:
2279
+ presentation_content += chunk.content
2280
+ yield chunk
2281
+
2282
+ # Check if post-evaluation should run
2283
+ # Skip post-evaluation on final attempt (user clarification #4)
2284
+ is_final_attempt = self.current_attempt >= (self.max_attempts - 1)
2285
+ should_evaluate = self.max_attempts > 1 and not is_final_attempt
2286
+
2287
+ if should_evaluate:
2288
+ # Run post-evaluation
2289
+ final_answer_to_evaluate = self._final_presentation_content or presentation_content
2290
+ async for chunk in self.post_evaluate_answer(self._selected_agent, final_answer_to_evaluate):
2291
+ yield chunk
2292
+
2293
+ # Check if restart was requested
2294
+ if self.restart_pending and self.current_attempt < (self.max_attempts - 1):
2295
+ # Show restart banner
2296
+ restart_banner = f"""
2297
+
2298
+ 🔄 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
2299
+ ORCHESTRATION RESTART (Attempt {self.current_attempt + 2}/{self.max_attempts})
2300
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
2301
+
2302
+ REASON:
2303
+ {self.restart_reason}
2304
+
2305
+ INSTRUCTIONS FOR NEXT ATTEMPT:
2306
+ {self.restart_instructions}
2307
+
2308
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
2309
+
2310
+ """
2311
+ log_stream_chunk("orchestrator", "status", restart_banner)
2312
+ yield StreamChunk(type="restart_banner", content=restart_banner, source="orchestrator")
2313
+
2314
+ # Reset state for restart (prepare for next coordinate() call)
2315
+ self.handle_restart()
2316
+
2317
+ # Don't add to history or set workflow phase - restart is pending
2318
+ # Exit here - CLI will detect restart_pending and call coordinate() again
2319
+ return
2320
+
2321
+ # No restart - add final answer to conversation history
2322
+ if self._final_presentation_content:
2323
+ self.add_to_history("assistant", self._final_presentation_content)
2250
2324
 
2251
2325
  # Update workflow phase
2252
2326
  self.workflow_phase = "presenting"
@@ -2422,10 +2496,16 @@ Your answer:"""
2422
2496
 
2423
2497
  # Extract command execution parameters
2424
2498
  enable_command_execution = False
2499
+ docker_mode = False
2500
+ enable_sudo = False
2425
2501
  if hasattr(agent, "config") and agent.config:
2426
2502
  enable_command_execution = agent.config.backend_params.get("enable_mcp_command_line", False)
2503
+ docker_mode = agent.config.backend_params.get("command_line_execution_mode", "local") == "docker"
2504
+ enable_sudo = agent.config.backend_params.get("command_line_docker_enable_sudo", False)
2427
2505
  elif hasattr(agent, "backend") and hasattr(agent.backend, "backend_params"):
2428
2506
  enable_command_execution = agent.backend.backend_params.get("enable_mcp_command_line", False)
2507
+ docker_mode = agent.backend.backend_params.get("command_line_execution_mode", "local") == "docker"
2508
+ enable_sudo = agent.backend.backend_params.get("command_line_docker_enable_sudo", False)
2429
2509
  # Check if audio generation is enabled for this agent
2430
2510
  enable_audio_generation = False
2431
2511
  if hasattr(agent, "config") and agent.config:
@@ -2483,6 +2563,8 @@ Your answer:"""
2483
2563
  enable_image_generation=enable_image_generation,
2484
2564
  agent_answers=all_answers,
2485
2565
  enable_command_execution=enable_command_execution,
2566
+ docker_mode=docker_mode,
2567
+ enable_sudo=enable_sudo,
2486
2568
  )
2487
2569
  + "\n\n## Instructions\n"
2488
2570
  + base_system_message
@@ -2674,6 +2756,204 @@ Your answer:"""
2674
2756
  # Save logs
2675
2757
  self.save_coordination_logs()
2676
2758
 
2759
+ # Don't yield done here - let _present_final_answer handle final done after post-evaluation
2760
+
2761
+ async def post_evaluate_answer(self, selected_agent_id: str, final_answer: str) -> AsyncGenerator[StreamChunk, None]:
2762
+ """Post-evaluation phase where winning agent evaluates its own answer.
2763
+
2764
+ The agent reviews the final answer and decides whether to submit or restart
2765
+ with specific improvement instructions.
2766
+
2767
+ Args:
2768
+ selected_agent_id: The agent that won the vote and presented the answer
2769
+ final_answer: The final answer that was presented
2770
+
2771
+ Yields:
2772
+ StreamChunk: Stream chunks from the evaluation process
2773
+ """
2774
+ if selected_agent_id not in self.agents:
2775
+ log_stream_chunk("orchestrator", "error", f"Selected agent {selected_agent_id} not found for post-evaluation")
2776
+ yield StreamChunk(type="error", error=f"Selected agent {selected_agent_id} not found")
2777
+ return
2778
+
2779
+ agent = self.agents[selected_agent_id]
2780
+
2781
+ # Use debug override on first attempt if configured
2782
+ eval_answer = final_answer
2783
+ if self.config.debug_final_answer and self.current_attempt == 0:
2784
+ eval_answer = self.config.debug_final_answer
2785
+ log_stream_chunk("orchestrator", "debug", f"Using debug override for post-evaluation: {self.config.debug_final_answer}")
2786
+ yield StreamChunk(
2787
+ type="debug",
2788
+ content=f"[DEBUG MODE] Overriding answer for evaluation: {self.config.debug_final_answer}",
2789
+ source="orchestrator",
2790
+ )
2791
+
2792
+ # Build evaluation message
2793
+ evaluation_content = f"""{self.message_templates.format_original_message(self.current_task or "Task")}
2794
+
2795
+ FINAL ANSWER TO EVALUATE:
2796
+ {eval_answer}
2797
+
2798
+ Review this answer carefully and determine if it fully addresses the original task. Use your available tools to verify claims and check files as needed.
2799
+ Then call either submit(confirmed=True) if the answer is satisfactory, or restart_orchestration(reason, instructions) if improvements are needed."""
2800
+
2801
+ # Get agent's configurable system message
2802
+ agent_system_message = agent.get_configurable_system_message()
2803
+
2804
+ # Build post-evaluation system message
2805
+ base_system_message = self.message_templates.post_evaluation_system_message(agent_system_message)
2806
+
2807
+ # Add filesystem context if available (same as final presentation)
2808
+ if agent.backend.filesystem_manager:
2809
+ main_workspace = str(agent.backend.filesystem_manager.get_current_workspace())
2810
+ temp_workspace = str(agent.backend.filesystem_manager.agent_temporary_workspace) if agent.backend.filesystem_manager.agent_temporary_workspace else None
2811
+ context_paths = agent.backend.filesystem_manager.path_permission_manager.get_context_paths() if agent.backend.filesystem_manager.path_permission_manager else []
2812
+ previous_turns_context = self._get_previous_turns_context_paths()
2813
+ current_turn_num = len(previous_turns_context) + 1 if previous_turns_context else 1
2814
+ turns_to_show = [t for t in previous_turns_context if t["turn"] < current_turn_num - 1]
2815
+ workspace_prepopulated = len(previous_turns_context) > 0
2816
+
2817
+ # Get all answers for context
2818
+ all_answers = {aid: s.answer for aid, s in self.agent_states.items() if s.answer}
2819
+
2820
+ base_system_message = (
2821
+ self.message_templates.filesystem_system_message(
2822
+ main_workspace=main_workspace,
2823
+ temp_workspace=temp_workspace,
2824
+ context_paths=context_paths,
2825
+ previous_turns=turns_to_show,
2826
+ workspace_prepopulated=workspace_prepopulated,
2827
+ enable_image_generation=False,
2828
+ agent_answers=all_answers,
2829
+ enable_command_execution=False,
2830
+ docker_mode=False,
2831
+ enable_sudo=False,
2832
+ )
2833
+ + "\n\n## Post-Evaluation Task\n"
2834
+ + base_system_message
2835
+ )
2836
+
2837
+ # Create evaluation messages
2838
+ evaluation_messages = [
2839
+ {"role": "system", "content": base_system_message},
2840
+ {"role": "user", "content": evaluation_content},
2841
+ ]
2842
+
2843
+ # Get post-evaluation tools
2844
+ api_format = "chat_completions" # Default format
2845
+ if hasattr(agent.backend, "api_format"):
2846
+ api_format = agent.backend.api_format
2847
+ post_eval_tools = get_post_evaluation_tools(api_format=api_format)
2848
+
2849
+ log_stream_chunk("orchestrator", "status", "🔍 Post-evaluation: Reviewing final answer\n")
2850
+ yield StreamChunk(type="status", content="🔍 Post-evaluation: Reviewing final answer\n", source="orchestrator")
2851
+
2852
+ # Stream evaluation with tools (with timeout protection)
2853
+ evaluation_complete = False
2854
+ tool_call_detected = False
2855
+
2856
+ try:
2857
+ timeout_seconds = self.config.timeout_config.orchestrator_timeout_seconds
2858
+ async with asyncio.timeout(timeout_seconds):
2859
+ async for chunk in agent.chat(messages=evaluation_messages, tools=post_eval_tools, reset_chat=True, current_stage=CoordinationStage.POST_EVALUATION):
2860
+ chunk_type = self._get_chunk_type_value(chunk)
2861
+
2862
+ if chunk_type == "content" and chunk.content:
2863
+ log_stream_chunk("orchestrator", "content", chunk.content, selected_agent_id)
2864
+ yield StreamChunk(type="content", content=chunk.content, source=selected_agent_id)
2865
+ elif chunk_type in ["reasoning", "reasoning_done", "reasoning_summary", "reasoning_summary_done"]:
2866
+ reasoning_chunk = StreamChunk(
2867
+ type=chunk_type,
2868
+ content=chunk.content,
2869
+ source=selected_agent_id,
2870
+ reasoning_delta=getattr(chunk, "reasoning_delta", None),
2871
+ reasoning_text=getattr(chunk, "reasoning_text", None),
2872
+ reasoning_summary_delta=getattr(chunk, "reasoning_summary_delta", None),
2873
+ reasoning_summary_text=getattr(chunk, "reasoning_summary_text", None),
2874
+ item_id=getattr(chunk, "item_id", None),
2875
+ content_index=getattr(chunk, "content_index", None),
2876
+ summary_index=getattr(chunk, "summary_index", None),
2877
+ )
2878
+ log_stream_chunk("orchestrator", chunk.type, chunk.content, selected_agent_id)
2879
+ yield reasoning_chunk
2880
+ elif chunk_type == "tool_calls":
2881
+ # Post-evaluation tool call detected
2882
+ tool_call_detected = True
2883
+ if hasattr(chunk, "tool_calls") and chunk.tool_calls:
2884
+ for tool_call in chunk.tool_calls:
2885
+ # Use backend's tool extraction (same as regular coordination)
2886
+ tool_name = agent.backend.extract_tool_name(tool_call)
2887
+ tool_args = agent.backend.extract_tool_arguments(tool_call)
2888
+
2889
+ if tool_name == "submit":
2890
+ log_stream_chunk("orchestrator", "status", "✅ Evaluation complete - answer approved\n")
2891
+ yield StreamChunk(type="status", content="✅ Evaluation complete - answer approved\n", source="orchestrator")
2892
+ evaluation_complete = True
2893
+ elif tool_name == "restart_orchestration":
2894
+ # Parse restart parameters from extracted args
2895
+ self.restart_reason = tool_args.get("reason", "No reason provided")
2896
+ self.restart_instructions = tool_args.get("instructions", "No instructions provided")
2897
+ self.restart_pending = True
2898
+
2899
+ log_stream_chunk("orchestrator", "status", "🔄 Restart requested\n")
2900
+ yield StreamChunk(type="status", content="🔄 Restart requested\n", source="orchestrator")
2901
+ evaluation_complete = True
2902
+ elif chunk_type == "done":
2903
+ log_stream_chunk("orchestrator", "done", None, selected_agent_id)
2904
+ yield StreamChunk(type="done", source=selected_agent_id)
2905
+ elif chunk_type == "error":
2906
+ log_stream_chunk("orchestrator", "error", chunk.error, selected_agent_id)
2907
+ yield StreamChunk(type="error", error=chunk.error, source=selected_agent_id)
2908
+ else:
2909
+ # Pass through other chunk types
2910
+ log_stream_chunk("orchestrator", chunk_type, getattr(chunk, "content", ""), selected_agent_id)
2911
+ yield StreamChunk(
2912
+ type=chunk_type,
2913
+ content=getattr(chunk, "content", ""),
2914
+ source=selected_agent_id,
2915
+ **{k: v for k, v in chunk.__dict__.items() if k not in ["type", "content", "source", "timestamp", "sequence_number"]},
2916
+ )
2917
+ except asyncio.TimeoutError:
2918
+ log_stream_chunk("orchestrator", "status", "⏱️ Post-evaluation timed out - auto-submitting answer\n")
2919
+ yield StreamChunk(type="status", content="⏱️ Post-evaluation timed out - auto-submitting answer\n", source="orchestrator")
2920
+ evaluation_complete = True
2921
+ # Don't set restart_pending - let it default to False (auto-submit)
2922
+ finally:
2923
+ # If no tool was called and evaluation didn't complete, auto-submit
2924
+ if not evaluation_complete and not tool_call_detected:
2925
+ log_stream_chunk("orchestrator", "status", "✅ Auto-submitting answer (no tool call detected)\n")
2926
+ yield StreamChunk(type="status", content="✅ Auto-submitting answer (no tool call detected)\n", source="orchestrator")
2927
+
2928
+ def handle_restart(self):
2929
+ """Reset orchestration state for restart attempt.
2930
+
2931
+ Clears agent states and coordination messages while preserving
2932
+ restart reason and instructions for the next attempt.
2933
+ """
2934
+ log_orchestrator_activity("handle_restart", f"Resetting state for restart attempt {self.current_attempt + 1}")
2935
+
2936
+ # Reset agent states
2937
+ for agent_id in self.agent_states:
2938
+ self.agent_states[agent_id] = AgentState()
2939
+
2940
+ # Clear coordination messages
2941
+ self._coordination_messages = []
2942
+ self._selected_agent = None
2943
+ self._final_presentation_content = None
2944
+
2945
+ # Reset coordination tracker for new attempt
2946
+ self.coordination_tracker = CoordinationTracker()
2947
+ self.coordination_tracker.initialize_session(list(self.agents.keys()))
2948
+
2949
+ # Reset workflow phase to idle so next coordinate() call starts fresh
2950
+ self.workflow_phase = "idle"
2951
+
2952
+ # Increment attempt counter
2953
+ self.current_attempt += 1
2954
+
2955
+ log_orchestrator_activity("handle_restart", f"State reset complete - starting attempt {self.current_attempt + 1}")
2956
+
2677
2957
  def _get_vote_results(self) -> Dict[str, Any]:
2678
2958
  """Get current vote results and statistics."""
2679
2959
  agent_answers = {aid: state.answer for aid, state in self.agent_states.items() if state.answer}
@@ -2867,8 +3147,9 @@ Your answer:"""
2867
3147
  """
2868
3148
  if self.config and hasattr(self.config, "get_configurable_system_message"):
2869
3149
  return self.config.get_configurable_system_message()
2870
- elif self.config and hasattr(self.config, "custom_system_instruction"):
2871
- return self.config.custom_system_instruction
3150
+ elif self.config and hasattr(self.config, "_custom_system_instruction"):
3151
+ # Access private attribute to avoid deprecation warning
3152
+ return self.config._custom_system_instruction
2872
3153
  elif self.config and self.config.backend_params:
2873
3154
  # Check for backend-specific system prompts
2874
3155
  backend_params = self.config.backend_params
@@ -154,6 +154,112 @@ class TestCommandSanitization:
154
154
  _sanitize_command(cmd)
155
155
 
156
156
 
157
+ class TestSudoSanitization:
158
+ """Test sudo sanitization respects enable_sudo flag."""
159
+
160
+ def test_sudo_blocked_by_default(self):
161
+ """Test that sudo is blocked when enable_sudo=False (default)."""
162
+ from massgen.filesystem_manager._code_execution_server import _sanitize_command
163
+
164
+ sudo_commands = [
165
+ "sudo apt-get update",
166
+ "sudo apt-get install -y ffmpeg",
167
+ "sudo pip install tensorflow",
168
+ "sudo npm install -g typescript",
169
+ "sudo chmod 755 file.txt",
170
+ "echo 'test' && sudo apt update",
171
+ ]
172
+
173
+ for cmd in sudo_commands:
174
+ with pytest.raises(ValueError, match="sudo.*not allowed"):
175
+ _sanitize_command(cmd, enable_sudo=False)
176
+
177
+ def test_sudo_allowed_when_enabled(self):
178
+ """Test that sudo is allowed when enable_sudo=True."""
179
+ from massgen.filesystem_manager._code_execution_server import _sanitize_command
180
+
181
+ sudo_commands = [
182
+ "sudo apt-get update",
183
+ "sudo apt-get install -y ffmpeg",
184
+ "sudo pip install tensorflow",
185
+ "sudo npm install -g typescript",
186
+ "sudo chown user:group file.txt", # chown allowed with sudo enabled
187
+ "sudo chmod 755 file.txt", # chmod allowed with sudo enabled
188
+ ]
189
+
190
+ for cmd in sudo_commands:
191
+ # Should not raise when enable_sudo=True
192
+ _sanitize_command(cmd, enable_sudo=True)
193
+
194
+ def test_other_dangerous_patterns_still_blocked_with_sudo(self):
195
+ """Test that other dangerous patterns are still blocked even with sudo enabled."""
196
+ from massgen.filesystem_manager._code_execution_server import _sanitize_command
197
+
198
+ # These should ALWAYS be blocked, regardless of enable_sudo
199
+ dangerous_commands = [
200
+ "sudo rm -rf /", # Still blocked - root deletion
201
+ "rm -rf /", # Still blocked
202
+ "dd if=/dev/zero of=/dev/sda", # Still blocked - dd command
203
+ "sudo dd if=/dev/zero of=/dev/sda", # Still blocked
204
+ ":(){ :|:& };:", # Still blocked - fork bomb
205
+ "mv file /dev/null", # Still blocked
206
+ "sudo mv file /dev/null", # Still blocked
207
+ "echo test > /dev/sda1", # Still blocked - writing to disk
208
+ ]
209
+
210
+ for cmd in dangerous_commands:
211
+ with pytest.raises(ValueError, match="dangerous|not allowed"):
212
+ _sanitize_command(cmd, enable_sudo=True)
213
+
214
+ def test_su_chown_chmod_blocked_without_sudo_flag(self):
215
+ """Test that su, chown, chmod are blocked when enable_sudo=False."""
216
+ from massgen.filesystem_manager._code_execution_server import _sanitize_command
217
+
218
+ commands = [
219
+ "su root",
220
+ "su - postgres",
221
+ "chown root:root file.txt",
222
+ "chmod 777 file.txt",
223
+ "chmod +x script.sh",
224
+ ]
225
+
226
+ for cmd in commands:
227
+ with pytest.raises(ValueError, match="not allowed"):
228
+ _sanitize_command(cmd, enable_sudo=False)
229
+
230
+ def test_su_chown_chmod_allowed_with_sudo_flag(self):
231
+ """Test that su, chown, chmod are allowed when enable_sudo=True (Docker sudo mode)."""
232
+ from massgen.filesystem_manager._code_execution_server import _sanitize_command
233
+
234
+ # In Docker sudo mode, these are safe because they're confined to container
235
+ commands = [
236
+ "su postgres",
237
+ "chown user:group file.txt",
238
+ "chmod 755 file.txt",
239
+ "chmod +x script.sh",
240
+ ]
241
+
242
+ for cmd in commands:
243
+ # Should not raise when enable_sudo=True
244
+ _sanitize_command(cmd, enable_sudo=True)
245
+
246
+ def test_local_mode_blocks_sudo(self):
247
+ """Test that local mode (non-Docker) blocks sudo commands."""
248
+ from massgen.filesystem_manager._code_execution_server import _sanitize_command
249
+
250
+ # In local mode (enable_sudo=False), sudo should be blocked for safety
251
+ with pytest.raises(ValueError, match="sudo.*not allowed"):
252
+ _sanitize_command("sudo apt-get install malicious-package", enable_sudo=False)
253
+
254
+ def test_docker_sudo_mode_allows_sudo(self):
255
+ """Test that Docker sudo mode allows sudo commands."""
256
+ from massgen.filesystem_manager._code_execution_server import _sanitize_command
257
+
258
+ # In Docker mode with enable_sudo=True, sudo should be allowed
259
+ # (safe because it's inside container)
260
+ _sanitize_command("sudo apt-get install gh", enable_sudo=True)
261
+
262
+
157
263
  class TestOutputHandling:
158
264
  """Test output capture and size limits."""
159
265
 
@@ -674,6 +780,78 @@ class TestDockerExecution:
674
780
  # Cleanup
675
781
  manager.cleanup("test_context")
676
782
 
783
+ @pytest.mark.docker
784
+ def test_docker_sudo_enabled_image_selection(self):
785
+ """Test that enabling sudo automatically selects the sudo image variant."""
786
+ from massgen.filesystem_manager._docker_manager import DockerManager
787
+
788
+ # Test 1: Default image with sudo=False should use regular image
789
+ manager_no_sudo = DockerManager(enable_sudo=False)
790
+ assert manager_no_sudo.image == "massgen/mcp-runtime:latest"
791
+ assert manager_no_sudo.enable_sudo is False
792
+
793
+ # Test 2: Default image with sudo=True should auto-switch to sudo variant
794
+ manager_with_sudo = DockerManager(enable_sudo=True)
795
+ assert manager_with_sudo.image == "massgen/mcp-runtime-sudo:latest"
796
+ assert manager_with_sudo.enable_sudo is True
797
+
798
+ # Test 3: Custom image with sudo=True should keep custom image
799
+ manager_custom = DockerManager(
800
+ image="my-custom-image:latest",
801
+ enable_sudo=True,
802
+ )
803
+ assert manager_custom.image == "my-custom-image:latest"
804
+ assert manager_custom.enable_sudo is True
805
+
806
+ @pytest.mark.docker
807
+ def test_docker_sudo_functionality(self, tmp_path):
808
+ """Test that sudo commands work in sudo-enabled container."""
809
+ from massgen.filesystem_manager._docker_manager import DockerManager
810
+
811
+ # Skip if sudo image not built
812
+ manager = DockerManager(enable_sudo=True)
813
+ try:
814
+ manager.ensure_image_exists()
815
+ except RuntimeError:
816
+ pytest.skip("Sudo Docker image not built. Run: bash massgen/docker/build.sh --sudo")
817
+
818
+ workspace = tmp_path / "workspace_sudo"
819
+ workspace.mkdir()
820
+
821
+ # Create container with sudo enabled
822
+ manager.create_container(
823
+ agent_id="test_sudo",
824
+ workspace_path=workspace,
825
+ )
826
+
827
+ # Test 1: Verify whoami returns 'massgen' (non-root user)
828
+ result_whoami = manager.exec_command(
829
+ agent_id="test_sudo",
830
+ command="whoami",
831
+ )
832
+ assert result_whoami["success"] is True
833
+ assert "massgen" in result_whoami["stdout"]
834
+
835
+ # Test 2: Verify sudo whoami returns 'root' (sudo works)
836
+ result_sudo_whoami = manager.exec_command(
837
+ agent_id="test_sudo",
838
+ command="sudo whoami",
839
+ )
840
+ assert result_sudo_whoami["success"] is True
841
+ assert "root" in result_sudo_whoami["stdout"]
842
+
843
+ # Test 3: Verify sudo apt-get update works (package installation capability)
844
+ result_apt = manager.exec_command(
845
+ agent_id="test_sudo",
846
+ command="sudo apt-get update",
847
+ timeout=60,
848
+ )
849
+ # This should succeed in sudo image (may fail in network=none, but command should run)
850
+ assert result_apt["exit_code"] is not None
851
+
852
+ # Cleanup
853
+ manager.cleanup("test_sudo")
854
+
677
855
 
678
856
  if __name__ == "__main__":
679
857
  pytest.main([__file__, "-v"])