@agentunion/kite 1.3.2 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. package/CHANGELOG.md +200 -0
  2. package/cli.js +76 -0
  3. package/extensions/agents/assistant/entry.py +111 -1
  4. package/extensions/agents/assistant/server.py +263 -215
  5. package/extensions/channels/acp_channel/entry.py +111 -1
  6. package/extensions/channels/acp_channel/module.md +23 -22
  7. package/extensions/channels/acp_channel/server.py +263 -215
  8. package/extensions/event_hub_bench/entry.py +107 -1
  9. package/extensions/services/backup/entry.py +299 -21
  10. package/extensions/services/backup/module.md +24 -22
  11. package/extensions/services/model_service/entry.py +145 -19
  12. package/extensions/services/model_service/module.md +21 -22
  13. package/extensions/services/watchdog/entry.py +188 -25
  14. package/extensions/services/watchdog/monitor.py +144 -34
  15. package/extensions/services/web/WEBSOCKET_STATUS.md +143 -0
  16. package/extensions/services/web/config_example.py +35 -0
  17. package/extensions/services/web/config_loader.py +110 -0
  18. package/extensions/services/web/entry.py +114 -26
  19. package/extensions/services/web/module.md +35 -24
  20. package/extensions/services/web/pairing.py +250 -0
  21. package/extensions/services/web/pairing_codes.jsonl +16 -0
  22. package/extensions/services/web/relay.py +643 -0
  23. package/extensions/services/web/relay_config.json5 +67 -0
  24. package/extensions/services/web/routes/routes_management_ws.py +127 -0
  25. package/extensions/services/web/routes/routes_rpc.py +89 -0
  26. package/extensions/services/web/routes/routes_test.py +61 -0
  27. package/extensions/services/web/routes/schemas.py +0 -22
  28. package/extensions/services/web/server.py +421 -98
  29. package/extensions/services/web/static/css/style.css +67 -28
  30. package/extensions/services/web/static/index.html +234 -44
  31. package/extensions/services/web/static/js/app.js +1335 -48
  32. package/extensions/services/web/static/js/kernel-client-example.js +161 -0
  33. package/extensions/services/web/static/js/kernel-client.js +383 -0
  34. package/extensions/services/web/static/js/registry-tests.js +558 -0
  35. package/extensions/services/web/static/js/token-manager.js +175 -0
  36. package/extensions/services/web/static/pairing.html +248 -0
  37. package/extensions/services/web/static/test_registry.html +262 -0
  38. package/extensions/services/web/web_config.json5 +29 -0
  39. package/kernel/entry.py +120 -32
  40. package/kernel/event_hub.py +141 -16
  41. package/kernel/module.md +36 -33
  42. package/kernel/registry_store.py +48 -15
  43. package/kernel/rpc_router.py +120 -53
  44. package/kernel/server.py +219 -12
  45. package/kite_cli/__init__.py +3 -0
  46. package/kite_cli/__main__.py +5 -0
  47. package/kite_cli/commands/__init__.py +1 -0
  48. package/kite_cli/commands/clean.py +101 -0
  49. package/kite_cli/commands/doctor.py +35 -0
  50. package/kite_cli/commands/history.py +111 -0
  51. package/kite_cli/commands/info.py +96 -0
  52. package/kite_cli/commands/install.py +313 -0
  53. package/kite_cli/commands/list.py +143 -0
  54. package/kite_cli/commands/log.py +81 -0
  55. package/kite_cli/commands/rollback.py +88 -0
  56. package/kite_cli/commands/search.py +73 -0
  57. package/kite_cli/commands/uninstall.py +85 -0
  58. package/kite_cli/commands/update.py +118 -0
  59. package/kite_cli/core/__init__.py +1 -0
  60. package/kite_cli/core/checker.py +142 -0
  61. package/kite_cli/core/dependency.py +229 -0
  62. package/kite_cli/core/downloader.py +209 -0
  63. package/kite_cli/core/install_info.py +40 -0
  64. package/kite_cli/core/tool_installer.py +397 -0
  65. package/kite_cli/core/validator.py +78 -0
  66. package/kite_cli/main.py +289 -0
  67. package/kite_cli/utils/__init__.py +1 -0
  68. package/kite_cli/utils/i18n.py +252 -0
  69. package/kite_cli/utils/interactive.py +63 -0
  70. package/kite_cli/utils/operation_log.py +77 -0
  71. package/kite_cli/utils/paths.py +34 -0
  72. package/kite_cli/utils/version.py +308 -0
  73. package/launcher/entry.py +819 -158
  74. package/launcher/logging_setup.py +104 -0
  75. package/launcher/module.md +37 -37
  76. package/package.json +2 -1
  77. package/scripts/plan_manager.py +315 -0
  78. package/extensions/services/web/routes/routes_modules.py +0 -249
package/launcher/entry.py CHANGED
@@ -29,10 +29,21 @@ from .process_manager import ProcessManager
29
29
  IS_WINDOWS = sys.platform == "win32"
30
30
 
31
31
  # Shutdown timeout constants (seconds)
32
- SHUTDOWN_TIMEOUT_NON_GRACEFUL = 5 # Non-graceful modules or no ack response
33
- SHUTDOWN_TIMEOUT_PARTIAL = 3 # Graceful module ack'd but no ready
34
- SHUTDOWN_TIMEOUT_READY = 1 # Graceful module sent ready (cleanup done)
35
- SHUTDOWN_TIMEOUT_BULK = 3 # Bulk stop_all() safety net
32
+
33
+ # 不支持优雅关闭
34
+ SHUTDOWN_TIMEOUT_NON_GRACEFUL = 0.3 # SIGTERM 后等待时间
35
+
36
+ # 支持优雅关闭 - 等待响应
37
+ SHUTDOWN_TIMEOUT_ACK = 3.0 # 等待 shutdown.ack
38
+ SHUTDOWN_TIMEOUT_EXITING = 3.0 # 等待 module.exiting
39
+
40
+ # 清理超时(从 exiting 事件获取)
41
+ CLEANUP_TIMEOUT_DEFAULT = 5.0 # 默认清理时间
42
+ CLEANUP_TIMEOUT_MIN = 0.0 # 最小清理时间
43
+ CLEANUP_TIMEOUT_MAX = 30.0 # 最大清理时间
44
+
45
+ # 批量关闭安全网
46
+ SHUTDOWN_TIMEOUT_BULK = 3.0
36
47
 
37
48
  # Core module names that are started in Phase 1 (not Phase 2)
38
49
  CORE_MODULE_NAMES = {"kernel"}
@@ -101,6 +112,9 @@ class Launcher:
101
112
  # System-wide shutdown flag: prevents Watchdog restart during shutdown
102
113
  self._system_shutting_down = False
103
114
 
115
+ # 模块退出状态跟踪(防止 stopped 事件重复发送)
116
+ self._module_states: dict[str, dict] = {}
117
+
104
118
  # Kite stdout message waiters: {waiter_key: (threading.Event, data_dict)}
105
119
  # Used by ProcessManager stdout callback (cross-thread)
106
120
  self._msg_waiters: dict[str, tuple[threading.Event, dict]] = {}
@@ -117,6 +131,9 @@ class Launcher:
117
131
  pass
118
132
  os.environ["KITE_INSTANCE_SUFFIX"] = suffix
119
133
 
134
+ # Record launcher startup
135
+ self._record_launcher_startup()
136
+
120
137
  @staticmethod
121
138
  def _fmt_elapsed(seconds: float) -> str:
122
139
  """Format elapsed seconds: <1s → 'NNNms', >=1s → 'N.Ns', >=10s → 'NNs'."""
@@ -597,6 +614,11 @@ class Launcher:
597
614
  print(f"[launcher] Kernel 重连失败 {max_retries} 次,退出")
598
615
  sys.exit(1)
599
616
  print(f"[launcher] Kernel 连接错误: {e}, {retry_delay:.1f}s 后重试 ({attempt}/{max_retries})")
617
+ if attempt == 5:
618
+ print(f"\033[33m[launcher] 提示: 已连续 {attempt} 次无法连接 Kernel (端口 {self.kernel_port})")
619
+ if self.kernel_port < 1024:
620
+ print(f"[launcher] ⚠ 端口 {self.kernel_port} 异常偏低,可能是 Kernel 端口绑定失败或配置错误")
621
+ print(f"[launcher] 请检查: 1) Kernel 进程是否存活 2) kernel/module.md 中 preferred_port 配置是否正确\033[0m")
600
622
  self._ws = None
601
623
  if self._thread_shutdown.is_set():
602
624
  return
@@ -608,7 +630,7 @@ class Launcher:
608
630
  launcher_token = self._module_tokens.get("launcher", "")
609
631
  ws_url = f"ws://127.0.0.1:{self.kernel_port}/ws?token={launcher_token}&id=launcher"
610
632
  t_ws_connect = time.monotonic()
611
- async with websockets.connect(ws_url, open_timeout=3, ping_interval=None, ping_timeout=None, close_timeout=10) as ws:
633
+ async with websockets.connect(ws_url, open_timeout=3, ping_interval=20, ping_timeout=20, close_timeout=10) as ws:
612
634
  self._ws = ws
613
635
  _ws_s = time.monotonic() - t_ws_connect
614
636
  print(f"[launcher] 已连接到 Kernel ({self._fmt_elapsed(_ws_s)})")
@@ -632,10 +654,38 @@ class Launcher:
632
654
  await self._rpc_call(ws, "registry.register", {
633
655
  "module_id": "launcher",
634
656
  "module_type": "infrastructure",
657
+ "tools": {
658
+ "rpc": {
659
+ "launcher": {
660
+ "list_modules": {"method": "list_modules", "description": "列出所有模块"},
661
+ "start_module": {"method": "start_module", "description": "启动模块"},
662
+ "stop_module": {"method": "stop_module", "description": "停止模块"},
663
+ "restart_module": {"method": "restart_module", "description": "重启模块"},
664
+ "restart_launcher": {"method": "restart_launcher", "description": "重启 Launcher"},
665
+ "rescan": {"method": "rescan", "description": "重新扫描模块"},
666
+ "shutdown": {"method": "shutdown", "description": "关闭系统"},
667
+ },
668
+ "module": {
669
+ "config": {
670
+ "get": {"method": "get_module_config", "description": "获取模块配置"},
671
+ "update": {"method": "update_module_config", "description": "更新模块配置"},
672
+ "reset": {"method": "reset_module_config", "description": "恢复默认配置"},
673
+ }
674
+ }
675
+ }
676
+ },
635
677
  "events_publish": {
636
- "module.started": {},
637
- "module.stopped": {},
638
- "module.state_changed": {},
678
+ "system": {
679
+ "ready": {"description": "系统启动完成"}
680
+ },
681
+ "module": {
682
+ "starting": {"description": "模块启动中"},
683
+ "started": {"description": "模块已启动"},
684
+ "ready": {"description": "模块就绪"},
685
+ "stopped": {"description": "模块已停止"},
686
+ "exiting": {"description": "模块退出中"},
687
+ "shutdown": {"description": "模块关闭"}
688
+ }
639
689
  },
640
690
  "events_subscribe": [">"],
641
691
  })
@@ -658,7 +708,14 @@ class Launcher:
658
708
  raise
659
709
 
660
710
  async def _ws_receiver(self, ws):
661
- """Receive loop: classify incoming messages."""
711
+ """Receive loop: classify incoming messages.
712
+
713
+ CRITICAL: RPC 死锁防范
714
+ - 入站 RPC 请求必须用 create_task() 异步执行,不可 await
715
+ - 原因:如果 handler 内部调用 rpc_call() 发出站请求,出站响应需要本接收循环来分发
716
+ - 如果接收循环被 await handler 阻塞,出站响应永远收不到 → 超时死锁
717
+ - 事件通知和 RPC 响应可以同步处理(它们不会反向调用 rpc_call)
718
+ """
662
719
  try:
663
720
  async for raw in ws:
664
721
  try:
@@ -676,7 +733,8 @@ class Launcher:
676
733
  await self._handle_event_notification(msg)
677
734
  elif has_method and has_id:
678
735
  # Incoming RPC request (forwarded by Kernel)
679
- await self._handle_rpc_request(ws, msg)
736
+ # Run in background so receiver loop continues processing responses
737
+ asyncio.create_task(self._handle_rpc_request(ws, msg))
680
738
  elif has_id and (has_result or has_error):
681
739
  # RPC response (to our own call)
682
740
  self._handle_rpc_response(msg)
@@ -728,6 +786,7 @@ class Launcher:
728
786
  # Trigger event waiters
729
787
  module_id = data.get("module_id", "")
730
788
  waiter_key = f"{event}:{module_id}"
789
+
731
790
  waiter = self._event_waiters.get(waiter_key)
732
791
  if waiter:
733
792
  waiter[1].update(data)
@@ -742,6 +801,42 @@ class Launcher:
742
801
  ready_waiter[1]["_exited"] = True
743
802
  ready_waiter[0].set()
744
803
 
804
+ # 处理主动退出场景(没有 shutdown 的情况)
805
+ if module_id not in self._module_states:
806
+ self._init_module_state(module_id)
807
+ state = self._module_states[module_id]
808
+
809
+ if not state.get("shutdown_sent"):
810
+ # 主动退出:记录信息
811
+ if not state.get("exiting_received"):
812
+ state["exiting_received"] = True
813
+ state["reason"] = data.get("reason", "active_exit")
814
+ state["restart"] = data.get("restart", False)
815
+ cleanup_timeout = data.get("cleanup_timeout", CLEANUP_TIMEOUT_DEFAULT)
816
+ cleanup_timeout = max(CLEANUP_TIMEOUT_MIN, min(cleanup_timeout, CLEANUP_TIMEOUT_MAX))
817
+ state["cleanup_timeout"] = cleanup_timeout
818
+
819
+ # 启动清理超时任务
820
+ async def cleanup_timeout_handler():
821
+ await asyncio.sleep(state["cleanup_timeout"])
822
+ if not state.get("stopped_sent"):
823
+ state["stopped_sent"] = True
824
+ self._kill_process(module_id)
825
+
826
+ # 发送 stopped 事件
827
+ await self._publish_event("module.stopped", {
828
+ "module_id": module_id,
829
+ "exit_code": -1, # 超时强制终止,退出码未知
830
+ "exit_type": "timeout",
831
+ "reason": state.get("reason", "cleanup_timeout"),
832
+ "restart": state.get("restart", False),
833
+ "ready_received": False,
834
+ })
835
+
836
+ self._log_lifecycle("stopped", module_id, reason=state["reason"])
837
+
838
+ state["cleanup_task"] = asyncio.create_task(cleanup_timeout_handler())
839
+
745
840
  # module.crash → print red crash summary
746
841
  if event == "module.crash" and module_id:
747
842
  RED = "\033[91m"
@@ -756,6 +851,43 @@ class Launcher:
756
851
  )
757
852
  print(f"[launcher] 崩溃日志: {crash_log}")
758
853
 
854
+ # pairing.status → handle all pairing flow events
855
+ if event == "pairing.status":
856
+ GREEN = "\033[92m"
857
+ RED = "\033[91m"
858
+ RESET = "\033[0m"
859
+
860
+ step = data.get("step", "")
861
+ success = data.get("success", True)
862
+
863
+ if step == "code_generated":
864
+ code = data.get("code", "")
865
+ expires_in = data.get("expires_in", 300)
866
+ if code:
867
+ print(f"[launcher] {GREEN}配对码: {code}{RESET}")
868
+ print(f"[launcher] {GREEN}有效期: {expires_in} 秒{RESET}")
869
+ print(f"[launcher] {GREEN}访问 Web 界面时使用此配对码进行配对{RESET}")
870
+
871
+ elif step == "pairing":
872
+ if success:
873
+ print(f"[launcher] {GREEN}正在配对...{RESET}")
874
+ else:
875
+ reason = data.get("reason", "Unknown error")
876
+ print(f"[launcher] {RED}✗ 配对失败: {reason}{RESET}")
877
+
878
+ elif step == "completed":
879
+ if success:
880
+ module_id = data.get("module_id", "")
881
+ role = data.get("role", "")
882
+ print(f"[launcher] {GREEN}✓ 配对成功!{RESET}")
883
+ print(f"[launcher] {GREEN} 模块 ID: {module_id}{RESET}")
884
+ print(f"[launcher] {GREEN} 角色: {role}{RESET}")
885
+ else:
886
+ reason = data.get("reason", "Unknown error")
887
+ print(f"[launcher] {RED}✗ 配对失败: {reason}{RESET}")
888
+
889
+ return
890
+
759
891
  # Only log system events (module.*, watchdog.*) to avoid flooding
760
892
  if not (event.startswith("module.") or event.startswith("watchdog.")):
761
893
  return
@@ -780,12 +912,16 @@ class Launcher:
780
912
  params = msg.get("params", {})
781
913
 
782
914
  handlers = {
783
- "list_modules": self._rpc_list_modules,
784
- "start_module": self._rpc_start_module,
785
- "stop_module": self._rpc_stop_module,
786
- "restart_module": self._rpc_restart_module,
787
- "rescan": self._rpc_rescan,
788
- "shutdown": self._rpc_shutdown,
915
+ "list_modules": self._rpc_list_modules,
916
+ "start_module": self._rpc_start_module,
917
+ "stop_module": self._rpc_stop_module,
918
+ "restart_module": self._rpc_restart_module,
919
+ "restart_launcher": self._rpc_restart_launcher,
920
+ "rescan": self._rpc_rescan,
921
+ "shutdown": self._rpc_shutdown,
922
+ "get_module_config": self._rpc_get_module_config,
923
+ "update_module_config": self._rpc_update_module_config,
924
+ "reset_module_config": self._rpc_reset_module_config,
789
925
  }
790
926
  handler = handlers.get(method)
791
927
  if handler:
@@ -815,11 +951,14 @@ class Launcher:
815
951
  "name": name,
816
952
  "display_name": info.display_name,
817
953
  "type": info.type,
818
- "config_state": info.state,
954
+ "state": info.state, # 改名为 state(与 /api/modules 一致)
955
+ "version": info.version,
956
+ "runtime": info.runtime,
957
+ "preferred_port": info.preferred_port,
958
+ "monitor": info.monitor,
819
959
  "desired_state": self._desired_states.get(name, "stopped"),
820
960
  "actual_state": f"running({rec.pid})" if running and rec else "stopped",
821
961
  "pid": rec.pid if running and rec else None,
822
- "monitor": info.monitor,
823
962
  })
824
963
  return {"modules": result}
825
964
 
@@ -911,30 +1050,243 @@ class Launcher:
911
1050
  self._request_shutdown(f"RPC shutdown request: {reason}")
912
1051
  return {"status": "shutting_down", "reason": reason}
913
1052
 
1053
+ async def _rpc_get_module_config(self, params: dict) -> dict:
1054
+ """获取指定模块的配置(通用降级方案)"""
1055
+ import re
1056
+ import yaml
1057
+ from pathlib import Path
1058
+
1059
+ module_name = params.get("module_name")
1060
+ if not module_name:
1061
+ raise ValueError("module_name required")
1062
+
1063
+ # 查找模块信息
1064
+ info = self.modules.get(module_name)
1065
+ if not info:
1066
+ raise RuntimeError(f"Module '{module_name}' not found")
1067
+
1068
+ # 读取 module.md
1069
+ md_path = Path(info.module_dir) / "module.md"
1070
+ if not md_path.exists():
1071
+ raise RuntimeError(f"module.md not found for '{module_name}'")
1072
+
1073
+ text = md_path.read_text(encoding="utf-8")
1074
+ m = re.match(r'^---\s*\n(.*?)\n---\s*\n?(.*)', text, re.DOTALL)
1075
+ if not m:
1076
+ frontmatter = {}
1077
+ else:
1078
+ frontmatter = yaml.safe_load(m.group(1)) or {}
1079
+
1080
+ # 读取 config.yaml(如果存在)
1081
+ config_path = Path(info.module_dir) / "config.yaml"
1082
+ config = None
1083
+ if config_path.exists():
1084
+ config = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
1085
+
1086
+ return {
1087
+ "name": frontmatter.get("name", module_name),
1088
+ "display_name": frontmatter.get("display_name", ""),
1089
+ "type": frontmatter.get("type", ""),
1090
+ "state": frontmatter.get("state", "enabled"),
1091
+ "version": frontmatter.get("version", ""),
1092
+ "runtime": frontmatter.get("runtime", ""),
1093
+ "entry": frontmatter.get("entry", ""),
1094
+ "preferred_port": frontmatter.get("preferred_port"),
1095
+ "advertise_ip": frontmatter.get("advertise_ip"),
1096
+ "monitor": frontmatter.get("monitor"),
1097
+ "events": frontmatter.get("events"),
1098
+ "subscriptions": frontmatter.get("subscriptions"),
1099
+ "depends_on": frontmatter.get("depends_on"),
1100
+ "source_path": str(info.module_dir), # 添加模块路径
1101
+ "has_config": config is not None,
1102
+ "config": config,
1103
+ }
1104
+
1105
+ async def _rpc_update_module_config(self, params: dict) -> dict:
1106
+ """更新指定模块的配置(通用降级方案)"""
1107
+ import yaml
1108
+ from pathlib import Path
1109
+
1110
+ module_name = params.get("module_name")
1111
+ metadata = params.get("metadata", {})
1112
+ config = params.get("config", {})
1113
+
1114
+ if not module_name:
1115
+ raise ValueError("module_name required")
1116
+
1117
+ info = self.modules.get(module_name)
1118
+ if not info:
1119
+ raise RuntimeError(f"Module '{module_name}' not found")
1120
+
1121
+ md_path = Path(info.module_dir) / "module.md"
1122
+ if not md_path.exists():
1123
+ raise RuntimeError(f"module.md not found for '{module_name}'")
1124
+
1125
+ # 更新 module.md frontmatter
1126
+ if metadata:
1127
+ frontmatter, body = _parse_frontmatter(md_path.read_text(encoding="utf-8"))
1128
+ for key, value in metadata.items():
1129
+ frontmatter[key] = value
1130
+ fm_str = yaml.dump(frontmatter, allow_unicode=True, sort_keys=False, default_flow_style=False).rstrip()
1131
+ content = f"---\n{fm_str}\n---\n{body}"
1132
+ md_path.write_text(content, encoding="utf-8")
1133
+
1134
+ # 更新 config.yaml
1135
+ if config:
1136
+ config_path = Path(info.module_dir) / "config.yaml"
1137
+ existing = {}
1138
+ if config_path.exists():
1139
+ existing = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
1140
+ # Deep merge
1141
+ self._deep_merge(existing, config)
1142
+ config_path.write_text(
1143
+ yaml.dump(existing, allow_unicode=True, sort_keys=False, default_flow_style=False),
1144
+ encoding="utf-8"
1145
+ )
1146
+
1147
+ # 重新扫描以更新缓存
1148
+ await self._rescan_modules()
1149
+
1150
+ # 返回更新后的配置
1151
+ return await self._rpc_get_module_config({"module_name": module_name})
1152
+
1153
+ async def _rpc_reset_module_config(self, params: dict) -> dict:
1154
+ """恢复指定模块的默认配置(通用降级方案)"""
1155
+ import yaml
1156
+ from pathlib import Path
1157
+
1158
+ module_name = params.get("module_name")
1159
+ fields = params.get("fields", [])
1160
+ reset_all = params.get("all", False)
1161
+
1162
+ if not module_name:
1163
+ raise ValueError("module_name required")
1164
+
1165
+ info = self.modules.get(module_name)
1166
+ if not info:
1167
+ raise RuntimeError(f"Module '{module_name}' not found")
1168
+
1169
+ md_path = Path(info.module_dir) / "module.md"
1170
+ if not md_path.exists():
1171
+ raise RuntimeError(f"module.md not found for '{module_name}'")
1172
+
1173
+ # 默认值定义(通用)
1174
+ defaults = {
1175
+ "state": "enabled",
1176
+ "monitor": True,
1177
+ }
1178
+
1179
+ frontmatter, body = _parse_frontmatter(md_path.read_text(encoding="utf-8"))
1180
+
1181
+ if reset_all:
1182
+ for key, value in defaults.items():
1183
+ frontmatter[key] = value
1184
+ else:
1185
+ for field in fields:
1186
+ if field in defaults:
1187
+ frontmatter[field] = defaults[field]
1188
+ elif field == "preferred_port":
1189
+ frontmatter.pop(field, None) # 恢复为 null
1190
+ elif field == "advertise_ip":
1191
+ frontmatter[field] = "127.0.0.1"
1192
+
1193
+ fm_str = yaml.dump(frontmatter, allow_unicode=True, sort_keys=False, default_flow_style=False).rstrip()
1194
+ content = f"---\n{fm_str}\n---\n{body}"
1195
+ md_path.write_text(content, encoding="utf-8")
1196
+
1197
+ # 重新扫描以更新缓存
1198
+ await self._rescan_modules()
1199
+
1200
+ return await self._rpc_get_module_config({"module_name": module_name})
1201
+
1202
+ @staticmethod
1203
+ def _deep_merge(base: dict, overlay: dict) -> dict:
1204
+ """递归合并字典"""
1205
+ for k, v in overlay.items():
1206
+ if k in base and isinstance(base[k], dict) and isinstance(v, dict):
1207
+ Launcher._deep_merge(base[k], v)
1208
+ else:
1209
+ base[k] = v
1210
+ return base
1211
+
1212
+
1213
+ async def _rpc_restart_launcher(self, params: dict) -> dict:
1214
+ """Restart Launcher process via Watchdog.
1215
+
1216
+ Simply notify watchdog and exit. Watchdog will start a new instance.
1217
+
1218
+ Args:
1219
+ params: {
1220
+ "reason": str (optional) - Restart reason
1221
+ }
1222
+
1223
+ Returns:
1224
+ {"status": "restarting", "reason": str}
1225
+ or {"error": "watchdog offline"}
1226
+ """
1227
+ reason = params.get("reason", "user_request")
1228
+ O = "\033[33m" # orange/yellow
1229
+ R = "\033[0m" # reset
1230
+ print(f"{O}[launcher] 收到 Launcher 重启请求{R}")
1231
+ print(f"[launcher] 原因: {reason}")
1232
+
1233
+ # Check if watchdog is running
1234
+ watchdog_running = self.process_manager.is_running("watchdog")
1235
+ print(f"[launcher] 检查 watchdog 状态: {'running' if watchdog_running else 'stopped'}")
1236
+
1237
+ if not watchdog_running:
1238
+ error_msg = "watchdog 未运行, 无法重启"
1239
+ print(f"[launcher] ❌ {error_msg}")
1240
+ return {"error": error_msg}
1241
+
1242
+ print(f"[launcher] ✓ watchdog 状态正常,准备重启流程")
1243
+
1244
+ # Schedule restart in background (don't block RPC response)
1245
+ async def _do_restart():
1246
+ await asyncio.sleep(0.3) # 确保 RPC 响应已发送
1247
+
1248
+ print(f"[launcher] 发送 module.exiting 事件给 watchdog...")
1249
+
1250
+ # Collect startup info for watchdog to restart with same environment
1251
+ startup_info = {
1252
+ "python": sys.executable,
1253
+ "argv": sys.argv,
1254
+ "cwd": os.getcwd(),
1255
+ "env": dict(os.environ), # 所有环境变量
1256
+ }
1257
+
1258
+ # Notify watchdog: this is a planned restart, not a crash
1259
+ await self._publish_event("module.exiting", {
1260
+ "module_id": "launcher",
1261
+ "action": "restart_launcher",
1262
+ "reason": reason,
1263
+ "startup_info": startup_info,
1264
+ })
1265
+
1266
+ print(f"[launcher] 已通知 watchdog 计划内重启")
1267
+ print(f"[launcher] 退出进程,等待 watchdog 重启")
1268
+ print(f"[launcher] 原因: {reason}")
1269
+
1270
+ os._exit(0)
1271
+
1272
+ asyncio.create_task(_do_restart())
1273
+
1274
+ return {"status": "restarting", "reason": reason}
1275
+
914
1276
  # ── Event publishing via RPC ──
915
1277
 
916
1278
  async def _publish_event(self, event_type: str, data: dict):
917
1279
  """Publish an event via RPC event.publish through Kernel WS."""
918
1280
  if not self._ws:
919
1281
  return
920
- msg = json.dumps({
921
- "jsonrpc": "2.0",
922
- "id": str(uuid.uuid4()),
923
- "method": "event.publish",
924
- "params": {
1282
+ try:
1283
+ await self._rpc_call(self._ws, "event.publish", {
925
1284
  "event_id": str(uuid.uuid4()),
926
1285
  "event": event_type,
927
1286
  "data": data,
928
- },
929
- })
930
-
931
- async def _send():
932
- try:
933
- await self._ws.send(msg)
934
- except Exception as e:
935
- print(f"[launcher] 发布事件失败: {e}")
936
-
937
- asyncio.create_task(_send())
1287
+ }, timeout=2.0)
1288
+ except Exception as e:
1289
+ print(f"[launcher] 发布事件失败 ({event_type}): {e}")
938
1290
 
939
1291
  async def _wait_event(self, event_type: str, module_id: str, timeout: float) -> dict | None:
940
1292
  """Wait for a specific event from a module. Returns data dict or None on timeout."""
@@ -956,90 +1308,299 @@ class Launcher:
956
1308
  finally:
957
1309
  self._event_waiters.pop(key, None)
958
1310
 
959
- async def _graceful_stop(self, name: str, reason: str = "stop_requested", timeout: float = 10):
960
- """Graceful shutdown: check capability → send event → wait ack → wait ready → kill.
961
- Modules that did not declare graceful_shutdown in module.ready are terminated directly.
962
- """
963
- self._log_lifecycle("stopping", name, reason=reason)
1311
+ # ── 退出机制辅助方法 ──
1312
+
1313
+ def _init_module_state(self, name: str):
1314
+ """初始化模块状态跟踪字典"""
1315
+ self._module_states[name] = {
1316
+ "shutdown_sent": False,
1317
+ "ack_received": False,
1318
+ "exiting_received": False,
1319
+ "ready_received": False,
1320
+ "stopped_sent": False,
1321
+ "exit_type": None, # "graceful" | "non_graceful" | "active"
1322
+ "reason": None,
1323
+ "restart": None,
1324
+ "cleanup_timeout": None,
1325
+ "cleanup_task": None,
1326
+ }
964
1327
 
965
- if not self._graceful_modules.get(name):
966
- self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
967
- self._log_lifecycle("stopped", name, reason=reason)
968
- await self._publish_event("module.stopped", {
969
- "module_id": name,
970
- "graceful_shutdown": False,
971
- })
1328
+ def _kill_process(self, name: str):
1329
+ """统一的进程杀死方法"""
1330
+ record = self.process_manager.get_record(name)
1331
+ if record and record.proc and record.proc.poll() is None:
1332
+ print(f"[launcher] 强制终止 {name} (PID {record.proc.pid})")
1333
+ self.process_manager.kill_process(name)
1334
+ elif record:
1335
+ # 进程已经退出,只是清理记录
1336
+ pass
1337
+ else:
1338
+ # 没有记录,可能已经被清理
1339
+ pass
1340
+
1341
+ def _determine_exit_type(self, name: str) -> str:
1342
+ """判断退出类型: graceful | non_graceful | active"""
1343
+ state = self._module_states.get(name, {})
1344
+ if state.get("exiting_received"):
1345
+ return "graceful"
1346
+ elif state.get("shutdown_sent"):
1347
+ return "non_graceful"
1348
+ else:
1349
+ return "active"
1350
+
1351
+ def _resolve_reason(self, name: str) -> str:
1352
+ """解析最终原因(优先级:exiting > shutdown > 默认)"""
1353
+ state = self._module_states.get(name, {})
1354
+ if state.get("reason"):
1355
+ return state["reason"]
1356
+ return "unknown"
1357
+
1358
+ def _resolve_restart(self, name: str) -> bool:
1359
+ """解析重启决策(优先级:exiting > shutdown > 默认)"""
1360
+ state = self._module_states.get(name, {})
1361
+ if state.get("restart") is not None:
1362
+ return state["restart"]
1363
+ # 默认:主动退出不重启,被动关闭看 desired_state
1364
+ if self._determine_exit_type(name) == "active":
1365
+ return False
1366
+ return self._desired_states.get(name) == "running"
1367
+
1368
+ async def _send_stopped_event(self, name: str, exit_code: int):
1369
+ """发送 module.stopped 事件(防重复)"""
1370
+ state = self._module_states.get(name, {})
1371
+ if state.get("stopped_sent"):
972
1372
  return
973
1373
 
974
- # Register waiters BEFORE sending shutdown event
975
- ack_key = f"module.shutdown.ack:{name}"
976
- ack_evt = asyncio.Event()
977
- ack_data = {}
978
- self._event_waiters[ack_key] = (ack_evt, ack_data)
1374
+ # 立即设置标记(防止竞态条件)
1375
+ if name in self._module_states:
1376
+ self._module_states[name]["stopped_sent"] = True
979
1377
 
980
- ready_key = f"module.shutdown.ready:{name}"
981
- ready_evt = asyncio.Event()
982
- ready_data = {}
983
- self._event_waiters[ready_key] = (ready_evt, ready_data)
1378
+ exit_type = self._determine_exit_type(name)
1379
+ reason = self._resolve_reason(name)
1380
+ restart = self._resolve_restart(name)
984
1381
 
985
- await self._publish_event("module.shutdown", {
986
- "module_id": name, "reason": reason, "timeout": timeout,
1382
+ await self._publish_event("module.stopped", {
1383
+ "module_id": name,
1384
+ "exit_code": exit_code,
1385
+ "exit_type": exit_type,
1386
+ "reason": reason,
1387
+ "restart": restart,
1388
+ "ready_received": state.get("ready_received", False),
987
1389
  })
988
1390
 
989
- # Wait for ack
1391
+ # ── 优雅关闭 ──
1392
+
1393
+ async def _graceful_stop(self, name: str, reason: str = "stop_requested", timeout: float = 10):
1394
+ """优雅关闭单个模块:
1395
+ 1. 初始化状态跟踪
1396
+ 2. 非优雅模块直接 SIGTERM
1397
+ 3. 优雅模块:发送 shutdown → 等待 ack → 等待 exiting → 启动清理超时 → 杀死
1398
+ """
990
1399
  try:
991
- await asyncio.wait_for(ack_evt.wait(), timeout=3)
992
- ack = ack_data
993
- except asyncio.TimeoutError:
994
- ack = None
995
- finally:
996
- self._event_waiters.pop(ack_key, None)
1400
+ self._log_lifecycle("stopping", name, reason=reason)
997
1401
 
998
- if not ack:
999
- self._event_waiters.pop(ready_key, None)
1000
- self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
1001
- await self._publish_event("module.stopped", {
1402
+ # 初始化状态
1403
+ self._init_module_state(name)
1404
+ state = self._module_states[name]
1405
+
1406
+ # 非优雅模块:直接 SIGTERM
1407
+ if not self._graceful_modules.get(name):
1408
+ state["shutdown_sent"] = True # 标记:Launcher 主动关闭
1409
+ state["stopped_sent"] = True # 防重复标记
1410
+ state["reason"] = reason
1411
+ state["restart"] = self._desired_states.get(name) == "running"
1412
+
1413
+ self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
1414
+
1415
+ # 发送 stopped 事件
1416
+ await self._publish_event("module.stopped", {
1417
+ "module_id": name,
1418
+ "exit_code": 0,
1419
+ "exit_type": "non_graceful",
1420
+ "reason": reason,
1421
+ "restart": state["restart"],
1422
+ "ready_received": False,
1423
+ })
1424
+
1425
+ self._log_lifecycle("stopped", name, reason=reason)
1426
+ return
1427
+
1428
+ # 优雅模块:提前注册所有三个 waiter(ack、exiting、ready)
1429
+ # 这样可以避免事件到达时 waiter 还没注册的竞争条件
1430
+ ack_key = f"module.shutdown.ack:{name}"
1431
+ ack_evt = asyncio.Event()
1432
+ ack_data = {}
1433
+ self._event_waiters[ack_key] = (ack_evt, ack_data)
1434
+
1435
+ exiting_key = f"module.exiting:{name}"
1436
+ exiting_evt = asyncio.Event()
1437
+ exiting_data = {}
1438
+ self._event_waiters[exiting_key] = (exiting_evt, exiting_data)
1439
+
1440
+ ready_key = f"module.shutdown.ready:{name}"
1441
+ ready_evt = asyncio.Event()
1442
+ ready_data = {}
1443
+ self._event_waiters[ready_key] = (ready_evt, ready_data)
1444
+
1445
+ # 发送 shutdown 事件
1446
+ state["shutdown_sent"] = True
1447
+ state["reason"] = reason
1448
+ state["restart"] = self._desired_states.get(name) == "running"
1449
+
1450
+ await self._publish_event("module.shutdown", {
1002
1451
  "module_id": name,
1003
- "graceful_shutdown": self._graceful_modules.get(name, False),
1452
+ "reason": reason,
1453
+ "timeout": timeout,
1454
+ "restart": state["restart"],
1004
1455
  })
1005
- return
1006
1456
 
1007
- estimated = min(ack.get("estimated_cleanup", timeout), timeout)
1457
+ # 等待 ack
1458
+ try:
1459
+ await asyncio.wait_for(ack_evt.wait(), timeout=SHUTDOWN_TIMEOUT_ACK)
1460
+ state["ack_received"] = True
1461
+ except asyncio.TimeoutError:
1462
+ pass
1463
+ finally:
1464
+ self._event_waiters.pop(ack_key, None)
1008
1465
 
1009
- # Wait for ready
1010
- try:
1011
- await asyncio.wait_for(ready_evt.wait(), timeout=estimated)
1012
- ready = ready_data
1013
- except asyncio.TimeoutError:
1014
- ready = None
1015
- finally:
1016
- self._event_waiters.pop(ready_key, None)
1017
- if ready:
1018
- self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_READY)
1019
- else:
1020
- self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_PARTIAL)
1466
+ if not state["ack_received"]:
1467
+ # 没有 ack,直接杀死
1468
+ self._event_waiters.pop(exiting_key, None)
1469
+ self._event_waiters.pop(ready_key, None)
1470
+ state["stopped_sent"] = True
1471
+ self._kill_process(name)
1021
1472
 
1022
- self._log_lifecycle("stopped", name, reason=reason)
1023
- await self._publish_event("module.stopped", {
1024
- "module_id": name,
1025
- "graceful_shutdown": self._graceful_modules.get(name, False),
1026
- })
1473
+ # 发送 stopped 事件
1474
+ await self._publish_event("module.stopped", {
1475
+ "module_id": name,
1476
+ "exit_code": -1, # 未收到 ack,退出码未知
1477
+ "exit_type": "timeout",
1478
+ "reason": state["reason"],
1479
+ "restart": state["restart"],
1480
+ "ready_received": False,
1481
+ })
1482
+
1483
+ self._log_lifecycle("stopped", name, reason=reason)
1484
+ return
1485
+
1486
+ # 等待 exiting 事件
1487
+ try:
1488
+ await asyncio.wait_for(exiting_evt.wait(), timeout=SHUTDOWN_TIMEOUT_EXITING)
1489
+ state["exiting_received"] = True
1490
+ # 从 exiting 事件中提取信息
1491
+ if exiting_data.get("reason"):
1492
+ state["reason"] = exiting_data["reason"]
1493
+ if "restart" in exiting_data:
1494
+ state["restart"] = exiting_data["restart"]
1495
+ cleanup_timeout = exiting_data.get("cleanup_timeout", CLEANUP_TIMEOUT_DEFAULT)
1496
+ cleanup_timeout = max(CLEANUP_TIMEOUT_MIN, min(cleanup_timeout, CLEANUP_TIMEOUT_MAX))
1497
+ state["cleanup_timeout"] = cleanup_timeout
1498
+ except asyncio.TimeoutError:
1499
+ pass
1500
+ finally:
1501
+ self._event_waiters.pop(exiting_key, None)
1502
+
1503
+ if not state["exiting_received"]:
1504
+ # 没有 exiting,直接杀死
1505
+ self._event_waiters.pop(ready_key, None)
1506
+ state["stopped_sent"] = True
1507
+ self._kill_process(name)
1508
+
1509
+ # 发送 stopped 事件
1510
+ await self._publish_event("module.stopped", {
1511
+ "module_id": name,
1512
+ "exit_code": -1, # 未收到 exiting,退出码未知
1513
+ "exit_type": "timeout",
1514
+ "reason": state["reason"],
1515
+ "restart": state["restart"],
1516
+ "ready_received": False,
1517
+ })
1518
+
1519
+ self._log_lifecycle("stopped", name, reason=state["reason"])
1520
+ return
1521
+
1522
+ # ready waiter 已经在前面注册好了,直接启动清理超时任务
1523
+ # 启动清理超时任务(兜底机制)
1524
+ async def cleanup_timeout_handler():
1525
+ await asyncio.sleep(state["cleanup_timeout"])
1526
+ if not state.get("stopped_sent"):
1527
+ print(f"[launcher] {name} 清理超时 ({state['cleanup_timeout']}s),强制终止")
1528
+ state["stopped_sent"] = True
1529
+ self._kill_process(name)
1530
+
1531
+ # 发送 stopped 事件
1532
+ await self._publish_event("module.stopped", {
1533
+ "module_id": name,
1534
+ "exit_code": -1, # 清理超时,退出码未知
1535
+ "exit_type": "timeout",
1536
+ "reason": state["reason"],
1537
+ "restart": state["restart"],
1538
+ "ready_received": False,
1539
+ })
1540
+
1541
+ self._log_lifecycle("stopped", name, reason=state["reason"])
1542
+
1543
+ state["cleanup_task"] = asyncio.create_task(cleanup_timeout_handler())
1544
+
1545
+ # 等待 ready 事件(主路径)
1546
+ try:
1547
+ await asyncio.wait_for(ready_evt.wait(), timeout=state["cleanup_timeout"])
1548
+ state["ready_received"] = True
1549
+ print(f"[launcher] {name} 清理完成,准备退出")
1550
+ except asyncio.TimeoutError:
1551
+ # 超时由 cleanup_timeout_handler 处理
1552
+ pass
1553
+ finally:
1554
+ self._event_waiters.pop(ready_key, None)
1555
+
1556
+ # 取消清理超时任务(如果 ready 先到达)
1557
+ if state.get("ready_received") and state["cleanup_task"] and not state["cleanup_task"].done():
1558
+ state["cleanup_task"].cancel()
1559
+
1560
+ # 如果收到 ready,立即杀死进程
1561
+ if state.get("ready_received") and not state.get("stopped_sent"):
1562
+ state["stopped_sent"] = True
1563
+ self._kill_process(name)
1564
+
1565
+ # 发送 stopped 事件
1566
+ await self._publish_event("module.stopped", {
1567
+ "module_id": name,
1568
+ "exit_code": 0, # 正常退出
1569
+ "exit_type": "graceful",
1570
+ "reason": state["reason"],
1571
+ "restart": state["restart"],
1572
+ "ready_received": True,
1573
+ })
1574
+
1575
+ self._log_lifecycle("stopped", name, reason=state["reason"])
1576
+
1577
+ except Exception as e:
1578
+ # 优雅关闭出错,强制终止进程
1579
+ print(f"[launcher] 优雅关闭出错: {e}")
1580
+ if not state.get("stopped_sent"):
1581
+ state["stopped_sent"] = True
1582
+ self._kill_process(name)
1583
+ # 清理所有 waiters
1584
+ self._event_waiters.pop(f"module.shutdown.ack:{name}", None)
1585
+ self._event_waiters.pop(f"module.exiting:{name}", None)
1586
+ self._event_waiters.pop(f"module.shutdown.ready:{name}", None)
1027
1587
 
1028
1588
  async def _graceful_shutdown_all(self):
1029
- """Shut down all modules. Order:
1030
- 1. Send module.exiting for Launcher itself (so Watchdog knows it's intentional)
1031
- 2. Send shutdown to graceful modules (excl. Kernel) — let them start cleanup
1032
- 3. Terminate non-graceful modules (fast, runs during graceful cleanup)
1033
- 4. Wait for graceful modules to exit (process monitoring)
1034
- 5. Shut down Kernel last (keeps event routing alive throughout)
1589
+ """全量优雅退出:三阶段关闭
1590
+
1591
+ Phase 1: 先关闭 Watchdog(防止它监控到其他模块退出后触发重启)
1592
+ Phase 2: 关闭其他所有模块(除 Kernel)
1593
+ Phase 3: 最后关闭 Kernel(保证事件路由畅通)
1035
1594
  """
1036
1595
  self._system_shutting_down = True
1037
1596
 
1038
- # Send module.exiting for Launcher before anything else
1597
+ # 发送 Launcher 自己的 exiting 事件
1039
1598
  await self._publish_event("module.exiting", {
1040
1599
  "module_id": "launcher",
1600
+ "type": "active",
1041
1601
  "reason": "system_shutdown",
1042
1602
  "action": "none",
1603
+ "timeout": 0,
1043
1604
  })
1044
1605
 
1045
1606
  running = [n for n in self.modules if self.process_manager.is_running(n)]
@@ -1047,88 +1608,144 @@ class Launcher:
1047
1608
  for cn in CORE_MODULE_NAMES:
1048
1609
  if self.process_manager.is_running(cn) and cn not in running:
1049
1610
  running.append(cn)
1611
+
1050
1612
  if not running:
1051
1613
  print("[launcher] 没有运行中的模块需要关闭")
1052
1614
  return
1053
1615
 
1054
- graceful = [n for n in running if self._graceful_modules.get(n)]
1055
- non_graceful = [n for n in running if not self._graceful_modules.get(n)]
1616
+ # 分组:Watchdog、Kernel、其他模块
1617
+ watchdog_running = WATCHDOG_MODULE_NAME in running
1618
+ kernel_running = "kernel" in running
1619
+ other_modules = [n for n in running if n not in (WATCHDOG_MODULE_NAME, "kernel")]
1620
+
1621
+ graceful_others = [n for n in other_modules if self._graceful_modules.get(n)]
1622
+ non_graceful_others = [n for n in other_modules if not self._graceful_modules.get(n)]
1623
+
1624
+ print(f"[launcher] 正在关闭 {len(running)} 个模块(三阶段)")
1625
+
1626
+ # ═══════════════════════════════════════════════════════════
1627
+ # Phase 1: 先关闭 Watchdog(防止重启其他模块)
1628
+ # ═══════════════════════════════════════════════════════════
1629
+ if watchdog_running and self.process_manager.is_running(WATCHDOG_MODULE_NAME):
1630
+ print(f"[launcher] Phase 1: 通知 Watchdog 退出(防止重启其他模块)")
1631
+
1632
+ if self._graceful_modules.get(WATCHDOG_MODULE_NAME):
1633
+ # Watchdog 支持优雅退出
1634
+ self._init_module_state(WATCHDOG_MODULE_NAME)
1635
+ state = self._module_states[WATCHDOG_MODULE_NAME]
1636
+ state["shutdown_sent"] = True
1637
+ state["reason"] = "system_shutdown"
1638
+ state["restart"] = False
1639
+ self._log_lifecycle("stopping", WATCHDOG_MODULE_NAME, reason="system_shutdown")
1640
+
1641
+ await self._publish_event("module.shutdown", {
1642
+ "module_id": WATCHDOG_MODULE_NAME,
1643
+ "reason": "system_shutdown",
1644
+ "timeout": 5,
1645
+ "restart": False,
1646
+ })
1647
+
1648
+ # 等待 0.2 秒确保事件送达(不需要等待进程退出)
1649
+ await asyncio.sleep(0.2)
1650
+ print(f"[launcher] Watchdog shutdown 事件已发送")
1651
+ else:
1652
+ # 直接终止
1653
+ self._init_module_state(WATCHDOG_MODULE_NAME)
1654
+ state = self._module_states[WATCHDOG_MODULE_NAME]
1655
+ state["shutdown_sent"] = True
1656
+ state["stopped_sent"] = True
1657
+ state["reason"] = "system_shutdown"
1658
+ state["restart"] = False
1659
+ self._log_lifecycle("stopping", WATCHDOG_MODULE_NAME, reason="system_shutdown")
1056
1660
 
1057
- # Defer Kernel — it must stay alive to route shutdown events
1058
- kernel_deferred = "kernel" in graceful
1059
- graceful_batch = [n for n in graceful if n != "kernel"] if kernel_deferred else graceful
1661
+ self.process_manager.stop_module(WATCHDOG_MODULE_NAME, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
1060
1662
 
1061
- print(f"[launcher] 正在关闭 {len(running)} 个模块: {', '.join(running)}")
1663
+ await self._publish_event("module.stopped", {
1664
+ "module_id": WATCHDOG_MODULE_NAME,
1665
+ "exit_code": 0,
1666
+ "exit_type": "non_graceful",
1667
+ "reason": "system_shutdown",
1668
+ "restart": False,
1669
+ "ready_received": False,
1670
+ })
1062
1671
 
1063
- # Phase 1: Notify graceful modules first (they start cleanup immediately)
1064
- for name in graceful_batch:
1672
+ self._log_lifecycle("stopped", WATCHDOG_MODULE_NAME, reason="system_shutdown")
1673
+
1674
+ # ═══════════════════════════════════════════════════════════
1675
+ # Phase 2: 关闭其他所有模块(除 Kernel)
1676
+ # ═══════════════════════════════════════════════════════════
1677
+ if graceful_others or non_graceful_others:
1678
+ print(f"[launcher] Phase 2: 关闭其他模块({len(graceful_others)} 优雅 + {len(non_graceful_others)} 非优雅)")
1679
+
1680
+ # 通知优雅模块
1681
+ for name in graceful_others:
1682
+ self._init_module_state(name)
1683
+ state = self._module_states[name]
1684
+ state["shutdown_sent"] = True
1685
+ state["reason"] = "system_shutdown"
1686
+ state["restart"] = False
1065
1687
  self._log_lifecycle("stopping", name, reason="system_shutdown")
1066
1688
  await self._publish_event("module.shutdown", {
1067
- "module_id": name, "reason": "system_shutdown", "timeout": 5,
1689
+ "module_id": name,
1690
+ "reason": "system_shutdown",
1691
+ "timeout": 5,
1692
+ "restart": False,
1068
1693
  })
1069
1694
 
1070
- # Phase 2: While graceful modules are cleaning up, terminate non-graceful ones
1071
- if non_graceful:
1072
- print(f"[launcher] 直接终止 {len(non_graceful)} 个不支持优雅退出的模块: {', '.join(non_graceful)}")
1073
- for name in non_graceful:
1695
+ # 终止非优雅模块
1696
+ for name in non_graceful_others:
1697
+ self._init_module_state(name)
1698
+ state = self._module_states[name]
1699
+ state["shutdown_sent"] = True
1700
+ state["stopped_sent"] = True
1701
+ state["reason"] = "system_shutdown"
1702
+ state["restart"] = False
1074
1703
  self._log_lifecycle("stopping", name, reason="system_shutdown")
1075
- self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_PARTIAL)
1704
+
1705
+ self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
1706
+
1707
+ await self._publish_event("module.stopped", {
1708
+ "module_id": name,
1709
+ "exit_code": 0,
1710
+ "exit_type": "non_graceful",
1711
+ "reason": "system_shutdown",
1712
+ "restart": False,
1713
+ "ready_received": False,
1714
+ })
1715
+
1076
1716
  self._log_lifecycle("stopped", name, reason="system_shutdown")
1077
1717
 
1078
- # Phase 3: Wait for graceful modules to exit (process monitoring)
1079
- if graceful_batch:
1718
+ # 等待优雅模块退出(包括 Watchdog)
1719
+ all_graceful = graceful_others + ([WATCHDOG_MODULE_NAME] if watchdog_running and self._graceful_modules.get(WATCHDOG_MODULE_NAME) else [])
1720
+ if all_graceful:
1080
1721
  deadline = time.time() + 5
1081
1722
  while time.time() < deadline:
1082
- still_running = [n for n in graceful_batch if self.process_manager.is_running(n)]
1723
+ still_running = [n for n in all_graceful if self.process_manager.is_running(n)]
1083
1724
  if not still_running:
1084
- print("[launcher] 所有优雅退出模块已自行退出")
1725
+ print("[launcher] 所有其他模块已退出")
1085
1726
  break
1086
1727
  remaining = max(0, deadline - time.time())
1087
1728
  print(f"[launcher] 等待 {len(still_running)} 个模块退出 ({remaining:.0f}s): {', '.join(still_running)}")
1088
1729
  await asyncio.sleep(1)
1089
- # Force kill survivors
1090
- for name in graceful_batch:
1730
+
1731
+ # 强杀未退出的
1732
+ for name in all_graceful:
1091
1733
  if self.process_manager.is_running(name):
1092
- self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_PARTIAL)
1093
- self._log_lifecycle("stopped", name, reason="system_shutdown")
1734
+ print(f"[launcher] {name} 超时,强制终止")
1735
+ self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
1736
+ self._log_lifecycle("stopped", name, reason="system_shutdown_timeout")
1094
1737
 
1095
- # Phase 4: All other modules exited — now shut down Kernel
1096
- if kernel_deferred and self.process_manager.is_running("kernel"):
1097
- self._log_lifecycle("stopping", "kernel", reason="system_shutdown")
1098
- print("[launcher] 正在关闭 Kernel...")
1738
+ # ═══════════════════════════════════════════════════════════
1739
+ # Phase 3: 最后关闭 Kernel(使用标准优雅退出流程)
1740
+ # ═══════════════════════════════════════════════════════════
1741
+ if kernel_running and self.process_manager.is_running("kernel"):
1742
+ print("[launcher] Phase 3: 关闭 Kernel(所有其他模块已退出)")
1099
1743
 
1100
- # Call kernel.shutdown RPC (not event)
1101
- rpc_sent = False
1102
- try:
1103
- if self._ws:
1104
- await self._rpc_call(self._ws, "kernel.shutdown", {})
1105
- print("[launcher] Kernel shutdown RPC 已发送")
1106
- rpc_sent = True
1107
- else:
1108
- print("[launcher] WebSocket 未连接,跳过 RPC 调用")
1109
- except Exception as e:
1110
- print(f"[launcher] Kernel shutdown RPC 失败: {e}")
1111
-
1112
- # Wait for kernel to exit
1113
- if rpc_sent:
1114
- # RPC sent: wait up to 5s for graceful exit
1115
- proc = self.process_manager._processes.get("kernel")
1116
- if proc:
1117
- try:
1118
- loop = asyncio.get_event_loop()
1119
- await asyncio.wait_for(
1120
- loop.run_in_executor(None, proc.wait),
1121
- timeout=5
1122
- )
1123
- print("[launcher] Kernel 已退出")
1124
- except asyncio.TimeoutError:
1125
- print("[launcher] Kernel 5秒内未退出,强制停止")
1126
- self.process_manager.stop_module("kernel", timeout=SHUTDOWN_TIMEOUT_PARTIAL)
1127
- else:
1128
- # No RPC (WS not connected): use shorter timeout for terminate
1129
- self.process_manager.stop_module("kernel", timeout=2)
1744
+ # 明确标记不重启
1745
+ self._desired_states["kernel"] = "stopped"
1130
1746
 
1131
- self._log_lifecycle("stopped", "kernel", reason="system_shutdown")
1747
+ # 使用标准优雅退出流程(内含等待 ack → exiting → ready → kill 完整逻辑)
1748
+ await self._graceful_stop("kernel", reason="system_shutdown", timeout=5)
1132
1749
 
1133
1750
  # Final safety net
1134
1751
  try:
@@ -1282,7 +1899,7 @@ class Launcher:
1282
1899
  # Call Kernel RPC to generate tokens
1283
1900
  try:
1284
1901
  result = await self._rpc_call(self._ws, "kernel.generate_tokens", {"modules": module_names})
1285
- if result.get("result", {}).get("ok"):
1902
+ if "result" in result:
1286
1903
  tokens = result["result"].get("tokens", {})
1287
1904
  self._module_tokens.update(tokens)
1288
1905
  print(f"[launcher] Kernel 已生成 {len(tokens)} 个模块令牌")
@@ -1297,7 +1914,7 @@ class Launcher:
1297
1914
  return
1298
1915
  try:
1299
1916
  result = await self._rpc_call(self._ws, "kernel.register_tokens", tokens)
1300
- if result.get("result", {}).get("ok"):
1917
+ if "result" in result:
1301
1918
  print(f"[launcher] 已注册 {len(tokens)} 个模块令牌")
1302
1919
  elif "error" in result:
1303
1920
  print(f"[launcher] 警告: 令牌注册失败: {result['error'].get('message', '')}")
@@ -1381,10 +1998,19 @@ class Launcher:
1381
1998
  if rc != 0:
1382
1999
  self._print_module_crash_summary(name)
1383
2000
  self._log_lifecycle("exited", name, exit_code=rc)
1384
- await self._publish_event("module.stopped", {
1385
- "module_id": name, "exit_code": rc,
1386
- "graceful_shutdown": self._graceful_modules.get(name, False),
1387
- })
2001
+
2002
+ # 检查是否已发送 stopped 事件
2003
+ state = self._module_states.get(name, {})
2004
+ if not state.get("stopped_sent"):
2005
+ # 取消清理超时任务(如果有)
2006
+ if state.get("cleanup_task"):
2007
+ state["cleanup_task"].cancel()
2008
+ # 发送 stopped 事件
2009
+ await self._send_stopped_event(name, rc)
2010
+
2011
+ # 无论是否发送,都清理状态(防止内存泄漏)
2012
+ self._module_states.pop(name, None)
2013
+
1388
2014
  info = self.modules.get(name)
1389
2015
 
1390
2016
  # 1) Core module crash → full restart
@@ -1691,7 +2317,18 @@ class Launcher:
1691
2317
  debug_flag = " [DEBUG]" if os.environ.get("KITE_DEBUG") == "1" else ""
1692
2318
  lines.append(f"{G} 当前实例: #{inst_num} 后缀: {suffix_display} PID: {os.getpid()}{debug_flag}{R}")
1693
2319
  lines.append(f"{G} 实例目录: {inst_dir}{R}")
1694
- lines.append(f"{G} 工作目录: {cwd}{R}")
2320
+
2321
+ # Check for abnormal working directory
2322
+ cwd_lower = cwd.lower()
2323
+ is_abnormal_cwd = (
2324
+ "windowsapps" in cwd_lower or
2325
+ "appdata\\local\\temp" in cwd_lower or
2326
+ not os.path.exists(os.path.join(cwd, "main.py"))
2327
+ )
2328
+ if is_abnormal_cwd:
2329
+ lines.append(f"\033[91m 工作目录: {cwd} ⚠️ 异常路径{R}")
2330
+ else:
2331
+ lines.append(f"{G} 工作目录: {cwd}{R}")
1695
2332
  if len(instances) > 1:
1696
2333
  lines.append(f"{G} 所有实例:{R}")
1697
2334
  for i in instances:
@@ -1760,6 +2397,30 @@ class Launcher:
1760
2397
  except Exception:
1761
2398
  pass
1762
2399
 
2400
+ def _record_launcher_startup(self):
2401
+ """Record launcher startup information to lifecycle.jsonl."""
2402
+ import sys
2403
+ from datetime import datetime, timezone
2404
+
2405
+ record = {
2406
+ "ts": datetime.now(timezone.utc).isoformat(),
2407
+ "event": "launcher_startup",
2408
+ "module": "launcher",
2409
+ "pid": os.getpid(),
2410
+ "cwd": os.getcwd(),
2411
+ "argv": sys.argv,
2412
+ "instance_dir": os.environ.get("KITE_INSTANCE_DIR", ""),
2413
+ "instance_suffix": self.process_manager.instance_suffix,
2414
+ "python": sys.executable,
2415
+ }
2416
+
2417
+ try:
2418
+ os.makedirs(os.path.dirname(self._lifecycle_log), exist_ok=True)
2419
+ with open(self._lifecycle_log, "a", encoding="utf-8") as f:
2420
+ f.write(json.dumps(record, ensure_ascii=False) + "\n")
2421
+ except Exception:
2422
+ pass
2423
+
1763
2424
 
1764
2425
 
1765
2426
  def _update_module_md_state(module_dir: str, new_state: str):