@agentunion/kite 1.3.1 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. package/CHANGELOG.md +287 -1
  2. package/cli.js +76 -0
  3. package/extensions/agents/assistant/entry.py +111 -1
  4. package/extensions/agents/assistant/server.py +263 -197
  5. package/extensions/channels/acp_channel/entry.py +111 -1
  6. package/extensions/channels/acp_channel/module.md +23 -22
  7. package/extensions/channels/acp_channel/server.py +263 -197
  8. package/extensions/event_hub_bench/entry.py +107 -1
  9. package/extensions/services/backup/entry.py +408 -72
  10. package/extensions/services/backup/module.md +24 -22
  11. package/extensions/services/model_service/entry.py +255 -71
  12. package/extensions/services/model_service/module.md +21 -22
  13. package/extensions/services/watchdog/entry.py +344 -90
  14. package/extensions/services/watchdog/monitor.py +237 -21
  15. package/extensions/services/web/WEBSOCKET_STATUS.md +143 -0
  16. package/extensions/services/web/config_example.py +35 -0
  17. package/extensions/services/web/config_loader.py +110 -0
  18. package/extensions/services/web/entry.py +114 -26
  19. package/extensions/services/web/module.md +35 -24
  20. package/extensions/services/web/pairing.py +250 -0
  21. package/extensions/services/web/pairing_codes.jsonl +16 -0
  22. package/extensions/services/web/relay.py +643 -0
  23. package/extensions/services/web/relay_config.json5 +67 -0
  24. package/extensions/services/web/routes/routes_management_ws.py +127 -0
  25. package/extensions/services/web/routes/routes_rpc.py +89 -0
  26. package/extensions/services/web/routes/routes_test.py +61 -0
  27. package/extensions/services/web/server.py +445 -99
  28. package/extensions/services/web/static/css/style.css +138 -2
  29. package/extensions/services/web/static/index.html +295 -2
  30. package/extensions/services/web/static/js/app.js +1579 -5
  31. package/extensions/services/web/static/js/kernel-client-example.js +161 -0
  32. package/extensions/services/web/static/js/kernel-client.js +383 -0
  33. package/extensions/services/web/static/js/registry-tests.js +558 -0
  34. package/extensions/services/web/static/js/token-manager.js +175 -0
  35. package/extensions/services/web/static/pairing.html +248 -0
  36. package/extensions/services/web/static/test_registry.html +262 -0
  37. package/extensions/services/web/web_config.json5 +29 -0
  38. package/kernel/entry.py +120 -32
  39. package/kernel/event_hub.py +159 -16
  40. package/kernel/module.md +36 -33
  41. package/kernel/registry_store.py +70 -20
  42. package/kernel/rpc_router.py +134 -57
  43. package/kernel/server.py +292 -15
  44. package/kite_cli/__init__.py +3 -0
  45. package/kite_cli/__main__.py +5 -0
  46. package/kite_cli/commands/__init__.py +1 -0
  47. package/kite_cli/commands/clean.py +101 -0
  48. package/kite_cli/commands/doctor.py +35 -0
  49. package/kite_cli/commands/history.py +111 -0
  50. package/kite_cli/commands/info.py +96 -0
  51. package/kite_cli/commands/install.py +313 -0
  52. package/kite_cli/commands/list.py +143 -0
  53. package/kite_cli/commands/log.py +81 -0
  54. package/kite_cli/commands/rollback.py +88 -0
  55. package/kite_cli/commands/search.py +73 -0
  56. package/kite_cli/commands/uninstall.py +85 -0
  57. package/kite_cli/commands/update.py +118 -0
  58. package/kite_cli/core/__init__.py +1 -0
  59. package/kite_cli/core/checker.py +142 -0
  60. package/kite_cli/core/dependency.py +229 -0
  61. package/kite_cli/core/downloader.py +209 -0
  62. package/kite_cli/core/install_info.py +40 -0
  63. package/kite_cli/core/tool_installer.py +397 -0
  64. package/kite_cli/core/validator.py +78 -0
  65. package/kite_cli/main.py +289 -0
  66. package/kite_cli/utils/__init__.py +1 -0
  67. package/kite_cli/utils/i18n.py +252 -0
  68. package/kite_cli/utils/interactive.py +63 -0
  69. package/kite_cli/utils/operation_log.py +77 -0
  70. package/kite_cli/utils/paths.py +34 -0
  71. package/kite_cli/utils/version.py +308 -0
  72. package/launcher/count_lines.py +34 -0
  73. package/launcher/entry.py +905 -166
  74. package/launcher/logging_setup.py +104 -0
  75. package/launcher/module.md +37 -37
  76. package/launcher/process_manager.py +12 -1
  77. package/package.json +2 -1
  78. package/scripts/plan_manager.py +315 -0
package/launcher/entry.py CHANGED
@@ -29,10 +29,21 @@ from .process_manager import ProcessManager
29
29
  IS_WINDOWS = sys.platform == "win32"
30
30
 
31
31
  # Shutdown timeout constants (seconds)
32
- SHUTDOWN_TIMEOUT_NON_GRACEFUL = 5 # Non-graceful modules or no ack response
33
- SHUTDOWN_TIMEOUT_PARTIAL = 3 # Graceful module ack'd but no ready
34
- SHUTDOWN_TIMEOUT_READY = 1 # Graceful module sent ready (cleanup done)
35
- SHUTDOWN_TIMEOUT_BULK = 3 # Bulk stop_all() safety net
32
+
33
+ # 不支持优雅关闭
34
+ SHUTDOWN_TIMEOUT_NON_GRACEFUL = 0.3 # SIGTERM 后等待时间
35
+
36
+ # 支持优雅关闭 - 等待响应
37
+ SHUTDOWN_TIMEOUT_ACK = 3.0 # 等待 shutdown.ack
38
+ SHUTDOWN_TIMEOUT_EXITING = 3.0 # 等待 module.exiting
39
+
40
+ # 清理超时(从 exiting 事件获取)
41
+ CLEANUP_TIMEOUT_DEFAULT = 5.0 # 默认清理时间
42
+ CLEANUP_TIMEOUT_MIN = 0.0 # 最小清理时间
43
+ CLEANUP_TIMEOUT_MAX = 30.0 # 最大清理时间
44
+
45
+ # 批量关闭安全网
46
+ SHUTDOWN_TIMEOUT_BULK = 3.0
36
47
 
37
48
  # Core module names that are started in Phase 1 (not Phase 2)
38
49
  CORE_MODULE_NAMES = {"kernel"}
@@ -101,6 +112,9 @@ class Launcher:
101
112
  # System-wide shutdown flag: prevents Watchdog restart during shutdown
102
113
  self._system_shutting_down = False
103
114
 
115
+ # 模块退出状态跟踪(防止 stopped 事件重复发送)
116
+ self._module_states: dict[str, dict] = {}
117
+
104
118
  # Kite stdout message waiters: {waiter_key: (threading.Event, data_dict)}
105
119
  # Used by ProcessManager stdout callback (cross-thread)
106
120
  self._msg_waiters: dict[str, tuple[threading.Event, dict]] = {}
@@ -117,6 +131,9 @@ class Launcher:
117
131
  pass
118
132
  os.environ["KITE_INSTANCE_SUFFIX"] = suffix
119
133
 
134
+ # Record launcher startup
135
+ self._record_launcher_startup()
136
+
120
137
  @staticmethod
121
138
  def _fmt_elapsed(seconds: float) -> str:
122
139
  """Format elapsed seconds: <1s → 'NNNms', >=1s → 'N.Ns', >=10s → 'NNs'."""
@@ -303,6 +320,21 @@ class Launcher:
303
320
  ch = msvcrt.getch()
304
321
  if ch == b'\x1b': # ESC - force exit immediately
305
322
  print("[launcher] ESC 强制退出")
323
+ # Send module.exiting before exit (best effort)
324
+ try:
325
+ if self._ws and self._loop:
326
+ import concurrent.futures
327
+ fut = asyncio.run_coroutine_threadsafe(
328
+ self._publish_event("module.exiting", {
329
+ "module_id": "launcher",
330
+ "reason": "ESC exit",
331
+ "action": "none",
332
+ }),
333
+ self._loop,
334
+ )
335
+ fut.result(timeout=1) # Wait up to 1s
336
+ except Exception:
337
+ pass
306
338
  os._exit(0)
307
339
  elif ch in (b'q', b'Q'): # q/Q - graceful shutdown
308
340
  self._request_shutdown("收到退出请求,正在关闭...")
@@ -316,7 +348,7 @@ class Launcher:
316
348
  """Full 2-phase startup sequence, then monitor loop."""
317
349
  self._loop = asyncio.get_running_loop()
318
350
  self._ws_connected = asyncio.Event() # Create event in async context
319
- t_start = time.monotonic()
351
+ self._t_start = time.monotonic() # Store for launcher ready_time calculation
320
352
  self._start_unix = time.time()
321
353
  phase_times = {}
322
354
  G = "\033[32m"
@@ -396,7 +428,7 @@ class Launcher:
396
428
  )
397
429
 
398
430
  # ── Startup report ──
399
- total_time = time.monotonic() - t_start
431
+ total_time = time.monotonic() - self._t_start
400
432
  await self._print_startup_report(total_time, phase_times,
401
433
  global_instances=global_instances,
402
434
  cleaned_stats=cleaned_stats)
@@ -557,24 +589,48 @@ class Launcher:
557
589
  # ── Kernel WebSocket connection (JSON-RPC 2.0) ──
558
590
 
559
591
  async def _ws_loop(self):
560
- """Connect to Kernel, reconnect on failure."""
592
+ """Connect to Kernel, reconnect on failure with exponential backoff."""
593
+ retry_delay = 0.3
594
+ max_delay = 5.0
595
+ max_retries = 10
596
+ attempt = 0
561
597
  while not self._thread_shutdown.is_set():
562
598
  try:
563
599
  await self._ws_connect()
600
+ retry_delay = 0.3 # Reset on successful connection
601
+ attempt = 0
564
602
  except asyncio.CancelledError:
565
603
  return
566
604
  except Exception as e:
567
605
  if not self._system_shutting_down:
568
- print(f"[launcher] Kernel 连接错误: {e}")
606
+ attempt += 1
607
+ # Check for auth failure (don't retry)
608
+ if hasattr(e, 'rcvd') and e.rcvd is not None:
609
+ code = e.rcvd.code if hasattr(e.rcvd, 'code') else 0
610
+ if code in (4001, 4003):
611
+ print(f"[launcher] Kernel 认证失败 (code {code}),退出")
612
+ sys.exit(1)
613
+ if attempt >= max_retries:
614
+ print(f"[launcher] Kernel 重连失败 {max_retries} 次,退出")
615
+ sys.exit(1)
616
+ print(f"[launcher] Kernel 连接错误: {e}, {retry_delay:.1f}s 后重试 ({attempt}/{max_retries})")
617
+ if attempt == 5:
618
+ print(f"\033[33m[launcher] 提示: 已连续 {attempt} 次无法连接 Kernel (端口 {self.kernel_port})")
619
+ if self.kernel_port < 1024:
620
+ print(f"[launcher] ⚠ 端口 {self.kernel_port} 异常偏低,可能是 Kernel 端口绑定失败或配置错误")
621
+ print(f"[launcher] 请检查: 1) Kernel 进程是否存活 2) kernel/module.md 中 preferred_port 配置是否正确\033[0m")
569
622
  self._ws = None
570
- await asyncio.sleep(5)
623
+ if self._thread_shutdown.is_set():
624
+ return
625
+ await asyncio.sleep(retry_delay)
626
+ retry_delay = min(retry_delay * 2, max_delay)
571
627
 
572
628
  async def _ws_connect(self):
573
629
  """Single WebSocket session with JSON-RPC 2.0 protocol."""
574
630
  launcher_token = self._module_tokens.get("launcher", "")
575
631
  ws_url = f"ws://127.0.0.1:{self.kernel_port}/ws?token={launcher_token}&id=launcher"
576
632
  t_ws_connect = time.monotonic()
577
- async with websockets.connect(ws_url, open_timeout=3, ping_interval=None, ping_timeout=None, close_timeout=10) as ws:
633
+ async with websockets.connect(ws_url, open_timeout=3, ping_interval=20, ping_timeout=20, close_timeout=10) as ws:
578
634
  self._ws = ws
579
635
  _ws_s = time.monotonic() - t_ws_connect
580
636
  print(f"[launcher] 已连接到 Kernel ({self._fmt_elapsed(_ws_s)})")
@@ -598,15 +654,49 @@ class Launcher:
598
654
  await self._rpc_call(ws, "registry.register", {
599
655
  "module_id": "launcher",
600
656
  "module_type": "infrastructure",
657
+ "tools": {
658
+ "rpc": {
659
+ "launcher": {
660
+ "list_modules": {"method": "list_modules", "description": "列出所有模块"},
661
+ "start_module": {"method": "start_module", "description": "启动模块"},
662
+ "stop_module": {"method": "stop_module", "description": "停止模块"},
663
+ "restart_module": {"method": "restart_module", "description": "重启模块"},
664
+ "restart_launcher": {"method": "restart_launcher", "description": "重启 Launcher"},
665
+ "rescan": {"method": "rescan", "description": "重新扫描模块"},
666
+ "shutdown": {"method": "shutdown", "description": "关闭系统"},
667
+ },
668
+ "module": {
669
+ "config": {
670
+ "get": {"method": "get_module_config", "description": "获取模块配置"},
671
+ "update": {"method": "update_module_config", "description": "更新模块配置"},
672
+ "reset": {"method": "reset_module_config", "description": "恢复默认配置"},
673
+ }
674
+ }
675
+ }
676
+ },
601
677
  "events_publish": {
602
- "module.started": {},
603
- "module.stopped": {},
604
- "module.state_changed": {},
678
+ "system": {
679
+ "ready": {"description": "系统启动完成"}
680
+ },
681
+ "module": {
682
+ "starting": {"description": "模块启动中"},
683
+ "started": {"description": "模块已启动"},
684
+ "ready": {"description": "模块就绪"},
685
+ "stopped": {"description": "模块已停止"},
686
+ "exiting": {"description": "模块退出中"},
687
+ "shutdown": {"description": "模块关闭"}
688
+ }
605
689
  },
606
690
  "events_subscribe": [">"],
607
691
  })
608
692
  print("[launcher] 已注册到 Kernel")
609
693
 
694
+ # Publish module.ready for Launcher itself (every reconnect)
695
+ await self._publish_event("module.ready", {
696
+ "module_id": "launcher",
697
+ "graceful_shutdown": True,
698
+ })
699
+
610
700
  # Signal that connection is ready (after subscription and registration)
611
701
  if self._ws_connected:
612
702
  self._ws_connected.set()
@@ -618,7 +708,14 @@ class Launcher:
618
708
  raise
619
709
 
620
710
  async def _ws_receiver(self, ws):
621
- """Receive loop: classify incoming messages."""
711
+ """Receive loop: classify incoming messages.
712
+
713
+ CRITICAL: RPC 死锁防范
714
+ - 入站 RPC 请求必须用 create_task() 异步执行,不可 await
715
+ - 原因:如果 handler 内部调用 rpc_call() 发出站请求,出站响应需要本接收循环来分发
716
+ - 如果接收循环被 await handler 阻塞,出站响应永远收不到 → 超时死锁
717
+ - 事件通知和 RPC 响应可以同步处理(它们不会反向调用 rpc_call)
718
+ """
622
719
  try:
623
720
  async for raw in ws:
624
721
  try:
@@ -636,7 +733,8 @@ class Launcher:
636
733
  await self._handle_event_notification(msg)
637
734
  elif has_method and has_id:
638
735
  # Incoming RPC request (forwarded by Kernel)
639
- await self._handle_rpc_request(ws, msg)
736
+ # Run in background so receiver loop continues processing responses
737
+ asyncio.create_task(self._handle_rpc_request(ws, msg))
640
738
  elif has_id and (has_result or has_error):
641
739
  # RPC response (to our own call)
642
740
  self._handle_rpc_response(msg)
@@ -688,6 +786,7 @@ class Launcher:
688
786
  # Trigger event waiters
689
787
  module_id = data.get("module_id", "")
690
788
  waiter_key = f"{event}:{module_id}"
789
+
691
790
  waiter = self._event_waiters.get(waiter_key)
692
791
  if waiter:
693
792
  waiter[1].update(data)
@@ -702,6 +801,42 @@ class Launcher:
702
801
  ready_waiter[1]["_exited"] = True
703
802
  ready_waiter[0].set()
704
803
 
804
+ # 处理主动退出场景(没有 shutdown 的情况)
805
+ if module_id not in self._module_states:
806
+ self._init_module_state(module_id)
807
+ state = self._module_states[module_id]
808
+
809
+ if not state.get("shutdown_sent"):
810
+ # 主动退出:记录信息
811
+ if not state.get("exiting_received"):
812
+ state["exiting_received"] = True
813
+ state["reason"] = data.get("reason", "active_exit")
814
+ state["restart"] = data.get("restart", False)
815
+ cleanup_timeout = data.get("cleanup_timeout", CLEANUP_TIMEOUT_DEFAULT)
816
+ cleanup_timeout = max(CLEANUP_TIMEOUT_MIN, min(cleanup_timeout, CLEANUP_TIMEOUT_MAX))
817
+ state["cleanup_timeout"] = cleanup_timeout
818
+
819
+ # 启动清理超时任务
820
+ async def cleanup_timeout_handler():
821
+ await asyncio.sleep(state["cleanup_timeout"])
822
+ if not state.get("stopped_sent"):
823
+ state["stopped_sent"] = True
824
+ self._kill_process(module_id)
825
+
826
+ # 发送 stopped 事件
827
+ await self._publish_event("module.stopped", {
828
+ "module_id": module_id,
829
+ "exit_code": -1, # 超时强制终止,退出码未知
830
+ "exit_type": "timeout",
831
+ "reason": state.get("reason", "cleanup_timeout"),
832
+ "restart": state.get("restart", False),
833
+ "ready_received": False,
834
+ })
835
+
836
+ self._log_lifecycle("stopped", module_id, reason=state["reason"])
837
+
838
+ state["cleanup_task"] = asyncio.create_task(cleanup_timeout_handler())
839
+
705
840
  # module.crash → print red crash summary
706
841
  if event == "module.crash" and module_id:
707
842
  RED = "\033[91m"
@@ -716,6 +851,43 @@ class Launcher:
716
851
  )
717
852
  print(f"[launcher] 崩溃日志: {crash_log}")
718
853
 
854
+ # pairing.status → handle all pairing flow events
855
+ if event == "pairing.status":
856
+ GREEN = "\033[92m"
857
+ RED = "\033[91m"
858
+ RESET = "\033[0m"
859
+
860
+ step = data.get("step", "")
861
+ success = data.get("success", True)
862
+
863
+ if step == "code_generated":
864
+ code = data.get("code", "")
865
+ expires_in = data.get("expires_in", 300)
866
+ if code:
867
+ print(f"[launcher] {GREEN}配对码: {code}{RESET}")
868
+ print(f"[launcher] {GREEN}有效期: {expires_in} 秒{RESET}")
869
+ print(f"[launcher] {GREEN}访问 Web 界面时使用此配对码进行配对{RESET}")
870
+
871
+ elif step == "pairing":
872
+ if success:
873
+ print(f"[launcher] {GREEN}正在配对...{RESET}")
874
+ else:
875
+ reason = data.get("reason", "Unknown error")
876
+ print(f"[launcher] {RED}✗ 配对失败: {reason}{RESET}")
877
+
878
+ elif step == "completed":
879
+ if success:
880
+ module_id = data.get("module_id", "")
881
+ role = data.get("role", "")
882
+ print(f"[launcher] {GREEN}✓ 配对成功!{RESET}")
883
+ print(f"[launcher] {GREEN} 模块 ID: {module_id}{RESET}")
884
+ print(f"[launcher] {GREEN} 角色: {role}{RESET}")
885
+ else:
886
+ reason = data.get("reason", "Unknown error")
887
+ print(f"[launcher] {RED}✗ 配对失败: {reason}{RESET}")
888
+
889
+ return
890
+
719
891
  # Only log system events (module.*, watchdog.*) to avoid flooding
720
892
  if not (event.startswith("module.") or event.startswith("watchdog.")):
721
893
  return
@@ -740,12 +912,16 @@ class Launcher:
740
912
  params = msg.get("params", {})
741
913
 
742
914
  handlers = {
743
- "list_modules": self._rpc_list_modules,
744
- "start_module": self._rpc_start_module,
745
- "stop_module": self._rpc_stop_module,
746
- "restart_module": self._rpc_restart_module,
747
- "rescan": self._rpc_rescan,
748
- "shutdown": self._rpc_shutdown,
915
+ "list_modules": self._rpc_list_modules,
916
+ "start_module": self._rpc_start_module,
917
+ "stop_module": self._rpc_stop_module,
918
+ "restart_module": self._rpc_restart_module,
919
+ "restart_launcher": self._rpc_restart_launcher,
920
+ "rescan": self._rpc_rescan,
921
+ "shutdown": self._rpc_shutdown,
922
+ "get_module_config": self._rpc_get_module_config,
923
+ "update_module_config": self._rpc_update_module_config,
924
+ "reset_module_config": self._rpc_reset_module_config,
749
925
  }
750
926
  handler = handlers.get(method)
751
927
  if handler:
@@ -775,11 +951,14 @@ class Launcher:
775
951
  "name": name,
776
952
  "display_name": info.display_name,
777
953
  "type": info.type,
778
- "config_state": info.state,
954
+ "state": info.state, # 改名为 state(与 /api/modules 一致)
955
+ "version": info.version,
956
+ "runtime": info.runtime,
957
+ "preferred_port": info.preferred_port,
958
+ "monitor": info.monitor,
779
959
  "desired_state": self._desired_states.get(name, "stopped"),
780
960
  "actual_state": f"running({rec.pid})" if running and rec else "stopped",
781
961
  "pid": rec.pid if running and rec else None,
782
- "monitor": info.monitor,
783
962
  })
784
963
  return {"modules": result}
785
964
 
@@ -871,37 +1050,256 @@ class Launcher:
871
1050
  self._request_shutdown(f"RPC shutdown request: {reason}")
872
1051
  return {"status": "shutting_down", "reason": reason}
873
1052
 
1053
+ async def _rpc_get_module_config(self, params: dict) -> dict:
1054
+ """获取指定模块的配置(通用降级方案)"""
1055
+ import re
1056
+ import yaml
1057
+ from pathlib import Path
1058
+
1059
+ module_name = params.get("module_name")
1060
+ if not module_name:
1061
+ raise ValueError("module_name required")
1062
+
1063
+ # 查找模块信息
1064
+ info = self.modules.get(module_name)
1065
+ if not info:
1066
+ raise RuntimeError(f"Module '{module_name}' not found")
1067
+
1068
+ # 读取 module.md
1069
+ md_path = Path(info.module_dir) / "module.md"
1070
+ if not md_path.exists():
1071
+ raise RuntimeError(f"module.md not found for '{module_name}'")
1072
+
1073
+ text = md_path.read_text(encoding="utf-8")
1074
+ m = re.match(r'^---\s*\n(.*?)\n---\s*\n?(.*)', text, re.DOTALL)
1075
+ if not m:
1076
+ frontmatter = {}
1077
+ else:
1078
+ frontmatter = yaml.safe_load(m.group(1)) or {}
1079
+
1080
+ # 读取 config.yaml(如果存在)
1081
+ config_path = Path(info.module_dir) / "config.yaml"
1082
+ config = None
1083
+ if config_path.exists():
1084
+ config = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
1085
+
1086
+ return {
1087
+ "name": frontmatter.get("name", module_name),
1088
+ "display_name": frontmatter.get("display_name", ""),
1089
+ "type": frontmatter.get("type", ""),
1090
+ "state": frontmatter.get("state", "enabled"),
1091
+ "version": frontmatter.get("version", ""),
1092
+ "runtime": frontmatter.get("runtime", ""),
1093
+ "entry": frontmatter.get("entry", ""),
1094
+ "preferred_port": frontmatter.get("preferred_port"),
1095
+ "advertise_ip": frontmatter.get("advertise_ip"),
1096
+ "monitor": frontmatter.get("monitor"),
1097
+ "events": frontmatter.get("events"),
1098
+ "subscriptions": frontmatter.get("subscriptions"),
1099
+ "depends_on": frontmatter.get("depends_on"),
1100
+ "source_path": str(info.module_dir), # 添加模块路径
1101
+ "has_config": config is not None,
1102
+ "config": config,
1103
+ }
1104
+
1105
+ async def _rpc_update_module_config(self, params: dict) -> dict:
1106
+ """更新指定模块的配置(通用降级方案)"""
1107
+ import yaml
1108
+ from pathlib import Path
1109
+
1110
+ module_name = params.get("module_name")
1111
+ metadata = params.get("metadata", {})
1112
+ config = params.get("config", {})
1113
+
1114
+ if not module_name:
1115
+ raise ValueError("module_name required")
1116
+
1117
+ info = self.modules.get(module_name)
1118
+ if not info:
1119
+ raise RuntimeError(f"Module '{module_name}' not found")
1120
+
1121
+ md_path = Path(info.module_dir) / "module.md"
1122
+ if not md_path.exists():
1123
+ raise RuntimeError(f"module.md not found for '{module_name}'")
1124
+
1125
+ # 更新 module.md frontmatter
1126
+ if metadata:
1127
+ frontmatter, body = _parse_frontmatter(md_path.read_text(encoding="utf-8"))
1128
+ for key, value in metadata.items():
1129
+ frontmatter[key] = value
1130
+ fm_str = yaml.dump(frontmatter, allow_unicode=True, sort_keys=False, default_flow_style=False).rstrip()
1131
+ content = f"---\n{fm_str}\n---\n{body}"
1132
+ md_path.write_text(content, encoding="utf-8")
1133
+
1134
+ # 更新 config.yaml
1135
+ if config:
1136
+ config_path = Path(info.module_dir) / "config.yaml"
1137
+ existing = {}
1138
+ if config_path.exists():
1139
+ existing = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
1140
+ # Deep merge
1141
+ self._deep_merge(existing, config)
1142
+ config_path.write_text(
1143
+ yaml.dump(existing, allow_unicode=True, sort_keys=False, default_flow_style=False),
1144
+ encoding="utf-8"
1145
+ )
1146
+
1147
+ # 重新扫描以更新缓存
1148
+ await self._rescan_modules()
1149
+
1150
+ # 返回更新后的配置
1151
+ return await self._rpc_get_module_config({"module_name": module_name})
1152
+
1153
+ async def _rpc_reset_module_config(self, params: dict) -> dict:
1154
+ """恢复指定模块的默认配置(通用降级方案)"""
1155
+ import yaml
1156
+ from pathlib import Path
1157
+
1158
+ module_name = params.get("module_name")
1159
+ fields = params.get("fields", [])
1160
+ reset_all = params.get("all", False)
1161
+
1162
+ if not module_name:
1163
+ raise ValueError("module_name required")
1164
+
1165
+ info = self.modules.get(module_name)
1166
+ if not info:
1167
+ raise RuntimeError(f"Module '{module_name}' not found")
1168
+
1169
+ md_path = Path(info.module_dir) / "module.md"
1170
+ if not md_path.exists():
1171
+ raise RuntimeError(f"module.md not found for '{module_name}'")
1172
+
1173
+ # 默认值定义(通用)
1174
+ defaults = {
1175
+ "state": "enabled",
1176
+ "monitor": True,
1177
+ }
1178
+
1179
+ frontmatter, body = _parse_frontmatter(md_path.read_text(encoding="utf-8"))
1180
+
1181
+ if reset_all:
1182
+ for key, value in defaults.items():
1183
+ frontmatter[key] = value
1184
+ else:
1185
+ for field in fields:
1186
+ if field in defaults:
1187
+ frontmatter[field] = defaults[field]
1188
+ elif field == "preferred_port":
1189
+ frontmatter.pop(field, None) # 恢复为 null
1190
+ elif field == "advertise_ip":
1191
+ frontmatter[field] = "127.0.0.1"
1192
+
1193
+ fm_str = yaml.dump(frontmatter, allow_unicode=True, sort_keys=False, default_flow_style=False).rstrip()
1194
+ content = f"---\n{fm_str}\n---\n{body}"
1195
+ md_path.write_text(content, encoding="utf-8")
1196
+
1197
+ # 重新扫描以更新缓存
1198
+ await self._rescan_modules()
1199
+
1200
+ return await self._rpc_get_module_config({"module_name": module_name})
1201
+
1202
+ @staticmethod
1203
+ def _deep_merge(base: dict, overlay: dict) -> dict:
1204
+ """递归合并字典"""
1205
+ for k, v in overlay.items():
1206
+ if k in base and isinstance(base[k], dict) and isinstance(v, dict):
1207
+ Launcher._deep_merge(base[k], v)
1208
+ else:
1209
+ base[k] = v
1210
+ return base
1211
+
1212
+
1213
+ async def _rpc_restart_launcher(self, params: dict) -> dict:
1214
+ """Restart Launcher process via Watchdog.
1215
+
1216
+ Simply notify watchdog and exit. Watchdog will start a new instance.
1217
+
1218
+ Args:
1219
+ params: {
1220
+ "reason": str (optional) - Restart reason
1221
+ }
1222
+
1223
+ Returns:
1224
+ {"status": "restarting", "reason": str}
1225
+ or {"error": "watchdog offline"}
1226
+ """
1227
+ reason = params.get("reason", "user_request")
1228
+ O = "\033[33m" # orange/yellow
1229
+ R = "\033[0m" # reset
1230
+ print(f"{O}[launcher] 收到 Launcher 重启请求{R}")
1231
+ print(f"[launcher] 原因: {reason}")
1232
+
1233
+ # Check if watchdog is running
1234
+ watchdog_running = self.process_manager.is_running("watchdog")
1235
+ print(f"[launcher] 检查 watchdog 状态: {'running' if watchdog_running else 'stopped'}")
1236
+
1237
+ if not watchdog_running:
1238
+ error_msg = "watchdog 未运行, 无法重启"
1239
+ print(f"[launcher] ❌ {error_msg}")
1240
+ return {"error": error_msg}
1241
+
1242
+ print(f"[launcher] ✓ watchdog 状态正常,准备重启流程")
1243
+
1244
+ # Schedule restart in background (don't block RPC response)
1245
+ async def _do_restart():
1246
+ await asyncio.sleep(0.3) # 确保 RPC 响应已发送
1247
+
1248
+ print(f"[launcher] 发送 module.exiting 事件给 watchdog...")
1249
+
1250
+ # Collect startup info for watchdog to restart with same environment
1251
+ startup_info = {
1252
+ "python": sys.executable,
1253
+ "argv": sys.argv,
1254
+ "cwd": os.getcwd(),
1255
+ "env": dict(os.environ), # 所有环境变量
1256
+ }
1257
+
1258
+ # Notify watchdog: this is a planned restart, not a crash
1259
+ await self._publish_event("module.exiting", {
1260
+ "module_id": "launcher",
1261
+ "action": "restart_launcher",
1262
+ "reason": reason,
1263
+ "startup_info": startup_info,
1264
+ })
1265
+
1266
+ print(f"[launcher] 已通知 watchdog 计划内重启")
1267
+ print(f"[launcher] 退出进程,等待 watchdog 重启")
1268
+ print(f"[launcher] 原因: {reason}")
1269
+
1270
+ os._exit(0)
1271
+
1272
+ asyncio.create_task(_do_restart())
1273
+
1274
+ return {"status": "restarting", "reason": reason}
1275
+
874
1276
  # ── Event publishing via RPC ──
875
1277
 
876
1278
  async def _publish_event(self, event_type: str, data: dict):
877
1279
  """Publish an event via RPC event.publish through Kernel WS."""
878
1280
  if not self._ws:
879
1281
  return
880
- msg = json.dumps({
881
- "jsonrpc": "2.0",
882
- "id": str(uuid.uuid4()),
883
- "method": "event.publish",
884
- "params": {
1282
+ try:
1283
+ await self._rpc_call(self._ws, "event.publish", {
885
1284
  "event_id": str(uuid.uuid4()),
886
1285
  "event": event_type,
887
1286
  "data": data,
888
- },
889
- })
890
-
891
- async def _send():
892
- try:
893
- await self._ws.send(msg)
894
- except Exception as e:
895
- print(f"[launcher] 发布事件失败: {e}")
896
-
897
- asyncio.create_task(_send())
1287
+ }, timeout=2.0)
1288
+ except Exception as e:
1289
+ print(f"[launcher] 发布事件失败 ({event_type}): {e}")
898
1290
 
899
1291
  async def _wait_event(self, event_type: str, module_id: str, timeout: float) -> dict | None:
900
1292
  """Wait for a specific event from a module. Returns data dict or None on timeout."""
901
1293
  key = f"{event_type}:{module_id}"
902
- evt = asyncio.Event()
903
- data = {}
904
- self._event_waiters[key] = (evt, data)
1294
+ # Reuse existing waiter if one was pre-registered (e.g. in _ws_connect)
1295
+ # This prevents a race where the event arrives before this method is called
1296
+ existing = self._event_waiters.get(key)
1297
+ if existing:
1298
+ evt, data = existing
1299
+ else:
1300
+ evt = asyncio.Event()
1301
+ data = {}
1302
+ self._event_waiters[key] = (evt, data)
905
1303
  try:
906
1304
  await asyncio.wait_for(evt.wait(), timeout=timeout)
907
1305
  return data
@@ -910,170 +1308,444 @@ class Launcher:
910
1308
  finally:
911
1309
  self._event_waiters.pop(key, None)
912
1310
 
913
- async def _graceful_stop(self, name: str, reason: str = "stop_requested", timeout: float = 10):
914
- """Graceful shutdown: check capability → send event → wait ack → wait ready → kill.
915
- Modules that did not declare graceful_shutdown in module.ready are terminated directly.
916
- """
917
- self._log_lifecycle("stopping", name, reason=reason)
1311
+ # ── 退出机制辅助方法 ──
1312
+
1313
+ def _init_module_state(self, name: str):
1314
+ """初始化模块状态跟踪字典"""
1315
+ self._module_states[name] = {
1316
+ "shutdown_sent": False,
1317
+ "ack_received": False,
1318
+ "exiting_received": False,
1319
+ "ready_received": False,
1320
+ "stopped_sent": False,
1321
+ "exit_type": None, # "graceful" | "non_graceful" | "active"
1322
+ "reason": None,
1323
+ "restart": None,
1324
+ "cleanup_timeout": None,
1325
+ "cleanup_task": None,
1326
+ }
918
1327
 
919
- if not self._graceful_modules.get(name):
920
- self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
921
- self._log_lifecycle("stopped", name, reason=reason)
922
- await self._publish_event("module.stopped", {
923
- "module_id": name,
924
- "graceful_shutdown": False,
925
- })
1328
+ def _kill_process(self, name: str):
1329
+ """统一的进程杀死方法"""
1330
+ record = self.process_manager.get_record(name)
1331
+ if record and record.proc and record.proc.poll() is None:
1332
+ print(f"[launcher] 强制终止 {name} (PID {record.proc.pid})")
1333
+ self.process_manager.kill_process(name)
1334
+ elif record:
1335
+ # 进程已经退出,只是清理记录
1336
+ pass
1337
+ else:
1338
+ # 没有记录,可能已经被清理
1339
+ pass
1340
+
1341
+ def _determine_exit_type(self, name: str) -> str:
1342
+ """判断退出类型: graceful | non_graceful | active"""
1343
+ state = self._module_states.get(name, {})
1344
+ if state.get("exiting_received"):
1345
+ return "graceful"
1346
+ elif state.get("shutdown_sent"):
1347
+ return "non_graceful"
1348
+ else:
1349
+ return "active"
1350
+
1351
+ def _resolve_reason(self, name: str) -> str:
1352
+ """解析最终原因(优先级:exiting > shutdown > 默认)"""
1353
+ state = self._module_states.get(name, {})
1354
+ if state.get("reason"):
1355
+ return state["reason"]
1356
+ return "unknown"
1357
+
1358
+ def _resolve_restart(self, name: str) -> bool:
1359
+ """解析重启决策(优先级:exiting > shutdown > 默认)"""
1360
+ state = self._module_states.get(name, {})
1361
+ if state.get("restart") is not None:
1362
+ return state["restart"]
1363
+ # 默认:主动退出不重启,被动关闭看 desired_state
1364
+ if self._determine_exit_type(name) == "active":
1365
+ return False
1366
+ return self._desired_states.get(name) == "running"
1367
+
1368
+ async def _send_stopped_event(self, name: str, exit_code: int):
1369
+ """发送 module.stopped 事件(防重复)"""
1370
+ state = self._module_states.get(name, {})
1371
+ if state.get("stopped_sent"):
926
1372
  return
927
1373
 
928
- # Register waiters BEFORE sending shutdown event
929
- ack_key = f"module.shutdown.ack:{name}"
930
- ack_evt = asyncio.Event()
931
- ack_data = {}
932
- self._event_waiters[ack_key] = (ack_evt, ack_data)
1374
+ # 立即设置标记(防止竞态条件)
1375
+ if name in self._module_states:
1376
+ self._module_states[name]["stopped_sent"] = True
933
1377
 
934
- ready_key = f"module.shutdown.ready:{name}"
935
- ready_evt = asyncio.Event()
936
- ready_data = {}
937
- self._event_waiters[ready_key] = (ready_evt, ready_data)
1378
+ exit_type = self._determine_exit_type(name)
1379
+ reason = self._resolve_reason(name)
1380
+ restart = self._resolve_restart(name)
938
1381
 
939
- await self._publish_event("module.shutdown", {
940
- "module_id": name, "reason": reason, "timeout": timeout,
1382
+ await self._publish_event("module.stopped", {
1383
+ "module_id": name,
1384
+ "exit_code": exit_code,
1385
+ "exit_type": exit_type,
1386
+ "reason": reason,
1387
+ "restart": restart,
1388
+ "ready_received": state.get("ready_received", False),
941
1389
  })
942
1390
 
943
- # Wait for ack
1391
+ # ── 优雅关闭 ──
1392
+
1393
+ async def _graceful_stop(self, name: str, reason: str = "stop_requested", timeout: float = 10):
1394
+ """优雅关闭单个模块:
1395
+ 1. 初始化状态跟踪
1396
+ 2. 非优雅模块直接 SIGTERM
1397
+ 3. 优雅模块:发送 shutdown → 等待 ack → 等待 exiting → 启动清理超时 → 杀死
1398
+ """
944
1399
  try:
945
- await asyncio.wait_for(ack_evt.wait(), timeout=3)
946
- ack = ack_data
947
- except asyncio.TimeoutError:
948
- ack = None
949
- finally:
950
- self._event_waiters.pop(ack_key, None)
1400
+ self._log_lifecycle("stopping", name, reason=reason)
951
1401
 
952
- if not ack:
953
- self._event_waiters.pop(ready_key, None)
954
- self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
955
- await self._publish_event("module.stopped", {
1402
+ # 初始化状态
1403
+ self._init_module_state(name)
1404
+ state = self._module_states[name]
1405
+
1406
+ # 非优雅模块:直接 SIGTERM
1407
+ if not self._graceful_modules.get(name):
1408
+ state["shutdown_sent"] = True # 标记:Launcher 主动关闭
1409
+ state["stopped_sent"] = True # 防重复标记
1410
+ state["reason"] = reason
1411
+ state["restart"] = self._desired_states.get(name) == "running"
1412
+
1413
+ self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
1414
+
1415
+ # 发送 stopped 事件
1416
+ await self._publish_event("module.stopped", {
1417
+ "module_id": name,
1418
+ "exit_code": 0,
1419
+ "exit_type": "non_graceful",
1420
+ "reason": reason,
1421
+ "restart": state["restart"],
1422
+ "ready_received": False,
1423
+ })
1424
+
1425
+ self._log_lifecycle("stopped", name, reason=reason)
1426
+ return
1427
+
1428
+ # 优雅模块:提前注册所有三个 waiter(ack、exiting、ready)
1429
+ # 这样可以避免事件到达时 waiter 还没注册的竞争条件
1430
+ ack_key = f"module.shutdown.ack:{name}"
1431
+ ack_evt = asyncio.Event()
1432
+ ack_data = {}
1433
+ self._event_waiters[ack_key] = (ack_evt, ack_data)
1434
+
1435
+ exiting_key = f"module.exiting:{name}"
1436
+ exiting_evt = asyncio.Event()
1437
+ exiting_data = {}
1438
+ self._event_waiters[exiting_key] = (exiting_evt, exiting_data)
1439
+
1440
+ ready_key = f"module.shutdown.ready:{name}"
1441
+ ready_evt = asyncio.Event()
1442
+ ready_data = {}
1443
+ self._event_waiters[ready_key] = (ready_evt, ready_data)
1444
+
1445
+ # 发送 shutdown 事件
1446
+ state["shutdown_sent"] = True
1447
+ state["reason"] = reason
1448
+ state["restart"] = self._desired_states.get(name) == "running"
1449
+
1450
+ await self._publish_event("module.shutdown", {
956
1451
  "module_id": name,
957
- "graceful_shutdown": self._graceful_modules.get(name, False),
1452
+ "reason": reason,
1453
+ "timeout": timeout,
1454
+ "restart": state["restart"],
958
1455
  })
959
- return
960
1456
 
961
- estimated = min(ack.get("estimated_cleanup", timeout), timeout)
1457
+ # 等待 ack
1458
+ try:
1459
+ await asyncio.wait_for(ack_evt.wait(), timeout=SHUTDOWN_TIMEOUT_ACK)
1460
+ state["ack_received"] = True
1461
+ except asyncio.TimeoutError:
1462
+ pass
1463
+ finally:
1464
+ self._event_waiters.pop(ack_key, None)
962
1465
 
963
- # Wait for ready
964
- try:
965
- await asyncio.wait_for(ready_evt.wait(), timeout=estimated)
966
- ready = ready_data
967
- except asyncio.TimeoutError:
968
- ready = None
969
- finally:
970
- self._event_waiters.pop(ready_key, None)
971
- if ready:
972
- self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_READY)
973
- else:
974
- self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_PARTIAL)
1466
+ if not state["ack_received"]:
1467
+ # 没有 ack,直接杀死
1468
+ self._event_waiters.pop(exiting_key, None)
1469
+ self._event_waiters.pop(ready_key, None)
1470
+ state["stopped_sent"] = True
1471
+ self._kill_process(name)
975
1472
 
976
- self._log_lifecycle("stopped", name, reason=reason)
977
- await self._publish_event("module.stopped", {
978
- "module_id": name,
979
- "graceful_shutdown": self._graceful_modules.get(name, False),
980
- })
1473
+ # 发送 stopped 事件
1474
+ await self._publish_event("module.stopped", {
1475
+ "module_id": name,
1476
+ "exit_code": -1, # 未收到 ack,退出码未知
1477
+ "exit_type": "timeout",
1478
+ "reason": state["reason"],
1479
+ "restart": state["restart"],
1480
+ "ready_received": False,
1481
+ })
1482
+
1483
+ self._log_lifecycle("stopped", name, reason=reason)
1484
+ return
1485
+
1486
+ # 等待 exiting 事件
1487
+ try:
1488
+ await asyncio.wait_for(exiting_evt.wait(), timeout=SHUTDOWN_TIMEOUT_EXITING)
1489
+ state["exiting_received"] = True
1490
+ # 从 exiting 事件中提取信息
1491
+ if exiting_data.get("reason"):
1492
+ state["reason"] = exiting_data["reason"]
1493
+ if "restart" in exiting_data:
1494
+ state["restart"] = exiting_data["restart"]
1495
+ cleanup_timeout = exiting_data.get("cleanup_timeout", CLEANUP_TIMEOUT_DEFAULT)
1496
+ cleanup_timeout = max(CLEANUP_TIMEOUT_MIN, min(cleanup_timeout, CLEANUP_TIMEOUT_MAX))
1497
+ state["cleanup_timeout"] = cleanup_timeout
1498
+ except asyncio.TimeoutError:
1499
+ pass
1500
+ finally:
1501
+ self._event_waiters.pop(exiting_key, None)
1502
+
1503
+ if not state["exiting_received"]:
1504
+ # 没有 exiting,直接杀死
1505
+ self._event_waiters.pop(ready_key, None)
1506
+ state["stopped_sent"] = True
1507
+ self._kill_process(name)
1508
+
1509
+ # 发送 stopped 事件
1510
+ await self._publish_event("module.stopped", {
1511
+ "module_id": name,
1512
+ "exit_code": -1, # 未收到 exiting,退出码未知
1513
+ "exit_type": "timeout",
1514
+ "reason": state["reason"],
1515
+ "restart": state["restart"],
1516
+ "ready_received": False,
1517
+ })
1518
+
1519
+ self._log_lifecycle("stopped", name, reason=state["reason"])
1520
+ return
1521
+
1522
+ # ready waiter 已经在前面注册好了,直接启动清理超时任务
1523
+ # 启动清理超时任务(兜底机制)
1524
+ async def cleanup_timeout_handler():
1525
+ await asyncio.sleep(state["cleanup_timeout"])
1526
+ if not state.get("stopped_sent"):
1527
+ print(f"[launcher] {name} 清理超时 ({state['cleanup_timeout']}s),强制终止")
1528
+ state["stopped_sent"] = True
1529
+ self._kill_process(name)
1530
+
1531
+ # 发送 stopped 事件
1532
+ await self._publish_event("module.stopped", {
1533
+ "module_id": name,
1534
+ "exit_code": -1, # 清理超时,退出码未知
1535
+ "exit_type": "timeout",
1536
+ "reason": state["reason"],
1537
+ "restart": state["restart"],
1538
+ "ready_received": False,
1539
+ })
1540
+
1541
+ self._log_lifecycle("stopped", name, reason=state["reason"])
1542
+
1543
+ state["cleanup_task"] = asyncio.create_task(cleanup_timeout_handler())
1544
+
1545
+ # 等待 ready 事件(主路径)
1546
+ try:
1547
+ await asyncio.wait_for(ready_evt.wait(), timeout=state["cleanup_timeout"])
1548
+ state["ready_received"] = True
1549
+ print(f"[launcher] {name} 清理完成,准备退出")
1550
+ except asyncio.TimeoutError:
1551
+ # 超时由 cleanup_timeout_handler 处理
1552
+ pass
1553
+ finally:
1554
+ self._event_waiters.pop(ready_key, None)
1555
+
1556
+ # 取消清理超时任务(如果 ready 先到达)
1557
+ if state.get("ready_received") and state["cleanup_task"] and not state["cleanup_task"].done():
1558
+ state["cleanup_task"].cancel()
1559
+
1560
+ # 如果收到 ready,立即杀死进程
1561
+ if state.get("ready_received") and not state.get("stopped_sent"):
1562
+ state["stopped_sent"] = True
1563
+ self._kill_process(name)
1564
+
1565
+ # 发送 stopped 事件
1566
+ await self._publish_event("module.stopped", {
1567
+ "module_id": name,
1568
+ "exit_code": 0, # 正常退出
1569
+ "exit_type": "graceful",
1570
+ "reason": state["reason"],
1571
+ "restart": state["restart"],
1572
+ "ready_received": True,
1573
+ })
1574
+
1575
+ self._log_lifecycle("stopped", name, reason=state["reason"])
1576
+
1577
+ except Exception as e:
1578
+ # 优雅关闭出错,强制终止进程
1579
+ print(f"[launcher] 优雅关闭出错: {e}")
1580
+ if not state.get("stopped_sent"):
1581
+ state["stopped_sent"] = True
1582
+ self._kill_process(name)
1583
+ # 清理所有 waiters
1584
+ self._event_waiters.pop(f"module.shutdown.ack:{name}", None)
1585
+ self._event_waiters.pop(f"module.exiting:{name}", None)
1586
+ self._event_waiters.pop(f"module.shutdown.ready:{name}", None)
981
1587
 
982
1588
  async def _graceful_shutdown_all(self):
983
- """Shut down all modules. Order:
984
- 1. Send shutdown to graceful modules (excl. Kernel) — let them start cleanup
985
- 2. Terminate non-graceful modules (fast, runs during graceful cleanup)
986
- 3. Wait for graceful modules to exit (process monitoring)
987
- 4. Shut down Kernel last (keeps event routing alive throughout)
1589
+ """全量优雅退出:三阶段关闭
1590
+
1591
+ Phase 1: 先关闭 Watchdog(防止它监控到其他模块退出后触发重启)
1592
+ Phase 2: 关闭其他所有模块(除 Kernel)
1593
+ Phase 3: 最后关闭 Kernel(保证事件路由畅通)
988
1594
  """
989
1595
  self._system_shutting_down = True
1596
+
1597
+ # 发送 Launcher 自己的 exiting 事件
1598
+ await self._publish_event("module.exiting", {
1599
+ "module_id": "launcher",
1600
+ "type": "active",
1601
+ "reason": "system_shutdown",
1602
+ "action": "none",
1603
+ "timeout": 0,
1604
+ })
1605
+
990
1606
  running = [n for n in self.modules if self.process_manager.is_running(n)]
991
1607
  # Also check core modules
992
1608
  for cn in CORE_MODULE_NAMES:
993
1609
  if self.process_manager.is_running(cn) and cn not in running:
994
1610
  running.append(cn)
1611
+
995
1612
  if not running:
996
1613
  print("[launcher] 没有运行中的模块需要关闭")
997
1614
  return
998
1615
 
999
- graceful = [n for n in running if self._graceful_modules.get(n)]
1000
- non_graceful = [n for n in running if not self._graceful_modules.get(n)]
1616
+ # 分组:Watchdog、Kernel、其他模块
1617
+ watchdog_running = WATCHDOG_MODULE_NAME in running
1618
+ kernel_running = "kernel" in running
1619
+ other_modules = [n for n in running if n not in (WATCHDOG_MODULE_NAME, "kernel")]
1620
+
1621
+ graceful_others = [n for n in other_modules if self._graceful_modules.get(n)]
1622
+ non_graceful_others = [n for n in other_modules if not self._graceful_modules.get(n)]
1623
+
1624
+ print(f"[launcher] 正在关闭 {len(running)} 个模块(三阶段)")
1625
+
1626
+ # ═══════════════════════════════════════════════════════════
1627
+ # Phase 1: 先关闭 Watchdog(防止重启其他模块)
1628
+ # ═══════════════════════════════════════════════════════════
1629
+ if watchdog_running and self.process_manager.is_running(WATCHDOG_MODULE_NAME):
1630
+ print(f"[launcher] Phase 1: 通知 Watchdog 退出(防止重启其他模块)")
1631
+
1632
+ if self._graceful_modules.get(WATCHDOG_MODULE_NAME):
1633
+ # Watchdog 支持优雅退出
1634
+ self._init_module_state(WATCHDOG_MODULE_NAME)
1635
+ state = self._module_states[WATCHDOG_MODULE_NAME]
1636
+ state["shutdown_sent"] = True
1637
+ state["reason"] = "system_shutdown"
1638
+ state["restart"] = False
1639
+ self._log_lifecycle("stopping", WATCHDOG_MODULE_NAME, reason="system_shutdown")
1640
+
1641
+ await self._publish_event("module.shutdown", {
1642
+ "module_id": WATCHDOG_MODULE_NAME,
1643
+ "reason": "system_shutdown",
1644
+ "timeout": 5,
1645
+ "restart": False,
1646
+ })
1647
+
1648
+ # 等待 0.2 秒确保事件送达(不需要等待进程退出)
1649
+ await asyncio.sleep(0.2)
1650
+ print(f"[launcher] Watchdog shutdown 事件已发送")
1651
+ else:
1652
+ # 直接终止
1653
+ self._init_module_state(WATCHDOG_MODULE_NAME)
1654
+ state = self._module_states[WATCHDOG_MODULE_NAME]
1655
+ state["shutdown_sent"] = True
1656
+ state["stopped_sent"] = True
1657
+ state["reason"] = "system_shutdown"
1658
+ state["restart"] = False
1659
+ self._log_lifecycle("stopping", WATCHDOG_MODULE_NAME, reason="system_shutdown")
1001
1660
 
1002
- # Defer Kernel — it must stay alive to route shutdown events
1003
- kernel_deferred = "kernel" in graceful
1004
- graceful_batch = [n for n in graceful if n != "kernel"] if kernel_deferred else graceful
1661
+ self.process_manager.stop_module(WATCHDOG_MODULE_NAME, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
1005
1662
 
1006
- print(f"[launcher] 正在关闭 {len(running)} 个模块: {', '.join(running)}")
1663
+ await self._publish_event("module.stopped", {
1664
+ "module_id": WATCHDOG_MODULE_NAME,
1665
+ "exit_code": 0,
1666
+ "exit_type": "non_graceful",
1667
+ "reason": "system_shutdown",
1668
+ "restart": False,
1669
+ "ready_received": False,
1670
+ })
1007
1671
 
1008
- # Phase 1: Notify graceful modules first (they start cleanup immediately)
1009
- for name in graceful_batch:
1672
+ self._log_lifecycle("stopped", WATCHDOG_MODULE_NAME, reason="system_shutdown")
1673
+
1674
+ # ═══════════════════════════════════════════════════════════
1675
+ # Phase 2: 关闭其他所有模块(除 Kernel)
1676
+ # ═══════════════════════════════════════════════════════════
1677
+ if graceful_others or non_graceful_others:
1678
+ print(f"[launcher] Phase 2: 关闭其他模块({len(graceful_others)} 优雅 + {len(non_graceful_others)} 非优雅)")
1679
+
1680
+ # 通知优雅模块
1681
+ for name in graceful_others:
1682
+ self._init_module_state(name)
1683
+ state = self._module_states[name]
1684
+ state["shutdown_sent"] = True
1685
+ state["reason"] = "system_shutdown"
1686
+ state["restart"] = False
1010
1687
  self._log_lifecycle("stopping", name, reason="system_shutdown")
1011
1688
  await self._publish_event("module.shutdown", {
1012
- "module_id": name, "reason": "system_shutdown", "timeout": 5,
1689
+ "module_id": name,
1690
+ "reason": "system_shutdown",
1691
+ "timeout": 5,
1692
+ "restart": False,
1013
1693
  })
1014
1694
 
1015
- # Phase 2: While graceful modules are cleaning up, terminate non-graceful ones
1016
- if non_graceful:
1017
- print(f"[launcher] 直接终止 {len(non_graceful)} 个不支持优雅退出的模块: {', '.join(non_graceful)}")
1018
- for name in non_graceful:
1695
+ # 终止非优雅模块
1696
+ for name in non_graceful_others:
1697
+ self._init_module_state(name)
1698
+ state = self._module_states[name]
1699
+ state["shutdown_sent"] = True
1700
+ state["stopped_sent"] = True
1701
+ state["reason"] = "system_shutdown"
1702
+ state["restart"] = False
1019
1703
  self._log_lifecycle("stopping", name, reason="system_shutdown")
1020
- self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_PARTIAL)
1704
+
1705
+ self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
1706
+
1707
+ await self._publish_event("module.stopped", {
1708
+ "module_id": name,
1709
+ "exit_code": 0,
1710
+ "exit_type": "non_graceful",
1711
+ "reason": "system_shutdown",
1712
+ "restart": False,
1713
+ "ready_received": False,
1714
+ })
1715
+
1021
1716
  self._log_lifecycle("stopped", name, reason="system_shutdown")
1022
1717
 
1023
- # Phase 3: Wait for graceful modules to exit (process monitoring)
1024
- if graceful_batch:
1718
+ # 等待优雅模块退出(包括 Watchdog)
1719
+ all_graceful = graceful_others + ([WATCHDOG_MODULE_NAME] if watchdog_running and self._graceful_modules.get(WATCHDOG_MODULE_NAME) else [])
1720
+ if all_graceful:
1025
1721
  deadline = time.time() + 5
1026
1722
  while time.time() < deadline:
1027
- still_running = [n for n in graceful_batch if self.process_manager.is_running(n)]
1723
+ still_running = [n for n in all_graceful if self.process_manager.is_running(n)]
1028
1724
  if not still_running:
1029
- print("[launcher] 所有优雅退出模块已自行退出")
1725
+ print("[launcher] 所有其他模块已退出")
1030
1726
  break
1031
1727
  remaining = max(0, deadline - time.time())
1032
1728
  print(f"[launcher] 等待 {len(still_running)} 个模块退出 ({remaining:.0f}s): {', '.join(still_running)}")
1033
1729
  await asyncio.sleep(1)
1034
- # Force kill survivors
1035
- for name in graceful_batch:
1730
+
1731
+ # 强杀未退出的
1732
+ for name in all_graceful:
1036
1733
  if self.process_manager.is_running(name):
1037
- self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_PARTIAL)
1038
- self._log_lifecycle("stopped", name, reason="system_shutdown")
1734
+ print(f"[launcher] {name} 超时,强制终止")
1735
+ self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
1736
+ self._log_lifecycle("stopped", name, reason="system_shutdown_timeout")
1039
1737
 
1040
- # Phase 4: All other modules exited — now shut down Kernel
1041
- if kernel_deferred and self.process_manager.is_running("kernel"):
1042
- self._log_lifecycle("stopping", "kernel", reason="system_shutdown")
1043
- print("[launcher] 正在关闭 Kernel...")
1738
+ # ═══════════════════════════════════════════════════════════
1739
+ # Phase 3: 最后关闭 Kernel(使用标准优雅退出流程)
1740
+ # ═══════════════════════════════════════════════════════════
1741
+ if kernel_running and self.process_manager.is_running("kernel"):
1742
+ print("[launcher] Phase 3: 关闭 Kernel(所有其他模块已退出)")
1044
1743
 
1045
- # Call kernel.shutdown RPC (not event)
1046
- rpc_sent = False
1047
- try:
1048
- if self._ws:
1049
- await self._rpc_call(self._ws, "kernel.shutdown", {})
1050
- print("[launcher] Kernel shutdown RPC 已发送")
1051
- rpc_sent = True
1052
- else:
1053
- print("[launcher] WebSocket 未连接,跳过 RPC 调用")
1054
- except Exception as e:
1055
- print(f"[launcher] Kernel shutdown RPC 失败: {e}")
1056
-
1057
- # Wait for kernel to exit
1058
- if rpc_sent:
1059
- # RPC sent: wait up to 5s for graceful exit
1060
- proc = self.process_manager._processes.get("kernel")
1061
- if proc:
1062
- try:
1063
- loop = asyncio.get_event_loop()
1064
- await asyncio.wait_for(
1065
- loop.run_in_executor(None, proc.wait),
1066
- timeout=5
1067
- )
1068
- print("[launcher] Kernel 已退出")
1069
- except asyncio.TimeoutError:
1070
- print("[launcher] Kernel 5秒内未退出,强制停止")
1071
- self.process_manager.stop_module("kernel", timeout=SHUTDOWN_TIMEOUT_PARTIAL)
1072
- else:
1073
- # No RPC (WS not connected): use shorter timeout for terminate
1074
- self.process_manager.stop_module("kernel", timeout=2)
1744
+ # 明确标记不重启
1745
+ self._desired_states["kernel"] = "stopped"
1075
1746
 
1076
- self._log_lifecycle("stopped", "kernel", reason="system_shutdown")
1747
+ # 使用标准优雅退出流程(内含等待 ack → exiting → ready → kill 完整逻辑)
1748
+ await self._graceful_stop("kernel", reason="system_shutdown", timeout=5)
1077
1749
 
1078
1750
  # Final safety net
1079
1751
  try:
@@ -1227,7 +1899,7 @@ class Launcher:
1227
1899
  # Call Kernel RPC to generate tokens
1228
1900
  try:
1229
1901
  result = await self._rpc_call(self._ws, "kernel.generate_tokens", {"modules": module_names})
1230
- if result.get("result", {}).get("ok"):
1902
+ if "result" in result:
1231
1903
  tokens = result["result"].get("tokens", {})
1232
1904
  self._module_tokens.update(tokens)
1233
1905
  print(f"[launcher] Kernel 已生成 {len(tokens)} 个模块令牌")
@@ -1242,7 +1914,7 @@ class Launcher:
1242
1914
  return
1243
1915
  try:
1244
1916
  result = await self._rpc_call(self._ws, "kernel.register_tokens", tokens)
1245
- if result.get("result", {}).get("ok"):
1917
+ if "result" in result:
1246
1918
  print(f"[launcher] 已注册 {len(tokens)} 个模块令牌")
1247
1919
  elif "error" in result:
1248
1920
  print(f"[launcher] 警告: 令牌注册失败: {result['error'].get('message', '')}")
@@ -1326,10 +1998,19 @@ class Launcher:
1326
1998
  if rc != 0:
1327
1999
  self._print_module_crash_summary(name)
1328
2000
  self._log_lifecycle("exited", name, exit_code=rc)
1329
- await self._publish_event("module.stopped", {
1330
- "module_id": name, "exit_code": rc,
1331
- "graceful_shutdown": self._graceful_modules.get(name, False),
1332
- })
2001
+
2002
+ # 检查是否已发送 stopped 事件
2003
+ state = self._module_states.get(name, {})
2004
+ if not state.get("stopped_sent"):
2005
+ # 取消清理超时任务(如果有)
2006
+ if state.get("cleanup_task"):
2007
+ state["cleanup_task"].cancel()
2008
+ # 发送 stopped 事件
2009
+ await self._send_stopped_event(name, rc)
2010
+
2011
+ # 无论是否发送,都清理状态(防止内存泄漏)
2012
+ self._module_states.pop(name, None)
2013
+
1333
2014
  info = self.modules.get(name)
1334
2015
 
1335
2016
  # 1) Core module crash → full restart
@@ -1457,6 +2138,22 @@ class Launcher:
1457
2138
  running = []
1458
2139
  exited = []
1459
2140
  stopped = []
2141
+
2142
+ # Add Launcher itself to running list
2143
+ from types import SimpleNamespace
2144
+ launcher_info = SimpleNamespace(
2145
+ display_name="Launcher",
2146
+ type="infrastructure",
2147
+ )
2148
+ launcher_rec = SimpleNamespace(
2149
+ pid=os.getpid(),
2150
+ started_at=self._start_unix,
2151
+ )
2152
+ running.append(("launcher", launcher_info, launcher_rec))
2153
+ # Launcher is ready immediately (ready_time = 0)
2154
+ if "launcher" not in self._ready_times:
2155
+ self._ready_times["launcher"] = 0.0
2156
+
1460
2157
  for name, info in self.modules.items():
1461
2158
  rec = self.process_manager.get_record(name)
1462
2159
  is_running = self.process_manager.is_running(name)
@@ -1527,9 +2224,16 @@ class Launcher:
1527
2224
  label = info.display_name or name
1528
2225
  ready_t = self._ready_times.get(name)
1529
2226
  time_str = f"{ready_t:.2f}s" if ready_t is not None else "—"
2227
+
2228
+ # Calculate elapsed from start
1530
2229
  if ready_t is not None and hasattr(self, '_start_unix'):
1531
- elapsed_from_start = (rec.started_at + ready_t) - self._start_unix
1532
- es_str = f"{elapsed_from_start:.2f}s"
2230
+ if name == "launcher":
2231
+ # Launcher: ready_t is already relative to _start_unix
2232
+ es_str = f"{ready_t:.2f}s"
2233
+ else:
2234
+ # Other modules: rec.started_at is unix timestamp
2235
+ elapsed_from_start = (rec.started_at + ready_t) - self._start_unix
2236
+ es_str = f"{elapsed_from_start:.2f}s"
1533
2237
  else:
1534
2238
  es_str = "—"
1535
2239
 
@@ -1613,7 +2317,18 @@ class Launcher:
1613
2317
  debug_flag = " [DEBUG]" if os.environ.get("KITE_DEBUG") == "1" else ""
1614
2318
  lines.append(f"{G} 当前实例: #{inst_num} 后缀: {suffix_display} PID: {os.getpid()}{debug_flag}{R}")
1615
2319
  lines.append(f"{G} 实例目录: {inst_dir}{R}")
1616
- lines.append(f"{G} 工作目录: {cwd}{R}")
2320
+
2321
+ # Check for abnormal working directory
2322
+ cwd_lower = cwd.lower()
2323
+ is_abnormal_cwd = (
2324
+ "windowsapps" in cwd_lower or
2325
+ "appdata\\local\\temp" in cwd_lower or
2326
+ not os.path.exists(os.path.join(cwd, "main.py"))
2327
+ )
2328
+ if is_abnormal_cwd:
2329
+ lines.append(f"\033[91m 工作目录: {cwd} ⚠️ 异常路径{R}")
2330
+ else:
2331
+ lines.append(f"{G} 工作目录: {cwd}{R}")
1617
2332
  if len(instances) > 1:
1618
2333
  lines.append(f"{G} 所有实例:{R}")
1619
2334
  for i in instances:
@@ -1682,6 +2397,30 @@ class Launcher:
1682
2397
  except Exception:
1683
2398
  pass
1684
2399
 
2400
+ def _record_launcher_startup(self):
2401
+ """Record launcher startup information to lifecycle.jsonl."""
2402
+ import sys
2403
+ from datetime import datetime, timezone
2404
+
2405
+ record = {
2406
+ "ts": datetime.now(timezone.utc).isoformat(),
2407
+ "event": "launcher_startup",
2408
+ "module": "launcher",
2409
+ "pid": os.getpid(),
2410
+ "cwd": os.getcwd(),
2411
+ "argv": sys.argv,
2412
+ "instance_dir": os.environ.get("KITE_INSTANCE_DIR", ""),
2413
+ "instance_suffix": self.process_manager.instance_suffix,
2414
+ "python": sys.executable,
2415
+ }
2416
+
2417
+ try:
2418
+ os.makedirs(os.path.dirname(self._lifecycle_log), exist_ok=True)
2419
+ with open(self._lifecycle_log, "a", encoding="utf-8") as f:
2420
+ f.write(json.dumps(record, ensure_ascii=False) + "\n")
2421
+ except Exception:
2422
+ pass
2423
+
1685
2424
 
1686
2425
 
1687
2426
  def _update_module_md_state(module_dir: str, new_state: str):