@agentunion/kite 1.3.2 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +200 -0
- package/cli.js +76 -0
- package/extensions/agents/assistant/entry.py +111 -1
- package/extensions/agents/assistant/server.py +263 -215
- package/extensions/channels/acp_channel/entry.py +111 -1
- package/extensions/channels/acp_channel/module.md +23 -22
- package/extensions/channels/acp_channel/server.py +263 -215
- package/extensions/event_hub_bench/entry.py +107 -1
- package/extensions/services/backup/entry.py +299 -21
- package/extensions/services/backup/module.md +24 -22
- package/extensions/services/model_service/entry.py +145 -19
- package/extensions/services/model_service/module.md +21 -22
- package/extensions/services/watchdog/entry.py +188 -25
- package/extensions/services/watchdog/monitor.py +144 -34
- package/extensions/services/web/WEBSOCKET_STATUS.md +143 -0
- package/extensions/services/web/config_example.py +35 -0
- package/extensions/services/web/config_loader.py +110 -0
- package/extensions/services/web/entry.py +114 -26
- package/extensions/services/web/module.md +35 -24
- package/extensions/services/web/pairing.py +250 -0
- package/extensions/services/web/pairing_codes.jsonl +16 -0
- package/extensions/services/web/relay.py +643 -0
- package/extensions/services/web/relay_config.json5 +67 -0
- package/extensions/services/web/routes/routes_management_ws.py +127 -0
- package/extensions/services/web/routes/routes_rpc.py +89 -0
- package/extensions/services/web/routes/routes_test.py +61 -0
- package/extensions/services/web/routes/schemas.py +0 -22
- package/extensions/services/web/server.py +421 -98
- package/extensions/services/web/static/css/style.css +67 -28
- package/extensions/services/web/static/index.html +234 -44
- package/extensions/services/web/static/js/app.js +1335 -48
- package/extensions/services/web/static/js/kernel-client-example.js +161 -0
- package/extensions/services/web/static/js/kernel-client.js +383 -0
- package/extensions/services/web/static/js/registry-tests.js +558 -0
- package/extensions/services/web/static/js/token-manager.js +175 -0
- package/extensions/services/web/static/pairing.html +248 -0
- package/extensions/services/web/static/test_registry.html +262 -0
- package/extensions/services/web/web_config.json5 +29 -0
- package/kernel/entry.py +120 -32
- package/kernel/event_hub.py +141 -16
- package/kernel/module.md +36 -33
- package/kernel/registry_store.py +48 -15
- package/kernel/rpc_router.py +120 -53
- package/kernel/server.py +219 -12
- package/kite_cli/__init__.py +3 -0
- package/kite_cli/__main__.py +5 -0
- package/kite_cli/commands/__init__.py +1 -0
- package/kite_cli/commands/clean.py +101 -0
- package/kite_cli/commands/doctor.py +35 -0
- package/kite_cli/commands/history.py +111 -0
- package/kite_cli/commands/info.py +96 -0
- package/kite_cli/commands/install.py +313 -0
- package/kite_cli/commands/list.py +143 -0
- package/kite_cli/commands/log.py +81 -0
- package/kite_cli/commands/rollback.py +88 -0
- package/kite_cli/commands/search.py +73 -0
- package/kite_cli/commands/uninstall.py +85 -0
- package/kite_cli/commands/update.py +118 -0
- package/kite_cli/core/__init__.py +1 -0
- package/kite_cli/core/checker.py +142 -0
- package/kite_cli/core/dependency.py +229 -0
- package/kite_cli/core/downloader.py +209 -0
- package/kite_cli/core/install_info.py +40 -0
- package/kite_cli/core/tool_installer.py +397 -0
- package/kite_cli/core/validator.py +78 -0
- package/kite_cli/main.py +289 -0
- package/kite_cli/utils/__init__.py +1 -0
- package/kite_cli/utils/i18n.py +252 -0
- package/kite_cli/utils/interactive.py +63 -0
- package/kite_cli/utils/operation_log.py +77 -0
- package/kite_cli/utils/paths.py +34 -0
- package/kite_cli/utils/version.py +308 -0
- package/launcher/entry.py +819 -158
- package/launcher/logging_setup.py +104 -0
- package/launcher/module.md +37 -37
- package/package.json +2 -1
- package/scripts/plan_manager.py +315 -0
- package/extensions/services/web/routes/routes_modules.py +0 -249
package/launcher/entry.py
CHANGED
|
@@ -29,10 +29,21 @@ from .process_manager import ProcessManager
|
|
|
29
29
|
IS_WINDOWS = sys.platform == "win32"
|
|
30
30
|
|
|
31
31
|
# Shutdown timeout constants (seconds)
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
32
|
+
|
|
33
|
+
# 不支持优雅关闭
|
|
34
|
+
SHUTDOWN_TIMEOUT_NON_GRACEFUL = 0.3 # SIGTERM 后等待时间
|
|
35
|
+
|
|
36
|
+
# 支持优雅关闭 - 等待响应
|
|
37
|
+
SHUTDOWN_TIMEOUT_ACK = 3.0 # 等待 shutdown.ack
|
|
38
|
+
SHUTDOWN_TIMEOUT_EXITING = 3.0 # 等待 module.exiting
|
|
39
|
+
|
|
40
|
+
# 清理超时(从 exiting 事件获取)
|
|
41
|
+
CLEANUP_TIMEOUT_DEFAULT = 5.0 # 默认清理时间
|
|
42
|
+
CLEANUP_TIMEOUT_MIN = 0.0 # 最小清理时间
|
|
43
|
+
CLEANUP_TIMEOUT_MAX = 30.0 # 最大清理时间
|
|
44
|
+
|
|
45
|
+
# 批量关闭安全网
|
|
46
|
+
SHUTDOWN_TIMEOUT_BULK = 3.0
|
|
36
47
|
|
|
37
48
|
# Core module names that are started in Phase 1 (not Phase 2)
|
|
38
49
|
CORE_MODULE_NAMES = {"kernel"}
|
|
@@ -101,6 +112,9 @@ class Launcher:
|
|
|
101
112
|
# System-wide shutdown flag: prevents Watchdog restart during shutdown
|
|
102
113
|
self._system_shutting_down = False
|
|
103
114
|
|
|
115
|
+
# 模块退出状态跟踪(防止 stopped 事件重复发送)
|
|
116
|
+
self._module_states: dict[str, dict] = {}
|
|
117
|
+
|
|
104
118
|
# Kite stdout message waiters: {waiter_key: (threading.Event, data_dict)}
|
|
105
119
|
# Used by ProcessManager stdout callback (cross-thread)
|
|
106
120
|
self._msg_waiters: dict[str, tuple[threading.Event, dict]] = {}
|
|
@@ -117,6 +131,9 @@ class Launcher:
|
|
|
117
131
|
pass
|
|
118
132
|
os.environ["KITE_INSTANCE_SUFFIX"] = suffix
|
|
119
133
|
|
|
134
|
+
# Record launcher startup
|
|
135
|
+
self._record_launcher_startup()
|
|
136
|
+
|
|
120
137
|
@staticmethod
|
|
121
138
|
def _fmt_elapsed(seconds: float) -> str:
|
|
122
139
|
"""Format elapsed seconds: <1s → 'NNNms', >=1s → 'N.Ns', >=10s → 'NNs'."""
|
|
@@ -597,6 +614,11 @@ class Launcher:
|
|
|
597
614
|
print(f"[launcher] Kernel 重连失败 {max_retries} 次,退出")
|
|
598
615
|
sys.exit(1)
|
|
599
616
|
print(f"[launcher] Kernel 连接错误: {e}, {retry_delay:.1f}s 后重试 ({attempt}/{max_retries})")
|
|
617
|
+
if attempt == 5:
|
|
618
|
+
print(f"\033[33m[launcher] 提示: 已连续 {attempt} 次无法连接 Kernel (端口 {self.kernel_port})")
|
|
619
|
+
if self.kernel_port < 1024:
|
|
620
|
+
print(f"[launcher] ⚠ 端口 {self.kernel_port} 异常偏低,可能是 Kernel 端口绑定失败或配置错误")
|
|
621
|
+
print(f"[launcher] 请检查: 1) Kernel 进程是否存活 2) kernel/module.md 中 preferred_port 配置是否正确\033[0m")
|
|
600
622
|
self._ws = None
|
|
601
623
|
if self._thread_shutdown.is_set():
|
|
602
624
|
return
|
|
@@ -608,7 +630,7 @@ class Launcher:
|
|
|
608
630
|
launcher_token = self._module_tokens.get("launcher", "")
|
|
609
631
|
ws_url = f"ws://127.0.0.1:{self.kernel_port}/ws?token={launcher_token}&id=launcher"
|
|
610
632
|
t_ws_connect = time.monotonic()
|
|
611
|
-
async with websockets.connect(ws_url, open_timeout=3, ping_interval=
|
|
633
|
+
async with websockets.connect(ws_url, open_timeout=3, ping_interval=20, ping_timeout=20, close_timeout=10) as ws:
|
|
612
634
|
self._ws = ws
|
|
613
635
|
_ws_s = time.monotonic() - t_ws_connect
|
|
614
636
|
print(f"[launcher] 已连接到 Kernel ({self._fmt_elapsed(_ws_s)})")
|
|
@@ -632,10 +654,38 @@ class Launcher:
|
|
|
632
654
|
await self._rpc_call(ws, "registry.register", {
|
|
633
655
|
"module_id": "launcher",
|
|
634
656
|
"module_type": "infrastructure",
|
|
657
|
+
"tools": {
|
|
658
|
+
"rpc": {
|
|
659
|
+
"launcher": {
|
|
660
|
+
"list_modules": {"method": "list_modules", "description": "列出所有模块"},
|
|
661
|
+
"start_module": {"method": "start_module", "description": "启动模块"},
|
|
662
|
+
"stop_module": {"method": "stop_module", "description": "停止模块"},
|
|
663
|
+
"restart_module": {"method": "restart_module", "description": "重启模块"},
|
|
664
|
+
"restart_launcher": {"method": "restart_launcher", "description": "重启 Launcher"},
|
|
665
|
+
"rescan": {"method": "rescan", "description": "重新扫描模块"},
|
|
666
|
+
"shutdown": {"method": "shutdown", "description": "关闭系统"},
|
|
667
|
+
},
|
|
668
|
+
"module": {
|
|
669
|
+
"config": {
|
|
670
|
+
"get": {"method": "get_module_config", "description": "获取模块配置"},
|
|
671
|
+
"update": {"method": "update_module_config", "description": "更新模块配置"},
|
|
672
|
+
"reset": {"method": "reset_module_config", "description": "恢复默认配置"},
|
|
673
|
+
}
|
|
674
|
+
}
|
|
675
|
+
}
|
|
676
|
+
},
|
|
635
677
|
"events_publish": {
|
|
636
|
-
"
|
|
637
|
-
|
|
638
|
-
|
|
678
|
+
"system": {
|
|
679
|
+
"ready": {"description": "系统启动完成"}
|
|
680
|
+
},
|
|
681
|
+
"module": {
|
|
682
|
+
"starting": {"description": "模块启动中"},
|
|
683
|
+
"started": {"description": "模块已启动"},
|
|
684
|
+
"ready": {"description": "模块就绪"},
|
|
685
|
+
"stopped": {"description": "模块已停止"},
|
|
686
|
+
"exiting": {"description": "模块退出中"},
|
|
687
|
+
"shutdown": {"description": "模块关闭"}
|
|
688
|
+
}
|
|
639
689
|
},
|
|
640
690
|
"events_subscribe": [">"],
|
|
641
691
|
})
|
|
@@ -658,7 +708,14 @@ class Launcher:
|
|
|
658
708
|
raise
|
|
659
709
|
|
|
660
710
|
async def _ws_receiver(self, ws):
|
|
661
|
-
"""Receive loop: classify incoming messages.
|
|
711
|
+
"""Receive loop: classify incoming messages.
|
|
712
|
+
|
|
713
|
+
CRITICAL: RPC 死锁防范
|
|
714
|
+
- 入站 RPC 请求必须用 create_task() 异步执行,不可 await
|
|
715
|
+
- 原因:如果 handler 内部调用 rpc_call() 发出站请求,出站响应需要本接收循环来分发
|
|
716
|
+
- 如果接收循环被 await handler 阻塞,出站响应永远收不到 → 超时死锁
|
|
717
|
+
- 事件通知和 RPC 响应可以同步处理(它们不会反向调用 rpc_call)
|
|
718
|
+
"""
|
|
662
719
|
try:
|
|
663
720
|
async for raw in ws:
|
|
664
721
|
try:
|
|
@@ -676,7 +733,8 @@ class Launcher:
|
|
|
676
733
|
await self._handle_event_notification(msg)
|
|
677
734
|
elif has_method and has_id:
|
|
678
735
|
# Incoming RPC request (forwarded by Kernel)
|
|
679
|
-
|
|
736
|
+
# Run in background so receiver loop continues processing responses
|
|
737
|
+
asyncio.create_task(self._handle_rpc_request(ws, msg))
|
|
680
738
|
elif has_id and (has_result or has_error):
|
|
681
739
|
# RPC response (to our own call)
|
|
682
740
|
self._handle_rpc_response(msg)
|
|
@@ -728,6 +786,7 @@ class Launcher:
|
|
|
728
786
|
# Trigger event waiters
|
|
729
787
|
module_id = data.get("module_id", "")
|
|
730
788
|
waiter_key = f"{event}:{module_id}"
|
|
789
|
+
|
|
731
790
|
waiter = self._event_waiters.get(waiter_key)
|
|
732
791
|
if waiter:
|
|
733
792
|
waiter[1].update(data)
|
|
@@ -742,6 +801,42 @@ class Launcher:
|
|
|
742
801
|
ready_waiter[1]["_exited"] = True
|
|
743
802
|
ready_waiter[0].set()
|
|
744
803
|
|
|
804
|
+
# 处理主动退出场景(没有 shutdown 的情况)
|
|
805
|
+
if module_id not in self._module_states:
|
|
806
|
+
self._init_module_state(module_id)
|
|
807
|
+
state = self._module_states[module_id]
|
|
808
|
+
|
|
809
|
+
if not state.get("shutdown_sent"):
|
|
810
|
+
# 主动退出:记录信息
|
|
811
|
+
if not state.get("exiting_received"):
|
|
812
|
+
state["exiting_received"] = True
|
|
813
|
+
state["reason"] = data.get("reason", "active_exit")
|
|
814
|
+
state["restart"] = data.get("restart", False)
|
|
815
|
+
cleanup_timeout = data.get("cleanup_timeout", CLEANUP_TIMEOUT_DEFAULT)
|
|
816
|
+
cleanup_timeout = max(CLEANUP_TIMEOUT_MIN, min(cleanup_timeout, CLEANUP_TIMEOUT_MAX))
|
|
817
|
+
state["cleanup_timeout"] = cleanup_timeout
|
|
818
|
+
|
|
819
|
+
# 启动清理超时任务
|
|
820
|
+
async def cleanup_timeout_handler():
|
|
821
|
+
await asyncio.sleep(state["cleanup_timeout"])
|
|
822
|
+
if not state.get("stopped_sent"):
|
|
823
|
+
state["stopped_sent"] = True
|
|
824
|
+
self._kill_process(module_id)
|
|
825
|
+
|
|
826
|
+
# 发送 stopped 事件
|
|
827
|
+
await self._publish_event("module.stopped", {
|
|
828
|
+
"module_id": module_id,
|
|
829
|
+
"exit_code": -1, # 超时强制终止,退出码未知
|
|
830
|
+
"exit_type": "timeout",
|
|
831
|
+
"reason": state.get("reason", "cleanup_timeout"),
|
|
832
|
+
"restart": state.get("restart", False),
|
|
833
|
+
"ready_received": False,
|
|
834
|
+
})
|
|
835
|
+
|
|
836
|
+
self._log_lifecycle("stopped", module_id, reason=state["reason"])
|
|
837
|
+
|
|
838
|
+
state["cleanup_task"] = asyncio.create_task(cleanup_timeout_handler())
|
|
839
|
+
|
|
745
840
|
# module.crash → print red crash summary
|
|
746
841
|
if event == "module.crash" and module_id:
|
|
747
842
|
RED = "\033[91m"
|
|
@@ -756,6 +851,43 @@ class Launcher:
|
|
|
756
851
|
)
|
|
757
852
|
print(f"[launcher] 崩溃日志: {crash_log}")
|
|
758
853
|
|
|
854
|
+
# pairing.status → handle all pairing flow events
|
|
855
|
+
if event == "pairing.status":
|
|
856
|
+
GREEN = "\033[92m"
|
|
857
|
+
RED = "\033[91m"
|
|
858
|
+
RESET = "\033[0m"
|
|
859
|
+
|
|
860
|
+
step = data.get("step", "")
|
|
861
|
+
success = data.get("success", True)
|
|
862
|
+
|
|
863
|
+
if step == "code_generated":
|
|
864
|
+
code = data.get("code", "")
|
|
865
|
+
expires_in = data.get("expires_in", 300)
|
|
866
|
+
if code:
|
|
867
|
+
print(f"[launcher] {GREEN}配对码: {code}{RESET}")
|
|
868
|
+
print(f"[launcher] {GREEN}有效期: {expires_in} 秒{RESET}")
|
|
869
|
+
print(f"[launcher] {GREEN}访问 Web 界面时使用此配对码进行配对{RESET}")
|
|
870
|
+
|
|
871
|
+
elif step == "pairing":
|
|
872
|
+
if success:
|
|
873
|
+
print(f"[launcher] {GREEN}正在配对...{RESET}")
|
|
874
|
+
else:
|
|
875
|
+
reason = data.get("reason", "Unknown error")
|
|
876
|
+
print(f"[launcher] {RED}✗ 配对失败: {reason}{RESET}")
|
|
877
|
+
|
|
878
|
+
elif step == "completed":
|
|
879
|
+
if success:
|
|
880
|
+
module_id = data.get("module_id", "")
|
|
881
|
+
role = data.get("role", "")
|
|
882
|
+
print(f"[launcher] {GREEN}✓ 配对成功!{RESET}")
|
|
883
|
+
print(f"[launcher] {GREEN} 模块 ID: {module_id}{RESET}")
|
|
884
|
+
print(f"[launcher] {GREEN} 角色: {role}{RESET}")
|
|
885
|
+
else:
|
|
886
|
+
reason = data.get("reason", "Unknown error")
|
|
887
|
+
print(f"[launcher] {RED}✗ 配对失败: {reason}{RESET}")
|
|
888
|
+
|
|
889
|
+
return
|
|
890
|
+
|
|
759
891
|
# Only log system events (module.*, watchdog.*) to avoid flooding
|
|
760
892
|
if not (event.startswith("module.") or event.startswith("watchdog.")):
|
|
761
893
|
return
|
|
@@ -780,12 +912,16 @@ class Launcher:
|
|
|
780
912
|
params = msg.get("params", {})
|
|
781
913
|
|
|
782
914
|
handlers = {
|
|
783
|
-
"list_modules":
|
|
784
|
-
"start_module":
|
|
785
|
-
"stop_module":
|
|
786
|
-
"restart_module":
|
|
787
|
-
"
|
|
788
|
-
"
|
|
915
|
+
"list_modules": self._rpc_list_modules,
|
|
916
|
+
"start_module": self._rpc_start_module,
|
|
917
|
+
"stop_module": self._rpc_stop_module,
|
|
918
|
+
"restart_module": self._rpc_restart_module,
|
|
919
|
+
"restart_launcher": self._rpc_restart_launcher,
|
|
920
|
+
"rescan": self._rpc_rescan,
|
|
921
|
+
"shutdown": self._rpc_shutdown,
|
|
922
|
+
"get_module_config": self._rpc_get_module_config,
|
|
923
|
+
"update_module_config": self._rpc_update_module_config,
|
|
924
|
+
"reset_module_config": self._rpc_reset_module_config,
|
|
789
925
|
}
|
|
790
926
|
handler = handlers.get(method)
|
|
791
927
|
if handler:
|
|
@@ -815,11 +951,14 @@ class Launcher:
|
|
|
815
951
|
"name": name,
|
|
816
952
|
"display_name": info.display_name,
|
|
817
953
|
"type": info.type,
|
|
818
|
-
"
|
|
954
|
+
"state": info.state, # 改名为 state(与 /api/modules 一致)
|
|
955
|
+
"version": info.version,
|
|
956
|
+
"runtime": info.runtime,
|
|
957
|
+
"preferred_port": info.preferred_port,
|
|
958
|
+
"monitor": info.monitor,
|
|
819
959
|
"desired_state": self._desired_states.get(name, "stopped"),
|
|
820
960
|
"actual_state": f"running({rec.pid})" if running and rec else "stopped",
|
|
821
961
|
"pid": rec.pid if running and rec else None,
|
|
822
|
-
"monitor": info.monitor,
|
|
823
962
|
})
|
|
824
963
|
return {"modules": result}
|
|
825
964
|
|
|
@@ -911,30 +1050,243 @@ class Launcher:
|
|
|
911
1050
|
self._request_shutdown(f"RPC shutdown request: {reason}")
|
|
912
1051
|
return {"status": "shutting_down", "reason": reason}
|
|
913
1052
|
|
|
1053
|
+
async def _rpc_get_module_config(self, params: dict) -> dict:
|
|
1054
|
+
"""获取指定模块的配置(通用降级方案)"""
|
|
1055
|
+
import re
|
|
1056
|
+
import yaml
|
|
1057
|
+
from pathlib import Path
|
|
1058
|
+
|
|
1059
|
+
module_name = params.get("module_name")
|
|
1060
|
+
if not module_name:
|
|
1061
|
+
raise ValueError("module_name required")
|
|
1062
|
+
|
|
1063
|
+
# 查找模块信息
|
|
1064
|
+
info = self.modules.get(module_name)
|
|
1065
|
+
if not info:
|
|
1066
|
+
raise RuntimeError(f"Module '{module_name}' not found")
|
|
1067
|
+
|
|
1068
|
+
# 读取 module.md
|
|
1069
|
+
md_path = Path(info.module_dir) / "module.md"
|
|
1070
|
+
if not md_path.exists():
|
|
1071
|
+
raise RuntimeError(f"module.md not found for '{module_name}'")
|
|
1072
|
+
|
|
1073
|
+
text = md_path.read_text(encoding="utf-8")
|
|
1074
|
+
m = re.match(r'^---\s*\n(.*?)\n---\s*\n?(.*)', text, re.DOTALL)
|
|
1075
|
+
if not m:
|
|
1076
|
+
frontmatter = {}
|
|
1077
|
+
else:
|
|
1078
|
+
frontmatter = yaml.safe_load(m.group(1)) or {}
|
|
1079
|
+
|
|
1080
|
+
# 读取 config.yaml(如果存在)
|
|
1081
|
+
config_path = Path(info.module_dir) / "config.yaml"
|
|
1082
|
+
config = None
|
|
1083
|
+
if config_path.exists():
|
|
1084
|
+
config = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
|
|
1085
|
+
|
|
1086
|
+
return {
|
|
1087
|
+
"name": frontmatter.get("name", module_name),
|
|
1088
|
+
"display_name": frontmatter.get("display_name", ""),
|
|
1089
|
+
"type": frontmatter.get("type", ""),
|
|
1090
|
+
"state": frontmatter.get("state", "enabled"),
|
|
1091
|
+
"version": frontmatter.get("version", ""),
|
|
1092
|
+
"runtime": frontmatter.get("runtime", ""),
|
|
1093
|
+
"entry": frontmatter.get("entry", ""),
|
|
1094
|
+
"preferred_port": frontmatter.get("preferred_port"),
|
|
1095
|
+
"advertise_ip": frontmatter.get("advertise_ip"),
|
|
1096
|
+
"monitor": frontmatter.get("monitor"),
|
|
1097
|
+
"events": frontmatter.get("events"),
|
|
1098
|
+
"subscriptions": frontmatter.get("subscriptions"),
|
|
1099
|
+
"depends_on": frontmatter.get("depends_on"),
|
|
1100
|
+
"source_path": str(info.module_dir), # 添加模块路径
|
|
1101
|
+
"has_config": config is not None,
|
|
1102
|
+
"config": config,
|
|
1103
|
+
}
|
|
1104
|
+
|
|
1105
|
+
async def _rpc_update_module_config(self, params: dict) -> dict:
|
|
1106
|
+
"""更新指定模块的配置(通用降级方案)"""
|
|
1107
|
+
import yaml
|
|
1108
|
+
from pathlib import Path
|
|
1109
|
+
|
|
1110
|
+
module_name = params.get("module_name")
|
|
1111
|
+
metadata = params.get("metadata", {})
|
|
1112
|
+
config = params.get("config", {})
|
|
1113
|
+
|
|
1114
|
+
if not module_name:
|
|
1115
|
+
raise ValueError("module_name required")
|
|
1116
|
+
|
|
1117
|
+
info = self.modules.get(module_name)
|
|
1118
|
+
if not info:
|
|
1119
|
+
raise RuntimeError(f"Module '{module_name}' not found")
|
|
1120
|
+
|
|
1121
|
+
md_path = Path(info.module_dir) / "module.md"
|
|
1122
|
+
if not md_path.exists():
|
|
1123
|
+
raise RuntimeError(f"module.md not found for '{module_name}'")
|
|
1124
|
+
|
|
1125
|
+
# 更新 module.md frontmatter
|
|
1126
|
+
if metadata:
|
|
1127
|
+
frontmatter, body = _parse_frontmatter(md_path.read_text(encoding="utf-8"))
|
|
1128
|
+
for key, value in metadata.items():
|
|
1129
|
+
frontmatter[key] = value
|
|
1130
|
+
fm_str = yaml.dump(frontmatter, allow_unicode=True, sort_keys=False, default_flow_style=False).rstrip()
|
|
1131
|
+
content = f"---\n{fm_str}\n---\n{body}"
|
|
1132
|
+
md_path.write_text(content, encoding="utf-8")
|
|
1133
|
+
|
|
1134
|
+
# 更新 config.yaml
|
|
1135
|
+
if config:
|
|
1136
|
+
config_path = Path(info.module_dir) / "config.yaml"
|
|
1137
|
+
existing = {}
|
|
1138
|
+
if config_path.exists():
|
|
1139
|
+
existing = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
|
|
1140
|
+
# Deep merge
|
|
1141
|
+
self._deep_merge(existing, config)
|
|
1142
|
+
config_path.write_text(
|
|
1143
|
+
yaml.dump(existing, allow_unicode=True, sort_keys=False, default_flow_style=False),
|
|
1144
|
+
encoding="utf-8"
|
|
1145
|
+
)
|
|
1146
|
+
|
|
1147
|
+
# 重新扫描以更新缓存
|
|
1148
|
+
await self._rescan_modules()
|
|
1149
|
+
|
|
1150
|
+
# 返回更新后的配置
|
|
1151
|
+
return await self._rpc_get_module_config({"module_name": module_name})
|
|
1152
|
+
|
|
1153
|
+
async def _rpc_reset_module_config(self, params: dict) -> dict:
|
|
1154
|
+
"""恢复指定模块的默认配置(通用降级方案)"""
|
|
1155
|
+
import yaml
|
|
1156
|
+
from pathlib import Path
|
|
1157
|
+
|
|
1158
|
+
module_name = params.get("module_name")
|
|
1159
|
+
fields = params.get("fields", [])
|
|
1160
|
+
reset_all = params.get("all", False)
|
|
1161
|
+
|
|
1162
|
+
if not module_name:
|
|
1163
|
+
raise ValueError("module_name required")
|
|
1164
|
+
|
|
1165
|
+
info = self.modules.get(module_name)
|
|
1166
|
+
if not info:
|
|
1167
|
+
raise RuntimeError(f"Module '{module_name}' not found")
|
|
1168
|
+
|
|
1169
|
+
md_path = Path(info.module_dir) / "module.md"
|
|
1170
|
+
if not md_path.exists():
|
|
1171
|
+
raise RuntimeError(f"module.md not found for '{module_name}'")
|
|
1172
|
+
|
|
1173
|
+
# 默认值定义(通用)
|
|
1174
|
+
defaults = {
|
|
1175
|
+
"state": "enabled",
|
|
1176
|
+
"monitor": True,
|
|
1177
|
+
}
|
|
1178
|
+
|
|
1179
|
+
frontmatter, body = _parse_frontmatter(md_path.read_text(encoding="utf-8"))
|
|
1180
|
+
|
|
1181
|
+
if reset_all:
|
|
1182
|
+
for key, value in defaults.items():
|
|
1183
|
+
frontmatter[key] = value
|
|
1184
|
+
else:
|
|
1185
|
+
for field in fields:
|
|
1186
|
+
if field in defaults:
|
|
1187
|
+
frontmatter[field] = defaults[field]
|
|
1188
|
+
elif field == "preferred_port":
|
|
1189
|
+
frontmatter.pop(field, None) # 恢复为 null
|
|
1190
|
+
elif field == "advertise_ip":
|
|
1191
|
+
frontmatter[field] = "127.0.0.1"
|
|
1192
|
+
|
|
1193
|
+
fm_str = yaml.dump(frontmatter, allow_unicode=True, sort_keys=False, default_flow_style=False).rstrip()
|
|
1194
|
+
content = f"---\n{fm_str}\n---\n{body}"
|
|
1195
|
+
md_path.write_text(content, encoding="utf-8")
|
|
1196
|
+
|
|
1197
|
+
# 重新扫描以更新缓存
|
|
1198
|
+
await self._rescan_modules()
|
|
1199
|
+
|
|
1200
|
+
return await self._rpc_get_module_config({"module_name": module_name})
|
|
1201
|
+
|
|
1202
|
+
@staticmethod
|
|
1203
|
+
def _deep_merge(base: dict, overlay: dict) -> dict:
|
|
1204
|
+
"""递归合并字典"""
|
|
1205
|
+
for k, v in overlay.items():
|
|
1206
|
+
if k in base and isinstance(base[k], dict) and isinstance(v, dict):
|
|
1207
|
+
Launcher._deep_merge(base[k], v)
|
|
1208
|
+
else:
|
|
1209
|
+
base[k] = v
|
|
1210
|
+
return base
|
|
1211
|
+
|
|
1212
|
+
|
|
1213
|
+
async def _rpc_restart_launcher(self, params: dict) -> dict:
|
|
1214
|
+
"""Restart Launcher process via Watchdog.
|
|
1215
|
+
|
|
1216
|
+
Simply notify watchdog and exit. Watchdog will start a new instance.
|
|
1217
|
+
|
|
1218
|
+
Args:
|
|
1219
|
+
params: {
|
|
1220
|
+
"reason": str (optional) - Restart reason
|
|
1221
|
+
}
|
|
1222
|
+
|
|
1223
|
+
Returns:
|
|
1224
|
+
{"status": "restarting", "reason": str}
|
|
1225
|
+
or {"error": "watchdog offline"}
|
|
1226
|
+
"""
|
|
1227
|
+
reason = params.get("reason", "user_request")
|
|
1228
|
+
O = "\033[33m" # orange/yellow
|
|
1229
|
+
R = "\033[0m" # reset
|
|
1230
|
+
print(f"{O}[launcher] 收到 Launcher 重启请求{R}")
|
|
1231
|
+
print(f"[launcher] 原因: {reason}")
|
|
1232
|
+
|
|
1233
|
+
# Check if watchdog is running
|
|
1234
|
+
watchdog_running = self.process_manager.is_running("watchdog")
|
|
1235
|
+
print(f"[launcher] 检查 watchdog 状态: {'running' if watchdog_running else 'stopped'}")
|
|
1236
|
+
|
|
1237
|
+
if not watchdog_running:
|
|
1238
|
+
error_msg = "watchdog 未运行, 无法重启"
|
|
1239
|
+
print(f"[launcher] ❌ {error_msg}")
|
|
1240
|
+
return {"error": error_msg}
|
|
1241
|
+
|
|
1242
|
+
print(f"[launcher] ✓ watchdog 状态正常,准备重启流程")
|
|
1243
|
+
|
|
1244
|
+
# Schedule restart in background (don't block RPC response)
|
|
1245
|
+
async def _do_restart():
|
|
1246
|
+
await asyncio.sleep(0.3) # 确保 RPC 响应已发送
|
|
1247
|
+
|
|
1248
|
+
print(f"[launcher] 发送 module.exiting 事件给 watchdog...")
|
|
1249
|
+
|
|
1250
|
+
# Collect startup info for watchdog to restart with same environment
|
|
1251
|
+
startup_info = {
|
|
1252
|
+
"python": sys.executable,
|
|
1253
|
+
"argv": sys.argv,
|
|
1254
|
+
"cwd": os.getcwd(),
|
|
1255
|
+
"env": dict(os.environ), # 所有环境变量
|
|
1256
|
+
}
|
|
1257
|
+
|
|
1258
|
+
# Notify watchdog: this is a planned restart, not a crash
|
|
1259
|
+
await self._publish_event("module.exiting", {
|
|
1260
|
+
"module_id": "launcher",
|
|
1261
|
+
"action": "restart_launcher",
|
|
1262
|
+
"reason": reason,
|
|
1263
|
+
"startup_info": startup_info,
|
|
1264
|
+
})
|
|
1265
|
+
|
|
1266
|
+
print(f"[launcher] 已通知 watchdog 计划内重启")
|
|
1267
|
+
print(f"[launcher] 退出进程,等待 watchdog 重启")
|
|
1268
|
+
print(f"[launcher] 原因: {reason}")
|
|
1269
|
+
|
|
1270
|
+
os._exit(0)
|
|
1271
|
+
|
|
1272
|
+
asyncio.create_task(_do_restart())
|
|
1273
|
+
|
|
1274
|
+
return {"status": "restarting", "reason": reason}
|
|
1275
|
+
|
|
914
1276
|
# ── Event publishing via RPC ──
|
|
915
1277
|
|
|
916
1278
|
async def _publish_event(self, event_type: str, data: dict):
|
|
917
1279
|
"""Publish an event via RPC event.publish through Kernel WS."""
|
|
918
1280
|
if not self._ws:
|
|
919
1281
|
return
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
"id": str(uuid.uuid4()),
|
|
923
|
-
"method": "event.publish",
|
|
924
|
-
"params": {
|
|
1282
|
+
try:
|
|
1283
|
+
await self._rpc_call(self._ws, "event.publish", {
|
|
925
1284
|
"event_id": str(uuid.uuid4()),
|
|
926
1285
|
"event": event_type,
|
|
927
1286
|
"data": data,
|
|
928
|
-
},
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
async def _send():
|
|
932
|
-
try:
|
|
933
|
-
await self._ws.send(msg)
|
|
934
|
-
except Exception as e:
|
|
935
|
-
print(f"[launcher] 发布事件失败: {e}")
|
|
936
|
-
|
|
937
|
-
asyncio.create_task(_send())
|
|
1287
|
+
}, timeout=2.0)
|
|
1288
|
+
except Exception as e:
|
|
1289
|
+
print(f"[launcher] 发布事件失败 ({event_type}): {e}")
|
|
938
1290
|
|
|
939
1291
|
async def _wait_event(self, event_type: str, module_id: str, timeout: float) -> dict | None:
|
|
940
1292
|
"""Wait for a specific event from a module. Returns data dict or None on timeout."""
|
|
@@ -956,90 +1308,299 @@ class Launcher:
|
|
|
956
1308
|
finally:
|
|
957
1309
|
self._event_waiters.pop(key, None)
|
|
958
1310
|
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
"""
|
|
963
|
-
self.
|
|
1311
|
+
# ── 退出机制辅助方法 ──
|
|
1312
|
+
|
|
1313
|
+
def _init_module_state(self, name: str):
|
|
1314
|
+
"""初始化模块状态跟踪字典"""
|
|
1315
|
+
self._module_states[name] = {
|
|
1316
|
+
"shutdown_sent": False,
|
|
1317
|
+
"ack_received": False,
|
|
1318
|
+
"exiting_received": False,
|
|
1319
|
+
"ready_received": False,
|
|
1320
|
+
"stopped_sent": False,
|
|
1321
|
+
"exit_type": None, # "graceful" | "non_graceful" | "active"
|
|
1322
|
+
"reason": None,
|
|
1323
|
+
"restart": None,
|
|
1324
|
+
"cleanup_timeout": None,
|
|
1325
|
+
"cleanup_task": None,
|
|
1326
|
+
}
|
|
964
1327
|
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
1328
|
+
def _kill_process(self, name: str):
|
|
1329
|
+
"""统一的进程杀死方法"""
|
|
1330
|
+
record = self.process_manager.get_record(name)
|
|
1331
|
+
if record and record.proc and record.proc.poll() is None:
|
|
1332
|
+
print(f"[launcher] 强制终止 {name} (PID {record.proc.pid})")
|
|
1333
|
+
self.process_manager.kill_process(name)
|
|
1334
|
+
elif record:
|
|
1335
|
+
# 进程已经退出,只是清理记录
|
|
1336
|
+
pass
|
|
1337
|
+
else:
|
|
1338
|
+
# 没有记录,可能已经被清理
|
|
1339
|
+
pass
|
|
1340
|
+
|
|
1341
|
+
def _determine_exit_type(self, name: str) -> str:
|
|
1342
|
+
"""判断退出类型: graceful | non_graceful | active"""
|
|
1343
|
+
state = self._module_states.get(name, {})
|
|
1344
|
+
if state.get("exiting_received"):
|
|
1345
|
+
return "graceful"
|
|
1346
|
+
elif state.get("shutdown_sent"):
|
|
1347
|
+
return "non_graceful"
|
|
1348
|
+
else:
|
|
1349
|
+
return "active"
|
|
1350
|
+
|
|
1351
|
+
def _resolve_reason(self, name: str) -> str:
|
|
1352
|
+
"""解析最终原因(优先级:exiting > shutdown > 默认)"""
|
|
1353
|
+
state = self._module_states.get(name, {})
|
|
1354
|
+
if state.get("reason"):
|
|
1355
|
+
return state["reason"]
|
|
1356
|
+
return "unknown"
|
|
1357
|
+
|
|
1358
|
+
def _resolve_restart(self, name: str) -> bool:
|
|
1359
|
+
"""解析重启决策(优先级:exiting > shutdown > 默认)"""
|
|
1360
|
+
state = self._module_states.get(name, {})
|
|
1361
|
+
if state.get("restart") is not None:
|
|
1362
|
+
return state["restart"]
|
|
1363
|
+
# 默认:主动退出不重启,被动关闭看 desired_state
|
|
1364
|
+
if self._determine_exit_type(name) == "active":
|
|
1365
|
+
return False
|
|
1366
|
+
return self._desired_states.get(name) == "running"
|
|
1367
|
+
|
|
1368
|
+
async def _send_stopped_event(self, name: str, exit_code: int):
|
|
1369
|
+
"""发送 module.stopped 事件(防重复)"""
|
|
1370
|
+
state = self._module_states.get(name, {})
|
|
1371
|
+
if state.get("stopped_sent"):
|
|
972
1372
|
return
|
|
973
1373
|
|
|
974
|
-
#
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
ack_data = {}
|
|
978
|
-
self._event_waiters[ack_key] = (ack_evt, ack_data)
|
|
1374
|
+
# 立即设置标记(防止竞态条件)
|
|
1375
|
+
if name in self._module_states:
|
|
1376
|
+
self._module_states[name]["stopped_sent"] = True
|
|
979
1377
|
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
self._event_waiters[ready_key] = (ready_evt, ready_data)
|
|
1378
|
+
exit_type = self._determine_exit_type(name)
|
|
1379
|
+
reason = self._resolve_reason(name)
|
|
1380
|
+
restart = self._resolve_restart(name)
|
|
984
1381
|
|
|
985
|
-
await self._publish_event("module.
|
|
986
|
-
"module_id": name,
|
|
1382
|
+
await self._publish_event("module.stopped", {
|
|
1383
|
+
"module_id": name,
|
|
1384
|
+
"exit_code": exit_code,
|
|
1385
|
+
"exit_type": exit_type,
|
|
1386
|
+
"reason": reason,
|
|
1387
|
+
"restart": restart,
|
|
1388
|
+
"ready_received": state.get("ready_received", False),
|
|
987
1389
|
})
|
|
988
1390
|
|
|
989
|
-
|
|
1391
|
+
# ── 优雅关闭 ──
|
|
1392
|
+
|
|
1393
|
+
async def _graceful_stop(self, name: str, reason: str = "stop_requested", timeout: float = 10):
|
|
1394
|
+
"""优雅关闭单个模块:
|
|
1395
|
+
1. 初始化状态跟踪
|
|
1396
|
+
2. 非优雅模块直接 SIGTERM
|
|
1397
|
+
3. 优雅模块:发送 shutdown → 等待 ack → 等待 exiting → 启动清理超时 → 杀死
|
|
1398
|
+
"""
|
|
990
1399
|
try:
|
|
991
|
-
|
|
992
|
-
ack = ack_data
|
|
993
|
-
except asyncio.TimeoutError:
|
|
994
|
-
ack = None
|
|
995
|
-
finally:
|
|
996
|
-
self._event_waiters.pop(ack_key, None)
|
|
1400
|
+
self._log_lifecycle("stopping", name, reason=reason)
|
|
997
1401
|
|
|
998
|
-
|
|
999
|
-
self.
|
|
1000
|
-
self.
|
|
1001
|
-
|
|
1402
|
+
# 初始化状态
|
|
1403
|
+
self._init_module_state(name)
|
|
1404
|
+
state = self._module_states[name]
|
|
1405
|
+
|
|
1406
|
+
# 非优雅模块:直接 SIGTERM
|
|
1407
|
+
if not self._graceful_modules.get(name):
|
|
1408
|
+
state["shutdown_sent"] = True # 标记:Launcher 主动关闭
|
|
1409
|
+
state["stopped_sent"] = True # 防重复标记
|
|
1410
|
+
state["reason"] = reason
|
|
1411
|
+
state["restart"] = self._desired_states.get(name) == "running"
|
|
1412
|
+
|
|
1413
|
+
self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
|
|
1414
|
+
|
|
1415
|
+
# 发送 stopped 事件
|
|
1416
|
+
await self._publish_event("module.stopped", {
|
|
1417
|
+
"module_id": name,
|
|
1418
|
+
"exit_code": 0,
|
|
1419
|
+
"exit_type": "non_graceful",
|
|
1420
|
+
"reason": reason,
|
|
1421
|
+
"restart": state["restart"],
|
|
1422
|
+
"ready_received": False,
|
|
1423
|
+
})
|
|
1424
|
+
|
|
1425
|
+
self._log_lifecycle("stopped", name, reason=reason)
|
|
1426
|
+
return
|
|
1427
|
+
|
|
1428
|
+
# 优雅模块:提前注册所有三个 waiter(ack、exiting、ready)
|
|
1429
|
+
# 这样可以避免事件到达时 waiter 还没注册的竞争条件
|
|
1430
|
+
ack_key = f"module.shutdown.ack:{name}"
|
|
1431
|
+
ack_evt = asyncio.Event()
|
|
1432
|
+
ack_data = {}
|
|
1433
|
+
self._event_waiters[ack_key] = (ack_evt, ack_data)
|
|
1434
|
+
|
|
1435
|
+
exiting_key = f"module.exiting:{name}"
|
|
1436
|
+
exiting_evt = asyncio.Event()
|
|
1437
|
+
exiting_data = {}
|
|
1438
|
+
self._event_waiters[exiting_key] = (exiting_evt, exiting_data)
|
|
1439
|
+
|
|
1440
|
+
ready_key = f"module.shutdown.ready:{name}"
|
|
1441
|
+
ready_evt = asyncio.Event()
|
|
1442
|
+
ready_data = {}
|
|
1443
|
+
self._event_waiters[ready_key] = (ready_evt, ready_data)
|
|
1444
|
+
|
|
1445
|
+
# 发送 shutdown 事件
|
|
1446
|
+
state["shutdown_sent"] = True
|
|
1447
|
+
state["reason"] = reason
|
|
1448
|
+
state["restart"] = self._desired_states.get(name) == "running"
|
|
1449
|
+
|
|
1450
|
+
await self._publish_event("module.shutdown", {
|
|
1002
1451
|
"module_id": name,
|
|
1003
|
-
"
|
|
1452
|
+
"reason": reason,
|
|
1453
|
+
"timeout": timeout,
|
|
1454
|
+
"restart": state["restart"],
|
|
1004
1455
|
})
|
|
1005
|
-
return
|
|
1006
1456
|
|
|
1007
|
-
|
|
1457
|
+
# 等待 ack
|
|
1458
|
+
try:
|
|
1459
|
+
await asyncio.wait_for(ack_evt.wait(), timeout=SHUTDOWN_TIMEOUT_ACK)
|
|
1460
|
+
state["ack_received"] = True
|
|
1461
|
+
except asyncio.TimeoutError:
|
|
1462
|
+
pass
|
|
1463
|
+
finally:
|
|
1464
|
+
self._event_waiters.pop(ack_key, None)
|
|
1008
1465
|
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
finally:
|
|
1016
|
-
self._event_waiters.pop(ready_key, None)
|
|
1017
|
-
if ready:
|
|
1018
|
-
self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_READY)
|
|
1019
|
-
else:
|
|
1020
|
-
self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_PARTIAL)
|
|
1466
|
+
if not state["ack_received"]:
|
|
1467
|
+
# 没有 ack,直接杀死
|
|
1468
|
+
self._event_waiters.pop(exiting_key, None)
|
|
1469
|
+
self._event_waiters.pop(ready_key, None)
|
|
1470
|
+
state["stopped_sent"] = True
|
|
1471
|
+
self._kill_process(name)
|
|
1021
1472
|
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1473
|
+
# 发送 stopped 事件
|
|
1474
|
+
await self._publish_event("module.stopped", {
|
|
1475
|
+
"module_id": name,
|
|
1476
|
+
"exit_code": -1, # 未收到 ack,退出码未知
|
|
1477
|
+
"exit_type": "timeout",
|
|
1478
|
+
"reason": state["reason"],
|
|
1479
|
+
"restart": state["restart"],
|
|
1480
|
+
"ready_received": False,
|
|
1481
|
+
})
|
|
1482
|
+
|
|
1483
|
+
self._log_lifecycle("stopped", name, reason=reason)
|
|
1484
|
+
return
|
|
1485
|
+
|
|
1486
|
+
# 等待 exiting 事件
|
|
1487
|
+
try:
|
|
1488
|
+
await asyncio.wait_for(exiting_evt.wait(), timeout=SHUTDOWN_TIMEOUT_EXITING)
|
|
1489
|
+
state["exiting_received"] = True
|
|
1490
|
+
# 从 exiting 事件中提取信息
|
|
1491
|
+
if exiting_data.get("reason"):
|
|
1492
|
+
state["reason"] = exiting_data["reason"]
|
|
1493
|
+
if "restart" in exiting_data:
|
|
1494
|
+
state["restart"] = exiting_data["restart"]
|
|
1495
|
+
cleanup_timeout = exiting_data.get("cleanup_timeout", CLEANUP_TIMEOUT_DEFAULT)
|
|
1496
|
+
cleanup_timeout = max(CLEANUP_TIMEOUT_MIN, min(cleanup_timeout, CLEANUP_TIMEOUT_MAX))
|
|
1497
|
+
state["cleanup_timeout"] = cleanup_timeout
|
|
1498
|
+
except asyncio.TimeoutError:
|
|
1499
|
+
pass
|
|
1500
|
+
finally:
|
|
1501
|
+
self._event_waiters.pop(exiting_key, None)
|
|
1502
|
+
|
|
1503
|
+
if not state["exiting_received"]:
|
|
1504
|
+
# 没有 exiting,直接杀死
|
|
1505
|
+
self._event_waiters.pop(ready_key, None)
|
|
1506
|
+
state["stopped_sent"] = True
|
|
1507
|
+
self._kill_process(name)
|
|
1508
|
+
|
|
1509
|
+
# 发送 stopped 事件
|
|
1510
|
+
await self._publish_event("module.stopped", {
|
|
1511
|
+
"module_id": name,
|
|
1512
|
+
"exit_code": -1, # 未收到 exiting,退出码未知
|
|
1513
|
+
"exit_type": "timeout",
|
|
1514
|
+
"reason": state["reason"],
|
|
1515
|
+
"restart": state["restart"],
|
|
1516
|
+
"ready_received": False,
|
|
1517
|
+
})
|
|
1518
|
+
|
|
1519
|
+
self._log_lifecycle("stopped", name, reason=state["reason"])
|
|
1520
|
+
return
|
|
1521
|
+
|
|
1522
|
+
# ready waiter 已经在前面注册好了,直接启动清理超时任务
|
|
1523
|
+
# 启动清理超时任务(兜底机制)
|
|
1524
|
+
async def cleanup_timeout_handler():
|
|
1525
|
+
await asyncio.sleep(state["cleanup_timeout"])
|
|
1526
|
+
if not state.get("stopped_sent"):
|
|
1527
|
+
print(f"[launcher] {name} 清理超时 ({state['cleanup_timeout']}s),强制终止")
|
|
1528
|
+
state["stopped_sent"] = True
|
|
1529
|
+
self._kill_process(name)
|
|
1530
|
+
|
|
1531
|
+
# 发送 stopped 事件
|
|
1532
|
+
await self._publish_event("module.stopped", {
|
|
1533
|
+
"module_id": name,
|
|
1534
|
+
"exit_code": -1, # 清理超时,退出码未知
|
|
1535
|
+
"exit_type": "timeout",
|
|
1536
|
+
"reason": state["reason"],
|
|
1537
|
+
"restart": state["restart"],
|
|
1538
|
+
"ready_received": False,
|
|
1539
|
+
})
|
|
1540
|
+
|
|
1541
|
+
self._log_lifecycle("stopped", name, reason=state["reason"])
|
|
1542
|
+
|
|
1543
|
+
state["cleanup_task"] = asyncio.create_task(cleanup_timeout_handler())
|
|
1544
|
+
|
|
1545
|
+
# 等待 ready 事件(主路径)
|
|
1546
|
+
try:
|
|
1547
|
+
await asyncio.wait_for(ready_evt.wait(), timeout=state["cleanup_timeout"])
|
|
1548
|
+
state["ready_received"] = True
|
|
1549
|
+
print(f"[launcher] {name} 清理完成,准备退出")
|
|
1550
|
+
except asyncio.TimeoutError:
|
|
1551
|
+
# 超时由 cleanup_timeout_handler 处理
|
|
1552
|
+
pass
|
|
1553
|
+
finally:
|
|
1554
|
+
self._event_waiters.pop(ready_key, None)
|
|
1555
|
+
|
|
1556
|
+
# 取消清理超时任务(如果 ready 先到达)
|
|
1557
|
+
if state.get("ready_received") and state["cleanup_task"] and not state["cleanup_task"].done():
|
|
1558
|
+
state["cleanup_task"].cancel()
|
|
1559
|
+
|
|
1560
|
+
# 如果收到 ready,立即杀死进程
|
|
1561
|
+
if state.get("ready_received") and not state.get("stopped_sent"):
|
|
1562
|
+
state["stopped_sent"] = True
|
|
1563
|
+
self._kill_process(name)
|
|
1564
|
+
|
|
1565
|
+
# 发送 stopped 事件
|
|
1566
|
+
await self._publish_event("module.stopped", {
|
|
1567
|
+
"module_id": name,
|
|
1568
|
+
"exit_code": 0, # 正常退出
|
|
1569
|
+
"exit_type": "graceful",
|
|
1570
|
+
"reason": state["reason"],
|
|
1571
|
+
"restart": state["restart"],
|
|
1572
|
+
"ready_received": True,
|
|
1573
|
+
})
|
|
1574
|
+
|
|
1575
|
+
self._log_lifecycle("stopped", name, reason=state["reason"])
|
|
1576
|
+
|
|
1577
|
+
except Exception as e:
|
|
1578
|
+
# 优雅关闭出错,强制终止进程
|
|
1579
|
+
print(f"[launcher] 优雅关闭出错: {e}")
|
|
1580
|
+
if not state.get("stopped_sent"):
|
|
1581
|
+
state["stopped_sent"] = True
|
|
1582
|
+
self._kill_process(name)
|
|
1583
|
+
# 清理所有 waiters
|
|
1584
|
+
self._event_waiters.pop(f"module.shutdown.ack:{name}", None)
|
|
1585
|
+
self._event_waiters.pop(f"module.exiting:{name}", None)
|
|
1586
|
+
self._event_waiters.pop(f"module.shutdown.ready:{name}", None)
|
|
1027
1587
|
|
|
1028
1588
|
async def _graceful_shutdown_all(self):
|
|
1029
|
-
"""
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
5. Shut down Kernel last (keeps event routing alive throughout)
|
|
1589
|
+
"""全量优雅退出:三阶段关闭
|
|
1590
|
+
|
|
1591
|
+
Phase 1: 先关闭 Watchdog(防止它监控到其他模块退出后触发重启)
|
|
1592
|
+
Phase 2: 关闭其他所有模块(除 Kernel)
|
|
1593
|
+
Phase 3: 最后关闭 Kernel(保证事件路由畅通)
|
|
1035
1594
|
"""
|
|
1036
1595
|
self._system_shutting_down = True
|
|
1037
1596
|
|
|
1038
|
-
#
|
|
1597
|
+
# 发送 Launcher 自己的 exiting 事件
|
|
1039
1598
|
await self._publish_event("module.exiting", {
|
|
1040
1599
|
"module_id": "launcher",
|
|
1600
|
+
"type": "active",
|
|
1041
1601
|
"reason": "system_shutdown",
|
|
1042
1602
|
"action": "none",
|
|
1603
|
+
"timeout": 0,
|
|
1043
1604
|
})
|
|
1044
1605
|
|
|
1045
1606
|
running = [n for n in self.modules if self.process_manager.is_running(n)]
|
|
@@ -1047,88 +1608,144 @@ class Launcher:
|
|
|
1047
1608
|
for cn in CORE_MODULE_NAMES:
|
|
1048
1609
|
if self.process_manager.is_running(cn) and cn not in running:
|
|
1049
1610
|
running.append(cn)
|
|
1611
|
+
|
|
1050
1612
|
if not running:
|
|
1051
1613
|
print("[launcher] 没有运行中的模块需要关闭")
|
|
1052
1614
|
return
|
|
1053
1615
|
|
|
1054
|
-
|
|
1055
|
-
|
|
1616
|
+
# 分组:Watchdog、Kernel、其他模块
|
|
1617
|
+
watchdog_running = WATCHDOG_MODULE_NAME in running
|
|
1618
|
+
kernel_running = "kernel" in running
|
|
1619
|
+
other_modules = [n for n in running if n not in (WATCHDOG_MODULE_NAME, "kernel")]
|
|
1620
|
+
|
|
1621
|
+
graceful_others = [n for n in other_modules if self._graceful_modules.get(n)]
|
|
1622
|
+
non_graceful_others = [n for n in other_modules if not self._graceful_modules.get(n)]
|
|
1623
|
+
|
|
1624
|
+
print(f"[launcher] 正在关闭 {len(running)} 个模块(三阶段)")
|
|
1625
|
+
|
|
1626
|
+
# ═══════════════════════════════════════════════════════════
|
|
1627
|
+
# Phase 1: 先关闭 Watchdog(防止重启其他模块)
|
|
1628
|
+
# ═══════════════════════════════════════════════════════════
|
|
1629
|
+
if watchdog_running and self.process_manager.is_running(WATCHDOG_MODULE_NAME):
|
|
1630
|
+
print(f"[launcher] Phase 1: 通知 Watchdog 退出(防止重启其他模块)")
|
|
1631
|
+
|
|
1632
|
+
if self._graceful_modules.get(WATCHDOG_MODULE_NAME):
|
|
1633
|
+
# Watchdog 支持优雅退出
|
|
1634
|
+
self._init_module_state(WATCHDOG_MODULE_NAME)
|
|
1635
|
+
state = self._module_states[WATCHDOG_MODULE_NAME]
|
|
1636
|
+
state["shutdown_sent"] = True
|
|
1637
|
+
state["reason"] = "system_shutdown"
|
|
1638
|
+
state["restart"] = False
|
|
1639
|
+
self._log_lifecycle("stopping", WATCHDOG_MODULE_NAME, reason="system_shutdown")
|
|
1640
|
+
|
|
1641
|
+
await self._publish_event("module.shutdown", {
|
|
1642
|
+
"module_id": WATCHDOG_MODULE_NAME,
|
|
1643
|
+
"reason": "system_shutdown",
|
|
1644
|
+
"timeout": 5,
|
|
1645
|
+
"restart": False,
|
|
1646
|
+
})
|
|
1647
|
+
|
|
1648
|
+
# 等待 0.2 秒确保事件送达(不需要等待进程退出)
|
|
1649
|
+
await asyncio.sleep(0.2)
|
|
1650
|
+
print(f"[launcher] Watchdog shutdown 事件已发送")
|
|
1651
|
+
else:
|
|
1652
|
+
# 直接终止
|
|
1653
|
+
self._init_module_state(WATCHDOG_MODULE_NAME)
|
|
1654
|
+
state = self._module_states[WATCHDOG_MODULE_NAME]
|
|
1655
|
+
state["shutdown_sent"] = True
|
|
1656
|
+
state["stopped_sent"] = True
|
|
1657
|
+
state["reason"] = "system_shutdown"
|
|
1658
|
+
state["restart"] = False
|
|
1659
|
+
self._log_lifecycle("stopping", WATCHDOG_MODULE_NAME, reason="system_shutdown")
|
|
1056
1660
|
|
|
1057
|
-
|
|
1058
|
-
kernel_deferred = "kernel" in graceful
|
|
1059
|
-
graceful_batch = [n for n in graceful if n != "kernel"] if kernel_deferred else graceful
|
|
1661
|
+
self.process_manager.stop_module(WATCHDOG_MODULE_NAME, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
|
|
1060
1662
|
|
|
1061
|
-
|
|
1663
|
+
await self._publish_event("module.stopped", {
|
|
1664
|
+
"module_id": WATCHDOG_MODULE_NAME,
|
|
1665
|
+
"exit_code": 0,
|
|
1666
|
+
"exit_type": "non_graceful",
|
|
1667
|
+
"reason": "system_shutdown",
|
|
1668
|
+
"restart": False,
|
|
1669
|
+
"ready_received": False,
|
|
1670
|
+
})
|
|
1062
1671
|
|
|
1063
|
-
|
|
1064
|
-
|
|
1672
|
+
self._log_lifecycle("stopped", WATCHDOG_MODULE_NAME, reason="system_shutdown")
|
|
1673
|
+
|
|
1674
|
+
# ═══════════════════════════════════════════════════════════
|
|
1675
|
+
# Phase 2: 关闭其他所有模块(除 Kernel)
|
|
1676
|
+
# ═══════════════════════════════════════════════════════════
|
|
1677
|
+
if graceful_others or non_graceful_others:
|
|
1678
|
+
print(f"[launcher] Phase 2: 关闭其他模块({len(graceful_others)} 优雅 + {len(non_graceful_others)} 非优雅)")
|
|
1679
|
+
|
|
1680
|
+
# 通知优雅模块
|
|
1681
|
+
for name in graceful_others:
|
|
1682
|
+
self._init_module_state(name)
|
|
1683
|
+
state = self._module_states[name]
|
|
1684
|
+
state["shutdown_sent"] = True
|
|
1685
|
+
state["reason"] = "system_shutdown"
|
|
1686
|
+
state["restart"] = False
|
|
1065
1687
|
self._log_lifecycle("stopping", name, reason="system_shutdown")
|
|
1066
1688
|
await self._publish_event("module.shutdown", {
|
|
1067
|
-
"module_id": name,
|
|
1689
|
+
"module_id": name,
|
|
1690
|
+
"reason": "system_shutdown",
|
|
1691
|
+
"timeout": 5,
|
|
1692
|
+
"restart": False,
|
|
1068
1693
|
})
|
|
1069
1694
|
|
|
1070
|
-
#
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1695
|
+
# 终止非优雅模块
|
|
1696
|
+
for name in non_graceful_others:
|
|
1697
|
+
self._init_module_state(name)
|
|
1698
|
+
state = self._module_states[name]
|
|
1699
|
+
state["shutdown_sent"] = True
|
|
1700
|
+
state["stopped_sent"] = True
|
|
1701
|
+
state["reason"] = "system_shutdown"
|
|
1702
|
+
state["restart"] = False
|
|
1074
1703
|
self._log_lifecycle("stopping", name, reason="system_shutdown")
|
|
1075
|
-
|
|
1704
|
+
|
|
1705
|
+
self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
|
|
1706
|
+
|
|
1707
|
+
await self._publish_event("module.stopped", {
|
|
1708
|
+
"module_id": name,
|
|
1709
|
+
"exit_code": 0,
|
|
1710
|
+
"exit_type": "non_graceful",
|
|
1711
|
+
"reason": "system_shutdown",
|
|
1712
|
+
"restart": False,
|
|
1713
|
+
"ready_received": False,
|
|
1714
|
+
})
|
|
1715
|
+
|
|
1076
1716
|
self._log_lifecycle("stopped", name, reason="system_shutdown")
|
|
1077
1717
|
|
|
1078
|
-
#
|
|
1079
|
-
if
|
|
1718
|
+
# 等待优雅模块退出(包括 Watchdog)
|
|
1719
|
+
all_graceful = graceful_others + ([WATCHDOG_MODULE_NAME] if watchdog_running and self._graceful_modules.get(WATCHDOG_MODULE_NAME) else [])
|
|
1720
|
+
if all_graceful:
|
|
1080
1721
|
deadline = time.time() + 5
|
|
1081
1722
|
while time.time() < deadline:
|
|
1082
|
-
still_running = [n for n in
|
|
1723
|
+
still_running = [n for n in all_graceful if self.process_manager.is_running(n)]
|
|
1083
1724
|
if not still_running:
|
|
1084
|
-
print("[launcher]
|
|
1725
|
+
print("[launcher] 所有其他模块已退出")
|
|
1085
1726
|
break
|
|
1086
1727
|
remaining = max(0, deadline - time.time())
|
|
1087
1728
|
print(f"[launcher] 等待 {len(still_running)} 个模块退出 ({remaining:.0f}s): {', '.join(still_running)}")
|
|
1088
1729
|
await asyncio.sleep(1)
|
|
1089
|
-
|
|
1090
|
-
|
|
1730
|
+
|
|
1731
|
+
# 强杀未退出的
|
|
1732
|
+
for name in all_graceful:
|
|
1091
1733
|
if self.process_manager.is_running(name):
|
|
1092
|
-
|
|
1093
|
-
self.
|
|
1734
|
+
print(f"[launcher] {name} 超时,强制终止")
|
|
1735
|
+
self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
|
|
1736
|
+
self._log_lifecycle("stopped", name, reason="system_shutdown_timeout")
|
|
1094
1737
|
|
|
1095
|
-
#
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1738
|
+
# ═══════════════════════════════════════════════════════════
|
|
1739
|
+
# Phase 3: 最后关闭 Kernel(使用标准优雅退出流程)
|
|
1740
|
+
# ═══════════════════════════════════════════════════════════
|
|
1741
|
+
if kernel_running and self.process_manager.is_running("kernel"):
|
|
1742
|
+
print("[launcher] Phase 3: 关闭 Kernel(所有其他模块已退出)")
|
|
1099
1743
|
|
|
1100
|
-
#
|
|
1101
|
-
|
|
1102
|
-
try:
|
|
1103
|
-
if self._ws:
|
|
1104
|
-
await self._rpc_call(self._ws, "kernel.shutdown", {})
|
|
1105
|
-
print("[launcher] Kernel shutdown RPC 已发送")
|
|
1106
|
-
rpc_sent = True
|
|
1107
|
-
else:
|
|
1108
|
-
print("[launcher] WebSocket 未连接,跳过 RPC 调用")
|
|
1109
|
-
except Exception as e:
|
|
1110
|
-
print(f"[launcher] Kernel shutdown RPC 失败: {e}")
|
|
1111
|
-
|
|
1112
|
-
# Wait for kernel to exit
|
|
1113
|
-
if rpc_sent:
|
|
1114
|
-
# RPC sent: wait up to 5s for graceful exit
|
|
1115
|
-
proc = self.process_manager._processes.get("kernel")
|
|
1116
|
-
if proc:
|
|
1117
|
-
try:
|
|
1118
|
-
loop = asyncio.get_event_loop()
|
|
1119
|
-
await asyncio.wait_for(
|
|
1120
|
-
loop.run_in_executor(None, proc.wait),
|
|
1121
|
-
timeout=5
|
|
1122
|
-
)
|
|
1123
|
-
print("[launcher] Kernel 已退出")
|
|
1124
|
-
except asyncio.TimeoutError:
|
|
1125
|
-
print("[launcher] Kernel 5秒内未退出,强制停止")
|
|
1126
|
-
self.process_manager.stop_module("kernel", timeout=SHUTDOWN_TIMEOUT_PARTIAL)
|
|
1127
|
-
else:
|
|
1128
|
-
# No RPC (WS not connected): use shorter timeout for terminate
|
|
1129
|
-
self.process_manager.stop_module("kernel", timeout=2)
|
|
1744
|
+
# 明确标记不重启
|
|
1745
|
+
self._desired_states["kernel"] = "stopped"
|
|
1130
1746
|
|
|
1131
|
-
|
|
1747
|
+
# 使用标准优雅退出流程(内含等待 ack → exiting → ready → kill 完整逻辑)
|
|
1748
|
+
await self._graceful_stop("kernel", reason="system_shutdown", timeout=5)
|
|
1132
1749
|
|
|
1133
1750
|
# Final safety net
|
|
1134
1751
|
try:
|
|
@@ -1282,7 +1899,7 @@ class Launcher:
|
|
|
1282
1899
|
# Call Kernel RPC to generate tokens
|
|
1283
1900
|
try:
|
|
1284
1901
|
result = await self._rpc_call(self._ws, "kernel.generate_tokens", {"modules": module_names})
|
|
1285
|
-
if
|
|
1902
|
+
if "result" in result:
|
|
1286
1903
|
tokens = result["result"].get("tokens", {})
|
|
1287
1904
|
self._module_tokens.update(tokens)
|
|
1288
1905
|
print(f"[launcher] Kernel 已生成 {len(tokens)} 个模块令牌")
|
|
@@ -1297,7 +1914,7 @@ class Launcher:
|
|
|
1297
1914
|
return
|
|
1298
1915
|
try:
|
|
1299
1916
|
result = await self._rpc_call(self._ws, "kernel.register_tokens", tokens)
|
|
1300
|
-
if
|
|
1917
|
+
if "result" in result:
|
|
1301
1918
|
print(f"[launcher] 已注册 {len(tokens)} 个模块令牌")
|
|
1302
1919
|
elif "error" in result:
|
|
1303
1920
|
print(f"[launcher] 警告: 令牌注册失败: {result['error'].get('message', '')}")
|
|
@@ -1381,10 +1998,19 @@ class Launcher:
|
|
|
1381
1998
|
if rc != 0:
|
|
1382
1999
|
self._print_module_crash_summary(name)
|
|
1383
2000
|
self._log_lifecycle("exited", name, exit_code=rc)
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
2001
|
+
|
|
2002
|
+
# 检查是否已发送 stopped 事件
|
|
2003
|
+
state = self._module_states.get(name, {})
|
|
2004
|
+
if not state.get("stopped_sent"):
|
|
2005
|
+
# 取消清理超时任务(如果有)
|
|
2006
|
+
if state.get("cleanup_task"):
|
|
2007
|
+
state["cleanup_task"].cancel()
|
|
2008
|
+
# 发送 stopped 事件
|
|
2009
|
+
await self._send_stopped_event(name, rc)
|
|
2010
|
+
|
|
2011
|
+
# 无论是否发送,都清理状态(防止内存泄漏)
|
|
2012
|
+
self._module_states.pop(name, None)
|
|
2013
|
+
|
|
1388
2014
|
info = self.modules.get(name)
|
|
1389
2015
|
|
|
1390
2016
|
# 1) Core module crash → full restart
|
|
@@ -1691,7 +2317,18 @@ class Launcher:
|
|
|
1691
2317
|
debug_flag = " [DEBUG]" if os.environ.get("KITE_DEBUG") == "1" else ""
|
|
1692
2318
|
lines.append(f"{G} 当前实例: #{inst_num} 后缀: {suffix_display} PID: {os.getpid()}{debug_flag}{R}")
|
|
1693
2319
|
lines.append(f"{G} 实例目录: {inst_dir}{R}")
|
|
1694
|
-
|
|
2320
|
+
|
|
2321
|
+
# Check for abnormal working directory
|
|
2322
|
+
cwd_lower = cwd.lower()
|
|
2323
|
+
is_abnormal_cwd = (
|
|
2324
|
+
"windowsapps" in cwd_lower or
|
|
2325
|
+
"appdata\\local\\temp" in cwd_lower or
|
|
2326
|
+
not os.path.exists(os.path.join(cwd, "main.py"))
|
|
2327
|
+
)
|
|
2328
|
+
if is_abnormal_cwd:
|
|
2329
|
+
lines.append(f"\033[91m 工作目录: {cwd} ⚠️ 异常路径{R}")
|
|
2330
|
+
else:
|
|
2331
|
+
lines.append(f"{G} 工作目录: {cwd}{R}")
|
|
1695
2332
|
if len(instances) > 1:
|
|
1696
2333
|
lines.append(f"{G} 所有实例:{R}")
|
|
1697
2334
|
for i in instances:
|
|
@@ -1760,6 +2397,30 @@ class Launcher:
|
|
|
1760
2397
|
except Exception:
|
|
1761
2398
|
pass
|
|
1762
2399
|
|
|
2400
|
+
def _record_launcher_startup(self):
|
|
2401
|
+
"""Record launcher startup information to lifecycle.jsonl."""
|
|
2402
|
+
import sys
|
|
2403
|
+
from datetime import datetime, timezone
|
|
2404
|
+
|
|
2405
|
+
record = {
|
|
2406
|
+
"ts": datetime.now(timezone.utc).isoformat(),
|
|
2407
|
+
"event": "launcher_startup",
|
|
2408
|
+
"module": "launcher",
|
|
2409
|
+
"pid": os.getpid(),
|
|
2410
|
+
"cwd": os.getcwd(),
|
|
2411
|
+
"argv": sys.argv,
|
|
2412
|
+
"instance_dir": os.environ.get("KITE_INSTANCE_DIR", ""),
|
|
2413
|
+
"instance_suffix": self.process_manager.instance_suffix,
|
|
2414
|
+
"python": sys.executable,
|
|
2415
|
+
}
|
|
2416
|
+
|
|
2417
|
+
try:
|
|
2418
|
+
os.makedirs(os.path.dirname(self._lifecycle_log), exist_ok=True)
|
|
2419
|
+
with open(self._lifecycle_log, "a", encoding="utf-8") as f:
|
|
2420
|
+
f.write(json.dumps(record, ensure_ascii=False) + "\n")
|
|
2421
|
+
except Exception:
|
|
2422
|
+
pass
|
|
2423
|
+
|
|
1763
2424
|
|
|
1764
2425
|
|
|
1765
2426
|
def _update_module_md_state(module_dir: str, new_state: str):
|