@agentunion/kite 1.3.1 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +287 -1
- package/cli.js +76 -0
- package/extensions/agents/assistant/entry.py +111 -1
- package/extensions/agents/assistant/server.py +263 -197
- package/extensions/channels/acp_channel/entry.py +111 -1
- package/extensions/channels/acp_channel/module.md +23 -22
- package/extensions/channels/acp_channel/server.py +263 -197
- package/extensions/event_hub_bench/entry.py +107 -1
- package/extensions/services/backup/entry.py +408 -72
- package/extensions/services/backup/module.md +24 -22
- package/extensions/services/model_service/entry.py +255 -71
- package/extensions/services/model_service/module.md +21 -22
- package/extensions/services/watchdog/entry.py +344 -90
- package/extensions/services/watchdog/monitor.py +237 -21
- package/extensions/services/web/WEBSOCKET_STATUS.md +143 -0
- package/extensions/services/web/config_example.py +35 -0
- package/extensions/services/web/config_loader.py +110 -0
- package/extensions/services/web/entry.py +114 -26
- package/extensions/services/web/module.md +35 -24
- package/extensions/services/web/pairing.py +250 -0
- package/extensions/services/web/pairing_codes.jsonl +16 -0
- package/extensions/services/web/relay.py +643 -0
- package/extensions/services/web/relay_config.json5 +67 -0
- package/extensions/services/web/routes/routes_management_ws.py +127 -0
- package/extensions/services/web/routes/routes_rpc.py +89 -0
- package/extensions/services/web/routes/routes_test.py +61 -0
- package/extensions/services/web/server.py +445 -99
- package/extensions/services/web/static/css/style.css +138 -2
- package/extensions/services/web/static/index.html +295 -2
- package/extensions/services/web/static/js/app.js +1579 -5
- package/extensions/services/web/static/js/kernel-client-example.js +161 -0
- package/extensions/services/web/static/js/kernel-client.js +383 -0
- package/extensions/services/web/static/js/registry-tests.js +558 -0
- package/extensions/services/web/static/js/token-manager.js +175 -0
- package/extensions/services/web/static/pairing.html +248 -0
- package/extensions/services/web/static/test_registry.html +262 -0
- package/extensions/services/web/web_config.json5 +29 -0
- package/kernel/entry.py +120 -32
- package/kernel/event_hub.py +159 -16
- package/kernel/module.md +36 -33
- package/kernel/registry_store.py +70 -20
- package/kernel/rpc_router.py +134 -57
- package/kernel/server.py +292 -15
- package/kite_cli/__init__.py +3 -0
- package/kite_cli/__main__.py +5 -0
- package/kite_cli/commands/__init__.py +1 -0
- package/kite_cli/commands/clean.py +101 -0
- package/kite_cli/commands/doctor.py +35 -0
- package/kite_cli/commands/history.py +111 -0
- package/kite_cli/commands/info.py +96 -0
- package/kite_cli/commands/install.py +313 -0
- package/kite_cli/commands/list.py +143 -0
- package/kite_cli/commands/log.py +81 -0
- package/kite_cli/commands/rollback.py +88 -0
- package/kite_cli/commands/search.py +73 -0
- package/kite_cli/commands/uninstall.py +85 -0
- package/kite_cli/commands/update.py +118 -0
- package/kite_cli/core/__init__.py +1 -0
- package/kite_cli/core/checker.py +142 -0
- package/kite_cli/core/dependency.py +229 -0
- package/kite_cli/core/downloader.py +209 -0
- package/kite_cli/core/install_info.py +40 -0
- package/kite_cli/core/tool_installer.py +397 -0
- package/kite_cli/core/validator.py +78 -0
- package/kite_cli/main.py +289 -0
- package/kite_cli/utils/__init__.py +1 -0
- package/kite_cli/utils/i18n.py +252 -0
- package/kite_cli/utils/interactive.py +63 -0
- package/kite_cli/utils/operation_log.py +77 -0
- package/kite_cli/utils/paths.py +34 -0
- package/kite_cli/utils/version.py +308 -0
- package/launcher/count_lines.py +34 -0
- package/launcher/entry.py +905 -166
- package/launcher/logging_setup.py +104 -0
- package/launcher/module.md +37 -37
- package/launcher/process_manager.py +12 -1
- package/package.json +2 -1
- package/scripts/plan_manager.py +315 -0
package/launcher/entry.py
CHANGED
|
@@ -29,10 +29,21 @@ from .process_manager import ProcessManager
|
|
|
29
29
|
IS_WINDOWS = sys.platform == "win32"
|
|
30
30
|
|
|
31
31
|
# Shutdown timeout constants (seconds)
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
32
|
+
|
|
33
|
+
# 不支持优雅关闭
|
|
34
|
+
SHUTDOWN_TIMEOUT_NON_GRACEFUL = 0.3 # SIGTERM 后等待时间
|
|
35
|
+
|
|
36
|
+
# 支持优雅关闭 - 等待响应
|
|
37
|
+
SHUTDOWN_TIMEOUT_ACK = 3.0 # 等待 shutdown.ack
|
|
38
|
+
SHUTDOWN_TIMEOUT_EXITING = 3.0 # 等待 module.exiting
|
|
39
|
+
|
|
40
|
+
# 清理超时(从 exiting 事件获取)
|
|
41
|
+
CLEANUP_TIMEOUT_DEFAULT = 5.0 # 默认清理时间
|
|
42
|
+
CLEANUP_TIMEOUT_MIN = 0.0 # 最小清理时间
|
|
43
|
+
CLEANUP_TIMEOUT_MAX = 30.0 # 最大清理时间
|
|
44
|
+
|
|
45
|
+
# 批量关闭安全网
|
|
46
|
+
SHUTDOWN_TIMEOUT_BULK = 3.0
|
|
36
47
|
|
|
37
48
|
# Core module names that are started in Phase 1 (not Phase 2)
|
|
38
49
|
CORE_MODULE_NAMES = {"kernel"}
|
|
@@ -101,6 +112,9 @@ class Launcher:
|
|
|
101
112
|
# System-wide shutdown flag: prevents Watchdog restart during shutdown
|
|
102
113
|
self._system_shutting_down = False
|
|
103
114
|
|
|
115
|
+
# 模块退出状态跟踪(防止 stopped 事件重复发送)
|
|
116
|
+
self._module_states: dict[str, dict] = {}
|
|
117
|
+
|
|
104
118
|
# Kite stdout message waiters: {waiter_key: (threading.Event, data_dict)}
|
|
105
119
|
# Used by ProcessManager stdout callback (cross-thread)
|
|
106
120
|
self._msg_waiters: dict[str, tuple[threading.Event, dict]] = {}
|
|
@@ -117,6 +131,9 @@ class Launcher:
|
|
|
117
131
|
pass
|
|
118
132
|
os.environ["KITE_INSTANCE_SUFFIX"] = suffix
|
|
119
133
|
|
|
134
|
+
# Record launcher startup
|
|
135
|
+
self._record_launcher_startup()
|
|
136
|
+
|
|
120
137
|
@staticmethod
|
|
121
138
|
def _fmt_elapsed(seconds: float) -> str:
|
|
122
139
|
"""Format elapsed seconds: <1s → 'NNNms', >=1s → 'N.Ns', >=10s → 'NNs'."""
|
|
@@ -303,6 +320,21 @@ class Launcher:
|
|
|
303
320
|
ch = msvcrt.getch()
|
|
304
321
|
if ch == b'\x1b': # ESC - force exit immediately
|
|
305
322
|
print("[launcher] ESC 强制退出")
|
|
323
|
+
# Send module.exiting before exit (best effort)
|
|
324
|
+
try:
|
|
325
|
+
if self._ws and self._loop:
|
|
326
|
+
import concurrent.futures
|
|
327
|
+
fut = asyncio.run_coroutine_threadsafe(
|
|
328
|
+
self._publish_event("module.exiting", {
|
|
329
|
+
"module_id": "launcher",
|
|
330
|
+
"reason": "ESC exit",
|
|
331
|
+
"action": "none",
|
|
332
|
+
}),
|
|
333
|
+
self._loop,
|
|
334
|
+
)
|
|
335
|
+
fut.result(timeout=1) # Wait up to 1s
|
|
336
|
+
except Exception:
|
|
337
|
+
pass
|
|
306
338
|
os._exit(0)
|
|
307
339
|
elif ch in (b'q', b'Q'): # q/Q - graceful shutdown
|
|
308
340
|
self._request_shutdown("收到退出请求,正在关闭...")
|
|
@@ -316,7 +348,7 @@ class Launcher:
|
|
|
316
348
|
"""Full 2-phase startup sequence, then monitor loop."""
|
|
317
349
|
self._loop = asyncio.get_running_loop()
|
|
318
350
|
self._ws_connected = asyncio.Event() # Create event in async context
|
|
319
|
-
|
|
351
|
+
self._t_start = time.monotonic() # Store for launcher ready_time calculation
|
|
320
352
|
self._start_unix = time.time()
|
|
321
353
|
phase_times = {}
|
|
322
354
|
G = "\033[32m"
|
|
@@ -396,7 +428,7 @@ class Launcher:
|
|
|
396
428
|
)
|
|
397
429
|
|
|
398
430
|
# ── Startup report ──
|
|
399
|
-
total_time = time.monotonic() -
|
|
431
|
+
total_time = time.monotonic() - self._t_start
|
|
400
432
|
await self._print_startup_report(total_time, phase_times,
|
|
401
433
|
global_instances=global_instances,
|
|
402
434
|
cleaned_stats=cleaned_stats)
|
|
@@ -557,24 +589,48 @@ class Launcher:
|
|
|
557
589
|
# ── Kernel WebSocket connection (JSON-RPC 2.0) ──
|
|
558
590
|
|
|
559
591
|
async def _ws_loop(self):
|
|
560
|
-
"""Connect to Kernel, reconnect on failure."""
|
|
592
|
+
"""Connect to Kernel, reconnect on failure with exponential backoff."""
|
|
593
|
+
retry_delay = 0.3
|
|
594
|
+
max_delay = 5.0
|
|
595
|
+
max_retries = 10
|
|
596
|
+
attempt = 0
|
|
561
597
|
while not self._thread_shutdown.is_set():
|
|
562
598
|
try:
|
|
563
599
|
await self._ws_connect()
|
|
600
|
+
retry_delay = 0.3 # Reset on successful connection
|
|
601
|
+
attempt = 0
|
|
564
602
|
except asyncio.CancelledError:
|
|
565
603
|
return
|
|
566
604
|
except Exception as e:
|
|
567
605
|
if not self._system_shutting_down:
|
|
568
|
-
|
|
606
|
+
attempt += 1
|
|
607
|
+
# Check for auth failure (don't retry)
|
|
608
|
+
if hasattr(e, 'rcvd') and e.rcvd is not None:
|
|
609
|
+
code = e.rcvd.code if hasattr(e.rcvd, 'code') else 0
|
|
610
|
+
if code in (4001, 4003):
|
|
611
|
+
print(f"[launcher] Kernel 认证失败 (code {code}),退出")
|
|
612
|
+
sys.exit(1)
|
|
613
|
+
if attempt >= max_retries:
|
|
614
|
+
print(f"[launcher] Kernel 重连失败 {max_retries} 次,退出")
|
|
615
|
+
sys.exit(1)
|
|
616
|
+
print(f"[launcher] Kernel 连接错误: {e}, {retry_delay:.1f}s 后重试 ({attempt}/{max_retries})")
|
|
617
|
+
if attempt == 5:
|
|
618
|
+
print(f"\033[33m[launcher] 提示: 已连续 {attempt} 次无法连接 Kernel (端口 {self.kernel_port})")
|
|
619
|
+
if self.kernel_port < 1024:
|
|
620
|
+
print(f"[launcher] ⚠ 端口 {self.kernel_port} 异常偏低,可能是 Kernel 端口绑定失败或配置错误")
|
|
621
|
+
print(f"[launcher] 请检查: 1) Kernel 进程是否存活 2) kernel/module.md 中 preferred_port 配置是否正确\033[0m")
|
|
569
622
|
self._ws = None
|
|
570
|
-
|
|
623
|
+
if self._thread_shutdown.is_set():
|
|
624
|
+
return
|
|
625
|
+
await asyncio.sleep(retry_delay)
|
|
626
|
+
retry_delay = min(retry_delay * 2, max_delay)
|
|
571
627
|
|
|
572
628
|
async def _ws_connect(self):
|
|
573
629
|
"""Single WebSocket session with JSON-RPC 2.0 protocol."""
|
|
574
630
|
launcher_token = self._module_tokens.get("launcher", "")
|
|
575
631
|
ws_url = f"ws://127.0.0.1:{self.kernel_port}/ws?token={launcher_token}&id=launcher"
|
|
576
632
|
t_ws_connect = time.monotonic()
|
|
577
|
-
async with websockets.connect(ws_url, open_timeout=3, ping_interval=
|
|
633
|
+
async with websockets.connect(ws_url, open_timeout=3, ping_interval=20, ping_timeout=20, close_timeout=10) as ws:
|
|
578
634
|
self._ws = ws
|
|
579
635
|
_ws_s = time.monotonic() - t_ws_connect
|
|
580
636
|
print(f"[launcher] 已连接到 Kernel ({self._fmt_elapsed(_ws_s)})")
|
|
@@ -598,15 +654,49 @@ class Launcher:
|
|
|
598
654
|
await self._rpc_call(ws, "registry.register", {
|
|
599
655
|
"module_id": "launcher",
|
|
600
656
|
"module_type": "infrastructure",
|
|
657
|
+
"tools": {
|
|
658
|
+
"rpc": {
|
|
659
|
+
"launcher": {
|
|
660
|
+
"list_modules": {"method": "list_modules", "description": "列出所有模块"},
|
|
661
|
+
"start_module": {"method": "start_module", "description": "启动模块"},
|
|
662
|
+
"stop_module": {"method": "stop_module", "description": "停止模块"},
|
|
663
|
+
"restart_module": {"method": "restart_module", "description": "重启模块"},
|
|
664
|
+
"restart_launcher": {"method": "restart_launcher", "description": "重启 Launcher"},
|
|
665
|
+
"rescan": {"method": "rescan", "description": "重新扫描模块"},
|
|
666
|
+
"shutdown": {"method": "shutdown", "description": "关闭系统"},
|
|
667
|
+
},
|
|
668
|
+
"module": {
|
|
669
|
+
"config": {
|
|
670
|
+
"get": {"method": "get_module_config", "description": "获取模块配置"},
|
|
671
|
+
"update": {"method": "update_module_config", "description": "更新模块配置"},
|
|
672
|
+
"reset": {"method": "reset_module_config", "description": "恢复默认配置"},
|
|
673
|
+
}
|
|
674
|
+
}
|
|
675
|
+
}
|
|
676
|
+
},
|
|
601
677
|
"events_publish": {
|
|
602
|
-
"
|
|
603
|
-
|
|
604
|
-
|
|
678
|
+
"system": {
|
|
679
|
+
"ready": {"description": "系统启动完成"}
|
|
680
|
+
},
|
|
681
|
+
"module": {
|
|
682
|
+
"starting": {"description": "模块启动中"},
|
|
683
|
+
"started": {"description": "模块已启动"},
|
|
684
|
+
"ready": {"description": "模块就绪"},
|
|
685
|
+
"stopped": {"description": "模块已停止"},
|
|
686
|
+
"exiting": {"description": "模块退出中"},
|
|
687
|
+
"shutdown": {"description": "模块关闭"}
|
|
688
|
+
}
|
|
605
689
|
},
|
|
606
690
|
"events_subscribe": [">"],
|
|
607
691
|
})
|
|
608
692
|
print("[launcher] 已注册到 Kernel")
|
|
609
693
|
|
|
694
|
+
# Publish module.ready for Launcher itself (every reconnect)
|
|
695
|
+
await self._publish_event("module.ready", {
|
|
696
|
+
"module_id": "launcher",
|
|
697
|
+
"graceful_shutdown": True,
|
|
698
|
+
})
|
|
699
|
+
|
|
610
700
|
# Signal that connection is ready (after subscription and registration)
|
|
611
701
|
if self._ws_connected:
|
|
612
702
|
self._ws_connected.set()
|
|
@@ -618,7 +708,14 @@ class Launcher:
|
|
|
618
708
|
raise
|
|
619
709
|
|
|
620
710
|
async def _ws_receiver(self, ws):
|
|
621
|
-
"""Receive loop: classify incoming messages.
|
|
711
|
+
"""Receive loop: classify incoming messages.
|
|
712
|
+
|
|
713
|
+
CRITICAL: RPC 死锁防范
|
|
714
|
+
- 入站 RPC 请求必须用 create_task() 异步执行,不可 await
|
|
715
|
+
- 原因:如果 handler 内部调用 rpc_call() 发出站请求,出站响应需要本接收循环来分发
|
|
716
|
+
- 如果接收循环被 await handler 阻塞,出站响应永远收不到 → 超时死锁
|
|
717
|
+
- 事件通知和 RPC 响应可以同步处理(它们不会反向调用 rpc_call)
|
|
718
|
+
"""
|
|
622
719
|
try:
|
|
623
720
|
async for raw in ws:
|
|
624
721
|
try:
|
|
@@ -636,7 +733,8 @@ class Launcher:
|
|
|
636
733
|
await self._handle_event_notification(msg)
|
|
637
734
|
elif has_method and has_id:
|
|
638
735
|
# Incoming RPC request (forwarded by Kernel)
|
|
639
|
-
|
|
736
|
+
# Run in background so receiver loop continues processing responses
|
|
737
|
+
asyncio.create_task(self._handle_rpc_request(ws, msg))
|
|
640
738
|
elif has_id and (has_result or has_error):
|
|
641
739
|
# RPC response (to our own call)
|
|
642
740
|
self._handle_rpc_response(msg)
|
|
@@ -688,6 +786,7 @@ class Launcher:
|
|
|
688
786
|
# Trigger event waiters
|
|
689
787
|
module_id = data.get("module_id", "")
|
|
690
788
|
waiter_key = f"{event}:{module_id}"
|
|
789
|
+
|
|
691
790
|
waiter = self._event_waiters.get(waiter_key)
|
|
692
791
|
if waiter:
|
|
693
792
|
waiter[1].update(data)
|
|
@@ -702,6 +801,42 @@ class Launcher:
|
|
|
702
801
|
ready_waiter[1]["_exited"] = True
|
|
703
802
|
ready_waiter[0].set()
|
|
704
803
|
|
|
804
|
+
# 处理主动退出场景(没有 shutdown 的情况)
|
|
805
|
+
if module_id not in self._module_states:
|
|
806
|
+
self._init_module_state(module_id)
|
|
807
|
+
state = self._module_states[module_id]
|
|
808
|
+
|
|
809
|
+
if not state.get("shutdown_sent"):
|
|
810
|
+
# 主动退出:记录信息
|
|
811
|
+
if not state.get("exiting_received"):
|
|
812
|
+
state["exiting_received"] = True
|
|
813
|
+
state["reason"] = data.get("reason", "active_exit")
|
|
814
|
+
state["restart"] = data.get("restart", False)
|
|
815
|
+
cleanup_timeout = data.get("cleanup_timeout", CLEANUP_TIMEOUT_DEFAULT)
|
|
816
|
+
cleanup_timeout = max(CLEANUP_TIMEOUT_MIN, min(cleanup_timeout, CLEANUP_TIMEOUT_MAX))
|
|
817
|
+
state["cleanup_timeout"] = cleanup_timeout
|
|
818
|
+
|
|
819
|
+
# 启动清理超时任务
|
|
820
|
+
async def cleanup_timeout_handler():
|
|
821
|
+
await asyncio.sleep(state["cleanup_timeout"])
|
|
822
|
+
if not state.get("stopped_sent"):
|
|
823
|
+
state["stopped_sent"] = True
|
|
824
|
+
self._kill_process(module_id)
|
|
825
|
+
|
|
826
|
+
# 发送 stopped 事件
|
|
827
|
+
await self._publish_event("module.stopped", {
|
|
828
|
+
"module_id": module_id,
|
|
829
|
+
"exit_code": -1, # 超时强制终止,退出码未知
|
|
830
|
+
"exit_type": "timeout",
|
|
831
|
+
"reason": state.get("reason", "cleanup_timeout"),
|
|
832
|
+
"restart": state.get("restart", False),
|
|
833
|
+
"ready_received": False,
|
|
834
|
+
})
|
|
835
|
+
|
|
836
|
+
self._log_lifecycle("stopped", module_id, reason=state["reason"])
|
|
837
|
+
|
|
838
|
+
state["cleanup_task"] = asyncio.create_task(cleanup_timeout_handler())
|
|
839
|
+
|
|
705
840
|
# module.crash → print red crash summary
|
|
706
841
|
if event == "module.crash" and module_id:
|
|
707
842
|
RED = "\033[91m"
|
|
@@ -716,6 +851,43 @@ class Launcher:
|
|
|
716
851
|
)
|
|
717
852
|
print(f"[launcher] 崩溃日志: {crash_log}")
|
|
718
853
|
|
|
854
|
+
# pairing.status → handle all pairing flow events
|
|
855
|
+
if event == "pairing.status":
|
|
856
|
+
GREEN = "\033[92m"
|
|
857
|
+
RED = "\033[91m"
|
|
858
|
+
RESET = "\033[0m"
|
|
859
|
+
|
|
860
|
+
step = data.get("step", "")
|
|
861
|
+
success = data.get("success", True)
|
|
862
|
+
|
|
863
|
+
if step == "code_generated":
|
|
864
|
+
code = data.get("code", "")
|
|
865
|
+
expires_in = data.get("expires_in", 300)
|
|
866
|
+
if code:
|
|
867
|
+
print(f"[launcher] {GREEN}配对码: {code}{RESET}")
|
|
868
|
+
print(f"[launcher] {GREEN}有效期: {expires_in} 秒{RESET}")
|
|
869
|
+
print(f"[launcher] {GREEN}访问 Web 界面时使用此配对码进行配对{RESET}")
|
|
870
|
+
|
|
871
|
+
elif step == "pairing":
|
|
872
|
+
if success:
|
|
873
|
+
print(f"[launcher] {GREEN}正在配对...{RESET}")
|
|
874
|
+
else:
|
|
875
|
+
reason = data.get("reason", "Unknown error")
|
|
876
|
+
print(f"[launcher] {RED}✗ 配对失败: {reason}{RESET}")
|
|
877
|
+
|
|
878
|
+
elif step == "completed":
|
|
879
|
+
if success:
|
|
880
|
+
module_id = data.get("module_id", "")
|
|
881
|
+
role = data.get("role", "")
|
|
882
|
+
print(f"[launcher] {GREEN}✓ 配对成功!{RESET}")
|
|
883
|
+
print(f"[launcher] {GREEN} 模块 ID: {module_id}{RESET}")
|
|
884
|
+
print(f"[launcher] {GREEN} 角色: {role}{RESET}")
|
|
885
|
+
else:
|
|
886
|
+
reason = data.get("reason", "Unknown error")
|
|
887
|
+
print(f"[launcher] {RED}✗ 配对失败: {reason}{RESET}")
|
|
888
|
+
|
|
889
|
+
return
|
|
890
|
+
|
|
719
891
|
# Only log system events (module.*, watchdog.*) to avoid flooding
|
|
720
892
|
if not (event.startswith("module.") or event.startswith("watchdog.")):
|
|
721
893
|
return
|
|
@@ -740,12 +912,16 @@ class Launcher:
|
|
|
740
912
|
params = msg.get("params", {})
|
|
741
913
|
|
|
742
914
|
handlers = {
|
|
743
|
-
"list_modules":
|
|
744
|
-
"start_module":
|
|
745
|
-
"stop_module":
|
|
746
|
-
"restart_module":
|
|
747
|
-
"
|
|
748
|
-
"
|
|
915
|
+
"list_modules": self._rpc_list_modules,
|
|
916
|
+
"start_module": self._rpc_start_module,
|
|
917
|
+
"stop_module": self._rpc_stop_module,
|
|
918
|
+
"restart_module": self._rpc_restart_module,
|
|
919
|
+
"restart_launcher": self._rpc_restart_launcher,
|
|
920
|
+
"rescan": self._rpc_rescan,
|
|
921
|
+
"shutdown": self._rpc_shutdown,
|
|
922
|
+
"get_module_config": self._rpc_get_module_config,
|
|
923
|
+
"update_module_config": self._rpc_update_module_config,
|
|
924
|
+
"reset_module_config": self._rpc_reset_module_config,
|
|
749
925
|
}
|
|
750
926
|
handler = handlers.get(method)
|
|
751
927
|
if handler:
|
|
@@ -775,11 +951,14 @@ class Launcher:
|
|
|
775
951
|
"name": name,
|
|
776
952
|
"display_name": info.display_name,
|
|
777
953
|
"type": info.type,
|
|
778
|
-
"
|
|
954
|
+
"state": info.state, # 改名为 state(与 /api/modules 一致)
|
|
955
|
+
"version": info.version,
|
|
956
|
+
"runtime": info.runtime,
|
|
957
|
+
"preferred_port": info.preferred_port,
|
|
958
|
+
"monitor": info.monitor,
|
|
779
959
|
"desired_state": self._desired_states.get(name, "stopped"),
|
|
780
960
|
"actual_state": f"running({rec.pid})" if running and rec else "stopped",
|
|
781
961
|
"pid": rec.pid if running and rec else None,
|
|
782
|
-
"monitor": info.monitor,
|
|
783
962
|
})
|
|
784
963
|
return {"modules": result}
|
|
785
964
|
|
|
@@ -871,37 +1050,256 @@ class Launcher:
|
|
|
871
1050
|
self._request_shutdown(f"RPC shutdown request: {reason}")
|
|
872
1051
|
return {"status": "shutting_down", "reason": reason}
|
|
873
1052
|
|
|
1053
|
+
async def _rpc_get_module_config(self, params: dict) -> dict:
|
|
1054
|
+
"""获取指定模块的配置(通用降级方案)"""
|
|
1055
|
+
import re
|
|
1056
|
+
import yaml
|
|
1057
|
+
from pathlib import Path
|
|
1058
|
+
|
|
1059
|
+
module_name = params.get("module_name")
|
|
1060
|
+
if not module_name:
|
|
1061
|
+
raise ValueError("module_name required")
|
|
1062
|
+
|
|
1063
|
+
# 查找模块信息
|
|
1064
|
+
info = self.modules.get(module_name)
|
|
1065
|
+
if not info:
|
|
1066
|
+
raise RuntimeError(f"Module '{module_name}' not found")
|
|
1067
|
+
|
|
1068
|
+
# 读取 module.md
|
|
1069
|
+
md_path = Path(info.module_dir) / "module.md"
|
|
1070
|
+
if not md_path.exists():
|
|
1071
|
+
raise RuntimeError(f"module.md not found for '{module_name}'")
|
|
1072
|
+
|
|
1073
|
+
text = md_path.read_text(encoding="utf-8")
|
|
1074
|
+
m = re.match(r'^---\s*\n(.*?)\n---\s*\n?(.*)', text, re.DOTALL)
|
|
1075
|
+
if not m:
|
|
1076
|
+
frontmatter = {}
|
|
1077
|
+
else:
|
|
1078
|
+
frontmatter = yaml.safe_load(m.group(1)) or {}
|
|
1079
|
+
|
|
1080
|
+
# 读取 config.yaml(如果存在)
|
|
1081
|
+
config_path = Path(info.module_dir) / "config.yaml"
|
|
1082
|
+
config = None
|
|
1083
|
+
if config_path.exists():
|
|
1084
|
+
config = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
|
|
1085
|
+
|
|
1086
|
+
return {
|
|
1087
|
+
"name": frontmatter.get("name", module_name),
|
|
1088
|
+
"display_name": frontmatter.get("display_name", ""),
|
|
1089
|
+
"type": frontmatter.get("type", ""),
|
|
1090
|
+
"state": frontmatter.get("state", "enabled"),
|
|
1091
|
+
"version": frontmatter.get("version", ""),
|
|
1092
|
+
"runtime": frontmatter.get("runtime", ""),
|
|
1093
|
+
"entry": frontmatter.get("entry", ""),
|
|
1094
|
+
"preferred_port": frontmatter.get("preferred_port"),
|
|
1095
|
+
"advertise_ip": frontmatter.get("advertise_ip"),
|
|
1096
|
+
"monitor": frontmatter.get("monitor"),
|
|
1097
|
+
"events": frontmatter.get("events"),
|
|
1098
|
+
"subscriptions": frontmatter.get("subscriptions"),
|
|
1099
|
+
"depends_on": frontmatter.get("depends_on"),
|
|
1100
|
+
"source_path": str(info.module_dir), # 添加模块路径
|
|
1101
|
+
"has_config": config is not None,
|
|
1102
|
+
"config": config,
|
|
1103
|
+
}
|
|
1104
|
+
|
|
1105
|
+
async def _rpc_update_module_config(self, params: dict) -> dict:
|
|
1106
|
+
"""更新指定模块的配置(通用降级方案)"""
|
|
1107
|
+
import yaml
|
|
1108
|
+
from pathlib import Path
|
|
1109
|
+
|
|
1110
|
+
module_name = params.get("module_name")
|
|
1111
|
+
metadata = params.get("metadata", {})
|
|
1112
|
+
config = params.get("config", {})
|
|
1113
|
+
|
|
1114
|
+
if not module_name:
|
|
1115
|
+
raise ValueError("module_name required")
|
|
1116
|
+
|
|
1117
|
+
info = self.modules.get(module_name)
|
|
1118
|
+
if not info:
|
|
1119
|
+
raise RuntimeError(f"Module '{module_name}' not found")
|
|
1120
|
+
|
|
1121
|
+
md_path = Path(info.module_dir) / "module.md"
|
|
1122
|
+
if not md_path.exists():
|
|
1123
|
+
raise RuntimeError(f"module.md not found for '{module_name}'")
|
|
1124
|
+
|
|
1125
|
+
# 更新 module.md frontmatter
|
|
1126
|
+
if metadata:
|
|
1127
|
+
frontmatter, body = _parse_frontmatter(md_path.read_text(encoding="utf-8"))
|
|
1128
|
+
for key, value in metadata.items():
|
|
1129
|
+
frontmatter[key] = value
|
|
1130
|
+
fm_str = yaml.dump(frontmatter, allow_unicode=True, sort_keys=False, default_flow_style=False).rstrip()
|
|
1131
|
+
content = f"---\n{fm_str}\n---\n{body}"
|
|
1132
|
+
md_path.write_text(content, encoding="utf-8")
|
|
1133
|
+
|
|
1134
|
+
# 更新 config.yaml
|
|
1135
|
+
if config:
|
|
1136
|
+
config_path = Path(info.module_dir) / "config.yaml"
|
|
1137
|
+
existing = {}
|
|
1138
|
+
if config_path.exists():
|
|
1139
|
+
existing = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
|
|
1140
|
+
# Deep merge
|
|
1141
|
+
self._deep_merge(existing, config)
|
|
1142
|
+
config_path.write_text(
|
|
1143
|
+
yaml.dump(existing, allow_unicode=True, sort_keys=False, default_flow_style=False),
|
|
1144
|
+
encoding="utf-8"
|
|
1145
|
+
)
|
|
1146
|
+
|
|
1147
|
+
# 重新扫描以更新缓存
|
|
1148
|
+
await self._rescan_modules()
|
|
1149
|
+
|
|
1150
|
+
# 返回更新后的配置
|
|
1151
|
+
return await self._rpc_get_module_config({"module_name": module_name})
|
|
1152
|
+
|
|
1153
|
+
async def _rpc_reset_module_config(self, params: dict) -> dict:
|
|
1154
|
+
"""恢复指定模块的默认配置(通用降级方案)"""
|
|
1155
|
+
import yaml
|
|
1156
|
+
from pathlib import Path
|
|
1157
|
+
|
|
1158
|
+
module_name = params.get("module_name")
|
|
1159
|
+
fields = params.get("fields", [])
|
|
1160
|
+
reset_all = params.get("all", False)
|
|
1161
|
+
|
|
1162
|
+
if not module_name:
|
|
1163
|
+
raise ValueError("module_name required")
|
|
1164
|
+
|
|
1165
|
+
info = self.modules.get(module_name)
|
|
1166
|
+
if not info:
|
|
1167
|
+
raise RuntimeError(f"Module '{module_name}' not found")
|
|
1168
|
+
|
|
1169
|
+
md_path = Path(info.module_dir) / "module.md"
|
|
1170
|
+
if not md_path.exists():
|
|
1171
|
+
raise RuntimeError(f"module.md not found for '{module_name}'")
|
|
1172
|
+
|
|
1173
|
+
# 默认值定义(通用)
|
|
1174
|
+
defaults = {
|
|
1175
|
+
"state": "enabled",
|
|
1176
|
+
"monitor": True,
|
|
1177
|
+
}
|
|
1178
|
+
|
|
1179
|
+
frontmatter, body = _parse_frontmatter(md_path.read_text(encoding="utf-8"))
|
|
1180
|
+
|
|
1181
|
+
if reset_all:
|
|
1182
|
+
for key, value in defaults.items():
|
|
1183
|
+
frontmatter[key] = value
|
|
1184
|
+
else:
|
|
1185
|
+
for field in fields:
|
|
1186
|
+
if field in defaults:
|
|
1187
|
+
frontmatter[field] = defaults[field]
|
|
1188
|
+
elif field == "preferred_port":
|
|
1189
|
+
frontmatter.pop(field, None) # 恢复为 null
|
|
1190
|
+
elif field == "advertise_ip":
|
|
1191
|
+
frontmatter[field] = "127.0.0.1"
|
|
1192
|
+
|
|
1193
|
+
fm_str = yaml.dump(frontmatter, allow_unicode=True, sort_keys=False, default_flow_style=False).rstrip()
|
|
1194
|
+
content = f"---\n{fm_str}\n---\n{body}"
|
|
1195
|
+
md_path.write_text(content, encoding="utf-8")
|
|
1196
|
+
|
|
1197
|
+
# 重新扫描以更新缓存
|
|
1198
|
+
await self._rescan_modules()
|
|
1199
|
+
|
|
1200
|
+
return await self._rpc_get_module_config({"module_name": module_name})
|
|
1201
|
+
|
|
1202
|
+
@staticmethod
|
|
1203
|
+
def _deep_merge(base: dict, overlay: dict) -> dict:
|
|
1204
|
+
"""递归合并字典"""
|
|
1205
|
+
for k, v in overlay.items():
|
|
1206
|
+
if k in base and isinstance(base[k], dict) and isinstance(v, dict):
|
|
1207
|
+
Launcher._deep_merge(base[k], v)
|
|
1208
|
+
else:
|
|
1209
|
+
base[k] = v
|
|
1210
|
+
return base
|
|
1211
|
+
|
|
1212
|
+
|
|
1213
|
+
async def _rpc_restart_launcher(self, params: dict) -> dict:
|
|
1214
|
+
"""Restart Launcher process via Watchdog.
|
|
1215
|
+
|
|
1216
|
+
Simply notify watchdog and exit. Watchdog will start a new instance.
|
|
1217
|
+
|
|
1218
|
+
Args:
|
|
1219
|
+
params: {
|
|
1220
|
+
"reason": str (optional) - Restart reason
|
|
1221
|
+
}
|
|
1222
|
+
|
|
1223
|
+
Returns:
|
|
1224
|
+
{"status": "restarting", "reason": str}
|
|
1225
|
+
or {"error": "watchdog offline"}
|
|
1226
|
+
"""
|
|
1227
|
+
reason = params.get("reason", "user_request")
|
|
1228
|
+
O = "\033[33m" # orange/yellow
|
|
1229
|
+
R = "\033[0m" # reset
|
|
1230
|
+
print(f"{O}[launcher] 收到 Launcher 重启请求{R}")
|
|
1231
|
+
print(f"[launcher] 原因: {reason}")
|
|
1232
|
+
|
|
1233
|
+
# Check if watchdog is running
|
|
1234
|
+
watchdog_running = self.process_manager.is_running("watchdog")
|
|
1235
|
+
print(f"[launcher] 检查 watchdog 状态: {'running' if watchdog_running else 'stopped'}")
|
|
1236
|
+
|
|
1237
|
+
if not watchdog_running:
|
|
1238
|
+
error_msg = "watchdog 未运行, 无法重启"
|
|
1239
|
+
print(f"[launcher] ❌ {error_msg}")
|
|
1240
|
+
return {"error": error_msg}
|
|
1241
|
+
|
|
1242
|
+
print(f"[launcher] ✓ watchdog 状态正常,准备重启流程")
|
|
1243
|
+
|
|
1244
|
+
# Schedule restart in background (don't block RPC response)
|
|
1245
|
+
async def _do_restart():
|
|
1246
|
+
await asyncio.sleep(0.3) # 确保 RPC 响应已发送
|
|
1247
|
+
|
|
1248
|
+
print(f"[launcher] 发送 module.exiting 事件给 watchdog...")
|
|
1249
|
+
|
|
1250
|
+
# Collect startup info for watchdog to restart with same environment
|
|
1251
|
+
startup_info = {
|
|
1252
|
+
"python": sys.executable,
|
|
1253
|
+
"argv": sys.argv,
|
|
1254
|
+
"cwd": os.getcwd(),
|
|
1255
|
+
"env": dict(os.environ), # 所有环境变量
|
|
1256
|
+
}
|
|
1257
|
+
|
|
1258
|
+
# Notify watchdog: this is a planned restart, not a crash
|
|
1259
|
+
await self._publish_event("module.exiting", {
|
|
1260
|
+
"module_id": "launcher",
|
|
1261
|
+
"action": "restart_launcher",
|
|
1262
|
+
"reason": reason,
|
|
1263
|
+
"startup_info": startup_info,
|
|
1264
|
+
})
|
|
1265
|
+
|
|
1266
|
+
print(f"[launcher] 已通知 watchdog 计划内重启")
|
|
1267
|
+
print(f"[launcher] 退出进程,等待 watchdog 重启")
|
|
1268
|
+
print(f"[launcher] 原因: {reason}")
|
|
1269
|
+
|
|
1270
|
+
os._exit(0)
|
|
1271
|
+
|
|
1272
|
+
asyncio.create_task(_do_restart())
|
|
1273
|
+
|
|
1274
|
+
return {"status": "restarting", "reason": reason}
|
|
1275
|
+
|
|
874
1276
|
# ── Event publishing via RPC ──
|
|
875
1277
|
|
|
876
1278
|
async def _publish_event(self, event_type: str, data: dict):
|
|
877
1279
|
"""Publish an event via RPC event.publish through Kernel WS."""
|
|
878
1280
|
if not self._ws:
|
|
879
1281
|
return
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
"id": str(uuid.uuid4()),
|
|
883
|
-
"method": "event.publish",
|
|
884
|
-
"params": {
|
|
1282
|
+
try:
|
|
1283
|
+
await self._rpc_call(self._ws, "event.publish", {
|
|
885
1284
|
"event_id": str(uuid.uuid4()),
|
|
886
1285
|
"event": event_type,
|
|
887
1286
|
"data": data,
|
|
888
|
-
},
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
async def _send():
|
|
892
|
-
try:
|
|
893
|
-
await self._ws.send(msg)
|
|
894
|
-
except Exception as e:
|
|
895
|
-
print(f"[launcher] 发布事件失败: {e}")
|
|
896
|
-
|
|
897
|
-
asyncio.create_task(_send())
|
|
1287
|
+
}, timeout=2.0)
|
|
1288
|
+
except Exception as e:
|
|
1289
|
+
print(f"[launcher] 发布事件失败 ({event_type}): {e}")
|
|
898
1290
|
|
|
899
1291
|
async def _wait_event(self, event_type: str, module_id: str, timeout: float) -> dict | None:
|
|
900
1292
|
"""Wait for a specific event from a module. Returns data dict or None on timeout."""
|
|
901
1293
|
key = f"{event_type}:{module_id}"
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
self._event_waiters
|
|
1294
|
+
# Reuse existing waiter if one was pre-registered (e.g. in _ws_connect)
|
|
1295
|
+
# This prevents a race where the event arrives before this method is called
|
|
1296
|
+
existing = self._event_waiters.get(key)
|
|
1297
|
+
if existing:
|
|
1298
|
+
evt, data = existing
|
|
1299
|
+
else:
|
|
1300
|
+
evt = asyncio.Event()
|
|
1301
|
+
data = {}
|
|
1302
|
+
self._event_waiters[key] = (evt, data)
|
|
905
1303
|
try:
|
|
906
1304
|
await asyncio.wait_for(evt.wait(), timeout=timeout)
|
|
907
1305
|
return data
|
|
@@ -910,170 +1308,444 @@ class Launcher:
|
|
|
910
1308
|
finally:
|
|
911
1309
|
self._event_waiters.pop(key, None)
|
|
912
1310
|
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
"""
|
|
917
|
-
self.
|
|
1311
|
+
# ── 退出机制辅助方法 ──
|
|
1312
|
+
|
|
1313
|
+
def _init_module_state(self, name: str):
|
|
1314
|
+
"""初始化模块状态跟踪字典"""
|
|
1315
|
+
self._module_states[name] = {
|
|
1316
|
+
"shutdown_sent": False,
|
|
1317
|
+
"ack_received": False,
|
|
1318
|
+
"exiting_received": False,
|
|
1319
|
+
"ready_received": False,
|
|
1320
|
+
"stopped_sent": False,
|
|
1321
|
+
"exit_type": None, # "graceful" | "non_graceful" | "active"
|
|
1322
|
+
"reason": None,
|
|
1323
|
+
"restart": None,
|
|
1324
|
+
"cleanup_timeout": None,
|
|
1325
|
+
"cleanup_task": None,
|
|
1326
|
+
}
|
|
918
1327
|
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
1328
|
+
def _kill_process(self, name: str):
|
|
1329
|
+
"""统一的进程杀死方法"""
|
|
1330
|
+
record = self.process_manager.get_record(name)
|
|
1331
|
+
if record and record.proc and record.proc.poll() is None:
|
|
1332
|
+
print(f"[launcher] 强制终止 {name} (PID {record.proc.pid})")
|
|
1333
|
+
self.process_manager.kill_process(name)
|
|
1334
|
+
elif record:
|
|
1335
|
+
# 进程已经退出,只是清理记录
|
|
1336
|
+
pass
|
|
1337
|
+
else:
|
|
1338
|
+
# 没有记录,可能已经被清理
|
|
1339
|
+
pass
|
|
1340
|
+
|
|
1341
|
+
def _determine_exit_type(self, name: str) -> str:
|
|
1342
|
+
"""判断退出类型: graceful | non_graceful | active"""
|
|
1343
|
+
state = self._module_states.get(name, {})
|
|
1344
|
+
if state.get("exiting_received"):
|
|
1345
|
+
return "graceful"
|
|
1346
|
+
elif state.get("shutdown_sent"):
|
|
1347
|
+
return "non_graceful"
|
|
1348
|
+
else:
|
|
1349
|
+
return "active"
|
|
1350
|
+
|
|
1351
|
+
def _resolve_reason(self, name: str) -> str:
|
|
1352
|
+
"""解析最终原因(优先级:exiting > shutdown > 默认)"""
|
|
1353
|
+
state = self._module_states.get(name, {})
|
|
1354
|
+
if state.get("reason"):
|
|
1355
|
+
return state["reason"]
|
|
1356
|
+
return "unknown"
|
|
1357
|
+
|
|
1358
|
+
def _resolve_restart(self, name: str) -> bool:
|
|
1359
|
+
"""解析重启决策(优先级:exiting > shutdown > 默认)"""
|
|
1360
|
+
state = self._module_states.get(name, {})
|
|
1361
|
+
if state.get("restart") is not None:
|
|
1362
|
+
return state["restart"]
|
|
1363
|
+
# 默认:主动退出不重启,被动关闭看 desired_state
|
|
1364
|
+
if self._determine_exit_type(name) == "active":
|
|
1365
|
+
return False
|
|
1366
|
+
return self._desired_states.get(name) == "running"
|
|
1367
|
+
|
|
1368
|
+
async def _send_stopped_event(self, name: str, exit_code: int):
|
|
1369
|
+
"""发送 module.stopped 事件(防重复)"""
|
|
1370
|
+
state = self._module_states.get(name, {})
|
|
1371
|
+
if state.get("stopped_sent"):
|
|
926
1372
|
return
|
|
927
1373
|
|
|
928
|
-
#
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
ack_data = {}
|
|
932
|
-
self._event_waiters[ack_key] = (ack_evt, ack_data)
|
|
1374
|
+
# 立即设置标记(防止竞态条件)
|
|
1375
|
+
if name in self._module_states:
|
|
1376
|
+
self._module_states[name]["stopped_sent"] = True
|
|
933
1377
|
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
self._event_waiters[ready_key] = (ready_evt, ready_data)
|
|
1378
|
+
exit_type = self._determine_exit_type(name)
|
|
1379
|
+
reason = self._resolve_reason(name)
|
|
1380
|
+
restart = self._resolve_restart(name)
|
|
938
1381
|
|
|
939
|
-
await self._publish_event("module.
|
|
940
|
-
"module_id": name,
|
|
1382
|
+
await self._publish_event("module.stopped", {
|
|
1383
|
+
"module_id": name,
|
|
1384
|
+
"exit_code": exit_code,
|
|
1385
|
+
"exit_type": exit_type,
|
|
1386
|
+
"reason": reason,
|
|
1387
|
+
"restart": restart,
|
|
1388
|
+
"ready_received": state.get("ready_received", False),
|
|
941
1389
|
})
|
|
942
1390
|
|
|
943
|
-
|
|
1391
|
+
# ── 优雅关闭 ──
|
|
1392
|
+
|
|
1393
|
+
async def _graceful_stop(self, name: str, reason: str = "stop_requested", timeout: float = 10):
|
|
1394
|
+
"""优雅关闭单个模块:
|
|
1395
|
+
1. 初始化状态跟踪
|
|
1396
|
+
2. 非优雅模块直接 SIGTERM
|
|
1397
|
+
3. 优雅模块:发送 shutdown → 等待 ack → 等待 exiting → 启动清理超时 → 杀死
|
|
1398
|
+
"""
|
|
944
1399
|
try:
|
|
945
|
-
|
|
946
|
-
ack = ack_data
|
|
947
|
-
except asyncio.TimeoutError:
|
|
948
|
-
ack = None
|
|
949
|
-
finally:
|
|
950
|
-
self._event_waiters.pop(ack_key, None)
|
|
1400
|
+
self._log_lifecycle("stopping", name, reason=reason)
|
|
951
1401
|
|
|
952
|
-
|
|
953
|
-
self.
|
|
954
|
-
self.
|
|
955
|
-
|
|
1402
|
+
# 初始化状态
|
|
1403
|
+
self._init_module_state(name)
|
|
1404
|
+
state = self._module_states[name]
|
|
1405
|
+
|
|
1406
|
+
# 非优雅模块:直接 SIGTERM
|
|
1407
|
+
if not self._graceful_modules.get(name):
|
|
1408
|
+
state["shutdown_sent"] = True # 标记:Launcher 主动关闭
|
|
1409
|
+
state["stopped_sent"] = True # 防重复标记
|
|
1410
|
+
state["reason"] = reason
|
|
1411
|
+
state["restart"] = self._desired_states.get(name) == "running"
|
|
1412
|
+
|
|
1413
|
+
self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
|
|
1414
|
+
|
|
1415
|
+
# 发送 stopped 事件
|
|
1416
|
+
await self._publish_event("module.stopped", {
|
|
1417
|
+
"module_id": name,
|
|
1418
|
+
"exit_code": 0,
|
|
1419
|
+
"exit_type": "non_graceful",
|
|
1420
|
+
"reason": reason,
|
|
1421
|
+
"restart": state["restart"],
|
|
1422
|
+
"ready_received": False,
|
|
1423
|
+
})
|
|
1424
|
+
|
|
1425
|
+
self._log_lifecycle("stopped", name, reason=reason)
|
|
1426
|
+
return
|
|
1427
|
+
|
|
1428
|
+
# 优雅模块:提前注册所有三个 waiter(ack、exiting、ready)
|
|
1429
|
+
# 这样可以避免事件到达时 waiter 还没注册的竞争条件
|
|
1430
|
+
ack_key = f"module.shutdown.ack:{name}"
|
|
1431
|
+
ack_evt = asyncio.Event()
|
|
1432
|
+
ack_data = {}
|
|
1433
|
+
self._event_waiters[ack_key] = (ack_evt, ack_data)
|
|
1434
|
+
|
|
1435
|
+
exiting_key = f"module.exiting:{name}"
|
|
1436
|
+
exiting_evt = asyncio.Event()
|
|
1437
|
+
exiting_data = {}
|
|
1438
|
+
self._event_waiters[exiting_key] = (exiting_evt, exiting_data)
|
|
1439
|
+
|
|
1440
|
+
ready_key = f"module.shutdown.ready:{name}"
|
|
1441
|
+
ready_evt = asyncio.Event()
|
|
1442
|
+
ready_data = {}
|
|
1443
|
+
self._event_waiters[ready_key] = (ready_evt, ready_data)
|
|
1444
|
+
|
|
1445
|
+
# 发送 shutdown 事件
|
|
1446
|
+
state["shutdown_sent"] = True
|
|
1447
|
+
state["reason"] = reason
|
|
1448
|
+
state["restart"] = self._desired_states.get(name) == "running"
|
|
1449
|
+
|
|
1450
|
+
await self._publish_event("module.shutdown", {
|
|
956
1451
|
"module_id": name,
|
|
957
|
-
"
|
|
1452
|
+
"reason": reason,
|
|
1453
|
+
"timeout": timeout,
|
|
1454
|
+
"restart": state["restart"],
|
|
958
1455
|
})
|
|
959
|
-
return
|
|
960
1456
|
|
|
961
|
-
|
|
1457
|
+
# 等待 ack
|
|
1458
|
+
try:
|
|
1459
|
+
await asyncio.wait_for(ack_evt.wait(), timeout=SHUTDOWN_TIMEOUT_ACK)
|
|
1460
|
+
state["ack_received"] = True
|
|
1461
|
+
except asyncio.TimeoutError:
|
|
1462
|
+
pass
|
|
1463
|
+
finally:
|
|
1464
|
+
self._event_waiters.pop(ack_key, None)
|
|
962
1465
|
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
finally:
|
|
970
|
-
self._event_waiters.pop(ready_key, None)
|
|
971
|
-
if ready:
|
|
972
|
-
self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_READY)
|
|
973
|
-
else:
|
|
974
|
-
self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_PARTIAL)
|
|
1466
|
+
if not state["ack_received"]:
|
|
1467
|
+
# 没有 ack,直接杀死
|
|
1468
|
+
self._event_waiters.pop(exiting_key, None)
|
|
1469
|
+
self._event_waiters.pop(ready_key, None)
|
|
1470
|
+
state["stopped_sent"] = True
|
|
1471
|
+
self._kill_process(name)
|
|
975
1472
|
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
1473
|
+
# 发送 stopped 事件
|
|
1474
|
+
await self._publish_event("module.stopped", {
|
|
1475
|
+
"module_id": name,
|
|
1476
|
+
"exit_code": -1, # 未收到 ack,退出码未知
|
|
1477
|
+
"exit_type": "timeout",
|
|
1478
|
+
"reason": state["reason"],
|
|
1479
|
+
"restart": state["restart"],
|
|
1480
|
+
"ready_received": False,
|
|
1481
|
+
})
|
|
1482
|
+
|
|
1483
|
+
self._log_lifecycle("stopped", name, reason=reason)
|
|
1484
|
+
return
|
|
1485
|
+
|
|
1486
|
+
# 等待 exiting 事件
|
|
1487
|
+
try:
|
|
1488
|
+
await asyncio.wait_for(exiting_evt.wait(), timeout=SHUTDOWN_TIMEOUT_EXITING)
|
|
1489
|
+
state["exiting_received"] = True
|
|
1490
|
+
# 从 exiting 事件中提取信息
|
|
1491
|
+
if exiting_data.get("reason"):
|
|
1492
|
+
state["reason"] = exiting_data["reason"]
|
|
1493
|
+
if "restart" in exiting_data:
|
|
1494
|
+
state["restart"] = exiting_data["restart"]
|
|
1495
|
+
cleanup_timeout = exiting_data.get("cleanup_timeout", CLEANUP_TIMEOUT_DEFAULT)
|
|
1496
|
+
cleanup_timeout = max(CLEANUP_TIMEOUT_MIN, min(cleanup_timeout, CLEANUP_TIMEOUT_MAX))
|
|
1497
|
+
state["cleanup_timeout"] = cleanup_timeout
|
|
1498
|
+
except asyncio.TimeoutError:
|
|
1499
|
+
pass
|
|
1500
|
+
finally:
|
|
1501
|
+
self._event_waiters.pop(exiting_key, None)
|
|
1502
|
+
|
|
1503
|
+
if not state["exiting_received"]:
|
|
1504
|
+
# 没有 exiting,直接杀死
|
|
1505
|
+
self._event_waiters.pop(ready_key, None)
|
|
1506
|
+
state["stopped_sent"] = True
|
|
1507
|
+
self._kill_process(name)
|
|
1508
|
+
|
|
1509
|
+
# 发送 stopped 事件
|
|
1510
|
+
await self._publish_event("module.stopped", {
|
|
1511
|
+
"module_id": name,
|
|
1512
|
+
"exit_code": -1, # 未收到 exiting,退出码未知
|
|
1513
|
+
"exit_type": "timeout",
|
|
1514
|
+
"reason": state["reason"],
|
|
1515
|
+
"restart": state["restart"],
|
|
1516
|
+
"ready_received": False,
|
|
1517
|
+
})
|
|
1518
|
+
|
|
1519
|
+
self._log_lifecycle("stopped", name, reason=state["reason"])
|
|
1520
|
+
return
|
|
1521
|
+
|
|
1522
|
+
# ready waiter 已经在前面注册好了,直接启动清理超时任务
|
|
1523
|
+
# 启动清理超时任务(兜底机制)
|
|
1524
|
+
async def cleanup_timeout_handler():
|
|
1525
|
+
await asyncio.sleep(state["cleanup_timeout"])
|
|
1526
|
+
if not state.get("stopped_sent"):
|
|
1527
|
+
print(f"[launcher] {name} 清理超时 ({state['cleanup_timeout']}s),强制终止")
|
|
1528
|
+
state["stopped_sent"] = True
|
|
1529
|
+
self._kill_process(name)
|
|
1530
|
+
|
|
1531
|
+
# 发送 stopped 事件
|
|
1532
|
+
await self._publish_event("module.stopped", {
|
|
1533
|
+
"module_id": name,
|
|
1534
|
+
"exit_code": -1, # 清理超时,退出码未知
|
|
1535
|
+
"exit_type": "timeout",
|
|
1536
|
+
"reason": state["reason"],
|
|
1537
|
+
"restart": state["restart"],
|
|
1538
|
+
"ready_received": False,
|
|
1539
|
+
})
|
|
1540
|
+
|
|
1541
|
+
self._log_lifecycle("stopped", name, reason=state["reason"])
|
|
1542
|
+
|
|
1543
|
+
state["cleanup_task"] = asyncio.create_task(cleanup_timeout_handler())
|
|
1544
|
+
|
|
1545
|
+
# 等待 ready 事件(主路径)
|
|
1546
|
+
try:
|
|
1547
|
+
await asyncio.wait_for(ready_evt.wait(), timeout=state["cleanup_timeout"])
|
|
1548
|
+
state["ready_received"] = True
|
|
1549
|
+
print(f"[launcher] {name} 清理完成,准备退出")
|
|
1550
|
+
except asyncio.TimeoutError:
|
|
1551
|
+
# 超时由 cleanup_timeout_handler 处理
|
|
1552
|
+
pass
|
|
1553
|
+
finally:
|
|
1554
|
+
self._event_waiters.pop(ready_key, None)
|
|
1555
|
+
|
|
1556
|
+
# 取消清理超时任务(如果 ready 先到达)
|
|
1557
|
+
if state.get("ready_received") and state["cleanup_task"] and not state["cleanup_task"].done():
|
|
1558
|
+
state["cleanup_task"].cancel()
|
|
1559
|
+
|
|
1560
|
+
# 如果收到 ready,立即杀死进程
|
|
1561
|
+
if state.get("ready_received") and not state.get("stopped_sent"):
|
|
1562
|
+
state["stopped_sent"] = True
|
|
1563
|
+
self._kill_process(name)
|
|
1564
|
+
|
|
1565
|
+
# 发送 stopped 事件
|
|
1566
|
+
await self._publish_event("module.stopped", {
|
|
1567
|
+
"module_id": name,
|
|
1568
|
+
"exit_code": 0, # 正常退出
|
|
1569
|
+
"exit_type": "graceful",
|
|
1570
|
+
"reason": state["reason"],
|
|
1571
|
+
"restart": state["restart"],
|
|
1572
|
+
"ready_received": True,
|
|
1573
|
+
})
|
|
1574
|
+
|
|
1575
|
+
self._log_lifecycle("stopped", name, reason=state["reason"])
|
|
1576
|
+
|
|
1577
|
+
except Exception as e:
|
|
1578
|
+
# 优雅关闭出错,强制终止进程
|
|
1579
|
+
print(f"[launcher] 优雅关闭出错: {e}")
|
|
1580
|
+
if not state.get("stopped_sent"):
|
|
1581
|
+
state["stopped_sent"] = True
|
|
1582
|
+
self._kill_process(name)
|
|
1583
|
+
# 清理所有 waiters
|
|
1584
|
+
self._event_waiters.pop(f"module.shutdown.ack:{name}", None)
|
|
1585
|
+
self._event_waiters.pop(f"module.exiting:{name}", None)
|
|
1586
|
+
self._event_waiters.pop(f"module.shutdown.ready:{name}", None)
|
|
981
1587
|
|
|
982
1588
|
async def _graceful_shutdown_all(self):
|
|
983
|
-
"""
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
1589
|
+
"""全量优雅退出:三阶段关闭
|
|
1590
|
+
|
|
1591
|
+
Phase 1: 先关闭 Watchdog(防止它监控到其他模块退出后触发重启)
|
|
1592
|
+
Phase 2: 关闭其他所有模块(除 Kernel)
|
|
1593
|
+
Phase 3: 最后关闭 Kernel(保证事件路由畅通)
|
|
988
1594
|
"""
|
|
989
1595
|
self._system_shutting_down = True
|
|
1596
|
+
|
|
1597
|
+
# 发送 Launcher 自己的 exiting 事件
|
|
1598
|
+
await self._publish_event("module.exiting", {
|
|
1599
|
+
"module_id": "launcher",
|
|
1600
|
+
"type": "active",
|
|
1601
|
+
"reason": "system_shutdown",
|
|
1602
|
+
"action": "none",
|
|
1603
|
+
"timeout": 0,
|
|
1604
|
+
})
|
|
1605
|
+
|
|
990
1606
|
running = [n for n in self.modules if self.process_manager.is_running(n)]
|
|
991
1607
|
# Also check core modules
|
|
992
1608
|
for cn in CORE_MODULE_NAMES:
|
|
993
1609
|
if self.process_manager.is_running(cn) and cn not in running:
|
|
994
1610
|
running.append(cn)
|
|
1611
|
+
|
|
995
1612
|
if not running:
|
|
996
1613
|
print("[launcher] 没有运行中的模块需要关闭")
|
|
997
1614
|
return
|
|
998
1615
|
|
|
999
|
-
|
|
1000
|
-
|
|
1616
|
+
# 分组:Watchdog、Kernel、其他模块
|
|
1617
|
+
watchdog_running = WATCHDOG_MODULE_NAME in running
|
|
1618
|
+
kernel_running = "kernel" in running
|
|
1619
|
+
other_modules = [n for n in running if n not in (WATCHDOG_MODULE_NAME, "kernel")]
|
|
1620
|
+
|
|
1621
|
+
graceful_others = [n for n in other_modules if self._graceful_modules.get(n)]
|
|
1622
|
+
non_graceful_others = [n for n in other_modules if not self._graceful_modules.get(n)]
|
|
1623
|
+
|
|
1624
|
+
print(f"[launcher] 正在关闭 {len(running)} 个模块(三阶段)")
|
|
1625
|
+
|
|
1626
|
+
# ═══════════════════════════════════════════════════════════
|
|
1627
|
+
# Phase 1: 先关闭 Watchdog(防止重启其他模块)
|
|
1628
|
+
# ═══════════════════════════════════════════════════════════
|
|
1629
|
+
if watchdog_running and self.process_manager.is_running(WATCHDOG_MODULE_NAME):
|
|
1630
|
+
print(f"[launcher] Phase 1: 通知 Watchdog 退出(防止重启其他模块)")
|
|
1631
|
+
|
|
1632
|
+
if self._graceful_modules.get(WATCHDOG_MODULE_NAME):
|
|
1633
|
+
# Watchdog 支持优雅退出
|
|
1634
|
+
self._init_module_state(WATCHDOG_MODULE_NAME)
|
|
1635
|
+
state = self._module_states[WATCHDOG_MODULE_NAME]
|
|
1636
|
+
state["shutdown_sent"] = True
|
|
1637
|
+
state["reason"] = "system_shutdown"
|
|
1638
|
+
state["restart"] = False
|
|
1639
|
+
self._log_lifecycle("stopping", WATCHDOG_MODULE_NAME, reason="system_shutdown")
|
|
1640
|
+
|
|
1641
|
+
await self._publish_event("module.shutdown", {
|
|
1642
|
+
"module_id": WATCHDOG_MODULE_NAME,
|
|
1643
|
+
"reason": "system_shutdown",
|
|
1644
|
+
"timeout": 5,
|
|
1645
|
+
"restart": False,
|
|
1646
|
+
})
|
|
1647
|
+
|
|
1648
|
+
# 等待 0.2 秒确保事件送达(不需要等待进程退出)
|
|
1649
|
+
await asyncio.sleep(0.2)
|
|
1650
|
+
print(f"[launcher] Watchdog shutdown 事件已发送")
|
|
1651
|
+
else:
|
|
1652
|
+
# 直接终止
|
|
1653
|
+
self._init_module_state(WATCHDOG_MODULE_NAME)
|
|
1654
|
+
state = self._module_states[WATCHDOG_MODULE_NAME]
|
|
1655
|
+
state["shutdown_sent"] = True
|
|
1656
|
+
state["stopped_sent"] = True
|
|
1657
|
+
state["reason"] = "system_shutdown"
|
|
1658
|
+
state["restart"] = False
|
|
1659
|
+
self._log_lifecycle("stopping", WATCHDOG_MODULE_NAME, reason="system_shutdown")
|
|
1001
1660
|
|
|
1002
|
-
|
|
1003
|
-
kernel_deferred = "kernel" in graceful
|
|
1004
|
-
graceful_batch = [n for n in graceful if n != "kernel"] if kernel_deferred else graceful
|
|
1661
|
+
self.process_manager.stop_module(WATCHDOG_MODULE_NAME, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
|
|
1005
1662
|
|
|
1006
|
-
|
|
1663
|
+
await self._publish_event("module.stopped", {
|
|
1664
|
+
"module_id": WATCHDOG_MODULE_NAME,
|
|
1665
|
+
"exit_code": 0,
|
|
1666
|
+
"exit_type": "non_graceful",
|
|
1667
|
+
"reason": "system_shutdown",
|
|
1668
|
+
"restart": False,
|
|
1669
|
+
"ready_received": False,
|
|
1670
|
+
})
|
|
1007
1671
|
|
|
1008
|
-
|
|
1009
|
-
|
|
1672
|
+
self._log_lifecycle("stopped", WATCHDOG_MODULE_NAME, reason="system_shutdown")
|
|
1673
|
+
|
|
1674
|
+
# ═══════════════════════════════════════════════════════════
|
|
1675
|
+
# Phase 2: 关闭其他所有模块(除 Kernel)
|
|
1676
|
+
# ═══════════════════════════════════════════════════════════
|
|
1677
|
+
if graceful_others or non_graceful_others:
|
|
1678
|
+
print(f"[launcher] Phase 2: 关闭其他模块({len(graceful_others)} 优雅 + {len(non_graceful_others)} 非优雅)")
|
|
1679
|
+
|
|
1680
|
+
# 通知优雅模块
|
|
1681
|
+
for name in graceful_others:
|
|
1682
|
+
self._init_module_state(name)
|
|
1683
|
+
state = self._module_states[name]
|
|
1684
|
+
state["shutdown_sent"] = True
|
|
1685
|
+
state["reason"] = "system_shutdown"
|
|
1686
|
+
state["restart"] = False
|
|
1010
1687
|
self._log_lifecycle("stopping", name, reason="system_shutdown")
|
|
1011
1688
|
await self._publish_event("module.shutdown", {
|
|
1012
|
-
"module_id": name,
|
|
1689
|
+
"module_id": name,
|
|
1690
|
+
"reason": "system_shutdown",
|
|
1691
|
+
"timeout": 5,
|
|
1692
|
+
"restart": False,
|
|
1013
1693
|
})
|
|
1014
1694
|
|
|
1015
|
-
#
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1695
|
+
# 终止非优雅模块
|
|
1696
|
+
for name in non_graceful_others:
|
|
1697
|
+
self._init_module_state(name)
|
|
1698
|
+
state = self._module_states[name]
|
|
1699
|
+
state["shutdown_sent"] = True
|
|
1700
|
+
state["stopped_sent"] = True
|
|
1701
|
+
state["reason"] = "system_shutdown"
|
|
1702
|
+
state["restart"] = False
|
|
1019
1703
|
self._log_lifecycle("stopping", name, reason="system_shutdown")
|
|
1020
|
-
|
|
1704
|
+
|
|
1705
|
+
self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
|
|
1706
|
+
|
|
1707
|
+
await self._publish_event("module.stopped", {
|
|
1708
|
+
"module_id": name,
|
|
1709
|
+
"exit_code": 0,
|
|
1710
|
+
"exit_type": "non_graceful",
|
|
1711
|
+
"reason": "system_shutdown",
|
|
1712
|
+
"restart": False,
|
|
1713
|
+
"ready_received": False,
|
|
1714
|
+
})
|
|
1715
|
+
|
|
1021
1716
|
self._log_lifecycle("stopped", name, reason="system_shutdown")
|
|
1022
1717
|
|
|
1023
|
-
#
|
|
1024
|
-
if
|
|
1718
|
+
# 等待优雅模块退出(包括 Watchdog)
|
|
1719
|
+
all_graceful = graceful_others + ([WATCHDOG_MODULE_NAME] if watchdog_running and self._graceful_modules.get(WATCHDOG_MODULE_NAME) else [])
|
|
1720
|
+
if all_graceful:
|
|
1025
1721
|
deadline = time.time() + 5
|
|
1026
1722
|
while time.time() < deadline:
|
|
1027
|
-
still_running = [n for n in
|
|
1723
|
+
still_running = [n for n in all_graceful if self.process_manager.is_running(n)]
|
|
1028
1724
|
if not still_running:
|
|
1029
|
-
print("[launcher]
|
|
1725
|
+
print("[launcher] 所有其他模块已退出")
|
|
1030
1726
|
break
|
|
1031
1727
|
remaining = max(0, deadline - time.time())
|
|
1032
1728
|
print(f"[launcher] 等待 {len(still_running)} 个模块退出 ({remaining:.0f}s): {', '.join(still_running)}")
|
|
1033
1729
|
await asyncio.sleep(1)
|
|
1034
|
-
|
|
1035
|
-
|
|
1730
|
+
|
|
1731
|
+
# 强杀未退出的
|
|
1732
|
+
for name in all_graceful:
|
|
1036
1733
|
if self.process_manager.is_running(name):
|
|
1037
|
-
|
|
1038
|
-
self.
|
|
1734
|
+
print(f"[launcher] {name} 超时,强制终止")
|
|
1735
|
+
self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
|
|
1736
|
+
self._log_lifecycle("stopped", name, reason="system_shutdown_timeout")
|
|
1039
1737
|
|
|
1040
|
-
#
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1738
|
+
# ═══════════════════════════════════════════════════════════
|
|
1739
|
+
# Phase 3: 最后关闭 Kernel(使用标准优雅退出流程)
|
|
1740
|
+
# ═══════════════════════════════════════════════════════════
|
|
1741
|
+
if kernel_running and self.process_manager.is_running("kernel"):
|
|
1742
|
+
print("[launcher] Phase 3: 关闭 Kernel(所有其他模块已退出)")
|
|
1044
1743
|
|
|
1045
|
-
#
|
|
1046
|
-
|
|
1047
|
-
try:
|
|
1048
|
-
if self._ws:
|
|
1049
|
-
await self._rpc_call(self._ws, "kernel.shutdown", {})
|
|
1050
|
-
print("[launcher] Kernel shutdown RPC 已发送")
|
|
1051
|
-
rpc_sent = True
|
|
1052
|
-
else:
|
|
1053
|
-
print("[launcher] WebSocket 未连接,跳过 RPC 调用")
|
|
1054
|
-
except Exception as e:
|
|
1055
|
-
print(f"[launcher] Kernel shutdown RPC 失败: {e}")
|
|
1056
|
-
|
|
1057
|
-
# Wait for kernel to exit
|
|
1058
|
-
if rpc_sent:
|
|
1059
|
-
# RPC sent: wait up to 5s for graceful exit
|
|
1060
|
-
proc = self.process_manager._processes.get("kernel")
|
|
1061
|
-
if proc:
|
|
1062
|
-
try:
|
|
1063
|
-
loop = asyncio.get_event_loop()
|
|
1064
|
-
await asyncio.wait_for(
|
|
1065
|
-
loop.run_in_executor(None, proc.wait),
|
|
1066
|
-
timeout=5
|
|
1067
|
-
)
|
|
1068
|
-
print("[launcher] Kernel 已退出")
|
|
1069
|
-
except asyncio.TimeoutError:
|
|
1070
|
-
print("[launcher] Kernel 5秒内未退出,强制停止")
|
|
1071
|
-
self.process_manager.stop_module("kernel", timeout=SHUTDOWN_TIMEOUT_PARTIAL)
|
|
1072
|
-
else:
|
|
1073
|
-
# No RPC (WS not connected): use shorter timeout for terminate
|
|
1074
|
-
self.process_manager.stop_module("kernel", timeout=2)
|
|
1744
|
+
# 明确标记不重启
|
|
1745
|
+
self._desired_states["kernel"] = "stopped"
|
|
1075
1746
|
|
|
1076
|
-
|
|
1747
|
+
# 使用标准优雅退出流程(内含等待 ack → exiting → ready → kill 完整逻辑)
|
|
1748
|
+
await self._graceful_stop("kernel", reason="system_shutdown", timeout=5)
|
|
1077
1749
|
|
|
1078
1750
|
# Final safety net
|
|
1079
1751
|
try:
|
|
@@ -1227,7 +1899,7 @@ class Launcher:
|
|
|
1227
1899
|
# Call Kernel RPC to generate tokens
|
|
1228
1900
|
try:
|
|
1229
1901
|
result = await self._rpc_call(self._ws, "kernel.generate_tokens", {"modules": module_names})
|
|
1230
|
-
if
|
|
1902
|
+
if "result" in result:
|
|
1231
1903
|
tokens = result["result"].get("tokens", {})
|
|
1232
1904
|
self._module_tokens.update(tokens)
|
|
1233
1905
|
print(f"[launcher] Kernel 已生成 {len(tokens)} 个模块令牌")
|
|
@@ -1242,7 +1914,7 @@ class Launcher:
|
|
|
1242
1914
|
return
|
|
1243
1915
|
try:
|
|
1244
1916
|
result = await self._rpc_call(self._ws, "kernel.register_tokens", tokens)
|
|
1245
|
-
if
|
|
1917
|
+
if "result" in result:
|
|
1246
1918
|
print(f"[launcher] 已注册 {len(tokens)} 个模块令牌")
|
|
1247
1919
|
elif "error" in result:
|
|
1248
1920
|
print(f"[launcher] 警告: 令牌注册失败: {result['error'].get('message', '')}")
|
|
@@ -1326,10 +1998,19 @@ class Launcher:
|
|
|
1326
1998
|
if rc != 0:
|
|
1327
1999
|
self._print_module_crash_summary(name)
|
|
1328
2000
|
self._log_lifecycle("exited", name, exit_code=rc)
|
|
1329
|
-
|
|
1330
|
-
|
|
1331
|
-
|
|
1332
|
-
|
|
2001
|
+
|
|
2002
|
+
# 检查是否已发送 stopped 事件
|
|
2003
|
+
state = self._module_states.get(name, {})
|
|
2004
|
+
if not state.get("stopped_sent"):
|
|
2005
|
+
# 取消清理超时任务(如果有)
|
|
2006
|
+
if state.get("cleanup_task"):
|
|
2007
|
+
state["cleanup_task"].cancel()
|
|
2008
|
+
# 发送 stopped 事件
|
|
2009
|
+
await self._send_stopped_event(name, rc)
|
|
2010
|
+
|
|
2011
|
+
# 无论是否发送,都清理状态(防止内存泄漏)
|
|
2012
|
+
self._module_states.pop(name, None)
|
|
2013
|
+
|
|
1333
2014
|
info = self.modules.get(name)
|
|
1334
2015
|
|
|
1335
2016
|
# 1) Core module crash → full restart
|
|
@@ -1457,6 +2138,22 @@ class Launcher:
|
|
|
1457
2138
|
running = []
|
|
1458
2139
|
exited = []
|
|
1459
2140
|
stopped = []
|
|
2141
|
+
|
|
2142
|
+
# Add Launcher itself to running list
|
|
2143
|
+
from types import SimpleNamespace
|
|
2144
|
+
launcher_info = SimpleNamespace(
|
|
2145
|
+
display_name="Launcher",
|
|
2146
|
+
type="infrastructure",
|
|
2147
|
+
)
|
|
2148
|
+
launcher_rec = SimpleNamespace(
|
|
2149
|
+
pid=os.getpid(),
|
|
2150
|
+
started_at=self._start_unix,
|
|
2151
|
+
)
|
|
2152
|
+
running.append(("launcher", launcher_info, launcher_rec))
|
|
2153
|
+
# Launcher is ready immediately (ready_time = 0)
|
|
2154
|
+
if "launcher" not in self._ready_times:
|
|
2155
|
+
self._ready_times["launcher"] = 0.0
|
|
2156
|
+
|
|
1460
2157
|
for name, info in self.modules.items():
|
|
1461
2158
|
rec = self.process_manager.get_record(name)
|
|
1462
2159
|
is_running = self.process_manager.is_running(name)
|
|
@@ -1527,9 +2224,16 @@ class Launcher:
|
|
|
1527
2224
|
label = info.display_name or name
|
|
1528
2225
|
ready_t = self._ready_times.get(name)
|
|
1529
2226
|
time_str = f"{ready_t:.2f}s" if ready_t is not None else "—"
|
|
2227
|
+
|
|
2228
|
+
# Calculate elapsed from start
|
|
1530
2229
|
if ready_t is not None and hasattr(self, '_start_unix'):
|
|
1531
|
-
|
|
1532
|
-
|
|
2230
|
+
if name == "launcher":
|
|
2231
|
+
# Launcher: ready_t is already relative to _start_unix
|
|
2232
|
+
es_str = f"{ready_t:.2f}s"
|
|
2233
|
+
else:
|
|
2234
|
+
# Other modules: rec.started_at is unix timestamp
|
|
2235
|
+
elapsed_from_start = (rec.started_at + ready_t) - self._start_unix
|
|
2236
|
+
es_str = f"{elapsed_from_start:.2f}s"
|
|
1533
2237
|
else:
|
|
1534
2238
|
es_str = "—"
|
|
1535
2239
|
|
|
@@ -1613,7 +2317,18 @@ class Launcher:
|
|
|
1613
2317
|
debug_flag = " [DEBUG]" if os.environ.get("KITE_DEBUG") == "1" else ""
|
|
1614
2318
|
lines.append(f"{G} 当前实例: #{inst_num} 后缀: {suffix_display} PID: {os.getpid()}{debug_flag}{R}")
|
|
1615
2319
|
lines.append(f"{G} 实例目录: {inst_dir}{R}")
|
|
1616
|
-
|
|
2320
|
+
|
|
2321
|
+
# Check for abnormal working directory
|
|
2322
|
+
cwd_lower = cwd.lower()
|
|
2323
|
+
is_abnormal_cwd = (
|
|
2324
|
+
"windowsapps" in cwd_lower or
|
|
2325
|
+
"appdata\\local\\temp" in cwd_lower or
|
|
2326
|
+
not os.path.exists(os.path.join(cwd, "main.py"))
|
|
2327
|
+
)
|
|
2328
|
+
if is_abnormal_cwd:
|
|
2329
|
+
lines.append(f"\033[91m 工作目录: {cwd} ⚠️ 异常路径{R}")
|
|
2330
|
+
else:
|
|
2331
|
+
lines.append(f"{G} 工作目录: {cwd}{R}")
|
|
1617
2332
|
if len(instances) > 1:
|
|
1618
2333
|
lines.append(f"{G} 所有实例:{R}")
|
|
1619
2334
|
for i in instances:
|
|
@@ -1682,6 +2397,30 @@ class Launcher:
|
|
|
1682
2397
|
except Exception:
|
|
1683
2398
|
pass
|
|
1684
2399
|
|
|
2400
|
+
def _record_launcher_startup(self):
|
|
2401
|
+
"""Record launcher startup information to lifecycle.jsonl."""
|
|
2402
|
+
import sys
|
|
2403
|
+
from datetime import datetime, timezone
|
|
2404
|
+
|
|
2405
|
+
record = {
|
|
2406
|
+
"ts": datetime.now(timezone.utc).isoformat(),
|
|
2407
|
+
"event": "launcher_startup",
|
|
2408
|
+
"module": "launcher",
|
|
2409
|
+
"pid": os.getpid(),
|
|
2410
|
+
"cwd": os.getcwd(),
|
|
2411
|
+
"argv": sys.argv,
|
|
2412
|
+
"instance_dir": os.environ.get("KITE_INSTANCE_DIR", ""),
|
|
2413
|
+
"instance_suffix": self.process_manager.instance_suffix,
|
|
2414
|
+
"python": sys.executable,
|
|
2415
|
+
}
|
|
2416
|
+
|
|
2417
|
+
try:
|
|
2418
|
+
os.makedirs(os.path.dirname(self._lifecycle_log), exist_ok=True)
|
|
2419
|
+
with open(self._lifecycle_log, "a", encoding="utf-8") as f:
|
|
2420
|
+
f.write(json.dumps(record, ensure_ascii=False) + "\n")
|
|
2421
|
+
except Exception:
|
|
2422
|
+
pass
|
|
2423
|
+
|
|
1685
2424
|
|
|
1686
2425
|
|
|
1687
2426
|
def _update_module_md_state(module_dir: str, new_state: str):
|