@agentunion/kite 1.0.7 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +208 -0
- package/README.md +48 -0
- package/cli.js +1 -1
- package/extensions/agents/__init__.py +1 -0
- package/extensions/agents/assistant/__init__.py +1 -0
- package/extensions/agents/assistant/entry.py +329 -0
- package/extensions/agents/assistant/module.md +22 -0
- package/extensions/agents/assistant/server.py +197 -0
- package/extensions/channels/__init__.py +1 -0
- package/extensions/channels/acp_channel/__init__.py +1 -0
- package/extensions/channels/acp_channel/entry.py +329 -0
- package/extensions/channels/acp_channel/module.md +22 -0
- package/extensions/channels/acp_channel/server.py +197 -0
- package/extensions/event_hub_bench/entry.py +624 -379
- package/extensions/event_hub_bench/module.md +2 -1
- package/extensions/services/backup/__init__.py +1 -0
- package/extensions/services/backup/entry.py +508 -0
- package/extensions/services/backup/module.md +22 -0
- package/extensions/services/model_service/__init__.py +1 -0
- package/extensions/services/model_service/entry.py +508 -0
- package/extensions/services/model_service/module.md +22 -0
- package/extensions/services/watchdog/entry.py +468 -102
- package/extensions/services/watchdog/module.md +3 -0
- package/extensions/services/watchdog/monitor.py +170 -69
- package/extensions/services/web/__init__.py +1 -0
- package/extensions/services/web/config.yaml +149 -0
- package/extensions/services/web/entry.py +390 -0
- package/extensions/services/web/module.md +24 -0
- package/extensions/services/web/routes/__init__.py +1 -0
- package/extensions/services/web/routes/routes_call.py +189 -0
- package/extensions/services/web/routes/routes_config.py +512 -0
- package/extensions/services/web/routes/routes_contacts.py +98 -0
- package/extensions/services/web/routes/routes_devlog.py +99 -0
- package/extensions/services/web/routes/routes_phone.py +81 -0
- package/extensions/services/web/routes/routes_sms.py +48 -0
- package/extensions/services/web/routes/routes_stats.py +17 -0
- package/extensions/services/web/routes/routes_voicechat.py +554 -0
- package/extensions/services/web/routes/schemas.py +216 -0
- package/extensions/services/web/server.py +375 -0
- package/extensions/services/web/static/css/style.css +1064 -0
- package/extensions/services/web/static/index.html +1445 -0
- package/extensions/services/web/static/js/app.js +4671 -0
- package/extensions/services/web/vendor/__init__.py +1 -0
- package/extensions/services/web/vendor/bluetooth/audio.py +348 -0
- package/extensions/services/web/vendor/bluetooth/contacts.py +251 -0
- package/extensions/services/web/vendor/bluetooth/manager.py +395 -0
- package/extensions/services/web/vendor/bluetooth/sms.py +290 -0
- package/extensions/services/web/vendor/bluetooth/telephony.py +274 -0
- package/extensions/services/web/vendor/config.py +139 -0
- package/extensions/services/web/vendor/conversation/asr.py +936 -0
- package/extensions/services/web/vendor/conversation/engine.py +548 -0
- package/extensions/services/web/vendor/conversation/llm.py +534 -0
- package/extensions/services/web/vendor/conversation/mcp_tools.py +190 -0
- package/extensions/services/web/vendor/conversation/tts.py +322 -0
- package/extensions/services/web/vendor/conversation/vad.py +138 -0
- package/extensions/services/web/vendor/storage/__init__.py +1 -0
- package/extensions/services/web/vendor/storage/identity.py +312 -0
- package/extensions/services/web/vendor/storage/store.py +507 -0
- package/extensions/services/web/vendor/task/manager.py +864 -0
- package/extensions/services/web/vendor/task/models.py +45 -0
- package/extensions/services/web/vendor/task/webhook.py +263 -0
- package/extensions/services/web/vendor/tools/registry.py +321 -0
- package/kernel/__init__.py +0 -0
- package/kernel/entry.py +407 -0
- package/{core/event_hub/hub.py → kernel/event_hub.py} +62 -74
- package/kernel/module.md +33 -0
- package/{core/registry/store.py → kernel/registry_store.py} +23 -8
- package/kernel/rpc_router.py +388 -0
- package/kernel/server.py +267 -0
- package/launcher/__init__.py +10 -0
- package/launcher/__main__.py +6 -0
- package/launcher/count_lines.py +258 -0
- package/launcher/entry.py +1778 -0
- package/launcher/logging_setup.py +289 -0
- package/{core/launcher → launcher}/module_scanner.py +11 -6
- package/launcher/process_manager.py +880 -0
- package/main.py +11 -210
- package/package.json +6 -9
- package/__init__.py +0 -1
- package/__main__.py +0 -15
- package/core/event_hub/BENCHMARK.md +0 -94
- package/core/event_hub/bench.py +0 -459
- package/core/event_hub/bench_extreme.py +0 -308
- package/core/event_hub/bench_perf.py +0 -350
- package/core/event_hub/entry.py +0 -157
- package/core/event_hub/module.md +0 -20
- package/core/event_hub/server.py +0 -206
- package/core/launcher/entry.py +0 -1158
- package/core/launcher/process_manager.py +0 -470
- package/core/registry/entry.py +0 -110
- package/core/registry/module.md +0 -30
- package/core/registry/server.py +0 -289
- package/extensions/services/watchdog/server.py +0 -167
- /package/{core → extensions/services/web/vendor/bluetooth}/__init__.py +0 -0
- /package/{core/event_hub → extensions/services/web/vendor/conversation}/__init__.py +0 -0
- /package/{core/launcher → extensions/services/web/vendor/task}/__init__.py +0 -0
- /package/{core/registry → extensions/services/web/vendor/tools}/__init__.py +0 -0
- /package/{core/event_hub → kernel}/dedup.py +0 -0
- /package/{core/event_hub → kernel}/router.py +0 -0
- /package/{core/launcher → launcher}/module.md +0 -0
|
@@ -0,0 +1,1778 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Launcher — the core of Kite. Manages module lifecycle, monitors processes.
|
|
3
|
+
|
|
4
|
+
Thread model:
|
|
5
|
+
- Main thread: asyncio event loop (process management + monitor loop)
|
|
6
|
+
- stdout threads: one daemon thread per child process (ProcessManager)
|
|
7
|
+
- (Windows) keyboard listener thread: polls for 'q' key
|
|
8
|
+
|
|
9
|
+
2-Phase startup:
|
|
10
|
+
Phase 1: Start Kernel → wait port → connect WS → register self → module.ready
|
|
11
|
+
Phase 2: start remaining enabled modules in topo order (each connects to Kernel WS)
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import asyncio
|
|
15
|
+
import json
|
|
16
|
+
import os
|
|
17
|
+
import secrets
|
|
18
|
+
import signal
|
|
19
|
+
import sys
|
|
20
|
+
import threading
|
|
21
|
+
import time
|
|
22
|
+
import uuid
|
|
23
|
+
|
|
24
|
+
import websockets
|
|
25
|
+
|
|
26
|
+
from .module_scanner import ModuleScanner, ModuleInfo, LaunchConfig, _parse_frontmatter
|
|
27
|
+
from .process_manager import ProcessManager
|
|
28
|
+
|
|
29
|
+
IS_WINDOWS = sys.platform == "win32"
|
|
30
|
+
|
|
31
|
+
# Shutdown timeout constants (seconds)
|
|
32
|
+
SHUTDOWN_TIMEOUT_NON_GRACEFUL = 5 # Non-graceful modules or no ack response
|
|
33
|
+
SHUTDOWN_TIMEOUT_PARTIAL = 3 # Graceful module ack'd but no ready
|
|
34
|
+
SHUTDOWN_TIMEOUT_READY = 1 # Graceful module sent ready (cleanup done)
|
|
35
|
+
SHUTDOWN_TIMEOUT_BULK = 3 # Bulk stop_all() safety net
|
|
36
|
+
|
|
37
|
+
# Core module names that are started in Phase 1 (not Phase 2)
|
|
38
|
+
CORE_MODULE_NAMES = {"kernel"}
|
|
39
|
+
|
|
40
|
+
WATCHDOG_MODULE_NAME = "watchdog"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class Launcher:
|
|
44
|
+
"""Kite system entry point. Starts Kernel, manages modules."""
|
|
45
|
+
|
|
46
|
+
def __init__(self, kite_token: str):
|
|
47
|
+
self.kite_token = kite_token
|
|
48
|
+
self.instance_id = str(os.getpid())
|
|
49
|
+
os.environ["KITE_INSTANCE"] = self.instance_id
|
|
50
|
+
|
|
51
|
+
# Resolve instance workspace (must happen before ProcessManager init)
|
|
52
|
+
self._resolve_instance_dir()
|
|
53
|
+
os.environ["KITE_MODULE_DATA"] = os.path.join(
|
|
54
|
+
os.environ["KITE_INSTANCE_DIR"], "launcher",
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
self.process_manager = ProcessManager(
|
|
58
|
+
kite_token, self.instance_id,
|
|
59
|
+
on_kite_message=self._on_kite_message,
|
|
60
|
+
)
|
|
61
|
+
self.module_scanner = ModuleScanner(
|
|
62
|
+
discovery=self._load_discovery(),
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
self.kernel_port: int = 0
|
|
66
|
+
self.modules: dict[str, ModuleInfo] = {}
|
|
67
|
+
self._shutdown_event = asyncio.Event()
|
|
68
|
+
self._thread_shutdown = threading.Event()
|
|
69
|
+
self._shutdown_complete = threading.Event() # Set when normal shutdown finishes
|
|
70
|
+
self._module_tokens: dict[str, str] = {} # module_name -> per-module token
|
|
71
|
+
|
|
72
|
+
# Three-layer state model: desired_state per module
|
|
73
|
+
self._desired_states: dict[str, str] = {} # module_name -> "running" | "stopped"
|
|
74
|
+
|
|
75
|
+
# Kernel WebSocket client
|
|
76
|
+
self._ws: object | None = None
|
|
77
|
+
self._ws_task: asyncio.Task | None = None
|
|
78
|
+
self._ws_connected: asyncio.Event | None = None # Created in _async_main, set when WS ready
|
|
79
|
+
self._loop: asyncio.AbstractEventLoop | None = None
|
|
80
|
+
|
|
81
|
+
# JSON-RPC 2.0 infrastructure
|
|
82
|
+
self._rpc_waiters: dict[str, asyncio.Event] = {} # rpc_id -> Event
|
|
83
|
+
self._rpc_results: dict[str, dict] = {} # rpc_id -> response dict
|
|
84
|
+
|
|
85
|
+
# Event waiters: {event_key: (asyncio.Event, data_dict)}
|
|
86
|
+
self._event_waiters: dict[str, tuple[asyncio.Event, dict]] = {}
|
|
87
|
+
|
|
88
|
+
# Module ready times: module_name -> seconds from start to ready
|
|
89
|
+
self._ready_times: dict[str, float] = {}
|
|
90
|
+
|
|
91
|
+
# Shutdown timing
|
|
92
|
+
self._shutdown_start_time: float = 0.0
|
|
93
|
+
|
|
94
|
+
# Module exit reasons: module_name -> reason string (for modules that sent module.exiting)
|
|
95
|
+
self._exit_reasons: dict[str, str] = {}
|
|
96
|
+
|
|
97
|
+
# Graceful shutdown capability: module_name -> True if module declared support
|
|
98
|
+
# Kernel defaults to True (it starts before Watchdog can observe)
|
|
99
|
+
self._graceful_modules: dict[str, bool] = {"kernel": True}
|
|
100
|
+
|
|
101
|
+
# System-wide shutdown flag: prevents Watchdog restart during shutdown
|
|
102
|
+
self._system_shutting_down = False
|
|
103
|
+
|
|
104
|
+
# Kite stdout message waiters: {waiter_key: (threading.Event, data_dict)}
|
|
105
|
+
# Used by ProcessManager stdout callback (cross-thread)
|
|
106
|
+
self._msg_waiters: dict[str, tuple[threading.Event, dict]] = {}
|
|
107
|
+
|
|
108
|
+
suffix = self.process_manager.instance_suffix
|
|
109
|
+
state_dir = os.path.join(os.environ["KITE_INSTANCE_DIR"], "launcher", "state")
|
|
110
|
+
os.makedirs(state_dir, exist_ok=True)
|
|
111
|
+
self._lifecycle_log = os.path.join(state_dir, f"lifecycle{suffix}.jsonl")
|
|
112
|
+
# Clear lifecycle log on startup (like latest.log)
|
|
113
|
+
try:
|
|
114
|
+
with open(self._lifecycle_log, "w", encoding="utf-8") as f:
|
|
115
|
+
pass
|
|
116
|
+
except Exception:
|
|
117
|
+
pass
|
|
118
|
+
os.environ["KITE_INSTANCE_SUFFIX"] = suffix
|
|
119
|
+
|
|
120
|
+
@staticmethod
|
|
121
|
+
def _fmt_elapsed(seconds: float) -> str:
|
|
122
|
+
"""Format elapsed seconds: <1s → 'NNNms', >=1s → 'N.Ns', >=10s → 'NNs'."""
|
|
123
|
+
if seconds < 1:
|
|
124
|
+
return f"{seconds * 1000:.0f}ms"
|
|
125
|
+
if seconds < 10:
|
|
126
|
+
return f"{seconds:.1f}s"
|
|
127
|
+
return f"{seconds:.0f}s"
|
|
128
|
+
|
|
129
|
+
# ── Instance workspace resolution ──
|
|
130
|
+
|
|
131
|
+
@staticmethod
|
|
132
|
+
def _resolve_instance_dir():
|
|
133
|
+
"""Resolve KITE_INSTANCE_DIR from KITE_WORKSPACE + KITE_CWD.
|
|
134
|
+
Algorithm: take CWD basename, find matching dir in workspace via .cwd file,
|
|
135
|
+
or create new one. Sets KITE_INSTANCE_DIR env var.
|
|
136
|
+
"""
|
|
137
|
+
if os.environ.get("KITE_INSTANCE_DIR"):
|
|
138
|
+
return # already set (e.g. by tests or parent)
|
|
139
|
+
|
|
140
|
+
cwd = os.environ.get("KITE_CWD", os.getcwd())
|
|
141
|
+
workspace = os.environ.get("KITE_WORKSPACE", "")
|
|
142
|
+
if not workspace:
|
|
143
|
+
home = os.environ.get("HOME") or os.environ.get("USERPROFILE") or os.path.expanduser("~")
|
|
144
|
+
workspace = os.path.join(home, ".kite", "workspace")
|
|
145
|
+
os.environ["KITE_WORKSPACE"] = workspace
|
|
146
|
+
|
|
147
|
+
basename = os.path.basename(cwd.rstrip(os.sep)) or "default"
|
|
148
|
+
suffix = 0
|
|
149
|
+
|
|
150
|
+
while True:
|
|
151
|
+
name = basename if suffix == 0 else f"{basename}~{suffix}"
|
|
152
|
+
candidate = os.path.join(workspace, name)
|
|
153
|
+
cwd_file = os.path.join(candidate, ".cwd")
|
|
154
|
+
|
|
155
|
+
if not os.path.exists(candidate):
|
|
156
|
+
# Empty slot — create new workspace
|
|
157
|
+
os.makedirs(candidate, exist_ok=True)
|
|
158
|
+
with open(cwd_file, "w", encoding="utf-8") as f:
|
|
159
|
+
f.write(cwd)
|
|
160
|
+
os.environ["KITE_INSTANCE_DIR"] = candidate
|
|
161
|
+
return
|
|
162
|
+
|
|
163
|
+
if os.path.isfile(cwd_file):
|
|
164
|
+
try:
|
|
165
|
+
with open(cwd_file, "r", encoding="utf-8") as f:
|
|
166
|
+
if f.read().strip() == cwd:
|
|
167
|
+
os.environ["KITE_INSTANCE_DIR"] = candidate
|
|
168
|
+
return
|
|
169
|
+
except Exception:
|
|
170
|
+
pass
|
|
171
|
+
|
|
172
|
+
suffix += 1
|
|
173
|
+
|
|
174
|
+
# ── Kite stdout message callback ──
|
|
175
|
+
|
|
176
|
+
def _on_kite_message(self, module_name: str, msg: dict):
|
|
177
|
+
"""Called by ProcessManager stdout reader thread when a kite message is detected.
|
|
178
|
+
Thread-safe: only touches _msg_waiters (dict + threading.Event).
|
|
179
|
+
"""
|
|
180
|
+
kite_type = msg.get("kite", "")
|
|
181
|
+
key = f"{module_name}:{kite_type}"
|
|
182
|
+
waiter = self._msg_waiters.get(key)
|
|
183
|
+
if waiter:
|
|
184
|
+
waiter[1].update(msg)
|
|
185
|
+
waiter[0].set()
|
|
186
|
+
|
|
187
|
+
async def _wait_kite_message(self, module_name: str, kite_type: str,
|
|
188
|
+
timeout: float) -> dict | None:
|
|
189
|
+
"""Wait for a kite stdout message from a module. Returns msg dict or None on timeout.
|
|
190
|
+
Checks shutdown flag every 0.5s so Ctrl+C is responsive even during Phase 1-2 waits.
|
|
191
|
+
"""
|
|
192
|
+
key = f"{module_name}:{kite_type}"
|
|
193
|
+
evt = threading.Event()
|
|
194
|
+
data = {}
|
|
195
|
+
self._msg_waiters[key] = (evt, data)
|
|
196
|
+
shutdown = self._thread_shutdown
|
|
197
|
+
try:
|
|
198
|
+
def _wait():
|
|
199
|
+
deadline = time.monotonic() + timeout
|
|
200
|
+
while time.monotonic() < deadline:
|
|
201
|
+
if evt.wait(timeout=0.5):
|
|
202
|
+
return True
|
|
203
|
+
if shutdown.is_set():
|
|
204
|
+
return False
|
|
205
|
+
return False
|
|
206
|
+
got = await asyncio.get_running_loop().run_in_executor(None, _wait)
|
|
207
|
+
return data if got else None
|
|
208
|
+
finally:
|
|
209
|
+
self._msg_waiters.pop(key, None)
|
|
210
|
+
|
|
211
|
+
# ── Public entry ──
|
|
212
|
+
|
|
213
|
+
def run(self):
|
|
214
|
+
"""Synchronous entry point. Sets up signals, runs the async main loop."""
|
|
215
|
+
print("[launcher] ── 环境 ──")
|
|
216
|
+
for key in sorted(k for k in os.environ if k.startswith("KITE_")):
|
|
217
|
+
print(f"[launcher] {key} = {os.environ[key]}")
|
|
218
|
+
print(f"[launcher] PID = {os.getpid()}")
|
|
219
|
+
print(f"[launcher] PYTHON = {sys.executable}")
|
|
220
|
+
print(f"[launcher] PLATFORM = {sys.platform}")
|
|
221
|
+
|
|
222
|
+
if IS_WINDOWS:
|
|
223
|
+
self._setup_windows_exit()
|
|
224
|
+
else:
|
|
225
|
+
self._setup_unix_signals()
|
|
226
|
+
|
|
227
|
+
try:
|
|
228
|
+
asyncio.run(self._async_main())
|
|
229
|
+
except KeyboardInterrupt:
|
|
230
|
+
pass
|
|
231
|
+
except RuntimeError as e:
|
|
232
|
+
# Don't print "启动失败" if user requested shutdown
|
|
233
|
+
if not self._thread_shutdown.is_set():
|
|
234
|
+
print(f"[launcher] 启动失败: {e}")
|
|
235
|
+
finally:
|
|
236
|
+
self._final_cleanup()
|
|
237
|
+
|
|
238
|
+
def _request_shutdown(self, reason: str = ""):
|
|
239
|
+
"""Request graceful shutdown. Thread-safe — can be called from signal handler or any thread."""
|
|
240
|
+
if self._thread_shutdown.is_set():
|
|
241
|
+
return # already shutting down
|
|
242
|
+
print(f"[launcher] {reason or '收到关闭请求'}")
|
|
243
|
+
self._shutdown_start_time = time.monotonic() # Record shutdown start time
|
|
244
|
+
self._thread_shutdown.set()
|
|
245
|
+
# Wake up asyncio event loop immediately (so _monitor_loop / wait_for exits)
|
|
246
|
+
loop = self._loop
|
|
247
|
+
if loop and not loop.is_closed():
|
|
248
|
+
try:
|
|
249
|
+
loop.call_soon_threadsafe(self._shutdown_event.set)
|
|
250
|
+
except RuntimeError:
|
|
251
|
+
pass
|
|
252
|
+
# Safety net: force exit after 10s only if normal shutdown hasn't completed
|
|
253
|
+
def _force():
|
|
254
|
+
if self._shutdown_complete.wait(timeout=10):
|
|
255
|
+
return # Normal shutdown completed — no need to force
|
|
256
|
+
try:
|
|
257
|
+
pm = self.process_manager
|
|
258
|
+
still = [n for n in pm._processes if pm.is_running(n)]
|
|
259
|
+
except Exception:
|
|
260
|
+
still = []
|
|
261
|
+
if still:
|
|
262
|
+
print(f"\033[91m[launcher] 关闭超时,以下模块仍在运行: {', '.join(still)},强制退出\033[0m")
|
|
263
|
+
else:
|
|
264
|
+
print("\033[91m[launcher] 关闭超时,强制退出\033[0m")
|
|
265
|
+
os._exit(1)
|
|
266
|
+
threading.Thread(target=_force, daemon=True).start()
|
|
267
|
+
|
|
268
|
+
def _setup_unix_signals(self):
|
|
269
|
+
"""Register SIGTERM/SIGINT handlers on Linux/macOS."""
|
|
270
|
+
def _handler(signum, frame):
|
|
271
|
+
self._request_shutdown(f"收到信号 {signum},正在关闭...")
|
|
272
|
+
signal.signal(signal.SIGTERM, _handler)
|
|
273
|
+
signal.signal(signal.SIGINT, _handler)
|
|
274
|
+
|
|
275
|
+
def _setup_windows_exit(self):
|
|
276
|
+
"""SetConsoleCtrlHandler for Ctrl+C + daemon thread for 'q' key.
|
|
277
|
+
|
|
278
|
+
Why not signal.signal(SIGINT)?
|
|
279
|
+
Python's signal delivery requires the main thread to be executing bytecode.
|
|
280
|
+
When the main thread is blocked in C code (asyncio ProactorEventLoop →
|
|
281
|
+
GetQueuedCompletionStatus), SIGINT is never delivered.
|
|
282
|
+
SetConsoleCtrlHandler runs its callback in a separate OS thread, so it
|
|
283
|
+
always works regardless of what the main thread is doing.
|
|
284
|
+
"""
|
|
285
|
+
import ctypes
|
|
286
|
+
|
|
287
|
+
@ctypes.WINFUNCTYPE(ctypes.c_int, ctypes.c_uint)
|
|
288
|
+
def _ctrl_handler(ctrl_type):
|
|
289
|
+
if ctrl_type in (0, 1): # CTRL_C_EVENT, CTRL_BREAK_EVENT
|
|
290
|
+
self._request_shutdown("收到 Ctrl+C,正在关闭...")
|
|
291
|
+
return 1 # handled — prevent default (which kills the process)
|
|
292
|
+
return 0
|
|
293
|
+
|
|
294
|
+
# prevent GC of the C callback
|
|
295
|
+
self._ctrl_handler_ref = _ctrl_handler
|
|
296
|
+
ctypes.windll.kernel32.SetConsoleCtrlHandler(_ctrl_handler, 1)
|
|
297
|
+
|
|
298
|
+
# 'q' key: handle via msvcrt polling
|
|
299
|
+
def _listen():
|
|
300
|
+
import msvcrt
|
|
301
|
+
while not self._thread_shutdown.is_set():
|
|
302
|
+
if msvcrt.kbhit():
|
|
303
|
+
ch = msvcrt.getch()
|
|
304
|
+
if ch == b'\x1b': # ESC - force exit immediately
|
|
305
|
+
print("[launcher] ESC 强制退出")
|
|
306
|
+
os._exit(0)
|
|
307
|
+
elif ch in (b'q', b'Q'): # q/Q - graceful shutdown
|
|
308
|
+
self._request_shutdown("收到退出请求,正在关闭...")
|
|
309
|
+
return
|
|
310
|
+
time.sleep(0.1)
|
|
311
|
+
threading.Thread(target=_listen, daemon=True).start()
|
|
312
|
+
|
|
313
|
+
# ── Async main (2-Phase startup) ──
|
|
314
|
+
|
|
315
|
+
async def _async_main(self):
|
|
316
|
+
"""Full 2-phase startup sequence, then monitor loop."""
|
|
317
|
+
self._loop = asyncio.get_running_loop()
|
|
318
|
+
self._ws_connected = asyncio.Event() # Create event in async context
|
|
319
|
+
t_start = time.monotonic()
|
|
320
|
+
self._start_unix = time.time()
|
|
321
|
+
phase_times = {}
|
|
322
|
+
G = "\033[32m"
|
|
323
|
+
R = "\033[0m"
|
|
324
|
+
|
|
325
|
+
# Validate core modules exist
|
|
326
|
+
self._validate_core_modules()
|
|
327
|
+
|
|
328
|
+
# Cleanup leftovers from previous instances (current instance dir)
|
|
329
|
+
local_cleaned = self.process_manager.cleanup_leftovers()
|
|
330
|
+
|
|
331
|
+
# Cross-directory leftover cleanup (background, non-blocking)
|
|
332
|
+
self._global_cleanup_task = asyncio.ensure_future(
|
|
333
|
+
asyncio.get_running_loop().run_in_executor(
|
|
334
|
+
None, self.process_manager.cleanup_global_leftovers
|
|
335
|
+
)
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
try:
|
|
339
|
+
# Phase 1: Start Kernel + connect WS
|
|
340
|
+
t0 = time.monotonic()
|
|
341
|
+
await self._phase1_start_kernel()
|
|
342
|
+
elapsed_p1 = time.monotonic() - t0
|
|
343
|
+
phase_times["Phase 1: Kernel"] = elapsed_p1
|
|
344
|
+
print(f"{G}[launcher] ✓ Phase 1 完成: Kernel 已就绪 ({elapsed_p1:.2f}s){R}")
|
|
345
|
+
if self._shutdown_event.is_set(): return
|
|
346
|
+
|
|
347
|
+
# Initialize desired_state from config_state
|
|
348
|
+
for name, info in self.modules.items():
|
|
349
|
+
if info.state == "enabled":
|
|
350
|
+
self._desired_states[name] = "running"
|
|
351
|
+
else: # manual, disabled
|
|
352
|
+
self._desired_states[name] = "stopped"
|
|
353
|
+
# Core modules are already running
|
|
354
|
+
for cn in CORE_MODULE_NAMES:
|
|
355
|
+
self._desired_states[cn] = "running"
|
|
356
|
+
|
|
357
|
+
# Phase 1.5: Watchdog
|
|
358
|
+
watchdog_info = self.modules.get(WATCHDOG_MODULE_NAME)
|
|
359
|
+
if watchdog_info and self._desired_states.get(WATCHDOG_MODULE_NAME) == "running":
|
|
360
|
+
t0 = time.monotonic()
|
|
361
|
+
print(f"[launcher] Phase 1.5: 启动 Watchdog...")
|
|
362
|
+
await self._start_one_module(watchdog_info)
|
|
363
|
+
elapsed = time.monotonic() - t0
|
|
364
|
+
print(f"{G}[launcher] ✓ Phase 1.5 完成: Watchdog ({elapsed:.2f}s){R}")
|
|
365
|
+
if self._shutdown_event.is_set(): return
|
|
366
|
+
|
|
367
|
+
# Phase 2: Start remaining enabled modules
|
|
368
|
+
t0 = time.monotonic()
|
|
369
|
+
await self._phase2_start_modules()
|
|
370
|
+
elapsed = time.monotonic() - t0
|
|
371
|
+
phase_times["Phase 2: Extensions"] = elapsed
|
|
372
|
+
print(f"{G}[launcher] ✓ Phase 2 完成: 扩展模块已启动 ({elapsed:.2f}s){R}")
|
|
373
|
+
if self._shutdown_event.is_set(): return
|
|
374
|
+
|
|
375
|
+
# Post-startup
|
|
376
|
+
self.process_manager.persist_records()
|
|
377
|
+
|
|
378
|
+
# Wait for global leftover cleanup to finish (non-blocking with timeout)
|
|
379
|
+
global_cleaned = {}
|
|
380
|
+
if hasattr(self, '_global_cleanup_task'):
|
|
381
|
+
try:
|
|
382
|
+
global_cleaned = await asyncio.wait_for(self._global_cleanup_task, timeout=5) or {}
|
|
383
|
+
except asyncio.TimeoutError:
|
|
384
|
+
print("[launcher] 警告: 全局遗留清理超时 (5s),跳过")
|
|
385
|
+
except Exception as e:
|
|
386
|
+
print(f"[launcher] 警告: 全局遗留清理出错: {e}")
|
|
387
|
+
# Merge local + global cleanup stats
|
|
388
|
+
cleaned_stats: dict[str, int] = {}
|
|
389
|
+
for d in (local_cleaned, global_cleaned):
|
|
390
|
+
for k, v in d.items():
|
|
391
|
+
cleaned_stats[k] = cleaned_stats.get(k, 0) + v
|
|
392
|
+
|
|
393
|
+
# Global instance scan (via executor to avoid blocking)
|
|
394
|
+
global_instances = await asyncio.get_running_loop().run_in_executor(
|
|
395
|
+
None, self.process_manager.get_global_instances
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
# ── Startup report ──
|
|
399
|
+
total_time = time.monotonic() - t_start
|
|
400
|
+
await self._print_startup_report(total_time, phase_times,
|
|
401
|
+
global_instances=global_instances,
|
|
402
|
+
cleaned_stats=cleaned_stats)
|
|
403
|
+
# Notify all modules that system startup is complete
|
|
404
|
+
await self._publish_event("system.ready", {
|
|
405
|
+
"startup_time": round(total_time, 2),
|
|
406
|
+
})
|
|
407
|
+
|
|
408
|
+
print("[launcher] 进入监控循环 (按 Ctrl+C 或 'q' 优雅退出,ESC 强制退出)")
|
|
409
|
+
await self._monitor_loop()
|
|
410
|
+
finally:
|
|
411
|
+
try:
|
|
412
|
+
await self._graceful_shutdown_all()
|
|
413
|
+
except Exception as e:
|
|
414
|
+
print(f"[launcher] 优雅关闭出错: {e}")
|
|
415
|
+
|
|
416
|
+
# ── Phase 1: Start Kernel ──
|
|
417
|
+
|
|
418
|
+
async def _phase1_start_kernel(self):
|
|
419
|
+
"""Start Kernel process, connect WS, register self, wait for module.ready.
|
|
420
|
+
|
|
421
|
+
Flow:
|
|
422
|
+
1. Start Kernel subprocess
|
|
423
|
+
2. Wait Kernel stdout port → set KITE_KERNEL_PORT env
|
|
424
|
+
3. Scan modules + connect WS + generate tokens (parallel)
|
|
425
|
+
4. Wait module.ready event from Kernel
|
|
426
|
+
"""
|
|
427
|
+
t_kernel = time.monotonic()
|
|
428
|
+
|
|
429
|
+
# ── Step 1: Start Kernel process ──
|
|
430
|
+
kernel_dir = os.path.join(os.environ["KITE_PROJECT"], "kernel")
|
|
431
|
+
kernel_info = ModuleInfo(
|
|
432
|
+
name="kernel",
|
|
433
|
+
display_name="Kernel",
|
|
434
|
+
type="infrastructure",
|
|
435
|
+
state="enabled",
|
|
436
|
+
runtime="python",
|
|
437
|
+
entry="entry.py",
|
|
438
|
+
module_dir=kernel_dir,
|
|
439
|
+
)
|
|
440
|
+
# Kernel does NOT receive boot_info via stdin
|
|
441
|
+
self._log_lifecycle("starting", "kernel")
|
|
442
|
+
ok = self.process_manager.start_module(kernel_info, boot_info=None)
|
|
443
|
+
if not ok:
|
|
444
|
+
self._log_lifecycle("start_failed", "kernel")
|
|
445
|
+
raise RuntimeError("启动 Kernel 失败")
|
|
446
|
+
|
|
447
|
+
print(f"[launcher] Kernel 进程已启动,等待 Kernel 端口...")
|
|
448
|
+
|
|
449
|
+
# Persist immediately after starting core processes
|
|
450
|
+
self.process_manager.persist_records()
|
|
451
|
+
|
|
452
|
+
# ── Step 2: Wait for Kernel port + launcher_token ──
|
|
453
|
+
msg = await self._wait_kite_message("kernel", "port", timeout=6)
|
|
454
|
+
if self._thread_shutdown.is_set():
|
|
455
|
+
# User requested shutdown during startup
|
|
456
|
+
raise RuntimeError("启动被用户中断")
|
|
457
|
+
if not msg or not msg.get("port") or not msg.get("token"):
|
|
458
|
+
raise RuntimeError("致命错误: Kernel 在 6s 内未报告端口和 token")
|
|
459
|
+
self.kernel_port = int(msg["port"])
|
|
460
|
+
launcher_token = msg["token"]
|
|
461
|
+
self._module_tokens["launcher"] = launcher_token
|
|
462
|
+
_wait_s = time.monotonic() - t_kernel
|
|
463
|
+
print(f"[launcher] Kernel 端口: {self.kernel_port} (等待 {self._fmt_elapsed(_wait_s)})")
|
|
464
|
+
|
|
465
|
+
# ── Step 3: Set env (but don't send kernel_port to modules yet) ──
|
|
466
|
+
os.environ["KITE_KERNEL_PORT"] = str(self.kernel_port)
|
|
467
|
+
|
|
468
|
+
# ── Step 4: Scan modules + connect WS + generate tokens (parallel) ──
|
|
469
|
+
async def _scan_and_generate_tokens():
|
|
470
|
+
t_scan = time.monotonic()
|
|
471
|
+
self.modules = self.module_scanner.scan()
|
|
472
|
+
for name, info in self.modules.items():
|
|
473
|
+
self._log_lifecycle("scanned", name, state=info.state, module_dir=info.module_dir)
|
|
474
|
+
_scan_s = time.monotonic() - t_scan
|
|
475
|
+
print(f"[launcher] 发现 {len(self.modules)} 个模块: {', '.join(self.modules.keys()) or '(无)'} (扫描 {self._fmt_elapsed(_scan_s)})")
|
|
476
|
+
# Generate tokens via Kernel RPC (after WS connection is ready)
|
|
477
|
+
t_gen = time.monotonic()
|
|
478
|
+
await self._generate_module_tokens()
|
|
479
|
+
_gen_s = time.monotonic() - t_gen
|
|
480
|
+
print(f"[launcher] 令牌生成完成 ({self._fmt_elapsed(_gen_s)})")
|
|
481
|
+
|
|
482
|
+
async def _connect_kernel_ws():
|
|
483
|
+
t_ws = time.monotonic()
|
|
484
|
+
self._ws_task = asyncio.create_task(self._ws_loop())
|
|
485
|
+
# Wait for WebSocket connection to be established and ready
|
|
486
|
+
try:
|
|
487
|
+
await asyncio.wait_for(self._ws_connected.wait(), timeout=5)
|
|
488
|
+
except asyncio.TimeoutError:
|
|
489
|
+
print("[launcher] 警告: WebSocket 连接超时")
|
|
490
|
+
return
|
|
491
|
+
|
|
492
|
+
# Now wait for Kernel module.ready event
|
|
493
|
+
# (waiter is registered inside _ws_connect before _ws_receiver starts)
|
|
494
|
+
ready = await self._wait_event("module.ready", "kernel", timeout=15)
|
|
495
|
+
if ready:
|
|
496
|
+
self._graceful_modules["kernel"] = bool(ready.get("graceful_shutdown"))
|
|
497
|
+
print("[launcher] Kernel 已就绪")
|
|
498
|
+
else:
|
|
499
|
+
print("\033[91m[launcher] 警告: Kernel 在 15s 内未发送 module.ready\033[0m")
|
|
500
|
+
self._ready_times["kernel"] = time.monotonic() - t_ws
|
|
501
|
+
|
|
502
|
+
await asyncio.gather(
|
|
503
|
+
_scan_and_generate_tokens(),
|
|
504
|
+
_connect_kernel_ws(),
|
|
505
|
+
)
|
|
506
|
+
if self._shutdown_event.is_set():
|
|
507
|
+
return
|
|
508
|
+
|
|
509
|
+
self._log_lifecycle("started", "kernel")
|
|
510
|
+
await self._publish_event("module.started", {"module_id": "kernel"})
|
|
511
|
+
self.process_manager.close_stdio("kernel")
|
|
512
|
+
|
|
513
|
+
# Store kernel_info in modules dict if not already present (from scan)
|
|
514
|
+
if "kernel" not in self.modules:
|
|
515
|
+
self.modules["kernel"] = kernel_info
|
|
516
|
+
|
|
517
|
+
# ── Phase 2: Start remaining modules ──
|
|
518
|
+
|
|
519
|
+
async def _phase2_start_modules(self):
|
|
520
|
+
"""Start enabled modules (excluding core) in dependency order."""
|
|
521
|
+
to_start = [m for m in self.modules.values()
|
|
522
|
+
if self._desired_states.get(m.name) == "running"
|
|
523
|
+
and m.name not in CORE_MODULE_NAMES
|
|
524
|
+
and m.name != WATCHDOG_MODULE_NAME]
|
|
525
|
+
if not to_start:
|
|
526
|
+
print("[launcher] 没有额外模块需要启动")
|
|
527
|
+
return
|
|
528
|
+
|
|
529
|
+
# Auto-start manual modules if depended upon
|
|
530
|
+
needed = set(m.name for m in to_start)
|
|
531
|
+
for m in list(to_start):
|
|
532
|
+
for dep in m.depends_on:
|
|
533
|
+
if dep not in needed and dep not in CORE_MODULE_NAMES:
|
|
534
|
+
dep_info = self.modules.get(dep)
|
|
535
|
+
if dep_info and dep_info.state != "disabled":
|
|
536
|
+
needed.add(dep)
|
|
537
|
+
to_start.append(dep_info)
|
|
538
|
+
self._desired_states[dep] = "running"
|
|
539
|
+
print(f"[launcher] 自动启动 '{dep}' (被依赖)")
|
|
540
|
+
elif dep_info and dep_info.state == "disabled":
|
|
541
|
+
print(f"[launcher] 错误: '{m.name}' 依赖已禁用的模块 '{dep}'")
|
|
542
|
+
|
|
543
|
+
try:
|
|
544
|
+
layers = self._topo_layers(to_start)
|
|
545
|
+
except RuntimeError as e:
|
|
546
|
+
print(f"[launcher] 错误: {e}")
|
|
547
|
+
return
|
|
548
|
+
|
|
549
|
+
total = sum(len(layer) for layer in layers)
|
|
550
|
+
print(f"[launcher] 正在启动 {total} 个模块...")
|
|
551
|
+
for layer in layers:
|
|
552
|
+
if len(layer) == 1:
|
|
553
|
+
await self._start_one_module(layer[0])
|
|
554
|
+
else:
|
|
555
|
+
await asyncio.gather(*(self._start_one_module(info) for info in layer))
|
|
556
|
+
|
|
557
|
+
# ── Kernel WebSocket connection (JSON-RPC 2.0) ──
|
|
558
|
+
|
|
559
|
+
async def _ws_loop(self):
|
|
560
|
+
"""Connect to Kernel, reconnect on failure."""
|
|
561
|
+
while not self._thread_shutdown.is_set():
|
|
562
|
+
try:
|
|
563
|
+
await self._ws_connect()
|
|
564
|
+
except asyncio.CancelledError:
|
|
565
|
+
return
|
|
566
|
+
except Exception as e:
|
|
567
|
+
if not self._system_shutting_down:
|
|
568
|
+
print(f"[launcher] Kernel 连接错误: {e}")
|
|
569
|
+
self._ws = None
|
|
570
|
+
await asyncio.sleep(5)
|
|
571
|
+
|
|
572
|
+
async def _ws_connect(self):
|
|
573
|
+
"""Single WebSocket session with JSON-RPC 2.0 protocol."""
|
|
574
|
+
launcher_token = self._module_tokens.get("launcher", "")
|
|
575
|
+
ws_url = f"ws://127.0.0.1:{self.kernel_port}/ws?token={launcher_token}&id=launcher"
|
|
576
|
+
t_ws_connect = time.monotonic()
|
|
577
|
+
async with websockets.connect(ws_url, open_timeout=3, ping_interval=None, ping_timeout=None, close_timeout=10) as ws:
|
|
578
|
+
self._ws = ws
|
|
579
|
+
_ws_s = time.monotonic() - t_ws_connect
|
|
580
|
+
print(f"[launcher] 已连接到 Kernel ({self._fmt_elapsed(_ws_s)})")
|
|
581
|
+
|
|
582
|
+
# Start receive loop in background task BEFORE making any RPC calls
|
|
583
|
+
# This prevents deadlock where RPC waits for response but receive loop hasn't started
|
|
584
|
+
receiver_task = asyncio.create_task(self._ws_receiver(ws))
|
|
585
|
+
|
|
586
|
+
try:
|
|
587
|
+
# Register kernel module.ready waiter BEFORE subscribing to events
|
|
588
|
+
# This prevents race condition where event arrives before waiter is registered
|
|
589
|
+
ready_key = "module.ready:kernel"
|
|
590
|
+
ready_evt = asyncio.Event()
|
|
591
|
+
ready_data = {}
|
|
592
|
+
self._event_waiters[ready_key] = (ready_evt, ready_data)
|
|
593
|
+
|
|
594
|
+
# Subscribe to all events
|
|
595
|
+
await self._rpc_call(ws, "event.subscribe", {"events": [">"]})
|
|
596
|
+
|
|
597
|
+
# Register Launcher itself in the Registry
|
|
598
|
+
await self._rpc_call(ws, "registry.register", {
|
|
599
|
+
"module_id": "launcher",
|
|
600
|
+
"module_type": "infrastructure",
|
|
601
|
+
"events_publish": {
|
|
602
|
+
"module.started": {},
|
|
603
|
+
"module.stopped": {},
|
|
604
|
+
"module.state_changed": {},
|
|
605
|
+
},
|
|
606
|
+
"events_subscribe": [">"],
|
|
607
|
+
})
|
|
608
|
+
print("[launcher] 已注册到 Kernel")
|
|
609
|
+
|
|
610
|
+
# Signal that connection is ready (after subscription and registration)
|
|
611
|
+
if self._ws_connected:
|
|
612
|
+
self._ws_connected.set()
|
|
613
|
+
|
|
614
|
+
# Wait for receiver task to complete (connection closed)
|
|
615
|
+
await receiver_task
|
|
616
|
+
except asyncio.CancelledError:
|
|
617
|
+
receiver_task.cancel()
|
|
618
|
+
raise
|
|
619
|
+
|
|
620
|
+
async def _ws_receiver(self, ws):
|
|
621
|
+
"""Receive loop: classify incoming messages."""
|
|
622
|
+
try:
|
|
623
|
+
async for raw in ws:
|
|
624
|
+
try:
|
|
625
|
+
msg = json.loads(raw)
|
|
626
|
+
except (json.JSONDecodeError, TypeError):
|
|
627
|
+
continue
|
|
628
|
+
try:
|
|
629
|
+
has_method = "method" in msg
|
|
630
|
+
has_id = "id" in msg
|
|
631
|
+
has_result = "result" in msg
|
|
632
|
+
has_error = "error" in msg
|
|
633
|
+
|
|
634
|
+
if has_method and not has_id:
|
|
635
|
+
# Event Notification (no id)
|
|
636
|
+
await self._handle_event_notification(msg)
|
|
637
|
+
elif has_method and has_id:
|
|
638
|
+
# Incoming RPC request (forwarded by Kernel)
|
|
639
|
+
await self._handle_rpc_request(ws, msg)
|
|
640
|
+
elif has_id and (has_result or has_error):
|
|
641
|
+
# RPC response (to our own call)
|
|
642
|
+
self._handle_rpc_response(msg)
|
|
643
|
+
except Exception as e:
|
|
644
|
+
print(f"[launcher] 消息处理异常(已忽略): {e}")
|
|
645
|
+
except asyncio.CancelledError:
|
|
646
|
+
pass
|
|
647
|
+
|
|
648
|
+
# ── JSON-RPC 2.0 infrastructure ──
|
|
649
|
+
|
|
650
|
+
async def _rpc_call(self, ws, method: str, params: dict = None, timeout: float = 5) -> dict:
|
|
651
|
+
"""Send a JSON-RPC 2.0 request and await the response."""
|
|
652
|
+
rpc_id = str(uuid.uuid4())
|
|
653
|
+
msg = {"jsonrpc": "2.0", "id": rpc_id, "method": method}
|
|
654
|
+
if params:
|
|
655
|
+
msg["params"] = params
|
|
656
|
+
|
|
657
|
+
evt = asyncio.Event()
|
|
658
|
+
self._rpc_waiters[rpc_id] = evt
|
|
659
|
+
self._rpc_results[rpc_id] = {}
|
|
660
|
+
|
|
661
|
+
try:
|
|
662
|
+
await ws.send(json.dumps(msg))
|
|
663
|
+
await asyncio.wait_for(evt.wait(), timeout=timeout)
|
|
664
|
+
return self._rpc_results.get(rpc_id, {})
|
|
665
|
+
except asyncio.TimeoutError:
|
|
666
|
+
print(f"[launcher] RPC 超时: {method}")
|
|
667
|
+
return {"error": {"code": -32002, "message": f"RPC timeout: {method}"}}
|
|
668
|
+
finally:
|
|
669
|
+
self._rpc_waiters.pop(rpc_id, None)
|
|
670
|
+
self._rpc_results.pop(rpc_id, None)
|
|
671
|
+
|
|
672
|
+
def _handle_rpc_response(self, msg: dict):
|
|
673
|
+
"""Match an incoming RPC response to a pending waiter."""
|
|
674
|
+
rpc_id = msg.get("id", "")
|
|
675
|
+
waiter = self._rpc_waiters.get(rpc_id)
|
|
676
|
+
if waiter:
|
|
677
|
+
self._rpc_results[rpc_id] = msg
|
|
678
|
+
waiter.set()
|
|
679
|
+
|
|
680
|
+
async def _handle_event_notification(self, msg: dict):
|
|
681
|
+
"""Handle an event notification (JSON-RPC 2.0 Notification with method='event')."""
|
|
682
|
+
params = msg.get("params", {})
|
|
683
|
+
source = params.get("source", "unknown")
|
|
684
|
+
event = params.get("event", "")
|
|
685
|
+
data = params.get("data") if isinstance(params.get("data"), dict) else {}
|
|
686
|
+
ts = params.get("timestamp", "")
|
|
687
|
+
|
|
688
|
+
# Trigger event waiters
|
|
689
|
+
module_id = data.get("module_id", "")
|
|
690
|
+
waiter_key = f"{event}:{module_id}"
|
|
691
|
+
waiter = self._event_waiters.get(waiter_key)
|
|
692
|
+
if waiter:
|
|
693
|
+
waiter[1].update(data)
|
|
694
|
+
waiter[0].set()
|
|
695
|
+
|
|
696
|
+
# module.exiting also wakes module.ready waiter
|
|
697
|
+
if event == "module.exiting" and module_id:
|
|
698
|
+
ready_key = f"module.ready:{module_id}"
|
|
699
|
+
ready_waiter = self._event_waiters.get(ready_key)
|
|
700
|
+
if ready_waiter:
|
|
701
|
+
ready_waiter[1].update(data)
|
|
702
|
+
ready_waiter[1]["_exited"] = True
|
|
703
|
+
ready_waiter[0].set()
|
|
704
|
+
|
|
705
|
+
# module.crash → print red crash summary
|
|
706
|
+
if event == "module.crash" and module_id:
|
|
707
|
+
RED = "\033[91m"
|
|
708
|
+
RESET = "\033[0m"
|
|
709
|
+
exc_type = data.get("exception_type", "Unknown")
|
|
710
|
+
preview = data.get("traceback_preview", "")
|
|
711
|
+
print(f"[launcher] {RED}模块 '{module_id}' 崩溃: {exc_type} — {preview}{RESET}")
|
|
712
|
+
_suffix = os.environ.get("KITE_INSTANCE_SUFFIX", "")
|
|
713
|
+
crash_log = os.path.join(
|
|
714
|
+
os.environ.get("KITE_INSTANCE_DIR", ""),
|
|
715
|
+
module_id, "log", f"crashes{_suffix}.jsonl"
|
|
716
|
+
)
|
|
717
|
+
print(f"[launcher] 崩溃日志: {crash_log}")
|
|
718
|
+
|
|
719
|
+
# Only log system events (module.*, watchdog.*) to avoid flooding
|
|
720
|
+
if not (event.startswith("module.") or event.startswith("watchdog.")):
|
|
721
|
+
return
|
|
722
|
+
latency_str = ""
|
|
723
|
+
if ts:
|
|
724
|
+
try:
|
|
725
|
+
from datetime import datetime, timezone
|
|
726
|
+
sent = datetime.fromisoformat(ts)
|
|
727
|
+
delay_ms = (datetime.now(timezone.utc) - sent).total_seconds() * 1000
|
|
728
|
+
latency_str = f" ({delay_ms:.1f}ms)"
|
|
729
|
+
local_ts = sent.astimezone().strftime("%H:%M:%S")
|
|
730
|
+
except Exception:
|
|
731
|
+
local_ts = ts[11:19] if len(ts) >= 19 else ts
|
|
732
|
+
print(f"[{source}] {local_ts} {event}{latency_str}: {json.dumps(data, ensure_ascii=False)}")
|
|
733
|
+
else:
|
|
734
|
+
print(f"[{source}] {event}: {json.dumps(data, ensure_ascii=False)}")
|
|
735
|
+
|
|
736
|
+
async def _handle_rpc_request(self, ws, msg: dict):
|
|
737
|
+
"""Handle an incoming RPC request forwarded by Kernel (launcher.* methods)."""
|
|
738
|
+
rpc_id = msg.get("id", "")
|
|
739
|
+
method = msg.get("method", "")
|
|
740
|
+
params = msg.get("params", {})
|
|
741
|
+
|
|
742
|
+
handlers = {
|
|
743
|
+
"list_modules": self._rpc_list_modules,
|
|
744
|
+
"start_module": self._rpc_start_module,
|
|
745
|
+
"stop_module": self._rpc_stop_module,
|
|
746
|
+
"restart_module": self._rpc_restart_module,
|
|
747
|
+
"rescan": self._rpc_rescan,
|
|
748
|
+
"shutdown": self._rpc_shutdown,
|
|
749
|
+
}
|
|
750
|
+
handler = handlers.get(method)
|
|
751
|
+
if handler:
|
|
752
|
+
try:
|
|
753
|
+
result = await handler(params)
|
|
754
|
+
await ws.send(json.dumps({"jsonrpc": "2.0", "id": rpc_id, "result": result}))
|
|
755
|
+
except Exception as e:
|
|
756
|
+
await ws.send(json.dumps({
|
|
757
|
+
"jsonrpc": "2.0", "id": rpc_id,
|
|
758
|
+
"error": {"code": -32603, "message": str(e)},
|
|
759
|
+
}))
|
|
760
|
+
else:
|
|
761
|
+
await ws.send(json.dumps({
|
|
762
|
+
"jsonrpc": "2.0", "id": rpc_id,
|
|
763
|
+
"error": {"code": -32601, "message": f"Method not found: {method}"},
|
|
764
|
+
}))
|
|
765
|
+
|
|
766
|
+
# ── Launcher RPC method handlers ──
|
|
767
|
+
|
|
768
|
+
async def _rpc_list_modules(self, params: dict) -> dict:
|
|
769
|
+
"""List all modules and their current status."""
|
|
770
|
+
result = []
|
|
771
|
+
for name, info in self.modules.items():
|
|
772
|
+
running = self.process_manager.is_running(name)
|
|
773
|
+
rec = self.process_manager.get_record(name)
|
|
774
|
+
result.append({
|
|
775
|
+
"name": name,
|
|
776
|
+
"display_name": info.display_name,
|
|
777
|
+
"type": info.type,
|
|
778
|
+
"config_state": info.state,
|
|
779
|
+
"desired_state": self._desired_states.get(name, "stopped"),
|
|
780
|
+
"actual_state": f"running({rec.pid})" if running and rec else "stopped",
|
|
781
|
+
"pid": rec.pid if running and rec else None,
|
|
782
|
+
"monitor": info.monitor,
|
|
783
|
+
})
|
|
784
|
+
return {"modules": result}
|
|
785
|
+
|
|
786
|
+
async def _rpc_start_module(self, params: dict) -> dict:
|
|
787
|
+
"""Start a module by name."""
|
|
788
|
+
name = params.get("name", "")
|
|
789
|
+
info = self.modules.get(name)
|
|
790
|
+
if not info:
|
|
791
|
+
raise RuntimeError(f"Module '{name}' not found")
|
|
792
|
+
if info.state == "disabled":
|
|
793
|
+
raise RuntimeError(f"Module '{name}' is disabled")
|
|
794
|
+
|
|
795
|
+
if name not in self._module_tokens:
|
|
796
|
+
self._module_tokens[name] = secrets.token_hex(32)
|
|
797
|
+
await self._register_new_tokens({name: self._module_tokens[name]})
|
|
798
|
+
|
|
799
|
+
token = self._module_tokens[name]
|
|
800
|
+
boot_info = {"token": token}
|
|
801
|
+
ok = self.process_manager.start_module(info, boot_info=boot_info)
|
|
802
|
+
if ok:
|
|
803
|
+
self._desired_states[name] = "running"
|
|
804
|
+
self.process_manager.persist_records()
|
|
805
|
+
rec = self.process_manager.get_record(name)
|
|
806
|
+
self._log_lifecycle("started", name, pid=rec.pid if rec else None, via="rpc")
|
|
807
|
+
await self._publish_event("module.started", {"module_id": name})
|
|
808
|
+
return {"status": "started", "name": name}
|
|
809
|
+
self._log_lifecycle("start_failed", name, via="rpc")
|
|
810
|
+
raise RuntimeError(f"Failed to start '{name}'")
|
|
811
|
+
|
|
812
|
+
async def _rpc_stop_module(self, params: dict) -> dict:
|
|
813
|
+
"""Stop a module with graceful shutdown."""
|
|
814
|
+
name = params.get("name", "")
|
|
815
|
+
info = self.modules.get(name)
|
|
816
|
+
if not info:
|
|
817
|
+
raise RuntimeError(f"Module '{name}' not found")
|
|
818
|
+
reason = params.get("reason", "stop_requested")
|
|
819
|
+
self._desired_states[name] = "stopped"
|
|
820
|
+
await self._graceful_stop(name, reason)
|
|
821
|
+
self.process_manager.persist_records()
|
|
822
|
+
return {"status": "stopped", "name": name}
|
|
823
|
+
|
|
824
|
+
async def _rpc_restart_module(self, params: dict) -> dict:
|
|
825
|
+
"""Restart a module (stop + start)."""
|
|
826
|
+
name = params.get("name", "")
|
|
827
|
+
info = self.modules.get(name)
|
|
828
|
+
if not info:
|
|
829
|
+
raise RuntimeError(f"Module '{name}' not found")
|
|
830
|
+
if info.state == "disabled":
|
|
831
|
+
raise RuntimeError(f"Module '{name}' is disabled")
|
|
832
|
+
reason = params.get("reason", "restart")
|
|
833
|
+
await self._graceful_stop(name, reason)
|
|
834
|
+
self._module_tokens[name] = secrets.token_hex(32)
|
|
835
|
+
await self._register_new_tokens({name: self._module_tokens[name]})
|
|
836
|
+
token = self._module_tokens[name]
|
|
837
|
+
boot_info = {"token": token}
|
|
838
|
+
ok = self.process_manager.start_module(info, boot_info=boot_info)
|
|
839
|
+
if ok:
|
|
840
|
+
self._desired_states[name] = "running"
|
|
841
|
+
self.process_manager.persist_records()
|
|
842
|
+
rec = self.process_manager.get_record(name)
|
|
843
|
+
self._log_lifecycle("started", name, pid=rec.pid if rec else None, via="rpc_restart")
|
|
844
|
+
await self._publish_event("module.started", {"module_id": name})
|
|
845
|
+
return {"status": "restarted", "name": name}
|
|
846
|
+
self._log_lifecycle("start_failed", name, via="rpc_restart")
|
|
847
|
+
raise RuntimeError(f"Failed to restart '{name}'")
|
|
848
|
+
|
|
849
|
+
async def _rpc_rescan(self, params: dict) -> dict:
|
|
850
|
+
"""Rescan module directories for new/removed modules."""
|
|
851
|
+
old_names = set(self.modules.keys())
|
|
852
|
+
self.modules = self.module_scanner.scan()
|
|
853
|
+
new_names = set(self.modules.keys())
|
|
854
|
+
added = list(new_names - old_names)
|
|
855
|
+
removed = list(old_names - new_names)
|
|
856
|
+
for name in added:
|
|
857
|
+
info = self.modules[name]
|
|
858
|
+
self._log_lifecycle("scanned", name, state=info.state, module_dir=info.module_dir)
|
|
859
|
+
self._desired_states[name] = "running" if info.state == "enabled" else "stopped"
|
|
860
|
+
if added:
|
|
861
|
+
new_tokens = {}
|
|
862
|
+
for name in added:
|
|
863
|
+
self._module_tokens[name] = secrets.token_hex(32)
|
|
864
|
+
new_tokens[name] = self._module_tokens[name]
|
|
865
|
+
await self._register_new_tokens(new_tokens)
|
|
866
|
+
return {"added": added, "removed": removed, "total": len(self.modules)}
|
|
867
|
+
|
|
868
|
+
async def _rpc_shutdown(self, params: dict) -> dict:
|
|
869
|
+
"""Shutdown the entire Kite system."""
|
|
870
|
+
reason = params.get("reason", "rpc_request")
|
|
871
|
+
self._request_shutdown(f"RPC shutdown request: {reason}")
|
|
872
|
+
return {"status": "shutting_down", "reason": reason}
|
|
873
|
+
|
|
874
|
+
# ── Event publishing via RPC ──
|
|
875
|
+
|
|
876
|
+
async def _publish_event(self, event_type: str, data: dict):
|
|
877
|
+
"""Publish an event via RPC event.publish through Kernel WS."""
|
|
878
|
+
if not self._ws:
|
|
879
|
+
return
|
|
880
|
+
msg = json.dumps({
|
|
881
|
+
"jsonrpc": "2.0",
|
|
882
|
+
"id": str(uuid.uuid4()),
|
|
883
|
+
"method": "event.publish",
|
|
884
|
+
"params": {
|
|
885
|
+
"event_id": str(uuid.uuid4()),
|
|
886
|
+
"event": event_type,
|
|
887
|
+
"data": data,
|
|
888
|
+
},
|
|
889
|
+
})
|
|
890
|
+
|
|
891
|
+
async def _send():
|
|
892
|
+
try:
|
|
893
|
+
await self._ws.send(msg)
|
|
894
|
+
except Exception as e:
|
|
895
|
+
print(f"[launcher] 发布事件失败: {e}")
|
|
896
|
+
|
|
897
|
+
asyncio.create_task(_send())
|
|
898
|
+
|
|
899
|
+
async def _wait_event(self, event_type: str, module_id: str, timeout: float) -> dict | None:
|
|
900
|
+
"""Wait for a specific event from a module. Returns data dict or None on timeout."""
|
|
901
|
+
key = f"{event_type}:{module_id}"
|
|
902
|
+
evt = asyncio.Event()
|
|
903
|
+
data = {}
|
|
904
|
+
self._event_waiters[key] = (evt, data)
|
|
905
|
+
try:
|
|
906
|
+
await asyncio.wait_for(evt.wait(), timeout=timeout)
|
|
907
|
+
return data
|
|
908
|
+
except asyncio.TimeoutError:
|
|
909
|
+
return None
|
|
910
|
+
finally:
|
|
911
|
+
self._event_waiters.pop(key, None)
|
|
912
|
+
|
|
913
|
+
async def _graceful_stop(self, name: str, reason: str = "stop_requested", timeout: float = 10):
|
|
914
|
+
"""Graceful shutdown: check capability → send event → wait ack → wait ready → kill.
|
|
915
|
+
Modules that did not declare graceful_shutdown in module.ready are terminated directly.
|
|
916
|
+
"""
|
|
917
|
+
self._log_lifecycle("stopping", name, reason=reason)
|
|
918
|
+
|
|
919
|
+
if not self._graceful_modules.get(name):
|
|
920
|
+
self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
|
|
921
|
+
self._log_lifecycle("stopped", name, reason=reason)
|
|
922
|
+
await self._publish_event("module.stopped", {
|
|
923
|
+
"module_id": name,
|
|
924
|
+
"graceful_shutdown": False,
|
|
925
|
+
})
|
|
926
|
+
return
|
|
927
|
+
|
|
928
|
+
# Register waiters BEFORE sending shutdown event
|
|
929
|
+
ack_key = f"module.shutdown.ack:{name}"
|
|
930
|
+
ack_evt = asyncio.Event()
|
|
931
|
+
ack_data = {}
|
|
932
|
+
self._event_waiters[ack_key] = (ack_evt, ack_data)
|
|
933
|
+
|
|
934
|
+
ready_key = f"module.shutdown.ready:{name}"
|
|
935
|
+
ready_evt = asyncio.Event()
|
|
936
|
+
ready_data = {}
|
|
937
|
+
self._event_waiters[ready_key] = (ready_evt, ready_data)
|
|
938
|
+
|
|
939
|
+
await self._publish_event("module.shutdown", {
|
|
940
|
+
"module_id": name, "reason": reason, "timeout": timeout,
|
|
941
|
+
})
|
|
942
|
+
|
|
943
|
+
# Wait for ack
|
|
944
|
+
try:
|
|
945
|
+
await asyncio.wait_for(ack_evt.wait(), timeout=3)
|
|
946
|
+
ack = ack_data
|
|
947
|
+
except asyncio.TimeoutError:
|
|
948
|
+
ack = None
|
|
949
|
+
finally:
|
|
950
|
+
self._event_waiters.pop(ack_key, None)
|
|
951
|
+
|
|
952
|
+
if not ack:
|
|
953
|
+
self._event_waiters.pop(ready_key, None)
|
|
954
|
+
self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
|
|
955
|
+
await self._publish_event("module.stopped", {
|
|
956
|
+
"module_id": name,
|
|
957
|
+
"graceful_shutdown": self._graceful_modules.get(name, False),
|
|
958
|
+
})
|
|
959
|
+
return
|
|
960
|
+
|
|
961
|
+
estimated = min(ack.get("estimated_cleanup", timeout), timeout)
|
|
962
|
+
|
|
963
|
+
# Wait for ready
|
|
964
|
+
try:
|
|
965
|
+
await asyncio.wait_for(ready_evt.wait(), timeout=estimated)
|
|
966
|
+
ready = ready_data
|
|
967
|
+
except asyncio.TimeoutError:
|
|
968
|
+
ready = None
|
|
969
|
+
finally:
|
|
970
|
+
self._event_waiters.pop(ready_key, None)
|
|
971
|
+
if ready:
|
|
972
|
+
self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_READY)
|
|
973
|
+
else:
|
|
974
|
+
self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_PARTIAL)
|
|
975
|
+
|
|
976
|
+
self._log_lifecycle("stopped", name, reason=reason)
|
|
977
|
+
await self._publish_event("module.stopped", {
|
|
978
|
+
"module_id": name,
|
|
979
|
+
"graceful_shutdown": self._graceful_modules.get(name, False),
|
|
980
|
+
})
|
|
981
|
+
|
|
982
|
+
async def _graceful_shutdown_all(self):
|
|
983
|
+
"""Shut down all modules. Order:
|
|
984
|
+
1. Send shutdown to graceful modules (excl. Kernel) — let them start cleanup
|
|
985
|
+
2. Terminate non-graceful modules (fast, runs during graceful cleanup)
|
|
986
|
+
3. Wait for graceful modules to exit (process monitoring)
|
|
987
|
+
4. Shut down Kernel last (keeps event routing alive throughout)
|
|
988
|
+
"""
|
|
989
|
+
self._system_shutting_down = True
|
|
990
|
+
running = [n for n in self.modules if self.process_manager.is_running(n)]
|
|
991
|
+
# Also check core modules
|
|
992
|
+
for cn in CORE_MODULE_NAMES:
|
|
993
|
+
if self.process_manager.is_running(cn) and cn not in running:
|
|
994
|
+
running.append(cn)
|
|
995
|
+
if not running:
|
|
996
|
+
print("[launcher] 没有运行中的模块需要关闭")
|
|
997
|
+
return
|
|
998
|
+
|
|
999
|
+
graceful = [n for n in running if self._graceful_modules.get(n)]
|
|
1000
|
+
non_graceful = [n for n in running if not self._graceful_modules.get(n)]
|
|
1001
|
+
|
|
1002
|
+
# Defer Kernel — it must stay alive to route shutdown events
|
|
1003
|
+
kernel_deferred = "kernel" in graceful
|
|
1004
|
+
graceful_batch = [n for n in graceful if n != "kernel"] if kernel_deferred else graceful
|
|
1005
|
+
|
|
1006
|
+
print(f"[launcher] 正在关闭 {len(running)} 个模块: {', '.join(running)}")
|
|
1007
|
+
|
|
1008
|
+
# Phase 1: Notify graceful modules first (they start cleanup immediately)
|
|
1009
|
+
for name in graceful_batch:
|
|
1010
|
+
self._log_lifecycle("stopping", name, reason="system_shutdown")
|
|
1011
|
+
await self._publish_event("module.shutdown", {
|
|
1012
|
+
"module_id": name, "reason": "system_shutdown", "timeout": 5,
|
|
1013
|
+
})
|
|
1014
|
+
|
|
1015
|
+
# Phase 2: While graceful modules are cleaning up, terminate non-graceful ones
|
|
1016
|
+
if non_graceful:
|
|
1017
|
+
print(f"[launcher] 直接终止 {len(non_graceful)} 个不支持优雅退出的模块: {', '.join(non_graceful)}")
|
|
1018
|
+
for name in non_graceful:
|
|
1019
|
+
self._log_lifecycle("stopping", name, reason="system_shutdown")
|
|
1020
|
+
self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_PARTIAL)
|
|
1021
|
+
self._log_lifecycle("stopped", name, reason="system_shutdown")
|
|
1022
|
+
|
|
1023
|
+
# Phase 3: Wait for graceful modules to exit (process monitoring)
|
|
1024
|
+
if graceful_batch:
|
|
1025
|
+
deadline = time.time() + 5
|
|
1026
|
+
while time.time() < deadline:
|
|
1027
|
+
still_running = [n for n in graceful_batch if self.process_manager.is_running(n)]
|
|
1028
|
+
if not still_running:
|
|
1029
|
+
print("[launcher] 所有优雅退出模块已自行退出")
|
|
1030
|
+
break
|
|
1031
|
+
remaining = max(0, deadline - time.time())
|
|
1032
|
+
print(f"[launcher] 等待 {len(still_running)} 个模块退出 ({remaining:.0f}s): {', '.join(still_running)}")
|
|
1033
|
+
await asyncio.sleep(1)
|
|
1034
|
+
# Force kill survivors
|
|
1035
|
+
for name in graceful_batch:
|
|
1036
|
+
if self.process_manager.is_running(name):
|
|
1037
|
+
self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_PARTIAL)
|
|
1038
|
+
self._log_lifecycle("stopped", name, reason="system_shutdown")
|
|
1039
|
+
|
|
1040
|
+
# Phase 4: All other modules exited — now shut down Kernel
|
|
1041
|
+
if kernel_deferred and self.process_manager.is_running("kernel"):
|
|
1042
|
+
self._log_lifecycle("stopping", "kernel", reason="system_shutdown")
|
|
1043
|
+
print("[launcher] 正在关闭 Kernel...")
|
|
1044
|
+
|
|
1045
|
+
# Call kernel.shutdown RPC (not event)
|
|
1046
|
+
rpc_sent = False
|
|
1047
|
+
try:
|
|
1048
|
+
if self._ws:
|
|
1049
|
+
await self._rpc_call(self._ws, "kernel.shutdown", {})
|
|
1050
|
+
print("[launcher] Kernel shutdown RPC 已发送")
|
|
1051
|
+
rpc_sent = True
|
|
1052
|
+
else:
|
|
1053
|
+
print("[launcher] WebSocket 未连接,跳过 RPC 调用")
|
|
1054
|
+
except Exception as e:
|
|
1055
|
+
print(f"[launcher] Kernel shutdown RPC 失败: {e}")
|
|
1056
|
+
|
|
1057
|
+
# Wait for kernel to exit
|
|
1058
|
+
if rpc_sent:
|
|
1059
|
+
# RPC sent: wait up to 5s for graceful exit
|
|
1060
|
+
proc = self.process_manager._processes.get("kernel")
|
|
1061
|
+
if proc:
|
|
1062
|
+
try:
|
|
1063
|
+
loop = asyncio.get_event_loop()
|
|
1064
|
+
await asyncio.wait_for(
|
|
1065
|
+
loop.run_in_executor(None, proc.wait),
|
|
1066
|
+
timeout=5
|
|
1067
|
+
)
|
|
1068
|
+
print("[launcher] Kernel 已退出")
|
|
1069
|
+
except asyncio.TimeoutError:
|
|
1070
|
+
print("[launcher] Kernel 5秒内未退出,强制停止")
|
|
1071
|
+
self.process_manager.stop_module("kernel", timeout=SHUTDOWN_TIMEOUT_PARTIAL)
|
|
1072
|
+
else:
|
|
1073
|
+
# No RPC (WS not connected): use shorter timeout for terminate
|
|
1074
|
+
self.process_manager.stop_module("kernel", timeout=2)
|
|
1075
|
+
|
|
1076
|
+
self._log_lifecycle("stopped", "kernel", reason="system_shutdown")
|
|
1077
|
+
|
|
1078
|
+
# Final safety net
|
|
1079
|
+
try:
|
|
1080
|
+
self.process_manager.stop_all(timeout=SHUTDOWN_TIMEOUT_BULK)
|
|
1081
|
+
except Exception as e:
|
|
1082
|
+
print(f"[launcher] stop_all 出错: {e}")
|
|
1083
|
+
|
|
1084
|
+
# ── Module startup ──
|
|
1085
|
+
|
|
1086
|
+
def _topo_sort(self, modules: list[ModuleInfo]) -> list[ModuleInfo]:
|
|
1087
|
+
"""Topological sort by depends_on. Raises RuntimeError on cycle."""
|
|
1088
|
+
name_map = {m.name: m for m in modules}
|
|
1089
|
+
visited = set()
|
|
1090
|
+
in_stack = set()
|
|
1091
|
+
order = []
|
|
1092
|
+
|
|
1093
|
+
def visit(name):
|
|
1094
|
+
if name in in_stack:
|
|
1095
|
+
raise RuntimeError(f"Circular dependency detected involving '{name}'")
|
|
1096
|
+
if name in visited:
|
|
1097
|
+
return
|
|
1098
|
+
in_stack.add(name)
|
|
1099
|
+
info = name_map.get(name)
|
|
1100
|
+
if info:
|
|
1101
|
+
for dep in info.depends_on:
|
|
1102
|
+
visit(dep)
|
|
1103
|
+
in_stack.remove(name)
|
|
1104
|
+
visited.add(name)
|
|
1105
|
+
if info:
|
|
1106
|
+
order.append(info)
|
|
1107
|
+
|
|
1108
|
+
for m in modules:
|
|
1109
|
+
visit(m.name)
|
|
1110
|
+
return order
|
|
1111
|
+
|
|
1112
|
+
def _topo_layers(self, modules: list[ModuleInfo]) -> list[list[ModuleInfo]]:
|
|
1113
|
+
"""Topological sort into layers. Modules in the same layer have no
|
|
1114
|
+
inter-dependencies and can be started in parallel."""
|
|
1115
|
+
name_map = {m.name: m for m in modules}
|
|
1116
|
+
all_names = set(name_map.keys())
|
|
1117
|
+
|
|
1118
|
+
# Compute depth (longest path from root) for each module
|
|
1119
|
+
depth: dict[str, int] = {}
|
|
1120
|
+
in_stack: set[str] = set()
|
|
1121
|
+
|
|
1122
|
+
def get_depth(name: str) -> int:
|
|
1123
|
+
if name in depth:
|
|
1124
|
+
return depth[name]
|
|
1125
|
+
if name in in_stack:
|
|
1126
|
+
raise RuntimeError(f"Circular dependency detected involving '{name}'")
|
|
1127
|
+
in_stack.add(name)
|
|
1128
|
+
info = name_map.get(name)
|
|
1129
|
+
d = 0
|
|
1130
|
+
if info:
|
|
1131
|
+
for dep in info.depends_on:
|
|
1132
|
+
if dep in all_names:
|
|
1133
|
+
d = max(d, get_depth(dep) + 1)
|
|
1134
|
+
in_stack.remove(name)
|
|
1135
|
+
depth[name] = d
|
|
1136
|
+
return d
|
|
1137
|
+
|
|
1138
|
+
for name in all_names:
|
|
1139
|
+
get_depth(name)
|
|
1140
|
+
|
|
1141
|
+
# Group by depth
|
|
1142
|
+
max_depth = max(depth.values()) if depth else 0
|
|
1143
|
+
layers: list[list[ModuleInfo]] = [[] for _ in range(max_depth + 1)]
|
|
1144
|
+
for name, d in depth.items():
|
|
1145
|
+
layers[d].append(name_map[name])
|
|
1146
|
+
return layers
|
|
1147
|
+
|
|
1148
|
+
async def _start_one_module(self, info: ModuleInfo):
|
|
1149
|
+
"""Start a single module: publish starting → start process → send kernel_port → wait ready → started → close stdio."""
|
|
1150
|
+
self._log_lifecycle("starting", info.name)
|
|
1151
|
+
await self._publish_event("module.starting", {"module_id": info.name})
|
|
1152
|
+
|
|
1153
|
+
token = self._module_tokens.get(info.name, "")
|
|
1154
|
+
boot_info = {"token": token}
|
|
1155
|
+
t0 = time.monotonic()
|
|
1156
|
+
ok = self.process_manager.start_module(info, boot_info=boot_info)
|
|
1157
|
+
if not ok:
|
|
1158
|
+
self._log_lifecycle("start_failed", info.name)
|
|
1159
|
+
return
|
|
1160
|
+
|
|
1161
|
+
# Register waiter BEFORE sending kernel_port
|
|
1162
|
+
# This prevents race condition where module connects and sends module.ready before waiter is registered
|
|
1163
|
+
ready_key = f"module.ready:{info.name}"
|
|
1164
|
+
ready_evt = asyncio.Event()
|
|
1165
|
+
ready_data = {}
|
|
1166
|
+
self._event_waiters[ready_key] = (ready_evt, ready_data)
|
|
1167
|
+
|
|
1168
|
+
# Send kernel_port via stdin so module can connect to Kernel WS
|
|
1169
|
+
self.process_manager.write_stdin(info.name, {
|
|
1170
|
+
"kite": "kernel_port",
|
|
1171
|
+
"kernel_port": self.kernel_port,
|
|
1172
|
+
})
|
|
1173
|
+
|
|
1174
|
+
# Persist immediately after starting to ensure PID is recorded
|
|
1175
|
+
self.process_manager.persist_records()
|
|
1176
|
+
|
|
1177
|
+
# Wait for module.ready or module.exiting (whichever comes first)
|
|
1178
|
+
timeout = info.launch.timeout
|
|
1179
|
+
try:
|
|
1180
|
+
await asyncio.wait_for(ready_evt.wait(), timeout=timeout)
|
|
1181
|
+
ready = ready_data
|
|
1182
|
+
except asyncio.TimeoutError:
|
|
1183
|
+
ready = None
|
|
1184
|
+
finally:
|
|
1185
|
+
self._event_waiters.pop(ready_key, None)
|
|
1186
|
+
|
|
1187
|
+
elapsed = time.monotonic() - t0
|
|
1188
|
+
if ready and ready.get("_exited"):
|
|
1189
|
+
# Module sent module.exiting before ready — it chose to quit
|
|
1190
|
+
reason = ready.get("reason", "unknown")
|
|
1191
|
+
self._exit_reasons[info.name] = reason
|
|
1192
|
+
print(f"[launcher] 模块 '{info.name}' 主动退出: {reason} ({elapsed:.2f}s)")
|
|
1193
|
+
elif ready:
|
|
1194
|
+
self._graceful_modules[info.name] = bool(ready.get("graceful_shutdown"))
|
|
1195
|
+
self._ready_times[info.name] = elapsed
|
|
1196
|
+
print(f"[launcher] 模块 '{info.name}' 已就绪 ({elapsed:.2f}s)")
|
|
1197
|
+
else:
|
|
1198
|
+
print(f"\033[91m[launcher] 警告: '{info.name}' 在 {timeout}s 内未发送 module.ready\033[0m")
|
|
1199
|
+
|
|
1200
|
+
rec = self.process_manager.get_record(info.name)
|
|
1201
|
+
self._log_lifecycle("started", info.name, pid=rec.pid if rec else None)
|
|
1202
|
+
await self._publish_event("module.started", {"module_id": info.name})
|
|
1203
|
+
self.process_manager.close_stdio(info.name)
|
|
1204
|
+
|
|
1205
|
+
async def _register_module_tokens(self):
|
|
1206
|
+
"""Generate per-module tokens and register the mapping to Kernel via RPC."""
|
|
1207
|
+
# Include all scanned modules
|
|
1208
|
+
async def _generate_module_tokens(self):
|
|
1209
|
+
"""Request Kernel to generate tokens for all scanned modules via RPC."""
|
|
1210
|
+
# Collect module names that need tokens
|
|
1211
|
+
module_names = [name for name in self.modules if name not in self._module_tokens]
|
|
1212
|
+
|
|
1213
|
+
if not module_names:
|
|
1214
|
+
return
|
|
1215
|
+
|
|
1216
|
+
# Wait for WebSocket connection to be ready
|
|
1217
|
+
if self._ws_connected:
|
|
1218
|
+
try:
|
|
1219
|
+
await asyncio.wait_for(self._ws_connected.wait(), timeout=5)
|
|
1220
|
+
except asyncio.TimeoutError:
|
|
1221
|
+
print(f"[launcher] 警告: WebSocket 未就绪,无法生成令牌")
|
|
1222
|
+
return
|
|
1223
|
+
else:
|
|
1224
|
+
print(f"[launcher] 警告: _ws_connected 未初始化")
|
|
1225
|
+
return
|
|
1226
|
+
|
|
1227
|
+
# Call Kernel RPC to generate tokens
|
|
1228
|
+
try:
|
|
1229
|
+
result = await self._rpc_call(self._ws, "kernel.generate_tokens", {"modules": module_names})
|
|
1230
|
+
if result.get("result", {}).get("ok"):
|
|
1231
|
+
tokens = result["result"].get("tokens", {})
|
|
1232
|
+
self._module_tokens.update(tokens)
|
|
1233
|
+
print(f"[launcher] Kernel 已生成 {len(tokens)} 个模块令牌")
|
|
1234
|
+
elif "error" in result:
|
|
1235
|
+
print(f"[launcher] 警告: 令牌生成失败: {result['error'].get('message', '')}")
|
|
1236
|
+
except Exception as e:
|
|
1237
|
+
print(f"[launcher] 警告: 生成模块令牌失败: {e}")
|
|
1238
|
+
|
|
1239
|
+
async def _register_new_tokens(self, tokens: dict):
|
|
1240
|
+
"""Register new token mapping to Kernel via RPC kernel.register_tokens."""
|
|
1241
|
+
if not self._ws or not tokens:
|
|
1242
|
+
return
|
|
1243
|
+
try:
|
|
1244
|
+
result = await self._rpc_call(self._ws, "kernel.register_tokens", tokens)
|
|
1245
|
+
if result.get("result", {}).get("ok"):
|
|
1246
|
+
print(f"[launcher] 已注册 {len(tokens)} 个模块令牌")
|
|
1247
|
+
elif "error" in result:
|
|
1248
|
+
print(f"[launcher] 警告: 令牌注册失败: {result['error'].get('message', '')}")
|
|
1249
|
+
except Exception as e:
|
|
1250
|
+
print(f"[launcher] 警告: 注册模块令牌失败: {e}")
|
|
1251
|
+
|
|
1252
|
+
# ── Validation ──
|
|
1253
|
+
|
|
1254
|
+
def _validate_core_modules(self):
|
|
1255
|
+
"""Validate core modules exist."""
|
|
1256
|
+
project_root = os.environ["KITE_PROJECT"]
|
|
1257
|
+
mod_dir = os.path.join(project_root, "kernel")
|
|
1258
|
+
md_path = os.path.join(mod_dir, "module.md")
|
|
1259
|
+
if not os.path.isdir(mod_dir):
|
|
1260
|
+
print(f"[launcher] 致命: 核心模块 'kernel' 目录未找到: {mod_dir}")
|
|
1261
|
+
sys.exit(1)
|
|
1262
|
+
if not os.path.isfile(md_path):
|
|
1263
|
+
print(f"[launcher] 致命: 核心模块 'kernel' 缺少 module.md: {md_path}")
|
|
1264
|
+
sys.exit(1)
|
|
1265
|
+
try:
|
|
1266
|
+
with open(md_path, "r", encoding="utf-8") as f:
|
|
1267
|
+
fm = _parse_frontmatter(f.read())
|
|
1268
|
+
if not fm:
|
|
1269
|
+
print(f"[launcher] 致命: 核心模块 'kernel' module.md 没有有效的 frontmatter")
|
|
1270
|
+
sys.exit(1)
|
|
1271
|
+
except Exception as e:
|
|
1272
|
+
print(f"[launcher] 致命: 核心模块 'kernel' module.md 解析错误: {e}")
|
|
1273
|
+
sys.exit(1)
|
|
1274
|
+
|
|
1275
|
+
# ── Module crash summary ──
|
|
1276
|
+
|
|
1277
|
+
def _print_module_crash_summary(self, name: str):
|
|
1278
|
+
"""Read module's crashes.jsonl last record and print red summary to console.
|
|
1279
|
+
Complement to module.crash event — reliable even if event was never sent."""
|
|
1280
|
+
RED = "\033[91m"
|
|
1281
|
+
RESET = "\033[0m"
|
|
1282
|
+
_suffix = os.environ.get("KITE_INSTANCE_SUFFIX", "")
|
|
1283
|
+
crash_log = os.path.join(
|
|
1284
|
+
os.environ.get("KITE_INSTANCE_DIR", ""), name, "log", f"crashes{_suffix}.jsonl"
|
|
1285
|
+
)
|
|
1286
|
+
if not os.path.isfile(crash_log):
|
|
1287
|
+
return
|
|
1288
|
+
try:
|
|
1289
|
+
with open(crash_log, "rb") as f:
|
|
1290
|
+
f.seek(0, 2)
|
|
1291
|
+
size = f.tell()
|
|
1292
|
+
if size == 0:
|
|
1293
|
+
return
|
|
1294
|
+
f.seek(max(0, size - 4096))
|
|
1295
|
+
lines = f.read().decode("utf-8").strip().split("\n")
|
|
1296
|
+
last = json.loads(lines[-1])
|
|
1297
|
+
exc_type = last.get("exception_type", "Unknown")
|
|
1298
|
+
ctx = last.get("context", {})
|
|
1299
|
+
file_name = ctx.get("file", "unknown")
|
|
1300
|
+
line_no = ctx.get("line", "?")
|
|
1301
|
+
print(f"[launcher] {RED}崩溃: "
|
|
1302
|
+
f"{exc_type} in {file_name}:{line_no}{RESET}")
|
|
1303
|
+
print(f"[launcher] 崩溃日志: {crash_log}")
|
|
1304
|
+
except Exception:
|
|
1305
|
+
pass
|
|
1306
|
+
|
|
1307
|
+
# ── Monitor loop ──
|
|
1308
|
+
|
|
1309
|
+
async def _monitor_loop(self):
|
|
1310
|
+
"""Check child processes every second. Handle crashes.
|
|
1311
|
+
Uses _shutdown_event (asyncio.Event) so Ctrl+C wakes us immediately.
|
|
1312
|
+
|
|
1313
|
+
Responsibility split:
|
|
1314
|
+
- Core module crash → full restart (Launcher handles)
|
|
1315
|
+
- Watchdog crash → Launcher restarts directly (up to 3 times)
|
|
1316
|
+
- Other module exit → publish module.stopped event only; Watchdog decides restart
|
|
1317
|
+
"""
|
|
1318
|
+
WATCHDOG_MAX_FAIL = 3
|
|
1319
|
+
watchdog_fail_count = 0
|
|
1320
|
+
|
|
1321
|
+
while not self._shutdown_event.is_set():
|
|
1322
|
+
exited = self.process_manager.check_exited()
|
|
1323
|
+
|
|
1324
|
+
for name, rc in exited:
|
|
1325
|
+
print(f"[launcher] 模块 '{name}' 退出,返回码 {rc}")
|
|
1326
|
+
if rc != 0:
|
|
1327
|
+
self._print_module_crash_summary(name)
|
|
1328
|
+
self._log_lifecycle("exited", name, exit_code=rc)
|
|
1329
|
+
await self._publish_event("module.stopped", {
|
|
1330
|
+
"module_id": name, "exit_code": rc,
|
|
1331
|
+
"graceful_shutdown": self._graceful_modules.get(name, False),
|
|
1332
|
+
})
|
|
1333
|
+
info = self.modules.get(name)
|
|
1334
|
+
|
|
1335
|
+
# 1) Core module crash → full restart
|
|
1336
|
+
if name in CORE_MODULE_NAMES or (info and info.is_core()):
|
|
1337
|
+
print(f"[launcher] 严重: 核心模块 '{name}' 崩溃,正在全部重启...")
|
|
1338
|
+
self._log_lifecycle("core_crash", name, exit_code=rc)
|
|
1339
|
+
await self._full_restart()
|
|
1340
|
+
return
|
|
1341
|
+
|
|
1342
|
+
# 2) Watchdog crash → Launcher restarts directly
|
|
1343
|
+
if name == WATCHDOG_MODULE_NAME:
|
|
1344
|
+
if self._system_shutting_down:
|
|
1345
|
+
print(f"[launcher] Watchdog 退出(系统关闭中),跳过重启")
|
|
1346
|
+
continue
|
|
1347
|
+
watchdog_fail_count += 1
|
|
1348
|
+
if watchdog_fail_count <= WATCHDOG_MAX_FAIL and info:
|
|
1349
|
+
print(f"[launcher] Watchdog 崩溃,正在重启 (第 {watchdog_fail_count}/{WATCHDOG_MAX_FAIL} 次)...")
|
|
1350
|
+
await self._start_one_module(info)
|
|
1351
|
+
else:
|
|
1352
|
+
self._desired_states[name] = "stopped"
|
|
1353
|
+
self._log_lifecycle("failed", name, reason=f"exceeded {WATCHDOG_MAX_FAIL} retries")
|
|
1354
|
+
print(f"[launcher] Watchdog 失败 {WATCHDOG_MAX_FAIL} 次,已放弃")
|
|
1355
|
+
continue
|
|
1356
|
+
|
|
1357
|
+
# 3) Other modules → event already published above; Watchdog decides restart
|
|
1358
|
+
# (no restart logic here — Watchdog handles it via module.stopped event)
|
|
1359
|
+
|
|
1360
|
+
if exited:
|
|
1361
|
+
self.process_manager.persist_records()
|
|
1362
|
+
|
|
1363
|
+
# Wait 1s but wake immediately on shutdown signal
|
|
1364
|
+
try:
|
|
1365
|
+
await asyncio.wait_for(self._shutdown_event.wait(), timeout=1)
|
|
1366
|
+
return # shutdown requested
|
|
1367
|
+
except asyncio.TimeoutError:
|
|
1368
|
+
pass
|
|
1369
|
+
|
|
1370
|
+
async def _full_restart(self):
|
|
1371
|
+
"""Stop all modules, regenerate tokens, re-run Phase 1-2."""
|
|
1372
|
+
print("[launcher] 全量重启: 正在停止所有模块...")
|
|
1373
|
+
|
|
1374
|
+
# Persist records before shutdown so cleanup_leftovers can find survivors
|
|
1375
|
+
self.process_manager.persist_records()
|
|
1376
|
+
|
|
1377
|
+
# Disconnect Kernel WS
|
|
1378
|
+
if self._ws_task:
|
|
1379
|
+
self._ws_task.cancel()
|
|
1380
|
+
self._ws_task = None
|
|
1381
|
+
self._ws = None
|
|
1382
|
+
self._rpc_waiters.clear()
|
|
1383
|
+
self._rpc_results.clear()
|
|
1384
|
+
|
|
1385
|
+
await self._graceful_shutdown_all()
|
|
1386
|
+
|
|
1387
|
+
# Cleanup any leftover processes that survived graceful shutdown.
|
|
1388
|
+
self.process_manager.cleanup_leftovers()
|
|
1389
|
+
|
|
1390
|
+
self._module_tokens.clear()
|
|
1391
|
+
|
|
1392
|
+
# Regenerate kite_token
|
|
1393
|
+
self.kite_token = secrets.token_hex(32)
|
|
1394
|
+
self.process_manager.kite_token = self.kite_token
|
|
1395
|
+
|
|
1396
|
+
print("[launcher] 全量重启: 重新执行 Phase 1-2...")
|
|
1397
|
+
try:
|
|
1398
|
+
await self._phase1_start_kernel()
|
|
1399
|
+
await self._phase2_start_modules()
|
|
1400
|
+
self.process_manager.persist_records()
|
|
1401
|
+
print("[launcher] 全量重启完成,恢复监控循环")
|
|
1402
|
+
await self._monitor_loop()
|
|
1403
|
+
except Exception as e:
|
|
1404
|
+
print(f"[launcher] 全量重启失败: {e}")
|
|
1405
|
+
|
|
1406
|
+
# ── Shutdown ──
|
|
1407
|
+
|
|
1408
|
+
def _final_cleanup(self):
|
|
1409
|
+
"""Called on exit — stop all processes, clear records."""
|
|
1410
|
+
try:
|
|
1411
|
+
print("[launcher] 正在执行最终清理...")
|
|
1412
|
+
|
|
1413
|
+
if self._ws_task:
|
|
1414
|
+
self._ws_task.cancel()
|
|
1415
|
+
|
|
1416
|
+
# Note: _graceful_shutdown_all() already called stop_all() in _async_main finally block.
|
|
1417
|
+
# This is just a safety check — should normally find nothing.
|
|
1418
|
+
remaining = [n for n in self.process_manager._processes
|
|
1419
|
+
if self.process_manager.is_running(n)]
|
|
1420
|
+
if remaining:
|
|
1421
|
+
print(f"[launcher] 警告: 仍有残留进程 (不应出现): {', '.join(remaining)}")
|
|
1422
|
+
self.process_manager.stop_all(timeout=SHUTDOWN_TIMEOUT_BULK)
|
|
1423
|
+
else:
|
|
1424
|
+
print("[launcher] 无残留进程")
|
|
1425
|
+
|
|
1426
|
+
# Clear instance runtime files
|
|
1427
|
+
try:
|
|
1428
|
+
os.remove(self.process_manager.records_path)
|
|
1429
|
+
except OSError:
|
|
1430
|
+
pass
|
|
1431
|
+
except Exception as e:
|
|
1432
|
+
print(f"[launcher] 最终清理出错: {e}")
|
|
1433
|
+
finally:
|
|
1434
|
+
# Signal the safety-net thread that normal shutdown has completed
|
|
1435
|
+
self._shutdown_complete.set()
|
|
1436
|
+
|
|
1437
|
+
# Calculate and display shutdown time
|
|
1438
|
+
if self._shutdown_start_time > 0:
|
|
1439
|
+
shutdown_elapsed = time.monotonic() - self._shutdown_start_time
|
|
1440
|
+
print(f"[launcher] 再见。(退出耗时: {shutdown_elapsed:.2f}s)")
|
|
1441
|
+
else:
|
|
1442
|
+
print("[launcher] 再见。")
|
|
1443
|
+
|
|
1444
|
+
if IS_WINDOWS:
|
|
1445
|
+
os._exit(0)
|
|
1446
|
+
|
|
1447
|
+
# ── Startup report ──
|
|
1448
|
+
|
|
1449
|
+
async def _print_startup_report(self, total_time: float, phase_times: dict[str, float], *,
|
|
1450
|
+
global_instances=None, cleaned_stats: dict[str, int] | None = None):
|
|
1451
|
+
"""Print a green startup summary with module list and timing."""
|
|
1452
|
+
G = "\033[32m" # green
|
|
1453
|
+
Y = "\033[33m" # yellow
|
|
1454
|
+
R = "\033[0m" # reset
|
|
1455
|
+
B = "\033[1;32m" # bold green
|
|
1456
|
+
|
|
1457
|
+
running = []
|
|
1458
|
+
exited = []
|
|
1459
|
+
stopped = []
|
|
1460
|
+
for name, info in self.modules.items():
|
|
1461
|
+
rec = self.process_manager.get_record(name)
|
|
1462
|
+
is_running = self.process_manager.is_running(name)
|
|
1463
|
+
if is_running and rec:
|
|
1464
|
+
running.append((name, info, rec))
|
|
1465
|
+
elif self._desired_states.get(name) == "running" and not is_running:
|
|
1466
|
+
# Was started but already exited (e.g. module.exiting)
|
|
1467
|
+
exited.append((name, info))
|
|
1468
|
+
else:
|
|
1469
|
+
stopped.append((name, info))
|
|
1470
|
+
|
|
1471
|
+
# Calculate kernel startup time (Phase 1)
|
|
1472
|
+
kernel_time = phase_times.get("Phase 1: Kernel", 0)
|
|
1473
|
+
|
|
1474
|
+
lines = [
|
|
1475
|
+
"",
|
|
1476
|
+
f"{B}{'=' * 60}",
|
|
1477
|
+
f" Kite 内核启动完成 耗时 {kernel_time:.2f}s",
|
|
1478
|
+
f" Kite 全部模块启动完成 总耗时 {total_time:.2f}s",
|
|
1479
|
+
f"{'=' * 60}{R}",
|
|
1480
|
+
]
|
|
1481
|
+
|
|
1482
|
+
# Phase breakdown
|
|
1483
|
+
lines.append(f"{G} 阶段耗时:{R}")
|
|
1484
|
+
|
|
1485
|
+
# Kernel modules section
|
|
1486
|
+
lines.append(f"{G} 内核模块:{R}")
|
|
1487
|
+
if "Phase 1: Kernel" in phase_times:
|
|
1488
|
+
elapsed = phase_times["Phase 1: Kernel"]
|
|
1489
|
+
lines.append(f"{G} {'Phase 1: Kernel':<26s} {elapsed:>6.2f}s{R}")
|
|
1490
|
+
|
|
1491
|
+
# Extension modules section
|
|
1492
|
+
lines.append(f"{G} 扩展模块:{R}")
|
|
1493
|
+
if "Phase 2: Extensions" in phase_times:
|
|
1494
|
+
elapsed = phase_times["Phase 2: Extensions"]
|
|
1495
|
+
lines.append(f"{G} {'Phase 2: Extensions':<26s} {elapsed:>6.2f}s{R}")
|
|
1496
|
+
|
|
1497
|
+
# Sort running modules by ready time
|
|
1498
|
+
running_sorted = sorted(running, key=lambda x: self._ready_times.get(x[0], float('inf')))
|
|
1499
|
+
|
|
1500
|
+
# Running modules with ready time and elapsed from Kite start
|
|
1501
|
+
DIM = "\033[90m"
|
|
1502
|
+
lines.append(f"{G} 运行中 ({len(running)}):{R}")
|
|
1503
|
+
|
|
1504
|
+
# CJK-aware display width helpers
|
|
1505
|
+
def _dw(s):
|
|
1506
|
+
"""Display width: CJK chars count as 2, others as 1."""
|
|
1507
|
+
w = 0
|
|
1508
|
+
for c in str(s):
|
|
1509
|
+
w += 2 if '\u4e00' <= c <= '\u9fff' or '\u3000' <= c <= '\u303f' or '\uff00' <= c <= '\uffef' else 1
|
|
1510
|
+
return w
|
|
1511
|
+
|
|
1512
|
+
def _rpad(s, width):
|
|
1513
|
+
"""Left-align s in a field of given display width."""
|
|
1514
|
+
return str(s) + ' ' * max(0, width - _dw(s))
|
|
1515
|
+
|
|
1516
|
+
def _lpad(s, width):
|
|
1517
|
+
"""Right-align s in a field of given display width."""
|
|
1518
|
+
return ' ' * max(0, width - _dw(s)) + str(s)
|
|
1519
|
+
|
|
1520
|
+
# Column definitions: (header, align, min_width)
|
|
1521
|
+
headers = ['模块', 'PID', '启动耗时', '进程启动时长', '类型']
|
|
1522
|
+
aligns = ['left', 'right', 'right', 'right', 'left'] # alignment per column
|
|
1523
|
+
|
|
1524
|
+
# Build data rows first to calculate column widths
|
|
1525
|
+
rows = []
|
|
1526
|
+
for name, info, rec in running_sorted:
|
|
1527
|
+
label = info.display_name or name
|
|
1528
|
+
ready_t = self._ready_times.get(name)
|
|
1529
|
+
time_str = f"{ready_t:.2f}s" if ready_t is not None else "—"
|
|
1530
|
+
if ready_t is not None and hasattr(self, '_start_unix'):
|
|
1531
|
+
elapsed_from_start = (rec.started_at + ready_t) - self._start_unix
|
|
1532
|
+
es_str = f"{elapsed_from_start:.2f}s"
|
|
1533
|
+
else:
|
|
1534
|
+
es_str = "—"
|
|
1535
|
+
|
|
1536
|
+
# Check if module timed out (ready_t >= 15s for kernel, >= timeout for others)
|
|
1537
|
+
is_timeout = False
|
|
1538
|
+
if ready_t is not None:
|
|
1539
|
+
if name == "kernel" and ready_t >= 15:
|
|
1540
|
+
is_timeout = True
|
|
1541
|
+
elif name != "kernel" and ready_t >= 15: # Default timeout for other modules
|
|
1542
|
+
is_timeout = True
|
|
1543
|
+
|
|
1544
|
+
rows.append([label, str(rec.pid), time_str, es_str, f"[{info.type}]", is_timeout])
|
|
1545
|
+
|
|
1546
|
+
# Calculate column widths: max of header and all data display widths
|
|
1547
|
+
col_widths = [_dw(h) for h in headers]
|
|
1548
|
+
for row in rows:
|
|
1549
|
+
for i, cell in enumerate(row[:5]): # Only first 5 columns (exclude is_timeout flag)
|
|
1550
|
+
col_widths[i] = max(col_widths[i], _dw(cell))
|
|
1551
|
+
|
|
1552
|
+
# Render header
|
|
1553
|
+
hdr_parts = []
|
|
1554
|
+
for i, h in enumerate(headers):
|
|
1555
|
+
if aligns[i] == 'left':
|
|
1556
|
+
hdr_parts.append(_rpad(h, col_widths[i]))
|
|
1557
|
+
else:
|
|
1558
|
+
hdr_parts.append(_lpad(h, col_widths[i]))
|
|
1559
|
+
lines.append(f"{DIM} {' '.join(hdr_parts)}{R}")
|
|
1560
|
+
|
|
1561
|
+
# Render data rows
|
|
1562
|
+
RED = "\033[91m"
|
|
1563
|
+
for row in rows:
|
|
1564
|
+
is_timeout = row[5] # Last element is the timeout flag
|
|
1565
|
+
parts = []
|
|
1566
|
+
for i, cell in enumerate(row[:5]): # Only first 5 columns
|
|
1567
|
+
if aligns[i] == 'left':
|
|
1568
|
+
parts.append(_rpad(cell, col_widths[i]))
|
|
1569
|
+
else:
|
|
1570
|
+
parts.append(_lpad(cell, col_widths[i]))
|
|
1571
|
+
if is_timeout:
|
|
1572
|
+
lines.append(f"{RED} ✓ {' '.join(parts)}{R}")
|
|
1573
|
+
else:
|
|
1574
|
+
lines.append(f"{G} ✓ {' '.join(parts)}{R}")
|
|
1575
|
+
|
|
1576
|
+
# Exited modules (started but already quit)
|
|
1577
|
+
if exited:
|
|
1578
|
+
lines.append(f"{Y} 已退出 ({len(exited)}):{R}")
|
|
1579
|
+
for name, info in exited:
|
|
1580
|
+
label = info.display_name or name
|
|
1581
|
+
reason = self._exit_reasons.get(name, "")
|
|
1582
|
+
reason_str = f": {reason}" if reason else ""
|
|
1583
|
+
lines.append(f"{Y} ↗ {label:<20s} (主动退出{reason_str}){R}")
|
|
1584
|
+
|
|
1585
|
+
# Stopped modules
|
|
1586
|
+
if stopped:
|
|
1587
|
+
lines.append(f"{G} 未启动 ({len(stopped)}):{R}")
|
|
1588
|
+
for name, info in stopped:
|
|
1589
|
+
label = info.display_name or name
|
|
1590
|
+
lines.append(f"{G} - {label:<20s} ({info.state}){R}")
|
|
1591
|
+
|
|
1592
|
+
lines.append(f"{G} Kernel WS: ws://127.0.0.1:{self.kernel_port}/ws 实例: {self.instance_id}{R}")
|
|
1593
|
+
|
|
1594
|
+
# Query Kernel for web module's api_endpoint via RPC
|
|
1595
|
+
web_url = ""
|
|
1596
|
+
if self._ws:
|
|
1597
|
+
try:
|
|
1598
|
+
resp = await self._rpc_call(self._ws, "registry.get", {"path": "web.api_endpoint"}, timeout=3)
|
|
1599
|
+
val = resp.get("result", {}).get("value")
|
|
1600
|
+
if val and isinstance(val, str):
|
|
1601
|
+
web_url = val.replace("://127.0.0.1:", "://localhost:")
|
|
1602
|
+
except Exception:
|
|
1603
|
+
pass
|
|
1604
|
+
if web_url:
|
|
1605
|
+
lines.append(f"{B} Web 管理后台: {web_url}{R}")
|
|
1606
|
+
|
|
1607
|
+
# Instance info
|
|
1608
|
+
instances = self.process_manager.get_alive_instances()
|
|
1609
|
+
inst_num = self.process_manager.instance_num
|
|
1610
|
+
suffix_display = self.process_manager.instance_suffix or "(无)"
|
|
1611
|
+
inst_dir = os.environ.get("KITE_INSTANCE_DIR", "")
|
|
1612
|
+
cwd = os.environ.get("KITE_CWD", "")
|
|
1613
|
+
debug_flag = " [DEBUG]" if os.environ.get("KITE_DEBUG") == "1" else ""
|
|
1614
|
+
lines.append(f"{G} 当前实例: #{inst_num} 后缀: {suffix_display} PID: {os.getpid()}{debug_flag}{R}")
|
|
1615
|
+
lines.append(f"{G} 实例目录: {inst_dir}{R}")
|
|
1616
|
+
lines.append(f"{G} 工作目录: {cwd}{R}")
|
|
1617
|
+
if len(instances) > 1:
|
|
1618
|
+
lines.append(f"{G} 所有实例:{R}")
|
|
1619
|
+
for i in instances:
|
|
1620
|
+
s = "" if i["num"] == 1 else f"~{i['num']}"
|
|
1621
|
+
debug_tag = " [DEBUG]" if i.get("debug", False) else ""
|
|
1622
|
+
current_tag = " (当前)" if i["is_self"] else ""
|
|
1623
|
+
lines.append(f"{G} #{i['num']} PID {i['launcher_pid']} "
|
|
1624
|
+
f"模块数 {i['module_count']} (processes{s}.json){debug_tag}{current_tag}{R}")
|
|
1625
|
+
|
|
1626
|
+
# Cross-directory instances from other projects
|
|
1627
|
+
if global_instances:
|
|
1628
|
+
my_inst_basename = os.path.basename(os.environ.get("KITE_INSTANCE_DIR", ""))
|
|
1629
|
+
other_instances = [i for i in global_instances
|
|
1630
|
+
if not i["is_self"] and i["instance_dir"] != my_inst_basename]
|
|
1631
|
+
if other_instances:
|
|
1632
|
+
lines.append(f"{G} 其他项目实例:{R}")
|
|
1633
|
+
for i in other_instances:
|
|
1634
|
+
debug_tag = " [DEBUG]" if i.get("debug", False) else ""
|
|
1635
|
+
cwd_display = f" {i['cwd']}" if i["cwd"] else ""
|
|
1636
|
+
lines.append(
|
|
1637
|
+
f"{G} {i['instance_dir']:<20s} "
|
|
1638
|
+
f"#{i['num']} PID {i['launcher_pid']} "
|
|
1639
|
+
f"模块数 {i['module_count']}"
|
|
1640
|
+
f"{cwd_display}{debug_tag}{R}"
|
|
1641
|
+
)
|
|
1642
|
+
|
|
1643
|
+
if cleaned_stats:
|
|
1644
|
+
total = sum(cleaned_stats.values())
|
|
1645
|
+
if len(cleaned_stats) == 1:
|
|
1646
|
+
inst, count = next(iter(cleaned_stats.items()))
|
|
1647
|
+
lines.append(f"{Y} 已清理残留进程: {inst} ({count} 个){R}")
|
|
1648
|
+
else:
|
|
1649
|
+
lines.append(f"{Y} 已清理残留进程 (共 {total} 个):{R}")
|
|
1650
|
+
for inst, count in cleaned_stats.items():
|
|
1651
|
+
lines.append(f"{Y} {inst}: {count} 个{R}")
|
|
1652
|
+
|
|
1653
|
+
lines.append(f"{B}{'=' * 60}{R}")
|
|
1654
|
+
lines.append("")
|
|
1655
|
+
|
|
1656
|
+
print("\n".join(lines))
|
|
1657
|
+
|
|
1658
|
+
# ── Utilities ──
|
|
1659
|
+
|
|
1660
|
+
def _load_discovery(self) -> dict | None:
|
|
1661
|
+
"""Read discovery config from launcher's own module.md."""
|
|
1662
|
+
md_path = os.path.join(os.environ["KITE_PROJECT"], "launcher", "module.md")
|
|
1663
|
+
try:
|
|
1664
|
+
with open(md_path, "r", encoding="utf-8") as f:
|
|
1665
|
+
fm = _parse_frontmatter(f.read())
|
|
1666
|
+
discovery = fm.get("discovery")
|
|
1667
|
+
if isinstance(discovery, dict) and discovery:
|
|
1668
|
+
return discovery
|
|
1669
|
+
except Exception as e:
|
|
1670
|
+
print(f"[launcher] 警告: 读取发现配置失败: {e}")
|
|
1671
|
+
return None
|
|
1672
|
+
|
|
1673
|
+
def _log_lifecycle(self, event: str, module: str, **extra):
|
|
1674
|
+
"""Append one JSONL line to lifecycle.jsonl."""
|
|
1675
|
+
from datetime import datetime, timezone
|
|
1676
|
+
record = {"ts": datetime.now(timezone.utc).isoformat(), "event": event, "module": module}
|
|
1677
|
+
record.update(extra)
|
|
1678
|
+
try:
|
|
1679
|
+
os.makedirs(os.path.dirname(self._lifecycle_log), exist_ok=True)
|
|
1680
|
+
with open(self._lifecycle_log, "a", encoding="utf-8") as f:
|
|
1681
|
+
f.write(json.dumps(record, ensure_ascii=False) + "\n")
|
|
1682
|
+
except Exception:
|
|
1683
|
+
pass
|
|
1684
|
+
|
|
1685
|
+
|
|
1686
|
+
|
|
1687
|
+
def _update_module_md_state(module_dir: str, new_state: str):
|
|
1688
|
+
"""Update the state field in a module's module.md frontmatter."""
|
|
1689
|
+
import re
|
|
1690
|
+
md_path = os.path.join(module_dir, "module.md")
|
|
1691
|
+
if not os.path.isfile(md_path):
|
|
1692
|
+
return
|
|
1693
|
+
|
|
1694
|
+
try:
|
|
1695
|
+
with open(md_path, "r", encoding="utf-8") as f:
|
|
1696
|
+
content = f.read()
|
|
1697
|
+
|
|
1698
|
+
updated = re.sub(
|
|
1699
|
+
r'^(state:\s*)(\S+)',
|
|
1700
|
+
rf'\g<1>{new_state}',
|
|
1701
|
+
content,
|
|
1702
|
+
count=1,
|
|
1703
|
+
flags=re.MULTILINE,
|
|
1704
|
+
)
|
|
1705
|
+
|
|
1706
|
+
with open(md_path, "w", encoding="utf-8") as f:
|
|
1707
|
+
f.write(updated)
|
|
1708
|
+
except Exception as e:
|
|
1709
|
+
print(f"[launcher] 警告: 更新 module.md 状态失败: {e}")
|
|
1710
|
+
|
|
1711
|
+
|
|
1712
|
+
def start_launcher():
|
|
1713
|
+
"""Entry point called from main.py. Sets up environment and starts launcher."""
|
|
1714
|
+
# Load .env (development convenience)
|
|
1715
|
+
try:
|
|
1716
|
+
from dotenv import load_dotenv
|
|
1717
|
+
load_dotenv()
|
|
1718
|
+
except ImportError:
|
|
1719
|
+
pass
|
|
1720
|
+
|
|
1721
|
+
# Resolve project root
|
|
1722
|
+
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
1723
|
+
|
|
1724
|
+
# Home base for Kite data
|
|
1725
|
+
home = os.environ.get("HOME") or os.environ.get("USERPROFILE") or os.path.expanduser("~")
|
|
1726
|
+
kite_home = os.path.join(home, ".kite")
|
|
1727
|
+
|
|
1728
|
+
# Set KITE_* defaults
|
|
1729
|
+
defaults = {
|
|
1730
|
+
"KITE_PROJECT": project_root,
|
|
1731
|
+
"KITE_CWD": os.getcwd(),
|
|
1732
|
+
"KITE_WORKSPACE": os.path.join(kite_home, "workspace"),
|
|
1733
|
+
"KITE_DATA": os.path.join(kite_home, "data"),
|
|
1734
|
+
"KITE_MODULES": os.path.join(kite_home, "modules"),
|
|
1735
|
+
"KITE_REPO": os.path.join(kite_home, "repo"),
|
|
1736
|
+
"KITE_ENV": "development",
|
|
1737
|
+
}
|
|
1738
|
+
for key, value in defaults.items():
|
|
1739
|
+
if not os.environ.get(key):
|
|
1740
|
+
os.environ[key] = value
|
|
1741
|
+
|
|
1742
|
+
# Parse CLI args
|
|
1743
|
+
if "--debug" in sys.argv:
|
|
1744
|
+
os.environ["KITE_DEBUG"] = "1"
|
|
1745
|
+
sys.argv.remove("--debug")
|
|
1746
|
+
|
|
1747
|
+
# Setup logging
|
|
1748
|
+
from .logging_setup import (
|
|
1749
|
+
setup_timestamped_print,
|
|
1750
|
+
init_log_files,
|
|
1751
|
+
setup_exception_hooks,
|
|
1752
|
+
reset_time_baseline,
|
|
1753
|
+
write_crash_handled
|
|
1754
|
+
)
|
|
1755
|
+
setup_timestamped_print()
|
|
1756
|
+
reset_time_baseline()
|
|
1757
|
+
|
|
1758
|
+
print("[launcher] Kite 启动中...")
|
|
1759
|
+
|
|
1760
|
+
# Create and run launcher
|
|
1761
|
+
token = secrets.token_hex(32)
|
|
1762
|
+
launcher = Launcher(kite_token=token)
|
|
1763
|
+
print("[launcher] 启动器实例已创建")
|
|
1764
|
+
|
|
1765
|
+
# Initialize log files (KITE_MODULE_DATA is now set)
|
|
1766
|
+
init_log_files()
|
|
1767
|
+
setup_exception_hooks()
|
|
1768
|
+
|
|
1769
|
+
log_dir = os.path.join(os.environ.get("KITE_MODULE_DATA", ""), "log")
|
|
1770
|
+
suffix = launcher.process_manager.instance_suffix
|
|
1771
|
+
latest_log = os.path.join(log_dir, f"latest{suffix}.log")
|
|
1772
|
+
print(f"[launcher] 日志: {latest_log}")
|
|
1773
|
+
|
|
1774
|
+
try:
|
|
1775
|
+
launcher.run()
|
|
1776
|
+
except Exception as e:
|
|
1777
|
+
write_crash_handled(type(e), e, e.__traceback__)
|
|
1778
|
+
sys.exit(1)
|