@agentunion/kite 1.0.7 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. package/CHANGELOG.md +208 -0
  2. package/README.md +48 -0
  3. package/cli.js +1 -1
  4. package/extensions/agents/__init__.py +1 -0
  5. package/extensions/agents/assistant/__init__.py +1 -0
  6. package/extensions/agents/assistant/entry.py +329 -0
  7. package/extensions/agents/assistant/module.md +22 -0
  8. package/extensions/agents/assistant/server.py +197 -0
  9. package/extensions/channels/__init__.py +1 -0
  10. package/extensions/channels/acp_channel/__init__.py +1 -0
  11. package/extensions/channels/acp_channel/entry.py +329 -0
  12. package/extensions/channels/acp_channel/module.md +22 -0
  13. package/extensions/channels/acp_channel/server.py +197 -0
  14. package/extensions/event_hub_bench/entry.py +624 -379
  15. package/extensions/event_hub_bench/module.md +2 -1
  16. package/extensions/services/backup/__init__.py +1 -0
  17. package/extensions/services/backup/entry.py +508 -0
  18. package/extensions/services/backup/module.md +22 -0
  19. package/extensions/services/model_service/__init__.py +1 -0
  20. package/extensions/services/model_service/entry.py +508 -0
  21. package/extensions/services/model_service/module.md +22 -0
  22. package/extensions/services/watchdog/entry.py +468 -102
  23. package/extensions/services/watchdog/module.md +3 -0
  24. package/extensions/services/watchdog/monitor.py +170 -69
  25. package/extensions/services/web/__init__.py +1 -0
  26. package/extensions/services/web/config.yaml +149 -0
  27. package/extensions/services/web/entry.py +390 -0
  28. package/extensions/services/web/module.md +24 -0
  29. package/extensions/services/web/routes/__init__.py +1 -0
  30. package/extensions/services/web/routes/routes_call.py +189 -0
  31. package/extensions/services/web/routes/routes_config.py +512 -0
  32. package/extensions/services/web/routes/routes_contacts.py +98 -0
  33. package/extensions/services/web/routes/routes_devlog.py +99 -0
  34. package/extensions/services/web/routes/routes_phone.py +81 -0
  35. package/extensions/services/web/routes/routes_sms.py +48 -0
  36. package/extensions/services/web/routes/routes_stats.py +17 -0
  37. package/extensions/services/web/routes/routes_voicechat.py +554 -0
  38. package/extensions/services/web/routes/schemas.py +216 -0
  39. package/extensions/services/web/server.py +375 -0
  40. package/extensions/services/web/static/css/style.css +1064 -0
  41. package/extensions/services/web/static/index.html +1445 -0
  42. package/extensions/services/web/static/js/app.js +4671 -0
  43. package/extensions/services/web/vendor/__init__.py +1 -0
  44. package/extensions/services/web/vendor/bluetooth/audio.py +348 -0
  45. package/extensions/services/web/vendor/bluetooth/contacts.py +251 -0
  46. package/extensions/services/web/vendor/bluetooth/manager.py +395 -0
  47. package/extensions/services/web/vendor/bluetooth/sms.py +290 -0
  48. package/extensions/services/web/vendor/bluetooth/telephony.py +274 -0
  49. package/extensions/services/web/vendor/config.py +139 -0
  50. package/extensions/services/web/vendor/conversation/asr.py +936 -0
  51. package/extensions/services/web/vendor/conversation/engine.py +548 -0
  52. package/extensions/services/web/vendor/conversation/llm.py +534 -0
  53. package/extensions/services/web/vendor/conversation/mcp_tools.py +190 -0
  54. package/extensions/services/web/vendor/conversation/tts.py +322 -0
  55. package/extensions/services/web/vendor/conversation/vad.py +138 -0
  56. package/extensions/services/web/vendor/storage/__init__.py +1 -0
  57. package/extensions/services/web/vendor/storage/identity.py +312 -0
  58. package/extensions/services/web/vendor/storage/store.py +507 -0
  59. package/extensions/services/web/vendor/task/manager.py +864 -0
  60. package/extensions/services/web/vendor/task/models.py +45 -0
  61. package/extensions/services/web/vendor/task/webhook.py +263 -0
  62. package/extensions/services/web/vendor/tools/registry.py +321 -0
  63. package/kernel/__init__.py +0 -0
  64. package/kernel/entry.py +407 -0
  65. package/{core/event_hub/hub.py → kernel/event_hub.py} +62 -74
  66. package/kernel/module.md +33 -0
  67. package/{core/registry/store.py → kernel/registry_store.py} +23 -8
  68. package/kernel/rpc_router.py +388 -0
  69. package/kernel/server.py +267 -0
  70. package/launcher/__init__.py +10 -0
  71. package/launcher/__main__.py +6 -0
  72. package/launcher/count_lines.py +258 -0
  73. package/launcher/entry.py +1778 -0
  74. package/launcher/logging_setup.py +289 -0
  75. package/{core/launcher → launcher}/module_scanner.py +11 -6
  76. package/launcher/process_manager.py +880 -0
  77. package/main.py +11 -210
  78. package/package.json +6 -9
  79. package/__init__.py +0 -1
  80. package/__main__.py +0 -15
  81. package/core/event_hub/BENCHMARK.md +0 -94
  82. package/core/event_hub/bench.py +0 -459
  83. package/core/event_hub/bench_extreme.py +0 -308
  84. package/core/event_hub/bench_perf.py +0 -350
  85. package/core/event_hub/entry.py +0 -157
  86. package/core/event_hub/module.md +0 -20
  87. package/core/event_hub/server.py +0 -206
  88. package/core/launcher/entry.py +0 -1158
  89. package/core/launcher/process_manager.py +0 -470
  90. package/core/registry/entry.py +0 -110
  91. package/core/registry/module.md +0 -30
  92. package/core/registry/server.py +0 -289
  93. package/extensions/services/watchdog/server.py +0 -167
  94. /package/{core → extensions/services/web/vendor/bluetooth}/__init__.py +0 -0
  95. /package/{core/event_hub → extensions/services/web/vendor/conversation}/__init__.py +0 -0
  96. /package/{core/launcher → extensions/services/web/vendor/task}/__init__.py +0 -0
  97. /package/{core/registry → extensions/services/web/vendor/tools}/__init__.py +0 -0
  98. /package/{core/event_hub → kernel}/dedup.py +0 -0
  99. /package/{core/event_hub → kernel}/router.py +0 -0
  100. /package/{core/launcher → launcher}/module.md +0 -0
@@ -0,0 +1,1778 @@
1
+ """
2
+ Launcher — the core of Kite. Manages module lifecycle, monitors processes.
3
+
4
+ Thread model:
5
+ - Main thread: asyncio event loop (process management + monitor loop)
6
+ - stdout threads: one daemon thread per child process (ProcessManager)
7
+ - (Windows) keyboard listener thread: polls for 'q' key
8
+
9
+ 2-Phase startup:
10
+ Phase 1: Start Kernel → wait port → connect WS → register self → module.ready
11
+ Phase 2: start remaining enabled modules in topo order (each connects to Kernel WS)
12
+ """
13
+
14
+ import asyncio
15
+ import json
16
+ import os
17
+ import secrets
18
+ import signal
19
+ import sys
20
+ import threading
21
+ import time
22
+ import uuid
23
+
24
+ import websockets
25
+
26
+ from .module_scanner import ModuleScanner, ModuleInfo, LaunchConfig, _parse_frontmatter
27
+ from .process_manager import ProcessManager
28
+
29
+ IS_WINDOWS = sys.platform == "win32"
30
+
31
+ # Shutdown timeout constants (seconds)
32
+ SHUTDOWN_TIMEOUT_NON_GRACEFUL = 5 # Non-graceful modules or no ack response
33
+ SHUTDOWN_TIMEOUT_PARTIAL = 3 # Graceful module ack'd but no ready
34
+ SHUTDOWN_TIMEOUT_READY = 1 # Graceful module sent ready (cleanup done)
35
+ SHUTDOWN_TIMEOUT_BULK = 3 # Bulk stop_all() safety net
36
+
37
+ # Core module names that are started in Phase 1 (not Phase 2)
38
+ CORE_MODULE_NAMES = {"kernel"}
39
+
40
+ WATCHDOG_MODULE_NAME = "watchdog"
41
+
42
+
43
+ class Launcher:
44
+ """Kite system entry point. Starts Kernel, manages modules."""
45
+
46
+ def __init__(self, kite_token: str):
47
+ self.kite_token = kite_token
48
+ self.instance_id = str(os.getpid())
49
+ os.environ["KITE_INSTANCE"] = self.instance_id
50
+
51
+ # Resolve instance workspace (must happen before ProcessManager init)
52
+ self._resolve_instance_dir()
53
+ os.environ["KITE_MODULE_DATA"] = os.path.join(
54
+ os.environ["KITE_INSTANCE_DIR"], "launcher",
55
+ )
56
+
57
+ self.process_manager = ProcessManager(
58
+ kite_token, self.instance_id,
59
+ on_kite_message=self._on_kite_message,
60
+ )
61
+ self.module_scanner = ModuleScanner(
62
+ discovery=self._load_discovery(),
63
+ )
64
+
65
+ self.kernel_port: int = 0
66
+ self.modules: dict[str, ModuleInfo] = {}
67
+ self._shutdown_event = asyncio.Event()
68
+ self._thread_shutdown = threading.Event()
69
+ self._shutdown_complete = threading.Event() # Set when normal shutdown finishes
70
+ self._module_tokens: dict[str, str] = {} # module_name -> per-module token
71
+
72
+ # Three-layer state model: desired_state per module
73
+ self._desired_states: dict[str, str] = {} # module_name -> "running" | "stopped"
74
+
75
+ # Kernel WebSocket client
76
+ self._ws: object | None = None
77
+ self._ws_task: asyncio.Task | None = None
78
+ self._ws_connected: asyncio.Event | None = None # Created in _async_main, set when WS ready
79
+ self._loop: asyncio.AbstractEventLoop | None = None
80
+
81
+ # JSON-RPC 2.0 infrastructure
82
+ self._rpc_waiters: dict[str, asyncio.Event] = {} # rpc_id -> Event
83
+ self._rpc_results: dict[str, dict] = {} # rpc_id -> response dict
84
+
85
+ # Event waiters: {event_key: (asyncio.Event, data_dict)}
86
+ self._event_waiters: dict[str, tuple[asyncio.Event, dict]] = {}
87
+
88
+ # Module ready times: module_name -> seconds from start to ready
89
+ self._ready_times: dict[str, float] = {}
90
+
91
+ # Shutdown timing
92
+ self._shutdown_start_time: float = 0.0
93
+
94
+ # Module exit reasons: module_name -> reason string (for modules that sent module.exiting)
95
+ self._exit_reasons: dict[str, str] = {}
96
+
97
+ # Graceful shutdown capability: module_name -> True if module declared support
98
+ # Kernel defaults to True (it starts before Watchdog can observe)
99
+ self._graceful_modules: dict[str, bool] = {"kernel": True}
100
+
101
+ # System-wide shutdown flag: prevents Watchdog restart during shutdown
102
+ self._system_shutting_down = False
103
+
104
+ # Kite stdout message waiters: {waiter_key: (threading.Event, data_dict)}
105
+ # Used by ProcessManager stdout callback (cross-thread)
106
+ self._msg_waiters: dict[str, tuple[threading.Event, dict]] = {}
107
+
108
+ suffix = self.process_manager.instance_suffix
109
+ state_dir = os.path.join(os.environ["KITE_INSTANCE_DIR"], "launcher", "state")
110
+ os.makedirs(state_dir, exist_ok=True)
111
+ self._lifecycle_log = os.path.join(state_dir, f"lifecycle{suffix}.jsonl")
112
+ # Clear lifecycle log on startup (like latest.log)
113
+ try:
114
+ with open(self._lifecycle_log, "w", encoding="utf-8") as f:
115
+ pass
116
+ except Exception:
117
+ pass
118
+ os.environ["KITE_INSTANCE_SUFFIX"] = suffix
119
+
120
+ @staticmethod
121
+ def _fmt_elapsed(seconds: float) -> str:
122
+ """Format elapsed seconds: <1s → 'NNNms', >=1s → 'N.Ns', >=10s → 'NNs'."""
123
+ if seconds < 1:
124
+ return f"{seconds * 1000:.0f}ms"
125
+ if seconds < 10:
126
+ return f"{seconds:.1f}s"
127
+ return f"{seconds:.0f}s"
128
+
129
+ # ── Instance workspace resolution ──
130
+
131
+ @staticmethod
132
+ def _resolve_instance_dir():
133
+ """Resolve KITE_INSTANCE_DIR from KITE_WORKSPACE + KITE_CWD.
134
+ Algorithm: take CWD basename, find matching dir in workspace via .cwd file,
135
+ or create new one. Sets KITE_INSTANCE_DIR env var.
136
+ """
137
+ if os.environ.get("KITE_INSTANCE_DIR"):
138
+ return # already set (e.g. by tests or parent)
139
+
140
+ cwd = os.environ.get("KITE_CWD", os.getcwd())
141
+ workspace = os.environ.get("KITE_WORKSPACE", "")
142
+ if not workspace:
143
+ home = os.environ.get("HOME") or os.environ.get("USERPROFILE") or os.path.expanduser("~")
144
+ workspace = os.path.join(home, ".kite", "workspace")
145
+ os.environ["KITE_WORKSPACE"] = workspace
146
+
147
+ basename = os.path.basename(cwd.rstrip(os.sep)) or "default"
148
+ suffix = 0
149
+
150
+ while True:
151
+ name = basename if suffix == 0 else f"{basename}~{suffix}"
152
+ candidate = os.path.join(workspace, name)
153
+ cwd_file = os.path.join(candidate, ".cwd")
154
+
155
+ if not os.path.exists(candidate):
156
+ # Empty slot — create new workspace
157
+ os.makedirs(candidate, exist_ok=True)
158
+ with open(cwd_file, "w", encoding="utf-8") as f:
159
+ f.write(cwd)
160
+ os.environ["KITE_INSTANCE_DIR"] = candidate
161
+ return
162
+
163
+ if os.path.isfile(cwd_file):
164
+ try:
165
+ with open(cwd_file, "r", encoding="utf-8") as f:
166
+ if f.read().strip() == cwd:
167
+ os.environ["KITE_INSTANCE_DIR"] = candidate
168
+ return
169
+ except Exception:
170
+ pass
171
+
172
+ suffix += 1
173
+
174
+ # ── Kite stdout message callback ──
175
+
176
+ def _on_kite_message(self, module_name: str, msg: dict):
177
+ """Called by ProcessManager stdout reader thread when a kite message is detected.
178
+ Thread-safe: only touches _msg_waiters (dict + threading.Event).
179
+ """
180
+ kite_type = msg.get("kite", "")
181
+ key = f"{module_name}:{kite_type}"
182
+ waiter = self._msg_waiters.get(key)
183
+ if waiter:
184
+ waiter[1].update(msg)
185
+ waiter[0].set()
186
+
187
+ async def _wait_kite_message(self, module_name: str, kite_type: str,
188
+ timeout: float) -> dict | None:
189
+ """Wait for a kite stdout message from a module. Returns msg dict or None on timeout.
190
+ Checks shutdown flag every 0.5s so Ctrl+C is responsive even during Phase 1-2 waits.
191
+ """
192
+ key = f"{module_name}:{kite_type}"
193
+ evt = threading.Event()
194
+ data = {}
195
+ self._msg_waiters[key] = (evt, data)
196
+ shutdown = self._thread_shutdown
197
+ try:
198
+ def _wait():
199
+ deadline = time.monotonic() + timeout
200
+ while time.monotonic() < deadline:
201
+ if evt.wait(timeout=0.5):
202
+ return True
203
+ if shutdown.is_set():
204
+ return False
205
+ return False
206
+ got = await asyncio.get_running_loop().run_in_executor(None, _wait)
207
+ return data if got else None
208
+ finally:
209
+ self._msg_waiters.pop(key, None)
210
+
211
+ # ── Public entry ──
212
+
213
+ def run(self):
214
+ """Synchronous entry point. Sets up signals, runs the async main loop."""
215
+ print("[launcher] ── 环境 ──")
216
+ for key in sorted(k for k in os.environ if k.startswith("KITE_")):
217
+ print(f"[launcher] {key} = {os.environ[key]}")
218
+ print(f"[launcher] PID = {os.getpid()}")
219
+ print(f"[launcher] PYTHON = {sys.executable}")
220
+ print(f"[launcher] PLATFORM = {sys.platform}")
221
+
222
+ if IS_WINDOWS:
223
+ self._setup_windows_exit()
224
+ else:
225
+ self._setup_unix_signals()
226
+
227
+ try:
228
+ asyncio.run(self._async_main())
229
+ except KeyboardInterrupt:
230
+ pass
231
+ except RuntimeError as e:
232
+ # Don't print "启动失败" if user requested shutdown
233
+ if not self._thread_shutdown.is_set():
234
+ print(f"[launcher] 启动失败: {e}")
235
+ finally:
236
+ self._final_cleanup()
237
+
238
+ def _request_shutdown(self, reason: str = ""):
239
+ """Request graceful shutdown. Thread-safe — can be called from signal handler or any thread."""
240
+ if self._thread_shutdown.is_set():
241
+ return # already shutting down
242
+ print(f"[launcher] {reason or '收到关闭请求'}")
243
+ self._shutdown_start_time = time.monotonic() # Record shutdown start time
244
+ self._thread_shutdown.set()
245
+ # Wake up asyncio event loop immediately (so _monitor_loop / wait_for exits)
246
+ loop = self._loop
247
+ if loop and not loop.is_closed():
248
+ try:
249
+ loop.call_soon_threadsafe(self._shutdown_event.set)
250
+ except RuntimeError:
251
+ pass
252
+ # Safety net: force exit after 10s only if normal shutdown hasn't completed
253
+ def _force():
254
+ if self._shutdown_complete.wait(timeout=10):
255
+ return # Normal shutdown completed — no need to force
256
+ try:
257
+ pm = self.process_manager
258
+ still = [n for n in pm._processes if pm.is_running(n)]
259
+ except Exception:
260
+ still = []
261
+ if still:
262
+ print(f"\033[91m[launcher] 关闭超时,以下模块仍在运行: {', '.join(still)},强制退出\033[0m")
263
+ else:
264
+ print("\033[91m[launcher] 关闭超时,强制退出\033[0m")
265
+ os._exit(1)
266
+ threading.Thread(target=_force, daemon=True).start()
267
+
268
+ def _setup_unix_signals(self):
269
+ """Register SIGTERM/SIGINT handlers on Linux/macOS."""
270
+ def _handler(signum, frame):
271
+ self._request_shutdown(f"收到信号 {signum},正在关闭...")
272
+ signal.signal(signal.SIGTERM, _handler)
273
+ signal.signal(signal.SIGINT, _handler)
274
+
275
+ def _setup_windows_exit(self):
276
+ """SetConsoleCtrlHandler for Ctrl+C + daemon thread for 'q' key.
277
+
278
+ Why not signal.signal(SIGINT)?
279
+ Python's signal delivery requires the main thread to be executing bytecode.
280
+ When the main thread is blocked in C code (asyncio ProactorEventLoop →
281
+ GetQueuedCompletionStatus), SIGINT is never delivered.
282
+ SetConsoleCtrlHandler runs its callback in a separate OS thread, so it
283
+ always works regardless of what the main thread is doing.
284
+ """
285
+ import ctypes
286
+
287
+ @ctypes.WINFUNCTYPE(ctypes.c_int, ctypes.c_uint)
288
+ def _ctrl_handler(ctrl_type):
289
+ if ctrl_type in (0, 1): # CTRL_C_EVENT, CTRL_BREAK_EVENT
290
+ self._request_shutdown("收到 Ctrl+C,正在关闭...")
291
+ return 1 # handled — prevent default (which kills the process)
292
+ return 0
293
+
294
+ # prevent GC of the C callback
295
+ self._ctrl_handler_ref = _ctrl_handler
296
+ ctypes.windll.kernel32.SetConsoleCtrlHandler(_ctrl_handler, 1)
297
+
298
+ # 'q' key: handle via msvcrt polling
299
+ def _listen():
300
+ import msvcrt
301
+ while not self._thread_shutdown.is_set():
302
+ if msvcrt.kbhit():
303
+ ch = msvcrt.getch()
304
+ if ch == b'\x1b': # ESC - force exit immediately
305
+ print("[launcher] ESC 强制退出")
306
+ os._exit(0)
307
+ elif ch in (b'q', b'Q'): # q/Q - graceful shutdown
308
+ self._request_shutdown("收到退出请求,正在关闭...")
309
+ return
310
+ time.sleep(0.1)
311
+ threading.Thread(target=_listen, daemon=True).start()
312
+
313
+ # ── Async main (2-Phase startup) ──
314
+
315
+ async def _async_main(self):
316
+ """Full 2-phase startup sequence, then monitor loop."""
317
+ self._loop = asyncio.get_running_loop()
318
+ self._ws_connected = asyncio.Event() # Create event in async context
319
+ t_start = time.monotonic()
320
+ self._start_unix = time.time()
321
+ phase_times = {}
322
+ G = "\033[32m"
323
+ R = "\033[0m"
324
+
325
+ # Validate core modules exist
326
+ self._validate_core_modules()
327
+
328
+ # Cleanup leftovers from previous instances (current instance dir)
329
+ local_cleaned = self.process_manager.cleanup_leftovers()
330
+
331
+ # Cross-directory leftover cleanup (background, non-blocking)
332
+ self._global_cleanup_task = asyncio.ensure_future(
333
+ asyncio.get_running_loop().run_in_executor(
334
+ None, self.process_manager.cleanup_global_leftovers
335
+ )
336
+ )
337
+
338
+ try:
339
+ # Phase 1: Start Kernel + connect WS
340
+ t0 = time.monotonic()
341
+ await self._phase1_start_kernel()
342
+ elapsed_p1 = time.monotonic() - t0
343
+ phase_times["Phase 1: Kernel"] = elapsed_p1
344
+ print(f"{G}[launcher] ✓ Phase 1 完成: Kernel 已就绪 ({elapsed_p1:.2f}s){R}")
345
+ if self._shutdown_event.is_set(): return
346
+
347
+ # Initialize desired_state from config_state
348
+ for name, info in self.modules.items():
349
+ if info.state == "enabled":
350
+ self._desired_states[name] = "running"
351
+ else: # manual, disabled
352
+ self._desired_states[name] = "stopped"
353
+ # Core modules are already running
354
+ for cn in CORE_MODULE_NAMES:
355
+ self._desired_states[cn] = "running"
356
+
357
+ # Phase 1.5: Watchdog
358
+ watchdog_info = self.modules.get(WATCHDOG_MODULE_NAME)
359
+ if watchdog_info and self._desired_states.get(WATCHDOG_MODULE_NAME) == "running":
360
+ t0 = time.monotonic()
361
+ print(f"[launcher] Phase 1.5: 启动 Watchdog...")
362
+ await self._start_one_module(watchdog_info)
363
+ elapsed = time.monotonic() - t0
364
+ print(f"{G}[launcher] ✓ Phase 1.5 完成: Watchdog ({elapsed:.2f}s){R}")
365
+ if self._shutdown_event.is_set(): return
366
+
367
+ # Phase 2: Start remaining enabled modules
368
+ t0 = time.monotonic()
369
+ await self._phase2_start_modules()
370
+ elapsed = time.monotonic() - t0
371
+ phase_times["Phase 2: Extensions"] = elapsed
372
+ print(f"{G}[launcher] ✓ Phase 2 完成: 扩展模块已启动 ({elapsed:.2f}s){R}")
373
+ if self._shutdown_event.is_set(): return
374
+
375
+ # Post-startup
376
+ self.process_manager.persist_records()
377
+
378
+ # Wait for global leftover cleanup to finish (non-blocking with timeout)
379
+ global_cleaned = {}
380
+ if hasattr(self, '_global_cleanup_task'):
381
+ try:
382
+ global_cleaned = await asyncio.wait_for(self._global_cleanup_task, timeout=5) or {}
383
+ except asyncio.TimeoutError:
384
+ print("[launcher] 警告: 全局遗留清理超时 (5s),跳过")
385
+ except Exception as e:
386
+ print(f"[launcher] 警告: 全局遗留清理出错: {e}")
387
+ # Merge local + global cleanup stats
388
+ cleaned_stats: dict[str, int] = {}
389
+ for d in (local_cleaned, global_cleaned):
390
+ for k, v in d.items():
391
+ cleaned_stats[k] = cleaned_stats.get(k, 0) + v
392
+
393
+ # Global instance scan (via executor to avoid blocking)
394
+ global_instances = await asyncio.get_running_loop().run_in_executor(
395
+ None, self.process_manager.get_global_instances
396
+ )
397
+
398
+ # ── Startup report ──
399
+ total_time = time.monotonic() - t_start
400
+ await self._print_startup_report(total_time, phase_times,
401
+ global_instances=global_instances,
402
+ cleaned_stats=cleaned_stats)
403
+ # Notify all modules that system startup is complete
404
+ await self._publish_event("system.ready", {
405
+ "startup_time": round(total_time, 2),
406
+ })
407
+
408
+ print("[launcher] 进入监控循环 (按 Ctrl+C 或 'q' 优雅退出,ESC 强制退出)")
409
+ await self._monitor_loop()
410
+ finally:
411
+ try:
412
+ await self._graceful_shutdown_all()
413
+ except Exception as e:
414
+ print(f"[launcher] 优雅关闭出错: {e}")
415
+
416
+ # ── Phase 1: Start Kernel ──
417
+
418
+ async def _phase1_start_kernel(self):
419
+ """Start Kernel process, connect WS, register self, wait for module.ready.
420
+
421
+ Flow:
422
+ 1. Start Kernel subprocess
423
+ 2. Wait Kernel stdout port → set KITE_KERNEL_PORT env
424
+ 3. Scan modules + connect WS + generate tokens (parallel)
425
+ 4. Wait module.ready event from Kernel
426
+ """
427
+ t_kernel = time.monotonic()
428
+
429
+ # ── Step 1: Start Kernel process ──
430
+ kernel_dir = os.path.join(os.environ["KITE_PROJECT"], "kernel")
431
+ kernel_info = ModuleInfo(
432
+ name="kernel",
433
+ display_name="Kernel",
434
+ type="infrastructure",
435
+ state="enabled",
436
+ runtime="python",
437
+ entry="entry.py",
438
+ module_dir=kernel_dir,
439
+ )
440
+ # Kernel does NOT receive boot_info via stdin
441
+ self._log_lifecycle("starting", "kernel")
442
+ ok = self.process_manager.start_module(kernel_info, boot_info=None)
443
+ if not ok:
444
+ self._log_lifecycle("start_failed", "kernel")
445
+ raise RuntimeError("启动 Kernel 失败")
446
+
447
+ print(f"[launcher] Kernel 进程已启动,等待 Kernel 端口...")
448
+
449
+ # Persist immediately after starting core processes
450
+ self.process_manager.persist_records()
451
+
452
+ # ── Step 2: Wait for Kernel port + launcher_token ──
453
+ msg = await self._wait_kite_message("kernel", "port", timeout=6)
454
+ if self._thread_shutdown.is_set():
455
+ # User requested shutdown during startup
456
+ raise RuntimeError("启动被用户中断")
457
+ if not msg or not msg.get("port") or not msg.get("token"):
458
+ raise RuntimeError("致命错误: Kernel 在 6s 内未报告端口和 token")
459
+ self.kernel_port = int(msg["port"])
460
+ launcher_token = msg["token"]
461
+ self._module_tokens["launcher"] = launcher_token
462
+ _wait_s = time.monotonic() - t_kernel
463
+ print(f"[launcher] Kernel 端口: {self.kernel_port} (等待 {self._fmt_elapsed(_wait_s)})")
464
+
465
+ # ── Step 3: Set env (but don't send kernel_port to modules yet) ──
466
+ os.environ["KITE_KERNEL_PORT"] = str(self.kernel_port)
467
+
468
+ # ── Step 4: Scan modules + connect WS + generate tokens (parallel) ──
469
+ async def _scan_and_generate_tokens():
470
+ t_scan = time.monotonic()
471
+ self.modules = self.module_scanner.scan()
472
+ for name, info in self.modules.items():
473
+ self._log_lifecycle("scanned", name, state=info.state, module_dir=info.module_dir)
474
+ _scan_s = time.monotonic() - t_scan
475
+ print(f"[launcher] 发现 {len(self.modules)} 个模块: {', '.join(self.modules.keys()) or '(无)'} (扫描 {self._fmt_elapsed(_scan_s)})")
476
+ # Generate tokens via Kernel RPC (after WS connection is ready)
477
+ t_gen = time.monotonic()
478
+ await self._generate_module_tokens()
479
+ _gen_s = time.monotonic() - t_gen
480
+ print(f"[launcher] 令牌生成完成 ({self._fmt_elapsed(_gen_s)})")
481
+
482
+ async def _connect_kernel_ws():
483
+ t_ws = time.monotonic()
484
+ self._ws_task = asyncio.create_task(self._ws_loop())
485
+ # Wait for WebSocket connection to be established and ready
486
+ try:
487
+ await asyncio.wait_for(self._ws_connected.wait(), timeout=5)
488
+ except asyncio.TimeoutError:
489
+ print("[launcher] 警告: WebSocket 连接超时")
490
+ return
491
+
492
+ # Now wait for Kernel module.ready event
493
+ # (waiter is registered inside _ws_connect before _ws_receiver starts)
494
+ ready = await self._wait_event("module.ready", "kernel", timeout=15)
495
+ if ready:
496
+ self._graceful_modules["kernel"] = bool(ready.get("graceful_shutdown"))
497
+ print("[launcher] Kernel 已就绪")
498
+ else:
499
+ print("\033[91m[launcher] 警告: Kernel 在 15s 内未发送 module.ready\033[0m")
500
+ self._ready_times["kernel"] = time.monotonic() - t_ws
501
+
502
+ await asyncio.gather(
503
+ _scan_and_generate_tokens(),
504
+ _connect_kernel_ws(),
505
+ )
506
+ if self._shutdown_event.is_set():
507
+ return
508
+
509
+ self._log_lifecycle("started", "kernel")
510
+ await self._publish_event("module.started", {"module_id": "kernel"})
511
+ self.process_manager.close_stdio("kernel")
512
+
513
+ # Store kernel_info in modules dict if not already present (from scan)
514
+ if "kernel" not in self.modules:
515
+ self.modules["kernel"] = kernel_info
516
+
517
+ # ── Phase 2: Start remaining modules ──
518
+
519
+ async def _phase2_start_modules(self):
520
+ """Start enabled modules (excluding core) in dependency order."""
521
+ to_start = [m for m in self.modules.values()
522
+ if self._desired_states.get(m.name) == "running"
523
+ and m.name not in CORE_MODULE_NAMES
524
+ and m.name != WATCHDOG_MODULE_NAME]
525
+ if not to_start:
526
+ print("[launcher] 没有额外模块需要启动")
527
+ return
528
+
529
+ # Auto-start manual modules if depended upon
530
+ needed = set(m.name for m in to_start)
531
+ for m in list(to_start):
532
+ for dep in m.depends_on:
533
+ if dep not in needed and dep not in CORE_MODULE_NAMES:
534
+ dep_info = self.modules.get(dep)
535
+ if dep_info and dep_info.state != "disabled":
536
+ needed.add(dep)
537
+ to_start.append(dep_info)
538
+ self._desired_states[dep] = "running"
539
+ print(f"[launcher] 自动启动 '{dep}' (被依赖)")
540
+ elif dep_info and dep_info.state == "disabled":
541
+ print(f"[launcher] 错误: '{m.name}' 依赖已禁用的模块 '{dep}'")
542
+
543
+ try:
544
+ layers = self._topo_layers(to_start)
545
+ except RuntimeError as e:
546
+ print(f"[launcher] 错误: {e}")
547
+ return
548
+
549
+ total = sum(len(layer) for layer in layers)
550
+ print(f"[launcher] 正在启动 {total} 个模块...")
551
+ for layer in layers:
552
+ if len(layer) == 1:
553
+ await self._start_one_module(layer[0])
554
+ else:
555
+ await asyncio.gather(*(self._start_one_module(info) for info in layer))
556
+
557
+ # ── Kernel WebSocket connection (JSON-RPC 2.0) ──
558
+
559
+ async def _ws_loop(self):
560
+ """Connect to Kernel, reconnect on failure."""
561
+ while not self._thread_shutdown.is_set():
562
+ try:
563
+ await self._ws_connect()
564
+ except asyncio.CancelledError:
565
+ return
566
+ except Exception as e:
567
+ if not self._system_shutting_down:
568
+ print(f"[launcher] Kernel 连接错误: {e}")
569
+ self._ws = None
570
+ await asyncio.sleep(5)
571
+
572
+ async def _ws_connect(self):
573
+ """Single WebSocket session with JSON-RPC 2.0 protocol."""
574
+ launcher_token = self._module_tokens.get("launcher", "")
575
+ ws_url = f"ws://127.0.0.1:{self.kernel_port}/ws?token={launcher_token}&id=launcher"
576
+ t_ws_connect = time.monotonic()
577
+ async with websockets.connect(ws_url, open_timeout=3, ping_interval=None, ping_timeout=None, close_timeout=10) as ws:
578
+ self._ws = ws
579
+ _ws_s = time.monotonic() - t_ws_connect
580
+ print(f"[launcher] 已连接到 Kernel ({self._fmt_elapsed(_ws_s)})")
581
+
582
+ # Start receive loop in background task BEFORE making any RPC calls
583
+ # This prevents deadlock where RPC waits for response but receive loop hasn't started
584
+ receiver_task = asyncio.create_task(self._ws_receiver(ws))
585
+
586
+ try:
587
+ # Register kernel module.ready waiter BEFORE subscribing to events
588
+ # This prevents race condition where event arrives before waiter is registered
589
+ ready_key = "module.ready:kernel"
590
+ ready_evt = asyncio.Event()
591
+ ready_data = {}
592
+ self._event_waiters[ready_key] = (ready_evt, ready_data)
593
+
594
+ # Subscribe to all events
595
+ await self._rpc_call(ws, "event.subscribe", {"events": [">"]})
596
+
597
+ # Register Launcher itself in the Registry
598
+ await self._rpc_call(ws, "registry.register", {
599
+ "module_id": "launcher",
600
+ "module_type": "infrastructure",
601
+ "events_publish": {
602
+ "module.started": {},
603
+ "module.stopped": {},
604
+ "module.state_changed": {},
605
+ },
606
+ "events_subscribe": [">"],
607
+ })
608
+ print("[launcher] 已注册到 Kernel")
609
+
610
+ # Signal that connection is ready (after subscription and registration)
611
+ if self._ws_connected:
612
+ self._ws_connected.set()
613
+
614
+ # Wait for receiver task to complete (connection closed)
615
+ await receiver_task
616
+ except asyncio.CancelledError:
617
+ receiver_task.cancel()
618
+ raise
619
+
620
+ async def _ws_receiver(self, ws):
621
+ """Receive loop: classify incoming messages."""
622
+ try:
623
+ async for raw in ws:
624
+ try:
625
+ msg = json.loads(raw)
626
+ except (json.JSONDecodeError, TypeError):
627
+ continue
628
+ try:
629
+ has_method = "method" in msg
630
+ has_id = "id" in msg
631
+ has_result = "result" in msg
632
+ has_error = "error" in msg
633
+
634
+ if has_method and not has_id:
635
+ # Event Notification (no id)
636
+ await self._handle_event_notification(msg)
637
+ elif has_method and has_id:
638
+ # Incoming RPC request (forwarded by Kernel)
639
+ await self._handle_rpc_request(ws, msg)
640
+ elif has_id and (has_result or has_error):
641
+ # RPC response (to our own call)
642
+ self._handle_rpc_response(msg)
643
+ except Exception as e:
644
+ print(f"[launcher] 消息处理异常(已忽略): {e}")
645
+ except asyncio.CancelledError:
646
+ pass
647
+
648
+ # ── JSON-RPC 2.0 infrastructure ──
649
+
650
+ async def _rpc_call(self, ws, method: str, params: dict = None, timeout: float = 5) -> dict:
651
+ """Send a JSON-RPC 2.0 request and await the response."""
652
+ rpc_id = str(uuid.uuid4())
653
+ msg = {"jsonrpc": "2.0", "id": rpc_id, "method": method}
654
+ if params:
655
+ msg["params"] = params
656
+
657
+ evt = asyncio.Event()
658
+ self._rpc_waiters[rpc_id] = evt
659
+ self._rpc_results[rpc_id] = {}
660
+
661
+ try:
662
+ await ws.send(json.dumps(msg))
663
+ await asyncio.wait_for(evt.wait(), timeout=timeout)
664
+ return self._rpc_results.get(rpc_id, {})
665
+ except asyncio.TimeoutError:
666
+ print(f"[launcher] RPC 超时: {method}")
667
+ return {"error": {"code": -32002, "message": f"RPC timeout: {method}"}}
668
+ finally:
669
+ self._rpc_waiters.pop(rpc_id, None)
670
+ self._rpc_results.pop(rpc_id, None)
671
+
672
+ def _handle_rpc_response(self, msg: dict):
673
+ """Match an incoming RPC response to a pending waiter."""
674
+ rpc_id = msg.get("id", "")
675
+ waiter = self._rpc_waiters.get(rpc_id)
676
+ if waiter:
677
+ self._rpc_results[rpc_id] = msg
678
+ waiter.set()
679
+
680
+ async def _handle_event_notification(self, msg: dict):
681
+ """Handle an event notification (JSON-RPC 2.0 Notification with method='event')."""
682
+ params = msg.get("params", {})
683
+ source = params.get("source", "unknown")
684
+ event = params.get("event", "")
685
+ data = params.get("data") if isinstance(params.get("data"), dict) else {}
686
+ ts = params.get("timestamp", "")
687
+
688
+ # Trigger event waiters
689
+ module_id = data.get("module_id", "")
690
+ waiter_key = f"{event}:{module_id}"
691
+ waiter = self._event_waiters.get(waiter_key)
692
+ if waiter:
693
+ waiter[1].update(data)
694
+ waiter[0].set()
695
+
696
+ # module.exiting also wakes module.ready waiter
697
+ if event == "module.exiting" and module_id:
698
+ ready_key = f"module.ready:{module_id}"
699
+ ready_waiter = self._event_waiters.get(ready_key)
700
+ if ready_waiter:
701
+ ready_waiter[1].update(data)
702
+ ready_waiter[1]["_exited"] = True
703
+ ready_waiter[0].set()
704
+
705
+ # module.crash → print red crash summary
706
+ if event == "module.crash" and module_id:
707
+ RED = "\033[91m"
708
+ RESET = "\033[0m"
709
+ exc_type = data.get("exception_type", "Unknown")
710
+ preview = data.get("traceback_preview", "")
711
+ print(f"[launcher] {RED}模块 '{module_id}' 崩溃: {exc_type} — {preview}{RESET}")
712
+ _suffix = os.environ.get("KITE_INSTANCE_SUFFIX", "")
713
+ crash_log = os.path.join(
714
+ os.environ.get("KITE_INSTANCE_DIR", ""),
715
+ module_id, "log", f"crashes{_suffix}.jsonl"
716
+ )
717
+ print(f"[launcher] 崩溃日志: {crash_log}")
718
+
719
+ # Only log system events (module.*, watchdog.*) to avoid flooding
720
+ if not (event.startswith("module.") or event.startswith("watchdog.")):
721
+ return
722
+ latency_str = ""
723
+ if ts:
724
+ try:
725
+ from datetime import datetime, timezone
726
+ sent = datetime.fromisoformat(ts)
727
+ delay_ms = (datetime.now(timezone.utc) - sent).total_seconds() * 1000
728
+ latency_str = f" ({delay_ms:.1f}ms)"
729
+ local_ts = sent.astimezone().strftime("%H:%M:%S")
730
+ except Exception:
731
+ local_ts = ts[11:19] if len(ts) >= 19 else ts
732
+ print(f"[{source}] {local_ts} {event}{latency_str}: {json.dumps(data, ensure_ascii=False)}")
733
+ else:
734
+ print(f"[{source}] {event}: {json.dumps(data, ensure_ascii=False)}")
735
+
736
+ async def _handle_rpc_request(self, ws, msg: dict):
737
+ """Handle an incoming RPC request forwarded by Kernel (launcher.* methods)."""
738
+ rpc_id = msg.get("id", "")
739
+ method = msg.get("method", "")
740
+ params = msg.get("params", {})
741
+
742
+ handlers = {
743
+ "list_modules": self._rpc_list_modules,
744
+ "start_module": self._rpc_start_module,
745
+ "stop_module": self._rpc_stop_module,
746
+ "restart_module": self._rpc_restart_module,
747
+ "rescan": self._rpc_rescan,
748
+ "shutdown": self._rpc_shutdown,
749
+ }
750
+ handler = handlers.get(method)
751
+ if handler:
752
+ try:
753
+ result = await handler(params)
754
+ await ws.send(json.dumps({"jsonrpc": "2.0", "id": rpc_id, "result": result}))
755
+ except Exception as e:
756
+ await ws.send(json.dumps({
757
+ "jsonrpc": "2.0", "id": rpc_id,
758
+ "error": {"code": -32603, "message": str(e)},
759
+ }))
760
+ else:
761
+ await ws.send(json.dumps({
762
+ "jsonrpc": "2.0", "id": rpc_id,
763
+ "error": {"code": -32601, "message": f"Method not found: {method}"},
764
+ }))
765
+
766
+ # ── Launcher RPC method handlers ──
767
+
768
+ async def _rpc_list_modules(self, params: dict) -> dict:
769
+ """List all modules and their current status."""
770
+ result = []
771
+ for name, info in self.modules.items():
772
+ running = self.process_manager.is_running(name)
773
+ rec = self.process_manager.get_record(name)
774
+ result.append({
775
+ "name": name,
776
+ "display_name": info.display_name,
777
+ "type": info.type,
778
+ "config_state": info.state,
779
+ "desired_state": self._desired_states.get(name, "stopped"),
780
+ "actual_state": f"running({rec.pid})" if running and rec else "stopped",
781
+ "pid": rec.pid if running and rec else None,
782
+ "monitor": info.monitor,
783
+ })
784
+ return {"modules": result}
785
+
786
+ async def _rpc_start_module(self, params: dict) -> dict:
787
+ """Start a module by name."""
788
+ name = params.get("name", "")
789
+ info = self.modules.get(name)
790
+ if not info:
791
+ raise RuntimeError(f"Module '{name}' not found")
792
+ if info.state == "disabled":
793
+ raise RuntimeError(f"Module '{name}' is disabled")
794
+
795
+ if name not in self._module_tokens:
796
+ self._module_tokens[name] = secrets.token_hex(32)
797
+ await self._register_new_tokens({name: self._module_tokens[name]})
798
+
799
+ token = self._module_tokens[name]
800
+ boot_info = {"token": token}
801
+ ok = self.process_manager.start_module(info, boot_info=boot_info)
802
+ if ok:
803
+ self._desired_states[name] = "running"
804
+ self.process_manager.persist_records()
805
+ rec = self.process_manager.get_record(name)
806
+ self._log_lifecycle("started", name, pid=rec.pid if rec else None, via="rpc")
807
+ await self._publish_event("module.started", {"module_id": name})
808
+ return {"status": "started", "name": name}
809
+ self._log_lifecycle("start_failed", name, via="rpc")
810
+ raise RuntimeError(f"Failed to start '{name}'")
811
+
812
+ async def _rpc_stop_module(self, params: dict) -> dict:
813
+ """Stop a module with graceful shutdown."""
814
+ name = params.get("name", "")
815
+ info = self.modules.get(name)
816
+ if not info:
817
+ raise RuntimeError(f"Module '{name}' not found")
818
+ reason = params.get("reason", "stop_requested")
819
+ self._desired_states[name] = "stopped"
820
+ await self._graceful_stop(name, reason)
821
+ self.process_manager.persist_records()
822
+ return {"status": "stopped", "name": name}
823
+
824
+ async def _rpc_restart_module(self, params: dict) -> dict:
825
+ """Restart a module (stop + start)."""
826
+ name = params.get("name", "")
827
+ info = self.modules.get(name)
828
+ if not info:
829
+ raise RuntimeError(f"Module '{name}' not found")
830
+ if info.state == "disabled":
831
+ raise RuntimeError(f"Module '{name}' is disabled")
832
+ reason = params.get("reason", "restart")
833
+ await self._graceful_stop(name, reason)
834
+ self._module_tokens[name] = secrets.token_hex(32)
835
+ await self._register_new_tokens({name: self._module_tokens[name]})
836
+ token = self._module_tokens[name]
837
+ boot_info = {"token": token}
838
+ ok = self.process_manager.start_module(info, boot_info=boot_info)
839
+ if ok:
840
+ self._desired_states[name] = "running"
841
+ self.process_manager.persist_records()
842
+ rec = self.process_manager.get_record(name)
843
+ self._log_lifecycle("started", name, pid=rec.pid if rec else None, via="rpc_restart")
844
+ await self._publish_event("module.started", {"module_id": name})
845
+ return {"status": "restarted", "name": name}
846
+ self._log_lifecycle("start_failed", name, via="rpc_restart")
847
+ raise RuntimeError(f"Failed to restart '{name}'")
848
+
849
+ async def _rpc_rescan(self, params: dict) -> dict:
850
+ """Rescan module directories for new/removed modules."""
851
+ old_names = set(self.modules.keys())
852
+ self.modules = self.module_scanner.scan()
853
+ new_names = set(self.modules.keys())
854
+ added = list(new_names - old_names)
855
+ removed = list(old_names - new_names)
856
+ for name in added:
857
+ info = self.modules[name]
858
+ self._log_lifecycle("scanned", name, state=info.state, module_dir=info.module_dir)
859
+ self._desired_states[name] = "running" if info.state == "enabled" else "stopped"
860
+ if added:
861
+ new_tokens = {}
862
+ for name in added:
863
+ self._module_tokens[name] = secrets.token_hex(32)
864
+ new_tokens[name] = self._module_tokens[name]
865
+ await self._register_new_tokens(new_tokens)
866
+ return {"added": added, "removed": removed, "total": len(self.modules)}
867
+
868
+ async def _rpc_shutdown(self, params: dict) -> dict:
869
+ """Shutdown the entire Kite system."""
870
+ reason = params.get("reason", "rpc_request")
871
+ self._request_shutdown(f"RPC shutdown request: {reason}")
872
+ return {"status": "shutting_down", "reason": reason}
873
+
874
+ # ── Event publishing via RPC ──
875
+
876
+ async def _publish_event(self, event_type: str, data: dict):
877
+ """Publish an event via RPC event.publish through Kernel WS."""
878
+ if not self._ws:
879
+ return
880
+ msg = json.dumps({
881
+ "jsonrpc": "2.0",
882
+ "id": str(uuid.uuid4()),
883
+ "method": "event.publish",
884
+ "params": {
885
+ "event_id": str(uuid.uuid4()),
886
+ "event": event_type,
887
+ "data": data,
888
+ },
889
+ })
890
+
891
+ async def _send():
892
+ try:
893
+ await self._ws.send(msg)
894
+ except Exception as e:
895
+ print(f"[launcher] 发布事件失败: {e}")
896
+
897
+ asyncio.create_task(_send())
898
+
899
+ async def _wait_event(self, event_type: str, module_id: str, timeout: float) -> dict | None:
900
+ """Wait for a specific event from a module. Returns data dict or None on timeout."""
901
+ key = f"{event_type}:{module_id}"
902
+ evt = asyncio.Event()
903
+ data = {}
904
+ self._event_waiters[key] = (evt, data)
905
+ try:
906
+ await asyncio.wait_for(evt.wait(), timeout=timeout)
907
+ return data
908
+ except asyncio.TimeoutError:
909
+ return None
910
+ finally:
911
+ self._event_waiters.pop(key, None)
912
+
913
+ async def _graceful_stop(self, name: str, reason: str = "stop_requested", timeout: float = 10):
914
+ """Graceful shutdown: check capability → send event → wait ack → wait ready → kill.
915
+ Modules that did not declare graceful_shutdown in module.ready are terminated directly.
916
+ """
917
+ self._log_lifecycle("stopping", name, reason=reason)
918
+
919
+ if not self._graceful_modules.get(name):
920
+ self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
921
+ self._log_lifecycle("stopped", name, reason=reason)
922
+ await self._publish_event("module.stopped", {
923
+ "module_id": name,
924
+ "graceful_shutdown": False,
925
+ })
926
+ return
927
+
928
+ # Register waiters BEFORE sending shutdown event
929
+ ack_key = f"module.shutdown.ack:{name}"
930
+ ack_evt = asyncio.Event()
931
+ ack_data = {}
932
+ self._event_waiters[ack_key] = (ack_evt, ack_data)
933
+
934
+ ready_key = f"module.shutdown.ready:{name}"
935
+ ready_evt = asyncio.Event()
936
+ ready_data = {}
937
+ self._event_waiters[ready_key] = (ready_evt, ready_data)
938
+
939
+ await self._publish_event("module.shutdown", {
940
+ "module_id": name, "reason": reason, "timeout": timeout,
941
+ })
942
+
943
+ # Wait for ack
944
+ try:
945
+ await asyncio.wait_for(ack_evt.wait(), timeout=3)
946
+ ack = ack_data
947
+ except asyncio.TimeoutError:
948
+ ack = None
949
+ finally:
950
+ self._event_waiters.pop(ack_key, None)
951
+
952
+ if not ack:
953
+ self._event_waiters.pop(ready_key, None)
954
+ self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
955
+ await self._publish_event("module.stopped", {
956
+ "module_id": name,
957
+ "graceful_shutdown": self._graceful_modules.get(name, False),
958
+ })
959
+ return
960
+
961
+ estimated = min(ack.get("estimated_cleanup", timeout), timeout)
962
+
963
+ # Wait for ready
964
+ try:
965
+ await asyncio.wait_for(ready_evt.wait(), timeout=estimated)
966
+ ready = ready_data
967
+ except asyncio.TimeoutError:
968
+ ready = None
969
+ finally:
970
+ self._event_waiters.pop(ready_key, None)
971
+ if ready:
972
+ self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_READY)
973
+ else:
974
+ self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_PARTIAL)
975
+
976
+ self._log_lifecycle("stopped", name, reason=reason)
977
+ await self._publish_event("module.stopped", {
978
+ "module_id": name,
979
+ "graceful_shutdown": self._graceful_modules.get(name, False),
980
+ })
981
+
982
+ async def _graceful_shutdown_all(self):
983
+ """Shut down all modules. Order:
984
+ 1. Send shutdown to graceful modules (excl. Kernel) — let them start cleanup
985
+ 2. Terminate non-graceful modules (fast, runs during graceful cleanup)
986
+ 3. Wait for graceful modules to exit (process monitoring)
987
+ 4. Shut down Kernel last (keeps event routing alive throughout)
988
+ """
989
+ self._system_shutting_down = True
990
+ running = [n for n in self.modules if self.process_manager.is_running(n)]
991
+ # Also check core modules
992
+ for cn in CORE_MODULE_NAMES:
993
+ if self.process_manager.is_running(cn) and cn not in running:
994
+ running.append(cn)
995
+ if not running:
996
+ print("[launcher] 没有运行中的模块需要关闭")
997
+ return
998
+
999
+ graceful = [n for n in running if self._graceful_modules.get(n)]
1000
+ non_graceful = [n for n in running if not self._graceful_modules.get(n)]
1001
+
1002
+ # Defer Kernel — it must stay alive to route shutdown events
1003
+ kernel_deferred = "kernel" in graceful
1004
+ graceful_batch = [n for n in graceful if n != "kernel"] if kernel_deferred else graceful
1005
+
1006
+ print(f"[launcher] 正在关闭 {len(running)} 个模块: {', '.join(running)}")
1007
+
1008
+ # Phase 1: Notify graceful modules first (they start cleanup immediately)
1009
+ for name in graceful_batch:
1010
+ self._log_lifecycle("stopping", name, reason="system_shutdown")
1011
+ await self._publish_event("module.shutdown", {
1012
+ "module_id": name, "reason": "system_shutdown", "timeout": 5,
1013
+ })
1014
+
1015
+ # Phase 2: While graceful modules are cleaning up, terminate non-graceful ones
1016
+ if non_graceful:
1017
+ print(f"[launcher] 直接终止 {len(non_graceful)} 个不支持优雅退出的模块: {', '.join(non_graceful)}")
1018
+ for name in non_graceful:
1019
+ self._log_lifecycle("stopping", name, reason="system_shutdown")
1020
+ self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_PARTIAL)
1021
+ self._log_lifecycle("stopped", name, reason="system_shutdown")
1022
+
1023
+ # Phase 3: Wait for graceful modules to exit (process monitoring)
1024
+ if graceful_batch:
1025
+ deadline = time.time() + 5
1026
+ while time.time() < deadline:
1027
+ still_running = [n for n in graceful_batch if self.process_manager.is_running(n)]
1028
+ if not still_running:
1029
+ print("[launcher] 所有优雅退出模块已自行退出")
1030
+ break
1031
+ remaining = max(0, deadline - time.time())
1032
+ print(f"[launcher] 等待 {len(still_running)} 个模块退出 ({remaining:.0f}s): {', '.join(still_running)}")
1033
+ await asyncio.sleep(1)
1034
+ # Force kill survivors
1035
+ for name in graceful_batch:
1036
+ if self.process_manager.is_running(name):
1037
+ self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_PARTIAL)
1038
+ self._log_lifecycle("stopped", name, reason="system_shutdown")
1039
+
1040
+ # Phase 4: All other modules exited — now shut down Kernel
1041
+ if kernel_deferred and self.process_manager.is_running("kernel"):
1042
+ self._log_lifecycle("stopping", "kernel", reason="system_shutdown")
1043
+ print("[launcher] 正在关闭 Kernel...")
1044
+
1045
+ # Call kernel.shutdown RPC (not event)
1046
+ rpc_sent = False
1047
+ try:
1048
+ if self._ws:
1049
+ await self._rpc_call(self._ws, "kernel.shutdown", {})
1050
+ print("[launcher] Kernel shutdown RPC 已发送")
1051
+ rpc_sent = True
1052
+ else:
1053
+ print("[launcher] WebSocket 未连接,跳过 RPC 调用")
1054
+ except Exception as e:
1055
+ print(f"[launcher] Kernel shutdown RPC 失败: {e}")
1056
+
1057
+ # Wait for kernel to exit
1058
+ if rpc_sent:
1059
+ # RPC sent: wait up to 5s for graceful exit
1060
+ proc = self.process_manager._processes.get("kernel")
1061
+ if proc:
1062
+ try:
1063
+ loop = asyncio.get_event_loop()
1064
+ await asyncio.wait_for(
1065
+ loop.run_in_executor(None, proc.wait),
1066
+ timeout=5
1067
+ )
1068
+ print("[launcher] Kernel 已退出")
1069
+ except asyncio.TimeoutError:
1070
+ print("[launcher] Kernel 5秒内未退出,强制停止")
1071
+ self.process_manager.stop_module("kernel", timeout=SHUTDOWN_TIMEOUT_PARTIAL)
1072
+ else:
1073
+ # No RPC (WS not connected): use shorter timeout for terminate
1074
+ self.process_manager.stop_module("kernel", timeout=2)
1075
+
1076
+ self._log_lifecycle("stopped", "kernel", reason="system_shutdown")
1077
+
1078
+ # Final safety net
1079
+ try:
1080
+ self.process_manager.stop_all(timeout=SHUTDOWN_TIMEOUT_BULK)
1081
+ except Exception as e:
1082
+ print(f"[launcher] stop_all 出错: {e}")
1083
+
1084
+ # ── Module startup ──
1085
+
1086
+ def _topo_sort(self, modules: list[ModuleInfo]) -> list[ModuleInfo]:
1087
+ """Topological sort by depends_on. Raises RuntimeError on cycle."""
1088
+ name_map = {m.name: m for m in modules}
1089
+ visited = set()
1090
+ in_stack = set()
1091
+ order = []
1092
+
1093
+ def visit(name):
1094
+ if name in in_stack:
1095
+ raise RuntimeError(f"Circular dependency detected involving '{name}'")
1096
+ if name in visited:
1097
+ return
1098
+ in_stack.add(name)
1099
+ info = name_map.get(name)
1100
+ if info:
1101
+ for dep in info.depends_on:
1102
+ visit(dep)
1103
+ in_stack.remove(name)
1104
+ visited.add(name)
1105
+ if info:
1106
+ order.append(info)
1107
+
1108
+ for m in modules:
1109
+ visit(m.name)
1110
+ return order
1111
+
1112
+ def _topo_layers(self, modules: list[ModuleInfo]) -> list[list[ModuleInfo]]:
1113
+ """Topological sort into layers. Modules in the same layer have no
1114
+ inter-dependencies and can be started in parallel."""
1115
+ name_map = {m.name: m for m in modules}
1116
+ all_names = set(name_map.keys())
1117
+
1118
+ # Compute depth (longest path from root) for each module
1119
+ depth: dict[str, int] = {}
1120
+ in_stack: set[str] = set()
1121
+
1122
+ def get_depth(name: str) -> int:
1123
+ if name in depth:
1124
+ return depth[name]
1125
+ if name in in_stack:
1126
+ raise RuntimeError(f"Circular dependency detected involving '{name}'")
1127
+ in_stack.add(name)
1128
+ info = name_map.get(name)
1129
+ d = 0
1130
+ if info:
1131
+ for dep in info.depends_on:
1132
+ if dep in all_names:
1133
+ d = max(d, get_depth(dep) + 1)
1134
+ in_stack.remove(name)
1135
+ depth[name] = d
1136
+ return d
1137
+
1138
+ for name in all_names:
1139
+ get_depth(name)
1140
+
1141
+ # Group by depth
1142
+ max_depth = max(depth.values()) if depth else 0
1143
+ layers: list[list[ModuleInfo]] = [[] for _ in range(max_depth + 1)]
1144
+ for name, d in depth.items():
1145
+ layers[d].append(name_map[name])
1146
+ return layers
1147
+
1148
+ async def _start_one_module(self, info: ModuleInfo):
1149
+ """Start a single module: publish starting → start process → send kernel_port → wait ready → started → close stdio."""
1150
+ self._log_lifecycle("starting", info.name)
1151
+ await self._publish_event("module.starting", {"module_id": info.name})
1152
+
1153
+ token = self._module_tokens.get(info.name, "")
1154
+ boot_info = {"token": token}
1155
+ t0 = time.monotonic()
1156
+ ok = self.process_manager.start_module(info, boot_info=boot_info)
1157
+ if not ok:
1158
+ self._log_lifecycle("start_failed", info.name)
1159
+ return
1160
+
1161
+ # Register waiter BEFORE sending kernel_port
1162
+ # This prevents race condition where module connects and sends module.ready before waiter is registered
1163
+ ready_key = f"module.ready:{info.name}"
1164
+ ready_evt = asyncio.Event()
1165
+ ready_data = {}
1166
+ self._event_waiters[ready_key] = (ready_evt, ready_data)
1167
+
1168
+ # Send kernel_port via stdin so module can connect to Kernel WS
1169
+ self.process_manager.write_stdin(info.name, {
1170
+ "kite": "kernel_port",
1171
+ "kernel_port": self.kernel_port,
1172
+ })
1173
+
1174
+ # Persist immediately after starting to ensure PID is recorded
1175
+ self.process_manager.persist_records()
1176
+
1177
+ # Wait for module.ready or module.exiting (whichever comes first)
1178
+ timeout = info.launch.timeout
1179
+ try:
1180
+ await asyncio.wait_for(ready_evt.wait(), timeout=timeout)
1181
+ ready = ready_data
1182
+ except asyncio.TimeoutError:
1183
+ ready = None
1184
+ finally:
1185
+ self._event_waiters.pop(ready_key, None)
1186
+
1187
+ elapsed = time.monotonic() - t0
1188
+ if ready and ready.get("_exited"):
1189
+ # Module sent module.exiting before ready — it chose to quit
1190
+ reason = ready.get("reason", "unknown")
1191
+ self._exit_reasons[info.name] = reason
1192
+ print(f"[launcher] 模块 '{info.name}' 主动退出: {reason} ({elapsed:.2f}s)")
1193
+ elif ready:
1194
+ self._graceful_modules[info.name] = bool(ready.get("graceful_shutdown"))
1195
+ self._ready_times[info.name] = elapsed
1196
+ print(f"[launcher] 模块 '{info.name}' 已就绪 ({elapsed:.2f}s)")
1197
+ else:
1198
+ print(f"\033[91m[launcher] 警告: '{info.name}' 在 {timeout}s 内未发送 module.ready\033[0m")
1199
+
1200
+ rec = self.process_manager.get_record(info.name)
1201
+ self._log_lifecycle("started", info.name, pid=rec.pid if rec else None)
1202
+ await self._publish_event("module.started", {"module_id": info.name})
1203
+ self.process_manager.close_stdio(info.name)
1204
+
1205
+ async def _register_module_tokens(self):
1206
+ """Generate per-module tokens and register the mapping to Kernel via RPC."""
1207
+ # Include all scanned modules
1208
+ async def _generate_module_tokens(self):
1209
+ """Request Kernel to generate tokens for all scanned modules via RPC."""
1210
+ # Collect module names that need tokens
1211
+ module_names = [name for name in self.modules if name not in self._module_tokens]
1212
+
1213
+ if not module_names:
1214
+ return
1215
+
1216
+ # Wait for WebSocket connection to be ready
1217
+ if self._ws_connected:
1218
+ try:
1219
+ await asyncio.wait_for(self._ws_connected.wait(), timeout=5)
1220
+ except asyncio.TimeoutError:
1221
+ print(f"[launcher] 警告: WebSocket 未就绪,无法生成令牌")
1222
+ return
1223
+ else:
1224
+ print(f"[launcher] 警告: _ws_connected 未初始化")
1225
+ return
1226
+
1227
+ # Call Kernel RPC to generate tokens
1228
+ try:
1229
+ result = await self._rpc_call(self._ws, "kernel.generate_tokens", {"modules": module_names})
1230
+ if result.get("result", {}).get("ok"):
1231
+ tokens = result["result"].get("tokens", {})
1232
+ self._module_tokens.update(tokens)
1233
+ print(f"[launcher] Kernel 已生成 {len(tokens)} 个模块令牌")
1234
+ elif "error" in result:
1235
+ print(f"[launcher] 警告: 令牌生成失败: {result['error'].get('message', '')}")
1236
+ except Exception as e:
1237
+ print(f"[launcher] 警告: 生成模块令牌失败: {e}")
1238
+
1239
+ async def _register_new_tokens(self, tokens: dict):
1240
+ """Register new token mapping to Kernel via RPC kernel.register_tokens."""
1241
+ if not self._ws or not tokens:
1242
+ return
1243
+ try:
1244
+ result = await self._rpc_call(self._ws, "kernel.register_tokens", tokens)
1245
+ if result.get("result", {}).get("ok"):
1246
+ print(f"[launcher] 已注册 {len(tokens)} 个模块令牌")
1247
+ elif "error" in result:
1248
+ print(f"[launcher] 警告: 令牌注册失败: {result['error'].get('message', '')}")
1249
+ except Exception as e:
1250
+ print(f"[launcher] 警告: 注册模块令牌失败: {e}")
1251
+
1252
+ # ── Validation ──
1253
+
1254
+ def _validate_core_modules(self):
1255
+ """Validate core modules exist."""
1256
+ project_root = os.environ["KITE_PROJECT"]
1257
+ mod_dir = os.path.join(project_root, "kernel")
1258
+ md_path = os.path.join(mod_dir, "module.md")
1259
+ if not os.path.isdir(mod_dir):
1260
+ print(f"[launcher] 致命: 核心模块 'kernel' 目录未找到: {mod_dir}")
1261
+ sys.exit(1)
1262
+ if not os.path.isfile(md_path):
1263
+ print(f"[launcher] 致命: 核心模块 'kernel' 缺少 module.md: {md_path}")
1264
+ sys.exit(1)
1265
+ try:
1266
+ with open(md_path, "r", encoding="utf-8") as f:
1267
+ fm = _parse_frontmatter(f.read())
1268
+ if not fm:
1269
+ print(f"[launcher] 致命: 核心模块 'kernel' module.md 没有有效的 frontmatter")
1270
+ sys.exit(1)
1271
+ except Exception as e:
1272
+ print(f"[launcher] 致命: 核心模块 'kernel' module.md 解析错误: {e}")
1273
+ sys.exit(1)
1274
+
1275
+ # ── Module crash summary ──
1276
+
1277
+ def _print_module_crash_summary(self, name: str):
1278
+ """Read module's crashes.jsonl last record and print red summary to console.
1279
+ Complement to module.crash event — reliable even if event was never sent."""
1280
+ RED = "\033[91m"
1281
+ RESET = "\033[0m"
1282
+ _suffix = os.environ.get("KITE_INSTANCE_SUFFIX", "")
1283
+ crash_log = os.path.join(
1284
+ os.environ.get("KITE_INSTANCE_DIR", ""), name, "log", f"crashes{_suffix}.jsonl"
1285
+ )
1286
+ if not os.path.isfile(crash_log):
1287
+ return
1288
+ try:
1289
+ with open(crash_log, "rb") as f:
1290
+ f.seek(0, 2)
1291
+ size = f.tell()
1292
+ if size == 0:
1293
+ return
1294
+ f.seek(max(0, size - 4096))
1295
+ lines = f.read().decode("utf-8").strip().split("\n")
1296
+ last = json.loads(lines[-1])
1297
+ exc_type = last.get("exception_type", "Unknown")
1298
+ ctx = last.get("context", {})
1299
+ file_name = ctx.get("file", "unknown")
1300
+ line_no = ctx.get("line", "?")
1301
+ print(f"[launcher] {RED}崩溃: "
1302
+ f"{exc_type} in {file_name}:{line_no}{RESET}")
1303
+ print(f"[launcher] 崩溃日志: {crash_log}")
1304
+ except Exception:
1305
+ pass
1306
+
1307
+ # ── Monitor loop ──
1308
+
1309
+ async def _monitor_loop(self):
1310
+ """Check child processes every second. Handle crashes.
1311
+ Uses _shutdown_event (asyncio.Event) so Ctrl+C wakes us immediately.
1312
+
1313
+ Responsibility split:
1314
+ - Core module crash → full restart (Launcher handles)
1315
+ - Watchdog crash → Launcher restarts directly (up to 3 times)
1316
+ - Other module exit → publish module.stopped event only; Watchdog decides restart
1317
+ """
1318
+ WATCHDOG_MAX_FAIL = 3
1319
+ watchdog_fail_count = 0
1320
+
1321
+ while not self._shutdown_event.is_set():
1322
+ exited = self.process_manager.check_exited()
1323
+
1324
+ for name, rc in exited:
1325
+ print(f"[launcher] 模块 '{name}' 退出,返回码 {rc}")
1326
+ if rc != 0:
1327
+ self._print_module_crash_summary(name)
1328
+ self._log_lifecycle("exited", name, exit_code=rc)
1329
+ await self._publish_event("module.stopped", {
1330
+ "module_id": name, "exit_code": rc,
1331
+ "graceful_shutdown": self._graceful_modules.get(name, False),
1332
+ })
1333
+ info = self.modules.get(name)
1334
+
1335
+ # 1) Core module crash → full restart
1336
+ if name in CORE_MODULE_NAMES or (info and info.is_core()):
1337
+ print(f"[launcher] 严重: 核心模块 '{name}' 崩溃,正在全部重启...")
1338
+ self._log_lifecycle("core_crash", name, exit_code=rc)
1339
+ await self._full_restart()
1340
+ return
1341
+
1342
+ # 2) Watchdog crash → Launcher restarts directly
1343
+ if name == WATCHDOG_MODULE_NAME:
1344
+ if self._system_shutting_down:
1345
+ print(f"[launcher] Watchdog 退出(系统关闭中),跳过重启")
1346
+ continue
1347
+ watchdog_fail_count += 1
1348
+ if watchdog_fail_count <= WATCHDOG_MAX_FAIL and info:
1349
+ print(f"[launcher] Watchdog 崩溃,正在重启 (第 {watchdog_fail_count}/{WATCHDOG_MAX_FAIL} 次)...")
1350
+ await self._start_one_module(info)
1351
+ else:
1352
+ self._desired_states[name] = "stopped"
1353
+ self._log_lifecycle("failed", name, reason=f"exceeded {WATCHDOG_MAX_FAIL} retries")
1354
+ print(f"[launcher] Watchdog 失败 {WATCHDOG_MAX_FAIL} 次,已放弃")
1355
+ continue
1356
+
1357
+ # 3) Other modules → event already published above; Watchdog decides restart
1358
+ # (no restart logic here — Watchdog handles it via module.stopped event)
1359
+
1360
+ if exited:
1361
+ self.process_manager.persist_records()
1362
+
1363
+ # Wait 1s but wake immediately on shutdown signal
1364
+ try:
1365
+ await asyncio.wait_for(self._shutdown_event.wait(), timeout=1)
1366
+ return # shutdown requested
1367
+ except asyncio.TimeoutError:
1368
+ pass
1369
+
1370
+ async def _full_restart(self):
1371
+ """Stop all modules, regenerate tokens, re-run Phase 1-2."""
1372
+ print("[launcher] 全量重启: 正在停止所有模块...")
1373
+
1374
+ # Persist records before shutdown so cleanup_leftovers can find survivors
1375
+ self.process_manager.persist_records()
1376
+
1377
+ # Disconnect Kernel WS
1378
+ if self._ws_task:
1379
+ self._ws_task.cancel()
1380
+ self._ws_task = None
1381
+ self._ws = None
1382
+ self._rpc_waiters.clear()
1383
+ self._rpc_results.clear()
1384
+
1385
+ await self._graceful_shutdown_all()
1386
+
1387
+ # Cleanup any leftover processes that survived graceful shutdown.
1388
+ self.process_manager.cleanup_leftovers()
1389
+
1390
+ self._module_tokens.clear()
1391
+
1392
+ # Regenerate kite_token
1393
+ self.kite_token = secrets.token_hex(32)
1394
+ self.process_manager.kite_token = self.kite_token
1395
+
1396
+ print("[launcher] 全量重启: 重新执行 Phase 1-2...")
1397
+ try:
1398
+ await self._phase1_start_kernel()
1399
+ await self._phase2_start_modules()
1400
+ self.process_manager.persist_records()
1401
+ print("[launcher] 全量重启完成,恢复监控循环")
1402
+ await self._monitor_loop()
1403
+ except Exception as e:
1404
+ print(f"[launcher] 全量重启失败: {e}")
1405
+
1406
+ # ── Shutdown ──
1407
+
1408
+ def _final_cleanup(self):
1409
+ """Called on exit — stop all processes, clear records."""
1410
+ try:
1411
+ print("[launcher] 正在执行最终清理...")
1412
+
1413
+ if self._ws_task:
1414
+ self._ws_task.cancel()
1415
+
1416
+ # Note: _graceful_shutdown_all() already called stop_all() in _async_main finally block.
1417
+ # This is just a safety check — should normally find nothing.
1418
+ remaining = [n for n in self.process_manager._processes
1419
+ if self.process_manager.is_running(n)]
1420
+ if remaining:
1421
+ print(f"[launcher] 警告: 仍有残留进程 (不应出现): {', '.join(remaining)}")
1422
+ self.process_manager.stop_all(timeout=SHUTDOWN_TIMEOUT_BULK)
1423
+ else:
1424
+ print("[launcher] 无残留进程")
1425
+
1426
+ # Clear instance runtime files
1427
+ try:
1428
+ os.remove(self.process_manager.records_path)
1429
+ except OSError:
1430
+ pass
1431
+ except Exception as e:
1432
+ print(f"[launcher] 最终清理出错: {e}")
1433
+ finally:
1434
+ # Signal the safety-net thread that normal shutdown has completed
1435
+ self._shutdown_complete.set()
1436
+
1437
+ # Calculate and display shutdown time
1438
+ if self._shutdown_start_time > 0:
1439
+ shutdown_elapsed = time.monotonic() - self._shutdown_start_time
1440
+ print(f"[launcher] 再见。(退出耗时: {shutdown_elapsed:.2f}s)")
1441
+ else:
1442
+ print("[launcher] 再见。")
1443
+
1444
+ if IS_WINDOWS:
1445
+ os._exit(0)
1446
+
1447
+ # ── Startup report ──
1448
+
1449
+ async def _print_startup_report(self, total_time: float, phase_times: dict[str, float], *,
1450
+ global_instances=None, cleaned_stats: dict[str, int] | None = None):
1451
+ """Print a green startup summary with module list and timing."""
1452
+ G = "\033[32m" # green
1453
+ Y = "\033[33m" # yellow
1454
+ R = "\033[0m" # reset
1455
+ B = "\033[1;32m" # bold green
1456
+
1457
+ running = []
1458
+ exited = []
1459
+ stopped = []
1460
+ for name, info in self.modules.items():
1461
+ rec = self.process_manager.get_record(name)
1462
+ is_running = self.process_manager.is_running(name)
1463
+ if is_running and rec:
1464
+ running.append((name, info, rec))
1465
+ elif self._desired_states.get(name) == "running" and not is_running:
1466
+ # Was started but already exited (e.g. module.exiting)
1467
+ exited.append((name, info))
1468
+ else:
1469
+ stopped.append((name, info))
1470
+
1471
+ # Calculate kernel startup time (Phase 1)
1472
+ kernel_time = phase_times.get("Phase 1: Kernel", 0)
1473
+
1474
+ lines = [
1475
+ "",
1476
+ f"{B}{'=' * 60}",
1477
+ f" Kite 内核启动完成 耗时 {kernel_time:.2f}s",
1478
+ f" Kite 全部模块启动完成 总耗时 {total_time:.2f}s",
1479
+ f"{'=' * 60}{R}",
1480
+ ]
1481
+
1482
+ # Phase breakdown
1483
+ lines.append(f"{G} 阶段耗时:{R}")
1484
+
1485
+ # Kernel modules section
1486
+ lines.append(f"{G} 内核模块:{R}")
1487
+ if "Phase 1: Kernel" in phase_times:
1488
+ elapsed = phase_times["Phase 1: Kernel"]
1489
+ lines.append(f"{G} {'Phase 1: Kernel':<26s} {elapsed:>6.2f}s{R}")
1490
+
1491
+ # Extension modules section
1492
+ lines.append(f"{G} 扩展模块:{R}")
1493
+ if "Phase 2: Extensions" in phase_times:
1494
+ elapsed = phase_times["Phase 2: Extensions"]
1495
+ lines.append(f"{G} {'Phase 2: Extensions':<26s} {elapsed:>6.2f}s{R}")
1496
+
1497
+ # Sort running modules by ready time
1498
+ running_sorted = sorted(running, key=lambda x: self._ready_times.get(x[0], float('inf')))
1499
+
1500
+ # Running modules with ready time and elapsed from Kite start
1501
+ DIM = "\033[90m"
1502
+ lines.append(f"{G} 运行中 ({len(running)}):{R}")
1503
+
1504
+ # CJK-aware display width helpers
1505
+ def _dw(s):
1506
+ """Display width: CJK chars count as 2, others as 1."""
1507
+ w = 0
1508
+ for c in str(s):
1509
+ w += 2 if '\u4e00' <= c <= '\u9fff' or '\u3000' <= c <= '\u303f' or '\uff00' <= c <= '\uffef' else 1
1510
+ return w
1511
+
1512
+ def _rpad(s, width):
1513
+ """Left-align s in a field of given display width."""
1514
+ return str(s) + ' ' * max(0, width - _dw(s))
1515
+
1516
+ def _lpad(s, width):
1517
+ """Right-align s in a field of given display width."""
1518
+ return ' ' * max(0, width - _dw(s)) + str(s)
1519
+
1520
+ # Column definitions: (header, align, min_width)
1521
+ headers = ['模块', 'PID', '启动耗时', '进程启动时长', '类型']
1522
+ aligns = ['left', 'right', 'right', 'right', 'left'] # alignment per column
1523
+
1524
+ # Build data rows first to calculate column widths
1525
+ rows = []
1526
+ for name, info, rec in running_sorted:
1527
+ label = info.display_name or name
1528
+ ready_t = self._ready_times.get(name)
1529
+ time_str = f"{ready_t:.2f}s" if ready_t is not None else "—"
1530
+ if ready_t is not None and hasattr(self, '_start_unix'):
1531
+ elapsed_from_start = (rec.started_at + ready_t) - self._start_unix
1532
+ es_str = f"{elapsed_from_start:.2f}s"
1533
+ else:
1534
+ es_str = "—"
1535
+
1536
+ # Check if module timed out (ready_t >= 15s for kernel, >= timeout for others)
1537
+ is_timeout = False
1538
+ if ready_t is not None:
1539
+ if name == "kernel" and ready_t >= 15:
1540
+ is_timeout = True
1541
+ elif name != "kernel" and ready_t >= 15: # Default timeout for other modules
1542
+ is_timeout = True
1543
+
1544
+ rows.append([label, str(rec.pid), time_str, es_str, f"[{info.type}]", is_timeout])
1545
+
1546
+ # Calculate column widths: max of header and all data display widths
1547
+ col_widths = [_dw(h) for h in headers]
1548
+ for row in rows:
1549
+ for i, cell in enumerate(row[:5]): # Only first 5 columns (exclude is_timeout flag)
1550
+ col_widths[i] = max(col_widths[i], _dw(cell))
1551
+
1552
+ # Render header
1553
+ hdr_parts = []
1554
+ for i, h in enumerate(headers):
1555
+ if aligns[i] == 'left':
1556
+ hdr_parts.append(_rpad(h, col_widths[i]))
1557
+ else:
1558
+ hdr_parts.append(_lpad(h, col_widths[i]))
1559
+ lines.append(f"{DIM} {' '.join(hdr_parts)}{R}")
1560
+
1561
+ # Render data rows
1562
+ RED = "\033[91m"
1563
+ for row in rows:
1564
+ is_timeout = row[5] # Last element is the timeout flag
1565
+ parts = []
1566
+ for i, cell in enumerate(row[:5]): # Only first 5 columns
1567
+ if aligns[i] == 'left':
1568
+ parts.append(_rpad(cell, col_widths[i]))
1569
+ else:
1570
+ parts.append(_lpad(cell, col_widths[i]))
1571
+ if is_timeout:
1572
+ lines.append(f"{RED} ✓ {' '.join(parts)}{R}")
1573
+ else:
1574
+ lines.append(f"{G} ✓ {' '.join(parts)}{R}")
1575
+
1576
+ # Exited modules (started but already quit)
1577
+ if exited:
1578
+ lines.append(f"{Y} 已退出 ({len(exited)}):{R}")
1579
+ for name, info in exited:
1580
+ label = info.display_name or name
1581
+ reason = self._exit_reasons.get(name, "")
1582
+ reason_str = f": {reason}" if reason else ""
1583
+ lines.append(f"{Y} ↗ {label:<20s} (主动退出{reason_str}){R}")
1584
+
1585
+ # Stopped modules
1586
+ if stopped:
1587
+ lines.append(f"{G} 未启动 ({len(stopped)}):{R}")
1588
+ for name, info in stopped:
1589
+ label = info.display_name or name
1590
+ lines.append(f"{G} - {label:<20s} ({info.state}){R}")
1591
+
1592
+ lines.append(f"{G} Kernel WS: ws://127.0.0.1:{self.kernel_port}/ws 实例: {self.instance_id}{R}")
1593
+
1594
+ # Query Kernel for web module's api_endpoint via RPC
1595
+ web_url = ""
1596
+ if self._ws:
1597
+ try:
1598
+ resp = await self._rpc_call(self._ws, "registry.get", {"path": "web.api_endpoint"}, timeout=3)
1599
+ val = resp.get("result", {}).get("value")
1600
+ if val and isinstance(val, str):
1601
+ web_url = val.replace("://127.0.0.1:", "://localhost:")
1602
+ except Exception:
1603
+ pass
1604
+ if web_url:
1605
+ lines.append(f"{B} Web 管理后台: {web_url}{R}")
1606
+
1607
+ # Instance info
1608
+ instances = self.process_manager.get_alive_instances()
1609
+ inst_num = self.process_manager.instance_num
1610
+ suffix_display = self.process_manager.instance_suffix or "(无)"
1611
+ inst_dir = os.environ.get("KITE_INSTANCE_DIR", "")
1612
+ cwd = os.environ.get("KITE_CWD", "")
1613
+ debug_flag = " [DEBUG]" if os.environ.get("KITE_DEBUG") == "1" else ""
1614
+ lines.append(f"{G} 当前实例: #{inst_num} 后缀: {suffix_display} PID: {os.getpid()}{debug_flag}{R}")
1615
+ lines.append(f"{G} 实例目录: {inst_dir}{R}")
1616
+ lines.append(f"{G} 工作目录: {cwd}{R}")
1617
+ if len(instances) > 1:
1618
+ lines.append(f"{G} 所有实例:{R}")
1619
+ for i in instances:
1620
+ s = "" if i["num"] == 1 else f"~{i['num']}"
1621
+ debug_tag = " [DEBUG]" if i.get("debug", False) else ""
1622
+ current_tag = " (当前)" if i["is_self"] else ""
1623
+ lines.append(f"{G} #{i['num']} PID {i['launcher_pid']} "
1624
+ f"模块数 {i['module_count']} (processes{s}.json){debug_tag}{current_tag}{R}")
1625
+
1626
+ # Cross-directory instances from other projects
1627
+ if global_instances:
1628
+ my_inst_basename = os.path.basename(os.environ.get("KITE_INSTANCE_DIR", ""))
1629
+ other_instances = [i for i in global_instances
1630
+ if not i["is_self"] and i["instance_dir"] != my_inst_basename]
1631
+ if other_instances:
1632
+ lines.append(f"{G} 其他项目实例:{R}")
1633
+ for i in other_instances:
1634
+ debug_tag = " [DEBUG]" if i.get("debug", False) else ""
1635
+ cwd_display = f" {i['cwd']}" if i["cwd"] else ""
1636
+ lines.append(
1637
+ f"{G} {i['instance_dir']:<20s} "
1638
+ f"#{i['num']} PID {i['launcher_pid']} "
1639
+ f"模块数 {i['module_count']}"
1640
+ f"{cwd_display}{debug_tag}{R}"
1641
+ )
1642
+
1643
+ if cleaned_stats:
1644
+ total = sum(cleaned_stats.values())
1645
+ if len(cleaned_stats) == 1:
1646
+ inst, count = next(iter(cleaned_stats.items()))
1647
+ lines.append(f"{Y} 已清理残留进程: {inst} ({count} 个){R}")
1648
+ else:
1649
+ lines.append(f"{Y} 已清理残留进程 (共 {total} 个):{R}")
1650
+ for inst, count in cleaned_stats.items():
1651
+ lines.append(f"{Y} {inst}: {count} 个{R}")
1652
+
1653
+ lines.append(f"{B}{'=' * 60}{R}")
1654
+ lines.append("")
1655
+
1656
+ print("\n".join(lines))
1657
+
1658
+ # ── Utilities ──
1659
+
1660
+ def _load_discovery(self) -> dict | None:
1661
+ """Read discovery config from launcher's own module.md."""
1662
+ md_path = os.path.join(os.environ["KITE_PROJECT"], "launcher", "module.md")
1663
+ try:
1664
+ with open(md_path, "r", encoding="utf-8") as f:
1665
+ fm = _parse_frontmatter(f.read())
1666
+ discovery = fm.get("discovery")
1667
+ if isinstance(discovery, dict) and discovery:
1668
+ return discovery
1669
+ except Exception as e:
1670
+ print(f"[launcher] 警告: 读取发现配置失败: {e}")
1671
+ return None
1672
+
1673
+ def _log_lifecycle(self, event: str, module: str, **extra):
1674
+ """Append one JSONL line to lifecycle.jsonl."""
1675
+ from datetime import datetime, timezone
1676
+ record = {"ts": datetime.now(timezone.utc).isoformat(), "event": event, "module": module}
1677
+ record.update(extra)
1678
+ try:
1679
+ os.makedirs(os.path.dirname(self._lifecycle_log), exist_ok=True)
1680
+ with open(self._lifecycle_log, "a", encoding="utf-8") as f:
1681
+ f.write(json.dumps(record, ensure_ascii=False) + "\n")
1682
+ except Exception:
1683
+ pass
1684
+
1685
+
1686
+
1687
+ def _update_module_md_state(module_dir: str, new_state: str):
1688
+ """Update the state field in a module's module.md frontmatter."""
1689
+ import re
1690
+ md_path = os.path.join(module_dir, "module.md")
1691
+ if not os.path.isfile(md_path):
1692
+ return
1693
+
1694
+ try:
1695
+ with open(md_path, "r", encoding="utf-8") as f:
1696
+ content = f.read()
1697
+
1698
+ updated = re.sub(
1699
+ r'^(state:\s*)(\S+)',
1700
+ rf'\g<1>{new_state}',
1701
+ content,
1702
+ count=1,
1703
+ flags=re.MULTILINE,
1704
+ )
1705
+
1706
+ with open(md_path, "w", encoding="utf-8") as f:
1707
+ f.write(updated)
1708
+ except Exception as e:
1709
+ print(f"[launcher] 警告: 更新 module.md 状态失败: {e}")
1710
+
1711
+
1712
+ def start_launcher():
1713
+ """Entry point called from main.py. Sets up environment and starts launcher."""
1714
+ # Load .env (development convenience)
1715
+ try:
1716
+ from dotenv import load_dotenv
1717
+ load_dotenv()
1718
+ except ImportError:
1719
+ pass
1720
+
1721
+ # Resolve project root
1722
+ project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
1723
+
1724
+ # Home base for Kite data
1725
+ home = os.environ.get("HOME") or os.environ.get("USERPROFILE") or os.path.expanduser("~")
1726
+ kite_home = os.path.join(home, ".kite")
1727
+
1728
+ # Set KITE_* defaults
1729
+ defaults = {
1730
+ "KITE_PROJECT": project_root,
1731
+ "KITE_CWD": os.getcwd(),
1732
+ "KITE_WORKSPACE": os.path.join(kite_home, "workspace"),
1733
+ "KITE_DATA": os.path.join(kite_home, "data"),
1734
+ "KITE_MODULES": os.path.join(kite_home, "modules"),
1735
+ "KITE_REPO": os.path.join(kite_home, "repo"),
1736
+ "KITE_ENV": "development",
1737
+ }
1738
+ for key, value in defaults.items():
1739
+ if not os.environ.get(key):
1740
+ os.environ[key] = value
1741
+
1742
+ # Parse CLI args
1743
+ if "--debug" in sys.argv:
1744
+ os.environ["KITE_DEBUG"] = "1"
1745
+ sys.argv.remove("--debug")
1746
+
1747
+ # Setup logging
1748
+ from .logging_setup import (
1749
+ setup_timestamped_print,
1750
+ init_log_files,
1751
+ setup_exception_hooks,
1752
+ reset_time_baseline,
1753
+ write_crash_handled
1754
+ )
1755
+ setup_timestamped_print()
1756
+ reset_time_baseline()
1757
+
1758
+ print("[launcher] Kite 启动中...")
1759
+
1760
+ # Create and run launcher
1761
+ token = secrets.token_hex(32)
1762
+ launcher = Launcher(kite_token=token)
1763
+ print("[launcher] 启动器实例已创建")
1764
+
1765
+ # Initialize log files (KITE_MODULE_DATA is now set)
1766
+ init_log_files()
1767
+ setup_exception_hooks()
1768
+
1769
+ log_dir = os.path.join(os.environ.get("KITE_MODULE_DATA", ""), "log")
1770
+ suffix = launcher.process_manager.instance_suffix
1771
+ latest_log = os.path.join(log_dir, f"latest{suffix}.log")
1772
+ print(f"[launcher] 日志: {latest_log}")
1773
+
1774
+ try:
1775
+ launcher.run()
1776
+ except Exception as e:
1777
+ write_crash_handled(type(e), e, e.__traceback__)
1778
+ sys.exit(1)