@agentunion/kite 1.0.7 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/core/event_hub/entry.py +305 -26
  2. package/core/event_hub/hub.py +8 -0
  3. package/core/event_hub/server.py +80 -17
  4. package/core/kite_log.py +241 -0
  5. package/core/launcher/entry.py +978 -284
  6. package/core/launcher/process_manager.py +456 -46
  7. package/core/registry/entry.py +272 -3
  8. package/core/registry/server.py +339 -289
  9. package/core/registry/store.py +10 -4
  10. package/extensions/agents/__init__.py +1 -0
  11. package/extensions/agents/assistant/__init__.py +1 -0
  12. package/extensions/agents/assistant/entry.py +380 -0
  13. package/extensions/agents/assistant/module.md +22 -0
  14. package/extensions/agents/assistant/server.py +236 -0
  15. package/extensions/channels/__init__.py +1 -0
  16. package/extensions/channels/acp_channel/__init__.py +1 -0
  17. package/extensions/channels/acp_channel/entry.py +380 -0
  18. package/extensions/channels/acp_channel/module.md +22 -0
  19. package/extensions/channels/acp_channel/server.py +236 -0
  20. package/extensions/event_hub_bench/entry.py +664 -379
  21. package/extensions/event_hub_bench/module.md +2 -1
  22. package/extensions/services/backup/__init__.py +1 -0
  23. package/extensions/services/backup/entry.py +380 -0
  24. package/extensions/services/backup/module.md +22 -0
  25. package/extensions/services/backup/server.py +244 -0
  26. package/extensions/services/model_service/__init__.py +1 -0
  27. package/extensions/services/model_service/entry.py +380 -0
  28. package/extensions/services/model_service/module.md +22 -0
  29. package/extensions/services/model_service/server.py +236 -0
  30. package/extensions/services/watchdog/entry.py +460 -147
  31. package/extensions/services/watchdog/module.md +3 -0
  32. package/extensions/services/watchdog/monitor.py +128 -13
  33. package/extensions/services/watchdog/server.py +75 -13
  34. package/extensions/services/web/__init__.py +1 -0
  35. package/extensions/services/web/config.yaml +149 -0
  36. package/extensions/services/web/entry.py +487 -0
  37. package/extensions/services/web/module.md +24 -0
  38. package/extensions/services/web/routes/__init__.py +1 -0
  39. package/extensions/services/web/routes/routes_call.py +189 -0
  40. package/extensions/services/web/routes/routes_config.py +512 -0
  41. package/extensions/services/web/routes/routes_contacts.py +98 -0
  42. package/extensions/services/web/routes/routes_devlog.py +99 -0
  43. package/extensions/services/web/routes/routes_phone.py +81 -0
  44. package/extensions/services/web/routes/routes_sms.py +48 -0
  45. package/extensions/services/web/routes/routes_stats.py +17 -0
  46. package/extensions/services/web/routes/routes_voicechat.py +554 -0
  47. package/extensions/services/web/routes/schemas.py +216 -0
  48. package/extensions/services/web/server.py +332 -0
  49. package/extensions/services/web/static/css/style.css +1064 -0
  50. package/extensions/services/web/static/index.html +1445 -0
  51. package/extensions/services/web/static/js/app.js +4671 -0
  52. package/extensions/services/web/vendor/__init__.py +1 -0
  53. package/extensions/services/web/vendor/bluetooth/__init__.py +0 -0
  54. package/extensions/services/web/vendor/bluetooth/audio.py +348 -0
  55. package/extensions/services/web/vendor/bluetooth/contacts.py +251 -0
  56. package/extensions/services/web/vendor/bluetooth/manager.py +395 -0
  57. package/extensions/services/web/vendor/bluetooth/sms.py +290 -0
  58. package/extensions/services/web/vendor/bluetooth/telephony.py +274 -0
  59. package/extensions/services/web/vendor/config.py +139 -0
  60. package/extensions/services/web/vendor/conversation/__init__.py +0 -0
  61. package/extensions/services/web/vendor/conversation/asr.py +936 -0
  62. package/extensions/services/web/vendor/conversation/engine.py +548 -0
  63. package/extensions/services/web/vendor/conversation/llm.py +534 -0
  64. package/extensions/services/web/vendor/conversation/mcp_tools.py +190 -0
  65. package/extensions/services/web/vendor/conversation/tts.py +322 -0
  66. package/extensions/services/web/vendor/conversation/vad.py +138 -0
  67. package/extensions/services/web/vendor/storage/__init__.py +1 -0
  68. package/extensions/services/web/vendor/storage/identity.py +312 -0
  69. package/extensions/services/web/vendor/storage/store.py +507 -0
  70. package/extensions/services/web/vendor/task/__init__.py +0 -0
  71. package/extensions/services/web/vendor/task/manager.py +864 -0
  72. package/extensions/services/web/vendor/task/models.py +45 -0
  73. package/extensions/services/web/vendor/task/webhook.py +263 -0
  74. package/extensions/services/web/vendor/tools/__init__.py +0 -0
  75. package/extensions/services/web/vendor/tools/registry.py +321 -0
  76. package/main.py +230 -90
  77. package/package.json +1 -1
@@ -8,9 +8,11 @@ Thread model:
8
8
  - (Windows) keyboard listener thread: polls for 'q' key
9
9
 
10
10
  4-Phase startup:
11
- Phase 1: Registry → stdout port → KITE_REGISTRY_PORT API register self + tokens
12
- Phase 2: Event Hub stdin launcher_ws_tokenstdout ws_endpoint WS connect → module.ready
13
- Phase 3: Event Hub Registry Registry Event Hub WS → module.ready
11
+ Phase 1: Registry + Event Hub (parallel start) Registry stdout port → stdin broadcast port to Event Hub
12
+ API register self + tokensstdin launcher_ws_token to Event Hub
13
+ stdout ws_endpointWS connect → module.ready
14
+ Phase 2: (reserved — Event Hub ready handled in Phase 1)
15
+ Phase 3: Registry delayed ready (Event Hub → Registry → Event Hub WS → module.ready)
14
16
  Phase 4: start remaining enabled modules in topo order
15
17
  """
16
18
 
@@ -34,9 +36,17 @@ from .process_manager import ProcessManager
34
36
 
35
37
  IS_WINDOWS = sys.platform == "win32"
36
38
 
39
+ # Shutdown timeout constants (seconds)
40
+ SHUTDOWN_TIMEOUT_NON_GRACEFUL = 5 # Non-graceful modules or no ack response
41
+ SHUTDOWN_TIMEOUT_PARTIAL = 3 # Graceful module ack'd but no ready
42
+ SHUTDOWN_TIMEOUT_READY = 1 # Graceful module sent ready (cleanup done)
43
+ SHUTDOWN_TIMEOUT_BULK = 3 # Bulk stop_all() safety net
44
+
37
45
  # Core module names that are started in Phase 1-2 (not Phase 4)
38
46
  CORE_MODULE_NAMES = {"registry", "event_hub"}
39
47
 
48
+ WATCHDOG_MODULE_NAME = "watchdog"
49
+
40
50
 
41
51
  class Launcher:
42
52
  """Kite system entry point. Starts Registry, manages modules, exposes API."""
@@ -65,9 +75,9 @@ class Launcher:
65
75
  self.modules: dict[str, ModuleInfo] = {}
66
76
  self._shutdown_event = asyncio.Event()
67
77
  self._thread_shutdown = threading.Event()
78
+ self._shutdown_complete = threading.Event() # Set when normal shutdown finishes
68
79
  self._api_server: uvicorn.Server | None = None
69
80
  self._api_ready = threading.Event()
70
- self._fail_counts: dict[str, int] = {} # module_name -> consecutive failure count
71
81
  self._module_tokens: dict[str, str] = {} # module_name -> per-module token
72
82
 
73
83
  # Three-layer state model: desired_state per module
@@ -83,15 +93,48 @@ class Launcher:
83
93
  # Event waiters: {event_key: (asyncio.Event, data_dict)}
84
94
  self._event_waiters: dict[str, tuple[asyncio.Event, dict]] = {}
85
95
 
96
+ # Module ready times: module_name -> seconds from start to ready
97
+ self._ready_times: dict[str, float] = {}
98
+
99
+ # Shared HTTP client for Registry communication (lazy-init, reuses TCP connections)
100
+ self._http: httpx.AsyncClient | None = None
101
+
102
+ # Module exit reasons: module_name -> reason string (for modules that sent module.exiting)
103
+ self._exit_reasons: dict[str, str] = {}
104
+
105
+ # Graceful shutdown capability: module_name -> True if module declared support
106
+ # Registry and Event Hub default to True (they start before Watchdog can observe)
107
+ self._graceful_modules: dict[str, bool] = {"registry": True, "event_hub": True}
108
+
109
+ # System-wide shutdown flag: prevents Watchdog restart during shutdown
110
+ self._system_shutting_down = False
111
+
86
112
  # Kite stdout message waiters: {waiter_key: (threading.Event, data_dict)}
87
113
  # Used by ProcessManager stdout callback (cross-thread)
88
114
  self._msg_waiters: dict[str, tuple[threading.Event, dict]] = {}
89
115
 
90
- self._lifecycle_log = os.path.join(
91
- os.environ["KITE_INSTANCE_DIR"], "launcher", "lifecycle.jsonl",
92
- )
116
+ suffix = self.process_manager.instance_suffix
117
+ state_dir = os.path.join(os.environ["KITE_INSTANCE_DIR"], "launcher", "state")
118
+ os.makedirs(state_dir, exist_ok=True)
119
+ self._lifecycle_log = os.path.join(state_dir, f"lifecycle{suffix}.jsonl")
120
+ # Clear lifecycle log on startup (like latest.log)
121
+ try:
122
+ with open(self._lifecycle_log, "w", encoding="utf-8") as f:
123
+ pass
124
+ except Exception:
125
+ pass
126
+ os.environ["KITE_INSTANCE_SUFFIX"] = suffix
93
127
  self._app = self._create_api_app()
94
128
 
129
+ @staticmethod
130
+ def _fmt_elapsed(seconds: float) -> str:
131
+ """Format elapsed seconds: <1s → 'NNNms', >=1s → 'N.Ns', >=10s → 'NNs'."""
132
+ if seconds < 1:
133
+ return f"{seconds * 1000:.0f}ms"
134
+ if seconds < 10:
135
+ return f"{seconds:.1f}s"
136
+ return f"{seconds:.0f}s"
137
+
95
138
  # ── Instance workspace resolution ──
96
139
 
97
140
  @staticmethod
@@ -124,7 +167,6 @@ class Launcher:
124
167
  with open(cwd_file, "w", encoding="utf-8") as f:
125
168
  f.write(cwd)
126
169
  os.environ["KITE_INSTANCE_DIR"] = candidate
127
- print(f"[launcher] 实例工作区已创建: {candidate}")
128
170
  return
129
171
 
130
172
  if os.path.isfile(cwd_file):
@@ -132,7 +174,6 @@ class Launcher:
132
174
  with open(cwd_file, "r", encoding="utf-8") as f:
133
175
  if f.read().strip() == cwd:
134
176
  os.environ["KITE_INSTANCE_DIR"] = candidate
135
- print(f"[launcher] 实例工作区已找到: {candidate}")
136
177
  return
137
178
  except Exception:
138
179
  pass
@@ -180,8 +221,7 @@ class Launcher:
180
221
 
181
222
  def run(self):
182
223
  """Synchronous entry point. Sets up signals, runs the async main loop."""
183
- print("[launcher] Kite 启动中...")
184
- print("[launcher] ── 环境变量 ──")
224
+ print("[launcher] ── 环境 ──")
185
225
  for key in sorted(k for k in os.environ if k.startswith("KITE_")):
186
226
  print(f"[launcher] {key} = {os.environ[key]}")
187
227
  print(f"[launcher] PID = {os.getpid()}")
@@ -197,6 +237,8 @@ class Launcher:
197
237
  asyncio.run(self._async_main())
198
238
  except KeyboardInterrupt:
199
239
  pass
240
+ except RuntimeError as e:
241
+ print(f"[launcher] 启动失败: {e}")
200
242
  finally:
201
243
  self._final_cleanup()
202
244
 
@@ -204,7 +246,7 @@ class Launcher:
204
246
  """Request graceful shutdown. Thread-safe — can be called from signal handler or any thread."""
205
247
  if self._thread_shutdown.is_set():
206
248
  return # already shutting down
207
- print(f"\n[launcher] {reason or '收到关闭请求'}")
249
+ print(f"[launcher] {reason or '收到关闭请求'}")
208
250
  self._thread_shutdown.set()
209
251
  # Wake up asyncio event loop immediately (so _monitor_loop / wait_for exits)
210
252
  loop = self._loop
@@ -213,9 +255,19 @@ class Launcher:
213
255
  loop.call_soon_threadsafe(self._shutdown_event.set)
214
256
  except RuntimeError:
215
257
  pass
216
- # Safety net: force exit after 15s no matter what
258
+ # Safety net: force exit after 10s only if normal shutdown hasn't completed
217
259
  def _force():
218
- time.sleep(15)
260
+ if self._shutdown_complete.wait(timeout=10):
261
+ return # Normal shutdown completed — no need to force
262
+ try:
263
+ pm = self.process_manager
264
+ still = [n for n in pm._processes if pm.is_running(n)]
265
+ except Exception:
266
+ still = []
267
+ if still:
268
+ print(f"[launcher] 关闭超时,以下模块仍在运行: {', '.join(still)},强制退出")
269
+ else:
270
+ print("[launcher] 关闭超时,强制退出")
219
271
  os._exit(1)
220
272
  threading.Thread(target=_force, daemon=True).start()
221
273
 
@@ -266,62 +318,145 @@ class Launcher:
266
318
  async def _async_main(self):
267
319
  """Full 4-phase startup sequence, then monitor loop."""
268
320
  self._loop = asyncio.get_running_loop()
321
+ t_start = time.monotonic()
322
+ self._start_unix = time.time()
323
+ phase_times = {}
324
+ G = "\033[32m"
325
+ R = "\033[0m"
269
326
 
270
327
  # Validate core modules exist (mechanism 12)
271
328
  self._validate_core_modules()
272
329
 
273
- # Cleanup leftovers from previous instances
274
- self.process_manager.cleanup_leftovers()
275
-
276
- # Phase 1: Registry bootstrap
277
- await self._phase1_registry()
278
- if self._shutdown_event.is_set(): return
330
+ # Cleanup leftovers from previous instances (current instance dir)
331
+ local_cleaned = self.process_manager.cleanup_leftovers()
279
332
 
280
- # Scan modules (can happen before Phase 2)
281
- self.modules = self.module_scanner.scan()
282
- for name, info in self.modules.items():
283
- self._log_lifecycle("scanned", name, state=info.state, module_dir=info.module_dir)
284
- print(f"[launcher] 发现 {len(self.modules)} 个模块: {', '.join(self.modules.keys()) or '(无)'}")
285
-
286
- # Generate per-module tokens (including event_hub and registry)
287
- await self._register_module_tokens()
288
- if self._shutdown_event.is_set(): return
333
+ # Cross-directory leftover cleanup (background, non-blocking)
334
+ # run_in_executor returns a Future (not coroutine), so use ensure_future
335
+ self._global_cleanup_task = asyncio.ensure_future(
336
+ asyncio.get_running_loop().run_in_executor(
337
+ None, self.process_manager.cleanup_global_leftovers
338
+ )
339
+ )
289
340
 
290
- # Phase 2: Event Hub bootstrap
291
- await self._phase2_event_hub()
292
- if self._shutdown_event.is_set(): return
341
+ try:
342
+ # Phase 1+2: Registry + Event Hub parallel bootstrap
343
+ t0 = time.monotonic()
344
+ await self._phase1_parallel_bootstrap()
345
+ elapsed_p1 = time.monotonic() - t0
346
+ phase_times["Phase 1+2: Registry + Event Hub (并行)"] = elapsed_p1
347
+ print(f"{G}[launcher] ✓ Phase 1+2 完成: Registry + Event Hub 已就绪 ({elapsed_p1:.2f}s){R}")
348
+ if self._shutdown_event.is_set(): return
349
+
350
+ # Phase 3: Wait for Registry delayed ready
351
+ t0 = time.monotonic()
352
+ await self._phase3_registry_ready()
353
+ elapsed = time.monotonic() - t0
354
+ phase_times["Phase 3: Registry 事件总线"] = elapsed
355
+ print(f"{G}[launcher] ✓ Phase 3 完成: Registry 已连接事件总线 ({elapsed:.2f}s){R}")
356
+ if self._shutdown_event.is_set(): return
357
+
358
+ # Initialize desired_state from config_state (needed before Phase 3.5)
359
+ for name, info in self.modules.items():
360
+ if info.state == "enabled":
361
+ self._desired_states[name] = "running"
362
+ else: # manual, disabled
363
+ self._desired_states[name] = "stopped"
364
+ # Core modules are already running
365
+ for cn in CORE_MODULE_NAMES:
366
+ self._desired_states[cn] = "running"
367
+
368
+ # Phase 3.5: Watchdog ready
369
+ # If started in parallel (Phase 1), just wait for module.ready
370
+ # Otherwise start it now (fallback)
371
+ watchdog_info = self.modules.get(WATCHDOG_MODULE_NAME)
372
+ if watchdog_info and self._desired_states.get(WATCHDOG_MODULE_NAME) == "running":
373
+ t0 = time.monotonic()
374
+ if getattr(self, '_watchdog_parallel', False):
375
+ print(f"[launcher] Phase 3.5: Watchdog 已并行启动,等待就绪...")
376
+ ready = await self._wait_event("module.ready", "watchdog", timeout=15)
377
+ elapsed = time.monotonic() - t0
378
+ if ready and not ready.get("_exited"):
379
+ self._graceful_modules["watchdog"] = bool(ready.get("graceful_shutdown"))
380
+ self._ready_times["watchdog"] = elapsed
381
+ print(f"[launcher] Watchdog 已就绪")
382
+ self._log_lifecycle("started", "watchdog")
383
+ await self._publish_event("module.started", {"module_id": "watchdog"})
384
+ self.process_manager.close_stdio("watchdog")
385
+ else:
386
+ print(f"[launcher] 警告: Watchdog 在 15s 内未就绪")
387
+ else:
388
+ print(f"[launcher] Phase 3.5: 启动 Watchdog...")
389
+ await self._start_one_module(watchdog_info)
390
+ elapsed = time.monotonic() - t0
391
+ print(f"{G}[launcher] ✓ Phase 3.5 完成: Watchdog ({elapsed:.2f}s){R}")
392
+ if self._shutdown_event.is_set(): return
393
+
394
+ # Phase 4: Start remaining enabled modules
395
+ t0 = time.monotonic()
396
+ await self._phase4_start_modules()
397
+ elapsed = time.monotonic() - t0
398
+ phase_times["Phase 4: Extensions"] = elapsed
399
+ print(f"{G}[launcher] ✓ Phase 4 完成: 扩展模块已启动 ({elapsed:.2f}s){R}")
400
+ if self._shutdown_event.is_set(): return
293
401
 
294
- # Phase 3: Wait for Registry delayed ready
295
- await self._phase3_registry_ready()
296
- if self._shutdown_event.is_set(): return
402
+ # Post-startup
403
+ self.process_manager.persist_records()
404
+ self._heartbeat_task = asyncio.create_task(self._heartbeat_loop())
297
405
 
298
- # Phase 4: Start remaining enabled modules
299
- # Initialize desired_state from config_state
300
- for name, info in self.modules.items():
301
- if info.state == "enabled":
302
- self._desired_states[name] = "running"
303
- else: # manual, disabled
304
- self._desired_states[name] = "stopped"
305
- # Core modules are already running
306
- for cn in CORE_MODULE_NAMES:
307
- self._desired_states[cn] = "running"
406
+ # Wait for global leftover cleanup to finish (non-blocking with timeout)
407
+ global_cleaned = {}
408
+ if hasattr(self, '_global_cleanup_task'):
409
+ try:
410
+ global_cleaned = await asyncio.wait_for(self._global_cleanup_task, timeout=5) or {}
411
+ except asyncio.TimeoutError:
412
+ print("[launcher] 警告: 全局遗留清理超时 (5s),跳过")
413
+ except Exception as e:
414
+ print(f"[launcher] 警告: 全局遗留清理出错: {e}")
415
+ # Merge local + global cleanup stats
416
+ cleaned_stats: dict[str, int] = {}
417
+ for d in (local_cleaned, global_cleaned):
418
+ for k, v in d.items():
419
+ cleaned_stats[k] = cleaned_stats.get(k, 0) + v
420
+
421
+ # Global instance scan (via executor to avoid blocking)
422
+ global_instances = await asyncio.get_running_loop().run_in_executor(
423
+ None, self.process_manager.get_global_instances
424
+ )
308
425
 
309
- await self._phase4_start_modules()
310
- if self._shutdown_event.is_set(): return
426
+ # ── Startup report ──
427
+ total_time = time.monotonic() - t_start
428
+ await self._print_startup_report(total_time, phase_times,
429
+ global_instances=global_instances,
430
+ cleaned_stats=cleaned_stats)
431
+ # Notify all modules that system startup is complete
432
+ await self._publish_event("system.ready", {
433
+ "startup_time": round(total_time, 2),
434
+ })
311
435
 
312
- # Post-startup
313
- self.process_manager.persist_records()
314
- self._heartbeat_task = asyncio.create_task(self._heartbeat_loop())
436
+ print("[launcher] 进入监控循环 (按 Ctrl+C 或 'q' 退出)")
437
+ await self._monitor_loop()
438
+ finally:
439
+ try:
440
+ await self._graceful_shutdown_all()
441
+ except Exception as e:
442
+ print(f"[launcher] 优雅关闭出错: {e}")
315
443
 
316
- print("[launcher] 进入监控循环 ( Ctrl+C 'q' 退出)")
317
- await self._monitor_loop()
444
+ # ── Phase 1+2: Parallel bootstrap (Registry + Event Hub) ──
318
445
 
319
- await self._graceful_shutdown_all()
446
+ async def _phase1_parallel_bootstrap(self):
447
+ """Start Registry + Event Hub processes in parallel to overlap cold-start time.
320
448
 
321
- # ── Phase 1: Registry ──
449
+ Flow:
450
+ 1. Start Registry + Event Hub processes simultaneously
451
+ 2. Wait for Registry to report port via stdout
452
+ 3. Set KITE_REGISTRY_PORT env (for Phase 3.5/4 modules) + start API
453
+ 4. Scan modules + register self & tokens (parallel)
454
+ 5. Send launcher_ws_token + registry_port to Event Hub via stdin
455
+ 6. Wait for Event Hub ws_endpoint → WS connect → module.ready
456
+ """
457
+ t_registry = time.monotonic()
322
458
 
323
- async def _phase1_registry(self):
324
- """Start Registry → capture port from stdout → set env → start API → register self."""
459
+ # ── Step 1: Start both processes ──
325
460
  registry_dir = os.path.join(os.environ["KITE_PROJECT"], "core", "registry")
326
461
  registry_info = ModuleInfo(
327
462
  name="registry",
@@ -332,30 +467,186 @@ class Launcher:
332
467
  entry="entry.py",
333
468
  module_dir=registry_dir,
334
469
  )
335
-
336
- boot_info = {"token": self.kite_token}
470
+ boot_info_registry = {"token": self.kite_token}
337
471
  self._log_lifecycle("starting", "registry")
338
- ok = self.process_manager.start_module(registry_info, boot_info=boot_info)
472
+ ok = self.process_manager.start_module(registry_info, boot_info=boot_info_registry)
339
473
  if not ok:
340
474
  self._log_lifecycle("start_failed", "registry")
341
475
  raise RuntimeError("启动 Registry 失败")
342
476
 
343
- # Wait for Registry to output port via stdout (mechanism 2)
344
- print("[launcher] 等待 Registry 端口...")
477
+ # Start Event Hub in parallel (before Registry port is known)
478
+ eh_dir = os.path.join(os.environ["KITE_PROJECT"], "core", "event_hub")
479
+ eh_info = ModuleInfo(
480
+ name="event_hub",
481
+ display_name="Event Hub",
482
+ type="infrastructure",
483
+ state="enabled",
484
+ runtime="python",
485
+ entry="entry.py",
486
+ module_dir=eh_dir,
487
+ )
488
+ # Generate Event Hub token early (will register to Registry once it's up)
489
+ eh_token = secrets.token_hex(32)
490
+ self._module_tokens["event_hub"] = eh_token
491
+ boot_info_eh = {"token": eh_token}
492
+ self._log_lifecycle("starting", "event_hub")
493
+ ok = self.process_manager.start_module(eh_info, boot_info=boot_info_eh)
494
+ if not ok:
495
+ self._log_lifecycle("start_failed", "event_hub")
496
+ raise RuntimeError("启动 Event Hub 失败")
497
+
498
+ # Start Watchdog in parallel (before Registry port is known)
499
+ # Watchdog will block on stdin waiting for registry_port
500
+ watchdog_dir = os.path.join(os.environ["KITE_PROJECT"], "extensions", "services", "watchdog")
501
+ watchdog_md = os.path.join(watchdog_dir, "module.md")
502
+ self._watchdog_parallel = False # track whether watchdog was started in parallel
503
+ if os.path.isfile(watchdog_md):
504
+ wd_token = secrets.token_hex(32)
505
+ self._module_tokens["watchdog"] = wd_token
506
+ # Parse watchdog module.md for ModuleInfo
507
+ try:
508
+ with open(watchdog_md, "r", encoding="utf-8") as f:
509
+ wd_fm = _parse_frontmatter(f.read())
510
+ wd_info = ModuleInfo(
511
+ name="watchdog",
512
+ display_name=wd_fm.get("display_name", "Watchdog"),
513
+ type=wd_fm.get("type", "service"),
514
+ state="enabled",
515
+ runtime=wd_fm.get("runtime", "python"),
516
+ entry=wd_fm.get("entry", "entry.py"),
517
+ module_dir=watchdog_dir,
518
+ )
519
+ boot_info_wd = {"token": wd_token}
520
+ self._log_lifecycle("starting", "watchdog")
521
+ ok = self.process_manager.start_module(wd_info, boot_info=boot_info_wd)
522
+ if ok:
523
+ self._watchdog_parallel = True
524
+ else:
525
+ self._log_lifecycle("start_failed", "watchdog")
526
+ print("[launcher] 警告: Watchdog 并行启动失败,将在 Phase 3.5 重试")
527
+ except Exception as e:
528
+ print(f"[launcher] 警告: Watchdog module.md 解析失败: {e}")
529
+
530
+ parallel_modules = "Registry + Event Hub" + (" + Watchdog" if self._watchdog_parallel else "")
531
+ print(f"[launcher] {parallel_modules} 进程已同时启动,等待 Registry 端口...")
532
+
533
+ # Persist immediately after starting core processes
534
+ self.process_manager.persist_records()
535
+
536
+ # ── Step 2: Wait for Registry port ──
345
537
  msg = await self._wait_kite_message("registry", "port", timeout=6)
346
538
  if not msg or not msg.get("port"):
347
539
  raise RuntimeError("致命错误: Registry 在 6s 内未报告端口")
348
540
  self.registry_port = int(msg["port"])
349
- print(f"[launcher] Registry 端口: {self.registry_port}")
541
+ self._ready_times["registry"] = time.monotonic() - t_registry
542
+ _wait_s = time.monotonic() - t_registry
543
+ print(f"[launcher] Registry 端口: {self.registry_port} (等待 {self._fmt_elapsed(_wait_s)})")
350
544
 
351
- # Set KITE_REGISTRY_PORT for all subsequent child processes
545
+ # ── Step 3: Set env + start API + immediately unblock Event Hub ──
352
546
  os.environ["KITE_REGISTRY_PORT"] = str(self.registry_port)
353
-
354
- # Start Launcher API in a separate thread
355
547
  self._start_api_thread()
356
548
 
357
- # Register Launcher itself to Registry
358
- await self._register_self()
549
+ # Send launcher_ws_token + registry_port to Event Hub ASAP (unblock it)
550
+ self._launcher_ws_token = secrets.token_hex(32)
551
+ self.process_manager.write_stdin("event_hub", {
552
+ "kite": "launcher_ws_token",
553
+ "launcher_ws_token": self._launcher_ws_token,
554
+ })
555
+ self.process_manager.write_stdin("event_hub", {
556
+ "kite": "registry_port",
557
+ "registry_port": self.registry_port,
558
+ })
559
+
560
+ # Send registry_port to Watchdog via stdin (if started in parallel)
561
+ # Watchdog will retry querying launcher.api_endpoint until it's available
562
+ if self.process_manager.is_running("watchdog"):
563
+ self.process_manager.write_stdin("watchdog", {
564
+ "kite": "registry_port",
565
+ "registry_port": self.registry_port,
566
+ })
567
+
568
+ # ── Step 4: Scan + register tokens ‖ wait for Event Hub ws_endpoint (parallel) ──
569
+ # Pre-register ws_endpoint waiter BEFORE gather to avoid race condition:
570
+ # module_scanner.scan() is synchronous and blocks the event loop,
571
+ # so the _wait_event_hub_endpoint coroutine wouldn't register its waiter in time.
572
+ ws_waiter_key = "event_hub:ws_endpoint"
573
+ ws_evt = threading.Event()
574
+ ws_data: dict = {}
575
+ self._msg_waiters[ws_waiter_key] = (ws_evt, ws_data)
576
+
577
+ async def _scan_and_register_tokens():
578
+ t_scan = time.monotonic()
579
+ self.modules = self.module_scanner.scan()
580
+ for name, info in self.modules.items():
581
+ self._log_lifecycle("scanned", name, state=info.state, module_dir=info.module_dir)
582
+ _scan_s = time.monotonic() - t_scan
583
+ print(f"[launcher] 发现 {len(self.modules)} 个模块: {', '.join(self.modules.keys()) or '(无)'} (扫描 {self._fmt_elapsed(_scan_s)})")
584
+ t_reg = time.monotonic()
585
+ await self._register_module_tokens()
586
+ _reg_s = time.monotonic() - t_reg
587
+ print(f"[launcher] 令牌注册完成 ({self._fmt_elapsed(_reg_s)})")
588
+
589
+ async def _wait_event_hub_endpoint():
590
+ t_wait_eh = time.monotonic()
591
+ print("[launcher] 等待 Event Hub ws_endpoint...")
592
+ shutdown = self._thread_shutdown
593
+ def _wait():
594
+ deadline = time.monotonic() + 10
595
+ while time.monotonic() < deadline:
596
+ if ws_evt.wait(timeout=0.5):
597
+ return True
598
+ if shutdown.is_set():
599
+ return False
600
+ return False
601
+ got = await asyncio.get_running_loop().run_in_executor(None, _wait)
602
+ self._msg_waiters.pop(ws_waiter_key, None)
603
+ if not got or not ws_data.get("ws_endpoint"):
604
+ raise RuntimeError("致命错误: Event Hub 在 10s 内未报告 ws_endpoint")
605
+ self._event_hub_ws_url = ws_data["ws_endpoint"]
606
+ _eh_s = time.monotonic() - t_wait_eh
607
+ print(f"[launcher] Event Hub 已发现: {self._event_hub_ws_url} (等待 {self._fmt_elapsed(_eh_s)})")
608
+
609
+ # Run all three in parallel: register_self + scan_tokens + wait_event_hub
610
+ await asyncio.gather(
611
+ self._register_self(),
612
+ _scan_and_register_tokens(),
613
+ _wait_event_hub_endpoint(),
614
+ )
615
+ if self._shutdown_event.is_set():
616
+ return
617
+
618
+ # ── Step 5: WS connect → module.ready ──
619
+ t_eh = time.monotonic()
620
+ self._ws_task = asyncio.create_task(self._ws_loop())
621
+
622
+ # Wait for Event Hub module.ready (sent when Launcher connects)
623
+ ready = await self._wait_event("module.ready", "event_hub", timeout=15)
624
+ if ready:
625
+ self._graceful_modules["event_hub"] = bool(ready.get("graceful_shutdown"))
626
+ print("[launcher] Event Hub 已就绪")
627
+ else:
628
+ print("[launcher] 警告: Event Hub 在 15s 内未发送 module.ready")
629
+
630
+ self._ready_times["event_hub"] = time.monotonic() - t_eh
631
+ self._log_lifecycle("started", "event_hub")
632
+ await self._publish_event("module.started", {"module_id": "event_hub"})
633
+ self.process_manager.close_stdio("event_hub")
634
+
635
+ # Store eh_info in modules dict if not already present (from scan)
636
+ if "event_hub" not in self.modules:
637
+ self.modules["event_hub"] = eh_info
638
+
639
+ def _get_http(self) -> httpx.AsyncClient:
640
+ """Get shared HTTP client (lazy-init, reuses TCP connections to Registry)."""
641
+ if self._http is None or self._http.is_closed:
642
+ self._http = httpx.AsyncClient(timeout=5)
643
+ return self._http
644
+
645
+ async def _close_http(self):
646
+ """Close shared HTTP client."""
647
+ if self._http and not self._http.is_closed:
648
+ await self._http.aclose()
649
+ self._http = None
359
650
 
360
651
  async def _register_self(self):
361
652
  """Register Launcher itself to Registry."""
@@ -376,86 +667,29 @@ class Launcher:
376
667
  "events_subscribe": [">"],
377
668
  }
378
669
  try:
379
- async with httpx.AsyncClient() as client:
380
- resp = await client.post(url, json=payload, headers=headers, timeout=5)
381
- if resp.status_code == 200:
382
- print("[launcher] 已注册到 Registry")
383
- else:
384
- print(f"[launcher] 警告: Registry 注册返回 {resp.status_code}")
670
+ client = self._get_http()
671
+ resp = await client.post(url, json=payload, headers=headers)
672
+ if resp.status_code == 200:
673
+ print("[launcher] 已注册到 Registry")
674
+ else:
675
+ print(f"[launcher] 警告: Registry 注册返回 {resp.status_code}")
385
676
  except Exception as e:
386
677
  print(f"[launcher] 警告: 注册到 Registry 失败: {e}")
387
678
 
388
- # ── Phase 2: Event Hub ──
389
-
390
- async def _phase2_event_hub(self):
391
- """Start Event Hub → stdin launcher_ws_token → stdout ws_endpoint → WS connect → module.ready."""
392
- # Find event_hub in scanned modules or build manually
393
- eh_info = self.modules.get("event_hub")
394
- if not eh_info:
395
- eh_dir = os.path.join(os.environ["KITE_PROJECT"], "core", "event_hub")
396
- eh_info = ModuleInfo(
397
- name="event_hub",
398
- display_name="Event Hub",
399
- type="infrastructure",
400
- state="enabled",
401
- runtime="python",
402
- entry="entry.py",
403
- module_dir=eh_dir,
404
- )
405
-
406
- token = self._module_tokens.get("event_hub", "")
407
- if not token:
408
- token = secrets.token_hex(32)
409
- self._module_tokens["event_hub"] = token
410
- await self._register_tokens_to_registry({"event_hub": token})
411
-
412
- boot_info = {"token": token}
413
- self._log_lifecycle("starting", "event_hub")
414
- ok = self.process_manager.start_module(eh_info, boot_info=boot_info)
415
- if not ok:
416
- self._log_lifecycle("start_failed", "event_hub")
417
- raise RuntimeError("启动 Event Hub 失败")
418
-
419
- # Send launcher_ws_token via stdin (mechanism 6)
420
- self._launcher_ws_token = secrets.token_hex(32)
421
- self.process_manager.write_stdin("event_hub", {
422
- "kite": "launcher_ws_token",
423
- "launcher_ws_token": self._launcher_ws_token,
424
- })
425
-
426
- # Wait for ws_endpoint from stdout (mechanism 5)
427
- print("[launcher] 等待 Event Hub ws_endpoint...")
428
- msg = await self._wait_kite_message("event_hub", "ws_endpoint", timeout=6)
429
- if not msg or not msg.get("ws_endpoint"):
430
- raise RuntimeError("致命错误: Event Hub 在 6s 内未报告 ws_endpoint")
431
- self._event_hub_ws_url = msg["ws_endpoint"]
432
- print(f"[launcher] Event Hub 已发现: {self._event_hub_ws_url}")
433
-
434
- # Connect to Event Hub WebSocket with launcher_ws_token
435
- self._ws_task = asyncio.create_task(self._ws_loop())
436
-
437
- # Wait for Event Hub module.ready (sent when Launcher connects)
438
- ready = await self._wait_event("module.ready", "event_hub", timeout=15)
439
- if ready:
440
- print("[launcher] Event Hub 已就绪")
441
- else:
442
- print("[launcher] 警告: Event Hub 在 15s 内未发送 module.ready")
443
-
444
- self._log_lifecycle("started", "event_hub")
445
- await self._publish_event("module.started", {"module_id": "event_hub"})
446
- self.process_manager.close_stdio("event_hub")
679
+ # ── (Phase 2 merged into _phase1_parallel_bootstrap) ──
447
680
 
448
681
  # ── Phase 3: Registry delayed ready ──
449
682
 
450
683
  async def _phase3_registry_ready(self):
451
684
  """Wait for Registry module.ready (triggered after Event Hub registers to Registry
452
685
  and Registry connects to Event Hub WS)."""
453
- print("[launcher] 等待 Registry 延迟就绪...")
686
+ print("[launcher] 等待 Registry 连接 Event Hub...")
454
687
  ready = await self._wait_event("module.ready", "registry", timeout=12)
455
688
  if ready:
456
- print("[launcher] Registry 已就绪")
689
+ self._graceful_modules["registry"] = bool(ready.get("graceful_shutdown"))
690
+ print("[launcher] Registry 事件总线连接完成")
457
691
  else:
458
- print("[launcher] 警告: Registry 在 12s 内未发送 module.ready (降级运行)")
692
+ print("[launcher] 警告: Registry 在 12s 内未连接事件总线 (降级运行)")
459
693
 
460
694
  self._log_lifecycle("started", "registry")
461
695
  await self._publish_event("module.started", {"module_id": "registry"})
@@ -467,7 +701,8 @@ class Launcher:
467
701
  """Start enabled modules (excluding core) in dependency order."""
468
702
  to_start = [m for m in self.modules.values()
469
703
  if self._desired_states.get(m.name) == "running"
470
- and m.name not in CORE_MODULE_NAMES]
704
+ and m.name not in CORE_MODULE_NAMES
705
+ and m.name != WATCHDOG_MODULE_NAME]
471
706
  if not to_start:
472
707
  print("[launcher] 没有额外模块需要启动")
473
708
  return
@@ -487,14 +722,18 @@ class Launcher:
487
722
  print(f"[launcher] 错误: '{m.name}' 依赖已禁用的模块 '{dep}'")
488
723
 
489
724
  try:
490
- sorted_modules = self._topo_sort(to_start)
725
+ layers = self._topo_layers(to_start)
491
726
  except RuntimeError as e:
492
727
  print(f"[launcher] 错误: {e}")
493
728
  return
494
729
 
495
- print(f"[launcher] 正在启动 {len(sorted_modules)} 个模块...")
496
- for info in sorted_modules:
497
- await self._start_one_module(info)
730
+ total = sum(len(layer) for layer in layers)
731
+ print(f"[launcher] 正在启动 {total} 个模块...")
732
+ for layer in layers:
733
+ if len(layer) == 1:
734
+ await self._start_one_module(layer[0])
735
+ else:
736
+ await asyncio.gather(*(self._start_one_module(info) for info in layer))
498
737
 
499
738
  # ── Event Hub WebSocket connection ──
500
739
 
@@ -506,16 +745,19 @@ class Launcher:
506
745
  except asyncio.CancelledError:
507
746
  return
508
747
  except Exception as e:
509
- print(f"[launcher] Event Hub 连接错误: {e}")
748
+ if not self._system_shutting_down:
749
+ print(f"[launcher] Event Hub 连接错误: {e}")
510
750
  self._ws = None
511
751
  await asyncio.sleep(5)
512
752
 
513
753
  async def _ws_connect(self):
514
754
  """Single WebSocket session with launcher_ws_token auth."""
515
- ws_url = f"{self._event_hub_ws_url}?token={self._launcher_ws_token}"
516
- async with websockets.connect(ws_url, ping_interval=None, ping_timeout=None, close_timeout=10) as ws:
755
+ ws_url = f"{self._event_hub_ws_url}?token={self._launcher_ws_token}&id=launcher"
756
+ t_ws_connect = time.monotonic()
757
+ async with websockets.connect(ws_url, open_timeout=3, ping_interval=None, ping_timeout=None, close_timeout=10) as ws:
517
758
  self._ws = ws
518
- print("[launcher] 已连接到 Event Hub")
759
+ _ws_s = time.monotonic() - t_ws_connect
760
+ print(f"[launcher] 已连接到 Event Hub ({self._fmt_elapsed(_ws_s)})")
519
761
 
520
762
  # Subscribe to all events
521
763
  await ws.send(json.dumps({
@@ -529,52 +771,89 @@ class Launcher:
529
771
  msg = json.loads(raw)
530
772
  except (json.JSONDecodeError, TypeError):
531
773
  continue
532
- msg_type = msg.get("type", "")
533
- if msg_type == "event":
534
- source = msg.get("source", "unknown")
535
- event = msg.get("event", "")
536
- data = msg.get("data", {})
537
- # Trigger event waiters
538
- module_id = data.get("module_id", "")
539
- waiter_key = f"{event}:{module_id}"
540
- waiter = self._event_waiters.get(waiter_key)
541
- if waiter:
542
- waiter[1].update(data)
543
- waiter[0].set()
544
- ts = msg.get("timestamp", "")
545
- latency_str = ""
546
- if ts:
547
- try:
548
- from datetime import datetime, timezone
549
- sent = datetime.fromisoformat(ts)
550
- delay_ms = (datetime.now(timezone.utc) - sent).total_seconds() * 1000
551
- latency_str = f" ({delay_ms:.1f}ms)"
552
- local_ts = sent.astimezone().strftime("%H:%M:%S")
553
- except Exception:
554
- local_ts = ts[11:19] if len(ts) >= 19 else ts
555
- print(f"[{source}] {local_ts} {event}{latency_str}: {json.dumps(data, ensure_ascii=False)}")
556
- else:
557
- print(f"[{source}] {event}: {json.dumps(data, ensure_ascii=False)}")
558
- elif msg_type == "error":
559
- print(f"[launcher] Event Hub 错误: {msg.get('message')}")
774
+ try:
775
+ msg_type = msg.get("type", "")
776
+ if msg_type == "event":
777
+ source = msg.get("source", "unknown")
778
+ event = msg.get("event", "")
779
+ data = msg.get("data") if isinstance(msg.get("data"), dict) else {}
780
+ # Trigger event waiters
781
+ module_id = data.get("module_id", "")
782
+ waiter_key = f"{event}:{module_id}"
783
+ waiter = self._event_waiters.get(waiter_key)
784
+ if waiter:
785
+ waiter[1].update(data)
786
+ waiter[0].set()
787
+ # module.exiting also wakes module.ready waiter
788
+ # (module won't send ready — no point waiting)
789
+ if event == "module.exiting" and module_id:
790
+ ready_key = f"module.ready:{module_id}"
791
+ ready_waiter = self._event_waiters.get(ready_key)
792
+ if ready_waiter:
793
+ ready_waiter[1].update(data)
794
+ ready_waiter[1]["_exited"] = True
795
+ ready_waiter[0].set()
796
+ # module.crash print red crash summary (real-time notification)
797
+ if event == "module.crash" and module_id:
798
+ RED = "\033[91m"
799
+ RESET = "\033[0m"
800
+ exc_type = data.get("exception_type", "Unknown")
801
+ preview = data.get("traceback_preview", "")
802
+ severity = data.get("severity", "error")
803
+ print(f"[launcher] {RED}模块 '{module_id}' 崩溃: "
804
+ f"{exc_type} — {preview}{RESET}")
805
+ _suffix = os.environ.get("KITE_INSTANCE_SUFFIX", "")
806
+ crash_log = os.path.join(
807
+ os.environ.get("KITE_INSTANCE_DIR", ""),
808
+ module_id, "log", f"crashes{_suffix}.jsonl"
809
+ )
810
+ print(f"[launcher] 崩溃日志: {crash_log}")
811
+ ts = msg.get("timestamp", "")
812
+ # Only log system events (module.*, watchdog.*) to avoid flooding
813
+ # from benchmark/test traffic
814
+ if not (event.startswith("module.") or event.startswith("watchdog.")):
815
+ continue
816
+ latency_str = ""
817
+ if ts:
818
+ try:
819
+ from datetime import datetime, timezone
820
+ sent = datetime.fromisoformat(ts)
821
+ delay_ms = (datetime.now(timezone.utc) - sent).total_seconds() * 1000
822
+ latency_str = f" ({delay_ms:.1f}ms)"
823
+ local_ts = sent.astimezone().strftime("%H:%M:%S")
824
+ except Exception:
825
+ local_ts = ts[11:19] if len(ts) >= 19 else ts
826
+ print(f"[{source}] {local_ts} {event}{latency_str}: {json.dumps(data, ensure_ascii=False)}")
827
+ else:
828
+ print(f"[{source}] {event}: {json.dumps(data, ensure_ascii=False)}")
829
+ elif msg_type == "error":
830
+ print(f"[launcher] Event Hub 错误: {msg.get('message')}")
831
+ except Exception as e:
832
+ print(f"[launcher] 事件处理异常(已忽略): {e}")
560
833
 
561
834
  async def _publish_event(self, event_type: str, data: dict):
562
- """Publish an event to Event Hub via WebSocket."""
835
+ """Publish an event to Event Hub via WebSocket. Uses create_task to avoid
836
+ deadlock with _ws_connect recv loop (websockets 15.x send can block when
837
+ incoming frames are pending and recv is held by async-for)."""
563
838
  if not self._ws:
564
839
  return
565
840
  from datetime import datetime, timezone
566
- msg = {
841
+ msg = json.dumps({
567
842
  "type": "event",
568
843
  "event_id": str(uuid.uuid4()),
569
844
  "event": event_type,
570
845
  "source": "launcher",
571
846
  "timestamp": datetime.now(timezone.utc).isoformat(),
572
847
  "data": data,
573
- }
574
- try:
575
- await self._ws.send(json.dumps(msg))
576
- except Exception as e:
577
- print(f"[launcher] 发布事件失败: {e}")
848
+ })
849
+
850
+ async def _send():
851
+ try:
852
+ await self._ws.send(msg)
853
+ except Exception as e:
854
+ print(f"[launcher] 发布事件失败: {e}")
855
+
856
+ asyncio.create_task(_send())
578
857
 
579
858
  def _publish_event_threadsafe(self, event_type: str, data: dict):
580
859
  """Publish event from non-async context (API thread). Fire-and-forget."""
@@ -599,53 +878,127 @@ class Launcher:
599
878
  self._event_waiters.pop(key, None)
600
879
 
601
880
  async def _graceful_stop(self, name: str, reason: str = "stop_requested", timeout: float = 10):
602
- """Graceful shutdown: send event → wait ack → wait ready → kill."""
881
+ """Graceful shutdown: check capability → send event → wait ack → wait ready → kill.
882
+ Modules that did not declare graceful_shutdown in module.ready are terminated directly.
883
+ """
603
884
  self._log_lifecycle("stopping", name, reason=reason)
885
+
886
+ if not self._graceful_modules.get(name):
887
+ self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
888
+ self._log_lifecycle("stopped", name, reason=reason)
889
+ await self._publish_event("module.stopped", {
890
+ "module_id": name,
891
+ "graceful_shutdown": False,
892
+ })
893
+ return
894
+
604
895
  await self._publish_event("module.shutdown", {
605
896
  "module_id": name, "reason": reason, "timeout": timeout,
606
897
  })
607
898
 
608
899
  ack = await self._wait_event("module.shutdown.ack", name, timeout=3)
609
900
  if not ack:
610
- self.process_manager.stop_module(name, timeout=5)
611
- await self._publish_event("module.stopped", {"module_id": name})
901
+ self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
902
+ await self._publish_event("module.stopped", {
903
+ "module_id": name,
904
+ "graceful_shutdown": self._graceful_modules.get(name, False),
905
+ })
612
906
  return
613
907
 
614
908
  estimated = min(ack.get("estimated_cleanup", timeout), timeout)
615
909
  ready = await self._wait_event("module.shutdown.ready", name, timeout=estimated)
616
910
  if ready:
617
- self.process_manager.stop_module(name, timeout=1)
911
+ self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_READY)
618
912
  else:
619
- self.process_manager.stop_module(name, timeout=3)
913
+ self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_PARTIAL)
620
914
 
621
915
  self._log_lifecycle("stopped", name, reason=reason)
622
- await self._publish_event("module.stopped", {"module_id": name})
916
+ await self._publish_event("module.stopped", {
917
+ "module_id": name,
918
+ "graceful_shutdown": self._graceful_modules.get(name, False),
919
+ })
623
920
 
624
921
  async def _graceful_shutdown_all(self):
625
- """Broadcast module.shutdown to all running modules, then force-kill survivors."""
922
+ """Shut down all modules. Order:
923
+ 1. Send shutdown to graceful modules (excl. Event Hub) — let them start cleanup
924
+ 2. Terminate non-graceful modules (fast, runs during graceful cleanup)
925
+ 3. Wait for graceful modules to exit (process monitoring)
926
+ 4. Shut down Event Hub last (keeps event routing alive throughout)
927
+ """
928
+ self._system_shutting_down = True
626
929
  running = [n for n in self.modules if self.process_manager.is_running(n)]
627
930
  # Also check core modules
628
931
  for cn in CORE_MODULE_NAMES:
629
932
  if self.process_manager.is_running(cn) and cn not in running:
630
933
  running.append(cn)
631
934
  if not running:
935
+ print("[launcher] 没有运行中的模块需要关闭")
632
936
  return
633
- print(f"[launcher] 优雅关闭: {', '.join(running)}")
634
- for name in running:
937
+
938
+ graceful = [n for n in running if self._graceful_modules.get(n)]
939
+ non_graceful = [n for n in running if not self._graceful_modules.get(n)]
940
+
941
+ # Defer Event Hub — it must stay alive to route shutdown events
942
+ hub_deferred = "event_hub" in graceful
943
+ graceful_batch = [n for n in graceful if n != "event_hub"] if hub_deferred else graceful
944
+
945
+ print(f"[launcher] 正在关闭 {len(running)} 个模块: {', '.join(running)}")
946
+
947
+ # Phase 1: Notify graceful modules first (they start cleanup immediately)
948
+ for name in graceful_batch:
635
949
  self._log_lifecycle("stopping", name, reason="system_shutdown")
636
950
  await self._publish_event("module.shutdown", {
637
- "module_id": name, "reason": "system_shutdown", "timeout": 10,
951
+ "module_id": name, "reason": "system_shutdown", "timeout": 5,
638
952
  })
639
- deadline = time.time() + 10
640
- while time.time() < deadline:
641
- still_running = [n for n in running if self.process_manager.is_running(n)]
642
- if not still_running:
643
- break
644
- await asyncio.sleep(0.5)
645
- self.process_manager.stop_all(timeout=3)
646
- for name in running:
953
+
954
+ # Phase 2: While graceful modules are cleaning up, terminate non-graceful ones
955
+ if non_graceful:
956
+ print(f"[launcher] 直接终止 {len(non_graceful)} 个不支持优雅退出的模块: {', '.join(non_graceful)}")
957
+ for name in non_graceful:
958
+ self._log_lifecycle("stopping", name, reason="system_shutdown")
959
+ self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_PARTIAL)
647
960
  self._log_lifecycle("stopped", name, reason="system_shutdown")
648
961
 
962
+ # Phase 3: Wait for graceful modules to exit (process monitoring)
963
+ if graceful_batch:
964
+ deadline = time.time() + 5
965
+ while time.time() < deadline:
966
+ still_running = [n for n in graceful_batch if self.process_manager.is_running(n)]
967
+ if not still_running:
968
+ print("[launcher] 所有优雅退出模块已自行退出")
969
+ break
970
+ remaining = max(0, deadline - time.time())
971
+ print(f"[launcher] 等待 {len(still_running)} 个模块退出 ({remaining:.0f}s): {', '.join(still_running)}")
972
+ await asyncio.sleep(1)
973
+ # Force kill survivors
974
+ for name in graceful_batch:
975
+ if self.process_manager.is_running(name):
976
+ self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_PARTIAL)
977
+ self._log_lifecycle("stopped", name, reason="system_shutdown")
978
+
979
+ # Phase 4: All other modules exited — now shut down Event Hub
980
+ if hub_deferred and self.process_manager.is_running("event_hub"):
981
+ self._log_lifecycle("stopping", "event_hub", reason="system_shutdown")
982
+ await self._publish_event("module.shutdown", {
983
+ "module_id": "event_hub", "reason": "system_shutdown", "timeout": 5,
984
+ })
985
+ deadline = time.time() + 5
986
+ while time.time() < deadline:
987
+ if not self.process_manager.is_running("event_hub"):
988
+ print("[launcher] Event Hub 已退出")
989
+ break
990
+ await asyncio.sleep(0.5)
991
+ if self.process_manager.is_running("event_hub"):
992
+ self.process_manager.stop_module("event_hub", timeout=SHUTDOWN_TIMEOUT_PARTIAL)
993
+ self._log_lifecycle("stopped", "event_hub", reason="system_shutdown")
994
+
995
+ # Final safety net
996
+ try:
997
+ self.process_manager.stop_all(timeout=SHUTDOWN_TIMEOUT_BULK)
998
+ except Exception as e:
999
+ print(f"[launcher] stop_all 出错: {e}")
1000
+ await self._close_http()
1001
+
649
1002
  # ── Heartbeat to Registry ──
650
1003
 
651
1004
  async def _heartbeat_loop(self):
@@ -653,13 +1006,12 @@ class Launcher:
653
1006
  while not self._thread_shutdown.is_set():
654
1007
  await asyncio.sleep(30)
655
1008
  try:
656
- async with httpx.AsyncClient() as client:
657
- await client.post(
658
- f"http://127.0.0.1:{self.registry_port}/modules",
659
- json={"action": "heartbeat", "module_id": "launcher"},
660
- headers={"Authorization": f"Bearer {self.kite_token}"},
661
- timeout=5,
662
- )
1009
+ client = self._get_http()
1010
+ await client.post(
1011
+ f"http://127.0.0.1:{self.registry_port}/modules",
1012
+ json={"action": "heartbeat", "module_id": "launcher"},
1013
+ headers={"Authorization": f"Bearer {self.kite_token}"},
1014
+ )
663
1015
  except Exception:
664
1016
  pass
665
1017
 
@@ -691,6 +1043,42 @@ class Launcher:
691
1043
  visit(m.name)
692
1044
  return order
693
1045
 
1046
+ def _topo_layers(self, modules: list[ModuleInfo]) -> list[list[ModuleInfo]]:
1047
+ """Topological sort into layers. Modules in the same layer have no
1048
+ inter-dependencies and can be started in parallel."""
1049
+ name_map = {m.name: m for m in modules}
1050
+ all_names = set(name_map.keys())
1051
+
1052
+ # Compute depth (longest path from root) for each module
1053
+ depth: dict[str, int] = {}
1054
+ in_stack: set[str] = set()
1055
+
1056
+ def get_depth(name: str) -> int:
1057
+ if name in depth:
1058
+ return depth[name]
1059
+ if name in in_stack:
1060
+ raise RuntimeError(f"Circular dependency detected involving '{name}'")
1061
+ in_stack.add(name)
1062
+ info = name_map.get(name)
1063
+ d = 0
1064
+ if info:
1065
+ for dep in info.depends_on:
1066
+ if dep in all_names:
1067
+ d = max(d, get_depth(dep) + 1)
1068
+ in_stack.remove(name)
1069
+ depth[name] = d
1070
+ return d
1071
+
1072
+ for name in all_names:
1073
+ get_depth(name)
1074
+
1075
+ # Group by depth
1076
+ max_depth = max(depth.values()) if depth else 0
1077
+ layers: list[list[ModuleInfo]] = [[] for _ in range(max_depth + 1)]
1078
+ for name, d in depth.items():
1079
+ layers[d].append(name_map[name])
1080
+ return layers
1081
+
694
1082
  async def _start_one_module(self, info: ModuleInfo):
695
1083
  """Start a single module: publish starting → start process → wait ready → started → close stdio."""
696
1084
  self._log_lifecycle("starting", info.name)
@@ -698,16 +1086,29 @@ class Launcher:
698
1086
 
699
1087
  token = self._module_tokens.get(info.name, "")
700
1088
  boot_info = {"token": token}
1089
+ t0 = time.monotonic()
701
1090
  ok = self.process_manager.start_module(info, boot_info=boot_info)
702
1091
  if not ok:
703
1092
  self._log_lifecycle("start_failed", info.name)
704
1093
  return
705
1094
 
706
- # Wait for module.ready (configurable timeout, degrade on timeout)
1095
+ # Persist immediately after starting to ensure PID is recorded
1096
+ # (in case launcher crashes before Phase 4 completes)
1097
+ self.process_manager.persist_records()
1098
+
1099
+ # Wait for module.ready or module.exiting (whichever comes first)
707
1100
  timeout = info.launch.timeout
708
1101
  ready = await self._wait_event("module.ready", info.name, timeout=timeout)
709
- if ready:
710
- print(f"[launcher] 模块 '{info.name}' 已就绪")
1102
+ elapsed = time.monotonic() - t0
1103
+ if ready and ready.get("_exited"):
1104
+ # Module sent module.exiting before ready — it chose to quit
1105
+ reason = ready.get("reason", "unknown")
1106
+ self._exit_reasons[info.name] = reason
1107
+ print(f"[launcher] 模块 '{info.name}' 主动退出: {reason} ({elapsed:.2f}s)")
1108
+ elif ready:
1109
+ self._graceful_modules[info.name] = bool(ready.get("graceful_shutdown"))
1110
+ self._ready_times[info.name] = elapsed
1111
+ print(f"[launcher] 模块 '{info.name}' 已就绪 ({elapsed:.2f}s)")
711
1112
  else:
712
1113
  print(f"[launcher] 警告: '{info.name}' 在 {timeout}s 内未发送 module.ready")
713
1114
 
@@ -736,12 +1137,12 @@ class Launcher:
736
1137
  url = f"http://127.0.0.1:{self.registry_port}/tokens"
737
1138
  headers = {"Authorization": f"Bearer {self.kite_token}"}
738
1139
  try:
739
- async with httpx.AsyncClient() as client:
740
- resp = await client.post(url, json=tokens, headers=headers, timeout=5)
741
- if resp.status_code == 200:
742
- print(f"[launcher] 已注册 {len(tokens)} 个模块令牌")
743
- else:
744
- print(f"[launcher] 警告: 令牌注册返回 {resp.status_code}")
1140
+ client = self._get_http()
1141
+ resp = await client.post(url, json=tokens, headers=headers)
1142
+ if resp.status_code == 200:
1143
+ print(f"[launcher] 已注册 {len(tokens)} 个模块令牌")
1144
+ else:
1145
+ print(f"[launcher] 警告: 令牌注册返回 {resp.status_code}")
745
1146
  except Exception as e:
746
1147
  print(f"[launcher] 警告: 注册模块令牌失败: {e}")
747
1148
 
@@ -799,49 +1200,90 @@ class Launcher:
799
1200
 
800
1201
  print(f"[launcher] API 服务器已启动,端口 {self.api_port}")
801
1202
 
1203
+ # ── Module crash summary ──
1204
+
1205
+ def _print_module_crash_summary(self, name: str):
1206
+ """Read module's crashes.jsonl last record and print red summary to console.
1207
+ Complement to module.crash event — reliable even if event was never sent."""
1208
+ RED = "\033[91m"
1209
+ RESET = "\033[0m"
1210
+ _suffix = os.environ.get("KITE_INSTANCE_SUFFIX", "")
1211
+ crash_log = os.path.join(
1212
+ os.environ.get("KITE_INSTANCE_DIR", ""), name, "log", f"crashes{_suffix}.jsonl"
1213
+ )
1214
+ if not os.path.isfile(crash_log):
1215
+ return
1216
+ try:
1217
+ with open(crash_log, "rb") as f:
1218
+ f.seek(0, 2)
1219
+ size = f.tell()
1220
+ if size == 0:
1221
+ return
1222
+ f.seek(max(0, size - 4096))
1223
+ lines = f.read().decode("utf-8").strip().split("\n")
1224
+ last = json.loads(lines[-1])
1225
+ exc_type = last.get("exception_type", "Unknown")
1226
+ ctx = last.get("context", {})
1227
+ file_name = ctx.get("file", "unknown")
1228
+ line_no = ctx.get("line", "?")
1229
+ print(f"[launcher] {RED}崩溃: "
1230
+ f"{exc_type} in {file_name}:{line_no}{RESET}")
1231
+ print(f"[launcher] 崩溃日志: {crash_log}")
1232
+ except Exception:
1233
+ pass
1234
+
802
1235
  # ── Monitor loop ──
803
1236
 
804
1237
  async def _monitor_loop(self):
805
1238
  """Check child processes every second. Handle crashes.
806
1239
  Uses _shutdown_event (asyncio.Event) so Ctrl+C wakes us immediately.
1240
+
1241
+ Responsibility split:
1242
+ - Core module crash → full restart (Launcher handles)
1243
+ - Watchdog crash → Launcher restarts directly (up to 3 times)
1244
+ - Other module exit → publish module.stopped event only; Watchdog decides restart
807
1245
  """
808
- MAX_FAIL = 3
809
- MAX_FAILED_MODULES = 3
1246
+ WATCHDOG_MAX_FAIL = 3
1247
+ watchdog_fail_count = 0
810
1248
 
811
1249
  while not self._shutdown_event.is_set():
812
1250
  exited = self.process_manager.check_exited()
813
1251
 
814
1252
  for name, rc in exited:
815
1253
  print(f"[launcher] 模块 '{name}' 退出,返回码 {rc}")
1254
+ if rc != 0:
1255
+ self._print_module_crash_summary(name)
816
1256
  self._log_lifecycle("exited", name, exit_code=rc)
817
1257
  await self._publish_event("module.stopped", {
818
1258
  "module_id": name, "exit_code": rc,
1259
+ "graceful_shutdown": self._graceful_modules.get(name, False),
819
1260
  })
820
1261
  info = self.modules.get(name)
821
1262
 
822
- # Core module crash → full restart
1263
+ # 1) Core module crash → full restart
823
1264
  if name in CORE_MODULE_NAMES or (info and info.is_core()):
824
1265
  print(f"[launcher] 严重: 核心模块 '{name}' 崩溃,正在全部重启...")
825
1266
  self._log_lifecycle("core_crash", name, exit_code=rc)
826
1267
  await self._full_restart()
827
1268
  return
828
1269
 
829
- # Non-core: attempt restart if desired_state is "running"
830
- self._fail_counts[name] = self._fail_counts.get(name, 0) + 1
831
- count = self._fail_counts[name]
832
-
833
- if count < MAX_FAIL and self._desired_states.get(name) == "running" and info:
834
- print(f"[launcher] 正在重启 '{name}' (第 {count}/{MAX_FAIL} 次)...")
835
- await self._start_one_module(info)
836
- elif count >= MAX_FAIL:
837
- self._desired_states[name] = "stopped"
838
- self._log_lifecycle("failed", name, reason=f"exceeded {MAX_FAIL} retries")
839
- print(f"[launcher] 模块 '{name}' 失败 {MAX_FAIL} 次,已放弃")
1270
+ # 2) Watchdog crash Launcher restarts directly
1271
+ if name == WATCHDOG_MODULE_NAME:
1272
+ if self._system_shutting_down:
1273
+ print(f"[launcher] Watchdog 退出(系统关闭中),跳过重启")
1274
+ continue
1275
+ watchdog_fail_count += 1
1276
+ if watchdog_fail_count <= WATCHDOG_MAX_FAIL and info:
1277
+ print(f"[launcher] Watchdog 崩溃,正在重启 (第 {watchdog_fail_count}/{WATCHDOG_MAX_FAIL} 次)...")
1278
+ await self._start_one_module(info)
1279
+ else:
1280
+ self._desired_states[name] = "stopped"
1281
+ self._log_lifecycle("failed", name, reason=f"exceeded {WATCHDOG_MAX_FAIL} retries")
1282
+ print(f"[launcher] Watchdog 失败 {WATCHDOG_MAX_FAIL} 次,已放弃")
1283
+ continue
840
1284
 
841
- failed_count = sum(1 for c in self._fail_counts.values() if c >= MAX_FAIL)
842
- if failed_count >= MAX_FAILED_MODULES:
843
- print(f"[launcher] {failed_count} 个模块永久失败,启动器退出")
844
- return
1285
+ # 3) Other modules event already published above; Watchdog decides restart
1286
+ # (no restart logic here — Watchdog handles it via module.stopped event)
845
1287
 
846
1288
  if exited:
847
1289
  self.process_manager.persist_records()
@@ -857,6 +1299,9 @@ class Launcher:
857
1299
  """Stop all modules, regenerate tokens, re-run Phase 1-4 (mechanism 10)."""
858
1300
  print("[launcher] 全量重启: 正在停止所有模块...")
859
1301
 
1302
+ # Persist records before shutdown so cleanup_leftovers can find survivors
1303
+ self.process_manager.persist_records()
1304
+
860
1305
  # Disconnect Event Hub WS
861
1306
  if self._ws_task:
862
1307
  self._ws_task.cancel()
@@ -869,7 +1314,13 @@ class Launcher:
869
1314
  self._launcher_ws_token = ""
870
1315
 
871
1316
  await self._graceful_shutdown_all()
872
- self._fail_counts.clear()
1317
+
1318
+ # Cleanup any leftover processes that survived graceful shutdown.
1319
+ # Note: _graceful_shutdown_all() clears _processes/_records dicts, but
1320
+ # cleanup_leftovers() reads from processes.json (persisted above), so it can
1321
+ # still find and kill survivors.
1322
+ self.process_manager.cleanup_leftovers()
1323
+
873
1324
  self._module_tokens.clear()
874
1325
 
875
1326
  # Regenerate kite_token
@@ -878,12 +1329,7 @@ class Launcher:
878
1329
 
879
1330
  print("[launcher] 全量重启: 重新执行 Phase 1-4...")
880
1331
  try:
881
- await self._phase1_registry()
882
- self.modules = self.module_scanner.scan()
883
- for n, info in self.modules.items():
884
- self._log_lifecycle("scanned", n, state=info.state, module_dir=info.module_dir)
885
- await self._register_module_tokens()
886
- await self._phase2_event_hub()
1332
+ await self._phase1_parallel_bootstrap()
887
1333
  await self._phase3_registry_ready()
888
1334
  await self._phase4_start_modules()
889
1335
  self.process_manager.persist_records()
@@ -897,28 +1343,252 @@ class Launcher:
897
1343
 
898
1344
  def _final_cleanup(self):
899
1345
  """Called on exit — stop all processes, stop API, clear records."""
900
- print("[launcher] 正在关闭...")
1346
+ try:
1347
+ print("[launcher] 正在执行最终清理...")
1348
+
1349
+ if self._ws_task:
1350
+ self._ws_task.cancel()
1351
+ if hasattr(self, '_heartbeat_task') and self._heartbeat_task:
1352
+ self._heartbeat_task.cancel()
1353
+
1354
+ # Note: _graceful_shutdown_all() already called stop_all() in _async_main finally block.
1355
+ # This is just a safety check — should normally find nothing.
1356
+ remaining = [n for n in self.process_manager._processes
1357
+ if self.process_manager.is_running(n)]
1358
+ if remaining:
1359
+ print(f"[launcher] 警告: 仍有残留进程 (不应出现): {', '.join(remaining)}")
1360
+ self.process_manager.stop_all(timeout=SHUTDOWN_TIMEOUT_BULK)
1361
+ else:
1362
+ print("[launcher] 无残留进程")
901
1363
 
902
- if self._ws_task:
903
- self._ws_task.cancel()
904
- if hasattr(self, '_heartbeat_task') and self._heartbeat_task:
905
- self._heartbeat_task.cancel()
1364
+ if self._api_server:
1365
+ self._api_server.should_exit = True
906
1366
 
907
- self.process_manager.stop_all(timeout=10)
1367
+ # Clear instance runtime files
1368
+ try:
1369
+ os.remove(self.process_manager.records_path)
1370
+ except OSError:
1371
+ pass
1372
+ except Exception as e:
1373
+ print(f"[launcher] 最终清理出错: {e}")
1374
+ finally:
1375
+ # Signal the safety-net thread that normal shutdown has completed
1376
+ self._shutdown_complete.set()
1377
+ print("[launcher] 再见。")
1378
+
1379
+ if IS_WINDOWS:
1380
+ os._exit(0)
1381
+
1382
+ # ── Startup report ──
1383
+
1384
+ async def _print_startup_report(self, total_time: float, phase_times: dict[str, float], *,
1385
+ global_instances=None, cleaned_stats: dict[str, int] | None = None):
1386
+ """Print a green startup summary with module list and timing."""
1387
+ G = "\033[32m" # green
1388
+ Y = "\033[33m" # yellow
1389
+ R = "\033[0m" # reset
1390
+ B = "\033[1;32m" # bold green
1391
+
1392
+ running = []
1393
+ exited = []
1394
+ stopped = []
1395
+ for name, info in self.modules.items():
1396
+ rec = self.process_manager.get_record(name)
1397
+ is_running = self.process_manager.is_running(name)
1398
+ if is_running and rec:
1399
+ running.append((name, info, rec))
1400
+ elif self._desired_states.get(name) == "running" and not is_running:
1401
+ # Was started but already exited (e.g. module.exiting)
1402
+ exited.append((name, info))
1403
+ else:
1404
+ stopped.append((name, info))
1405
+
1406
+ # Calculate kernel startup time (Phase 1+2+3)
1407
+ kernel_time = 0
1408
+ for phase_name in ["Phase 1+2: Registry + Event Hub (并行)", "Phase 3: Registry 事件总线"]:
1409
+ if phase_name in phase_times:
1410
+ kernel_time += phase_times[phase_name]
1411
+
1412
+ lines = [
1413
+ "",
1414
+ f"{B}{'=' * 60}",
1415
+ f" Kite 内核启动完成 耗时 {kernel_time:.2f}s",
1416
+ f" Kite 全部模块启动完成 总耗时 {total_time:.2f}s",
1417
+ f"{'=' * 60}{R}",
1418
+ ]
1419
+
1420
+ # Phase breakdown
1421
+ lines.append(f"{G} 阶段耗时:{R}")
1422
+
1423
+ # Kernel modules section
1424
+ lines.append(f"{G} 内核模块:{R}")
1425
+ for phase_name in ["Phase 1+2: Registry + Event Hub (并行)", "Phase 3: Registry 事件总线"]:
1426
+ if phase_name in phase_times:
1427
+ elapsed = phase_times[phase_name]
1428
+ lines.append(f"{G} {phase_name:<26s} {elapsed:>6.2f}s{R}")
1429
+
1430
+ # Extension modules section
1431
+ lines.append(f"{G} 扩展模块:{R}")
1432
+ if "Phase 4: Extensions" in phase_times:
1433
+ elapsed = phase_times["Phase 4: Extensions"]
1434
+ lines.append(f"{G} {'Phase 4: Extensions':<26s} {elapsed:>6.2f}s{R}")
1435
+
1436
+ # Sort running modules by ready time
1437
+ running_sorted = sorted(running, key=lambda x: self._ready_times.get(x[0], float('inf')))
1438
+
1439
+ # Running modules with ready time and elapsed from Kite start
1440
+ DIM = "\033[90m"
1441
+ lines.append(f"{G} 运行中 ({len(running)}):{R}")
1442
+
1443
+ # CJK-aware display width helpers
1444
+ def _dw(s):
1445
+ """Display width: CJK chars count as 2, others as 1."""
1446
+ w = 0
1447
+ for c in str(s):
1448
+ w += 2 if '\u4e00' <= c <= '\u9fff' or '\u3000' <= c <= '\u303f' or '\uff00' <= c <= '\uffef' else 1
1449
+ return w
1450
+
1451
+ def _rpad(s, width):
1452
+ """Left-align s in a field of given display width."""
1453
+ return str(s) + ' ' * max(0, width - _dw(s))
1454
+
1455
+ def _lpad(s, width):
1456
+ """Right-align s in a field of given display width."""
1457
+ return ' ' * max(0, width - _dw(s)) + str(s)
1458
+
1459
+ # Column definitions: (header, align, min_width)
1460
+ headers = ['模块', 'PID', '启动耗时', '进程启动时长', '类型']
1461
+ aligns = ['left', 'right', 'right', 'right', 'left'] # alignment per column
1462
+
1463
+ # Build data rows first to calculate column widths
1464
+ rows = []
1465
+ for name, info, rec in running_sorted:
1466
+ label = info.display_name or name
1467
+ ready_t = self._ready_times.get(name)
1468
+ time_str = f"{ready_t:.2f}s" if ready_t is not None else "—"
1469
+ if ready_t is not None and hasattr(self, '_start_unix'):
1470
+ elapsed_from_start = (rec.started_at + ready_t) - self._start_unix
1471
+ es_str = f"{elapsed_from_start:.2f}s"
1472
+ else:
1473
+ es_str = "—"
1474
+ rows.append([label, str(rec.pid), time_str, es_str, f"[{info.type}]"])
1475
+
1476
+ # Calculate column widths: max of header and all data display widths
1477
+ col_widths = [_dw(h) for h in headers]
1478
+ for row in rows:
1479
+ for i, cell in enumerate(row):
1480
+ col_widths[i] = max(col_widths[i], _dw(cell))
1481
+
1482
+ # Render header
1483
+ hdr_parts = []
1484
+ for i, h in enumerate(headers):
1485
+ if aligns[i] == 'left':
1486
+ hdr_parts.append(_rpad(h, col_widths[i]))
1487
+ else:
1488
+ hdr_parts.append(_lpad(h, col_widths[i]))
1489
+ lines.append(f"{DIM} {' '.join(hdr_parts)}{R}")
1490
+
1491
+ # Render data rows
1492
+ for row in rows:
1493
+ parts = []
1494
+ for i, cell in enumerate(row):
1495
+ if aligns[i] == 'left':
1496
+ parts.append(_rpad(cell, col_widths[i]))
1497
+ else:
1498
+ parts.append(_lpad(cell, col_widths[i]))
1499
+ lines.append(f"{G} ✓ {' '.join(parts)}{R}")
1500
+
1501
+ # Exited modules (started but already quit)
1502
+ if exited:
1503
+ lines.append(f"{Y} 已退出 ({len(exited)}):{R}")
1504
+ for name, info in exited:
1505
+ label = info.display_name or name
1506
+ reason = self._exit_reasons.get(name, "")
1507
+ reason_str = f": {reason}" if reason else ""
1508
+ lines.append(f"{Y} ↗ {label:<20s} (主动退出{reason_str}){R}")
1509
+
1510
+ # Stopped modules
1511
+ if stopped:
1512
+ lines.append(f"{G} 未启动 ({len(stopped)}):{R}")
1513
+ for name, info in stopped:
1514
+ label = info.display_name or name
1515
+ lines.append(f"{G} - {label:<20s} ({info.state}){R}")
1516
+
1517
+ lines.append(f"{G} Launcher API: http://127.0.0.1:{self.api_port} 实例: {self.instance_id}{R}")
1518
+
1519
+ # Query Registry for web module's access URL
1520
+ web_url = await self._get_web_url()
1521
+ if web_url:
1522
+ lines.append(f"{B} Web 管理后台: {web_url}{R}")
1523
+
1524
+ # Instance info
1525
+ instances = self.process_manager.get_alive_instances()
1526
+ inst_num = self.process_manager.instance_num
1527
+ suffix_display = self.process_manager.instance_suffix or "(无)"
1528
+ inst_dir = os.environ.get("KITE_INSTANCE_DIR", "")
1529
+ cwd = os.environ.get("KITE_CWD", "")
1530
+ debug_flag = " [DEBUG]" if os.environ.get("KITE_DEBUG") == "1" else ""
1531
+ lines.append(f"{G} 当前实例: #{inst_num} 后缀: {suffix_display} PID: {os.getpid()}{debug_flag}{R}")
1532
+ lines.append(f"{G} 实例目录: {inst_dir}{R}")
1533
+ lines.append(f"{G} 工作目录: {cwd}{R}")
1534
+ if len(instances) > 1:
1535
+ lines.append(f"{G} 所有实例:{R}")
1536
+ for i in instances:
1537
+ s = "" if i["num"] == 1 else f"~{i['num']}"
1538
+ debug_tag = " [DEBUG]" if i.get("debug", False) else ""
1539
+ current_tag = " (当前)" if i["is_self"] else ""
1540
+ lines.append(f"{G} #{i['num']} PID {i['launcher_pid']} "
1541
+ f"模块数 {i['module_count']} (processes{s}.json){debug_tag}{current_tag}{R}")
1542
+
1543
+ # Cross-directory instances from other projects
1544
+ if global_instances:
1545
+ my_inst_basename = os.path.basename(os.environ.get("KITE_INSTANCE_DIR", ""))
1546
+ other_instances = [i for i in global_instances
1547
+ if not i["is_self"] and i["instance_dir"] != my_inst_basename]
1548
+ if other_instances:
1549
+ lines.append(f"{G} 其他项目实例:{R}")
1550
+ for i in other_instances:
1551
+ debug_tag = " [DEBUG]" if i.get("debug", False) else ""
1552
+ cwd_display = f" {i['cwd']}" if i["cwd"] else ""
1553
+ lines.append(
1554
+ f"{G} {i['instance_dir']:<20s} "
1555
+ f"#{i['num']} PID {i['launcher_pid']} "
1556
+ f"模块数 {i['module_count']}"
1557
+ f"{cwd_display}{debug_tag}{R}"
1558
+ )
908
1559
 
909
- if self._api_server:
910
- self._api_server.should_exit = True
1560
+ if cleaned_stats:
1561
+ total = sum(cleaned_stats.values())
1562
+ if len(cleaned_stats) == 1:
1563
+ inst, count = next(iter(cleaned_stats.items()))
1564
+ lines.append(f"{Y} 已清理残留进程: {inst} ({count} 个){R}")
1565
+ else:
1566
+ lines.append(f"{Y} 已清理残留进程 (共 {total} 个):{R}")
1567
+ for inst, count in cleaned_stats.items():
1568
+ lines.append(f"{Y} {inst}: {count} 个{R}")
911
1569
 
912
- # Clear instance runtime files
913
- self.process_manager._write_records_file([])
1570
+ lines.append(f"{B}{'=' * 60}{R}")
1571
+ lines.append("")
1572
+
1573
+ print("\n".join(lines))
1574
+
1575
+ async def _get_web_url(self) -> str:
1576
+ """Query Registry for the web module's api_endpoint. Returns URL or empty string."""
914
1577
  try:
915
- os.remove(self.process_manager.records_path)
916
- except OSError:
1578
+ client = self._get_http()
1579
+ resp = await client.get(
1580
+ f"http://127.0.0.1:{self.registry_port}/get/web.api_endpoint",
1581
+ headers={"Authorization": f"Bearer {self.kite_token}"},
1582
+ timeout=3,
1583
+ )
1584
+ if resp.status_code == 200:
1585
+ val = resp.json()
1586
+ if val and isinstance(val, str):
1587
+ # Show localhost instead of 127.0.0.1 for friendliness
1588
+ return val.replace("://127.0.0.1:", "://localhost:")
1589
+ except Exception:
917
1590
  pass
918
- print("[launcher] 再见。")
919
-
920
- if IS_WINDOWS:
921
- os._exit(0)
1591
+ return ""
922
1592
 
923
1593
  # ── Utilities ──
924
1594
 
@@ -930,7 +1600,6 @@ class Launcher:
930
1600
  fm = _parse_frontmatter(f.read())
931
1601
  discovery = fm.get("discovery")
932
1602
  if isinstance(discovery, dict) and discovery:
933
- print(f"[launcher] 发现来源: {', '.join(discovery.keys())}")
934
1603
  return discovery
935
1604
  except Exception as e:
936
1605
  print(f"[launcher] 警告: 读取发现配置失败: {e}")
@@ -960,12 +1629,29 @@ class Launcher:
960
1629
 
961
1630
  def _create_api_app(self) -> FastAPI:
962
1631
  """Create the FastAPI app with Launcher management routes."""
1632
+ from fastapi import Request, HTTPException
963
1633
  app = FastAPI(title="Kite Launcher", docs_url=None, redoc_url=None)
964
1634
  launcher = self
965
1635
 
1636
+ def _require_auth(request: Request):
1637
+ """Verify Bearer token and IP whitelist. Raise 401/403 on failure."""
1638
+ # IP whitelist: only allow 127.0.0.1
1639
+ client_host = request.client.host if request.client else None
1640
+ if client_host not in ("127.0.0.1", "::1", "localhost"):
1641
+ raise HTTPException(status_code=403, detail="Access denied: only localhost allowed")
1642
+
1643
+ # Bearer token verification
1644
+ auth = request.headers.get("Authorization", "")
1645
+ if not auth.startswith("Bearer "):
1646
+ raise HTTPException(status_code=401, detail="Missing or invalid Authorization header")
1647
+ token = auth[7:].strip()
1648
+ if token != launcher.kite_token:
1649
+ raise HTTPException(status_code=401, detail="Invalid token")
1650
+
966
1651
  @app.get("/launcher/modules")
967
- async def list_modules():
1652
+ async def list_modules(request: Request):
968
1653
  """List all modules and their current status."""
1654
+ _require_auth(request)
969
1655
  result = []
970
1656
  for name, info in launcher.modules.items():
971
1657
  running = launcher.process_manager.is_running(name)
@@ -983,8 +1669,9 @@ class Launcher:
983
1669
  return result
984
1670
 
985
1671
  @app.post("/launcher/modules/{name}/start")
986
- async def start_module(name: str):
1672
+ async def start_module(name: str, request: Request):
987
1673
  """Start a module by name."""
1674
+ _require_auth(request)
988
1675
  info = launcher.modules.get(name)
989
1676
  if not info:
990
1677
  raise HTTPException(404, f"Module '{name}' not found")
@@ -994,13 +1681,12 @@ class Launcher:
994
1681
  if name not in launcher._module_tokens:
995
1682
  launcher._module_tokens[name] = secrets.token_hex(32)
996
1683
  try:
997
- async with httpx.AsyncClient() as client:
998
- await client.post(
999
- f"http://127.0.0.1:{launcher.registry_port}/tokens",
1000
- json={name: launcher._module_tokens[name]},
1001
- headers={"Authorization": f"Bearer {launcher.kite_token}"},
1002
- timeout=5,
1003
- )
1684
+ client = launcher._get_http()
1685
+ await client.post(
1686
+ f"http://127.0.0.1:{launcher.registry_port}/tokens",
1687
+ json={name: launcher._module_tokens[name]},
1688
+ headers={"Authorization": f"Bearer {launcher.kite_token}"},
1689
+ )
1004
1690
  except Exception as e:
1005
1691
  print(f"[launcher] 警告: 注册 {name} 的令牌失败: {e}")
1006
1692
 
@@ -1009,7 +1695,6 @@ class Launcher:
1009
1695
  ok = launcher.process_manager.start_module(info, boot_info=boot_info)
1010
1696
  if ok:
1011
1697
  launcher._desired_states[name] = "running"
1012
- launcher._fail_counts.pop(name, None)
1013
1698
  launcher.process_manager.persist_records()
1014
1699
  rec = launcher.process_manager.get_record(name)
1015
1700
  launcher._log_lifecycle("started", name, pid=rec.pid if rec else None, via="api")
@@ -1019,8 +1704,9 @@ class Launcher:
1019
1704
  raise HTTPException(500, f"Failed to start '{name}'")
1020
1705
 
1021
1706
  @app.post("/launcher/modules/{name}/stop")
1022
- async def stop_module(name: str, body: dict = None):
1707
+ async def stop_module(name: str, request: Request, body: dict = None):
1023
1708
  """Stop a module with graceful shutdown."""
1709
+ _require_auth(request)
1024
1710
  info = launcher.modules.get(name)
1025
1711
  if not info:
1026
1712
  raise HTTPException(404, f"Module '{name}' not found")
@@ -1031,8 +1717,9 @@ class Launcher:
1031
1717
  return {"status": "stopped", "name": name}
1032
1718
 
1033
1719
  @app.post("/launcher/modules/{name}/restart")
1034
- async def restart_module(name: str, body: dict = None):
1720
+ async def restart_module(name: str, request: Request, body: dict = None):
1035
1721
  """Restart a module (stop + start)."""
1722
+ _require_auth(request)
1036
1723
  info = launcher.modules.get(name)
1037
1724
  if not info:
1038
1725
  raise HTTPException(404, f"Module '{name}' not found")
@@ -1042,13 +1729,12 @@ class Launcher:
1042
1729
  await launcher._graceful_stop(name, reason)
1043
1730
  launcher._module_tokens[name] = secrets.token_hex(32)
1044
1731
  try:
1045
- async with httpx.AsyncClient() as client:
1046
- await client.post(
1047
- f"http://127.0.0.1:{launcher.registry_port}/tokens",
1048
- json={name: launcher._module_tokens[name]},
1049
- headers={"Authorization": f"Bearer {launcher.kite_token}"},
1050
- timeout=5,
1051
- )
1732
+ client = launcher._get_http()
1733
+ await client.post(
1734
+ f"http://127.0.0.1:{launcher.registry_port}/tokens",
1735
+ json={name: launcher._module_tokens[name]},
1736
+ headers={"Authorization": f"Bearer {launcher.kite_token}"},
1737
+ )
1052
1738
  except Exception:
1053
1739
  pass
1054
1740
  token = launcher._module_tokens[name]
@@ -1056,7 +1742,6 @@ class Launcher:
1056
1742
  ok = launcher.process_manager.start_module(info, boot_info=boot_info)
1057
1743
  if ok:
1058
1744
  launcher._desired_states[name] = "running"
1059
- launcher._fail_counts.pop(name, None)
1060
1745
  launcher.process_manager.persist_records()
1061
1746
  rec = launcher.process_manager.get_record(name)
1062
1747
  launcher._log_lifecycle("started", name, pid=rec.pid if rec else None, via="restart_api")
@@ -1066,8 +1751,9 @@ class Launcher:
1066
1751
  raise HTTPException(500, f"Failed to restart '{name}'")
1067
1752
 
1068
1753
  @app.post("/launcher/rescan")
1069
- async def rescan_modules():
1754
+ async def rescan_modules(request: Request):
1070
1755
  """Rescan module directories for new/removed modules."""
1756
+ _require_auth(request)
1071
1757
  old_names = set(launcher.modules.keys())
1072
1758
  launcher.modules = launcher.module_scanner.scan()
1073
1759
  new_names = set(launcher.modules.keys())
@@ -1085,20 +1771,28 @@ class Launcher:
1085
1771
  launcher._module_tokens[name] = secrets.token_hex(32)
1086
1772
  new_tokens[name] = launcher._module_tokens[name]
1087
1773
  try:
1088
- async with httpx.AsyncClient() as client:
1089
- await client.post(
1090
- f"http://127.0.0.1:{launcher.registry_port}/tokens",
1091
- json=new_tokens,
1092
- headers={"Authorization": f"Bearer {launcher.kite_token}"},
1093
- timeout=5,
1094
- )
1774
+ client = launcher._get_http()
1775
+ await client.post(
1776
+ f"http://127.0.0.1:{launcher.registry_port}/tokens",
1777
+ json=new_tokens,
1778
+ headers={"Authorization": f"Bearer {launcher.kite_token}"},
1779
+ )
1095
1780
  except Exception:
1096
1781
  pass
1097
1782
  return {"added": added, "removed": removed, "total": len(launcher.modules)}
1098
1783
 
1784
+ @app.post("/launcher/shutdown")
1785
+ async def shutdown_launcher(request: Request, body: dict = None):
1786
+ """Shutdown the entire Kite system (equivalent to Ctrl+C)."""
1787
+ _require_auth(request)
1788
+ reason = (body or {}).get("reason", "api_request")
1789
+ launcher._request_shutdown(f"API shutdown request: {reason}")
1790
+ return {"status": "shutting_down", "reason": reason}
1791
+
1099
1792
  @app.put("/launcher/modules/{name}/state")
1100
- async def update_state(name: str, body: dict):
1793
+ async def update_state(name: str, request: Request, body: dict):
1101
1794
  """Update module state (enabled/manual/disabled). Writes to module.md."""
1795
+ _require_auth(request)
1102
1796
  info = launcher.modules.get(name)
1103
1797
  if not info:
1104
1798
  raise HTTPException(404, f"Module '{name}' not found")