@agentunion/kite 1.2.0 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/CHANGELOG.md +208 -0
  2. package/README.md +48 -0
  3. package/cli.js +1 -1
  4. package/extensions/agents/assistant/entry.py +30 -81
  5. package/extensions/agents/assistant/module.md +1 -1
  6. package/extensions/agents/assistant/server.py +83 -122
  7. package/extensions/channels/acp_channel/entry.py +30 -81
  8. package/extensions/channels/acp_channel/module.md +1 -1
  9. package/extensions/channels/acp_channel/server.py +83 -122
  10. package/extensions/event_hub_bench/entry.py +81 -121
  11. package/extensions/services/backup/entry.py +213 -85
  12. package/extensions/services/model_service/entry.py +213 -85
  13. package/extensions/services/watchdog/entry.py +513 -460
  14. package/extensions/services/watchdog/monitor.py +55 -69
  15. package/extensions/services/web/entry.py +11 -108
  16. package/extensions/services/web/server.py +120 -77
  17. package/{core/registry → kernel}/entry.py +65 -37
  18. package/{core/event_hub/hub.py → kernel/event_hub.py} +61 -81
  19. package/kernel/module.md +33 -0
  20. package/{core/registry/store.py → kernel/registry_store.py} +13 -4
  21. package/kernel/rpc_router.py +388 -0
  22. package/kernel/server.py +267 -0
  23. package/launcher/__init__.py +10 -0
  24. package/launcher/__main__.py +6 -0
  25. package/launcher/count_lines.py +258 -0
  26. package/{core/launcher → launcher}/entry.py +693 -767
  27. package/launcher/logging_setup.py +289 -0
  28. package/{core/launcher → launcher}/module_scanner.py +11 -6
  29. package/main.py +11 -350
  30. package/package.json +6 -9
  31. package/__init__.py +0 -1
  32. package/__main__.py +0 -15
  33. package/core/event_hub/BENCHMARK.md +0 -94
  34. package/core/event_hub/__init__.py +0 -0
  35. package/core/event_hub/bench.py +0 -459
  36. package/core/event_hub/bench_extreme.py +0 -308
  37. package/core/event_hub/bench_perf.py +0 -350
  38. package/core/event_hub/entry.py +0 -436
  39. package/core/event_hub/module.md +0 -20
  40. package/core/event_hub/server.py +0 -269
  41. package/core/kite_log.py +0 -241
  42. package/core/launcher/__init__.py +0 -0
  43. package/core/registry/__init__.py +0 -0
  44. package/core/registry/module.md +0 -30
  45. package/core/registry/server.py +0 -339
  46. package/extensions/services/backup/server.py +0 -244
  47. package/extensions/services/model_service/server.py +0 -236
  48. package/extensions/services/watchdog/server.py +0 -229
  49. /package/{core → kernel}/__init__.py +0 -0
  50. /package/{core/event_hub → kernel}/dedup.py +0 -0
  51. /package/{core/event_hub → kernel}/router.py +0 -0
  52. /package/{core/launcher → launcher}/module.md +0 -0
  53. /package/{core/launcher → launcher}/process_manager.py +0 -0
@@ -1,19 +1,14 @@
1
1
  """
2
- Launcher — the core of Kite. Manages module lifecycle, exposes API, monitors processes.
2
+ Launcher — the core of Kite. Manages module lifecycle, monitors processes.
3
3
 
4
4
  Thread model:
5
5
  - Main thread: asyncio event loop (process management + monitor loop)
6
- - API thread: independent thread running uvicorn + FastAPI
7
6
  - stdout threads: one daemon thread per child process (ProcessManager)
8
7
  - (Windows) keyboard listener thread: polls for 'q' key
9
8
 
10
- 4-Phase startup:
11
- Phase 1: Registry + Event Hub (parallel start) Registry stdout port stdin broadcast port to Event Hub
12
- API register self + tokens stdin launcher_ws_token to Event Hub
13
- → stdout ws_endpoint → WS connect → module.ready
14
- Phase 2: (reserved — Event Hub ready handled in Phase 1)
15
- Phase 3: Registry delayed ready (Event Hub → Registry → Event Hub WS → module.ready)
16
- Phase 4: start remaining enabled modules in topo order
9
+ 2-Phase startup:
10
+ Phase 1: Start Kernel wait portconnect WSregister self module.ready
11
+ Phase 2: start remaining enabled modules in topo order (each connects to Kernel WS)
17
12
  """
18
13
 
19
14
  import asyncio
@@ -26,10 +21,7 @@ import threading
26
21
  import time
27
22
  import uuid
28
23
 
29
- import httpx
30
- import uvicorn
31
24
  import websockets
32
- from fastapi import FastAPI, HTTPException
33
25
 
34
26
  from .module_scanner import ModuleScanner, ModuleInfo, LaunchConfig, _parse_frontmatter
35
27
  from .process_manager import ProcessManager
@@ -42,14 +34,14 @@ SHUTDOWN_TIMEOUT_PARTIAL = 3 # Graceful module ack'd but no ready
42
34
  SHUTDOWN_TIMEOUT_READY = 1 # Graceful module sent ready (cleanup done)
43
35
  SHUTDOWN_TIMEOUT_BULK = 3 # Bulk stop_all() safety net
44
36
 
45
- # Core module names that are started in Phase 1-2 (not Phase 4)
46
- CORE_MODULE_NAMES = {"registry", "event_hub"}
37
+ # Core module names that are started in Phase 1 (not Phase 2)
38
+ CORE_MODULE_NAMES = {"kernel"}
47
39
 
48
40
  WATCHDOG_MODULE_NAME = "watchdog"
49
41
 
50
42
 
51
43
  class Launcher:
52
- """Kite system entry point. Starts Registry, manages modules, exposes API."""
44
+ """Kite system entry point. Starts Kernel, manages modules."""
53
45
 
54
46
  def __init__(self, kite_token: str):
55
47
  self.kite_token = kite_token
@@ -70,41 +62,41 @@ class Launcher:
70
62
  discovery=self._load_discovery(),
71
63
  )
72
64
 
73
- self.registry_port: int = 0
74
- self.api_port: int = 0
65
+ self.kernel_port: int = 0
75
66
  self.modules: dict[str, ModuleInfo] = {}
76
67
  self._shutdown_event = asyncio.Event()
77
68
  self._thread_shutdown = threading.Event()
78
69
  self._shutdown_complete = threading.Event() # Set when normal shutdown finishes
79
- self._api_server: uvicorn.Server | None = None
80
- self._api_ready = threading.Event()
81
70
  self._module_tokens: dict[str, str] = {} # module_name -> per-module token
82
71
 
83
72
  # Three-layer state model: desired_state per module
84
73
  self._desired_states: dict[str, str] = {} # module_name -> "running" | "stopped"
85
74
 
86
- # Event Hub WebSocket client
87
- self._event_hub_ws_url: str = ""
88
- self._launcher_ws_token: str = ""
75
+ # Kernel WebSocket client
89
76
  self._ws: object | None = None
90
77
  self._ws_task: asyncio.Task | None = None
78
+ self._ws_connected: asyncio.Event | None = None # Created in _async_main, set when WS ready
91
79
  self._loop: asyncio.AbstractEventLoop | None = None
92
80
 
81
+ # JSON-RPC 2.0 infrastructure
82
+ self._rpc_waiters: dict[str, asyncio.Event] = {} # rpc_id -> Event
83
+ self._rpc_results: dict[str, dict] = {} # rpc_id -> response dict
84
+
93
85
  # Event waiters: {event_key: (asyncio.Event, data_dict)}
94
86
  self._event_waiters: dict[str, tuple[asyncio.Event, dict]] = {}
95
87
 
96
88
  # Module ready times: module_name -> seconds from start to ready
97
89
  self._ready_times: dict[str, float] = {}
98
90
 
99
- # Shared HTTP client for Registry communication (lazy-init, reuses TCP connections)
100
- self._http: httpx.AsyncClient | None = None
91
+ # Shutdown timing
92
+ self._shutdown_start_time: float = 0.0
101
93
 
102
94
  # Module exit reasons: module_name -> reason string (for modules that sent module.exiting)
103
95
  self._exit_reasons: dict[str, str] = {}
104
96
 
105
97
  # Graceful shutdown capability: module_name -> True if module declared support
106
- # Registry and Event Hub default to True (they start before Watchdog can observe)
107
- self._graceful_modules: dict[str, bool] = {"registry": True, "event_hub": True}
98
+ # Kernel defaults to True (it starts before Watchdog can observe)
99
+ self._graceful_modules: dict[str, bool] = {"kernel": True}
108
100
 
109
101
  # System-wide shutdown flag: prevents Watchdog restart during shutdown
110
102
  self._system_shutting_down = False
@@ -124,7 +116,6 @@ class Launcher:
124
116
  except Exception:
125
117
  pass
126
118
  os.environ["KITE_INSTANCE_SUFFIX"] = suffix
127
- self._app = self._create_api_app()
128
119
 
129
120
  @staticmethod
130
121
  def _fmt_elapsed(seconds: float) -> str:
@@ -238,7 +229,9 @@ class Launcher:
238
229
  except KeyboardInterrupt:
239
230
  pass
240
231
  except RuntimeError as e:
241
- print(f"[launcher] 启动失败: {e}")
232
+ # Don't print "启动失败" if user requested shutdown
233
+ if not self._thread_shutdown.is_set():
234
+ print(f"[launcher] 启动失败: {e}")
242
235
  finally:
243
236
  self._final_cleanup()
244
237
 
@@ -247,6 +240,7 @@ class Launcher:
247
240
  if self._thread_shutdown.is_set():
248
241
  return # already shutting down
249
242
  print(f"[launcher] {reason or '收到关闭请求'}")
243
+ self._shutdown_start_time = time.monotonic() # Record shutdown start time
250
244
  self._thread_shutdown.set()
251
245
  # Wake up asyncio event loop immediately (so _monitor_loop / wait_for exits)
252
246
  loop = self._loop
@@ -265,9 +259,9 @@ class Launcher:
265
259
  except Exception:
266
260
  still = []
267
261
  if still:
268
- print(f"[launcher] 关闭超时,以下模块仍在运行: {', '.join(still)},强制退出")
262
+ print(f"\033[91m[launcher] 关闭超时,以下模块仍在运行: {', '.join(still)},强制退出\033[0m")
269
263
  else:
270
- print("[launcher] 关闭超时,强制退出")
264
+ print("\033[91m[launcher] 关闭超时,强制退出\033[0m")
271
265
  os._exit(1)
272
266
  threading.Thread(target=_force, daemon=True).start()
273
267
 
@@ -307,31 +301,34 @@ class Launcher:
307
301
  while not self._thread_shutdown.is_set():
308
302
  if msvcrt.kbhit():
309
303
  ch = msvcrt.getch()
310
- if ch in (b'q', b'Q'):
304
+ if ch == b'\x1b': # ESC - force exit immediately
305
+ print("[launcher] ESC 强制退出")
306
+ os._exit(0)
307
+ elif ch in (b'q', b'Q'): # q/Q - graceful shutdown
311
308
  self._request_shutdown("收到退出请求,正在关闭...")
312
309
  return
313
310
  time.sleep(0.1)
314
311
  threading.Thread(target=_listen, daemon=True).start()
315
312
 
316
- # ── Async main (4-Phase startup) ──
313
+ # ── Async main (2-Phase startup) ──
317
314
 
318
315
  async def _async_main(self):
319
- """Full 4-phase startup sequence, then monitor loop."""
316
+ """Full 2-phase startup sequence, then monitor loop."""
320
317
  self._loop = asyncio.get_running_loop()
318
+ self._ws_connected = asyncio.Event() # Create event in async context
321
319
  t_start = time.monotonic()
322
320
  self._start_unix = time.time()
323
321
  phase_times = {}
324
322
  G = "\033[32m"
325
323
  R = "\033[0m"
326
324
 
327
- # Validate core modules exist (mechanism 12)
325
+ # Validate core modules exist
328
326
  self._validate_core_modules()
329
327
 
330
328
  # Cleanup leftovers from previous instances (current instance dir)
331
329
  local_cleaned = self.process_manager.cleanup_leftovers()
332
330
 
333
331
  # Cross-directory leftover cleanup (background, non-blocking)
334
- # run_in_executor returns a Future (not coroutine), so use ensure_future
335
332
  self._global_cleanup_task = asyncio.ensure_future(
336
333
  asyncio.get_running_loop().run_in_executor(
337
334
  None, self.process_manager.cleanup_global_leftovers
@@ -339,23 +336,15 @@ class Launcher:
339
336
  )
340
337
 
341
338
  try:
342
- # Phase 1+2: Registry + Event Hub parallel bootstrap
339
+ # Phase 1: Start Kernel + connect WS
343
340
  t0 = time.monotonic()
344
- await self._phase1_parallel_bootstrap()
341
+ await self._phase1_start_kernel()
345
342
  elapsed_p1 = time.monotonic() - t0
346
- phase_times["Phase 1+2: Registry + Event Hub (并行)"] = elapsed_p1
347
- print(f"{G}[launcher] ✓ Phase 1+2 完成: Registry + Event Hub 已就绪 ({elapsed_p1:.2f}s){R}")
343
+ phase_times["Phase 1: Kernel"] = elapsed_p1
344
+ print(f"{G}[launcher] ✓ Phase 1 完成: Kernel 已就绪 ({elapsed_p1:.2f}s){R}")
348
345
  if self._shutdown_event.is_set(): return
349
346
 
350
- # Phase 3: Wait for Registry delayed ready
351
- t0 = time.monotonic()
352
- await self._phase3_registry_ready()
353
- elapsed = time.monotonic() - t0
354
- phase_times["Phase 3: Registry 事件总线"] = elapsed
355
- print(f"{G}[launcher] ✓ Phase 3 完成: Registry 已连接事件总线 ({elapsed:.2f}s){R}")
356
- if self._shutdown_event.is_set(): return
357
-
358
- # Initialize desired_state from config_state (needed before Phase 3.5)
347
+ # Initialize desired_state from config_state
359
348
  for name, info in self.modules.items():
360
349
  if info.state == "enabled":
361
350
  self._desired_states[name] = "running"
@@ -365,43 +354,26 @@ class Launcher:
365
354
  for cn in CORE_MODULE_NAMES:
366
355
  self._desired_states[cn] = "running"
367
356
 
368
- # Phase 3.5: Watchdog ready
369
- # If started in parallel (Phase 1), just wait for module.ready
370
- # Otherwise start it now (fallback)
357
+ # Phase 1.5: Watchdog
371
358
  watchdog_info = self.modules.get(WATCHDOG_MODULE_NAME)
372
359
  if watchdog_info and self._desired_states.get(WATCHDOG_MODULE_NAME) == "running":
373
360
  t0 = time.monotonic()
374
- if getattr(self, '_watchdog_parallel', False):
375
- print(f"[launcher] Phase 3.5: Watchdog 已并行启动,等待就绪...")
376
- ready = await self._wait_event("module.ready", "watchdog", timeout=15)
377
- elapsed = time.monotonic() - t0
378
- if ready and not ready.get("_exited"):
379
- self._graceful_modules["watchdog"] = bool(ready.get("graceful_shutdown"))
380
- self._ready_times["watchdog"] = elapsed
381
- print(f"[launcher] Watchdog 已就绪")
382
- self._log_lifecycle("started", "watchdog")
383
- await self._publish_event("module.started", {"module_id": "watchdog"})
384
- self.process_manager.close_stdio("watchdog")
385
- else:
386
- print(f"[launcher] 警告: Watchdog 在 15s 内未就绪")
387
- else:
388
- print(f"[launcher] Phase 3.5: 启动 Watchdog...")
389
- await self._start_one_module(watchdog_info)
390
- elapsed = time.monotonic() - t0
391
- print(f"{G}[launcher] ✓ Phase 3.5 完成: Watchdog ({elapsed:.2f}s){R}")
361
+ print(f"[launcher] Phase 1.5: 启动 Watchdog...")
362
+ await self._start_one_module(watchdog_info)
363
+ elapsed = time.monotonic() - t0
364
+ print(f"{G}[launcher] Phase 1.5 完成: Watchdog ({elapsed:.2f}s){R}")
392
365
  if self._shutdown_event.is_set(): return
393
366
 
394
- # Phase 4: Start remaining enabled modules
367
+ # Phase 2: Start remaining enabled modules
395
368
  t0 = time.monotonic()
396
- await self._phase4_start_modules()
369
+ await self._phase2_start_modules()
397
370
  elapsed = time.monotonic() - t0
398
- phase_times["Phase 4: Extensions"] = elapsed
399
- print(f"{G}[launcher] ✓ Phase 4 完成: 扩展模块已启动 ({elapsed:.2f}s){R}")
371
+ phase_times["Phase 2: Extensions"] = elapsed
372
+ print(f"{G}[launcher] ✓ Phase 2 完成: 扩展模块已启动 ({elapsed:.2f}s){R}")
400
373
  if self._shutdown_event.is_set(): return
401
374
 
402
375
  # Post-startup
403
376
  self.process_manager.persist_records()
404
- self._heartbeat_task = asyncio.create_task(self._heartbeat_loop())
405
377
 
406
378
  # Wait for global leftover cleanup to finish (non-blocking with timeout)
407
379
  global_cleaned = {}
@@ -433,7 +405,7 @@ class Launcher:
433
405
  "startup_time": round(total_time, 2),
434
406
  })
435
407
 
436
- print("[launcher] 进入监控循环 (按 Ctrl+C 或 'q' 退出)")
408
+ print("[launcher] 进入监控循环 (按 Ctrl+C 或 'q' 优雅退出,ESC 强制退出)")
437
409
  await self._monitor_loop()
438
410
  finally:
439
411
  try:
@@ -441,263 +413,110 @@ class Launcher:
441
413
  except Exception as e:
442
414
  print(f"[launcher] 优雅关闭出错: {e}")
443
415
 
444
- # ── Phase 1+2: Parallel bootstrap (Registry + Event Hub) ──
416
+ # ── Phase 1: Start Kernel ──
445
417
 
446
- async def _phase1_parallel_bootstrap(self):
447
- """Start Registry + Event Hub processes in parallel to overlap cold-start time.
418
+ async def _phase1_start_kernel(self):
419
+ """Start Kernel process, connect WS, register self, wait for module.ready.
448
420
 
449
421
  Flow:
450
- 1. Start Registry + Event Hub processes simultaneously
451
- 2. Wait for Registry to report port via stdout
452
- 3. Set KITE_REGISTRY_PORT env (for Phase 3.5/4 modules) + start API
453
- 4. Scan modules + register self & tokens (parallel)
454
- 5. Send launcher_ws_token + registry_port to Event Hub via stdin
455
- 6. Wait for Event Hub ws_endpoint → WS connect → module.ready
422
+ 1. Start Kernel subprocess
423
+ 2. Wait Kernel stdout port set KITE_KERNEL_PORT env
424
+ 3. Scan modules + connect WS + generate tokens (parallel)
425
+ 4. Wait module.ready event from Kernel
456
426
  """
457
- t_registry = time.monotonic()
427
+ t_kernel = time.monotonic()
458
428
 
459
- # ── Step 1: Start both processes ──
460
- registry_dir = os.path.join(os.environ["KITE_PROJECT"], "core", "registry")
461
- registry_info = ModuleInfo(
462
- name="registry",
463
- display_name="Registry",
429
+ # ── Step 1: Start Kernel process ──
430
+ kernel_dir = os.path.join(os.environ["KITE_PROJECT"], "kernel")
431
+ kernel_info = ModuleInfo(
432
+ name="kernel",
433
+ display_name="Kernel",
464
434
  type="infrastructure",
465
435
  state="enabled",
466
436
  runtime="python",
467
437
  entry="entry.py",
468
- module_dir=registry_dir,
438
+ module_dir=kernel_dir,
469
439
  )
470
- boot_info_registry = {"token": self.kite_token}
471
- self._log_lifecycle("starting", "registry")
472
- ok = self.process_manager.start_module(registry_info, boot_info=boot_info_registry)
440
+ # Kernel does NOT receive boot_info via stdin
441
+ self._log_lifecycle("starting", "kernel")
442
+ ok = self.process_manager.start_module(kernel_info, boot_info=None)
473
443
  if not ok:
474
- self._log_lifecycle("start_failed", "registry")
475
- raise RuntimeError("启动 Registry 失败")
476
-
477
- # Start Event Hub in parallel (before Registry port is known)
478
- eh_dir = os.path.join(os.environ["KITE_PROJECT"], "core", "event_hub")
479
- eh_info = ModuleInfo(
480
- name="event_hub",
481
- display_name="Event Hub",
482
- type="infrastructure",
483
- state="enabled",
484
- runtime="python",
485
- entry="entry.py",
486
- module_dir=eh_dir,
487
- )
488
- # Generate Event Hub token early (will register to Registry once it's up)
489
- eh_token = secrets.token_hex(32)
490
- self._module_tokens["event_hub"] = eh_token
491
- boot_info_eh = {"token": eh_token}
492
- self._log_lifecycle("starting", "event_hub")
493
- ok = self.process_manager.start_module(eh_info, boot_info=boot_info_eh)
494
- if not ok:
495
- self._log_lifecycle("start_failed", "event_hub")
496
- raise RuntimeError("启动 Event Hub 失败")
497
-
498
- # Start Watchdog in parallel (before Registry port is known)
499
- # Watchdog will block on stdin waiting for registry_port
500
- watchdog_dir = os.path.join(os.environ["KITE_PROJECT"], "extensions", "services", "watchdog")
501
- watchdog_md = os.path.join(watchdog_dir, "module.md")
502
- self._watchdog_parallel = False # track whether watchdog was started in parallel
503
- if os.path.isfile(watchdog_md):
504
- wd_token = secrets.token_hex(32)
505
- self._module_tokens["watchdog"] = wd_token
506
- # Parse watchdog module.md for ModuleInfo
507
- try:
508
- with open(watchdog_md, "r", encoding="utf-8") as f:
509
- wd_fm = _parse_frontmatter(f.read())
510
- wd_info = ModuleInfo(
511
- name="watchdog",
512
- display_name=wd_fm.get("display_name", "Watchdog"),
513
- type=wd_fm.get("type", "service"),
514
- state="enabled",
515
- runtime=wd_fm.get("runtime", "python"),
516
- entry=wd_fm.get("entry", "entry.py"),
517
- module_dir=watchdog_dir,
518
- )
519
- boot_info_wd = {"token": wd_token}
520
- self._log_lifecycle("starting", "watchdog")
521
- ok = self.process_manager.start_module(wd_info, boot_info=boot_info_wd)
522
- if ok:
523
- self._watchdog_parallel = True
524
- else:
525
- self._log_lifecycle("start_failed", "watchdog")
526
- print("[launcher] 警告: Watchdog 并行启动失败,将在 Phase 3.5 重试")
527
- except Exception as e:
528
- print(f"[launcher] 警告: Watchdog module.md 解析失败: {e}")
444
+ self._log_lifecycle("start_failed", "kernel")
445
+ raise RuntimeError("启动 Kernel 失败")
529
446
 
530
- parallel_modules = "Registry + Event Hub" + (" + Watchdog" if self._watchdog_parallel else "")
531
- print(f"[launcher] {parallel_modules} 进程已同时启动,等待 Registry 端口...")
447
+ print(f"[launcher] Kernel 进程已启动,等待 Kernel 端口...")
532
448
 
533
449
  # Persist immediately after starting core processes
534
450
  self.process_manager.persist_records()
535
451
 
536
- # ── Step 2: Wait for Registry port ──
537
- msg = await self._wait_kite_message("registry", "port", timeout=6)
538
- if not msg or not msg.get("port"):
539
- raise RuntimeError("致命错误: Registry 6s 内未报告端口")
540
- self.registry_port = int(msg["port"])
541
- self._ready_times["registry"] = time.monotonic() - t_registry
542
- _wait_s = time.monotonic() - t_registry
543
- print(f"[launcher] Registry 端口: {self.registry_port} (等待 {self._fmt_elapsed(_wait_s)})")
544
-
545
- # ── Step 3: Set env + start API + immediately unblock Event Hub ──
546
- os.environ["KITE_REGISTRY_PORT"] = str(self.registry_port)
547
- self._start_api_thread()
548
-
549
- # Send launcher_ws_token + registry_port to Event Hub ASAP (unblock it)
550
- self._launcher_ws_token = secrets.token_hex(32)
551
- self.process_manager.write_stdin("event_hub", {
552
- "kite": "launcher_ws_token",
553
- "launcher_ws_token": self._launcher_ws_token,
554
- })
555
- self.process_manager.write_stdin("event_hub", {
556
- "kite": "registry_port",
557
- "registry_port": self.registry_port,
558
- })
559
-
560
- # Send registry_port to Watchdog via stdin (if started in parallel)
561
- # Watchdog will retry querying launcher.api_endpoint until it's available
562
- if self.process_manager.is_running("watchdog"):
563
- self.process_manager.write_stdin("watchdog", {
564
- "kite": "registry_port",
565
- "registry_port": self.registry_port,
566
- })
567
-
568
- # ── Step 4: Scan + register tokens ‖ wait for Event Hub ws_endpoint (parallel) ──
569
- # Pre-register ws_endpoint waiter BEFORE gather to avoid race condition:
570
- # module_scanner.scan() is synchronous and blocks the event loop,
571
- # so the _wait_event_hub_endpoint coroutine wouldn't register its waiter in time.
572
- ws_waiter_key = "event_hub:ws_endpoint"
573
- ws_evt = threading.Event()
574
- ws_data: dict = {}
575
- self._msg_waiters[ws_waiter_key] = (ws_evt, ws_data)
576
-
577
- async def _scan_and_register_tokens():
452
+ # ── Step 2: Wait for Kernel port + launcher_token ──
453
+ msg = await self._wait_kite_message("kernel", "port", timeout=6)
454
+ if self._thread_shutdown.is_set():
455
+ # User requested shutdown during startup
456
+ raise RuntimeError("启动被用户中断")
457
+ if not msg or not msg.get("port") or not msg.get("token"):
458
+ raise RuntimeError("致命错误: Kernel 在 6s 内未报告端口和 token")
459
+ self.kernel_port = int(msg["port"])
460
+ launcher_token = msg["token"]
461
+ self._module_tokens["launcher"] = launcher_token
462
+ _wait_s = time.monotonic() - t_kernel
463
+ print(f"[launcher] Kernel 端口: {self.kernel_port} (等待 {self._fmt_elapsed(_wait_s)})")
464
+
465
+ # ── Step 3: Set env (but don't send kernel_port to modules yet) ──
466
+ os.environ["KITE_KERNEL_PORT"] = str(self.kernel_port)
467
+
468
+ # ── Step 4: Scan modules + connect WS + generate tokens (parallel) ──
469
+ async def _scan_and_generate_tokens():
578
470
  t_scan = time.monotonic()
579
471
  self.modules = self.module_scanner.scan()
580
472
  for name, info in self.modules.items():
581
473
  self._log_lifecycle("scanned", name, state=info.state, module_dir=info.module_dir)
582
474
  _scan_s = time.monotonic() - t_scan
583
475
  print(f"[launcher] 发现 {len(self.modules)} 个模块: {', '.join(self.modules.keys()) or '(无)'} (扫描 {self._fmt_elapsed(_scan_s)})")
584
- t_reg = time.monotonic()
585
- await self._register_module_tokens()
586
- _reg_s = time.monotonic() - t_reg
587
- print(f"[launcher] 令牌注册完成 ({self._fmt_elapsed(_reg_s)})")
588
-
589
- async def _wait_event_hub_endpoint():
590
- t_wait_eh = time.monotonic()
591
- print("[launcher] 等待 Event Hub ws_endpoint...")
592
- shutdown = self._thread_shutdown
593
- def _wait():
594
- deadline = time.monotonic() + 10
595
- while time.monotonic() < deadline:
596
- if ws_evt.wait(timeout=0.5):
597
- return True
598
- if shutdown.is_set():
599
- return False
600
- return False
601
- got = await asyncio.get_running_loop().run_in_executor(None, _wait)
602
- self._msg_waiters.pop(ws_waiter_key, None)
603
- if not got or not ws_data.get("ws_endpoint"):
604
- raise RuntimeError("致命错误: Event Hub 在 10s 内未报告 ws_endpoint")
605
- self._event_hub_ws_url = ws_data["ws_endpoint"]
606
- _eh_s = time.monotonic() - t_wait_eh
607
- print(f"[launcher] Event Hub 已发现: {self._event_hub_ws_url} (等待 {self._fmt_elapsed(_eh_s)})")
608
-
609
- # Run all three in parallel: register_self + scan_tokens + wait_event_hub
476
+ # Generate tokens via Kernel RPC (after WS connection is ready)
477
+ t_gen = time.monotonic()
478
+ await self._generate_module_tokens()
479
+ _gen_s = time.monotonic() - t_gen
480
+ print(f"[launcher] 令牌生成完成 ({self._fmt_elapsed(_gen_s)})")
481
+
482
+ async def _connect_kernel_ws():
483
+ t_ws = time.monotonic()
484
+ self._ws_task = asyncio.create_task(self._ws_loop())
485
+ # Wait for WebSocket connection to be established and ready
486
+ try:
487
+ await asyncio.wait_for(self._ws_connected.wait(), timeout=5)
488
+ except asyncio.TimeoutError:
489
+ print("[launcher] 警告: WebSocket 连接超时")
490
+ return
491
+
492
+ # Now wait for Kernel module.ready event
493
+ # (waiter is registered inside _ws_connect before _ws_receiver starts)
494
+ ready = await self._wait_event("module.ready", "kernel", timeout=15)
495
+ if ready:
496
+ self._graceful_modules["kernel"] = bool(ready.get("graceful_shutdown"))
497
+ print("[launcher] Kernel 已就绪")
498
+ else:
499
+ print("\033[91m[launcher] 警告: Kernel 15s 内未发送 module.ready\033[0m")
500
+ self._ready_times["kernel"] = time.monotonic() - t_ws
501
+
610
502
  await asyncio.gather(
611
- self._register_self(),
612
- _scan_and_register_tokens(),
613
- _wait_event_hub_endpoint(),
503
+ _scan_and_generate_tokens(),
504
+ _connect_kernel_ws(),
614
505
  )
615
506
  if self._shutdown_event.is_set():
616
507
  return
617
508
 
618
- # ── Step 5: WS connect → module.ready ──
619
- t_eh = time.monotonic()
620
- self._ws_task = asyncio.create_task(self._ws_loop())
621
-
622
- # Wait for Event Hub module.ready (sent when Launcher connects)
623
- ready = await self._wait_event("module.ready", "event_hub", timeout=15)
624
- if ready:
625
- self._graceful_modules["event_hub"] = bool(ready.get("graceful_shutdown"))
626
- print("[launcher] Event Hub 已就绪")
627
- else:
628
- print("[launcher] 警告: Event Hub 在 15s 内未发送 module.ready")
629
-
630
- self._ready_times["event_hub"] = time.monotonic() - t_eh
631
- self._log_lifecycle("started", "event_hub")
632
- await self._publish_event("module.started", {"module_id": "event_hub"})
633
- self.process_manager.close_stdio("event_hub")
634
-
635
- # Store eh_info in modules dict if not already present (from scan)
636
- if "event_hub" not in self.modules:
637
- self.modules["event_hub"] = eh_info
638
-
639
- def _get_http(self) -> httpx.AsyncClient:
640
- """Get shared HTTP client (lazy-init, reuses TCP connections to Registry)."""
641
- if self._http is None or self._http.is_closed:
642
- self._http = httpx.AsyncClient(timeout=5)
643
- return self._http
644
-
645
- async def _close_http(self):
646
- """Close shared HTTP client."""
647
- if self._http and not self._http.is_closed:
648
- await self._http.aclose()
649
- self._http = None
650
-
651
- async def _register_self(self):
652
- """Register Launcher itself to Registry."""
653
- url = f"http://127.0.0.1:{self.registry_port}/modules"
654
- headers = {"Authorization": f"Bearer {self.kite_token}"}
655
- payload = {
656
- "action": "register",
657
- "module_id": "launcher",
658
- "module_type": "infrastructure",
659
- "name": "Launcher",
660
- "api_endpoint": f"http://127.0.0.1:{self.api_port}",
661
- "health_endpoint": "/launcher/modules",
662
- "events_publish": {
663
- "module.started": {},
664
- "module.stopped": {},
665
- "module.state_changed": {},
666
- },
667
- "events_subscribe": [">"],
668
- }
669
- try:
670
- client = self._get_http()
671
- resp = await client.post(url, json=payload, headers=headers)
672
- if resp.status_code == 200:
673
- print("[launcher] 已注册到 Registry")
674
- else:
675
- print(f"[launcher] 警告: Registry 注册返回 {resp.status_code}")
676
- except Exception as e:
677
- print(f"[launcher] 警告: 注册到 Registry 失败: {e}")
678
-
679
- # ── (Phase 2 merged into _phase1_parallel_bootstrap) ──
509
+ self._log_lifecycle("started", "kernel")
510
+ await self._publish_event("module.started", {"module_id": "kernel"})
511
+ self.process_manager.close_stdio("kernel")
680
512
 
681
- # ── Phase 3: Registry delayed ready ──
513
+ # Store kernel_info in modules dict if not already present (from scan)
514
+ if "kernel" not in self.modules:
515
+ self.modules["kernel"] = kernel_info
682
516
 
683
- async def _phase3_registry_ready(self):
684
- """Wait for Registry module.ready (triggered after Event Hub registers to Registry
685
- and Registry connects to Event Hub WS)."""
686
- print("[launcher] 等待 Registry 连接 Event Hub...")
687
- ready = await self._wait_event("module.ready", "registry", timeout=12)
688
- if ready:
689
- self._graceful_modules["registry"] = bool(ready.get("graceful_shutdown"))
690
- print("[launcher] Registry 事件总线连接完成")
691
- else:
692
- print("[launcher] 警告: Registry 在 12s 内未连接事件总线 (降级运行)")
517
+ # ── Phase 2: Start remaining modules ──
693
518
 
694
- self._log_lifecycle("started", "registry")
695
- await self._publish_event("module.started", {"module_id": "registry"})
696
- self.process_manager.close_stdio("registry")
697
-
698
- # ── Phase 4: Start remaining modules ──
699
-
700
- async def _phase4_start_modules(self):
519
+ async def _phase2_start_modules(self):
701
520
  """Start enabled modules (excluding core) in dependency order."""
702
521
  to_start = [m for m in self.modules.values()
703
522
  if self._desired_states.get(m.name) == "running"
@@ -735,10 +554,10 @@ class Launcher:
735
554
  else:
736
555
  await asyncio.gather(*(self._start_one_module(info) for info in layer))
737
556
 
738
- # ── Event Hub WebSocket connection ──
557
+ # ── Kernel WebSocket connection (JSON-RPC 2.0) ──
739
558
 
740
559
  async def _ws_loop(self):
741
- """Connect to Event Hub, reconnect on failure."""
560
+ """Connect to Kernel, reconnect on failure."""
742
561
  while not self._thread_shutdown.is_set():
743
562
  try:
744
563
  await self._ws_connect()
@@ -746,105 +565,327 @@ class Launcher:
746
565
  return
747
566
  except Exception as e:
748
567
  if not self._system_shutting_down:
749
- print(f"[launcher] Event Hub 连接错误: {e}")
568
+ print(f"[launcher] Kernel 连接错误: {e}")
750
569
  self._ws = None
751
570
  await asyncio.sleep(5)
752
571
 
753
572
  async def _ws_connect(self):
754
- """Single WebSocket session with launcher_ws_token auth."""
755
- ws_url = f"{self._event_hub_ws_url}?token={self._launcher_ws_token}&id=launcher"
573
+ """Single WebSocket session with JSON-RPC 2.0 protocol."""
574
+ launcher_token = self._module_tokens.get("launcher", "")
575
+ ws_url = f"ws://127.0.0.1:{self.kernel_port}/ws?token={launcher_token}&id=launcher"
756
576
  t_ws_connect = time.monotonic()
757
577
  async with websockets.connect(ws_url, open_timeout=3, ping_interval=None, ping_timeout=None, close_timeout=10) as ws:
758
578
  self._ws = ws
759
579
  _ws_s = time.monotonic() - t_ws_connect
760
- print(f"[launcher] 已连接到 Event Hub ({self._fmt_elapsed(_ws_s)})")
580
+ print(f"[launcher] 已连接到 Kernel ({self._fmt_elapsed(_ws_s)})")
761
581
 
762
- # Subscribe to all events
763
- await ws.send(json.dumps({
764
- "type": "subscribe",
765
- "events": [">"],
766
- }))
582
+ # Start receive loop in background task BEFORE making any RPC calls
583
+ # This prevents deadlock where RPC waits for response but receive loop hasn't started
584
+ receiver_task = asyncio.create_task(self._ws_receiver(ws))
585
+
586
+ try:
587
+ # Register kernel module.ready waiter BEFORE subscribing to events
588
+ # This prevents race condition where event arrives before waiter is registered
589
+ ready_key = "module.ready:kernel"
590
+ ready_evt = asyncio.Event()
591
+ ready_data = {}
592
+ self._event_waiters[ready_key] = (ready_evt, ready_data)
593
+
594
+ # Subscribe to all events
595
+ await self._rpc_call(ws, "event.subscribe", {"events": [">"]})
596
+
597
+ # Register Launcher itself in the Registry
598
+ await self._rpc_call(ws, "registry.register", {
599
+ "module_id": "launcher",
600
+ "module_type": "infrastructure",
601
+ "events_publish": {
602
+ "module.started": {},
603
+ "module.stopped": {},
604
+ "module.state_changed": {},
605
+ },
606
+ "events_subscribe": [">"],
607
+ })
608
+ print("[launcher] 已注册到 Kernel")
609
+
610
+ # Signal that connection is ready (after subscription and registration)
611
+ if self._ws_connected:
612
+ self._ws_connected.set()
613
+
614
+ # Wait for receiver task to complete (connection closed)
615
+ await receiver_task
616
+ except asyncio.CancelledError:
617
+ receiver_task.cancel()
618
+ raise
767
619
 
768
- # Receive loop
620
+ async def _ws_receiver(self, ws):
621
+ """Receive loop: classify incoming messages."""
622
+ try:
769
623
  async for raw in ws:
770
624
  try:
771
625
  msg = json.loads(raw)
772
626
  except (json.JSONDecodeError, TypeError):
773
627
  continue
774
628
  try:
775
- msg_type = msg.get("type", "")
776
- if msg_type == "event":
777
- source = msg.get("source", "unknown")
778
- event = msg.get("event", "")
779
- data = msg.get("data") if isinstance(msg.get("data"), dict) else {}
780
- # Trigger event waiters
781
- module_id = data.get("module_id", "")
782
- waiter_key = f"{event}:{module_id}"
783
- waiter = self._event_waiters.get(waiter_key)
784
- if waiter:
785
- waiter[1].update(data)
786
- waiter[0].set()
787
- # module.exiting also wakes module.ready waiter
788
- # (module won't send ready — no point waiting)
789
- if event == "module.exiting" and module_id:
790
- ready_key = f"module.ready:{module_id}"
791
- ready_waiter = self._event_waiters.get(ready_key)
792
- if ready_waiter:
793
- ready_waiter[1].update(data)
794
- ready_waiter[1]["_exited"] = True
795
- ready_waiter[0].set()
796
- # module.crash → print red crash summary (real-time notification)
797
- if event == "module.crash" and module_id:
798
- RED = "\033[91m"
799
- RESET = "\033[0m"
800
- exc_type = data.get("exception_type", "Unknown")
801
- preview = data.get("traceback_preview", "")
802
- severity = data.get("severity", "error")
803
- print(f"[launcher] {RED}模块 '{module_id}' 崩溃: "
804
- f"{exc_type} — {preview}{RESET}")
805
- _suffix = os.environ.get("KITE_INSTANCE_SUFFIX", "")
806
- crash_log = os.path.join(
807
- os.environ.get("KITE_INSTANCE_DIR", ""),
808
- module_id, "log", f"crashes{_suffix}.jsonl"
809
- )
810
- print(f"[launcher] 崩溃日志: {crash_log}")
811
- ts = msg.get("timestamp", "")
812
- # Only log system events (module.*, watchdog.*) to avoid flooding
813
- # from benchmark/test traffic
814
- if not (event.startswith("module.") or event.startswith("watchdog.")):
815
- continue
816
- latency_str = ""
817
- if ts:
818
- try:
819
- from datetime import datetime, timezone
820
- sent = datetime.fromisoformat(ts)
821
- delay_ms = (datetime.now(timezone.utc) - sent).total_seconds() * 1000
822
- latency_str = f" ({delay_ms:.1f}ms)"
823
- local_ts = sent.astimezone().strftime("%H:%M:%S")
824
- except Exception:
825
- local_ts = ts[11:19] if len(ts) >= 19 else ts
826
- print(f"[{source}] {local_ts} {event}{latency_str}: {json.dumps(data, ensure_ascii=False)}")
827
- else:
828
- print(f"[{source}] {event}: {json.dumps(data, ensure_ascii=False)}")
829
- elif msg_type == "error":
830
- print(f"[launcher] Event Hub 错误: {msg.get('message')}")
629
+ has_method = "method" in msg
630
+ has_id = "id" in msg
631
+ has_result = "result" in msg
632
+ has_error = "error" in msg
633
+
634
+ if has_method and not has_id:
635
+ # Event Notification (no id)
636
+ await self._handle_event_notification(msg)
637
+ elif has_method and has_id:
638
+ # Incoming RPC request (forwarded by Kernel)
639
+ await self._handle_rpc_request(ws, msg)
640
+ elif has_id and (has_result or has_error):
641
+ # RPC response (to our own call)
642
+ self._handle_rpc_response(msg)
831
643
  except Exception as e:
832
- print(f"[launcher] 事件处理异常(已忽略): {e}")
644
+ print(f"[launcher] 消息处理异常(已忽略): {e}")
645
+ except asyncio.CancelledError:
646
+ pass
647
+
648
+ # ── JSON-RPC 2.0 infrastructure ──
649
+
650
+ async def _rpc_call(self, ws, method: str, params: dict = None, timeout: float = 5) -> dict:
651
+ """Send a JSON-RPC 2.0 request and await the response."""
652
+ rpc_id = str(uuid.uuid4())
653
+ msg = {"jsonrpc": "2.0", "id": rpc_id, "method": method}
654
+ if params:
655
+ msg["params"] = params
656
+
657
+ evt = asyncio.Event()
658
+ self._rpc_waiters[rpc_id] = evt
659
+ self._rpc_results[rpc_id] = {}
660
+
661
+ try:
662
+ await ws.send(json.dumps(msg))
663
+ await asyncio.wait_for(evt.wait(), timeout=timeout)
664
+ return self._rpc_results.get(rpc_id, {})
665
+ except asyncio.TimeoutError:
666
+ print(f"[launcher] RPC 超时: {method}")
667
+ return {"error": {"code": -32002, "message": f"RPC timeout: {method}"}}
668
+ finally:
669
+ self._rpc_waiters.pop(rpc_id, None)
670
+ self._rpc_results.pop(rpc_id, None)
671
+
672
+ def _handle_rpc_response(self, msg: dict):
673
+ """Match an incoming RPC response to a pending waiter."""
674
+ rpc_id = msg.get("id", "")
675
+ waiter = self._rpc_waiters.get(rpc_id)
676
+ if waiter:
677
+ self._rpc_results[rpc_id] = msg
678
+ waiter.set()
679
+
680
+ async def _handle_event_notification(self, msg: dict):
681
+ """Handle an event notification (JSON-RPC 2.0 Notification with method='event')."""
682
+ params = msg.get("params", {})
683
+ source = params.get("source", "unknown")
684
+ event = params.get("event", "")
685
+ data = params.get("data") if isinstance(params.get("data"), dict) else {}
686
+ ts = params.get("timestamp", "")
687
+
688
+ # Trigger event waiters
689
+ module_id = data.get("module_id", "")
690
+ waiter_key = f"{event}:{module_id}"
691
+ waiter = self._event_waiters.get(waiter_key)
692
+ if waiter:
693
+ waiter[1].update(data)
694
+ waiter[0].set()
695
+
696
+ # module.exiting also wakes module.ready waiter
697
+ if event == "module.exiting" and module_id:
698
+ ready_key = f"module.ready:{module_id}"
699
+ ready_waiter = self._event_waiters.get(ready_key)
700
+ if ready_waiter:
701
+ ready_waiter[1].update(data)
702
+ ready_waiter[1]["_exited"] = True
703
+ ready_waiter[0].set()
704
+
705
+ # module.crash → print red crash summary
706
+ if event == "module.crash" and module_id:
707
+ RED = "\033[91m"
708
+ RESET = "\033[0m"
709
+ exc_type = data.get("exception_type", "Unknown")
710
+ preview = data.get("traceback_preview", "")
711
+ print(f"[launcher] {RED}模块 '{module_id}' 崩溃: {exc_type} — {preview}{RESET}")
712
+ _suffix = os.environ.get("KITE_INSTANCE_SUFFIX", "")
713
+ crash_log = os.path.join(
714
+ os.environ.get("KITE_INSTANCE_DIR", ""),
715
+ module_id, "log", f"crashes{_suffix}.jsonl"
716
+ )
717
+ print(f"[launcher] 崩溃日志: {crash_log}")
718
+
719
+ # Only log system events (module.*, watchdog.*) to avoid flooding
720
+ if not (event.startswith("module.") or event.startswith("watchdog.")):
721
+ return
722
+ latency_str = ""
723
+ if ts:
724
+ try:
725
+ from datetime import datetime, timezone
726
+ sent = datetime.fromisoformat(ts)
727
+ delay_ms = (datetime.now(timezone.utc) - sent).total_seconds() * 1000
728
+ latency_str = f" ({delay_ms:.1f}ms)"
729
+ local_ts = sent.astimezone().strftime("%H:%M:%S")
730
+ except Exception:
731
+ local_ts = ts[11:19] if len(ts) >= 19 else ts
732
+ print(f"[{source}] {local_ts} {event}{latency_str}: {json.dumps(data, ensure_ascii=False)}")
733
+ else:
734
+ print(f"[{source}] {event}: {json.dumps(data, ensure_ascii=False)}")
735
+
736
+ async def _handle_rpc_request(self, ws, msg: dict):
737
+ """Handle an incoming RPC request forwarded by Kernel (launcher.* methods)."""
738
+ rpc_id = msg.get("id", "")
739
+ method = msg.get("method", "")
740
+ params = msg.get("params", {})
741
+
742
+ handlers = {
743
+ "list_modules": self._rpc_list_modules,
744
+ "start_module": self._rpc_start_module,
745
+ "stop_module": self._rpc_stop_module,
746
+ "restart_module": self._rpc_restart_module,
747
+ "rescan": self._rpc_rescan,
748
+ "shutdown": self._rpc_shutdown,
749
+ }
750
+ handler = handlers.get(method)
751
+ if handler:
752
+ try:
753
+ result = await handler(params)
754
+ await ws.send(json.dumps({"jsonrpc": "2.0", "id": rpc_id, "result": result}))
755
+ except Exception as e:
756
+ await ws.send(json.dumps({
757
+ "jsonrpc": "2.0", "id": rpc_id,
758
+ "error": {"code": -32603, "message": str(e)},
759
+ }))
760
+ else:
761
+ await ws.send(json.dumps({
762
+ "jsonrpc": "2.0", "id": rpc_id,
763
+ "error": {"code": -32601, "message": f"Method not found: {method}"},
764
+ }))
765
+
766
+ # ── Launcher RPC method handlers ──
767
+
768
+ async def _rpc_list_modules(self, params: dict) -> dict:
769
+ """List all modules and their current status."""
770
+ result = []
771
+ for name, info in self.modules.items():
772
+ running = self.process_manager.is_running(name)
773
+ rec = self.process_manager.get_record(name)
774
+ result.append({
775
+ "name": name,
776
+ "display_name": info.display_name,
777
+ "type": info.type,
778
+ "config_state": info.state,
779
+ "desired_state": self._desired_states.get(name, "stopped"),
780
+ "actual_state": f"running({rec.pid})" if running and rec else "stopped",
781
+ "pid": rec.pid if running and rec else None,
782
+ "monitor": info.monitor,
783
+ })
784
+ return {"modules": result}
785
+
786
+ async def _rpc_start_module(self, params: dict) -> dict:
787
+ """Start a module by name."""
788
+ name = params.get("name", "")
789
+ info = self.modules.get(name)
790
+ if not info:
791
+ raise RuntimeError(f"Module '{name}' not found")
792
+ if info.state == "disabled":
793
+ raise RuntimeError(f"Module '{name}' is disabled")
794
+
795
+ if name not in self._module_tokens:
796
+ self._module_tokens[name] = secrets.token_hex(32)
797
+ await self._register_new_tokens({name: self._module_tokens[name]})
798
+
799
+ token = self._module_tokens[name]
800
+ boot_info = {"token": token}
801
+ ok = self.process_manager.start_module(info, boot_info=boot_info)
802
+ if ok:
803
+ self._desired_states[name] = "running"
804
+ self.process_manager.persist_records()
805
+ rec = self.process_manager.get_record(name)
806
+ self._log_lifecycle("started", name, pid=rec.pid if rec else None, via="rpc")
807
+ await self._publish_event("module.started", {"module_id": name})
808
+ return {"status": "started", "name": name}
809
+ self._log_lifecycle("start_failed", name, via="rpc")
810
+ raise RuntimeError(f"Failed to start '{name}'")
811
+
812
+ async def _rpc_stop_module(self, params: dict) -> dict:
813
+ """Stop a module with graceful shutdown."""
814
+ name = params.get("name", "")
815
+ info = self.modules.get(name)
816
+ if not info:
817
+ raise RuntimeError(f"Module '{name}' not found")
818
+ reason = params.get("reason", "stop_requested")
819
+ self._desired_states[name] = "stopped"
820
+ await self._graceful_stop(name, reason)
821
+ self.process_manager.persist_records()
822
+ return {"status": "stopped", "name": name}
823
+
824
+ async def _rpc_restart_module(self, params: dict) -> dict:
825
+ """Restart a module (stop + start)."""
826
+ name = params.get("name", "")
827
+ info = self.modules.get(name)
828
+ if not info:
829
+ raise RuntimeError(f"Module '{name}' not found")
830
+ if info.state == "disabled":
831
+ raise RuntimeError(f"Module '{name}' is disabled")
832
+ reason = params.get("reason", "restart")
833
+ await self._graceful_stop(name, reason)
834
+ self._module_tokens[name] = secrets.token_hex(32)
835
+ await self._register_new_tokens({name: self._module_tokens[name]})
836
+ token = self._module_tokens[name]
837
+ boot_info = {"token": token}
838
+ ok = self.process_manager.start_module(info, boot_info=boot_info)
839
+ if ok:
840
+ self._desired_states[name] = "running"
841
+ self.process_manager.persist_records()
842
+ rec = self.process_manager.get_record(name)
843
+ self._log_lifecycle("started", name, pid=rec.pid if rec else None, via="rpc_restart")
844
+ await self._publish_event("module.started", {"module_id": name})
845
+ return {"status": "restarted", "name": name}
846
+ self._log_lifecycle("start_failed", name, via="rpc_restart")
847
+ raise RuntimeError(f"Failed to restart '{name}'")
848
+
849
+ async def _rpc_rescan(self, params: dict) -> dict:
850
+ """Rescan module directories for new/removed modules."""
851
+ old_names = set(self.modules.keys())
852
+ self.modules = self.module_scanner.scan()
853
+ new_names = set(self.modules.keys())
854
+ added = list(new_names - old_names)
855
+ removed = list(old_names - new_names)
856
+ for name in added:
857
+ info = self.modules[name]
858
+ self._log_lifecycle("scanned", name, state=info.state, module_dir=info.module_dir)
859
+ self._desired_states[name] = "running" if info.state == "enabled" else "stopped"
860
+ if added:
861
+ new_tokens = {}
862
+ for name in added:
863
+ self._module_tokens[name] = secrets.token_hex(32)
864
+ new_tokens[name] = self._module_tokens[name]
865
+ await self._register_new_tokens(new_tokens)
866
+ return {"added": added, "removed": removed, "total": len(self.modules)}
867
+
868
+ async def _rpc_shutdown(self, params: dict) -> dict:
869
+ """Shutdown the entire Kite system."""
870
+ reason = params.get("reason", "rpc_request")
871
+ self._request_shutdown(f"RPC shutdown request: {reason}")
872
+ return {"status": "shutting_down", "reason": reason}
873
+
874
+ # ── Event publishing via RPC ──
833
875
 
834
876
  async def _publish_event(self, event_type: str, data: dict):
835
- """Publish an event to Event Hub via WebSocket. Uses create_task to avoid
836
- deadlock with _ws_connect recv loop (websockets 15.x send can block when
837
- incoming frames are pending and recv is held by async-for)."""
877
+ """Publish an event via RPC event.publish through Kernel WS."""
838
878
  if not self._ws:
839
879
  return
840
- from datetime import datetime, timezone
841
880
  msg = json.dumps({
842
- "type": "event",
843
- "event_id": str(uuid.uuid4()),
844
- "event": event_type,
845
- "source": "launcher",
846
- "timestamp": datetime.now(timezone.utc).isoformat(),
847
- "data": data,
881
+ "jsonrpc": "2.0",
882
+ "id": str(uuid.uuid4()),
883
+ "method": "event.publish",
884
+ "params": {
885
+ "event_id": str(uuid.uuid4()),
886
+ "event": event_type,
887
+ "data": data,
888
+ },
848
889
  })
849
890
 
850
891
  async def _send():
@@ -855,14 +896,6 @@ class Launcher:
855
896
 
856
897
  asyncio.create_task(_send())
857
898
 
858
- def _publish_event_threadsafe(self, event_type: str, data: dict):
859
- """Publish event from non-async context (API thread). Fire-and-forget."""
860
- if not self._ws or not self._loop:
861
- return
862
- asyncio.run_coroutine_threadsafe(
863
- self._publish_event(event_type, data), self._loop,
864
- )
865
-
866
899
  async def _wait_event(self, event_type: str, module_id: str, timeout: float) -> dict | None:
867
900
  """Wait for a specific event from a module. Returns data dict or None on timeout."""
868
901
  key = f"{event_type}:{module_id}"
@@ -892,12 +925,32 @@ class Launcher:
892
925
  })
893
926
  return
894
927
 
928
+ # Register waiters BEFORE sending shutdown event
929
+ ack_key = f"module.shutdown.ack:{name}"
930
+ ack_evt = asyncio.Event()
931
+ ack_data = {}
932
+ self._event_waiters[ack_key] = (ack_evt, ack_data)
933
+
934
+ ready_key = f"module.shutdown.ready:{name}"
935
+ ready_evt = asyncio.Event()
936
+ ready_data = {}
937
+ self._event_waiters[ready_key] = (ready_evt, ready_data)
938
+
895
939
  await self._publish_event("module.shutdown", {
896
940
  "module_id": name, "reason": reason, "timeout": timeout,
897
941
  })
898
942
 
899
- ack = await self._wait_event("module.shutdown.ack", name, timeout=3)
943
+ # Wait for ack
944
+ try:
945
+ await asyncio.wait_for(ack_evt.wait(), timeout=3)
946
+ ack = ack_data
947
+ except asyncio.TimeoutError:
948
+ ack = None
949
+ finally:
950
+ self._event_waiters.pop(ack_key, None)
951
+
900
952
  if not ack:
953
+ self._event_waiters.pop(ready_key, None)
901
954
  self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
902
955
  await self._publish_event("module.stopped", {
903
956
  "module_id": name,
@@ -906,7 +959,15 @@ class Launcher:
906
959
  return
907
960
 
908
961
  estimated = min(ack.get("estimated_cleanup", timeout), timeout)
909
- ready = await self._wait_event("module.shutdown.ready", name, timeout=estimated)
962
+
963
+ # Wait for ready
964
+ try:
965
+ await asyncio.wait_for(ready_evt.wait(), timeout=estimated)
966
+ ready = ready_data
967
+ except asyncio.TimeoutError:
968
+ ready = None
969
+ finally:
970
+ self._event_waiters.pop(ready_key, None)
910
971
  if ready:
911
972
  self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_READY)
912
973
  else:
@@ -920,10 +981,10 @@ class Launcher:
920
981
 
921
982
  async def _graceful_shutdown_all(self):
922
983
  """Shut down all modules. Order:
923
- 1. Send shutdown to graceful modules (excl. Event Hub) — let them start cleanup
984
+ 1. Send shutdown to graceful modules (excl. Kernel) — let them start cleanup
924
985
  2. Terminate non-graceful modules (fast, runs during graceful cleanup)
925
986
  3. Wait for graceful modules to exit (process monitoring)
926
- 4. Shut down Event Hub last (keeps event routing alive throughout)
987
+ 4. Shut down Kernel last (keeps event routing alive throughout)
927
988
  """
928
989
  self._system_shutting_down = True
929
990
  running = [n for n in self.modules if self.process_manager.is_running(n)]
@@ -938,9 +999,9 @@ class Launcher:
938
999
  graceful = [n for n in running if self._graceful_modules.get(n)]
939
1000
  non_graceful = [n for n in running if not self._graceful_modules.get(n)]
940
1001
 
941
- # Defer Event Hub — it must stay alive to route shutdown events
942
- hub_deferred = "event_hub" in graceful
943
- graceful_batch = [n for n in graceful if n != "event_hub"] if hub_deferred else graceful
1002
+ # Defer Kernel — it must stay alive to route shutdown events
1003
+ kernel_deferred = "kernel" in graceful
1004
+ graceful_batch = [n for n in graceful if n != "kernel"] if kernel_deferred else graceful
944
1005
 
945
1006
  print(f"[launcher] 正在关闭 {len(running)} 个模块: {', '.join(running)}")
946
1007
 
@@ -976,44 +1037,49 @@ class Launcher:
976
1037
  self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_PARTIAL)
977
1038
  self._log_lifecycle("stopped", name, reason="system_shutdown")
978
1039
 
979
- # Phase 4: All other modules exited — now shut down Event Hub
980
- if hub_deferred and self.process_manager.is_running("event_hub"):
981
- self._log_lifecycle("stopping", "event_hub", reason="system_shutdown")
982
- await self._publish_event("module.shutdown", {
983
- "module_id": "event_hub", "reason": "system_shutdown", "timeout": 5,
984
- })
985
- deadline = time.time() + 5
986
- while time.time() < deadline:
987
- if not self.process_manager.is_running("event_hub"):
988
- print("[launcher] Event Hub 已退出")
989
- break
990
- await asyncio.sleep(0.5)
991
- if self.process_manager.is_running("event_hub"):
992
- self.process_manager.stop_module("event_hub", timeout=SHUTDOWN_TIMEOUT_PARTIAL)
993
- self._log_lifecycle("stopped", "event_hub", reason="system_shutdown")
1040
+ # Phase 4: All other modules exited — now shut down Kernel
1041
+ if kernel_deferred and self.process_manager.is_running("kernel"):
1042
+ self._log_lifecycle("stopping", "kernel", reason="system_shutdown")
1043
+ print("[launcher] 正在关闭 Kernel...")
1044
+
1045
+ # Call kernel.shutdown RPC (not event)
1046
+ rpc_sent = False
1047
+ try:
1048
+ if self._ws:
1049
+ await self._rpc_call(self._ws, "kernel.shutdown", {})
1050
+ print("[launcher] Kernel shutdown RPC 已发送")
1051
+ rpc_sent = True
1052
+ else:
1053
+ print("[launcher] WebSocket 未连接,跳过 RPC 调用")
1054
+ except Exception as e:
1055
+ print(f"[launcher] Kernel shutdown RPC 失败: {e}")
1056
+
1057
+ # Wait for kernel to exit
1058
+ if rpc_sent:
1059
+ # RPC sent: wait up to 5s for graceful exit
1060
+ proc = self.process_manager._processes.get("kernel")
1061
+ if proc:
1062
+ try:
1063
+ loop = asyncio.get_event_loop()
1064
+ await asyncio.wait_for(
1065
+ loop.run_in_executor(None, proc.wait),
1066
+ timeout=5
1067
+ )
1068
+ print("[launcher] Kernel 已退出")
1069
+ except asyncio.TimeoutError:
1070
+ print("[launcher] Kernel 5秒内未退出,强制停止")
1071
+ self.process_manager.stop_module("kernel", timeout=SHUTDOWN_TIMEOUT_PARTIAL)
1072
+ else:
1073
+ # No RPC (WS not connected): use shorter timeout for terminate
1074
+ self.process_manager.stop_module("kernel", timeout=2)
1075
+
1076
+ self._log_lifecycle("stopped", "kernel", reason="system_shutdown")
994
1077
 
995
1078
  # Final safety net
996
1079
  try:
997
1080
  self.process_manager.stop_all(timeout=SHUTDOWN_TIMEOUT_BULK)
998
1081
  except Exception as e:
999
1082
  print(f"[launcher] stop_all 出错: {e}")
1000
- await self._close_http()
1001
-
1002
- # ── Heartbeat to Registry ──
1003
-
1004
- async def _heartbeat_loop(self):
1005
- """Send heartbeat to Registry every 30 seconds."""
1006
- while not self._thread_shutdown.is_set():
1007
- await asyncio.sleep(30)
1008
- try:
1009
- client = self._get_http()
1010
- await client.post(
1011
- f"http://127.0.0.1:{self.registry_port}/modules",
1012
- json={"action": "heartbeat", "module_id": "launcher"},
1013
- headers={"Authorization": f"Bearer {self.kite_token}"},
1014
- )
1015
- except Exception:
1016
- pass
1017
1083
 
1018
1084
  # ── Module startup ──
1019
1085
 
@@ -1080,7 +1146,7 @@ class Launcher:
1080
1146
  return layers
1081
1147
 
1082
1148
  async def _start_one_module(self, info: ModuleInfo):
1083
- """Start a single module: publish starting → start process → wait ready → started → close stdio."""
1149
+ """Start a single module: publish starting → start process → send kernel_port → wait ready → started → close stdio."""
1084
1150
  self._log_lifecycle("starting", info.name)
1085
1151
  await self._publish_event("module.starting", {"module_id": info.name})
1086
1152
 
@@ -1092,13 +1158,32 @@ class Launcher:
1092
1158
  self._log_lifecycle("start_failed", info.name)
1093
1159
  return
1094
1160
 
1161
+ # Register waiter BEFORE sending kernel_port
1162
+ # This prevents race condition where module connects and sends module.ready before waiter is registered
1163
+ ready_key = f"module.ready:{info.name}"
1164
+ ready_evt = asyncio.Event()
1165
+ ready_data = {}
1166
+ self._event_waiters[ready_key] = (ready_evt, ready_data)
1167
+
1168
+ # Send kernel_port via stdin so module can connect to Kernel WS
1169
+ self.process_manager.write_stdin(info.name, {
1170
+ "kite": "kernel_port",
1171
+ "kernel_port": self.kernel_port,
1172
+ })
1173
+
1095
1174
  # Persist immediately after starting to ensure PID is recorded
1096
- # (in case launcher crashes before Phase 4 completes)
1097
1175
  self.process_manager.persist_records()
1098
1176
 
1099
1177
  # Wait for module.ready or module.exiting (whichever comes first)
1100
1178
  timeout = info.launch.timeout
1101
- ready = await self._wait_event("module.ready", info.name, timeout=timeout)
1179
+ try:
1180
+ await asyncio.wait_for(ready_evt.wait(), timeout=timeout)
1181
+ ready = ready_data
1182
+ except asyncio.TimeoutError:
1183
+ ready = None
1184
+ finally:
1185
+ self._event_waiters.pop(ready_key, None)
1186
+
1102
1187
  elapsed = time.monotonic() - t0
1103
1188
  if ready and ready.get("_exited"):
1104
1189
  # Module sent module.exiting before ready — it chose to quit
@@ -1110,7 +1195,7 @@ class Launcher:
1110
1195
  self._ready_times[info.name] = elapsed
1111
1196
  print(f"[launcher] 模块 '{info.name}' 已就绪 ({elapsed:.2f}s)")
1112
1197
  else:
1113
- print(f"[launcher] 警告: '{info.name}' 在 {timeout}s 内未发送 module.ready")
1198
+ print(f"\033[91m[launcher] 警告: '{info.name}' 在 {timeout}s 内未发送 module.ready\033[0m")
1114
1199
 
1115
1200
  rec = self.process_manager.get_record(info.name)
1116
1201
  self._log_lifecycle("started", info.name, pid=rec.pid if rec else None)
@@ -1118,87 +1203,74 @@ class Launcher:
1118
1203
  self.process_manager.close_stdio(info.name)
1119
1204
 
1120
1205
  async def _register_module_tokens(self):
1121
- """Generate per-module tokens and register the mapping to Registry."""
1122
- # Include all scanned modules + core modules
1123
- for name in self.modules:
1124
- if name not in self._module_tokens:
1125
- self._module_tokens[name] = secrets.token_hex(32)
1126
- # Ensure registry has a token
1127
- if "registry" not in self._module_tokens:
1128
- self._module_tokens["registry"] = secrets.token_hex(32)
1206
+ """Generate per-module tokens and register the mapping to Kernel via RPC."""
1207
+ # Include all scanned modules
1208
+ async def _generate_module_tokens(self):
1209
+ """Request Kernel to generate tokens for all scanned modules via RPC."""
1210
+ # Collect module names that need tokens
1211
+ module_names = [name for name in self.modules if name not in self._module_tokens]
1212
+
1213
+ if not module_names:
1214
+ return
1129
1215
 
1130
- if not self._module_tokens:
1216
+ # Wait for WebSocket connection to be ready
1217
+ if self._ws_connected:
1218
+ try:
1219
+ await asyncio.wait_for(self._ws_connected.wait(), timeout=5)
1220
+ except asyncio.TimeoutError:
1221
+ print(f"[launcher] 警告: WebSocket 未就绪,无法生成令牌")
1222
+ return
1223
+ else:
1224
+ print(f"[launcher] 警告: _ws_connected 未初始化")
1131
1225
  return
1132
1226
 
1133
- await self._register_tokens_to_registry(self._module_tokens)
1227
+ # Call Kernel RPC to generate tokens
1228
+ try:
1229
+ result = await self._rpc_call(self._ws, "kernel.generate_tokens", {"modules": module_names})
1230
+ if result.get("result", {}).get("ok"):
1231
+ tokens = result["result"].get("tokens", {})
1232
+ self._module_tokens.update(tokens)
1233
+ print(f"[launcher] Kernel 已生成 {len(tokens)} 个模块令牌")
1234
+ elif "error" in result:
1235
+ print(f"[launcher] 警告: 令牌生成失败: {result['error'].get('message', '')}")
1236
+ except Exception as e:
1237
+ print(f"[launcher] 警告: 生成模块令牌失败: {e}")
1134
1238
 
1135
- async def _register_tokens_to_registry(self, tokens: dict):
1136
- """Register token mapping to Registry via POST /tokens."""
1137
- url = f"http://127.0.0.1:{self.registry_port}/tokens"
1138
- headers = {"Authorization": f"Bearer {self.kite_token}"}
1239
+ async def _register_new_tokens(self, tokens: dict):
1240
+ """Register new token mapping to Kernel via RPC kernel.register_tokens."""
1241
+ if not self._ws or not tokens:
1242
+ return
1139
1243
  try:
1140
- client = self._get_http()
1141
- resp = await client.post(url, json=tokens, headers=headers)
1142
- if resp.status_code == 200:
1244
+ result = await self._rpc_call(self._ws, "kernel.register_tokens", tokens)
1245
+ if result.get("result", {}).get("ok"):
1143
1246
  print(f"[launcher] 已注册 {len(tokens)} 个模块令牌")
1144
- else:
1145
- print(f"[launcher] 警告: 令牌注册返回 {resp.status_code}")
1247
+ elif "error" in result:
1248
+ print(f"[launcher] 警告: 令牌注册失败: {result['error'].get('message', '')}")
1146
1249
  except Exception as e:
1147
1250
  print(f"[launcher] 警告: 注册模块令牌失败: {e}")
1148
1251
 
1149
1252
  # ── Validation ──
1150
1253
 
1151
1254
  def _validate_core_modules(self):
1152
- """Validate core modules exist (mechanism 12)."""
1255
+ """Validate core modules exist."""
1153
1256
  project_root = os.environ["KITE_PROJECT"]
1154
- for name in ("registry", "event_hub"):
1155
- mod_dir = os.path.join(project_root, "core", name)
1156
- md_path = os.path.join(mod_dir, "module.md")
1157
- if not os.path.isdir(mod_dir):
1158
- print(f"[launcher] 致命: 核心模块 '{name}' 目录未找到: {mod_dir}")
1159
- sys.exit(1)
1160
- if not os.path.isfile(md_path):
1161
- print(f"[launcher] 致命: 核心模块 '{name}' 缺少 module.md: {md_path}")
1162
- sys.exit(1)
1163
- # Try to parse frontmatter
1164
- try:
1165
- with open(md_path, "r", encoding="utf-8") as f:
1166
- fm = _parse_frontmatter(f.read())
1167
- if not fm:
1168
- print(f"[launcher] 致命: 核心模块 '{name}' module.md 没有有效的 frontmatter")
1169
- sys.exit(1)
1170
- except Exception as e:
1171
- print(f"[launcher] 致命: 核心模块 '{name}' module.md 解析错误: {e}")
1257
+ mod_dir = os.path.join(project_root, "kernel")
1258
+ md_path = os.path.join(mod_dir, "module.md")
1259
+ if not os.path.isdir(mod_dir):
1260
+ print(f"[launcher] 致命: 核心模块 'kernel' 目录未找到: {mod_dir}")
1261
+ sys.exit(1)
1262
+ if not os.path.isfile(md_path):
1263
+ print(f"[launcher] 致命: 核心模块 'kernel' 缺少 module.md: {md_path}")
1264
+ sys.exit(1)
1265
+ try:
1266
+ with open(md_path, "r", encoding="utf-8") as f:
1267
+ fm = _parse_frontmatter(f.read())
1268
+ if not fm:
1269
+ print(f"[launcher] 致命: 核心模块 'kernel' module.md 没有有效的 frontmatter")
1172
1270
  sys.exit(1)
1173
-
1174
- # ── API thread ──
1175
-
1176
- def _start_api_thread(self):
1177
- """Start the Launcher API server in a separate thread with OS-assigned port."""
1178
- self.api_port = self._get_free_port()
1179
- config = uvicorn.Config(
1180
- self._app,
1181
- host="127.0.0.1",
1182
- port=self.api_port,
1183
- log_level="warning",
1184
- )
1185
- self._api_server = uvicorn.Server(config)
1186
-
1187
- def _run():
1188
- self._api_server.run()
1189
-
1190
- t = threading.Thread(target=_run, daemon=True)
1191
- t.start()
1192
-
1193
- deadline = time.time() + 5
1194
- while time.time() < deadline:
1195
- if self._api_server.started:
1196
- break
1197
- time.sleep(0.05)
1198
- else:
1199
- print("[launcher] 警告: API 服务器可能尚未完全就绪")
1200
-
1201
- print(f"[launcher] API 服务器已启动,端口 {self.api_port}")
1271
+ except Exception as e:
1272
+ print(f"[launcher] 致命: 核心模块 'kernel' module.md 解析错误: {e}")
1273
+ sys.exit(1)
1202
1274
 
1203
1275
  # ── Module crash summary ──
1204
1276
 
@@ -1296,29 +1368,23 @@ class Launcher:
1296
1368
  pass
1297
1369
 
1298
1370
  async def _full_restart(self):
1299
- """Stop all modules, regenerate tokens, re-run Phase 1-4 (mechanism 10)."""
1371
+ """Stop all modules, regenerate tokens, re-run Phase 1-2."""
1300
1372
  print("[launcher] 全量重启: 正在停止所有模块...")
1301
1373
 
1302
1374
  # Persist records before shutdown so cleanup_leftovers can find survivors
1303
1375
  self.process_manager.persist_records()
1304
1376
 
1305
- # Disconnect Event Hub WS
1377
+ # Disconnect Kernel WS
1306
1378
  if self._ws_task:
1307
1379
  self._ws_task.cancel()
1308
1380
  self._ws_task = None
1309
- if hasattr(self, '_heartbeat_task') and self._heartbeat_task:
1310
- self._heartbeat_task.cancel()
1311
- self._heartbeat_task = None
1312
1381
  self._ws = None
1313
- self._event_hub_ws_url = ""
1314
- self._launcher_ws_token = ""
1382
+ self._rpc_waiters.clear()
1383
+ self._rpc_results.clear()
1315
1384
 
1316
1385
  await self._graceful_shutdown_all()
1317
1386
 
1318
1387
  # Cleanup any leftover processes that survived graceful shutdown.
1319
- # Note: _graceful_shutdown_all() clears _processes/_records dicts, but
1320
- # cleanup_leftovers() reads from processes.json (persisted above), so it can
1321
- # still find and kill survivors.
1322
1388
  self.process_manager.cleanup_leftovers()
1323
1389
 
1324
1390
  self._module_tokens.clear()
@@ -1327,13 +1393,11 @@ class Launcher:
1327
1393
  self.kite_token = secrets.token_hex(32)
1328
1394
  self.process_manager.kite_token = self.kite_token
1329
1395
 
1330
- print("[launcher] 全量重启: 重新执行 Phase 1-4...")
1396
+ print("[launcher] 全量重启: 重新执行 Phase 1-2...")
1331
1397
  try:
1332
- await self._phase1_parallel_bootstrap()
1333
- await self._phase3_registry_ready()
1334
- await self._phase4_start_modules()
1398
+ await self._phase1_start_kernel()
1399
+ await self._phase2_start_modules()
1335
1400
  self.process_manager.persist_records()
1336
- self._heartbeat_task = asyncio.create_task(self._heartbeat_loop())
1337
1401
  print("[launcher] 全量重启完成,恢复监控循环")
1338
1402
  await self._monitor_loop()
1339
1403
  except Exception as e:
@@ -1342,14 +1406,12 @@ class Launcher:
1342
1406
  # ── Shutdown ──
1343
1407
 
1344
1408
  def _final_cleanup(self):
1345
- """Called on exit — stop all processes, stop API, clear records."""
1409
+ """Called on exit — stop all processes, clear records."""
1346
1410
  try:
1347
1411
  print("[launcher] 正在执行最终清理...")
1348
1412
 
1349
1413
  if self._ws_task:
1350
1414
  self._ws_task.cancel()
1351
- if hasattr(self, '_heartbeat_task') and self._heartbeat_task:
1352
- self._heartbeat_task.cancel()
1353
1415
 
1354
1416
  # Note: _graceful_shutdown_all() already called stop_all() in _async_main finally block.
1355
1417
  # This is just a safety check — should normally find nothing.
@@ -1361,9 +1423,6 @@ class Launcher:
1361
1423
  else:
1362
1424
  print("[launcher] 无残留进程")
1363
1425
 
1364
- if self._api_server:
1365
- self._api_server.should_exit = True
1366
-
1367
1426
  # Clear instance runtime files
1368
1427
  try:
1369
1428
  os.remove(self.process_manager.records_path)
@@ -1374,7 +1433,13 @@ class Launcher:
1374
1433
  finally:
1375
1434
  # Signal the safety-net thread that normal shutdown has completed
1376
1435
  self._shutdown_complete.set()
1377
- print("[launcher] 再见。")
1436
+
1437
+ # Calculate and display shutdown time
1438
+ if self._shutdown_start_time > 0:
1439
+ shutdown_elapsed = time.monotonic() - self._shutdown_start_time
1440
+ print(f"[launcher] 再见。(退出耗时: {shutdown_elapsed:.2f}s)")
1441
+ else:
1442
+ print("[launcher] 再见。")
1378
1443
 
1379
1444
  if IS_WINDOWS:
1380
1445
  os._exit(0)
@@ -1403,11 +1468,8 @@ class Launcher:
1403
1468
  else:
1404
1469
  stopped.append((name, info))
1405
1470
 
1406
- # Calculate kernel startup time (Phase 1+2+3)
1407
- kernel_time = 0
1408
- for phase_name in ["Phase 1+2: Registry + Event Hub (并行)", "Phase 3: Registry 事件总线"]:
1409
- if phase_name in phase_times:
1410
- kernel_time += phase_times[phase_name]
1471
+ # Calculate kernel startup time (Phase 1)
1472
+ kernel_time = phase_times.get("Phase 1: Kernel", 0)
1411
1473
 
1412
1474
  lines = [
1413
1475
  "",
@@ -1422,16 +1484,15 @@ class Launcher:
1422
1484
 
1423
1485
  # Kernel modules section
1424
1486
  lines.append(f"{G} 内核模块:{R}")
1425
- for phase_name in ["Phase 1+2: Registry + Event Hub (并行)", "Phase 3: Registry 事件总线"]:
1426
- if phase_name in phase_times:
1427
- elapsed = phase_times[phase_name]
1428
- lines.append(f"{G} {phase_name:<26s} {elapsed:>6.2f}s{R}")
1487
+ if "Phase 1: Kernel" in phase_times:
1488
+ elapsed = phase_times["Phase 1: Kernel"]
1489
+ lines.append(f"{G} {'Phase 1: Kernel':<26s} {elapsed:>6.2f}s{R}")
1429
1490
 
1430
1491
  # Extension modules section
1431
1492
  lines.append(f"{G} 扩展模块:{R}")
1432
- if "Phase 4: Extensions" in phase_times:
1433
- elapsed = phase_times["Phase 4: Extensions"]
1434
- lines.append(f"{G} {'Phase 4: Extensions':<26s} {elapsed:>6.2f}s{R}")
1493
+ if "Phase 2: Extensions" in phase_times:
1494
+ elapsed = phase_times["Phase 2: Extensions"]
1495
+ lines.append(f"{G} {'Phase 2: Extensions':<26s} {elapsed:>6.2f}s{R}")
1435
1496
 
1436
1497
  # Sort running modules by ready time
1437
1498
  running_sorted = sorted(running, key=lambda x: self._ready_times.get(x[0], float('inf')))
@@ -1471,12 +1532,21 @@ class Launcher:
1471
1532
  es_str = f"{elapsed_from_start:.2f}s"
1472
1533
  else:
1473
1534
  es_str = "—"
1474
- rows.append([label, str(rec.pid), time_str, es_str, f"[{info.type}]"])
1535
+
1536
+ # Check if module timed out (ready_t >= 15s for kernel, >= timeout for others)
1537
+ is_timeout = False
1538
+ if ready_t is not None:
1539
+ if name == "kernel" and ready_t >= 15:
1540
+ is_timeout = True
1541
+ elif name != "kernel" and ready_t >= 15: # Default timeout for other modules
1542
+ is_timeout = True
1543
+
1544
+ rows.append([label, str(rec.pid), time_str, es_str, f"[{info.type}]", is_timeout])
1475
1545
 
1476
1546
  # Calculate column widths: max of header and all data display widths
1477
1547
  col_widths = [_dw(h) for h in headers]
1478
1548
  for row in rows:
1479
- for i, cell in enumerate(row):
1549
+ for i, cell in enumerate(row[:5]): # Only first 5 columns (exclude is_timeout flag)
1480
1550
  col_widths[i] = max(col_widths[i], _dw(cell))
1481
1551
 
1482
1552
  # Render header
@@ -1489,14 +1559,19 @@ class Launcher:
1489
1559
  lines.append(f"{DIM} {' '.join(hdr_parts)}{R}")
1490
1560
 
1491
1561
  # Render data rows
1562
+ RED = "\033[91m"
1492
1563
  for row in rows:
1564
+ is_timeout = row[5] # Last element is the timeout flag
1493
1565
  parts = []
1494
- for i, cell in enumerate(row):
1566
+ for i, cell in enumerate(row[:5]): # Only first 5 columns
1495
1567
  if aligns[i] == 'left':
1496
1568
  parts.append(_rpad(cell, col_widths[i]))
1497
1569
  else:
1498
1570
  parts.append(_lpad(cell, col_widths[i]))
1499
- lines.append(f"{G} ✓ {' '.join(parts)}{R}")
1571
+ if is_timeout:
1572
+ lines.append(f"{RED} ✓ {' '.join(parts)}{R}")
1573
+ else:
1574
+ lines.append(f"{G} ✓ {' '.join(parts)}{R}")
1500
1575
 
1501
1576
  # Exited modules (started but already quit)
1502
1577
  if exited:
@@ -1514,10 +1589,18 @@ class Launcher:
1514
1589
  label = info.display_name or name
1515
1590
  lines.append(f"{G} - {label:<20s} ({info.state}){R}")
1516
1591
 
1517
- lines.append(f"{G} Launcher API: http://127.0.0.1:{self.api_port} 实例: {self.instance_id}{R}")
1592
+ lines.append(f"{G} Kernel WS: ws://127.0.0.1:{self.kernel_port}/ws 实例: {self.instance_id}{R}")
1518
1593
 
1519
- # Query Registry for web module's access URL
1520
- web_url = await self._get_web_url()
1594
+ # Query Kernel for web module's api_endpoint via RPC
1595
+ web_url = ""
1596
+ if self._ws:
1597
+ try:
1598
+ resp = await self._rpc_call(self._ws, "registry.get", {"path": "web.api_endpoint"}, timeout=3)
1599
+ val = resp.get("result", {}).get("value")
1600
+ if val and isinstance(val, str):
1601
+ web_url = val.replace("://127.0.0.1:", "://localhost:")
1602
+ except Exception:
1603
+ pass
1521
1604
  if web_url:
1522
1605
  lines.append(f"{B} Web 管理后台: {web_url}{R}")
1523
1606
 
@@ -1572,29 +1655,11 @@ class Launcher:
1572
1655
 
1573
1656
  print("\n".join(lines))
1574
1657
 
1575
- async def _get_web_url(self) -> str:
1576
- """Query Registry for the web module's api_endpoint. Returns URL or empty string."""
1577
- try:
1578
- client = self._get_http()
1579
- resp = await client.get(
1580
- f"http://127.0.0.1:{self.registry_port}/get/web.api_endpoint",
1581
- headers={"Authorization": f"Bearer {self.kite_token}"},
1582
- timeout=3,
1583
- )
1584
- if resp.status_code == 200:
1585
- val = resp.json()
1586
- if val and isinstance(val, str):
1587
- # Show localhost instead of 127.0.0.1 for friendliness
1588
- return val.replace("://127.0.0.1:", "://localhost:")
1589
- except Exception:
1590
- pass
1591
- return ""
1592
-
1593
1658
  # ── Utilities ──
1594
1659
 
1595
1660
  def _load_discovery(self) -> dict | None:
1596
1661
  """Read discovery config from launcher's own module.md."""
1597
- md_path = os.path.join(os.environ["KITE_PROJECT"], "core", "launcher", "module.md")
1662
+ md_path = os.path.join(os.environ["KITE_PROJECT"], "launcher", "module.md")
1598
1663
  try:
1599
1664
  with open(md_path, "r", encoding="utf-8") as f:
1600
1665
  fm = _parse_frontmatter(f.read())
@@ -1617,214 +1682,6 @@ class Launcher:
1617
1682
  except Exception:
1618
1683
  pass
1619
1684
 
1620
- @staticmethod
1621
- def _get_free_port() -> int:
1622
- """Get a free port assigned by the OS (bind to port 0)."""
1623
- import socket
1624
- with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
1625
- s.bind(("127.0.0.1", 0))
1626
- return s.getsockname()[1]
1627
-
1628
- # ── API app ──
1629
-
1630
- def _create_api_app(self) -> FastAPI:
1631
- """Create the FastAPI app with Launcher management routes."""
1632
- from fastapi import Request, HTTPException
1633
- app = FastAPI(title="Kite Launcher", docs_url=None, redoc_url=None)
1634
- launcher = self
1635
-
1636
- def _require_auth(request: Request):
1637
- """Verify Bearer token and IP whitelist. Raise 401/403 on failure."""
1638
- # IP whitelist: only allow 127.0.0.1
1639
- client_host = request.client.host if request.client else None
1640
- if client_host not in ("127.0.0.1", "::1", "localhost"):
1641
- raise HTTPException(status_code=403, detail="Access denied: only localhost allowed")
1642
-
1643
- # Bearer token verification
1644
- auth = request.headers.get("Authorization", "")
1645
- if not auth.startswith("Bearer "):
1646
- raise HTTPException(status_code=401, detail="Missing or invalid Authorization header")
1647
- token = auth[7:].strip()
1648
- if token != launcher.kite_token:
1649
- raise HTTPException(status_code=401, detail="Invalid token")
1650
-
1651
- @app.get("/launcher/modules")
1652
- async def list_modules(request: Request):
1653
- """List all modules and their current status."""
1654
- _require_auth(request)
1655
- result = []
1656
- for name, info in launcher.modules.items():
1657
- running = launcher.process_manager.is_running(name)
1658
- rec = launcher.process_manager.get_record(name)
1659
- result.append({
1660
- "name": name,
1661
- "display_name": info.display_name,
1662
- "type": info.type,
1663
- "config_state": info.state,
1664
- "desired_state": launcher._desired_states.get(name, "stopped"),
1665
- "actual_state": f"running({rec.pid})" if running and rec else "stopped",
1666
- "pid": rec.pid if running and rec else None,
1667
- "monitor": info.monitor,
1668
- })
1669
- return result
1670
-
1671
- @app.post("/launcher/modules/{name}/start")
1672
- async def start_module(name: str, request: Request):
1673
- """Start a module by name."""
1674
- _require_auth(request)
1675
- info = launcher.modules.get(name)
1676
- if not info:
1677
- raise HTTPException(404, f"Module '{name}' not found")
1678
- if info.state == "disabled":
1679
- raise HTTPException(403, f"Module '{name}' is disabled")
1680
-
1681
- if name not in launcher._module_tokens:
1682
- launcher._module_tokens[name] = secrets.token_hex(32)
1683
- try:
1684
- client = launcher._get_http()
1685
- await client.post(
1686
- f"http://127.0.0.1:{launcher.registry_port}/tokens",
1687
- json={name: launcher._module_tokens[name]},
1688
- headers={"Authorization": f"Bearer {launcher.kite_token}"},
1689
- )
1690
- except Exception as e:
1691
- print(f"[launcher] 警告: 注册 {name} 的令牌失败: {e}")
1692
-
1693
- token = launcher._module_tokens[name]
1694
- boot_info = {"token": token}
1695
- ok = launcher.process_manager.start_module(info, boot_info=boot_info)
1696
- if ok:
1697
- launcher._desired_states[name] = "running"
1698
- launcher.process_manager.persist_records()
1699
- rec = launcher.process_manager.get_record(name)
1700
- launcher._log_lifecycle("started", name, pid=rec.pid if rec else None, via="api")
1701
- launcher._publish_event_threadsafe("module.started", {"module_id": name})
1702
- return {"status": "started", "name": name}
1703
- launcher._log_lifecycle("start_failed", name, via="api")
1704
- raise HTTPException(500, f"Failed to start '{name}'")
1705
-
1706
- @app.post("/launcher/modules/{name}/stop")
1707
- async def stop_module(name: str, request: Request, body: dict = None):
1708
- """Stop a module with graceful shutdown."""
1709
- _require_auth(request)
1710
- info = launcher.modules.get(name)
1711
- if not info:
1712
- raise HTTPException(404, f"Module '{name}' not found")
1713
- reason = (body or {}).get("reason", "stop_requested")
1714
- launcher._desired_states[name] = "stopped"
1715
- await launcher._graceful_stop(name, reason)
1716
- launcher.process_manager.persist_records()
1717
- return {"status": "stopped", "name": name}
1718
-
1719
- @app.post("/launcher/modules/{name}/restart")
1720
- async def restart_module(name: str, request: Request, body: dict = None):
1721
- """Restart a module (stop + start)."""
1722
- _require_auth(request)
1723
- info = launcher.modules.get(name)
1724
- if not info:
1725
- raise HTTPException(404, f"Module '{name}' not found")
1726
- if info.state == "disabled":
1727
- raise HTTPException(403, f"Module '{name}' is disabled")
1728
- reason = (body or {}).get("reason", "restart")
1729
- await launcher._graceful_stop(name, reason)
1730
- launcher._module_tokens[name] = secrets.token_hex(32)
1731
- try:
1732
- client = launcher._get_http()
1733
- await client.post(
1734
- f"http://127.0.0.1:{launcher.registry_port}/tokens",
1735
- json={name: launcher._module_tokens[name]},
1736
- headers={"Authorization": f"Bearer {launcher.kite_token}"},
1737
- )
1738
- except Exception:
1739
- pass
1740
- token = launcher._module_tokens[name]
1741
- boot_info = {"token": token}
1742
- ok = launcher.process_manager.start_module(info, boot_info=boot_info)
1743
- if ok:
1744
- launcher._desired_states[name] = "running"
1745
- launcher.process_manager.persist_records()
1746
- rec = launcher.process_manager.get_record(name)
1747
- launcher._log_lifecycle("started", name, pid=rec.pid if rec else None, via="restart_api")
1748
- launcher._publish_event_threadsafe("module.started", {"module_id": name})
1749
- return {"status": "restarted", "name": name}
1750
- launcher._log_lifecycle("start_failed", name, via="restart_api")
1751
- raise HTTPException(500, f"Failed to restart '{name}'")
1752
-
1753
- @app.post("/launcher/rescan")
1754
- async def rescan_modules(request: Request):
1755
- """Rescan module directories for new/removed modules."""
1756
- _require_auth(request)
1757
- old_names = set(launcher.modules.keys())
1758
- launcher.modules = launcher.module_scanner.scan()
1759
- new_names = set(launcher.modules.keys())
1760
- added = list(new_names - old_names)
1761
- removed = list(old_names - new_names)
1762
- for name in added:
1763
- info = launcher.modules[name]
1764
- launcher._log_lifecycle("scanned", name, state=info.state, module_dir=info.module_dir)
1765
- for name in added:
1766
- info = launcher.modules[name]
1767
- launcher._desired_states[name] = "running" if info.state == "enabled" else "stopped"
1768
- if added:
1769
- new_tokens = {}
1770
- for name in added:
1771
- launcher._module_tokens[name] = secrets.token_hex(32)
1772
- new_tokens[name] = launcher._module_tokens[name]
1773
- try:
1774
- client = launcher._get_http()
1775
- await client.post(
1776
- f"http://127.0.0.1:{launcher.registry_port}/tokens",
1777
- json=new_tokens,
1778
- headers={"Authorization": f"Bearer {launcher.kite_token}"},
1779
- )
1780
- except Exception:
1781
- pass
1782
- return {"added": added, "removed": removed, "total": len(launcher.modules)}
1783
-
1784
- @app.post("/launcher/shutdown")
1785
- async def shutdown_launcher(request: Request, body: dict = None):
1786
- """Shutdown the entire Kite system (equivalent to Ctrl+C)."""
1787
- _require_auth(request)
1788
- reason = (body or {}).get("reason", "api_request")
1789
- launcher._request_shutdown(f"API shutdown request: {reason}")
1790
- return {"status": "shutting_down", "reason": reason}
1791
-
1792
- @app.put("/launcher/modules/{name}/state")
1793
- async def update_state(name: str, request: Request, body: dict):
1794
- """Update module state (enabled/manual/disabled). Writes to module.md."""
1795
- _require_auth(request)
1796
- info = launcher.modules.get(name)
1797
- if not info:
1798
- raise HTTPException(404, f"Module '{name}' not found")
1799
-
1800
- new_state = body.get("state", "")
1801
- if new_state not in ("enabled", "manual", "disabled"):
1802
- raise HTTPException(400, "state must be enabled, manual, or disabled")
1803
-
1804
- if info.is_core() and new_state == "disabled":
1805
- raise HTTPException(403, "Core modules cannot be disabled")
1806
-
1807
- old_state = info.state
1808
- info.state = new_state
1809
-
1810
- if new_state == "enabled":
1811
- launcher._desired_states[name] = "running"
1812
- else:
1813
- launcher._desired_states[name] = "stopped"
1814
-
1815
- _update_module_md_state(info.module_dir, new_state)
1816
- launcher._publish_event_threadsafe("module.state_changed", {
1817
- "module_id": name,
1818
- "old_state": old_state,
1819
- "new_state": new_state,
1820
- })
1821
- return {
1822
- "name": name,
1823
- "old_state": old_state,
1824
- "new_state": new_state,
1825
- }
1826
-
1827
- return app
1828
1685
 
1829
1686
 
1830
1687
  def _update_module_md_state(module_dir: str, new_state: str):
@@ -1850,3 +1707,72 @@ def _update_module_md_state(module_dir: str, new_state: str):
1850
1707
  f.write(updated)
1851
1708
  except Exception as e:
1852
1709
  print(f"[launcher] 警告: 更新 module.md 状态失败: {e}")
1710
+
1711
+
1712
+ def start_launcher():
1713
+ """Entry point called from main.py. Sets up environment and starts launcher."""
1714
+ # Load .env (development convenience)
1715
+ try:
1716
+ from dotenv import load_dotenv
1717
+ load_dotenv()
1718
+ except ImportError:
1719
+ pass
1720
+
1721
+ # Resolve project root
1722
+ project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
1723
+
1724
+ # Home base for Kite data
1725
+ home = os.environ.get("HOME") or os.environ.get("USERPROFILE") or os.path.expanduser("~")
1726
+ kite_home = os.path.join(home, ".kite")
1727
+
1728
+ # Set KITE_* defaults
1729
+ defaults = {
1730
+ "KITE_PROJECT": project_root,
1731
+ "KITE_CWD": os.getcwd(),
1732
+ "KITE_WORKSPACE": os.path.join(kite_home, "workspace"),
1733
+ "KITE_DATA": os.path.join(kite_home, "data"),
1734
+ "KITE_MODULES": os.path.join(kite_home, "modules"),
1735
+ "KITE_REPO": os.path.join(kite_home, "repo"),
1736
+ "KITE_ENV": "development",
1737
+ }
1738
+ for key, value in defaults.items():
1739
+ if not os.environ.get(key):
1740
+ os.environ[key] = value
1741
+
1742
+ # Parse CLI args
1743
+ if "--debug" in sys.argv:
1744
+ os.environ["KITE_DEBUG"] = "1"
1745
+ sys.argv.remove("--debug")
1746
+
1747
+ # Setup logging
1748
+ from .logging_setup import (
1749
+ setup_timestamped_print,
1750
+ init_log_files,
1751
+ setup_exception_hooks,
1752
+ reset_time_baseline,
1753
+ write_crash_handled
1754
+ )
1755
+ setup_timestamped_print()
1756
+ reset_time_baseline()
1757
+
1758
+ print("[launcher] Kite 启动中...")
1759
+
1760
+ # Create and run launcher
1761
+ token = secrets.token_hex(32)
1762
+ launcher = Launcher(kite_token=token)
1763
+ print("[launcher] 启动器实例已创建")
1764
+
1765
+ # Initialize log files (KITE_MODULE_DATA is now set)
1766
+ init_log_files()
1767
+ setup_exception_hooks()
1768
+
1769
+ log_dir = os.path.join(os.environ.get("KITE_MODULE_DATA", ""), "log")
1770
+ suffix = launcher.process_manager.instance_suffix
1771
+ latest_log = os.path.join(log_dir, f"latest{suffix}.log")
1772
+ print(f"[launcher] 日志: {latest_log}")
1773
+
1774
+ try:
1775
+ launcher.run()
1776
+ except Exception as e:
1777
+ write_crash_handled(type(e), e, e.__traceback__)
1778
+ sys.exit(1)