@agentunion/kite 1.0.6 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. package/cli.js +127 -25
  2. package/core/event_hub/entry.py +384 -61
  3. package/core/event_hub/hub.py +8 -0
  4. package/core/event_hub/module.md +0 -1
  5. package/core/event_hub/server.py +169 -38
  6. package/core/kite_log.py +241 -0
  7. package/core/launcher/entry.py +1306 -425
  8. package/core/launcher/module_scanner.py +10 -9
  9. package/core/launcher/process_manager.py +555 -121
  10. package/core/registry/entry.py +335 -30
  11. package/core/registry/server.py +339 -256
  12. package/core/registry/store.py +13 -2
  13. package/extensions/agents/__init__.py +1 -0
  14. package/extensions/agents/assistant/__init__.py +1 -0
  15. package/extensions/agents/assistant/entry.py +380 -0
  16. package/extensions/agents/assistant/module.md +22 -0
  17. package/extensions/agents/assistant/server.py +236 -0
  18. package/extensions/channels/__init__.py +1 -0
  19. package/extensions/channels/acp_channel/__init__.py +1 -0
  20. package/extensions/channels/acp_channel/entry.py +380 -0
  21. package/extensions/channels/acp_channel/module.md +22 -0
  22. package/extensions/channels/acp_channel/server.py +236 -0
  23. package/{core → extensions}/event_hub_bench/entry.py +664 -371
  24. package/{core → extensions}/event_hub_bench/module.md +4 -2
  25. package/extensions/services/backup/__init__.py +1 -0
  26. package/extensions/services/backup/entry.py +380 -0
  27. package/extensions/services/backup/module.md +22 -0
  28. package/extensions/services/backup/server.py +244 -0
  29. package/extensions/services/model_service/__init__.py +1 -0
  30. package/extensions/services/model_service/entry.py +380 -0
  31. package/extensions/services/model_service/module.md +22 -0
  32. package/extensions/services/model_service/server.py +236 -0
  33. package/extensions/services/watchdog/entry.py +460 -143
  34. package/extensions/services/watchdog/module.md +3 -0
  35. package/extensions/services/watchdog/monitor.py +128 -13
  36. package/extensions/services/watchdog/server.py +75 -13
  37. package/extensions/services/web/__init__.py +1 -0
  38. package/extensions/services/web/config.yaml +149 -0
  39. package/extensions/services/web/entry.py +487 -0
  40. package/extensions/services/web/module.md +24 -0
  41. package/extensions/services/web/routes/__init__.py +1 -0
  42. package/extensions/services/web/routes/routes_call.py +189 -0
  43. package/extensions/services/web/routes/routes_config.py +512 -0
  44. package/extensions/services/web/routes/routes_contacts.py +98 -0
  45. package/extensions/services/web/routes/routes_devlog.py +99 -0
  46. package/extensions/services/web/routes/routes_phone.py +81 -0
  47. package/extensions/services/web/routes/routes_sms.py +48 -0
  48. package/extensions/services/web/routes/routes_stats.py +17 -0
  49. package/extensions/services/web/routes/routes_voicechat.py +554 -0
  50. package/extensions/services/web/routes/schemas.py +216 -0
  51. package/extensions/services/web/server.py +332 -0
  52. package/extensions/services/web/static/css/style.css +1064 -0
  53. package/extensions/services/web/static/index.html +1445 -0
  54. package/extensions/services/web/static/js/app.js +4671 -0
  55. package/extensions/services/web/vendor/__init__.py +1 -0
  56. package/extensions/services/web/vendor/bluetooth/audio.py +348 -0
  57. package/extensions/services/web/vendor/bluetooth/contacts.py +251 -0
  58. package/extensions/services/web/vendor/bluetooth/manager.py +395 -0
  59. package/extensions/services/web/vendor/bluetooth/sms.py +290 -0
  60. package/extensions/services/web/vendor/bluetooth/telephony.py +274 -0
  61. package/extensions/services/web/vendor/config.py +139 -0
  62. package/extensions/services/web/vendor/conversation/__init__.py +0 -0
  63. package/extensions/services/web/vendor/conversation/asr.py +936 -0
  64. package/extensions/services/web/vendor/conversation/engine.py +548 -0
  65. package/extensions/services/web/vendor/conversation/llm.py +534 -0
  66. package/extensions/services/web/vendor/conversation/mcp_tools.py +190 -0
  67. package/extensions/services/web/vendor/conversation/tts.py +322 -0
  68. package/extensions/services/web/vendor/conversation/vad.py +138 -0
  69. package/extensions/services/web/vendor/storage/__init__.py +1 -0
  70. package/extensions/services/web/vendor/storage/identity.py +312 -0
  71. package/extensions/services/web/vendor/storage/store.py +507 -0
  72. package/extensions/services/web/vendor/task/__init__.py +0 -0
  73. package/extensions/services/web/vendor/task/manager.py +864 -0
  74. package/extensions/services/web/vendor/task/models.py +45 -0
  75. package/extensions/services/web/vendor/task/webhook.py +263 -0
  76. package/extensions/services/web/vendor/tools/__init__.py +0 -0
  77. package/extensions/services/web/vendor/tools/registry.py +321 -0
  78. package/main.py +344 -4
  79. package/package.json +11 -2
  80. package/core/__pycache__/__init__.cpython-313.pyc +0 -0
  81. package/core/__pycache__/data_dir.cpython-313.pyc +0 -0
  82. package/core/data_dir.py +0 -62
  83. package/core/event_hub/__pycache__/__init__.cpython-313.pyc +0 -0
  84. package/core/event_hub/__pycache__/bench.cpython-313.pyc +0 -0
  85. package/core/event_hub/__pycache__/bench_perf.cpython-313.pyc +0 -0
  86. package/core/event_hub/__pycache__/dedup.cpython-313.pyc +0 -0
  87. package/core/event_hub/__pycache__/entry.cpython-313.pyc +0 -0
  88. package/core/event_hub/__pycache__/hub.cpython-313.pyc +0 -0
  89. package/core/event_hub/__pycache__/router.cpython-313.pyc +0 -0
  90. package/core/event_hub/__pycache__/server.cpython-313.pyc +0 -0
  91. package/core/event_hub/bench_results/2026-02-28_13-26-48.json +0 -51
  92. package/core/event_hub/bench_results/2026-02-28_13-44-45.json +0 -51
  93. package/core/event_hub/bench_results/2026-02-28_13-45-39.json +0 -51
  94. package/core/launcher/__pycache__/__init__.cpython-313.pyc +0 -0
  95. package/core/launcher/__pycache__/entry.cpython-313.pyc +0 -0
  96. package/core/launcher/__pycache__/module_scanner.cpython-313.pyc +0 -0
  97. package/core/launcher/__pycache__/process_manager.cpython-313.pyc +0 -0
  98. package/core/launcher/data/log/lifecycle.jsonl +0 -1158
  99. package/core/launcher/data/token.txt +0 -1
  100. package/core/registry/__pycache__/__init__.cpython-313.pyc +0 -0
  101. package/core/registry/__pycache__/entry.cpython-313.pyc +0 -0
  102. package/core/registry/__pycache__/server.cpython-313.pyc +0 -0
  103. package/core/registry/__pycache__/store.cpython-313.pyc +0 -0
  104. package/core/registry/data/port.txt +0 -1
  105. package/core/registry/data/port_484.txt +0 -1
  106. package/extensions/__pycache__/__init__.cpython-313.pyc +0 -0
  107. package/extensions/services/__pycache__/__init__.cpython-313.pyc +0 -0
  108. package/extensions/services/watchdog/__pycache__/__init__.cpython-313.pyc +0 -0
  109. package/extensions/services/watchdog/__pycache__/entry.cpython-313.pyc +0 -0
  110. package/extensions/services/watchdog/__pycache__/monitor.cpython-313.pyc +0 -0
  111. package/extensions/services/watchdog/__pycache__/server.cpython-313.pyc +0 -0
  112. /package/{core/event_hub/bench_results/.gitkeep → extensions/services/web/vendor/bluetooth/__init__.py} +0 -0
@@ -4,8 +4,16 @@ Launcher — the core of Kite. Manages module lifecycle, exposes API, monitors p
4
4
  Thread model:
5
5
  - Main thread: asyncio event loop (process management + monitor loop)
6
6
  - API thread: independent thread running uvicorn + FastAPI
7
- - stdout threads: one daemon thread per child process
7
+ - stdout threads: one daemon thread per child process (ProcessManager)
8
8
  - (Windows) keyboard listener thread: polls for 'q' key
9
+
10
+ 4-Phase startup:
11
+ Phase 1: Registry + Event Hub (parallel start) → Registry stdout port → stdin broadcast port to Event Hub
12
+ → API → register self + tokens → stdin launcher_ws_token to Event Hub
13
+ → stdout ws_endpoint → WS connect → module.ready
14
+ Phase 2: (reserved — Event Hub ready handled in Phase 1)
15
+ Phase 3: Registry delayed ready (Event Hub → Registry → Event Hub WS → module.ready)
16
+ Phase 4: start remaining enabled modules in topo order
9
17
  """
10
18
 
11
19
  import asyncio
@@ -22,25 +30,43 @@ import httpx
22
30
  import uvicorn
23
31
  import websockets
24
32
  from fastapi import FastAPI, HTTPException
25
- from fastapi.responses import JSONResponse
26
33
 
27
- from .module_scanner import ModuleScanner, ModuleInfo, _parse_frontmatter
34
+ from .module_scanner import ModuleScanner, ModuleInfo, LaunchConfig, _parse_frontmatter
28
35
  from .process_manager import ProcessManager
29
- from core.data_dir import get_launcher_data_dir
30
36
 
31
37
  IS_WINDOWS = sys.platform == "win32"
32
38
 
39
+ # Shutdown timeout constants (seconds)
40
+ SHUTDOWN_TIMEOUT_NON_GRACEFUL = 5 # Non-graceful modules or no ack response
41
+ SHUTDOWN_TIMEOUT_PARTIAL = 3 # Graceful module ack'd but no ready
42
+ SHUTDOWN_TIMEOUT_READY = 1 # Graceful module sent ready (cleanup done)
43
+ SHUTDOWN_TIMEOUT_BULK = 3 # Bulk stop_all() safety net
44
+
45
+ # Core module names that are started in Phase 1-2 (not Phase 4)
46
+ CORE_MODULE_NAMES = {"registry", "event_hub"}
47
+
48
+ WATCHDOG_MODULE_NAME = "watchdog"
49
+
33
50
 
34
51
  class Launcher:
35
52
  """Kite system entry point. Starts Registry, manages modules, exposes API."""
36
53
 
37
54
  def __init__(self, kite_token: str):
38
55
  self.kite_token = kite_token
39
- self.project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
40
56
  self.instance_id = str(os.getpid())
41
- self.process_manager = ProcessManager(self.project_root, kite_token, self.instance_id)
57
+ os.environ["KITE_INSTANCE"] = self.instance_id
58
+
59
+ # Resolve instance workspace (must happen before ProcessManager init)
60
+ self._resolve_instance_dir()
61
+ os.environ["KITE_MODULE_DATA"] = os.path.join(
62
+ os.environ["KITE_INSTANCE_DIR"], "launcher",
63
+ )
64
+
65
+ self.process_manager = ProcessManager(
66
+ kite_token, self.instance_id,
67
+ on_kite_message=self._on_kite_message,
68
+ )
42
69
  self.module_scanner = ModuleScanner(
43
- self.project_root,
44
70
  discovery=self._load_discovery(),
45
71
  )
46
72
 
@@ -49,36 +75,158 @@ class Launcher:
49
75
  self.modules: dict[str, ModuleInfo] = {}
50
76
  self._shutdown_event = asyncio.Event()
51
77
  self._thread_shutdown = threading.Event()
78
+ self._shutdown_complete = threading.Event() # Set when normal shutdown finishes
52
79
  self._api_server: uvicorn.Server | None = None
53
80
  self._api_ready = threading.Event()
54
- self._fail_counts: dict[str, int] = {} # module_name -> consecutive failure count
55
81
  self._module_tokens: dict[str, str] = {} # module_name -> per-module token
56
82
 
57
83
  # Three-layer state model: desired_state per module
58
- # Initialized from config_state: enabled→running, manual→stopped, disabled→stopped
59
84
  self._desired_states: dict[str, str] = {} # module_name -> "running" | "stopped"
60
85
 
61
86
  # Event Hub WebSocket client
62
87
  self._event_hub_ws_url: str = ""
88
+ self._launcher_ws_token: str = ""
63
89
  self._ws: object | None = None
64
90
  self._ws_task: asyncio.Task | None = None
65
91
  self._loop: asyncio.AbstractEventLoop | None = None
66
92
 
67
93
  # Event waiters: {event_key: (asyncio.Event, data_dict)}
68
- # event_key format: "event_type:module_id"
69
94
  self._event_waiters: dict[str, tuple[asyncio.Event, dict]] = {}
70
95
 
71
- self._lifecycle_log = os.path.join(
72
- get_launcher_data_dir(), "lifecycle.jsonl",
73
- )
96
+ # Module ready times: module_name -> seconds from start to ready
97
+ self._ready_times: dict[str, float] = {}
98
+
99
+ # Shared HTTP client for Registry communication (lazy-init, reuses TCP connections)
100
+ self._http: httpx.AsyncClient | None = None
101
+
102
+ # Module exit reasons: module_name -> reason string (for modules that sent module.exiting)
103
+ self._exit_reasons: dict[str, str] = {}
104
+
105
+ # Graceful shutdown capability: module_name -> True if module declared support
106
+ # Registry and Event Hub default to True (they start before Watchdog can observe)
107
+ self._graceful_modules: dict[str, bool] = {"registry": True, "event_hub": True}
108
+
109
+ # System-wide shutdown flag: prevents Watchdog restart during shutdown
110
+ self._system_shutting_down = False
111
+
112
+ # Kite stdout message waiters: {waiter_key: (threading.Event, data_dict)}
113
+ # Used by ProcessManager stdout callback (cross-thread)
114
+ self._msg_waiters: dict[str, tuple[threading.Event, dict]] = {}
115
+
116
+ suffix = self.process_manager.instance_suffix
117
+ state_dir = os.path.join(os.environ["KITE_INSTANCE_DIR"], "launcher", "state")
118
+ os.makedirs(state_dir, exist_ok=True)
119
+ self._lifecycle_log = os.path.join(state_dir, f"lifecycle{suffix}.jsonl")
120
+ # Clear lifecycle log on startup (like latest.log)
121
+ try:
122
+ with open(self._lifecycle_log, "w", encoding="utf-8") as f:
123
+ pass
124
+ except Exception:
125
+ pass
126
+ os.environ["KITE_INSTANCE_SUFFIX"] = suffix
74
127
  self._app = self._create_api_app()
75
128
 
129
+ @staticmethod
130
+ def _fmt_elapsed(seconds: float) -> str:
131
+ """Format elapsed seconds: <1s → 'NNNms', >=1s → 'N.Ns', >=10s → 'NNs'."""
132
+ if seconds < 1:
133
+ return f"{seconds * 1000:.0f}ms"
134
+ if seconds < 10:
135
+ return f"{seconds:.1f}s"
136
+ return f"{seconds:.0f}s"
137
+
138
+ # ── Instance workspace resolution ──
139
+
140
+ @staticmethod
141
+ def _resolve_instance_dir():
142
+ """Resolve KITE_INSTANCE_DIR from KITE_WORKSPACE + KITE_CWD.
143
+ Algorithm: take CWD basename, find matching dir in workspace via .cwd file,
144
+ or create new one. Sets KITE_INSTANCE_DIR env var.
145
+ """
146
+ if os.environ.get("KITE_INSTANCE_DIR"):
147
+ return # already set (e.g. by tests or parent)
148
+
149
+ cwd = os.environ.get("KITE_CWD", os.getcwd())
150
+ workspace = os.environ.get("KITE_WORKSPACE", "")
151
+ if not workspace:
152
+ home = os.environ.get("HOME") or os.environ.get("USERPROFILE") or os.path.expanduser("~")
153
+ workspace = os.path.join(home, ".kite", "workspace")
154
+ os.environ["KITE_WORKSPACE"] = workspace
155
+
156
+ basename = os.path.basename(cwd.rstrip(os.sep)) or "default"
157
+ suffix = 0
158
+
159
+ while True:
160
+ name = basename if suffix == 0 else f"{basename}~{suffix}"
161
+ candidate = os.path.join(workspace, name)
162
+ cwd_file = os.path.join(candidate, ".cwd")
163
+
164
+ if not os.path.exists(candidate):
165
+ # Empty slot — create new workspace
166
+ os.makedirs(candidate, exist_ok=True)
167
+ with open(cwd_file, "w", encoding="utf-8") as f:
168
+ f.write(cwd)
169
+ os.environ["KITE_INSTANCE_DIR"] = candidate
170
+ return
171
+
172
+ if os.path.isfile(cwd_file):
173
+ try:
174
+ with open(cwd_file, "r", encoding="utf-8") as f:
175
+ if f.read().strip() == cwd:
176
+ os.environ["KITE_INSTANCE_DIR"] = candidate
177
+ return
178
+ except Exception:
179
+ pass
180
+
181
+ suffix += 1
182
+
183
+ # ── Kite stdout message callback ──
184
+
185
+ def _on_kite_message(self, module_name: str, msg: dict):
186
+ """Called by ProcessManager stdout reader thread when a kite message is detected.
187
+ Thread-safe: only touches _msg_waiters (dict + threading.Event).
188
+ """
189
+ kite_type = msg.get("kite", "")
190
+ key = f"{module_name}:{kite_type}"
191
+ waiter = self._msg_waiters.get(key)
192
+ if waiter:
193
+ waiter[1].update(msg)
194
+ waiter[0].set()
195
+
196
+ async def _wait_kite_message(self, module_name: str, kite_type: str,
197
+ timeout: float) -> dict | None:
198
+ """Wait for a kite stdout message from a module. Returns msg dict or None on timeout.
199
+ Checks shutdown flag every 0.5s so Ctrl+C is responsive even during Phase 1-2 waits.
200
+ """
201
+ key = f"{module_name}:{kite_type}"
202
+ evt = threading.Event()
203
+ data = {}
204
+ self._msg_waiters[key] = (evt, data)
205
+ shutdown = self._thread_shutdown
206
+ try:
207
+ def _wait():
208
+ deadline = time.monotonic() + timeout
209
+ while time.monotonic() < deadline:
210
+ if evt.wait(timeout=0.5):
211
+ return True
212
+ if shutdown.is_set():
213
+ return False
214
+ return False
215
+ got = await asyncio.get_running_loop().run_in_executor(None, _wait)
216
+ return data if got else None
217
+ finally:
218
+ self._msg_waiters.pop(key, None)
219
+
76
220
  # ── Public entry ──
77
221
 
78
222
  def run(self):
79
223
  """Synchronous entry point. Sets up signals, runs the async main loop."""
80
- print("[launcher] Kite starting...")
81
- print(f"[launcher] Project root: {self.project_root}")
224
+ print("[launcher] ── 环境 ──")
225
+ for key in sorted(k for k in os.environ if k.startswith("KITE_")):
226
+ print(f"[launcher] {key} = {os.environ[key]}")
227
+ print(f"[launcher] PID = {os.getpid()}")
228
+ print(f"[launcher] PYTHON = {sys.executable}")
229
+ print(f"[launcher] PLATFORM = {sys.platform}")
82
230
 
83
231
  if IS_WINDOWS:
84
232
  self._setup_windows_exit()
@@ -89,116 +237,505 @@ class Launcher:
89
237
  asyncio.run(self._async_main())
90
238
  except KeyboardInterrupt:
91
239
  pass
240
+ except RuntimeError as e:
241
+ print(f"[launcher] 启动失败: {e}")
92
242
  finally:
93
243
  self._final_cleanup()
94
244
 
245
+ def _request_shutdown(self, reason: str = ""):
246
+ """Request graceful shutdown. Thread-safe — can be called from signal handler or any thread."""
247
+ if self._thread_shutdown.is_set():
248
+ return # already shutting down
249
+ print(f"[launcher] {reason or '收到关闭请求'}")
250
+ self._thread_shutdown.set()
251
+ # Wake up asyncio event loop immediately (so _monitor_loop / wait_for exits)
252
+ loop = self._loop
253
+ if loop and not loop.is_closed():
254
+ try:
255
+ loop.call_soon_threadsafe(self._shutdown_event.set)
256
+ except RuntimeError:
257
+ pass
258
+ # Safety net: force exit after 10s only if normal shutdown hasn't completed
259
+ def _force():
260
+ if self._shutdown_complete.wait(timeout=10):
261
+ return # Normal shutdown completed — no need to force
262
+ try:
263
+ pm = self.process_manager
264
+ still = [n for n in pm._processes if pm.is_running(n)]
265
+ except Exception:
266
+ still = []
267
+ if still:
268
+ print(f"[launcher] 关闭超时,以下模块仍在运行: {', '.join(still)},强制退出")
269
+ else:
270
+ print("[launcher] 关闭超时,强制退出")
271
+ os._exit(1)
272
+ threading.Thread(target=_force, daemon=True).start()
273
+
95
274
  def _setup_unix_signals(self):
96
275
  """Register SIGTERM/SIGINT handlers on Linux/macOS."""
97
276
  def _handler(signum, frame):
98
- print(f"\n[launcher] Received signal {signum}, shutting down...")
99
- self._thread_shutdown.set()
277
+ self._request_shutdown(f"收到信号 {signum},正在关闭...")
100
278
  signal.signal(signal.SIGTERM, _handler)
101
279
  signal.signal(signal.SIGINT, _handler)
102
280
 
103
281
  def _setup_windows_exit(self):
104
- """Start a daemon thread that listens for 'q' or Ctrl+C on Windows."""
282
+ """SetConsoleCtrlHandler for Ctrl+C + daemon thread for 'q' key.
283
+
284
+ Why not signal.signal(SIGINT)?
285
+ Python's signal delivery requires the main thread to be executing bytecode.
286
+ When the main thread is blocked in C code (asyncio ProactorEventLoop →
287
+ GetQueuedCompletionStatus), SIGINT is never delivered.
288
+ SetConsoleCtrlHandler runs its callback in a separate OS thread, so it
289
+ always works regardless of what the main thread is doing.
290
+ """
291
+ import ctypes
292
+
293
+ @ctypes.WINFUNCTYPE(ctypes.c_int, ctypes.c_uint)
294
+ def _ctrl_handler(ctrl_type):
295
+ if ctrl_type in (0, 1): # CTRL_C_EVENT, CTRL_BREAK_EVENT
296
+ self._request_shutdown("收到 Ctrl+C,正在关闭...")
297
+ return 1 # handled — prevent default (which kills the process)
298
+ return 0
299
+
300
+ # prevent GC of the C callback
301
+ self._ctrl_handler_ref = _ctrl_handler
302
+ ctypes.windll.kernel32.SetConsoleCtrlHandler(_ctrl_handler, 1)
303
+
304
+ # 'q' key: handle via msvcrt polling
105
305
  def _listen():
106
306
  import msvcrt
107
307
  while not self._thread_shutdown.is_set():
108
308
  if msvcrt.kbhit():
109
309
  ch = msvcrt.getch()
110
- if ch in (b'q', b'Q', b'\x03'): # q or Ctrl+C
111
- print("\n[launcher] Exit requested, shutting down...")
112
- self._thread_shutdown.set()
310
+ if ch in (b'q', b'Q'):
311
+ self._request_shutdown("收到退出请求,正在关闭...")
113
312
  return
114
313
  time.sleep(0.1)
115
- t = threading.Thread(target=_listen, daemon=True)
116
- t.start()
314
+ threading.Thread(target=_listen, daemon=True).start()
117
315
 
118
- # ── Async main ──
316
+ # ── Async main (4-Phase startup) ──
119
317
 
120
318
  async def _async_main(self):
121
- """Full startup sequence, then monitor loop."""
319
+ """Full 4-phase startup sequence, then monitor loop."""
122
320
  self._loop = asyncio.get_running_loop()
321
+ t_start = time.monotonic()
322
+ self._start_unix = time.time()
323
+ phase_times = {}
324
+ G = "\033[32m"
325
+ R = "\033[0m"
326
+
327
+ # Validate core modules exist (mechanism 12)
328
+ self._validate_core_modules()
329
+
330
+ # Cleanup leftovers from previous instances (current instance dir)
331
+ local_cleaned = self.process_manager.cleanup_leftovers()
332
+
333
+ # Cross-directory leftover cleanup (background, non-blocking)
334
+ # run_in_executor returns a Future (not coroutine), so use ensure_future
335
+ self._global_cleanup_task = asyncio.ensure_future(
336
+ asyncio.get_running_loop().run_in_executor(
337
+ None, self.process_manager.cleanup_global_leftovers
338
+ )
339
+ )
123
340
 
124
- # Step 1: cleanup leftovers
125
- self.process_manager.cleanup_leftovers()
341
+ try:
342
+ # Phase 1+2: Registry + Event Hub parallel bootstrap
343
+ t0 = time.monotonic()
344
+ await self._phase1_parallel_bootstrap()
345
+ elapsed_p1 = time.monotonic() - t0
346
+ phase_times["Phase 1+2: Registry + Event Hub (并行)"] = elapsed_p1
347
+ print(f"{G}[launcher] ✓ Phase 1+2 完成: Registry + Event Hub 已就绪 ({elapsed_p1:.2f}s){R}")
348
+ if self._shutdown_event.is_set(): return
349
+
350
+ # Phase 3: Wait for Registry delayed ready
351
+ t0 = time.monotonic()
352
+ await self._phase3_registry_ready()
353
+ elapsed = time.monotonic() - t0
354
+ phase_times["Phase 3: Registry 事件总线"] = elapsed
355
+ print(f"{G}[launcher] ✓ Phase 3 完成: Registry 已连接事件总线 ({elapsed:.2f}s){R}")
356
+ if self._shutdown_event.is_set(): return
357
+
358
+ # Initialize desired_state from config_state (needed before Phase 3.5)
359
+ for name, info in self.modules.items():
360
+ if info.state == "enabled":
361
+ self._desired_states[name] = "running"
362
+ else: # manual, disabled
363
+ self._desired_states[name] = "stopped"
364
+ # Core modules are already running
365
+ for cn in CORE_MODULE_NAMES:
366
+ self._desired_states[cn] = "running"
367
+
368
+ # Phase 3.5: Watchdog ready
369
+ # If started in parallel (Phase 1), just wait for module.ready
370
+ # Otherwise start it now (fallback)
371
+ watchdog_info = self.modules.get(WATCHDOG_MODULE_NAME)
372
+ if watchdog_info and self._desired_states.get(WATCHDOG_MODULE_NAME) == "running":
373
+ t0 = time.monotonic()
374
+ if getattr(self, '_watchdog_parallel', False):
375
+ print(f"[launcher] Phase 3.5: Watchdog 已并行启动,等待就绪...")
376
+ ready = await self._wait_event("module.ready", "watchdog", timeout=15)
377
+ elapsed = time.monotonic() - t0
378
+ if ready and not ready.get("_exited"):
379
+ self._graceful_modules["watchdog"] = bool(ready.get("graceful_shutdown"))
380
+ self._ready_times["watchdog"] = elapsed
381
+ print(f"[launcher] Watchdog 已就绪")
382
+ self._log_lifecycle("started", "watchdog")
383
+ await self._publish_event("module.started", {"module_id": "watchdog"})
384
+ self.process_manager.close_stdio("watchdog")
385
+ else:
386
+ print(f"[launcher] 警告: Watchdog 在 15s 内未就绪")
387
+ else:
388
+ print(f"[launcher] Phase 3.5: 启动 Watchdog...")
389
+ await self._start_one_module(watchdog_info)
390
+ elapsed = time.monotonic() - t0
391
+ print(f"{G}[launcher] ✓ Phase 3.5 完成: Watchdog ({elapsed:.2f}s){R}")
392
+ if self._shutdown_event.is_set(): return
393
+
394
+ # Phase 4: Start remaining enabled modules
395
+ t0 = time.monotonic()
396
+ await self._phase4_start_modules()
397
+ elapsed = time.monotonic() - t0
398
+ phase_times["Phase 4: Extensions"] = elapsed
399
+ print(f"{G}[launcher] ✓ Phase 4 完成: 扩展模块已启动 ({elapsed:.2f}s){R}")
400
+ if self._shutdown_event.is_set(): return
401
+
402
+ # Post-startup
403
+ self.process_manager.persist_records()
404
+ self._heartbeat_task = asyncio.create_task(self._heartbeat_loop())
126
405
 
127
- # Step 2: start Registry
128
- await self._start_registry()
406
+ # Wait for global leftover cleanup to finish (non-blocking with timeout)
407
+ global_cleaned = {}
408
+ if hasattr(self, '_global_cleanup_task'):
409
+ try:
410
+ global_cleaned = await asyncio.wait_for(self._global_cleanup_task, timeout=5) or {}
411
+ except asyncio.TimeoutError:
412
+ print("[launcher] 警告: 全局遗留清理超时 (5s),跳过")
413
+ except Exception as e:
414
+ print(f"[launcher] 警告: 全局遗留清理出错: {e}")
415
+ # Merge local + global cleanup stats
416
+ cleaned_stats: dict[str, int] = {}
417
+ for d in (local_cleaned, global_cleaned):
418
+ for k, v in d.items():
419
+ cleaned_stats[k] = cleaned_stats.get(k, 0) + v
420
+
421
+ # Global instance scan (via executor to avoid blocking)
422
+ global_instances = await asyncio.get_running_loop().run_in_executor(
423
+ None, self.process_manager.get_global_instances
424
+ )
425
+
426
+ # ── Startup report ──
427
+ total_time = time.monotonic() - t_start
428
+ await self._print_startup_report(total_time, phase_times,
429
+ global_instances=global_instances,
430
+ cleaned_stats=cleaned_stats)
431
+ # Notify all modules that system startup is complete
432
+ await self._publish_event("system.ready", {
433
+ "startup_time": round(total_time, 2),
434
+ })
129
435
 
130
- # Step 3: start Launcher API in a separate thread
131
- self._start_api_thread()
436
+ print("[launcher] 进入监控循环 (按 Ctrl+C 'q' 退出)")
437
+ await self._monitor_loop()
438
+ finally:
439
+ try:
440
+ await self._graceful_shutdown_all()
441
+ except Exception as e:
442
+ print(f"[launcher] 优雅关闭出错: {e}")
132
443
 
133
- # Step 4: register Launcher to Registry
134
- await self._register_self()
444
+ # ── Phase 1+2: Parallel bootstrap (Registry + Event Hub) ──
135
445
 
136
- # Step 5: scan modules
137
- self.modules = self.module_scanner.scan()
138
- for name, info in self.modules.items():
139
- self._log_lifecycle("scanned", name, state=info.state, module_dir=info.module_dir)
140
- print(f"[launcher] Found {len(self.modules)} module(s): {', '.join(self.modules.keys()) or '(none)'}")
446
+ async def _phase1_parallel_bootstrap(self):
447
+ """Start Registry + Event Hub processes in parallel to overlap cold-start time.
141
448
 
142
- # Step 5.5: initialize desired_state from config_state
143
- for name, info in self.modules.items():
144
- if info.state == "enabled":
145
- self._desired_states[name] = "running"
146
- else: # manual, disabled
147
- self._desired_states[name] = "stopped"
449
+ Flow:
450
+ 1. Start Registry + Event Hub processes simultaneously
451
+ 2. Wait for Registry to report port via stdout
452
+ 3. Set KITE_REGISTRY_PORT env (for Phase 3.5/4 modules) + start API
453
+ 4. Scan modules + register self & tokens (parallel)
454
+ 5. Send launcher_ws_token + registry_port to Event Hub via stdin
455
+ 6. Wait for Event Hub ws_endpoint → WS connect → module.ready
456
+ """
457
+ t_registry = time.monotonic()
148
458
 
149
- # Step 6: generate per-module tokens and register to Registry
150
- await self._register_module_tokens()
459
+ # ── Step 1: Start both processes ──
460
+ registry_dir = os.path.join(os.environ["KITE_PROJECT"], "core", "registry")
461
+ registry_info = ModuleInfo(
462
+ name="registry",
463
+ display_name="Registry",
464
+ type="infrastructure",
465
+ state="enabled",
466
+ runtime="python",
467
+ entry="entry.py",
468
+ module_dir=registry_dir,
469
+ )
470
+ boot_info_registry = {"token": self.kite_token}
471
+ self._log_lifecycle("starting", "registry")
472
+ ok = self.process_manager.start_module(registry_info, boot_info=boot_info_registry)
473
+ if not ok:
474
+ self._log_lifecycle("start_failed", "registry")
475
+ raise RuntimeError("启动 Registry 失败")
476
+
477
+ # Start Event Hub in parallel (before Registry port is known)
478
+ eh_dir = os.path.join(os.environ["KITE_PROJECT"], "core", "event_hub")
479
+ eh_info = ModuleInfo(
480
+ name="event_hub",
481
+ display_name="Event Hub",
482
+ type="infrastructure",
483
+ state="enabled",
484
+ runtime="python",
485
+ entry="entry.py",
486
+ module_dir=eh_dir,
487
+ )
488
+ # Generate Event Hub token early (will register to Registry once it's up)
489
+ eh_token = secrets.token_hex(32)
490
+ self._module_tokens["event_hub"] = eh_token
491
+ boot_info_eh = {"token": eh_token}
492
+ self._log_lifecycle("starting", "event_hub")
493
+ ok = self.process_manager.start_module(eh_info, boot_info=boot_info_eh)
494
+ if not ok:
495
+ self._log_lifecycle("start_failed", "event_hub")
496
+ raise RuntimeError("启动 Event Hub 失败")
497
+
498
+ # Start Watchdog in parallel (before Registry port is known)
499
+ # Watchdog will block on stdin waiting for registry_port
500
+ watchdog_dir = os.path.join(os.environ["KITE_PROJECT"], "extensions", "services", "watchdog")
501
+ watchdog_md = os.path.join(watchdog_dir, "module.md")
502
+ self._watchdog_parallel = False # track whether watchdog was started in parallel
503
+ if os.path.isfile(watchdog_md):
504
+ wd_token = secrets.token_hex(32)
505
+ self._module_tokens["watchdog"] = wd_token
506
+ # Parse watchdog module.md for ModuleInfo
507
+ try:
508
+ with open(watchdog_md, "r", encoding="utf-8") as f:
509
+ wd_fm = _parse_frontmatter(f.read())
510
+ wd_info = ModuleInfo(
511
+ name="watchdog",
512
+ display_name=wd_fm.get("display_name", "Watchdog"),
513
+ type=wd_fm.get("type", "service"),
514
+ state="enabled",
515
+ runtime=wd_fm.get("runtime", "python"),
516
+ entry=wd_fm.get("entry", "entry.py"),
517
+ module_dir=watchdog_dir,
518
+ )
519
+ boot_info_wd = {"token": wd_token}
520
+ self._log_lifecycle("starting", "watchdog")
521
+ ok = self.process_manager.start_module(wd_info, boot_info=boot_info_wd)
522
+ if ok:
523
+ self._watchdog_parallel = True
524
+ else:
525
+ self._log_lifecycle("start_failed", "watchdog")
526
+ print("[launcher] 警告: Watchdog 并行启动失败,将在 Phase 3.5 重试")
527
+ except Exception as e:
528
+ print(f"[launcher] 警告: Watchdog module.md 解析失败: {e}")
151
529
 
152
- # Step 7: start enabled modules
153
- await self._start_enabled_modules()
530
+ parallel_modules = "Registry + Event Hub" + (" + Watchdog" if self._watchdog_parallel else "")
531
+ print(f"[launcher] {parallel_modules} 进程已同时启动,等待 Registry 端口...")
154
532
 
155
- # Step 8: persist records
533
+ # Persist immediately after starting core processes
156
534
  self.process_manager.persist_records()
157
535
 
158
- # Step 9: connect to Event Hub (best-effort, non-blocking)
159
- await self._connect_event_hub()
536
+ # ── Step 2: Wait for Registry port ──
537
+ msg = await self._wait_kite_message("registry", "port", timeout=6)
538
+ if not msg or not msg.get("port"):
539
+ raise RuntimeError("致命错误: Registry 在 6s 内未报告端口")
540
+ self.registry_port = int(msg["port"])
541
+ self._ready_times["registry"] = time.monotonic() - t_registry
542
+ _wait_s = time.monotonic() - t_registry
543
+ print(f"[launcher] Registry 端口: {self.registry_port} (等待 {self._fmt_elapsed(_wait_s)})")
544
+
545
+ # ── Step 3: Set env + start API + immediately unblock Event Hub ──
546
+ os.environ["KITE_REGISTRY_PORT"] = str(self.registry_port)
547
+ self._start_api_thread()
160
548
 
161
- # Step 10: start heartbeat to Registry
162
- self._heartbeat_task = asyncio.create_task(self._heartbeat_loop())
549
+ # Send launcher_ws_token + registry_port to Event Hub ASAP (unblock it)
550
+ self._launcher_ws_token = secrets.token_hex(32)
551
+ self.process_manager.write_stdin("event_hub", {
552
+ "kite": "launcher_ws_token",
553
+ "launcher_ws_token": self._launcher_ws_token,
554
+ })
555
+ self.process_manager.write_stdin("event_hub", {
556
+ "kite": "registry_port",
557
+ "registry_port": self.registry_port,
558
+ })
163
559
 
164
- # Step 11: monitor loop
165
- print("[launcher] Entering monitor loop (press Ctrl+C or 'q' to exit)")
166
- await self._monitor_loop()
560
+ # Send registry_port to Watchdog via stdin (if started in parallel)
561
+ # Watchdog will retry querying launcher.api_endpoint until it's available
562
+ if self.process_manager.is_running("watchdog"):
563
+ self.process_manager.write_stdin("watchdog", {
564
+ "kite": "registry_port",
565
+ "registry_port": self.registry_port,
566
+ })
167
567
 
168
- # Graceful shutdown all modules before event loop closes
169
- await self._graceful_shutdown_all()
568
+ # ── Step 4: Scan + register tokens ‖ wait for Event Hub ws_endpoint (parallel) ──
569
+ # Pre-register ws_endpoint waiter BEFORE gather to avoid race condition:
570
+ # module_scanner.scan() is synchronous and blocks the event loop,
571
+ # so the _wait_event_hub_endpoint coroutine wouldn't register its waiter in time.
572
+ ws_waiter_key = "event_hub:ws_endpoint"
573
+ ws_evt = threading.Event()
574
+ ws_data: dict = {}
575
+ self._msg_waiters[ws_waiter_key] = (ws_evt, ws_data)
576
+
577
+ async def _scan_and_register_tokens():
578
+ t_scan = time.monotonic()
579
+ self.modules = self.module_scanner.scan()
580
+ for name, info in self.modules.items():
581
+ self._log_lifecycle("scanned", name, state=info.state, module_dir=info.module_dir)
582
+ _scan_s = time.monotonic() - t_scan
583
+ print(f"[launcher] 发现 {len(self.modules)} 个模块: {', '.join(self.modules.keys()) or '(无)'} (扫描 {self._fmt_elapsed(_scan_s)})")
584
+ t_reg = time.monotonic()
585
+ await self._register_module_tokens()
586
+ _reg_s = time.monotonic() - t_reg
587
+ print(f"[launcher] 令牌注册完成 ({self._fmt_elapsed(_reg_s)})")
588
+
589
+ async def _wait_event_hub_endpoint():
590
+ t_wait_eh = time.monotonic()
591
+ print("[launcher] 等待 Event Hub ws_endpoint...")
592
+ shutdown = self._thread_shutdown
593
+ def _wait():
594
+ deadline = time.monotonic() + 10
595
+ while time.monotonic() < deadline:
596
+ if ws_evt.wait(timeout=0.5):
597
+ return True
598
+ if shutdown.is_set():
599
+ return False
600
+ return False
601
+ got = await asyncio.get_running_loop().run_in_executor(None, _wait)
602
+ self._msg_waiters.pop(ws_waiter_key, None)
603
+ if not got or not ws_data.get("ws_endpoint"):
604
+ raise RuntimeError("致命错误: Event Hub 在 10s 内未报告 ws_endpoint")
605
+ self._event_hub_ws_url = ws_data["ws_endpoint"]
606
+ _eh_s = time.monotonic() - t_wait_eh
607
+ print(f"[launcher] Event Hub 已发现: {self._event_hub_ws_url} (等待 {self._fmt_elapsed(_eh_s)})")
608
+
609
+ # Run all three in parallel: register_self + scan_tokens + wait_event_hub
610
+ await asyncio.gather(
611
+ self._register_self(),
612
+ _scan_and_register_tokens(),
613
+ _wait_event_hub_endpoint(),
614
+ )
615
+ if self._shutdown_event.is_set():
616
+ return
617
+
618
+ # ── Step 5: WS connect → module.ready ──
619
+ t_eh = time.monotonic()
620
+ self._ws_task = asyncio.create_task(self._ws_loop())
621
+
622
+ # Wait for Event Hub module.ready (sent when Launcher connects)
623
+ ready = await self._wait_event("module.ready", "event_hub", timeout=15)
624
+ if ready:
625
+ self._graceful_modules["event_hub"] = bool(ready.get("graceful_shutdown"))
626
+ print("[launcher] Event Hub 已就绪")
627
+ else:
628
+ print("[launcher] 警告: Event Hub 在 15s 内未发送 module.ready")
170
629
 
171
- # ── Event Hub connection ──
630
+ self._ready_times["event_hub"] = time.monotonic() - t_eh
631
+ self._log_lifecycle("started", "event_hub")
632
+ await self._publish_event("module.started", {"module_id": "event_hub"})
633
+ self.process_manager.close_stdio("event_hub")
172
634
 
173
- async def _connect_event_hub(self):
174
- """Discover Event Hub WS endpoint (with retry) and start background client."""
175
- url = f"http://127.0.0.1:{self.registry_port}"
635
+ # Store eh_info in modules dict if not already present (from scan)
636
+ if "event_hub" not in self.modules:
637
+ self.modules["event_hub"] = eh_info
638
+
639
+ def _get_http(self) -> httpx.AsyncClient:
640
+ """Get shared HTTP client (lazy-init, reuses TCP connections to Registry)."""
641
+ if self._http is None or self._http.is_closed:
642
+ self._http = httpx.AsyncClient(timeout=5)
643
+ return self._http
644
+
645
+ async def _close_http(self):
646
+ """Close shared HTTP client."""
647
+ if self._http and not self._http.is_closed:
648
+ await self._http.aclose()
649
+ self._http = None
650
+
651
+ async def _register_self(self):
652
+ """Register Launcher itself to Registry."""
653
+ url = f"http://127.0.0.1:{self.registry_port}/modules"
176
654
  headers = {"Authorization": f"Bearer {self.kite_token}"}
655
+ payload = {
656
+ "action": "register",
657
+ "module_id": "launcher",
658
+ "module_type": "infrastructure",
659
+ "name": "Launcher",
660
+ "api_endpoint": f"http://127.0.0.1:{self.api_port}",
661
+ "health_endpoint": "/launcher/modules",
662
+ "events_publish": {
663
+ "module.started": {},
664
+ "module.stopped": {},
665
+ "module.state_changed": {},
666
+ },
667
+ "events_subscribe": [">"],
668
+ }
669
+ try:
670
+ client = self._get_http()
671
+ resp = await client.post(url, json=payload, headers=headers)
672
+ if resp.status_code == 200:
673
+ print("[launcher] 已注册到 Registry")
674
+ else:
675
+ print(f"[launcher] 警告: Registry 注册返回 {resp.status_code}")
676
+ except Exception as e:
677
+ print(f"[launcher] 警告: 注册到 Registry 失败: {e}")
177
678
 
178
- # Event Hub needs time to start and register itself to Registry
179
- print("[launcher] Waiting for Event Hub to register...")
180
- deadline = time.time() + 15
181
- while time.time() < deadline:
182
- try:
183
- async with httpx.AsyncClient() as client:
184
- resp = await client.get(
185
- f"{url}/get/event_hub.metadata.ws_endpoint",
186
- headers=headers, timeout=3,
187
- )
188
- if resp.status_code == 200:
189
- self._event_hub_ws_url = resp.json()
190
- if self._event_hub_ws_url:
191
- break
192
- except Exception:
193
- pass
194
- await asyncio.sleep(1)
679
+ # ── (Phase 2 merged into _phase1_parallel_bootstrap) ──
195
680
 
196
- if not self._event_hub_ws_url:
197
- print("[launcher] WARNING: Could not discover Event Hub WS, events disabled")
681
+ # ── Phase 3: Registry delayed ready ──
682
+
683
+ async def _phase3_registry_ready(self):
684
+ """Wait for Registry module.ready (triggered after Event Hub registers to Registry
685
+ and Registry connects to Event Hub WS)."""
686
+ print("[launcher] 等待 Registry 连接 Event Hub...")
687
+ ready = await self._wait_event("module.ready", "registry", timeout=12)
688
+ if ready:
689
+ self._graceful_modules["registry"] = bool(ready.get("graceful_shutdown"))
690
+ print("[launcher] Registry 事件总线连接完成")
691
+ else:
692
+ print("[launcher] 警告: Registry 在 12s 内未连接事件总线 (降级运行)")
693
+
694
+ self._log_lifecycle("started", "registry")
695
+ await self._publish_event("module.started", {"module_id": "registry"})
696
+ self.process_manager.close_stdio("registry")
697
+
698
+ # ── Phase 4: Start remaining modules ──
699
+
700
+ async def _phase4_start_modules(self):
701
+ """Start enabled modules (excluding core) in dependency order."""
702
+ to_start = [m for m in self.modules.values()
703
+ if self._desired_states.get(m.name) == "running"
704
+ and m.name not in CORE_MODULE_NAMES
705
+ and m.name != WATCHDOG_MODULE_NAME]
706
+ if not to_start:
707
+ print("[launcher] 没有额外模块需要启动")
198
708
  return
199
709
 
200
- print(f"[launcher] Event Hub discovered: {self._event_hub_ws_url}")
201
- self._ws_task = asyncio.create_task(self._ws_loop())
710
+ # Auto-start manual modules if depended upon
711
+ needed = set(m.name for m in to_start)
712
+ for m in list(to_start):
713
+ for dep in m.depends_on:
714
+ if dep not in needed and dep not in CORE_MODULE_NAMES:
715
+ dep_info = self.modules.get(dep)
716
+ if dep_info and dep_info.state != "disabled":
717
+ needed.add(dep)
718
+ to_start.append(dep_info)
719
+ self._desired_states[dep] = "running"
720
+ print(f"[launcher] 自动启动 '{dep}' (被依赖)")
721
+ elif dep_info and dep_info.state == "disabled":
722
+ print(f"[launcher] 错误: '{m.name}' 依赖已禁用的模块 '{dep}'")
723
+
724
+ try:
725
+ layers = self._topo_layers(to_start)
726
+ except RuntimeError as e:
727
+ print(f"[launcher] 错误: {e}")
728
+ return
729
+
730
+ total = sum(len(layer) for layer in layers)
731
+ print(f"[launcher] 正在启动 {total} 个模块...")
732
+ for layer in layers:
733
+ if len(layer) == 1:
734
+ await self._start_one_module(layer[0])
735
+ else:
736
+ await asyncio.gather(*(self._start_one_module(info) for info in layer))
737
+
738
+ # ── Event Hub WebSocket connection ──
202
739
 
203
740
  async def _ws_loop(self):
204
741
  """Connect to Event Hub, reconnect on failure."""
@@ -208,16 +745,19 @@ class Launcher:
208
745
  except asyncio.CancelledError:
209
746
  return
210
747
  except Exception as e:
211
- print(f"[launcher] Event Hub connection error: {e}")
748
+ if not self._system_shutting_down:
749
+ print(f"[launcher] Event Hub 连接错误: {e}")
212
750
  self._ws = None
213
751
  await asyncio.sleep(5)
214
752
 
215
753
  async def _ws_connect(self):
216
- """Single WebSocket session: connect, subscribe to all events, display them."""
217
- ws_url = f"{self._event_hub_ws_url}?token={self.kite_token}"
218
- async with websockets.connect(ws_url) as ws:
754
+ """Single WebSocket session with launcher_ws_token auth."""
755
+ ws_url = f"{self._event_hub_ws_url}?token={self._launcher_ws_token}&id=launcher"
756
+ t_ws_connect = time.monotonic()
757
+ async with websockets.connect(ws_url, open_timeout=3, ping_interval=None, ping_timeout=None, close_timeout=10) as ws:
219
758
  self._ws = ws
220
- print("[launcher] Connected to Event Hub")
759
+ _ws_s = time.monotonic() - t_ws_connect
760
+ print(f"[launcher] 已连接到 Event Hub ({self._fmt_elapsed(_ws_s)})")
221
761
 
222
762
  # Subscribe to all events
223
763
  await ws.send(json.dumps({
@@ -231,52 +771,89 @@ class Launcher:
231
771
  msg = json.loads(raw)
232
772
  except (json.JSONDecodeError, TypeError):
233
773
  continue
234
- msg_type = msg.get("type", "")
235
- if msg_type == "event":
236
- source = msg.get("source", "unknown")
237
- event = msg.get("event", "")
238
- data = msg.get("data", {})
239
- # Trigger event waiters
240
- module_id = data.get("module_id", "")
241
- waiter_key = f"{event}:{module_id}"
242
- waiter = self._event_waiters.get(waiter_key)
243
- if waiter:
244
- waiter[1].update(data)
245
- waiter[0].set()
246
- ts = msg.get("timestamp", "")
247
- latency_str = ""
248
- if ts:
249
- try:
250
- from datetime import datetime, timezone
251
- sent = datetime.fromisoformat(ts)
252
- delay_ms = (datetime.now(timezone.utc) - sent).total_seconds() * 1000
253
- latency_str = f" ({delay_ms:.1f}ms)"
254
- local_ts = sent.astimezone().strftime("%H:%M:%S")
255
- except Exception:
256
- local_ts = ts[11:19] if len(ts) >= 19 else ts
257
- print(f"[{source}] {local_ts} {event}{latency_str}: {json.dumps(data, ensure_ascii=False)}")
258
- else:
259
- print(f"[{source}] {event}: {json.dumps(data, ensure_ascii=False)}")
260
- elif msg_type == "error":
261
- print(f"[launcher] Event Hub error: {msg.get('message')}")
774
+ try:
775
+ msg_type = msg.get("type", "")
776
+ if msg_type == "event":
777
+ source = msg.get("source", "unknown")
778
+ event = msg.get("event", "")
779
+ data = msg.get("data") if isinstance(msg.get("data"), dict) else {}
780
+ # Trigger event waiters
781
+ module_id = data.get("module_id", "")
782
+ waiter_key = f"{event}:{module_id}"
783
+ waiter = self._event_waiters.get(waiter_key)
784
+ if waiter:
785
+ waiter[1].update(data)
786
+ waiter[0].set()
787
+ # module.exiting also wakes module.ready waiter
788
+ # (module won't send ready — no point waiting)
789
+ if event == "module.exiting" and module_id:
790
+ ready_key = f"module.ready:{module_id}"
791
+ ready_waiter = self._event_waiters.get(ready_key)
792
+ if ready_waiter:
793
+ ready_waiter[1].update(data)
794
+ ready_waiter[1]["_exited"] = True
795
+ ready_waiter[0].set()
796
+ # module.crash print red crash summary (real-time notification)
797
+ if event == "module.crash" and module_id:
798
+ RED = "\033[91m"
799
+ RESET = "\033[0m"
800
+ exc_type = data.get("exception_type", "Unknown")
801
+ preview = data.get("traceback_preview", "")
802
+ severity = data.get("severity", "error")
803
+ print(f"[launcher] {RED}模块 '{module_id}' 崩溃: "
804
+ f"{exc_type} — {preview}{RESET}")
805
+ _suffix = os.environ.get("KITE_INSTANCE_SUFFIX", "")
806
+ crash_log = os.path.join(
807
+ os.environ.get("KITE_INSTANCE_DIR", ""),
808
+ module_id, "log", f"crashes{_suffix}.jsonl"
809
+ )
810
+ print(f"[launcher] 崩溃日志: {crash_log}")
811
+ ts = msg.get("timestamp", "")
812
+ # Only log system events (module.*, watchdog.*) to avoid flooding
813
+ # from benchmark/test traffic
814
+ if not (event.startswith("module.") or event.startswith("watchdog.")):
815
+ continue
816
+ latency_str = ""
817
+ if ts:
818
+ try:
819
+ from datetime import datetime, timezone
820
+ sent = datetime.fromisoformat(ts)
821
+ delay_ms = (datetime.now(timezone.utc) - sent).total_seconds() * 1000
822
+ latency_str = f" ({delay_ms:.1f}ms)"
823
+ local_ts = sent.astimezone().strftime("%H:%M:%S")
824
+ except Exception:
825
+ local_ts = ts[11:19] if len(ts) >= 19 else ts
826
+ print(f"[{source}] {local_ts} {event}{latency_str}: {json.dumps(data, ensure_ascii=False)}")
827
+ else:
828
+ print(f"[{source}] {event}: {json.dumps(data, ensure_ascii=False)}")
829
+ elif msg_type == "error":
830
+ print(f"[launcher] Event Hub 错误: {msg.get('message')}")
831
+ except Exception as e:
832
+ print(f"[launcher] 事件处理异常(已忽略): {e}")
262
833
 
263
834
  async def _publish_event(self, event_type: str, data: dict):
264
- """Publish an event to Event Hub via WebSocket."""
835
+ """Publish an event to Event Hub via WebSocket. Uses create_task to avoid
836
+ deadlock with _ws_connect recv loop (websockets 15.x send can block when
837
+ incoming frames are pending and recv is held by async-for)."""
265
838
  if not self._ws:
266
839
  return
267
840
  from datetime import datetime, timezone
268
- msg = {
841
+ msg = json.dumps({
269
842
  "type": "event",
270
843
  "event_id": str(uuid.uuid4()),
271
844
  "event": event_type,
272
845
  "source": "launcher",
273
846
  "timestamp": datetime.now(timezone.utc).isoformat(),
274
847
  "data": data,
275
- }
276
- try:
277
- await self._ws.send(json.dumps(msg))
278
- except Exception as e:
279
- print(f"[launcher] Failed to publish event: {e}")
848
+ })
849
+
850
+ async def _send():
851
+ try:
852
+ await self._ws.send(msg)
853
+ except Exception as e:
854
+ print(f"[launcher] 发布事件失败: {e}")
855
+
856
+ asyncio.create_task(_send())
280
857
 
281
858
  def _publish_event_threadsafe(self, event_type: str, data: dict):
282
859
  """Publish event from non-async context (API thread). Fire-and-forget."""
@@ -301,57 +878,127 @@ class Launcher:
301
878
  self._event_waiters.pop(key, None)
302
879
 
303
880
  async def _graceful_stop(self, name: str, reason: str = "stop_requested", timeout: float = 10):
304
- """Graceful shutdown: send event → wait ack → wait ready → kill."""
881
+ """Graceful shutdown: check capability → send event → wait ack → wait ready → kill.
882
+ Modules that did not declare graceful_shutdown in module.ready are terminated directly.
883
+ """
305
884
  self._log_lifecycle("stopping", name, reason=reason)
306
- # Step 1: send module.shutdown event
885
+
886
+ if not self._graceful_modules.get(name):
887
+ self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
888
+ self._log_lifecycle("stopped", name, reason=reason)
889
+ await self._publish_event("module.stopped", {
890
+ "module_id": name,
891
+ "graceful_shutdown": False,
892
+ })
893
+ return
894
+
307
895
  await self._publish_event("module.shutdown", {
308
896
  "module_id": name, "reason": reason, "timeout": timeout,
309
897
  })
310
898
 
311
- # Step 2: wait for ack (3s)
312
899
  ack = await self._wait_event("module.shutdown.ack", name, timeout=3)
313
900
  if not ack:
314
- # No ack — fallback to direct terminate
315
- self.process_manager.stop_module(name, timeout=5)
316
- await self._publish_event("module.stopped", {"module_id": name})
901
+ self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
902
+ await self._publish_event("module.stopped", {
903
+ "module_id": name,
904
+ "graceful_shutdown": self._graceful_modules.get(name, False),
905
+ })
317
906
  return
318
907
 
319
- # Step 3: wait for ready
320
908
  estimated = min(ack.get("estimated_cleanup", timeout), timeout)
321
909
  ready = await self._wait_event("module.shutdown.ready", name, timeout=estimated)
322
910
  if ready:
323
- # Module is ready to die — kill immediately
324
- self.process_manager.stop_module(name, timeout=1)
911
+ self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_READY)
325
912
  else:
326
- # Timeout — force stop
327
- self.process_manager.stop_module(name, timeout=3)
913
+ self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_PARTIAL)
328
914
 
329
915
  self._log_lifecycle("stopped", name, reason=reason)
330
- await self._publish_event("module.stopped", {"module_id": name})
916
+ await self._publish_event("module.stopped", {
917
+ "module_id": name,
918
+ "graceful_shutdown": self._graceful_modules.get(name, False),
919
+ })
331
920
 
332
921
  async def _graceful_shutdown_all(self):
333
- """Broadcast module.shutdown to all running modules, then force-kill survivors."""
922
+ """Shut down all modules. Order:
923
+ 1. Send shutdown to graceful modules (excl. Event Hub) — let them start cleanup
924
+ 2. Terminate non-graceful modules (fast, runs during graceful cleanup)
925
+ 3. Wait for graceful modules to exit (process monitoring)
926
+ 4. Shut down Event Hub last (keeps event routing alive throughout)
927
+ """
928
+ self._system_shutting_down = True
334
929
  running = [n for n in self.modules if self.process_manager.is_running(n)]
930
+ # Also check core modules
931
+ for cn in CORE_MODULE_NAMES:
932
+ if self.process_manager.is_running(cn) and cn not in running:
933
+ running.append(cn)
335
934
  if not running:
935
+ print("[launcher] 没有运行中的模块需要关闭")
336
936
  return
337
- print(f"[launcher] Graceful shutdown: {', '.join(running)}")
338
- # Broadcast shutdown event
339
- for name in running:
937
+
938
+ graceful = [n for n in running if self._graceful_modules.get(n)]
939
+ non_graceful = [n for n in running if not self._graceful_modules.get(n)]
940
+
941
+ # Defer Event Hub — it must stay alive to route shutdown events
942
+ hub_deferred = "event_hub" in graceful
943
+ graceful_batch = [n for n in graceful if n != "event_hub"] if hub_deferred else graceful
944
+
945
+ print(f"[launcher] 正在关闭 {len(running)} 个模块: {', '.join(running)}")
946
+
947
+ # Phase 1: Notify graceful modules first (they start cleanup immediately)
948
+ for name in graceful_batch:
340
949
  self._log_lifecycle("stopping", name, reason="system_shutdown")
341
950
  await self._publish_event("module.shutdown", {
342
- "module_id": name, "reason": "system_shutdown", "timeout": 10,
951
+ "module_id": name, "reason": "system_shutdown", "timeout": 5,
343
952
  })
344
- # Wait up to 10s total, then force-kill
345
- deadline = time.time() + 10
346
- while time.time() < deadline:
347
- still_running = [n for n in running if self.process_manager.is_running(n)]
348
- if not still_running:
349
- break
350
- await asyncio.sleep(0.5)
351
- self.process_manager.stop_all(timeout=3)
352
- for name in running:
953
+
954
+ # Phase 2: While graceful modules are cleaning up, terminate non-graceful ones
955
+ if non_graceful:
956
+ print(f"[launcher] 直接终止 {len(non_graceful)} 个不支持优雅退出的模块: {', '.join(non_graceful)}")
957
+ for name in non_graceful:
958
+ self._log_lifecycle("stopping", name, reason="system_shutdown")
959
+ self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_PARTIAL)
353
960
  self._log_lifecycle("stopped", name, reason="system_shutdown")
354
961
 
962
+ # Phase 3: Wait for graceful modules to exit (process monitoring)
963
+ if graceful_batch:
964
+ deadline = time.time() + 5
965
+ while time.time() < deadline:
966
+ still_running = [n for n in graceful_batch if self.process_manager.is_running(n)]
967
+ if not still_running:
968
+ print("[launcher] 所有优雅退出模块已自行退出")
969
+ break
970
+ remaining = max(0, deadline - time.time())
971
+ print(f"[launcher] 等待 {len(still_running)} 个模块退出 ({remaining:.0f}s): {', '.join(still_running)}")
972
+ await asyncio.sleep(1)
973
+ # Force kill survivors
974
+ for name in graceful_batch:
975
+ if self.process_manager.is_running(name):
976
+ self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_PARTIAL)
977
+ self._log_lifecycle("stopped", name, reason="system_shutdown")
978
+
979
+ # Phase 4: All other modules exited — now shut down Event Hub
980
+ if hub_deferred and self.process_manager.is_running("event_hub"):
981
+ self._log_lifecycle("stopping", "event_hub", reason="system_shutdown")
982
+ await self._publish_event("module.shutdown", {
983
+ "module_id": "event_hub", "reason": "system_shutdown", "timeout": 5,
984
+ })
985
+ deadline = time.time() + 5
986
+ while time.time() < deadline:
987
+ if not self.process_manager.is_running("event_hub"):
988
+ print("[launcher] Event Hub 已退出")
989
+ break
990
+ await asyncio.sleep(0.5)
991
+ if self.process_manager.is_running("event_hub"):
992
+ self.process_manager.stop_module("event_hub", timeout=SHUTDOWN_TIMEOUT_PARTIAL)
993
+ self._log_lifecycle("stopped", "event_hub", reason="system_shutdown")
994
+
995
+ # Final safety net
996
+ try:
997
+ self.process_manager.stop_all(timeout=SHUTDOWN_TIMEOUT_BULK)
998
+ except Exception as e:
999
+ print(f"[launcher] stop_all 出错: {e}")
1000
+ await self._close_http()
1001
+
355
1002
  # ── Heartbeat to Registry ──
356
1003
 
357
1004
  async def _heartbeat_loop(self):
@@ -359,110 +1006,15 @@ class Launcher:
359
1006
  while not self._thread_shutdown.is_set():
360
1007
  await asyncio.sleep(30)
361
1008
  try:
362
- async with httpx.AsyncClient() as client:
363
- await client.post(
364
- f"http://127.0.0.1:{self.registry_port}/modules",
365
- json={"action": "heartbeat", "module_id": "launcher"},
366
- headers={"Authorization": f"Bearer {self.kite_token}"},
367
- timeout=5,
368
- )
1009
+ client = self._get_http()
1010
+ await client.post(
1011
+ f"http://127.0.0.1:{self.registry_port}/modules",
1012
+ json={"action": "heartbeat", "module_id": "launcher"},
1013
+ headers={"Authorization": f"Bearer {self.kite_token}"},
1014
+ )
369
1015
  except Exception:
370
1016
  pass
371
1017
 
372
- # ── Registry startup ──
373
-
374
- async def _start_registry(self):
375
- """Start Registry as a subprocess, wait for it to write port.txt and /health to respond."""
376
- registry_dir = os.path.join(self.project_root, "core", "registry")
377
- if not os.path.isdir(registry_dir):
378
- raise RuntimeError(f"Registry module not found at {registry_dir}")
379
-
380
- # Use centralized data directory
381
- from core.data_dir import get_registry_data_dir
382
- registry_data_dir = get_registry_data_dir()
383
-
384
- # Clean our instance's port file before starting
385
- port_file = os.path.join(registry_data_dir, f"port_{self.instance_id}.txt")
386
- if os.path.isfile(port_file):
387
- os.remove(port_file)
388
-
389
- registry_info = ModuleInfo(
390
- name="registry",
391
- display_name="Registry",
392
- type="infrastructure",
393
- state="enabled",
394
- runtime="python",
395
- entry="entry.py",
396
- module_dir=registry_dir,
397
- )
398
-
399
- # Pass launcher_token + bind config via stdin
400
- boot_info = {"token": self.kite_token, "registry_port": 0, "bind": "127.0.0.1", "instance_id": self.instance_id}
401
- ok = self.process_manager.start_module(registry_info, boot_info=boot_info)
402
- if not ok:
403
- raise RuntimeError("Failed to start Registry")
404
-
405
- # Wait for Registry to write port.txt
406
- print("[launcher] Waiting for Registry to report its port...")
407
- deadline = time.time() + 10
408
- while time.time() < deadline:
409
- if os.path.isfile(port_file):
410
- try:
411
- with open(port_file, "r") as f:
412
- self.registry_port = int(f.read().strip())
413
- break
414
- except (ValueError, OSError):
415
- pass
416
- await asyncio.sleep(0.2)
417
- else:
418
- raise RuntimeError("Registry failed to write port.txt within 10s")
419
-
420
- # Poll /health until ready
421
- url = f"http://127.0.0.1:{self.registry_port}/health"
422
- print(f"[launcher] Registry on port {self.registry_port}, waiting for health check...")
423
-
424
- deadline = time.time() + 10
425
- async with httpx.AsyncClient() as client:
426
- while time.time() < deadline:
427
- try:
428
- resp = await client.get(url, timeout=1)
429
- if resp.status_code == 200:
430
- print("[launcher] Registry is ready")
431
- return
432
- except Exception:
433
- pass
434
- await asyncio.sleep(0.2)
435
-
436
- raise RuntimeError("Registry failed to become ready within 10s")
437
-
438
- async def _register_self(self):
439
- """Register Launcher itself to Registry using new API."""
440
- url = f"http://127.0.0.1:{self.registry_port}/modules"
441
- headers = {"Authorization": f"Bearer {self.kite_token}"}
442
- payload = {
443
- "action": "register",
444
- "module_id": "launcher",
445
- "module_type": "infrastructure",
446
- "name": "Launcher",
447
- "api_endpoint": f"http://127.0.0.1:{self.api_port}",
448
- "health_endpoint": "/launcher/modules",
449
- "events_publish": {
450
- "module.started": {},
451
- "module.stopped": {},
452
- "module.state_changed": {},
453
- },
454
- "events_subscribe": [">"],
455
- }
456
- try:
457
- async with httpx.AsyncClient() as client:
458
- resp = await client.post(url, json=payload, headers=headers, timeout=5)
459
- if resp.status_code == 200:
460
- print("[launcher] Registered self to Registry")
461
- else:
462
- print(f"[launcher] WARNING: Registry registration returned {resp.status_code}")
463
- except Exception as e:
464
- print(f"[launcher] WARNING: failed to register to Registry: {e}")
465
-
466
1018
  # ── Module startup ──
467
1019
 
468
1020
  def _topo_sort(self, modules: list[ModuleInfo]) -> list[ModuleInfo]:
@@ -491,86 +1043,133 @@ class Launcher:
491
1043
  visit(m.name)
492
1044
  return order
493
1045
 
1046
+ def _topo_layers(self, modules: list[ModuleInfo]) -> list[list[ModuleInfo]]:
1047
+ """Topological sort into layers. Modules in the same layer have no
1048
+ inter-dependencies and can be started in parallel."""
1049
+ name_map = {m.name: m for m in modules}
1050
+ all_names = set(name_map.keys())
1051
+
1052
+ # Compute depth (longest path from root) for each module
1053
+ depth: dict[str, int] = {}
1054
+ in_stack: set[str] = set()
1055
+
1056
+ def get_depth(name: str) -> int:
1057
+ if name in depth:
1058
+ return depth[name]
1059
+ if name in in_stack:
1060
+ raise RuntimeError(f"Circular dependency detected involving '{name}'")
1061
+ in_stack.add(name)
1062
+ info = name_map.get(name)
1063
+ d = 0
1064
+ if info:
1065
+ for dep in info.depends_on:
1066
+ if dep in all_names:
1067
+ d = max(d, get_depth(dep) + 1)
1068
+ in_stack.remove(name)
1069
+ depth[name] = d
1070
+ return d
1071
+
1072
+ for name in all_names:
1073
+ get_depth(name)
1074
+
1075
+ # Group by depth
1076
+ max_depth = max(depth.values()) if depth else 0
1077
+ layers: list[list[ModuleInfo]] = [[] for _ in range(max_depth + 1)]
1078
+ for name, d in depth.items():
1079
+ layers[d].append(name_map[name])
1080
+ return layers
1081
+
494
1082
  async def _start_one_module(self, info: ModuleInfo):
495
- """Start a single module: publish starting event, start process, wait for ready."""
1083
+ """Start a single module: publish starting start process wait ready → started → close stdio."""
496
1084
  self._log_lifecycle("starting", info.name)
497
1085
  await self._publish_event("module.starting", {"module_id": info.name})
498
1086
 
499
1087
  token = self._module_tokens.get(info.name, "")
500
- boot_info = {
501
- "token": token,
502
- "registry_port": self.registry_port,
503
- "preferred_port": info.preferred_port,
504
- "advertise_ip": "127.0.0.1",
505
- }
1088
+ boot_info = {"token": token}
1089
+ t0 = time.monotonic()
506
1090
  ok = self.process_manager.start_module(info, boot_info=boot_info)
507
1091
  if not ok:
508
1092
  self._log_lifecycle("start_failed", info.name)
509
1093
  return
510
1094
 
511
- # Wait for module.ready (configurable timeout, degrade on timeout)
1095
+ # Persist immediately after starting to ensure PID is recorded
1096
+ # (in case launcher crashes before Phase 4 completes)
1097
+ self.process_manager.persist_records()
1098
+
1099
+ # Wait for module.ready or module.exiting (whichever comes first)
512
1100
  timeout = info.launch.timeout
513
1101
  ready = await self._wait_event("module.ready", info.name, timeout=timeout)
514
- if ready:
515
- print(f"[launcher] Module '{info.name}' is ready")
1102
+ elapsed = time.monotonic() - t0
1103
+ if ready and ready.get("_exited"):
1104
+ # Module sent module.exiting before ready — it chose to quit
1105
+ reason = ready.get("reason", "unknown")
1106
+ self._exit_reasons[info.name] = reason
1107
+ print(f"[launcher] 模块 '{info.name}' 主动退出: {reason} ({elapsed:.2f}s)")
1108
+ elif ready:
1109
+ self._graceful_modules[info.name] = bool(ready.get("graceful_shutdown"))
1110
+ self._ready_times[info.name] = elapsed
1111
+ print(f"[launcher] 模块 '{info.name}' 已就绪 ({elapsed:.2f}s)")
516
1112
  else:
517
- print(f"[launcher] WARNING: '{info.name}' did not send module.ready within {timeout}s")
1113
+ print(f"[launcher] 警告: '{info.name}' {timeout}s 内未发送 module.ready")
518
1114
 
519
1115
  rec = self.process_manager.get_record(info.name)
520
1116
  self._log_lifecycle("started", info.name, pid=rec.pid if rec else None)
521
1117
  await self._publish_event("module.started", {"module_id": info.name})
522
-
523
- async def _start_enabled_modules(self):
524
- """Start modules in dependency order, auto-starting manual deps if needed."""
525
- to_start = [m for m in self.modules.values()
526
- if self._desired_states.get(m.name) == "running"]
527
- if not to_start:
528
- print("[launcher] No modules to start")
529
- return
530
-
531
- # Auto-start manual modules if depended upon
532
- needed = set(m.name for m in to_start)
533
- for m in to_start:
534
- for dep in m.depends_on:
535
- if dep not in needed:
536
- dep_info = self.modules.get(dep)
537
- if dep_info and dep_info.state != "disabled":
538
- needed.add(dep)
539
- to_start.append(dep_info)
540
- self._desired_states[dep] = "running"
541
- print(f"[launcher] Auto-starting '{dep}' (dependency)")
542
- elif dep_info and dep_info.state == "disabled":
543
- print(f"[launcher] ERROR: '{m.name}' depends on disabled module '{dep}'")
544
-
545
- try:
546
- sorted_modules = self._topo_sort(to_start)
547
- except RuntimeError as e:
548
- print(f"[launcher] ERROR: {e}")
549
- return
550
-
551
- print(f"[launcher] Starting {len(sorted_modules)} module(s)...")
552
- for info in sorted_modules:
553
- await self._start_one_module(info)
1118
+ self.process_manager.close_stdio(info.name)
554
1119
 
555
1120
  async def _register_module_tokens(self):
556
1121
  """Generate per-module tokens and register the mapping to Registry."""
1122
+ # Include all scanned modules + core modules
557
1123
  for name in self.modules:
558
- self._module_tokens[name] = secrets.token_hex(32)
1124
+ if name not in self._module_tokens:
1125
+ self._module_tokens[name] = secrets.token_hex(32)
1126
+ # Ensure registry has a token
1127
+ if "registry" not in self._module_tokens:
1128
+ self._module_tokens["registry"] = secrets.token_hex(32)
559
1129
 
560
1130
  if not self._module_tokens:
561
1131
  return
562
1132
 
1133
+ await self._register_tokens_to_registry(self._module_tokens)
1134
+
1135
+ async def _register_tokens_to_registry(self, tokens: dict):
1136
+ """Register token mapping to Registry via POST /tokens."""
563
1137
  url = f"http://127.0.0.1:{self.registry_port}/tokens"
564
1138
  headers = {"Authorization": f"Bearer {self.kite_token}"}
565
1139
  try:
566
- async with httpx.AsyncClient() as client:
567
- resp = await client.post(url, json=self._module_tokens, headers=headers, timeout=5)
568
- if resp.status_code == 200:
569
- print(f"[launcher] Registered {len(self._module_tokens)} module token(s)")
570
- else:
571
- print(f"[launcher] WARNING: token registration returned {resp.status_code}")
1140
+ client = self._get_http()
1141
+ resp = await client.post(url, json=tokens, headers=headers)
1142
+ if resp.status_code == 200:
1143
+ print(f"[launcher] 已注册 {len(tokens)} 个模块令牌")
1144
+ else:
1145
+ print(f"[launcher] 警告: 令牌注册返回 {resp.status_code}")
572
1146
  except Exception as e:
573
- print(f"[launcher] WARNING: failed to register module tokens: {e}")
1147
+ print(f"[launcher] 警告: 注册模块令牌失败: {e}")
1148
+
1149
+ # ── Validation ──
1150
+
1151
+ def _validate_core_modules(self):
1152
+ """Validate core modules exist (mechanism 12)."""
1153
+ project_root = os.environ["KITE_PROJECT"]
1154
+ for name in ("registry", "event_hub"):
1155
+ mod_dir = os.path.join(project_root, "core", name)
1156
+ md_path = os.path.join(mod_dir, "module.md")
1157
+ if not os.path.isdir(mod_dir):
1158
+ print(f"[launcher] 致命: 核心模块 '{name}' 目录未找到: {mod_dir}")
1159
+ sys.exit(1)
1160
+ if not os.path.isfile(md_path):
1161
+ print(f"[launcher] 致命: 核心模块 '{name}' 缺少 module.md: {md_path}")
1162
+ sys.exit(1)
1163
+ # Try to parse frontmatter
1164
+ try:
1165
+ with open(md_path, "r", encoding="utf-8") as f:
1166
+ fm = _parse_frontmatter(f.read())
1167
+ if not fm:
1168
+ print(f"[launcher] 致命: 核心模块 '{name}' module.md 没有有效的 frontmatter")
1169
+ sys.exit(1)
1170
+ except Exception as e:
1171
+ print(f"[launcher] 致命: 核心模块 '{name}' module.md 解析错误: {e}")
1172
+ sys.exit(1)
574
1173
 
575
1174
  # ── API thread ──
576
1175
 
@@ -591,70 +1190,119 @@ class Launcher:
591
1190
  t = threading.Thread(target=_run, daemon=True)
592
1191
  t.start()
593
1192
 
594
- # Wait for API server to actually be ready before proceeding
595
1193
  deadline = time.time() + 5
596
1194
  while time.time() < deadline:
597
1195
  if self._api_server.started:
598
1196
  break
599
1197
  time.sleep(0.05)
600
1198
  else:
601
- print("[launcher] WARNING: API server may not be fully ready")
1199
+ print("[launcher] 警告: API 服务器可能尚未完全就绪")
602
1200
 
603
- print(f"[launcher] API server started on port {self.api_port}")
1201
+ print(f"[launcher] API 服务器已启动,端口 {self.api_port}")
1202
+
1203
+ # ── Module crash summary ──
1204
+
1205
+ def _print_module_crash_summary(self, name: str):
1206
+ """Read module's crashes.jsonl last record and print red summary to console.
1207
+ Complement to module.crash event — reliable even if event was never sent."""
1208
+ RED = "\033[91m"
1209
+ RESET = "\033[0m"
1210
+ _suffix = os.environ.get("KITE_INSTANCE_SUFFIX", "")
1211
+ crash_log = os.path.join(
1212
+ os.environ.get("KITE_INSTANCE_DIR", ""), name, "log", f"crashes{_suffix}.jsonl"
1213
+ )
1214
+ if not os.path.isfile(crash_log):
1215
+ return
1216
+ try:
1217
+ with open(crash_log, "rb") as f:
1218
+ f.seek(0, 2)
1219
+ size = f.tell()
1220
+ if size == 0:
1221
+ return
1222
+ f.seek(max(0, size - 4096))
1223
+ lines = f.read().decode("utf-8").strip().split("\n")
1224
+ last = json.loads(lines[-1])
1225
+ exc_type = last.get("exception_type", "Unknown")
1226
+ ctx = last.get("context", {})
1227
+ file_name = ctx.get("file", "unknown")
1228
+ line_no = ctx.get("line", "?")
1229
+ print(f"[launcher] {RED}崩溃: "
1230
+ f"{exc_type} in {file_name}:{line_no}{RESET}")
1231
+ print(f"[launcher] 崩溃日志: {crash_log}")
1232
+ except Exception:
1233
+ pass
604
1234
 
605
1235
  # ── Monitor loop ──
606
1236
 
607
1237
  async def _monitor_loop(self):
608
- """Check child processes every second. Handle crashes."""
609
- MAX_FAIL = 3
610
- MAX_FAILED_MODULES = 3
611
-
612
- while not self._thread_shutdown.is_set():
1238
+ """Check child processes every second. Handle crashes.
1239
+ Uses _shutdown_event (asyncio.Event) so Ctrl+C wakes us immediately.
1240
+
1241
+ Responsibility split:
1242
+ - Core module crash → full restart (Launcher handles)
1243
+ - Watchdog crash → Launcher restarts directly (up to 3 times)
1244
+ - Other module exit → publish module.stopped event only; Watchdog decides restart
1245
+ """
1246
+ WATCHDOG_MAX_FAIL = 3
1247
+ watchdog_fail_count = 0
1248
+
1249
+ while not self._shutdown_event.is_set():
613
1250
  exited = self.process_manager.check_exited()
614
1251
 
615
1252
  for name, rc in exited:
616
- print(f"[launcher] Module '{name}' exited with code {rc}")
1253
+ print(f"[launcher] 模块 '{name}' 退出,返回码 {rc}")
1254
+ if rc != 0:
1255
+ self._print_module_crash_summary(name)
617
1256
  self._log_lifecycle("exited", name, exit_code=rc)
618
1257
  await self._publish_event("module.stopped", {
619
1258
  "module_id": name, "exit_code": rc,
1259
+ "graceful_shutdown": self._graceful_modules.get(name, False),
620
1260
  })
621
1261
  info = self.modules.get(name)
622
1262
 
623
- # Core module crash → full restart
624
- if info and info.is_core(self.project_root):
625
- print(f"[launcher] CRITICAL: core module '{name}' crashed, restarting all...")
1263
+ # 1) Core module crash → full restart
1264
+ if name in CORE_MODULE_NAMES or (info and info.is_core()):
1265
+ print(f"[launcher] 严重: 核心模块 '{name}' 崩溃,正在全部重启...")
626
1266
  self._log_lifecycle("core_crash", name, exit_code=rc)
627
1267
  await self._full_restart()
628
1268
  return
629
1269
 
630
- # Non-core: attempt restart if desired_state is "running"
631
- self._fail_counts[name] = self._fail_counts.get(name, 0) + 1
632
- count = self._fail_counts[name]
633
-
634
- if count < MAX_FAIL and self._desired_states.get(name) == "running" and info:
635
- print(f"[launcher] Restarting '{name}' (attempt {count}/{MAX_FAIL})...")
636
- await self._start_one_module(info)
637
- elif count >= MAX_FAIL:
638
- self._desired_states[name] = "stopped"
639
- self._log_lifecycle("failed", name, reason=f"exceeded {MAX_FAIL} retries")
640
- print(f"[launcher] Module '{name}' failed {MAX_FAIL} times, giving up")
1270
+ # 2) Watchdog crash Launcher restarts directly
1271
+ if name == WATCHDOG_MODULE_NAME:
1272
+ if self._system_shutting_down:
1273
+ print(f"[launcher] Watchdog 退出(系统关闭中),跳过重启")
1274
+ continue
1275
+ watchdog_fail_count += 1
1276
+ if watchdog_fail_count <= WATCHDOG_MAX_FAIL and info:
1277
+ print(f"[launcher] Watchdog 崩溃,正在重启 (第 {watchdog_fail_count}/{WATCHDOG_MAX_FAIL} 次)...")
1278
+ await self._start_one_module(info)
1279
+ else:
1280
+ self._desired_states[name] = "stopped"
1281
+ self._log_lifecycle("failed", name, reason=f"exceeded {WATCHDOG_MAX_FAIL} retries")
1282
+ print(f"[launcher] Watchdog 失败 {WATCHDOG_MAX_FAIL} 次,已放弃")
1283
+ continue
641
1284
 
642
- # Too many failed modules → exit
643
- failed_count = sum(1 for c in self._fail_counts.values() if c >= MAX_FAIL)
644
- if failed_count >= MAX_FAILED_MODULES:
645
- print(f"[launcher] {failed_count} modules permanently failed, Launcher exiting")
646
- return
1285
+ # 3) Other modules → event already published above; Watchdog decides restart
1286
+ # (no restart logic here Watchdog handles it via module.stopped event)
647
1287
 
648
1288
  if exited:
649
1289
  self.process_manager.persist_records()
650
1290
 
651
- await asyncio.sleep(1)
1291
+ # Wait 1s but wake immediately on shutdown signal
1292
+ try:
1293
+ await asyncio.wait_for(self._shutdown_event.wait(), timeout=1)
1294
+ return # shutdown requested
1295
+ except asyncio.TimeoutError:
1296
+ pass
652
1297
 
653
1298
  async def _full_restart(self):
654
- """Stop all modules, then re-run the startup sequence."""
655
- print("[launcher] Full restart: stopping all modules...")
1299
+ """Stop all modules, regenerate tokens, re-run Phase 1-4 (mechanism 10)."""
1300
+ print("[launcher] 全量重启: 正在停止所有模块...")
1301
+
1302
+ # Persist records before shutdown so cleanup_leftovers can find survivors
1303
+ self.process_manager.persist_records()
656
1304
 
657
- # Disconnect Event Hub
1305
+ # Disconnect Event Hub WS
658
1306
  if self._ws_task:
659
1307
  self._ws_task.cancel()
660
1308
  self._ws_task = None
@@ -662,79 +1310,303 @@ class Launcher:
662
1310
  self._heartbeat_task.cancel()
663
1311
  self._heartbeat_task = None
664
1312
  self._ws = None
1313
+ self._event_hub_ws_url = ""
1314
+ self._launcher_ws_token = ""
665
1315
 
666
1316
  await self._graceful_shutdown_all()
667
- self._fail_counts.clear()
1317
+
1318
+ # Cleanup any leftover processes that survived graceful shutdown.
1319
+ # Note: _graceful_shutdown_all() clears _processes/_records dicts, but
1320
+ # cleanup_leftovers() reads from processes.json (persisted above), so it can
1321
+ # still find and kill survivors.
1322
+ self.process_manager.cleanup_leftovers()
668
1323
 
669
1324
  self._module_tokens.clear()
670
1325
 
671
- print("[launcher] Full restart: re-running startup sequence...")
1326
+ # Regenerate kite_token
1327
+ self.kite_token = secrets.token_hex(32)
1328
+ self.process_manager.kite_token = self.kite_token
1329
+
1330
+ print("[launcher] 全量重启: 重新执行 Phase 1-4...")
672
1331
  try:
673
- await self._start_registry()
674
- await self._register_self()
675
- self.modules = self.module_scanner.scan()
676
- for n, info in self.modules.items():
677
- self._log_lifecycle("scanned", n, state=info.state, module_dir=info.module_dir)
678
- await self._register_module_tokens()
679
- await self._start_enabled_modules()
1332
+ await self._phase1_parallel_bootstrap()
1333
+ await self._phase3_registry_ready()
1334
+ await self._phase4_start_modules()
680
1335
  self.process_manager.persist_records()
681
- await self._connect_event_hub()
682
- print("[launcher] Full restart complete, resuming monitor loop")
1336
+ self._heartbeat_task = asyncio.create_task(self._heartbeat_loop())
1337
+ print("[launcher] 全量重启完成,恢复监控循环")
683
1338
  await self._monitor_loop()
684
1339
  except Exception as e:
685
- print(f"[launcher] Full restart failed: {e}")
1340
+ print(f"[launcher] 全量重启失败: {e}")
686
1341
 
687
1342
  # ── Shutdown ──
688
1343
 
689
1344
  def _final_cleanup(self):
690
1345
  """Called on exit — stop all processes, stop API, clear records."""
691
- print("[launcher] Shutting down...")
1346
+ try:
1347
+ print("[launcher] 正在执行最终清理...")
1348
+
1349
+ if self._ws_task:
1350
+ self._ws_task.cancel()
1351
+ if hasattr(self, '_heartbeat_task') and self._heartbeat_task:
1352
+ self._heartbeat_task.cancel()
1353
+
1354
+ # Note: _graceful_shutdown_all() already called stop_all() in _async_main finally block.
1355
+ # This is just a safety check — should normally find nothing.
1356
+ remaining = [n for n in self.process_manager._processes
1357
+ if self.process_manager.is_running(n)]
1358
+ if remaining:
1359
+ print(f"[launcher] 警告: 仍有残留进程 (不应出现): {', '.join(remaining)}")
1360
+ self.process_manager.stop_all(timeout=SHUTDOWN_TIMEOUT_BULK)
1361
+ else:
1362
+ print("[launcher] 无残留进程")
692
1363
 
693
- if self._ws_task:
694
- self._ws_task.cancel()
695
- if hasattr(self, '_heartbeat_task') and self._heartbeat_task:
696
- self._heartbeat_task.cancel()
1364
+ if self._api_server:
1365
+ self._api_server.should_exit = True
697
1366
 
698
- self.process_manager.stop_all(timeout=10)
1367
+ # Clear instance runtime files
1368
+ try:
1369
+ os.remove(self.process_manager.records_path)
1370
+ except OSError:
1371
+ pass
1372
+ except Exception as e:
1373
+ print(f"[launcher] 最终清理出错: {e}")
1374
+ finally:
1375
+ # Signal the safety-net thread that normal shutdown has completed
1376
+ self._shutdown_complete.set()
1377
+ print("[launcher] 再见。")
1378
+
1379
+ if IS_WINDOWS:
1380
+ os._exit(0)
1381
+
1382
+ # ── Startup report ──
1383
+
1384
+ async def _print_startup_report(self, total_time: float, phase_times: dict[str, float], *,
1385
+ global_instances=None, cleaned_stats: dict[str, int] | None = None):
1386
+ """Print a green startup summary with module list and timing."""
1387
+ G = "\033[32m" # green
1388
+ Y = "\033[33m" # yellow
1389
+ R = "\033[0m" # reset
1390
+ B = "\033[1;32m" # bold green
1391
+
1392
+ running = []
1393
+ exited = []
1394
+ stopped = []
1395
+ for name, info in self.modules.items():
1396
+ rec = self.process_manager.get_record(name)
1397
+ is_running = self.process_manager.is_running(name)
1398
+ if is_running and rec:
1399
+ running.append((name, info, rec))
1400
+ elif self._desired_states.get(name) == "running" and not is_running:
1401
+ # Was started but already exited (e.g. module.exiting)
1402
+ exited.append((name, info))
1403
+ else:
1404
+ stopped.append((name, info))
1405
+
1406
+ # Calculate kernel startup time (Phase 1+2+3)
1407
+ kernel_time = 0
1408
+ for phase_name in ["Phase 1+2: Registry + Event Hub (并行)", "Phase 3: Registry 事件总线"]:
1409
+ if phase_name in phase_times:
1410
+ kernel_time += phase_times[phase_name]
1411
+
1412
+ lines = [
1413
+ "",
1414
+ f"{B}{'=' * 60}",
1415
+ f" Kite 内核启动完成 耗时 {kernel_time:.2f}s",
1416
+ f" Kite 全部模块启动完成 总耗时 {total_time:.2f}s",
1417
+ f"{'=' * 60}{R}",
1418
+ ]
1419
+
1420
+ # Phase breakdown
1421
+ lines.append(f"{G} 阶段耗时:{R}")
1422
+
1423
+ # Kernel modules section
1424
+ lines.append(f"{G} 内核模块:{R}")
1425
+ for phase_name in ["Phase 1+2: Registry + Event Hub (并行)", "Phase 3: Registry 事件总线"]:
1426
+ if phase_name in phase_times:
1427
+ elapsed = phase_times[phase_name]
1428
+ lines.append(f"{G} {phase_name:<26s} {elapsed:>6.2f}s{R}")
1429
+
1430
+ # Extension modules section
1431
+ lines.append(f"{G} 扩展模块:{R}")
1432
+ if "Phase 4: Extensions" in phase_times:
1433
+ elapsed = phase_times["Phase 4: Extensions"]
1434
+ lines.append(f"{G} {'Phase 4: Extensions':<26s} {elapsed:>6.2f}s{R}")
1435
+
1436
+ # Sort running modules by ready time
1437
+ running_sorted = sorted(running, key=lambda x: self._ready_times.get(x[0], float('inf')))
1438
+
1439
+ # Running modules with ready time and elapsed from Kite start
1440
+ DIM = "\033[90m"
1441
+ lines.append(f"{G} 运行中 ({len(running)}):{R}")
1442
+
1443
+ # CJK-aware display width helpers
1444
+ def _dw(s):
1445
+ """Display width: CJK chars count as 2, others as 1."""
1446
+ w = 0
1447
+ for c in str(s):
1448
+ w += 2 if '\u4e00' <= c <= '\u9fff' or '\u3000' <= c <= '\u303f' or '\uff00' <= c <= '\uffef' else 1
1449
+ return w
1450
+
1451
+ def _rpad(s, width):
1452
+ """Left-align s in a field of given display width."""
1453
+ return str(s) + ' ' * max(0, width - _dw(s))
1454
+
1455
+ def _lpad(s, width):
1456
+ """Right-align s in a field of given display width."""
1457
+ return ' ' * max(0, width - _dw(s)) + str(s)
1458
+
1459
+ # Column definitions: (header, align, min_width)
1460
+ headers = ['模块', 'PID', '启动耗时', '进程启动时长', '类型']
1461
+ aligns = ['left', 'right', 'right', 'right', 'left'] # alignment per column
1462
+
1463
+ # Build data rows first to calculate column widths
1464
+ rows = []
1465
+ for name, info, rec in running_sorted:
1466
+ label = info.display_name or name
1467
+ ready_t = self._ready_times.get(name)
1468
+ time_str = f"{ready_t:.2f}s" if ready_t is not None else "—"
1469
+ if ready_t is not None and hasattr(self, '_start_unix'):
1470
+ elapsed_from_start = (rec.started_at + ready_t) - self._start_unix
1471
+ es_str = f"{elapsed_from_start:.2f}s"
1472
+ else:
1473
+ es_str = "—"
1474
+ rows.append([label, str(rec.pid), time_str, es_str, f"[{info.type}]"])
1475
+
1476
+ # Calculate column widths: max of header and all data display widths
1477
+ col_widths = [_dw(h) for h in headers]
1478
+ for row in rows:
1479
+ for i, cell in enumerate(row):
1480
+ col_widths[i] = max(col_widths[i], _dw(cell))
1481
+
1482
+ # Render header
1483
+ hdr_parts = []
1484
+ for i, h in enumerate(headers):
1485
+ if aligns[i] == 'left':
1486
+ hdr_parts.append(_rpad(h, col_widths[i]))
1487
+ else:
1488
+ hdr_parts.append(_lpad(h, col_widths[i]))
1489
+ lines.append(f"{DIM} {' '.join(hdr_parts)}{R}")
1490
+
1491
+ # Render data rows
1492
+ for row in rows:
1493
+ parts = []
1494
+ for i, cell in enumerate(row):
1495
+ if aligns[i] == 'left':
1496
+ parts.append(_rpad(cell, col_widths[i]))
1497
+ else:
1498
+ parts.append(_lpad(cell, col_widths[i]))
1499
+ lines.append(f"{G} ✓ {' '.join(parts)}{R}")
1500
+
1501
+ # Exited modules (started but already quit)
1502
+ if exited:
1503
+ lines.append(f"{Y} 已退出 ({len(exited)}):{R}")
1504
+ for name, info in exited:
1505
+ label = info.display_name or name
1506
+ reason = self._exit_reasons.get(name, "")
1507
+ reason_str = f": {reason}" if reason else ""
1508
+ lines.append(f"{Y} ↗ {label:<20s} (主动退出{reason_str}){R}")
1509
+
1510
+ # Stopped modules
1511
+ if stopped:
1512
+ lines.append(f"{G} 未启动 ({len(stopped)}):{R}")
1513
+ for name, info in stopped:
1514
+ label = info.display_name or name
1515
+ lines.append(f"{G} - {label:<20s} ({info.state}){R}")
1516
+
1517
+ lines.append(f"{G} Launcher API: http://127.0.0.1:{self.api_port} 实例: {self.instance_id}{R}")
1518
+
1519
+ # Query Registry for web module's access URL
1520
+ web_url = await self._get_web_url()
1521
+ if web_url:
1522
+ lines.append(f"{B} Web 管理后台: {web_url}{R}")
1523
+
1524
+ # Instance info
1525
+ instances = self.process_manager.get_alive_instances()
1526
+ inst_num = self.process_manager.instance_num
1527
+ suffix_display = self.process_manager.instance_suffix or "(无)"
1528
+ inst_dir = os.environ.get("KITE_INSTANCE_DIR", "")
1529
+ cwd = os.environ.get("KITE_CWD", "")
1530
+ debug_flag = " [DEBUG]" if os.environ.get("KITE_DEBUG") == "1" else ""
1531
+ lines.append(f"{G} 当前实例: #{inst_num} 后缀: {suffix_display} PID: {os.getpid()}{debug_flag}{R}")
1532
+ lines.append(f"{G} 实例目录: {inst_dir}{R}")
1533
+ lines.append(f"{G} 工作目录: {cwd}{R}")
1534
+ if len(instances) > 1:
1535
+ lines.append(f"{G} 所有实例:{R}")
1536
+ for i in instances:
1537
+ s = "" if i["num"] == 1 else f"~{i['num']}"
1538
+ debug_tag = " [DEBUG]" if i.get("debug", False) else ""
1539
+ current_tag = " (当前)" if i["is_self"] else ""
1540
+ lines.append(f"{G} #{i['num']} PID {i['launcher_pid']} "
1541
+ f"模块数 {i['module_count']} (processes{s}.json){debug_tag}{current_tag}{R}")
1542
+
1543
+ # Cross-directory instances from other projects
1544
+ if global_instances:
1545
+ my_inst_basename = os.path.basename(os.environ.get("KITE_INSTANCE_DIR", ""))
1546
+ other_instances = [i for i in global_instances
1547
+ if not i["is_self"] and i["instance_dir"] != my_inst_basename]
1548
+ if other_instances:
1549
+ lines.append(f"{G} 其他项目实例:{R}")
1550
+ for i in other_instances:
1551
+ debug_tag = " [DEBUG]" if i.get("debug", False) else ""
1552
+ cwd_display = f" {i['cwd']}" if i["cwd"] else ""
1553
+ lines.append(
1554
+ f"{G} {i['instance_dir']:<20s} "
1555
+ f"#{i['num']} PID {i['launcher_pid']} "
1556
+ f"模块数 {i['module_count']}"
1557
+ f"{cwd_display}{debug_tag}{R}"
1558
+ )
699
1559
 
700
- if self._api_server:
701
- self._api_server.should_exit = True
1560
+ if cleaned_stats:
1561
+ total = sum(cleaned_stats.values())
1562
+ if len(cleaned_stats) == 1:
1563
+ inst, count = next(iter(cleaned_stats.items()))
1564
+ lines.append(f"{Y} 已清理残留进程: {inst} ({count} 个){R}")
1565
+ else:
1566
+ lines.append(f"{Y} 已清理残留进程 (共 {total} 个):{R}")
1567
+ for inst, count in cleaned_stats.items():
1568
+ lines.append(f"{Y} {inst}: {count} 个{R}")
702
1569
 
703
- # Clear instance runtime files
704
- self.process_manager._write_records_file([])
705
- try:
706
- os.remove(self.process_manager.records_path)
707
- except OSError:
708
- pass
709
- from core.data_dir import get_registry_data_dir
710
- port_file = os.path.join(get_registry_data_dir(), f"port_{self.instance_id}.txt")
1570
+ lines.append(f"{B}{'=' * 60}{R}")
1571
+ lines.append("")
1572
+
1573
+ print("\n".join(lines))
1574
+
1575
+ async def _get_web_url(self) -> str:
1576
+ """Query Registry for the web module's api_endpoint. Returns URL or empty string."""
711
1577
  try:
712
- os.remove(port_file)
713
- except OSError:
1578
+ client = self._get_http()
1579
+ resp = await client.get(
1580
+ f"http://127.0.0.1:{self.registry_port}/get/web.api_endpoint",
1581
+ headers={"Authorization": f"Bearer {self.kite_token}"},
1582
+ timeout=3,
1583
+ )
1584
+ if resp.status_code == 200:
1585
+ val = resp.json()
1586
+ if val and isinstance(val, str):
1587
+ # Show localhost instead of 127.0.0.1 for friendliness
1588
+ return val.replace("://127.0.0.1:", "://localhost:")
1589
+ except Exception:
714
1590
  pass
715
- print("[launcher] Goodbye.")
716
-
717
- if IS_WINDOWS:
718
- os._exit(0)
1591
+ return ""
719
1592
 
720
1593
  # ── Utilities ──
721
1594
 
722
1595
  def _load_discovery(self) -> dict | None:
723
1596
  """Read discovery config from launcher's own module.md."""
724
- md_path = os.path.join(self.project_root, "core", "launcher", "module.md")
1597
+ md_path = os.path.join(os.environ["KITE_PROJECT"], "core", "launcher", "module.md")
725
1598
  try:
726
1599
  with open(md_path, "r", encoding="utf-8") as f:
727
1600
  fm = _parse_frontmatter(f.read())
728
1601
  discovery = fm.get("discovery")
729
1602
  if isinstance(discovery, dict) and discovery:
730
- print(f"[launcher] Discovery sources: {', '.join(discovery.keys())}")
731
1603
  return discovery
732
1604
  except Exception as e:
733
- print(f"[launcher] WARNING: failed to read discovery config: {e}")
1605
+ print(f"[launcher] 警告: 读取发现配置失败: {e}")
734
1606
  return None
735
1607
 
736
1608
  def _log_lifecycle(self, event: str, module: str, **extra):
737
- """Append one JSONL line to core/launcher/data/lifecycle.jsonl."""
1609
+ """Append one JSONL line to lifecycle.jsonl."""
738
1610
  from datetime import datetime, timezone
739
1611
  record = {"ts": datetime.now(timezone.utc).isoformat(), "event": event, "module": module}
740
1612
  record.update(extra)
@@ -757,12 +1629,29 @@ class Launcher:
757
1629
 
758
1630
  def _create_api_app(self) -> FastAPI:
759
1631
  """Create the FastAPI app with Launcher management routes."""
1632
+ from fastapi import Request, HTTPException
760
1633
  app = FastAPI(title="Kite Launcher", docs_url=None, redoc_url=None)
761
- launcher = self # closure reference
1634
+ launcher = self
1635
+
1636
+ def _require_auth(request: Request):
1637
+ """Verify Bearer token and IP whitelist. Raise 401/403 on failure."""
1638
+ # IP whitelist: only allow 127.0.0.1
1639
+ client_host = request.client.host if request.client else None
1640
+ if client_host not in ("127.0.0.1", "::1", "localhost"):
1641
+ raise HTTPException(status_code=403, detail="Access denied: only localhost allowed")
1642
+
1643
+ # Bearer token verification
1644
+ auth = request.headers.get("Authorization", "")
1645
+ if not auth.startswith("Bearer "):
1646
+ raise HTTPException(status_code=401, detail="Missing or invalid Authorization header")
1647
+ token = auth[7:].strip()
1648
+ if token != launcher.kite_token:
1649
+ raise HTTPException(status_code=401, detail="Invalid token")
762
1650
 
763
1651
  @app.get("/launcher/modules")
764
- async def list_modules():
765
- """List all modules and their current status (three-layer state model)."""
1652
+ async def list_modules(request: Request):
1653
+ """List all modules and their current status."""
1654
+ _require_auth(request)
766
1655
  result = []
767
1656
  for name, info in launcher.modules.items():
768
1657
  running = launcher.process_manager.is_running(name)
@@ -780,39 +1669,32 @@ class Launcher:
780
1669
  return result
781
1670
 
782
1671
  @app.post("/launcher/modules/{name}/start")
783
- async def start_module(name: str):
784
- """Start a module by name. Generates token and passes boot_info via stdin."""
1672
+ async def start_module(name: str, request: Request):
1673
+ """Start a module by name."""
1674
+ _require_auth(request)
785
1675
  info = launcher.modules.get(name)
786
1676
  if not info:
787
1677
  raise HTTPException(404, f"Module '{name}' not found")
788
1678
  if info.state == "disabled":
789
1679
  raise HTTPException(403, f"Module '{name}' is disabled")
790
1680
 
791
- # Generate token if not already present
792
1681
  if name not in launcher._module_tokens:
793
1682
  launcher._module_tokens[name] = secrets.token_hex(32)
794
- # Register the new token to Registry
795
1683
  try:
796
- async with httpx.AsyncClient() as client:
797
- await client.post(
798
- f"http://127.0.0.1:{launcher.registry_port}/tokens",
799
- json={name: launcher._module_tokens[name]},
800
- headers={"Authorization": f"Bearer {launcher.kite_token}"},
801
- timeout=5,
802
- )
1684
+ client = launcher._get_http()
1685
+ await client.post(
1686
+ f"http://127.0.0.1:{launcher.registry_port}/tokens",
1687
+ json={name: launcher._module_tokens[name]},
1688
+ headers={"Authorization": f"Bearer {launcher.kite_token}"},
1689
+ )
803
1690
  except Exception as e:
804
- print(f"[launcher] WARNING: failed to register token for {name}: {e}")
1691
+ print(f"[launcher] 警告: 注册 {name} 的令牌失败: {e}")
805
1692
 
806
1693
  token = launcher._module_tokens[name]
807
- boot_info = {
808
- "token": token,
809
- "registry_port": launcher.registry_port,
810
- "preferred_port": info.preferred_port,
811
- }
1694
+ boot_info = {"token": token}
812
1695
  ok = launcher.process_manager.start_module(info, boot_info=boot_info)
813
1696
  if ok:
814
1697
  launcher._desired_states[name] = "running"
815
- launcher._fail_counts.pop(name, None)
816
1698
  launcher.process_manager.persist_records()
817
1699
  rec = launcher.process_manager.get_record(name)
818
1700
  launcher._log_lifecycle("started", name, pid=rec.pid if rec else None, via="api")
@@ -822,8 +1704,9 @@ class Launcher:
822
1704
  raise HTTPException(500, f"Failed to start '{name}'")
823
1705
 
824
1706
  @app.post("/launcher/modules/{name}/stop")
825
- async def stop_module(name: str, body: dict = None):
826
- """Stop a module with graceful shutdown. Accepts optional reason."""
1707
+ async def stop_module(name: str, request: Request, body: dict = None):
1708
+ """Stop a module with graceful shutdown."""
1709
+ _require_auth(request)
827
1710
  info = launcher.modules.get(name)
828
1711
  if not info:
829
1712
  raise HTTPException(404, f"Module '{name}' not found")
@@ -834,8 +1717,9 @@ class Launcher:
834
1717
  return {"status": "stopped", "name": name}
835
1718
 
836
1719
  @app.post("/launcher/modules/{name}/restart")
837
- async def restart_module(name: str, body: dict = None):
1720
+ async def restart_module(name: str, request: Request, body: dict = None):
838
1721
  """Restart a module (stop + start)."""
1722
+ _require_auth(request)
839
1723
  info = launcher.modules.get(name)
840
1724
  if not info:
841
1725
  raise HTTPException(404, f"Module '{name}' not found")
@@ -843,28 +1727,21 @@ class Launcher:
843
1727
  raise HTTPException(403, f"Module '{name}' is disabled")
844
1728
  reason = (body or {}).get("reason", "restart")
845
1729
  await launcher._graceful_stop(name, reason)
846
- # Re-generate token
847
1730
  launcher._module_tokens[name] = secrets.token_hex(32)
848
1731
  try:
849
- async with httpx.AsyncClient() as client:
850
- await client.post(
851
- f"http://127.0.0.1:{launcher.registry_port}/tokens",
852
- json={name: launcher._module_tokens[name]},
853
- headers={"Authorization": f"Bearer {launcher.kite_token}"},
854
- timeout=5,
855
- )
1732
+ client = launcher._get_http()
1733
+ await client.post(
1734
+ f"http://127.0.0.1:{launcher.registry_port}/tokens",
1735
+ json={name: launcher._module_tokens[name]},
1736
+ headers={"Authorization": f"Bearer {launcher.kite_token}"},
1737
+ )
856
1738
  except Exception:
857
1739
  pass
858
1740
  token = launcher._module_tokens[name]
859
- boot_info = {
860
- "token": token,
861
- "registry_port": launcher.registry_port,
862
- "preferred_port": info.preferred_port,
863
- }
1741
+ boot_info = {"token": token}
864
1742
  ok = launcher.process_manager.start_module(info, boot_info=boot_info)
865
1743
  if ok:
866
1744
  launcher._desired_states[name] = "running"
867
- launcher._fail_counts.pop(name, None)
868
1745
  launcher.process_manager.persist_records()
869
1746
  rec = launcher.process_manager.get_record(name)
870
1747
  launcher._log_lifecycle("started", name, pid=rec.pid if rec else None, via="restart_api")
@@ -874,8 +1751,9 @@ class Launcher:
874
1751
  raise HTTPException(500, f"Failed to restart '{name}'")
875
1752
 
876
1753
  @app.post("/launcher/rescan")
877
- async def rescan_modules():
1754
+ async def rescan_modules(request: Request):
878
1755
  """Rescan module directories for new/removed modules."""
1756
+ _require_auth(request)
879
1757
  old_names = set(launcher.modules.keys())
880
1758
  launcher.modules = launcher.module_scanner.scan()
881
1759
  new_names = set(launcher.modules.keys())
@@ -884,31 +1762,37 @@ class Launcher:
884
1762
  for name in added:
885
1763
  info = launcher.modules[name]
886
1764
  launcher._log_lifecycle("scanned", name, state=info.state, module_dir=info.module_dir)
887
- # Initialize desired_state for new modules
888
1765
  for name in added:
889
1766
  info = launcher.modules[name]
890
1767
  launcher._desired_states[name] = "running" if info.state == "enabled" else "stopped"
891
- # Register tokens for new modules
892
1768
  if added:
893
1769
  new_tokens = {}
894
1770
  for name in added:
895
1771
  launcher._module_tokens[name] = secrets.token_hex(32)
896
1772
  new_tokens[name] = launcher._module_tokens[name]
897
1773
  try:
898
- async with httpx.AsyncClient() as client:
899
- await client.post(
900
- f"http://127.0.0.1:{launcher.registry_port}/tokens",
901
- json=new_tokens,
902
- headers={"Authorization": f"Bearer {launcher.kite_token}"},
903
- timeout=5,
904
- )
1774
+ client = launcher._get_http()
1775
+ await client.post(
1776
+ f"http://127.0.0.1:{launcher.registry_port}/tokens",
1777
+ json=new_tokens,
1778
+ headers={"Authorization": f"Bearer {launcher.kite_token}"},
1779
+ )
905
1780
  except Exception:
906
1781
  pass
907
1782
  return {"added": added, "removed": removed, "total": len(launcher.modules)}
908
1783
 
1784
+ @app.post("/launcher/shutdown")
1785
+ async def shutdown_launcher(request: Request, body: dict = None):
1786
+ """Shutdown the entire Kite system (equivalent to Ctrl+C)."""
1787
+ _require_auth(request)
1788
+ reason = (body or {}).get("reason", "api_request")
1789
+ launcher._request_shutdown(f"API shutdown request: {reason}")
1790
+ return {"status": "shutting_down", "reason": reason}
1791
+
909
1792
  @app.put("/launcher/modules/{name}/state")
910
- async def update_state(name: str, body: dict):
1793
+ async def update_state(name: str, request: Request, body: dict):
911
1794
  """Update module state (enabled/manual/disabled). Writes to module.md."""
1795
+ _require_auth(request)
912
1796
  info = launcher.modules.get(name)
913
1797
  if not info:
914
1798
  raise HTTPException(404, f"Module '{name}' not found")
@@ -917,14 +1801,12 @@ class Launcher:
917
1801
  if new_state not in ("enabled", "manual", "disabled"):
918
1802
  raise HTTPException(400, "state must be enabled, manual, or disabled")
919
1803
 
920
- # Core modules cannot be disabled
921
- if info.is_core(launcher.project_root) and new_state == "disabled":
1804
+ if info.is_core() and new_state == "disabled":
922
1805
  raise HTTPException(403, "Core modules cannot be disabled")
923
1806
 
924
1807
  old_state = info.state
925
1808
  info.state = new_state
926
1809
 
927
- # Update desired_state to match new config_state
928
1810
  if new_state == "enabled":
929
1811
  launcher._desired_states[name] = "running"
930
1812
  else:
@@ -956,7 +1838,6 @@ def _update_module_md_state(module_dir: str, new_state: str):
956
1838
  with open(md_path, "r", encoding="utf-8") as f:
957
1839
  content = f.read()
958
1840
 
959
- # Replace state: xxx in frontmatter
960
1841
  updated = re.sub(
961
1842
  r'^(state:\s*)(\S+)',
962
1843
  rf'\g<1>{new_state}',
@@ -968,4 +1849,4 @@ def _update_module_md_state(module_dir: str, new_state: str):
968
1849
  with open(md_path, "w", encoding="utf-8") as f:
969
1850
  f.write(updated)
970
1851
  except Exception as e:
971
- print(f"[launcher] WARNING: failed to update module.md state: {e}")
1852
+ print(f"[launcher] 警告: 更新 module.md 状态失败: {e}")