@agentunion/kite 1.0.6 → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/cli.js +127 -25
  2. package/core/event_hub/entry.py +105 -61
  3. package/core/event_hub/module.md +0 -1
  4. package/core/event_hub/server.py +96 -28
  5. package/core/launcher/entry.py +477 -290
  6. package/core/launcher/module_scanner.py +10 -9
  7. package/core/launcher/process_manager.py +120 -96
  8. package/core/registry/entry.py +66 -30
  9. package/core/registry/server.py +47 -14
  10. package/core/registry/store.py +6 -1
  11. package/{core → extensions}/event_hub_bench/entry.py +17 -9
  12. package/{core → extensions}/event_hub_bench/module.md +2 -1
  13. package/extensions/services/watchdog/entry.py +11 -7
  14. package/extensions/services/watchdog/server.py +1 -1
  15. package/main.py +204 -4
  16. package/package.json +11 -2
  17. package/core/__pycache__/__init__.cpython-313.pyc +0 -0
  18. package/core/__pycache__/data_dir.cpython-313.pyc +0 -0
  19. package/core/data_dir.py +0 -62
  20. package/core/event_hub/__pycache__/__init__.cpython-313.pyc +0 -0
  21. package/core/event_hub/__pycache__/bench.cpython-313.pyc +0 -0
  22. package/core/event_hub/__pycache__/bench_perf.cpython-313.pyc +0 -0
  23. package/core/event_hub/__pycache__/dedup.cpython-313.pyc +0 -0
  24. package/core/event_hub/__pycache__/entry.cpython-313.pyc +0 -0
  25. package/core/event_hub/__pycache__/hub.cpython-313.pyc +0 -0
  26. package/core/event_hub/__pycache__/router.cpython-313.pyc +0 -0
  27. package/core/event_hub/__pycache__/server.cpython-313.pyc +0 -0
  28. package/core/event_hub/bench_results/.gitkeep +0 -0
  29. package/core/event_hub/bench_results/2026-02-28_13-26-48.json +0 -51
  30. package/core/event_hub/bench_results/2026-02-28_13-44-45.json +0 -51
  31. package/core/event_hub/bench_results/2026-02-28_13-45-39.json +0 -51
  32. package/core/launcher/__pycache__/__init__.cpython-313.pyc +0 -0
  33. package/core/launcher/__pycache__/entry.cpython-313.pyc +0 -0
  34. package/core/launcher/__pycache__/module_scanner.cpython-313.pyc +0 -0
  35. package/core/launcher/__pycache__/process_manager.cpython-313.pyc +0 -0
  36. package/core/launcher/data/log/lifecycle.jsonl +0 -1158
  37. package/core/launcher/data/token.txt +0 -1
  38. package/core/registry/__pycache__/__init__.cpython-313.pyc +0 -0
  39. package/core/registry/__pycache__/entry.cpython-313.pyc +0 -0
  40. package/core/registry/__pycache__/server.cpython-313.pyc +0 -0
  41. package/core/registry/__pycache__/store.cpython-313.pyc +0 -0
  42. package/core/registry/data/port.txt +0 -1
  43. package/core/registry/data/port_484.txt +0 -1
  44. package/extensions/__pycache__/__init__.cpython-313.pyc +0 -0
  45. package/extensions/services/__pycache__/__init__.cpython-313.pyc +0 -0
  46. package/extensions/services/watchdog/__pycache__/__init__.cpython-313.pyc +0 -0
  47. package/extensions/services/watchdog/__pycache__/entry.cpython-313.pyc +0 -0
  48. package/extensions/services/watchdog/__pycache__/monitor.cpython-313.pyc +0 -0
  49. package/extensions/services/watchdog/__pycache__/server.cpython-313.pyc +0 -0
@@ -4,8 +4,14 @@ Launcher — the core of Kite. Manages module lifecycle, exposes API, monitors p
4
4
  Thread model:
5
5
  - Main thread: asyncio event loop (process management + monitor loop)
6
6
  - API thread: independent thread running uvicorn + FastAPI
7
- - stdout threads: one daemon thread per child process
7
+ - stdout threads: one daemon thread per child process (ProcessManager)
8
8
  - (Windows) keyboard listener thread: polls for 'q' key
9
+
10
+ 4-Phase startup:
11
+ Phase 1: Registry → stdout port → KITE_REGISTRY_PORT → API → register self + tokens
12
+ Phase 2: Event Hub → stdin launcher_ws_token → stdout ws_endpoint → WS connect → module.ready
13
+ Phase 3: Event Hub → Registry → Registry → Event Hub WS → module.ready
14
+ Phase 4: start remaining enabled modules in topo order
9
15
  """
10
16
 
11
17
  import asyncio
@@ -22,25 +28,35 @@ import httpx
22
28
  import uvicorn
23
29
  import websockets
24
30
  from fastapi import FastAPI, HTTPException
25
- from fastapi.responses import JSONResponse
26
31
 
27
- from .module_scanner import ModuleScanner, ModuleInfo, _parse_frontmatter
32
+ from .module_scanner import ModuleScanner, ModuleInfo, LaunchConfig, _parse_frontmatter
28
33
  from .process_manager import ProcessManager
29
- from core.data_dir import get_launcher_data_dir
30
34
 
31
35
  IS_WINDOWS = sys.platform == "win32"
32
36
 
37
+ # Core module names that are started in Phase 1-2 (not Phase 4)
38
+ CORE_MODULE_NAMES = {"registry", "event_hub"}
39
+
33
40
 
34
41
  class Launcher:
35
42
  """Kite system entry point. Starts Registry, manages modules, exposes API."""
36
43
 
37
44
  def __init__(self, kite_token: str):
38
45
  self.kite_token = kite_token
39
- self.project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
40
46
  self.instance_id = str(os.getpid())
41
- self.process_manager = ProcessManager(self.project_root, kite_token, self.instance_id)
47
+ os.environ["KITE_INSTANCE"] = self.instance_id
48
+
49
+ # Resolve instance workspace (must happen before ProcessManager init)
50
+ self._resolve_instance_dir()
51
+ os.environ["KITE_MODULE_DATA"] = os.path.join(
52
+ os.environ["KITE_INSTANCE_DIR"], "launcher",
53
+ )
54
+
55
+ self.process_manager = ProcessManager(
56
+ kite_token, self.instance_id,
57
+ on_kite_message=self._on_kite_message,
58
+ )
42
59
  self.module_scanner = ModuleScanner(
43
- self.project_root,
44
60
  discovery=self._load_discovery(),
45
61
  )
46
62
 
@@ -55,30 +71,122 @@ class Launcher:
55
71
  self._module_tokens: dict[str, str] = {} # module_name -> per-module token
56
72
 
57
73
  # Three-layer state model: desired_state per module
58
- # Initialized from config_state: enabled→running, manual→stopped, disabled→stopped
59
74
  self._desired_states: dict[str, str] = {} # module_name -> "running" | "stopped"
60
75
 
61
76
  # Event Hub WebSocket client
62
77
  self._event_hub_ws_url: str = ""
78
+ self._launcher_ws_token: str = ""
63
79
  self._ws: object | None = None
64
80
  self._ws_task: asyncio.Task | None = None
65
81
  self._loop: asyncio.AbstractEventLoop | None = None
66
82
 
67
83
  # Event waiters: {event_key: (asyncio.Event, data_dict)}
68
- # event_key format: "event_type:module_id"
69
84
  self._event_waiters: dict[str, tuple[asyncio.Event, dict]] = {}
70
85
 
86
+ # Kite stdout message waiters: {waiter_key: (threading.Event, data_dict)}
87
+ # Used by ProcessManager stdout callback (cross-thread)
88
+ self._msg_waiters: dict[str, tuple[threading.Event, dict]] = {}
89
+
71
90
  self._lifecycle_log = os.path.join(
72
- get_launcher_data_dir(), "lifecycle.jsonl",
91
+ os.environ["KITE_INSTANCE_DIR"], "launcher", "lifecycle.jsonl",
73
92
  )
74
93
  self._app = self._create_api_app()
75
94
 
95
+ # ── Instance workspace resolution ──
96
+
97
+ @staticmethod
98
+ def _resolve_instance_dir():
99
+ """Resolve KITE_INSTANCE_DIR from KITE_WORKSPACE + KITE_CWD.
100
+ Algorithm: take CWD basename, find matching dir in workspace via .cwd file,
101
+ or create new one. Sets KITE_INSTANCE_DIR env var.
102
+ """
103
+ if os.environ.get("KITE_INSTANCE_DIR"):
104
+ return # already set (e.g. by tests or parent)
105
+
106
+ cwd = os.environ.get("KITE_CWD", os.getcwd())
107
+ workspace = os.environ.get("KITE_WORKSPACE", "")
108
+ if not workspace:
109
+ home = os.environ.get("HOME") or os.environ.get("USERPROFILE") or os.path.expanduser("~")
110
+ workspace = os.path.join(home, ".kite", "workspace")
111
+ os.environ["KITE_WORKSPACE"] = workspace
112
+
113
+ basename = os.path.basename(cwd.rstrip(os.sep)) or "default"
114
+ suffix = 0
115
+
116
+ while True:
117
+ name = basename if suffix == 0 else f"{basename}~{suffix}"
118
+ candidate = os.path.join(workspace, name)
119
+ cwd_file = os.path.join(candidate, ".cwd")
120
+
121
+ if not os.path.exists(candidate):
122
+ # Empty slot — create new workspace
123
+ os.makedirs(candidate, exist_ok=True)
124
+ with open(cwd_file, "w", encoding="utf-8") as f:
125
+ f.write(cwd)
126
+ os.environ["KITE_INSTANCE_DIR"] = candidate
127
+ print(f"[launcher] 实例工作区已创建: {candidate}")
128
+ return
129
+
130
+ if os.path.isfile(cwd_file):
131
+ try:
132
+ with open(cwd_file, "r", encoding="utf-8") as f:
133
+ if f.read().strip() == cwd:
134
+ os.environ["KITE_INSTANCE_DIR"] = candidate
135
+ print(f"[launcher] 实例工作区已找到: {candidate}")
136
+ return
137
+ except Exception:
138
+ pass
139
+
140
+ suffix += 1
141
+
142
+ # ── Kite stdout message callback ──
143
+
144
+ def _on_kite_message(self, module_name: str, msg: dict):
145
+ """Called by ProcessManager stdout reader thread when a kite message is detected.
146
+ Thread-safe: only touches _msg_waiters (dict + threading.Event).
147
+ """
148
+ kite_type = msg.get("kite", "")
149
+ key = f"{module_name}:{kite_type}"
150
+ waiter = self._msg_waiters.get(key)
151
+ if waiter:
152
+ waiter[1].update(msg)
153
+ waiter[0].set()
154
+
155
+ async def _wait_kite_message(self, module_name: str, kite_type: str,
156
+ timeout: float) -> dict | None:
157
+ """Wait for a kite stdout message from a module. Returns msg dict or None on timeout.
158
+ Checks shutdown flag every 0.5s so Ctrl+C is responsive even during Phase 1-2 waits.
159
+ """
160
+ key = f"{module_name}:{kite_type}"
161
+ evt = threading.Event()
162
+ data = {}
163
+ self._msg_waiters[key] = (evt, data)
164
+ shutdown = self._thread_shutdown
165
+ try:
166
+ def _wait():
167
+ deadline = time.monotonic() + timeout
168
+ while time.monotonic() < deadline:
169
+ if evt.wait(timeout=0.5):
170
+ return True
171
+ if shutdown.is_set():
172
+ return False
173
+ return False
174
+ got = await asyncio.get_running_loop().run_in_executor(None, _wait)
175
+ return data if got else None
176
+ finally:
177
+ self._msg_waiters.pop(key, None)
178
+
76
179
  # ── Public entry ──
77
180
 
78
181
  def run(self):
79
182
  """Synchronous entry point. Sets up signals, runs the async main loop."""
80
- print("[launcher] Kite starting...")
81
- print(f"[launcher] Project root: {self.project_root}")
183
+ print("[launcher] Kite 启动中...")
184
+ print("[launcher] ── 环境变量 ──")
185
+ for key in sorted(k for k in os.environ if k.startswith("KITE_")):
186
+ print(f"[launcher] {key} = {os.environ[key]}")
187
+ print(f"[launcher] PID = {os.getpid()}")
188
+ print(f"[launcher] PYTHON = {sys.executable}")
189
+ print(f"[launcher] PLATFORM = {sys.platform}")
82
190
 
83
191
  if IS_WINDOWS:
84
192
  self._setup_windows_exit()
@@ -92,113 +200,303 @@ class Launcher:
92
200
  finally:
93
201
  self._final_cleanup()
94
202
 
203
+ def _request_shutdown(self, reason: str = ""):
204
+ """Request graceful shutdown. Thread-safe — can be called from signal handler or any thread."""
205
+ if self._thread_shutdown.is_set():
206
+ return # already shutting down
207
+ print(f"\n[launcher] {reason or '收到关闭请求'}")
208
+ self._thread_shutdown.set()
209
+ # Wake up asyncio event loop immediately (so _monitor_loop / wait_for exits)
210
+ loop = self._loop
211
+ if loop and not loop.is_closed():
212
+ try:
213
+ loop.call_soon_threadsafe(self._shutdown_event.set)
214
+ except RuntimeError:
215
+ pass
216
+ # Safety net: force exit after 15s no matter what
217
+ def _force():
218
+ time.sleep(15)
219
+ os._exit(1)
220
+ threading.Thread(target=_force, daemon=True).start()
221
+
95
222
  def _setup_unix_signals(self):
96
223
  """Register SIGTERM/SIGINT handlers on Linux/macOS."""
97
224
  def _handler(signum, frame):
98
- print(f"\n[launcher] Received signal {signum}, shutting down...")
99
- self._thread_shutdown.set()
225
+ self._request_shutdown(f"收到信号 {signum},正在关闭...")
100
226
  signal.signal(signal.SIGTERM, _handler)
101
227
  signal.signal(signal.SIGINT, _handler)
102
228
 
103
229
  def _setup_windows_exit(self):
104
- """Start a daemon thread that listens for 'q' or Ctrl+C on Windows."""
230
+ """SetConsoleCtrlHandler for Ctrl+C + daemon thread for 'q' key.
231
+
232
+ Why not signal.signal(SIGINT)?
233
+ Python's signal delivery requires the main thread to be executing bytecode.
234
+ When the main thread is blocked in C code (asyncio ProactorEventLoop →
235
+ GetQueuedCompletionStatus), SIGINT is never delivered.
236
+ SetConsoleCtrlHandler runs its callback in a separate OS thread, so it
237
+ always works regardless of what the main thread is doing.
238
+ """
239
+ import ctypes
240
+
241
+ @ctypes.WINFUNCTYPE(ctypes.c_int, ctypes.c_uint)
242
+ def _ctrl_handler(ctrl_type):
243
+ if ctrl_type in (0, 1): # CTRL_C_EVENT, CTRL_BREAK_EVENT
244
+ self._request_shutdown("收到 Ctrl+C,正在关闭...")
245
+ return 1 # handled — prevent default (which kills the process)
246
+ return 0
247
+
248
+ # prevent GC of the C callback
249
+ self._ctrl_handler_ref = _ctrl_handler
250
+ ctypes.windll.kernel32.SetConsoleCtrlHandler(_ctrl_handler, 1)
251
+
252
+ # 'q' key: handle via msvcrt polling
105
253
  def _listen():
106
254
  import msvcrt
107
255
  while not self._thread_shutdown.is_set():
108
256
  if msvcrt.kbhit():
109
257
  ch = msvcrt.getch()
110
- if ch in (b'q', b'Q', b'\x03'): # q or Ctrl+C
111
- print("\n[launcher] Exit requested, shutting down...")
112
- self._thread_shutdown.set()
258
+ if ch in (b'q', b'Q'):
259
+ self._request_shutdown("收到退出请求,正在关闭...")
113
260
  return
114
261
  time.sleep(0.1)
115
- t = threading.Thread(target=_listen, daemon=True)
116
- t.start()
262
+ threading.Thread(target=_listen, daemon=True).start()
117
263
 
118
- # ── Async main ──
264
+ # ── Async main (4-Phase startup) ──
119
265
 
120
266
  async def _async_main(self):
121
- """Full startup sequence, then monitor loop."""
267
+ """Full 4-phase startup sequence, then monitor loop."""
122
268
  self._loop = asyncio.get_running_loop()
123
269
 
124
- # Step 1: cleanup leftovers
125
- self.process_manager.cleanup_leftovers()
270
+ # Validate core modules exist (mechanism 12)
271
+ self._validate_core_modules()
126
272
 
127
- # Step 2: start Registry
128
- await self._start_registry()
273
+ # Cleanup leftovers from previous instances
274
+ self.process_manager.cleanup_leftovers()
129
275
 
130
- # Step 3: start Launcher API in a separate thread
131
- self._start_api_thread()
276
+ # Phase 1: Registry bootstrap
277
+ await self._phase1_registry()
278
+ if self._shutdown_event.is_set(): return
132
279
 
133
- # Step 4: register Launcher to Registry
134
- await self._register_self()
135
-
136
- # Step 5: scan modules
280
+ # Scan modules (can happen before Phase 2)
137
281
  self.modules = self.module_scanner.scan()
138
282
  for name, info in self.modules.items():
139
283
  self._log_lifecycle("scanned", name, state=info.state, module_dir=info.module_dir)
140
- print(f"[launcher] Found {len(self.modules)} module(s): {', '.join(self.modules.keys()) or '(none)'}")
284
+ print(f"[launcher] 发现 {len(self.modules)} 个模块: {', '.join(self.modules.keys()) or '()'}")
285
+
286
+ # Generate per-module tokens (including event_hub and registry)
287
+ await self._register_module_tokens()
288
+ if self._shutdown_event.is_set(): return
289
+
290
+ # Phase 2: Event Hub bootstrap
291
+ await self._phase2_event_hub()
292
+ if self._shutdown_event.is_set(): return
141
293
 
142
- # Step 5.5: initialize desired_state from config_state
294
+ # Phase 3: Wait for Registry delayed ready
295
+ await self._phase3_registry_ready()
296
+ if self._shutdown_event.is_set(): return
297
+
298
+ # Phase 4: Start remaining enabled modules
299
+ # Initialize desired_state from config_state
143
300
  for name, info in self.modules.items():
144
301
  if info.state == "enabled":
145
302
  self._desired_states[name] = "running"
146
303
  else: # manual, disabled
147
304
  self._desired_states[name] = "stopped"
305
+ # Core modules are already running
306
+ for cn in CORE_MODULE_NAMES:
307
+ self._desired_states[cn] = "running"
148
308
 
149
- # Step 6: generate per-module tokens and register to Registry
150
- await self._register_module_tokens()
151
-
152
- # Step 7: start enabled modules
153
- await self._start_enabled_modules()
309
+ await self._phase4_start_modules()
310
+ if self._shutdown_event.is_set(): return
154
311
 
155
- # Step 8: persist records
312
+ # Post-startup
156
313
  self.process_manager.persist_records()
157
-
158
- # Step 9: connect to Event Hub (best-effort, non-blocking)
159
- await self._connect_event_hub()
160
-
161
- # Step 10: start heartbeat to Registry
162
314
  self._heartbeat_task = asyncio.create_task(self._heartbeat_loop())
163
315
 
164
- # Step 11: monitor loop
165
- print("[launcher] Entering monitor loop (press Ctrl+C or 'q' to exit)")
316
+ print("[launcher] 进入监控循环 (按 Ctrl+C 或 'q' 退出)")
166
317
  await self._monitor_loop()
167
318
 
168
- # Graceful shutdown all modules before event loop closes
169
319
  await self._graceful_shutdown_all()
170
320
 
171
- # ── Event Hub connection ──
321
+ # ── Phase 1: Registry ──
322
+
323
+ async def _phase1_registry(self):
324
+ """Start Registry → capture port from stdout → set env → start API → register self."""
325
+ registry_dir = os.path.join(os.environ["KITE_PROJECT"], "core", "registry")
326
+ registry_info = ModuleInfo(
327
+ name="registry",
328
+ display_name="Registry",
329
+ type="infrastructure",
330
+ state="enabled",
331
+ runtime="python",
332
+ entry="entry.py",
333
+ module_dir=registry_dir,
334
+ )
335
+
336
+ boot_info = {"token": self.kite_token}
337
+ self._log_lifecycle("starting", "registry")
338
+ ok = self.process_manager.start_module(registry_info, boot_info=boot_info)
339
+ if not ok:
340
+ self._log_lifecycle("start_failed", "registry")
341
+ raise RuntimeError("启动 Registry 失败")
342
+
343
+ # Wait for Registry to output port via stdout (mechanism 2)
344
+ print("[launcher] 等待 Registry 端口...")
345
+ msg = await self._wait_kite_message("registry", "port", timeout=6)
346
+ if not msg or not msg.get("port"):
347
+ raise RuntimeError("致命错误: Registry 在 6s 内未报告端口")
348
+ self.registry_port = int(msg["port"])
349
+ print(f"[launcher] Registry 端口: {self.registry_port}")
350
+
351
+ # Set KITE_REGISTRY_PORT for all subsequent child processes
352
+ os.environ["KITE_REGISTRY_PORT"] = str(self.registry_port)
353
+
354
+ # Start Launcher API in a separate thread
355
+ self._start_api_thread()
172
356
 
173
- async def _connect_event_hub(self):
174
- """Discover Event Hub WS endpoint (with retry) and start background client."""
175
- url = f"http://127.0.0.1:{self.registry_port}"
357
+ # Register Launcher itself to Registry
358
+ await self._register_self()
359
+
360
+ async def _register_self(self):
361
+ """Register Launcher itself to Registry."""
362
+ url = f"http://127.0.0.1:{self.registry_port}/modules"
176
363
  headers = {"Authorization": f"Bearer {self.kite_token}"}
364
+ payload = {
365
+ "action": "register",
366
+ "module_id": "launcher",
367
+ "module_type": "infrastructure",
368
+ "name": "Launcher",
369
+ "api_endpoint": f"http://127.0.0.1:{self.api_port}",
370
+ "health_endpoint": "/launcher/modules",
371
+ "events_publish": {
372
+ "module.started": {},
373
+ "module.stopped": {},
374
+ "module.state_changed": {},
375
+ },
376
+ "events_subscribe": [">"],
377
+ }
378
+ try:
379
+ async with httpx.AsyncClient() as client:
380
+ resp = await client.post(url, json=payload, headers=headers, timeout=5)
381
+ if resp.status_code == 200:
382
+ print("[launcher] 已注册到 Registry")
383
+ else:
384
+ print(f"[launcher] 警告: Registry 注册返回 {resp.status_code}")
385
+ except Exception as e:
386
+ print(f"[launcher] 警告: 注册到 Registry 失败: {e}")
387
+
388
+ # ── Phase 2: Event Hub ──
389
+
390
+ async def _phase2_event_hub(self):
391
+ """Start Event Hub → stdin launcher_ws_token → stdout ws_endpoint → WS connect → module.ready."""
392
+ # Find event_hub in scanned modules or build manually
393
+ eh_info = self.modules.get("event_hub")
394
+ if not eh_info:
395
+ eh_dir = os.path.join(os.environ["KITE_PROJECT"], "core", "event_hub")
396
+ eh_info = ModuleInfo(
397
+ name="event_hub",
398
+ display_name="Event Hub",
399
+ type="infrastructure",
400
+ state="enabled",
401
+ runtime="python",
402
+ entry="entry.py",
403
+ module_dir=eh_dir,
404
+ )
405
+
406
+ token = self._module_tokens.get("event_hub", "")
407
+ if not token:
408
+ token = secrets.token_hex(32)
409
+ self._module_tokens["event_hub"] = token
410
+ await self._register_tokens_to_registry({"event_hub": token})
411
+
412
+ boot_info = {"token": token}
413
+ self._log_lifecycle("starting", "event_hub")
414
+ ok = self.process_manager.start_module(eh_info, boot_info=boot_info)
415
+ if not ok:
416
+ self._log_lifecycle("start_failed", "event_hub")
417
+ raise RuntimeError("启动 Event Hub 失败")
418
+
419
+ # Send launcher_ws_token via stdin (mechanism 6)
420
+ self._launcher_ws_token = secrets.token_hex(32)
421
+ self.process_manager.write_stdin("event_hub", {
422
+ "kite": "launcher_ws_token",
423
+ "launcher_ws_token": self._launcher_ws_token,
424
+ })
177
425
 
178
- # Event Hub needs time to start and register itself to Registry
179
- print("[launcher] Waiting for Event Hub to register...")
180
- deadline = time.time() + 15
181
- while time.time() < deadline:
182
- try:
183
- async with httpx.AsyncClient() as client:
184
- resp = await client.get(
185
- f"{url}/get/event_hub.metadata.ws_endpoint",
186
- headers=headers, timeout=3,
187
- )
188
- if resp.status_code == 200:
189
- self._event_hub_ws_url = resp.json()
190
- if self._event_hub_ws_url:
191
- break
192
- except Exception:
193
- pass
194
- await asyncio.sleep(1)
426
+ # Wait for ws_endpoint from stdout (mechanism 5)
427
+ print("[launcher] 等待 Event Hub ws_endpoint...")
428
+ msg = await self._wait_kite_message("event_hub", "ws_endpoint", timeout=6)
429
+ if not msg or not msg.get("ws_endpoint"):
430
+ raise RuntimeError("致命错误: Event Hub 在 6s 内未报告 ws_endpoint")
431
+ self._event_hub_ws_url = msg["ws_endpoint"]
432
+ print(f"[launcher] Event Hub 已发现: {self._event_hub_ws_url}")
433
+
434
+ # Connect to Event Hub WebSocket with launcher_ws_token
435
+ self._ws_task = asyncio.create_task(self._ws_loop())
436
+
437
+ # Wait for Event Hub module.ready (sent when Launcher connects)
438
+ ready = await self._wait_event("module.ready", "event_hub", timeout=15)
439
+ if ready:
440
+ print("[launcher] Event Hub 已就绪")
441
+ else:
442
+ print("[launcher] 警告: Event Hub 在 15s 内未发送 module.ready")
443
+
444
+ self._log_lifecycle("started", "event_hub")
445
+ await self._publish_event("module.started", {"module_id": "event_hub"})
446
+ self.process_manager.close_stdio("event_hub")
447
+
448
+ # ── Phase 3: Registry delayed ready ──
195
449
 
196
- if not self._event_hub_ws_url:
197
- print("[launcher] WARNING: Could not discover Event Hub WS, events disabled")
450
+ async def _phase3_registry_ready(self):
451
+ """Wait for Registry module.ready (triggered after Event Hub registers to Registry
452
+ and Registry connects to Event Hub WS)."""
453
+ print("[launcher] 等待 Registry 延迟就绪...")
454
+ ready = await self._wait_event("module.ready", "registry", timeout=12)
455
+ if ready:
456
+ print("[launcher] Registry 已就绪")
457
+ else:
458
+ print("[launcher] 警告: Registry 在 12s 内未发送 module.ready (降级运行)")
459
+
460
+ self._log_lifecycle("started", "registry")
461
+ await self._publish_event("module.started", {"module_id": "registry"})
462
+ self.process_manager.close_stdio("registry")
463
+
464
+ # ── Phase 4: Start remaining modules ──
465
+
466
+ async def _phase4_start_modules(self):
467
+ """Start enabled modules (excluding core) in dependency order."""
468
+ to_start = [m for m in self.modules.values()
469
+ if self._desired_states.get(m.name) == "running"
470
+ and m.name not in CORE_MODULE_NAMES]
471
+ if not to_start:
472
+ print("[launcher] 没有额外模块需要启动")
198
473
  return
199
474
 
200
- print(f"[launcher] Event Hub discovered: {self._event_hub_ws_url}")
201
- self._ws_task = asyncio.create_task(self._ws_loop())
475
+ # Auto-start manual modules if depended upon
476
+ needed = set(m.name for m in to_start)
477
+ for m in list(to_start):
478
+ for dep in m.depends_on:
479
+ if dep not in needed and dep not in CORE_MODULE_NAMES:
480
+ dep_info = self.modules.get(dep)
481
+ if dep_info and dep_info.state != "disabled":
482
+ needed.add(dep)
483
+ to_start.append(dep_info)
484
+ self._desired_states[dep] = "running"
485
+ print(f"[launcher] 自动启动 '{dep}' (被依赖)")
486
+ elif dep_info and dep_info.state == "disabled":
487
+ print(f"[launcher] 错误: '{m.name}' 依赖已禁用的模块 '{dep}'")
488
+
489
+ try:
490
+ sorted_modules = self._topo_sort(to_start)
491
+ except RuntimeError as e:
492
+ print(f"[launcher] 错误: {e}")
493
+ return
494
+
495
+ print(f"[launcher] 正在启动 {len(sorted_modules)} 个模块...")
496
+ for info in sorted_modules:
497
+ await self._start_one_module(info)
498
+
499
+ # ── Event Hub WebSocket connection ──
202
500
 
203
501
  async def _ws_loop(self):
204
502
  """Connect to Event Hub, reconnect on failure."""
@@ -208,16 +506,16 @@ class Launcher:
208
506
  except asyncio.CancelledError:
209
507
  return
210
508
  except Exception as e:
211
- print(f"[launcher] Event Hub connection error: {e}")
509
+ print(f"[launcher] Event Hub 连接错误: {e}")
212
510
  self._ws = None
213
511
  await asyncio.sleep(5)
214
512
 
215
513
  async def _ws_connect(self):
216
- """Single WebSocket session: connect, subscribe to all events, display them."""
217
- ws_url = f"{self._event_hub_ws_url}?token={self.kite_token}"
218
- async with websockets.connect(ws_url) as ws:
514
+ """Single WebSocket session with launcher_ws_token auth."""
515
+ ws_url = f"{self._event_hub_ws_url}?token={self._launcher_ws_token}"
516
+ async with websockets.connect(ws_url, ping_interval=None, ping_timeout=None, close_timeout=10) as ws:
219
517
  self._ws = ws
220
- print("[launcher] Connected to Event Hub")
518
+ print("[launcher] 已连接到 Event Hub")
221
519
 
222
520
  # Subscribe to all events
223
521
  await ws.send(json.dumps({
@@ -258,7 +556,7 @@ class Launcher:
258
556
  else:
259
557
  print(f"[{source}] {event}: {json.dumps(data, ensure_ascii=False)}")
260
558
  elif msg_type == "error":
261
- print(f"[launcher] Event Hub error: {msg.get('message')}")
559
+ print(f"[launcher] Event Hub 错误: {msg.get('message')}")
262
560
 
263
561
  async def _publish_event(self, event_type: str, data: dict):
264
562
  """Publish an event to Event Hub via WebSocket."""
@@ -276,7 +574,7 @@ class Launcher:
276
574
  try:
277
575
  await self._ws.send(json.dumps(msg))
278
576
  except Exception as e:
279
- print(f"[launcher] Failed to publish event: {e}")
577
+ print(f"[launcher] 发布事件失败: {e}")
280
578
 
281
579
  def _publish_event_threadsafe(self, event_type: str, data: dict):
282
580
  """Publish event from non-async context (API thread). Fire-and-forget."""
@@ -303,27 +601,21 @@ class Launcher:
303
601
  async def _graceful_stop(self, name: str, reason: str = "stop_requested", timeout: float = 10):
304
602
  """Graceful shutdown: send event → wait ack → wait ready → kill."""
305
603
  self._log_lifecycle("stopping", name, reason=reason)
306
- # Step 1: send module.shutdown event
307
604
  await self._publish_event("module.shutdown", {
308
605
  "module_id": name, "reason": reason, "timeout": timeout,
309
606
  })
310
607
 
311
- # Step 2: wait for ack (3s)
312
608
  ack = await self._wait_event("module.shutdown.ack", name, timeout=3)
313
609
  if not ack:
314
- # No ack — fallback to direct terminate
315
610
  self.process_manager.stop_module(name, timeout=5)
316
611
  await self._publish_event("module.stopped", {"module_id": name})
317
612
  return
318
613
 
319
- # Step 3: wait for ready
320
614
  estimated = min(ack.get("estimated_cleanup", timeout), timeout)
321
615
  ready = await self._wait_event("module.shutdown.ready", name, timeout=estimated)
322
616
  if ready:
323
- # Module is ready to die — kill immediately
324
617
  self.process_manager.stop_module(name, timeout=1)
325
618
  else:
326
- # Timeout — force stop
327
619
  self.process_manager.stop_module(name, timeout=3)
328
620
 
329
621
  self._log_lifecycle("stopped", name, reason=reason)
@@ -332,16 +624,18 @@ class Launcher:
332
624
  async def _graceful_shutdown_all(self):
333
625
  """Broadcast module.shutdown to all running modules, then force-kill survivors."""
334
626
  running = [n for n in self.modules if self.process_manager.is_running(n)]
627
+ # Also check core modules
628
+ for cn in CORE_MODULE_NAMES:
629
+ if self.process_manager.is_running(cn) and cn not in running:
630
+ running.append(cn)
335
631
  if not running:
336
632
  return
337
- print(f"[launcher] Graceful shutdown: {', '.join(running)}")
338
- # Broadcast shutdown event
633
+ print(f"[launcher] 优雅关闭: {', '.join(running)}")
339
634
  for name in running:
340
635
  self._log_lifecycle("stopping", name, reason="system_shutdown")
341
636
  await self._publish_event("module.shutdown", {
342
637
  "module_id": name, "reason": "system_shutdown", "timeout": 10,
343
638
  })
344
- # Wait up to 10s total, then force-kill
345
639
  deadline = time.time() + 10
346
640
  while time.time() < deadline:
347
641
  still_running = [n for n in running if self.process_manager.is_running(n)]
@@ -369,100 +663,6 @@ class Launcher:
369
663
  except Exception:
370
664
  pass
371
665
 
372
- # ── Registry startup ──
373
-
374
- async def _start_registry(self):
375
- """Start Registry as a subprocess, wait for it to write port.txt and /health to respond."""
376
- registry_dir = os.path.join(self.project_root, "core", "registry")
377
- if not os.path.isdir(registry_dir):
378
- raise RuntimeError(f"Registry module not found at {registry_dir}")
379
-
380
- # Use centralized data directory
381
- from core.data_dir import get_registry_data_dir
382
- registry_data_dir = get_registry_data_dir()
383
-
384
- # Clean our instance's port file before starting
385
- port_file = os.path.join(registry_data_dir, f"port_{self.instance_id}.txt")
386
- if os.path.isfile(port_file):
387
- os.remove(port_file)
388
-
389
- registry_info = ModuleInfo(
390
- name="registry",
391
- display_name="Registry",
392
- type="infrastructure",
393
- state="enabled",
394
- runtime="python",
395
- entry="entry.py",
396
- module_dir=registry_dir,
397
- )
398
-
399
- # Pass launcher_token + bind config via stdin
400
- boot_info = {"token": self.kite_token, "registry_port": 0, "bind": "127.0.0.1", "instance_id": self.instance_id}
401
- ok = self.process_manager.start_module(registry_info, boot_info=boot_info)
402
- if not ok:
403
- raise RuntimeError("Failed to start Registry")
404
-
405
- # Wait for Registry to write port.txt
406
- print("[launcher] Waiting for Registry to report its port...")
407
- deadline = time.time() + 10
408
- while time.time() < deadline:
409
- if os.path.isfile(port_file):
410
- try:
411
- with open(port_file, "r") as f:
412
- self.registry_port = int(f.read().strip())
413
- break
414
- except (ValueError, OSError):
415
- pass
416
- await asyncio.sleep(0.2)
417
- else:
418
- raise RuntimeError("Registry failed to write port.txt within 10s")
419
-
420
- # Poll /health until ready
421
- url = f"http://127.0.0.1:{self.registry_port}/health"
422
- print(f"[launcher] Registry on port {self.registry_port}, waiting for health check...")
423
-
424
- deadline = time.time() + 10
425
- async with httpx.AsyncClient() as client:
426
- while time.time() < deadline:
427
- try:
428
- resp = await client.get(url, timeout=1)
429
- if resp.status_code == 200:
430
- print("[launcher] Registry is ready")
431
- return
432
- except Exception:
433
- pass
434
- await asyncio.sleep(0.2)
435
-
436
- raise RuntimeError("Registry failed to become ready within 10s")
437
-
438
- async def _register_self(self):
439
- """Register Launcher itself to Registry using new API."""
440
- url = f"http://127.0.0.1:{self.registry_port}/modules"
441
- headers = {"Authorization": f"Bearer {self.kite_token}"}
442
- payload = {
443
- "action": "register",
444
- "module_id": "launcher",
445
- "module_type": "infrastructure",
446
- "name": "Launcher",
447
- "api_endpoint": f"http://127.0.0.1:{self.api_port}",
448
- "health_endpoint": "/launcher/modules",
449
- "events_publish": {
450
- "module.started": {},
451
- "module.stopped": {},
452
- "module.state_changed": {},
453
- },
454
- "events_subscribe": [">"],
455
- }
456
- try:
457
- async with httpx.AsyncClient() as client:
458
- resp = await client.post(url, json=payload, headers=headers, timeout=5)
459
- if resp.status_code == 200:
460
- print("[launcher] Registered self to Registry")
461
- else:
462
- print(f"[launcher] WARNING: Registry registration returned {resp.status_code}")
463
- except Exception as e:
464
- print(f"[launcher] WARNING: failed to register to Registry: {e}")
465
-
466
666
  # ── Module startup ──
467
667
 
468
668
  def _topo_sort(self, modules: list[ModuleInfo]) -> list[ModuleInfo]:
@@ -492,17 +692,12 @@ class Launcher:
492
692
  return order
493
693
 
494
694
  async def _start_one_module(self, info: ModuleInfo):
495
- """Start a single module: publish starting event, start process, wait for ready."""
695
+ """Start a single module: publish starting start process wait ready → started → close stdio."""
496
696
  self._log_lifecycle("starting", info.name)
497
697
  await self._publish_event("module.starting", {"module_id": info.name})
498
698
 
499
699
  token = self._module_tokens.get(info.name, "")
500
- boot_info = {
501
- "token": token,
502
- "registry_port": self.registry_port,
503
- "preferred_port": info.preferred_port,
504
- "advertise_ip": "127.0.0.1",
505
- }
700
+ boot_info = {"token": token}
506
701
  ok = self.process_manager.start_module(info, boot_info=boot_info)
507
702
  if not ok:
508
703
  self._log_lifecycle("start_failed", info.name)
@@ -512,65 +707,68 @@ class Launcher:
512
707
  timeout = info.launch.timeout
513
708
  ready = await self._wait_event("module.ready", info.name, timeout=timeout)
514
709
  if ready:
515
- print(f"[launcher] Module '{info.name}' is ready")
710
+ print(f"[launcher] 模块 '{info.name}' 已就绪")
516
711
  else:
517
- print(f"[launcher] WARNING: '{info.name}' did not send module.ready within {timeout}s")
712
+ print(f"[launcher] 警告: '{info.name}' {timeout}s 内未发送 module.ready")
518
713
 
519
714
  rec = self.process_manager.get_record(info.name)
520
715
  self._log_lifecycle("started", info.name, pid=rec.pid if rec else None)
521
716
  await self._publish_event("module.started", {"module_id": info.name})
522
-
523
- async def _start_enabled_modules(self):
524
- """Start modules in dependency order, auto-starting manual deps if needed."""
525
- to_start = [m for m in self.modules.values()
526
- if self._desired_states.get(m.name) == "running"]
527
- if not to_start:
528
- print("[launcher] No modules to start")
529
- return
530
-
531
- # Auto-start manual modules if depended upon
532
- needed = set(m.name for m in to_start)
533
- for m in to_start:
534
- for dep in m.depends_on:
535
- if dep not in needed:
536
- dep_info = self.modules.get(dep)
537
- if dep_info and dep_info.state != "disabled":
538
- needed.add(dep)
539
- to_start.append(dep_info)
540
- self._desired_states[dep] = "running"
541
- print(f"[launcher] Auto-starting '{dep}' (dependency)")
542
- elif dep_info and dep_info.state == "disabled":
543
- print(f"[launcher] ERROR: '{m.name}' depends on disabled module '{dep}'")
544
-
545
- try:
546
- sorted_modules = self._topo_sort(to_start)
547
- except RuntimeError as e:
548
- print(f"[launcher] ERROR: {e}")
549
- return
550
-
551
- print(f"[launcher] Starting {len(sorted_modules)} module(s)...")
552
- for info in sorted_modules:
553
- await self._start_one_module(info)
717
+ self.process_manager.close_stdio(info.name)
554
718
 
555
719
  async def _register_module_tokens(self):
556
720
  """Generate per-module tokens and register the mapping to Registry."""
721
+ # Include all scanned modules + core modules
557
722
  for name in self.modules:
558
- self._module_tokens[name] = secrets.token_hex(32)
723
+ if name not in self._module_tokens:
724
+ self._module_tokens[name] = secrets.token_hex(32)
725
+ # Ensure registry has a token
726
+ if "registry" not in self._module_tokens:
727
+ self._module_tokens["registry"] = secrets.token_hex(32)
559
728
 
560
729
  if not self._module_tokens:
561
730
  return
562
731
 
732
+ await self._register_tokens_to_registry(self._module_tokens)
733
+
734
+ async def _register_tokens_to_registry(self, tokens: dict):
735
+ """Register token mapping to Registry via POST /tokens."""
563
736
  url = f"http://127.0.0.1:{self.registry_port}/tokens"
564
737
  headers = {"Authorization": f"Bearer {self.kite_token}"}
565
738
  try:
566
739
  async with httpx.AsyncClient() as client:
567
- resp = await client.post(url, json=self._module_tokens, headers=headers, timeout=5)
740
+ resp = await client.post(url, json=tokens, headers=headers, timeout=5)
568
741
  if resp.status_code == 200:
569
- print(f"[launcher] Registered {len(self._module_tokens)} module token(s)")
742
+ print(f"[launcher] 已注册 {len(tokens)} 个模块令牌")
570
743
  else:
571
- print(f"[launcher] WARNING: token registration returned {resp.status_code}")
744
+ print(f"[launcher] 警告: 令牌注册返回 {resp.status_code}")
572
745
  except Exception as e:
573
- print(f"[launcher] WARNING: failed to register module tokens: {e}")
746
+ print(f"[launcher] 警告: 注册模块令牌失败: {e}")
747
+
748
+ # ── Validation ──
749
+
750
+ def _validate_core_modules(self):
751
+ """Validate core modules exist (mechanism 12)."""
752
+ project_root = os.environ["KITE_PROJECT"]
753
+ for name in ("registry", "event_hub"):
754
+ mod_dir = os.path.join(project_root, "core", name)
755
+ md_path = os.path.join(mod_dir, "module.md")
756
+ if not os.path.isdir(mod_dir):
757
+ print(f"[launcher] 致命: 核心模块 '{name}' 目录未找到: {mod_dir}")
758
+ sys.exit(1)
759
+ if not os.path.isfile(md_path):
760
+ print(f"[launcher] 致命: 核心模块 '{name}' 缺少 module.md: {md_path}")
761
+ sys.exit(1)
762
+ # Try to parse frontmatter
763
+ try:
764
+ with open(md_path, "r", encoding="utf-8") as f:
765
+ fm = _parse_frontmatter(f.read())
766
+ if not fm:
767
+ print(f"[launcher] 致命: 核心模块 '{name}' module.md 没有有效的 frontmatter")
768
+ sys.exit(1)
769
+ except Exception as e:
770
+ print(f"[launcher] 致命: 核心模块 '{name}' module.md 解析错误: {e}")
771
+ sys.exit(1)
574
772
 
575
773
  # ── API thread ──
576
774
 
@@ -591,29 +789,30 @@ class Launcher:
591
789
  t = threading.Thread(target=_run, daemon=True)
592
790
  t.start()
593
791
 
594
- # Wait for API server to actually be ready before proceeding
595
792
  deadline = time.time() + 5
596
793
  while time.time() < deadline:
597
794
  if self._api_server.started:
598
795
  break
599
796
  time.sleep(0.05)
600
797
  else:
601
- print("[launcher] WARNING: API server may not be fully ready")
798
+ print("[launcher] 警告: API 服务器可能尚未完全就绪")
602
799
 
603
- print(f"[launcher] API server started on port {self.api_port}")
800
+ print(f"[launcher] API 服务器已启动,端口 {self.api_port}")
604
801
 
605
802
  # ── Monitor loop ──
606
803
 
607
804
  async def _monitor_loop(self):
608
- """Check child processes every second. Handle crashes."""
805
+ """Check child processes every second. Handle crashes.
806
+ Uses _shutdown_event (asyncio.Event) so Ctrl+C wakes us immediately.
807
+ """
609
808
  MAX_FAIL = 3
610
809
  MAX_FAILED_MODULES = 3
611
810
 
612
- while not self._thread_shutdown.is_set():
811
+ while not self._shutdown_event.is_set():
613
812
  exited = self.process_manager.check_exited()
614
813
 
615
814
  for name, rc in exited:
616
- print(f"[launcher] Module '{name}' exited with code {rc}")
815
+ print(f"[launcher] 模块 '{name}' 退出,返回码 {rc}")
617
816
  self._log_lifecycle("exited", name, exit_code=rc)
618
817
  await self._publish_event("module.stopped", {
619
818
  "module_id": name, "exit_code": rc,
@@ -621,8 +820,8 @@ class Launcher:
621
820
  info = self.modules.get(name)
622
821
 
623
822
  # Core module crash → full restart
624
- if info and info.is_core(self.project_root):
625
- print(f"[launcher] CRITICAL: core module '{name}' crashed, restarting all...")
823
+ if name in CORE_MODULE_NAMES or (info and info.is_core()):
824
+ print(f"[launcher] 严重: 核心模块 '{name}' 崩溃,正在全部重启...")
626
825
  self._log_lifecycle("core_crash", name, exit_code=rc)
627
826
  await self._full_restart()
628
827
  return
@@ -632,29 +831,33 @@ class Launcher:
632
831
  count = self._fail_counts[name]
633
832
 
634
833
  if count < MAX_FAIL and self._desired_states.get(name) == "running" and info:
635
- print(f"[launcher] Restarting '{name}' (attempt {count}/{MAX_FAIL})...")
834
+ print(f"[launcher] 正在重启 '{name}' ( {count}/{MAX_FAIL})...")
636
835
  await self._start_one_module(info)
637
836
  elif count >= MAX_FAIL:
638
837
  self._desired_states[name] = "stopped"
639
838
  self._log_lifecycle("failed", name, reason=f"exceeded {MAX_FAIL} retries")
640
- print(f"[launcher] Module '{name}' failed {MAX_FAIL} times, giving up")
839
+ print(f"[launcher] 模块 '{name}' 失败 {MAX_FAIL} 次,已放弃")
641
840
 
642
- # Too many failed modules → exit
643
841
  failed_count = sum(1 for c in self._fail_counts.values() if c >= MAX_FAIL)
644
842
  if failed_count >= MAX_FAILED_MODULES:
645
- print(f"[launcher] {failed_count} modules permanently failed, Launcher exiting")
843
+ print(f"[launcher] {failed_count} 个模块永久失败,启动器退出")
646
844
  return
647
845
 
648
846
  if exited:
649
847
  self.process_manager.persist_records()
650
848
 
651
- await asyncio.sleep(1)
849
+ # Wait 1s but wake immediately on shutdown signal
850
+ try:
851
+ await asyncio.wait_for(self._shutdown_event.wait(), timeout=1)
852
+ return # shutdown requested
853
+ except asyncio.TimeoutError:
854
+ pass
652
855
 
653
856
  async def _full_restart(self):
654
- """Stop all modules, then re-run the startup sequence."""
655
- print("[launcher] Full restart: stopping all modules...")
857
+ """Stop all modules, regenerate tokens, re-run Phase 1-4 (mechanism 10)."""
858
+ print("[launcher] 全量重启: 正在停止所有模块...")
656
859
 
657
- # Disconnect Event Hub
860
+ # Disconnect Event Hub WS
658
861
  if self._ws_task:
659
862
  self._ws_task.cancel()
660
863
  self._ws_task = None
@@ -662,33 +865,39 @@ class Launcher:
662
865
  self._heartbeat_task.cancel()
663
866
  self._heartbeat_task = None
664
867
  self._ws = None
868
+ self._event_hub_ws_url = ""
869
+ self._launcher_ws_token = ""
665
870
 
666
871
  await self._graceful_shutdown_all()
667
872
  self._fail_counts.clear()
668
-
669
873
  self._module_tokens.clear()
670
874
 
671
- print("[launcher] Full restart: re-running startup sequence...")
875
+ # Regenerate kite_token
876
+ self.kite_token = secrets.token_hex(32)
877
+ self.process_manager.kite_token = self.kite_token
878
+
879
+ print("[launcher] 全量重启: 重新执行 Phase 1-4...")
672
880
  try:
673
- await self._start_registry()
674
- await self._register_self()
881
+ await self._phase1_registry()
675
882
  self.modules = self.module_scanner.scan()
676
883
  for n, info in self.modules.items():
677
884
  self._log_lifecycle("scanned", n, state=info.state, module_dir=info.module_dir)
678
885
  await self._register_module_tokens()
679
- await self._start_enabled_modules()
886
+ await self._phase2_event_hub()
887
+ await self._phase3_registry_ready()
888
+ await self._phase4_start_modules()
680
889
  self.process_manager.persist_records()
681
- await self._connect_event_hub()
682
- print("[launcher] Full restart complete, resuming monitor loop")
890
+ self._heartbeat_task = asyncio.create_task(self._heartbeat_loop())
891
+ print("[launcher] 全量重启完成,恢复监控循环")
683
892
  await self._monitor_loop()
684
893
  except Exception as e:
685
- print(f"[launcher] Full restart failed: {e}")
894
+ print(f"[launcher] 全量重启失败: {e}")
686
895
 
687
896
  # ── Shutdown ──
688
897
 
689
898
  def _final_cleanup(self):
690
899
  """Called on exit — stop all processes, stop API, clear records."""
691
- print("[launcher] Shutting down...")
900
+ print("[launcher] 正在关闭...")
692
901
 
693
902
  if self._ws_task:
694
903
  self._ws_task.cancel()
@@ -706,13 +915,7 @@ class Launcher:
706
915
  os.remove(self.process_manager.records_path)
707
916
  except OSError:
708
917
  pass
709
- from core.data_dir import get_registry_data_dir
710
- port_file = os.path.join(get_registry_data_dir(), f"port_{self.instance_id}.txt")
711
- try:
712
- os.remove(port_file)
713
- except OSError:
714
- pass
715
- print("[launcher] Goodbye.")
918
+ print("[launcher] 再见。")
716
919
 
717
920
  if IS_WINDOWS:
718
921
  os._exit(0)
@@ -721,20 +924,20 @@ class Launcher:
721
924
 
722
925
  def _load_discovery(self) -> dict | None:
723
926
  """Read discovery config from launcher's own module.md."""
724
- md_path = os.path.join(self.project_root, "core", "launcher", "module.md")
927
+ md_path = os.path.join(os.environ["KITE_PROJECT"], "core", "launcher", "module.md")
725
928
  try:
726
929
  with open(md_path, "r", encoding="utf-8") as f:
727
930
  fm = _parse_frontmatter(f.read())
728
931
  discovery = fm.get("discovery")
729
932
  if isinstance(discovery, dict) and discovery:
730
- print(f"[launcher] Discovery sources: {', '.join(discovery.keys())}")
933
+ print(f"[launcher] 发现来源: {', '.join(discovery.keys())}")
731
934
  return discovery
732
935
  except Exception as e:
733
- print(f"[launcher] WARNING: failed to read discovery config: {e}")
936
+ print(f"[launcher] 警告: 读取发现配置失败: {e}")
734
937
  return None
735
938
 
736
939
  def _log_lifecycle(self, event: str, module: str, **extra):
737
- """Append one JSONL line to core/launcher/data/lifecycle.jsonl."""
940
+ """Append one JSONL line to lifecycle.jsonl."""
738
941
  from datetime import datetime, timezone
739
942
  record = {"ts": datetime.now(timezone.utc).isoformat(), "event": event, "module": module}
740
943
  record.update(extra)
@@ -758,11 +961,11 @@ class Launcher:
758
961
  def _create_api_app(self) -> FastAPI:
759
962
  """Create the FastAPI app with Launcher management routes."""
760
963
  app = FastAPI(title="Kite Launcher", docs_url=None, redoc_url=None)
761
- launcher = self # closure reference
964
+ launcher = self
762
965
 
763
966
  @app.get("/launcher/modules")
764
967
  async def list_modules():
765
- """List all modules and their current status (three-layer state model)."""
968
+ """List all modules and their current status."""
766
969
  result = []
767
970
  for name, info in launcher.modules.items():
768
971
  running = launcher.process_manager.is_running(name)
@@ -781,17 +984,15 @@ class Launcher:
781
984
 
782
985
  @app.post("/launcher/modules/{name}/start")
783
986
  async def start_module(name: str):
784
- """Start a module by name. Generates token and passes boot_info via stdin."""
987
+ """Start a module by name."""
785
988
  info = launcher.modules.get(name)
786
989
  if not info:
787
990
  raise HTTPException(404, f"Module '{name}' not found")
788
991
  if info.state == "disabled":
789
992
  raise HTTPException(403, f"Module '{name}' is disabled")
790
993
 
791
- # Generate token if not already present
792
994
  if name not in launcher._module_tokens:
793
995
  launcher._module_tokens[name] = secrets.token_hex(32)
794
- # Register the new token to Registry
795
996
  try:
796
997
  async with httpx.AsyncClient() as client:
797
998
  await client.post(
@@ -801,14 +1002,10 @@ class Launcher:
801
1002
  timeout=5,
802
1003
  )
803
1004
  except Exception as e:
804
- print(f"[launcher] WARNING: failed to register token for {name}: {e}")
1005
+ print(f"[launcher] 警告: 注册 {name} 的令牌失败: {e}")
805
1006
 
806
1007
  token = launcher._module_tokens[name]
807
- boot_info = {
808
- "token": token,
809
- "registry_port": launcher.registry_port,
810
- "preferred_port": info.preferred_port,
811
- }
1008
+ boot_info = {"token": token}
812
1009
  ok = launcher.process_manager.start_module(info, boot_info=boot_info)
813
1010
  if ok:
814
1011
  launcher._desired_states[name] = "running"
@@ -823,7 +1020,7 @@ class Launcher:
823
1020
 
824
1021
  @app.post("/launcher/modules/{name}/stop")
825
1022
  async def stop_module(name: str, body: dict = None):
826
- """Stop a module with graceful shutdown. Accepts optional reason."""
1023
+ """Stop a module with graceful shutdown."""
827
1024
  info = launcher.modules.get(name)
828
1025
  if not info:
829
1026
  raise HTTPException(404, f"Module '{name}' not found")
@@ -843,7 +1040,6 @@ class Launcher:
843
1040
  raise HTTPException(403, f"Module '{name}' is disabled")
844
1041
  reason = (body or {}).get("reason", "restart")
845
1042
  await launcher._graceful_stop(name, reason)
846
- # Re-generate token
847
1043
  launcher._module_tokens[name] = secrets.token_hex(32)
848
1044
  try:
849
1045
  async with httpx.AsyncClient() as client:
@@ -856,11 +1052,7 @@ class Launcher:
856
1052
  except Exception:
857
1053
  pass
858
1054
  token = launcher._module_tokens[name]
859
- boot_info = {
860
- "token": token,
861
- "registry_port": launcher.registry_port,
862
- "preferred_port": info.preferred_port,
863
- }
1055
+ boot_info = {"token": token}
864
1056
  ok = launcher.process_manager.start_module(info, boot_info=boot_info)
865
1057
  if ok:
866
1058
  launcher._desired_states[name] = "running"
@@ -884,11 +1076,9 @@ class Launcher:
884
1076
  for name in added:
885
1077
  info = launcher.modules[name]
886
1078
  launcher._log_lifecycle("scanned", name, state=info.state, module_dir=info.module_dir)
887
- # Initialize desired_state for new modules
888
1079
  for name in added:
889
1080
  info = launcher.modules[name]
890
1081
  launcher._desired_states[name] = "running" if info.state == "enabled" else "stopped"
891
- # Register tokens for new modules
892
1082
  if added:
893
1083
  new_tokens = {}
894
1084
  for name in added:
@@ -917,14 +1107,12 @@ class Launcher:
917
1107
  if new_state not in ("enabled", "manual", "disabled"):
918
1108
  raise HTTPException(400, "state must be enabled, manual, or disabled")
919
1109
 
920
- # Core modules cannot be disabled
921
- if info.is_core(launcher.project_root) and new_state == "disabled":
1110
+ if info.is_core() and new_state == "disabled":
922
1111
  raise HTTPException(403, "Core modules cannot be disabled")
923
1112
 
924
1113
  old_state = info.state
925
1114
  info.state = new_state
926
1115
 
927
- # Update desired_state to match new config_state
928
1116
  if new_state == "enabled":
929
1117
  launcher._desired_states[name] = "running"
930
1118
  else:
@@ -956,7 +1144,6 @@ def _update_module_md_state(module_dir: str, new_state: str):
956
1144
  with open(md_path, "r", encoding="utf-8") as f:
957
1145
  content = f.read()
958
1146
 
959
- # Replace state: xxx in frontmatter
960
1147
  updated = re.sub(
961
1148
  r'^(state:\s*)(\S+)',
962
1149
  rf'\g<1>{new_state}',
@@ -968,4 +1155,4 @@ def _update_module_md_state(module_dir: str, new_state: str):
968
1155
  with open(md_path, "w", encoding="utf-8") as f:
969
1156
  f.write(updated)
970
1157
  except Exception as e:
971
- print(f"[launcher] WARNING: failed to update module.md state: {e}")
1158
+ print(f"[launcher] 警告: 更新 module.md 状态失败: {e}")