@agentunion/kite 1.2.0 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +208 -0
- package/README.md +48 -0
- package/cli.js +1 -1
- package/extensions/agents/assistant/entry.py +30 -81
- package/extensions/agents/assistant/module.md +1 -1
- package/extensions/agents/assistant/server.py +83 -122
- package/extensions/channels/acp_channel/entry.py +30 -81
- package/extensions/channels/acp_channel/module.md +1 -1
- package/extensions/channels/acp_channel/server.py +83 -122
- package/extensions/event_hub_bench/entry.py +81 -121
- package/extensions/services/backup/entry.py +213 -85
- package/extensions/services/model_service/entry.py +213 -85
- package/extensions/services/watchdog/entry.py +513 -460
- package/extensions/services/watchdog/monitor.py +55 -69
- package/extensions/services/web/entry.py +11 -108
- package/extensions/services/web/server.py +120 -77
- package/{core/registry → kernel}/entry.py +65 -37
- package/{core/event_hub/hub.py → kernel/event_hub.py} +61 -81
- package/kernel/module.md +33 -0
- package/{core/registry/store.py → kernel/registry_store.py} +13 -4
- package/kernel/rpc_router.py +388 -0
- package/kernel/server.py +267 -0
- package/launcher/__init__.py +10 -0
- package/launcher/__main__.py +6 -0
- package/launcher/count_lines.py +258 -0
- package/{core/launcher → launcher}/entry.py +693 -767
- package/launcher/logging_setup.py +289 -0
- package/{core/launcher → launcher}/module_scanner.py +11 -6
- package/main.py +11 -350
- package/package.json +6 -9
- package/__init__.py +0 -1
- package/__main__.py +0 -15
- package/core/event_hub/BENCHMARK.md +0 -94
- package/core/event_hub/__init__.py +0 -0
- package/core/event_hub/bench.py +0 -459
- package/core/event_hub/bench_extreme.py +0 -308
- package/core/event_hub/bench_perf.py +0 -350
- package/core/event_hub/entry.py +0 -436
- package/core/event_hub/module.md +0 -20
- package/core/event_hub/server.py +0 -269
- package/core/kite_log.py +0 -241
- package/core/launcher/__init__.py +0 -0
- package/core/registry/__init__.py +0 -0
- package/core/registry/module.md +0 -30
- package/core/registry/server.py +0 -339
- package/extensions/services/backup/server.py +0 -244
- package/extensions/services/model_service/server.py +0 -236
- package/extensions/services/watchdog/server.py +0 -229
- /package/{core → kernel}/__init__.py +0 -0
- /package/{core/event_hub → kernel}/dedup.py +0 -0
- /package/{core/event_hub → kernel}/router.py +0 -0
- /package/{core/launcher → launcher}/module.md +0 -0
- /package/{core/launcher → launcher}/process_manager.py +0 -0
|
@@ -1,19 +1,14 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Launcher — the core of Kite. Manages module lifecycle,
|
|
2
|
+
Launcher — the core of Kite. Manages module lifecycle, monitors processes.
|
|
3
3
|
|
|
4
4
|
Thread model:
|
|
5
5
|
- Main thread: asyncio event loop (process management + monitor loop)
|
|
6
|
-
- API thread: independent thread running uvicorn + FastAPI
|
|
7
6
|
- stdout threads: one daemon thread per child process (ProcessManager)
|
|
8
7
|
- (Windows) keyboard listener thread: polls for 'q' key
|
|
9
8
|
|
|
10
|
-
|
|
11
|
-
Phase 1:
|
|
12
|
-
|
|
13
|
-
→ stdout ws_endpoint → WS connect → module.ready
|
|
14
|
-
Phase 2: (reserved — Event Hub ready handled in Phase 1)
|
|
15
|
-
Phase 3: Registry delayed ready (Event Hub → Registry → Event Hub WS → module.ready)
|
|
16
|
-
Phase 4: start remaining enabled modules in topo order
|
|
9
|
+
2-Phase startup:
|
|
10
|
+
Phase 1: Start Kernel → wait port → connect WS → register self → module.ready
|
|
11
|
+
Phase 2: start remaining enabled modules in topo order (each connects to Kernel WS)
|
|
17
12
|
"""
|
|
18
13
|
|
|
19
14
|
import asyncio
|
|
@@ -26,10 +21,7 @@ import threading
|
|
|
26
21
|
import time
|
|
27
22
|
import uuid
|
|
28
23
|
|
|
29
|
-
import httpx
|
|
30
|
-
import uvicorn
|
|
31
24
|
import websockets
|
|
32
|
-
from fastapi import FastAPI, HTTPException
|
|
33
25
|
|
|
34
26
|
from .module_scanner import ModuleScanner, ModuleInfo, LaunchConfig, _parse_frontmatter
|
|
35
27
|
from .process_manager import ProcessManager
|
|
@@ -42,14 +34,14 @@ SHUTDOWN_TIMEOUT_PARTIAL = 3 # Graceful module ack'd but no ready
|
|
|
42
34
|
SHUTDOWN_TIMEOUT_READY = 1 # Graceful module sent ready (cleanup done)
|
|
43
35
|
SHUTDOWN_TIMEOUT_BULK = 3 # Bulk stop_all() safety net
|
|
44
36
|
|
|
45
|
-
# Core module names that are started in Phase 1
|
|
46
|
-
CORE_MODULE_NAMES = {"
|
|
37
|
+
# Core module names that are started in Phase 1 (not Phase 2)
|
|
38
|
+
CORE_MODULE_NAMES = {"kernel"}
|
|
47
39
|
|
|
48
40
|
WATCHDOG_MODULE_NAME = "watchdog"
|
|
49
41
|
|
|
50
42
|
|
|
51
43
|
class Launcher:
|
|
52
|
-
"""Kite system entry point. Starts
|
|
44
|
+
"""Kite system entry point. Starts Kernel, manages modules."""
|
|
53
45
|
|
|
54
46
|
def __init__(self, kite_token: str):
|
|
55
47
|
self.kite_token = kite_token
|
|
@@ -70,41 +62,41 @@ class Launcher:
|
|
|
70
62
|
discovery=self._load_discovery(),
|
|
71
63
|
)
|
|
72
64
|
|
|
73
|
-
self.
|
|
74
|
-
self.api_port: int = 0
|
|
65
|
+
self.kernel_port: int = 0
|
|
75
66
|
self.modules: dict[str, ModuleInfo] = {}
|
|
76
67
|
self._shutdown_event = asyncio.Event()
|
|
77
68
|
self._thread_shutdown = threading.Event()
|
|
78
69
|
self._shutdown_complete = threading.Event() # Set when normal shutdown finishes
|
|
79
|
-
self._api_server: uvicorn.Server | None = None
|
|
80
|
-
self._api_ready = threading.Event()
|
|
81
70
|
self._module_tokens: dict[str, str] = {} # module_name -> per-module token
|
|
82
71
|
|
|
83
72
|
# Three-layer state model: desired_state per module
|
|
84
73
|
self._desired_states: dict[str, str] = {} # module_name -> "running" | "stopped"
|
|
85
74
|
|
|
86
|
-
#
|
|
87
|
-
self._event_hub_ws_url: str = ""
|
|
88
|
-
self._launcher_ws_token: str = ""
|
|
75
|
+
# Kernel WebSocket client
|
|
89
76
|
self._ws: object | None = None
|
|
90
77
|
self._ws_task: asyncio.Task | None = None
|
|
78
|
+
self._ws_connected: asyncio.Event | None = None # Created in _async_main, set when WS ready
|
|
91
79
|
self._loop: asyncio.AbstractEventLoop | None = None
|
|
92
80
|
|
|
81
|
+
# JSON-RPC 2.0 infrastructure
|
|
82
|
+
self._rpc_waiters: dict[str, asyncio.Event] = {} # rpc_id -> Event
|
|
83
|
+
self._rpc_results: dict[str, dict] = {} # rpc_id -> response dict
|
|
84
|
+
|
|
93
85
|
# Event waiters: {event_key: (asyncio.Event, data_dict)}
|
|
94
86
|
self._event_waiters: dict[str, tuple[asyncio.Event, dict]] = {}
|
|
95
87
|
|
|
96
88
|
# Module ready times: module_name -> seconds from start to ready
|
|
97
89
|
self._ready_times: dict[str, float] = {}
|
|
98
90
|
|
|
99
|
-
#
|
|
100
|
-
self.
|
|
91
|
+
# Shutdown timing
|
|
92
|
+
self._shutdown_start_time: float = 0.0
|
|
101
93
|
|
|
102
94
|
# Module exit reasons: module_name -> reason string (for modules that sent module.exiting)
|
|
103
95
|
self._exit_reasons: dict[str, str] = {}
|
|
104
96
|
|
|
105
97
|
# Graceful shutdown capability: module_name -> True if module declared support
|
|
106
|
-
#
|
|
107
|
-
self._graceful_modules: dict[str, bool] = {"
|
|
98
|
+
# Kernel defaults to True (it starts before Watchdog can observe)
|
|
99
|
+
self._graceful_modules: dict[str, bool] = {"kernel": True}
|
|
108
100
|
|
|
109
101
|
# System-wide shutdown flag: prevents Watchdog restart during shutdown
|
|
110
102
|
self._system_shutting_down = False
|
|
@@ -124,7 +116,6 @@ class Launcher:
|
|
|
124
116
|
except Exception:
|
|
125
117
|
pass
|
|
126
118
|
os.environ["KITE_INSTANCE_SUFFIX"] = suffix
|
|
127
|
-
self._app = self._create_api_app()
|
|
128
119
|
|
|
129
120
|
@staticmethod
|
|
130
121
|
def _fmt_elapsed(seconds: float) -> str:
|
|
@@ -238,7 +229,9 @@ class Launcher:
|
|
|
238
229
|
except KeyboardInterrupt:
|
|
239
230
|
pass
|
|
240
231
|
except RuntimeError as e:
|
|
241
|
-
print
|
|
232
|
+
# Don't print "启动失败" if user requested shutdown
|
|
233
|
+
if not self._thread_shutdown.is_set():
|
|
234
|
+
print(f"[launcher] 启动失败: {e}")
|
|
242
235
|
finally:
|
|
243
236
|
self._final_cleanup()
|
|
244
237
|
|
|
@@ -247,6 +240,7 @@ class Launcher:
|
|
|
247
240
|
if self._thread_shutdown.is_set():
|
|
248
241
|
return # already shutting down
|
|
249
242
|
print(f"[launcher] {reason or '收到关闭请求'}")
|
|
243
|
+
self._shutdown_start_time = time.monotonic() # Record shutdown start time
|
|
250
244
|
self._thread_shutdown.set()
|
|
251
245
|
# Wake up asyncio event loop immediately (so _monitor_loop / wait_for exits)
|
|
252
246
|
loop = self._loop
|
|
@@ -265,9 +259,9 @@ class Launcher:
|
|
|
265
259
|
except Exception:
|
|
266
260
|
still = []
|
|
267
261
|
if still:
|
|
268
|
-
print(f"[launcher] 关闭超时,以下模块仍在运行: {', '.join(still)}
|
|
262
|
+
print(f"\033[91m[launcher] 关闭超时,以下模块仍在运行: {', '.join(still)},强制退出\033[0m")
|
|
269
263
|
else:
|
|
270
|
-
print("[launcher]
|
|
264
|
+
print("\033[91m[launcher] 关闭超时,强制退出\033[0m")
|
|
271
265
|
os._exit(1)
|
|
272
266
|
threading.Thread(target=_force, daemon=True).start()
|
|
273
267
|
|
|
@@ -307,31 +301,34 @@ class Launcher:
|
|
|
307
301
|
while not self._thread_shutdown.is_set():
|
|
308
302
|
if msvcrt.kbhit():
|
|
309
303
|
ch = msvcrt.getch()
|
|
310
|
-
if ch
|
|
304
|
+
if ch == b'\x1b': # ESC - force exit immediately
|
|
305
|
+
print("[launcher] ESC 强制退出")
|
|
306
|
+
os._exit(0)
|
|
307
|
+
elif ch in (b'q', b'Q'): # q/Q - graceful shutdown
|
|
311
308
|
self._request_shutdown("收到退出请求,正在关闭...")
|
|
312
309
|
return
|
|
313
310
|
time.sleep(0.1)
|
|
314
311
|
threading.Thread(target=_listen, daemon=True).start()
|
|
315
312
|
|
|
316
|
-
# ── Async main (
|
|
313
|
+
# ── Async main (2-Phase startup) ──
|
|
317
314
|
|
|
318
315
|
async def _async_main(self):
|
|
319
|
-
"""Full
|
|
316
|
+
"""Full 2-phase startup sequence, then monitor loop."""
|
|
320
317
|
self._loop = asyncio.get_running_loop()
|
|
318
|
+
self._ws_connected = asyncio.Event() # Create event in async context
|
|
321
319
|
t_start = time.monotonic()
|
|
322
320
|
self._start_unix = time.time()
|
|
323
321
|
phase_times = {}
|
|
324
322
|
G = "\033[32m"
|
|
325
323
|
R = "\033[0m"
|
|
326
324
|
|
|
327
|
-
# Validate core modules exist
|
|
325
|
+
# Validate core modules exist
|
|
328
326
|
self._validate_core_modules()
|
|
329
327
|
|
|
330
328
|
# Cleanup leftovers from previous instances (current instance dir)
|
|
331
329
|
local_cleaned = self.process_manager.cleanup_leftovers()
|
|
332
330
|
|
|
333
331
|
# Cross-directory leftover cleanup (background, non-blocking)
|
|
334
|
-
# run_in_executor returns a Future (not coroutine), so use ensure_future
|
|
335
332
|
self._global_cleanup_task = asyncio.ensure_future(
|
|
336
333
|
asyncio.get_running_loop().run_in_executor(
|
|
337
334
|
None, self.process_manager.cleanup_global_leftovers
|
|
@@ -339,23 +336,15 @@ class Launcher:
|
|
|
339
336
|
)
|
|
340
337
|
|
|
341
338
|
try:
|
|
342
|
-
# Phase 1
|
|
339
|
+
# Phase 1: Start Kernel + connect WS
|
|
343
340
|
t0 = time.monotonic()
|
|
344
|
-
await self.
|
|
341
|
+
await self._phase1_start_kernel()
|
|
345
342
|
elapsed_p1 = time.monotonic() - t0
|
|
346
|
-
phase_times["Phase 1
|
|
347
|
-
print(f"{G}[launcher] ✓ Phase 1
|
|
343
|
+
phase_times["Phase 1: Kernel"] = elapsed_p1
|
|
344
|
+
print(f"{G}[launcher] ✓ Phase 1 完成: Kernel 已就绪 ({elapsed_p1:.2f}s){R}")
|
|
348
345
|
if self._shutdown_event.is_set(): return
|
|
349
346
|
|
|
350
|
-
#
|
|
351
|
-
t0 = time.monotonic()
|
|
352
|
-
await self._phase3_registry_ready()
|
|
353
|
-
elapsed = time.monotonic() - t0
|
|
354
|
-
phase_times["Phase 3: Registry 事件总线"] = elapsed
|
|
355
|
-
print(f"{G}[launcher] ✓ Phase 3 完成: Registry 已连接事件总线 ({elapsed:.2f}s){R}")
|
|
356
|
-
if self._shutdown_event.is_set(): return
|
|
357
|
-
|
|
358
|
-
# Initialize desired_state from config_state (needed before Phase 3.5)
|
|
347
|
+
# Initialize desired_state from config_state
|
|
359
348
|
for name, info in self.modules.items():
|
|
360
349
|
if info.state == "enabled":
|
|
361
350
|
self._desired_states[name] = "running"
|
|
@@ -365,43 +354,26 @@ class Launcher:
|
|
|
365
354
|
for cn in CORE_MODULE_NAMES:
|
|
366
355
|
self._desired_states[cn] = "running"
|
|
367
356
|
|
|
368
|
-
# Phase
|
|
369
|
-
# If started in parallel (Phase 1), just wait for module.ready
|
|
370
|
-
# Otherwise start it now (fallback)
|
|
357
|
+
# Phase 1.5: Watchdog
|
|
371
358
|
watchdog_info = self.modules.get(WATCHDOG_MODULE_NAME)
|
|
372
359
|
if watchdog_info and self._desired_states.get(WATCHDOG_MODULE_NAME) == "running":
|
|
373
360
|
t0 = time.monotonic()
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
if ready and not ready.get("_exited"):
|
|
379
|
-
self._graceful_modules["watchdog"] = bool(ready.get("graceful_shutdown"))
|
|
380
|
-
self._ready_times["watchdog"] = elapsed
|
|
381
|
-
print(f"[launcher] Watchdog 已就绪")
|
|
382
|
-
self._log_lifecycle("started", "watchdog")
|
|
383
|
-
await self._publish_event("module.started", {"module_id": "watchdog"})
|
|
384
|
-
self.process_manager.close_stdio("watchdog")
|
|
385
|
-
else:
|
|
386
|
-
print(f"[launcher] 警告: Watchdog 在 15s 内未就绪")
|
|
387
|
-
else:
|
|
388
|
-
print(f"[launcher] Phase 3.5: 启动 Watchdog...")
|
|
389
|
-
await self._start_one_module(watchdog_info)
|
|
390
|
-
elapsed = time.monotonic() - t0
|
|
391
|
-
print(f"{G}[launcher] ✓ Phase 3.5 完成: Watchdog ({elapsed:.2f}s){R}")
|
|
361
|
+
print(f"[launcher] Phase 1.5: 启动 Watchdog...")
|
|
362
|
+
await self._start_one_module(watchdog_info)
|
|
363
|
+
elapsed = time.monotonic() - t0
|
|
364
|
+
print(f"{G}[launcher] ✓ Phase 1.5 完成: Watchdog ({elapsed:.2f}s){R}")
|
|
392
365
|
if self._shutdown_event.is_set(): return
|
|
393
366
|
|
|
394
|
-
# Phase
|
|
367
|
+
# Phase 2: Start remaining enabled modules
|
|
395
368
|
t0 = time.monotonic()
|
|
396
|
-
await self.
|
|
369
|
+
await self._phase2_start_modules()
|
|
397
370
|
elapsed = time.monotonic() - t0
|
|
398
|
-
phase_times["Phase
|
|
399
|
-
print(f"{G}[launcher] ✓ Phase
|
|
371
|
+
phase_times["Phase 2: Extensions"] = elapsed
|
|
372
|
+
print(f"{G}[launcher] ✓ Phase 2 完成: 扩展模块已启动 ({elapsed:.2f}s){R}")
|
|
400
373
|
if self._shutdown_event.is_set(): return
|
|
401
374
|
|
|
402
375
|
# Post-startup
|
|
403
376
|
self.process_manager.persist_records()
|
|
404
|
-
self._heartbeat_task = asyncio.create_task(self._heartbeat_loop())
|
|
405
377
|
|
|
406
378
|
# Wait for global leftover cleanup to finish (non-blocking with timeout)
|
|
407
379
|
global_cleaned = {}
|
|
@@ -433,7 +405,7 @@ class Launcher:
|
|
|
433
405
|
"startup_time": round(total_time, 2),
|
|
434
406
|
})
|
|
435
407
|
|
|
436
|
-
print("[launcher] 进入监控循环 (按 Ctrl+C 或 'q'
|
|
408
|
+
print("[launcher] 进入监控循环 (按 Ctrl+C 或 'q' 优雅退出,ESC 强制退出)")
|
|
437
409
|
await self._monitor_loop()
|
|
438
410
|
finally:
|
|
439
411
|
try:
|
|
@@ -441,263 +413,110 @@ class Launcher:
|
|
|
441
413
|
except Exception as e:
|
|
442
414
|
print(f"[launcher] 优雅关闭出错: {e}")
|
|
443
415
|
|
|
444
|
-
# ── Phase 1
|
|
416
|
+
# ── Phase 1: Start Kernel ──
|
|
445
417
|
|
|
446
|
-
async def
|
|
447
|
-
"""Start
|
|
418
|
+
async def _phase1_start_kernel(self):
|
|
419
|
+
"""Start Kernel process, connect WS, register self, wait for module.ready.
|
|
448
420
|
|
|
449
421
|
Flow:
|
|
450
|
-
1. Start
|
|
451
|
-
2. Wait
|
|
452
|
-
3.
|
|
453
|
-
4.
|
|
454
|
-
5. Send launcher_ws_token + registry_port to Event Hub via stdin
|
|
455
|
-
6. Wait for Event Hub ws_endpoint → WS connect → module.ready
|
|
422
|
+
1. Start Kernel subprocess
|
|
423
|
+
2. Wait Kernel stdout port → set KITE_KERNEL_PORT env
|
|
424
|
+
3. Scan modules + connect WS + generate tokens (parallel)
|
|
425
|
+
4. Wait module.ready event from Kernel
|
|
456
426
|
"""
|
|
457
|
-
|
|
427
|
+
t_kernel = time.monotonic()
|
|
458
428
|
|
|
459
|
-
# ── Step 1: Start
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
name="
|
|
463
|
-
display_name="
|
|
429
|
+
# ── Step 1: Start Kernel process ──
|
|
430
|
+
kernel_dir = os.path.join(os.environ["KITE_PROJECT"], "kernel")
|
|
431
|
+
kernel_info = ModuleInfo(
|
|
432
|
+
name="kernel",
|
|
433
|
+
display_name="Kernel",
|
|
464
434
|
type="infrastructure",
|
|
465
435
|
state="enabled",
|
|
466
436
|
runtime="python",
|
|
467
437
|
entry="entry.py",
|
|
468
|
-
module_dir=
|
|
438
|
+
module_dir=kernel_dir,
|
|
469
439
|
)
|
|
470
|
-
|
|
471
|
-
self._log_lifecycle("starting", "
|
|
472
|
-
ok = self.process_manager.start_module(
|
|
440
|
+
# Kernel does NOT receive boot_info via stdin
|
|
441
|
+
self._log_lifecycle("starting", "kernel")
|
|
442
|
+
ok = self.process_manager.start_module(kernel_info, boot_info=None)
|
|
473
443
|
if not ok:
|
|
474
|
-
self._log_lifecycle("start_failed", "
|
|
475
|
-
raise RuntimeError("启动
|
|
476
|
-
|
|
477
|
-
# Start Event Hub in parallel (before Registry port is known)
|
|
478
|
-
eh_dir = os.path.join(os.environ["KITE_PROJECT"], "core", "event_hub")
|
|
479
|
-
eh_info = ModuleInfo(
|
|
480
|
-
name="event_hub",
|
|
481
|
-
display_name="Event Hub",
|
|
482
|
-
type="infrastructure",
|
|
483
|
-
state="enabled",
|
|
484
|
-
runtime="python",
|
|
485
|
-
entry="entry.py",
|
|
486
|
-
module_dir=eh_dir,
|
|
487
|
-
)
|
|
488
|
-
# Generate Event Hub token early (will register to Registry once it's up)
|
|
489
|
-
eh_token = secrets.token_hex(32)
|
|
490
|
-
self._module_tokens["event_hub"] = eh_token
|
|
491
|
-
boot_info_eh = {"token": eh_token}
|
|
492
|
-
self._log_lifecycle("starting", "event_hub")
|
|
493
|
-
ok = self.process_manager.start_module(eh_info, boot_info=boot_info_eh)
|
|
494
|
-
if not ok:
|
|
495
|
-
self._log_lifecycle("start_failed", "event_hub")
|
|
496
|
-
raise RuntimeError("启动 Event Hub 失败")
|
|
497
|
-
|
|
498
|
-
# Start Watchdog in parallel (before Registry port is known)
|
|
499
|
-
# Watchdog will block on stdin waiting for registry_port
|
|
500
|
-
watchdog_dir = os.path.join(os.environ["KITE_PROJECT"], "extensions", "services", "watchdog")
|
|
501
|
-
watchdog_md = os.path.join(watchdog_dir, "module.md")
|
|
502
|
-
self._watchdog_parallel = False # track whether watchdog was started in parallel
|
|
503
|
-
if os.path.isfile(watchdog_md):
|
|
504
|
-
wd_token = secrets.token_hex(32)
|
|
505
|
-
self._module_tokens["watchdog"] = wd_token
|
|
506
|
-
# Parse watchdog module.md for ModuleInfo
|
|
507
|
-
try:
|
|
508
|
-
with open(watchdog_md, "r", encoding="utf-8") as f:
|
|
509
|
-
wd_fm = _parse_frontmatter(f.read())
|
|
510
|
-
wd_info = ModuleInfo(
|
|
511
|
-
name="watchdog",
|
|
512
|
-
display_name=wd_fm.get("display_name", "Watchdog"),
|
|
513
|
-
type=wd_fm.get("type", "service"),
|
|
514
|
-
state="enabled",
|
|
515
|
-
runtime=wd_fm.get("runtime", "python"),
|
|
516
|
-
entry=wd_fm.get("entry", "entry.py"),
|
|
517
|
-
module_dir=watchdog_dir,
|
|
518
|
-
)
|
|
519
|
-
boot_info_wd = {"token": wd_token}
|
|
520
|
-
self._log_lifecycle("starting", "watchdog")
|
|
521
|
-
ok = self.process_manager.start_module(wd_info, boot_info=boot_info_wd)
|
|
522
|
-
if ok:
|
|
523
|
-
self._watchdog_parallel = True
|
|
524
|
-
else:
|
|
525
|
-
self._log_lifecycle("start_failed", "watchdog")
|
|
526
|
-
print("[launcher] 警告: Watchdog 并行启动失败,将在 Phase 3.5 重试")
|
|
527
|
-
except Exception as e:
|
|
528
|
-
print(f"[launcher] 警告: Watchdog module.md 解析失败: {e}")
|
|
444
|
+
self._log_lifecycle("start_failed", "kernel")
|
|
445
|
+
raise RuntimeError("启动 Kernel 失败")
|
|
529
446
|
|
|
530
|
-
|
|
531
|
-
print(f"[launcher] {parallel_modules} 进程已同时启动,等待 Registry 端口...")
|
|
447
|
+
print(f"[launcher] Kernel 进程已启动,等待 Kernel 端口...")
|
|
532
448
|
|
|
533
449
|
# Persist immediately after starting core processes
|
|
534
450
|
self.process_manager.persist_records()
|
|
535
451
|
|
|
536
|
-
# ── Step 2: Wait for
|
|
537
|
-
msg = await self._wait_kite_message("
|
|
538
|
-
if
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
self.
|
|
548
|
-
|
|
549
|
-
#
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
})
|
|
555
|
-
self.process_manager.write_stdin("event_hub", {
|
|
556
|
-
"kite": "registry_port",
|
|
557
|
-
"registry_port": self.registry_port,
|
|
558
|
-
})
|
|
559
|
-
|
|
560
|
-
# Send registry_port to Watchdog via stdin (if started in parallel)
|
|
561
|
-
# Watchdog will retry querying launcher.api_endpoint until it's available
|
|
562
|
-
if self.process_manager.is_running("watchdog"):
|
|
563
|
-
self.process_manager.write_stdin("watchdog", {
|
|
564
|
-
"kite": "registry_port",
|
|
565
|
-
"registry_port": self.registry_port,
|
|
566
|
-
})
|
|
567
|
-
|
|
568
|
-
# ── Step 4: Scan + register tokens ‖ wait for Event Hub ws_endpoint (parallel) ──
|
|
569
|
-
# Pre-register ws_endpoint waiter BEFORE gather to avoid race condition:
|
|
570
|
-
# module_scanner.scan() is synchronous and blocks the event loop,
|
|
571
|
-
# so the _wait_event_hub_endpoint coroutine wouldn't register its waiter in time.
|
|
572
|
-
ws_waiter_key = "event_hub:ws_endpoint"
|
|
573
|
-
ws_evt = threading.Event()
|
|
574
|
-
ws_data: dict = {}
|
|
575
|
-
self._msg_waiters[ws_waiter_key] = (ws_evt, ws_data)
|
|
576
|
-
|
|
577
|
-
async def _scan_and_register_tokens():
|
|
452
|
+
# ── Step 2: Wait for Kernel port + launcher_token ──
|
|
453
|
+
msg = await self._wait_kite_message("kernel", "port", timeout=6)
|
|
454
|
+
if self._thread_shutdown.is_set():
|
|
455
|
+
# User requested shutdown during startup
|
|
456
|
+
raise RuntimeError("启动被用户中断")
|
|
457
|
+
if not msg or not msg.get("port") or not msg.get("token"):
|
|
458
|
+
raise RuntimeError("致命错误: Kernel 在 6s 内未报告端口和 token")
|
|
459
|
+
self.kernel_port = int(msg["port"])
|
|
460
|
+
launcher_token = msg["token"]
|
|
461
|
+
self._module_tokens["launcher"] = launcher_token
|
|
462
|
+
_wait_s = time.monotonic() - t_kernel
|
|
463
|
+
print(f"[launcher] Kernel 端口: {self.kernel_port} (等待 {self._fmt_elapsed(_wait_s)})")
|
|
464
|
+
|
|
465
|
+
# ── Step 3: Set env (but don't send kernel_port to modules yet) ──
|
|
466
|
+
os.environ["KITE_KERNEL_PORT"] = str(self.kernel_port)
|
|
467
|
+
|
|
468
|
+
# ── Step 4: Scan modules + connect WS + generate tokens (parallel) ──
|
|
469
|
+
async def _scan_and_generate_tokens():
|
|
578
470
|
t_scan = time.monotonic()
|
|
579
471
|
self.modules = self.module_scanner.scan()
|
|
580
472
|
for name, info in self.modules.items():
|
|
581
473
|
self._log_lifecycle("scanned", name, state=info.state, module_dir=info.module_dir)
|
|
582
474
|
_scan_s = time.monotonic() - t_scan
|
|
583
475
|
print(f"[launcher] 发现 {len(self.modules)} 个模块: {', '.join(self.modules.keys()) or '(无)'} (扫描 {self._fmt_elapsed(_scan_s)})")
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
self.
|
|
603
|
-
if
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
476
|
+
# Generate tokens via Kernel RPC (after WS connection is ready)
|
|
477
|
+
t_gen = time.monotonic()
|
|
478
|
+
await self._generate_module_tokens()
|
|
479
|
+
_gen_s = time.monotonic() - t_gen
|
|
480
|
+
print(f"[launcher] 令牌生成完成 ({self._fmt_elapsed(_gen_s)})")
|
|
481
|
+
|
|
482
|
+
async def _connect_kernel_ws():
|
|
483
|
+
t_ws = time.monotonic()
|
|
484
|
+
self._ws_task = asyncio.create_task(self._ws_loop())
|
|
485
|
+
# Wait for WebSocket connection to be established and ready
|
|
486
|
+
try:
|
|
487
|
+
await asyncio.wait_for(self._ws_connected.wait(), timeout=5)
|
|
488
|
+
except asyncio.TimeoutError:
|
|
489
|
+
print("[launcher] 警告: WebSocket 连接超时")
|
|
490
|
+
return
|
|
491
|
+
|
|
492
|
+
# Now wait for Kernel module.ready event
|
|
493
|
+
# (waiter is registered inside _ws_connect before _ws_receiver starts)
|
|
494
|
+
ready = await self._wait_event("module.ready", "kernel", timeout=15)
|
|
495
|
+
if ready:
|
|
496
|
+
self._graceful_modules["kernel"] = bool(ready.get("graceful_shutdown"))
|
|
497
|
+
print("[launcher] Kernel 已就绪")
|
|
498
|
+
else:
|
|
499
|
+
print("\033[91m[launcher] 警告: Kernel 在 15s 内未发送 module.ready\033[0m")
|
|
500
|
+
self._ready_times["kernel"] = time.monotonic() - t_ws
|
|
501
|
+
|
|
610
502
|
await asyncio.gather(
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
_wait_event_hub_endpoint(),
|
|
503
|
+
_scan_and_generate_tokens(),
|
|
504
|
+
_connect_kernel_ws(),
|
|
614
505
|
)
|
|
615
506
|
if self._shutdown_event.is_set():
|
|
616
507
|
return
|
|
617
508
|
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
self.
|
|
621
|
-
|
|
622
|
-
# Wait for Event Hub module.ready (sent when Launcher connects)
|
|
623
|
-
ready = await self._wait_event("module.ready", "event_hub", timeout=15)
|
|
624
|
-
if ready:
|
|
625
|
-
self._graceful_modules["event_hub"] = bool(ready.get("graceful_shutdown"))
|
|
626
|
-
print("[launcher] Event Hub 已就绪")
|
|
627
|
-
else:
|
|
628
|
-
print("[launcher] 警告: Event Hub 在 15s 内未发送 module.ready")
|
|
629
|
-
|
|
630
|
-
self._ready_times["event_hub"] = time.monotonic() - t_eh
|
|
631
|
-
self._log_lifecycle("started", "event_hub")
|
|
632
|
-
await self._publish_event("module.started", {"module_id": "event_hub"})
|
|
633
|
-
self.process_manager.close_stdio("event_hub")
|
|
634
|
-
|
|
635
|
-
# Store eh_info in modules dict if not already present (from scan)
|
|
636
|
-
if "event_hub" not in self.modules:
|
|
637
|
-
self.modules["event_hub"] = eh_info
|
|
638
|
-
|
|
639
|
-
def _get_http(self) -> httpx.AsyncClient:
|
|
640
|
-
"""Get shared HTTP client (lazy-init, reuses TCP connections to Registry)."""
|
|
641
|
-
if self._http is None or self._http.is_closed:
|
|
642
|
-
self._http = httpx.AsyncClient(timeout=5)
|
|
643
|
-
return self._http
|
|
644
|
-
|
|
645
|
-
async def _close_http(self):
|
|
646
|
-
"""Close shared HTTP client."""
|
|
647
|
-
if self._http and not self._http.is_closed:
|
|
648
|
-
await self._http.aclose()
|
|
649
|
-
self._http = None
|
|
650
|
-
|
|
651
|
-
async def _register_self(self):
|
|
652
|
-
"""Register Launcher itself to Registry."""
|
|
653
|
-
url = f"http://127.0.0.1:{self.registry_port}/modules"
|
|
654
|
-
headers = {"Authorization": f"Bearer {self.kite_token}"}
|
|
655
|
-
payload = {
|
|
656
|
-
"action": "register",
|
|
657
|
-
"module_id": "launcher",
|
|
658
|
-
"module_type": "infrastructure",
|
|
659
|
-
"name": "Launcher",
|
|
660
|
-
"api_endpoint": f"http://127.0.0.1:{self.api_port}",
|
|
661
|
-
"health_endpoint": "/launcher/modules",
|
|
662
|
-
"events_publish": {
|
|
663
|
-
"module.started": {},
|
|
664
|
-
"module.stopped": {},
|
|
665
|
-
"module.state_changed": {},
|
|
666
|
-
},
|
|
667
|
-
"events_subscribe": [">"],
|
|
668
|
-
}
|
|
669
|
-
try:
|
|
670
|
-
client = self._get_http()
|
|
671
|
-
resp = await client.post(url, json=payload, headers=headers)
|
|
672
|
-
if resp.status_code == 200:
|
|
673
|
-
print("[launcher] 已注册到 Registry")
|
|
674
|
-
else:
|
|
675
|
-
print(f"[launcher] 警告: Registry 注册返回 {resp.status_code}")
|
|
676
|
-
except Exception as e:
|
|
677
|
-
print(f"[launcher] 警告: 注册到 Registry 失败: {e}")
|
|
678
|
-
|
|
679
|
-
# ── (Phase 2 merged into _phase1_parallel_bootstrap) ──
|
|
509
|
+
self._log_lifecycle("started", "kernel")
|
|
510
|
+
await self._publish_event("module.started", {"module_id": "kernel"})
|
|
511
|
+
self.process_manager.close_stdio("kernel")
|
|
680
512
|
|
|
681
|
-
|
|
513
|
+
# Store kernel_info in modules dict if not already present (from scan)
|
|
514
|
+
if "kernel" not in self.modules:
|
|
515
|
+
self.modules["kernel"] = kernel_info
|
|
682
516
|
|
|
683
|
-
|
|
684
|
-
"""Wait for Registry module.ready (triggered after Event Hub registers to Registry
|
|
685
|
-
and Registry connects to Event Hub WS)."""
|
|
686
|
-
print("[launcher] 等待 Registry 连接 Event Hub...")
|
|
687
|
-
ready = await self._wait_event("module.ready", "registry", timeout=12)
|
|
688
|
-
if ready:
|
|
689
|
-
self._graceful_modules["registry"] = bool(ready.get("graceful_shutdown"))
|
|
690
|
-
print("[launcher] Registry 事件总线连接完成")
|
|
691
|
-
else:
|
|
692
|
-
print("[launcher] 警告: Registry 在 12s 内未连接事件总线 (降级运行)")
|
|
517
|
+
# ── Phase 2: Start remaining modules ──
|
|
693
518
|
|
|
694
|
-
|
|
695
|
-
await self._publish_event("module.started", {"module_id": "registry"})
|
|
696
|
-
self.process_manager.close_stdio("registry")
|
|
697
|
-
|
|
698
|
-
# ── Phase 4: Start remaining modules ──
|
|
699
|
-
|
|
700
|
-
async def _phase4_start_modules(self):
|
|
519
|
+
async def _phase2_start_modules(self):
|
|
701
520
|
"""Start enabled modules (excluding core) in dependency order."""
|
|
702
521
|
to_start = [m for m in self.modules.values()
|
|
703
522
|
if self._desired_states.get(m.name) == "running"
|
|
@@ -735,10 +554,10 @@ class Launcher:
|
|
|
735
554
|
else:
|
|
736
555
|
await asyncio.gather(*(self._start_one_module(info) for info in layer))
|
|
737
556
|
|
|
738
|
-
# ──
|
|
557
|
+
# ── Kernel WebSocket connection (JSON-RPC 2.0) ──
|
|
739
558
|
|
|
740
559
|
async def _ws_loop(self):
|
|
741
|
-
"""Connect to
|
|
560
|
+
"""Connect to Kernel, reconnect on failure."""
|
|
742
561
|
while not self._thread_shutdown.is_set():
|
|
743
562
|
try:
|
|
744
563
|
await self._ws_connect()
|
|
@@ -746,105 +565,327 @@ class Launcher:
|
|
|
746
565
|
return
|
|
747
566
|
except Exception as e:
|
|
748
567
|
if not self._system_shutting_down:
|
|
749
|
-
print(f"[launcher]
|
|
568
|
+
print(f"[launcher] Kernel 连接错误: {e}")
|
|
750
569
|
self._ws = None
|
|
751
570
|
await asyncio.sleep(5)
|
|
752
571
|
|
|
753
572
|
async def _ws_connect(self):
|
|
754
|
-
"""Single WebSocket session with
|
|
755
|
-
|
|
573
|
+
"""Single WebSocket session with JSON-RPC 2.0 protocol."""
|
|
574
|
+
launcher_token = self._module_tokens.get("launcher", "")
|
|
575
|
+
ws_url = f"ws://127.0.0.1:{self.kernel_port}/ws?token={launcher_token}&id=launcher"
|
|
756
576
|
t_ws_connect = time.monotonic()
|
|
757
577
|
async with websockets.connect(ws_url, open_timeout=3, ping_interval=None, ping_timeout=None, close_timeout=10) as ws:
|
|
758
578
|
self._ws = ws
|
|
759
579
|
_ws_s = time.monotonic() - t_ws_connect
|
|
760
|
-
print(f"[launcher] 已连接到
|
|
580
|
+
print(f"[launcher] 已连接到 Kernel ({self._fmt_elapsed(_ws_s)})")
|
|
761
581
|
|
|
762
|
-
#
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
582
|
+
# Start receive loop in background task BEFORE making any RPC calls
|
|
583
|
+
# This prevents deadlock where RPC waits for response but receive loop hasn't started
|
|
584
|
+
receiver_task = asyncio.create_task(self._ws_receiver(ws))
|
|
585
|
+
|
|
586
|
+
try:
|
|
587
|
+
# Register kernel module.ready waiter BEFORE subscribing to events
|
|
588
|
+
# This prevents race condition where event arrives before waiter is registered
|
|
589
|
+
ready_key = "module.ready:kernel"
|
|
590
|
+
ready_evt = asyncio.Event()
|
|
591
|
+
ready_data = {}
|
|
592
|
+
self._event_waiters[ready_key] = (ready_evt, ready_data)
|
|
593
|
+
|
|
594
|
+
# Subscribe to all events
|
|
595
|
+
await self._rpc_call(ws, "event.subscribe", {"events": [">"]})
|
|
596
|
+
|
|
597
|
+
# Register Launcher itself in the Registry
|
|
598
|
+
await self._rpc_call(ws, "registry.register", {
|
|
599
|
+
"module_id": "launcher",
|
|
600
|
+
"module_type": "infrastructure",
|
|
601
|
+
"events_publish": {
|
|
602
|
+
"module.started": {},
|
|
603
|
+
"module.stopped": {},
|
|
604
|
+
"module.state_changed": {},
|
|
605
|
+
},
|
|
606
|
+
"events_subscribe": [">"],
|
|
607
|
+
})
|
|
608
|
+
print("[launcher] 已注册到 Kernel")
|
|
609
|
+
|
|
610
|
+
# Signal that connection is ready (after subscription and registration)
|
|
611
|
+
if self._ws_connected:
|
|
612
|
+
self._ws_connected.set()
|
|
613
|
+
|
|
614
|
+
# Wait for receiver task to complete (connection closed)
|
|
615
|
+
await receiver_task
|
|
616
|
+
except asyncio.CancelledError:
|
|
617
|
+
receiver_task.cancel()
|
|
618
|
+
raise
|
|
767
619
|
|
|
768
|
-
|
|
620
|
+
async def _ws_receiver(self, ws):
|
|
621
|
+
"""Receive loop: classify incoming messages."""
|
|
622
|
+
try:
|
|
769
623
|
async for raw in ws:
|
|
770
624
|
try:
|
|
771
625
|
msg = json.loads(raw)
|
|
772
626
|
except (json.JSONDecodeError, TypeError):
|
|
773
627
|
continue
|
|
774
628
|
try:
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
#
|
|
788
|
-
|
|
789
|
-
if event == "module.exiting" and module_id:
|
|
790
|
-
ready_key = f"module.ready:{module_id}"
|
|
791
|
-
ready_waiter = self._event_waiters.get(ready_key)
|
|
792
|
-
if ready_waiter:
|
|
793
|
-
ready_waiter[1].update(data)
|
|
794
|
-
ready_waiter[1]["_exited"] = True
|
|
795
|
-
ready_waiter[0].set()
|
|
796
|
-
# module.crash → print red crash summary (real-time notification)
|
|
797
|
-
if event == "module.crash" and module_id:
|
|
798
|
-
RED = "\033[91m"
|
|
799
|
-
RESET = "\033[0m"
|
|
800
|
-
exc_type = data.get("exception_type", "Unknown")
|
|
801
|
-
preview = data.get("traceback_preview", "")
|
|
802
|
-
severity = data.get("severity", "error")
|
|
803
|
-
print(f"[launcher] {RED}模块 '{module_id}' 崩溃: "
|
|
804
|
-
f"{exc_type} — {preview}{RESET}")
|
|
805
|
-
_suffix = os.environ.get("KITE_INSTANCE_SUFFIX", "")
|
|
806
|
-
crash_log = os.path.join(
|
|
807
|
-
os.environ.get("KITE_INSTANCE_DIR", ""),
|
|
808
|
-
module_id, "log", f"crashes{_suffix}.jsonl"
|
|
809
|
-
)
|
|
810
|
-
print(f"[launcher] 崩溃日志: {crash_log}")
|
|
811
|
-
ts = msg.get("timestamp", "")
|
|
812
|
-
# Only log system events (module.*, watchdog.*) to avoid flooding
|
|
813
|
-
# from benchmark/test traffic
|
|
814
|
-
if not (event.startswith("module.") or event.startswith("watchdog.")):
|
|
815
|
-
continue
|
|
816
|
-
latency_str = ""
|
|
817
|
-
if ts:
|
|
818
|
-
try:
|
|
819
|
-
from datetime import datetime, timezone
|
|
820
|
-
sent = datetime.fromisoformat(ts)
|
|
821
|
-
delay_ms = (datetime.now(timezone.utc) - sent).total_seconds() * 1000
|
|
822
|
-
latency_str = f" ({delay_ms:.1f}ms)"
|
|
823
|
-
local_ts = sent.astimezone().strftime("%H:%M:%S")
|
|
824
|
-
except Exception:
|
|
825
|
-
local_ts = ts[11:19] if len(ts) >= 19 else ts
|
|
826
|
-
print(f"[{source}] {local_ts} {event}{latency_str}: {json.dumps(data, ensure_ascii=False)}")
|
|
827
|
-
else:
|
|
828
|
-
print(f"[{source}] {event}: {json.dumps(data, ensure_ascii=False)}")
|
|
829
|
-
elif msg_type == "error":
|
|
830
|
-
print(f"[launcher] Event Hub 错误: {msg.get('message')}")
|
|
629
|
+
has_method = "method" in msg
|
|
630
|
+
has_id = "id" in msg
|
|
631
|
+
has_result = "result" in msg
|
|
632
|
+
has_error = "error" in msg
|
|
633
|
+
|
|
634
|
+
if has_method and not has_id:
|
|
635
|
+
# Event Notification (no id)
|
|
636
|
+
await self._handle_event_notification(msg)
|
|
637
|
+
elif has_method and has_id:
|
|
638
|
+
# Incoming RPC request (forwarded by Kernel)
|
|
639
|
+
await self._handle_rpc_request(ws, msg)
|
|
640
|
+
elif has_id and (has_result or has_error):
|
|
641
|
+
# RPC response (to our own call)
|
|
642
|
+
self._handle_rpc_response(msg)
|
|
831
643
|
except Exception as e:
|
|
832
|
-
print(f"[launcher]
|
|
644
|
+
print(f"[launcher] 消息处理异常(已忽略): {e}")
|
|
645
|
+
except asyncio.CancelledError:
|
|
646
|
+
pass
|
|
647
|
+
|
|
648
|
+
# ── JSON-RPC 2.0 infrastructure ──
|
|
649
|
+
|
|
650
|
+
async def _rpc_call(self, ws, method: str, params: dict = None, timeout: float = 5) -> dict:
|
|
651
|
+
"""Send a JSON-RPC 2.0 request and await the response."""
|
|
652
|
+
rpc_id = str(uuid.uuid4())
|
|
653
|
+
msg = {"jsonrpc": "2.0", "id": rpc_id, "method": method}
|
|
654
|
+
if params:
|
|
655
|
+
msg["params"] = params
|
|
656
|
+
|
|
657
|
+
evt = asyncio.Event()
|
|
658
|
+
self._rpc_waiters[rpc_id] = evt
|
|
659
|
+
self._rpc_results[rpc_id] = {}
|
|
660
|
+
|
|
661
|
+
try:
|
|
662
|
+
await ws.send(json.dumps(msg))
|
|
663
|
+
await asyncio.wait_for(evt.wait(), timeout=timeout)
|
|
664
|
+
return self._rpc_results.get(rpc_id, {})
|
|
665
|
+
except asyncio.TimeoutError:
|
|
666
|
+
print(f"[launcher] RPC 超时: {method}")
|
|
667
|
+
return {"error": {"code": -32002, "message": f"RPC timeout: {method}"}}
|
|
668
|
+
finally:
|
|
669
|
+
self._rpc_waiters.pop(rpc_id, None)
|
|
670
|
+
self._rpc_results.pop(rpc_id, None)
|
|
671
|
+
|
|
672
|
+
def _handle_rpc_response(self, msg: dict):
|
|
673
|
+
"""Match an incoming RPC response to a pending waiter."""
|
|
674
|
+
rpc_id = msg.get("id", "")
|
|
675
|
+
waiter = self._rpc_waiters.get(rpc_id)
|
|
676
|
+
if waiter:
|
|
677
|
+
self._rpc_results[rpc_id] = msg
|
|
678
|
+
waiter.set()
|
|
679
|
+
|
|
680
|
+
async def _handle_event_notification(self, msg: dict):
|
|
681
|
+
"""Handle an event notification (JSON-RPC 2.0 Notification with method='event')."""
|
|
682
|
+
params = msg.get("params", {})
|
|
683
|
+
source = params.get("source", "unknown")
|
|
684
|
+
event = params.get("event", "")
|
|
685
|
+
data = params.get("data") if isinstance(params.get("data"), dict) else {}
|
|
686
|
+
ts = params.get("timestamp", "")
|
|
687
|
+
|
|
688
|
+
# Trigger event waiters
|
|
689
|
+
module_id = data.get("module_id", "")
|
|
690
|
+
waiter_key = f"{event}:{module_id}"
|
|
691
|
+
waiter = self._event_waiters.get(waiter_key)
|
|
692
|
+
if waiter:
|
|
693
|
+
waiter[1].update(data)
|
|
694
|
+
waiter[0].set()
|
|
695
|
+
|
|
696
|
+
# module.exiting also wakes module.ready waiter
|
|
697
|
+
if event == "module.exiting" and module_id:
|
|
698
|
+
ready_key = f"module.ready:{module_id}"
|
|
699
|
+
ready_waiter = self._event_waiters.get(ready_key)
|
|
700
|
+
if ready_waiter:
|
|
701
|
+
ready_waiter[1].update(data)
|
|
702
|
+
ready_waiter[1]["_exited"] = True
|
|
703
|
+
ready_waiter[0].set()
|
|
704
|
+
|
|
705
|
+
# module.crash → print red crash summary
|
|
706
|
+
if event == "module.crash" and module_id:
|
|
707
|
+
RED = "\033[91m"
|
|
708
|
+
RESET = "\033[0m"
|
|
709
|
+
exc_type = data.get("exception_type", "Unknown")
|
|
710
|
+
preview = data.get("traceback_preview", "")
|
|
711
|
+
print(f"[launcher] {RED}模块 '{module_id}' 崩溃: {exc_type} — {preview}{RESET}")
|
|
712
|
+
_suffix = os.environ.get("KITE_INSTANCE_SUFFIX", "")
|
|
713
|
+
crash_log = os.path.join(
|
|
714
|
+
os.environ.get("KITE_INSTANCE_DIR", ""),
|
|
715
|
+
module_id, "log", f"crashes{_suffix}.jsonl"
|
|
716
|
+
)
|
|
717
|
+
print(f"[launcher] 崩溃日志: {crash_log}")
|
|
718
|
+
|
|
719
|
+
# Only log system events (module.*, watchdog.*) to avoid flooding
|
|
720
|
+
if not (event.startswith("module.") or event.startswith("watchdog.")):
|
|
721
|
+
return
|
|
722
|
+
latency_str = ""
|
|
723
|
+
if ts:
|
|
724
|
+
try:
|
|
725
|
+
from datetime import datetime, timezone
|
|
726
|
+
sent = datetime.fromisoformat(ts)
|
|
727
|
+
delay_ms = (datetime.now(timezone.utc) - sent).total_seconds() * 1000
|
|
728
|
+
latency_str = f" ({delay_ms:.1f}ms)"
|
|
729
|
+
local_ts = sent.astimezone().strftime("%H:%M:%S")
|
|
730
|
+
except Exception:
|
|
731
|
+
local_ts = ts[11:19] if len(ts) >= 19 else ts
|
|
732
|
+
print(f"[{source}] {local_ts} {event}{latency_str}: {json.dumps(data, ensure_ascii=False)}")
|
|
733
|
+
else:
|
|
734
|
+
print(f"[{source}] {event}: {json.dumps(data, ensure_ascii=False)}")
|
|
735
|
+
|
|
736
|
+
async def _handle_rpc_request(self, ws, msg: dict):
|
|
737
|
+
"""Handle an incoming RPC request forwarded by Kernel (launcher.* methods)."""
|
|
738
|
+
rpc_id = msg.get("id", "")
|
|
739
|
+
method = msg.get("method", "")
|
|
740
|
+
params = msg.get("params", {})
|
|
741
|
+
|
|
742
|
+
handlers = {
|
|
743
|
+
"list_modules": self._rpc_list_modules,
|
|
744
|
+
"start_module": self._rpc_start_module,
|
|
745
|
+
"stop_module": self._rpc_stop_module,
|
|
746
|
+
"restart_module": self._rpc_restart_module,
|
|
747
|
+
"rescan": self._rpc_rescan,
|
|
748
|
+
"shutdown": self._rpc_shutdown,
|
|
749
|
+
}
|
|
750
|
+
handler = handlers.get(method)
|
|
751
|
+
if handler:
|
|
752
|
+
try:
|
|
753
|
+
result = await handler(params)
|
|
754
|
+
await ws.send(json.dumps({"jsonrpc": "2.0", "id": rpc_id, "result": result}))
|
|
755
|
+
except Exception as e:
|
|
756
|
+
await ws.send(json.dumps({
|
|
757
|
+
"jsonrpc": "2.0", "id": rpc_id,
|
|
758
|
+
"error": {"code": -32603, "message": str(e)},
|
|
759
|
+
}))
|
|
760
|
+
else:
|
|
761
|
+
await ws.send(json.dumps({
|
|
762
|
+
"jsonrpc": "2.0", "id": rpc_id,
|
|
763
|
+
"error": {"code": -32601, "message": f"Method not found: {method}"},
|
|
764
|
+
}))
|
|
765
|
+
|
|
766
|
+
# ── Launcher RPC method handlers ──
|
|
767
|
+
|
|
768
|
+
async def _rpc_list_modules(self, params: dict) -> dict:
|
|
769
|
+
"""List all modules and their current status."""
|
|
770
|
+
result = []
|
|
771
|
+
for name, info in self.modules.items():
|
|
772
|
+
running = self.process_manager.is_running(name)
|
|
773
|
+
rec = self.process_manager.get_record(name)
|
|
774
|
+
result.append({
|
|
775
|
+
"name": name,
|
|
776
|
+
"display_name": info.display_name,
|
|
777
|
+
"type": info.type,
|
|
778
|
+
"config_state": info.state,
|
|
779
|
+
"desired_state": self._desired_states.get(name, "stopped"),
|
|
780
|
+
"actual_state": f"running({rec.pid})" if running and rec else "stopped",
|
|
781
|
+
"pid": rec.pid if running and rec else None,
|
|
782
|
+
"monitor": info.monitor,
|
|
783
|
+
})
|
|
784
|
+
return {"modules": result}
|
|
785
|
+
|
|
786
|
+
async def _rpc_start_module(self, params: dict) -> dict:
|
|
787
|
+
"""Start a module by name."""
|
|
788
|
+
name = params.get("name", "")
|
|
789
|
+
info = self.modules.get(name)
|
|
790
|
+
if not info:
|
|
791
|
+
raise RuntimeError(f"Module '{name}' not found")
|
|
792
|
+
if info.state == "disabled":
|
|
793
|
+
raise RuntimeError(f"Module '{name}' is disabled")
|
|
794
|
+
|
|
795
|
+
if name not in self._module_tokens:
|
|
796
|
+
self._module_tokens[name] = secrets.token_hex(32)
|
|
797
|
+
await self._register_new_tokens({name: self._module_tokens[name]})
|
|
798
|
+
|
|
799
|
+
token = self._module_tokens[name]
|
|
800
|
+
boot_info = {"token": token}
|
|
801
|
+
ok = self.process_manager.start_module(info, boot_info=boot_info)
|
|
802
|
+
if ok:
|
|
803
|
+
self._desired_states[name] = "running"
|
|
804
|
+
self.process_manager.persist_records()
|
|
805
|
+
rec = self.process_manager.get_record(name)
|
|
806
|
+
self._log_lifecycle("started", name, pid=rec.pid if rec else None, via="rpc")
|
|
807
|
+
await self._publish_event("module.started", {"module_id": name})
|
|
808
|
+
return {"status": "started", "name": name}
|
|
809
|
+
self._log_lifecycle("start_failed", name, via="rpc")
|
|
810
|
+
raise RuntimeError(f"Failed to start '{name}'")
|
|
811
|
+
|
|
812
|
+
async def _rpc_stop_module(self, params: dict) -> dict:
|
|
813
|
+
"""Stop a module with graceful shutdown."""
|
|
814
|
+
name = params.get("name", "")
|
|
815
|
+
info = self.modules.get(name)
|
|
816
|
+
if not info:
|
|
817
|
+
raise RuntimeError(f"Module '{name}' not found")
|
|
818
|
+
reason = params.get("reason", "stop_requested")
|
|
819
|
+
self._desired_states[name] = "stopped"
|
|
820
|
+
await self._graceful_stop(name, reason)
|
|
821
|
+
self.process_manager.persist_records()
|
|
822
|
+
return {"status": "stopped", "name": name}
|
|
823
|
+
|
|
824
|
+
async def _rpc_restart_module(self, params: dict) -> dict:
|
|
825
|
+
"""Restart a module (stop + start)."""
|
|
826
|
+
name = params.get("name", "")
|
|
827
|
+
info = self.modules.get(name)
|
|
828
|
+
if not info:
|
|
829
|
+
raise RuntimeError(f"Module '{name}' not found")
|
|
830
|
+
if info.state == "disabled":
|
|
831
|
+
raise RuntimeError(f"Module '{name}' is disabled")
|
|
832
|
+
reason = params.get("reason", "restart")
|
|
833
|
+
await self._graceful_stop(name, reason)
|
|
834
|
+
self._module_tokens[name] = secrets.token_hex(32)
|
|
835
|
+
await self._register_new_tokens({name: self._module_tokens[name]})
|
|
836
|
+
token = self._module_tokens[name]
|
|
837
|
+
boot_info = {"token": token}
|
|
838
|
+
ok = self.process_manager.start_module(info, boot_info=boot_info)
|
|
839
|
+
if ok:
|
|
840
|
+
self._desired_states[name] = "running"
|
|
841
|
+
self.process_manager.persist_records()
|
|
842
|
+
rec = self.process_manager.get_record(name)
|
|
843
|
+
self._log_lifecycle("started", name, pid=rec.pid if rec else None, via="rpc_restart")
|
|
844
|
+
await self._publish_event("module.started", {"module_id": name})
|
|
845
|
+
return {"status": "restarted", "name": name}
|
|
846
|
+
self._log_lifecycle("start_failed", name, via="rpc_restart")
|
|
847
|
+
raise RuntimeError(f"Failed to restart '{name}'")
|
|
848
|
+
|
|
849
|
+
async def _rpc_rescan(self, params: dict) -> dict:
|
|
850
|
+
"""Rescan module directories for new/removed modules."""
|
|
851
|
+
old_names = set(self.modules.keys())
|
|
852
|
+
self.modules = self.module_scanner.scan()
|
|
853
|
+
new_names = set(self.modules.keys())
|
|
854
|
+
added = list(new_names - old_names)
|
|
855
|
+
removed = list(old_names - new_names)
|
|
856
|
+
for name in added:
|
|
857
|
+
info = self.modules[name]
|
|
858
|
+
self._log_lifecycle("scanned", name, state=info.state, module_dir=info.module_dir)
|
|
859
|
+
self._desired_states[name] = "running" if info.state == "enabled" else "stopped"
|
|
860
|
+
if added:
|
|
861
|
+
new_tokens = {}
|
|
862
|
+
for name in added:
|
|
863
|
+
self._module_tokens[name] = secrets.token_hex(32)
|
|
864
|
+
new_tokens[name] = self._module_tokens[name]
|
|
865
|
+
await self._register_new_tokens(new_tokens)
|
|
866
|
+
return {"added": added, "removed": removed, "total": len(self.modules)}
|
|
867
|
+
|
|
868
|
+
async def _rpc_shutdown(self, params: dict) -> dict:
|
|
869
|
+
"""Shutdown the entire Kite system."""
|
|
870
|
+
reason = params.get("reason", "rpc_request")
|
|
871
|
+
self._request_shutdown(f"RPC shutdown request: {reason}")
|
|
872
|
+
return {"status": "shutting_down", "reason": reason}
|
|
873
|
+
|
|
874
|
+
# ── Event publishing via RPC ──
|
|
833
875
|
|
|
834
876
|
async def _publish_event(self, event_type: str, data: dict):
|
|
835
|
-
"""Publish an event
|
|
836
|
-
deadlock with _ws_connect recv loop (websockets 15.x send can block when
|
|
837
|
-
incoming frames are pending and recv is held by async-for)."""
|
|
877
|
+
"""Publish an event via RPC event.publish through Kernel WS."""
|
|
838
878
|
if not self._ws:
|
|
839
879
|
return
|
|
840
|
-
from datetime import datetime, timezone
|
|
841
880
|
msg = json.dumps({
|
|
842
|
-
"
|
|
843
|
-
"
|
|
844
|
-
"
|
|
845
|
-
"
|
|
846
|
-
|
|
847
|
-
|
|
881
|
+
"jsonrpc": "2.0",
|
|
882
|
+
"id": str(uuid.uuid4()),
|
|
883
|
+
"method": "event.publish",
|
|
884
|
+
"params": {
|
|
885
|
+
"event_id": str(uuid.uuid4()),
|
|
886
|
+
"event": event_type,
|
|
887
|
+
"data": data,
|
|
888
|
+
},
|
|
848
889
|
})
|
|
849
890
|
|
|
850
891
|
async def _send():
|
|
@@ -855,14 +896,6 @@ class Launcher:
|
|
|
855
896
|
|
|
856
897
|
asyncio.create_task(_send())
|
|
857
898
|
|
|
858
|
-
def _publish_event_threadsafe(self, event_type: str, data: dict):
|
|
859
|
-
"""Publish event from non-async context (API thread). Fire-and-forget."""
|
|
860
|
-
if not self._ws or not self._loop:
|
|
861
|
-
return
|
|
862
|
-
asyncio.run_coroutine_threadsafe(
|
|
863
|
-
self._publish_event(event_type, data), self._loop,
|
|
864
|
-
)
|
|
865
|
-
|
|
866
899
|
async def _wait_event(self, event_type: str, module_id: str, timeout: float) -> dict | None:
|
|
867
900
|
"""Wait for a specific event from a module. Returns data dict or None on timeout."""
|
|
868
901
|
key = f"{event_type}:{module_id}"
|
|
@@ -892,12 +925,32 @@ class Launcher:
|
|
|
892
925
|
})
|
|
893
926
|
return
|
|
894
927
|
|
|
928
|
+
# Register waiters BEFORE sending shutdown event
|
|
929
|
+
ack_key = f"module.shutdown.ack:{name}"
|
|
930
|
+
ack_evt = asyncio.Event()
|
|
931
|
+
ack_data = {}
|
|
932
|
+
self._event_waiters[ack_key] = (ack_evt, ack_data)
|
|
933
|
+
|
|
934
|
+
ready_key = f"module.shutdown.ready:{name}"
|
|
935
|
+
ready_evt = asyncio.Event()
|
|
936
|
+
ready_data = {}
|
|
937
|
+
self._event_waiters[ready_key] = (ready_evt, ready_data)
|
|
938
|
+
|
|
895
939
|
await self._publish_event("module.shutdown", {
|
|
896
940
|
"module_id": name, "reason": reason, "timeout": timeout,
|
|
897
941
|
})
|
|
898
942
|
|
|
899
|
-
|
|
943
|
+
# Wait for ack
|
|
944
|
+
try:
|
|
945
|
+
await asyncio.wait_for(ack_evt.wait(), timeout=3)
|
|
946
|
+
ack = ack_data
|
|
947
|
+
except asyncio.TimeoutError:
|
|
948
|
+
ack = None
|
|
949
|
+
finally:
|
|
950
|
+
self._event_waiters.pop(ack_key, None)
|
|
951
|
+
|
|
900
952
|
if not ack:
|
|
953
|
+
self._event_waiters.pop(ready_key, None)
|
|
901
954
|
self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
|
|
902
955
|
await self._publish_event("module.stopped", {
|
|
903
956
|
"module_id": name,
|
|
@@ -906,7 +959,15 @@ class Launcher:
|
|
|
906
959
|
return
|
|
907
960
|
|
|
908
961
|
estimated = min(ack.get("estimated_cleanup", timeout), timeout)
|
|
909
|
-
|
|
962
|
+
|
|
963
|
+
# Wait for ready
|
|
964
|
+
try:
|
|
965
|
+
await asyncio.wait_for(ready_evt.wait(), timeout=estimated)
|
|
966
|
+
ready = ready_data
|
|
967
|
+
except asyncio.TimeoutError:
|
|
968
|
+
ready = None
|
|
969
|
+
finally:
|
|
970
|
+
self._event_waiters.pop(ready_key, None)
|
|
910
971
|
if ready:
|
|
911
972
|
self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_READY)
|
|
912
973
|
else:
|
|
@@ -920,10 +981,10 @@ class Launcher:
|
|
|
920
981
|
|
|
921
982
|
async def _graceful_shutdown_all(self):
|
|
922
983
|
"""Shut down all modules. Order:
|
|
923
|
-
1. Send shutdown to graceful modules (excl.
|
|
984
|
+
1. Send shutdown to graceful modules (excl. Kernel) — let them start cleanup
|
|
924
985
|
2. Terminate non-graceful modules (fast, runs during graceful cleanup)
|
|
925
986
|
3. Wait for graceful modules to exit (process monitoring)
|
|
926
|
-
4. Shut down
|
|
987
|
+
4. Shut down Kernel last (keeps event routing alive throughout)
|
|
927
988
|
"""
|
|
928
989
|
self._system_shutting_down = True
|
|
929
990
|
running = [n for n in self.modules if self.process_manager.is_running(n)]
|
|
@@ -938,9 +999,9 @@ class Launcher:
|
|
|
938
999
|
graceful = [n for n in running if self._graceful_modules.get(n)]
|
|
939
1000
|
non_graceful = [n for n in running if not self._graceful_modules.get(n)]
|
|
940
1001
|
|
|
941
|
-
# Defer
|
|
942
|
-
|
|
943
|
-
graceful_batch = [n for n in graceful if n != "
|
|
1002
|
+
# Defer Kernel — it must stay alive to route shutdown events
|
|
1003
|
+
kernel_deferred = "kernel" in graceful
|
|
1004
|
+
graceful_batch = [n for n in graceful if n != "kernel"] if kernel_deferred else graceful
|
|
944
1005
|
|
|
945
1006
|
print(f"[launcher] 正在关闭 {len(running)} 个模块: {', '.join(running)}")
|
|
946
1007
|
|
|
@@ -976,44 +1037,49 @@ class Launcher:
|
|
|
976
1037
|
self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_PARTIAL)
|
|
977
1038
|
self._log_lifecycle("stopped", name, reason="system_shutdown")
|
|
978
1039
|
|
|
979
|
-
# Phase 4: All other modules exited — now shut down
|
|
980
|
-
if
|
|
981
|
-
self._log_lifecycle("stopping", "
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
if
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
1040
|
+
# Phase 4: All other modules exited — now shut down Kernel
|
|
1041
|
+
if kernel_deferred and self.process_manager.is_running("kernel"):
|
|
1042
|
+
self._log_lifecycle("stopping", "kernel", reason="system_shutdown")
|
|
1043
|
+
print("[launcher] 正在关闭 Kernel...")
|
|
1044
|
+
|
|
1045
|
+
# Call kernel.shutdown RPC (not event)
|
|
1046
|
+
rpc_sent = False
|
|
1047
|
+
try:
|
|
1048
|
+
if self._ws:
|
|
1049
|
+
await self._rpc_call(self._ws, "kernel.shutdown", {})
|
|
1050
|
+
print("[launcher] Kernel shutdown RPC 已发送")
|
|
1051
|
+
rpc_sent = True
|
|
1052
|
+
else:
|
|
1053
|
+
print("[launcher] WebSocket 未连接,跳过 RPC 调用")
|
|
1054
|
+
except Exception as e:
|
|
1055
|
+
print(f"[launcher] Kernel shutdown RPC 失败: {e}")
|
|
1056
|
+
|
|
1057
|
+
# Wait for kernel to exit
|
|
1058
|
+
if rpc_sent:
|
|
1059
|
+
# RPC sent: wait up to 5s for graceful exit
|
|
1060
|
+
proc = self.process_manager._processes.get("kernel")
|
|
1061
|
+
if proc:
|
|
1062
|
+
try:
|
|
1063
|
+
loop = asyncio.get_event_loop()
|
|
1064
|
+
await asyncio.wait_for(
|
|
1065
|
+
loop.run_in_executor(None, proc.wait),
|
|
1066
|
+
timeout=5
|
|
1067
|
+
)
|
|
1068
|
+
print("[launcher] Kernel 已退出")
|
|
1069
|
+
except asyncio.TimeoutError:
|
|
1070
|
+
print("[launcher] Kernel 5秒内未退出,强制停止")
|
|
1071
|
+
self.process_manager.stop_module("kernel", timeout=SHUTDOWN_TIMEOUT_PARTIAL)
|
|
1072
|
+
else:
|
|
1073
|
+
# No RPC (WS not connected): use shorter timeout for terminate
|
|
1074
|
+
self.process_manager.stop_module("kernel", timeout=2)
|
|
1075
|
+
|
|
1076
|
+
self._log_lifecycle("stopped", "kernel", reason="system_shutdown")
|
|
994
1077
|
|
|
995
1078
|
# Final safety net
|
|
996
1079
|
try:
|
|
997
1080
|
self.process_manager.stop_all(timeout=SHUTDOWN_TIMEOUT_BULK)
|
|
998
1081
|
except Exception as e:
|
|
999
1082
|
print(f"[launcher] stop_all 出错: {e}")
|
|
1000
|
-
await self._close_http()
|
|
1001
|
-
|
|
1002
|
-
# ── Heartbeat to Registry ──
|
|
1003
|
-
|
|
1004
|
-
async def _heartbeat_loop(self):
|
|
1005
|
-
"""Send heartbeat to Registry every 30 seconds."""
|
|
1006
|
-
while not self._thread_shutdown.is_set():
|
|
1007
|
-
await asyncio.sleep(30)
|
|
1008
|
-
try:
|
|
1009
|
-
client = self._get_http()
|
|
1010
|
-
await client.post(
|
|
1011
|
-
f"http://127.0.0.1:{self.registry_port}/modules",
|
|
1012
|
-
json={"action": "heartbeat", "module_id": "launcher"},
|
|
1013
|
-
headers={"Authorization": f"Bearer {self.kite_token}"},
|
|
1014
|
-
)
|
|
1015
|
-
except Exception:
|
|
1016
|
-
pass
|
|
1017
1083
|
|
|
1018
1084
|
# ── Module startup ──
|
|
1019
1085
|
|
|
@@ -1080,7 +1146,7 @@ class Launcher:
|
|
|
1080
1146
|
return layers
|
|
1081
1147
|
|
|
1082
1148
|
async def _start_one_module(self, info: ModuleInfo):
|
|
1083
|
-
"""Start a single module: publish starting → start process → wait ready → started → close stdio."""
|
|
1149
|
+
"""Start a single module: publish starting → start process → send kernel_port → wait ready → started → close stdio."""
|
|
1084
1150
|
self._log_lifecycle("starting", info.name)
|
|
1085
1151
|
await self._publish_event("module.starting", {"module_id": info.name})
|
|
1086
1152
|
|
|
@@ -1092,13 +1158,32 @@ class Launcher:
|
|
|
1092
1158
|
self._log_lifecycle("start_failed", info.name)
|
|
1093
1159
|
return
|
|
1094
1160
|
|
|
1161
|
+
# Register waiter BEFORE sending kernel_port
|
|
1162
|
+
# This prevents race condition where module connects and sends module.ready before waiter is registered
|
|
1163
|
+
ready_key = f"module.ready:{info.name}"
|
|
1164
|
+
ready_evt = asyncio.Event()
|
|
1165
|
+
ready_data = {}
|
|
1166
|
+
self._event_waiters[ready_key] = (ready_evt, ready_data)
|
|
1167
|
+
|
|
1168
|
+
# Send kernel_port via stdin so module can connect to Kernel WS
|
|
1169
|
+
self.process_manager.write_stdin(info.name, {
|
|
1170
|
+
"kite": "kernel_port",
|
|
1171
|
+
"kernel_port": self.kernel_port,
|
|
1172
|
+
})
|
|
1173
|
+
|
|
1095
1174
|
# Persist immediately after starting to ensure PID is recorded
|
|
1096
|
-
# (in case launcher crashes before Phase 4 completes)
|
|
1097
1175
|
self.process_manager.persist_records()
|
|
1098
1176
|
|
|
1099
1177
|
# Wait for module.ready or module.exiting (whichever comes first)
|
|
1100
1178
|
timeout = info.launch.timeout
|
|
1101
|
-
|
|
1179
|
+
try:
|
|
1180
|
+
await asyncio.wait_for(ready_evt.wait(), timeout=timeout)
|
|
1181
|
+
ready = ready_data
|
|
1182
|
+
except asyncio.TimeoutError:
|
|
1183
|
+
ready = None
|
|
1184
|
+
finally:
|
|
1185
|
+
self._event_waiters.pop(ready_key, None)
|
|
1186
|
+
|
|
1102
1187
|
elapsed = time.monotonic() - t0
|
|
1103
1188
|
if ready and ready.get("_exited"):
|
|
1104
1189
|
# Module sent module.exiting before ready — it chose to quit
|
|
@@ -1110,7 +1195,7 @@ class Launcher:
|
|
|
1110
1195
|
self._ready_times[info.name] = elapsed
|
|
1111
1196
|
print(f"[launcher] 模块 '{info.name}' 已就绪 ({elapsed:.2f}s)")
|
|
1112
1197
|
else:
|
|
1113
|
-
print(f"[launcher] 警告: '{info.name}' 在 {timeout}s 内未发送 module.ready")
|
|
1198
|
+
print(f"\033[91m[launcher] 警告: '{info.name}' 在 {timeout}s 内未发送 module.ready\033[0m")
|
|
1114
1199
|
|
|
1115
1200
|
rec = self.process_manager.get_record(info.name)
|
|
1116
1201
|
self._log_lifecycle("started", info.name, pid=rec.pid if rec else None)
|
|
@@ -1118,87 +1203,74 @@ class Launcher:
|
|
|
1118
1203
|
self.process_manager.close_stdio(info.name)
|
|
1119
1204
|
|
|
1120
1205
|
async def _register_module_tokens(self):
|
|
1121
|
-
"""Generate per-module tokens and register the mapping to
|
|
1122
|
-
# Include all scanned modules
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1206
|
+
"""Generate per-module tokens and register the mapping to Kernel via RPC."""
|
|
1207
|
+
# Include all scanned modules
|
|
1208
|
+
async def _generate_module_tokens(self):
|
|
1209
|
+
"""Request Kernel to generate tokens for all scanned modules via RPC."""
|
|
1210
|
+
# Collect module names that need tokens
|
|
1211
|
+
module_names = [name for name in self.modules if name not in self._module_tokens]
|
|
1212
|
+
|
|
1213
|
+
if not module_names:
|
|
1214
|
+
return
|
|
1129
1215
|
|
|
1130
|
-
|
|
1216
|
+
# Wait for WebSocket connection to be ready
|
|
1217
|
+
if self._ws_connected:
|
|
1218
|
+
try:
|
|
1219
|
+
await asyncio.wait_for(self._ws_connected.wait(), timeout=5)
|
|
1220
|
+
except asyncio.TimeoutError:
|
|
1221
|
+
print(f"[launcher] 警告: WebSocket 未就绪,无法生成令牌")
|
|
1222
|
+
return
|
|
1223
|
+
else:
|
|
1224
|
+
print(f"[launcher] 警告: _ws_connected 未初始化")
|
|
1131
1225
|
return
|
|
1132
1226
|
|
|
1133
|
-
|
|
1227
|
+
# Call Kernel RPC to generate tokens
|
|
1228
|
+
try:
|
|
1229
|
+
result = await self._rpc_call(self._ws, "kernel.generate_tokens", {"modules": module_names})
|
|
1230
|
+
if result.get("result", {}).get("ok"):
|
|
1231
|
+
tokens = result["result"].get("tokens", {})
|
|
1232
|
+
self._module_tokens.update(tokens)
|
|
1233
|
+
print(f"[launcher] Kernel 已生成 {len(tokens)} 个模块令牌")
|
|
1234
|
+
elif "error" in result:
|
|
1235
|
+
print(f"[launcher] 警告: 令牌生成失败: {result['error'].get('message', '')}")
|
|
1236
|
+
except Exception as e:
|
|
1237
|
+
print(f"[launcher] 警告: 生成模块令牌失败: {e}")
|
|
1134
1238
|
|
|
1135
|
-
async def
|
|
1136
|
-
"""Register token mapping to
|
|
1137
|
-
|
|
1138
|
-
|
|
1239
|
+
async def _register_new_tokens(self, tokens: dict):
|
|
1240
|
+
"""Register new token mapping to Kernel via RPC kernel.register_tokens."""
|
|
1241
|
+
if not self._ws or not tokens:
|
|
1242
|
+
return
|
|
1139
1243
|
try:
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
if resp.status_code == 200:
|
|
1244
|
+
result = await self._rpc_call(self._ws, "kernel.register_tokens", tokens)
|
|
1245
|
+
if result.get("result", {}).get("ok"):
|
|
1143
1246
|
print(f"[launcher] 已注册 {len(tokens)} 个模块令牌")
|
|
1144
|
-
|
|
1145
|
-
print(f"[launcher] 警告:
|
|
1247
|
+
elif "error" in result:
|
|
1248
|
+
print(f"[launcher] 警告: 令牌注册失败: {result['error'].get('message', '')}")
|
|
1146
1249
|
except Exception as e:
|
|
1147
1250
|
print(f"[launcher] 警告: 注册模块令牌失败: {e}")
|
|
1148
1251
|
|
|
1149
1252
|
# ── Validation ──
|
|
1150
1253
|
|
|
1151
1254
|
def _validate_core_modules(self):
|
|
1152
|
-
"""Validate core modules exist
|
|
1255
|
+
"""Validate core modules exist."""
|
|
1153
1256
|
project_root = os.environ["KITE_PROJECT"]
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
if not fm:
|
|
1168
|
-
print(f"[launcher] 致命: 核心模块 '{name}' module.md 没有有效的 frontmatter")
|
|
1169
|
-
sys.exit(1)
|
|
1170
|
-
except Exception as e:
|
|
1171
|
-
print(f"[launcher] 致命: 核心模块 '{name}' module.md 解析错误: {e}")
|
|
1257
|
+
mod_dir = os.path.join(project_root, "kernel")
|
|
1258
|
+
md_path = os.path.join(mod_dir, "module.md")
|
|
1259
|
+
if not os.path.isdir(mod_dir):
|
|
1260
|
+
print(f"[launcher] 致命: 核心模块 'kernel' 目录未找到: {mod_dir}")
|
|
1261
|
+
sys.exit(1)
|
|
1262
|
+
if not os.path.isfile(md_path):
|
|
1263
|
+
print(f"[launcher] 致命: 核心模块 'kernel' 缺少 module.md: {md_path}")
|
|
1264
|
+
sys.exit(1)
|
|
1265
|
+
try:
|
|
1266
|
+
with open(md_path, "r", encoding="utf-8") as f:
|
|
1267
|
+
fm = _parse_frontmatter(f.read())
|
|
1268
|
+
if not fm:
|
|
1269
|
+
print(f"[launcher] 致命: 核心模块 'kernel' module.md 没有有效的 frontmatter")
|
|
1172
1270
|
sys.exit(1)
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
def _start_api_thread(self):
|
|
1177
|
-
"""Start the Launcher API server in a separate thread with OS-assigned port."""
|
|
1178
|
-
self.api_port = self._get_free_port()
|
|
1179
|
-
config = uvicorn.Config(
|
|
1180
|
-
self._app,
|
|
1181
|
-
host="127.0.0.1",
|
|
1182
|
-
port=self.api_port,
|
|
1183
|
-
log_level="warning",
|
|
1184
|
-
)
|
|
1185
|
-
self._api_server = uvicorn.Server(config)
|
|
1186
|
-
|
|
1187
|
-
def _run():
|
|
1188
|
-
self._api_server.run()
|
|
1189
|
-
|
|
1190
|
-
t = threading.Thread(target=_run, daemon=True)
|
|
1191
|
-
t.start()
|
|
1192
|
-
|
|
1193
|
-
deadline = time.time() + 5
|
|
1194
|
-
while time.time() < deadline:
|
|
1195
|
-
if self._api_server.started:
|
|
1196
|
-
break
|
|
1197
|
-
time.sleep(0.05)
|
|
1198
|
-
else:
|
|
1199
|
-
print("[launcher] 警告: API 服务器可能尚未完全就绪")
|
|
1200
|
-
|
|
1201
|
-
print(f"[launcher] API 服务器已启动,端口 {self.api_port}")
|
|
1271
|
+
except Exception as e:
|
|
1272
|
+
print(f"[launcher] 致命: 核心模块 'kernel' module.md 解析错误: {e}")
|
|
1273
|
+
sys.exit(1)
|
|
1202
1274
|
|
|
1203
1275
|
# ── Module crash summary ──
|
|
1204
1276
|
|
|
@@ -1296,29 +1368,23 @@ class Launcher:
|
|
|
1296
1368
|
pass
|
|
1297
1369
|
|
|
1298
1370
|
async def _full_restart(self):
|
|
1299
|
-
"""Stop all modules, regenerate tokens, re-run Phase 1-
|
|
1371
|
+
"""Stop all modules, regenerate tokens, re-run Phase 1-2."""
|
|
1300
1372
|
print("[launcher] 全量重启: 正在停止所有模块...")
|
|
1301
1373
|
|
|
1302
1374
|
# Persist records before shutdown so cleanup_leftovers can find survivors
|
|
1303
1375
|
self.process_manager.persist_records()
|
|
1304
1376
|
|
|
1305
|
-
# Disconnect
|
|
1377
|
+
# Disconnect Kernel WS
|
|
1306
1378
|
if self._ws_task:
|
|
1307
1379
|
self._ws_task.cancel()
|
|
1308
1380
|
self._ws_task = None
|
|
1309
|
-
if hasattr(self, '_heartbeat_task') and self._heartbeat_task:
|
|
1310
|
-
self._heartbeat_task.cancel()
|
|
1311
|
-
self._heartbeat_task = None
|
|
1312
1381
|
self._ws = None
|
|
1313
|
-
self.
|
|
1314
|
-
self.
|
|
1382
|
+
self._rpc_waiters.clear()
|
|
1383
|
+
self._rpc_results.clear()
|
|
1315
1384
|
|
|
1316
1385
|
await self._graceful_shutdown_all()
|
|
1317
1386
|
|
|
1318
1387
|
# Cleanup any leftover processes that survived graceful shutdown.
|
|
1319
|
-
# Note: _graceful_shutdown_all() clears _processes/_records dicts, but
|
|
1320
|
-
# cleanup_leftovers() reads from processes.json (persisted above), so it can
|
|
1321
|
-
# still find and kill survivors.
|
|
1322
1388
|
self.process_manager.cleanup_leftovers()
|
|
1323
1389
|
|
|
1324
1390
|
self._module_tokens.clear()
|
|
@@ -1327,13 +1393,11 @@ class Launcher:
|
|
|
1327
1393
|
self.kite_token = secrets.token_hex(32)
|
|
1328
1394
|
self.process_manager.kite_token = self.kite_token
|
|
1329
1395
|
|
|
1330
|
-
print("[launcher] 全量重启: 重新执行 Phase 1-
|
|
1396
|
+
print("[launcher] 全量重启: 重新执行 Phase 1-2...")
|
|
1331
1397
|
try:
|
|
1332
|
-
await self.
|
|
1333
|
-
await self.
|
|
1334
|
-
await self._phase4_start_modules()
|
|
1398
|
+
await self._phase1_start_kernel()
|
|
1399
|
+
await self._phase2_start_modules()
|
|
1335
1400
|
self.process_manager.persist_records()
|
|
1336
|
-
self._heartbeat_task = asyncio.create_task(self._heartbeat_loop())
|
|
1337
1401
|
print("[launcher] 全量重启完成,恢复监控循环")
|
|
1338
1402
|
await self._monitor_loop()
|
|
1339
1403
|
except Exception as e:
|
|
@@ -1342,14 +1406,12 @@ class Launcher:
|
|
|
1342
1406
|
# ── Shutdown ──
|
|
1343
1407
|
|
|
1344
1408
|
def _final_cleanup(self):
|
|
1345
|
-
"""Called on exit — stop all processes,
|
|
1409
|
+
"""Called on exit — stop all processes, clear records."""
|
|
1346
1410
|
try:
|
|
1347
1411
|
print("[launcher] 正在执行最终清理...")
|
|
1348
1412
|
|
|
1349
1413
|
if self._ws_task:
|
|
1350
1414
|
self._ws_task.cancel()
|
|
1351
|
-
if hasattr(self, '_heartbeat_task') and self._heartbeat_task:
|
|
1352
|
-
self._heartbeat_task.cancel()
|
|
1353
1415
|
|
|
1354
1416
|
# Note: _graceful_shutdown_all() already called stop_all() in _async_main finally block.
|
|
1355
1417
|
# This is just a safety check — should normally find nothing.
|
|
@@ -1361,9 +1423,6 @@ class Launcher:
|
|
|
1361
1423
|
else:
|
|
1362
1424
|
print("[launcher] 无残留进程")
|
|
1363
1425
|
|
|
1364
|
-
if self._api_server:
|
|
1365
|
-
self._api_server.should_exit = True
|
|
1366
|
-
|
|
1367
1426
|
# Clear instance runtime files
|
|
1368
1427
|
try:
|
|
1369
1428
|
os.remove(self.process_manager.records_path)
|
|
@@ -1374,7 +1433,13 @@ class Launcher:
|
|
|
1374
1433
|
finally:
|
|
1375
1434
|
# Signal the safety-net thread that normal shutdown has completed
|
|
1376
1435
|
self._shutdown_complete.set()
|
|
1377
|
-
|
|
1436
|
+
|
|
1437
|
+
# Calculate and display shutdown time
|
|
1438
|
+
if self._shutdown_start_time > 0:
|
|
1439
|
+
shutdown_elapsed = time.monotonic() - self._shutdown_start_time
|
|
1440
|
+
print(f"[launcher] 再见。(退出耗时: {shutdown_elapsed:.2f}s)")
|
|
1441
|
+
else:
|
|
1442
|
+
print("[launcher] 再见。")
|
|
1378
1443
|
|
|
1379
1444
|
if IS_WINDOWS:
|
|
1380
1445
|
os._exit(0)
|
|
@@ -1403,11 +1468,8 @@ class Launcher:
|
|
|
1403
1468
|
else:
|
|
1404
1469
|
stopped.append((name, info))
|
|
1405
1470
|
|
|
1406
|
-
# Calculate kernel startup time (Phase 1
|
|
1407
|
-
kernel_time = 0
|
|
1408
|
-
for phase_name in ["Phase 1+2: Registry + Event Hub (并行)", "Phase 3: Registry 事件总线"]:
|
|
1409
|
-
if phase_name in phase_times:
|
|
1410
|
-
kernel_time += phase_times[phase_name]
|
|
1471
|
+
# Calculate kernel startup time (Phase 1)
|
|
1472
|
+
kernel_time = phase_times.get("Phase 1: Kernel", 0)
|
|
1411
1473
|
|
|
1412
1474
|
lines = [
|
|
1413
1475
|
"",
|
|
@@ -1422,16 +1484,15 @@ class Launcher:
|
|
|
1422
1484
|
|
|
1423
1485
|
# Kernel modules section
|
|
1424
1486
|
lines.append(f"{G} 内核模块:{R}")
|
|
1425
|
-
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
lines.append(f"{G} {phase_name:<26s} {elapsed:>6.2f}s{R}")
|
|
1487
|
+
if "Phase 1: Kernel" in phase_times:
|
|
1488
|
+
elapsed = phase_times["Phase 1: Kernel"]
|
|
1489
|
+
lines.append(f"{G} {'Phase 1: Kernel':<26s} {elapsed:>6.2f}s{R}")
|
|
1429
1490
|
|
|
1430
1491
|
# Extension modules section
|
|
1431
1492
|
lines.append(f"{G} 扩展模块:{R}")
|
|
1432
|
-
if "Phase
|
|
1433
|
-
elapsed = phase_times["Phase
|
|
1434
|
-
lines.append(f"{G} {'Phase
|
|
1493
|
+
if "Phase 2: Extensions" in phase_times:
|
|
1494
|
+
elapsed = phase_times["Phase 2: Extensions"]
|
|
1495
|
+
lines.append(f"{G} {'Phase 2: Extensions':<26s} {elapsed:>6.2f}s{R}")
|
|
1435
1496
|
|
|
1436
1497
|
# Sort running modules by ready time
|
|
1437
1498
|
running_sorted = sorted(running, key=lambda x: self._ready_times.get(x[0], float('inf')))
|
|
@@ -1471,12 +1532,21 @@ class Launcher:
|
|
|
1471
1532
|
es_str = f"{elapsed_from_start:.2f}s"
|
|
1472
1533
|
else:
|
|
1473
1534
|
es_str = "—"
|
|
1474
|
-
|
|
1535
|
+
|
|
1536
|
+
# Check if module timed out (ready_t >= 15s for kernel, >= timeout for others)
|
|
1537
|
+
is_timeout = False
|
|
1538
|
+
if ready_t is not None:
|
|
1539
|
+
if name == "kernel" and ready_t >= 15:
|
|
1540
|
+
is_timeout = True
|
|
1541
|
+
elif name != "kernel" and ready_t >= 15: # Default timeout for other modules
|
|
1542
|
+
is_timeout = True
|
|
1543
|
+
|
|
1544
|
+
rows.append([label, str(rec.pid), time_str, es_str, f"[{info.type}]", is_timeout])
|
|
1475
1545
|
|
|
1476
1546
|
# Calculate column widths: max of header and all data display widths
|
|
1477
1547
|
col_widths = [_dw(h) for h in headers]
|
|
1478
1548
|
for row in rows:
|
|
1479
|
-
for i, cell in enumerate(row):
|
|
1549
|
+
for i, cell in enumerate(row[:5]): # Only first 5 columns (exclude is_timeout flag)
|
|
1480
1550
|
col_widths[i] = max(col_widths[i], _dw(cell))
|
|
1481
1551
|
|
|
1482
1552
|
# Render header
|
|
@@ -1489,14 +1559,19 @@ class Launcher:
|
|
|
1489
1559
|
lines.append(f"{DIM} {' '.join(hdr_parts)}{R}")
|
|
1490
1560
|
|
|
1491
1561
|
# Render data rows
|
|
1562
|
+
RED = "\033[91m"
|
|
1492
1563
|
for row in rows:
|
|
1564
|
+
is_timeout = row[5] # Last element is the timeout flag
|
|
1493
1565
|
parts = []
|
|
1494
|
-
for i, cell in enumerate(row):
|
|
1566
|
+
for i, cell in enumerate(row[:5]): # Only first 5 columns
|
|
1495
1567
|
if aligns[i] == 'left':
|
|
1496
1568
|
parts.append(_rpad(cell, col_widths[i]))
|
|
1497
1569
|
else:
|
|
1498
1570
|
parts.append(_lpad(cell, col_widths[i]))
|
|
1499
|
-
|
|
1571
|
+
if is_timeout:
|
|
1572
|
+
lines.append(f"{RED} ✓ {' '.join(parts)}{R}")
|
|
1573
|
+
else:
|
|
1574
|
+
lines.append(f"{G} ✓ {' '.join(parts)}{R}")
|
|
1500
1575
|
|
|
1501
1576
|
# Exited modules (started but already quit)
|
|
1502
1577
|
if exited:
|
|
@@ -1514,10 +1589,18 @@ class Launcher:
|
|
|
1514
1589
|
label = info.display_name or name
|
|
1515
1590
|
lines.append(f"{G} - {label:<20s} ({info.state}){R}")
|
|
1516
1591
|
|
|
1517
|
-
lines.append(f"{G}
|
|
1592
|
+
lines.append(f"{G} Kernel WS: ws://127.0.0.1:{self.kernel_port}/ws 实例: {self.instance_id}{R}")
|
|
1518
1593
|
|
|
1519
|
-
# Query
|
|
1520
|
-
web_url =
|
|
1594
|
+
# Query Kernel for web module's api_endpoint via RPC
|
|
1595
|
+
web_url = ""
|
|
1596
|
+
if self._ws:
|
|
1597
|
+
try:
|
|
1598
|
+
resp = await self._rpc_call(self._ws, "registry.get", {"path": "web.api_endpoint"}, timeout=3)
|
|
1599
|
+
val = resp.get("result", {}).get("value")
|
|
1600
|
+
if val and isinstance(val, str):
|
|
1601
|
+
web_url = val.replace("://127.0.0.1:", "://localhost:")
|
|
1602
|
+
except Exception:
|
|
1603
|
+
pass
|
|
1521
1604
|
if web_url:
|
|
1522
1605
|
lines.append(f"{B} Web 管理后台: {web_url}{R}")
|
|
1523
1606
|
|
|
@@ -1572,29 +1655,11 @@ class Launcher:
|
|
|
1572
1655
|
|
|
1573
1656
|
print("\n".join(lines))
|
|
1574
1657
|
|
|
1575
|
-
async def _get_web_url(self) -> str:
|
|
1576
|
-
"""Query Registry for the web module's api_endpoint. Returns URL or empty string."""
|
|
1577
|
-
try:
|
|
1578
|
-
client = self._get_http()
|
|
1579
|
-
resp = await client.get(
|
|
1580
|
-
f"http://127.0.0.1:{self.registry_port}/get/web.api_endpoint",
|
|
1581
|
-
headers={"Authorization": f"Bearer {self.kite_token}"},
|
|
1582
|
-
timeout=3,
|
|
1583
|
-
)
|
|
1584
|
-
if resp.status_code == 200:
|
|
1585
|
-
val = resp.json()
|
|
1586
|
-
if val and isinstance(val, str):
|
|
1587
|
-
# Show localhost instead of 127.0.0.1 for friendliness
|
|
1588
|
-
return val.replace("://127.0.0.1:", "://localhost:")
|
|
1589
|
-
except Exception:
|
|
1590
|
-
pass
|
|
1591
|
-
return ""
|
|
1592
|
-
|
|
1593
1658
|
# ── Utilities ──
|
|
1594
1659
|
|
|
1595
1660
|
def _load_discovery(self) -> dict | None:
|
|
1596
1661
|
"""Read discovery config from launcher's own module.md."""
|
|
1597
|
-
md_path = os.path.join(os.environ["KITE_PROJECT"], "
|
|
1662
|
+
md_path = os.path.join(os.environ["KITE_PROJECT"], "launcher", "module.md")
|
|
1598
1663
|
try:
|
|
1599
1664
|
with open(md_path, "r", encoding="utf-8") as f:
|
|
1600
1665
|
fm = _parse_frontmatter(f.read())
|
|
@@ -1617,214 +1682,6 @@ class Launcher:
|
|
|
1617
1682
|
except Exception:
|
|
1618
1683
|
pass
|
|
1619
1684
|
|
|
1620
|
-
@staticmethod
|
|
1621
|
-
def _get_free_port() -> int:
|
|
1622
|
-
"""Get a free port assigned by the OS (bind to port 0)."""
|
|
1623
|
-
import socket
|
|
1624
|
-
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
1625
|
-
s.bind(("127.0.0.1", 0))
|
|
1626
|
-
return s.getsockname()[1]
|
|
1627
|
-
|
|
1628
|
-
# ── API app ──
|
|
1629
|
-
|
|
1630
|
-
def _create_api_app(self) -> FastAPI:
|
|
1631
|
-
"""Create the FastAPI app with Launcher management routes."""
|
|
1632
|
-
from fastapi import Request, HTTPException
|
|
1633
|
-
app = FastAPI(title="Kite Launcher", docs_url=None, redoc_url=None)
|
|
1634
|
-
launcher = self
|
|
1635
|
-
|
|
1636
|
-
def _require_auth(request: Request):
|
|
1637
|
-
"""Verify Bearer token and IP whitelist. Raise 401/403 on failure."""
|
|
1638
|
-
# IP whitelist: only allow 127.0.0.1
|
|
1639
|
-
client_host = request.client.host if request.client else None
|
|
1640
|
-
if client_host not in ("127.0.0.1", "::1", "localhost"):
|
|
1641
|
-
raise HTTPException(status_code=403, detail="Access denied: only localhost allowed")
|
|
1642
|
-
|
|
1643
|
-
# Bearer token verification
|
|
1644
|
-
auth = request.headers.get("Authorization", "")
|
|
1645
|
-
if not auth.startswith("Bearer "):
|
|
1646
|
-
raise HTTPException(status_code=401, detail="Missing or invalid Authorization header")
|
|
1647
|
-
token = auth[7:].strip()
|
|
1648
|
-
if token != launcher.kite_token:
|
|
1649
|
-
raise HTTPException(status_code=401, detail="Invalid token")
|
|
1650
|
-
|
|
1651
|
-
@app.get("/launcher/modules")
|
|
1652
|
-
async def list_modules(request: Request):
|
|
1653
|
-
"""List all modules and their current status."""
|
|
1654
|
-
_require_auth(request)
|
|
1655
|
-
result = []
|
|
1656
|
-
for name, info in launcher.modules.items():
|
|
1657
|
-
running = launcher.process_manager.is_running(name)
|
|
1658
|
-
rec = launcher.process_manager.get_record(name)
|
|
1659
|
-
result.append({
|
|
1660
|
-
"name": name,
|
|
1661
|
-
"display_name": info.display_name,
|
|
1662
|
-
"type": info.type,
|
|
1663
|
-
"config_state": info.state,
|
|
1664
|
-
"desired_state": launcher._desired_states.get(name, "stopped"),
|
|
1665
|
-
"actual_state": f"running({rec.pid})" if running and rec else "stopped",
|
|
1666
|
-
"pid": rec.pid if running and rec else None,
|
|
1667
|
-
"monitor": info.monitor,
|
|
1668
|
-
})
|
|
1669
|
-
return result
|
|
1670
|
-
|
|
1671
|
-
@app.post("/launcher/modules/{name}/start")
|
|
1672
|
-
async def start_module(name: str, request: Request):
|
|
1673
|
-
"""Start a module by name."""
|
|
1674
|
-
_require_auth(request)
|
|
1675
|
-
info = launcher.modules.get(name)
|
|
1676
|
-
if not info:
|
|
1677
|
-
raise HTTPException(404, f"Module '{name}' not found")
|
|
1678
|
-
if info.state == "disabled":
|
|
1679
|
-
raise HTTPException(403, f"Module '{name}' is disabled")
|
|
1680
|
-
|
|
1681
|
-
if name not in launcher._module_tokens:
|
|
1682
|
-
launcher._module_tokens[name] = secrets.token_hex(32)
|
|
1683
|
-
try:
|
|
1684
|
-
client = launcher._get_http()
|
|
1685
|
-
await client.post(
|
|
1686
|
-
f"http://127.0.0.1:{launcher.registry_port}/tokens",
|
|
1687
|
-
json={name: launcher._module_tokens[name]},
|
|
1688
|
-
headers={"Authorization": f"Bearer {launcher.kite_token}"},
|
|
1689
|
-
)
|
|
1690
|
-
except Exception as e:
|
|
1691
|
-
print(f"[launcher] 警告: 注册 {name} 的令牌失败: {e}")
|
|
1692
|
-
|
|
1693
|
-
token = launcher._module_tokens[name]
|
|
1694
|
-
boot_info = {"token": token}
|
|
1695
|
-
ok = launcher.process_manager.start_module(info, boot_info=boot_info)
|
|
1696
|
-
if ok:
|
|
1697
|
-
launcher._desired_states[name] = "running"
|
|
1698
|
-
launcher.process_manager.persist_records()
|
|
1699
|
-
rec = launcher.process_manager.get_record(name)
|
|
1700
|
-
launcher._log_lifecycle("started", name, pid=rec.pid if rec else None, via="api")
|
|
1701
|
-
launcher._publish_event_threadsafe("module.started", {"module_id": name})
|
|
1702
|
-
return {"status": "started", "name": name}
|
|
1703
|
-
launcher._log_lifecycle("start_failed", name, via="api")
|
|
1704
|
-
raise HTTPException(500, f"Failed to start '{name}'")
|
|
1705
|
-
|
|
1706
|
-
@app.post("/launcher/modules/{name}/stop")
|
|
1707
|
-
async def stop_module(name: str, request: Request, body: dict = None):
|
|
1708
|
-
"""Stop a module with graceful shutdown."""
|
|
1709
|
-
_require_auth(request)
|
|
1710
|
-
info = launcher.modules.get(name)
|
|
1711
|
-
if not info:
|
|
1712
|
-
raise HTTPException(404, f"Module '{name}' not found")
|
|
1713
|
-
reason = (body or {}).get("reason", "stop_requested")
|
|
1714
|
-
launcher._desired_states[name] = "stopped"
|
|
1715
|
-
await launcher._graceful_stop(name, reason)
|
|
1716
|
-
launcher.process_manager.persist_records()
|
|
1717
|
-
return {"status": "stopped", "name": name}
|
|
1718
|
-
|
|
1719
|
-
@app.post("/launcher/modules/{name}/restart")
|
|
1720
|
-
async def restart_module(name: str, request: Request, body: dict = None):
|
|
1721
|
-
"""Restart a module (stop + start)."""
|
|
1722
|
-
_require_auth(request)
|
|
1723
|
-
info = launcher.modules.get(name)
|
|
1724
|
-
if not info:
|
|
1725
|
-
raise HTTPException(404, f"Module '{name}' not found")
|
|
1726
|
-
if info.state == "disabled":
|
|
1727
|
-
raise HTTPException(403, f"Module '{name}' is disabled")
|
|
1728
|
-
reason = (body or {}).get("reason", "restart")
|
|
1729
|
-
await launcher._graceful_stop(name, reason)
|
|
1730
|
-
launcher._module_tokens[name] = secrets.token_hex(32)
|
|
1731
|
-
try:
|
|
1732
|
-
client = launcher._get_http()
|
|
1733
|
-
await client.post(
|
|
1734
|
-
f"http://127.0.0.1:{launcher.registry_port}/tokens",
|
|
1735
|
-
json={name: launcher._module_tokens[name]},
|
|
1736
|
-
headers={"Authorization": f"Bearer {launcher.kite_token}"},
|
|
1737
|
-
)
|
|
1738
|
-
except Exception:
|
|
1739
|
-
pass
|
|
1740
|
-
token = launcher._module_tokens[name]
|
|
1741
|
-
boot_info = {"token": token}
|
|
1742
|
-
ok = launcher.process_manager.start_module(info, boot_info=boot_info)
|
|
1743
|
-
if ok:
|
|
1744
|
-
launcher._desired_states[name] = "running"
|
|
1745
|
-
launcher.process_manager.persist_records()
|
|
1746
|
-
rec = launcher.process_manager.get_record(name)
|
|
1747
|
-
launcher._log_lifecycle("started", name, pid=rec.pid if rec else None, via="restart_api")
|
|
1748
|
-
launcher._publish_event_threadsafe("module.started", {"module_id": name})
|
|
1749
|
-
return {"status": "restarted", "name": name}
|
|
1750
|
-
launcher._log_lifecycle("start_failed", name, via="restart_api")
|
|
1751
|
-
raise HTTPException(500, f"Failed to restart '{name}'")
|
|
1752
|
-
|
|
1753
|
-
@app.post("/launcher/rescan")
|
|
1754
|
-
async def rescan_modules(request: Request):
|
|
1755
|
-
"""Rescan module directories for new/removed modules."""
|
|
1756
|
-
_require_auth(request)
|
|
1757
|
-
old_names = set(launcher.modules.keys())
|
|
1758
|
-
launcher.modules = launcher.module_scanner.scan()
|
|
1759
|
-
new_names = set(launcher.modules.keys())
|
|
1760
|
-
added = list(new_names - old_names)
|
|
1761
|
-
removed = list(old_names - new_names)
|
|
1762
|
-
for name in added:
|
|
1763
|
-
info = launcher.modules[name]
|
|
1764
|
-
launcher._log_lifecycle("scanned", name, state=info.state, module_dir=info.module_dir)
|
|
1765
|
-
for name in added:
|
|
1766
|
-
info = launcher.modules[name]
|
|
1767
|
-
launcher._desired_states[name] = "running" if info.state == "enabled" else "stopped"
|
|
1768
|
-
if added:
|
|
1769
|
-
new_tokens = {}
|
|
1770
|
-
for name in added:
|
|
1771
|
-
launcher._module_tokens[name] = secrets.token_hex(32)
|
|
1772
|
-
new_tokens[name] = launcher._module_tokens[name]
|
|
1773
|
-
try:
|
|
1774
|
-
client = launcher._get_http()
|
|
1775
|
-
await client.post(
|
|
1776
|
-
f"http://127.0.0.1:{launcher.registry_port}/tokens",
|
|
1777
|
-
json=new_tokens,
|
|
1778
|
-
headers={"Authorization": f"Bearer {launcher.kite_token}"},
|
|
1779
|
-
)
|
|
1780
|
-
except Exception:
|
|
1781
|
-
pass
|
|
1782
|
-
return {"added": added, "removed": removed, "total": len(launcher.modules)}
|
|
1783
|
-
|
|
1784
|
-
@app.post("/launcher/shutdown")
|
|
1785
|
-
async def shutdown_launcher(request: Request, body: dict = None):
|
|
1786
|
-
"""Shutdown the entire Kite system (equivalent to Ctrl+C)."""
|
|
1787
|
-
_require_auth(request)
|
|
1788
|
-
reason = (body or {}).get("reason", "api_request")
|
|
1789
|
-
launcher._request_shutdown(f"API shutdown request: {reason}")
|
|
1790
|
-
return {"status": "shutting_down", "reason": reason}
|
|
1791
|
-
|
|
1792
|
-
@app.put("/launcher/modules/{name}/state")
|
|
1793
|
-
async def update_state(name: str, request: Request, body: dict):
|
|
1794
|
-
"""Update module state (enabled/manual/disabled). Writes to module.md."""
|
|
1795
|
-
_require_auth(request)
|
|
1796
|
-
info = launcher.modules.get(name)
|
|
1797
|
-
if not info:
|
|
1798
|
-
raise HTTPException(404, f"Module '{name}' not found")
|
|
1799
|
-
|
|
1800
|
-
new_state = body.get("state", "")
|
|
1801
|
-
if new_state not in ("enabled", "manual", "disabled"):
|
|
1802
|
-
raise HTTPException(400, "state must be enabled, manual, or disabled")
|
|
1803
|
-
|
|
1804
|
-
if info.is_core() and new_state == "disabled":
|
|
1805
|
-
raise HTTPException(403, "Core modules cannot be disabled")
|
|
1806
|
-
|
|
1807
|
-
old_state = info.state
|
|
1808
|
-
info.state = new_state
|
|
1809
|
-
|
|
1810
|
-
if new_state == "enabled":
|
|
1811
|
-
launcher._desired_states[name] = "running"
|
|
1812
|
-
else:
|
|
1813
|
-
launcher._desired_states[name] = "stopped"
|
|
1814
|
-
|
|
1815
|
-
_update_module_md_state(info.module_dir, new_state)
|
|
1816
|
-
launcher._publish_event_threadsafe("module.state_changed", {
|
|
1817
|
-
"module_id": name,
|
|
1818
|
-
"old_state": old_state,
|
|
1819
|
-
"new_state": new_state,
|
|
1820
|
-
})
|
|
1821
|
-
return {
|
|
1822
|
-
"name": name,
|
|
1823
|
-
"old_state": old_state,
|
|
1824
|
-
"new_state": new_state,
|
|
1825
|
-
}
|
|
1826
|
-
|
|
1827
|
-
return app
|
|
1828
1685
|
|
|
1829
1686
|
|
|
1830
1687
|
def _update_module_md_state(module_dir: str, new_state: str):
|
|
@@ -1850,3 +1707,72 @@ def _update_module_md_state(module_dir: str, new_state: str):
|
|
|
1850
1707
|
f.write(updated)
|
|
1851
1708
|
except Exception as e:
|
|
1852
1709
|
print(f"[launcher] 警告: 更新 module.md 状态失败: {e}")
|
|
1710
|
+
|
|
1711
|
+
|
|
1712
|
+
def start_launcher():
|
|
1713
|
+
"""Entry point called from main.py. Sets up environment and starts launcher."""
|
|
1714
|
+
# Load .env (development convenience)
|
|
1715
|
+
try:
|
|
1716
|
+
from dotenv import load_dotenv
|
|
1717
|
+
load_dotenv()
|
|
1718
|
+
except ImportError:
|
|
1719
|
+
pass
|
|
1720
|
+
|
|
1721
|
+
# Resolve project root
|
|
1722
|
+
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
1723
|
+
|
|
1724
|
+
# Home base for Kite data
|
|
1725
|
+
home = os.environ.get("HOME") or os.environ.get("USERPROFILE") or os.path.expanduser("~")
|
|
1726
|
+
kite_home = os.path.join(home, ".kite")
|
|
1727
|
+
|
|
1728
|
+
# Set KITE_* defaults
|
|
1729
|
+
defaults = {
|
|
1730
|
+
"KITE_PROJECT": project_root,
|
|
1731
|
+
"KITE_CWD": os.getcwd(),
|
|
1732
|
+
"KITE_WORKSPACE": os.path.join(kite_home, "workspace"),
|
|
1733
|
+
"KITE_DATA": os.path.join(kite_home, "data"),
|
|
1734
|
+
"KITE_MODULES": os.path.join(kite_home, "modules"),
|
|
1735
|
+
"KITE_REPO": os.path.join(kite_home, "repo"),
|
|
1736
|
+
"KITE_ENV": "development",
|
|
1737
|
+
}
|
|
1738
|
+
for key, value in defaults.items():
|
|
1739
|
+
if not os.environ.get(key):
|
|
1740
|
+
os.environ[key] = value
|
|
1741
|
+
|
|
1742
|
+
# Parse CLI args
|
|
1743
|
+
if "--debug" in sys.argv:
|
|
1744
|
+
os.environ["KITE_DEBUG"] = "1"
|
|
1745
|
+
sys.argv.remove("--debug")
|
|
1746
|
+
|
|
1747
|
+
# Setup logging
|
|
1748
|
+
from .logging_setup import (
|
|
1749
|
+
setup_timestamped_print,
|
|
1750
|
+
init_log_files,
|
|
1751
|
+
setup_exception_hooks,
|
|
1752
|
+
reset_time_baseline,
|
|
1753
|
+
write_crash_handled
|
|
1754
|
+
)
|
|
1755
|
+
setup_timestamped_print()
|
|
1756
|
+
reset_time_baseline()
|
|
1757
|
+
|
|
1758
|
+
print("[launcher] Kite 启动中...")
|
|
1759
|
+
|
|
1760
|
+
# Create and run launcher
|
|
1761
|
+
token = secrets.token_hex(32)
|
|
1762
|
+
launcher = Launcher(kite_token=token)
|
|
1763
|
+
print("[launcher] 启动器实例已创建")
|
|
1764
|
+
|
|
1765
|
+
# Initialize log files (KITE_MODULE_DATA is now set)
|
|
1766
|
+
init_log_files()
|
|
1767
|
+
setup_exception_hooks()
|
|
1768
|
+
|
|
1769
|
+
log_dir = os.path.join(os.environ.get("KITE_MODULE_DATA", ""), "log")
|
|
1770
|
+
suffix = launcher.process_manager.instance_suffix
|
|
1771
|
+
latest_log = os.path.join(log_dir, f"latest{suffix}.log")
|
|
1772
|
+
print(f"[launcher] 日志: {latest_log}")
|
|
1773
|
+
|
|
1774
|
+
try:
|
|
1775
|
+
launcher.run()
|
|
1776
|
+
except Exception as e:
|
|
1777
|
+
write_crash_handled(type(e), e, e.__traceback__)
|
|
1778
|
+
sys.exit(1)
|