@agentunion/kite 1.0.6 → 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli.js +127 -25
- package/core/event_hub/entry.py +105 -61
- package/core/event_hub/module.md +0 -1
- package/core/event_hub/server.py +96 -28
- package/core/launcher/entry.py +477 -290
- package/core/launcher/module_scanner.py +10 -9
- package/core/launcher/process_manager.py +120 -96
- package/core/registry/entry.py +66 -30
- package/core/registry/server.py +47 -14
- package/core/registry/store.py +6 -1
- package/{core → extensions}/event_hub_bench/entry.py +17 -9
- package/{core → extensions}/event_hub_bench/module.md +2 -1
- package/extensions/services/watchdog/entry.py +11 -7
- package/extensions/services/watchdog/server.py +1 -1
- package/main.py +204 -4
- package/package.json +11 -2
- package/core/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/__pycache__/data_dir.cpython-313.pyc +0 -0
- package/core/data_dir.py +0 -62
- package/core/event_hub/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/bench.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/bench_perf.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/dedup.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/entry.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/hub.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/router.cpython-313.pyc +0 -0
- package/core/event_hub/__pycache__/server.cpython-313.pyc +0 -0
- package/core/event_hub/bench_results/.gitkeep +0 -0
- package/core/event_hub/bench_results/2026-02-28_13-26-48.json +0 -51
- package/core/event_hub/bench_results/2026-02-28_13-44-45.json +0 -51
- package/core/event_hub/bench_results/2026-02-28_13-45-39.json +0 -51
- package/core/launcher/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/launcher/__pycache__/entry.cpython-313.pyc +0 -0
- package/core/launcher/__pycache__/module_scanner.cpython-313.pyc +0 -0
- package/core/launcher/__pycache__/process_manager.cpython-313.pyc +0 -0
- package/core/launcher/data/log/lifecycle.jsonl +0 -1158
- package/core/launcher/data/token.txt +0 -1
- package/core/registry/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/registry/__pycache__/entry.cpython-313.pyc +0 -0
- package/core/registry/__pycache__/server.cpython-313.pyc +0 -0
- package/core/registry/__pycache__/store.cpython-313.pyc +0 -0
- package/core/registry/data/port.txt +0 -1
- package/core/registry/data/port_484.txt +0 -1
- package/extensions/__pycache__/__init__.cpython-313.pyc +0 -0
- package/extensions/services/__pycache__/__init__.cpython-313.pyc +0 -0
- package/extensions/services/watchdog/__pycache__/__init__.cpython-313.pyc +0 -0
- package/extensions/services/watchdog/__pycache__/entry.cpython-313.pyc +0 -0
- package/extensions/services/watchdog/__pycache__/monitor.cpython-313.pyc +0 -0
- package/extensions/services/watchdog/__pycache__/server.cpython-313.pyc +0 -0
package/core/launcher/entry.py
CHANGED
|
@@ -4,8 +4,14 @@ Launcher — the core of Kite. Manages module lifecycle, exposes API, monitors p
|
|
|
4
4
|
Thread model:
|
|
5
5
|
- Main thread: asyncio event loop (process management + monitor loop)
|
|
6
6
|
- API thread: independent thread running uvicorn + FastAPI
|
|
7
|
-
- stdout threads: one daemon thread per child process
|
|
7
|
+
- stdout threads: one daemon thread per child process (ProcessManager)
|
|
8
8
|
- (Windows) keyboard listener thread: polls for 'q' key
|
|
9
|
+
|
|
10
|
+
4-Phase startup:
|
|
11
|
+
Phase 1: Registry → stdout port → KITE_REGISTRY_PORT → API → register self + tokens
|
|
12
|
+
Phase 2: Event Hub → stdin launcher_ws_token → stdout ws_endpoint → WS connect → module.ready
|
|
13
|
+
Phase 3: Event Hub → Registry → Registry → Event Hub WS → module.ready
|
|
14
|
+
Phase 4: start remaining enabled modules in topo order
|
|
9
15
|
"""
|
|
10
16
|
|
|
11
17
|
import asyncio
|
|
@@ -22,25 +28,35 @@ import httpx
|
|
|
22
28
|
import uvicorn
|
|
23
29
|
import websockets
|
|
24
30
|
from fastapi import FastAPI, HTTPException
|
|
25
|
-
from fastapi.responses import JSONResponse
|
|
26
31
|
|
|
27
|
-
from .module_scanner import ModuleScanner, ModuleInfo, _parse_frontmatter
|
|
32
|
+
from .module_scanner import ModuleScanner, ModuleInfo, LaunchConfig, _parse_frontmatter
|
|
28
33
|
from .process_manager import ProcessManager
|
|
29
|
-
from core.data_dir import get_launcher_data_dir
|
|
30
34
|
|
|
31
35
|
IS_WINDOWS = sys.platform == "win32"
|
|
32
36
|
|
|
37
|
+
# Core module names that are started in Phase 1-2 (not Phase 4)
|
|
38
|
+
CORE_MODULE_NAMES = {"registry", "event_hub"}
|
|
39
|
+
|
|
33
40
|
|
|
34
41
|
class Launcher:
|
|
35
42
|
"""Kite system entry point. Starts Registry, manages modules, exposes API."""
|
|
36
43
|
|
|
37
44
|
def __init__(self, kite_token: str):
|
|
38
45
|
self.kite_token = kite_token
|
|
39
|
-
self.project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
40
46
|
self.instance_id = str(os.getpid())
|
|
41
|
-
|
|
47
|
+
os.environ["KITE_INSTANCE"] = self.instance_id
|
|
48
|
+
|
|
49
|
+
# Resolve instance workspace (must happen before ProcessManager init)
|
|
50
|
+
self._resolve_instance_dir()
|
|
51
|
+
os.environ["KITE_MODULE_DATA"] = os.path.join(
|
|
52
|
+
os.environ["KITE_INSTANCE_DIR"], "launcher",
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
self.process_manager = ProcessManager(
|
|
56
|
+
kite_token, self.instance_id,
|
|
57
|
+
on_kite_message=self._on_kite_message,
|
|
58
|
+
)
|
|
42
59
|
self.module_scanner = ModuleScanner(
|
|
43
|
-
self.project_root,
|
|
44
60
|
discovery=self._load_discovery(),
|
|
45
61
|
)
|
|
46
62
|
|
|
@@ -55,30 +71,122 @@ class Launcher:
|
|
|
55
71
|
self._module_tokens: dict[str, str] = {} # module_name -> per-module token
|
|
56
72
|
|
|
57
73
|
# Three-layer state model: desired_state per module
|
|
58
|
-
# Initialized from config_state: enabled→running, manual→stopped, disabled→stopped
|
|
59
74
|
self._desired_states: dict[str, str] = {} # module_name -> "running" | "stopped"
|
|
60
75
|
|
|
61
76
|
# Event Hub WebSocket client
|
|
62
77
|
self._event_hub_ws_url: str = ""
|
|
78
|
+
self._launcher_ws_token: str = ""
|
|
63
79
|
self._ws: object | None = None
|
|
64
80
|
self._ws_task: asyncio.Task | None = None
|
|
65
81
|
self._loop: asyncio.AbstractEventLoop | None = None
|
|
66
82
|
|
|
67
83
|
# Event waiters: {event_key: (asyncio.Event, data_dict)}
|
|
68
|
-
# event_key format: "event_type:module_id"
|
|
69
84
|
self._event_waiters: dict[str, tuple[asyncio.Event, dict]] = {}
|
|
70
85
|
|
|
86
|
+
# Kite stdout message waiters: {waiter_key: (threading.Event, data_dict)}
|
|
87
|
+
# Used by ProcessManager stdout callback (cross-thread)
|
|
88
|
+
self._msg_waiters: dict[str, tuple[threading.Event, dict]] = {}
|
|
89
|
+
|
|
71
90
|
self._lifecycle_log = os.path.join(
|
|
72
|
-
|
|
91
|
+
os.environ["KITE_INSTANCE_DIR"], "launcher", "lifecycle.jsonl",
|
|
73
92
|
)
|
|
74
93
|
self._app = self._create_api_app()
|
|
75
94
|
|
|
95
|
+
# ── Instance workspace resolution ──
|
|
96
|
+
|
|
97
|
+
@staticmethod
|
|
98
|
+
def _resolve_instance_dir():
|
|
99
|
+
"""Resolve KITE_INSTANCE_DIR from KITE_WORKSPACE + KITE_CWD.
|
|
100
|
+
Algorithm: take CWD basename, find matching dir in workspace via .cwd file,
|
|
101
|
+
or create new one. Sets KITE_INSTANCE_DIR env var.
|
|
102
|
+
"""
|
|
103
|
+
if os.environ.get("KITE_INSTANCE_DIR"):
|
|
104
|
+
return # already set (e.g. by tests or parent)
|
|
105
|
+
|
|
106
|
+
cwd = os.environ.get("KITE_CWD", os.getcwd())
|
|
107
|
+
workspace = os.environ.get("KITE_WORKSPACE", "")
|
|
108
|
+
if not workspace:
|
|
109
|
+
home = os.environ.get("HOME") or os.environ.get("USERPROFILE") or os.path.expanduser("~")
|
|
110
|
+
workspace = os.path.join(home, ".kite", "workspace")
|
|
111
|
+
os.environ["KITE_WORKSPACE"] = workspace
|
|
112
|
+
|
|
113
|
+
basename = os.path.basename(cwd.rstrip(os.sep)) or "default"
|
|
114
|
+
suffix = 0
|
|
115
|
+
|
|
116
|
+
while True:
|
|
117
|
+
name = basename if suffix == 0 else f"{basename}~{suffix}"
|
|
118
|
+
candidate = os.path.join(workspace, name)
|
|
119
|
+
cwd_file = os.path.join(candidate, ".cwd")
|
|
120
|
+
|
|
121
|
+
if not os.path.exists(candidate):
|
|
122
|
+
# Empty slot — create new workspace
|
|
123
|
+
os.makedirs(candidate, exist_ok=True)
|
|
124
|
+
with open(cwd_file, "w", encoding="utf-8") as f:
|
|
125
|
+
f.write(cwd)
|
|
126
|
+
os.environ["KITE_INSTANCE_DIR"] = candidate
|
|
127
|
+
print(f"[launcher] 实例工作区已创建: {candidate}")
|
|
128
|
+
return
|
|
129
|
+
|
|
130
|
+
if os.path.isfile(cwd_file):
|
|
131
|
+
try:
|
|
132
|
+
with open(cwd_file, "r", encoding="utf-8") as f:
|
|
133
|
+
if f.read().strip() == cwd:
|
|
134
|
+
os.environ["KITE_INSTANCE_DIR"] = candidate
|
|
135
|
+
print(f"[launcher] 实例工作区已找到: {candidate}")
|
|
136
|
+
return
|
|
137
|
+
except Exception:
|
|
138
|
+
pass
|
|
139
|
+
|
|
140
|
+
suffix += 1
|
|
141
|
+
|
|
142
|
+
# ── Kite stdout message callback ──
|
|
143
|
+
|
|
144
|
+
def _on_kite_message(self, module_name: str, msg: dict):
|
|
145
|
+
"""Called by ProcessManager stdout reader thread when a kite message is detected.
|
|
146
|
+
Thread-safe: only touches _msg_waiters (dict + threading.Event).
|
|
147
|
+
"""
|
|
148
|
+
kite_type = msg.get("kite", "")
|
|
149
|
+
key = f"{module_name}:{kite_type}"
|
|
150
|
+
waiter = self._msg_waiters.get(key)
|
|
151
|
+
if waiter:
|
|
152
|
+
waiter[1].update(msg)
|
|
153
|
+
waiter[0].set()
|
|
154
|
+
|
|
155
|
+
async def _wait_kite_message(self, module_name: str, kite_type: str,
|
|
156
|
+
timeout: float) -> dict | None:
|
|
157
|
+
"""Wait for a kite stdout message from a module. Returns msg dict or None on timeout.
|
|
158
|
+
Checks shutdown flag every 0.5s so Ctrl+C is responsive even during Phase 1-2 waits.
|
|
159
|
+
"""
|
|
160
|
+
key = f"{module_name}:{kite_type}"
|
|
161
|
+
evt = threading.Event()
|
|
162
|
+
data = {}
|
|
163
|
+
self._msg_waiters[key] = (evt, data)
|
|
164
|
+
shutdown = self._thread_shutdown
|
|
165
|
+
try:
|
|
166
|
+
def _wait():
|
|
167
|
+
deadline = time.monotonic() + timeout
|
|
168
|
+
while time.monotonic() < deadline:
|
|
169
|
+
if evt.wait(timeout=0.5):
|
|
170
|
+
return True
|
|
171
|
+
if shutdown.is_set():
|
|
172
|
+
return False
|
|
173
|
+
return False
|
|
174
|
+
got = await asyncio.get_running_loop().run_in_executor(None, _wait)
|
|
175
|
+
return data if got else None
|
|
176
|
+
finally:
|
|
177
|
+
self._msg_waiters.pop(key, None)
|
|
178
|
+
|
|
76
179
|
# ── Public entry ──
|
|
77
180
|
|
|
78
181
|
def run(self):
|
|
79
182
|
"""Synchronous entry point. Sets up signals, runs the async main loop."""
|
|
80
|
-
print("[launcher] Kite
|
|
81
|
-
print(
|
|
183
|
+
print("[launcher] Kite 启动中...")
|
|
184
|
+
print("[launcher] ── 环境变量 ──")
|
|
185
|
+
for key in sorted(k for k in os.environ if k.startswith("KITE_")):
|
|
186
|
+
print(f"[launcher] {key} = {os.environ[key]}")
|
|
187
|
+
print(f"[launcher] PID = {os.getpid()}")
|
|
188
|
+
print(f"[launcher] PYTHON = {sys.executable}")
|
|
189
|
+
print(f"[launcher] PLATFORM = {sys.platform}")
|
|
82
190
|
|
|
83
191
|
if IS_WINDOWS:
|
|
84
192
|
self._setup_windows_exit()
|
|
@@ -92,113 +200,303 @@ class Launcher:
|
|
|
92
200
|
finally:
|
|
93
201
|
self._final_cleanup()
|
|
94
202
|
|
|
203
|
+
def _request_shutdown(self, reason: str = ""):
|
|
204
|
+
"""Request graceful shutdown. Thread-safe — can be called from signal handler or any thread."""
|
|
205
|
+
if self._thread_shutdown.is_set():
|
|
206
|
+
return # already shutting down
|
|
207
|
+
print(f"\n[launcher] {reason or '收到关闭请求'}")
|
|
208
|
+
self._thread_shutdown.set()
|
|
209
|
+
# Wake up asyncio event loop immediately (so _monitor_loop / wait_for exits)
|
|
210
|
+
loop = self._loop
|
|
211
|
+
if loop and not loop.is_closed():
|
|
212
|
+
try:
|
|
213
|
+
loop.call_soon_threadsafe(self._shutdown_event.set)
|
|
214
|
+
except RuntimeError:
|
|
215
|
+
pass
|
|
216
|
+
# Safety net: force exit after 15s no matter what
|
|
217
|
+
def _force():
|
|
218
|
+
time.sleep(15)
|
|
219
|
+
os._exit(1)
|
|
220
|
+
threading.Thread(target=_force, daemon=True).start()
|
|
221
|
+
|
|
95
222
|
def _setup_unix_signals(self):
|
|
96
223
|
"""Register SIGTERM/SIGINT handlers on Linux/macOS."""
|
|
97
224
|
def _handler(signum, frame):
|
|
98
|
-
|
|
99
|
-
self._thread_shutdown.set()
|
|
225
|
+
self._request_shutdown(f"收到信号 {signum},正在关闭...")
|
|
100
226
|
signal.signal(signal.SIGTERM, _handler)
|
|
101
227
|
signal.signal(signal.SIGINT, _handler)
|
|
102
228
|
|
|
103
229
|
def _setup_windows_exit(self):
|
|
104
|
-
"""
|
|
230
|
+
"""SetConsoleCtrlHandler for Ctrl+C + daemon thread for 'q' key.
|
|
231
|
+
|
|
232
|
+
Why not signal.signal(SIGINT)?
|
|
233
|
+
Python's signal delivery requires the main thread to be executing bytecode.
|
|
234
|
+
When the main thread is blocked in C code (asyncio ProactorEventLoop →
|
|
235
|
+
GetQueuedCompletionStatus), SIGINT is never delivered.
|
|
236
|
+
SetConsoleCtrlHandler runs its callback in a separate OS thread, so it
|
|
237
|
+
always works regardless of what the main thread is doing.
|
|
238
|
+
"""
|
|
239
|
+
import ctypes
|
|
240
|
+
|
|
241
|
+
@ctypes.WINFUNCTYPE(ctypes.c_int, ctypes.c_uint)
|
|
242
|
+
def _ctrl_handler(ctrl_type):
|
|
243
|
+
if ctrl_type in (0, 1): # CTRL_C_EVENT, CTRL_BREAK_EVENT
|
|
244
|
+
self._request_shutdown("收到 Ctrl+C,正在关闭...")
|
|
245
|
+
return 1 # handled — prevent default (which kills the process)
|
|
246
|
+
return 0
|
|
247
|
+
|
|
248
|
+
# prevent GC of the C callback
|
|
249
|
+
self._ctrl_handler_ref = _ctrl_handler
|
|
250
|
+
ctypes.windll.kernel32.SetConsoleCtrlHandler(_ctrl_handler, 1)
|
|
251
|
+
|
|
252
|
+
# 'q' key: handle via msvcrt polling
|
|
105
253
|
def _listen():
|
|
106
254
|
import msvcrt
|
|
107
255
|
while not self._thread_shutdown.is_set():
|
|
108
256
|
if msvcrt.kbhit():
|
|
109
257
|
ch = msvcrt.getch()
|
|
110
|
-
if ch in (b'q', b'Q'
|
|
111
|
-
|
|
112
|
-
self._thread_shutdown.set()
|
|
258
|
+
if ch in (b'q', b'Q'):
|
|
259
|
+
self._request_shutdown("收到退出请求,正在关闭...")
|
|
113
260
|
return
|
|
114
261
|
time.sleep(0.1)
|
|
115
|
-
|
|
116
|
-
t.start()
|
|
262
|
+
threading.Thread(target=_listen, daemon=True).start()
|
|
117
263
|
|
|
118
|
-
# ── Async main ──
|
|
264
|
+
# ── Async main (4-Phase startup) ──
|
|
119
265
|
|
|
120
266
|
async def _async_main(self):
|
|
121
|
-
"""Full startup sequence, then monitor loop."""
|
|
267
|
+
"""Full 4-phase startup sequence, then monitor loop."""
|
|
122
268
|
self._loop = asyncio.get_running_loop()
|
|
123
269
|
|
|
124
|
-
#
|
|
125
|
-
self.
|
|
270
|
+
# Validate core modules exist (mechanism 12)
|
|
271
|
+
self._validate_core_modules()
|
|
126
272
|
|
|
127
|
-
#
|
|
128
|
-
|
|
273
|
+
# Cleanup leftovers from previous instances
|
|
274
|
+
self.process_manager.cleanup_leftovers()
|
|
129
275
|
|
|
130
|
-
#
|
|
131
|
-
self.
|
|
276
|
+
# Phase 1: Registry bootstrap
|
|
277
|
+
await self._phase1_registry()
|
|
278
|
+
if self._shutdown_event.is_set(): return
|
|
132
279
|
|
|
133
|
-
#
|
|
134
|
-
await self._register_self()
|
|
135
|
-
|
|
136
|
-
# Step 5: scan modules
|
|
280
|
+
# Scan modules (can happen before Phase 2)
|
|
137
281
|
self.modules = self.module_scanner.scan()
|
|
138
282
|
for name, info in self.modules.items():
|
|
139
283
|
self._log_lifecycle("scanned", name, state=info.state, module_dir=info.module_dir)
|
|
140
|
-
print(f"[launcher]
|
|
284
|
+
print(f"[launcher] 发现 {len(self.modules)} 个模块: {', '.join(self.modules.keys()) or '(无)'}")
|
|
285
|
+
|
|
286
|
+
# Generate per-module tokens (including event_hub and registry)
|
|
287
|
+
await self._register_module_tokens()
|
|
288
|
+
if self._shutdown_event.is_set(): return
|
|
289
|
+
|
|
290
|
+
# Phase 2: Event Hub bootstrap
|
|
291
|
+
await self._phase2_event_hub()
|
|
292
|
+
if self._shutdown_event.is_set(): return
|
|
141
293
|
|
|
142
|
-
#
|
|
294
|
+
# Phase 3: Wait for Registry delayed ready
|
|
295
|
+
await self._phase3_registry_ready()
|
|
296
|
+
if self._shutdown_event.is_set(): return
|
|
297
|
+
|
|
298
|
+
# Phase 4: Start remaining enabled modules
|
|
299
|
+
# Initialize desired_state from config_state
|
|
143
300
|
for name, info in self.modules.items():
|
|
144
301
|
if info.state == "enabled":
|
|
145
302
|
self._desired_states[name] = "running"
|
|
146
303
|
else: # manual, disabled
|
|
147
304
|
self._desired_states[name] = "stopped"
|
|
305
|
+
# Core modules are already running
|
|
306
|
+
for cn in CORE_MODULE_NAMES:
|
|
307
|
+
self._desired_states[cn] = "running"
|
|
148
308
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
# Step 7: start enabled modules
|
|
153
|
-
await self._start_enabled_modules()
|
|
309
|
+
await self._phase4_start_modules()
|
|
310
|
+
if self._shutdown_event.is_set(): return
|
|
154
311
|
|
|
155
|
-
#
|
|
312
|
+
# Post-startup
|
|
156
313
|
self.process_manager.persist_records()
|
|
157
|
-
|
|
158
|
-
# Step 9: connect to Event Hub (best-effort, non-blocking)
|
|
159
|
-
await self._connect_event_hub()
|
|
160
|
-
|
|
161
|
-
# Step 10: start heartbeat to Registry
|
|
162
314
|
self._heartbeat_task = asyncio.create_task(self._heartbeat_loop())
|
|
163
315
|
|
|
164
|
-
|
|
165
|
-
print("[launcher] Entering monitor loop (press Ctrl+C or 'q' to exit)")
|
|
316
|
+
print("[launcher] 进入监控循环 (按 Ctrl+C 或 'q' 退出)")
|
|
166
317
|
await self._monitor_loop()
|
|
167
318
|
|
|
168
|
-
# Graceful shutdown all modules before event loop closes
|
|
169
319
|
await self._graceful_shutdown_all()
|
|
170
320
|
|
|
171
|
-
# ──
|
|
321
|
+
# ── Phase 1: Registry ──
|
|
322
|
+
|
|
323
|
+
async def _phase1_registry(self):
|
|
324
|
+
"""Start Registry → capture port from stdout → set env → start API → register self."""
|
|
325
|
+
registry_dir = os.path.join(os.environ["KITE_PROJECT"], "core", "registry")
|
|
326
|
+
registry_info = ModuleInfo(
|
|
327
|
+
name="registry",
|
|
328
|
+
display_name="Registry",
|
|
329
|
+
type="infrastructure",
|
|
330
|
+
state="enabled",
|
|
331
|
+
runtime="python",
|
|
332
|
+
entry="entry.py",
|
|
333
|
+
module_dir=registry_dir,
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
boot_info = {"token": self.kite_token}
|
|
337
|
+
self._log_lifecycle("starting", "registry")
|
|
338
|
+
ok = self.process_manager.start_module(registry_info, boot_info=boot_info)
|
|
339
|
+
if not ok:
|
|
340
|
+
self._log_lifecycle("start_failed", "registry")
|
|
341
|
+
raise RuntimeError("启动 Registry 失败")
|
|
342
|
+
|
|
343
|
+
# Wait for Registry to output port via stdout (mechanism 2)
|
|
344
|
+
print("[launcher] 等待 Registry 端口...")
|
|
345
|
+
msg = await self._wait_kite_message("registry", "port", timeout=6)
|
|
346
|
+
if not msg or not msg.get("port"):
|
|
347
|
+
raise RuntimeError("致命错误: Registry 在 6s 内未报告端口")
|
|
348
|
+
self.registry_port = int(msg["port"])
|
|
349
|
+
print(f"[launcher] Registry 端口: {self.registry_port}")
|
|
350
|
+
|
|
351
|
+
# Set KITE_REGISTRY_PORT for all subsequent child processes
|
|
352
|
+
os.environ["KITE_REGISTRY_PORT"] = str(self.registry_port)
|
|
353
|
+
|
|
354
|
+
# Start Launcher API in a separate thread
|
|
355
|
+
self._start_api_thread()
|
|
172
356
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
357
|
+
# Register Launcher itself to Registry
|
|
358
|
+
await self._register_self()
|
|
359
|
+
|
|
360
|
+
async def _register_self(self):
|
|
361
|
+
"""Register Launcher itself to Registry."""
|
|
362
|
+
url = f"http://127.0.0.1:{self.registry_port}/modules"
|
|
176
363
|
headers = {"Authorization": f"Bearer {self.kite_token}"}
|
|
364
|
+
payload = {
|
|
365
|
+
"action": "register",
|
|
366
|
+
"module_id": "launcher",
|
|
367
|
+
"module_type": "infrastructure",
|
|
368
|
+
"name": "Launcher",
|
|
369
|
+
"api_endpoint": f"http://127.0.0.1:{self.api_port}",
|
|
370
|
+
"health_endpoint": "/launcher/modules",
|
|
371
|
+
"events_publish": {
|
|
372
|
+
"module.started": {},
|
|
373
|
+
"module.stopped": {},
|
|
374
|
+
"module.state_changed": {},
|
|
375
|
+
},
|
|
376
|
+
"events_subscribe": [">"],
|
|
377
|
+
}
|
|
378
|
+
try:
|
|
379
|
+
async with httpx.AsyncClient() as client:
|
|
380
|
+
resp = await client.post(url, json=payload, headers=headers, timeout=5)
|
|
381
|
+
if resp.status_code == 200:
|
|
382
|
+
print("[launcher] 已注册到 Registry")
|
|
383
|
+
else:
|
|
384
|
+
print(f"[launcher] 警告: Registry 注册返回 {resp.status_code}")
|
|
385
|
+
except Exception as e:
|
|
386
|
+
print(f"[launcher] 警告: 注册到 Registry 失败: {e}")
|
|
387
|
+
|
|
388
|
+
# ── Phase 2: Event Hub ──
|
|
389
|
+
|
|
390
|
+
async def _phase2_event_hub(self):
|
|
391
|
+
"""Start Event Hub → stdin launcher_ws_token → stdout ws_endpoint → WS connect → module.ready."""
|
|
392
|
+
# Find event_hub in scanned modules or build manually
|
|
393
|
+
eh_info = self.modules.get("event_hub")
|
|
394
|
+
if not eh_info:
|
|
395
|
+
eh_dir = os.path.join(os.environ["KITE_PROJECT"], "core", "event_hub")
|
|
396
|
+
eh_info = ModuleInfo(
|
|
397
|
+
name="event_hub",
|
|
398
|
+
display_name="Event Hub",
|
|
399
|
+
type="infrastructure",
|
|
400
|
+
state="enabled",
|
|
401
|
+
runtime="python",
|
|
402
|
+
entry="entry.py",
|
|
403
|
+
module_dir=eh_dir,
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
token = self._module_tokens.get("event_hub", "")
|
|
407
|
+
if not token:
|
|
408
|
+
token = secrets.token_hex(32)
|
|
409
|
+
self._module_tokens["event_hub"] = token
|
|
410
|
+
await self._register_tokens_to_registry({"event_hub": token})
|
|
411
|
+
|
|
412
|
+
boot_info = {"token": token}
|
|
413
|
+
self._log_lifecycle("starting", "event_hub")
|
|
414
|
+
ok = self.process_manager.start_module(eh_info, boot_info=boot_info)
|
|
415
|
+
if not ok:
|
|
416
|
+
self._log_lifecycle("start_failed", "event_hub")
|
|
417
|
+
raise RuntimeError("启动 Event Hub 失败")
|
|
418
|
+
|
|
419
|
+
# Send launcher_ws_token via stdin (mechanism 6)
|
|
420
|
+
self._launcher_ws_token = secrets.token_hex(32)
|
|
421
|
+
self.process_manager.write_stdin("event_hub", {
|
|
422
|
+
"kite": "launcher_ws_token",
|
|
423
|
+
"launcher_ws_token": self._launcher_ws_token,
|
|
424
|
+
})
|
|
177
425
|
|
|
178
|
-
#
|
|
179
|
-
print("[launcher]
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
426
|
+
# Wait for ws_endpoint from stdout (mechanism 5)
|
|
427
|
+
print("[launcher] 等待 Event Hub ws_endpoint...")
|
|
428
|
+
msg = await self._wait_kite_message("event_hub", "ws_endpoint", timeout=6)
|
|
429
|
+
if not msg or not msg.get("ws_endpoint"):
|
|
430
|
+
raise RuntimeError("致命错误: Event Hub 在 6s 内未报告 ws_endpoint")
|
|
431
|
+
self._event_hub_ws_url = msg["ws_endpoint"]
|
|
432
|
+
print(f"[launcher] Event Hub 已发现: {self._event_hub_ws_url}")
|
|
433
|
+
|
|
434
|
+
# Connect to Event Hub WebSocket with launcher_ws_token
|
|
435
|
+
self._ws_task = asyncio.create_task(self._ws_loop())
|
|
436
|
+
|
|
437
|
+
# Wait for Event Hub module.ready (sent when Launcher connects)
|
|
438
|
+
ready = await self._wait_event("module.ready", "event_hub", timeout=15)
|
|
439
|
+
if ready:
|
|
440
|
+
print("[launcher] Event Hub 已就绪")
|
|
441
|
+
else:
|
|
442
|
+
print("[launcher] 警告: Event Hub 在 15s 内未发送 module.ready")
|
|
443
|
+
|
|
444
|
+
self._log_lifecycle("started", "event_hub")
|
|
445
|
+
await self._publish_event("module.started", {"module_id": "event_hub"})
|
|
446
|
+
self.process_manager.close_stdio("event_hub")
|
|
447
|
+
|
|
448
|
+
# ── Phase 3: Registry delayed ready ──
|
|
195
449
|
|
|
196
|
-
|
|
197
|
-
|
|
450
|
+
async def _phase3_registry_ready(self):
|
|
451
|
+
"""Wait for Registry module.ready (triggered after Event Hub registers to Registry
|
|
452
|
+
and Registry connects to Event Hub WS)."""
|
|
453
|
+
print("[launcher] 等待 Registry 延迟就绪...")
|
|
454
|
+
ready = await self._wait_event("module.ready", "registry", timeout=12)
|
|
455
|
+
if ready:
|
|
456
|
+
print("[launcher] Registry 已就绪")
|
|
457
|
+
else:
|
|
458
|
+
print("[launcher] 警告: Registry 在 12s 内未发送 module.ready (降级运行)")
|
|
459
|
+
|
|
460
|
+
self._log_lifecycle("started", "registry")
|
|
461
|
+
await self._publish_event("module.started", {"module_id": "registry"})
|
|
462
|
+
self.process_manager.close_stdio("registry")
|
|
463
|
+
|
|
464
|
+
# ── Phase 4: Start remaining modules ──
|
|
465
|
+
|
|
466
|
+
async def _phase4_start_modules(self):
|
|
467
|
+
"""Start enabled modules (excluding core) in dependency order."""
|
|
468
|
+
to_start = [m for m in self.modules.values()
|
|
469
|
+
if self._desired_states.get(m.name) == "running"
|
|
470
|
+
and m.name not in CORE_MODULE_NAMES]
|
|
471
|
+
if not to_start:
|
|
472
|
+
print("[launcher] 没有额外模块需要启动")
|
|
198
473
|
return
|
|
199
474
|
|
|
200
|
-
|
|
201
|
-
|
|
475
|
+
# Auto-start manual modules if depended upon
|
|
476
|
+
needed = set(m.name for m in to_start)
|
|
477
|
+
for m in list(to_start):
|
|
478
|
+
for dep in m.depends_on:
|
|
479
|
+
if dep not in needed and dep not in CORE_MODULE_NAMES:
|
|
480
|
+
dep_info = self.modules.get(dep)
|
|
481
|
+
if dep_info and dep_info.state != "disabled":
|
|
482
|
+
needed.add(dep)
|
|
483
|
+
to_start.append(dep_info)
|
|
484
|
+
self._desired_states[dep] = "running"
|
|
485
|
+
print(f"[launcher] 自动启动 '{dep}' (被依赖)")
|
|
486
|
+
elif dep_info and dep_info.state == "disabled":
|
|
487
|
+
print(f"[launcher] 错误: '{m.name}' 依赖已禁用的模块 '{dep}'")
|
|
488
|
+
|
|
489
|
+
try:
|
|
490
|
+
sorted_modules = self._topo_sort(to_start)
|
|
491
|
+
except RuntimeError as e:
|
|
492
|
+
print(f"[launcher] 错误: {e}")
|
|
493
|
+
return
|
|
494
|
+
|
|
495
|
+
print(f"[launcher] 正在启动 {len(sorted_modules)} 个模块...")
|
|
496
|
+
for info in sorted_modules:
|
|
497
|
+
await self._start_one_module(info)
|
|
498
|
+
|
|
499
|
+
# ── Event Hub WebSocket connection ──
|
|
202
500
|
|
|
203
501
|
async def _ws_loop(self):
|
|
204
502
|
"""Connect to Event Hub, reconnect on failure."""
|
|
@@ -208,16 +506,16 @@ class Launcher:
|
|
|
208
506
|
except asyncio.CancelledError:
|
|
209
507
|
return
|
|
210
508
|
except Exception as e:
|
|
211
|
-
print(f"[launcher] Event Hub
|
|
509
|
+
print(f"[launcher] Event Hub 连接错误: {e}")
|
|
212
510
|
self._ws = None
|
|
213
511
|
await asyncio.sleep(5)
|
|
214
512
|
|
|
215
513
|
async def _ws_connect(self):
|
|
216
|
-
"""Single WebSocket session
|
|
217
|
-
ws_url = f"{self._event_hub_ws_url}?token={self.
|
|
218
|
-
async with websockets.connect(ws_url) as ws:
|
|
514
|
+
"""Single WebSocket session with launcher_ws_token auth."""
|
|
515
|
+
ws_url = f"{self._event_hub_ws_url}?token={self._launcher_ws_token}"
|
|
516
|
+
async with websockets.connect(ws_url, ping_interval=None, ping_timeout=None, close_timeout=10) as ws:
|
|
219
517
|
self._ws = ws
|
|
220
|
-
print("[launcher]
|
|
518
|
+
print("[launcher] 已连接到 Event Hub")
|
|
221
519
|
|
|
222
520
|
# Subscribe to all events
|
|
223
521
|
await ws.send(json.dumps({
|
|
@@ -258,7 +556,7 @@ class Launcher:
|
|
|
258
556
|
else:
|
|
259
557
|
print(f"[{source}] {event}: {json.dumps(data, ensure_ascii=False)}")
|
|
260
558
|
elif msg_type == "error":
|
|
261
|
-
print(f"[launcher] Event Hub
|
|
559
|
+
print(f"[launcher] Event Hub 错误: {msg.get('message')}")
|
|
262
560
|
|
|
263
561
|
async def _publish_event(self, event_type: str, data: dict):
|
|
264
562
|
"""Publish an event to Event Hub via WebSocket."""
|
|
@@ -276,7 +574,7 @@ class Launcher:
|
|
|
276
574
|
try:
|
|
277
575
|
await self._ws.send(json.dumps(msg))
|
|
278
576
|
except Exception as e:
|
|
279
|
-
print(f"[launcher]
|
|
577
|
+
print(f"[launcher] 发布事件失败: {e}")
|
|
280
578
|
|
|
281
579
|
def _publish_event_threadsafe(self, event_type: str, data: dict):
|
|
282
580
|
"""Publish event from non-async context (API thread). Fire-and-forget."""
|
|
@@ -303,27 +601,21 @@ class Launcher:
|
|
|
303
601
|
async def _graceful_stop(self, name: str, reason: str = "stop_requested", timeout: float = 10):
|
|
304
602
|
"""Graceful shutdown: send event → wait ack → wait ready → kill."""
|
|
305
603
|
self._log_lifecycle("stopping", name, reason=reason)
|
|
306
|
-
# Step 1: send module.shutdown event
|
|
307
604
|
await self._publish_event("module.shutdown", {
|
|
308
605
|
"module_id": name, "reason": reason, "timeout": timeout,
|
|
309
606
|
})
|
|
310
607
|
|
|
311
|
-
# Step 2: wait for ack (3s)
|
|
312
608
|
ack = await self._wait_event("module.shutdown.ack", name, timeout=3)
|
|
313
609
|
if not ack:
|
|
314
|
-
# No ack — fallback to direct terminate
|
|
315
610
|
self.process_manager.stop_module(name, timeout=5)
|
|
316
611
|
await self._publish_event("module.stopped", {"module_id": name})
|
|
317
612
|
return
|
|
318
613
|
|
|
319
|
-
# Step 3: wait for ready
|
|
320
614
|
estimated = min(ack.get("estimated_cleanup", timeout), timeout)
|
|
321
615
|
ready = await self._wait_event("module.shutdown.ready", name, timeout=estimated)
|
|
322
616
|
if ready:
|
|
323
|
-
# Module is ready to die — kill immediately
|
|
324
617
|
self.process_manager.stop_module(name, timeout=1)
|
|
325
618
|
else:
|
|
326
|
-
# Timeout — force stop
|
|
327
619
|
self.process_manager.stop_module(name, timeout=3)
|
|
328
620
|
|
|
329
621
|
self._log_lifecycle("stopped", name, reason=reason)
|
|
@@ -332,16 +624,18 @@ class Launcher:
|
|
|
332
624
|
async def _graceful_shutdown_all(self):
|
|
333
625
|
"""Broadcast module.shutdown to all running modules, then force-kill survivors."""
|
|
334
626
|
running = [n for n in self.modules if self.process_manager.is_running(n)]
|
|
627
|
+
# Also check core modules
|
|
628
|
+
for cn in CORE_MODULE_NAMES:
|
|
629
|
+
if self.process_manager.is_running(cn) and cn not in running:
|
|
630
|
+
running.append(cn)
|
|
335
631
|
if not running:
|
|
336
632
|
return
|
|
337
|
-
print(f"[launcher]
|
|
338
|
-
# Broadcast shutdown event
|
|
633
|
+
print(f"[launcher] 优雅关闭: {', '.join(running)}")
|
|
339
634
|
for name in running:
|
|
340
635
|
self._log_lifecycle("stopping", name, reason="system_shutdown")
|
|
341
636
|
await self._publish_event("module.shutdown", {
|
|
342
637
|
"module_id": name, "reason": "system_shutdown", "timeout": 10,
|
|
343
638
|
})
|
|
344
|
-
# Wait up to 10s total, then force-kill
|
|
345
639
|
deadline = time.time() + 10
|
|
346
640
|
while time.time() < deadline:
|
|
347
641
|
still_running = [n for n in running if self.process_manager.is_running(n)]
|
|
@@ -369,100 +663,6 @@ class Launcher:
|
|
|
369
663
|
except Exception:
|
|
370
664
|
pass
|
|
371
665
|
|
|
372
|
-
# ── Registry startup ──
|
|
373
|
-
|
|
374
|
-
async def _start_registry(self):
|
|
375
|
-
"""Start Registry as a subprocess, wait for it to write port.txt and /health to respond."""
|
|
376
|
-
registry_dir = os.path.join(self.project_root, "core", "registry")
|
|
377
|
-
if not os.path.isdir(registry_dir):
|
|
378
|
-
raise RuntimeError(f"Registry module not found at {registry_dir}")
|
|
379
|
-
|
|
380
|
-
# Use centralized data directory
|
|
381
|
-
from core.data_dir import get_registry_data_dir
|
|
382
|
-
registry_data_dir = get_registry_data_dir()
|
|
383
|
-
|
|
384
|
-
# Clean our instance's port file before starting
|
|
385
|
-
port_file = os.path.join(registry_data_dir, f"port_{self.instance_id}.txt")
|
|
386
|
-
if os.path.isfile(port_file):
|
|
387
|
-
os.remove(port_file)
|
|
388
|
-
|
|
389
|
-
registry_info = ModuleInfo(
|
|
390
|
-
name="registry",
|
|
391
|
-
display_name="Registry",
|
|
392
|
-
type="infrastructure",
|
|
393
|
-
state="enabled",
|
|
394
|
-
runtime="python",
|
|
395
|
-
entry="entry.py",
|
|
396
|
-
module_dir=registry_dir,
|
|
397
|
-
)
|
|
398
|
-
|
|
399
|
-
# Pass launcher_token + bind config via stdin
|
|
400
|
-
boot_info = {"token": self.kite_token, "registry_port": 0, "bind": "127.0.0.1", "instance_id": self.instance_id}
|
|
401
|
-
ok = self.process_manager.start_module(registry_info, boot_info=boot_info)
|
|
402
|
-
if not ok:
|
|
403
|
-
raise RuntimeError("Failed to start Registry")
|
|
404
|
-
|
|
405
|
-
# Wait for Registry to write port.txt
|
|
406
|
-
print("[launcher] Waiting for Registry to report its port...")
|
|
407
|
-
deadline = time.time() + 10
|
|
408
|
-
while time.time() < deadline:
|
|
409
|
-
if os.path.isfile(port_file):
|
|
410
|
-
try:
|
|
411
|
-
with open(port_file, "r") as f:
|
|
412
|
-
self.registry_port = int(f.read().strip())
|
|
413
|
-
break
|
|
414
|
-
except (ValueError, OSError):
|
|
415
|
-
pass
|
|
416
|
-
await asyncio.sleep(0.2)
|
|
417
|
-
else:
|
|
418
|
-
raise RuntimeError("Registry failed to write port.txt within 10s")
|
|
419
|
-
|
|
420
|
-
# Poll /health until ready
|
|
421
|
-
url = f"http://127.0.0.1:{self.registry_port}/health"
|
|
422
|
-
print(f"[launcher] Registry on port {self.registry_port}, waiting for health check...")
|
|
423
|
-
|
|
424
|
-
deadline = time.time() + 10
|
|
425
|
-
async with httpx.AsyncClient() as client:
|
|
426
|
-
while time.time() < deadline:
|
|
427
|
-
try:
|
|
428
|
-
resp = await client.get(url, timeout=1)
|
|
429
|
-
if resp.status_code == 200:
|
|
430
|
-
print("[launcher] Registry is ready")
|
|
431
|
-
return
|
|
432
|
-
except Exception:
|
|
433
|
-
pass
|
|
434
|
-
await asyncio.sleep(0.2)
|
|
435
|
-
|
|
436
|
-
raise RuntimeError("Registry failed to become ready within 10s")
|
|
437
|
-
|
|
438
|
-
async def _register_self(self):
|
|
439
|
-
"""Register Launcher itself to Registry using new API."""
|
|
440
|
-
url = f"http://127.0.0.1:{self.registry_port}/modules"
|
|
441
|
-
headers = {"Authorization": f"Bearer {self.kite_token}"}
|
|
442
|
-
payload = {
|
|
443
|
-
"action": "register",
|
|
444
|
-
"module_id": "launcher",
|
|
445
|
-
"module_type": "infrastructure",
|
|
446
|
-
"name": "Launcher",
|
|
447
|
-
"api_endpoint": f"http://127.0.0.1:{self.api_port}",
|
|
448
|
-
"health_endpoint": "/launcher/modules",
|
|
449
|
-
"events_publish": {
|
|
450
|
-
"module.started": {},
|
|
451
|
-
"module.stopped": {},
|
|
452
|
-
"module.state_changed": {},
|
|
453
|
-
},
|
|
454
|
-
"events_subscribe": [">"],
|
|
455
|
-
}
|
|
456
|
-
try:
|
|
457
|
-
async with httpx.AsyncClient() as client:
|
|
458
|
-
resp = await client.post(url, json=payload, headers=headers, timeout=5)
|
|
459
|
-
if resp.status_code == 200:
|
|
460
|
-
print("[launcher] Registered self to Registry")
|
|
461
|
-
else:
|
|
462
|
-
print(f"[launcher] WARNING: Registry registration returned {resp.status_code}")
|
|
463
|
-
except Exception as e:
|
|
464
|
-
print(f"[launcher] WARNING: failed to register to Registry: {e}")
|
|
465
|
-
|
|
466
666
|
# ── Module startup ──
|
|
467
667
|
|
|
468
668
|
def _topo_sort(self, modules: list[ModuleInfo]) -> list[ModuleInfo]:
|
|
@@ -492,17 +692,12 @@ class Launcher:
|
|
|
492
692
|
return order
|
|
493
693
|
|
|
494
694
|
async def _start_one_module(self, info: ModuleInfo):
|
|
495
|
-
"""Start a single module: publish starting
|
|
695
|
+
"""Start a single module: publish starting → start process → wait ready → started → close stdio."""
|
|
496
696
|
self._log_lifecycle("starting", info.name)
|
|
497
697
|
await self._publish_event("module.starting", {"module_id": info.name})
|
|
498
698
|
|
|
499
699
|
token = self._module_tokens.get(info.name, "")
|
|
500
|
-
boot_info = {
|
|
501
|
-
"token": token,
|
|
502
|
-
"registry_port": self.registry_port,
|
|
503
|
-
"preferred_port": info.preferred_port,
|
|
504
|
-
"advertise_ip": "127.0.0.1",
|
|
505
|
-
}
|
|
700
|
+
boot_info = {"token": token}
|
|
506
701
|
ok = self.process_manager.start_module(info, boot_info=boot_info)
|
|
507
702
|
if not ok:
|
|
508
703
|
self._log_lifecycle("start_failed", info.name)
|
|
@@ -512,65 +707,68 @@ class Launcher:
|
|
|
512
707
|
timeout = info.launch.timeout
|
|
513
708
|
ready = await self._wait_event("module.ready", info.name, timeout=timeout)
|
|
514
709
|
if ready:
|
|
515
|
-
print(f"[launcher]
|
|
710
|
+
print(f"[launcher] 模块 '{info.name}' 已就绪")
|
|
516
711
|
else:
|
|
517
|
-
print(f"[launcher]
|
|
712
|
+
print(f"[launcher] 警告: '{info.name}' 在 {timeout}s 内未发送 module.ready")
|
|
518
713
|
|
|
519
714
|
rec = self.process_manager.get_record(info.name)
|
|
520
715
|
self._log_lifecycle("started", info.name, pid=rec.pid if rec else None)
|
|
521
716
|
await self._publish_event("module.started", {"module_id": info.name})
|
|
522
|
-
|
|
523
|
-
async def _start_enabled_modules(self):
|
|
524
|
-
"""Start modules in dependency order, auto-starting manual deps if needed."""
|
|
525
|
-
to_start = [m for m in self.modules.values()
|
|
526
|
-
if self._desired_states.get(m.name) == "running"]
|
|
527
|
-
if not to_start:
|
|
528
|
-
print("[launcher] No modules to start")
|
|
529
|
-
return
|
|
530
|
-
|
|
531
|
-
# Auto-start manual modules if depended upon
|
|
532
|
-
needed = set(m.name for m in to_start)
|
|
533
|
-
for m in to_start:
|
|
534
|
-
for dep in m.depends_on:
|
|
535
|
-
if dep not in needed:
|
|
536
|
-
dep_info = self.modules.get(dep)
|
|
537
|
-
if dep_info and dep_info.state != "disabled":
|
|
538
|
-
needed.add(dep)
|
|
539
|
-
to_start.append(dep_info)
|
|
540
|
-
self._desired_states[dep] = "running"
|
|
541
|
-
print(f"[launcher] Auto-starting '{dep}' (dependency)")
|
|
542
|
-
elif dep_info and dep_info.state == "disabled":
|
|
543
|
-
print(f"[launcher] ERROR: '{m.name}' depends on disabled module '{dep}'")
|
|
544
|
-
|
|
545
|
-
try:
|
|
546
|
-
sorted_modules = self._topo_sort(to_start)
|
|
547
|
-
except RuntimeError as e:
|
|
548
|
-
print(f"[launcher] ERROR: {e}")
|
|
549
|
-
return
|
|
550
|
-
|
|
551
|
-
print(f"[launcher] Starting {len(sorted_modules)} module(s)...")
|
|
552
|
-
for info in sorted_modules:
|
|
553
|
-
await self._start_one_module(info)
|
|
717
|
+
self.process_manager.close_stdio(info.name)
|
|
554
718
|
|
|
555
719
|
async def _register_module_tokens(self):
|
|
556
720
|
"""Generate per-module tokens and register the mapping to Registry."""
|
|
721
|
+
# Include all scanned modules + core modules
|
|
557
722
|
for name in self.modules:
|
|
558
|
-
|
|
723
|
+
if name not in self._module_tokens:
|
|
724
|
+
self._module_tokens[name] = secrets.token_hex(32)
|
|
725
|
+
# Ensure registry has a token
|
|
726
|
+
if "registry" not in self._module_tokens:
|
|
727
|
+
self._module_tokens["registry"] = secrets.token_hex(32)
|
|
559
728
|
|
|
560
729
|
if not self._module_tokens:
|
|
561
730
|
return
|
|
562
731
|
|
|
732
|
+
await self._register_tokens_to_registry(self._module_tokens)
|
|
733
|
+
|
|
734
|
+
async def _register_tokens_to_registry(self, tokens: dict):
|
|
735
|
+
"""Register token mapping to Registry via POST /tokens."""
|
|
563
736
|
url = f"http://127.0.0.1:{self.registry_port}/tokens"
|
|
564
737
|
headers = {"Authorization": f"Bearer {self.kite_token}"}
|
|
565
738
|
try:
|
|
566
739
|
async with httpx.AsyncClient() as client:
|
|
567
|
-
resp = await client.post(url, json=
|
|
740
|
+
resp = await client.post(url, json=tokens, headers=headers, timeout=5)
|
|
568
741
|
if resp.status_code == 200:
|
|
569
|
-
print(f"[launcher]
|
|
742
|
+
print(f"[launcher] 已注册 {len(tokens)} 个模块令牌")
|
|
570
743
|
else:
|
|
571
|
-
print(f"[launcher]
|
|
744
|
+
print(f"[launcher] 警告: 令牌注册返回 {resp.status_code}")
|
|
572
745
|
except Exception as e:
|
|
573
|
-
print(f"[launcher]
|
|
746
|
+
print(f"[launcher] 警告: 注册模块令牌失败: {e}")
|
|
747
|
+
|
|
748
|
+
# ── Validation ──
|
|
749
|
+
|
|
750
|
+
def _validate_core_modules(self):
|
|
751
|
+
"""Validate core modules exist (mechanism 12)."""
|
|
752
|
+
project_root = os.environ["KITE_PROJECT"]
|
|
753
|
+
for name in ("registry", "event_hub"):
|
|
754
|
+
mod_dir = os.path.join(project_root, "core", name)
|
|
755
|
+
md_path = os.path.join(mod_dir, "module.md")
|
|
756
|
+
if not os.path.isdir(mod_dir):
|
|
757
|
+
print(f"[launcher] 致命: 核心模块 '{name}' 目录未找到: {mod_dir}")
|
|
758
|
+
sys.exit(1)
|
|
759
|
+
if not os.path.isfile(md_path):
|
|
760
|
+
print(f"[launcher] 致命: 核心模块 '{name}' 缺少 module.md: {md_path}")
|
|
761
|
+
sys.exit(1)
|
|
762
|
+
# Try to parse frontmatter
|
|
763
|
+
try:
|
|
764
|
+
with open(md_path, "r", encoding="utf-8") as f:
|
|
765
|
+
fm = _parse_frontmatter(f.read())
|
|
766
|
+
if not fm:
|
|
767
|
+
print(f"[launcher] 致命: 核心模块 '{name}' module.md 没有有效的 frontmatter")
|
|
768
|
+
sys.exit(1)
|
|
769
|
+
except Exception as e:
|
|
770
|
+
print(f"[launcher] 致命: 核心模块 '{name}' module.md 解析错误: {e}")
|
|
771
|
+
sys.exit(1)
|
|
574
772
|
|
|
575
773
|
# ── API thread ──
|
|
576
774
|
|
|
@@ -591,29 +789,30 @@ class Launcher:
|
|
|
591
789
|
t = threading.Thread(target=_run, daemon=True)
|
|
592
790
|
t.start()
|
|
593
791
|
|
|
594
|
-
# Wait for API server to actually be ready before proceeding
|
|
595
792
|
deadline = time.time() + 5
|
|
596
793
|
while time.time() < deadline:
|
|
597
794
|
if self._api_server.started:
|
|
598
795
|
break
|
|
599
796
|
time.sleep(0.05)
|
|
600
797
|
else:
|
|
601
|
-
print("[launcher]
|
|
798
|
+
print("[launcher] 警告: API 服务器可能尚未完全就绪")
|
|
602
799
|
|
|
603
|
-
print(f"[launcher] API
|
|
800
|
+
print(f"[launcher] API 服务器已启动,端口 {self.api_port}")
|
|
604
801
|
|
|
605
802
|
# ── Monitor loop ──
|
|
606
803
|
|
|
607
804
|
async def _monitor_loop(self):
|
|
608
|
-
"""Check child processes every second. Handle crashes.
|
|
805
|
+
"""Check child processes every second. Handle crashes.
|
|
806
|
+
Uses _shutdown_event (asyncio.Event) so Ctrl+C wakes us immediately.
|
|
807
|
+
"""
|
|
609
808
|
MAX_FAIL = 3
|
|
610
809
|
MAX_FAILED_MODULES = 3
|
|
611
810
|
|
|
612
|
-
while not self.
|
|
811
|
+
while not self._shutdown_event.is_set():
|
|
613
812
|
exited = self.process_manager.check_exited()
|
|
614
813
|
|
|
615
814
|
for name, rc in exited:
|
|
616
|
-
print(f"[launcher]
|
|
815
|
+
print(f"[launcher] 模块 '{name}' 退出,返回码 {rc}")
|
|
617
816
|
self._log_lifecycle("exited", name, exit_code=rc)
|
|
618
817
|
await self._publish_event("module.stopped", {
|
|
619
818
|
"module_id": name, "exit_code": rc,
|
|
@@ -621,8 +820,8 @@ class Launcher:
|
|
|
621
820
|
info = self.modules.get(name)
|
|
622
821
|
|
|
623
822
|
# Core module crash → full restart
|
|
624
|
-
if info and info.is_core(
|
|
625
|
-
print(f"[launcher]
|
|
823
|
+
if name in CORE_MODULE_NAMES or (info and info.is_core()):
|
|
824
|
+
print(f"[launcher] 严重: 核心模块 '{name}' 崩溃,正在全部重启...")
|
|
626
825
|
self._log_lifecycle("core_crash", name, exit_code=rc)
|
|
627
826
|
await self._full_restart()
|
|
628
827
|
return
|
|
@@ -632,29 +831,33 @@ class Launcher:
|
|
|
632
831
|
count = self._fail_counts[name]
|
|
633
832
|
|
|
634
833
|
if count < MAX_FAIL and self._desired_states.get(name) == "running" and info:
|
|
635
|
-
print(f"[launcher]
|
|
834
|
+
print(f"[launcher] 正在重启 '{name}' (第 {count}/{MAX_FAIL} 次)...")
|
|
636
835
|
await self._start_one_module(info)
|
|
637
836
|
elif count >= MAX_FAIL:
|
|
638
837
|
self._desired_states[name] = "stopped"
|
|
639
838
|
self._log_lifecycle("failed", name, reason=f"exceeded {MAX_FAIL} retries")
|
|
640
|
-
print(f"[launcher]
|
|
839
|
+
print(f"[launcher] 模块 '{name}' 失败 {MAX_FAIL} 次,已放弃")
|
|
641
840
|
|
|
642
|
-
# Too many failed modules → exit
|
|
643
841
|
failed_count = sum(1 for c in self._fail_counts.values() if c >= MAX_FAIL)
|
|
644
842
|
if failed_count >= MAX_FAILED_MODULES:
|
|
645
|
-
print(f"[launcher] {failed_count}
|
|
843
|
+
print(f"[launcher] {failed_count} 个模块永久失败,启动器退出")
|
|
646
844
|
return
|
|
647
845
|
|
|
648
846
|
if exited:
|
|
649
847
|
self.process_manager.persist_records()
|
|
650
848
|
|
|
651
|
-
|
|
849
|
+
# Wait 1s but wake immediately on shutdown signal
|
|
850
|
+
try:
|
|
851
|
+
await asyncio.wait_for(self._shutdown_event.wait(), timeout=1)
|
|
852
|
+
return # shutdown requested
|
|
853
|
+
except asyncio.TimeoutError:
|
|
854
|
+
pass
|
|
652
855
|
|
|
653
856
|
async def _full_restart(self):
|
|
654
|
-
"""Stop all modules,
|
|
655
|
-
print("[launcher]
|
|
857
|
+
"""Stop all modules, regenerate tokens, re-run Phase 1-4 (mechanism 10)."""
|
|
858
|
+
print("[launcher] 全量重启: 正在停止所有模块...")
|
|
656
859
|
|
|
657
|
-
# Disconnect Event Hub
|
|
860
|
+
# Disconnect Event Hub WS
|
|
658
861
|
if self._ws_task:
|
|
659
862
|
self._ws_task.cancel()
|
|
660
863
|
self._ws_task = None
|
|
@@ -662,33 +865,39 @@ class Launcher:
|
|
|
662
865
|
self._heartbeat_task.cancel()
|
|
663
866
|
self._heartbeat_task = None
|
|
664
867
|
self._ws = None
|
|
868
|
+
self._event_hub_ws_url = ""
|
|
869
|
+
self._launcher_ws_token = ""
|
|
665
870
|
|
|
666
871
|
await self._graceful_shutdown_all()
|
|
667
872
|
self._fail_counts.clear()
|
|
668
|
-
|
|
669
873
|
self._module_tokens.clear()
|
|
670
874
|
|
|
671
|
-
|
|
875
|
+
# Regenerate kite_token
|
|
876
|
+
self.kite_token = secrets.token_hex(32)
|
|
877
|
+
self.process_manager.kite_token = self.kite_token
|
|
878
|
+
|
|
879
|
+
print("[launcher] 全量重启: 重新执行 Phase 1-4...")
|
|
672
880
|
try:
|
|
673
|
-
await self.
|
|
674
|
-
await self._register_self()
|
|
881
|
+
await self._phase1_registry()
|
|
675
882
|
self.modules = self.module_scanner.scan()
|
|
676
883
|
for n, info in self.modules.items():
|
|
677
884
|
self._log_lifecycle("scanned", n, state=info.state, module_dir=info.module_dir)
|
|
678
885
|
await self._register_module_tokens()
|
|
679
|
-
await self.
|
|
886
|
+
await self._phase2_event_hub()
|
|
887
|
+
await self._phase3_registry_ready()
|
|
888
|
+
await self._phase4_start_modules()
|
|
680
889
|
self.process_manager.persist_records()
|
|
681
|
-
|
|
682
|
-
print("[launcher]
|
|
890
|
+
self._heartbeat_task = asyncio.create_task(self._heartbeat_loop())
|
|
891
|
+
print("[launcher] 全量重启完成,恢复监控循环")
|
|
683
892
|
await self._monitor_loop()
|
|
684
893
|
except Exception as e:
|
|
685
|
-
print(f"[launcher]
|
|
894
|
+
print(f"[launcher] 全量重启失败: {e}")
|
|
686
895
|
|
|
687
896
|
# ── Shutdown ──
|
|
688
897
|
|
|
689
898
|
def _final_cleanup(self):
|
|
690
899
|
"""Called on exit — stop all processes, stop API, clear records."""
|
|
691
|
-
print("[launcher]
|
|
900
|
+
print("[launcher] 正在关闭...")
|
|
692
901
|
|
|
693
902
|
if self._ws_task:
|
|
694
903
|
self._ws_task.cancel()
|
|
@@ -706,13 +915,7 @@ class Launcher:
|
|
|
706
915
|
os.remove(self.process_manager.records_path)
|
|
707
916
|
except OSError:
|
|
708
917
|
pass
|
|
709
|
-
|
|
710
|
-
port_file = os.path.join(get_registry_data_dir(), f"port_{self.instance_id}.txt")
|
|
711
|
-
try:
|
|
712
|
-
os.remove(port_file)
|
|
713
|
-
except OSError:
|
|
714
|
-
pass
|
|
715
|
-
print("[launcher] Goodbye.")
|
|
918
|
+
print("[launcher] 再见。")
|
|
716
919
|
|
|
717
920
|
if IS_WINDOWS:
|
|
718
921
|
os._exit(0)
|
|
@@ -721,20 +924,20 @@ class Launcher:
|
|
|
721
924
|
|
|
722
925
|
def _load_discovery(self) -> dict | None:
|
|
723
926
|
"""Read discovery config from launcher's own module.md."""
|
|
724
|
-
md_path = os.path.join(
|
|
927
|
+
md_path = os.path.join(os.environ["KITE_PROJECT"], "core", "launcher", "module.md")
|
|
725
928
|
try:
|
|
726
929
|
with open(md_path, "r", encoding="utf-8") as f:
|
|
727
930
|
fm = _parse_frontmatter(f.read())
|
|
728
931
|
discovery = fm.get("discovery")
|
|
729
932
|
if isinstance(discovery, dict) and discovery:
|
|
730
|
-
print(f"[launcher]
|
|
933
|
+
print(f"[launcher] 发现来源: {', '.join(discovery.keys())}")
|
|
731
934
|
return discovery
|
|
732
935
|
except Exception as e:
|
|
733
|
-
print(f"[launcher]
|
|
936
|
+
print(f"[launcher] 警告: 读取发现配置失败: {e}")
|
|
734
937
|
return None
|
|
735
938
|
|
|
736
939
|
def _log_lifecycle(self, event: str, module: str, **extra):
|
|
737
|
-
"""Append one JSONL line to
|
|
940
|
+
"""Append one JSONL line to lifecycle.jsonl."""
|
|
738
941
|
from datetime import datetime, timezone
|
|
739
942
|
record = {"ts": datetime.now(timezone.utc).isoformat(), "event": event, "module": module}
|
|
740
943
|
record.update(extra)
|
|
@@ -758,11 +961,11 @@ class Launcher:
|
|
|
758
961
|
def _create_api_app(self) -> FastAPI:
|
|
759
962
|
"""Create the FastAPI app with Launcher management routes."""
|
|
760
963
|
app = FastAPI(title="Kite Launcher", docs_url=None, redoc_url=None)
|
|
761
|
-
launcher = self
|
|
964
|
+
launcher = self
|
|
762
965
|
|
|
763
966
|
@app.get("/launcher/modules")
|
|
764
967
|
async def list_modules():
|
|
765
|
-
"""List all modules and their current status
|
|
968
|
+
"""List all modules and their current status."""
|
|
766
969
|
result = []
|
|
767
970
|
for name, info in launcher.modules.items():
|
|
768
971
|
running = launcher.process_manager.is_running(name)
|
|
@@ -781,17 +984,15 @@ class Launcher:
|
|
|
781
984
|
|
|
782
985
|
@app.post("/launcher/modules/{name}/start")
|
|
783
986
|
async def start_module(name: str):
|
|
784
|
-
"""Start a module by name.
|
|
987
|
+
"""Start a module by name."""
|
|
785
988
|
info = launcher.modules.get(name)
|
|
786
989
|
if not info:
|
|
787
990
|
raise HTTPException(404, f"Module '{name}' not found")
|
|
788
991
|
if info.state == "disabled":
|
|
789
992
|
raise HTTPException(403, f"Module '{name}' is disabled")
|
|
790
993
|
|
|
791
|
-
# Generate token if not already present
|
|
792
994
|
if name not in launcher._module_tokens:
|
|
793
995
|
launcher._module_tokens[name] = secrets.token_hex(32)
|
|
794
|
-
# Register the new token to Registry
|
|
795
996
|
try:
|
|
796
997
|
async with httpx.AsyncClient() as client:
|
|
797
998
|
await client.post(
|
|
@@ -801,14 +1002,10 @@ class Launcher:
|
|
|
801
1002
|
timeout=5,
|
|
802
1003
|
)
|
|
803
1004
|
except Exception as e:
|
|
804
|
-
print(f"[launcher]
|
|
1005
|
+
print(f"[launcher] 警告: 注册 {name} 的令牌失败: {e}")
|
|
805
1006
|
|
|
806
1007
|
token = launcher._module_tokens[name]
|
|
807
|
-
boot_info = {
|
|
808
|
-
"token": token,
|
|
809
|
-
"registry_port": launcher.registry_port,
|
|
810
|
-
"preferred_port": info.preferred_port,
|
|
811
|
-
}
|
|
1008
|
+
boot_info = {"token": token}
|
|
812
1009
|
ok = launcher.process_manager.start_module(info, boot_info=boot_info)
|
|
813
1010
|
if ok:
|
|
814
1011
|
launcher._desired_states[name] = "running"
|
|
@@ -823,7 +1020,7 @@ class Launcher:
|
|
|
823
1020
|
|
|
824
1021
|
@app.post("/launcher/modules/{name}/stop")
|
|
825
1022
|
async def stop_module(name: str, body: dict = None):
|
|
826
|
-
"""Stop a module with graceful shutdown.
|
|
1023
|
+
"""Stop a module with graceful shutdown."""
|
|
827
1024
|
info = launcher.modules.get(name)
|
|
828
1025
|
if not info:
|
|
829
1026
|
raise HTTPException(404, f"Module '{name}' not found")
|
|
@@ -843,7 +1040,6 @@ class Launcher:
|
|
|
843
1040
|
raise HTTPException(403, f"Module '{name}' is disabled")
|
|
844
1041
|
reason = (body or {}).get("reason", "restart")
|
|
845
1042
|
await launcher._graceful_stop(name, reason)
|
|
846
|
-
# Re-generate token
|
|
847
1043
|
launcher._module_tokens[name] = secrets.token_hex(32)
|
|
848
1044
|
try:
|
|
849
1045
|
async with httpx.AsyncClient() as client:
|
|
@@ -856,11 +1052,7 @@ class Launcher:
|
|
|
856
1052
|
except Exception:
|
|
857
1053
|
pass
|
|
858
1054
|
token = launcher._module_tokens[name]
|
|
859
|
-
boot_info = {
|
|
860
|
-
"token": token,
|
|
861
|
-
"registry_port": launcher.registry_port,
|
|
862
|
-
"preferred_port": info.preferred_port,
|
|
863
|
-
}
|
|
1055
|
+
boot_info = {"token": token}
|
|
864
1056
|
ok = launcher.process_manager.start_module(info, boot_info=boot_info)
|
|
865
1057
|
if ok:
|
|
866
1058
|
launcher._desired_states[name] = "running"
|
|
@@ -884,11 +1076,9 @@ class Launcher:
|
|
|
884
1076
|
for name in added:
|
|
885
1077
|
info = launcher.modules[name]
|
|
886
1078
|
launcher._log_lifecycle("scanned", name, state=info.state, module_dir=info.module_dir)
|
|
887
|
-
# Initialize desired_state for new modules
|
|
888
1079
|
for name in added:
|
|
889
1080
|
info = launcher.modules[name]
|
|
890
1081
|
launcher._desired_states[name] = "running" if info.state == "enabled" else "stopped"
|
|
891
|
-
# Register tokens for new modules
|
|
892
1082
|
if added:
|
|
893
1083
|
new_tokens = {}
|
|
894
1084
|
for name in added:
|
|
@@ -917,14 +1107,12 @@ class Launcher:
|
|
|
917
1107
|
if new_state not in ("enabled", "manual", "disabled"):
|
|
918
1108
|
raise HTTPException(400, "state must be enabled, manual, or disabled")
|
|
919
1109
|
|
|
920
|
-
|
|
921
|
-
if info.is_core(launcher.project_root) and new_state == "disabled":
|
|
1110
|
+
if info.is_core() and new_state == "disabled":
|
|
922
1111
|
raise HTTPException(403, "Core modules cannot be disabled")
|
|
923
1112
|
|
|
924
1113
|
old_state = info.state
|
|
925
1114
|
info.state = new_state
|
|
926
1115
|
|
|
927
|
-
# Update desired_state to match new config_state
|
|
928
1116
|
if new_state == "enabled":
|
|
929
1117
|
launcher._desired_states[name] = "running"
|
|
930
1118
|
else:
|
|
@@ -956,7 +1144,6 @@ def _update_module_md_state(module_dir: str, new_state: str):
|
|
|
956
1144
|
with open(md_path, "r", encoding="utf-8") as f:
|
|
957
1145
|
content = f.read()
|
|
958
1146
|
|
|
959
|
-
# Replace state: xxx in frontmatter
|
|
960
1147
|
updated = re.sub(
|
|
961
1148
|
r'^(state:\s*)(\S+)',
|
|
962
1149
|
rf'\g<1>{new_state}',
|
|
@@ -968,4 +1155,4 @@ def _update_module_md_state(module_dir: str, new_state: str):
|
|
|
968
1155
|
with open(md_path, "w", encoding="utf-8") as f:
|
|
969
1156
|
f.write(updated)
|
|
970
1157
|
except Exception as e:
|
|
971
|
-
print(f"[launcher]
|
|
1158
|
+
print(f"[launcher] 警告: 更新 module.md 状态失败: {e}")
|