@agentunion/kite 1.0.7 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/core/event_hub/entry.py +305 -26
- package/core/event_hub/hub.py +8 -0
- package/core/event_hub/server.py +80 -17
- package/core/kite_log.py +241 -0
- package/core/launcher/entry.py +978 -284
- package/core/launcher/process_manager.py +456 -46
- package/core/registry/entry.py +272 -3
- package/core/registry/server.py +339 -289
- package/core/registry/store.py +10 -4
- package/extensions/agents/__init__.py +1 -0
- package/extensions/agents/assistant/__init__.py +1 -0
- package/extensions/agents/assistant/entry.py +380 -0
- package/extensions/agents/assistant/module.md +22 -0
- package/extensions/agents/assistant/server.py +236 -0
- package/extensions/channels/__init__.py +1 -0
- package/extensions/channels/acp_channel/__init__.py +1 -0
- package/extensions/channels/acp_channel/entry.py +380 -0
- package/extensions/channels/acp_channel/module.md +22 -0
- package/extensions/channels/acp_channel/server.py +236 -0
- package/extensions/event_hub_bench/entry.py +664 -379
- package/extensions/event_hub_bench/module.md +2 -1
- package/extensions/services/backup/__init__.py +1 -0
- package/extensions/services/backup/entry.py +380 -0
- package/extensions/services/backup/module.md +22 -0
- package/extensions/services/backup/server.py +244 -0
- package/extensions/services/model_service/__init__.py +1 -0
- package/extensions/services/model_service/entry.py +380 -0
- package/extensions/services/model_service/module.md +22 -0
- package/extensions/services/model_service/server.py +236 -0
- package/extensions/services/watchdog/entry.py +460 -147
- package/extensions/services/watchdog/module.md +3 -0
- package/extensions/services/watchdog/monitor.py +128 -13
- package/extensions/services/watchdog/server.py +75 -13
- package/extensions/services/web/__init__.py +1 -0
- package/extensions/services/web/config.yaml +149 -0
- package/extensions/services/web/entry.py +487 -0
- package/extensions/services/web/module.md +24 -0
- package/extensions/services/web/routes/__init__.py +1 -0
- package/extensions/services/web/routes/routes_call.py +189 -0
- package/extensions/services/web/routes/routes_config.py +512 -0
- package/extensions/services/web/routes/routes_contacts.py +98 -0
- package/extensions/services/web/routes/routes_devlog.py +99 -0
- package/extensions/services/web/routes/routes_phone.py +81 -0
- package/extensions/services/web/routes/routes_sms.py +48 -0
- package/extensions/services/web/routes/routes_stats.py +17 -0
- package/extensions/services/web/routes/routes_voicechat.py +554 -0
- package/extensions/services/web/routes/schemas.py +216 -0
- package/extensions/services/web/server.py +332 -0
- package/extensions/services/web/static/css/style.css +1064 -0
- package/extensions/services/web/static/index.html +1445 -0
- package/extensions/services/web/static/js/app.js +4671 -0
- package/extensions/services/web/vendor/__init__.py +1 -0
- package/extensions/services/web/vendor/bluetooth/__init__.py +0 -0
- package/extensions/services/web/vendor/bluetooth/audio.py +348 -0
- package/extensions/services/web/vendor/bluetooth/contacts.py +251 -0
- package/extensions/services/web/vendor/bluetooth/manager.py +395 -0
- package/extensions/services/web/vendor/bluetooth/sms.py +290 -0
- package/extensions/services/web/vendor/bluetooth/telephony.py +274 -0
- package/extensions/services/web/vendor/config.py +139 -0
- package/extensions/services/web/vendor/conversation/__init__.py +0 -0
- package/extensions/services/web/vendor/conversation/asr.py +936 -0
- package/extensions/services/web/vendor/conversation/engine.py +548 -0
- package/extensions/services/web/vendor/conversation/llm.py +534 -0
- package/extensions/services/web/vendor/conversation/mcp_tools.py +190 -0
- package/extensions/services/web/vendor/conversation/tts.py +322 -0
- package/extensions/services/web/vendor/conversation/vad.py +138 -0
- package/extensions/services/web/vendor/storage/__init__.py +1 -0
- package/extensions/services/web/vendor/storage/identity.py +312 -0
- package/extensions/services/web/vendor/storage/store.py +507 -0
- package/extensions/services/web/vendor/task/__init__.py +0 -0
- package/extensions/services/web/vendor/task/manager.py +864 -0
- package/extensions/services/web/vendor/task/models.py +45 -0
- package/extensions/services/web/vendor/task/webhook.py +263 -0
- package/extensions/services/web/vendor/tools/__init__.py +0 -0
- package/extensions/services/web/vendor/tools/registry.py +321 -0
- package/main.py +230 -90
- package/package.json +1 -1
package/core/launcher/entry.py
CHANGED
|
@@ -8,9 +8,11 @@ Thread model:
|
|
|
8
8
|
- (Windows) keyboard listener thread: polls for 'q' key
|
|
9
9
|
|
|
10
10
|
4-Phase startup:
|
|
11
|
-
Phase 1: Registry → stdout port →
|
|
12
|
-
|
|
13
|
-
|
|
11
|
+
Phase 1: Registry + Event Hub (parallel start) → Registry stdout port → stdin broadcast port to Event Hub
|
|
12
|
+
→ API → register self + tokens → stdin launcher_ws_token to Event Hub
|
|
13
|
+
→ stdout ws_endpoint → WS connect → module.ready
|
|
14
|
+
Phase 2: (reserved — Event Hub ready handled in Phase 1)
|
|
15
|
+
Phase 3: Registry delayed ready (Event Hub → Registry → Event Hub WS → module.ready)
|
|
14
16
|
Phase 4: start remaining enabled modules in topo order
|
|
15
17
|
"""
|
|
16
18
|
|
|
@@ -34,9 +36,17 @@ from .process_manager import ProcessManager
|
|
|
34
36
|
|
|
35
37
|
IS_WINDOWS = sys.platform == "win32"
|
|
36
38
|
|
|
39
|
+
# Shutdown timeout constants (seconds)
|
|
40
|
+
SHUTDOWN_TIMEOUT_NON_GRACEFUL = 5 # Non-graceful modules or no ack response
|
|
41
|
+
SHUTDOWN_TIMEOUT_PARTIAL = 3 # Graceful module ack'd but no ready
|
|
42
|
+
SHUTDOWN_TIMEOUT_READY = 1 # Graceful module sent ready (cleanup done)
|
|
43
|
+
SHUTDOWN_TIMEOUT_BULK = 3 # Bulk stop_all() safety net
|
|
44
|
+
|
|
37
45
|
# Core module names that are started in Phase 1-2 (not Phase 4)
|
|
38
46
|
CORE_MODULE_NAMES = {"registry", "event_hub"}
|
|
39
47
|
|
|
48
|
+
WATCHDOG_MODULE_NAME = "watchdog"
|
|
49
|
+
|
|
40
50
|
|
|
41
51
|
class Launcher:
|
|
42
52
|
"""Kite system entry point. Starts Registry, manages modules, exposes API."""
|
|
@@ -65,9 +75,9 @@ class Launcher:
|
|
|
65
75
|
self.modules: dict[str, ModuleInfo] = {}
|
|
66
76
|
self._shutdown_event = asyncio.Event()
|
|
67
77
|
self._thread_shutdown = threading.Event()
|
|
78
|
+
self._shutdown_complete = threading.Event() # Set when normal shutdown finishes
|
|
68
79
|
self._api_server: uvicorn.Server | None = None
|
|
69
80
|
self._api_ready = threading.Event()
|
|
70
|
-
self._fail_counts: dict[str, int] = {} # module_name -> consecutive failure count
|
|
71
81
|
self._module_tokens: dict[str, str] = {} # module_name -> per-module token
|
|
72
82
|
|
|
73
83
|
# Three-layer state model: desired_state per module
|
|
@@ -83,15 +93,48 @@ class Launcher:
|
|
|
83
93
|
# Event waiters: {event_key: (asyncio.Event, data_dict)}
|
|
84
94
|
self._event_waiters: dict[str, tuple[asyncio.Event, dict]] = {}
|
|
85
95
|
|
|
96
|
+
# Module ready times: module_name -> seconds from start to ready
|
|
97
|
+
self._ready_times: dict[str, float] = {}
|
|
98
|
+
|
|
99
|
+
# Shared HTTP client for Registry communication (lazy-init, reuses TCP connections)
|
|
100
|
+
self._http: httpx.AsyncClient | None = None
|
|
101
|
+
|
|
102
|
+
# Module exit reasons: module_name -> reason string (for modules that sent module.exiting)
|
|
103
|
+
self._exit_reasons: dict[str, str] = {}
|
|
104
|
+
|
|
105
|
+
# Graceful shutdown capability: module_name -> True if module declared support
|
|
106
|
+
# Registry and Event Hub default to True (they start before Watchdog can observe)
|
|
107
|
+
self._graceful_modules: dict[str, bool] = {"registry": True, "event_hub": True}
|
|
108
|
+
|
|
109
|
+
# System-wide shutdown flag: prevents Watchdog restart during shutdown
|
|
110
|
+
self._system_shutting_down = False
|
|
111
|
+
|
|
86
112
|
# Kite stdout message waiters: {waiter_key: (threading.Event, data_dict)}
|
|
87
113
|
# Used by ProcessManager stdout callback (cross-thread)
|
|
88
114
|
self._msg_waiters: dict[str, tuple[threading.Event, dict]] = {}
|
|
89
115
|
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
)
|
|
116
|
+
suffix = self.process_manager.instance_suffix
|
|
117
|
+
state_dir = os.path.join(os.environ["KITE_INSTANCE_DIR"], "launcher", "state")
|
|
118
|
+
os.makedirs(state_dir, exist_ok=True)
|
|
119
|
+
self._lifecycle_log = os.path.join(state_dir, f"lifecycle{suffix}.jsonl")
|
|
120
|
+
# Clear lifecycle log on startup (like latest.log)
|
|
121
|
+
try:
|
|
122
|
+
with open(self._lifecycle_log, "w", encoding="utf-8") as f:
|
|
123
|
+
pass
|
|
124
|
+
except Exception:
|
|
125
|
+
pass
|
|
126
|
+
os.environ["KITE_INSTANCE_SUFFIX"] = suffix
|
|
93
127
|
self._app = self._create_api_app()
|
|
94
128
|
|
|
129
|
+
@staticmethod
|
|
130
|
+
def _fmt_elapsed(seconds: float) -> str:
|
|
131
|
+
"""Format elapsed seconds: <1s → 'NNNms', >=1s → 'N.Ns', >=10s → 'NNs'."""
|
|
132
|
+
if seconds < 1:
|
|
133
|
+
return f"{seconds * 1000:.0f}ms"
|
|
134
|
+
if seconds < 10:
|
|
135
|
+
return f"{seconds:.1f}s"
|
|
136
|
+
return f"{seconds:.0f}s"
|
|
137
|
+
|
|
95
138
|
# ── Instance workspace resolution ──
|
|
96
139
|
|
|
97
140
|
@staticmethod
|
|
@@ -124,7 +167,6 @@ class Launcher:
|
|
|
124
167
|
with open(cwd_file, "w", encoding="utf-8") as f:
|
|
125
168
|
f.write(cwd)
|
|
126
169
|
os.environ["KITE_INSTANCE_DIR"] = candidate
|
|
127
|
-
print(f"[launcher] 实例工作区已创建: {candidate}")
|
|
128
170
|
return
|
|
129
171
|
|
|
130
172
|
if os.path.isfile(cwd_file):
|
|
@@ -132,7 +174,6 @@ class Launcher:
|
|
|
132
174
|
with open(cwd_file, "r", encoding="utf-8") as f:
|
|
133
175
|
if f.read().strip() == cwd:
|
|
134
176
|
os.environ["KITE_INSTANCE_DIR"] = candidate
|
|
135
|
-
print(f"[launcher] 实例工作区已找到: {candidate}")
|
|
136
177
|
return
|
|
137
178
|
except Exception:
|
|
138
179
|
pass
|
|
@@ -180,8 +221,7 @@ class Launcher:
|
|
|
180
221
|
|
|
181
222
|
def run(self):
|
|
182
223
|
"""Synchronous entry point. Sets up signals, runs the async main loop."""
|
|
183
|
-
print("[launcher]
|
|
184
|
-
print("[launcher] ── 环境变量 ──")
|
|
224
|
+
print("[launcher] ── 环境 ──")
|
|
185
225
|
for key in sorted(k for k in os.environ if k.startswith("KITE_")):
|
|
186
226
|
print(f"[launcher] {key} = {os.environ[key]}")
|
|
187
227
|
print(f"[launcher] PID = {os.getpid()}")
|
|
@@ -197,6 +237,8 @@ class Launcher:
|
|
|
197
237
|
asyncio.run(self._async_main())
|
|
198
238
|
except KeyboardInterrupt:
|
|
199
239
|
pass
|
|
240
|
+
except RuntimeError as e:
|
|
241
|
+
print(f"[launcher] 启动失败: {e}")
|
|
200
242
|
finally:
|
|
201
243
|
self._final_cleanup()
|
|
202
244
|
|
|
@@ -204,7 +246,7 @@ class Launcher:
|
|
|
204
246
|
"""Request graceful shutdown. Thread-safe — can be called from signal handler or any thread."""
|
|
205
247
|
if self._thread_shutdown.is_set():
|
|
206
248
|
return # already shutting down
|
|
207
|
-
print(f"
|
|
249
|
+
print(f"[launcher] {reason or '收到关闭请求'}")
|
|
208
250
|
self._thread_shutdown.set()
|
|
209
251
|
# Wake up asyncio event loop immediately (so _monitor_loop / wait_for exits)
|
|
210
252
|
loop = self._loop
|
|
@@ -213,9 +255,19 @@ class Launcher:
|
|
|
213
255
|
loop.call_soon_threadsafe(self._shutdown_event.set)
|
|
214
256
|
except RuntimeError:
|
|
215
257
|
pass
|
|
216
|
-
# Safety net: force exit after
|
|
258
|
+
# Safety net: force exit after 10s only if normal shutdown hasn't completed
|
|
217
259
|
def _force():
|
|
218
|
-
|
|
260
|
+
if self._shutdown_complete.wait(timeout=10):
|
|
261
|
+
return # Normal shutdown completed — no need to force
|
|
262
|
+
try:
|
|
263
|
+
pm = self.process_manager
|
|
264
|
+
still = [n for n in pm._processes if pm.is_running(n)]
|
|
265
|
+
except Exception:
|
|
266
|
+
still = []
|
|
267
|
+
if still:
|
|
268
|
+
print(f"[launcher] 关闭超时,以下模块仍在运行: {', '.join(still)},强制退出")
|
|
269
|
+
else:
|
|
270
|
+
print("[launcher] 关闭超时,强制退出")
|
|
219
271
|
os._exit(1)
|
|
220
272
|
threading.Thread(target=_force, daemon=True).start()
|
|
221
273
|
|
|
@@ -266,62 +318,145 @@ class Launcher:
|
|
|
266
318
|
async def _async_main(self):
|
|
267
319
|
"""Full 4-phase startup sequence, then monitor loop."""
|
|
268
320
|
self._loop = asyncio.get_running_loop()
|
|
321
|
+
t_start = time.monotonic()
|
|
322
|
+
self._start_unix = time.time()
|
|
323
|
+
phase_times = {}
|
|
324
|
+
G = "\033[32m"
|
|
325
|
+
R = "\033[0m"
|
|
269
326
|
|
|
270
327
|
# Validate core modules exist (mechanism 12)
|
|
271
328
|
self._validate_core_modules()
|
|
272
329
|
|
|
273
|
-
# Cleanup leftovers from previous instances
|
|
274
|
-
self.process_manager.cleanup_leftovers()
|
|
275
|
-
|
|
276
|
-
# Phase 1: Registry bootstrap
|
|
277
|
-
await self._phase1_registry()
|
|
278
|
-
if self._shutdown_event.is_set(): return
|
|
330
|
+
# Cleanup leftovers from previous instances (current instance dir)
|
|
331
|
+
local_cleaned = self.process_manager.cleanup_leftovers()
|
|
279
332
|
|
|
280
|
-
#
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
await self._register_module_tokens()
|
|
288
|
-
if self._shutdown_event.is_set(): return
|
|
333
|
+
# Cross-directory leftover cleanup (background, non-blocking)
|
|
334
|
+
# run_in_executor returns a Future (not coroutine), so use ensure_future
|
|
335
|
+
self._global_cleanup_task = asyncio.ensure_future(
|
|
336
|
+
asyncio.get_running_loop().run_in_executor(
|
|
337
|
+
None, self.process_manager.cleanup_global_leftovers
|
|
338
|
+
)
|
|
339
|
+
)
|
|
289
340
|
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
341
|
+
try:
|
|
342
|
+
# Phase 1+2: Registry + Event Hub parallel bootstrap
|
|
343
|
+
t0 = time.monotonic()
|
|
344
|
+
await self._phase1_parallel_bootstrap()
|
|
345
|
+
elapsed_p1 = time.monotonic() - t0
|
|
346
|
+
phase_times["Phase 1+2: Registry + Event Hub (并行)"] = elapsed_p1
|
|
347
|
+
print(f"{G}[launcher] ✓ Phase 1+2 完成: Registry + Event Hub 已就绪 ({elapsed_p1:.2f}s){R}")
|
|
348
|
+
if self._shutdown_event.is_set(): return
|
|
349
|
+
|
|
350
|
+
# Phase 3: Wait for Registry delayed ready
|
|
351
|
+
t0 = time.monotonic()
|
|
352
|
+
await self._phase3_registry_ready()
|
|
353
|
+
elapsed = time.monotonic() - t0
|
|
354
|
+
phase_times["Phase 3: Registry 事件总线"] = elapsed
|
|
355
|
+
print(f"{G}[launcher] ✓ Phase 3 完成: Registry 已连接事件总线 ({elapsed:.2f}s){R}")
|
|
356
|
+
if self._shutdown_event.is_set(): return
|
|
357
|
+
|
|
358
|
+
# Initialize desired_state from config_state (needed before Phase 3.5)
|
|
359
|
+
for name, info in self.modules.items():
|
|
360
|
+
if info.state == "enabled":
|
|
361
|
+
self._desired_states[name] = "running"
|
|
362
|
+
else: # manual, disabled
|
|
363
|
+
self._desired_states[name] = "stopped"
|
|
364
|
+
# Core modules are already running
|
|
365
|
+
for cn in CORE_MODULE_NAMES:
|
|
366
|
+
self._desired_states[cn] = "running"
|
|
367
|
+
|
|
368
|
+
# Phase 3.5: Watchdog ready
|
|
369
|
+
# If started in parallel (Phase 1), just wait for module.ready
|
|
370
|
+
# Otherwise start it now (fallback)
|
|
371
|
+
watchdog_info = self.modules.get(WATCHDOG_MODULE_NAME)
|
|
372
|
+
if watchdog_info and self._desired_states.get(WATCHDOG_MODULE_NAME) == "running":
|
|
373
|
+
t0 = time.monotonic()
|
|
374
|
+
if getattr(self, '_watchdog_parallel', False):
|
|
375
|
+
print(f"[launcher] Phase 3.5: Watchdog 已并行启动,等待就绪...")
|
|
376
|
+
ready = await self._wait_event("module.ready", "watchdog", timeout=15)
|
|
377
|
+
elapsed = time.monotonic() - t0
|
|
378
|
+
if ready and not ready.get("_exited"):
|
|
379
|
+
self._graceful_modules["watchdog"] = bool(ready.get("graceful_shutdown"))
|
|
380
|
+
self._ready_times["watchdog"] = elapsed
|
|
381
|
+
print(f"[launcher] Watchdog 已就绪")
|
|
382
|
+
self._log_lifecycle("started", "watchdog")
|
|
383
|
+
await self._publish_event("module.started", {"module_id": "watchdog"})
|
|
384
|
+
self.process_manager.close_stdio("watchdog")
|
|
385
|
+
else:
|
|
386
|
+
print(f"[launcher] 警告: Watchdog 在 15s 内未就绪")
|
|
387
|
+
else:
|
|
388
|
+
print(f"[launcher] Phase 3.5: 启动 Watchdog...")
|
|
389
|
+
await self._start_one_module(watchdog_info)
|
|
390
|
+
elapsed = time.monotonic() - t0
|
|
391
|
+
print(f"{G}[launcher] ✓ Phase 3.5 完成: Watchdog ({elapsed:.2f}s){R}")
|
|
392
|
+
if self._shutdown_event.is_set(): return
|
|
393
|
+
|
|
394
|
+
# Phase 4: Start remaining enabled modules
|
|
395
|
+
t0 = time.monotonic()
|
|
396
|
+
await self._phase4_start_modules()
|
|
397
|
+
elapsed = time.monotonic() - t0
|
|
398
|
+
phase_times["Phase 4: Extensions"] = elapsed
|
|
399
|
+
print(f"{G}[launcher] ✓ Phase 4 完成: 扩展模块已启动 ({elapsed:.2f}s){R}")
|
|
400
|
+
if self._shutdown_event.is_set(): return
|
|
293
401
|
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
402
|
+
# Post-startup
|
|
403
|
+
self.process_manager.persist_records()
|
|
404
|
+
self._heartbeat_task = asyncio.create_task(self._heartbeat_loop())
|
|
297
405
|
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
406
|
+
# Wait for global leftover cleanup to finish (non-blocking with timeout)
|
|
407
|
+
global_cleaned = {}
|
|
408
|
+
if hasattr(self, '_global_cleanup_task'):
|
|
409
|
+
try:
|
|
410
|
+
global_cleaned = await asyncio.wait_for(self._global_cleanup_task, timeout=5) or {}
|
|
411
|
+
except asyncio.TimeoutError:
|
|
412
|
+
print("[launcher] 警告: 全局遗留清理超时 (5s),跳过")
|
|
413
|
+
except Exception as e:
|
|
414
|
+
print(f"[launcher] 警告: 全局遗留清理出错: {e}")
|
|
415
|
+
# Merge local + global cleanup stats
|
|
416
|
+
cleaned_stats: dict[str, int] = {}
|
|
417
|
+
for d in (local_cleaned, global_cleaned):
|
|
418
|
+
for k, v in d.items():
|
|
419
|
+
cleaned_stats[k] = cleaned_stats.get(k, 0) + v
|
|
420
|
+
|
|
421
|
+
# Global instance scan (via executor to avoid blocking)
|
|
422
|
+
global_instances = await asyncio.get_running_loop().run_in_executor(
|
|
423
|
+
None, self.process_manager.get_global_instances
|
|
424
|
+
)
|
|
308
425
|
|
|
309
|
-
|
|
310
|
-
|
|
426
|
+
# ── Startup report ──
|
|
427
|
+
total_time = time.monotonic() - t_start
|
|
428
|
+
await self._print_startup_report(total_time, phase_times,
|
|
429
|
+
global_instances=global_instances,
|
|
430
|
+
cleaned_stats=cleaned_stats)
|
|
431
|
+
# Notify all modules that system startup is complete
|
|
432
|
+
await self._publish_event("system.ready", {
|
|
433
|
+
"startup_time": round(total_time, 2),
|
|
434
|
+
})
|
|
311
435
|
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
436
|
+
print("[launcher] 进入监控循环 (按 Ctrl+C 或 'q' 退出)")
|
|
437
|
+
await self._monitor_loop()
|
|
438
|
+
finally:
|
|
439
|
+
try:
|
|
440
|
+
await self._graceful_shutdown_all()
|
|
441
|
+
except Exception as e:
|
|
442
|
+
print(f"[launcher] 优雅关闭出错: {e}")
|
|
315
443
|
|
|
316
|
-
|
|
317
|
-
await self._monitor_loop()
|
|
444
|
+
# ── Phase 1+2: Parallel bootstrap (Registry + Event Hub) ──
|
|
318
445
|
|
|
319
|
-
|
|
446
|
+
async def _phase1_parallel_bootstrap(self):
|
|
447
|
+
"""Start Registry + Event Hub processes in parallel to overlap cold-start time.
|
|
320
448
|
|
|
321
|
-
|
|
449
|
+
Flow:
|
|
450
|
+
1. Start Registry + Event Hub processes simultaneously
|
|
451
|
+
2. Wait for Registry to report port via stdout
|
|
452
|
+
3. Set KITE_REGISTRY_PORT env (for Phase 3.5/4 modules) + start API
|
|
453
|
+
4. Scan modules + register self & tokens (parallel)
|
|
454
|
+
5. Send launcher_ws_token + registry_port to Event Hub via stdin
|
|
455
|
+
6. Wait for Event Hub ws_endpoint → WS connect → module.ready
|
|
456
|
+
"""
|
|
457
|
+
t_registry = time.monotonic()
|
|
322
458
|
|
|
323
|
-
|
|
324
|
-
"""Start Registry → capture port from stdout → set env → start API → register self."""
|
|
459
|
+
# ── Step 1: Start both processes ──
|
|
325
460
|
registry_dir = os.path.join(os.environ["KITE_PROJECT"], "core", "registry")
|
|
326
461
|
registry_info = ModuleInfo(
|
|
327
462
|
name="registry",
|
|
@@ -332,30 +467,186 @@ class Launcher:
|
|
|
332
467
|
entry="entry.py",
|
|
333
468
|
module_dir=registry_dir,
|
|
334
469
|
)
|
|
335
|
-
|
|
336
|
-
boot_info = {"token": self.kite_token}
|
|
470
|
+
boot_info_registry = {"token": self.kite_token}
|
|
337
471
|
self._log_lifecycle("starting", "registry")
|
|
338
|
-
ok = self.process_manager.start_module(registry_info, boot_info=
|
|
472
|
+
ok = self.process_manager.start_module(registry_info, boot_info=boot_info_registry)
|
|
339
473
|
if not ok:
|
|
340
474
|
self._log_lifecycle("start_failed", "registry")
|
|
341
475
|
raise RuntimeError("启动 Registry 失败")
|
|
342
476
|
|
|
343
|
-
#
|
|
344
|
-
|
|
477
|
+
# Start Event Hub in parallel (before Registry port is known)
|
|
478
|
+
eh_dir = os.path.join(os.environ["KITE_PROJECT"], "core", "event_hub")
|
|
479
|
+
eh_info = ModuleInfo(
|
|
480
|
+
name="event_hub",
|
|
481
|
+
display_name="Event Hub",
|
|
482
|
+
type="infrastructure",
|
|
483
|
+
state="enabled",
|
|
484
|
+
runtime="python",
|
|
485
|
+
entry="entry.py",
|
|
486
|
+
module_dir=eh_dir,
|
|
487
|
+
)
|
|
488
|
+
# Generate Event Hub token early (will register to Registry once it's up)
|
|
489
|
+
eh_token = secrets.token_hex(32)
|
|
490
|
+
self._module_tokens["event_hub"] = eh_token
|
|
491
|
+
boot_info_eh = {"token": eh_token}
|
|
492
|
+
self._log_lifecycle("starting", "event_hub")
|
|
493
|
+
ok = self.process_manager.start_module(eh_info, boot_info=boot_info_eh)
|
|
494
|
+
if not ok:
|
|
495
|
+
self._log_lifecycle("start_failed", "event_hub")
|
|
496
|
+
raise RuntimeError("启动 Event Hub 失败")
|
|
497
|
+
|
|
498
|
+
# Start Watchdog in parallel (before Registry port is known)
|
|
499
|
+
# Watchdog will block on stdin waiting for registry_port
|
|
500
|
+
watchdog_dir = os.path.join(os.environ["KITE_PROJECT"], "extensions", "services", "watchdog")
|
|
501
|
+
watchdog_md = os.path.join(watchdog_dir, "module.md")
|
|
502
|
+
self._watchdog_parallel = False # track whether watchdog was started in parallel
|
|
503
|
+
if os.path.isfile(watchdog_md):
|
|
504
|
+
wd_token = secrets.token_hex(32)
|
|
505
|
+
self._module_tokens["watchdog"] = wd_token
|
|
506
|
+
# Parse watchdog module.md for ModuleInfo
|
|
507
|
+
try:
|
|
508
|
+
with open(watchdog_md, "r", encoding="utf-8") as f:
|
|
509
|
+
wd_fm = _parse_frontmatter(f.read())
|
|
510
|
+
wd_info = ModuleInfo(
|
|
511
|
+
name="watchdog",
|
|
512
|
+
display_name=wd_fm.get("display_name", "Watchdog"),
|
|
513
|
+
type=wd_fm.get("type", "service"),
|
|
514
|
+
state="enabled",
|
|
515
|
+
runtime=wd_fm.get("runtime", "python"),
|
|
516
|
+
entry=wd_fm.get("entry", "entry.py"),
|
|
517
|
+
module_dir=watchdog_dir,
|
|
518
|
+
)
|
|
519
|
+
boot_info_wd = {"token": wd_token}
|
|
520
|
+
self._log_lifecycle("starting", "watchdog")
|
|
521
|
+
ok = self.process_manager.start_module(wd_info, boot_info=boot_info_wd)
|
|
522
|
+
if ok:
|
|
523
|
+
self._watchdog_parallel = True
|
|
524
|
+
else:
|
|
525
|
+
self._log_lifecycle("start_failed", "watchdog")
|
|
526
|
+
print("[launcher] 警告: Watchdog 并行启动失败,将在 Phase 3.5 重试")
|
|
527
|
+
except Exception as e:
|
|
528
|
+
print(f"[launcher] 警告: Watchdog module.md 解析失败: {e}")
|
|
529
|
+
|
|
530
|
+
parallel_modules = "Registry + Event Hub" + (" + Watchdog" if self._watchdog_parallel else "")
|
|
531
|
+
print(f"[launcher] {parallel_modules} 进程已同时启动,等待 Registry 端口...")
|
|
532
|
+
|
|
533
|
+
# Persist immediately after starting core processes
|
|
534
|
+
self.process_manager.persist_records()
|
|
535
|
+
|
|
536
|
+
# ── Step 2: Wait for Registry port ──
|
|
345
537
|
msg = await self._wait_kite_message("registry", "port", timeout=6)
|
|
346
538
|
if not msg or not msg.get("port"):
|
|
347
539
|
raise RuntimeError("致命错误: Registry 在 6s 内未报告端口")
|
|
348
540
|
self.registry_port = int(msg["port"])
|
|
349
|
-
|
|
541
|
+
self._ready_times["registry"] = time.monotonic() - t_registry
|
|
542
|
+
_wait_s = time.monotonic() - t_registry
|
|
543
|
+
print(f"[launcher] Registry 端口: {self.registry_port} (等待 {self._fmt_elapsed(_wait_s)})")
|
|
350
544
|
|
|
351
|
-
# Set
|
|
545
|
+
# ── Step 3: Set env + start API + immediately unblock Event Hub ──
|
|
352
546
|
os.environ["KITE_REGISTRY_PORT"] = str(self.registry_port)
|
|
353
|
-
|
|
354
|
-
# Start Launcher API in a separate thread
|
|
355
547
|
self._start_api_thread()
|
|
356
548
|
|
|
357
|
-
#
|
|
358
|
-
|
|
549
|
+
# Send launcher_ws_token + registry_port to Event Hub ASAP (unblock it)
|
|
550
|
+
self._launcher_ws_token = secrets.token_hex(32)
|
|
551
|
+
self.process_manager.write_stdin("event_hub", {
|
|
552
|
+
"kite": "launcher_ws_token",
|
|
553
|
+
"launcher_ws_token": self._launcher_ws_token,
|
|
554
|
+
})
|
|
555
|
+
self.process_manager.write_stdin("event_hub", {
|
|
556
|
+
"kite": "registry_port",
|
|
557
|
+
"registry_port": self.registry_port,
|
|
558
|
+
})
|
|
559
|
+
|
|
560
|
+
# Send registry_port to Watchdog via stdin (if started in parallel)
|
|
561
|
+
# Watchdog will retry querying launcher.api_endpoint until it's available
|
|
562
|
+
if self.process_manager.is_running("watchdog"):
|
|
563
|
+
self.process_manager.write_stdin("watchdog", {
|
|
564
|
+
"kite": "registry_port",
|
|
565
|
+
"registry_port": self.registry_port,
|
|
566
|
+
})
|
|
567
|
+
|
|
568
|
+
# ── Step 4: Scan + register tokens ‖ wait for Event Hub ws_endpoint (parallel) ──
|
|
569
|
+
# Pre-register ws_endpoint waiter BEFORE gather to avoid race condition:
|
|
570
|
+
# module_scanner.scan() is synchronous and blocks the event loop,
|
|
571
|
+
# so the _wait_event_hub_endpoint coroutine wouldn't register its waiter in time.
|
|
572
|
+
ws_waiter_key = "event_hub:ws_endpoint"
|
|
573
|
+
ws_evt = threading.Event()
|
|
574
|
+
ws_data: dict = {}
|
|
575
|
+
self._msg_waiters[ws_waiter_key] = (ws_evt, ws_data)
|
|
576
|
+
|
|
577
|
+
async def _scan_and_register_tokens():
|
|
578
|
+
t_scan = time.monotonic()
|
|
579
|
+
self.modules = self.module_scanner.scan()
|
|
580
|
+
for name, info in self.modules.items():
|
|
581
|
+
self._log_lifecycle("scanned", name, state=info.state, module_dir=info.module_dir)
|
|
582
|
+
_scan_s = time.monotonic() - t_scan
|
|
583
|
+
print(f"[launcher] 发现 {len(self.modules)} 个模块: {', '.join(self.modules.keys()) or '(无)'} (扫描 {self._fmt_elapsed(_scan_s)})")
|
|
584
|
+
t_reg = time.monotonic()
|
|
585
|
+
await self._register_module_tokens()
|
|
586
|
+
_reg_s = time.monotonic() - t_reg
|
|
587
|
+
print(f"[launcher] 令牌注册完成 ({self._fmt_elapsed(_reg_s)})")
|
|
588
|
+
|
|
589
|
+
async def _wait_event_hub_endpoint():
|
|
590
|
+
t_wait_eh = time.monotonic()
|
|
591
|
+
print("[launcher] 等待 Event Hub ws_endpoint...")
|
|
592
|
+
shutdown = self._thread_shutdown
|
|
593
|
+
def _wait():
|
|
594
|
+
deadline = time.monotonic() + 10
|
|
595
|
+
while time.monotonic() < deadline:
|
|
596
|
+
if ws_evt.wait(timeout=0.5):
|
|
597
|
+
return True
|
|
598
|
+
if shutdown.is_set():
|
|
599
|
+
return False
|
|
600
|
+
return False
|
|
601
|
+
got = await asyncio.get_running_loop().run_in_executor(None, _wait)
|
|
602
|
+
self._msg_waiters.pop(ws_waiter_key, None)
|
|
603
|
+
if not got or not ws_data.get("ws_endpoint"):
|
|
604
|
+
raise RuntimeError("致命错误: Event Hub 在 10s 内未报告 ws_endpoint")
|
|
605
|
+
self._event_hub_ws_url = ws_data["ws_endpoint"]
|
|
606
|
+
_eh_s = time.monotonic() - t_wait_eh
|
|
607
|
+
print(f"[launcher] Event Hub 已发现: {self._event_hub_ws_url} (等待 {self._fmt_elapsed(_eh_s)})")
|
|
608
|
+
|
|
609
|
+
# Run all three in parallel: register_self + scan_tokens + wait_event_hub
|
|
610
|
+
await asyncio.gather(
|
|
611
|
+
self._register_self(),
|
|
612
|
+
_scan_and_register_tokens(),
|
|
613
|
+
_wait_event_hub_endpoint(),
|
|
614
|
+
)
|
|
615
|
+
if self._shutdown_event.is_set():
|
|
616
|
+
return
|
|
617
|
+
|
|
618
|
+
# ── Step 5: WS connect → module.ready ──
|
|
619
|
+
t_eh = time.monotonic()
|
|
620
|
+
self._ws_task = asyncio.create_task(self._ws_loop())
|
|
621
|
+
|
|
622
|
+
# Wait for Event Hub module.ready (sent when Launcher connects)
|
|
623
|
+
ready = await self._wait_event("module.ready", "event_hub", timeout=15)
|
|
624
|
+
if ready:
|
|
625
|
+
self._graceful_modules["event_hub"] = bool(ready.get("graceful_shutdown"))
|
|
626
|
+
print("[launcher] Event Hub 已就绪")
|
|
627
|
+
else:
|
|
628
|
+
print("[launcher] 警告: Event Hub 在 15s 内未发送 module.ready")
|
|
629
|
+
|
|
630
|
+
self._ready_times["event_hub"] = time.monotonic() - t_eh
|
|
631
|
+
self._log_lifecycle("started", "event_hub")
|
|
632
|
+
await self._publish_event("module.started", {"module_id": "event_hub"})
|
|
633
|
+
self.process_manager.close_stdio("event_hub")
|
|
634
|
+
|
|
635
|
+
# Store eh_info in modules dict if not already present (from scan)
|
|
636
|
+
if "event_hub" not in self.modules:
|
|
637
|
+
self.modules["event_hub"] = eh_info
|
|
638
|
+
|
|
639
|
+
def _get_http(self) -> httpx.AsyncClient:
|
|
640
|
+
"""Get shared HTTP client (lazy-init, reuses TCP connections to Registry)."""
|
|
641
|
+
if self._http is None or self._http.is_closed:
|
|
642
|
+
self._http = httpx.AsyncClient(timeout=5)
|
|
643
|
+
return self._http
|
|
644
|
+
|
|
645
|
+
async def _close_http(self):
|
|
646
|
+
"""Close shared HTTP client."""
|
|
647
|
+
if self._http and not self._http.is_closed:
|
|
648
|
+
await self._http.aclose()
|
|
649
|
+
self._http = None
|
|
359
650
|
|
|
360
651
|
async def _register_self(self):
|
|
361
652
|
"""Register Launcher itself to Registry."""
|
|
@@ -376,86 +667,29 @@ class Launcher:
|
|
|
376
667
|
"events_subscribe": [">"],
|
|
377
668
|
}
|
|
378
669
|
try:
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
670
|
+
client = self._get_http()
|
|
671
|
+
resp = await client.post(url, json=payload, headers=headers)
|
|
672
|
+
if resp.status_code == 200:
|
|
673
|
+
print("[launcher] 已注册到 Registry")
|
|
674
|
+
else:
|
|
675
|
+
print(f"[launcher] 警告: Registry 注册返回 {resp.status_code}")
|
|
385
676
|
except Exception as e:
|
|
386
677
|
print(f"[launcher] 警告: 注册到 Registry 失败: {e}")
|
|
387
678
|
|
|
388
|
-
# ── Phase 2
|
|
389
|
-
|
|
390
|
-
async def _phase2_event_hub(self):
|
|
391
|
-
"""Start Event Hub → stdin launcher_ws_token → stdout ws_endpoint → WS connect → module.ready."""
|
|
392
|
-
# Find event_hub in scanned modules or build manually
|
|
393
|
-
eh_info = self.modules.get("event_hub")
|
|
394
|
-
if not eh_info:
|
|
395
|
-
eh_dir = os.path.join(os.environ["KITE_PROJECT"], "core", "event_hub")
|
|
396
|
-
eh_info = ModuleInfo(
|
|
397
|
-
name="event_hub",
|
|
398
|
-
display_name="Event Hub",
|
|
399
|
-
type="infrastructure",
|
|
400
|
-
state="enabled",
|
|
401
|
-
runtime="python",
|
|
402
|
-
entry="entry.py",
|
|
403
|
-
module_dir=eh_dir,
|
|
404
|
-
)
|
|
405
|
-
|
|
406
|
-
token = self._module_tokens.get("event_hub", "")
|
|
407
|
-
if not token:
|
|
408
|
-
token = secrets.token_hex(32)
|
|
409
|
-
self._module_tokens["event_hub"] = token
|
|
410
|
-
await self._register_tokens_to_registry({"event_hub": token})
|
|
411
|
-
|
|
412
|
-
boot_info = {"token": token}
|
|
413
|
-
self._log_lifecycle("starting", "event_hub")
|
|
414
|
-
ok = self.process_manager.start_module(eh_info, boot_info=boot_info)
|
|
415
|
-
if not ok:
|
|
416
|
-
self._log_lifecycle("start_failed", "event_hub")
|
|
417
|
-
raise RuntimeError("启动 Event Hub 失败")
|
|
418
|
-
|
|
419
|
-
# Send launcher_ws_token via stdin (mechanism 6)
|
|
420
|
-
self._launcher_ws_token = secrets.token_hex(32)
|
|
421
|
-
self.process_manager.write_stdin("event_hub", {
|
|
422
|
-
"kite": "launcher_ws_token",
|
|
423
|
-
"launcher_ws_token": self._launcher_ws_token,
|
|
424
|
-
})
|
|
425
|
-
|
|
426
|
-
# Wait for ws_endpoint from stdout (mechanism 5)
|
|
427
|
-
print("[launcher] 等待 Event Hub ws_endpoint...")
|
|
428
|
-
msg = await self._wait_kite_message("event_hub", "ws_endpoint", timeout=6)
|
|
429
|
-
if not msg or not msg.get("ws_endpoint"):
|
|
430
|
-
raise RuntimeError("致命错误: Event Hub 在 6s 内未报告 ws_endpoint")
|
|
431
|
-
self._event_hub_ws_url = msg["ws_endpoint"]
|
|
432
|
-
print(f"[launcher] Event Hub 已发现: {self._event_hub_ws_url}")
|
|
433
|
-
|
|
434
|
-
# Connect to Event Hub WebSocket with launcher_ws_token
|
|
435
|
-
self._ws_task = asyncio.create_task(self._ws_loop())
|
|
436
|
-
|
|
437
|
-
# Wait for Event Hub module.ready (sent when Launcher connects)
|
|
438
|
-
ready = await self._wait_event("module.ready", "event_hub", timeout=15)
|
|
439
|
-
if ready:
|
|
440
|
-
print("[launcher] Event Hub 已就绪")
|
|
441
|
-
else:
|
|
442
|
-
print("[launcher] 警告: Event Hub 在 15s 内未发送 module.ready")
|
|
443
|
-
|
|
444
|
-
self._log_lifecycle("started", "event_hub")
|
|
445
|
-
await self._publish_event("module.started", {"module_id": "event_hub"})
|
|
446
|
-
self.process_manager.close_stdio("event_hub")
|
|
679
|
+
# ── (Phase 2 merged into _phase1_parallel_bootstrap) ──
|
|
447
680
|
|
|
448
681
|
# ── Phase 3: Registry delayed ready ──
|
|
449
682
|
|
|
450
683
|
async def _phase3_registry_ready(self):
|
|
451
684
|
"""Wait for Registry module.ready (triggered after Event Hub registers to Registry
|
|
452
685
|
and Registry connects to Event Hub WS)."""
|
|
453
|
-
print("[launcher] 等待 Registry
|
|
686
|
+
print("[launcher] 等待 Registry 连接 Event Hub...")
|
|
454
687
|
ready = await self._wait_event("module.ready", "registry", timeout=12)
|
|
455
688
|
if ready:
|
|
456
|
-
|
|
689
|
+
self._graceful_modules["registry"] = bool(ready.get("graceful_shutdown"))
|
|
690
|
+
print("[launcher] Registry 事件总线连接完成")
|
|
457
691
|
else:
|
|
458
|
-
print("[launcher] 警告: Registry 在 12s
|
|
692
|
+
print("[launcher] 警告: Registry 在 12s 内未连接事件总线 (降级运行)")
|
|
459
693
|
|
|
460
694
|
self._log_lifecycle("started", "registry")
|
|
461
695
|
await self._publish_event("module.started", {"module_id": "registry"})
|
|
@@ -467,7 +701,8 @@ class Launcher:
|
|
|
467
701
|
"""Start enabled modules (excluding core) in dependency order."""
|
|
468
702
|
to_start = [m for m in self.modules.values()
|
|
469
703
|
if self._desired_states.get(m.name) == "running"
|
|
470
|
-
and m.name not in CORE_MODULE_NAMES
|
|
704
|
+
and m.name not in CORE_MODULE_NAMES
|
|
705
|
+
and m.name != WATCHDOG_MODULE_NAME]
|
|
471
706
|
if not to_start:
|
|
472
707
|
print("[launcher] 没有额外模块需要启动")
|
|
473
708
|
return
|
|
@@ -487,14 +722,18 @@ class Launcher:
|
|
|
487
722
|
print(f"[launcher] 错误: '{m.name}' 依赖已禁用的模块 '{dep}'")
|
|
488
723
|
|
|
489
724
|
try:
|
|
490
|
-
|
|
725
|
+
layers = self._topo_layers(to_start)
|
|
491
726
|
except RuntimeError as e:
|
|
492
727
|
print(f"[launcher] 错误: {e}")
|
|
493
728
|
return
|
|
494
729
|
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
730
|
+
total = sum(len(layer) for layer in layers)
|
|
731
|
+
print(f"[launcher] 正在启动 {total} 个模块...")
|
|
732
|
+
for layer in layers:
|
|
733
|
+
if len(layer) == 1:
|
|
734
|
+
await self._start_one_module(layer[0])
|
|
735
|
+
else:
|
|
736
|
+
await asyncio.gather(*(self._start_one_module(info) for info in layer))
|
|
498
737
|
|
|
499
738
|
# ── Event Hub WebSocket connection ──
|
|
500
739
|
|
|
@@ -506,16 +745,19 @@ class Launcher:
|
|
|
506
745
|
except asyncio.CancelledError:
|
|
507
746
|
return
|
|
508
747
|
except Exception as e:
|
|
509
|
-
|
|
748
|
+
if not self._system_shutting_down:
|
|
749
|
+
print(f"[launcher] Event Hub 连接错误: {e}")
|
|
510
750
|
self._ws = None
|
|
511
751
|
await asyncio.sleep(5)
|
|
512
752
|
|
|
513
753
|
async def _ws_connect(self):
|
|
514
754
|
"""Single WebSocket session with launcher_ws_token auth."""
|
|
515
|
-
ws_url = f"{self._event_hub_ws_url}?token={self._launcher_ws_token}"
|
|
516
|
-
|
|
755
|
+
ws_url = f"{self._event_hub_ws_url}?token={self._launcher_ws_token}&id=launcher"
|
|
756
|
+
t_ws_connect = time.monotonic()
|
|
757
|
+
async with websockets.connect(ws_url, open_timeout=3, ping_interval=None, ping_timeout=None, close_timeout=10) as ws:
|
|
517
758
|
self._ws = ws
|
|
518
|
-
|
|
759
|
+
_ws_s = time.monotonic() - t_ws_connect
|
|
760
|
+
print(f"[launcher] 已连接到 Event Hub ({self._fmt_elapsed(_ws_s)})")
|
|
519
761
|
|
|
520
762
|
# Subscribe to all events
|
|
521
763
|
await ws.send(json.dumps({
|
|
@@ -529,52 +771,89 @@ class Launcher:
|
|
|
529
771
|
msg = json.loads(raw)
|
|
530
772
|
except (json.JSONDecodeError, TypeError):
|
|
531
773
|
continue
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
waiter
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
774
|
+
try:
|
|
775
|
+
msg_type = msg.get("type", "")
|
|
776
|
+
if msg_type == "event":
|
|
777
|
+
source = msg.get("source", "unknown")
|
|
778
|
+
event = msg.get("event", "")
|
|
779
|
+
data = msg.get("data") if isinstance(msg.get("data"), dict) else {}
|
|
780
|
+
# Trigger event waiters
|
|
781
|
+
module_id = data.get("module_id", "")
|
|
782
|
+
waiter_key = f"{event}:{module_id}"
|
|
783
|
+
waiter = self._event_waiters.get(waiter_key)
|
|
784
|
+
if waiter:
|
|
785
|
+
waiter[1].update(data)
|
|
786
|
+
waiter[0].set()
|
|
787
|
+
# module.exiting also wakes module.ready waiter
|
|
788
|
+
# (module won't send ready — no point waiting)
|
|
789
|
+
if event == "module.exiting" and module_id:
|
|
790
|
+
ready_key = f"module.ready:{module_id}"
|
|
791
|
+
ready_waiter = self._event_waiters.get(ready_key)
|
|
792
|
+
if ready_waiter:
|
|
793
|
+
ready_waiter[1].update(data)
|
|
794
|
+
ready_waiter[1]["_exited"] = True
|
|
795
|
+
ready_waiter[0].set()
|
|
796
|
+
# module.crash → print red crash summary (real-time notification)
|
|
797
|
+
if event == "module.crash" and module_id:
|
|
798
|
+
RED = "\033[91m"
|
|
799
|
+
RESET = "\033[0m"
|
|
800
|
+
exc_type = data.get("exception_type", "Unknown")
|
|
801
|
+
preview = data.get("traceback_preview", "")
|
|
802
|
+
severity = data.get("severity", "error")
|
|
803
|
+
print(f"[launcher] {RED}模块 '{module_id}' 崩溃: "
|
|
804
|
+
f"{exc_type} — {preview}{RESET}")
|
|
805
|
+
_suffix = os.environ.get("KITE_INSTANCE_SUFFIX", "")
|
|
806
|
+
crash_log = os.path.join(
|
|
807
|
+
os.environ.get("KITE_INSTANCE_DIR", ""),
|
|
808
|
+
module_id, "log", f"crashes{_suffix}.jsonl"
|
|
809
|
+
)
|
|
810
|
+
print(f"[launcher] 崩溃日志: {crash_log}")
|
|
811
|
+
ts = msg.get("timestamp", "")
|
|
812
|
+
# Only log system events (module.*, watchdog.*) to avoid flooding
|
|
813
|
+
# from benchmark/test traffic
|
|
814
|
+
if not (event.startswith("module.") or event.startswith("watchdog.")):
|
|
815
|
+
continue
|
|
816
|
+
latency_str = ""
|
|
817
|
+
if ts:
|
|
818
|
+
try:
|
|
819
|
+
from datetime import datetime, timezone
|
|
820
|
+
sent = datetime.fromisoformat(ts)
|
|
821
|
+
delay_ms = (datetime.now(timezone.utc) - sent).total_seconds() * 1000
|
|
822
|
+
latency_str = f" ({delay_ms:.1f}ms)"
|
|
823
|
+
local_ts = sent.astimezone().strftime("%H:%M:%S")
|
|
824
|
+
except Exception:
|
|
825
|
+
local_ts = ts[11:19] if len(ts) >= 19 else ts
|
|
826
|
+
print(f"[{source}] {local_ts} {event}{latency_str}: {json.dumps(data, ensure_ascii=False)}")
|
|
827
|
+
else:
|
|
828
|
+
print(f"[{source}] {event}: {json.dumps(data, ensure_ascii=False)}")
|
|
829
|
+
elif msg_type == "error":
|
|
830
|
+
print(f"[launcher] Event Hub 错误: {msg.get('message')}")
|
|
831
|
+
except Exception as e:
|
|
832
|
+
print(f"[launcher] 事件处理异常(已忽略): {e}")
|
|
560
833
|
|
|
561
834
|
async def _publish_event(self, event_type: str, data: dict):
|
|
562
|
-
"""Publish an event to Event Hub via WebSocket.
|
|
835
|
+
"""Publish an event to Event Hub via WebSocket. Uses create_task to avoid
|
|
836
|
+
deadlock with _ws_connect recv loop (websockets 15.x send can block when
|
|
837
|
+
incoming frames are pending and recv is held by async-for)."""
|
|
563
838
|
if not self._ws:
|
|
564
839
|
return
|
|
565
840
|
from datetime import datetime, timezone
|
|
566
|
-
msg = {
|
|
841
|
+
msg = json.dumps({
|
|
567
842
|
"type": "event",
|
|
568
843
|
"event_id": str(uuid.uuid4()),
|
|
569
844
|
"event": event_type,
|
|
570
845
|
"source": "launcher",
|
|
571
846
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
572
847
|
"data": data,
|
|
573
|
-
}
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
848
|
+
})
|
|
849
|
+
|
|
850
|
+
async def _send():
|
|
851
|
+
try:
|
|
852
|
+
await self._ws.send(msg)
|
|
853
|
+
except Exception as e:
|
|
854
|
+
print(f"[launcher] 发布事件失败: {e}")
|
|
855
|
+
|
|
856
|
+
asyncio.create_task(_send())
|
|
578
857
|
|
|
579
858
|
def _publish_event_threadsafe(self, event_type: str, data: dict):
|
|
580
859
|
"""Publish event from non-async context (API thread). Fire-and-forget."""
|
|
@@ -599,53 +878,127 @@ class Launcher:
|
|
|
599
878
|
self._event_waiters.pop(key, None)
|
|
600
879
|
|
|
601
880
|
async def _graceful_stop(self, name: str, reason: str = "stop_requested", timeout: float = 10):
|
|
602
|
-
"""Graceful shutdown: send event → wait ack → wait ready → kill.
|
|
881
|
+
"""Graceful shutdown: check capability → send event → wait ack → wait ready → kill.
|
|
882
|
+
Modules that did not declare graceful_shutdown in module.ready are terminated directly.
|
|
883
|
+
"""
|
|
603
884
|
self._log_lifecycle("stopping", name, reason=reason)
|
|
885
|
+
|
|
886
|
+
if not self._graceful_modules.get(name):
|
|
887
|
+
self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
|
|
888
|
+
self._log_lifecycle("stopped", name, reason=reason)
|
|
889
|
+
await self._publish_event("module.stopped", {
|
|
890
|
+
"module_id": name,
|
|
891
|
+
"graceful_shutdown": False,
|
|
892
|
+
})
|
|
893
|
+
return
|
|
894
|
+
|
|
604
895
|
await self._publish_event("module.shutdown", {
|
|
605
896
|
"module_id": name, "reason": reason, "timeout": timeout,
|
|
606
897
|
})
|
|
607
898
|
|
|
608
899
|
ack = await self._wait_event("module.shutdown.ack", name, timeout=3)
|
|
609
900
|
if not ack:
|
|
610
|
-
self.process_manager.stop_module(name, timeout=
|
|
611
|
-
await self._publish_event("module.stopped", {
|
|
901
|
+
self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_NON_GRACEFUL)
|
|
902
|
+
await self._publish_event("module.stopped", {
|
|
903
|
+
"module_id": name,
|
|
904
|
+
"graceful_shutdown": self._graceful_modules.get(name, False),
|
|
905
|
+
})
|
|
612
906
|
return
|
|
613
907
|
|
|
614
908
|
estimated = min(ack.get("estimated_cleanup", timeout), timeout)
|
|
615
909
|
ready = await self._wait_event("module.shutdown.ready", name, timeout=estimated)
|
|
616
910
|
if ready:
|
|
617
|
-
self.process_manager.stop_module(name, timeout=
|
|
911
|
+
self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_READY)
|
|
618
912
|
else:
|
|
619
|
-
self.process_manager.stop_module(name, timeout=
|
|
913
|
+
self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_PARTIAL)
|
|
620
914
|
|
|
621
915
|
self._log_lifecycle("stopped", name, reason=reason)
|
|
622
|
-
await self._publish_event("module.stopped", {
|
|
916
|
+
await self._publish_event("module.stopped", {
|
|
917
|
+
"module_id": name,
|
|
918
|
+
"graceful_shutdown": self._graceful_modules.get(name, False),
|
|
919
|
+
})
|
|
623
920
|
|
|
624
921
|
async def _graceful_shutdown_all(self):
|
|
625
|
-
"""
|
|
922
|
+
"""Shut down all modules. Order:
|
|
923
|
+
1. Send shutdown to graceful modules (excl. Event Hub) — let them start cleanup
|
|
924
|
+
2. Terminate non-graceful modules (fast, runs during graceful cleanup)
|
|
925
|
+
3. Wait for graceful modules to exit (process monitoring)
|
|
926
|
+
4. Shut down Event Hub last (keeps event routing alive throughout)
|
|
927
|
+
"""
|
|
928
|
+
self._system_shutting_down = True
|
|
626
929
|
running = [n for n in self.modules if self.process_manager.is_running(n)]
|
|
627
930
|
# Also check core modules
|
|
628
931
|
for cn in CORE_MODULE_NAMES:
|
|
629
932
|
if self.process_manager.is_running(cn) and cn not in running:
|
|
630
933
|
running.append(cn)
|
|
631
934
|
if not running:
|
|
935
|
+
print("[launcher] 没有运行中的模块需要关闭")
|
|
632
936
|
return
|
|
633
|
-
|
|
634
|
-
for
|
|
937
|
+
|
|
938
|
+
graceful = [n for n in running if self._graceful_modules.get(n)]
|
|
939
|
+
non_graceful = [n for n in running if not self._graceful_modules.get(n)]
|
|
940
|
+
|
|
941
|
+
# Defer Event Hub — it must stay alive to route shutdown events
|
|
942
|
+
hub_deferred = "event_hub" in graceful
|
|
943
|
+
graceful_batch = [n for n in graceful if n != "event_hub"] if hub_deferred else graceful
|
|
944
|
+
|
|
945
|
+
print(f"[launcher] 正在关闭 {len(running)} 个模块: {', '.join(running)}")
|
|
946
|
+
|
|
947
|
+
# Phase 1: Notify graceful modules first (they start cleanup immediately)
|
|
948
|
+
for name in graceful_batch:
|
|
635
949
|
self._log_lifecycle("stopping", name, reason="system_shutdown")
|
|
636
950
|
await self._publish_event("module.shutdown", {
|
|
637
|
-
"module_id": name, "reason": "system_shutdown", "timeout":
|
|
951
|
+
"module_id": name, "reason": "system_shutdown", "timeout": 5,
|
|
638
952
|
})
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
for name in running:
|
|
953
|
+
|
|
954
|
+
# Phase 2: While graceful modules are cleaning up, terminate non-graceful ones
|
|
955
|
+
if non_graceful:
|
|
956
|
+
print(f"[launcher] 直接终止 {len(non_graceful)} 个不支持优雅退出的模块: {', '.join(non_graceful)}")
|
|
957
|
+
for name in non_graceful:
|
|
958
|
+
self._log_lifecycle("stopping", name, reason="system_shutdown")
|
|
959
|
+
self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_PARTIAL)
|
|
647
960
|
self._log_lifecycle("stopped", name, reason="system_shutdown")
|
|
648
961
|
|
|
962
|
+
# Phase 3: Wait for graceful modules to exit (process monitoring)
|
|
963
|
+
if graceful_batch:
|
|
964
|
+
deadline = time.time() + 5
|
|
965
|
+
while time.time() < deadline:
|
|
966
|
+
still_running = [n for n in graceful_batch if self.process_manager.is_running(n)]
|
|
967
|
+
if not still_running:
|
|
968
|
+
print("[launcher] 所有优雅退出模块已自行退出")
|
|
969
|
+
break
|
|
970
|
+
remaining = max(0, deadline - time.time())
|
|
971
|
+
print(f"[launcher] 等待 {len(still_running)} 个模块退出 ({remaining:.0f}s): {', '.join(still_running)}")
|
|
972
|
+
await asyncio.sleep(1)
|
|
973
|
+
# Force kill survivors
|
|
974
|
+
for name in graceful_batch:
|
|
975
|
+
if self.process_manager.is_running(name):
|
|
976
|
+
self.process_manager.stop_module(name, timeout=SHUTDOWN_TIMEOUT_PARTIAL)
|
|
977
|
+
self._log_lifecycle("stopped", name, reason="system_shutdown")
|
|
978
|
+
|
|
979
|
+
# Phase 4: All other modules exited — now shut down Event Hub
|
|
980
|
+
if hub_deferred and self.process_manager.is_running("event_hub"):
|
|
981
|
+
self._log_lifecycle("stopping", "event_hub", reason="system_shutdown")
|
|
982
|
+
await self._publish_event("module.shutdown", {
|
|
983
|
+
"module_id": "event_hub", "reason": "system_shutdown", "timeout": 5,
|
|
984
|
+
})
|
|
985
|
+
deadline = time.time() + 5
|
|
986
|
+
while time.time() < deadline:
|
|
987
|
+
if not self.process_manager.is_running("event_hub"):
|
|
988
|
+
print("[launcher] Event Hub 已退出")
|
|
989
|
+
break
|
|
990
|
+
await asyncio.sleep(0.5)
|
|
991
|
+
if self.process_manager.is_running("event_hub"):
|
|
992
|
+
self.process_manager.stop_module("event_hub", timeout=SHUTDOWN_TIMEOUT_PARTIAL)
|
|
993
|
+
self._log_lifecycle("stopped", "event_hub", reason="system_shutdown")
|
|
994
|
+
|
|
995
|
+
# Final safety net
|
|
996
|
+
try:
|
|
997
|
+
self.process_manager.stop_all(timeout=SHUTDOWN_TIMEOUT_BULK)
|
|
998
|
+
except Exception as e:
|
|
999
|
+
print(f"[launcher] stop_all 出错: {e}")
|
|
1000
|
+
await self._close_http()
|
|
1001
|
+
|
|
649
1002
|
# ── Heartbeat to Registry ──
|
|
650
1003
|
|
|
651
1004
|
async def _heartbeat_loop(self):
|
|
@@ -653,13 +1006,12 @@ class Launcher:
|
|
|
653
1006
|
while not self._thread_shutdown.is_set():
|
|
654
1007
|
await asyncio.sleep(30)
|
|
655
1008
|
try:
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
)
|
|
1009
|
+
client = self._get_http()
|
|
1010
|
+
await client.post(
|
|
1011
|
+
f"http://127.0.0.1:{self.registry_port}/modules",
|
|
1012
|
+
json={"action": "heartbeat", "module_id": "launcher"},
|
|
1013
|
+
headers={"Authorization": f"Bearer {self.kite_token}"},
|
|
1014
|
+
)
|
|
663
1015
|
except Exception:
|
|
664
1016
|
pass
|
|
665
1017
|
|
|
@@ -691,6 +1043,42 @@ class Launcher:
|
|
|
691
1043
|
visit(m.name)
|
|
692
1044
|
return order
|
|
693
1045
|
|
|
1046
|
+
def _topo_layers(self, modules: list[ModuleInfo]) -> list[list[ModuleInfo]]:
|
|
1047
|
+
"""Topological sort into layers. Modules in the same layer have no
|
|
1048
|
+
inter-dependencies and can be started in parallel."""
|
|
1049
|
+
name_map = {m.name: m for m in modules}
|
|
1050
|
+
all_names = set(name_map.keys())
|
|
1051
|
+
|
|
1052
|
+
# Compute depth (longest path from root) for each module
|
|
1053
|
+
depth: dict[str, int] = {}
|
|
1054
|
+
in_stack: set[str] = set()
|
|
1055
|
+
|
|
1056
|
+
def get_depth(name: str) -> int:
|
|
1057
|
+
if name in depth:
|
|
1058
|
+
return depth[name]
|
|
1059
|
+
if name in in_stack:
|
|
1060
|
+
raise RuntimeError(f"Circular dependency detected involving '{name}'")
|
|
1061
|
+
in_stack.add(name)
|
|
1062
|
+
info = name_map.get(name)
|
|
1063
|
+
d = 0
|
|
1064
|
+
if info:
|
|
1065
|
+
for dep in info.depends_on:
|
|
1066
|
+
if dep in all_names:
|
|
1067
|
+
d = max(d, get_depth(dep) + 1)
|
|
1068
|
+
in_stack.remove(name)
|
|
1069
|
+
depth[name] = d
|
|
1070
|
+
return d
|
|
1071
|
+
|
|
1072
|
+
for name in all_names:
|
|
1073
|
+
get_depth(name)
|
|
1074
|
+
|
|
1075
|
+
# Group by depth
|
|
1076
|
+
max_depth = max(depth.values()) if depth else 0
|
|
1077
|
+
layers: list[list[ModuleInfo]] = [[] for _ in range(max_depth + 1)]
|
|
1078
|
+
for name, d in depth.items():
|
|
1079
|
+
layers[d].append(name_map[name])
|
|
1080
|
+
return layers
|
|
1081
|
+
|
|
694
1082
|
async def _start_one_module(self, info: ModuleInfo):
|
|
695
1083
|
"""Start a single module: publish starting → start process → wait ready → started → close stdio."""
|
|
696
1084
|
self._log_lifecycle("starting", info.name)
|
|
@@ -698,16 +1086,29 @@ class Launcher:
|
|
|
698
1086
|
|
|
699
1087
|
token = self._module_tokens.get(info.name, "")
|
|
700
1088
|
boot_info = {"token": token}
|
|
1089
|
+
t0 = time.monotonic()
|
|
701
1090
|
ok = self.process_manager.start_module(info, boot_info=boot_info)
|
|
702
1091
|
if not ok:
|
|
703
1092
|
self._log_lifecycle("start_failed", info.name)
|
|
704
1093
|
return
|
|
705
1094
|
|
|
706
|
-
#
|
|
1095
|
+
# Persist immediately after starting to ensure PID is recorded
|
|
1096
|
+
# (in case launcher crashes before Phase 4 completes)
|
|
1097
|
+
self.process_manager.persist_records()
|
|
1098
|
+
|
|
1099
|
+
# Wait for module.ready or module.exiting (whichever comes first)
|
|
707
1100
|
timeout = info.launch.timeout
|
|
708
1101
|
ready = await self._wait_event("module.ready", info.name, timeout=timeout)
|
|
709
|
-
|
|
710
|
-
|
|
1102
|
+
elapsed = time.monotonic() - t0
|
|
1103
|
+
if ready and ready.get("_exited"):
|
|
1104
|
+
# Module sent module.exiting before ready — it chose to quit
|
|
1105
|
+
reason = ready.get("reason", "unknown")
|
|
1106
|
+
self._exit_reasons[info.name] = reason
|
|
1107
|
+
print(f"[launcher] 模块 '{info.name}' 主动退出: {reason} ({elapsed:.2f}s)")
|
|
1108
|
+
elif ready:
|
|
1109
|
+
self._graceful_modules[info.name] = bool(ready.get("graceful_shutdown"))
|
|
1110
|
+
self._ready_times[info.name] = elapsed
|
|
1111
|
+
print(f"[launcher] 模块 '{info.name}' 已就绪 ({elapsed:.2f}s)")
|
|
711
1112
|
else:
|
|
712
1113
|
print(f"[launcher] 警告: '{info.name}' 在 {timeout}s 内未发送 module.ready")
|
|
713
1114
|
|
|
@@ -736,12 +1137,12 @@ class Launcher:
|
|
|
736
1137
|
url = f"http://127.0.0.1:{self.registry_port}/tokens"
|
|
737
1138
|
headers = {"Authorization": f"Bearer {self.kite_token}"}
|
|
738
1139
|
try:
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
1140
|
+
client = self._get_http()
|
|
1141
|
+
resp = await client.post(url, json=tokens, headers=headers)
|
|
1142
|
+
if resp.status_code == 200:
|
|
1143
|
+
print(f"[launcher] 已注册 {len(tokens)} 个模块令牌")
|
|
1144
|
+
else:
|
|
1145
|
+
print(f"[launcher] 警告: 令牌注册返回 {resp.status_code}")
|
|
745
1146
|
except Exception as e:
|
|
746
1147
|
print(f"[launcher] 警告: 注册模块令牌失败: {e}")
|
|
747
1148
|
|
|
@@ -799,49 +1200,90 @@ class Launcher:
|
|
|
799
1200
|
|
|
800
1201
|
print(f"[launcher] API 服务器已启动,端口 {self.api_port}")
|
|
801
1202
|
|
|
1203
|
+
# ── Module crash summary ──
|
|
1204
|
+
|
|
1205
|
+
def _print_module_crash_summary(self, name: str):
|
|
1206
|
+
"""Read module's crashes.jsonl last record and print red summary to console.
|
|
1207
|
+
Complement to module.crash event — reliable even if event was never sent."""
|
|
1208
|
+
RED = "\033[91m"
|
|
1209
|
+
RESET = "\033[0m"
|
|
1210
|
+
_suffix = os.environ.get("KITE_INSTANCE_SUFFIX", "")
|
|
1211
|
+
crash_log = os.path.join(
|
|
1212
|
+
os.environ.get("KITE_INSTANCE_DIR", ""), name, "log", f"crashes{_suffix}.jsonl"
|
|
1213
|
+
)
|
|
1214
|
+
if not os.path.isfile(crash_log):
|
|
1215
|
+
return
|
|
1216
|
+
try:
|
|
1217
|
+
with open(crash_log, "rb") as f:
|
|
1218
|
+
f.seek(0, 2)
|
|
1219
|
+
size = f.tell()
|
|
1220
|
+
if size == 0:
|
|
1221
|
+
return
|
|
1222
|
+
f.seek(max(0, size - 4096))
|
|
1223
|
+
lines = f.read().decode("utf-8").strip().split("\n")
|
|
1224
|
+
last = json.loads(lines[-1])
|
|
1225
|
+
exc_type = last.get("exception_type", "Unknown")
|
|
1226
|
+
ctx = last.get("context", {})
|
|
1227
|
+
file_name = ctx.get("file", "unknown")
|
|
1228
|
+
line_no = ctx.get("line", "?")
|
|
1229
|
+
print(f"[launcher] {RED}崩溃: "
|
|
1230
|
+
f"{exc_type} in {file_name}:{line_no}{RESET}")
|
|
1231
|
+
print(f"[launcher] 崩溃日志: {crash_log}")
|
|
1232
|
+
except Exception:
|
|
1233
|
+
pass
|
|
1234
|
+
|
|
802
1235
|
# ── Monitor loop ──
|
|
803
1236
|
|
|
804
1237
|
async def _monitor_loop(self):
|
|
805
1238
|
"""Check child processes every second. Handle crashes.
|
|
806
1239
|
Uses _shutdown_event (asyncio.Event) so Ctrl+C wakes us immediately.
|
|
1240
|
+
|
|
1241
|
+
Responsibility split:
|
|
1242
|
+
- Core module crash → full restart (Launcher handles)
|
|
1243
|
+
- Watchdog crash → Launcher restarts directly (up to 3 times)
|
|
1244
|
+
- Other module exit → publish module.stopped event only; Watchdog decides restart
|
|
807
1245
|
"""
|
|
808
|
-
|
|
809
|
-
|
|
1246
|
+
WATCHDOG_MAX_FAIL = 3
|
|
1247
|
+
watchdog_fail_count = 0
|
|
810
1248
|
|
|
811
1249
|
while not self._shutdown_event.is_set():
|
|
812
1250
|
exited = self.process_manager.check_exited()
|
|
813
1251
|
|
|
814
1252
|
for name, rc in exited:
|
|
815
1253
|
print(f"[launcher] 模块 '{name}' 退出,返回码 {rc}")
|
|
1254
|
+
if rc != 0:
|
|
1255
|
+
self._print_module_crash_summary(name)
|
|
816
1256
|
self._log_lifecycle("exited", name, exit_code=rc)
|
|
817
1257
|
await self._publish_event("module.stopped", {
|
|
818
1258
|
"module_id": name, "exit_code": rc,
|
|
1259
|
+
"graceful_shutdown": self._graceful_modules.get(name, False),
|
|
819
1260
|
})
|
|
820
1261
|
info = self.modules.get(name)
|
|
821
1262
|
|
|
822
|
-
# Core module crash → full restart
|
|
1263
|
+
# 1) Core module crash → full restart
|
|
823
1264
|
if name in CORE_MODULE_NAMES or (info and info.is_core()):
|
|
824
1265
|
print(f"[launcher] 严重: 核心模块 '{name}' 崩溃,正在全部重启...")
|
|
825
1266
|
self._log_lifecycle("core_crash", name, exit_code=rc)
|
|
826
1267
|
await self._full_restart()
|
|
827
1268
|
return
|
|
828
1269
|
|
|
829
|
-
#
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
1270
|
+
# 2) Watchdog crash → Launcher restarts directly
|
|
1271
|
+
if name == WATCHDOG_MODULE_NAME:
|
|
1272
|
+
if self._system_shutting_down:
|
|
1273
|
+
print(f"[launcher] Watchdog 退出(系统关闭中),跳过重启")
|
|
1274
|
+
continue
|
|
1275
|
+
watchdog_fail_count += 1
|
|
1276
|
+
if watchdog_fail_count <= WATCHDOG_MAX_FAIL and info:
|
|
1277
|
+
print(f"[launcher] Watchdog 崩溃,正在重启 (第 {watchdog_fail_count}/{WATCHDOG_MAX_FAIL} 次)...")
|
|
1278
|
+
await self._start_one_module(info)
|
|
1279
|
+
else:
|
|
1280
|
+
self._desired_states[name] = "stopped"
|
|
1281
|
+
self._log_lifecycle("failed", name, reason=f"exceeded {WATCHDOG_MAX_FAIL} retries")
|
|
1282
|
+
print(f"[launcher] Watchdog 失败 {WATCHDOG_MAX_FAIL} 次,已放弃")
|
|
1283
|
+
continue
|
|
840
1284
|
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
print(f"[launcher] {failed_count} 个模块永久失败,启动器退出")
|
|
844
|
-
return
|
|
1285
|
+
# 3) Other modules → event already published above; Watchdog decides restart
|
|
1286
|
+
# (no restart logic here — Watchdog handles it via module.stopped event)
|
|
845
1287
|
|
|
846
1288
|
if exited:
|
|
847
1289
|
self.process_manager.persist_records()
|
|
@@ -857,6 +1299,9 @@ class Launcher:
|
|
|
857
1299
|
"""Stop all modules, regenerate tokens, re-run Phase 1-4 (mechanism 10)."""
|
|
858
1300
|
print("[launcher] 全量重启: 正在停止所有模块...")
|
|
859
1301
|
|
|
1302
|
+
# Persist records before shutdown so cleanup_leftovers can find survivors
|
|
1303
|
+
self.process_manager.persist_records()
|
|
1304
|
+
|
|
860
1305
|
# Disconnect Event Hub WS
|
|
861
1306
|
if self._ws_task:
|
|
862
1307
|
self._ws_task.cancel()
|
|
@@ -869,7 +1314,13 @@ class Launcher:
|
|
|
869
1314
|
self._launcher_ws_token = ""
|
|
870
1315
|
|
|
871
1316
|
await self._graceful_shutdown_all()
|
|
872
|
-
|
|
1317
|
+
|
|
1318
|
+
# Cleanup any leftover processes that survived graceful shutdown.
|
|
1319
|
+
# Note: _graceful_shutdown_all() clears _processes/_records dicts, but
|
|
1320
|
+
# cleanup_leftovers() reads from processes.json (persisted above), so it can
|
|
1321
|
+
# still find and kill survivors.
|
|
1322
|
+
self.process_manager.cleanup_leftovers()
|
|
1323
|
+
|
|
873
1324
|
self._module_tokens.clear()
|
|
874
1325
|
|
|
875
1326
|
# Regenerate kite_token
|
|
@@ -878,12 +1329,7 @@ class Launcher:
|
|
|
878
1329
|
|
|
879
1330
|
print("[launcher] 全量重启: 重新执行 Phase 1-4...")
|
|
880
1331
|
try:
|
|
881
|
-
await self.
|
|
882
|
-
self.modules = self.module_scanner.scan()
|
|
883
|
-
for n, info in self.modules.items():
|
|
884
|
-
self._log_lifecycle("scanned", n, state=info.state, module_dir=info.module_dir)
|
|
885
|
-
await self._register_module_tokens()
|
|
886
|
-
await self._phase2_event_hub()
|
|
1332
|
+
await self._phase1_parallel_bootstrap()
|
|
887
1333
|
await self._phase3_registry_ready()
|
|
888
1334
|
await self._phase4_start_modules()
|
|
889
1335
|
self.process_manager.persist_records()
|
|
@@ -897,28 +1343,252 @@ class Launcher:
|
|
|
897
1343
|
|
|
898
1344
|
def _final_cleanup(self):
|
|
899
1345
|
"""Called on exit — stop all processes, stop API, clear records."""
|
|
900
|
-
|
|
1346
|
+
try:
|
|
1347
|
+
print("[launcher] 正在执行最终清理...")
|
|
1348
|
+
|
|
1349
|
+
if self._ws_task:
|
|
1350
|
+
self._ws_task.cancel()
|
|
1351
|
+
if hasattr(self, '_heartbeat_task') and self._heartbeat_task:
|
|
1352
|
+
self._heartbeat_task.cancel()
|
|
1353
|
+
|
|
1354
|
+
# Note: _graceful_shutdown_all() already called stop_all() in _async_main finally block.
|
|
1355
|
+
# This is just a safety check — should normally find nothing.
|
|
1356
|
+
remaining = [n for n in self.process_manager._processes
|
|
1357
|
+
if self.process_manager.is_running(n)]
|
|
1358
|
+
if remaining:
|
|
1359
|
+
print(f"[launcher] 警告: 仍有残留进程 (不应出现): {', '.join(remaining)}")
|
|
1360
|
+
self.process_manager.stop_all(timeout=SHUTDOWN_TIMEOUT_BULK)
|
|
1361
|
+
else:
|
|
1362
|
+
print("[launcher] 无残留进程")
|
|
901
1363
|
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
if hasattr(self, '_heartbeat_task') and self._heartbeat_task:
|
|
905
|
-
self._heartbeat_task.cancel()
|
|
1364
|
+
if self._api_server:
|
|
1365
|
+
self._api_server.should_exit = True
|
|
906
1366
|
|
|
907
|
-
|
|
1367
|
+
# Clear instance runtime files
|
|
1368
|
+
try:
|
|
1369
|
+
os.remove(self.process_manager.records_path)
|
|
1370
|
+
except OSError:
|
|
1371
|
+
pass
|
|
1372
|
+
except Exception as e:
|
|
1373
|
+
print(f"[launcher] 最终清理出错: {e}")
|
|
1374
|
+
finally:
|
|
1375
|
+
# Signal the safety-net thread that normal shutdown has completed
|
|
1376
|
+
self._shutdown_complete.set()
|
|
1377
|
+
print("[launcher] 再见。")
|
|
1378
|
+
|
|
1379
|
+
if IS_WINDOWS:
|
|
1380
|
+
os._exit(0)
|
|
1381
|
+
|
|
1382
|
+
# ── Startup report ──
|
|
1383
|
+
|
|
1384
|
+
async def _print_startup_report(self, total_time: float, phase_times: dict[str, float], *,
|
|
1385
|
+
global_instances=None, cleaned_stats: dict[str, int] | None = None):
|
|
1386
|
+
"""Print a green startup summary with module list and timing."""
|
|
1387
|
+
G = "\033[32m" # green
|
|
1388
|
+
Y = "\033[33m" # yellow
|
|
1389
|
+
R = "\033[0m" # reset
|
|
1390
|
+
B = "\033[1;32m" # bold green
|
|
1391
|
+
|
|
1392
|
+
running = []
|
|
1393
|
+
exited = []
|
|
1394
|
+
stopped = []
|
|
1395
|
+
for name, info in self.modules.items():
|
|
1396
|
+
rec = self.process_manager.get_record(name)
|
|
1397
|
+
is_running = self.process_manager.is_running(name)
|
|
1398
|
+
if is_running and rec:
|
|
1399
|
+
running.append((name, info, rec))
|
|
1400
|
+
elif self._desired_states.get(name) == "running" and not is_running:
|
|
1401
|
+
# Was started but already exited (e.g. module.exiting)
|
|
1402
|
+
exited.append((name, info))
|
|
1403
|
+
else:
|
|
1404
|
+
stopped.append((name, info))
|
|
1405
|
+
|
|
1406
|
+
# Calculate kernel startup time (Phase 1+2+3)
|
|
1407
|
+
kernel_time = 0
|
|
1408
|
+
for phase_name in ["Phase 1+2: Registry + Event Hub (并行)", "Phase 3: Registry 事件总线"]:
|
|
1409
|
+
if phase_name in phase_times:
|
|
1410
|
+
kernel_time += phase_times[phase_name]
|
|
1411
|
+
|
|
1412
|
+
lines = [
|
|
1413
|
+
"",
|
|
1414
|
+
f"{B}{'=' * 60}",
|
|
1415
|
+
f" Kite 内核启动完成 耗时 {kernel_time:.2f}s",
|
|
1416
|
+
f" Kite 全部模块启动完成 总耗时 {total_time:.2f}s",
|
|
1417
|
+
f"{'=' * 60}{R}",
|
|
1418
|
+
]
|
|
1419
|
+
|
|
1420
|
+
# Phase breakdown
|
|
1421
|
+
lines.append(f"{G} 阶段耗时:{R}")
|
|
1422
|
+
|
|
1423
|
+
# Kernel modules section
|
|
1424
|
+
lines.append(f"{G} 内核模块:{R}")
|
|
1425
|
+
for phase_name in ["Phase 1+2: Registry + Event Hub (并行)", "Phase 3: Registry 事件总线"]:
|
|
1426
|
+
if phase_name in phase_times:
|
|
1427
|
+
elapsed = phase_times[phase_name]
|
|
1428
|
+
lines.append(f"{G} {phase_name:<26s} {elapsed:>6.2f}s{R}")
|
|
1429
|
+
|
|
1430
|
+
# Extension modules section
|
|
1431
|
+
lines.append(f"{G} 扩展模块:{R}")
|
|
1432
|
+
if "Phase 4: Extensions" in phase_times:
|
|
1433
|
+
elapsed = phase_times["Phase 4: Extensions"]
|
|
1434
|
+
lines.append(f"{G} {'Phase 4: Extensions':<26s} {elapsed:>6.2f}s{R}")
|
|
1435
|
+
|
|
1436
|
+
# Sort running modules by ready time
|
|
1437
|
+
running_sorted = sorted(running, key=lambda x: self._ready_times.get(x[0], float('inf')))
|
|
1438
|
+
|
|
1439
|
+
# Running modules with ready time and elapsed from Kite start
|
|
1440
|
+
DIM = "\033[90m"
|
|
1441
|
+
lines.append(f"{G} 运行中 ({len(running)}):{R}")
|
|
1442
|
+
|
|
1443
|
+
# CJK-aware display width helpers
|
|
1444
|
+
def _dw(s):
|
|
1445
|
+
"""Display width: CJK chars count as 2, others as 1."""
|
|
1446
|
+
w = 0
|
|
1447
|
+
for c in str(s):
|
|
1448
|
+
w += 2 if '\u4e00' <= c <= '\u9fff' or '\u3000' <= c <= '\u303f' or '\uff00' <= c <= '\uffef' else 1
|
|
1449
|
+
return w
|
|
1450
|
+
|
|
1451
|
+
def _rpad(s, width):
|
|
1452
|
+
"""Left-align s in a field of given display width."""
|
|
1453
|
+
return str(s) + ' ' * max(0, width - _dw(s))
|
|
1454
|
+
|
|
1455
|
+
def _lpad(s, width):
|
|
1456
|
+
"""Right-align s in a field of given display width."""
|
|
1457
|
+
return ' ' * max(0, width - _dw(s)) + str(s)
|
|
1458
|
+
|
|
1459
|
+
# Column definitions: (header, align, min_width)
|
|
1460
|
+
headers = ['模块', 'PID', '启动耗时', '进程启动时长', '类型']
|
|
1461
|
+
aligns = ['left', 'right', 'right', 'right', 'left'] # alignment per column
|
|
1462
|
+
|
|
1463
|
+
# Build data rows first to calculate column widths
|
|
1464
|
+
rows = []
|
|
1465
|
+
for name, info, rec in running_sorted:
|
|
1466
|
+
label = info.display_name or name
|
|
1467
|
+
ready_t = self._ready_times.get(name)
|
|
1468
|
+
time_str = f"{ready_t:.2f}s" if ready_t is not None else "—"
|
|
1469
|
+
if ready_t is not None and hasattr(self, '_start_unix'):
|
|
1470
|
+
elapsed_from_start = (rec.started_at + ready_t) - self._start_unix
|
|
1471
|
+
es_str = f"{elapsed_from_start:.2f}s"
|
|
1472
|
+
else:
|
|
1473
|
+
es_str = "—"
|
|
1474
|
+
rows.append([label, str(rec.pid), time_str, es_str, f"[{info.type}]"])
|
|
1475
|
+
|
|
1476
|
+
# Calculate column widths: max of header and all data display widths
|
|
1477
|
+
col_widths = [_dw(h) for h in headers]
|
|
1478
|
+
for row in rows:
|
|
1479
|
+
for i, cell in enumerate(row):
|
|
1480
|
+
col_widths[i] = max(col_widths[i], _dw(cell))
|
|
1481
|
+
|
|
1482
|
+
# Render header
|
|
1483
|
+
hdr_parts = []
|
|
1484
|
+
for i, h in enumerate(headers):
|
|
1485
|
+
if aligns[i] == 'left':
|
|
1486
|
+
hdr_parts.append(_rpad(h, col_widths[i]))
|
|
1487
|
+
else:
|
|
1488
|
+
hdr_parts.append(_lpad(h, col_widths[i]))
|
|
1489
|
+
lines.append(f"{DIM} {' '.join(hdr_parts)}{R}")
|
|
1490
|
+
|
|
1491
|
+
# Render data rows
|
|
1492
|
+
for row in rows:
|
|
1493
|
+
parts = []
|
|
1494
|
+
for i, cell in enumerate(row):
|
|
1495
|
+
if aligns[i] == 'left':
|
|
1496
|
+
parts.append(_rpad(cell, col_widths[i]))
|
|
1497
|
+
else:
|
|
1498
|
+
parts.append(_lpad(cell, col_widths[i]))
|
|
1499
|
+
lines.append(f"{G} ✓ {' '.join(parts)}{R}")
|
|
1500
|
+
|
|
1501
|
+
# Exited modules (started but already quit)
|
|
1502
|
+
if exited:
|
|
1503
|
+
lines.append(f"{Y} 已退出 ({len(exited)}):{R}")
|
|
1504
|
+
for name, info in exited:
|
|
1505
|
+
label = info.display_name or name
|
|
1506
|
+
reason = self._exit_reasons.get(name, "")
|
|
1507
|
+
reason_str = f": {reason}" if reason else ""
|
|
1508
|
+
lines.append(f"{Y} ↗ {label:<20s} (主动退出{reason_str}){R}")
|
|
1509
|
+
|
|
1510
|
+
# Stopped modules
|
|
1511
|
+
if stopped:
|
|
1512
|
+
lines.append(f"{G} 未启动 ({len(stopped)}):{R}")
|
|
1513
|
+
for name, info in stopped:
|
|
1514
|
+
label = info.display_name or name
|
|
1515
|
+
lines.append(f"{G} - {label:<20s} ({info.state}){R}")
|
|
1516
|
+
|
|
1517
|
+
lines.append(f"{G} Launcher API: http://127.0.0.1:{self.api_port} 实例: {self.instance_id}{R}")
|
|
1518
|
+
|
|
1519
|
+
# Query Registry for web module's access URL
|
|
1520
|
+
web_url = await self._get_web_url()
|
|
1521
|
+
if web_url:
|
|
1522
|
+
lines.append(f"{B} Web 管理后台: {web_url}{R}")
|
|
1523
|
+
|
|
1524
|
+
# Instance info
|
|
1525
|
+
instances = self.process_manager.get_alive_instances()
|
|
1526
|
+
inst_num = self.process_manager.instance_num
|
|
1527
|
+
suffix_display = self.process_manager.instance_suffix or "(无)"
|
|
1528
|
+
inst_dir = os.environ.get("KITE_INSTANCE_DIR", "")
|
|
1529
|
+
cwd = os.environ.get("KITE_CWD", "")
|
|
1530
|
+
debug_flag = " [DEBUG]" if os.environ.get("KITE_DEBUG") == "1" else ""
|
|
1531
|
+
lines.append(f"{G} 当前实例: #{inst_num} 后缀: {suffix_display} PID: {os.getpid()}{debug_flag}{R}")
|
|
1532
|
+
lines.append(f"{G} 实例目录: {inst_dir}{R}")
|
|
1533
|
+
lines.append(f"{G} 工作目录: {cwd}{R}")
|
|
1534
|
+
if len(instances) > 1:
|
|
1535
|
+
lines.append(f"{G} 所有实例:{R}")
|
|
1536
|
+
for i in instances:
|
|
1537
|
+
s = "" if i["num"] == 1 else f"~{i['num']}"
|
|
1538
|
+
debug_tag = " [DEBUG]" if i.get("debug", False) else ""
|
|
1539
|
+
current_tag = " (当前)" if i["is_self"] else ""
|
|
1540
|
+
lines.append(f"{G} #{i['num']} PID {i['launcher_pid']} "
|
|
1541
|
+
f"模块数 {i['module_count']} (processes{s}.json){debug_tag}{current_tag}{R}")
|
|
1542
|
+
|
|
1543
|
+
# Cross-directory instances from other projects
|
|
1544
|
+
if global_instances:
|
|
1545
|
+
my_inst_basename = os.path.basename(os.environ.get("KITE_INSTANCE_DIR", ""))
|
|
1546
|
+
other_instances = [i for i in global_instances
|
|
1547
|
+
if not i["is_self"] and i["instance_dir"] != my_inst_basename]
|
|
1548
|
+
if other_instances:
|
|
1549
|
+
lines.append(f"{G} 其他项目实例:{R}")
|
|
1550
|
+
for i in other_instances:
|
|
1551
|
+
debug_tag = " [DEBUG]" if i.get("debug", False) else ""
|
|
1552
|
+
cwd_display = f" {i['cwd']}" if i["cwd"] else ""
|
|
1553
|
+
lines.append(
|
|
1554
|
+
f"{G} {i['instance_dir']:<20s} "
|
|
1555
|
+
f"#{i['num']} PID {i['launcher_pid']} "
|
|
1556
|
+
f"模块数 {i['module_count']}"
|
|
1557
|
+
f"{cwd_display}{debug_tag}{R}"
|
|
1558
|
+
)
|
|
908
1559
|
|
|
909
|
-
if
|
|
910
|
-
|
|
1560
|
+
if cleaned_stats:
|
|
1561
|
+
total = sum(cleaned_stats.values())
|
|
1562
|
+
if len(cleaned_stats) == 1:
|
|
1563
|
+
inst, count = next(iter(cleaned_stats.items()))
|
|
1564
|
+
lines.append(f"{Y} 已清理残留进程: {inst} ({count} 个){R}")
|
|
1565
|
+
else:
|
|
1566
|
+
lines.append(f"{Y} 已清理残留进程 (共 {total} 个):{R}")
|
|
1567
|
+
for inst, count in cleaned_stats.items():
|
|
1568
|
+
lines.append(f"{Y} {inst}: {count} 个{R}")
|
|
911
1569
|
|
|
912
|
-
|
|
913
|
-
|
|
1570
|
+
lines.append(f"{B}{'=' * 60}{R}")
|
|
1571
|
+
lines.append("")
|
|
1572
|
+
|
|
1573
|
+
print("\n".join(lines))
|
|
1574
|
+
|
|
1575
|
+
async def _get_web_url(self) -> str:
|
|
1576
|
+
"""Query Registry for the web module's api_endpoint. Returns URL or empty string."""
|
|
914
1577
|
try:
|
|
915
|
-
|
|
916
|
-
|
|
1578
|
+
client = self._get_http()
|
|
1579
|
+
resp = await client.get(
|
|
1580
|
+
f"http://127.0.0.1:{self.registry_port}/get/web.api_endpoint",
|
|
1581
|
+
headers={"Authorization": f"Bearer {self.kite_token}"},
|
|
1582
|
+
timeout=3,
|
|
1583
|
+
)
|
|
1584
|
+
if resp.status_code == 200:
|
|
1585
|
+
val = resp.json()
|
|
1586
|
+
if val and isinstance(val, str):
|
|
1587
|
+
# Show localhost instead of 127.0.0.1 for friendliness
|
|
1588
|
+
return val.replace("://127.0.0.1:", "://localhost:")
|
|
1589
|
+
except Exception:
|
|
917
1590
|
pass
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
if IS_WINDOWS:
|
|
921
|
-
os._exit(0)
|
|
1591
|
+
return ""
|
|
922
1592
|
|
|
923
1593
|
# ── Utilities ──
|
|
924
1594
|
|
|
@@ -930,7 +1600,6 @@ class Launcher:
|
|
|
930
1600
|
fm = _parse_frontmatter(f.read())
|
|
931
1601
|
discovery = fm.get("discovery")
|
|
932
1602
|
if isinstance(discovery, dict) and discovery:
|
|
933
|
-
print(f"[launcher] 发现来源: {', '.join(discovery.keys())}")
|
|
934
1603
|
return discovery
|
|
935
1604
|
except Exception as e:
|
|
936
1605
|
print(f"[launcher] 警告: 读取发现配置失败: {e}")
|
|
@@ -960,12 +1629,29 @@ class Launcher:
|
|
|
960
1629
|
|
|
961
1630
|
def _create_api_app(self) -> FastAPI:
|
|
962
1631
|
"""Create the FastAPI app with Launcher management routes."""
|
|
1632
|
+
from fastapi import Request, HTTPException
|
|
963
1633
|
app = FastAPI(title="Kite Launcher", docs_url=None, redoc_url=None)
|
|
964
1634
|
launcher = self
|
|
965
1635
|
|
|
1636
|
+
def _require_auth(request: Request):
|
|
1637
|
+
"""Verify Bearer token and IP whitelist. Raise 401/403 on failure."""
|
|
1638
|
+
# IP whitelist: only allow 127.0.0.1
|
|
1639
|
+
client_host = request.client.host if request.client else None
|
|
1640
|
+
if client_host not in ("127.0.0.1", "::1", "localhost"):
|
|
1641
|
+
raise HTTPException(status_code=403, detail="Access denied: only localhost allowed")
|
|
1642
|
+
|
|
1643
|
+
# Bearer token verification
|
|
1644
|
+
auth = request.headers.get("Authorization", "")
|
|
1645
|
+
if not auth.startswith("Bearer "):
|
|
1646
|
+
raise HTTPException(status_code=401, detail="Missing or invalid Authorization header")
|
|
1647
|
+
token = auth[7:].strip()
|
|
1648
|
+
if token != launcher.kite_token:
|
|
1649
|
+
raise HTTPException(status_code=401, detail="Invalid token")
|
|
1650
|
+
|
|
966
1651
|
@app.get("/launcher/modules")
|
|
967
|
-
async def list_modules():
|
|
1652
|
+
async def list_modules(request: Request):
|
|
968
1653
|
"""List all modules and their current status."""
|
|
1654
|
+
_require_auth(request)
|
|
969
1655
|
result = []
|
|
970
1656
|
for name, info in launcher.modules.items():
|
|
971
1657
|
running = launcher.process_manager.is_running(name)
|
|
@@ -983,8 +1669,9 @@ class Launcher:
|
|
|
983
1669
|
return result
|
|
984
1670
|
|
|
985
1671
|
@app.post("/launcher/modules/{name}/start")
|
|
986
|
-
async def start_module(name: str):
|
|
1672
|
+
async def start_module(name: str, request: Request):
|
|
987
1673
|
"""Start a module by name."""
|
|
1674
|
+
_require_auth(request)
|
|
988
1675
|
info = launcher.modules.get(name)
|
|
989
1676
|
if not info:
|
|
990
1677
|
raise HTTPException(404, f"Module '{name}' not found")
|
|
@@ -994,13 +1681,12 @@ class Launcher:
|
|
|
994
1681
|
if name not in launcher._module_tokens:
|
|
995
1682
|
launcher._module_tokens[name] = secrets.token_hex(32)
|
|
996
1683
|
try:
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
)
|
|
1684
|
+
client = launcher._get_http()
|
|
1685
|
+
await client.post(
|
|
1686
|
+
f"http://127.0.0.1:{launcher.registry_port}/tokens",
|
|
1687
|
+
json={name: launcher._module_tokens[name]},
|
|
1688
|
+
headers={"Authorization": f"Bearer {launcher.kite_token}"},
|
|
1689
|
+
)
|
|
1004
1690
|
except Exception as e:
|
|
1005
1691
|
print(f"[launcher] 警告: 注册 {name} 的令牌失败: {e}")
|
|
1006
1692
|
|
|
@@ -1009,7 +1695,6 @@ class Launcher:
|
|
|
1009
1695
|
ok = launcher.process_manager.start_module(info, boot_info=boot_info)
|
|
1010
1696
|
if ok:
|
|
1011
1697
|
launcher._desired_states[name] = "running"
|
|
1012
|
-
launcher._fail_counts.pop(name, None)
|
|
1013
1698
|
launcher.process_manager.persist_records()
|
|
1014
1699
|
rec = launcher.process_manager.get_record(name)
|
|
1015
1700
|
launcher._log_lifecycle("started", name, pid=rec.pid if rec else None, via="api")
|
|
@@ -1019,8 +1704,9 @@ class Launcher:
|
|
|
1019
1704
|
raise HTTPException(500, f"Failed to start '{name}'")
|
|
1020
1705
|
|
|
1021
1706
|
@app.post("/launcher/modules/{name}/stop")
|
|
1022
|
-
async def stop_module(name: str, body: dict = None):
|
|
1707
|
+
async def stop_module(name: str, request: Request, body: dict = None):
|
|
1023
1708
|
"""Stop a module with graceful shutdown."""
|
|
1709
|
+
_require_auth(request)
|
|
1024
1710
|
info = launcher.modules.get(name)
|
|
1025
1711
|
if not info:
|
|
1026
1712
|
raise HTTPException(404, f"Module '{name}' not found")
|
|
@@ -1031,8 +1717,9 @@ class Launcher:
|
|
|
1031
1717
|
return {"status": "stopped", "name": name}
|
|
1032
1718
|
|
|
1033
1719
|
@app.post("/launcher/modules/{name}/restart")
|
|
1034
|
-
async def restart_module(name: str, body: dict = None):
|
|
1720
|
+
async def restart_module(name: str, request: Request, body: dict = None):
|
|
1035
1721
|
"""Restart a module (stop + start)."""
|
|
1722
|
+
_require_auth(request)
|
|
1036
1723
|
info = launcher.modules.get(name)
|
|
1037
1724
|
if not info:
|
|
1038
1725
|
raise HTTPException(404, f"Module '{name}' not found")
|
|
@@ -1042,13 +1729,12 @@ class Launcher:
|
|
|
1042
1729
|
await launcher._graceful_stop(name, reason)
|
|
1043
1730
|
launcher._module_tokens[name] = secrets.token_hex(32)
|
|
1044
1731
|
try:
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
)
|
|
1732
|
+
client = launcher._get_http()
|
|
1733
|
+
await client.post(
|
|
1734
|
+
f"http://127.0.0.1:{launcher.registry_port}/tokens",
|
|
1735
|
+
json={name: launcher._module_tokens[name]},
|
|
1736
|
+
headers={"Authorization": f"Bearer {launcher.kite_token}"},
|
|
1737
|
+
)
|
|
1052
1738
|
except Exception:
|
|
1053
1739
|
pass
|
|
1054
1740
|
token = launcher._module_tokens[name]
|
|
@@ -1056,7 +1742,6 @@ class Launcher:
|
|
|
1056
1742
|
ok = launcher.process_manager.start_module(info, boot_info=boot_info)
|
|
1057
1743
|
if ok:
|
|
1058
1744
|
launcher._desired_states[name] = "running"
|
|
1059
|
-
launcher._fail_counts.pop(name, None)
|
|
1060
1745
|
launcher.process_manager.persist_records()
|
|
1061
1746
|
rec = launcher.process_manager.get_record(name)
|
|
1062
1747
|
launcher._log_lifecycle("started", name, pid=rec.pid if rec else None, via="restart_api")
|
|
@@ -1066,8 +1751,9 @@ class Launcher:
|
|
|
1066
1751
|
raise HTTPException(500, f"Failed to restart '{name}'")
|
|
1067
1752
|
|
|
1068
1753
|
@app.post("/launcher/rescan")
|
|
1069
|
-
async def rescan_modules():
|
|
1754
|
+
async def rescan_modules(request: Request):
|
|
1070
1755
|
"""Rescan module directories for new/removed modules."""
|
|
1756
|
+
_require_auth(request)
|
|
1071
1757
|
old_names = set(launcher.modules.keys())
|
|
1072
1758
|
launcher.modules = launcher.module_scanner.scan()
|
|
1073
1759
|
new_names = set(launcher.modules.keys())
|
|
@@ -1085,20 +1771,28 @@ class Launcher:
|
|
|
1085
1771
|
launcher._module_tokens[name] = secrets.token_hex(32)
|
|
1086
1772
|
new_tokens[name] = launcher._module_tokens[name]
|
|
1087
1773
|
try:
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
)
|
|
1774
|
+
client = launcher._get_http()
|
|
1775
|
+
await client.post(
|
|
1776
|
+
f"http://127.0.0.1:{launcher.registry_port}/tokens",
|
|
1777
|
+
json=new_tokens,
|
|
1778
|
+
headers={"Authorization": f"Bearer {launcher.kite_token}"},
|
|
1779
|
+
)
|
|
1095
1780
|
except Exception:
|
|
1096
1781
|
pass
|
|
1097
1782
|
return {"added": added, "removed": removed, "total": len(launcher.modules)}
|
|
1098
1783
|
|
|
1784
|
+
@app.post("/launcher/shutdown")
|
|
1785
|
+
async def shutdown_launcher(request: Request, body: dict = None):
|
|
1786
|
+
"""Shutdown the entire Kite system (equivalent to Ctrl+C)."""
|
|
1787
|
+
_require_auth(request)
|
|
1788
|
+
reason = (body or {}).get("reason", "api_request")
|
|
1789
|
+
launcher._request_shutdown(f"API shutdown request: {reason}")
|
|
1790
|
+
return {"status": "shutting_down", "reason": reason}
|
|
1791
|
+
|
|
1099
1792
|
@app.put("/launcher/modules/{name}/state")
|
|
1100
|
-
async def update_state(name: str, body: dict):
|
|
1793
|
+
async def update_state(name: str, request: Request, body: dict):
|
|
1101
1794
|
"""Update module state (enabled/manual/disabled). Writes to module.md."""
|
|
1795
|
+
_require_auth(request)
|
|
1102
1796
|
info = launcher.modules.get(name)
|
|
1103
1797
|
if not info:
|
|
1104
1798
|
raise HTTPException(404, f"Module '{name}' not found")
|